From 5d0c0a665df683df2021767baa70bf0d41b263f8 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 7 Apr 2018 12:27:07 +0200
Subject: [PATCH 01/24] x17: apply echo512 improvement

add a tiny 1% on x17, better than nothing...
---
 x16/x16r.cu |  3 ++-
 x16/x16s.cu |  3 ++-
 x17/x17.cu  | 18 +++++++++++++++---
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/x16/x16r.cu b/x16/x16r.cu
index 1319c22..0a42be0 100644
--- a/x16/x16r.cu
+++ b/x16/x16r.cu
@@ -491,8 +491,9 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 			case ECHO:
 				if (use_compat_kernels[thr_id])
 					x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-				else
+				else {
 					x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				}
 				TRACE("echo   :");
 				break;
 			case HAMSI:
diff --git a/x16/x16s.cu b/x16/x16s.cu
index 36aeacb..080ff74 100644
--- a/x16/x16s.cu
+++ b/x16/x16s.cu
@@ -489,8 +489,9 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
 			case ECHO:
 				if (use_compat_kernels[thr_id])
 					x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-				else
+				else {
 					x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				}
 				TRACE("echo   :");
 				break;
 			case HAMSI:
diff --git a/x17/x17.cu b/x17/x17.cu
index 816e5e0..3536cdc 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -32,6 +32,8 @@ extern "C" {
 
 static uint32_t *d_hash[MAX_GPUS];
 
+extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+
 extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
@@ -151,12 +153,14 @@ extern "C" void x17hash(void *output, const void *input)
 }
 
 static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
 
 extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
 
 	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8;
 	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
@@ -166,7 +170,7 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
 
 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage
@@ -174,6 +178,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
@@ -183,7 +192,6 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
 		x11_luffaCubehash512_cpu_init(thr_id, throughput);
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_simd512_cpu_init(thr_id, throughput);
-		x11_echo512_cpu_init(thr_id, throughput);
 		x13_hamsi512_cpu_init(thr_id, throughput);
 		x13_fugue512_cpu_init(thr_id, throughput);
 		x14_shabal512_cpu_init(thr_id, throughput);
@@ -220,7 +228,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
 		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
 		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		else {
+			x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+		}
 		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

From 16ebe53b72d7c02f95fc948579876f08971bfc66 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 9 Apr 2018 17:45:20 +0200
Subject: [PATCH 02/24] x12: apply echo512 optimised kernel on recent cards

---
 x12/x12.cu  | 20 ++++++++++++++++----
 x16/x16r.cu |  2 +-
 x16/x16s.cu |  2 +-
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/x12/x12.cu b/x12/x12.cu
index 1cf862b..c0fd623 100644
--- a/x12/x12.cu
+++ b/x12/x12.cu
@@ -22,6 +22,8 @@ extern "C" {
 
 static uint32_t *d_hash[MAX_GPUS];
 
+extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+
 extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
@@ -96,13 +98,15 @@ extern "C" void x12hash(void *output, const void *input)
 }
 
 static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
 
 extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-	int intensity = (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19;
+	const int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
 	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << intensity); // 19=256*256*8;
 	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
@@ -111,7 +115,7 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u
 
 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage
@@ -120,13 +124,17 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
 		quark_blake512_cpu_init(thr_id, throughput);
 		x11_luffaCubehash512_cpu_init(thr_id, throughput);
 		x11_shavite512_cpu_init(thr_id, throughput);
 		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
 			return 0;
 		}
-		x11_echo512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
@@ -156,7 +164,11 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u
 		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
 		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		else {
+			x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+		}
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
diff --git a/x16/x16r.cu b/x16/x16r.cu
index 0a42be0..2caa5d0 100644
--- a/x16/x16r.cu
+++ b/x16/x16r.cu
@@ -250,7 +250,7 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 
 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage
diff --git a/x16/x16s.cu b/x16/x16s.cu
index 080ff74..382de41 100644
--- a/x16/x16s.cu
+++ b/x16/x16s.cu
@@ -248,7 +248,7 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
 
 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage

From bfcf7a9e52e9bf8277893b6e66dd038998d88610 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 22 Apr 2018 23:12:37 +0200
Subject: [PATCH 03/24] neoscrypt: add extra space for recent vstudio madness

---
 neoscrypt/cuda_neoscrypt.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu
index 9ea3b75..59d73b7 100644
--- a/neoscrypt/cuda_neoscrypt.cu
+++ b/neoscrypt/cuda_neoscrypt.cu
@@ -179,7 +179,7 @@ static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2)
 	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
 	a += b; d = rotateL(d^a, 16); \
 	c += d; b = rotateR(b^c, 12); \
-	idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
+	idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \
 	a += b; d = rotateR(d^a, 8); \
 	c += d; b = rotateR(b^c, 7); \
 }
@@ -392,7 +392,7 @@ void Blake2S(uint32_t *out, const uint32_t* const __restrict__  inout, const  ui
 	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
 	a += b; d = __byte_perm(d^a, 0, 0x1032); \
 	c += d; b = rotateR(b^c, 12); \
-	idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
+	idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \
 	a += b; d = __byte_perm(d^a, 0, 0x0321); \
 	c += d; b = rotateR(b^c, 7); \
 }
@@ -1260,7 +1260,7 @@ uint32_t fastkdf32_v3(uint32_t thread, const uint32_t nonce, uint32_t* const sal
 	idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \
 	a += b; d = ROTR32(d^a,16); \
 	c += d; b = ROTR32(b^c, 12); \
-	idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \
+	idx = BLAKE2S_SIGMA_host[idx0][idx1 + 1]; a += key[idx]; \
 	a += b; d = ROTR32(d^a,8); \
 	c += d; b = ROTR32(b^c, 7); \
 }

From b97567a451f0069612d20db71912f7b85996d52f Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 6 May 2018 18:04:10 +0200
Subject: [PATCH 04/24] allium algo

---
 Makefile.am             |   1 +
 algos.h                 |   2 +
 bench.cpp               |   1 +
 ccminer.cpp             |   5 +
 ccminer.vcxproj         |   1 +
 ccminer.vcxproj.filters |   3 +
 configure.ac            |   2 +-
 lyra2/allium.cu         | 213 ++++++++++++++++++++++++++++++++++++++++
 miner.h                 |   3 +
 util.cpp                |   3 +
 10 files changed, 233 insertions(+), 1 deletion(-)
 create mode 100644 lyra2/allium.cu

diff --git a/Makefile.am b/Makefile.am
index d7d2a0b..8f33d48 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -39,6 +39,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \
 		          lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \
 			  lyra2/Lyra2Z.c lyra2/lyra2Z.cu lyra2/cuda_lyra2Z.cu \
+			  lyra2/allium.cu \
 			  Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \
 			  Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu \
 			  Algo256/cuda_keccak256_sm3.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \
diff --git a/algos.h b/algos.h
index ed0ff83..f141086 100644
--- a/algos.h
+++ b/algos.h
@@ -8,6 +8,7 @@ enum sha_algos {
 	ALGO_BLAKECOIN = 0,
 	ALGO_BLAKE,
 	ALGO_BLAKE2S,
+	ALGO_ALLIUM,
 	ALGO_BMW,
 	ALGO_BASTION,
 	ALGO_C11,
@@ -80,6 +81,7 @@ static const char *algo_names[] = {
 	"blakecoin",
 	"blake",
 	"blake2s",
+	"allium",
 	"bmw",
 	"bastion",
 	"c11",
diff --git a/bench.cpp b/bench.cpp
index eeeee60..84f9bc5 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -49,6 +49,7 @@ void bench_free()
 void algo_free_all(int thr_id)
 {
 	// only initialized algos will be freed
+	free_allium(thr_id);
 	free_bastion(thr_id);
 	free_bitcore(thr_id);
 	free_blake256(thr_id);
diff --git a/ccminer.cpp b/ccminer.cpp
index 87cd26c..770d5d5 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -1698,6 +1698,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_SCRYPT_JANE:
 			work_set_target(work, sctx->job.diff / (65536.0 * opt_difficulty));
 			break;
+		case ALGO_ALLIUM:
 		case ALGO_DMD_GR:
 		case ALGO_FRESH:
 		case ALGO_FUGUE256:
@@ -2234,6 +2235,7 @@ static void *miner_thread(void *userdata)
 			case ALGO_TRIBUS:
 				minmax = 0x1000000;
 				break;
+			case ALGO_ALLIUM:
 			case ALGO_C11:
 			case ALGO_DEEP:
 			case ALGO_HEAVY:
@@ -2323,6 +2325,9 @@ static void *miner_thread(void *userdata)
 		/* scan nonces for a proof-of-work hash */
 		switch (opt_algo) {
 
+		case ALGO_ALLIUM:
+			rc = scanhash_allium(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_BASTION:
 			rc = scanhash_bastion(thr_id, &work, max_nonce, &hashes_done);
 			break;
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index f995f4a..1db063e 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -519,6 +519,7 @@
     <CudaCompile Include="qubit\luffa.cu" />
     <CudaCompile Include="qubit\qubit.cu" />
     <CudaCompile Include="qubit\qubit_luffa512.cu" />
+    <CudaCompile Include="lyra2\allium.cu" />
     <CudaCompile Include="lyra2\lyra2RE.cu" />
     <CudaCompile Include="lyra2\cuda_lyra2.cu" />
     <CudaCompile Include="lyra2\lyra2REv2.cu" />
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 4c1b8d6..b2ee453 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -910,6 +910,9 @@
     <CudaCompile Include="Algo256\cuda_bmw.cu">
       <Filter>Source Files\CUDA\Algo256</Filter>
     </CudaCompile>
+    <CudaCompile Include="lyra2\allium.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
     <CudaCompile Include="lyra2\cuda_lyra2.cu">
       <Filter>Source Files\CUDA\lyra2</Filter>
     </CudaCompile>
diff --git a/configure.ac b/configure.ac
index 08a340f..e164456 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.2.5], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.2.6], [], [ccminer], [http://github.com/tpruvot/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/lyra2/allium.cu b/lyra2/allium.cu
new file mode 100644
index 0000000..931e6bc
--- /dev/null
+++ b/lyra2/allium.cu
@@ -0,0 +1,213 @@
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_groestl.h"
+#include "lyra2/Lyra2.h"
+}
+
+#include <miner.h>
+#include <cuda_helper.h>
+
+static uint64_t* d_hash[MAX_GPUS];
+static uint64_t* d_matrix[MAX_GPUS];
+
+extern void blake256_cpu_init(int thr_id, uint32_t threads);
+extern void blake256_cpu_setBlock_80(uint32_t *pdata);
+//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+
+//extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+//extern void keccak256_sm3_init(int thr_id, uint32_t threads);
+//extern void keccak256_sm3_free(int thr_id);
+
+extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+
+extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+
+extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
+
+extern void skein256_cpu_init(int thr_id, uint32_t threads);
+
+extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
+extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti);
+
+extern void groestl256_cpu_init(int thr_id, uint32_t threads);
+extern void groestl256_cpu_free(int thr_id);
+extern void groestl256_setTarget(const void *ptarget);
+extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order);
+extern uint32_t groestl256_getSecNonce(int thr_id, int num);
+
+
+extern "C" void allium_hash(void *state, const void *input)
+{
+	uint32_t hashA[8], hashB[8];
+
+	sph_blake256_context     ctx_blake;
+	sph_keccak256_context    ctx_keccak;
+	sph_cubehash256_context  ctx_cube;
+	sph_skein256_context     ctx_skein;
+	sph_groestl256_context   ctx_groestl;
+
+	sph_blake256_set_rounds(14);
+
+	sph_blake256_init(&ctx_blake);
+	sph_blake256(&ctx_blake, input, 80);
+	sph_blake256_close(&ctx_blake, hashA);
+
+	sph_keccak256_init(&ctx_keccak);
+	sph_keccak256(&ctx_keccak, hashA, 32);
+	sph_keccak256_close(&ctx_keccak, hashB);
+
+	LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
+
+	sph_cubehash256_init(&ctx_cube);
+	sph_cubehash256(&ctx_cube, hashA, 32);
+	sph_cubehash256_close(&ctx_cube, hashB);
+
+	LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
+
+	sph_skein256_init(&ctx_skein);
+	sph_skein256(&ctx_skein, hashA, 32);
+	sph_skein256_close(&ctx_skein, hashB);
+
+	sph_groestl256_init(&ctx_groestl);
+	sph_groestl256(&ctx_groestl, hashB, 32);
+	sph_groestl256_close(&ctx_groestl, hashA);
+
+	memcpy(state, hashA, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static __thread uint32_t throughput = 0;
+
+extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+
+	if (opt_benchmark)
+		ptarget[7] = 0x00ff;
+
+	static __thread bool gtx750ti;
+	if (!init[thr_id])
+	{
+		int dev_id = device_map[thr_id];
+		cudaSetDevice(dev_id);
+		CUDA_LOG_ERROR();
+
+		int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16;
+		if (device_sm[device_map[thr_id]] == 500) intensity = 15;
+		throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
+		if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+		cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, dev_id);
+
+		if (strstr(props.name, "750 Ti")) gtx750ti = true;
+		else gtx750ti = false;
+
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		blake256_cpu_init(thr_id, throughput);
+		//keccak256_sm3_init(thr_id, throughput);
+		skein256_cpu_init(thr_id, throughput);
+		groestl256_cpu_init(thr_id, throughput);
+
+		//cuda_get_arch(thr_id);
+		if (device_sm[dev_id] >= 500)
+		{
+			size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
+			CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
+			lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+		}
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+
+		init[thr_id] = true;
+	}
+
+	uint32_t _ALIGN(128) endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	blake256_cpu_setBlock_80(pdata);
+	groestl256_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		//blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		//keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti);
+		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti);
+		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+
+			be32enc(&endiandata[19], work->nonces[0]);
+			allium_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = groestl256_getSecNonce(thr_id, 1);
+				if (work->nonces[1] != UINT32_MAX) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					allium_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_allium(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_matrix[thr_id]);
+
+	//keccak256_sm3_free(thr_id);
+	groestl256_cpu_free(thr_id);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/miner.h b/miner.h
index 6d90518..16f57ab 100644
--- a/miner.h
+++ b/miner.h
@@ -273,6 +273,7 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len);
 
 struct work;
 
+extern int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_bastion(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds);
 extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -339,6 +340,7 @@ extern int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonc
 /* free device allocated memory per algo */
 void algo_free_all(int thr_id);
 
+extern void free_allium(int thr_id);
 extern void free_bastion(int thr_id);
 extern void free_bitcore(int thr_id);
 extern void free_blake256(int thr_id);
@@ -887,6 +889,7 @@ void applog_hash64(void *hash);
 void applog_compare_hash(void *hash, void *hash_ref);
 
 void print_hash_tests(void);
+void allium_hash(void *state, const void *input);
 void bastionhash(void* output, const unsigned char* input);
 void blake256hash(void *output, const void *input, int8_t rounds);
 void blake2b_hash(void *output, const void *input);
diff --git a/util.cpp b/util.cpp
index dc20c2a..70dc626 100644
--- a/util.cpp
+++ b/util.cpp
@@ -2164,6 +2164,9 @@ void print_hash_tests(void)
 
 	printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n");
 
+	allium_hash(&hash[0], &buf[0]);
+	printpfx("allium", hash);
+
 	bastionhash(&hash[0], &buf[0]);
 	printpfx("bastion", hash);
 

From ffd6cf38bf43387cda0fab567ba360f637f8fa18 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 6 May 2018 18:16:35 +0200
Subject: [PATCH 05/24] update readme and win ver

---
 README.txt              | 6 +++++-
 ccminer.cpp             | 1 +
 compat/ccminer-config.h | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.txt b/README.txt
index 59a2cec..af0718d 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
 
-ccminer 2.2.5 (Apr 2018)             "x12, x16r and x16s algos"
+ccminer 2.2.6 (Under Dev)
 ---------------------------------------------------------------
 
 ***************************************************************
@@ -73,6 +73,7 @@ This code is based on the pooler cpuminer and inherits
 its command line interface and options.
 
   -a, --algo=ALGO       specify the algorithm to use
+                          allium      use to mine Garlic
                           bastion     use to mine Joincoin
                           bitcore     use to mine Bitcore's Timetravel10
                           blake       use to mine Saffroncoin (Blake256)
@@ -281,6 +282,9 @@ so we can more efficiently implement new algorithms using the latest hardware
 features.
 
 >>> RELEASE HISTORY <<<
+  2018            v2.2.6
+                  New allium algo for Garlic
+
   Apr. 02nd 2018  v2.2.5
                   New x16r algo for Raven
                   New x16s algo for Pigeon and Eden
diff --git a/ccminer.cpp b/ccminer.cpp
index 770d5d5..a48b194 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -236,6 +236,7 @@ static char const usage[] = "\
 Usage: " PROGRAM_NAME " [OPTIONS]\n\
 Options:\n\
   -a, --algo=ALGO       specify the hash algorithm to use\n\
+			allium      Garlic double lyra2\n\
 			bastion     Hefty bastion\n\
 			bitcore     Timetravel-10\n\
 			blake       Blake 256 (SFR)\n\
diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h
index 17efd4c..5b36078 100644
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@@ -164,7 +164,7 @@
 #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.2.5"
+#define PACKAGE_VERSION "2.2.6"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be

From 57f8f776fb1819d253e34f615d46a83c0490f77b Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Wed, 9 May 2018 14:57:49 +0200
Subject: [PATCH 06/24] timetravel: cleanup, remove unused algos

+ cubehash 80 midstate
---
 x11/cuda_x11_cubehash512.cu | 54 +++++++++++++++++++++++++-----
 x11/timetravel.cu           | 67 +++----------------------------------
 2 files changed, 51 insertions(+), 70 deletions(-)

diff --git a/x11/cuda_x11_cubehash512.cu b/x11/cuda_x11_cubehash512.cu
index f7ce97c..b5aa534 100644
--- a/x11/cuda_x11_cubehash512.cu
+++ b/x11/cuda_x11_cubehash512.cu
@@ -259,16 +259,32 @@ void x11_cubehash512_cpu_init(int thr_id, uint32_t threads) { }
 
 /***************************************************/
 
-#define WANT_CUBEHASH80
-#ifdef WANT_CUBEHASH80
+/**
+ * Timetravel and x16 CUBEHASH-80 CUDA implementation
+ *  by tpruvot@github - Jan 2017 / May 2018
+ */
 
-__constant__
-static uint32_t c_PaddedMessage80[20];
+__constant__ static uint32_t c_midstate128[32];
+__constant__ static uint32_t c_PaddedMessage80[20];
+
+#undef SPH_C32
+#undef SPH_C64
+#undef SPH_T32
+#undef SPH_T64
+#include "sph/sph_cubehash.h"
 
 __host__
 void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata)
 {
+	sph_cubehash512_context ctx_cubehash;
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (void*)endiandata, 64);
+#ifndef NO_MIDSTATE
+	cudaMemcpyToSymbol(c_midstate128, ctx_cubehash.state, 128, 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_PaddedMessage80, &endiandata[16], 16, 0, cudaMemcpyHostToDevice);
+#else
 	cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+#endif
 }
 
 __global__
@@ -278,11 +294,11 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce,
 	if (thread < threads)
 	{
 		const uint32_t nonce = startNounce + thread;
-
+		uint32_t message[8];
 		uint32_t x[2][2][2][2][2];
+#ifdef NO_MIDSTATE
 		Init(x);
 
-		uint32_t message[8];
 		// first 32 bytes
 		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[0]);
 		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[4]);
@@ -293,8 +309,31 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce,
 		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[12]);
 		Update32(x, message);
 
-		// last 16 bytes + Padding
+		// last 16 bytes
 		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[16]);
+#else
+		AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_midstate128[ 0]);
+		AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_midstate128[ 2]);
+		AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_midstate128[ 4]);
+		AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_midstate128[ 6]);
+		AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_midstate128[ 8]);
+		AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_midstate128[10]);
+		AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_midstate128[12]);
+		AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_midstate128[14]);
+
+		AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_midstate128[16]);
+		AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_midstate128[18]);
+		AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_midstate128[20]);
+		AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_midstate128[22]);
+		AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_midstate128[24]);
+		AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_midstate128[26]);
+		AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_midstate128[28]);
+		AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_midstate128[30]);
+
+		// last 16 bytes
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[0]);
+#endif
+		// nonce + Padding
 		message[3] = cuda_swab32(nonce);
 		message[4] = 0x80;
 		message[5] = 0;
@@ -317,4 +356,3 @@ void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const ui
 	cubehash512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, (uint64_t*) d_hash);
 }
 
-#endif
\ No newline at end of file
diff --git a/x11/timetravel.cu b/x11/timetravel.cu
index 93c3fd1..8d157f2 100644
--- a/x11/timetravel.cu
+++ b/x11/timetravel.cu
@@ -20,11 +20,6 @@ extern "C" {
 #include "sph/sph_keccak.h"
 #include "sph/sph_luffa.h"
 #include "sph/sph_cubehash.h"
-#if HASH_FUNC_COUNT > 8
-#include "sph/sph_shavite.h"
-#include "sph/sph_simd.h"
-#include "sph/sph_echo.h"
-#endif
 }
 
 #include "miner.h"
@@ -42,11 +37,6 @@ enum Algo {
 	KECCAK,
 	LUFFA,
 	CUBEHASH,
-#if HASH_FUNC_COUNT > 8
-	SHAVITE,
-	SIMD,
-	ECHO,
-#endif
 	MAX_ALGOS_COUNT
 };
 
@@ -153,11 +143,6 @@ extern "C" void timetravel_hash(void *output, const void *input)
 	sph_keccak512_context    ctx_keccak;
 	sph_luffa512_context     ctx_luffa1;
 	sph_cubehash512_context  ctx_cubehash1;
-#if HASH_FUNC_COUNT > 8
-	sph_shavite512_context   ctx_shavite1;
-	sph_simd512_context      ctx_simd1;
-	sph_echo512_context      ctx_echo1;
-#endif
 
 	if (s_sequence == UINT32_MAX) {
 		uint32_t *data = (uint32_t*) input;
@@ -175,11 +160,6 @@ extern "C" void timetravel_hash(void *output, const void *input)
 		const char elem = hashOrder[i];
 		uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
 
-		if (i > 0) {
-			in = (void*) hash;
-			size = 64;
-		}
-
 		switch (algo) {
 		case BLAKE:
 			sph_blake512_init(&ctx_blake);
@@ -195,7 +175,6 @@ extern "C" void timetravel_hash(void *output, const void *input)
 			sph_groestl512_init(&ctx_groestl);
 			sph_groestl512(&ctx_groestl, in, size);
 			sph_groestl512_close(&ctx_groestl, hash);
-			//applog_hex((void*)hash, 32);
 			break;
 		case SKEIN:
 			sph_skein512_init(&ctx_skein);
@@ -222,24 +201,10 @@ extern "C" void timetravel_hash(void *output, const void *input)
 			sph_cubehash512(&ctx_cubehash1, in, size);
 			sph_cubehash512_close(&ctx_cubehash1, hash);
 			break;
-#if HASH_FUNC_COUNT > 8
-		case SHAVITE:
-			sph_shavite512_init(&ctx_shavite1);
-			sph_shavite512(&ctx_shavite1, in, size);
-			sph_shavite512_close(&ctx_shavite1, hash);
-			break;
-		case SIMD:
-			sph_simd512_init(&ctx_simd1);
-			sph_simd512(&ctx_simd1, in, size);
-			sph_simd512_close(&ctx_simd1, hash);
-			break;
-		case ECHO:
-			sph_echo512_init(&ctx_echo1);
-			sph_echo512(&ctx_echo1, in, size);
-			sph_echo512_close(&ctx_echo1, hash);
-			break;
-#endif
 		}
+
+		in = (void*) hash;
+		size = 64;
 	}
 
 	memcpy(output, hash, 32);
@@ -330,13 +295,7 @@ extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_n
 		qubit_luffa512_cpu_init(thr_id, throughput); // only constants (480 bytes)
 		x11_luffa512_cpu_init(thr_id, throughput);
 		x11_cubehash512_cpu_init(thr_id, throughput);
-#if HASH_FUNC_COUNT > 8
-		x11_shavite512_cpu_init(thr_id, throughput);
-		x11_echo512_cpu_init(thr_id, throughput);
-		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
-			return 0;
-		}
-#endif
+
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
 		CUDA_CALL_OR_RET_X(cudaMemset(d_hash[thr_id], 0, (size_t) 64 * throughput), -1);
 
@@ -471,20 +430,6 @@ extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_n
 				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("cube   :");
 				break;
-#if HASH_FUNC_COUNT > 8
-			case SHAVITE:
-				x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-				TRACE("shavite:");
-				break;
-			case SIMD:
-				x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-				TRACE("simd   :");
-				break;
-			case ECHO:
-				x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-				TRACE("echo   :");
-				break;
-#endif
 			}
 		}
 
@@ -544,9 +489,7 @@ extern "C" void free_timetravel(int thr_id)
 
 	quark_blake512_cpu_free(thr_id);
 	quark_groestl512_cpu_free(thr_id);
-#if HASH_FUNC_COUNT > 8
-	x11_simd512_cpu_free(thr_id);
-#endif
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
 

From a9357e1ec84fa0be584f126f9c852bc3372ada27 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 10 May 2018 06:31:25 +0200
Subject: [PATCH 07/24] lyra2: remove unused nonce param

---
 lyra2/allium.cu          |  6 +++---
 lyra2/cuda_lyra2.cu      | 30 +++++++++++++++---------------
 lyra2/cuda_lyra2_sm2.cuh |  4 ++--
 lyra2/cuda_lyra2_sm5.cuh | 12 ++++++------
 lyra2/lyra2RE.cu         |  4 ++--
 5 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/lyra2/allium.cu b/lyra2/allium.cu
index 931e6bc..6492c92 100644
--- a/lyra2/allium.cu
+++ b/lyra2/allium.cu
@@ -30,7 +30,7 @@ extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t start
 extern void skein256_cpu_init(int thr_id, uint32_t threads);
 
 extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
-extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti);
+extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_outputHash, bool gtx750ti);
 
 extern void groestl256_cpu_init(int thr_id, uint32_t threads);
 extern void groestl256_cpu_free(int thr_id);
@@ -141,9 +141,9 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce
 		//blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		//keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti);
+		lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti);
 		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti);
+		lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti);
 		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 
 		*hashes_done = pdata[19] - first_nonce + throughput;
diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu
index 7905d23..a280200 100644
--- a/lyra2/cuda_lyra2.cu
+++ b/lyra2/cuda_lyra2.cu
@@ -409,7 +409,7 @@ __constant__ uint2x4 blake2b_IV[2] = {
 };
 
 __global__ __launch_bounds__(64, 1)
-void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
@@ -436,7 +436,7 @@ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
 
 __global__
 __launch_bounds__(TPB52, 1)
-void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash)
 {
 	const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
 
@@ -481,7 +481,7 @@ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_has
 }
 
 __global__ __launch_bounds__(64, 1)
-void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash)
 {
 	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
 
@@ -502,7 +502,7 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
 		g_hash[thread + threads * 2] = state[0].z;
 		g_hash[thread + threads * 3] = state[0].w;
 
-	} //thread
+	}
 }
 #else
 #if __CUDA_ARCH__ < 500
@@ -510,9 +510,9 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
 /* for unsupported SM arch */
 __device__ void* DMatrix;
 #endif
-__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
-__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
-__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) {}
+__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) {}
 #endif
 
 __host__
@@ -523,7 +523,7 @@ void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
 }
 
 __host__
-void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti)
+void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx750ti)
 {
 	int dev_id = device_map[thr_id % MAX_GPUS];
 
@@ -544,11 +544,11 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6
 
 	if (cuda_arch[dev_id] >= 520)
 	{
-		lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
+		lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
 
-		lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, startNounce, d_hash);
+		lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, d_hash);
 
-		lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
+		lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
 	}
 	else if (cuda_arch[dev_id] >= 500)
 	{
@@ -561,12 +561,12 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6
 			// suitable amount to adjust for 10warp
 			shared_mem = 6144;
 
-		lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
+		lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
 
-		lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, startNounce, (uint2*)d_hash);
+		lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash);
 
-		lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
+		lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
 	}
 	else
-		lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, startNounce, d_hash);
+		lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, d_hash);
 }
diff --git a/lyra2/cuda_lyra2_sm2.cuh b/lyra2/cuda_lyra2_sm2.cuh
index 18263b2..da621d0 100644
--- a/lyra2/cuda_lyra2_sm2.cuh
+++ b/lyra2/cuda_lyra2_sm2.cuh
@@ -131,7 +131,7 @@ void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut,
 }
 
 __global__ __launch_bounds__(TPB30, 1)
-void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -224,5 +224,5 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h
 
 #else
 /* if __CUDA_ARCH__ < 200 .. host */
-__global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
+__global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) {}
 #endif
diff --git a/lyra2/cuda_lyra2_sm5.cuh b/lyra2/cuda_lyra2_sm5.cuh
index fc13172..4a3caeb 100644
--- a/lyra2/cuda_lyra2_sm5.cuh
+++ b/lyra2/cuda_lyra2_sm5.cuh
@@ -589,7 +589,7 @@ void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thr
 }
 
 __global__ __launch_bounds__(64, 1)
-void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
@@ -622,7 +622,7 @@ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha
 }
 
 __global__ __launch_bounds__(TPB50, 1)
-void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash)
 {
 	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
 
@@ -662,7 +662,7 @@ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha
 }
 
 __global__ __launch_bounds__(64, 1)
-void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
@@ -687,7 +687,7 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha
 
 #else
 /* if __CUDA_ARCH__ != 500 .. host */
-__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
-__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
-__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) {}
 #endif
diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu
index b3ad49f..b435371 100644
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@@ -26,7 +26,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon
 extern void skein256_cpu_init(int thr_id, uint32_t threads);
 
 extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
-extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti);
+extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_outputHash, bool gtx750ti);
 
 extern void groestl256_cpu_init(int thr_id, uint32_t threads);
 extern void groestl256_cpu_free(int thr_id);
@@ -130,7 +130,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 		//blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		//keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti);
+		lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti);
 		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 
 		*hashes_done = pdata[19] - first_nonce + throughput;

From b8190e4aa77c2925b0199f3fbfc22571f279925b Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 17 May 2018 19:26:00 +0200
Subject: [PATCH 08/24] allium: add missing device cpu flag for linux

---
 lyra2/allium.cu | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lyra2/allium.cu b/lyra2/allium.cu
index 6492c92..65dbbe3 100644
--- a/lyra2/allium.cu
+++ b/lyra2/allium.cu
@@ -95,7 +95,11 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce
 	{
 		int dev_id = device_map[thr_id];
 		cudaSetDevice(dev_id);
-		CUDA_LOG_ERROR();
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 
 		int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16;
 		if (device_sm[device_map[thr_id]] == 500) intensity = 15;

From 3d03a1b9fd0a14c65f231ed65f9cebb267e63a4f Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 28 May 2018 15:21:00 +0200
Subject: [PATCH 09/24] phi2 algo

---
 Makefile.am                  |   2 +-
 algos.h                      |   2 +
 ccminer.cpp                  |   8 +-
 ccminer.vcxproj              |   5 +-
 ccminer.vcxproj.filters      |  20 ++-
 lyra2/cuda_lyra2.cu          | 122 ++++++++++++++---
 lyra2/cuda_lyra2_sm2.cuh     |  65 ++++++++-
 lyra2/cuda_lyra2_sm5.cuh     |  64 ++++++++-
 miner.h                      |   5 +-
 phi/cuda_phi2.cu             |  89 ++++++++++++
 {x11 => phi}/phi.cu          |   8 +-
 phi/phi2.cu                  | 255 +++++++++++++++++++++++++++++++++++
 util.cpp                     |   2 +-
 x11/cuda_streebog_maxwell.cu |  21 ++-
 x16/cuda_x16_echo512_64.cu   |  26 +++-
 15 files changed, 648 insertions(+), 46 deletions(-)
 create mode 100644 phi/cuda_phi2.cu
 rename {x11 => phi}/phi.cu (97%)
 create mode 100644 phi/phi2.cu

diff --git a/Makefile.am b/Makefile.am
index 8f33d48..5d5652c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -81,7 +81,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \
 			  x16/cuda_x16_echo512_64.cu \
 			  x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
-			  x11/phi.cu x11/cuda_streebog_maxwell.cu \
+			  phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu x11/cuda_streebog_maxwell.cu \
 			  x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu
 
 # scrypt
diff --git a/algos.h b/algos.h
index f141086..229d8e9 100644
--- a/algos.h
+++ b/algos.h
@@ -39,6 +39,7 @@ enum sha_algos {
 	ALGO_NIST5,
 	ALGO_PENTABLAKE,
 	ALGO_PHI,
+	ALGO_PHI2,
 	ALGO_POLYTIMOS,
 	ALGO_QUARK,
 	ALGO_QUBIT,
@@ -112,6 +113,7 @@ static const char *algo_names[] = {
 	"nist5",
 	"penta",
 	"phi",
+	"phi2",
 	"polytimos",
 	"quark",
 	"qubit",
diff --git a/ccminer.cpp b/ccminer.cpp
index a48b194..c1567a1 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -269,7 +269,8 @@ Options:\n\
 			neoscrypt   FeatherCoin, Phoenix, UFO...\n\
 			nist5       NIST5 (TalkCoin)\n\
 			penta       Pentablake hash (5x Blake 512)\n\
-			phi         BHCoin\n\
+			phi         LUX initial algo\n\
+			phi2        LUX v2 with lyra2\n\
 			polytimos   Politimos\n\
 			quark       Quark\n\
 			qubit       Qubit\n\
@@ -1708,6 +1709,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_LBRY:
 		case ALGO_LYRA2v2:
 		case ALGO_LYRA2Z:
+		case ALGO_PHI2:
 		case ALGO_TIMETRAVEL:
 		case ALGO_BITCORE:
 		case ALGO_X16R:
@@ -2245,6 +2247,7 @@ static void *miner_thread(void *userdata)
 			case ALGO_HSR:
 			case ALGO_LYRA2v2:
 			case ALGO_PHI:
+			case ALGO_PHI2:
 			case ALGO_POLYTIMOS:
 			case ALGO_S3:
 			case ALGO_SKUNK:
@@ -2436,6 +2439,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_PHI:
 			rc = scanhash_phi(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_PHI2:
+			rc = scanhash_phi2(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_POLYTIMOS:
 			rc = scanhash_polytimos(thr_id, &work, max_nonce, &hashes_done);
 			break;
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 1db063e..f20449a 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -525,6 +525,7 @@
     <CudaCompile Include="lyra2\lyra2REv2.cu" />
     <CudaCompile Include="lyra2\cuda_lyra2v2.cu" />
     <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
+    <ClInclude Include="lyra2\cuda_lyra2_sm5.cuh" />
     <ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
     <CudaCompile Include="lyra2\lyra2Z.cu" />
     <CudaCompile Include="lyra2\cuda_lyra2Z.cu" />
@@ -537,6 +538,9 @@
     <CudaCompile Include="cuda_skeincoin.cu">
       <MaxRegCount>48</MaxRegCount>
     </CudaCompile>
+    <CudaCompile Include="phi\phi.cu" />
+    <CudaCompile Include="phi\phi2.cu" />
+    <CudaCompile Include="phi\cuda_phi2.cu" />
     <CudaCompile Include="skunk\skunk.cu" />
     <CudaCompile Include="skunk\cuda_skunk.cu">
       <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
@@ -567,7 +571,6 @@
     <CudaCompile Include="x11\cuda_streebog_maxwell.cu" />
     <CudaCompile Include="x11\c11.cu" />
     <CudaCompile Include="x11\fresh.cu" />
-    <CudaCompile Include="x11\phi.cu" />
     <CudaCompile Include="x11\sib.cu" />
     <CudaCompile Include="x11\s3.cu" />
     <CudaCompile Include="x11\timetravel.cu" />
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index b2ee453..96220ae 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -115,7 +115,10 @@
     <Filter Include="Source Files\CUDA\tribus">
       <UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier>
     </Filter>
-        <Filter Include="Source Files\CUDA\x12">
+    <Filter Include="Source Files\CUDA\phi">
+      <UniqueIdentifier>{311e8d79-1612-4f0f-8591-23a592f2b2d3}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Source Files\CUDA\x12">
       <UniqueIdentifier>{xde48d89-fx12-1323-129a-b592fe2b2de3}</UniqueIdentifier>
     </Filter>
   </ItemGroup>
@@ -545,6 +548,9 @@
     <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh">
       <Filter>Source Files\CUDA\lyra2</Filter>
     </ClInclude>
+    <ClInclude Include="lyra2\cuda_lyra2_sm5.cuh">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </ClInclude>
     <ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh">
       <Filter>Source Files\CUDA\lyra2</Filter>
     </ClInclude>
@@ -781,6 +787,15 @@
     <CudaCompile Include="polytimos.cu">
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
+    <CudaCompile Include="phi\phi.cu">
+      <Filter>Source Files\CUDA\phi</Filter>
+    </CudaCompile>
+    <CudaCompile Include="phi\phi2.cu">
+      <Filter>Source Files\CUDA\phi</Filter>
+    </CudaCompile>
+    <CudaCompile Include="phi\cuda_phi2.cu">
+      <Filter>Source Files\CUDA\phi</Filter>
+    </CudaCompile>
     <CudaCompile Include="skunk\skunk.cu">
       <Filter>Source Files\CUDA\skunk</Filter>
     </CudaCompile>
@@ -799,9 +814,6 @@
     <ClInclude Include="tribus\cuda_echo512_aes.cuh">
       <Filter>Source Files\CUDA\tribus</Filter>
     </ClInclude>
-    <CudaCompile Include="x11\phi.cu">
-      <Filter>Source Files\CUDA\x11</Filter>
-    </CudaCompile>
     <CudaCompile Include="x11\sib.cu">
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu
index a280200..5cdb6ee 100644
--- a/lyra2/cuda_lyra2.cu
+++ b/lyra2/cuda_lyra2.cu
@@ -1,6 +1,7 @@
 /**
  * Lyra2 (v1) cuda implementation based on djm34 work
  * tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
+ * tpruvot@github 2018 for phi2 double lyra2-32 support
  */
 
 #include <stdio.h>
@@ -228,9 +229,7 @@ void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads)
 {
 	uint2 state1[3];
 
-#if __CUDA_ARCH__ > 500
-#pragma unroll
-#endif
+	#pragma unroll
 	for (int i = 0; i < Nrow; i++)
 	{
 		ST4S(0, Ncol - i - 1, state, thread, threads);
@@ -305,7 +304,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin
 		LD4S(state1, rowIn, i, thread, threads);
 		LD4S(state2, rowInOut, i, thread, threads);
 
-#pragma unroll
+		#pragma unroll
 		for (int j = 0; j < 3; j++)
 			state[j] ^= state1[j] + state2[j];
 
@@ -334,7 +333,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin
 
 		LD4S(state1, rowOut, i, thread, threads);
 
-#pragma unroll
+		#pragma unroll
 		for (int j = 0; j < 3; j++)
 			state1[j] ^= state[j];
 
@@ -412,11 +411,9 @@ __global__ __launch_bounds__(64, 1)
 void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
 	if (thread < threads)
 	{
 		uint2x4 state[4];
-
 		state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]);
 		state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]);
 		state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]);
@@ -436,10 +433,9 @@ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash)
 
 __global__
 __launch_bounds__(TPB52, 1)
-void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash)
+void lyra2_gpu_hash_32_2(const uint32_t threads, uint64_t *g_hash)
 {
 	const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
-
 	if (thread < threads)
 	{
 		uint2 state[4];
@@ -484,11 +480,9 @@ __global__ __launch_bounds__(64, 1)
 void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash)
 {
 	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
-
-	uint28 state[4];
-
 	if (thread < threads)
 	{
+		uint2x4 state[4];
 		state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
 		state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
 		state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
@@ -501,7 +495,57 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash)
 		g_hash[thread + threads * 1] = state[0].y;
 		g_hash[thread + threads * 2] = state[0].z;
 		g_hash[thread + threads * 3] = state[0].w;
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+		const size_t offset = (size_t)8 * thread + (round * 4U);
+		uint2 *psrc = (uint2*)(&d_hash_512[offset]);
+		state[0].x = state[1].x = __ldg(&psrc[0]);
+		state[0].y = state[1].y = __ldg(&psrc[1]);
+		state[0].z = state[1].z = __ldg(&psrc[2]);
+		state[0].w = state[1].w = __ldg(&psrc[3]);
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
 
+		for (int i = 0; i<24; i++)
+			round_lyra(state);
+
+		((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
+		((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
+		((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
+		((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round)
+{
+	// This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 rounds
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state);
+
+		const size_t offset = (size_t)8 * thread + (round * 4U);
+		uint2 *pdst = (uint2*)(&d_hash_512[offset]);
+		pdst[0] = state[0].x;
+		pdst[1] = state[0].y;
+		pdst[2] = state[0].z;
+		pdst[3] = state[0].w;
 	}
 }
 #else
@@ -513,6 +557,8 @@ __device__ void* DMatrix;
 __global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) {}
 __global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) {}
 __global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {}
+__global__ void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {}
 #endif
 
 __host__
@@ -545,9 +591,7 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx7
 	if (cuda_arch[dev_id] >= 520)
 	{
 		lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
-
 		lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, d_hash);
-
 		lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
 	}
 	else if (cuda_arch[dev_id] >= 500)
@@ -562,11 +606,57 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx7
 			shared_mem = 6144;
 
 		lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
-
 		lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash);
-
 		lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
 	}
 	else
 		lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, d_hash);
 }
+
+__host__
+void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti)
+{
+	int dev_id = device_map[thr_id % MAX_GPUS];
+	uint32_t tpb = TPB52;
+	if (cuda_arch[dev_id] >= 520) tpb = TPB52;
+	else if (cuda_arch[dev_id] >= 500) tpb = TPB50;
+	else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
+
+	dim3 grid1((size_t(threads) * 4 + tpb - 1) / tpb);
+	dim3 block1(4, tpb >> 2);
+
+	dim3 grid2((threads + 64 - 1) / 64);
+	dim3 block2(64);
+
+	if (cuda_arch[dev_id] >= 520)
+	{
+		const size_t shared_mem = sizeof(uint2) * tpb * 192; // 49152;
+		lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
+		lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256);
+		lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
+
+		lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
+		lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256);
+		lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
+	}
+	else if (cuda_arch[dev_id] >= 500)
+	{
+		size_t shared_mem = gtx750ti ? 8192 : 6144; // 8 or 10 warps
+		lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
+		lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256);
+		lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
+
+		lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
+		lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256);
+		lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
+	}
+	else {
+		// alternative method for SM 3.x
+		hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0);
+		lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti);
+		hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0);
+		hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1);
+		lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti);
+		hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1);
+	}
+}
diff --git a/lyra2/cuda_lyra2_sm2.cuh b/lyra2/cuda_lyra2_sm2.cuh
index da621d0..cc0bd82 100644
--- a/lyra2/cuda_lyra2_sm2.cuh
+++ b/lyra2/cuda_lyra2_sm2.cuh
@@ -3,7 +3,7 @@
 #ifdef __INTELLISENSE__
 /* just for vstudio code colors, only uncomment that temporary, dont commit it */
 //#undef __CUDA_ARCH__
-//#define __CUDA_ARCH__ 500
+//#define __CUDA_ARCH__ 300
 #endif
 
 #include "cuda_helper.h"
@@ -226,3 +226,66 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash)
 /* if __CUDA_ARCH__ < 200 .. host */
 __global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) {}
 #endif
+
+// -------------------------------------------------------------------------------------------------------------------------
+
+// lyra2 cant be used as-is in 512-bits hash chains, tx to djm for these weird offsets since first lyra2 algo...
+
+#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350
+
+__global__ __launch_bounds__(128, 8)
+void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const size_t offset = (size_t) 16 * thread + (round * 8U);
+		uint2 *psrc = (uint2*) (&d_hash64[offset]);
+		uint2 *pdst = (uint2*) (&d_hash_lyra[thread]);
+		pdst[threads*0] = __ldg(&psrc[0]);
+		pdst[threads*1] = __ldg(&psrc[1]);
+		pdst[threads*2] = __ldg(&psrc[2]);
+		pdst[threads*3] = __ldg(&psrc[3]);
+	}
+}
+
+__global__ __launch_bounds__(128, 8)
+void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const size_t offset = (size_t) 16 * thread + (round * 8U);
+		uint2 *psrc = (uint2*) (&d_hash_lyra[thread]);
+		uint2 *pdst = (uint2*) (&d_hash64[offset]);
+		pdst[0] = psrc[0];
+		pdst[1] = psrc[threads*1];
+		pdst[2] = psrc[threads*2];
+		pdst[3] = psrc[threads*3];
+	}
+}
+#else
+/* if __CUDA_ARCH__ < 200 .. host */
+__global__ void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {}
+__global__ void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {}
+#endif
+
+__host__
+void hash64_to_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	hash64_to_lyra32_gpu <<<grid, block>>> (threads, d_hash64, (uint2*) d_hash_lyra, round);
+}
+
+__host__
+void hash64_from_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	hash64_from_lyra32_gpu <<<grid, block>>> (threads, d_hash64, (uint2*) d_hash_lyra, round);
+}
diff --git a/lyra2/cuda_lyra2_sm5.cuh b/lyra2/cuda_lyra2_sm5.cuh
index 4a3caeb..85adfd9 100644
--- a/lyra2/cuda_lyra2_sm5.cuh
+++ b/lyra2/cuda_lyra2_sm5.cuh
@@ -591,13 +591,12 @@ void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thr
 __global__ __launch_bounds__(64, 1)
 void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash)
 {
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
 	const uint2x4 blake2b_IV[2] = {
 		{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
 		{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
 	};
 
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint2x4 state[4];
@@ -629,7 +628,6 @@ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash)
 	if (thread < threads)
 	{
 		uint2 state[4];
-
 		state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]);
 		state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]);
 		state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]);
@@ -669,7 +667,6 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash)
 	if (thread < threads)
 	{
 		uint2x4 state[4];
-
 		state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]);
 		state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]);
 		state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]);
@@ -685,9 +682,68 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash)
 	}
 }
 
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round)
+{
+	const uint2x4 blake2b_IV[2] = {
+		{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
+		{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
+	};
+	// This kernel loads 2x 256-bits hashes from 512-bits chain offsets in 2 steps
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+		const size_t offset = (size_t)8 * thread + (round * 4U);
+		uint2 *psrc = (uint2*)(&d_hash_512[offset]);
+		state[0].x = state[1].x = __ldg(&psrc[0]);
+		state[0].y = state[1].y = __ldg(&psrc[1]);
+		state[0].z = state[1].z = __ldg(&psrc[2]);
+		state[0].w = state[1].w = __ldg(&psrc[3]);
+
+		state[1] = state[0];
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i<24; i++)
+			round_lyra(state);
+
+		((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
+		((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
+		((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
+		((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round)
+{
+	// This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 steps
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state);
+
+		const size_t offset = (size_t)8 * thread + (round * 4U);
+		uint2 *pdst = (uint2*)(&d_hash_512[offset]);
+		pdst[0] = state[0].x;
+		pdst[1] = state[0].y;
+		pdst[2] = state[0].z;
+		pdst[3] = state[0].w;
+	}
+}
 #else
 /* if __CUDA_ARCH__ != 500 .. host */
 __global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) {}
 __global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) {}
 __global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {}
+__global__ void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {}
 #endif
diff --git a/miner.h b/miner.h
index 16f57ab..d3118dc 100644
--- a/miner.h
+++ b/miner.h
@@ -303,6 +303,7 @@ extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce,
 extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_polytimos(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -371,6 +372,7 @@ extern void free_neoscrypt(int thr_id);
 extern void free_nist5(int thr_id);
 extern void free_pentablake(int thr_id);
 extern void free_phi(int thr_id);
+extern void free_phi2(int thr_id);
 extern void free_polytimos(int thr_id);
 extern void free_quark(int thr_id);
 extern void free_qubit(int thr_id);
@@ -918,7 +920,8 @@ void myriadhash(void *state, const void *input);
 void neoscrypt(uchar *output, const uchar *input, uint32_t profile);
 void nist5hash(void *state, const void *input);
 void pentablakehash(void *output, const void *input);
-void phihash(void *output, const void *input);
+void phi_hash(void *output, const void *input);
+void phi2_hash(void *output, const void *input);
 void polytimos_hash(void *output, const void *input);
 void quarkhash(void *state, const void *input);
 void qubithash(void *state, const void *input);
diff --git a/phi/cuda_phi2.cu b/phi/cuda_phi2.cu
new file mode 100644
index 0000000..a0bcf6d
--- /dev/null
+++ b/phi/cuda_phi2.cu
@@ -0,0 +1,89 @@
+#include <stdio.h>
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+__global__ __launch_bounds__(128, 8)
+void phi_filter_gpu(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t);
+		uint4 *psrc = (uint4*) (&d_hash[offset]);
+		d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 1;
+		if (d_NonceBranch[thread]) return;
+		if (d_branch2) {
+			uint4 *pdst = (uint4*)(&d_branch2[offset]);
+			uint4 data;
+			data = psrc[0]; pdst[0] = data;
+			data = psrc[1]; pdst[1] = data;
+			data = psrc[2]; pdst[2] = data;
+			data = psrc[3]; pdst[3] = data;
+		}
+	}
+}
+
+__global__ __launch_bounds__(128, 8)
+void phi_merge_gpu(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads && !d_NonceBranch[thread])
+	{
+		const uint32_t offset = thread * 16U;
+		uint4 *psrc = (uint4*) (&d_branch2[offset]);
+		uint4 *pdst = (uint4*) (&d_hash[offset]);
+		uint4 data;
+		data = psrc[0]; pdst[0] = data;
+		data = psrc[1]; pdst[1] = data;
+		data = psrc[2]; pdst[2] = data;
+		data = psrc[3]; pdst[3] = data;
+	}
+}
+
+__global__
+void phi_final_compress_gpu(const uint32_t threads, uint32_t* d_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t offset = thread * 16U;
+		uint2 *psrc = (uint2*) (&d_hash[offset]);
+		uint2 *pdst = (uint2*) (&d_hash[offset]);
+		uint2 data;
+		data = psrc[4]; pdst[0] ^= data;
+		data = psrc[5]; pdst[1] ^= data;
+		data = psrc[6]; pdst[2] ^= data;
+		data = psrc[7]; pdst[3] ^= data;
+	}
+}
+
+__host__
+uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	// extract algo permution hashes to a second branch buffer
+	phi_filter_gpu <<<grid, block>>> (threads, inpHashes, d_br2, d_nonces);
+	return threads;
+}
+
+__host__
+void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	// put back second branch hashes to the common buffer d_hash
+	phi_merge_gpu <<<grid, block>>> (threads, outpHashes, d_br2, d_nonces);
+}
+
+__host__
+void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	phi_final_compress_gpu <<<grid, block>>> (threads, d_hashes);
+}
diff --git a/x11/phi.cu b/phi/phi.cu
similarity index 97%
rename from x11/phi.cu
rename to phi/phi.cu
index ab1f308..ba2a967 100644
--- a/x11/phi.cu
+++ b/phi/phi.cu
@@ -19,7 +19,7 @@ extern "C" {
 
 #include "miner.h"
 #include "cuda_helper.h"
-#include "cuda_x11.h"
+#include "x11/cuda_x11.h"
 
 extern void skein512_cpu_setBlock_80(void *pdata);
 extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
@@ -38,7 +38,7 @@ extern void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash,
 static uint32_t *d_hash[MAX_GPUS];
 static uint32_t *d_resNonce[MAX_GPUS];
 
-extern "C" void phihash(void *output, const void *input)
+extern "C" void phi_hash(void *output, const void *input)
 {
 	unsigned char _ALIGN(128) hash[128] = { 0 };
 
@@ -162,7 +162,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
 			uint32_t _ALIGN(64) vhash[8];
 			if (!use_compat_kernels[thr_id]) work->nonces[0] += startNonce;
 			be32enc(&endiandata[19], work->nonces[0]);
-			phihash(vhash, endiandata);
+			phi_hash(vhash, endiandata);
 
 			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 				work->valid_nonces = 1;
@@ -173,7 +173,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
 				if (work->nonces[1] != UINT32_MAX) {
 					work->nonces[1] += startNonce;
 					be32enc(&endiandata[19], work->nonces[1]);
-					phihash(vhash, endiandata);
+					phi_hash(vhash, endiandata);
 					bn_set_target_ratio(work, vhash, 1);
 					work->valid_nonces++;
 					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
diff --git a/phi/phi2.cu b/phi/phi2.cu
new file mode 100644
index 0000000..537217f
--- /dev/null
+++ b/phi/phi2.cu
@@ -0,0 +1,255 @@
+//
+//  PHI2 algo
+//  CubeHash + Lyra2 x2 + JH + Gost or Echo + Skein
+//
+//  Implemented by tpruvot in May 2018
+//
+
+extern "C" {
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_streebog.h"
+#include "sph/sph_echo.h"
+#include "lyra2/Lyra2.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+extern void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
+extern void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
+extern void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti);
+
+extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+extern void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter);
+extern void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter);
+
+extern uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces);
+extern void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces);
+extern void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes);
+
+static uint64_t* d_matrix[MAX_GPUS];
+static uint32_t* d_hash_512[MAX_GPUS];
+static uint64_t* d_hash_256[MAX_GPUS];
+static uint32_t* d_hash_br2[MAX_GPUS];
+static uint32_t* d_nonce_br[MAX_GPUS];
+
+extern "C" void phi2_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(128) hash[128] = { 0 };
+	unsigned char _ALIGN(128) hashA[64] = { 0 };
+	unsigned char _ALIGN(128) hashB[64] = { 0 };
+
+	sph_cubehash512_context ctx_cubehash;
+	sph_jh512_context ctx_jh;
+	sph_gost512_context ctx_gost;
+	sph_echo512_context ctx_echo;
+	sph_skein512_context ctx_skein;
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, input, 80);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hashB);
+
+	LYRA2(&hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8);
+	LYRA2(&hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, (const void*)hashA, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	if (hash[0] & 1) {
+		sph_gost512_init(&ctx_gost);
+		sph_gost512(&ctx_gost, (const void*)hash, 64);
+		sph_gost512_close(&ctx_gost, (void*)hash);
+	} else {
+		sph_echo512_init(&ctx_echo);
+		sph_echo512(&ctx_echo, (const void*)hash, 64);
+		sph_echo512_close(&ctx_echo, (void*)hash);
+
+		sph_echo512_init(&ctx_echo);
+		sph_echo512(&ctx_echo, (const void*)hash, 64);
+		sph_echo512_close(&ctx_echo, (void*)hash);
+	}
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	for (int i=0; i<32; i++)
+		hash[i] ^= hash[i+32];
+
+	memcpy(output, hash, 32);
+}
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "phi-"
+#include "cuda_debug.cuh"
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+static __thread bool gtx750ti = false;
+
+extern "C" int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16;
+	if (device_sm[dev_id] == 500) intensity = 15;
+	if (device_sm[dev_id] == 600) intensity = 17;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+	if (init[thr_id]) throughput = max(throughput & 0xffffff80, 128); // for shared mem
+
+	if (opt_benchmark)
+		ptarget[7] = 0xff;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		gtx750ti = (strstr(device_name[dev_id], "GTX 750 Ti") != NULL);
+
+		size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 16 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput), -1);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_256[thr_id], (size_t)32 * throughput), -1);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_512[thr_id], (size_t)64 * throughput), -1);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_nonce_br[thr_id], sizeof(uint32_t) * throughput), -1);
+		if (use_compat_kernels[thr_id]) {
+			CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_br2[thr_id], (size_t)64 * throughput), -1);
+		}
+
+		x11_cubehash512_cpu_init(thr_id, throughput);
+		lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+		quark_jh512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		if (use_compat_kernels[thr_id]) x11_echo512_cpu_init(thr_id, throughput);
+
+		cuda_check_cpu_init(thr_id, throughput);
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k = 0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	cuda_check_cpu_setTarget(ptarget);
+	cubehash512_setBlock_80(thr_id, endiandata);
+
+	do {
+		int order = 0;
+
+		cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash_512[thr_id]); order++;
+		TRACE("cube   ");
+
+		lyra2_cuda_hash_64(thr_id, throughput, d_hash_256[thr_id], d_hash_512[thr_id], gtx750ti);
+		order++;
+		TRACE("lyra   ");
+
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++);
+		TRACE("jh     ");
+
+		order++;
+		if (!use_compat_kernels[thr_id]) {
+			phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], NULL, d_nonce_br[thr_id]);
+			phi_streebog_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
+			phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
+			phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
+		} else {
+			// todo: nonces vector to reduce amount of hashes to compute
+			phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]);
+			streebog_cpu_hash_64(thr_id, throughput, d_hash_512[thr_id]);
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order);
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order);
+			phi_merge_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]);
+		}
+		TRACE("mix    ");
+
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++);
+		TRACE("skein  ");
+
+		phi_final_compress_cuda(thr_id, throughput, d_hash_512[thr_id]);
+		TRACE("xor  ");
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			phi2_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash_512[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					phi2_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				if (pdata[19] > max_nonce) pdata[19] = max_nonce;
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! thr=%x", work->nonces[0], throughput);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_phi2(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+	cudaFree(d_matrix[thr_id]);
+	cudaFree(d_hash_512[thr_id]);
+	cudaFree(d_hash_256[thr_id]);
+	cudaFree(d_nonce_br[thr_id]);
+	if (use_compat_kernels[thr_id]) cudaFree(d_hash_br2[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/util.cpp b/util.cpp
index 70dc626..ee1c1ee 100644
--- a/util.cpp
+++ b/util.cpp
@@ -2250,7 +2250,7 @@ void print_hash_tests(void)
 	pentablakehash(&hash[0], &buf[0]);
 	printpfx("pentablake", hash);
 
-	phihash(&hash[0], &buf[0]);
+	phi2_hash(&hash[0], &buf[0]);
 	printpfx("phi", hash);
 
 	polytimos_hash(&hash[0], &buf[0]);
diff --git a/x11/cuda_streebog_maxwell.cu b/x11/cuda_streebog_maxwell.cu
index 6a06332..4ff580b 100644
--- a/x11/cuda_streebog_maxwell.cu
+++ b/x11/cuda_streebog_maxwell.cu
@@ -207,7 +207,7 @@ __launch_bounds__(TPB, 3)
 #else
 __launch_bounds__(TPB, 3)
 #endif
-void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
+void streebog_gpu_hash_64_sm5(uint64_t *g_hash, uint32_t* const d_filter, const uint32_t filter_val)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	uint2 buf[8], t[8], temp[8], K0[8], hash[8];
@@ -222,13 +222,16 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
 	shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]);
 	shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]);
 
+	//__threadfence_block();
+	__syncthreads();
+
+	if (d_filter && d_filter[thread] != filter_val) return;
+
 	uint64_t* inout = &g_hash[thread<<3];
 
 	*(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]);
 	*(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]);
 
-	__threadfence_block();
-
 	K0[0] = vectorize(0x74a5d4ce2efc83b3);
 
 	#pragma unroll 8
@@ -301,9 +304,17 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
 }
 
 __host__
-void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash)
+void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *g_hash)
+{
+	dim3 grid((threads + TPB-1) / TPB);
+	dim3 block(TPB);
+	streebog_gpu_hash_64_sm5 <<<grid, block>>> ((uint64_t*)g_hash, NULL, 0);
+}
+
+__host__
+void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter)
 {
 	dim3 grid((threads + TPB-1) / TPB);
 	dim3 block(TPB);
-	streebog_gpu_hash_64_maxwell <<<grid, block>>> ((uint64_t*)d_hash);
+	streebog_gpu_hash_64_sm5 <<<grid, block>>> ((uint64_t*)g_hash, d_filter, 1);
 }
diff --git a/x16/cuda_x16_echo512_64.cu b/x16/cuda_x16_echo512_64.cu
index ac18ff6..3a0f268 100644
--- a/x16/cuda_x16_echo512_64.cu
+++ b/x16/cuda_x16_echo512_64.cu
@@ -79,11 +79,12 @@ static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W,
 }
 
 __global__ __launch_bounds__(128, 5) /* will force 80 registers */
-static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
+static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t* g_hash, uint32_t* const d_filter, const uint32_t filter_val)
 {
 	__shared__ uint32_t sharedMemory[4][256];
 
 	aes_gpu_init128(sharedMemory);
+	__syncthreads();
 
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	uint32_t k0;
@@ -91,6 +92,9 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
 	uint32_t hash[16];
 	if (thread < threads)
 	{
+		// phi2 filter (2 hash chain branches)
+		if (d_filter && d_filter[thread] != filter_val) return;
+
 		uint32_t *Hash = &g_hash[thread<<4];
 
 		*(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]);
@@ -99,8 +103,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
 		*(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0];
 		*(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8];
 
-		__syncthreads();
-
 		const uint32_t P[48] = {
 			0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
 			//8-12
@@ -217,7 +219,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
 			W[48 + i + 4] = a ^ cd ^ bcx;
 			W[48 + i + 8] = d ^ ab ^ cdx;
 			W[48 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
-
 		}
 
 		for (int k = 1; k < 10; k++)
@@ -237,12 +238,23 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
 }
 
 __host__
-void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){
-
+void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
+{
 	const uint32_t threadsperblock = 128;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	x16_echo512_gpu_hash_64<<<grid, block>>>(threads, d_hash);
+	x16_echo512_gpu_hash_64 <<<grid, block>>> (threads, d_hash, NULL, 0);
 }
+
+__host__
+void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter)
+{
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	x16_echo512_gpu_hash_64 <<<grid, block>>> (threads, g_hash, d_filter, 0);
+}
\ No newline at end of file

From 07859f93cef68072d8011f3f9a60d443ae11289e Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 10 Jun 2018 18:32:37 +0200
Subject: [PATCH 10/24] update windows version + 2.2.6 readme

---
 README.txt     | 7 ++++---
 res/ccminer.rc | 8 ++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/README.txt b/README.txt
index af0718d..0862870 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
 
-ccminer 2.2.6 (Under Dev)
+ccminer 2.2.6                                 "phi2 and allium"
 ---------------------------------------------------------------
 
 ***************************************************************
@@ -103,7 +103,7 @@ its command line interface and options.
                           neoscrypt   use to mine FeatherCoin, Trezarcoin, Orbitcoin, etc
                           nist5       use to mine TalkCoin
                           penta       use to mine Joincoin / Pentablake
-                          phi         use to mine LUXCoin
+                          phi2        use to mine LUXCoin
                           polytimos   use to mine Polytimos
                           quark       use to mine Quarkcoin
                           qubit       use to mine Qubit
@@ -282,7 +282,8 @@ so we can more efficiently implement new algorithms using the latest hardware
 features.
 
 >>> RELEASE HISTORY <<<
-  2018            v2.2.6
+  June 10th 2018  v2.2.6
+                  New phi2 algo for LUX
                   New allium algo for Garlic
 
   Apr. 02nd 2018  v2.2.5
diff --git a/res/ccminer.rc b/res/ccminer.rc
index e031f82..78be94c 100644
--- a/res/ccminer.rc
+++ b/res/ccminer.rc
@@ -60,8 +60,8 @@ IDI_ICON1               ICON                    "ccminer.ico"
 //
 
 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 2,2,5,0
- PRODUCTVERSION 2,2,5,0
+ FILEVERSION 2,2,6,0
+ PRODUCTVERSION 2,2,6,0
  FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
  FILEFLAGS 0x21L
@@ -76,10 +76,10 @@ BEGIN
     BEGIN
         BLOCK "040904e4"
         BEGIN
-            VALUE "FileVersion", "2.2.5"
+            VALUE "FileVersion", "2.2.6"
             VALUE "LegalCopyright", "Copyright (C) 2018"
             VALUE "ProductName", "ccminer"
-            VALUE "ProductVersion", "2.2.5"
+            VALUE "ProductVersion", "2.2.6"
         END
     END
     BLOCK "VarFileInfo"

From 9fd5b04af628dc395a964ee4bb3126fcdd5f65da Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 18 Jun 2018 08:31:55 +0200
Subject: [PATCH 11/24] phi2: handle stratum sc hashes

---
 Makefile.am                  |   2 +-
 ccminer.cpp                  |  23 ++-
 ccminer.vcxproj              |   1 +
 ccminer.vcxproj.filters      |   3 +
 configure.ac                 |   2 +-
 equi/equi-stratum.cpp        |   2 +-
 miner.h                      |   2 +-
 phi/cuda_phi2_cubehash512.cu | 319 +++++++++++++++++++++++++++++++++++
 phi/phi2.cu                  |  35 ++--
 util.cpp                     |  18 +-
 10 files changed, 386 insertions(+), 21 deletions(-)
 create mode 100644 phi/cuda_phi2_cubehash512.cu

diff --git a/Makefile.am b/Makefile.am
index 5d5652c..80a80c8 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -81,7 +81,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \
 			  x16/cuda_x16_echo512_64.cu \
 			  x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
-			  phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu x11/cuda_streebog_maxwell.cu \
+			  phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu phi/cuda_phi2_cubehash512.cu x11/cuda_streebog_maxwell.cu \
 			  x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu
 
 # scrypt
diff --git a/ccminer.cpp b/ccminer.cpp
index c1567a1..7f01a80 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -103,6 +103,7 @@ bool submit_old = false;
 bool use_syslog = false;
 bool use_colors = true;
 int use_pok = 0;
+int use_roots = 0;
 static bool opt_background = false;
 bool opt_quiet = false;
 int opt_maxlograte = 3;
@@ -698,6 +699,10 @@ static bool work_decode(const json_t *val, struct work *work)
 		data_size = 192;
 		adata_sz = 180/4;
 		break;
+	case ALGO_PHI2:
+		data_size = 144;
+		adata_sz = data_size / 4;
+		break;
 	case ALGO_NEOSCRYPT:
 	case ALGO_ZR5:
 		data_size = 80;
@@ -743,6 +748,12 @@ static bool work_decode(const json_t *val, struct work *work)
 	for (i = 0; i < atarget_sz; i++)
 		work->target[i] = le32dec(work->target + i);
 
+	if (opt_algo == ALGO_PHI2) {
+		for (i = 20; i < 36; i++) if (work->data[i]) {
+			use_roots = 1; break;
+		}
+	}
+
 	if ((opt_showdiff || opt_max_diff > 0.) && !allow_mininginfo)
 		calc_network_diff(work);
 
@@ -1066,6 +1077,9 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		else if (opt_algo == ALGO_DECRED) {
 			data_size = 192; adata_sz = 180/4;
 		}
+		else if (opt_algo == ALGO_PHI2 && use_roots) {
+			data_size = 144; adata_sz = 36;
+		}
 		else if (opt_algo == ALGO_SIA) {
 			return sia_submit(curl, pool, work);
 		}
@@ -1629,10 +1643,17 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		for (i = 0; i < 8; i++)
 			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
 		for (i = 0; i < 8; i++)
-			work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
+			work->data[17 + i] = ((uint32_t*)sctx->job.extra)[i];
 		work->data[25] = le32dec(sctx->job.ntime);
 		work->data[26] = le32dec(sctx->job.nbits);
 		work->data[28] = 0x80000000;
+	} else if (opt_algo == ALGO_PHI2) {
+		for (i = 0; i < 8; i++)
+			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
+		work->data[17] = le32dec(sctx->job.ntime);
+		work->data[18] = le32dec(sctx->job.nbits);
+		for (i = 0; i < 16; i++)
+			work->data[20 + i] = be32dec((uint32_t*)sctx->job.extra + i);
 	} else if (opt_algo == ALGO_SIA) {
 		uint32_t extra = 0;
 		memcpy(&extra, &sctx->job.coinbase[32], 2);
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index f20449a..c0aa954 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -541,6 +541,7 @@
     <CudaCompile Include="phi\phi.cu" />
     <CudaCompile Include="phi\phi2.cu" />
     <CudaCompile Include="phi\cuda_phi2.cu" />
+    <CudaCompile Include="phi\cuda_phi2_cubehash512.cu" />
     <CudaCompile Include="skunk\skunk.cu" />
     <CudaCompile Include="skunk\cuda_skunk.cu">
       <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 96220ae..667331a 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -796,6 +796,9 @@
     <CudaCompile Include="phi\cuda_phi2.cu">
       <Filter>Source Files\CUDA\phi</Filter>
     </CudaCompile>
+    <CudaCompile Include="phi\cuda_phi2_cubehash512.cu">
+      <Filter>Source Files\CUDA\phi</Filter>
+    </CudaCompile>
     <CudaCompile Include="skunk\skunk.cu">
       <Filter>Source Files\CUDA\skunk</Filter>
     </CudaCompile>
diff --git a/configure.ac b/configure.ac
index e164456..5489e9c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.2.6], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.2.7], [], [ccminer], [http://github.com/tpruvot/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/equi/equi-stratum.cpp b/equi/equi-stratum.cpp
index 403c185..26433cc 100644
--- a/equi/equi-stratum.cpp
+++ b/equi/equi-stratum.cpp
@@ -101,7 +101,7 @@ bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params)
 		target_be[31-i] = target_bin[i];
 		if (target_bin[i]) filled++;
 	}
-	memcpy(sctx->job.claim, target_be, 32); // hack, unused struct field
+	memcpy(sctx->job.extra, target_be, 32);
 
 	pthread_mutex_lock(&stratum_work_lock);
 	sctx->next_diff = target_to_diff_equi((uint32_t*) &target_be);
diff --git a/miner.h b/miner.h
index d3118dc..2853906 100644
--- a/miner.h
+++ b/miner.h
@@ -669,7 +669,7 @@ struct stratum_job {
 	unsigned char version[4];
 	unsigned char nbits[4];
 	unsigned char ntime[4];
-	unsigned char claim[32]; // lbry
+	unsigned char extra[64]; // like lbry claimtrie
 	bool clean;
 	unsigned char nreward[2];
 	uint32_t height;
diff --git a/phi/cuda_phi2_cubehash512.cu b/phi/cuda_phi2_cubehash512.cu
new file mode 100644
index 0000000..e0e7fd7
--- /dev/null
+++ b/phi/cuda_phi2_cubehash512.cu
@@ -0,0 +1,319 @@
+/* phi2 cubehash-512 144-bytes input (80 + 64) */
+
+#include <cuda_helper.h>
+#include <cuda_vectors.h>
+
+#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
+#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
+
+#if __CUDA_ARCH__ < 350
+#define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
+#else
+#define LROT(x, bits) __funnelshift_l(x, x, bits)
+#endif
+
+#define ROTATEUPWARDS7(a)  LROT(a,7)
+#define ROTATEUPWARDS11(a) LROT(a,11)
+
+#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
+
+#ifdef NO_MIDSTATE
+
+__device__ __constant__
+static const uint32_t c_IV_512[32] = {
+	0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E,
+	0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695,
+	0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537,
+	0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE,
+	0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532,
+	0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
+	0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576,
+	0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44
+};
+
+#endif
+
+__device__ __forceinline__
+static void rrounds(uint32_t x[2][2][2][2][2])
+{
+    int r;
+    int j;
+    int k;
+    int l;
+    int m;
+
+//#pragma unroll 16
+    for (r = 0;r < CUBEHASH_ROUNDS;++r) {
+
+        /* "add x_0jklm into x_1jklmn modulo 2^32" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[1][j][k][l][m] += x[0][j][k][l][m];
+
+        /* "rotate x_0jklm upwards by 7 bits" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
+
+        /* "swap x_00klm with x_01klm" */
+#pragma unroll 2
+        for (k = 0;k < 2;++k)
+#pragma unroll 2
+            for (l = 0;l < 2;++l)
+#pragma unroll 2
+                for (m = 0;m < 2;++m)
+                    SWAP(x[0][0][k][l][m],x[0][1][k][l][m])
+
+        /* "xor x_1jklm into x_0jklm" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+        /* "swap x_1jk0m with x_1jk1m" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (m = 0;m < 2;++m)
+                    SWAP(x[1][j][k][0][m],x[1][j][k][1][m])
+
+        /* "add x_0jklm into x_1jklm modulo 2^32" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[1][j][k][l][m] += x[0][j][k][l][m];
+
+        /* "rotate x_0jklm upwards by 11 bits" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
+
+        /* "swap x_0j0lm with x_0j1lm" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (l = 0;l < 2;++l)
+#pragma unroll 2
+                for (m = 0;m < 2;++m)
+                    SWAP(x[0][j][0][l][m],x[0][j][1][l][m])
+
+        /* "xor x_1jklm into x_0jklm" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+        /* "swap x_1jkl0 with x_1jkl1" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+                    SWAP(x[1][j][k][l][0],x[1][j][k][l][1])
+
+    }
+}
+
+__device__ __forceinline__
+static void block_tox(uint32_t* const block, uint32_t x[2][2][2][2][2])
+{
+	// read 32 bytes input from global mem with uint2 chunks
+	AS_UINT2(x[0][0][0][0]) ^= AS_UINT2(&block[0]);
+	AS_UINT2(x[0][0][0][1]) ^= AS_UINT2(&block[2]);
+	AS_UINT2(x[0][0][1][0]) ^= AS_UINT2(&block[4]);
+	AS_UINT2(x[0][0][1][1]) ^= AS_UINT2(&block[6]);
+}
+
+__device__ __forceinline__
+static void hash_fromx(uint32_t hash[16], uint32_t const x[2][2][2][2][2])
+{
+	// used to write final hash to global mem
+	AS_UINT2(&hash[ 0]) = AS_UINT2(x[0][0][0][0]);
+	AS_UINT2(&hash[ 2]) = AS_UINT2(x[0][0][0][1]);
+	AS_UINT2(&hash[ 4]) = AS_UINT2(x[0][0][1][0]);
+	AS_UINT2(&hash[ 6]) = AS_UINT2(x[0][0][1][1]);
+	AS_UINT2(&hash[ 8]) = AS_UINT2(x[0][1][0][0]);
+	AS_UINT2(&hash[10]) = AS_UINT2(x[0][1][0][1]);
+	AS_UINT2(&hash[12]) = AS_UINT2(x[0][1][1][0]);
+	AS_UINT2(&hash[14]) = AS_UINT2(x[0][1][1][1]);
+}
+
+#define Init(x) \
+	AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_IV_512[ 0]); \
+	AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_IV_512[ 2]); \
+	AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_IV_512[ 4]); \
+	AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_IV_512[ 6]); \
+	AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_IV_512[ 8]); \
+	AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_IV_512[10]); \
+	AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_IV_512[12]); \
+	AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_IV_512[14]); \
+	AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_IV_512[16]); \
+	AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_IV_512[18]); \
+	AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_IV_512[20]); \
+	AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_IV_512[22]); \
+	AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_IV_512[24]); \
+	AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_IV_512[26]); \
+	AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_IV_512[28]); \
+	AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_IV_512[30]);
+
+__device__ __forceinline__
+static void Update32(uint32_t x[2][2][2][2][2], uint32_t* const data)
+{
+	/* "xor the block into the first b bytes of the state" */
+	block_tox(data, x);
+	/* "and then transform the state invertibly through r identical rounds" */
+	rrounds(x);
+}
+
+__device__ __forceinline__
+static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
+{
+	/* "the integer 1 is xored into the last state word x_11111" */
+	x[1][1][1][1][1] ^= 1;
+
+	/* "the state is then transformed invertibly through 10r identical rounds" */
+	#pragma unroll 10
+	for (int i = 0; i < 10; i++) rrounds(x);
+
+	/* "output the first h/8 bytes of the state" */
+	hash_fromx(hashval, x);
+}
+
+__host__ void phi2_cubehash512_cpu_init(int thr_id, uint32_t threads) { }
+
+/***************************************************/
+
+/**
+ * Timetravel and x16 CUBEHASH-80 CUDA implementation
+ *  by tpruvot@github - Jan 2017 / May 2018
+ */
+
+__constant__ static uint32_t c_midstate128[32];
+__constant__ static uint32_t c_PaddedMessage_144[36];
+
+#undef SPH_C32
+#undef SPH_C64
+#undef SPH_T32
+#undef SPH_T64
+#include "sph/sph_cubehash.h"
+
+__host__
+void cubehash512_setBlock_144(int thr_id, uint32_t* endiandata)
+{
+	sph_cubehash512_context ctx_cubehash;
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (void*)endiandata, 64);
+#ifndef NO_MIDSTATE
+	cudaMemcpyToSymbol(c_midstate128, ctx_cubehash.state, 128, 0, cudaMemcpyHostToDevice);
+#endif
+	cudaMemcpyToSymbol(c_PaddedMessage_144, endiandata, sizeof(c_PaddedMessage_144), 0, cudaMemcpyHostToDevice);
+}
+
+__global__
+void cubehash512_gpu_hash_144(const uint32_t threads, const uint32_t startNounce, uint64_t *g_outhash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNounce + thread;
+		uint32_t message[8];
+		uint32_t x[2][2][2][2][2];
+#ifdef NO_MIDSTATE
+		Init(x);
+
+		// first 32 bytes
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[0]);
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[4]);
+		Update32(x, message);
+
+		// second 32 bytes
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[8]);
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[12]);
+		Update32(x, message);
+#else
+		AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_midstate128[ 0]);
+		AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_midstate128[ 2]);
+		AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_midstate128[ 4]);
+		AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_midstate128[ 6]);
+		AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_midstate128[ 8]);
+		AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_midstate128[10]);
+		AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_midstate128[12]);
+		AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_midstate128[14]);
+
+		AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_midstate128[16]);
+		AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_midstate128[18]);
+		AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_midstate128[20]);
+		AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_midstate128[22]);
+		AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_midstate128[24]);
+		AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_midstate128[26]);
+		AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_midstate128[28]);
+		AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_midstate128[30]);
+#endif
+		// nonce + state root
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[16]);
+		message[3] = cuda_swab32(nonce);
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[20]); // state
+		Update32(x, message);
+
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[24]); // state
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[28]); // utxo
+		Update32(x, message);
+
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[32]); // utxo
+		message[4] = 0x80;
+		message[5] = 0;
+		message[6] = 0;
+		message[7] = 0;
+		Update32(x, message);
+
+		uint32_t* output = (uint32_t*) (&g_outhash[(size_t)8 * thread]);
+		Final(x, output);
+	}
+}
+
+__host__
+void cubehash512_cuda_hash_144(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	cubehash512_gpu_hash_144 <<<grid, block>>> (threads, startNounce, (uint64_t*) d_hash);
+}
+
diff --git a/phi/phi2.cu b/phi/phi2.cu
index 537217f..fbdb9c4 100644
--- a/phi/phi2.cu
+++ b/phi/phi2.cu
@@ -1,5 +1,5 @@
 //
-//  PHI2 algo
+//  PHI2 algo (with smart contracts header)
 //  CubeHash + Lyra2 x2 + JH + Gost or Echo + Skein
 //
 //  Implemented by tpruvot in May 2018
@@ -24,6 +24,9 @@ extern "C" {
 extern void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
 extern void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
 
+extern void cubehash512_setBlock_144(int thr_id, uint32_t* endiandata);
+extern void cubehash512_cuda_hash_144(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
 extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
 extern void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti);
 
@@ -41,11 +44,13 @@ static uint64_t* d_hash_256[MAX_GPUS];
 static uint32_t* d_hash_br2[MAX_GPUS];
 static uint32_t* d_nonce_br[MAX_GPUS];
 
+static bool has_roots;
+
 extern "C" void phi2_hash(void *output, const void *input)
 {
-	unsigned char _ALIGN(128) hash[128] = { 0 };
-	unsigned char _ALIGN(128) hashA[64] = { 0 };
-	unsigned char _ALIGN(128) hashB[64] = { 0 };
+	unsigned char _ALIGN(128) hash[64];
+	unsigned char _ALIGN(128) hashA[64];
+	unsigned char _ALIGN(128) hashB[64];
 
 	sph_cubehash512_context ctx_cubehash;
 	sph_jh512_context ctx_jh;
@@ -54,7 +59,7 @@ extern "C" void phi2_hash(void *output, const void *input)
 	sph_skein512_context ctx_skein;
 
 	sph_cubehash512_init(&ctx_cubehash);
-	sph_cubehash512(&ctx_cubehash, input, 80);
+	sph_cubehash512(&ctx_cubehash, input, has_roots ? 144 : 80);
 	sph_cubehash512_close(&ctx_cubehash, (void*)hashB);
 
 	LYRA2(&hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8);
@@ -137,7 +142,6 @@ extern "C" int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce,
 			CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_br2[thr_id], (size_t)64 * throughput), -1);
 		}
 
-		x11_cubehash512_cpu_init(thr_id, throughput);
 		lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
 		quark_jh512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
@@ -147,17 +151,26 @@ extern "C" int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce,
 		init[thr_id] = true;
 	}
 
-	uint32_t endiandata[20];
-	for (int k = 0; k < 20; k++)
+	has_roots = false;
+	uint32_t endiandata[36];
+	for (int k = 0; k < 36; k++) {
 		be32enc(&endiandata[k], pdata[k]);
+		if (k >= 20 && pdata[k]) has_roots = true;
+	}
 
 	cuda_check_cpu_setTarget(ptarget);
-	cubehash512_setBlock_80(thr_id, endiandata);
+	if (has_roots)
+		cubehash512_setBlock_144(thr_id, endiandata);
+	else
+		cubehash512_setBlock_80(thr_id, endiandata);
 
 	do {
 		int order = 0;
-
-		cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash_512[thr_id]); order++;
+		if (has_roots)
+			cubehash512_cuda_hash_144(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
+		else
+			cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
+		order++;
 		TRACE("cube   ");
 
 		lyra2_cuda_hash_64(thr_id, throughput, d_hash_256[thr_id], d_hash_512[thr_id], gtx750ti);
diff --git a/util.cpp b/util.cpp
index ee1c1ee..49cd854 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1442,7 +1442,7 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)
 static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 {
 	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime;
-	const char *claim = NULL, *nreward = NULL;
+	const char *extradata = NULL, *nreward = NULL;
 	size_t coinb1_size, coinb2_size;
 	bool clean, ret = false;
 	int merkle_count, i, p=0;
@@ -1452,7 +1452,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	int ntime;
 	char algo[64] = { 0 };
 	get_currentalgo(algo, sizeof(algo));
-	bool has_claim = !strcasecmp(algo, "lbry");
+	bool has_claim = !strcmp(algo, "lbry");
+	bool has_roots = !strcmp(algo, "phi2") && json_array_size(params) == 10;
 
 	if (sctx->is_equihash) {
 		return equi_stratum_notify(sctx, params);
@@ -1461,11 +1462,17 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	job_id = json_string_value(json_array_get(params, p++));
 	prevhash = json_string_value(json_array_get(params, p++));
 	if (has_claim) {
-		claim = json_string_value(json_array_get(params, p++));
-		if (!claim || strlen(claim) != 64) {
+		extradata = json_string_value(json_array_get(params, p++));
+		if (!extradata || strlen(extradata) != 64) {
 			applog(LOG_ERR, "Stratum notify: invalid claim parameter");
 			goto out;
 		}
+	} else if (has_roots) {
+		extradata = json_string_value(json_array_get(params, p++));
+		if (!extradata || strlen(extradata) != 128) {
+			applog(LOG_ERR, "Stratum notify: invalid UTXO root parameter");
+			goto out;
+		}
 	}
 	coinb1 = json_string_value(json_array_get(params, p++));
 	coinb2 = json_string_value(json_array_get(params, p++));
@@ -1529,7 +1536,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	free(sctx->job.job_id);
 	sctx->job.job_id = strdup(job_id);
 	hex2bin(sctx->job.prevhash, prevhash, 32);
-	if (has_claim) hex2bin(sctx->job.claim, claim, 32);
+	if (has_claim) hex2bin(sctx->job.extra, extradata, 32);
+	if (has_roots) hex2bin(sctx->job.extra, extradata, 64);
 
 	sctx->job.height = getblocheight(sctx);
 

From 968d2ba0499ac49cd67518a4d8e3d5e9d017c6bb Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 21 Jun 2018 10:11:22 +0200
Subject: [PATCH 12/24] phi2: fix the double endian swap on roots

---
 ccminer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ccminer.cpp b/ccminer.cpp
index 7f01a80..00fe1cd 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -1653,7 +1653,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		work->data[17] = le32dec(sctx->job.ntime);
 		work->data[18] = le32dec(sctx->job.nbits);
 		for (i = 0; i < 16; i++)
-			work->data[20 + i] = be32dec((uint32_t*)sctx->job.extra + i);
+			work->data[20 + i] = ((uint32_t*)sctx->job.extra)[i];
 	} else if (opt_algo == ALGO_SIA) {
 		uint32_t extra = 0;
 		memcpy(&extra, &sctx->job.coinbase[32], 2);

From 77c4b8724ea0d0d53c00f27beba66c603d9ff4d4 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 23 Jun 2018 13:29:22 +0200
Subject: [PATCH 13/24] handle new cryptonight variants, stellite, aeon

special thanks for klausT changes and ystarnaud who helped me to adapt my kernel variants...

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
---
 algos.h                     |  31 +++++
 ccminer.cpp                 |  41 ++++++-
 crypto/cn_aes.cuh           |   1 +
 crypto/cn_blake.cuh         |   2 +-
 crypto/cn_groestl.cuh       |   9 +-
 crypto/cn_jh.cuh            |  20 +--
 crypto/cn_keccak.cuh        |   2 +-
 crypto/cn_skein.cuh         |  22 ++--
 crypto/cryptolight-core.cu  |  86 +++++++++++--
 crypto/cryptolight-cpu.cpp  |  39 ++++--
 crypto/cryptolight.cu       |  30 +++--
 crypto/cryptolight.h        |  13 +-
 crypto/cryptonight-core.cu  | 236 ++++++++++++++++++++++++++++--------
 crypto/cryptonight-cpu.cpp  |  62 ++++++++--
 crypto/cryptonight-extra.cu | 175 +++++++++++---------------
 crypto/cryptonight.cu       |  36 +++---
 crypto/cryptonight.h        |  13 +-
 crypto/xmr-rpc.cpp          |  10 +-
 miner.h                     |  14 ++-
 util.cpp                    |  10 +-
 20 files changed, 593 insertions(+), 259 deletions(-)

diff --git a/algos.h b/algos.h
index 229d8e9..c484bcc 100644
--- a/algos.h
+++ b/algos.h
@@ -72,6 +72,9 @@ enum sha_algos {
 	ALGO_WHIRLPOOLX,
 	ALGO_WILDKECCAK,
 	ALGO_ZR5,
+	ALGO_MONERO,
+	ALGO_GRAFT,
+	ALGO_STELLITE,
 	ALGO_AUTO,
 	ALGO_COUNT
 };
@@ -146,6 +149,9 @@ static const char *algo_names[] = {
 	"whirlpoolx",
 	"wildkeccak",
 	"zr5",
+	"monero",
+	"graft",
+	"stellite",
 	"auto", /* reserved for multi algo */
 	""
 };
@@ -206,4 +212,29 @@ static inline int algo_to_int(char* arg)
 	return i;
 }
 
+static inline int get_cryptonight_algo(int fork)
+{
+	int algo = ALGO_COUNT;
+
+	switch (fork) {
+		case 8:
+			algo = ALGO_GRAFT;
+			break;
+
+		case 7:
+			algo = ALGO_MONERO;
+			break;
+
+		case 3:
+			algo = ALGO_STELLITE;
+			break;
+
+		default:
+			algo = ALGO_CRYPTONIGHT;
+			break;
+	}
+
+	return algo;
+}
+
 #endif
diff --git a/ccminer.cpp b/ccminer.cpp
index 00fe1cd..6521284 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -233,6 +233,8 @@ int opt_api_mcast_port = 4068;
 
 bool opt_stratum_stats = false;
 
+int cryptonight_fork = 1;
+
 static char const usage[] = "\
 Usage: " PROGRAM_NAME " [OPTIONS]\n\
 Options:\n\
@@ -245,7 +247,7 @@ Options:\n\
 			blakecoin   Fast Blake 256 (8 rounds)\n\
 			bmw         BMW 256\n\
 			cryptolight AEON cryptonight (MEM/2)\n\
-			cryptonight XMR cryptonight\n\
+			cryptonight XMR cryptonight v1 (old)\n\
 			c11/flax    X11 variant\n\
 			decred      Decred Blake256\n\
 			deep        Deepcoin\n\
@@ -253,6 +255,7 @@ Options:\n\
 			dmd-gr      Diamond-Groestl\n\
 			fresh       Freshcoin (shavite 80)\n\
 			fugue256    Fuguecoin\n\
+			graft       Cryptonight v8\n\
 			groestl     Groestlcoin\n"
 #ifdef WITH_HEAVY_ALGO
 "			heavy       Heavycoin\n"
@@ -267,6 +270,7 @@ Options:\n\
 			lyra2v2     VertCoin\n\
 			lyra2z      ZeroCoin (3rd impl)\n\
 			myr-gr      Myriad-Groestl\n\
+			monero      XMR cryptonight (v7)\n\
 			neoscrypt   FeatherCoin, Phoenix, UFO...\n\
 			nist5       NIST5 (TalkCoin)\n\
 			penta       Pentablake hash (5x Blake 512)\n\
@@ -284,6 +288,7 @@ Options:\n\
 			skein       Skein SHA2 (Skeincoin)\n\
 			skein2      Double Skein (Woodcoin)\n\
 			skunk       Skein Cube Fugue Streebog\n\
+			stellite    Cryptonight v3\n\
 			s3          S3 (1Coin)\n\
 			timetravel  Machinecoin permuted x8\n\
 			tribus      Denarius\n\
@@ -573,7 +578,10 @@ static bool get_blocktemplate(CURL *curl, struct work *work);
 
 void get_currentalgo(char* buf, int sz)
 {
-	snprintf(buf, sz, "%s", algo_names[opt_algo]);
+	int algo = opt_algo;
+	if (algo == ALGO_CRYPTONIGHT)
+		algo = get_cryptonight_algo(cryptonight_fork);
+	snprintf(buf, sz, "%s", algo_names[algo]);
 }
 
 void format_hashrate(double hashrate, char *output)
@@ -2372,11 +2380,16 @@ static void *miner_thread(void *userdata)
 			rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_CRYPTOLIGHT:
-			rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done);
+			rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done, 1);
 			break;
 		case ALGO_CRYPTONIGHT:
-			rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done);
+		{
+			int cn_variant = 0;
+			if (cryptonight_fork > 1 && ((unsigned char*)work.data)[0] >= cryptonight_fork)
+				cn_variant = ((unsigned char*)work.data)[0] - cryptonight_fork + 1;
+			rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done, cn_variant);
 			break;
+		}
 		case ALGO_DECRED:
 			rc = scanhash_decred(thr_id, &work, max_nonce, &hashes_done);
 			break;
@@ -3138,6 +3151,26 @@ void parse_arg(int key, char *arg)
 			case ALGO_SCRYPT_JANE: opt_nfactor = 14; break;
 			}
 		}
+
+		// cryptonight variants
+		switch (opt_algo) {
+		case ALGO_MONERO:
+			opt_algo = ALGO_CRYPTONIGHT;
+			cryptonight_fork = 7;
+			break;
+		case ALGO_GRAFT:
+			opt_algo = ALGO_CRYPTONIGHT;
+			cryptonight_fork = 8;
+			break;
+		case ALGO_STELLITE:
+			opt_algo = ALGO_CRYPTONIGHT;
+			cryptonight_fork = 3;
+			break;
+		case ALGO_CRYPTONIGHT:
+			cryptonight_fork = 1;
+			break;
+		}
+
 		break;
 	case 'b':
 		p = strstr(arg, ":");
diff --git a/crypto/cn_aes.cuh b/crypto/cn_aes.cuh
index df419b3..99ad212 100644
--- a/crypto/cn_aes.cuh
+++ b/crypto/cn_aes.cuh
@@ -138,6 +138,7 @@ static const __device__ __align__(16) uint32_t d_t_fn[1024] = {
 */
 
 #define AS_U32(addr) *((uint32_t*)(addr))
+#define AS_U64(addr) *((uint64_t*)(addr))
 #define AS_UINT2(addr) *((uint2*)(addr))
 #define AS_UINT4(addr) *((uint4*)(addr))
 #define AS_UL2(addr) *((ulonglong2*)(addr))
diff --git a/crypto/cn_blake.cuh b/crypto/cn_blake.cuh
index 5c0d09f..bd2ba43 100644
--- a/crypto/cn_blake.cuh
+++ b/crypto/cn_blake.cuh
@@ -164,7 +164,7 @@ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest)
 }
 
 __device__
-void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out)
+void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint32_t * out)
 {
 	blake_state bs;
 	blake_state *S = (blake_state *)&bs;
diff --git a/crypto/cn_groestl.cuh b/crypto/cn_groestl.cuh
index 62530d4..425e062 100644
--- a/crypto/cn_groestl.cuh
+++ b/crypto/cn_groestl.cuh
@@ -274,13 +274,14 @@ void cn_groestl_final(groestlHashState*  __restrict__ ctx, BitSequence* __restri
 	for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) {
 		output[j] = s[i];
 	}
-
+#if 0
 	for (i = 0; i < GROESTL_COLS512; i++) {
 		ctx->chaining[i] = 0;
 	}
 	for (i = 0; i < GROESTL_SIZE512; i++) {
 		ctx->buffer[i] = 0;
 	}
+#endif
 }
 
 __device__
@@ -336,12 +337,12 @@ void cn_groestl_init(groestlHashState* ctx)
 }
 
 __device__
-void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
+void cn_groestl(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
 {
 	DataLength databitlen = len << 3;
 	groestlHashState context;
 
 	cn_groestl_init(&context);
-	cn_groestl_update(&context, data, databitlen);
-	cn_groestl_final(&context, hashval);
+	cn_groestl_update(&context, (BitSequence*) data, databitlen);
+	cn_groestl_final(&context, (BitSequence*) hashval);
 }
diff --git a/crypto/cn_jh.cuh b/crypto/cn_jh.cuh
index c2df763..b05380d 100644
--- a/crypto/cn_jh.cuh
+++ b/crypto/cn_jh.cuh
@@ -198,8 +198,9 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__
 		databitlen = 0;
 	}
 
-	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  ) {
-		memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ;
+	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  )
+	{
+		memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) );
 		index = 64-(state->datasize_in_buffer >> 3);
 		databitlen = databitlen - (512 - state->datasize_in_buffer);
 		cn_jh_F8(state);
@@ -222,7 +223,7 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__
 
 /* pad the message, process the padded block(s), truncate the hash value H to obtain the message digest */
 __device__
-void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashval)
+void cn_jh_final(jhHashState * __restrict__ state, uint8_t * __restrict__ hashval)
 {
 	unsigned int i;
 	//uint32_t *bufptr = (uint32_t *)state->buffer;
@@ -244,7 +245,7 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv
 
 	} else {
 
-		/*set the rest of the bytes in the buffer to 0*/
+		/* set the rest of the bytes in the buffer to 0 */
 		if ( (state->datasize_in_buffer & 7) == 0) {
 			for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0;
 		} else {
@@ -268,7 +269,8 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv
 		cn_jh_F8(state);
 	}
 
-	MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8);
+	memcpy(hashval, ((unsigned char*)state->x) + 64 + 32, 32);
+	//MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8);
 }
 
 __device__
@@ -277,12 +279,12 @@ void cn_jh_init(jhHashState *state, int hashbitlen)
 	state->databitlen = 0;
 	state->datasize_in_buffer = 0;
 	state->hashbitlen = hashbitlen;
-	//memcpy(state->x, d_JH256_H0, 128);
-	MEMCPY8(state->x, d_JH256_H0, 128 / 8);
+	memcpy(state->x, d_JH256_H0, 128);
+	//MEMCPY8(state->x, d_JH256_H0, 128 / 8);
 }
 
 __device__
-void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval)
+void cn_jh(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
 {
 	const int hashbitlen = 256;
 	DataLength databitlen = len << 3;
@@ -290,5 +292,5 @@ void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re
 
 	cn_jh_init(&state, hashbitlen);
 	cn_jh_update(&state, data, databitlen);
-	cn_jh_final(&state, hashval);
+	cn_jh_final(&state, (uint8_t*) hashval);
 }
diff --git a/crypto/cn_keccak.cuh b/crypto/cn_keccak.cuh
index 3acef7a..c6f5908 100644
--- a/crypto/cn_keccak.cuh
+++ b/crypto/cn_keccak.cuh
@@ -195,7 +195,7 @@ void cn_keccakf(uint64_t *s)
 }
 
 __device__ __forceinline__
-void cn_keccak(const uint8_t * __restrict__ in, uint8_t * __restrict__ md)
+void cn_keccak(const uint32_t * __restrict__ in, uint64_t * __restrict__ md)
 {
 	uint64_t st[25];
 
diff --git a/crypto/cn_skein.cuh b/crypto/cn_skein.cuh
index 2096467..0e68143 100644
--- a/crypto/cn_skein.cuh
+++ b/crypto/cn_skein.cuh
@@ -4,19 +4,15 @@ typedef unsigned int    uint_t;             /* native unsigned integer */
 
 #define SKEIN_256_STATE_WORDS ( 4)
 #define SKEIN_512_STATE_WORDS ( 8)
-#define SKEIN1024_STATE_WORDS (16)
 
 #define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
 #define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
-#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
 
 #define SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
 #define SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
-#define SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)
 
 #define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
 #define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
-#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
 
 #define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((uint64_t) (hi32)) << 32))
 #define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
@@ -119,7 +115,7 @@ typedef struct {
 } skeinHashState;
 
 __device__
-void cn_skein256_init(skeinHashState *state, size_t hashBitLen)
+void cn_skein_init(skeinHashState *state, size_t hashBitLen)
 {
 	const uint64_t SKEIN_512_IV_256[] =
 	{
@@ -258,14 +254,12 @@ void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restr
 }
 
 __device__
-void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
+void cn_skein_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
 {
 	if ((databitlen & 7) == 0) {
-
 		cn_skein_block(&state->u.ctx_512, data, databitlen >> 3);
 	}
 	else {
-
 		size_t bCnt = (databitlen >> 3) + 1;
 		uint8_t b,mask;
 
@@ -280,7 +274,7 @@ void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __r
 }
 
 __device__
-void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restrict__ hashVal)
+void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal)
 {
 	uint64_t X[SKEIN_512_STATE_WORDS];
 	Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512;
@@ -305,13 +299,13 @@ void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restric
 		((uint64_t *)ctx->b)[0] = (uint64_t)i;
 		Skein_Start_New_Type(ctx, OUT_FINAL);
 		cn_skein_processblock(ctx, ctx->b, 1, sizeof(uint64_t));
-		memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES/sizeof(uint32_t)), ctx->X, n);
+		memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES), ctx->X, n);
 		memcpy(ctx->X, X, sizeof(X)); // restore the counter mode key for next time
 	}
 }
 
 __device__
-void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval)
+void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
 {
 	int hashbitlen = 256;
 	DataLength databitlen = len << 3;
@@ -319,7 +313,7 @@ void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re
 
 	state.statebits = 64*SKEIN_512_STATE_WORDS;
 
-	cn_skein256_init(&state, hashbitlen);
-	cn_skein256_update(&state, data, databitlen);
-	cn_skein256_final(&state, hashval);
+	cn_skein_init(&state, hashbitlen);
+	cn_skein_update(&state, data, databitlen);
+	cn_skein_final(&state, (uint8_t*) hashval);
 }
diff --git a/crypto/cryptolight-core.cu b/crypto/cryptolight-core.cu
index 3891768..8f0bb75 100644
--- a/crypto/cryptolight-core.cu
+++ b/crypto/cryptolight-core.cu
@@ -36,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
 
 	if(thread < threads)
 	{
-		const int oft = thread * 52 + sub + 16; // not aligned 16!
+		const int oft = thread * 50 + sub + 16; // not aligned 16!
 		const int long_oft = (thread << LONG_SHL_IDX) + sub;
 		uint32_t __align__(16) key[40];
 		uint32_t __align__(16) text[4];
@@ -57,8 +57,10 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
 	}
 }
 
+// --------------------------------------------------------------------------------------------------------------
+
 __global__
-void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
+void cryptolight_old_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
 {
 	__shared__ uint32_t __align__(16) sharedMemory[1024];
 
@@ -209,6 +211,70 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int
 #endif // __CUDA_ARCH__ >= 300
 }
 
+__device__ __forceinline__ void store_variant1(uint32_t* long_state)
+{
+	uint4* Z = (uint4*) long_state;
+	const uint32_t tmp = (Z->z >> 24); // __byte_perm(src, 0, 0x7773);
+	const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
+	Z->z = (Z->z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
+}
+
+#define MUL_SUM_XOR_DST_1(a,c,dst,tweak) { \
+        uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
+        hi += ((uint64_t *)c)[0]; \
+        ((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
+        ((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
+        ((uint64_t *)dst)[0] = hi; \
+        ((uint64_t *)dst)[1] = lo ^ tweak; }
+
+__global__
+void cryptolight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
+	uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
+	uint64_t * __restrict__ d_tweak)
+{
+	__shared__ __align__(16) uint32_t sharedMemory[1024];
+	cn_aes_gpu_init(sharedMemory);
+	__syncthreads();
+
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
+	{
+		const uint32_t batchsize = ITER >> (2 + bfactor);
+		const uint32_t start = partidx * batchsize;
+		const uint32_t end = start + batchsize;
+		const uint32_t longptr = thread << LONG_SHL_IDX;
+		uint32_t * long_state = &d_long_state[longptr];
+		uint64_t tweak = d_tweak[thread];
+
+		void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
+		void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
+		uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
+		uint4 B = AS_UINT4(ctx_b);
+		uint32_t* a = (uint32_t*)&A;
+		uint32_t* b = (uint32_t*)&B;
+
+		for (int i = start; i < end; i++)
+		{
+			uint32_t c[4];
+			uint32_t j = (A.x >> 2) & E2I_MASK2;
+			cn_aes_single_round(sharedMemory, &long_state[j], c, a);
+			XOR_BLOCKS_DST(c, b, &long_state[j]);
+			store_variant1(&long_state[j]);
+			MUL_SUM_XOR_DST_1(c, a, &long_state[(c[0] >> 2) & E2I_MASK2], tweak);
+
+			j = (A.x >> 2) & E2I_MASK2;
+			cn_aes_single_round(sharedMemory, &long_state[j], b, a);
+			XOR_BLOCKS_DST(b, c, &long_state[j]);
+			store_variant1(&long_state[j]);
+			MUL_SUM_XOR_DST_1(b, a, &long_state[(b[0] >> 2) & E2I_MASK2], tweak);
+		}
+		if (bfactor) {
+			AS_UINT4(ctx_a) = A;
+			AS_UINT4(ctx_b) = B;
+		}
+	}
+}
+
 __global__
 void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key2)
 {
@@ -222,7 +288,7 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
 	if(thread < threads)
 	{
 		const int long_oft = (thread << LONG_SHL_IDX) + sub;
-		const int oft = thread * 52 + sub + 16;
+		const int oft = thread * 50 + sub + 16;
 		uint32_t __align__(16) key[40];
 		uint32_t __align__(16) text[4];
 
@@ -251,8 +317,8 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
 extern int device_bfactor[MAX_GPUS];
 
 __host__
-void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state,
-	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
+void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state,
+	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
 {
 	dim3 grid(blocks);
 	dim3 block(threads);
@@ -265,17 +331,21 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_
 	int i, partcount = 1 << bfactor;
 	int dev_id = device_map[thr_id];
 
-	cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key1);
+	cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key1);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 	if(partcount > 1) usleep(bsleep);
 
 	for(i = 0; i < partcount; i++)
 	{
-		cryptolight_core_gpu_phase2 <<<grid, (device_sm[dev_id] >= 300 ? block4 : block)>>>(blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
+		dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
+		if (variant == 0)
+			cryptolight_old_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
+		else
+			cryptolight_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		if(partcount > 1) usleep(bsleep);
 	}
 
-	cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key2);
+	cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key2);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
diff --git a/crypto/cryptolight-cpu.cpp b/crypto/cryptolight-cpu.cpp
index b0ee386..f995b4c 100644
--- a/crypto/cryptolight-cpu.cpp
+++ b/crypto/cryptolight-cpu.cpp
@@ -22,6 +22,16 @@ struct cryptonight_ctx {
 	oaes_ctx* aes_ctx;
 };
 
+
+static void cryptolight_store_variant(void* state, int variant) {
+	if (variant == 1) {
+		// use variant 1 like monero since june 2018
+		const uint8_t tmp = ((const uint8_t*)(state))[11];
+		const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
+		((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
+	}
+}
+
 static void do_blake_hash(const void* input, int len, void* output)
 {
 	uchar hash[32];
@@ -132,14 +142,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
 	((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
 }
 
-static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
+static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) {
 	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
 	hi += ((uint64_t*) c)[0];
 
 	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
 	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
 	((uint64_t*) dst)[0] = hi;
-	((uint64_t*) dst)[1] = lo;
+	((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo;
 }
 
 static void copy_block(uint8_t* dst, const uint8_t* src) {
@@ -157,13 +167,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
 	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
 }
 
-static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx)
+static int cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant)
 {
 	size_t i, j;
+	if (variant && len < 43)
+		return 0;
+
 	keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
 	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
 
+	const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
+
 	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
 	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
 #undef RND
@@ -186,14 +201,16 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
 		j = e2i(ctx->a);
 		aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
 		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
+		cryptolight_store_variant(&ctx->long_state[j], variant);
 
-		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
+		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)], variant, tweak);
 
 		j = e2i(ctx->a);
 		aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
 		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
+		cryptolight_store_variant(&ctx->long_state[j], variant);
 
-		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
+		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)], variant, tweak);
 	}
 
 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
@@ -219,11 +236,19 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
 	if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
 
 	oaes_free((OAES_CTX **) &ctx->aes_ctx);
+	return 1;
 }
 
-void cryptolight_hash(void* output, const void* input, int len)
+int cryptolight_hash_variant(void* output, const void* input, int len, int variant)
 {
 	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-	cryptolight_hash_ctx(output, input, len, ctx);
+	int rc = cryptolight_hash_ctx(output, input, len, ctx, variant);
 	free(ctx);
+	return rc;
 }
+
+void cryptolight_hash(void* output, const void* input)
+{
+	cryptolight_hash_variant(output, input, 76, 1);
+}
+
diff --git a/crypto/cryptolight.cu b/crypto/cryptolight.cu
index c8ab8ea..c2a10e4 100644
--- a/crypto/cryptolight.cu
+++ b/crypto/cryptolight.cu
@@ -7,16 +7,17 @@ static __thread uint32_t cn_blocks  = 32;
 static __thread uint32_t cn_threads = 16;
 
 static uint32_t *d_long_state[MAX_GPUS];
-static uint64_t *d_ctx_state[MAX_GPUS];
+static uint32_t *d_ctx_state[MAX_GPUS];
 static uint32_t *d_ctx_key1[MAX_GPUS];
 static uint32_t *d_ctx_key2[MAX_GPUS];
 static uint32_t *d_ctx_text[MAX_GPUS];
+static uint64_t *d_ctx_tweak[MAX_GPUS];
 static uint32_t *d_ctx_a[MAX_GPUS];
 static uint32_t *d_ctx_b[MAX_GPUS];
 
 static bool init[MAX_GPUS] = { 0 };
 
-extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
 {
 	int res = 0;
 	uint32_t throughput = 0;
@@ -26,6 +27,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 	uint32_t *nonceptr = (uint32_t*) (&pdata[39]);
 	const uint32_t first_nonce = *nonceptr;
 	uint32_t nonce = first_nonce;
+	int dev_id = device_map[thr_id];
 
 	if(opt_benchmark) {
 		ptarget[7] = 0x00ff;
@@ -33,6 +35,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 
 	if(!init[thr_id])
 	{
+		if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) {
+			device_config[thr_id] = strdup("80x32");
+		}
+
 		if (device_config[thr_id]) {
 			sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
 			throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
@@ -63,11 +69,11 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 		}
 
 		const size_t alloc = MEMORY * throughput;
-		cryptonight_extra_cpu_init(thr_id, throughput);
+		cryptonight_extra_init(thr_id);
 
 		cudaMalloc(&d_long_state[thr_id], alloc);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-		cudaMalloc(&d_ctx_state[thr_id], 26 * sizeof(uint64_t) * throughput);
+		cudaMalloc(&d_ctx_state[thr_id], 25 * sizeof(uint64_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
@@ -79,6 +85,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
 
 		init[thr_id] = true;
 	}
@@ -90,10 +97,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 		const uint32_t Htarg = ptarget[7];
 		uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
 
-		cryptonight_extra_cpu_setData(thr_id, pdata, ptarget);
-		cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
-		cryptolight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
-		cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
+		cryptonight_extra_setData(thr_id, pdata, ptarget);
+		cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
+		cryptolight_core_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
+		cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
 
 		*hashes_done = nonce - first_nonce + throughput;
 
@@ -104,7 +111,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 			uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
 			memcpy(tempdata, pdata, 76);
 			*tempnonceptr = resNonces[0];
-			cryptolight_hash(vhash, tempdata, 76);
+			cryptolight_hash_variant(vhash, tempdata, 76, variant);
 			if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
 			{
 				res = 1;
@@ -114,7 +121,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 				if(resNonces[1] != UINT32_MAX)
 				{
 					*tempnonceptr = resNonces[1];
-					cryptolight_hash(vhash, tempdata, 76);
+					cryptolight_hash_variant(vhash, tempdata, 76, variant);
 					if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 						res++;
 						work->nonces[1] = resNonces[1];
@@ -157,10 +164,11 @@ void free_cryptolight(int thr_id)
 	cudaFree(d_ctx_key1[thr_id]);
 	cudaFree(d_ctx_key2[thr_id]);
 	cudaFree(d_ctx_text[thr_id]);
+	cudaFree(d_ctx_tweak[thr_id]);
 	cudaFree(d_ctx_a[thr_id]);
 	cudaFree(d_ctx_b[thr_id]);
 
-	cryptonight_extra_cpu_free(thr_id);
+	cryptonight_extra_free(thr_id);
 
 	cudaDeviceSynchronize();
 
diff --git a/crypto/cryptolight.h b/crypto/cryptolight.h
index 443cf5b..482d0f8 100644
--- a/crypto/cryptolight.h
+++ b/crypto/cryptolight.h
@@ -134,10 +134,11 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
 		exit(1);
 	}
 }
-void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
 
-void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
-void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
-void cryptonight_extra_cpu_free(int thr_id);
-void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
-void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state);
+void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
+
+void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
+void cryptonight_extra_init(int thr_id/*, uint32_t threads*/);
+void cryptonight_extra_free(int thr_id);
+void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
+void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state);
diff --git a/crypto/cryptonight-core.cu b/crypto/cryptonight-core.cu
index 4780f37..90f024f 100644
--- a/crypto/cryptonight-core.cu
+++ b/crypto/cryptonight-core.cu
@@ -2,47 +2,55 @@
 #include <stdint.h>
 #include <string.h>
 #include <sys/time.h>
+#ifndef _WIN32
 #include <unistd.h>
+#endif
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
+#undef __shfl
+#define __shfl(var, srcLane, width) __shfl_sync(0xFFFFFFFFu, var, srcLane, width)
+#endif
 
 #include "cryptonight.h"
 
-#define LONG_SHL32 19 // 1<<19
+#define LONG_SHL32 19 // 1<<19 (uint32_t* index)
 #define LONG_SHL64 18 // 1<<18 (uint64_t* index)
 #define LONG_LOOPS32 0x80000U
-#define LONG_LOOPS64 0x40000U
 
 #include "cn_aes.cuh"
 
 __global__
-//__launch_bounds__(128, 9) // 56 registers
-void cryptonight_core_gpu_phase1(const uint32_t threads, uint64_t * long_state, uint64_t * const ctx_state, uint32_t * ctx_key1)
+void cryptonight_gpu_phase1(const uint32_t threads, uint32_t * __restrict__ d_long_state,
+	uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1)
 {
-	__shared__ __align__(16) uint32_t sharedMemory[1024];
-	cn_aes_gpu_init(sharedMemory);
-	__syncthreads();
+	__shared__ uint32_t sharedMemory[1024];
 
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
-	const uint32_t sub = (threadIdx.x & 7) << 1; // 0 2 .. 14
-
 	if(thread < threads)
 	{
-		const uint32_t long_oft = (thread << LONG_SHL64) + sub;
-
-		const uint32_t* ctx_key = &ctx_key1[thread * 40U];
-		uint4 keys[10];
-		#pragma unroll 10 // load 160 bytes
-		for (int i = 0; i < 10; i ++)
-			keys[i] = AS_UINT4(&ctx_key[i*4]);
+		cn_aes_gpu_init(sharedMemory);
+		__syncthreads();
 
-		uint4 text = AS_UINT4(&ctx_state[thread * 26U + sub + 8U]);
+		const uint32_t sub = (threadIdx.x & 0x7U) << 2;
+		uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
+		uint32_t __align__(8) key[40];
+		MEMCPY8(key, &ctx_key1[thread * 40U], 20);
+		uint32_t __align__(8) text[4];
+		MEMCPY8(text, &ctx_state[thread * 50U + sub + 16U], 2);
 
-		for (uint32_t i = 0; i < LONG_LOOPS64; i += 16U) {
-			cn_aes_pseudo_round_mut_uint4(sharedMemory, text, keys);
-			AS_UINT4(&long_state[long_oft + i]) = text;
+		for(int i = 0; i < LONG_LOOPS32; i += 32)
+		{
+			cn_aes_pseudo_round_mut(sharedMemory, text, key);
+			MEMCPY8(&longstate[i], text, 2);
 		}
 	}
 }
 
+// --------------------------------------------------------------------------------------------------------------
+
 __device__ __forceinline__ ulonglong2 cuda_mul128(const uint64_t multiplier, const uint64_t multiplicand)
 {
 	ulonglong2 product;
@@ -59,8 +67,7 @@ static __forceinline__ __device__ ulonglong2 operator ^ (const ulonglong2 &a, co
 	return make_ulonglong2(a.x ^ b.x, a.y ^ b.y);
 }
 
-#undef MUL_SUM_XOR_DST
-__device__ __forceinline__ void MUL_SUM_XOR_DST(const uint64_t m, uint4 &a, void* far_dst)
+__device__ __forceinline__ void MUL_SUM_XOR_DST_0(const uint64_t m, uint4 &a, void* far_dst)
 {
 	ulonglong2 d = AS_UL2(far_dst);
 	ulonglong2 p = cuda_mul128(m, d.x);
@@ -73,8 +80,8 @@ __global__
 #if __CUDA_ARCH__ >= 500
 //__launch_bounds__(128,12) /* force 40 regs to allow -l ...x32 */
 #endif
-void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, const uint32_t partidx,
-	uint64_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
+void cryptonight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
+	uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b)
 {
 	__shared__ __align__(16) uint32_t sharedMemory[1024];
 	cn_aes_gpu_init(sharedMemory);
@@ -84,7 +91,7 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
 
 	if (thread < threads)
 	{
-		const uint32_t batchsize = ITER >> (2U + bfactor);
+		const uint32_t batchsize = ITER >> (2 + bfactor);
 		const uint32_t start = partidx * batchsize;
 		const uint32_t end = start + batchsize;
 
@@ -101,12 +108,12 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
 			uint32_t j = (A.x & E2I_MASK) >> 3;
 			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
 			AS_UINT4(&long_state[j]) = C ^ B; // st.global.u32.v4
-			MUL_SUM_XOR_DST((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]);
+			MUL_SUM_XOR_DST_0((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]);
 
 			j = (A.x & E2I_MASK) >> 3;
 			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
 			AS_UINT4(&long_state[j]) = C ^ B;
-			MUL_SUM_XOR_DST((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]);
+			MUL_SUM_XOR_DST_0((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]);
 		}
 
 		if (bfactor) {
@@ -116,71 +123,194 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
 	}
 }
 
+// --------------------------------------------------------------------------------------------------------------
+
+__device__ __forceinline__ void store_variant1(uint64_t* long_state, uint4 Z)
+{
+	const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
+	const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
+	Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
+	AS_UINT4(long_state) = Z;
+}
+
+__device__ __forceinline__ void store_variant2(uint64_t* long_state, uint4 Z)
+{
+	const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
+	const uint32_t index = (((tmp >> 4) & 6u) | (tmp & 1u)) << 1;
+	Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75312u >> index) & 0x30u)) << 24);
+	AS_UINT4(long_state) = Z;
+}
+
+__device__ __forceinline__ void MUL_SUM_XOR_DST_1(const uint64_t m, uint4 &a, void* far_dst, uint64_t tweak)
+{
+	ulonglong2 d = AS_UL2(far_dst);
+	ulonglong2 p = cuda_mul128(m, d.x);
+	p += AS_UL2(&a);
+	AS_UL2(&a) = p ^ d;
+	p.y = p.y ^ tweak;
+	AS_UL2(far_dst) = p;
+}
+
 __global__
-void cryptonight_core_gpu_phase3(const uint32_t threads, const uint64_t * long_state, uint64_t * ctx_state, uint32_t * __restrict__ ctx_key2)
+void monero_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
+	uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
+	uint64_t * __restrict__ d_tweak)
 {
 	__shared__ __align__(16) uint32_t sharedMemory[1024];
 	cn_aes_gpu_init(sharedMemory);
 	__syncthreads();
 
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3U;
-	const uint32_t sub = (threadIdx.x & 7U) << 1U;
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
+	{
+		const uint32_t batchsize = ITER >> (2 + bfactor);
+		const uint32_t start = partidx * batchsize;
+		const uint32_t end = start + batchsize;
+		uint64_t tweak = d_tweak[thread];
+
+		void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
+		void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
+		uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
+		uint4 B = AS_UINT4(ctx_b);
 
-	if(thread < threads)
+		uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
+		for (int i = start; i < end; i++) // end = 262144
+		{
+			uint4 C;
+			uint32_t j = (A.x & E2I_MASK) >> 3;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
+			store_variant1(&long_state[j], C ^ B); // st.global
+			MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
+
+			j = (A.x & E2I_MASK) >> 3;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
+			store_variant1(&long_state[j], C ^ B);
+			MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
+		}
+		if (bfactor) {
+			AS_UINT4(ctx_a) = A;
+			AS_UINT4(ctx_b) = B;
+		}
+	}
+}
+
+// --------------------------------------------------------------------------------------------------------------
+
+__global__
+void stellite_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
+	uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
+	uint64_t * __restrict__ d_tweak)
+{
+	__shared__ __align__(16) uint32_t sharedMemory[1024];
+	cn_aes_gpu_init(sharedMemory);
+	__syncthreads();
+
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
 	{
-		const uint32_t long_oft = (thread << LONG_SHL64) + sub;
-		const uint32_t st_oft = (thread * 26U) + sub + 8U;
+		const uint32_t batchsize = ITER >> (2 + bfactor);
+		const uint32_t start = partidx * batchsize;
+		const uint32_t end = start + batchsize;
+		uint64_t tweak = d_tweak[thread];
+
+		void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
+		void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
+		uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
+		uint4 B = AS_UINT4(ctx_b);
+
+		uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
+		for (int i = start; i < end; i++) // end = 262144
+		{
+			uint4 C;
+			uint32_t j = (A.x & E2I_MASK) >> 3;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
+			store_variant2(&long_state[j], C ^ B); // st.global
+			MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
+
+			j = (A.x & E2I_MASK) >> 3;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
+			store_variant2(&long_state[j], C ^ B);
+			MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
+		}
+		if (bfactor) {
+			AS_UINT4(ctx_a) = A;
+			AS_UINT4(ctx_b) = B;
+		}
+	}
+}
+
+// --------------------------------------------------------------------------------------------------------------
 
-		uint4 key[10];
-		const uint32_t* ctx_key = &ctx_key2[thread * 40U];
-		#pragma unroll 10 // 160 bytes
-		for (int i = 0; i < 10; i++)
-			key[i] = AS_UINT4(&ctx_key[i*4U]);
+__global__
+void cryptonight_gpu_phase3(const uint32_t threads, const uint32_t * __restrict__ d_long_state,
+	uint32_t * __restrict__ d_ctx_state, const uint32_t * __restrict__ d_ctx_key2)
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	cn_aes_gpu_init(sharedMemory);
+	__syncthreads();
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
 
-		uint4 text = AS_UINT4(&ctx_state[st_oft]);
+	if(thread < threads)
+	{
+		const int sub = (threadIdx.x & 7) << 2;
+		const uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
+		uint32_t key[40], text[4];
+		MEMCPY8(key, d_ctx_key2 + thread * 40, 20);
+		MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2);
 
-		for(uint32_t i = 0; i < LONG_LOOPS64; i += 16U)
+		for(int i = 0; i < LONG_LOOPS32; i += 32)
 		{
-			uint4 st = AS_UINT4(&long_state[long_oft + i]);
-			text = text ^ st;
-			cn_aes_pseudo_round_mut_uint4(sharedMemory, text, key);
+			#pragma unroll
+			for(int j = 0; j < 4; ++j)
+				text[j] ^= longstate[i + j];
+
+			cn_aes_pseudo_round_mut(sharedMemory, text, key);
 		}
 
-		AS_UINT4(&ctx_state[st_oft]) = text;
+		MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2);
 	}
 }
 
+// --------------------------------------------------------------------------------------------------------------
+
 extern int device_bfactor[MAX_GPUS];
 
 __host__
-void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state,
-	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
+void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state,
+	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
 {
 	dim3 grid(blocks);
 	dim3 block(threads);
-	//dim3 block2(threads << 1);
 	dim3 block4(threads << 2);
 	dim3 block8(threads << 3);
 
-	const uint32_t bfactor = (uint32_t) device_bfactor[thr_id];
-	const uint32_t partcount = 1 << bfactor;
+	const uint16_t bfactor = (uint16_t) device_bfactor[thr_id];
+	const uint32_t partcount = 1U << bfactor;
 	const uint32_t throughput = (uint32_t) (blocks*threads);
 
 	const int bsleep = bfactor ? 100 : 0;
 	const int dev_id = device_map[thr_id];
 
-	cryptonight_core_gpu_phase1 <<<grid, block8>>> (throughput, d_long_state, d_ctx_state, d_ctx_key1);
+	cryptonight_gpu_phase1 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key1);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 	if(partcount > 1) usleep(bsleep);
 
 	for (uint32_t i = 0; i < partcount; i++)
 	{
 		dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
-		cryptonight_core_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
+		if (variant == 0)
+			cryptonight_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
+		else if (variant == 1 || cryptonight_fork == 8)
+			monero_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
+		else if (variant == 2 && cryptonight_fork == 3)
+			stellite_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		if(partcount > 1) usleep(bsleep);
 	}
-
-	cryptonight_core_gpu_phase3 <<<grid, block8>>> (throughput, d_long_state, d_ctx_state, d_ctx_key2);
+	//cudaDeviceSynchronize();
+	//exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+	cryptonight_gpu_phase3 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key2);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
diff --git a/crypto/cryptonight-cpu.cpp b/crypto/cryptonight-cpu.cpp
index 66b3cf4..b60798f 100644
--- a/crypto/cryptonight-cpu.cpp
+++ b/crypto/cryptonight-cpu.cpp
@@ -12,6 +12,20 @@ extern "C" {
 #include "cpu/c_keccak.h"
 }
 
+static void cryptonight_store_variant(void* state, int variant) {
+	if (variant == 1 || cryptonight_fork == 8) {
+		// monero, and graft ?
+		const uint8_t tmp = ((const uint8_t*)(state))[11];
+		const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
+		((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
+	} else if (variant == 2 && cryptonight_fork == 3) {
+		// stellite
+		const uint8_t tmp = ((const uint8_t*)(state))[11];
+		const uint8_t index = (((tmp >> 4) & 6) | (tmp & 1)) << 1;
+		((uint8_t*)(state))[11] = tmp ^ ((0x75312 >> index) & 0x30);
+	}
+}
+
 struct cryptonight_ctx {
 	uint8_t long_state[MEMORY];
 	union cn_slow_hash_state state;
@@ -130,14 +144,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
 	((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
 }
 
-static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
+static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak1_2) {
 	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
 	hi += ((uint64_t*) c)[0];
 
 	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
 	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
 	((uint64_t*) dst)[0] = hi;
-	((uint64_t*) dst)[1] = lo;
+	((uint64_t*) dst)[1] = variant ? lo ^ tweak1_2 : lo;
 }
 
 static void copy_block(uint8_t* dst, const uint8_t* src) {
@@ -155,13 +169,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
 	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
 }
 
-static void cryptonight_hash_ctx(void* output, const void* input, size_t len, struct cryptonight_ctx* ctx)
+static int cryptonight_hash_ctx(void* output, const void* input, const size_t len, struct cryptonight_ctx* ctx, const int variant)
 {
 	size_t i, j;
+	if (variant && len < 43)
+		return 0;
+
 	keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
 	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
 
+	const uint64_t tweak1_2 = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
+
 	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
 	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
 #undef RND
@@ -184,14 +203,16 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
 		j = e2i(ctx->a) * AES_BLOCK_SIZE;
 		aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
 		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
+		cryptonight_store_variant(&ctx->long_state[j], variant);
 
-		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE]);
+		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE], variant, tweak1_2);
 
 		j = e2i(ctx->a) * AES_BLOCK_SIZE;
 		aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
 		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
+		cryptonight_store_variant(&ctx->long_state[j], variant);
 
-		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE]);
+		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE], variant, tweak1_2);
 	}
 
 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
@@ -217,11 +238,38 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
 	if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
 
 	oaes_free((OAES_CTX **) &ctx->aes_ctx);
+	return 1;
 }
 
-void cryptonight_hash(void* output, const void* input, size_t len)
+int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant)
 {
 	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-	cryptonight_hash_ctx(output, input, len, ctx);
+	int rc = cryptonight_hash_ctx(output, input, len, ctx, variant);
 	free(ctx);
+	return rc;
+}
+
+void cryptonight_hash(void* output, const void* input)
+{
+	cryptonight_fork = 1;
+	cryptonight_hash_variant(output, input, 76, 0);
+}
+
+void graft_hash(void* output, const void* input)
+{
+	cryptonight_fork = 8;
+	cryptonight_hash_variant(output, input, 76, 1);
+}
+
+void monero_hash(void* output, const void* input)
+{
+	cryptonight_fork = 7;
+	cryptonight_hash_variant(output, input, 76, 1);
 }
+
+void stellite_hash(void* output, const void* input)
+{
+	cryptonight_fork = 3;
+	cryptonight_hash_variant(output, input, 76, 2);
+}
+
diff --git a/crypto/cryptonight-extra.cu b/crypto/cryptonight-extra.cu
index 6d3c131..c55c518 100644
--- a/crypto/cryptonight-extra.cu
+++ b/crypto/cryptonight-extra.cu
@@ -7,15 +7,15 @@
 
 #include <miner.h>
 #include <cuda_helper.h>
-#include "cryptonight.h"
 
-typedef uint8_t BitSequence;
-typedef uint64_t DataLength;
+#include "cryptonight.h"
 
-static uint32_t *d_input[MAX_GPUS] = { 0 };
+static uint32_t *d_input[MAX_GPUS];
 static uint32_t *d_target[MAX_GPUS];
 static uint32_t *d_result[MAX_GPUS];
 
+typedef uint8_t BitSequence;
+typedef uint32_t DataLength;
 #include "cn_keccak.cuh"
 #include "cn_blake.cuh"
 #include "cn_groestl.cuh"
@@ -44,13 +44,11 @@ __constant__ uint8_t d_sub_byte[16][16] = {
 __device__ __forceinline__
 void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __restrict__ data)
 {
-	const uint32_t aes_gf[] = {
+	const uint32_t aes_gf[10] = {
 		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
 	};
 
-	MEMSET4(key, 0, 40);
 	MEMCPY4(key, data, 8);
-
 	#pragma unroll
 	for(int i = 8; i < 40; i++)
 	{
@@ -74,15 +72,14 @@ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __res
 }
 
 __global__
-void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict__ d_input, uint32_t startNonce,
-	uint64_t * d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
-	uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2)
+void cryptonight_extra_gpu_prepare(const uint32_t threads, const uint32_t * __restrict__ d_input, uint32_t startNonce,
+	uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
+	uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2, int variant, uint64_t * d_ctx_tweak)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
 	if(thread < threads)
 	{
-		uint32_t ctx_state[50];
+		uint64_t ctx_state[25];
 		uint32_t ctx_a[4];
 		uint32_t ctx_b[4];
 		uint32_t ctx_key1[40];
@@ -90,92 +87,62 @@ void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict
 		uint32_t input[19];
 
 		MEMCPY4(input, d_input, 19);
-		*((uint32_t *)(((char *)input) + 39)) = startNonce + thread;
-
-		cn_keccak((uint8_t *)input, (uint8_t *)ctx_state);
-		cryptonight_aes_set_key(ctx_key1, ctx_state);
-		cryptonight_aes_set_key(ctx_key2, ctx_state + 8);
-		XOR_BLOCKS_DST(ctx_state, ctx_state + 8, ctx_a);
-		XOR_BLOCKS_DST(ctx_state + 4, ctx_state + 12, ctx_b);
-
-		MEMCPY8(&d_ctx_state[thread * 26], ctx_state, 25);
-		MEMCPY4(d_ctx_a + thread * 4, ctx_a, 4);
-		MEMCPY4(d_ctx_b + thread * 4, ctx_b, 4);
-		MEMCPY4(d_ctx_key1 + thread * 40, ctx_key1, 40);
-		MEMCPY4(d_ctx_key2 + thread * 40, ctx_key2, 40);
-	}
-}
 
-__global__
-void cryptonight_extra_gpu_keccak(uint32_t threads, uint32_t * d_ctx_state)
-{
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if(thread < threads)
-	{
-		uint64_t* ctx_state = (uint64_t*) (&d_ctx_state[thread * 52U]);
-		uint64_t state[25];
-		#pragma unroll
-		for(int i = 0; i < 25; i++)
-			state[i] = ctx_state[i];
-
-		cn_keccakf2(state);
-
-		// to reduce the final kernel stack frame, cut algos in 2 kernels
-		// ps: these 2 final kernels are not important for the overall xmr hashrate (< 1%)
-		switch (((uint8_t*)state)[0] & 0x03)
-		{
-			case 0: {
-				uint32_t hash[8];
-				cn_blake((uint8_t*)state, 200, (uint8_t*)hash);
-				((uint32_t*)ctx_state)[0] = 0;
-				((uint32_t*)ctx_state)[6] = hash[6];
-				((uint32_t*)ctx_state)[7] = hash[7];
-				break;
-			}
-			case 1: {
-				uint32_t hash[8];
-				cn_groestl((BitSequence*)state, 200, (BitSequence*)hash);
-				((uint32_t*)ctx_state)[0] = 0;
-				((uint32_t*)ctx_state)[6] = hash[6];
-				((uint32_t*)ctx_state)[7] = hash[7];
-				break;
-			}
-			default: {
-				#pragma unroll
-				for(int i = 0; i < 25; i++)
-					ctx_state[i] = state[i];
-			}
+		uint32_t nonce = startNonce + thread;
+		*(((uint8_t *)input) + 39) = nonce & 0xff;
+		*(((uint8_t *)input) + 40) = (nonce >> 8) & 0xff;
+		*(((uint8_t *)input) + 41) = (nonce >> 16) & 0xff;
+		*(((uint8_t *)input) + 42) = (nonce >> 24) & 0xff;
+
+		cn_keccak(input, ctx_state);
+		MEMCPY4(&d_ctx_state[thread * 50U], ctx_state, 50);
+
+		cryptonight_aes_set_key(ctx_key1, (uint32_t*)(&ctx_state[0]));
+		cryptonight_aes_set_key(ctx_key2, (uint32_t*)(&ctx_state[4]));
+		MEMCPY4(&d_ctx_key1[thread * 40U], ctx_key1, 40);
+		MEMCPY4(&d_ctx_key2[thread * 40U], ctx_key2, 40);
+
+		XOR_BLOCKS_DST(&ctx_state[0], &ctx_state[4], ctx_a);
+		XOR_BLOCKS_DST(&ctx_state[2], &ctx_state[6], ctx_b);
+		MEMCPY4(&d_ctx_a[thread * 4U], ctx_a, 4);
+		MEMCPY4(&d_ctx_b[thread * 4U], ctx_b, 4);
+
+		if (variant) {
+			uint2 tweak = AS_UINT2(&ctx_state[24]);
+			//tweak.x ^= (input[8] >> 24) | (input[9] << 8);
+			tweak.x ^= __byte_perm(input[8], input[ 9], 0x6543);
+			tweak.y ^= __byte_perm(input[9], input[10], 0x6543);
+			MEMCPY4(&d_ctx_tweak[thread], &tweak, 2);
 		}
 	}
 }
 
 __global__
-void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, uint64_t * __restrict__ d_ctx_state,
-	const uint32_t* d_target, uint32_t * resNonces)
+void cryptonight_extra_gpu_final(const uint32_t threads, uint32_t startNonce, const uint32_t * __restrict__ d_target,
+	uint32_t * __restrict__ resNonces, uint32_t * __restrict__ d_ctx_state)
 {
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
 	if(thread < threads)
 	{
-		uint64_t* const state = &d_ctx_state[thread * 26U];
-
+		uint32_t *ctx_state = &d_ctx_state[thread * 50U];
 		uint32_t hash[8];
-		switch(((uint8_t *)state)[0] & 0x03)
-		{
-			case 0: {
-				uint32_t* h32 = (uint32_t*)state;
-				hash[6] = h32[6];
-				hash[7] = h32[7];
-				break;
-			}
-			case 2: {
-				cn_jh256((uint8_t*)state, 200, hash);
-				break;
-			}
-			case 3: {
-				cn_skein((uint8_t*)state, 200, hash);
-				break;
-			}
-		}
+		uint32_t state[50];
+
+		#pragma unroll 25
+		for(int i = 0; i < 50; i+=2)
+			AS_UINT2(&state[i]) = AS_UINT2(&ctx_state[i]);
+
+		cn_keccakf2((uint64_t *)state);
+
+		int branch = ((uint8_t *)state)[0] & 0x03;
+		if(branch == 0)
+			cn_blake((const uint8_t *)state, 200, hash);
+		if(branch == 1)
+			cn_groestl((const uint8_t *)state, 200, hash);
+		if(branch == 2)
+			cn_jh((const uint8_t *)state, 200, hash);
+		if(branch == 3)
+			cn_skein((const uint8_t *)state, 200, hash);
 
 		if(hash[7] <= d_target[1] && hash[6] <= d_target[0])
 		{
@@ -188,55 +155,53 @@ void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, ui
 }
 
 __host__
-void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *ptarget)
+void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget)
 {
 	uint32_t *pTargetIn = (uint32_t*) ptarget;
-	cudaMemcpy(d_input[thr_id], data, 19 * sizeof(uint32_t), cudaMemcpyHostToDevice);
-	cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2*sizeof(uint32_t), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_input[thr_id], data, 20 * sizeof(uint32_t), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2 * sizeof(uint32_t), cudaMemcpyHostToDevice);
+	cudaMemset(d_result[thr_id], 0xFF, 2 * sizeof(uint32_t));
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
 
 __host__
-void cryptonight_extra_cpu_init(int thr_id, uint32_t threads)
+void cryptonight_extra_init(int thr_id)
 {
-	cudaMalloc(&d_input[thr_id], 19 * sizeof(uint32_t));
-	cudaMalloc(&d_target[thr_id], 2*sizeof(uint32_t));
-	cudaMalloc(&d_result[thr_id], 2*sizeof(uint32_t));
+	cudaMalloc(&d_input[thr_id], 20 * sizeof(uint32_t));
+	cudaMalloc(&d_target[thr_id], 2 * sizeof(uint32_t));
+	cudaMalloc(&d_result[thr_id], 2 * sizeof(uint32_t));
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
 
 __host__
-void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
+void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
 {
 	uint32_t threadsperblock = 128;
 
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	cryptonight_extra_gpu_prepare <<<grid, block >>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2);
+	cryptonight_extra_gpu_prepare <<<grid, block>>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2, variant, d_ctx_tweak);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
 
 __host__
-void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint64_t *d_ctx_state)
+void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state)
 {
 	uint32_t threadsperblock = 128;
 
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	cudaMemset(d_result[thr_id], 0xFF, 2*sizeof(uint32_t));
-	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-	cryptonight_extra_gpu_keccak <<<grid, block >>> (threads, (uint32_t*)d_ctx_state);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-	cryptonight_extra_gpu_final <<<grid, block >>> (threads, startNonce, d_ctx_state, d_target[thr_id], d_result[thr_id]);
+	cryptonight_extra_gpu_final <<<grid, block>>> (threads, startNonce, d_target[thr_id], d_result[thr_id], d_ctx_state);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-	cudaMemcpy(resnonce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(resNonces, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
 
 __host__
-void cryptonight_extra_cpu_free(int thr_id)
+void cryptonight_extra_free(int thr_id)
 {
 	if (d_input[thr_id]) {
 		cudaFree(d_input[thr_id]);
@@ -244,4 +209,4 @@ void cryptonight_extra_cpu_free(int thr_id)
 		cudaFree(d_result[thr_id]);
 		d_input[thr_id] = NULL;
 	}
-}
\ No newline at end of file
+}
diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu
index 0214ce4..5f92972 100644
--- a/crypto/cryptonight.cu
+++ b/crypto/cryptonight.cu
@@ -12,16 +12,17 @@ static __thread bool gpu_init_shown = false;
 	gpulog(p, thr, fmt, ##__VA_ARGS__)
 
 static uint64_t *d_long_state[MAX_GPUS];
-static uint64_t *d_ctx_state[MAX_GPUS];
+static uint32_t *d_ctx_state[MAX_GPUS];
 static uint32_t *d_ctx_key1[MAX_GPUS];
 static uint32_t *d_ctx_key2[MAX_GPUS];
 static uint32_t *d_ctx_text[MAX_GPUS];
+static uint64_t *d_ctx_tweak[MAX_GPUS];
 static uint32_t *d_ctx_a[MAX_GPUS];
 static uint32_t *d_ctx_b[MAX_GPUS];
 
 static bool init[MAX_GPUS] = { 0 };
 
-extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
 {
 	int res = 0;
 	uint32_t throughput = 0;
@@ -49,6 +50,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 			gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id],
 				mem, device_mpcount[dev_id]);
 
+		if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) {
+			device_config[thr_id] = strdup("80x24");
+		}
+
 		if (device_config[thr_id]) {
 			int res = sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
 			throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
@@ -70,7 +75,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 			exit(1);
 		}
 
-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
@@ -79,11 +84,11 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 		}
 
 		const size_t alloc = MEMORY * throughput;
-		cryptonight_extra_cpu_init(thr_id, throughput);
+		cryptonight_extra_init(thr_id);
 
 		cudaMalloc(&d_long_state[thr_id], alloc);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-		cudaMalloc(&d_ctx_state[thr_id], 208 * throughput); // 52*4 (200 is not aligned 16)
+		cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
@@ -95,6 +100,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
+		exit_if_cudaerror(thr_id, __FILE__, __LINE__);
 
 		gpu_init_shown = true;
 		init[thr_id] = true;
@@ -107,10 +114,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 		const uint32_t Htarg = ptarget[7];
 		uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
 
-		cryptonight_extra_cpu_setData(thr_id, pdata, ptarget);
-		cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
-		cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
-		cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
+		cryptonight_extra_setData(thr_id, pdata, ptarget);
+		cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
+		cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
+		cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
 
 		*hashes_done = nonce - first_nonce + throughput;
 
@@ -121,8 +128,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 			uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
 			memcpy(tempdata, pdata, 76);
 			*tempnonceptr = resNonces[0];
-			cryptonight_hash(vhash, tempdata, 76);
-			if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
+			const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant);
+			if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget))
 			{
 				res = 1;
 				work->nonces[0] = resNonces[0];
@@ -131,8 +138,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 				if(resNonces[1] != UINT32_MAX)
 				{
 					*tempnonceptr = resNonces[1];
-					cryptonight_hash(vhash, tempdata, 76);
-					if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+					const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant);
+					if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget)) {
 						res++;
 						work->nonces[1] = resNonces[1];
 					} else {
@@ -174,10 +181,11 @@ void free_cryptonight(int thr_id)
 	cudaFree(d_ctx_key1[thr_id]);
 	cudaFree(d_ctx_key2[thr_id]);
 	cudaFree(d_ctx_text[thr_id]);
+	cudaFree(d_ctx_tweak[thr_id]);
 	cudaFree(d_ctx_a[thr_id]);
 	cudaFree(d_ctx_b[thr_id]);
 
-	cryptonight_extra_cpu_free(thr_id);
+	cryptonight_extra_free(thr_id);
 
 	cudaDeviceSynchronize();
 
diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h
index 4a31832..00417b9 100644
--- a/crypto/cryptonight.h
+++ b/crypto/cryptonight.h
@@ -20,7 +20,6 @@ struct uint3  blockDim;
 #define __umul64hi(a,b) a*b
 #endif
 
-
 #define MEMORY         (1U << 21) // 2 MiB / 2097152 B
 #define ITER           (1U << 20) // 1048576
 #define E2I_MASK       0x1FFFF0u
@@ -136,10 +135,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
 		exit(1);
 	}
 }
-void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
+void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
 
-void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
-void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
-void cryptonight_extra_cpu_free(int thr_id);
-void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
-void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state);
+void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
+void cryptonight_extra_init(int thr_id);
+void cryptonight_extra_free(int thr_id);
+void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
+void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state);
diff --git a/crypto/xmr-rpc.cpp b/crypto/xmr-rpc.cpp
index 82b7845..433caa7 100644
--- a/crypto/xmr-rpc.cpp
+++ b/crypto/xmr-rpc.cpp
@@ -550,18 +550,24 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work)
 	}
 
 	else if (opt_algo == ALGO_CRYPTOLIGHT) {
+		int variant = 1;
 		uint32_t nonce = work->nonces[idnonce];
 		noncestr = bin2hex((unsigned char*) &nonce, 4);
 		last_found_nonce = nonce;
-		cryptolight_hash(hash, data, 76);
+		//if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
+		//	variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
+		cryptolight_hash_variant(hash, data, 76, variant);
 		work_set_target_ratio(work, (uint32_t*) hash);
 	}
 
 	else if (opt_algo == ALGO_CRYPTONIGHT) {
+		int variant = 0;
 		uint32_t nonce = work->nonces[idnonce];
 		noncestr = bin2hex((unsigned char*) &nonce, 4);
 		last_found_nonce = nonce;
-		cryptonight_hash(hash, data, 76);
+		if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
+			variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
+		cryptonight_hash_variant(hash, data, 76, variant);
 		work_set_target_ratio(work, (uint32_t*) hash);
 	}
 
diff --git a/miner.h b/miner.h
index 2853906..86088cb 100644
--- a/miner.h
+++ b/miner.h
@@ -279,8 +279,8 @@ extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce,
 extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
-extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
-extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
+extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
 extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_equihash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -575,6 +575,8 @@ extern uint32_t device_plimit[MAX_GPUS];
 extern uint32_t gpus_intensity[MAX_GPUS];
 extern int opt_cudaschedule;
 
+extern int cryptonight_fork;
+
 // cuda.cpp
 int cuda_num_devices();
 void cuda_devicenames();
@@ -898,8 +900,12 @@ void blake2b_hash(void *output, const void *input);
 void blake2s_hash(void *output, const void *input);
 void bmw_hash(void *state, const void *input);
 void c11hash(void *output, const void *input);
-void cryptolight_hash(void* output, const void* input, int len);
-void cryptonight_hash(void* output, const void* input, size_t len);
+int cryptolight_hash_variant(void* output, const void* input, int len, int variant);
+void cryptolight_hash(void* output, const void* input);
+int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant);
+void cryptonight_hash(void* output, const void* input);
+void monero_hash(void* output, const void* input);
+void stellite_hash(void* output, const void* input);
 void decred_hash(void *state, const void *input);
 void deephash(void *state, const void *input);
 void luffa_hash(void *state, const void *input);
diff --git a/util.cpp b/util.cpp
index 49cd854..9c2194d 100644
--- a/util.cpp
+++ b/util.cpp
@@ -2193,10 +2193,10 @@ void print_hash_tests(void)
 	c11hash(&hash[0], &buf[0]);
 	printpfx("c11", hash);
 
-	cryptolight_hash(&hash[0], &buf[0], 76);
+	cryptolight_hash(&hash[0], &buf[0]);
 	printpfx("cryptolight", hash);
 
-	cryptonight_hash(&hash[0], &buf[0], 76);
+	cryptonight_hash(&hash[0], &buf[0]);
 	printpfx("cryptonight", hash);
 
 	memset(buf, 0, 180);
@@ -2246,6 +2246,9 @@ void print_hash_tests(void)
 	lyra2Z_hash(&hash[0], &buf[0]);
 	printpfx("lyra2z", hash);
 
+	monero_hash(&hash[0], &buf[0]);
+	printpfx("monero", hash);
+
 	myriadhash(&hash[0], &buf[0]);
 	printpfx("myriad", hash);
 
@@ -2297,6 +2300,9 @@ void print_hash_tests(void)
 	skunk_hash(&hash[0], &buf[0]);
 	printpfx("skunk", hash);
 
+	stellite_hash(&hash[0], &buf[0]);
+	printpfx("stelitte", hash);
+
 	s3hash(&hash[0], &buf[0]);
 	printpfx("S3", hash);
 

From 6dc1bbdd47bbd4b85850ab43540c71875ef95e92 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 23 Jun 2018 13:39:38 +0200
Subject: [PATCH 14/24] prepare the new release

---
 README.txt              | 13 ++++++++++---
 compat/ccminer-config.h |  2 +-
 configure.ac            |  2 +-
 res/ccminer.rc          |  8 ++++----
 4 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/README.txt b/README.txt
index 0862870..148f089 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
 
-ccminer 2.2.6                                 "phi2 and allium"
+ccminer 2.3                     "phi2 and cryptonight variants"
 ---------------------------------------------------------------
 
 ***************************************************************
@@ -80,8 +80,8 @@ its command line interface and options.
                           blakecoin   use to mine Old Blake 256
                           blake2s     use to mine Nevacoin (Blake2-S 256)
                           bmw         use to mine Midnight
-                          cryptolight use to mine AEON cryptonight (MEM/2)
-                          cryptonight use to mine XMR cryptonight, Bytecoin, Dash, DigitalNote, etc
+                          cryptolight use to mine AEON cryptonight variant 1 (MEM/2)
+                          cryptonight use to mine original cryptonight
                           c11/flax    use to mine Chaincoin and Flax
                           decred      use to mine Decred 180 bytes Blake256-14
                           deep        use to mine Deepcoin
@@ -99,10 +99,12 @@ its command line interface and options.
                           lyra2       use to mine CryptoCoin
                           lyra2v2     use to mine Vertcoin
                           lyra2z      use to mine Zerocoin (XZC)
+                          monero      use to mine Monero (XMR)
                           myr-gr      use to mine Myriad-Groest
                           neoscrypt   use to mine FeatherCoin, Trezarcoin, Orbitcoin, etc
                           nist5       use to mine TalkCoin
                           penta       use to mine Joincoin / Pentablake
+                          phi1612     use to mine Seraph
                           phi2        use to mine LUXCoin
                           polytimos   use to mine Polytimos
                           quark       use to mine Quarkcoin
@@ -117,6 +119,7 @@ its command line interface and options.
                           skein       use to mine Skeincoin
                           skein2      use to mine Woodcoin
                           skunk       use to mine Signatum
+                          stellite    use to mine Stellite (a cryptonight variant)
                           timetravel  use to mine MachineCoin
                           tribus      use to mine Denarius
                           x11evo      use to mine Revolver
@@ -282,6 +285,10 @@ so we can more efficiently implement new algorithms using the latest hardware
 features.
 
 >>> RELEASE HISTORY <<<
+  June 23th 2018  v2.3
+                  Handle phi2 header variation for smart contracts
+                  Handle monero, stellite, graft and cryptolight variants
+
   June 10th 2018  v2.2.6
                   New phi2 algo for LUX
                   New allium algo for Garlic
diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h
index 5b36078..030e89f 100644
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@@ -164,7 +164,7 @@
 #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.2.6"
+#define PACKAGE_VERSION "2.3"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
diff --git a/configure.ac b/configure.ac
index 5489e9c..9030e7e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.2.7], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.3], [], [ccminer], [http://github.com/tpruvot/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/res/ccminer.rc b/res/ccminer.rc
index 78be94c..18eb1d2 100644
--- a/res/ccminer.rc
+++ b/res/ccminer.rc
@@ -60,8 +60,8 @@ IDI_ICON1               ICON                    "ccminer.ico"
 //
 
 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 2,2,6,0
- PRODUCTVERSION 2,2,6,0
+ FILEVERSION 2,3,0,0
+ PRODUCTVERSION 2,3,0,0
  FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
  FILEFLAGS 0x21L
@@ -76,10 +76,10 @@ BEGIN
     BEGIN
         BLOCK "040904e4"
         BEGIN
-            VALUE "FileVersion", "2.2.6"
+            VALUE "FileVersion", "2.3"
             VALUE "LegalCopyright", "Copyright (C) 2018"
             VALUE "ProductName", "ccminer"
-            VALUE "ProductVersion", "2.2.6"
+            VALUE "ProductVersion", "2.3"
         END
     END
     BLOCK "VarFileInfo"

From d9f242b8d1a1ef46e584f69666450fbc4431db15 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 23 Jun 2018 14:40:29 +0200
Subject: [PATCH 15/24] add sonoa algo, heavy x17 hashes

seems to works, more or less correctly (a few validation errors)
---
 Makefile.am             |   2 +-
 README.txt              |  10 +-
 algos.h                 |   2 +
 bench.cpp               |   2 +
 ccminer.cpp             |   7 +-
 ccminer.vcxproj         |   3 +-
 ccminer.vcxproj.filters |   3 +
 miner.h                 |   2 +
 x17/sonoa.cu            | 632 ++++++++++++++++++++++++++++++++++++++++
 9 files changed, 657 insertions(+), 6 deletions(-)
 create mode 100644 x17/sonoa.cu

diff --git a/Makefile.am b/Makefile.am
index 80a80c8..ddfbec6 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -80,7 +80,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  x16/x16r.cu x16/x16s.cu x16/cuda_x16_echo512.cu x16/cuda_x16_fugue512.cu \
 			  x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \
 			  x16/cuda_x16_echo512_64.cu \
-			  x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
+			  x17/x17.cu x17/hmq17.cu x17/sonoa.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
 			  phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu phi/cuda_phi2_cubehash512.cu x11/cuda_streebog_maxwell.cu \
 			  x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu
 
diff --git a/README.txt b/README.txt
index 148f089..cb60fca 100644
--- a/README.txt
+++ b/README.txt
@@ -41,19 +41,21 @@ Keccak (Maxcoin)
 Pentablake (Blake 512 x5)
 1Coin Triple S
 Neoscrypt (FeatherCoin)
-Revolver (X11evo)
+x11evo (Revolver)
+phi2 (LUXCoin)
 Scrypt and Scrypt:N
 Scrypt-Jane (Chacha)
-Sibcoin (sib)
+sib (Sibcoin)
 Skein (Skein + SHA)
 Signatum (Skein cubehash fugue Streebog)
+SonoA (Sono)
 Tribus (JH, keccak, simd)
 Woodcoin (Double Skein)
 Vanilla (Blake256 8-rounds - double sha256)
 Vertcoin Lyra2RE
 Ziftrcoin (ZR5)
 Boolberry (Wild Keccak)
-Monero (Cryptonight)
+Monero (Cryptonight v7 with -a monero)
 Aeon (Cryptonight-lite)
 
 where some of these coins have a VERY NOTABLE nVidia advantage
@@ -119,6 +121,7 @@ its command line interface and options.
                           skein       use to mine Skeincoin
                           skein2      use to mine Woodcoin
                           skunk       use to mine Signatum
+                          sonoa       use to mine Sono
                           stellite    use to mine Stellite (a cryptonight variant)
                           timetravel  use to mine MachineCoin
                           tribus      use to mine Denarius
@@ -288,6 +291,7 @@ features.
   June 23th 2018  v2.3
                   Handle phi2 header variation for smart contracts
                   Handle monero, stellite, graft and cryptolight variants
+                  Handle SonoA algo
 
   June 10th 2018  v2.2.6
                   New phi2 algo for LUX
diff --git a/algos.h b/algos.h
index c484bcc..dfbf7d8 100644
--- a/algos.h
+++ b/algos.h
@@ -52,6 +52,7 @@ enum sha_algos {
 	ALGO_SKEIN,
 	ALGO_SKEIN2,
 	ALGO_SKUNK,
+	ALGO_SONOA,
 	ALGO_S3,
 	ALGO_TIMETRAVEL,
 	ALGO_TRIBUS,
@@ -129,6 +130,7 @@ static const char *algo_names[] = {
 	"skein",
 	"skein2",
 	"skunk",
+	"sonoa",
 	"s3",
 	"timetravel",
 	"tribus",
diff --git a/bench.cpp b/bench.cpp
index 84f9bc5..894fd8a 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -82,6 +82,7 @@ void algo_free_all(int thr_id)
 	free_nist5(thr_id);
 	free_pentablake(thr_id);
 	free_phi(thr_id);
+	free_phi2(thr_id);
 	free_polytimos(thr_id);
 	free_quark(thr_id);
 	free_qubit(thr_id);
@@ -92,6 +93,7 @@ void algo_free_all(int thr_id)
 	free_sha256t(thr_id);
 	free_sia(thr_id);
 	free_sib(thr_id);
+	free_sonoa(thr_id);
 	free_s3(thr_id);
 	free_vanilla(thr_id);
 	free_veltor(thr_id);
diff --git a/ccminer.cpp b/ccminer.cpp
index 6521284..c2b34f8 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -274,7 +274,7 @@ Options:\n\
 			neoscrypt   FeatherCoin, Phoenix, UFO...\n\
 			nist5       NIST5 (TalkCoin)\n\
 			penta       Pentablake hash (5x Blake 512)\n\
-			phi         LUX initial algo\n\
+			phi1612     LUX initial algo, for Seraph\n\
 			phi2        LUX v2 with lyra2\n\
 			polytimos   Politimos\n\
 			quark       Quark\n\
@@ -288,6 +288,7 @@ Options:\n\
 			skein       Skein SHA2 (Skeincoin)\n\
 			skein2      Double Skein (Woodcoin)\n\
 			skunk       Skein Cube Fugue Streebog\n\
+			sonoa       97 hashes based on X17 ones (Sono)\n\
 			stellite    Cryptonight v3\n\
 			s3          S3 (1Coin)\n\
 			timetravel  Machinecoin permuted x8\n\
@@ -2299,6 +2300,7 @@ static void *miner_thread(void *userdata)
 			case ALGO_NEOSCRYPT:
 			case ALGO_SIB:
 			case ALGO_SCRYPT:
+			case ALGO_SONOA:
 			case ALGO_VELTOR:
 				minmax = 0x80000;
 				break;
@@ -2508,6 +2510,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_SIB:
 			rc = scanhash_sib(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_SONOA:
+			rc = scanhash_sonoa(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_S3:
 			rc = scanhash_s3(thr_id, &work, max_nonce, &hashes_done);
 			break;
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index c0aa954..f3d3e28 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -591,7 +591,6 @@
     <CudaCompile Include="x15\x14.cu" />
     <CudaCompile Include="x15\cuda_x14_shabal512.cu" />
     <CudaCompile Include="x15\cuda_x15_whirlpool.cu" />
-    <CudaCompile Include="x17\hmq17.cu" />
     <CudaCompile Include="x15\x15.cu" />
     <CudaCompile Include="x15\whirlpool.cu" />
     <CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu" />
@@ -604,6 +603,8 @@
     <CudaCompile Include="x16\cuda_x16_echo512_64.cu">
       <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
     </CudaCompile>
+    <CudaCompile Include="x17\hmq17.cu" />
+    <CudaCompile Include="x17\sonoa.cu" />
     <CudaCompile Include="x17\x17.cu" />
     <CudaCompile Include="x17\cuda_x17_haval256.cu">
     </CudaCompile>
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 667331a..a1b9e86 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -778,6 +778,9 @@
     <CudaCompile Include="x17\hmq17.cu">
       <Filter>Source Files\CUDA\x17</Filter>
     </CudaCompile>
+    <CudaCompile Include="x17\sonoa.cu">
+      <Filter>Source Files\CUDA\x17</Filter>
+    </CudaCompile>
     <CudaCompile Include="x17\x17.cu">
       <Filter>Source Files\CUDA\x17</Filter>
     </CudaCompile>
diff --git a/miner.h b/miner.h
index 86088cb..368b3cb 100644
--- a/miner.h
+++ b/miner.h
@@ -315,6 +315,7 @@ extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce,
 extern int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_skunk(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_sonoa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_tribus(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_bitcore(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -384,6 +385,7 @@ extern void free_skeincoin(int thr_id);
 extern void free_skein2(int thr_id);
 extern void free_skunk(int thr_id);
 extern void free_s3(int thr_id);
+extern void free_sonoa(int thr_id);
 extern void free_timetravel(int thr_id);
 extern void free_tribus(int thr_id);
 extern void free_bitcore(int thr_id);
diff --git a/x17/sonoa.cu b/x17/sonoa.cu
new file mode 100644
index 0000000..153f787
--- /dev/null
+++ b/x17/sonoa.cu
@@ -0,0 +1,632 @@
+/**
+ * x97 SONO
+ **/
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
+#include "sph/sph_whirlpool.h"
+#include "sph/sph_sha2.h"
+#include "sph/sph_haval.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+#define NBN 2
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
+extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x15_whirlpool_cpu_free(int thr_id);
+
+extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
+extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+
+extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
+extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const int outlen);
+
+// CPU Hash Validation
+extern "C" void sonoa_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(128) hash[64];
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+	sph_hamsi512_context ctx_hamsi;
+	sph_fugue512_context ctx_fugue;
+	sph_shabal512_context ctx_shabal;
+	sph_whirlpool_context ctx_whirlpool;
+	sph_sha512_context ctx_sha512;
+	sph_haval256_5_context ctx_haval;
+
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, (void*)hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512_init(&ctx_hamsi);
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+	sph_shabal512_init(&ctx_shabal);
+	sph_shabal512(&ctx_shabal, (const void*)hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_shabal512(&ctx_shabal, (const void*)hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+	sph_shabal512(&ctx_shabal, (const void*)hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*)hash);
+
+	sph_whirlpool_init(&ctx_whirlpool);
+	sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+	sph_shabal512(&ctx_shabal, (const void*)hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*)hash);
+
+	sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
+
+	sph_sha512_init(&ctx_sha512);
+	sph_sha512(&ctx_sha512, (const void*)hash, 64);
+	sph_sha512_close(&ctx_sha512, (void*)hash);
+
+	sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+	sph_shabal512(&ctx_shabal, (const void*)hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*)hash);
+
+	sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
+
+	sph_sha512(&ctx_sha512, (const void*)hash, 64);
+	sph_sha512_close(&ctx_sha512, (void*)hash);
+
+	sph_haval256_5_init(&ctx_haval);
+	sph_haval256_5(&ctx_haval, (const void*)hash, 64);
+	sph_haval256_5_close(&ctx_haval, (void*)hash);
+
+	memcpy(output, hash, 32);
+}
+
+#define x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash) \
+  x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash, order++); \
+  if (use_compat_kernels[thr_id]) x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash, order++); \
+  else x16_echo512_cpu_hash_64(thr_id, throughput, d_hash)
+
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_sonoa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+
+	uint32_t default_throughput = 1 << 18;
+	if (device_sm[dev_id] <= 500) default_throughput = 1 << 18;
+	else if (device_sm[dev_id] <= 520) default_throughput = 1 << 18;
+	else if (device_sm[dev_id]  > 520) default_throughput = (1 << 19) + (1 << 18);
+
+	uint32_t throughput = cuda_default_throughput(thr_id, default_throughput);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	throughput &= 0xFFFFFF00;
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x00ff;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
+		gpulog(LOG_INFO,thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x11_luffaCubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		x14_shabal512_cpu_init(thr_id, throughput);
+		x15_whirlpool_cpu_init(thr_id, throughput, 0);
+		x17_sha512_cpu_init(thr_id, throughput);
+		x17_haval256_cpu_init(thr_id, throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 8 * sizeof(uint64_t) * throughput));
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	int warn = 0;
+	uint32_t _ALIGN(64) endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], 256); order++;
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+                work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+                if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			sonoa_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
+			{
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					sonoa_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!warn) {
+					warn++;
+					pdata[19] = work->nonces[0] + 1;
+					continue;
+				} else {
+					if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+					warn = 0;
+				}
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+extern "C" void free_sonoa(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaDeviceSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+	x15_whirlpool_cpu_free(thr_id);
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}

From 654e8a10ec3d5924099a68f0d7ef3d928f126b8f Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 24 Jun 2018 11:54:49 +0200
Subject: [PATCH 16/24] fix g++ 7.3 warnings (ubuntu 18.04)

---
 api.cpp                |  2 +-
 scrypt.cpp             | 14 ++++++++++++--
 scrypt/test_kernel.cu  |  5 ++---
 scrypt/titan_kernel.cu |  4 ++--
 sia/sia-rpc.cpp        |  8 ++++----
 util.cpp               |  4 ++--
 6 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/api.cpp b/api.cpp
index 9014f3a..6edfd31 100644
--- a/api.cpp
+++ b/api.cpp
@@ -257,7 +257,7 @@ static char *getpoolnfo(char *params)
 
 static void gpuhwinfos(int gpu_id)
 {
-	char buf[256];
+	char buf[512];
 	char pstate[8];
 	char* card;
 	struct cgpu_info *cgpu = NULL;
diff --git a/scrypt.cpp b/scrypt.cpp
index a6b9b70..68e81e4 100644
--- a/scrypt.cpp
+++ b/scrypt.cpp
@@ -50,7 +50,17 @@ using namespace Concurrency;
 
 #if _MSC_VER > 1800
 #undef _THROW1
+#if __cplusplus < 201101L
 #define _THROW1(x) throw(std::bad_alloc)
+#else
+#define _THROW1(x) noexcept(false)
+#endif
+#elif !defined(_MSC_VER)
+#if __cplusplus < 201101L
+#define _THROW1(x) throw(std::bad_alloc)
+#else
+#define _THROW1(x) noexcept(false)
+#endif
 #endif
 
 // A thin wrapper around the builtin __m128i type
@@ -63,9 +73,9 @@ public:
 	void * operator new[](size_t size) _THROW1(_STD bad_alloc) { void *p; if ((p = _aligned_malloc(size, 16)) == 0) { static const std::bad_alloc nomem; _RAISE(nomem); } return (p); }
 	void operator delete[](void *p) { _aligned_free(p); }
 #else
-	void * operator new(size_t size) throw(std::bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
+	void * operator new(size_t size) _THROW1(_STD bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
 	void operator delete(void *p) { free(p); }
-	void * operator new[](size_t size) throw(std::bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
+	void * operator new[](size_t size) _THROW1(_STD bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
 	void operator delete[](void *p) { free(p); }
 #endif
 	uint32x4_t() { };
diff --git a/scrypt/test_kernel.cu b/scrypt/test_kernel.cu
index e4467d1..ab5b03c 100644
--- a/scrypt/test_kernel.cu
+++ b/scrypt/test_kernel.cu
@@ -47,7 +47,7 @@ texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V;
 
 template <int ALGO> __device__  __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3);
 
-static __host__ __device__ uint4& operator^=(uint4& left, const uint4& right) {
+static __device__ uint4& operator^=(uint4& left, const uint4& right) {
 	left.x ^= right.x;
 	left.y ^= right.y;
 	left.z ^= right.z;
@@ -55,7 +55,7 @@ static __host__ __device__ uint4& operator^=(uint4& left, const uint4& right) {
 	return left;
 }
 
-static __host__ __device__ uint4& operator+=(uint4& left, const uint4& right) {
+static __device__ uint4& operator+=(uint4& left, const uint4& right) {
 	left.x += right.x;
 	left.y += right.y;
 	left.z += right.z;
@@ -63,7 +63,6 @@ static __host__ __device__ uint4& operator+=(uint4& left, const uint4& right) {
 	return left;
 }
 
-
 /* write_keys writes the 8 keys being processed by a warp to the global
  * scratchpad. To effectively use memory bandwidth, it performs the writes
  * (and reads, for read_keys) 128 bytes at a time per memory location
diff --git a/scrypt/titan_kernel.cu b/scrypt/titan_kernel.cu
index 1758722..57672a2 100644
--- a/scrypt/titan_kernel.cu
+++ b/scrypt/titan_kernel.cu
@@ -50,7 +50,7 @@ __constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP)-1
 
 template <int ALGO> __device__  __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3);
 
-static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right) {
+static __device__ uint4& operator ^= (uint4& left, const uint4& right) {
 	left.x ^= right.x;
 	left.y ^= right.y;
 	left.z ^= right.z;
@@ -58,7 +58,7 @@ static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right)
 	return left;
 }
 
-static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) {
+static __device__ uint4& operator += (uint4& left, const uint4& right) {
 	left.x += right.x;
 	left.y += right.y;
 	left.z += right.z;
diff --git a/sia/sia-rpc.cpp b/sia/sia-rpc.cpp
index 5eafe9e..4770426 100644
--- a/sia/sia-rpc.cpp
+++ b/sia/sia-rpc.cpp
@@ -74,10 +74,10 @@ char* sia_getheader(CURL *curl, struct pool_infos *pool)
 	struct data_buffer all_data = { 0 };
 	struct curl_slist *headers = NULL;
 	char data[256] = { 0 };
-	char url[512];
+	char url[512*3];
 
 	// nanopool
-	snprintf(url, 512, "%s/miner/header?address=%s&worker=%s", //&longpoll
+	snprintf(url, sizeof(url), "%s/miner/header?address=%s&worker=%s", //&longpoll
 		pool->url, pool->user, pool->pass);
 
 	if (opt_protocol)
@@ -148,7 +148,7 @@ bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work)
 	struct data_buffer all_data = { 0 };
 	struct curl_slist *headers = NULL;
 	char buf[256] = { 0 };
-	char url[512];
+	char url[512*3];
 
 	if (opt_protocol)
 		applog_hex(work->data, 80);
@@ -156,7 +156,7 @@ bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work)
 	//applog_hex(&work->data[10], 4);
 
 	// nanopool
-	snprintf(url, 512, "%s/miner/header?address=%s&worker=%s",
+	snprintf(url, sizeof(url), "%s/miner/header?address=%s&worker=%s",
 		pool->url, pool->user, pool->pass);
 
 	if (opt_protocol)
diff --git a/util.cpp b/util.cpp
index 9c2194d..66617af 100644
--- a/util.cpp
+++ b/util.cpp
@@ -616,7 +616,7 @@ err_out:
 json_t *json_rpc_call_pool(CURL *curl, struct pool_infos *pool, const char *req,
 	bool longpoll_scan, bool longpoll, int *curl_err)
 {
-	char userpass[512];
+	char userpass[768];
 	// todo, malloc and store that in pool array
 	snprintf(userpass, sizeof(userpass), "%s%c%s", pool->user,
 		strlen(pool->pass)?':':'\0', pool->pass);
@@ -627,7 +627,7 @@ json_t *json_rpc_call_pool(CURL *curl, struct pool_infos *pool, const char *req,
 /* called only from longpoll thread, we have the lp_url */
 json_t *json_rpc_longpoll(CURL *curl, char *lp_url, struct pool_infos *pool, const char *req, int *curl_err)
 {
-	char userpass[512];
+	char userpass[768];
 	snprintf(userpass, sizeof(userpass), "%s%c%s", pool->user,
 		strlen(pool->pass)?':':'\0', pool->pass);
 

From 370684f7435d1256cbabef4410a57ed5bc705fdc Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 24 Jun 2018 12:25:42 +0200
Subject: [PATCH 17/24] cryptonight: some code finitions

---
 crypto/cryptolight-cpu.cpp | 42 +++++++++++---------------
 crypto/cryptonight-cpu.cpp | 60 +++++++++++++++++---------------------
 crypto/cryptonight.cu      |  8 ++---
 miner.h                    |  4 +--
 4 files changed, 50 insertions(+), 64 deletions(-)

diff --git a/crypto/cryptolight-cpu.cpp b/crypto/cryptolight-cpu.cpp
index f995b4c..14cd3af 100644
--- a/crypto/cryptolight-cpu.cpp
+++ b/crypto/cryptolight-cpu.cpp
@@ -22,16 +22,6 @@ struct cryptonight_ctx {
 	oaes_ctx* aes_ctx;
 };
 
-
-static void cryptolight_store_variant(void* state, int variant) {
-	if (variant == 1) {
-		// use variant 1 like monero since june 2018
-		const uint8_t tmp = ((const uint8_t*)(state))[11];
-		const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
-		((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
-	}
-}
-
 static void do_blake_hash(const void* input, int len, void* output)
 {
 	uchar hash[32];
@@ -145,7 +135,6 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
 static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) {
 	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
 	hi += ((uint64_t*) c)[0];
-
 	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
 	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
 	((uint64_t*) dst)[0] = hi;
@@ -167,11 +156,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
 	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
 }
 
-static int cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant)
+static void cryptolight_store_variant(void* state, int variant) {
+	if (variant == 1) {
+		// use variant 1 like monero since june 2018
+		const uint8_t tmp = ((const uint8_t*)(state))[11];
+		const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
+		((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
+	}
+}
+
+static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant)
 {
 	size_t i, j;
-	if (variant && len < 43)
-		return 0;
 
 	keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
 	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
@@ -181,8 +177,8 @@ static int cryptolight_hash_ctx(void* output, const void* input, const int len,
 
 	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
 	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-#undef RND
-#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
+		#undef RND
+		#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
 		RND(0);
 		RND(1);
 		RND(2);
@@ -202,23 +198,21 @@ static int cryptolight_hash_ctx(void* output, const void* input, const int len,
 		aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
 		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
 		cryptolight_store_variant(&ctx->long_state[j], variant);
-
 		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)], variant, tweak);
 
 		j = e2i(ctx->a);
 		aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
 		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
 		cryptolight_store_variant(&ctx->long_state[j], variant);
-
 		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)], variant, tweak);
 	}
 
 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
 	oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
 	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-#undef RND
-#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
-		aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		#undef RND
+		#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
+			aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
 		RND(0);
 		RND(1);
 		RND(2);
@@ -236,15 +230,13 @@ static int cryptolight_hash_ctx(void* output, const void* input, const int len,
 	if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
 
 	oaes_free((OAES_CTX **) &ctx->aes_ctx);
-	return 1;
 }
 
-int cryptolight_hash_variant(void* output, const void* input, int len, int variant)
+void cryptolight_hash_variant(void* output, const void* input, int len, int variant)
 {
 	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-	int rc = cryptolight_hash_ctx(output, input, len, ctx, variant);
+	cryptolight_hash_ctx(output, input, len, ctx, variant);
 	free(ctx);
-	return rc;
 }
 
 void cryptolight_hash(void* output, const void* input)
diff --git a/crypto/cryptonight-cpu.cpp b/crypto/cryptonight-cpu.cpp
index b60798f..582d096 100644
--- a/crypto/cryptonight-cpu.cpp
+++ b/crypto/cryptonight-cpu.cpp
@@ -12,20 +12,6 @@ extern "C" {
 #include "cpu/c_keccak.h"
 }
 
-static void cryptonight_store_variant(void* state, int variant) {
-	if (variant == 1 || cryptonight_fork == 8) {
-		// monero, and graft ?
-		const uint8_t tmp = ((const uint8_t*)(state))[11];
-		const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
-		((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
-	} else if (variant == 2 && cryptonight_fork == 3) {
-		// stellite
-		const uint8_t tmp = ((const uint8_t*)(state))[11];
-		const uint8_t index = (((tmp >> 4) & 6) | (tmp & 1)) << 1;
-		((uint8_t*)(state))[11] = tmp ^ ((0x75312 >> index) & 0x30);
-	}
-}
-
 struct cryptonight_ctx {
 	uint8_t long_state[MEMORY];
 	union cn_slow_hash_state state;
@@ -144,14 +130,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
 	((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
 }
 
-static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak1_2) {
+static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) {
 	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
 	hi += ((uint64_t*) c)[0];
 
 	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
 	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
 	((uint64_t*) dst)[0] = hi;
-	((uint64_t*) dst)[1] = variant ? lo ^ tweak1_2 : lo;
+	((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo;
 }
 
 static void copy_block(uint8_t* dst, const uint8_t* src) {
@@ -169,22 +155,34 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
 	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
 }
 
-static int cryptonight_hash_ctx(void* output, const void* input, const size_t len, struct cryptonight_ctx* ctx, const int variant)
+static void cryptonight_store_variant(void* state, int variant) {
+	if (variant == 1 || cryptonight_fork == 8) {
+		// monero and graft
+		const uint8_t tmp = ((const uint8_t*)(state))[11];
+		const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
+		((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
+	} else if (variant == 2 && cryptonight_fork == 3) {
+		// stellite
+		const uint8_t tmp = ((const uint8_t*)(state))[11];
+		const uint8_t index = (((tmp >> 4) & 6) | (tmp & 1)) << 1;
+		((uint8_t*)(state))[11] = tmp ^ ((0x75312 >> index) & 0x30);
+	}
+}
+
+static void cryptonight_hash_ctx(void* output, const void* input, const size_t len, struct cryptonight_ctx* ctx, const int variant)
 {
 	size_t i, j;
-	if (variant && len < 43)
-		return 0;
 
 	keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
 	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
 
-	const uint64_t tweak1_2 = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
+	const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
 
 	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
 	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-#undef RND
-#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
+		#undef RND
+			#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
 		RND(0);
 		RND(1);
 		RND(2);
@@ -204,23 +202,21 @@ static int cryptonight_hash_ctx(void* output, const void* input, const size_t le
 		aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
 		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
 		cryptonight_store_variant(&ctx->long_state[j], variant);
-
-		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE], variant, tweak1_2);
+		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE], variant, tweak);
 
 		j = e2i(ctx->a) * AES_BLOCK_SIZE;
 		aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
 		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
 		cryptonight_store_variant(&ctx->long_state[j], variant);
-
-		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE], variant, tweak1_2);
+		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE], variant, tweak);
 	}
 
 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
 	oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
 	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-#undef RND
-#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
-		aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		#undef RND
+		#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
+			aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
 		RND(0);
 		RND(1);
 		RND(2);
@@ -238,15 +234,13 @@ static int cryptonight_hash_ctx(void* output, const void* input, const size_t le
 	if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
 
 	oaes_free((OAES_CTX **) &ctx->aes_ctx);
-	return 1;
 }
 
-int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant)
+void cryptonight_hash_variant(void* output, const void* input, size_t len, int variant)
 {
 	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-	int rc = cryptonight_hash_ctx(output, input, len, ctx, variant);
+	cryptonight_hash_ctx(output, input, len, ctx, variant);
 	free(ctx);
-	return rc;
 }
 
 void cryptonight_hash(void* output, const void* input)
diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu
index 5f92972..52d0e97 100644
--- a/crypto/cryptonight.cu
+++ b/crypto/cryptonight.cu
@@ -128,8 +128,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 			uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
 			memcpy(tempdata, pdata, 76);
 			*tempnonceptr = resNonces[0];
-			const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant);
-			if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget))
+			cryptonight_hash_variant(vhash, tempdata, 76, variant);
+			if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
 			{
 				res = 1;
 				work->nonces[0] = resNonces[0];
@@ -138,8 +138,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 				if(resNonces[1] != UINT32_MAX)
 				{
 					*tempnonceptr = resNonces[1];
-					const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant);
-					if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget)) {
+					cryptonight_hash_variant(vhash, tempdata, 76, variant);
+					if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 						res++;
 						work->nonces[1] = resNonces[1];
 					} else {
diff --git a/miner.h b/miner.h
index 368b3cb..f866cd9 100644
--- a/miner.h
+++ b/miner.h
@@ -902,9 +902,9 @@ void blake2b_hash(void *output, const void *input);
 void blake2s_hash(void *output, const void *input);
 void bmw_hash(void *state, const void *input);
 void c11hash(void *output, const void *input);
-int cryptolight_hash_variant(void* output, const void* input, int len, int variant);
+void cryptolight_hash_variant(void* output, const void* input, int len, int variant);
 void cryptolight_hash(void* output, const void* input);
-int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant);
+void cryptonight_hash_variant(void* output, const void* input, size_t len, int variant);
 void cryptonight_hash(void* output, const void* input);
 void monero_hash(void* output, const void* input);
 void stellite_hash(void* output, const void* input);

From 4a76ca5cb6e1f555621effec3880465124f2e386 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Fri, 3 Aug 2018 20:01:14 +0200
Subject: [PATCH 18/24] bench: handle cryptonight variants + V100 fix

---
 bench.cpp             | 16 ++++++++++++++++
 ccminer.cpp           |  3 +++
 crypto/cryptonight.cu |  9 ++++++---
 equi/equihash.cpp     |  3 +--
 4 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/bench.cpp b/bench.cpp
index 894fd8a..e573a08 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -156,6 +156,22 @@ bool bench_algo_switch_next(int thr_id)
 	if (algo == ALGO_SCRYPT) algo++;
 	if (algo == ALGO_SCRYPT_JANE) algo++;
 
+	// Set cryptonight variant
+	switch (algo) {
+		case ALGO_MONERO:
+			cryptonight_fork = 7;
+			break;
+		case ALGO_GRAFT:
+			cryptonight_fork = 8;
+			break;
+		case ALGO_STELLITE:
+			cryptonight_fork = 3;
+			break;
+		case ALGO_CRYPTONIGHT:
+			cryptonight_fork = 1;
+			break;
+	}
+
 	// free current algo memory and track mem usage
 	mused = cuda_available_memory(thr_id);
 	algo_free_all(thr_id);
diff --git a/ccminer.cpp b/ccminer.cpp
index c2b34f8..f4c1039 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -2384,6 +2384,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_CRYPTOLIGHT:
 			rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done, 1);
 			break;
+		case ALGO_MONERO:
+		case ALGO_STELLITE:
+		case ALGO_GRAFT:
 		case ALGO_CRYPTONIGHT:
 		{
 			int cn_variant = 0;
diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu
index 52d0e97..2c3a6cd 100644
--- a/crypto/cryptonight.cu
+++ b/crypto/cryptonight.cu
@@ -50,8 +50,11 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 			gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id],
 				mem, device_mpcount[dev_id]);
 
-		if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) {
-			device_config[thr_id] = strdup("80x24");
+		if (!device_config[thr_id]) {
+			if(strcmp(device_name[dev_id], "TITAN V") == 0)
+				device_config[thr_id] = strdup("80x24");
+			if(strstr(device_name[dev_id], "V100"))
+				device_config[thr_id] = strdup("80x24");
 		}
 
 		if (device_config[thr_id]) {
@@ -83,7 +86,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 			CUDA_LOG_ERROR();
 		}
 
-		const size_t alloc = MEMORY * throughput;
+		const size_t alloc = MEMORY * size_t(throughput);
 		cryptonight_extra_init(thr_id);
 
 		cudaMalloc(&d_long_state[thr_id], alloc);
diff --git a/equi/equihash.cpp b/equi/equihash.cpp
index c9ac1fc..3209546 100644
--- a/equi/equihash.cpp
+++ b/equi/equihash.cpp
@@ -183,8 +183,7 @@ extern "C" int scanhash_equihash(int thr_id, struct work *work, uint32_t max_non
 				return -1;
 			}
 			size_t memSz = solvers[thr_id]->equi_mem_sz / (1024*1024);
-			gpus_intensity[thr_id] = (uint32_t) solvers[thr_id]->throughput;
-			api_set_throughput(thr_id, gpus_intensity[thr_id]);
+			api_set_throughput(thr_id, (uint32_t) solvers[thr_id]->throughput);
 			gpulog(LOG_DEBUG, thr_id, "Allocated %u MB of context memory", (u32) memSz);
 			cuda_get_arch(thr_id);
 			init[thr_id] = true;

From 1f5efa7d3622f7c9efdf0d67e00200483a42e891 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 20 Sep 2018 20:13:31 +0200
Subject: [PATCH 19/24] makefile: add new cuda arch and remove sm5.0 by default

---
 Makefile.am | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index ddfbec6..1d13556 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -116,9 +116,11 @@ endif
 ccminer_LDADD += -lcuda
 
 nvcc_ARCH :=
-#nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
+#nvcc_ARCH += -gencode=arch=compute_75,code=\"sm_75,compute_75\" # CUDA 10 req.
+#nvcc_ARCH += -gencode=arch=compute_70,code=\"sm_70,compute_70\" # CUDA 9.1
+#nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\" # CUDA 8
 nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
-nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\"
+#nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\"
 #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
 #nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
 

From b36d174554591ca3572529846abd1d84df4cb41f Mon Sep 17 00:00:00 2001
From: opensourcerulez <krugliak@yandex.ru>
Date: Mon, 22 Oct 2018 22:11:33 +0300
Subject: [PATCH 20/24] Add exosis algo (#69)

---
 Makefile.am             |   2 +-
 README.txt              |   1 +
 algos.h                 |   2 +
 bench.cpp               |   1 +
 ccminer.cpp             |   6 +
 ccminer.vcxproj         |   1 +
 ccminer.vcxproj.filters |   3 +
 miner.h                 |   3 +
 util.cpp                |   3 +
 x11/exosis.cu           | 497 ++++++++++++++++++++++++++++++++++++++++
 10 files changed, 518 insertions(+), 1 deletion(-)
 create mode 100644 x11/exosis.cu

diff --git a/Makefile.am b/Makefile.am
index 1d13556..d34ac78 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -71,7 +71,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
 			  tribus/tribus.cu tribus/cuda_echo512_final.cu \
 			  x11/x11.cu x12/x12.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
-			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
+			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu x11/exosis.cu \
 			  x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \
 			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
 			  x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \
diff --git a/README.txt b/README.txt
index cb60fca..285b04d 100644
--- a/README.txt
+++ b/README.txt
@@ -89,6 +89,7 @@ its command line interface and options.
                           deep        use to mine Deepcoin
                           dmd-gr      use to mine Diamond-Groestl
                           equihash    use to mine ZEC, HUSH and KMD
+                          exosis      use to mine EXO
                           fresh       use to mine Freshcoin
                           fugue256    use to mine Fuguecoin
                           groestl     use to mine Groestlcoin
diff --git a/algos.h b/algos.h
index dfbf7d8..2d2da2d 100644
--- a/algos.h
+++ b/algos.h
@@ -18,6 +18,7 @@ enum sha_algos {
 	ALGO_DECRED,
 	ALGO_DMD_GR,
 	ALGO_EQUIHASH,
+	ALGO_EXOSIS,
 	ALGO_FRESH,
 	ALGO_FUGUE256,		/* Fugue256 */
 	ALGO_GROESTL,
@@ -96,6 +97,7 @@ static const char *algo_names[] = {
 	"decred",
 	"dmd-gr",
 	"equihash",
+	"exosis",
 	"fresh",
 	"fugue256",
 	"groestl",
diff --git a/bench.cpp b/bench.cpp
index e573a08..be53bbc 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -61,6 +61,7 @@ void algo_free_all(int thr_id)
 	free_decred(thr_id);
 	free_deep(thr_id);
 	free_equihash(thr_id);
+	free_exosis(thr_id);
 	free_keccak256(thr_id);
 	free_fresh(thr_id);
 	free_fugue256(thr_id);
diff --git a/ccminer.cpp b/ccminer.cpp
index f4c1039..bf5399c 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -252,6 +252,7 @@ Options:\n\
 			decred      Decred Blake256\n\
 			deep        Deepcoin\n\
 			equihash    Zcash Equihash\n\
+			exosis      Exosis timetravel\n\
 			dmd-gr      Diamond-Groestl\n\
 			fresh       Freshcoin (shavite 80)\n\
 			fugue256    Fuguecoin\n\
@@ -1742,6 +1743,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_PHI2:
 		case ALGO_TIMETRAVEL:
 		case ALGO_BITCORE:
+		case ALGO_EXOSIS:
 		case ALGO_X16R:
 		case ALGO_X16S:
 			work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
@@ -2283,6 +2285,7 @@ static void *miner_thread(void *userdata)
 			case ALGO_SKUNK:
 			case ALGO_TIMETRAVEL:
 			case ALGO_BITCORE:
+			case ALGO_EXOSIS:
 			case ALGO_X11EVO:
 			case ALGO_X11:
 			case ALGO_X12:
@@ -2544,6 +2547,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_BITCORE:
 			rc = scanhash_bitcore(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_EXOSIS:
+			rc = scanhash_exosis(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_X11EVO:
 			rc = scanhash_x11evo(thr_id, &work, max_nonce, &hashes_done);
 			break;
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index f3d3e28..01a598f 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -576,6 +576,7 @@
     <CudaCompile Include="x11\s3.cu" />
     <CudaCompile Include="x11\timetravel.cu" />
     <CudaCompile Include="x11\bitcore.cu" />
+    <CudaCompile Include="x11\exosis.cu" />
     <CudaCompile Include="x11\veltor.cu" />
     <CudaCompile Include="x11\x11.cu" />
     <CudaCompile Include="x11\x11evo.cu" />
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index a1b9e86..88252ec 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -841,6 +841,9 @@
     <CudaCompile Include="x11\bitcore.cu">
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
+    <CudaCompile Include="x11\exosis.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
     <CudaCompile Include="x11\veltor.cu">
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
diff --git a/miner.h b/miner.h
index f866cd9..cbc766b 100644
--- a/miner.h
+++ b/miner.h
@@ -319,6 +319,7 @@ extern int scanhash_sonoa(int thr_id, struct work* work, uint32_t max_nonce, uns
 extern int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_tribus(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_bitcore(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_exosis(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_vanilla(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blake_rounds);
 extern int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -354,6 +355,7 @@ extern void free_cryptonight(int thr_id);
 extern void free_decred(int thr_id);
 extern void free_deep(int thr_id);
 extern void free_equihash(int thr_id);
+extern void free_exosis(int thr_id);
 extern void free_keccak256(int thr_id);
 extern void free_fresh(int thr_id);
 extern void free_fugue256(int thr_id);
@@ -944,6 +946,7 @@ void skunk_hash(void *state, const void *input);
 void s3hash(void *output, const void *input);
 void timetravel_hash(void *output, const void *input);
 void bitcore_hash(void *output, const void *input);
+void exosis_hash(void *output, const void *input);
 void tribus_hash(void *output, const void *input);
 void veltorhash(void *output, const void *input);
 void wcoinhash(void *state, const void *input);
diff --git a/util.cpp b/util.cpp
index 66617af..7a67ea6 100644
--- a/util.cpp
+++ b/util.cpp
@@ -2311,6 +2311,9 @@ void print_hash_tests(void)
 
 	bitcore_hash(&hash[0], &buf[0]);
 	printpfx("bitcore", hash);
+	
+	exosis_hash(&hash[0], &buf[0]);
+	printpfx("exosis", hash);
 
 	blake256hash(&hash[0], &buf[0], 8);
 	printpfx("vanilla", hash);
diff --git a/x11/exosis.cu b/x11/exosis.cu
new file mode 100644
index 0000000..e4dcfe5
--- /dev/null
+++ b/x11/exosis.cu
@@ -0,0 +1,497 @@
+/**
+ * Timetravel (exosis) CUDA implementation
+ *  by tpruvot@github, exosis
+ */
+
+#include <stdio.h>
+#include <memory.h>
+#include <unistd.h>
+
+#define HASH_FUNC_BASE_TIMESTAMP 1538556426U
+#define HASH_FUNC_COUNT 8
+#define HASH_FUNC_COUNT_PERMUTATIONS 40320U
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+enum Algo {
+	BLAKE = 0,
+	BMW,
+	GROESTL,
+	SKEIN,
+	JH,
+	KECCAK,
+	LUFFA,
+	CUBEHASH,
+	MAX_ALGOS_COUNT
+};
+
+static const char* algo_strings[] = {
+	"blake",
+	"bmw512",
+	"groestl",
+	"skein",
+	"jh512",
+	"keccak",
+	"luffa",
+	"cube",
+	NULL
+};
+
+inline void swap8(uint8_t *a, uint8_t *b)
+{
+	uint8_t t = *a;
+	*a = *b;
+	*b = t;
+}
+
+inline void initPerm(uint8_t n[], int count)
+{
+	for (int i = 0; i < count; i++)
+		n[i] = i;
+}
+
+static int nextPerm(uint8_t n[], int count)
+{
+	int tail, i, j;
+
+	if (count <= 1)
+		return 0;
+
+	for (i = count - 1; i>0 && n[i - 1] >= n[i]; i--);
+	tail = i;
+
+	if (tail > 0) {
+		for (j = count - 1; j>tail && n[j] <= n[tail - 1]; j--);
+		swap8(&n[tail - 1], &n[j]);
+	}
+
+	for (i = tail, j = count - 1; i<j; i++, j--)
+		swap8(&n[i], &n[j]);
+
+	return (tail != 0);
+}
+
+static void getAlgoString(char *str, int seq)
+{
+	uint8_t algoList[HASH_FUNC_COUNT];
+	char *sptr;
+
+	initPerm(algoList, HASH_FUNC_COUNT);
+
+	for (int k = 0; k < seq; k++) {
+		nextPerm(algoList, HASH_FUNC_COUNT);
+	}
+
+	sptr = str;
+	for (int j = 0; j < HASH_FUNC_COUNT; j++) {
+		if (algoList[j] >= 10)
+			sprintf(sptr, "%c", 'A' + (algoList[j] - 10));
+		else
+			sprintf(sptr, "%u", (uint32_t) algoList[j]);
+		sptr++;
+	}
+	*sptr = '\0';
+}
+
+static __thread uint32_t s_ntime = 0;
+static uint32_t s_sequence = UINT32_MAX;
+static uint8_t s_firstalgo = 0xFF;
+static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
+
+#define INITIAL_DATE HASH_FUNC_BASE_TIMESTAMP
+static inline uint32_t getCurrentAlgoSeq(uint32_t ntime)
+{
+	// unlike x11evo, the permutation changes often (with ntime)
+	return (uint32_t) (ntime - INITIAL_DATE) % HASH_FUNC_COUNT_PERMUTATIONS;
+}
+
+// To finish...
+static void get_travel_order(uint32_t ntime, char *permstr)
+{
+	uint32_t seq = getCurrentAlgoSeq(ntime);
+	if (s_sequence != seq) {
+		getAlgoString(permstr, seq);
+		s_sequence = seq;
+	}
+}
+
+// CPU Hash
+extern "C" void exosis_hash(void *output, const void *input)
+{
+	uint32_t _ALIGN(64) hash[64/4] = { 0 };
+
+	sph_blake512_context     ctx_blake;
+	sph_bmw512_context       ctx_bmw;
+	sph_groestl512_context   ctx_groestl;
+	sph_skein512_context     ctx_skein;
+	sph_jh512_context        ctx_jh;
+	sph_keccak512_context    ctx_keccak;
+	sph_luffa512_context     ctx_luffa1;
+	sph_cubehash512_context  ctx_cubehash1;
+
+	if (s_sequence == UINT32_MAX) {
+		uint32_t *data = (uint32_t*) input;
+		const uint32_t ntime = (opt_benchmark || !data[17]) ? (uint32_t) time(NULL) : data[17];
+		get_travel_order(ntime, hashOrder);
+	}
+
+	void *in = (void*) input;
+	int size = 80;
+
+	const int hashes = (int) strlen(hashOrder);
+
+	for (int i = 0; i < hashes; i++)
+	{
+		const char elem = hashOrder[i];
+		uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+		switch (algo) {
+		case BLAKE:
+			sph_blake512_init(&ctx_blake);
+			sph_blake512(&ctx_blake, in, size);
+			sph_blake512_close(&ctx_blake, hash);
+			break;
+		case BMW:
+			sph_bmw512_init(&ctx_bmw);
+			sph_bmw512(&ctx_bmw, in, size);
+			sph_bmw512_close(&ctx_bmw, hash);
+			break;
+		case GROESTL:
+			sph_groestl512_init(&ctx_groestl);
+			sph_groestl512(&ctx_groestl, in, size);
+			sph_groestl512_close(&ctx_groestl, hash);
+			break;
+		case SKEIN:
+			sph_skein512_init(&ctx_skein);
+			sph_skein512(&ctx_skein, in, size);
+			sph_skein512_close(&ctx_skein, hash);
+			break;
+		case JH:
+			sph_jh512_init(&ctx_jh);
+			sph_jh512(&ctx_jh, in, size);
+			sph_jh512_close(&ctx_jh, hash);
+			break;
+		case KECCAK:
+			sph_keccak512_init(&ctx_keccak);
+			sph_keccak512(&ctx_keccak, in, size);
+			sph_keccak512_close(&ctx_keccak, hash);
+			break;
+		case LUFFA:
+			sph_luffa512_init(&ctx_luffa1);
+			sph_luffa512(&ctx_luffa1, in, size);
+			sph_luffa512_close(&ctx_luffa1, hash);
+			break;
+		case CUBEHASH:
+			sph_cubehash512_init(&ctx_cubehash1);
+			sph_cubehash512(&ctx_cubehash1, in, size);
+			sph_cubehash512_close(&ctx_cubehash1, hash);
+			break;
+		}
+
+		in = (void*) hash;
+		size = 64;
+	}
+
+	memcpy(output, hash, 32);
+}
+
+static uint32_t get_next_time(uint32_t ntime, char* curOrder)
+{
+	char nextOrder[HASH_FUNC_COUNT + 1] = { 0 };
+	uint32_t secs = 15;
+	do {
+		uint32_t nseq = getCurrentAlgoSeq(ntime+secs);
+		getAlgoString(nextOrder, nseq);
+		secs += 15;
+	} while (curOrder[0] == nextOrder[0]);
+	return secs;
+}
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "tt-"
+#include "cuda_debug.cuh"
+
+void quark_bmw512_cpu_setBlock_80(void *pdata);
+void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+void groestl512_setBlock_80(int thr_id, uint32_t *endiandata);
+void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void skein512_cpu_setBlock_80(void *pdata);
+void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
+
+void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
+void qubit_luffa512_cpu_setBlock_80(void *pdata);
+void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
+void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void keccak512_setBlock_80(int thr_id, uint32_t *endiandata);
+void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
+void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_exosis(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	// if (opt_benchmark) pdata[17] = swab32(0x5886a4be); // TO DEBUG GROESTL 80
+
+	if (opt_debug || s_ntime != pdata[17] || s_sequence == UINT32_MAX) {
+		uint32_t ntime = swab32(work->data[17]);
+		get_travel_order(ntime, hashOrder);
+		s_ntime = pdata[17];
+		if (opt_debug && !thr_id) {
+			applog(LOG_DEBUG, "exosis hash order %s (%08x)", hashOrder, ntime);
+		}
+	}
+
+	if (opt_benchmark)
+		ptarget[7] = 0x5;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		qubit_luffa512_cpu_init(thr_id, throughput); // only constants (480 bytes)
+		x11_luffa512_cpu_init(thr_id, throughput);
+		x11_cubehash512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
+		CUDA_CALL_OR_RET_X(cudaMemset(d_hash[thr_id], 0, (size_t) 64 * throughput), -1);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	cuda_check_cpu_setTarget(ptarget);
+
+	const int hashes = (int) strlen(hashOrder);
+	const char first = hashOrder[0];
+	const uint8_t algo80 = first >= 'A' ? first - 'A' + 10 : first - '0';
+	if (algo80 != s_firstalgo) {
+		s_firstalgo = algo80;
+		applog(LOG_INFO, "Exosis first algo is now %s", algo_strings[algo80 % HASH_FUNC_COUNT]);
+	}
+
+	switch (algo80) {
+		case BLAKE:
+			quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+			break;
+		case BMW:
+			quark_bmw512_cpu_setBlock_80(endiandata);
+			break;
+		case GROESTL:
+			groestl512_setBlock_80(thr_id, endiandata);
+			break;
+		case SKEIN:
+			skein512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case JH:
+			jh512_setBlock_80(thr_id, endiandata);
+			break;
+		case KECCAK:
+			keccak512_setBlock_80(thr_id, endiandata);
+			break;
+		case LUFFA:
+			qubit_luffa512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case CUBEHASH:
+			cubehash512_setBlock_80(thr_id, endiandata);
+			break;
+		default: {
+			uint32_t next = get_next_time(swab32(s_ntime), hashOrder);
+			if (!thr_id)
+				applog(LOG_WARNING, "kernel %c unimplemented, next in %u mn", first, next/60);
+			sleep(next > 30 ? 60 : 10);
+			return -1;
+		}
+	}
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+
+		switch (algo80) {
+			case BLAKE:
+				quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("blake80:");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("bmw80  :");
+				break;
+			case GROESTL:
+				groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("grstl80:");
+				break;
+			case SKEIN:
+				skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
+				TRACE("skein80:");
+				break;
+			case JH:
+				jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("jh51280:");
+				break;
+			case KECCAK:
+				keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("kecck80:");
+				break;
+			case LUFFA:
+				qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("luffa80:");
+				break;
+			case CUBEHASH:
+				cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("cube 80:");
+				break;
+		}
+
+		for (int i = 1; i < hashes; i++)
+		{
+			const char elem = hashOrder[i];
+			const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+			switch (algo64) {
+			case BLAKE:
+				quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("blake  :");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("bmw    :");
+				break;
+			case GROESTL:
+				quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("groestl:");
+				break;
+			case SKEIN:
+				quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("skein  :");
+				break;
+			case JH:
+				quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("jh512  :");
+				break;
+			case KECCAK:
+				quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("keccak :");
+				break;
+			case LUFFA:
+				x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("luffa  :");
+				break;
+			case CUBEHASH:
+				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("cube   :");
+				break;
+			}
+		}
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+			const uint32_t Htarg = ptarget[7];
+			be32enc(&endiandata[19], work->nonces[0]);
+			exosis_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				pdata[19] = work->nonces[0];
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					exosis_hash(vhash, endiandata);
+					if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+						bn_set_target_ratio(work, vhash, 1);
+						work->valid_nonces++;
+					}
+					pdata[19] = max(pdata[19], work->nonces[1]) + 1;
+				}
+				return work->valid_nonces;
+			} else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_exosis(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}

From 01e632cf05d51fdc898838976297339594db6769 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 25 Nov 2018 04:00:26 +0100
Subject: [PATCH 21/24] handle standard blake2b stratum algo

no weird protocol or reversed endian like sia...
---
 Makefile.am             |   1 +
 algos.h                 |   2 +
 bench.cpp               |   1 +
 blake2b.cu              | 273 ++++++++++++++++++++++++++++++++++++++++
 ccminer.cpp             |   5 +
 ccminer.vcxproj         |   1 +
 ccminer.vcxproj.filters |   3 +
 miner.h                 |   3 +
 sia/sia.cu              |  18 +--
 util.cpp                |   5 +-
 10 files changed, 302 insertions(+), 10 deletions(-)
 create mode 100644 blake2b.cu

diff --git a/Makefile.am b/Makefile.am
index d34ac78..6a15836 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -46,6 +46,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  Algo256/blake256.cu Algo256/decred.cu Algo256/vanilla.cu Algo256/keccak256.cu \
 			  Algo256/blake2s.cu sph/blake2s.c \
 			  Algo256/bmw.cu Algo256/cuda_bmw.cu \
+			  blake2b.cu \
 			  crypto/xmr-rpc.cpp crypto/wildkeccak-cpu.cpp crypto/wildkeccak.cu \
 			  crypto/cryptolight.cu crypto/cryptolight-core.cu crypto/cryptolight-cpu.cpp \
 			  crypto/cryptonight.cu crypto/cryptonight-core.cu crypto/cryptonight-extra.cu \
diff --git a/algos.h b/algos.h
index 2d2da2d..b084eeb 100644
--- a/algos.h
+++ b/algos.h
@@ -7,6 +7,7 @@
 enum sha_algos {
 	ALGO_BLAKECOIN = 0,
 	ALGO_BLAKE,
+	ALGO_BLAKE2B,
 	ALGO_BLAKE2S,
 	ALGO_ALLIUM,
 	ALGO_BMW,
@@ -86,6 +87,7 @@ extern volatile enum sha_algos opt_algo;
 static const char *algo_names[] = {
 	"blakecoin",
 	"blake",
+	"blake2b",
 	"blake2s",
 	"allium",
 	"bmw",
diff --git a/bench.cpp b/bench.cpp
index be53bbc..e2c26be 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -53,6 +53,7 @@ void algo_free_all(int thr_id)
 	free_bastion(thr_id);
 	free_bitcore(thr_id);
 	free_blake256(thr_id);
+	free_blake2b(thr_id);
 	free_blake2s(thr_id);
 	free_bmw(thr_id);
 	free_c11(thr_id);
diff --git a/blake2b.cu b/blake2b.cu
new file mode 100644
index 0000000..2be74f8
--- /dev/null
+++ b/blake2b.cu
@@ -0,0 +1,273 @@
+/**
+ * Blake2-B CUDA Implementation
+ *
+ * tpruvot@github July 2016
+ *
+ */
+
+#include <miner.h>
+
+#include <string.h>
+#include <stdint.h>
+
+#include <sph/blake2b.h>
+
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+
+#define TPB 512
+#define NBN 2
+
+static uint32_t *d_resNonces[MAX_GPUS];
+
+__device__ uint64_t d_data[10];
+
+static __constant__ const int8_t blake2b_sigma[12][16] = {
+	{ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  } ,
+	{ 11, 8,  12, 0,  5,  2,  15, 13, 10, 14, 3,  6,  7,  1,  9,  4  } ,
+	{ 7,  9,  3,  1,  13, 12, 11, 14, 2,  6,  5,  10, 4,  0,  15, 8  } ,
+	{ 9,  0,  5,  7,  2,  4,  10, 15, 14, 1,  11, 12, 6,  8,  3,  13 } ,
+	{ 2,  12, 6,  10, 0,  11, 8,  3,  4,  13, 7,  5,  15, 14, 1,  9  } ,
+	{ 12, 5,  1,  15, 14, 13, 4,  10, 0,  7,  6,  3,  9,  2,  8,  11 } ,
+	{ 13, 11, 7,  14, 12, 1,  3,  9,  5,  0,  15, 4,  8,  6,  2,  10 } ,
+	{ 6,  15, 14, 9,  11, 3,  0,  8,  12, 2,  13, 7,  1,  4,  10, 5  } ,
+	{ 10, 2,  8,  4,  7,  6,  1,  5,  15, 11, 9,  14, 3,  12, 13, 0  } ,
+	{ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  }
+};
+
+// host mem align
+#define A 64
+
+extern "C" void blake2b_hash(void *output, const void *input)
+{
+	uint8_t _ALIGN(A) hash[32];
+	blake2b_ctx ctx;
+
+	blake2b_init(&ctx, 32, NULL, 0);
+	blake2b_update(&ctx, input, 80);
+	blake2b_final(&ctx, hash);
+
+	memcpy(output, hash, 32);
+}
+
+// ----------------------------------------------------------------
+
+__device__ __forceinline__
+static void G(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, uint64_t &d, uint64_t const m[16])
+{
+	a = a + b + m[ blake2b_sigma[r][2*i] ];
+	((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
+	c = c + d;
+	((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] );
+	a = a + b + m[ blake2b_sigma[r][2*i+1] ];
+	((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
+	c = c + d;
+	((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
+}
+
+#define ROUND(r) \
+	G(r, 0, v[0], v[4], v[ 8], v[12], m); \
+	G(r, 1, v[1], v[5], v[ 9], v[13], m); \
+	G(r, 2, v[2], v[6], v[10], v[14], m); \
+	G(r, 3, v[3], v[7], v[11], v[15], m); \
+	G(r, 4, v[0], v[5], v[10], v[15], m); \
+	G(r, 5, v[1], v[6], v[11], v[12], m); \
+	G(r, 6, v[2], v[7], v[ 8], v[13], m); \
+	G(r, 7, v[3], v[4], v[ 9], v[14], m);
+
+__global__
+//__launch_bounds__(128, 8) /* to force 64 regs */
+void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2)
+{
+	const uint32_t nonce = (blockDim.x * blockIdx.x + threadIdx.x) + startNonce;
+
+	uint64_t m[16];
+
+	m[0] = d_data[0];
+	m[1] = d_data[1];
+	m[2] = d_data[2];
+	m[3] = d_data[3];
+	m[4] = d_data[4];
+	m[5] = d_data[5];
+	m[6] = d_data[6];
+	m[7] = d_data[7];
+	m[8] = d_data[8];
+	((uint32_t*)m)[18] = AS_U32(&d_data[9]);
+	((uint32_t*)m)[19] = nonce;
+
+	m[10] = m[11] = 0;
+	m[12] = m[13] = 0;
+	m[14] = m[15] = 0;
+
+	uint64_t v[16] = {
+		0x6a09e667f2bdc928, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+		0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+		0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+		0x510e527fade68281, 0x9b05688c2b3e6c1f, 0xe07c265404be4294, 0x5be0cd19137e2179
+	};
+
+	ROUND( 0);
+	ROUND( 1);
+	ROUND( 2);
+	ROUND( 3);
+	ROUND( 4);
+	ROUND( 5);
+	ROUND( 6);
+	ROUND( 7);
+	ROUND( 8);
+	ROUND( 9);
+	ROUND(10);
+	ROUND(11);
+
+	uint2 last = vectorize(v[3] ^ v[11] ^ 0xa54ff53a5f1d36f1);
+	if (last.y <= target2.y && last.x <= target2.x) {
+		resNonce[1] = resNonce[0];
+		resNonce[0] = nonce;
+	}
+}
+
+__host__
+uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce)
+{
+	uint32_t resNonces[NBN] = { UINT32_MAX, UINT32_MAX };
+	uint32_t result = UINT32_MAX;
+
+	dim3 grid((threads + TPB-1)/TPB);
+	dim3 block(TPB);
+
+	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
+	if (cudaMemset(d_resNonces[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
+		return result;
+
+	blake2b_gpu_hash <<<grid, block, 8>>> (threads, startNonce, d_resNonces[thr_id], target2);
+	cudaThreadSynchronize();
+
+	if (cudaSuccess == cudaMemcpy(resNonces, d_resNonces[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+		result = resNonces[0];
+		secNonce = resNonces[1];
+		if (secNonce == result) secNonce = UINT32_MAX;
+	}
+	return result;
+}
+
+__host__
+void blake2b_setBlock(uint32_t *data)
+{
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, data, 80, 0, cudaMemcpyHostToDevice));
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+int scanhash_blake2b(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(A) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t first_nonce = pdata[19];
+
+	int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 28 : 25;
+	if (device_sm[dev_id] >= 520 && is_windows()) intensity = 26;
+	if (device_sm[dev_id] < 350) intensity = 22;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage (linux)
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonces[thr_id], NBN * sizeof(uint32_t)), -1);
+		init[thr_id] = true;
+	}
+
+	for (int i=0; i < 20; i++)
+		be32enc(&endiandata[i], pdata[i]);
+
+	const uint2 target = make_uint2(ptarget[6], ptarget[7]);
+	blake2b_setBlock(endiandata);
+
+	do {
+		work->nonces[0] = blake2b_hash_cuda(thr_id, throughput, pdata[19], target, work->nonces[1]);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(A) vhash[8];
+			work->valid_nonces = 0;
+			endiandata[19] = work->nonces[0];
+			blake2b_hash(vhash, endiandata);
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work_set_target_ratio(work, vhash);
+				work->valid_nonces++;
+				pdata[19] = work->nonces[0] + 1;
+			} else {
+				gpu_increment_reject(thr_id);
+			}
+
+			if (work->nonces[1] != UINT32_MAX) {
+				endiandata[19] = work->nonces[1];
+				blake2b_hash(vhash, endiandata);
+				if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+					if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
+						work->sharediff[1] = work->sharediff[0];
+						work->shareratio[1] = work->shareratio[0];
+						xchg(work->nonces[1], work->nonces[0]);
+						work_set_target_ratio(work, vhash);
+					} else {
+						bn_set_target_ratio(work, vhash, 1);
+					}
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // next scan start
+				} else {
+					gpu_increment_reject(thr_id);
+				}
+			}
+
+			if (work->valid_nonces) {
+				work->nonces[0] = cuda_swab32(work->nonces[0]);
+				work->nonces[1] = cuda_swab32(work->nonces[1]);
+				return work->valid_nonces;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_blake2b(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	//cudaThreadSynchronize();
+
+	cudaFree(d_resNonces[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/ccminer.cpp b/ccminer.cpp
index bf5399c..46d9fac 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -243,6 +243,7 @@ Options:\n\
 			bastion     Hefty bastion\n\
 			bitcore     Timetravel-10\n\
 			blake       Blake 256 (SFR)\n\
+			blake2b     Blake2-B 512 (BCX)\n\
 			blake2s     Blake2-S 256 (NEVA)\n\
 			blakecoin   Fast Blake 256 (8 rounds)\n\
 			bmw         BMW 256\n\
@@ -2260,6 +2261,7 @@ static void *miner_thread(void *userdata)
 			//case ALGO_WHIRLPOOLX:
 				minmax = 0x40000000U;
 				break;
+			case ALGO_BLAKE2B:
 			case ALGO_KECCAK:
 			case ALGO_KECCAKC:
 			case ALGO_LBRY:
@@ -2375,6 +2377,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_BLAKE:
 			rc = scanhash_blake256(thr_id, &work, max_nonce, &hashes_done, 14);
 			break;
+		case ALGO_BLAKE2B:
+			rc = scanhash_blake2b(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_BLAKE2S:
 			rc = scanhash_blake2s(thr_id, &work, max_nonce, &hashes_done);
 			break;
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 01a598f..67820ad 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -460,6 +460,7 @@
     </CudaCompile>
     <CudaCompile Include="JHA\cuda_jha_keccak512.cu">
     </CudaCompile>
+    <CudaCompile Include="blake2b.cu" />
     <CudaCompile Include="Algo256\blake256.cu">
       <MaxRegCount>64</MaxRegCount>
       <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 88252ec..c353d21 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -952,6 +952,9 @@
     <CudaCompile Include="lyra2\lyra2Z.cu">
       <Filter>Source Files\CUDA\lyra2</Filter>
     </CudaCompile>
+    <CudaCompile Include="blake2b.cu">
+      <Filter>Source Files\CUDA\</Filter>
+    </CudaCompile>
     <CudaCompile Include="Algo256\blake2s.cu">
       <Filter>Source Files\CUDA\Algo256</Filter>
     </CudaCompile>
diff --git a/miner.h b/miner.h
index cbc766b..bbd4c8c 100644
--- a/miner.h
+++ b/miner.h
@@ -276,6 +276,7 @@ struct work;
 extern int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_bastion(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds);
+extern int scanhash_blake2b(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -347,6 +348,7 @@ extern void free_allium(int thr_id);
 extern void free_bastion(int thr_id);
 extern void free_bitcore(int thr_id);
 extern void free_blake256(int thr_id);
+extern void free_blake2b(int thr_id);
 extern void free_blake2s(int thr_id);
 extern void free_bmw(int thr_id);
 extern void free_c11(int thr_id);
@@ -939,6 +941,7 @@ void scrypthash(void* output, const void* input);
 void scryptjane_hash(void* output, const void* input);
 void sha256d_hash(void *output, const void *input);
 void sha256t_hash(void *output, const void *input);
+void sia_blake2b_hash(void *output, const void *input);
 void sibhash(void *output, const void *input);
 void skeincoinhash(void *output, const void *input);
 void skein2hash(void *output, const void *input);
diff --git a/sia/sia.cu b/sia/sia.cu
index 8e4f483..4ffdccb 100644
--- a/sia/sia.cu
+++ b/sia/sia.cu
@@ -40,7 +40,7 @@ static __constant__ const int8_t blake2b_sigma[12][16] = {
 // host mem align
 #define A 64
 
-extern "C" void blake2b_hash(void *output, const void *input)
+extern "C" void sia_blake2b_hash(void *output, const void *input)
 {
 	uint8_t _ALIGN(A) hash[32];
 	blake2b_ctx ctx;
@@ -102,7 +102,7 @@ static void H(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, u
 
 __global__
 //__launch_bounds__(128, 8) /* to force 64 regs */
-void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2)
+void sia_blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2)
 {
 	const uint32_t nonce = (blockDim.x * blockIdx.x + threadIdx.x) + startNonce;
 	__shared__ uint64_t s_target;
@@ -154,7 +154,7 @@ void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_
 }
 
 __host__
-uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce)
+uint32_t sia_blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce)
 {
 	uint32_t resNonces[NBN] = { UINT32_MAX, UINT32_MAX };
 	uint32_t result = UINT32_MAX;
@@ -166,7 +166,7 @@ uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint3
 	if (cudaMemset(d_resNonces[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
 		return result;
 
-	blake2b_gpu_hash <<<grid, block, 8>>> (threads, startNonce, d_resNonces[thr_id], target2);
+	sia_blake2b_gpu_hash <<<grid, block, 8>>> (threads, startNonce, d_resNonces[thr_id], target2);
 	cudaThreadSynchronize();
 
 	if (cudaSuccess == cudaMemcpy(resNonces, d_resNonces[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
@@ -178,7 +178,7 @@ uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint3
 }
 
 __host__
-void blake2b_setBlock(uint32_t *data)
+void sia_blake2b_setBlock(uint32_t *data)
 {
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, data, 80, 0, cudaMemcpyHostToDevice));
 }
@@ -224,10 +224,10 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon
 
 	const uint2 target = make_uint2(ptarget[6], ptarget[7]);
 
-	blake2b_setBlock(inputdata);
+	sia_blake2b_setBlock(inputdata);
 
 	do {
-		work->nonces[0] = blake2b_hash_cuda(thr_id, throughput, pdata[8], target, work->nonces[1]);
+		work->nonces[0] = sia_blake2b_hash_cuda(thr_id, throughput, pdata[8], target, work->nonces[1]);
 
 		*hashes_done = pdata[8] - first_nonce + throughput;
 
@@ -235,7 +235,7 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon
 		{
 			work->valid_nonces = 0;
 			inputdata[8] = work->nonces[0];
-			blake2b_hash(hash, inputdata);
+			sia_blake2b_hash(hash, inputdata);
 			if (swab32(hash[0]) <= Htarg) {
 				// sia hash target is reversed (start of hash)
 				swab256(vhashcpu, hash);
@@ -250,7 +250,7 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon
 
 			if (work->nonces[1] != UINT32_MAX) {
 				inputdata[8] = work->nonces[1];
-				blake2b_hash(hash, inputdata);
+				sia_blake2b_hash(hash, inputdata);
 				if (swab32(hash[0]) <= Htarg) {
 					swab256(vhashcpu, hash);
 					if (fulltest(vhashcpu, ptarget)) {
diff --git a/util.cpp b/util.cpp
index 7a67ea6..f661d52 100644
--- a/util.cpp
+++ b/util.cpp
@@ -2184,6 +2184,9 @@ void print_hash_tests(void)
 	blake256hash(&hash[0], &buf[0], 14);
 	printpfx("blake", hash);
 
+	blake2b_hash(&hash[0], &buf[0]);
+	printpfx("blake2b", hash);
+
 	blake2s_hash(&hash[0], &buf[0]);
 	printpfx("blake2s", hash);
 
@@ -2285,7 +2288,7 @@ void print_hash_tests(void)
 	sha256t_hash(&hash[0], &buf[0]);
 	printpfx("sha256t", hash);
 
-	blake2b_hash(&hash[0], &buf[0]);
+	sia_blake2b_hash(&hash[0], &buf[0]);
 	printpfx("sia", hash);
 
 	sibhash(&hash[0], &buf[0]);

From c59bc2438a6b0404c3199972fecd45123480f792 Mon Sep 17 00:00:00 2001
From: pyritepirate <44350183+pyritepirate@users.noreply.github.com>
Date: Sun, 27 Jan 2019 08:24:53 +0100
Subject: [PATCH 22/24] sha256q (#70)

---
 Makefile.am             |   2 +-
 README.txt              |   1 +
 algos.h                 |   2 +
 bench.cpp               |   1 +
 ccminer.cpp             |   6 +
 ccminer.vcxproj         |   2 +
 ccminer.vcxproj.filters |   6 +
 miner.h                 |   3 +
 sha256/cuda_sha256q.cu  | 507 ++++++++++++++++++++++++++++++++++++++++
 sha256/sha256q.cu       | 136 +++++++++++
 util.cpp                |   3 +
 11 files changed, 668 insertions(+), 1 deletion(-)
 create mode 100644 sha256/cuda_sha256q.cu
 create mode 100644 sha256/sha256q.cu

diff --git a/Makefile.am b/Makefile.am
index 6a15836..ecc8e30 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -60,7 +60,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  neoscrypt/neoscrypt.cpp neoscrypt/neoscrypt-cpu.c neoscrypt/cuda_neoscrypt.cu \
 			  pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp zr5.cu \
 			  skunk/skunk.cu skunk/cuda_skunk.cu skunk/cuda_skunk_streebog.cu \
-			  sha256/sha256d.cu sha256/cuda_sha256d.cu sha256/sha256t.cu sha256/cuda_sha256t.cu \
+			  sha256/sha256d.cu sha256/cuda_sha256d.cu sha256/sha256t.cu sha256/cuda_sha256t.cu sha256/sha256q.cu sha256/cuda_sha256q.cu \
 			  sia/sia.cu sia/sia-rpc.cpp sph/blake2b.c \
 			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
diff --git a/README.txt b/README.txt
index 285b04d..321bfb4 100644
--- a/README.txt
+++ b/README.txt
@@ -117,6 +117,7 @@ its command line interface and options.
                           scrypt-jane use to mine Chacha coins like Cache and Ultracoin
                           s3          use to mine 1coin (ONE)
                           sha256t     use to mine OneCoin (OC)
+			  sha256q     use to mine Pyrite
                           sia         use to mine SIA
                           sib         use to mine Sibcoin
                           skein       use to mine Skeincoin
diff --git a/algos.h b/algos.h
index b084eeb..aa03ecd 100644
--- a/algos.h
+++ b/algos.h
@@ -49,6 +49,7 @@ enum sha_algos {
 	ALGO_SCRYPT_JANE,
 	ALGO_SHA256D,
 	ALGO_SHA256T,
+	ALGO_SHA256Q,
 	ALGO_SIA,
 	ALGO_SIB,
 	ALGO_SKEIN,
@@ -129,6 +130,7 @@ static const char *algo_names[] = {
 	"scrypt-jane",
 	"sha256d",
 	"sha256t",
+	"sha256q",
 	"sia",
 	"sib",
 	"skein",
diff --git a/bench.cpp b/bench.cpp
index e2c26be..f674f77 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -93,6 +93,7 @@ void algo_free_all(int thr_id)
 	free_skunk(thr_id);
 	free_sha256d(thr_id);
 	free_sha256t(thr_id);
+	free_sha256q(thr_id);
 	free_sia(thr_id);
 	free_sib(thr_id);
 	free_sonoa(thr_id);
diff --git a/ccminer.cpp b/ccminer.cpp
index 46d9fac..596a924 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -283,6 +283,7 @@ Options:\n\
 			qubit       Qubit\n\
 			sha256d     SHA256d (bitcoin)\n\
 			sha256t     SHA256 x3\n\
+			sha256q     SHA256 x4\n\
 			sia         SIA (Blake2B)\n\
 			sib         Sibcoin (X11+Streebog)\n\
 			scrypt      Scrypt\n\
@@ -977,6 +978,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		case ALGO_BMW:
 		case ALGO_SHA256D:
 		case ALGO_SHA256T:
+		case ALGO_SHA256Q:
 		case ALGO_VANILLA:
 			// fast algos require that... (todo: regen hash)
 			check_dups = true;
@@ -2258,6 +2260,7 @@ static void *miner_thread(void *userdata)
 			case ALGO_DECRED:
 			case ALGO_SHA256D:
 			case ALGO_SHA256T:
+			case ALGO_SHA256Q:
 			//case ALGO_WHIRLPOOLX:
 				minmax = 0x40000000U;
 				break;
@@ -2515,6 +2518,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_SHA256T:
 			rc = scanhash_sha256t(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_SHA256Q:
+			rc = scanhash_sha256q(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_SIA:
 			rc = scanhash_sia(thr_id, &work, max_nonce, &hashes_done);
 			break;
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 67820ad..5ef6551 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -435,6 +435,8 @@
     <CudaCompile Include="sha256\sha256d.cu" />
     <CudaCompile Include="sha256\cuda_sha256t.cu" />
     <CudaCompile Include="sha256\sha256t.cu" />
+    <CudaCompile Include="sha256\cuda_sha256q.cu" />
+    <CudaCompile Include="sha256\sha256q.cu" />
     <CudaCompile Include="zr5.cu" />
     <CudaCompile Include="heavy\cuda_blake512.cu">
     </CudaCompile>
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index c353d21..8ed886a 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -982,6 +982,12 @@
     <CudaCompile Include="sha256\sha256t.cu">
       <Filter>Source Files\CUDA\sha256</Filter>
     </CudaCompile>
+    <CudaCompile Include="sha256\cuda_sha256q.cu">
+      <Filter>Source Files\CUDA\sha256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="sha256\sha256q.cu">
+      <Filter>Source Files\CUDA\sha256</Filter>
+    </CudaCompile>
     <CudaCompile Include="sia\sia.cu">
       <Filter>Source Files\sia</Filter>
     </CudaCompile>
diff --git a/miner.h b/miner.h
index bbd4c8c..7f52d55 100644
--- a/miner.h
+++ b/miner.h
@@ -310,6 +310,7 @@ extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, uns
 extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_sha256d(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_sha256t(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_sha256q(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -383,6 +384,7 @@ extern void free_quark(int thr_id);
 extern void free_qubit(int thr_id);
 extern void free_sha256d(int thr_id);
 extern void free_sha256t(int thr_id);
+extern void free_sha256q(int thr_id);
 extern void free_sia(int thr_id);
 extern void free_sib(int thr_id);
 extern void free_skeincoin(int thr_id);
@@ -941,6 +943,7 @@ void scrypthash(void* output, const void* input);
 void scryptjane_hash(void* output, const void* input);
 void sha256d_hash(void *output, const void *input);
 void sha256t_hash(void *output, const void *input);
+void sha256q_hash(void *output, const void *input);
 void sia_blake2b_hash(void *output, const void *input);
 void sibhash(void *output, const void *input);
 void skeincoinhash(void *output, const void *input);
diff --git a/sha256/cuda_sha256q.cu b/sha256/cuda_sha256q.cu
new file mode 100644
index 0000000..80733ac
--- /dev/null
+++ b/sha256/cuda_sha256q.cu
@@ -0,0 +1,507 @@
+/*
+ * sha256(-q) CUDA implementation.
+ * pyritepirate 2018
+ * tpruvot 2017
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <cuda_helper.h>
+#include <miner.h>
+
+__constant__ static uint32_t __align__(8) c_midstate76[8];
+__constant__ static uint32_t __align__(8) c_dataEnd80[4];
+
+const __constant__  uint32_t __align__(8) c_H256[8] = {
+	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+__constant__ static uint32_t __align__(8) c_K[64];
+__constant__ static uint32_t __align__(8) c_target[2];
+__device__ uint64_t d_target[1];
+
+static uint32_t* d_resNonces[MAX_GPUS] = { 0 };
+
+// ------------------------------------------------------------------------------------------------
+
+static const uint32_t cpu_H256[8] = {
+	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+
+static const uint32_t cpu_K[64] = {
+	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+	0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+	0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+	0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+	0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+	0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+	0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+	0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+#define ROTR ROTR32
+
+__host__
+static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t in, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+	t1 = h + bsg21 + vxandx + Kshared + in;
+	t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+
+	int pcidx1 = (pc-2)  & 0xF;
+	int pcidx2 = (pc-7)  & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
+
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
+	uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	d =  d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
+
+	for (int i=0; i<3; i++)
+	{
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+#define xor3b(a,b,c) (a ^ b ^ c)
+
+__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
+{
+	return xor3b(ROTR32(x,2),ROTR32(x,13),ROTR32(x,22));
+}
+
+__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
+{
+	return xor3b(ROTR32(x,6),ROTR32(x,11),ROTR32(x,25));
+}
+
+__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
+{
+	return xor3b(ROTR32(x,7),ROTR32(x,18),(x>>3));
+}
+
+__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
+{
+	return xor3b(ROTR32(x,17),ROTR32(x,19),(x>>10));
+}
+
+__device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, const uint32_t c)
+{
+	uint32_t result;
+	asm("{\n\t"
+		".reg .u32 m,n,o;\n\t"
+		"and.b32 m,  %1, %2;\n\t"
+		" or.b32 n,  %1, %2;\n\t"
+		"and.b32 o,   n, %3;\n\t"
+		" or.b32 %0,  m, o ;\n\t"
+		"}\n\t" : "=r"(result) : "r"(a), "r"(b), "r"(c)
+	);
+	return result;
+}
+
+__device__ __forceinline__ uint2 vectorizeswap(uint64_t v) {
+	uint2 result;
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(result.y), "=r"(result.x) : "l"(v));
+	return result;
+}
+
+__device__
+static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t in, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+	uint32_t vxandx = xandx(e, f, g);
+	uint32_t bsg21 = bsg2_1(e);
+	uint32_t bsg20 = bsg2_0(a);
+	uint32_t andorv = andor32(a,b,c);
+
+	t1 = h + bsg21 + vxandx + Kshared + in;
+	t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+__device__
+static void sha2_step2(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+
+	int pcidx1 = (pc-2) & 0xF;
+	int pcidx2 = (pc-7) & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
+
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ssg2_1(inx1);
+	uint32_t ssg20 = ssg2_0(inx3);
+	uint32_t vxandx = xandx(e, f, g);
+	uint32_t bsg21 = bsg2_1(e);
+	uint32_t bsg20 = bsg2_0(a);
+	uint32_t andorv = andor32(a,b,c);
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	d =  d + t1;
+	h = t1 + t2;
+}
+
+__device__
+static void sha256_round_body(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha2_step1(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
+	sha2_step1(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
+	sha2_step1(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
+	sha2_step1(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
+	sha2_step1(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
+	sha2_step1(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
+	sha2_step1(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
+	sha2_step1(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
+	sha2_step1(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
+	sha2_step1(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
+	sha2_step1(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
+	sha2_step1(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
+	sha2_step1(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
+	sha2_step1(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
+	sha2_step1(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
+	sha2_step1(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
+
+	#pragma unroll
+	for (int i=0; i<3; i++)
+	{
+		sha2_step2(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+		sha2_step2(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+		sha2_step2(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+		sha2_step2(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+		sha2_step2(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+		sha2_step2(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+		sha2_step2(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+		sha2_step2(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+		sha2_step2(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+		sha2_step2(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+		sha2_step2(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+		sha2_step2(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+		sha2_step2(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+		sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+		sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+		sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+__device__
+static void sha256_round_last(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha2_step1(a,b,c,d, e,f,g,h, in[ 0], Kshared[ 0]);
+	sha2_step1(h,a,b,c, d,e,f,g, in[ 1], Kshared[ 1]);
+	sha2_step1(g,h,a,b, c,d,e,f, in[ 2], Kshared[ 2]);
+	sha2_step1(f,g,h,a, b,c,d,e, in[ 3], Kshared[ 3]);
+	sha2_step1(e,f,g,h, a,b,c,d, in[ 4], Kshared[ 4]);
+	sha2_step1(d,e,f,g, h,a,b,c, in[ 5], Kshared[ 5]);
+	sha2_step1(c,d,e,f, g,h,a,b, in[ 6], Kshared[ 6]);
+	sha2_step1(b,c,d,e, f,g,h,a, in[ 7], Kshared[ 7]);
+	sha2_step1(a,b,c,d, e,f,g,h, in[ 8], Kshared[ 8]);
+	sha2_step1(h,a,b,c, d,e,f,g, in[ 9], Kshared[ 9]);
+	sha2_step1(g,h,a,b, c,d,e,f, in[10], Kshared[10]);
+	sha2_step1(f,g,h,a, b,c,d,e, in[11], Kshared[11]);
+	sha2_step1(e,f,g,h, a,b,c,d, in[12], Kshared[12]);
+	sha2_step1(d,e,f,g, h,a,b,c, in[13], Kshared[13]);
+	sha2_step1(c,d,e,f, g,h,a,b, in[14], Kshared[14]);
+	sha2_step1(b,c,d,e, f,g,h,a, in[15], Kshared[15]);
+
+	#pragma unroll
+	for (int i=0; i<2; i++)
+	{
+		sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*i]);
+		sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*i]);
+		sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*i]);
+		sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*i]);
+		sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*i]);
+		sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*i]);
+		sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*i]);
+		sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*i]);
+		sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*i]);
+		sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*i]);
+		sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*i]);
+		sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*i]);
+		sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*i]);
+		sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*i]);
+		sha2_step2(c,d,e,f, g,h,a,b, in,14, Kshared[30+16*i]);
+		sha2_step2(b,c,d,e, f,g,h,a, in,15, Kshared[31+16*i]);
+	}
+
+	sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*2]);
+	sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*2]);
+	sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*2]);
+	sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*2]);
+	sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*2]);
+	sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*2]);
+	sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*2]);
+	sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*2]);
+	sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*2]);
+	sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*2]);
+	sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*2]);
+	sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*2]);
+	sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*2]);
+	sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*2]);
+
+	state[6] += g;
+	state[7] += h;
+}
+
+__device__ __forceinline__
+uint64_t cuda_swab32ll(uint64_t x) {
+	return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
+}
+
+__global__
+/*__launch_bounds__(256,3)*/
+void sha256q_gpu_hash_shared(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	__shared__ uint32_t s_K[64*4];
+	//s_K[thread & 63] = c_K[thread & 63];
+	if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
+
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+
+		uint32_t dat[16];
+		AS_UINT2(dat) = AS_UINT2(c_dataEnd80);
+		dat[ 2] = c_dataEnd80[2];
+		dat[ 3] = nonce;
+		dat[ 4] = 0x80000000;
+		dat[15] = 0x280;
+		#pragma unroll
+		for (int i=5; i<15; i++) dat[i] = 0;
+
+		uint32_t buf[8];
+		#pragma unroll
+		for (int i=0; i<8; i+=2) AS_UINT2(&buf[i]) = AS_UINT2(&c_midstate76[i]);
+		//for (int i=0; i<8; i++) buf[i] = c_midstate76[i];
+
+		sha256_round_body(dat, buf, s_K);
+
+		// second sha256
+
+		#pragma unroll
+		for (int i=0; i<8; i++) dat[i] = buf[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll
+		for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+		sha256_round_body(dat, buf, s_K);
+
+		// third sha256
+
+		#pragma unroll
+		for (int i=0; i<8; i++) dat[i] = buf[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll
+		for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+		sha256_round_body(dat, buf, s_K);
+
+		// last sha256
+
+		#pragma unroll
+		for (int i=0; i<8; i++) dat[i] = buf[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll
+		for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+		sha256_round_last(dat, buf, s_K);
+
+
+		// valid nonces
+		uint64_t high = cuda_swab32ll(((uint64_t*)buf)[3]);
+		if (high <= c_target[0]) {
+			//printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]);
+			resNonces[1] = atomicExch(resNonces, nonce);
+			//d_target[0] = high;
+		}
+	}
+}
+
+__host__
+void sha256q_init(int thr_id)
+{
+	cuda_get_arch(thr_id);
+	cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaMalloc(&d_resNonces[thr_id], 2*sizeof(uint32_t)));
+}
+
+__host__
+void sha256q_free(int thr_id)
+{
+	if (d_resNonces[thr_id]) cudaFree(d_resNonces[thr_id]);
+	d_resNonces[thr_id] = NULL;
+}
+
+__host__
+void sha256q_setBlock_80(uint32_t *pdata, uint32_t *ptarget)
+{
+	uint32_t _ALIGN(64) in[16], buf[8], end[4];
+	for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]);
+	for (int i=0;i<8;i++) buf[i] = cpu_H256[i];
+	for (int i=0;i<4;i++) end[i] = cuda_swab32(pdata[16+i]);
+	sha256_round_body_host(in, buf, cpu_K);
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_midstate76, buf, 32, 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_dataEnd80,  end, sizeof(end), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
+}
+
+__host__
+void sha256q_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces)
+{
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid(threads/threadsperblock);
+	dim3 block(threadsperblock);
+
+	CUDA_SAFE_CALL(cudaMemset(d_resNonces[thr_id], 0xFF, 2 * sizeof(uint32_t)));
+	cudaThreadSynchronize();
+	sha256q_gpu_hash_shared <<<grid, block>>> (threads, startNonce, d_resNonces[thr_id]);
+	cudaThreadSynchronize();
+
+	CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+	if (resNonces[0] == resNonces[1]) {
+		resNonces[1] = UINT32_MAX;
+	}
+}
diff --git a/sha256/sha256q.cu b/sha256/sha256q.cu
new file mode 100644
index 0000000..d3efa40
--- /dev/null
+++ b/sha256/sha256q.cu
@@ -0,0 +1,136 @@
+/**
+ * SHA256 4x
+ * by pyritepirate - 2018
+ * by tpruvot@github - 2017
+ */
+
+#include <miner.h>
+#include <cuda_helper.h>
+#include <openssl/sha.h>
+
+// CPU Check
+extern "C" void sha256q_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(64) hash[64];
+	SHA256_CTX sha256;
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, (unsigned char *)input, 80);
+	SHA256_Final(hash, &sha256);
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, hash, 32);
+	SHA256_Final(hash, &sha256);
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, hash, 32);
+	SHA256_Final(hash, &sha256);
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, hash, 32);
+	SHA256_Final((unsigned char *)output, &sha256);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+extern void sha256q_init(int thr_id);
+extern void sha256q_free(int thr_id);
+extern void sha256q_setBlock_80(uint32_t *pdata, uint32_t *ptarget);
+extern void sha256q_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces);
+
+extern "C" int scanhash_sha256q(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 23);
+	if (init[thr_id]) throughput = min(throughput, (max_nonce - first_nonce));
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x03;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		sha256q_init(thr_id);
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	sha256q_setBlock_80(endiandata, ptarget);
+
+	do {
+		// Hash with CUDA
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		sha256q_hash_80(thr_id, throughput, pdata[19], work->nonces);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+
+			endiandata[19] = swab32(work->nonces[0]);
+			sha256q_hash(vhash, endiandata);
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != UINT32_MAX) {
+					endiandata[19] = swab32(work->nonces[1]);
+					sha256q_hash(vhash, endiandata);
+					if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+						work->valid_nonces++;
+						bn_set_target_ratio(work, vhash, 1);
+					}
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1;
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > ptarget[7]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_sha256q(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	sha256q_free(thr_id);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/util.cpp b/util.cpp
index f661d52..79799b0 100644
--- a/util.cpp
+++ b/util.cpp
@@ -2288,6 +2288,9 @@ void print_hash_tests(void)
 	sha256t_hash(&hash[0], &buf[0]);
 	printpfx("sha256t", hash);
 
+	sha256q_hash(&hash[0], &buf[0]);
+	printpfx("sha256q", hash);
+  
 	sia_blake2b_hash(&hash[0], &buf[0]);
 	printpfx("sia", hash);
 

From 9a1f20d455d27c44bedcb80c30f1d2e2a50c0913 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Wed, 30 Jan 2019 14:28:23 +0100
Subject: [PATCH 23/24] Handle lyra2v3 algo, for VTC fork

mostly imported from opensourced vertcoin-miner with a few fixes
---
 Makefile.am                |   1 +
 README.txt                 |   7 +-
 algos.h                    |   4 +
 bench.cpp                  |   1 +
 ccminer.cpp                |   8 +-
 ccminer.vcxproj            |   3 +
 ccminer.vcxproj.filters    |   9 +
 compat/ccminer-config.h    |   2 +-
 lyra2/Lyra2.c              | 173 +++++++++++++
 lyra2/Lyra2.h              |   1 +
 lyra2/cuda_lyra2v3.cu      | 481 +++++++++++++++++++++++++++++++++++++
 lyra2/cuda_lyra2v3_sm3.cuh | 348 +++++++++++++++++++++++++++
 lyra2/lyra2REv3.cu         | 182 ++++++++++++++
 miner.h                    |   3 +
 util.cpp                   |   3 +
 15 files changed, 1221 insertions(+), 5 deletions(-)
 create mode 100644 lyra2/cuda_lyra2v3.cu
 create mode 100644 lyra2/cuda_lyra2v3_sm3.cuh
 create mode 100644 lyra2/lyra2REv3.cu

diff --git a/Makefile.am b/Makefile.am
index ecc8e30..4749f57 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -38,6 +38,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  lyra2/Lyra2.c lyra2/Sponge.c \
 			  lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \
 		          lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \
+			  lyra2/lyra2REv3.cu lyra2/cuda_lyra2v3.cu \
 			  lyra2/Lyra2Z.c lyra2/lyra2Z.cu lyra2/cuda_lyra2Z.cu \
 			  lyra2/allium.cu \
 			  Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \
diff --git a/README.txt b/README.txt
index 321bfb4..0ee3313 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
 
-ccminer 2.3                     "phi2 and cryptonight variants"
+ccminer 2.3.1                     "lyra2v3, exosis and sha256q"
 ---------------------------------------------------------------
 
 ***************************************************************
@@ -100,7 +100,8 @@ its command line interface and options.
                           lbry        use to mine LBRY Credits
                           luffa       use to mine Joincoin
                           lyra2       use to mine CryptoCoin
-                          lyra2v2     use to mine Vertcoin
+                          lyra2v2     use to mine Monacoin
+                          lyra2v3     use to mine Vertcoin
                           lyra2z      use to mine Zerocoin (XZC)
                           monero      use to mine Monero (XMR)
                           myr-gr      use to mine Myriad-Groest
@@ -117,7 +118,7 @@ its command line interface and options.
                           scrypt-jane use to mine Chacha coins like Cache and Ultracoin
                           s3          use to mine 1coin (ONE)
                           sha256t     use to mine OneCoin (OC)
-			  sha256q     use to mine Pyrite
+                          sha256q     use to mine Pyrite
                           sia         use to mine SIA
                           sib         use to mine Sibcoin
                           skein       use to mine Skeincoin
diff --git a/algos.h b/algos.h
index aa03ecd..e33d182 100644
--- a/algos.h
+++ b/algos.h
@@ -34,6 +34,7 @@ enum sha_algos {
 	ALGO_LUFFA,
 	ALGO_LYRA2,
 	ALGO_LYRA2v2,
+	ALGO_LYRA2v3,
 	ALGO_LYRA2Z,
 	ALGO_MJOLLNIR,		/* Hefty hash */
 	ALGO_MYR_GR,
@@ -115,6 +116,7 @@ static const char *algo_names[] = {
 	"luffa",
 	"lyra2",
 	"lyra2v2",
+	"lyra2v3",
 	"lyra2z",
 	"mjollnir",
 	"myr-gr",
@@ -199,6 +201,8 @@ static inline int algo_to_int(char* arg)
 			i = ALGO_LYRA2;
 		else if (!strcasecmp("lyra2rev2", arg))
 			i = ALGO_LYRA2v2;
+		else if (!strcasecmp("lyra2rev3", arg))
+			i = ALGO_LYRA2v3;
 		else if (!strcasecmp("phi1612", arg))
 			i = ALGO_PHI;
 		else if (!strcasecmp("bitcoin", arg))
diff --git a/bench.cpp b/bench.cpp
index f674f77..d3c7701 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -78,6 +78,7 @@ void algo_free_all(int thr_id)
 	free_luffa(thr_id);
 	free_lyra2(thr_id);
 	free_lyra2v2(thr_id);
+	free_lyra2v3(thr_id);
 	free_lyra2Z(thr_id);
 	free_myriad(thr_id);
 	free_neoscrypt(thr_id);
diff --git a/ccminer.cpp b/ccminer.cpp
index 596a924..2695074 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -269,7 +269,8 @@ Options:\n\
 			lbry        LBRY Credits (Sha/Ripemd)\n\
 			luffa       Joincoin\n\
 			lyra2       CryptoCoin\n\
-			lyra2v2     VertCoin\n\
+			lyra2v2     MonaCoin\n\
+			lyra2v3     Vertcoin\n\
 			lyra2z      ZeroCoin (3rd impl)\n\
 			myr-gr      Myriad-Groestl\n\
 			monero      XMR cryptonight (v7)\n\
@@ -1742,6 +1743,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_KECCAKC:
 		case ALGO_LBRY:
 		case ALGO_LYRA2v2:
+		case ALGO_LYRA2v3:
 		case ALGO_LYRA2Z:
 		case ALGO_PHI2:
 		case ALGO_TIMETRAVEL:
@@ -2283,6 +2285,7 @@ static void *miner_thread(void *userdata)
 			case ALGO_JHA:
 			case ALGO_HSR:
 			case ALGO_LYRA2v2:
+			case ALGO_LYRA2v3:
 			case ALGO_PHI:
 			case ALGO_PHI2:
 			case ALGO_POLYTIMOS:
@@ -2474,6 +2477,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_LYRA2v2:
 			rc = scanhash_lyra2v2(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_LYRA2v3:
+			rc = scanhash_lyra2v3(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_LYRA2Z:
 			rc = scanhash_lyra2Z(thr_id, &work, max_nonce, &hashes_done);
 			break;
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 5ef6551..26c9cd1 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -530,6 +530,9 @@
     <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
     <ClInclude Include="lyra2\cuda_lyra2_sm5.cuh" />
     <ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
+    <CudaCompile Include="lyra2\lyra2REv3.cu" />
+    <CudaCompile Include="lyra2\cuda_lyra2v3.cu" />
+    <ClInclude Include="lyra2\cuda_lyra2v3_sm3.cuh" />
     <CudaCompile Include="lyra2\lyra2Z.cu" />
     <CudaCompile Include="lyra2\cuda_lyra2Z.cu" />
     <ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh" />
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 8ed886a..3df7871 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -946,6 +946,15 @@
     <CudaCompile Include="lyra2\lyra2REv2.cu">
       <Filter>Source Files\CUDA\lyra2</Filter>
     </CudaCompile>
+    <CudaCompile Include="lyra2\cuda_lyra2v3.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
+    <ClInclude Include="lyra2\cuda_lyra2v3_sm3.cuh">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </ClInclude>
+    <CudaCompile Include="lyra2\lyra2REv3.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
     <CudaCompile Include="lyra2\cuda_lyra2Z.cu">
       <Filter>Source Files\CUDA\lyra2</Filter>
     </CudaCompile>
diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h
index 030e89f..d110201 100644
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@@ -164,7 +164,7 @@
 #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.3"
+#define PACKAGE_VERSION "2.3.1"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
diff --git a/lyra2/Lyra2.c b/lyra2/Lyra2.c
index 1f0a953..256af78 100644
--- a/lyra2/Lyra2.c
+++ b/lyra2/Lyra2.c
@@ -212,3 +212,176 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa
 
 	return 0;
 }
+
+int LYRA2_3(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols)
+{
+	//============================= Basic variables ============================//
+	int64_t row = 2; //index of row to be processed
+	int64_t prev = 1; //index of prev (last row ever computed/modified)
+	int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+	int64_t tau; //Time Loop iterator
+	int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+	int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+	int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+	int64_t i; //auxiliary iteration counter
+	int64_t v64; // 64bit var for memcpy
+	uint64_t instance = 0;
+	//==========================================================================/
+
+	//========== Initializing the Memory Matrix and pointers to it =============//
+	//Tries to allocate enough space for the whole memory matrix
+
+	const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+	const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+	// for Lyra2REv2, nCols = 4, v1 was using 8
+	const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+
+	size_t sz = (size_t)ROW_LEN_BYTES * nRows;
+	uint64_t *wholeMatrix = malloc(sz);
+	if (wholeMatrix == NULL) {
+		return -1;
+	}
+	memset(wholeMatrix, 0, sz);
+
+	//Allocates pointers to each row of the matrix
+	uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
+	if (memMatrix == NULL) {
+		return -1;
+	}
+	//Places the pointers in the correct positions
+	uint64_t *ptrWord = wholeMatrix;
+	for (i = 0; i < nRows; i++) {
+		memMatrix[i] = ptrWord;
+		ptrWord += ROW_LEN_INT64;
+	}
+	//==========================================================================/
+
+	//============= Getting the password + salt + basil padded with 10*1 ===============//
+	//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+	//but this ensures that the password copied locally will be overwritten as soon as possible
+
+	//First, we clean enough blocks for the password, salt, basil and padding
+	int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+
+	byte *ptrByte = (byte*) wholeMatrix;
+
+	//Prepends the password
+	memcpy(ptrByte, pwd, pwdlen);
+	ptrByte += pwdlen;
+
+	//Concatenates the salt
+	memcpy(ptrByte, salt, saltlen);
+	ptrByte += saltlen;
+
+	memset(ptrByte, 0, (size_t) (nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen)));
+
+	//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+	memcpy(ptrByte, &kLen, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = pwdlen;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = saltlen;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = timeCost;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = nRows;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = nCols;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+
+	//Now comes the padding
+	*ptrByte = 0x80; //first byte of padding: right after the password
+	ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+	ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+	*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+	//==========================================================================/
+
+	//======================= Initializing the Sponge State ====================//
+	//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+	uint64_t state[16];
+	initState(state);
+	//==========================================================================/
+
+	//================================ Setup Phase =============================//
+	//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+	ptrWord = wholeMatrix;
+	for (i = 0; i < nBlocksInput; i++) {
+		absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
+		ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+	}
+
+	//Initializes M[0] and M[1]
+	reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
+
+	reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
+
+	do {
+		//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+		reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+
+		//updates the value of row* (deterministically picked during Setup))
+		rowa = (rowa + step) & (window - 1);
+		//update prev: it now points to the last row ever computed
+		prev = row;
+		//updates row: goes to the next row to be computed
+		row++;
+
+		//Checks if all rows in the window where visited.
+		if (rowa == 0) {
+		step = window + gap; //changes the step: approximately doubles its value
+		window *= 2; //doubles the size of the re-visitation window
+		gap = -gap; //inverts the modifier to the step
+	}
+
+	} while (row < nRows);
+	//==========================================================================/
+
+	//============================ Wandering Phase =============================//
+	row = 0; //Resets the visitation to the first row of the memory matrix
+	for (tau = 1; tau <= timeCost; tau++) {
+		//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+		step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+		do {
+			//Selects a pseudorandom index row* (the only change in REv3)
+			//------------------------------------------------------------------------------------------
+			instance = state[instance & 0xF];
+			rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
+
+			//rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+			//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			//------------------------------------------------------------------------------------------
+
+			//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+			reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+
+			//update prev: it now points to the last row ever computed
+			prev = row;
+
+			//updates row: goes to the next row to be computed
+			//------------------------------------------------------------------------------------------
+			row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+			//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			//------------------------------------------------------------------------------------------
+
+		} while (row != 0);
+	}
+
+	//============================ Wrap-up Phase ===============================//
+	//Absorbs the last block of the memory matrix
+	absorbBlock(state, memMatrix[rowa]);
+
+	//Squeezes the key
+	squeeze(state, K, (unsigned int) kLen);
+
+	//========================= Freeing the memory =============================//
+	free(memMatrix);
+	free(wholeMatrix);
+
+	return 0;
+}
diff --git a/lyra2/Lyra2.h b/lyra2/Lyra2.h
index edf9179..f866462 100644
--- a/lyra2/Lyra2.h
+++ b/lyra2/Lyra2.h
@@ -38,5 +38,6 @@ typedef unsigned char byte;
 #endif
 
 int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
+int LYRA2_3(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
 
 #endif /* LYRA2_H_ */
diff --git a/lyra2/cuda_lyra2v3.cu b/lyra2/cuda_lyra2v3.cu
new file mode 100644
index 0000000..0278cab
--- /dev/null
+++ b/lyra2/cuda_lyra2v3.cu
@@ -0,0 +1,481 @@
+/**
+ * Lyra2 (v3) CUDA Implementation
+ *
+ * Based on VTC sources
+ */
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+#include "cuda_helper.h"
+
+#include "cuda_lyra2v3_sm3.cuh"
+
+
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 500
+#endif
+
+#define TPB 32
+
+#if __CUDA_ARCH__ >= 500
+
+#include "cuda_lyra2_vectors.h"
+
+#define Nrow 4
+#define Ncol 4
+#define memshift 3
+
+
+__device__ uint2x4 *DMatrix;
+
+__device__ __forceinline__ uint2 LD4S(const int index)
+{
+	extern __shared__ uint2 shared_mem[];
+	return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+}
+
+__device__ __forceinline__ void ST4S(const int index, const uint2 data)
+{
+	extern __shared__ uint2 shared_mem[];
+	shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
+}
+
+__device__ __forceinline__ uint2 shuffle2(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__
+void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
+{
+	a += b; d ^= a; d = SWAPUINT2(d);
+	c += d; b ^= c; b = ROR2(b, 24);
+	a += b; d ^= a; d = ROR2(d, 16);
+	c += d; b ^= c; b = ROR2(b, 63);
+}
+
+__device__ __forceinline__
+void round_lyra_v5(uint2x4 s[4])
+{
+	Gfunc_v5(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc_v5(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc_v5(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc_v5(s[0].w, s[1].w, s[2].w, s[3].w);
+
+	Gfunc_v5(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc_v5(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc_v5(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+__device__ __forceinline__
+void round_lyra_v5(uint2 s[4])
+{
+	Gfunc_v5(s[0], s[1], s[2], s[3]);
+	s[1] = shuffle2(s[1], threadIdx.x + 1, 4);
+	s[2] = shuffle2(s[2], threadIdx.x + 2, 4);
+	s[3] = shuffle2(s[3], threadIdx.x + 3, 4);
+	Gfunc_v5(s[0], s[1], s[2], s[3]);
+	s[1] = shuffle2(s[1], threadIdx.x + 3, 4);
+	s[2] = shuffle2(s[2], threadIdx.x + 2, 4);
+	s[3] = shuffle2(s[3], threadIdx.x + 1, 4);
+}
+
+__device__ __forceinline__
+void reduceDuplexRowSetup2(uint2 state[4])
+{
+	uint2 state1[Ncol][3], state0[Ncol][3], state2[3];
+	int i, j;
+
+	#pragma unroll
+	for (int i = 0; i < Ncol; i++)
+	{
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[Ncol - i - 1][j] = state[j];
+		round_lyra_v5(state);
+	}
+
+	//#pragma unroll 4
+	for (i = 0; i < Ncol; i++)
+	{
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state0[i][j];
+
+		round_lyra_v5(state);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state1[Ncol - i - 1][j] = state0[i][j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state1[Ncol - i - 1][j] ^= state[j];
+	}
+
+	for (i = 0; i < Ncol; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift;
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state1[i][j] + state0[i][j];
+
+		round_lyra_v5(state);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state2[j] = state1[i][j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state2[j] ^= state[j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s2 + j, state2[j]);
+
+		uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+		uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+		uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0) {
+			state0[i][0] ^= Data2;
+			state0[i][1] ^= Data0;
+			state0[i][2] ^= Data1;
+		} else {
+			state0[i][0] ^= Data0;
+			state0[i][1] ^= Data1;
+			state0[i][2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s0 + j, state0[i][j]);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[i][j] = state2[j];
+
+	}
+
+	for (i = 0; i < Ncol; i++)
+	{
+		const uint32_t s1 = memshift * Ncol * 1 + i*memshift;
+		const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift;
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state1[i][j] + state0[Ncol - i - 1][j];
+
+		round_lyra_v5(state);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[Ncol - i - 1][j] ^= state[j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s3 + j, state0[Ncol - i - 1][j]);
+
+		uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+		uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+		uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0) {
+			state1[i][0] ^= Data2;
+			state1[i][1] ^= Data0;
+			state1[i][2] ^= Data1;
+		} else  {
+			state1[i][0] ^= Data0;
+			state1[i][1] ^= Data1;
+			state1[i][2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s1 + j, state1[i][j]);
+	}
+}
+
+__device__
+void reduceDuplexRowt2(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4])
+{
+	uint2 state1[3], state2[3];
+	const uint32_t ps1 = memshift * Ncol * rowIn;
+	const uint32_t ps2 = memshift * Ncol * rowInOut;
+	const uint32_t ps3 = memshift * Ncol * rowOut;
+
+	for (int i = 0; i < Ncol; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift;
+		const uint32_t s2 = ps2 + i*memshift;
+		const uint32_t s3 = ps3 + i*memshift;
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = LD4S(s1 + j);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = LD4S(s2 + j);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra_v5(state);
+
+		uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+		uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+		uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0) {
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s2 + j, state2[j]);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s3 + j, LD4S(s3 + j) ^ state[j]);
+	}
+}
+
+__device__
+void reduceDuplexRowt2x4(const int rowInOut, uint2 state[4])
+{
+	const int rowIn = 2;
+	const int rowOut = 3;
+
+	int i, j;
+	uint2 last[3];
+	const uint32_t ps1 = memshift * Ncol * rowIn;
+	const uint32_t ps2 = memshift * Ncol * rowInOut;
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		last[j] = LD4S(ps2 + j);
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= LD4S(ps1 + j) + last[j];
+
+	round_lyra_v5(state);
+
+	uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+	uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+	uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0) {
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	} else {
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == rowOut)
+	{
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (i = 1; i < Ncol; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift;
+		const uint32_t s2 = ps2 + i*memshift;
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= LD4S(s1 + j) + LD4S(s2 + j);
+
+		round_lyra_v5(state);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	const uint2x4 blake2b_IV[2] = {
+		0xf3bcc908UL, 0x6a09e667UL, 0x84caa73bUL, 0xbb67ae85UL,
+		0xfe94f82bUL, 0x3c6ef372UL, 0x5f1d36f1UL, 0xa54ff53aUL,
+		0xade682d1UL, 0x510e527fUL, 0x2b3e6c1fUL, 0x9b05688cUL,
+		0xfb41bd6bUL, 0x1f83d9abUL, 0x137e2179UL, 0x5be0cd19UL
+	};
+
+	const uint2x4 Mask[2] = {
+		0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL,
+		0x00000020UL, 0x00000000UL, 0x00000001UL, 0x00000000UL,
+		0x00000004UL, 0x00000000UL, 0x00000004UL, 0x00000000UL,
+		0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL
+	};
+
+	uint2x4 state[4];
+
+	if (thread < threads)
+	{
+		state[0].x = state[1].x = __ldg(&inputHash[thread + threads * 0]);
+		state[0].y = state[1].y = __ldg(&inputHash[thread + threads * 1]);
+		state[0].z = state[1].z = __ldg(&inputHash[thread + threads * 2]);
+		state[0].w = state[1].w = __ldg(&inputHash[thread + threads * 3]);
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v5(state);
+
+		state[0] ^= Mask[0];
+		state[1] ^= Mask[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v5(state);
+
+		DMatrix[blockDim.x * gridDim.x * 0 + thread] = state[0];
+		DMatrix[blockDim.x * gridDim.x * 1 + thread] = state[1];
+		DMatrix[blockDim.x * gridDim.x * 2 + thread] = state[2];
+		DMatrix[blockDim.x * gridDim.x * 3 + thread] = state[3];
+	}
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void lyra2v3_gpu_hash_32_2(uint32_t threads)
+{
+	const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
+
+	if (thread < threads)
+	{
+		uint2 state[4];
+		state[0] = ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[1] = ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[2] = ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[3] = ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+
+		reduceDuplexRowSetup2(state);
+
+		uint32_t rowa;
+		int prev = 3;
+		unsigned int instance = 0;
+		for (int i = 0; i < 3; i++)
+		{
+			instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4);
+			rowa     = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3;
+
+			//rowa = __shfl(state[0].x, 0, 4) & 3;
+			reduceDuplexRowt2(prev, rowa, i, state);
+			prev = i;
+		}
+
+		instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4);
+		rowa     = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3;
+
+		//rowa = __shfl(state[0].x, 0, 4) & 3;
+		reduceDuplexRowt2x4(rowa, state);
+
+		((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0];
+		((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1];
+		((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2];
+		((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3];
+	}
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	uint2x4 state[4];
+
+	if (thread < threads)
+	{
+		state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + thread]);
+		state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + thread]);
+		state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + thread]);
+		state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra_v5(state);
+
+		outputHash[thread + threads * 0] = state[0].x;
+		outputHash[thread + threads * 1] = state[0].y;
+		outputHash[thread + threads * 2] = state[0].z;
+		outputHash[thread + threads * 3] = state[0].w;
+	}
+}
+
+#else
+#include "cuda_helper.h"
+#if __CUDA_ARCH__ < 200
+__device__ void* DMatrix;
+#endif
+__global__ void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) {}
+__global__ void lyra2v3_gpu_hash_32_2(uint32_t threads) {}
+__global__ void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) {}
+#endif
+
+
+__host__
+void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
+{
+	cuda_get_arch(thr_id);
+	// just assign the device pointer allocated in main loop
+	cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order)
+{
+	int dev_id = device_map[thr_id % MAX_GPUS];
+
+	if (device_sm[dev_id] >= 500) {
+
+		const uint32_t tpb = TPB;
+
+		dim3 grid2((threads + tpb - 1) / tpb);
+		dim3 block2(tpb);
+		dim3 grid4((threads * 4 + tpb - 1) / tpb);
+		dim3 block4(4, tpb / 4);
+
+		lyra2v3_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)g_hash);
+		lyra2v3_gpu_hash_32_2 <<< grid4, block4, 48 * sizeof(uint2) * tpb >>> (threads);
+		lyra2v3_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)g_hash);
+
+	} else {
+
+		uint32_t tpb = 16;
+		if (cuda_arch[dev_id] >= 350) tpb = TPB35;
+		else if (cuda_arch[dev_id] >= 300) tpb = TPB30;
+		else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
+
+		dim3 grid((threads + tpb - 1) / tpb);
+		dim3 block(tpb);
+		lyra2v3_gpu_hash_32_v3 <<< grid, block >>> (threads, startNounce, (uint2*)g_hash);
+
+	}
+}
+
+
diff --git a/lyra2/cuda_lyra2v3_sm3.cuh b/lyra2/cuda_lyra2v3_sm3.cuh
new file mode 100644
index 0000000..f84521c
--- /dev/null
+++ b/lyra2/cuda_lyra2v3_sm3.cuh
@@ -0,0 +1,348 @@
+/* SM 2/3/3.5 Variant for lyra2REv2 */
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors, only uncomment that temporary, dont commit it */
+//#undef __CUDA_ARCH__
+//#define __CUDA_ARCH__ 500
+#endif
+
+#define TPB20 64
+#define TPB30 64
+#define TPB35 64
+
+#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500
+
+#include "cuda_lyra2_vectors.h"
+
+#define Nrow 4
+#define Ncol 4
+
+#define vectype ulonglong4
+#define memshift 4
+
+__device__ vectype *DMatrix;
+
+static __device__ __forceinline__
+void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d)
+{
+	a += b; d ^= a; d = ROTR64(d, 32);
+	c += d; b ^= c; b = ROTR64(b, 24);
+	a += b; d ^= a; d = ROTR64(d, 16);
+	c += d; b ^= c; b = ROTR64(b, 63);
+}
+
+static __device__ __forceinline__
+void round_lyra_v35(vectype* s)
+{
+	Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w);
+
+	Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+static __device__ __forceinline__
+void reduceDuplexV3(vectype state[4], uint32_t thread)
+{
+	vectype state1[3];
+	uint32_t ps1 = (Nrow * Ncol * memshift * thread);
+	uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread);
+
+	#pragma unroll 4
+	for (int i = 0; i < Ncol; i++)
+	{
+		uint32_t s1 = ps1 + Nrow * i *memshift;
+		uint32_t s2 = ps2 - Nrow * i *memshift;
+
+		for (int j = 0; j < 3; j++)
+			state1[j] = __ldg4(&(DMatrix + s1)[j]);
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+		round_lyra_v35(state);
+
+		for (int j = 0; j < 3; j++)
+			state1[j] ^= state[j];
+
+		for (int j = 0; j < 3; j++)
+			(DMatrix + s2)[j] = state1[j];
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread)
+{
+	vectype state2[3], state1[3];
+
+	uint32_t ps1 = (memshift * rowIn    + Nrow * Ncol * memshift * thread);
+	uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
+	uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift *  rowOut + Nrow * Ncol * memshift * thread);
+
+	for (int i = 0; i < Ncol; i++)
+	{
+		uint32_t s1 = ps1 + Nrow*i*memshift;
+		uint32_t s2 = ps2 + Nrow*i*memshift;
+		uint32_t s3 = ps3 - Nrow*i*memshift;
+
+		for (int j = 0; j < 3; j++)
+			state1[j] = __ldg4(&(DMatrix + s1 )[j]);
+		for (int j = 0; j < 3; j++)
+			state2[j] = __ldg4(&(DMatrix + s2 )[j]);
+		for (int j = 0; j < 3; j++) {
+			vectype tmp = state1[j] + state2[j];
+			state[j] ^= tmp;
+		}
+
+		round_lyra_v35(state);
+
+		for (int j = 0; j < 3; j++) {
+			state1[j] ^= state[j];
+			(DMatrix + s3)[j] = state1[j];
+		}
+
+		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+		for (int j = 0; j < 11; j++)
+			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
+
+		for (int j = 0; j < 3; j++)
+			(DMatrix + s2)[j] = state2[j];
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread)
+{
+	vectype state1[3], state2[3];
+	uint32_t ps1 = (memshift * rowIn    + Nrow * Ncol * memshift * thread);
+	uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
+	uint32_t ps3 = (memshift * rowOut   + Nrow * Ncol * memshift * thread);
+
+	#pragma nounroll
+	for (int i = 0; i < Ncol; i++)
+	{
+		uint32_t s1 = ps1 + Nrow * i*memshift;
+		uint32_t s2 = ps2 + Nrow * i*memshift;
+		uint32_t s3 = ps3 + Nrow * i*memshift;
+
+		for (int j = 0; j < 3; j++)
+			state1[j] = __ldg4(&(DMatrix + s1)[j]);
+
+		for (int j = 0; j < 3; j++)
+			state2[j] = __ldg4(&(DMatrix + s2)[j]);
+
+		for (int j = 0; j < 3; j++)
+			state1[j] += state2[j];
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+
+		round_lyra_v35(state);
+
+		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+
+		for (int j = 0; j < 11; j++)
+			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
+
+		if (rowInOut != rowOut) {
+
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s2)[j] = state2[j];
+
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s3)[j] ^= state[j];
+
+		} else {
+
+			for (int j = 0; j < 3; j++)
+				state2[j] ^= state[j];
+
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s2)[j] = state2[j];
+		}
+	}
+}
+
+#if __CUDA_ARCH__ >= 300
+__global__ __launch_bounds__(TPB35, 1)
+void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	vectype state[4];
+	vectype blake2b_IV[2];
+	vectype padding[2];
+
+	if (threadIdx.x == 0) {
+
+		((uint16*)blake2b_IV)[0] = make_uint16(
+			0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85,
+			0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a,
+			0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c,
+			0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19
+		);
+		((uint16*)padding)[0] = make_uint16(
+			0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0,
+			0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000
+		);
+	}
+
+	if (thread < threads)
+	{
+		((uint2*)state)[0] = __ldg(&outputHash[thread]);
+		((uint2*)state)[1] = __ldg(&outputHash[thread + threads]);
+		((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]);
+		((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]);
+
+		state[1] = state[0];
+		state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0);
+		state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0);
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		state[0] ^= shuffle4(((vectype*)padding)[0], 0);
+		state[1] ^= shuffle4(((vectype*)padding)[1], 0);
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
+
+		//#pragma unroll 4
+		for (int i = 0; i < 4; i++)
+		{
+			uint32_t s1 = ps1 - 4 * memshift * i;
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s1)[j] = (state)[j];
+
+			round_lyra_v35(state);
+		}
+
+		reduceDuplexV3(state, thread);
+		reduceDuplexRowSetupV3(1, 0, 2, state, thread);
+		reduceDuplexRowSetupV3(2, 1, 3, state, thread);
+
+		unsigned int instance = 0;
+		uint32_t rowa;
+		int prev = 3;
+		for (int i = 0; i < 4; i++)
+		{
+			//rowa = ((uint2*)state)[0].x & 3;
+
+			instance = ((uint2*)state)[instance & 0xf].x;
+			rowa = ((uint2*)state)[instance & 0xf].x & 0x3;
+			reduceDuplexRowtV3(prev, rowa, i, state, thread);
+			prev = i;
+		}
+
+		uint32_t shift = (memshift * rowa + 16 * memshift * thread);
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra_v35(state);
+
+		outputHash[thread] = ((uint2*)state)[0];
+		outputHash[thread + threads] = ((uint2*)state)[1];
+		outputHash[thread + 2 * threads] = ((uint2*)state)[2];
+		outputHash[thread + 3 * threads] = ((uint2*)state)[3];
+
+	} //thread
+}
+#elif __CUDA_ARCH__ >= 200
+__global__ __launch_bounds__(TPB20, 1)
+void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	vectype state[4];
+	vectype blake2b_IV[2];
+	vectype padding[2];
+
+	((uint16*)blake2b_IV)[0] = make_uint16(
+		0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85,
+		0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a,
+		0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c,
+		0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19
+	);
+	((uint16*)padding)[0] = make_uint16(
+		0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0,
+		0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000
+	);
+
+	if (thread < threads)
+	{
+
+		((uint2*)state)[0] = outputHash[thread];
+		((uint2*)state)[1] = outputHash[thread + threads];
+		((uint2*)state)[2] = outputHash[thread + 2 * threads];
+		((uint2*)state)[3] = outputHash[thread + 3 * threads];
+
+		state[1] = state[0];
+		state[2] = ((vectype*)blake2b_IV)[0];
+		state[3] = ((vectype*)blake2b_IV)[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		state[0] ^= ((vectype*)padding)[0];
+		state[1] ^= ((vectype*)padding)[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
+
+		//#pragma unroll 4
+		for (int i = 0; i < 4; i++)
+		{
+			uint32_t s1 = ps1 - 4 * memshift * i;
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s1)[j] = (state)[j];
+
+			round_lyra_v35(state);
+		}
+
+		reduceDuplexV3(state, thread);
+		reduceDuplexRowSetupV3(1, 0, 2, state, thread);
+		reduceDuplexRowSetupV3(2, 1, 3, state, thread);
+
+		uint instance = 0;
+		uint32_t rowa;
+		int prev = 3;
+		for (int i = 0; i < 4; i++)
+		{
+			// rowa = ((uint2*)state)[0].x & 3;
+
+			instance = ((uint2*)state)[instance & 0xf];
+			rowa = ((uint2*)state)[instance & 0xf] & 0x3;
+			reduceDuplexRowtV3(prev, rowa, i, state, thread);
+			prev = i;
+		}
+
+		uint32_t shift = (memshift * rowa + 16 * memshift * thread);
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra_v35(state);
+
+		outputHash[thread] = ((uint2*)state)[0];
+		outputHash[thread + threads] = ((uint2*)state)[1];
+		outputHash[thread + 2 * threads] = ((uint2*)state)[2];
+		outputHash[thread + 3 * threads] = ((uint2*)state)[3];
+
+	} //thread
+}
+#endif
+
+#else
+/* host & sm5+ */
+__global__ void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {}
+#endif
diff --git a/lyra2/lyra2REv3.cu b/lyra2/lyra2REv3.cu
new file mode 100644
index 0000000..21ad3cb
--- /dev/null
+++ b/lyra2/lyra2REv3.cu
@@ -0,0 +1,182 @@
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_cubehash.h"
+#include "lyra2/Lyra2.h"
+}
+
+#include <miner.h>
+#include <cuda_helper.h>
+
+static uint64_t *d_hash[MAX_GPUS];
+static uint64_t* d_matrix[MAX_GPUS];
+
+extern void blake256_cpu_init(int thr_id, uint32_t threads);
+extern void blake256_cpu_setBlock_80(uint32_t *pdata);
+extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+
+extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
+
+extern void lyra2v3_setTarget(const void *pTargetIn);
+extern void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix);
+extern void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+
+extern void lyra2v3_cpu_hash_32_targ(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
+
+extern void bmw256_setTarget(const void *ptarget);
+extern void bmw256_cpu_init(int thr_id, uint32_t threads);
+extern void bmw256_cpu_free(int thr_id);
+extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
+
+extern "C" void lyra2v3_hash(void *state, const void *input)
+{
+	uint32_t hashA[8], hashB[8];
+
+	sph_blake256_context      ctx_blake;
+	sph_cubehash256_context   ctx_cube;
+	sph_bmw256_context        ctx_bmw;
+
+	sph_blake256_set_rounds(14);
+
+	sph_blake256_init(&ctx_blake);
+	sph_blake256(&ctx_blake, input, 80);
+	sph_blake256_close(&ctx_blake, hashA);
+
+	LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
+
+	sph_cubehash256_init(&ctx_cube);
+	sph_cubehash256(&ctx_cube, hashB, 32);
+	sph_cubehash256_close(&ctx_cube, hashA);
+
+	LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
+
+	sph_bmw256_init(&ctx_bmw);
+	sph_bmw256(&ctx_bmw, hashB, 32);
+	sph_bmw256_close(&ctx_bmw, hashA);
+
+	memcpy(state, hashA, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20;
+	if (strstr(device_name[dev_id], "GTX 10")) intensity = 20;
+	uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x000f;
+
+
+	if (!init[thr_id])
+	{
+		size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3;
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		blake256_cpu_init(thr_id, throughput);
+		bmw256_cpu_init(thr_id, throughput);
+
+		cuda_get_arch(thr_id); // cuda_arch[] also used in cubehash256
+
+		// SM 3 implentation requires a bit more memory
+		if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500)
+			matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
+		lyra2v3_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+
+		api_set_throughput(thr_id, throughput);
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	blake256_cpu_setBlock_80(pdata);
+	bmw256_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		memset(work->nonces, 0, sizeof(work->nonces));
+		bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], work->nonces);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (work->nonces[0] != 0)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			lyra2v3_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					lyra2v3_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart && !abort_flag);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_lyra2v3(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_matrix[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/miner.h b/miner.h
index 7f52d55..1d75855 100644
--- a/miner.h
+++ b/miner.h
@@ -298,6 +298,7 @@ extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsi
 extern int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_lyra2v2(int thr_id,struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_myriad(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -372,6 +373,7 @@ extern void free_lbry(int thr_id);
 extern void free_luffa(int thr_id);
 extern void free_lyra2(int thr_id);
 extern void free_lyra2v2(int thr_id);
+extern void free_lyra2v3(int thr_id);
 extern void free_lyra2Z(int thr_id);
 extern void free_myriad(int thr_id);
 extern void free_neoscrypt(int thr_id);
@@ -929,6 +931,7 @@ void jha_hash(void *output, const void *input);
 void lbry_hash(void *output, const void *input);
 void lyra2re_hash(void *state, const void *input);
 void lyra2v2_hash(void *state, const void *input);
+void lyra2v3_hash(void *state, const void *input);
 void lyra2Z_hash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
 void neoscrypt(uchar *output, const uchar *input, uint32_t profile);
diff --git a/util.cpp b/util.cpp
index 79799b0..fca1b5c 100644
--- a/util.cpp
+++ b/util.cpp
@@ -2246,6 +2246,9 @@ void print_hash_tests(void)
 	lyra2v2_hash(&hash[0], &buf[0]);
 	printpfx("lyra2v2", hash);
 
+	lyra2v3_hash(&hash[0], &buf[0]);
+	printpfx("lyra2v3", hash);
+
 	lyra2Z_hash(&hash[0], &buf[0]);
 	printpfx("lyra2z", hash);
 

From 6ff4e50987e59a70056324a94ed8667cc0bf598d Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Wed, 30 Jan 2019 16:01:24 +0100
Subject: [PATCH 24/24] v2.3.1 release

---
 README.txt         |  9 +++++++--
 configure.ac       |  2 +-
 lyra2/lyra2REv3.cu |  3 ++-
 res/ccminer.rc     | 10 +++++-----
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/README.txt b/README.txt
index 0ee3313..c2470bd 100644
--- a/README.txt
+++ b/README.txt
@@ -52,8 +52,7 @@ SonoA (Sono)
 Tribus (JH, keccak, simd)
 Woodcoin (Double Skein)
 Vanilla (Blake256 8-rounds - double sha256)
-Vertcoin Lyra2RE
-Ziftrcoin (ZR5)
+Vertcoin Lyra2REv3
 Boolberry (Wild Keccak)
 Monero (Cryptonight v7 with -a monero)
 Aeon (Cryptonight-lite)
@@ -291,6 +290,12 @@ so we can more efficiently implement new algorithms using the latest hardware
 features.
 
 >>> RELEASE HISTORY <<<
+  Jan. 30th 2019  v2.3.1
+                  Handle Lyra2v3 algo
+                  Handle sha256q algo
+                  Handle exosis algo
+                  Handle blake2b standard algo
+
   June 23th 2018  v2.3
                   Handle phi2 header variation for smart contracts
                   Handle monero, stellite, graft and cryptolight variants
diff --git a/configure.ac b/configure.ac
index 9030e7e..6bb2209 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.3], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.3.1], [], [ccminer], [http://github.com/tpruvot/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/lyra2/lyra2REv3.cu b/lyra2/lyra2REv3.cu
index 21ad3cb..7e1b4a7 100644
--- a/lyra2/lyra2REv3.cu
+++ b/lyra2/lyra2REv3.cu
@@ -66,7 +66,8 @@ extern "C" int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonc
 	const uint32_t first_nonce = pdata[19];
 	int dev_id = device_map[thr_id];
 	int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20;
-	if (strstr(device_name[dev_id], "GTX 10")) intensity = 20;
+	if (strstr(device_name[dev_id], "GTX 1")) intensity = 20;
+	if (strstr(device_name[dev_id], "RTX 20")) intensity = 20;
 	uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
diff --git a/res/ccminer.rc b/res/ccminer.rc
index 18eb1d2..bc285bf 100644
--- a/res/ccminer.rc
+++ b/res/ccminer.rc
@@ -60,8 +60,8 @@ IDI_ICON1               ICON                    "ccminer.ico"
 //
 
 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 2,3,0,0
- PRODUCTVERSION 2,3,0,0
+ FILEVERSION 2,3,1,0
+ PRODUCTVERSION 2,3,1,0
  FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
  FILEFLAGS 0x21L
@@ -76,10 +76,10 @@ BEGIN
     BEGIN
         BLOCK "040904e4"
         BEGIN
-            VALUE "FileVersion", "2.3"
-            VALUE "LegalCopyright", "Copyright (C) 2018"
+            VALUE "FileVersion", "2.3.1"
+            VALUE "LegalCopyright", "Copyright (C) 2019"
             VALUE "ProductName", "ccminer"
-            VALUE "ProductVersion", "2.3"
+            VALUE "ProductVersion", "2.3.1"
         END
     END
     BLOCK "VarFileInfo"