From 8cf21599d4c5ff104130b21a766353c7f2d35462 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 14 May 2017 11:57:56 +0200
Subject: [PATCH] Interface nicehash djeZo equihash solver (squashed)

Todo:
    - send block height via stratum protocol (encoded in jobid?)
    - remove equi/blake2 cpu algorithm to use common one

the extranonce imcompatibility is related to the solver nonce data,
offsets may be reversed in nheqminer, to check...

The solver was adapted for SM 3.0+ support (no perf changes)

Note: The solver was not improved on purpose, to be able compare
the two miners performances (nheqminer 0.5c the last open sourced, and ccminer)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

stratum: code cleanup, move equi fns in equi folder
---
 Makefile.am                      |    6 +
 algos.h                          |    4 +
 api.cpp                          |    1 +
 ccminer.cpp                      |   56 +-
 ccminer.vcxproj                  |   20 +-
 ccminer.vcxproj.filters          |   26 +-
 compat/ccminer-config.h          |    2 +-
 configure.ac                     |    2 +-
 equi/blake2/blake2-config.h      |   81 ++
 equi/blake2/blake2-impl.h        |  136 ++
 equi/blake2/blake2-round.h       |   85 ++
 equi/blake2/blake2.h             |   85 ++
 equi/blake2/blake2b-load-sse2.h  |   68 +
 equi/blake2/blake2b-load-sse41.h |  402 ++++++
 equi/blake2/blake2b-round.h      |  170 +++
 equi/blake2/blake2bx.cpp         |  262 ++++
 equi/cuda_equi.cu                | 2117 ++++++++++++++++++++++++++++++
 equi/eqcuda.hpp                  |  136 ++
 equi/equi-stratum.cpp            |  241 ++++
 equi/equi.cpp                    |  171 +++
 equi/equihash.cpp                |  294 +++++
 equi/equihash.h                  |   19 +
 miner.h                          |   14 +
 util.cpp                         |   48 +-
 24 files changed, 4422 insertions(+), 24 deletions(-)
 create mode 100644 equi/blake2/blake2-config.h
 create mode 100644 equi/blake2/blake2-impl.h
 create mode 100644 equi/blake2/blake2-round.h
 create mode 100644 equi/blake2/blake2.h
 create mode 100644 equi/blake2/blake2b-load-sse2.h
 create mode 100644 equi/blake2/blake2b-load-sse41.h
 create mode 100644 equi/blake2/blake2b-round.h
 create mode 100644 equi/blake2/blake2bx.cpp
 create mode 100644 equi/cuda_equi.cu
 create mode 100644 equi/eqcuda.hpp
 create mode 100644 equi/equi-stratum.cpp
 create mode 100644 equi/equi.cpp
 create mode 100644 equi/equihash.cpp
 create mode 100644 equi/equihash.h

diff --git a/Makefile.am b/Makefile.am
index 307d86e..683e21d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -21,6 +21,8 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  crc32.c hefty1.c \
 			  ccminer.cpp pools.cpp util.cpp bench.cpp bignum.cpp \
 			  api.cpp hashlog.cpp nvml.cpp stats.cpp sysinfos.cpp cuda.cpp \
+			  equi/equi-stratum.cpp equi/equi.cpp equi/blake2/blake2bx.cpp \
+			  equi/equihash.cpp equi/cuda_equi.cu \
 			  heavy/heavy.cu \
 			  heavy/cuda_blake512.cu heavy/cuda_blake512.h \
 			  heavy/cuda_combine.cu heavy/cuda_combine.h \
@@ -97,6 +99,10 @@ ccminer_LDFLAGS += -L/usr/local/llvm/lib
 ccminer_LDADD += -lomp
 endif
 
+#ccminer_CPPFLAGS += -DUSE_LIBSODIUM
+#ccminer_LDFLAGS += -Lequi/lib
+#ccminer_LDADD += -lsodium
+ccminer_LDADD += -lcuda
 
 nvcc_ARCH  = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
 
diff --git a/algos.h b/algos.h
index e4dc4bb..163b193 100644
--- a/algos.h
+++ b/algos.h
@@ -16,6 +16,7 @@ enum sha_algos {
 	ALGO_DEEP,
 	ALGO_DECRED,
 	ALGO_DMD_GR,
+	ALGO_EQUIHASH,
 	ALGO_FRESH,
 	ALGO_FUGUE256,		/* Fugue256 */
 	ALGO_GROESTL,
@@ -78,6 +79,7 @@ static const char *algo_names[] = {
 	"deep",
 	"decred",
 	"dmd-gr",
+	"equihash",
 	"fresh",
 	"fugue256",
 	"groestl",
@@ -149,6 +151,8 @@ static inline int algo_to_int(char* arg)
 			i = ALGO_C11;
 		else if (!strcasecmp("diamond", arg))
 			i = ALGO_DMD_GR;
+		else if (!strcasecmp("equi", arg))
+			i = ALGO_EQUIHASH;
 		else if (!strcasecmp("doom", arg))
 			i = ALGO_LUFFA;
 		else if (!strcasecmp("hmq17", arg))
diff --git a/api.cpp b/api.cpp
index 0ec023f..8353127 100644
--- a/api.cpp
+++ b/api.cpp
@@ -1341,6 +1341,7 @@ void api_set_throughput(int thr_id, uint32_t throughput)
 	if (thr_id < MAX_GPUS && thr_info) {
 		struct cgpu_info *cgpu = &thr_info[thr_id].gpu;
 		cgpu->intensity = throughput2intensity(throughput);
+		if (cgpu->throughput != throughput) cgpu->throughput = throughput;
 	}
 	// to display in bench results
 	if (opt_benchmark)
diff --git a/ccminer.cpp b/ccminer.cpp
index 652a884..85ec68e 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -44,6 +44,7 @@
 #include "algos.h"
 #include "sia/sia-rpc.h"
 #include "crypto/xmr-rpc.h"
+#include "equi/equihash.h"
 
 #include <cuda_runtime.h>
 
@@ -243,6 +244,7 @@ Options:\n\
 			c11/flax    X11 variant\n\
 			decred      Decred Blake256\n\
 			deep        Deepcoin\n\
+			equihash    Zcash Equihash\n\
 			dmd-gr      Diamond-Groestl\n\
 			fresh       Freshcoin (shavite 80)\n\
 			fugue256    Fuguecoin\n\
@@ -558,6 +560,14 @@ void get_currentalgo(char* buf, int sz)
 	snprintf(buf, sz, "%s", algo_names[opt_algo]);
 }
 
+void format_hashrate(double hashrate, char *output)
+{
+	if (opt_algo == ALGO_EQUIHASH)
+		format_hashrate_unit(hashrate, output, "Sol/s");
+	else
+		format_hashrate_unit(hashrate, output, "H/s");
+}
+
 /**
  * Exit app
  */
@@ -634,6 +644,10 @@ static void calc_network_diff(struct work *work)
 	if (opt_algo == ALGO_LBRY) nbits = swab32(work->data[26]);
 	if (opt_algo == ALGO_DECRED) nbits = work->data[29];
 	if (opt_algo == ALGO_SIA) nbits = work->data[11]; // unsure if correct
+	if (opt_algo == ALGO_EQUIHASH) {
+		net_diff = equi_network_diff(work);
+		return;
+	}
 
 	uint32_t bits = (nbits & 0xffffff);
 	int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
@@ -856,6 +870,17 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		return true;
 	}
 
+	if (pool->type & POOL_STRATUM && stratum.is_equihash) {
+		struct work submit_work;
+		memcpy(&submit_work, work, sizeof(struct work));
+		//if (!hashlog_already_submittted(submit_work.job_id, submit_work.nonces[idnonce])) {
+			if (equi_stratum_submit(pool, &submit_work))
+				hashlog_remember_submit(&submit_work, submit_work.nonces[idnonce]);
+			stratum.job.shares_count++;
+		//}
+		return true;
+	}
+
 	/* discard if a newer block was received */
 	stale_work = work->height && work->height < g_work.height;
 	if (have_stratum && !stale_work && opt_algo != ALGO_ZR5 && opt_algo != ALGO_SCRYPT_JANE) {
@@ -1508,6 +1533,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	/* Generate merkle root */
 	switch (opt_algo) {
 		case ALGO_DECRED:
+		case ALGO_EQUIHASH:
 		case ALGO_SIA:
 			// getwork over stratum, no merkle to generate
 			break;
@@ -1566,6 +1592,13 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		memcpy(&work->data[44], &sctx->job.coinbase[sctx->job.coinbase_size-4], 4);
 		sctx->job.height = work->data[32];
 		//applog_hex(work->data, 180);
+	} else if (opt_algo == ALGO_EQUIHASH) {
+		memcpy(&work->data[9], sctx->job.coinbase, 32+32); // merkle [9..16] + reserved
+		work->data[25] = le32dec(sctx->job.ntime);
+		work->data[26] = le32dec(sctx->job.nbits);
+		memcpy(&work->data[27], sctx->xnonce1, sctx->xnonce1_size & 0x1F); // pool extranonce
+		work->data[35] = 0x80;
+		//applog_hex(work->data, 140);
 	} else if (opt_algo == ALGO_LBRY) {
 		for (i = 0; i < 8; i++)
 			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
@@ -1618,7 +1651,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 
 	pthread_mutex_unlock(&stratum_work_lock);
 
-	if (opt_debug && opt_algo != ALGO_DECRED && opt_algo != ALGO_SIA) {
+	if (opt_debug && opt_algo != ALGO_DECRED && opt_algo != ALGO_EQUIHASH && opt_algo != ALGO_SIA) {
 		uint32_t utm = work->data[17];
 		if (opt_algo != ALGO_ZR5) utm = swab32(utm);
 		char *tm = atime2str(utm - sctx->srvtime_diff);
@@ -1656,6 +1689,9 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_LYRA2:
 			work_set_target(work, sctx->job.diff / (128.0 * opt_difficulty));
 			break;
+		case ALGO_EQUIHASH:
+			equi_work_set_target(work, sctx->job.diff / opt_difficulty);
+			break;
 		default:
 			work_set_target(work, sctx->job.diff / opt_difficulty);
 	}
@@ -1830,6 +1866,9 @@ static void *miner_thread(void *userdata)
 		} else if (opt_algo == ALGO_CRYPTOLIGHT || opt_algo == ALGO_CRYPTONIGHT) {
 			nonceptr = (uint32_t*) (((char*)work.data) + 39);
 			wcmplen = 39;
+		} else if (opt_algo == ALGO_EQUIHASH) {
+			nonceptr = &work.data[EQNONCE_OFFSET]; // 27 is pool extranonce (256bits nonce space)
+			wcmplen = 4+32+32;
 		}
 
 		if (have_stratum) {
@@ -1962,6 +2001,10 @@ static void *miner_thread(void *userdata)
 			nonceptr[1] += 1;
 			nonceptr[2] |= thr_id;
 
+		} else if (opt_algo == ALGO_EQUIHASH) {
+			nonceptr[1]++;
+			nonceptr[1] |= thr_id << 24;
+			//applog_hex(&work.data[27], 32);
 		} else if (opt_algo == ALGO_WILDKECCAK) {
 			//nonceptr[1] += 1;
 		} else if (opt_algo == ALGO_SIA) {
@@ -2264,6 +2307,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_DEEP:
 			rc = scanhash_deep(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_EQUIHASH:
+			rc = scanhash_equihash(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_FRESH:
 			rc = scanhash_fresh(thr_id, &work, max_nonce, &hashes_done);
 			break;
@@ -2431,7 +2477,7 @@ static void *miner_thread(void *userdata)
 				work.nonces[1] = nonceptr[2];
 		}
 
-		if (stratum.rpc2 && rc == -EBUSY || work_restart[thr_id].restart) {
+		if (stratum.rpc2 && (rc == -EBUSY || work_restart[thr_id].restart)) {
 			// bbr scratchpad download or stale result
 			sleep(1);
 			if (!thr_id) pools[cur_pooln].wait_time += 1;
@@ -2489,7 +2535,7 @@ static void *miner_thread(void *userdata)
 		}
 
 		// only required to debug purpose
-		if (opt_debug && check_dups && opt_algo != ALGO_DECRED && opt_algo != ALGO_SIA)
+		if (opt_debug && check_dups && opt_algo != ALGO_DECRED && opt_algo != ALGO_EQUIHASH && opt_algo != ALGO_SIA)
 			hashlog_remember_scan_range(&work);
 
 		/* output */
@@ -3854,6 +3900,10 @@ int main(int argc, char *argv[])
 		allow_mininginfo = false;
 	}
 
+	if (opt_algo == ALGO_EQUIHASH) {
+		opt_extranonce = false; // disable subscribe
+	}
+
 	if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT) {
 		rpc2_init();
 		if (!opt_quiet) applog(LOG_INFO, "Using JSON-RPC 2.0");
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 5aad65e..1316052 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -156,7 +156,7 @@
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
       <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_30,sm_30;compute_20,sm_21</CodeGeneration>
-      <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions>--ptxas-options="-O2" --Wno-deprecated-gpu-targets %(AdditionalOptions)</AdditionalOptions>
       <Optimization>O2</Optimization>
     </CudaCompile>
     <CudaLink>
@@ -199,9 +199,10 @@
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
       <CodeGeneration>compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;compute_20,sm_21</CodeGeneration>
-      <Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
+      <Include>$(NVTOOLSEXT_PATH)\include</Include>
       <Optimization>O3</Optimization>
       <TargetMachinePlatform>64</TargetMachinePlatform>
+      <AdditionalOptions>--Wno-deprecated-gpu-targets %(AdditionalOptions)</AdditionalOptions>
     </CudaCompile>
     <CudaLink>
       <Optimization>O3</Optimization>
@@ -235,6 +236,12 @@
     <ClCompile Include="crypto\cryptolight-cpu.cpp" />
     <ClCompile Include="crypto\cryptonight-cpu.cpp" />
     <ClCompile Include="crypto\cpu\c_keccak.c" />
+    <ClCompile Include="equi\blake2\blake2bx.cpp">
+      <EnableEnhancedInstructionSet Condition="'$(Platform)'=='Win32'">StreamingSIMDExtensions</EnableEnhancedInstructionSet>
+    </ClCompile>
+    <ClCompile Include="equi\equi-stratum.cpp" />
+    <ClCompile Include="equi\equi.cpp" />
+    <ClCompile Include="equi\equihash.cpp" />
     <ClCompile Include="nvapi.cpp" />
     <ClCompile Include="pools.cpp" />
     <ClCompile Include="util.cpp" />
@@ -254,6 +261,8 @@
     <ClCompile Include="lyra2\Lyra2.c" />
     <ClCompile Include="lyra2\Sponge.c" />
     <ClCompile Include="lyra2\Lyra2Z.c" />
+    <ClInclude Include="equi\eqcuda.hpp" />
+    <ClInclude Include="equi\equihash.h" />
     <ClInclude Include="neoscrypt\neoscrypt.h" />
     <ClCompile Include="neoscrypt\neoscrypt.cpp" />
     <ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
@@ -283,6 +292,11 @@
     <CudaCompile Include="crypto\wildkeccak.cu">
       <MaxRegCount>128</MaxRegCount>
     </CudaCompile>
+    <CudaCompile Include="equi\cuda_equi.cu">
+      <CodeGeneration>compute_52,sm_52;compute_50,sm_50;compute_30,sm_30</CodeGeneration>
+      <AdditionalOptions> -Xptxas -dlcm=ca -Xptxas -dscm=cs %(AdditionalOptions)</AdditionalOptions>
+      <MaxRegCount>0</MaxRegCount>
+    </CudaCompile>
     <CudaCompile Include="neoscrypt\cuda_neoscrypt.cu">
       <MaxRegCount>160</MaxRegCount>
     </CudaCompile>
@@ -552,7 +566,7 @@
     </CudaCompile>
     <CudaCompile Include="x15\cuda_x14_shabal512.cu" />
     <CudaCompile Include="x15\cuda_x15_whirlpool.cu" />
-    <CudaCompile Include="x17\hmq17.cu"/>
+    <CudaCompile Include="x17\hmq17.cu" />
     <CudaCompile Include="x15\x15.cu" />
     <CudaCompile Include="x15\whirlpool.cu" />
     <CudaCompile Include="x17\x17.cu">
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index f37da23..a6b39c4 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -103,6 +103,9 @@
     <Filter Include="Source Files\CUDA\xmr">
       <UniqueIdentifier>{0f9aec5e-5409-488f-992a-2c108590d1ac}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Source Files\equi">
+      <UniqueIdentifier>{031afae7-2a78-4e32-9738-4b589b6f7ff3}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="compat\jansson\dump.c">
@@ -321,6 +324,18 @@
     <ClCompile Include="crypto\wildkeccak-cpu.cpp">
       <Filter>Source Files\crypto\bbr</Filter>
     </ClCompile>
+    <ClCompile Include="equi\equi.cpp">
+      <Filter>Source Files\equi</Filter>
+    </ClCompile>
+    <ClCompile Include="equi\equihash.cpp">
+      <Filter>Source Files\equi</Filter>
+    </ClCompile>
+    <ClCompile Include="equi\equi-stratum.cpp">
+      <Filter>Source Files\equi</Filter>
+    </ClCompile>
+    <ClCompile Include="equi\blake2\blake2bx.cpp">
+      <Filter>Source Files\equi</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="algos.h">
@@ -563,6 +578,12 @@
     <ClInclude Include="crypto\cryptonight.h">
       <Filter>Source Files\CUDA\xmr</Filter>
     </ClInclude>
+    <ClInclude Include="equi\eqcuda.hpp">
+      <Filter>Source Files\equi</Filter>
+    </ClInclude>
+    <ClInclude Include="equi\equihash.h">
+      <Filter>Source Files\equi</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <CudaCompile Include="cuda.cpp">
@@ -895,6 +916,9 @@
     <CudaCompile Include="crypto\wildkeccak.cu">
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
+    <CudaCompile Include="equi\cuda_equi.cu">
+      <Filter>Source Files\equi</Filter>
+    </CudaCompile>
   </ItemGroup>
   <ItemGroup>
     <Image Include="res\ccminer.ico">
@@ -911,4 +935,4 @@
       <Filter>Ressources</Filter>
     </Text>
   </ItemGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h
index 26cfea4..dfd973c 100644
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@@ -164,7 +164,7 @@
 #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.0"
+#define PACKAGE_VERSION "2.1"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
diff --git a/configure.ac b/configure.ac
index 1c76b70..c92a6fc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.0], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.1], [], [ccminer], [http://github.com/tpruvot/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/equi/blake2/blake2-config.h b/equi/blake2/blake2-config.h
new file mode 100644
index 0000000..2a8f9ed
--- /dev/null
+++ b/equi/blake2/blake2-config.h
@@ -0,0 +1,81 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_CONFIG_H__
+#define __BLAKE2_CONFIG_H__
+
+// These don't work everywhere
+#if (defined(__SSE2__) || defined(_M_AMD_64) || defined(_M_X64))
+#define HAVE_SSE2
+#endif
+
+#if defined(__SSSE3__)
+#define HAVE_SSSE3
+#endif
+
+#if defined(__SSE4_1__)
+#define HAVE_SSE41
+#endif
+
+#if defined(__AVX__)
+#define HAVE_AVX
+#endif
+
+#if defined(__XOP__)
+#define HAVE_XOP
+#endif
+
+
+#ifdef HAVE_AVX2
+#ifndef HAVE_AVX
+#define HAVE_AVX
+#endif
+#endif
+
+#ifdef HAVE_XOP
+#ifndef HAVE_AVX
+#define HAVE_AVX
+#endif
+#endif
+
+#ifdef HAVE_AVX
+#ifndef HAVE_SSE41
+#define HAVE_SSE41
+#endif
+#endif
+
+#ifdef HAVE_SSE41
+#ifndef HAVE_SSSE3
+#define HAVE_SSSE3
+#endif
+#endif
+
+#ifdef HAVE_SSSE3
+#define HAVE_SSE2
+#endif
+
+#if !defined(HAVE_SSE2)
+
+#ifdef _MSC_VER
+// enforce required stuff for now
+#define HAVE_SSE2
+//#define HAVE_SSSE3
+#define HAVE_SSE41
+#else
+# error "This code requires at least SSE 4.1"
+#endif
+
+#endif
+
+#endif
+
diff --git a/equi/blake2/blake2-impl.h b/equi/blake2/blake2-impl.h
new file mode 100644
index 0000000..16219db
--- /dev/null
+++ b/equi/blake2/blake2-impl.h
@@ -0,0 +1,136 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_IMPL_H__
+#define __BLAKE2_IMPL_H__
+
+#include <stdint.h>
+
+static inline uint32_t load32( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  uint32_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
+#else
+  const uint8_t *p = ( const uint8_t * )src;
+  uint32_t w = *p++;
+  w |= ( uint32_t )( *p++ ) <<  8;
+  w |= ( uint32_t )( *p++ ) << 16;
+  w |= ( uint32_t )( *p++ ) << 24;
+  return w;
+#endif
+}
+
+static inline uint64_t load64( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  uint64_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
+#else
+  const uint8_t *p = ( const uint8_t * )src;
+  uint64_t w = *p++;
+  w |= ( uint64_t )( *p++ ) <<  8;
+  w |= ( uint64_t )( *p++ ) << 16;
+  w |= ( uint64_t )( *p++ ) << 24;
+  w |= ( uint64_t )( *p++ ) << 32;
+  w |= ( uint64_t )( *p++ ) << 40;
+  w |= ( uint64_t )( *p++ ) << 48;
+  w |= ( uint64_t )( *p++ ) << 56;
+  return w;
+#endif
+}
+
+static inline void store32( void *dst, uint32_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  memcpy(dst, &w, sizeof w);
+#else
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+#endif
+}
+
+static inline void store64( void *dst, uint64_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  memcpy(dst, &w, sizeof w);
+#else
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+#endif
+}
+
+static inline uint64_t load48( const void *src )
+{
+  const uint8_t *p = ( const uint8_t * )src;
+  uint64_t w = *p++;
+  w |= ( uint64_t )( *p++ ) <<  8;
+  w |= ( uint64_t )( *p++ ) << 16;
+  w |= ( uint64_t )( *p++ ) << 24;
+  w |= ( uint64_t )( *p++ ) << 32;
+  w |= ( uint64_t )( *p++ ) << 40;
+  return w;
+}
+
+static inline void store48( void *dst, uint64_t w )
+{
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+}
+
+static inline uint32_t rotl32( const uint32_t w, const unsigned c )
+{
+  return ( w << c ) | ( w >> ( 32 - c ) );
+}
+
+static inline uint64_t rotl64( const uint64_t w, const unsigned c )
+{
+  return ( w << c ) | ( w >> ( 64 - c ) );
+}
+
+static inline uint32_t rotr32( const uint32_t w, const unsigned c )
+{
+  return ( w >> c ) | ( w << ( 32 - c ) );
+}
+
+static inline uint64_t rotr64( const uint64_t w, const unsigned c )
+{
+  return ( w >> c ) | ( w << ( 64 - c ) );
+}
+
+/* prevents compiler optimizing out memset() */
+static inline void secure_zero_memory( void *v, size_t n )
+{
+  volatile uint8_t *p = ( volatile uint8_t * )v;
+  while( n-- ) *p++ = 0;
+}
+
+#endif
+
diff --git a/equi/blake2/blake2-round.h b/equi/blake2/blake2-round.h
new file mode 100644
index 0000000..400ed20
--- /dev/null
+++ b/equi/blake2/blake2-round.h
@@ -0,0 +1,85 @@
+#define _mm_roti_epi64(x, c) \
+	(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+	: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
+	: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+	: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
+	: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	row1l = _mm_add_epi64(row1l, row2l); \
+	row1h = _mm_add_epi64(row1h, row2h); \
+	\
+	row4l = _mm_xor_si128(row4l, row1l); \
+	row4h = _mm_xor_si128(row4h, row1h); \
+	\
+	row4l = _mm_roti_epi64(row4l, -32); \
+	row4h = _mm_roti_epi64(row4h, -32); \
+	\
+	row3l = _mm_add_epi64(row3l, row4l); \
+	row3h = _mm_add_epi64(row3h, row4h); \
+	\
+	row2l = _mm_xor_si128(row2l, row3l); \
+	row2h = _mm_xor_si128(row2h, row3h); \
+	\
+	row2l = _mm_roti_epi64(row2l, -24); \
+	row2h = _mm_roti_epi64(row2h, -24); \
+ 
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	row1l = _mm_add_epi64(row1l, row2l); \
+	row1h = _mm_add_epi64(row1h, row2h); \
+	\
+	row4l = _mm_xor_si128(row4l, row1l); \
+	row4h = _mm_xor_si128(row4h, row1h); \
+	\
+	row4l = _mm_roti_epi64(row4l, -16); \
+	row4h = _mm_roti_epi64(row4h, -16); \
+	\
+	row3l = _mm_add_epi64(row3l, row4l); \
+	row3h = _mm_add_epi64(row3h, row4h); \
+	\
+	row2l = _mm_xor_si128(row2l, row3l); \
+	row2h = _mm_xor_si128(row2h, row3h); \
+	\
+	row2l = _mm_roti_epi64(row2l, -63); \
+	row2h = _mm_roti_epi64(row2h, -63); \
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+	t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0;    \
+	\
+	t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+	t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+	row4l = t1; \
+	row4h = t0;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+	t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0; \
+	\
+	t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+	t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+	row4l = t1; \
+	row4h = t0;
+
+#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
+	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
diff --git a/equi/blake2/blake2.h b/equi/blake2/blake2.h
new file mode 100644
index 0000000..3206576
--- /dev/null
+++ b/equi/blake2/blake2.h
@@ -0,0 +1,85 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_H__
+#define __BLAKE2_H__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__ ((__aligned__(x)))
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+  enum blake2b_constant
+  {
+    BLAKE2B_BLOCKBYTES = 128,
+    BLAKE2B_OUTBYTES   = 64,
+    BLAKE2B_KEYBYTES   = 64,
+    BLAKE2B_SALTBYTES  = 16,
+    BLAKE2B_PERSONALBYTES = 16
+  };
+
+#pragma pack(push, 1)
+  typedef struct __blake2b_param
+  {
+    uint8_t  digest_length; // 1
+    uint8_t  key_length;    // 2
+    uint8_t  fanout;        // 3
+    uint8_t  depth;         // 4
+    uint32_t leaf_length;   // 8
+    uint64_t node_offset;   // 16
+    uint8_t  node_depth;    // 17
+    uint8_t  inner_length;  // 18
+    uint8_t  reserved[14];  // 32
+    uint8_t  salt[BLAKE2B_SALTBYTES]; // 48
+    uint8_t  personal[BLAKE2B_PERSONALBYTES];  // 64
+  } blake2b_param;
+
+  ALIGN( 64 ) typedef struct __blake2b_state
+  {
+    uint64_t h[8];
+    uint8_t  buf[BLAKE2B_BLOCKBYTES];
+    uint16_t counter;
+    uint8_t  buflen;
+    uint8_t  lastblock;
+  } blake2b_state;
+
+#pragma pack(pop)
+
+  int eq_blake2b_init( blake2b_state *S, const uint8_t outlen );
+  int eq_blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int eq_blake2b_init_param( blake2b_state *S, const blake2b_param *P );
+  int eq_blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen );
+  int eq_blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen );
+
+  // Simple API
+  int eq_blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+
+  static inline int eq_blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+  {
+    return eq_blake2b( out, in, key, outlen, inlen, keylen );
+  }
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
+
diff --git a/equi/blake2/blake2b-load-sse2.h b/equi/blake2/blake2b-load-sse2.h
new file mode 100644
index 0000000..1ba153c
--- /dev/null
+++ b/equi/blake2/blake2b-load-sse2.h
@@ -0,0 +1,68 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_LOAD_SSE2_H__
+#define __BLAKE2B_LOAD_SSE2_H__
+
+#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
+#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5)
+#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2)
+#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7)
+#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1)
+#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13)
+#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2)
+#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6)
+#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8)
+#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11)
+#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15)
+#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14)
+#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14)
+#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13)
+#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9)
+#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2)
+#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12)
+#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1)
+#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8)
+#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6)
+#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11)
+#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3)
+#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1)
+#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4)
+#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7)
+#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6)
+#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3)
+#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12)
+#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
+
+
+#endif
+
diff --git a/equi/blake2/blake2b-load-sse41.h b/equi/blake2/blake2b-load-sse41.h
new file mode 100644
index 0000000..f6c1bc8
--- /dev/null
+++ b/equi/blake2/blake2b-load-sse41.h
@@ -0,0 +1,402 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_LOAD_SSE41_H__
+#define __BLAKE2B_LOAD_SSE41_H__
+
+#define LOAD_MSG_0_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m1); \
+b1 = _mm_unpacklo_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_0_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m0, m1); \
+b1 = _mm_unpackhi_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_0_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m5); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_0_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m5); \
+b1 = _mm_unpackhi_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_1_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m2); \
+b1 = _mm_unpackhi_epi64(m4, m6); \
+} while(0)
+
+
+#define LOAD_MSG_1_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_alignr_epi8(m3, m7, 8); \
+} while(0)
+
+
+#define LOAD_MSG_1_3(b0, b1) \
+do \
+{ \
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+b1 = _mm_unpackhi_epi64(m5, m2); \
+} while(0)
+
+
+#define LOAD_MSG_1_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m1); \
+b1 = _mm_unpackhi_epi64(m3, m1); \
+} while(0)
+
+
+#define LOAD_MSG_2_1(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m6, m5, 8); \
+b1 = _mm_unpackhi_epi64(m2, m7); \
+} while(0)
+
+
+#define LOAD_MSG_2_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m0); \
+b1 = _mm_blend_epi16(m1, m6, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_2_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m5, m1, 0xF0); \
+b1 = _mm_unpackhi_epi64(m3, m4); \
+} while(0)
+
+
+#define LOAD_MSG_2_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m3); \
+b1 = _mm_alignr_epi8(m2, m0, 8); \
+} while(0)
+
+
+#define LOAD_MSG_3_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m3, m1); \
+b1 = _mm_unpackhi_epi64(m6, m5); \
+} while(0)
+
+
+#define LOAD_MSG_3_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m0); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_3_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m1, m2, 0xF0); \
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_3_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m3, m5); \
+b1 = _mm_unpacklo_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_4_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m2); \
+b1 = _mm_unpacklo_epi64(m1, m5); \
+} while(0)
+
+
+#define LOAD_MSG_4_2(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m0, m3, 0xF0); \
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_4_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m7, m5, 0xF0); \
+b1 = _mm_blend_epi16(m3, m1, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_4_4(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m6, m0, 8); \
+b1 = _mm_blend_epi16(m4, m6, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_5_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m1, m3); \
+b1 = _mm_unpacklo_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_5_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m5); \
+b1 = _mm_unpackhi_epi64(m5, m1); \
+} while(0)
+
+
+#define LOAD_MSG_5_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m2, m3, 0xF0); \
+b1 = _mm_unpackhi_epi64(m7, m0); \
+} while(0)
+
+
+#define LOAD_MSG_5_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m6, m2); \
+b1 = _mm_blend_epi16(m7, m4, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_6_1(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m6, m0, 0xF0); \
+b1 = _mm_unpacklo_epi64(m7, m2); \
+} while(0)
+
+
+#define LOAD_MSG_6_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m2, m7); \
+b1 = _mm_alignr_epi8(m5, m6, 8); \
+} while(0)
+
+
+#define LOAD_MSG_6_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m3); \
+b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
+} while(0)
+
+
+#define LOAD_MSG_6_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m3, m1); \
+b1 = _mm_blend_epi16(m1, m5, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_7_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m6, m3); \
+b1 = _mm_blend_epi16(m6, m1, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_7_2(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m7, m5, 8); \
+b1 = _mm_unpackhi_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_7_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m2, m7); \
+b1 = _mm_unpacklo_epi64(m4, m1); \
+} while(0)
+
+
+#define LOAD_MSG_7_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m2); \
+b1 = _mm_unpacklo_epi64(m3, m5); \
+} while(0)
+
+
+#define LOAD_MSG_8_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m3, m7); \
+b1 = _mm_alignr_epi8(m0, m5, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m7, m4); \
+b1 = _mm_alignr_epi8(m4, m1, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_3(b0, b1) \
+do \
+{ \
+b0 = m6; \
+b1 = _mm_alignr_epi8(m5, m0, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_4(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m1, m3, 0xF0); \
+b1 = m2; \
+} while(0)
+
+
+#define LOAD_MSG_9_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_unpackhi_epi64(m3, m0); \
+} while(0)
+
+
+#define LOAD_MSG_9_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m1, m2); \
+b1 = _mm_blend_epi16(m3, m2, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_9_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m7, m4); \
+b1 = _mm_unpackhi_epi64(m1, m6); \
+} while(0)
+
+
+#define LOAD_MSG_9_4(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m7, m5, 8); \
+b1 = _mm_unpacklo_epi64(m6, m0); \
+} while(0)
+
+
+#define LOAD_MSG_10_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m1); \
+b1 = _mm_unpacklo_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_10_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m0, m1); \
+b1 = _mm_unpackhi_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_10_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m5); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_10_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m5); \
+b1 = _mm_unpackhi_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_11_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m2); \
+b1 = _mm_unpackhi_epi64(m4, m6); \
+} while(0)
+
+
+#define LOAD_MSG_11_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_alignr_epi8(m3, m7, 8); \
+} while(0)
+
+
+#define LOAD_MSG_11_3(b0, b1) \
+do \
+{ \
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+b1 = _mm_unpackhi_epi64(m5, m2); \
+} while(0)
+
+
+#define LOAD_MSG_11_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m1); \
+b1 = _mm_unpackhi_epi64(m3, m1); \
+} while(0)
+
+
+#endif
+
diff --git a/equi/blake2/blake2b-round.h b/equi/blake2/blake2b-round.h
new file mode 100644
index 0000000..b39106b
--- /dev/null
+++ b/equi/blake2/blake2b-round.h
@@ -0,0 +1,170 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_ROUND_H__
+#define __BLAKE2B_ROUND_H__
+
+#define LOAD(p)  _mm_load_si128( (const __m128i *)(p) )
+#define STORE(p,r) _mm_store_si128((__m128i *)(p), r)
+
+#define LOADU(p)  _mm_loadu_si128( (const __m128i *)(p) )
+#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
+
+#define TOF(reg) _mm_castsi128_ps((reg))
+#define TOI(reg) _mm_castps_si128((reg))
+
+#define LIKELY(x) __builtin_expect((x),1)
+
+
+/* Microarchitecture-specific macros */
+#ifndef HAVE_XOP
+#ifdef HAVE_SSSE3
+#define _mm_roti_epi64(x, c) \
+    (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+    : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
+    : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+    : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
+    : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
+#else
+#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) ))
+#endif
+#else
+/* ... */
+#endif
+
+
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, (-32)); \
+  row4h = _mm_roti_epi64(row4h, (-32)); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, (-24)); \
+  row2h = _mm_roti_epi64(row2h, (-24)); \
+
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, (-16)); \
+  row4h = _mm_roti_epi64(row4h, (-16)); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, (-63)); \
+  row2h = _mm_roti_epi64(row2h, (-63)); \
+
+#if defined(HAVE_SSSE3)
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+  t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0;    \
+  \
+  t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+  t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+  row4l = t1; \
+  row4h = t0;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+  t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0; \
+  \
+  t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+  t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+  row4l = t1; \
+  row4h = t0;
+#else
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = row4l;\
+  t1 = row2l;\
+  row4l = row3l;\
+  row3l = row3h;\
+  row3h = row4l;\
+  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
+  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
+  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
+  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = row3l;\
+  row3l = row3h;\
+  row3h = t0;\
+  t0 = row2l;\
+  t1 = row4l;\
+  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
+  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
+  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
+  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
+
+#endif
+
+#if defined(HAVE_SSE41)
+#include "blake2b-load-sse41.h"
+#else
+#include "blake2b-load-sse2.h"
+#endif
+
+#define ROUND(r) \
+  LOAD_MSG_ ##r ##_1(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_2(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+  LOAD_MSG_ ##r ##_3(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_4(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
+
+#endif
+
+#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
+	G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
diff --git a/equi/blake2/blake2bx.cpp b/equi/blake2/blake2bx.cpp
new file mode 100644
index 0000000..6f5c041
--- /dev/null
+++ b/equi/blake2/blake2bx.cpp
@@ -0,0 +1,262 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+#include "blake2-config.h"
+
+#ifdef WIN32
+#include <intrin.h>
+#endif
+
+#include <emmintrin.h>
+#if defined(HAVE_SSSE3)
+#include <tmmintrin.h>
+#endif
+#if defined(HAVE_SSE41)
+#include <smmintrin.h>
+#endif
+#if defined(HAVE_AVX)
+#include <immintrin.h>
+#endif
+#if defined(HAVE_XOP)
+#include <x86intrin.h>
+#endif
+
+#include "blake2b-round.h"
+
+ALIGN(64) static const uint64_t blake2b_IV[8] = {
+	0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+	0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+	0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+	0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+/* init xors IV with input parameter block */
+int eq_blake2b_init_param(blake2b_state *S, const blake2b_param *P)
+{
+	//blake2b_init0( S );
+	const uint8_t * v = (const uint8_t *)(blake2b_IV);
+	const uint8_t * p = (const uint8_t *)(P);
+	uint8_t * h = (uint8_t *)(S->h);
+	/* IV XOR ParamBlock */
+	memset(S, 0, sizeof(blake2b_state));
+
+	for (int i = 0; i < BLAKE2B_OUTBYTES; ++i) h[i] = v[i] ^ p[i];
+
+	return 0;
+}
+
+/* Some sort of default parameter block initialization, for sequential blake2b */
+int eq_blake2b_init(blake2b_state *S, const uint8_t outlen)
+{
+	if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
+
+	const blake2b_param P =
+	{
+		outlen,
+		0,
+		1,
+		1,
+		0,
+		0,
+		0,
+		0,
+		{ 0 },
+		{ 0 },
+		{ 0 }
+	};
+	return eq_blake2b_init_param(S, &P);
+}
+
+int eq_blake2b_init_key(blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen)
+{
+	if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
+
+	if ((!keylen) || keylen > BLAKE2B_KEYBYTES) return -1;
+
+	const blake2b_param P =
+	{
+		outlen,
+		keylen,
+		1,
+		1,
+		0,
+		0,
+		0,
+		0,
+		{ 0 },
+		{ 0 },
+		{ 0 }
+	};
+
+	if (eq_blake2b_init_param(S, &P) < 0)
+		return 0;
+
+	{
+		uint8_t block[BLAKE2B_BLOCKBYTES];
+		memset(block, 0, BLAKE2B_BLOCKBYTES);
+		memcpy(block, key, keylen);
+		eq_blake2b_update(S, block, BLAKE2B_BLOCKBYTES);
+		secure_zero_memory(block, BLAKE2B_BLOCKBYTES); /* Burn the key from stack */
+	}
+	return 0;
+}
+
+static inline int blake2b_compress(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES])
+{
+	__m128i row1l, row1h;
+	__m128i row2l, row2h;
+	__m128i row3l, row3h;
+	__m128i row4l, row4h;
+	__m128i b0, b1;
+	__m128i t0, t1;
+#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
+	const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
+	const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
+#endif
+#if defined(HAVE_SSE41)
+	const __m128i m0 = LOADU(block + 00);
+	const __m128i m1 = LOADU(block + 16);
+	const __m128i m2 = LOADU(block + 32);
+	const __m128i m3 = LOADU(block + 48);
+	const __m128i m4 = LOADU(block + 64);
+	const __m128i m5 = LOADU(block + 80);
+	const __m128i m6 = LOADU(block + 96);
+	const __m128i m7 = LOADU(block + 112);
+#else
+	const uint64_t  m0 = ( ( uint64_t * )block )[ 0];
+	const uint64_t  m1 = ( ( uint64_t * )block )[ 1];
+	const uint64_t  m2 = ( ( uint64_t * )block )[ 2];
+	const uint64_t  m3 = ( ( uint64_t * )block )[ 3];
+	const uint64_t  m4 = ( ( uint64_t * )block )[ 4];
+	const uint64_t  m5 = ( ( uint64_t * )block )[ 5];
+	const uint64_t  m6 = ( ( uint64_t * )block )[ 6];
+	const uint64_t  m7 = ( ( uint64_t * )block )[ 7];
+	const uint64_t  m8 = ( ( uint64_t * )block )[ 8];
+	const uint64_t  m9 = ( ( uint64_t * )block )[ 9];
+	const uint64_t m10 = ( ( uint64_t * )block )[10];
+	const uint64_t m11 = ( ( uint64_t * )block )[11];
+	const uint64_t m12 = ( ( uint64_t * )block )[12];
+	const uint64_t m13 = ( ( uint64_t * )block )[13];
+	const uint64_t m14 = ( ( uint64_t * )block )[14];
+	const uint64_t m15 = ( ( uint64_t * )block )[15];
+#endif
+	row1l = LOADU(&S->h[0]);
+	row1h = LOADU(&S->h[2]);
+	row2l = LOADU(&S->h[4]);
+	row2h = LOADU(&S->h[6]);
+	row3l = LOADU(&blake2b_IV[0]);
+	row3h = LOADU(&blake2b_IV[2]);
+	row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), _mm_set_epi32(0, 0, 0, S->counter));
+	row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), _mm_set_epi32(0, 0, 0L - S->lastblock, 0L - S->lastblock));
+	ROUND(0);
+	ROUND(1);
+	ROUND(2);
+	ROUND(3);
+	ROUND(4);
+	ROUND(5);
+	ROUND(6);
+	ROUND(7);
+	ROUND(8);
+	ROUND(9);
+	ROUND(10);
+	ROUND(11);
+	row1l = _mm_xor_si128(row3l, row1l);
+	row1h = _mm_xor_si128(row3h, row1h);
+	STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
+	STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
+	row2l = _mm_xor_si128(row4l, row2l);
+	row2h = _mm_xor_si128(row4h, row2h);
+	STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
+	STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
+	return 0;
+}
+
+int eq_blake2b_update(blake2b_state *S, const uint8_t *in, uint64_t inlen)
+{
+	while (inlen > 0)
+	{
+		size_t left = S->buflen;
+		size_t fill = BLAKE2B_BLOCKBYTES - left;
+
+		if (inlen > fill)
+		{
+			memcpy(S->buf + left, in, fill); // Fill buffer
+			in += fill;
+			inlen -= fill;
+			S->counter += BLAKE2B_BLOCKBYTES;
+			blake2b_compress(S, S->buf); // Compress
+			S->buflen = 0;
+		}
+		else // inlen <= fill
+		{
+			memcpy(S->buf + left, in, inlen);
+			S->buflen += (uint8_t) inlen; // not enough to compress
+			in += inlen;
+			inlen = 0;
+		}
+	}
+
+	return 0;
+}
+
+int eq_blake2b_final(blake2b_state *S, uint8_t *out, uint8_t outlen)
+{
+	if (outlen > BLAKE2B_OUTBYTES)
+		return -1;
+
+	if (S->buflen > BLAKE2B_BLOCKBYTES)
+	{
+		S->counter += BLAKE2B_BLOCKBYTES;
+		blake2b_compress(S, S->buf);
+		S->buflen -= BLAKE2B_BLOCKBYTES;
+		memcpy(S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen);
+	}
+
+	S->counter += S->buflen;
+	S->lastblock = 1;
+	memset(S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
+	blake2b_compress(S, S->buf);
+	memcpy(out, &S->h[0], outlen);
+	S->lastblock = 0;
+	return 0;
+}
+
+int eq_blake2b(uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen)
+{
+	blake2b_state S[1];
+
+	/* Verify parameters */
+	if (!in || !out) return -1;
+	if (NULL == key) keylen = 0;
+
+	if (keylen)
+	{
+		if (eq_blake2b_init_key(S, outlen, key, keylen) < 0) return -1;
+	}
+	else
+	{
+		if (eq_blake2b_init(S, outlen) < 0) return -1;
+	}
+
+	eq_blake2b_update(S, (const uint8_t *)in, inlen);
+	eq_blake2b_final(S, out, outlen);
+	return 0;
+}
diff --git a/equi/cuda_equi.cu b/equi/cuda_equi.cu
new file mode 100644
index 0000000..7a45ac8
--- /dev/null
+++ b/equi/cuda_equi.cu
@@ -0,0 +1,2117 @@
+/*
+ * Equihash solver created by djeZo (l33tsoftw@gmail.com) for NiceHash
+ * Adapted to be more compatible with older C++ compilers
+ *
+ * cuda_djezo solver was released by NiceHash (www.nicehash.com) under
+ * GPL 3.0 license. If you don't have a copy, you can obtain one from
+ * https://www.gnu.org/licenses/gpl-3.0.txt
+ *
+ * Based on CUDA solver by John Tromp released under MIT license.
+ * Some helper functions taken out of OpenCL solver by Marc Bevand
+ * released under MIT license.
+ *
+ * Copyright (c) 2016 John Tromp, Marc Bevand
+ * Copyright (c) 2017 djeZo, Tanguy Pruvot (GPL v3)
+ */
+
+#ifdef WIN32
+#include <Windows.h>
+#endif
+
+#include <stdio.h>
+#include <vector>
+//#include <mutex>
+
+#include "equihash.h"
+#include "eqcuda.hpp" // eq_cuda_context
+
+#include "blake2/blake2.h"
+
+//#define WN 200
+//#define WK 9
+#ifndef MAX_GPUS
+#define MAX_GPUS 16
+#endif
+
+#define NDIGITS		(WK+1)
+#define DIGITBITS	(WN/(NDIGITS))
+#define PROOFSIZE (1<<WK)
+#define BASE (1<<DIGITBITS)
+#define NHASHES (2*BASE)
+#define HASHESPERBLAKE (512/WN)
+#define HASHOUT (HASHESPERBLAKE*WN/8)
+#define NBLOCKS ((NHASHES + HASHESPERBLAKE - 1) / HASHESPERBLAKE)
+#define BUCKBITS (DIGITBITS - RB)
+#define NBUCKETS (1 << BUCKBITS)
+#define BUCKMASK (NBUCKETS - 1)
+#define SLOTBITS (RB + 2)
+#define SLOTRANGE (1 << SLOTBITS)
+#define NSLOTS SM
+#define SLOTMASK (SLOTRANGE - 1)
+#define NRESTS (1 << RB)
+#define RESTMASK (NRESTS - 1)
+#define CANTORBITS (2 * SLOTBITS - 2)
+#define CANTORMASK ((1 << CANTORBITS) - 1)
+#define CANTORMAXSQRT (2 * NSLOTS)
+#define RB8_NSLOTS 640
+#define RB8_NSLOTS_LD 624
+#define FD_THREADS 128
+
+#ifdef __INTELLISENSE__
+// reduce vstudio editor warnings
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+#define __launch_bounds__(max_tpb, min_blocks)
+#define __CUDA_ARCH__ 520
+uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
+uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
+uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z);
+uint32_t atomicExch(uint32_t *x, uint32_t y);
+uint32_t atomicAdd(uint32_t *x, uint32_t y);
+void __syncthreads(void);
+void __threadfence(void);
+void __threadfence_block(void);
+uint32_t __ldg(const uint32_t* address);
+uint64_t __ldg(const uint64_t* address);
+uint4 __ldca(const uint4 *ptr);
+u32 __ldca(const u32 *ptr);
+u32 umin(const u32, const u32);
+u32 umax(const u32, const u32);
+#endif
+
+typedef u32 proof[PROOFSIZE];
+
+struct __align__(32) slot {
+	u32 hash[8];
+};
+
+struct __align__(16) slotsmall {
+	u32 hash[4];
+};
+
+struct __align__(8) slottiny {
+	u32 hash[2];
+};
+
+template <u32 RB, u32 SM>
+struct equi
+{
+	slot round0trees[4096][RB8_NSLOTS];
+	slot trees[1][NBUCKETS][NSLOTS];
+	struct {
+		slotsmall treessmall[NSLOTS];
+		slottiny treestiny[NSLOTS];
+	} round2trees[NBUCKETS];
+	struct {
+		slotsmall treessmall[NSLOTS];
+		slottiny treestiny[NSLOTS];
+	} round3trees[NBUCKETS];
+	slotsmall treessmall[4][NBUCKETS][NSLOTS];
+	slottiny treestiny[1][4096][RB8_NSLOTS_LD];
+	u32 round4bidandsids[NBUCKETS][NSLOTS];
+	union {
+		u64 blake_h[8];
+		u32 blake_h32[16];
+	};
+	struct {
+		u32 nslots8[4096];
+		u32 nslots0[4096];
+		u32 nslots[9][NBUCKETS];
+		scontainerreal srealcont;
+	} edata;
+};
+
+// todo: use cuda_helper.h and/or cuda_vector.h
+__device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b)
+{
+	return make_uint2(a.x ^ b.x, a.y ^ b.y);
+}
+
+__device__ __forceinline__ uint4 operator^ (uint4 a, uint4 b)
+{
+	return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+
+// for ROR 63 (or ROL 1); this func only support (32 <= offset < 64)
+__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset)
+{
+	uint2 result;
+#if __CUDA_ARCH__ > 300
+	{
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+#else
+	result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
+	result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
+#endif
+	return result;
+}
+
+
+__device__ __forceinline__ uint2 SWAPUINT2(uint2 value)
+{
+	return make_uint2(value.y, value.x);
+}
+
+__device__ __forceinline__ uint2 ROR24(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x2107);
+	result.y = __byte_perm(a.y, a.x, 0x6543);
+	return result;
+}
+
+__device__ __forceinline__ uint2 ROR16(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x1076);
+	result.y = __byte_perm(a.y, a.x, 0x5432);
+	return result;
+}
+
+__device__ __forceinline__ void G2(u64 & a, u64 & b, u64 & c, u64 & d, u64 x, u64 y)
+{
+	a = a + b + x;
+	((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+	c = c + d;
+	((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]);
+	a = a + b + y;
+	((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+	c = c + d;
+	((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
+}
+
+// untested..
+struct packer_default
+{
+	__device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return (((bucketid << SLOTBITS) | s0) << SLOTBITS) | s1;
+	}
+
+	__device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM)
+	{
+		// BUCKMASK-ed to prevent illegal memory accesses in case of memory errors
+		return (bid >> (2 * SLOTBITS)) & BUCKMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return bid & SLOTMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM)
+	{
+		return (bid >> SLOTBITS) & SLOTMASK;
+	}
+};
+
+
+struct packer_cantor
+{
+	__device__ __forceinline__ static u32 cantor(const u32 s0, const u32 s1)
+	{
+		u32 a = umax(s0, s1);
+		u32 b = umin(s0, s1);
+		return a * (a + 1) / 2 + b;
+	}
+
+	__device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return (bucketid << CANTORBITS) | cantor(s0, s1);
+	}
+
+	__device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM)
+	{
+		return (bid >> CANTORBITS) & BUCKMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return ((bid & CANTORMASK) - cantor(0, s1)) & SLOTMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM)
+	{
+		u32 k, q, sqr = 8 * (bid & CANTORMASK) + 1;
+		// this k=sqrt(sqr) computing loop averages 3.4 iterations out of maximum 9
+		for (k = CANTORMAXSQRT; (q = sqr / k) < k; k = (k + q) / 2);
+		return ((k - 1) / 2) & SLOTMASK;
+	}
+};
+
+__device__ __constant__ const u64 blake_iv[] = {
+	0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+	0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+	0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+	0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+};
+
+#if CUDART_VERSION < 8000 || !defined(__ldca)
+#define __ldca(ptr) *(ptr)
+#endif
+
+template <u32 RB, u32 SM, typename PACKER>
+__global__ void digit_first(equi<RB, SM>* eq, u32 nonce)
+{
+	const u32 block = blockIdx.x * blockDim.x + threadIdx.x;
+	__shared__ u64 hash_h[8];
+	u32* hash_h32 = (u32*)hash_h;
+
+	if (threadIdx.x < 16)
+		hash_h32[threadIdx.x] = __ldca(&eq->blake_h32[threadIdx.x]);
+
+	__syncthreads();
+
+	u64 m = (u64)block << 32 | (u64)nonce;
+
+	union
+	{
+		u64 v[16];
+		u32 v32[32];
+		uint4 v128[8];
+	};
+
+	v[0] = hash_h[0];
+	v[1] = hash_h[1];
+	v[2] = hash_h[2];
+	v[3] = hash_h[3];
+	v[4] = hash_h[4];
+	v[5] = hash_h[5];
+	v[6] = hash_h[6];
+	v[7] = hash_h[7];
+	v[8] = blake_iv[0];
+	v[9] = blake_iv[1];
+	v[10] = blake_iv[2];
+	v[11] = blake_iv[3];
+	v[12] = blake_iv[4] ^ (128 + 16);
+	v[13] = blake_iv[5];
+	v[14] = blake_iv[6] ^ 0xffffffffffffffff;
+	v[15] = blake_iv[7];
+
+	// mix 1
+	G2(v[0], v[4], v[8], v[12], 0, m);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 2
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], m, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 3
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, m);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 4
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, m);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 5
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, m);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 6
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], m, 0);
+
+	// mix 7
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], m, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 8
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, m);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 9
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], m, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 10
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], m, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 11
+	G2(v[0], v[4], v[8], v[12], 0, m);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 12
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], m, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	v[0] ^= hash_h[0] ^ v[8];
+	v[1] ^= hash_h[1] ^ v[9];
+	v[2] ^= hash_h[2] ^ v[10];
+	v[3] ^= hash_h[3] ^ v[11];
+	v[4] ^= hash_h[4] ^ v[12];
+	v[5] ^= hash_h[5] ^ v[13];
+	v32[12] ^= hash_h32[12] ^ v32[28];
+
+	u32 bexor = __byte_perm(v32[0], 0, 0x4012); // first 20 bits
+	u32 bucketid;
+	asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor));
+	u32 slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1);
+	if (slotp < RB8_NSLOTS)
+	{
+		slot* s = &eq->round0trees[bucketid][slotp];
+
+		uint4 tt;
+		tt.x = __byte_perm(v32[0], v32[1], 0x1234);
+		tt.y = __byte_perm(v32[1], v32[2], 0x1234);
+		tt.z = __byte_perm(v32[2], v32[3], 0x1234);
+		tt.w = __byte_perm(v32[3], v32[4], 0x1234);
+		*(uint4*)(&s->hash[0]) = tt;
+
+		tt.x = __byte_perm(v32[4], v32[5], 0x1234);
+		tt.y = __byte_perm(v32[5], v32[6], 0x1234);
+		tt.z = 0;
+		tt.w = block << 1;
+		*(uint4*)(&s->hash[4]) = tt;
+	}
+
+	bexor = __byte_perm(v32[6], 0, 0x0123);
+	asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor));
+	slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1);
+	if (slotp < RB8_NSLOTS)
+	{
+		slot* s = &eq->round0trees[bucketid][slotp];
+
+		uint4 tt;
+		tt.x = __byte_perm(v32[6], v32[7], 0x2345);
+		tt.y = __byte_perm(v32[7], v32[8], 0x2345);
+		tt.z = __byte_perm(v32[8], v32[9], 0x2345);
+		tt.w = __byte_perm(v32[9], v32[10], 0x2345);
+		*(uint4*)(&s->hash[0]) = tt;
+
+		tt.x = __byte_perm(v32[10], v32[11], 0x2345);
+		tt.y = __byte_perm(v32[11], v32[12], 0x2345);
+		tt.z = 0;
+		tt.w = (block << 1) + 1;
+		*(uint4*)(&s->hash[4]) = tt;
+	}
+}
+
+/*
+  Functions digit_1 to digit_8 works by the same principle;
+  Each thread does 2-3 slot loads (loads are coalesced).
+  Xorwork of slots is loaded into shared memory and is kept in registers (except for digit_1).
+  At the same time, restbits (8 or 9 bits) in xorwork are used for collisions.
+  Restbits determine position in ht.
+  Following next is pair creation. First one (or two) pairs' xorworks are put into global memory
+  as soon as possible, the rest pairs are saved in shared memory (one u32 per pair - 16 bit indices).
+  In most cases, all threads have one (or two) pairs so with this trick, we offload memory writes a bit in last step.
+  In last step we save xorwork of pairs in memory.
+*/
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_1(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[256][SSM - 1];
+	__shared__ uint2 lastword1[RB8_NSLOTS];
+	__shared__ uint4 lastword2[RB8_NSLOTS];
+	__shared__ int ht_len[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < 256)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	u32 bsize = umin(eq->edata.nslots0[bucketid], RB8_NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	uint2 ta[2];
+	uint4 tb[2];
+
+	u32 si[2];
+
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	//__syncthreads();
+
+	#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slot* pslot1 = eq->round0trees[bucketid] + si[i];
+
+		// get xhash
+		uint4 a1 = *(uint4*)(&pslot1->hash[0]);
+		uint2 a2 = *(uint2*)(&pslot1->hash[4]);
+		ta[i].x = a1.x;
+		ta[i].y = a1.y;
+		lastword1[si[i]] = ta[i];
+		tb[i].x = a1.z;
+		tb[i].y = a1.w;
+		tb[i].z = a2.x;
+		tb[i].w = a2.y;
+		lastword2[si[i]] = tb[i];
+
+		asm("bfe.u32 %0, %1, 20, 8;" : "=r"(hr[i]) : "r"(ta[i].x));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+	int* pairs = ht_len;
+
+	u32 xors[6];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint2*)(&xors[0]) = ta[i] ^ lastword1[p];
+
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1);
+
+			if (xorslot < NSLOTS)
+			{
+				*(uint4*)(&xors[2]) = lastword2[si[i]] ^ lastword2[p];
+
+				slot &xs = eq->trees[0][xorbucketid][xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+				uint4 ttx;
+				ttx.x = xors[5];
+				ttx.y = xors[0];
+				ttx.z = packer_default::set_bucketid_and_slots(bucketid, si[i], p, 8, RB8_NSLOTS);
+				ttx.w = 0;
+				*(uint4*)(&xs.hash[4]) = ttx;
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k];
+
+		asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS));
+		xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1);
+
+		if (xorslot < NSLOTS)
+		{
+			*(uint4*)(&xors[2]) = lastword2[i] ^ lastword2[k];
+
+			slot &xs = eq->trees[0][xorbucketid][xorslot];
+			*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+			uint4 ttx;
+			ttx.x = xors[5];
+			ttx.y = xors[0];
+			ttx.z = packer_default::set_bucketid_and_slots(bucketid, i, k, 8, RB8_NSLOTS);
+			ttx.w = 0;
+			*(uint4*)(&xs.hash[4]) = ttx;
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_2(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][SSM - 1];
+	__shared__ u32 lastword1[NSLOTS];
+	__shared__ uint4 lastword2[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	slot* buck = eq->trees[0][bucketid];
+	u32 bsize = umin(eq->edata.nslots[1][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 ta[2];
+	uint4 tt[2];
+
+	u32 si[2];
+
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	//__syncthreads();
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		// get slot
+		const slot* pslot1 = buck + si[i];
+
+		uint4 ttx = *(uint4*)(&pslot1->hash[0]);
+		lastword1[si[i]] = ta[i] = ttx.x;
+		uint2 tty = *(uint2*)(&pslot1->hash[4]);
+		tt[i].x = ttx.y;
+		tt[i].y = ttx.z;
+		tt[i].z = ttx.w;
+		tt[i].w = tty.x;
+		lastword2[si[i]] = tt[i];
+
+		hr[i] = tty.y & RESTMASK;
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[5];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			xors[0] = ta[i] ^ lastword1[p];
+
+			xorbucketid = xors[0] >> (12 + RB);
+			xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1);
+			if (xorslot < NSLOTS)
+			{
+				*(uint4*)(&xors[1]) = tt[i] ^ lastword2[p];
+				slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+				slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot];
+				uint2 ttx;
+				ttx.x = xors[4];
+				ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+				*(uint2*)(&xst.hash[0]) = ttx;
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		xors[0] = lastword1[i] ^ lastword1[k];
+
+		xorbucketid = xors[0] >> (12 + RB);
+		xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1);
+		if (xorslot < NSLOTS)
+		{
+			*(uint4*)(&xors[1]) = lastword2[i] ^ lastword2[k];
+			slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot];
+			*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+			slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot];
+			uint2 ttx;
+			ttx.x = xors[4];
+			ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+			*(uint2*)(&xst.hash[0]) = ttx;
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_3(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint4 lastword1[NSLOTS];
+	__shared__ u32 lastword2[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	u32 bsize = umin(eq->edata.nslots[2][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 si[2];
+	uint4 tt[2];
+	u32 ta[2];
+
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	//__syncthreads();
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		slotsmall &xs = eq->round2trees[bucketid].treessmall[si[i]];
+		slottiny &xst = eq->round2trees[bucketid].treestiny[si[i]];
+
+		tt[i] = *(uint4*)(&xs.hash[0]);
+		lastword1[si[i]] = tt[i];
+		ta[i] = xst.hash[0];
+		lastword2[si[i]] = ta[i];
+		asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[5];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			xors[4] = ta[i] ^ lastword2[p];
+
+			if (xors[4] != 0)
+			{
+				*(uint4*)(&xors[0]) = tt[i] ^ lastword1[p];
+
+				bexor = __byte_perm(xors[0], xors[1], 0x2107);
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1);
+
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot];
+					*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+					slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot];
+					uint2 ttx;
+					ttx.x = bexor;
+					ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint2*)(&xst.hash[0]) = ttx;
+				}
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		xors[4] = lastword2[i] ^ lastword2[k];
+
+		if (xors[4] != 0)
+		{
+			*(uint4*)(&xors[0]) = lastword1[i] ^ lastword1[k];
+
+			bexor = __byte_perm(xors[0], xors[1], 0x2107);
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1);
+
+			if (xorslot < NSLOTS)
+			{
+				slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+				slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot];
+				uint2 ttx;
+				ttx.x = bexor;
+				ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+				*(uint2*)(&xst.hash[0]) = ttx;
+			}
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_4(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint4 lastword[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	u32 bsize = umin(eq->edata.nslots[3][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 si[2];
+	uint4 tt[2];
+
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	//__syncthreads();
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		slotsmall &xs = eq->round3trees[bucketid].treessmall[si[i]];
+		slottiny &xst = eq->round3trees[bucketid].treestiny[si[i]];
+
+		// get xhash
+		tt[i] = *(uint4*)(&xs.hash[0]);
+		lastword[si[i]] = tt[i];
+		hr[i] = xst.hash[0] & RESTMASK;
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+	u32 xors[4];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint4*)(&xors[0]) = tt[i] ^ lastword[p];
+
+			if (xors[3] != 0)
+			{
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot];
+					*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+
+					eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+				}
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint4*)(&xors[0]) = lastword[i] ^ lastword[k];
+		if (xors[3] != 0)
+		{
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1);
+			if (xorslot < NSLOTS)
+			{
+				slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+				eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+			}
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_5(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint4 lastword[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	slotsmall* buck = eq->treessmall[3][bucketid];
+	u32 bsize = umin(eq->edata.nslots[4][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 si[2];
+	uint4 tt[2];
+
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	//__syncthreads();
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		tt[i] = *(uint4*)(&pslot1->hash[0]);
+		lastword[si[i]] = tt[i];
+		asm("bfe.u32 %0, %1, 4, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+	u32 xors[4];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint4*)(&xors[0]) = tt[i] ^ lastword[p];
+
+			if (xors[3] != 0)
+			{
+				bexor = __byte_perm(xors[0], xors[1], 0x1076);
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot];
+					uint4 ttx;
+					ttx.x = xors[1];
+					ttx.y = xors[2];
+					ttx.z = xors[3];
+					ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint4*)(&xs.hash[0]) = ttx;
+				}
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint4*)(&xors[0]) = lastword[i] ^ lastword[k];
+
+		if (xors[3] != 0)
+		{
+			bexor = __byte_perm(xors[0], xors[1], 0x1076);
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1);
+			if (xorslot < NSLOTS)
+			{
+				slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot];
+				uint4 tt;
+				tt.x = xors[1];
+				tt.y = xors[2];
+				tt.z = xors[3];
+				tt.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+				*(uint4*)(&xs.hash[0]) = tt;
+			}
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS>
+__global__ void digit_6(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint2 lastword1[NSLOTS];
+	__shared__ u32 lastword2[NSLOTS];
+	__shared__ int ht_len[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 bsize_sh;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	ht_len[threadid] = 0;
+	if (threadid == (NRESTS - 1))
+	{
+		pairs_len = 0;
+		next_pair = 0;
+	}
+	else if (threadid == (NRESTS - 33))
+		bsize_sh = umin(eq->edata.nslots[5][bucketid], NSLOTS);
+
+	slotsmall* buck = eq->treessmall[2][bucketid];
+
+	u32 hr[3];
+	int pos[3];
+	pos[0] = pos[1] = pos[2] = SSM;
+
+	u32 si[3];
+	uint4 tt[3];
+
+	__syncthreads();
+
+	u32 bsize = bsize_sh;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		si[i] = i * NRESTS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		tt[i] = *(uint4*)(&pslot1->hash[0]);
+		lastword1[si[i]] = *(uint2*)(&tt[i].x);
+		lastword2[si[i]] = tt[i].z;
+		asm("bfe.u32 %0, %1, 16, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	// doing this to save shared memory
+	int* pairs = ht_len;
+	__syncthreads();
+
+	u32 xors[3];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			xors[2] = tt[i].z ^ lastword2[p];
+
+			if (xors[2] != 0)
+			{
+				*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p];
+
+				bexor = __byte_perm(xors[0], xors[1], 0x1076);
+				xorbucketid = bexor >> (12 + RB);
+				xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot];
+					uint4 ttx;
+					ttx.x = xors[1];
+					ttx.y = xors[2];
+					ttx.z = bexor;
+					ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint4*)(&xs.hash[0]) = ttx;
+				}
+			}
+
+			if (pos[i] > 1)
+			{
+				p = ht[hr[i]][1];
+
+				xors[2] = tt[i].z ^ lastword2[p];
+
+				if (xors[2] != 0)
+				{
+					*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p];
+
+					bexor = __byte_perm(xors[0], xors[1], 0x1076);
+					xorbucketid = bexor >> (12 + RB);
+					xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1);
+					if (xorslot < NSLOTS)
+					{
+						slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot];
+						uint4 ttx;
+						ttx.x = xors[1];
+						ttx.y = xors[2];
+						ttx.z = bexor;
+						ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+						*(uint4*)(&xs.hash[0]) = ttx;
+					}
+				}
+
+				for (int k = 2; k != pos[i]; ++k)
+				{
+					u32 pindex = atomicAdd(&pairs_len, 1);
+					if (pindex >= MAXPAIRS) break;
+					u16 prev = ht[hr[i]][k];
+					pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+				}
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		u32 pair = pairs[s];
+		u32 i = __byte_perm(pair, 0, 0x4510);
+		u32 k = __byte_perm(pair, 0, 0x4532);
+
+		xors[2] = lastword2[i] ^ lastword2[k];
+		if (xors[2] == 0)
+			continue;
+
+		*(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k];
+
+		bexor = __byte_perm(xors[0], xors[1], 0x1076);
+		xorbucketid = bexor >> (12 + RB);
+		xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1);
+		if (xorslot >= NSLOTS) continue;
+		slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot];
+		uint4 ttx;
+		ttx.x = xors[1];
+		ttx.y = xors[2];
+		ttx.z = bexor;
+		ttx.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+		*(uint4*)(&xs.hash[0]) = ttx;
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS>
+__global__ void digit_7(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ u32 lastword[NSLOTS][2];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 bsize_sh;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	ht_len[threadid] = 0;
+	if (threadid == (NRESTS - 1))
+	{
+		pairs_len = 0;
+		next_pair = 0;
+	}
+	else if (threadid == (NRESTS - 33))
+		bsize_sh = umin(eq->edata.nslots[6][bucketid], NSLOTS);
+
+	slotsmall* buck = eq->treessmall[0][bucketid];
+
+	u32 hr[3];
+	int pos[3];
+	pos[0] = pos[1] = pos[2] = SSM;
+
+	u32 si[3];
+	uint4 tt[3];
+
+	__syncthreads();
+
+	u32 bsize = bsize_sh;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		si[i] = i * NRESTS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		// get xhash
+		tt[i] = *(uint4*)(&pslot1->hash[0]);
+		*(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x);
+		asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].z), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[2];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+			if (xors[1] != 0)
+			{
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot];
+					uint4 ttx;
+					ttx.x = xors[0];
+					ttx.y = xors[1];
+					ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					ttx.w = 0;
+					*(uint4*)(&xs.hash[0]) = ttx;
+				}
+			}
+
+			if (pos[i] > 1)
+			{
+				p = ht[hr[i]][1];
+
+				*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+				if (xors[1] != 0)
+				{
+					asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS));
+					xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1);
+					if (xorslot < NSLOTS)
+					{
+						slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot];
+						uint4 ttx;
+						ttx.x = xors[0];
+						ttx.y = xors[1];
+						ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+						ttx.w = 0;
+						*(uint4*)(&xs.hash[0]) = ttx;
+					}
+				}
+
+				for (int k = 2; k != pos[i]; ++k)
+				{
+					u32 pindex = atomicAdd(&pairs_len, 1);
+					if (pindex >= MAXPAIRS) break;
+					u16 prev = ht[hr[i]][k];
+					pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+				}
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		u32 i = __byte_perm(pair, 0, 0x4510);
+		u32 k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]);
+
+		if (xors[1] == 0)
+			continue;
+
+		asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS));
+		xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1);
+		if (xorslot >= NSLOTS) continue;
+		slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot];
+		uint4 tt;
+		tt.x = xors[0];
+		tt.y = xors[1];
+		tt.z = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+		tt.w = 0;
+		*(uint4*)(&xs.hash[0]) = tt;
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS>
+__global__ void digit_8(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ u32 lastword[NSLOTS][2];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 bsize_sh;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	ht_len[threadid] = 0;
+	if (threadid == (NRESTS - 1))
+	{
+		next_pair = 0;
+		pairs_len = 0;
+	}
+	else if (threadid == (NRESTS - 33))
+		bsize_sh = umin(eq->edata.nslots[7][bucketid], NSLOTS);
+
+	slotsmall* buck = eq->treessmall[1][bucketid];
+
+	u32 hr[3];
+	int pos[3];
+	pos[0] = pos[1] = pos[2] = SSM;
+
+	u32 si[3];
+	uint2 tt[3];
+
+	__syncthreads();
+
+	u32 bsize = bsize_sh;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		si[i] = i * NRESTS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		// get xhash
+		tt[i] = *(uint2*)(&pslot1->hash[0]);
+		*(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x);
+		asm("bfe.u32 %0, %1, 8, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[2];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+			if (xors[1] != 0)
+			{
+				bexor = __byte_perm(xors[0], xors[1], 0x0765);
+				xorbucketid = bexor >> (12 + 8);
+				xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1);
+				if (xorslot < RB8_NSLOTS_LD)
+				{
+					slottiny &xs = eq->treestiny[0][xorbucketid][xorslot];
+					uint2 tt;
+					tt.x = xors[1];
+					tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint2*)(&xs.hash[0]) = tt;
+				}
+			}
+
+			if (pos[i] > 1)
+			{
+				p = ht[hr[i]][1];
+
+				*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+				if (xors[1] != 0)
+				{
+					bexor = __byte_perm(xors[0], xors[1], 0x0765);
+					xorbucketid = bexor >> (12 + 8);
+					xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1);
+					if (xorslot < RB8_NSLOTS_LD)
+					{
+						slottiny &xs = eq->treestiny[0][xorbucketid][xorslot];
+						uint2 tt;
+						tt.x = xors[1];
+						tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+						*(uint2*)(&xs.hash[0]) = tt;
+					}
+				}
+
+				for (int k = 2; k != pos[i]; ++k)
+				{
+					u32 pindex = atomicAdd(&pairs_len, 1);
+					if (pindex >= MAXPAIRS) break;
+					u16 prev = ht[hr[i]][k];
+					pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+				}
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		u32 i = __byte_perm(pair, 0, 0x4510);
+		u32 k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]);
+
+		if (xors[1] == 0)
+			continue;
+
+		bexor = __byte_perm(xors[0], xors[1], 0x0765);
+		xorbucketid = bexor >> (12 + 8);
+		xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1);
+		if (xorslot >= RB8_NSLOTS_LD) continue;
+		slottiny &xs = eq->treestiny[0][xorbucketid][xorslot];
+		uint2 tt;
+		tt.x = xors[1];
+		tt.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+		*(uint2*)(&xs.hash[0]) = tt;
+	}
+}
+
+/*
+  Last round function is similar to previous ones but has different ending.
+  We use warps to process final candidates. Each warp process one candidate.
+  First two bidandsids (u32 of stored bucketid and two slotids) are retreived by
+  lane 0 and lane 16, next four bidandsids by lane 0, 8, 16 and 24, ... until
+  all lanes in warp have bidandsids from round 4. Next, each thread retreives
+  16 indices. While doing so, indices are put into comparison using atomicExch
+  to determine if there are duplicates (tromp's method). At the end, if no
+  duplicates are found, candidate solution is saved (all indices). Note that this
+  dup check method is not exact so CPU dup checking is needed after.
+*/
+template <u32 RB, u32 SM, int SSM, u32 FCT, typename PACKER, u32 MAXPAIRS, u32 DUPBITS, u32 W>
+__global__ void digit_last_wdc(equi<RB, SM>* eq)
+{
+	__shared__ u8 shared_data[8192];
+	int* ht_len = (int*)(&shared_data[0]);
+	int* pairs = ht_len;
+	u32* lastword = (u32*)(&shared_data[256 * 4]);
+	u16* ht = (u16*)(&shared_data[256 * 4 + RB8_NSLOTS_LD * 4]);
+	u32* pairs_len = (u32*)(&shared_data[8188]);
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	#pragma unroll
+	for (u32 i = 0; i < FCT; i++)
+		ht_len[(i * (256 / FCT)) + threadid] = 0;
+
+	if (threadid == ((256 / FCT) - 1))
+		*pairs_len = 0;
+
+	slottiny* buck = eq->treestiny[0][bucketid];
+	u32 bsize = umin(eq->edata.nslots8[bucketid], RB8_NSLOTS_LD);
+
+	u32 si[3 * FCT];
+	u32 hr[3 * FCT];
+	int pos[3 * FCT];
+	u32 lw[3 * FCT];
+
+	#pragma unroll
+	for (u32 i = 0; i < (3 * FCT); i++)
+		pos[i] = SSM;
+
+	__syncthreads();
+
+	#pragma unroll
+	for (u32 i = 0; i < (3 * FCT); i++)
+	{
+		si[i] = i * (256 / FCT) + threadid;
+		if (si[i] >= bsize) break;
+
+		const slottiny* pslot1 = buck + si[i];
+
+		// get xhash
+		uint2 tt = *(uint2*)(&pslot1->hash[0]);
+		lw[i] = tt.x;
+		lastword[si[i]] = lw[i];
+
+		u32 a;
+		asm("bfe.u32 %0, %1, 20, 8;" : "=r"(a) : "r"(lw[i]));
+		hr[i] = a;
+
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1))
+			ht[hr[i] * (SSM - 1) + pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	#pragma unroll
+	for (u32 i = 0; i < (3 * FCT); i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		for (int k = 0; k != pos[i]; ++k)
+		{
+			u16 prev = ht[hr[i] * (SSM - 1) + k];
+			if (lw[i] != lastword[prev]) continue;
+			u32 pindex = atomicAdd(pairs_len, 1);
+			if (pindex >= MAXPAIRS) break;
+			pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+		}
+	}
+
+	__syncthreads();
+	u32 plen = umin(*pairs_len, 64);
+
+#define CALC_LEVEL(a, b, c, d) { \
+	u32 plvl = levels[b]; \
+	u32* bucks = eq->round4bidandsids[PACKER::get_bucketid(plvl, RB, SM)]; \
+	u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \
+	u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \
+	levels[b] = bucks[slot1]; \
+	levels[c] = bucks[slot0]; \
+				}
+
+#define CALC_LEVEL_SMALL(a, b, c, d) { \
+	u32 plvl = levels[b]; \
+	slotsmall* bucks = eq->treessmall[a][PACKER::get_bucketid(plvl, RB, SM)]; \
+	u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \
+	u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \
+	levels[b] = bucks[slot1].hash[d]; \
+	levels[c] = bucks[slot0].hash[d]; \
+				}
+
+	u32 lane = threadIdx.x & 0x1f;
+	u32 par = threadIdx.x >> 5;
+
+	u32* levels = (u32*)&pairs[MAXPAIRS + (par << DUPBITS)];
+	u32* susp = levels;
+
+	while (par < plen)
+	{
+		int pair = pairs[par];
+		par += W;
+
+		if (lane % 16 == 0)
+		{
+			u32 plvl;
+			if (lane == 0) plvl = buck[__byte_perm(pair, 0, 0x4510)].hash[1];
+			else plvl = buck[__byte_perm(pair, 0, 0x4532)].hash[1];
+			slotsmall* bucks = eq->treessmall[1][PACKER::get_bucketid(plvl, RB, SM)];
+			u32 slot1 = PACKER::get_slot1(plvl, RB, SM);
+			u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM);
+			levels[lane] = bucks[slot1].hash[2];
+			levels[lane + 8] = bucks[slot0].hash[2];
+		}
+
+		if (lane % 8 == 0)
+			CALC_LEVEL_SMALL(0, lane, lane + 4, 3);
+
+		if (lane % 4 == 0)
+			CALC_LEVEL_SMALL(2, lane, lane + 2, 3);
+
+		if (lane % 2 == 0)
+			CALC_LEVEL(0, lane, lane + 1, 4);
+
+		u32 ind[16];
+
+		u32 f1 = levels[lane];
+		const slottiny* buck_v4 = &eq->round3trees[PACKER::get_bucketid(f1, RB, SM)].treestiny[0];
+		const u32 slot1_v4 = PACKER::get_slot1(f1, RB, SM);
+		const u32 slot0_v4 = PACKER::get_slot0(f1, slot1_v4, RB, SM);
+
+		susp[lane] = 0xffffffff;
+		susp[32 + lane] = 0xffffffff;
+
+#define CHECK_DUP(a) \
+	__any(atomicExch(&susp[(ind[a] & ((1 << DUPBITS) - 1))], (ind[a] >> DUPBITS)) == (ind[a] >> DUPBITS))
+
+		u32 f2 = buck_v4[slot1_v4].hash[1];
+		const slottiny* buck_v3_1 = &eq->round2trees[PACKER::get_bucketid(f2, RB, SM)].treestiny[0];
+		const u32 slot1_v3_1 = PACKER::get_slot1(f2, RB, SM);
+		const u32 slot0_v3_1 = PACKER::get_slot0(f2, slot1_v3_1, RB, SM);
+
+		susp[64 + lane] = 0xffffffff;
+		susp[96 + lane] = 0xffffffff;
+
+		u32 f0 = buck_v3_1[slot1_v3_1].hash[1];
+		const slot* buck_v2_1 = eq->trees[0][PACKER::get_bucketid(f0, RB, SM)];
+		const u32 slot1_v2_1 = PACKER::get_slot1(f0, RB, SM);
+		const u32 slot0_v2_1 = PACKER::get_slot0(f0, slot1_v2_1, RB, SM);
+
+		susp[128 + lane] = 0xffffffff;
+		susp[160 + lane] = 0xffffffff;
+
+		u32 f3 = buck_v2_1[slot1_v2_1].hash[6];
+		const slot* buck_fin_1 = eq->round0trees[packer_default::get_bucketid(f3, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_1 = packer_default::get_slot1(f3, 8, RB8_NSLOTS);
+		const u32 slot0_fin_1 = packer_default::get_slot0(f3, slot1_fin_1, 8, RB8_NSLOTS);
+
+		susp[192 + lane] = 0xffffffff;
+		susp[224 + lane] = 0xffffffff;
+
+		ind[0] = buck_fin_1[slot1_fin_1].hash[7];
+		if (CHECK_DUP(0)) continue;
+		ind[1] = buck_fin_1[slot0_fin_1].hash[7];
+		if (CHECK_DUP(1)) continue;
+
+		u32 f4 = buck_v2_1[slot0_v2_1].hash[6];
+		const slot* buck_fin_2 = eq->round0trees[packer_default::get_bucketid(f4, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_2 = packer_default::get_slot1(f4, 8, RB8_NSLOTS);
+		const u32 slot0_fin_2 = packer_default::get_slot0(f4, slot1_fin_2, 8, RB8_NSLOTS);
+
+		ind[2] = buck_fin_2[slot1_fin_2].hash[7];
+		if (CHECK_DUP(2)) continue;
+		ind[3] = buck_fin_2[slot0_fin_2].hash[7];
+		if (CHECK_DUP(3)) continue;
+
+		u32 f5 = buck_v3_1[slot0_v3_1].hash[1];
+		const slot* buck_v2_2 = eq->trees[0][PACKER::get_bucketid(f5, RB, SM)];
+		const u32 slot1_v2_2 = PACKER::get_slot1(f5, RB, SM);
+		const u32 slot0_v2_2 = PACKER::get_slot0(f5, slot1_v2_2, RB, SM);
+
+		u32 f6 = buck_v2_2[slot1_v2_2].hash[6];
+		const slot* buck_fin_3 = eq->round0trees[packer_default::get_bucketid(f6, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_3 = packer_default::get_slot1(f6, 8, RB8_NSLOTS);
+		const u32 slot0_fin_3 = packer_default::get_slot0(f6, slot1_fin_3, 8, RB8_NSLOTS);
+
+		ind[4] = buck_fin_3[slot1_fin_3].hash[7];
+		if (CHECK_DUP(4)) continue;
+		ind[5] = buck_fin_3[slot0_fin_3].hash[7];
+		if (CHECK_DUP(5)) continue;
+
+		u32 f7 = buck_v2_2[slot0_v2_2].hash[6];
+		const slot* buck_fin_4 = eq->round0trees[packer_default::get_bucketid(f7, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_4 = packer_default::get_slot1(f7, 8, RB8_NSLOTS);
+		const u32 slot0_fin_4 = packer_default::get_slot0(f7, slot1_fin_4, 8, RB8_NSLOTS);
+
+		ind[6] = buck_fin_4[slot1_fin_4].hash[7];
+		if (CHECK_DUP(6)) continue;
+		ind[7] = buck_fin_4[slot0_fin_4].hash[7];
+		if (CHECK_DUP(7)) continue;
+
+		u32 f8 = buck_v4[slot0_v4].hash[1];
+		const slottiny* buck_v3_2 = &eq->round2trees[PACKER::get_bucketid(f8, RB, SM)].treestiny[0];
+		const u32 slot1_v3_2 = PACKER::get_slot1(f8, RB, SM);
+		const u32 slot0_v3_2 = PACKER::get_slot0(f8, slot1_v3_2, RB, SM);
+
+		u32 f9 = buck_v3_2[slot1_v3_2].hash[1];
+		const slot* buck_v2_3 = eq->trees[0][PACKER::get_bucketid(f9, RB, SM)];
+		const u32 slot1_v2_3 = PACKER::get_slot1(f9, RB, SM);
+		const u32 slot0_v2_3 = PACKER::get_slot0(f9, slot1_v2_3, RB, SM);
+
+		u32 f10 = buck_v2_3[slot1_v2_3].hash[6];
+		const slot* buck_fin_5 = eq->round0trees[packer_default::get_bucketid(f10, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_5 = packer_default::get_slot1(f10, 8, RB8_NSLOTS);
+		const u32 slot0_fin_5 = packer_default::get_slot0(f10, slot1_fin_5, 8, RB8_NSLOTS);
+
+		ind[8] = buck_fin_5[slot1_fin_5].hash[7];
+		if (CHECK_DUP(8)) continue;
+		ind[9] = buck_fin_5[slot0_fin_5].hash[7];
+		if (CHECK_DUP(9)) continue;
+
+		u32 f11 = buck_v2_3[slot0_v2_3].hash[6];
+		const slot* buck_fin_6 = eq->round0trees[packer_default::get_bucketid(f11, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_6 = packer_default::get_slot1(f11, 8, RB8_NSLOTS);
+		const u32 slot0_fin_6 = packer_default::get_slot0(f11, slot1_fin_6, 8, RB8_NSLOTS);
+
+		ind[10] = buck_fin_6[slot1_fin_6].hash[7];
+		if (CHECK_DUP(10)) continue;
+		ind[11] = buck_fin_6[slot0_fin_6].hash[7];
+		if (CHECK_DUP(11)) continue;
+
+		u32 f12 = buck_v3_2[slot0_v3_2].hash[1];
+		const slot* buck_v2_4 = eq->trees[0][PACKER::get_bucketid(f12, RB, SM)];
+		const u32 slot1_v2_4 = PACKER::get_slot1(f12, RB, SM);
+		const u32 slot0_v2_4 = PACKER::get_slot0(f12, slot1_v2_4, RB, SM);
+
+		u32 f13 = buck_v2_4[slot1_v2_4].hash[6];
+		const slot* buck_fin_7 = eq->round0trees[packer_default::get_bucketid(f13, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_7 = packer_default::get_slot1(f13, 8, RB8_NSLOTS);
+		const u32 slot0_fin_7 = packer_default::get_slot0(f13, slot1_fin_7, 8, RB8_NSLOTS);
+
+		ind[12] = buck_fin_7[slot1_fin_7].hash[7];
+		if (CHECK_DUP(12)) continue;
+		ind[13] = buck_fin_7[slot0_fin_7].hash[7];
+		if (CHECK_DUP(13)) continue;
+
+		u32 f14 = buck_v2_4[slot0_v2_4].hash[6];
+		const slot* buck_fin_8 = eq->round0trees[packer_default::get_bucketid(f14, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_8 = packer_default::get_slot1(f14, 8, RB8_NSLOTS);
+		const u32 slot0_fin_8 = packer_default::get_slot0(f14, slot1_fin_8, 8, RB8_NSLOTS);
+
+		ind[14] = buck_fin_8[slot1_fin_8].hash[7];
+		if (CHECK_DUP(14)) continue;
+		ind[15] = buck_fin_8[slot0_fin_8].hash[7];
+		if (CHECK_DUP(15)) continue;
+
+		u32 soli;
+		if (lane == 0) {
+			soli = atomicAdd(&eq->edata.srealcont.nsols, 1);
+		}
+#if __CUDA_ARCH__ >= 300
+		// useful ?
+		soli = __shfl(soli, 0);
+#else
+		__syncthreads();
+#endif
+		if (soli < MAXREALSOLS)
+		{
+			u32 pos = lane << 4;
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos     ]) = *(uint4*)(&ind[ 0]);
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos +  4]) = *(uint4*)(&ind[ 4]);
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos +  8]) = *(uint4*)(&ind[ 8]);
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos + 12]) = *(uint4*)(&ind[12]);
+		}
+	}
+}
+
+//std::mutex dev_init;
+int dev_init_done[MAX_GPUS] = { 0 };
+
+__host__
+static int compu32(const void *pa, const void *pb)
+{
+	uint32_t a = *(uint32_t *)pa, b = *(uint32_t *)pb;
+	return a<b ? -1 : a == b ? 0 : +1;
+}
+
+__host__
+static bool duped(uint32_t* prf)
+{
+	uint32_t sortprf[512];
+	memcpy(sortprf, prf, sizeof(uint32_t) * 512);
+	qsort(sortprf, 512, sizeof(uint32_t), &compu32);
+	for (uint32_t i = 1; i<512; i++) {
+		if (sortprf[i] <= sortprf[i - 1])
+			return true;
+	}
+	return false;
+}
+
+__host__
+static void sort_pair(uint32_t *a, uint32_t len)
+{
+	uint32_t *b = a + len;
+	uint32_t  tmp, need_sorting = 0;
+	for (uint32_t i = 0; i < len; i++) {
+		if (need_sorting || a[i] > b[i])
+		{
+			need_sorting = 1;
+			tmp = a[i];
+			a[i] = b[i];
+			b[i] = tmp;
+		}
+		else if (a[i] < b[i])
+			return;
+	}
+}
+
+__host__
+static void setheader(blake2b_state *ctx, const char *header, const u32 headerLen, const char* nce, const u32 nonceLen)
+{
+	uint32_t le_N = WN;
+	uint32_t le_K = WK;
+	uchar personal[] = "ZcashPoW01230123";
+	memcpy(personal + 8, &le_N, 4);
+	memcpy(personal + 12, &le_K, 4);
+	blake2b_param P[1];
+	P->digest_length = HASHOUT;
+	P->key_length = 0;
+	P->fanout = 1;
+	P->depth = 1;
+	P->leaf_length = 0;
+	P->node_offset = 0;
+	P->node_depth = 0;
+	P->inner_length = 0;
+	memset(P->reserved, 0, sizeof(P->reserved));
+	memset(P->salt, 0, sizeof(P->salt));
+	memcpy(P->personal, (const uint8_t *)personal, 16);
+	eq_blake2b_init_param(ctx, P);
+	eq_blake2b_update(ctx, (const uchar *)header, headerLen);
+	if (nonceLen) eq_blake2b_update(ctx, (const uchar *)nce, nonceLen);
+}
+
+#ifdef WIN32
+typedef CUresult(CUDAAPI *dec_cuDeviceGet)(CUdevice*, int);
+typedef CUresult(CUDAAPI *dec_cuCtxCreate)(CUcontext*, unsigned int, CUdevice);
+typedef CUresult(CUDAAPI *dec_cuCtxPushCurrent)(CUcontext);
+typedef CUresult(CUDAAPI *dec_cuCtxDestroy)(CUcontext);
+
+dec_cuDeviceGet _cuDeviceGet = nullptr;
+dec_cuCtxCreate _cuCtxCreate = nullptr;
+dec_cuCtxPushCurrent _cuCtxPushCurrent = nullptr;
+dec_cuCtxDestroy _cuCtxDestroy = nullptr;
+#endif
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__ eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::eq_cuda_context(int thr_id, int dev_id)
+{
+	thread_id = thr_id;
+	device_id = dev_id;
+	solutions = nullptr;
+	equi_mem_sz = sizeof(equi<RB, SM>);
+	throughput = NBLOCKS;
+	totalblocks = NBLOCKS/FD_THREADS;
+	threadsperblock = FD_THREADS;
+	threadsperblock_digits = THREADS;
+
+	//dev_init.lock();
+	if (!dev_init_done[device_id])
+	{
+		// only first thread shall init device
+		checkCudaErrors(cudaSetDevice(device_id));
+		checkCudaErrors(cudaDeviceReset());
+		checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+
+		pctx = nullptr;
+	}
+	else
+	{
+		// create new context
+		CUdevice dev;
+
+#ifdef WIN32
+		if (_cuDeviceGet == nullptr)
+		{
+			HMODULE hmod = LoadLibraryA("nvcuda.dll");
+			if (hmod == NULL)
+				throw std::runtime_error("Failed to load nvcuda.dll");
+			_cuDeviceGet = (dec_cuDeviceGet)GetProcAddress(hmod, "cuDeviceGet");
+			if (_cuDeviceGet == nullptr)
+				throw std::runtime_error("Failed to get cuDeviceGet address");
+			_cuCtxCreate = (dec_cuCtxCreate)GetProcAddress(hmod, "cuCtxCreate_v2");
+			if (_cuCtxCreate == nullptr)
+				throw std::runtime_error("Failed to get cuCtxCreate address");
+			_cuCtxPushCurrent = (dec_cuCtxPushCurrent)GetProcAddress(hmod, "cuCtxPushCurrent_v2");
+			if (_cuCtxPushCurrent == nullptr)
+				throw std::runtime_error("Failed to get cuCtxPushCurrent address");
+			_cuCtxDestroy = (dec_cuCtxDestroy)GetProcAddress(hmod, "cuCtxDestroy_v2");
+			if (_cuCtxDestroy == nullptr)
+				throw std::runtime_error("Failed to get cuCtxDestroy address");
+		}
+
+		checkCudaDriverErrors(_cuDeviceGet(&dev, device_id));
+		checkCudaDriverErrors(_cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev));
+		checkCudaDriverErrors(_cuCtxPushCurrent(pctx));
+#else
+		checkCudaDriverErrors(cuDeviceGet(&dev, device_id));
+		checkCudaDriverErrors(cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev));
+		checkCudaDriverErrors(cuCtxPushCurrent(pctx));
+#endif
+	}
+	++dev_init_done[device_id];
+	//dev_init.unlock();
+
+	if (cudaMalloc((void**)&device_eq, equi_mem_sz) != cudaSuccess)
+		throw std::runtime_error("CUDA: failed to alloc memory");
+
+	solutions = (scontainerreal*) malloc(sizeof(scontainerreal));
+	if (!solutions)
+		throw std::runtime_error("EOM: failed to alloc solutions memory");
+}
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__ void eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::solve(const char *tequihash_header,
+	unsigned int tequihash_header_len,
+	const char* nonce,
+	unsigned int nonce_len,
+	fn_cancel cancelf,
+	fn_solution solutionf,
+	fn_hashdone hashdonef)
+{
+	blake2b_state blake_ctx;
+
+	int blocks = NBUCKETS;
+
+	setheader(&blake_ctx, tequihash_header, tequihash_header_len, nonce, nonce_len);
+
+	// todo: improve
+	// djezo solver allows last 4 bytes of nonce to be iterrated
+	// this can be used to create internal loop - calc initial blake hash only once, then load 8*8 bytes on device (blake state h)
+	// then just iterate nn++
+	// less CPU load, 1 cudaMemcpy less -> faster
+	//u32 nn = *(u32*)&nonce[28];
+	u32 nn = 0;
+
+	checkCudaErrors(cudaMemcpy(&device_eq->blake_h, &blake_ctx.h, sizeof(u64) * 8, cudaMemcpyHostToDevice));
+
+	checkCudaErrors(cudaMemset(&device_eq->edata, 0, sizeof(device_eq->edata)));
+
+	digit_first<RB, SM, PACKER> <<<NBLOCKS / FD_THREADS, FD_THREADS >>>(device_eq, nn);
+
+	digit_1<RB, SM, SSM, PACKER, 4 * NRESTS, 512> <<<4096, 512 >>>(device_eq);
+	digit_2<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+	digit_3<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+
+	if (cancelf(thread_id)) return;
+
+	digit_4<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+	digit_5<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+
+	digit_6<RB, SM, SSM - 1, PACKER, 3 * NRESTS> <<<blocks, NRESTS >>>(device_eq);
+	digit_7<RB, SM, SSM - 1, PACKER, 3 * NRESTS> <<<blocks, NRESTS >>>(device_eq);
+	digit_8<RB, SM, SSM - 1, PACKER, 3 * NRESTS> <<<blocks, NRESTS >>>(device_eq);
+
+	digit_last_wdc<RB, SM, SSM - 3, 2, PACKER, 64, 8, 4> <<<4096, 256 / 2 >>>(device_eq);
+
+	checkCudaErrors(cudaMemcpy(solutions, &device_eq->edata.srealcont, (MAXREALSOLS * (512 * 4)) + 4, cudaMemcpyDeviceToHost));
+
+	//printf("T%d nsols: %u\n", thread_id, solutions->nsols);
+	//if (solutions->nsols > 9)
+	//	printf("missing sol, total: %u\n", solutions->nsols);
+
+	for (u32 s = 0; (s < solutions->nsols) && (s < MAXREALSOLS); s++)
+	{
+		// remove dups on CPU (dup removal on GPU is not fully exact and can pass on some invalid solutions)
+		if (duped(solutions->sols[s])) continue;
+
+		// perform sort of pairs
+		for (uint32_t level = 0; level < 9; level++)
+			for (uint32_t i = 0; i < (1 << 9); i += (2 << level))
+				sort_pair(&solutions->sols[s][i], 1 << level);
+
+		std::vector<uint32_t> index_vector(PROOFSIZE);
+		for (u32 i = 0; i < PROOFSIZE; i++) {
+			index_vector[i] = solutions->sols[s][i];
+		}
+
+		solutionf(thread_id, index_vector, DIGITBITS, nullptr);
+	}
+
+	// ccminer: only use hashdonef if no solutions...
+	if (!solutions->nsols)
+		hashdonef(thread_id);
+}
+
+// destructor
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__
+eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::~eq_cuda_context()
+{
+	if (solutions)
+		free(solutions);
+
+	if (device_eq) {
+		cudaFree(device_eq);
+		device_eq = NULL;
+	}
+
+	if (pctx) {
+		// non primary thread, destroy context
+#ifdef WIN32
+		checkCudaDriverErrors(_cuCtxDestroy(pctx));
+#else
+		checkCudaDriverErrors(cuCtxDestroy(pctx));
+#endif
+	} else {
+		checkCudaErrors(cudaDeviceReset());
+		dev_init_done[device_id] = 0;
+	}
+}
+
+
+#ifdef CONFIG_MODE_1
+template class eq_cuda_context<CONFIG_MODE_1>;
+#endif
+
+#ifdef CONFIG_MODE_2
+template class eq_cuda_context<CONFIG_MODE_2>;
+#endif
+
+#ifdef CONFIG_MODE_3
+template class eq_cuda_context<CONFIG_MODE_3>;
+#endif
diff --git a/equi/eqcuda.hpp b/equi/eqcuda.hpp
new file mode 100644
index 0000000..68bdaf0
--- /dev/null
+++ b/equi/eqcuda.hpp
@@ -0,0 +1,136 @@
+#pragma once
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdexcept>
+#include <vector>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <device_functions_decls.h>
+
+#ifdef WIN32
+#define _SNPRINTF _snprintf
+#else
+#define _SNPRINTF snprintf
+#endif
+
+#ifndef nullptr
+#define nullptr NULL
+#endif
+
+#ifdef WIN32
+#define rt_error std::runtime_error
+#else
+class rt_error : public std::runtime_error
+{
+public:
+	explicit rt_error(const std::string& str) : std::runtime_error(str) {}
+};
+#endif
+
+#define checkCudaErrors(call)								\
+do {														\
+	cudaError_t err = call;									\
+	if (cudaSuccess != err) {								\
+		char errorBuff[512];								\
+		_SNPRINTF(errorBuff, sizeof(errorBuff) - 1,			\
+			"CUDA error '%s' in func '%s' line %d",			\
+			cudaGetErrorString(err), __FUNCTION__, __LINE__); \
+		throw rt_error(errorBuff);				\
+		}													\
+} while (0)
+
+#define checkCudaDriverErrors(call)							\
+do {														\
+	CUresult err = call;									\
+	if (CUDA_SUCCESS != err) {								\
+		char errorBuff[512];								\
+		_SNPRINTF(errorBuff, sizeof(errorBuff) - 1,			\
+			"CUDA error DRIVER: '%d' in func '%s' line %d", \
+			err, __FUNCTION__, __LINE__);	\
+		throw rt_error(errorBuff);				\
+				}											\
+} while (0)
+
+typedef uint64_t u64;
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+typedef unsigned char uchar;
+
+struct packer_default;
+struct packer_cantor;
+
+#define MAXREALSOLS 9
+
+struct scontainerreal {
+	u32 sols[MAXREALSOLS][512];
+	u32 nsols;
+};
+
+#if 0
+#include <functional>
+#define fn_solution std::function<void(int thr_id, const std::vector<uint32_t>&, size_t, const unsigned char*)>
+#define fn_hashdone std::function<void(int thr_id)>
+#define fn_cancel   std::function<bool(int thr_id)>
+#else
+typedef void (*fn_solution)(int thr_id, const std::vector<uint32_t>&, size_t, const unsigned char*);
+typedef void (*fn_hashdone)(int thr_id);
+typedef bool (*fn_cancel)(int thr_id);
+#endif
+
+template <u32 RB, u32 SM> struct equi;
+
+// ---------------------------------------------------------------------------------------------------
+
+struct eq_cuda_context_interface
+{
+	virtual ~eq_cuda_context_interface();
+
+	virtual void solve(const char *tequihash_header,
+		unsigned int tequihash_header_len,
+		const char* nonce,
+		unsigned int nonce_len,
+		fn_cancel cancelf,
+		fn_solution solutionf,
+		fn_hashdone hashdonef);
+public:
+	int thread_id;
+	int device_id;
+	int throughput;
+	int totalblocks;
+	int threadsperblock;
+	int threadsperblock_digits;
+	size_t equi_mem_sz;
+};
+
+// ---------------------------------------------------------------------------------------------------
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+class eq_cuda_context : public eq_cuda_context_interface
+{
+	equi<RB, SM>* device_eq;
+	scontainerreal* solutions;
+	CUcontext pctx;
+
+	void solve(const char *tequihash_header,
+		unsigned int tequihash_header_len,
+		const char* nonce,
+		unsigned int nonce_len,
+		fn_cancel cancelf,
+		fn_solution solutionf,
+		fn_hashdone hashdonef);
+
+public:
+	eq_cuda_context(int thr_id, int dev_id);
+	~eq_cuda_context();
+};
+
+// RB, SM, SSM, TPB, PACKER... but any change only here will fail..
+#define CONFIG_MODE_1	9, 1248, 12, 640, packer_cantor
+//#define CONFIG_MODE_2	8, 640, 12, 512, packer_default
diff --git a/equi/equi-stratum.cpp b/equi/equi-stratum.cpp
new file mode 100644
index 0000000..c9f6587
--- /dev/null
+++ b/equi/equi-stratum.cpp
@@ -0,0 +1,241 @@
+/**
+ * Equihash specific stratum protocol
+ * tpruvot@github - 2017 - Part under GPLv3 Licence
+ */
+
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <miner.h>
+
+#include "equihash.h"
+
+extern struct stratum_ctx stratum;
+extern pthread_mutex_t stratum_work_lock;
+
+// ZEC uses a different scale to compute diff... 
+// sample targets to diff (stored in the reverse byte order in work->target)
+// 0007fff800000000000000000000000000000000000000000000000000000000 is stratum diff 32
+// 003fffc000000000000000000000000000000000000000000000000000000000 is stratum diff 4
+// 00ffff0000000000000000000000000000000000000000000000000000000000 is stratum diff 1
+double target_to_diff_equi(uint32_t* target)
+{
+	uchar* tgt = (uchar*) target;
+	uint64_t m =
+		(uint64_t)tgt[30] << 24 |
+		(uint64_t)tgt[29] << 16 |
+		(uint64_t)tgt[28] << 8  |
+		(uint64_t)tgt[27] << 0;
+
+	if (!m)
+		return 0.;
+	else
+		return (double)0xffff0000UL/m;
+}
+
+void diff_to_target_equi(uint32_t *target, double diff)
+{
+	uint64_t m;
+	int k;
+
+	for (k = 6; k > 0 && diff > 1.0; k--)
+		diff /= 4294967296.0;
+	m = (uint64_t)(4294901760.0 / diff);
+	if (m == 0 && k == 6)
+		memset(target, 0xff, 32);
+	else {
+		memset(target, 0, 32);
+		target[k + 1] = (uint32_t)(m >> 8);
+		target[k + 2] = (uint32_t)(m >> 40);
+		//memset(target, 0xff, 6*sizeof(uint32_t));
+		for (k = 0; k < 28 && ((uint8_t*)target)[k] == 0; k++)
+			((uint8_t*)target)[k] = 0xff;
+	}
+}
+
+/* compute nbits to get the network diff */
+double equi_network_diff(struct work *work)
+{
+	//KMD bits: "1e 015971",
+	//KMD target: "00 00 015971000000000000000000000000000000000000000000000000000000",
+	//KMD bits: "1d 686aaf",
+	//KMD target: "00 0000 686aaf0000000000000000000000000000000000000000000000000000",
+	uint32_t nbits = work->data[26];
+	uint32_t bits = (nbits & 0xffffff);
+	int16_t shift = (swab32(nbits) & 0xff);
+	shift = (31 - shift) * 8; // 8 bits shift for 0x1e, 16 for 0x1d
+	uint64_t tgt64 = swab32(bits);
+	tgt64 = tgt64 << shift;
+	// applog_hex(&tgt64, 8);
+	uint8_t net_target[32] = { 0 };
+	for (int b=0; b<8; b++)
+		net_target[31-b] = ((uint8_t*)&tgt64)[b];
+	// applog_hex(net_target, 32);
+	double d = target_to_diff_equi((uint32_t*)net_target);
+	return d;
+}
+
+void equi_work_set_target(struct work* work, double diff)
+{
+	// target is given as data by the equihash stratum
+	// memcpy(work->target, stratum.job.claim, 32); // claim field is only used for lbry
+	diff_to_target_equi(work->target, diff);
+	//applog(LOG_BLUE, "diff %f to target :", diff);
+	//applog_hex(work->target, 32);
+	work->targetdiff = diff;
+}
+
+bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params)
+{
+	uint8_t target_bin[32], target_be[32];
+
+	const char *target_hex = json_string_value(json_array_get(params, 0));
+	if (!target_hex || strlen(target_hex) == 0)
+		return false;
+
+	hex2bin(target_bin, target_hex, 32);
+	memset(target_be, 0xff, 32);
+	int filled = 0;
+	for (int i=0; i<32; i++) {
+		if (filled == 3) break;
+		target_be[31-i] = target_bin[i];
+		if (target_bin[i]) filled++;
+	}
+	memcpy(sctx->job.claim, target_be, 32); // hack, unused struct field
+
+	pthread_mutex_lock(&stratum_work_lock);
+	sctx->next_diff = target_to_diff_equi((uint32_t*) &target_be);
+	pthread_mutex_unlock(&stratum_work_lock);
+
+	//applog(LOG_BLUE, "low diff %f", sctx->next_diff);
+	//applog_hex(target_be, 32);
+
+	return true;
+}
+
+bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params)
+{
+	const char *job_id, *version, *prevhash, *coinb1, *coinb2, *nbits, *stime;
+	size_t coinb1_size, coinb2_size;
+	bool clean, ret = false;
+	int ntime, i, p=0;
+	job_id = json_string_value(json_array_get(params, p++));
+	version = json_string_value(json_array_get(params, p++));
+	prevhash = json_string_value(json_array_get(params, p++));
+	coinb1 = json_string_value(json_array_get(params, p++)); //merkle
+	coinb2 = json_string_value(json_array_get(params, p++)); //blank (reserved)
+	stime = json_string_value(json_array_get(params, p++));
+	nbits = json_string_value(json_array_get(params, p++));
+	clean = json_is_true(json_array_get(params, p)); p++;
+
+	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime ||
+	    strlen(prevhash) != 64 || strlen(version) != 8 ||
+	    strlen(coinb1) != 64 || strlen(coinb2) != 64 ||
+	    strlen(nbits) != 8 || strlen(stime) != 8) {
+		applog(LOG_ERR, "Stratum notify: invalid parameters");
+		goto out;
+	}
+
+	/* store stratum server time diff */
+	hex2bin((uchar *)&ntime, stime, 4);
+	ntime = ntime - (int) time(0);
+	if (ntime > sctx->srvtime_diff) {
+		sctx->srvtime_diff = ntime;
+		if (opt_protocol && ntime > 20)
+			applog(LOG_DEBUG, "stratum time is at least %ds in the future", ntime);
+	}
+
+	pthread_mutex_lock(&stratum_work_lock);
+	hex2bin(sctx->job.version, version, 4);
+	hex2bin(sctx->job.prevhash, prevhash, 32);
+
+	coinb1_size = strlen(coinb1) / 2;
+	coinb2_size = strlen(coinb2) / 2;
+	sctx->job.coinbase_size = coinb1_size + coinb2_size + // merkle + reserved
+		sctx->xnonce1_size + sctx->xnonce2_size; // extranonce and...
+
+	sctx->job.coinbase = (uchar*) realloc(sctx->job.coinbase, sctx->job.coinbase_size);
+	hex2bin(sctx->job.coinbase, coinb1, coinb1_size);
+	hex2bin(sctx->job.coinbase + coinb1_size, coinb2, coinb2_size);
+
+	sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + coinb2_size + sctx->xnonce1_size;
+	if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id))
+		memset(sctx->job.xnonce2, 0, sctx->xnonce2_size);
+	memcpy(sctx->job.coinbase + coinb1_size + coinb2_size, sctx->xnonce1, sctx->xnonce1_size);
+
+	for (i = 0; i < sctx->job.merkle_count; i++)
+		free(sctx->job.merkle[i]);
+	free(sctx->job.merkle);
+	sctx->job.merkle = NULL;
+	sctx->job.merkle_count = 0;
+
+	free(sctx->job.job_id);
+	sctx->job.job_id = strdup(job_id);
+
+	hex2bin(sctx->job.nbits, nbits, 4);
+	hex2bin(sctx->job.ntime, stime, 4);
+	sctx->job.clean = clean;
+
+	sctx->job.diff = sctx->next_diff;
+	pthread_mutex_unlock(&stratum_work_lock);
+
+	ret = true;
+
+out:
+	return ret;
+}
+
+void equi_store_work_solution(struct work* work, uint32_t* hash, void* sol_data)
+{
+	int nonce = work->valid_nonces-1;
+	memcpy(work->extra, sol_data, 1347);
+	bn_store_hash_target_ratio(hash, work->target, work, nonce);
+	//work->sharediff[nonce] = target_to_diff_equi(hash);
+}
+
+#define JSON_SUBMIT_BUF_LEN (4*1024)
+// called by submit_upstream_work()
+bool equi_stratum_submit(struct pool_infos *pool, struct work *work)
+{
+	char _ALIGN(64) s[JSON_SUBMIT_BUF_LEN];
+	char _ALIGN(64) timehex[16] = { 0 };
+	char *jobid, *noncestr, *solhex;
+	int idnonce = work->submit_nonce_id;
+
+	// scanned nonce
+	work->data[EQNONCE_OFFSET] = work->nonces[idnonce];
+	unsigned char * nonce = (unsigned char*) (&work->data[27]);
+	size_t nonce_len = 32 - stratum.xnonce1_size;
+	// long nonce without pool prefix (extranonce)
+	noncestr = bin2hex(&nonce[stratum.xnonce1_size], nonce_len);
+
+	solhex = (char*) calloc(1, 1344*2 + 64);
+	if (!solhex || !noncestr) {
+		applog(LOG_ERR, "unable to alloc share memory");
+		return false;
+	}
+	cbin2hex(solhex, (const char*) work->extra, 1347);
+
+	jobid = work->job_id + 8;
+	sprintf(timehex, "%08x", swab32(work->data[25]));
+
+	snprintf(s, sizeof(s), "{\"method\":\"mining.submit\",\"params\":"
+		"[\"%s\",\"%s\",\"%s\",\"%s\",\"%s\"], \"id\":%u}",
+		pool->user, jobid, timehex, noncestr, solhex,
+		stratum.job.shares_count + 10);
+
+	free(solhex);
+	free(noncestr);
+
+	gettimeofday(&stratum.tv_submit, NULL);
+
+	if(!stratum_send_line(&stratum, s)) {
+		applog(LOG_ERR, "%s stratum_send_line failed", __func__);
+		return false;
+	}
+
+	stratum.sharediff = work->sharediff[idnonce];
+	stratum.job.shares_count++;
+
+	return true;
+}
diff --git a/equi/equi.cpp b/equi/equi.cpp
new file mode 100644
index 0000000..0a1d919
--- /dev/null
+++ b/equi/equi.cpp
@@ -0,0 +1,171 @@
+/*
+ * Port to Generic C of C++ implementation of the Equihash Proof-of-Work
+ * algorithm from zcashd.
+ *
+ * Copyright (c) 2016 abc at openwall dot com
+ * Copyright (c) 2016 Jack Grigg
+ * Copyright (c) 2016 The Zcash developers
+ * Copyright (c) 2017 tpruvot
+ *
+ * Distributed under the MIT software license, see the accompanying
+ * file COPYING or http://www.opensource.org/licenses/mit-license.php.
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <assert.h>
+
+#include "equihash.h"
+
+//#define USE_LIBSODIUM
+
+#ifdef USE_LIBSODIUM
+#include "sodium.h"
+#define blake2b_state crypto_generichash_blake2b_state
+#else
+#include "blake2/blake2.h"
+#define be32toh(x) swab32(x)
+#define htole32(x) (x)
+#define HASHOUT 50
+#endif
+
+#include <miner.h>
+
+static void digestInit(blake2b_state *S, const uint32_t n, const uint32_t k)
+{
+	uint32_t le_N = htole32(n);
+	uint32_t le_K = htole32(k);
+#ifdef USE_LIBSODIUM
+	uint8_t personalization[crypto_generichash_blake2b_PERSONALBYTES] = { 0 };
+
+	memcpy(personalization, "ZcashPoW", 8);
+	memcpy(personalization + 8,  &le_N, 4);
+	memcpy(personalization + 12, &le_K, 4);
+
+	crypto_generichash_blake2b_init_salt_personal(S,
+		NULL, 0, (512 / n) * n / 8, NULL, personalization);
+#else
+	unsigned char personal[] = "ZcashPoW01230123";
+	memcpy(personal + 8, &le_N, 4);
+	memcpy(personal + 12, &le_K, 4);
+	blake2b_param P[1];
+	P->digest_length = HASHOUT;
+	P->key_length = 0;
+	P->fanout = 1;
+	P->depth = 1;
+	P->leaf_length = 0;
+	P->node_offset = 0;
+	P->node_depth = 0;
+	P->inner_length = 0;
+	memset(P->reserved, 0, sizeof(P->reserved));
+	memset(P->salt, 0, sizeof(P->salt));
+	memcpy(P->personal, (const uint8_t *)personal, 16);
+	eq_blake2b_init_param(S, P);
+#endif
+}
+
+static void expandArray(const unsigned char *in, const uint32_t in_len,
+	unsigned char *out, const uint32_t out_len,
+	const uint32_t bit_len, const uint32_t byte_pad)
+{
+	assert(bit_len >= 8);
+	assert(8 * sizeof(uint32_t) >= 7 + bit_len);
+
+	const uint32_t out_width = (bit_len + 7) / 8 + byte_pad;
+	assert(out_len == 8 * out_width * in_len / bit_len);
+
+	const uint32_t bit_len_mask = ((uint32_t)1 << bit_len) - 1;
+
+	// The acc_bits least-significant bits of acc_value represent a bit sequence
+	// in big-endian order.
+	uint32_t acc_bits = 0;
+	uint32_t acc_value = 0;
+	uint32_t j = 0;
+
+	for (uint32_t i = 0; i < in_len; i++)
+	{
+		acc_value = (acc_value << 8) | in[i];
+		acc_bits += 8;
+
+		// When we have bit_len or more bits in the accumulator, write the next
+		// output element.
+		if (acc_bits >= bit_len) {
+			acc_bits -= bit_len;
+			for (uint32_t x = 0; x < byte_pad; x++) {
+				out[j + x] = 0;
+			}
+			for (uint32_t x = byte_pad; x < out_width; x++) {
+				out[j + x] = (
+					// Big-endian
+					acc_value >> (acc_bits + (8 * (out_width - x - 1)))
+				) & (
+					// Apply bit_len_mask across byte boundaries
+					(bit_len_mask >> (8 * (out_width - x - 1))) & 0xFF
+				);
+			}
+			j += out_width;
+		}
+	}
+}
+
+static void generateHash(blake2b_state *S, const uint32_t g, uint8_t *hash, const size_t hashLen)
+{
+	const uint32_t le_g = htole32(g);
+	blake2b_state digest = *S; /* copy */
+#ifdef USE_LIBSODIUM
+	crypto_generichash_blake2b_update(&digest, (uint8_t *)&le_g, sizeof(le_g));
+	crypto_generichash_blake2b_final(&digest, hash, hashLen);
+#else
+	eq_blake2b_update(&digest, (const uint8_t*) &le_g, sizeof(le_g));
+	eq_blake2b_final(&digest, hash, (uint8_t) (hashLen & 0xFF));
+#endif
+}
+
+static int isZero(const uint8_t *hash, size_t len)
+{
+	// This doesn't need to be constant time.
+	for (size_t i = 0; i < len; i++) {
+		if (hash[i] != 0) return 0;
+	}
+	return 1;
+}
+
+// hdr -> header including nonce (140 bytes)
+// soln -> equihash solution (excluding 3 bytes with size, so 1344 bytes length)
+bool equi_verify(uint8_t* const hdr, uint8_t* const soln)
+{
+	const uint32_t n = WN; // 200
+	const uint32_t k = WK; // 9
+	const uint32_t collisionBitLength = n / (k + 1);
+	const uint32_t collisionByteLength = (collisionBitLength + 7) / 8;
+	const uint32_t hashLength = (k + 1) * collisionByteLength;
+	const uint32_t indicesPerHashOutput = 512 / n;
+	const uint32_t hashOutput = indicesPerHashOutput * n / 8;
+	const uint32_t equihashSolutionSize = (1 << k) * (n / (k + 1) + 1) / 8;
+	const uint32_t solnr = 1 << k;
+
+	uint32_t indices[512] = { 0 };
+	uint8_t vHash[hashLength] = { 0 };
+
+	blake2b_state state;
+	digestInit(&state, n, k);
+#ifdef USE_LIBSODIUM
+	crypto_generichash_blake2b_update(&state, hdr, 140);
+#else
+	eq_blake2b_update(&state, hdr, 140);
+#endif
+
+	expandArray(soln, equihashSolutionSize, (uint8_t*) &indices, sizeof(indices), collisionBitLength + 1, 1);
+
+	for (uint32_t j = 0; j < solnr; j++) {
+		uint8_t tmpHash[hashOutput];
+		uint8_t hash[hashLength];
+		uint32_t i = be32toh(indices[j]);
+		generateHash(&state, i / indicesPerHashOutput, tmpHash, hashOutput);
+		expandArray(tmpHash + (i % indicesPerHashOutput * n / 8), n / 8, hash, hashLength, collisionBitLength, 0);
+		for (uint32_t k = 0; k < hashLength; k++)
+			vHash[k] ^= hash[k];
+	}
+	return isZero(vHash, sizeof(vHash));
+}
diff --git a/equi/equihash.cpp b/equi/equihash.cpp
new file mode 100644
index 0000000..2a6e514
--- /dev/null
+++ b/equi/equihash.cpp
@@ -0,0 +1,294 @@
+/**
+ * Equihash solver interface for ccminer (compatible with linux and windows)
+ * Solver taken from nheqminer, by djeZo (and NiceHash)
+ * tpruvot - 2017 (GPL v3)
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <assert.h>
+
+#include <stdexcept>
+#include <vector>
+
+#include <sph/sph_sha2.h>
+
+#include "eqcuda.hpp"
+#include "equihash.h" // equi_verify()
+
+#include <miner.h>
+
+// All solutions (BLOCK_HEADER_LEN + SOLSIZE_LEN + SOL_LEN) sha256d should be under the target
+extern "C" void equi_hash(const void* input, void* output, int len)
+{
+	uint8_t _ALIGN(64) hash0[32], hash1[32];
+
+	sph_sha256_context ctx_sha256;
+
+	sph_sha256_init(&ctx_sha256);
+	sph_sha256(&ctx_sha256, input, len);
+	sph_sha256_close(&ctx_sha256, hash0);
+	sph_sha256(&ctx_sha256, hash0, 32);
+	sph_sha256_close(&ctx_sha256, hash1);
+
+	memcpy(output, hash1, 32);
+}
+
+// input here is 140 for the header and 1344 for the solution (equi.cpp)
+extern "C" int equi_verify_sol(void * const hdr, void * const sol)
+{
+	bool res = equi_verify((uint8_t*) hdr, (uint8_t*) sol);
+
+	//applog_hex((void*)hdr, 140);
+	//applog_hex((void*)sol, 1344);
+
+	return res ? 1 : 0;
+}
+
+#include <cuda_helper.h>
+
+//#define EQNONCE_OFFSET 30 /* 27:34 */
+#define NONCE_OFT EQNONCE_OFFSET
+
+static bool init[MAX_GPUS] = { 0 };
+static int valid_sols[MAX_GPUS] = { 0 };
+static uint8_t _ALIGN(64) data_sols[MAX_GPUS][MAXREALSOLS][1536] = { 0 }; // 140+3+1344 required
+static eq_cuda_context_interface* solvers[MAX_GPUS] = { NULL };
+
+static void CompressArray(const unsigned char* in, size_t in_len,
+	unsigned char* out, size_t out_len, size_t bit_len, size_t byte_pad)
+{
+	assert(bit_len >= 8);
+	assert(8 * sizeof(uint32_t) >= 7 + bit_len);
+
+	size_t in_width = (bit_len + 7) / 8 + byte_pad;
+	assert(out_len == bit_len*in_len / (8 * in_width));
+
+	uint32_t bit_len_mask = (1UL << bit_len) - 1;
+
+	// The acc_bits least-significant bits of acc_value represent a bit sequence
+	// in big-endian order.
+	size_t acc_bits = 0;
+	uint32_t acc_value = 0;
+
+	size_t j = 0;
+	for (size_t i = 0; i < out_len; i++) {
+		// When we have fewer than 8 bits left in the accumulator, read the next
+		// input element.
+		if (acc_bits < 8) {
+			acc_value = acc_value << bit_len;
+			for (size_t x = byte_pad; x < in_width; x++) {
+				acc_value = acc_value | (
+					(
+					// Apply bit_len_mask across byte boundaries
+					in[j + x] & ((bit_len_mask >> (8 * (in_width - x - 1))) & 0xFF)
+					) << (8 * (in_width - x - 1))); // Big-endian
+			}
+			j += in_width;
+			acc_bits += bit_len;
+		}
+
+		acc_bits -= 8;
+		out[i] = (acc_value >> acc_bits) & 0xFF;
+	}
+}
+
+#ifndef htobe32
+#define htobe32(x) swab32(x)
+#endif
+
+static void EhIndexToArray(const u32 i, unsigned char* arr)
+{
+	u32 bei = htobe32(i);
+	memcpy(arr, &bei, sizeof(u32));
+}
+
+static std::vector<unsigned char> GetMinimalFromIndices(std::vector<u32> indices, size_t cBitLen)
+{
+	assert(((cBitLen + 1) + 7) / 8 <= sizeof(u32));
+	size_t lenIndices = indices.size()*sizeof(u32);
+	size_t minLen = (cBitLen + 1)*lenIndices / (8 * sizeof(u32));
+	size_t bytePad = sizeof(u32) - ((cBitLen + 1) + 7) / 8;
+	std::vector<unsigned char> array(lenIndices);
+	for (size_t i = 0; i < indices.size(); i++) {
+		EhIndexToArray(indices[i], array.data() + (i*sizeof(u32)));
+	}
+	std::vector<unsigned char> ret(minLen);
+	CompressArray(array.data(), lenIndices, ret.data(), minLen, cBitLen + 1, bytePad);
+	return ret;
+}
+
+// solver callbacks
+static void cb_solution(int thr_id, const std::vector<uint32_t>& solutions, size_t cbitlen, const unsigned char *compressed_sol)
+{
+	std::vector<unsigned char> nSolution;
+	if (!compressed_sol) {
+		nSolution = GetMinimalFromIndices(solutions, cbitlen);
+	} else {
+		gpulog(LOG_INFO, thr_id, "compressed_sol");
+		nSolution = std::vector<unsigned char>(1344);
+		for (size_t i = 0; i < cbitlen; i++)
+			nSolution[i] = compressed_sol[i];
+	}
+	int nsol = valid_sols[thr_id];
+	if (nsol < 0) nsol = 0;
+	if(nSolution.size() == 1344) {
+		// todo, only store solution data here...
+		le32enc(&data_sols[thr_id][nsol][140], 0x000540fd); // sol sz header
+		memcpy(&data_sols[thr_id][nsol][143], nSolution.data(), 1344);
+		valid_sols[thr_id] = nsol + 1;
+	}
+}
+static void cb_hashdone(int thr_id) {
+	if (!valid_sols[thr_id]) valid_sols[thr_id] = -1;
+}
+static bool cb_cancel(int thr_id) {
+	if (work_restart[thr_id].restart)
+		valid_sols[thr_id] = -1;
+	return work_restart[thr_id].restart;
+}
+
+extern "C" int scanhash_equihash(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[35];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[NONCE_OFT];
+	uint32_t nonce_increment = rand() & 0xFF; // nonce randomizer
+	struct timeval tv_start, tv_end, diff;
+	double secs, solps;
+	uint32_t soluce_count = 0;
+
+	if (opt_benchmark)
+		ptarget[7] = 0xfffff;
+
+	if (!init[thr_id]) {
+		try {
+			int mode = 1;
+			switch (mode) {
+			case 1:
+				solvers[thr_id] = new eq_cuda_context<CONFIG_MODE_1>(thr_id, device_map[thr_id]);
+				break;
+#ifdef CONFIG_MODE_2
+			case 2:
+				solvers[thr_id] = new eq_cuda_context<CONFIG_MODE_2>(thr_id, device_map[thr_id]);
+				break;
+#endif
+#ifdef CONFIG_MODE_3
+			case 3:
+				solvers[thr_id] = new eq_cuda_context<CONFIG_MODE_3>(thr_id, device_map[thr_id]);
+				break;
+#endif
+			default:
+				proper_exit(EXIT_CODE_SW_INIT_ERROR);
+				return -1;
+			}
+			size_t memSz = solvers[thr_id]->equi_mem_sz / (1024*1024);
+			gpus_intensity[thr_id] = (uint32_t) solvers[thr_id]->throughput;
+			api_set_throughput(thr_id, gpus_intensity[thr_id]);
+			gpulog(LOG_DEBUG, thr_id, "Allocated %u MB of context memory", (u32) memSz);
+			cuda_get_arch(thr_id);
+			init[thr_id] = true;
+		} catch (const std::exception & e) {
+			CUDA_LOG_ERROR();
+			gpulog(LOG_ERR, thr_id, "init: %s", e.what());
+			proper_exit(EXIT_CODE_CUDA_ERROR);
+		}
+	}
+
+	gettimeofday(&tv_start, NULL);
+	memcpy(endiandata, pdata, 140);
+	work->valid_nonces = 0;
+
+	do {
+
+		try {
+
+			valid_sols[thr_id] = 0;
+			solvers[thr_id]->solve(
+				(const char *) endiandata, (unsigned int) (140 - 32),
+				(const char *) &endiandata[27], (unsigned int) 32,
+				&cb_cancel, &cb_solution, &cb_hashdone
+			);
+
+			*hashes_done = soluce_count;
+
+		} catch (const std::exception & e) {
+			gpulog(LOG_WARNING, thr_id, "solver: %s", e.what());
+			free_equihash(thr_id);
+			sleep(1);
+			return -1;
+		}
+
+		if (valid_sols[thr_id] > 0)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			uint8_t _ALIGN(64) full_data[140+3+1344] = { 0 };
+			uint8_t* sol_data = &full_data[140];
+
+			soluce_count += valid_sols[thr_id];
+
+			for (int nsol=0; nsol < valid_sols[thr_id]; nsol++)
+			{
+				memcpy(full_data, endiandata, 140);
+				memcpy(sol_data, &data_sols[thr_id][nsol][140], 1347);
+				equi_hash(full_data, vhash, 140+3+1344);
+
+				if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
+				{
+					bool valid = equi_verify_sol(endiandata, &sol_data[3]);
+					if (valid && work->valid_nonces < MAX_NONCES) {
+						work->valid_nonces++;
+						memcpy(work->data, endiandata, 140);
+						equi_store_work_solution(work, vhash, sol_data);
+						work->nonces[work->valid_nonces-1] = endiandata[NONCE_OFT];
+						pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1;
+						//applog_hex(vhash, 32);
+						//applog_hex(&work->data[27], 32);
+						goto out; // second solution storage not handled..
+					}
+				}
+				if (work->valid_nonces == MAX_NONCES) goto out;
+			}
+			if (work->valid_nonces)
+				goto out;
+
+			valid_sols[thr_id] = 0;
+		}
+
+		endiandata[NONCE_OFT] += nonce_increment;
+
+	} while (!work_restart[thr_id].restart);
+
+out:
+	gettimeofday(&tv_end, NULL);
+	timeval_subtract(&diff, &tv_end, &tv_start);
+	secs = (1.0 * diff.tv_sec) + (0.000001 * diff.tv_usec);
+	solps = (double)soluce_count / secs;
+	gpulog(LOG_DEBUG, thr_id, "%d solutions in %.2f s (%.2f Sol/s)", soluce_count, secs, solps);
+
+	// H/s
+	*hashes_done = soluce_count;
+
+	pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1;
+
+	return work->valid_nonces;
+}
+
+// cleanup
+void free_equihash(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	delete(solvers[thr_id]);
+	solvers[thr_id] = NULL;
+
+	init[thr_id] = false;
+}
+
+// mmm... viva c++ junk
+void eq_cuda_context_interface::solve(const char *tequihash_header, unsigned int tequihash_header_len,
+	const char* nonce, unsigned int nonce_len,
+	fn_cancel cancelf, fn_solution solutionf, fn_hashdone hashdonef) { }
+eq_cuda_context_interface::~eq_cuda_context_interface() { }
diff --git a/equi/equihash.h b/equi/equihash.h
new file mode 100644
index 0000000..cdf47fa
--- /dev/null
+++ b/equi/equihash.h
@@ -0,0 +1,19 @@
+#ifndef EQUIHASH_H
+#define EQUIHASH_H
+
+#include <stdint.h>
+
+// miner nonce "cursor" unique for each thread
+#define EQNONCE_OFFSET 30 /* 27:34 */
+
+#define WK 9
+#define WN 200
+//#define CONFIG_MODE_1 9, 1248, 12, 640, packer_cantor /* eqcuda.hpp */
+
+extern "C" {
+	void equi_hash(const void* input, void* output, int len);
+	int  equi_verify_sol(void* const hdr, void* const soln);
+	bool equi_verify(uint8_t* const hdr, uint8_t* const soln);
+}
+
+#endif
diff --git a/miner.h b/miner.h
index f720bc7..307ff7d 100644
--- a/miner.h
+++ b/miner.h
@@ -282,6 +282,7 @@ extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonc
 extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_equihash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -340,6 +341,7 @@ extern void free_cryptolight(int thr_id);
 extern void free_cryptonight(int thr_id);
 extern void free_decred(int thr_id);
 extern void free_deep(int thr_id);
+extern void free_equihash(int thr_id);
 extern void free_keccak256(int thr_id);
 extern void free_fresh(int thr_id);
 extern void free_fugue256(int thr_id);
@@ -604,6 +606,7 @@ void cuda_clear_lasterror();
 #define CL_WHT  "\x1B[01;37m" /* white */
 
 extern void format_hashrate(double hashrate, char *output);
+extern void format_hashrate_unit(double hashrate, char *output, const char* unit);
 extern void applog(int prio, const char *fmt, ...);
 extern void gpulog(int prio, int thr_id, const char *fmt, ...);
 
@@ -679,6 +682,7 @@ struct stratum_ctx {
 	time_t tm_connected;
 
 	int rpc2;
+	int is_equihash;
 	int srvtime_diff;
 };
 
@@ -722,6 +726,8 @@ struct work {
 	/* pok getwork txs */
 	uint32_t tx_count;
 	struct tx txs[POK_MAX_TXS];
+	// zec solution
+	uint8_t extra[1388];
 };
 
 #define POK_BOOL_MASK 0x00008000
@@ -804,6 +810,14 @@ void stratum_free_job(struct stratum_ctx *sctx);
 
 bool rpc2_stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
 
+bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params);
+bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params);
+bool equi_stratum_submit(struct pool_infos *pool, struct work *work);
+void equi_work_set_target(struct work* work, double diff);
+void equi_store_work_solution(struct work* work, uint32_t* hash, void* sol_data);
+int equi_verify_sol(void * const hdr, void * const sol);
+double equi_network_diff(struct work *work);
+
 void hashlog_remember_submit(struct work* work, uint32_t nonce);
 void hashlog_remember_scan_range(struct work* work);
 double hashlog_get_sharediff(char* jobid, int idnonce, double defvalue);
diff --git a/util.cpp b/util.cpp
index 7c2878b..e15fbf6 100644
--- a/util.cpp
+++ b/util.cpp
@@ -211,35 +211,31 @@ void get_defconfig_path(char *out, size_t bufsize, char *argv0)
 #endif
 }
 
-void format_hashrate(double hashrate, char *output)
+void format_hashrate_unit(double hashrate, char *output, const char *unit)
 {
-	char prefix = '\0';
+	char prefix[2] = { 0, 0 };
 
 	if (hashrate < 10000) {
 		// nop
 	}
 	else if (hashrate < 1e7) {
-		prefix = 'k';
+		prefix[0] = 'k';
 		hashrate *= 1e-3;
 	}
 	else if (hashrate < 1e10) {
-		prefix = 'M';
+		prefix[0] = 'M';
 		hashrate *= 1e-6;
 	}
 	else if (hashrate < 1e13) {
-		prefix = 'G';
+		prefix[0] = 'G';
 		hashrate *= 1e-9;
 	}
 	else {
-		prefix = 'T';
+		prefix[0] = 'T';
 		hashrate *= 1e-12;
 	}
 
-	sprintf(
-		output,
-		prefix ? "%.2f %cH/s" : "%.2f H/s%c",
-		hashrate, prefix
-	);
+	sprintf(output, "%.2f %s%s", hashrate, prefix, unit);
 }
 
 static void databuf_free(struct data_buffer *db)
@@ -1179,14 +1175,27 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i
 	}
 	xn2_size = (int) json_integer_value(json_array_get(params, pndx+1));
 	if (!xn2_size) {
-		applog(LOG_ERR, "Failed to get extranonce2_size");
-		goto out;
+		char algo[64] = { 0 };
+		get_currentalgo(algo, sizeof(algo));
+		if (strcmp(algo, "equihash") == 0) {
+			int xn1_size = (int)strlen(xnonce1) / 2;
+			xn2_size = 32 - xn1_size;
+			if (xn1_size < 4 || xn1_size > 12) {
+				// This miner iterates the nonces at data32[30]
+				applog(LOG_ERR, "Unsupported extranonce size of %d (12 maxi)", xn1_size);
+				goto out;
+			}
+			goto skip_n2;
+		} else {
+			applog(LOG_ERR, "Failed to get extranonce2_size");
+			goto out;
+		}
 	}
 	if (xn2_size < 2 || xn2_size > 16) {
-		applog(LOG_INFO, "Failed to get valid n2size in parse_extranonce");
+		applog(LOG_ERR, "Failed to get valid n2size in parse_extranonce (%d)", xn2_size);
 		goto out;
 	}
-
+skip_n2:
 	pthread_mutex_lock(&stratum_work_lock);
 	if (sctx->xnonce1)
 		free(sctx->xnonce1);
@@ -1441,6 +1450,10 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	get_currentalgo(algo, sizeof(algo));
 	bool has_claim = !strcasecmp(algo, "lbry");
 
+	if (sctx->is_equihash) {
+		return equi_stratum_notify(sctx, params);
+	}
+
 	job_id = json_string_value(json_array_get(params, p++));
 	prevhash = json_string_value(json_array_get(params, p++));
 	if (has_claim) {
@@ -1842,6 +1855,11 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
 		ret = stratum_set_difficulty(sctx, params);
 		goto out;
 	}
+	if (!strcasecmp(method, "mining.set_target")) {
+		sctx->is_equihash = true;
+		ret = equi_stratum_set_target(sctx, params);
+		goto out;
+	}
 	if (!strcasecmp(method, "mining.set_extranonce")) {
 		ret = stratum_parse_extranonce(sctx, params, 0);
 		goto out;