diff --git a/Makefile.am b/Makefile.am
index 307d86e..683e21d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -21,6 +21,8 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  crc32.c hefty1.c \
 			  ccminer.cpp pools.cpp util.cpp bench.cpp bignum.cpp \
 			  api.cpp hashlog.cpp nvml.cpp stats.cpp sysinfos.cpp cuda.cpp \
+			  equi/equi-stratum.cpp equi/equi.cpp equi/blake2/blake2bx.cpp \
+			  equi/equihash.cpp equi/cuda_equi.cu \
 			  heavy/heavy.cu \
 			  heavy/cuda_blake512.cu heavy/cuda_blake512.h \
 			  heavy/cuda_combine.cu heavy/cuda_combine.h \
@@ -97,6 +99,10 @@ ccminer_LDFLAGS += -L/usr/local/llvm/lib
 ccminer_LDADD += -lomp
 endif
 
+#ccminer_CPPFLAGS += -DUSE_LIBSODIUM
+#ccminer_LDFLAGS += -Lequi/lib
+#ccminer_LDADD += -lsodium
+ccminer_LDADD += -lcuda
 
 nvcc_ARCH  = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
 
diff --git a/algos.h b/algos.h
index e4dc4bb..163b193 100644
--- a/algos.h
+++ b/algos.h
@@ -16,6 +16,7 @@ enum sha_algos {
 	ALGO_DEEP,
 	ALGO_DECRED,
 	ALGO_DMD_GR,
+	ALGO_EQUIHASH,
 	ALGO_FRESH,
 	ALGO_FUGUE256,		/* Fugue256 */
 	ALGO_GROESTL,
@@ -78,6 +79,7 @@ static const char *algo_names[] = {
 	"deep",
 	"decred",
 	"dmd-gr",
+	"equihash",
 	"fresh",
 	"fugue256",
 	"groestl",
@@ -149,6 +151,8 @@ static inline int algo_to_int(char* arg)
 			i = ALGO_C11;
 		else if (!strcasecmp("diamond", arg))
 			i = ALGO_DMD_GR;
+		else if (!strcasecmp("equi", arg))
+			i = ALGO_EQUIHASH;
 		else if (!strcasecmp("doom", arg))
 			i = ALGO_LUFFA;
 		else if (!strcasecmp("hmq17", arg))
diff --git a/api.cpp b/api.cpp
index 0ec023f..8353127 100644
--- a/api.cpp
+++ b/api.cpp
@@ -1341,6 +1341,7 @@ void api_set_throughput(int thr_id, uint32_t throughput)
 	if (thr_id < MAX_GPUS && thr_info) {
 		struct cgpu_info *cgpu = &thr_info[thr_id].gpu;
 		cgpu->intensity = throughput2intensity(throughput);
+		if (cgpu->throughput != throughput) cgpu->throughput = throughput;
 	}
 	// to display in bench results
 	if (opt_benchmark)
diff --git a/ccminer.cpp b/ccminer.cpp
index 652a884..85ec68e 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -44,6 +44,7 @@
 #include "algos.h"
 #include "sia/sia-rpc.h"
 #include "crypto/xmr-rpc.h"
+#include "equi/equihash.h"
 
 #include <cuda_runtime.h>
 
@@ -243,6 +244,7 @@ Options:\n\
 			c11/flax    X11 variant\n\
 			decred      Decred Blake256\n\
 			deep        Deepcoin\n\
+			equihash    Zcash Equihash\n\
 			dmd-gr      Diamond-Groestl\n\
 			fresh       Freshcoin (shavite 80)\n\
 			fugue256    Fuguecoin\n\
@@ -558,6 +560,14 @@ void get_currentalgo(char* buf, int sz)
 	snprintf(buf, sz, "%s", algo_names[opt_algo]);
 }
 
+void format_hashrate(double hashrate, char *output)
+{
+	if (opt_algo == ALGO_EQUIHASH)
+		format_hashrate_unit(hashrate, output, "Sol/s");
+	else
+		format_hashrate_unit(hashrate, output, "H/s");
+}
+
 /**
  * Exit app
  */
@@ -634,6 +644,10 @@ static void calc_network_diff(struct work *work)
 	if (opt_algo == ALGO_LBRY) nbits = swab32(work->data[26]);
 	if (opt_algo == ALGO_DECRED) nbits = work->data[29];
 	if (opt_algo == ALGO_SIA) nbits = work->data[11]; // unsure if correct
+	if (opt_algo == ALGO_EQUIHASH) {
+		net_diff = equi_network_diff(work);
+		return;
+	}
 
 	uint32_t bits = (nbits & 0xffffff);
 	int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
@@ -856,6 +870,17 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		return true;
 	}
 
+	if (pool->type & POOL_STRATUM && stratum.is_equihash) {
+		struct work submit_work;
+		memcpy(&submit_work, work, sizeof(struct work));
+		//if (!hashlog_already_submittted(submit_work.job_id, submit_work.nonces[idnonce])) {
+			if (equi_stratum_submit(pool, &submit_work))
+				hashlog_remember_submit(&submit_work, submit_work.nonces[idnonce]);
+			stratum.job.shares_count++;
+		//}
+		return true;
+	}
+
 	/* discard if a newer block was received */
 	stale_work = work->height && work->height < g_work.height;
 	if (have_stratum && !stale_work && opt_algo != ALGO_ZR5 && opt_algo != ALGO_SCRYPT_JANE) {
@@ -1508,6 +1533,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	/* Generate merkle root */
 	switch (opt_algo) {
 		case ALGO_DECRED:
+		case ALGO_EQUIHASH:
 		case ALGO_SIA:
 			// getwork over stratum, no merkle to generate
 			break;
@@ -1566,6 +1592,13 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		memcpy(&work->data[44], &sctx->job.coinbase[sctx->job.coinbase_size-4], 4);
 		sctx->job.height = work->data[32];
 		//applog_hex(work->data, 180);
+	} else if (opt_algo == ALGO_EQUIHASH) {
+		memcpy(&work->data[9], sctx->job.coinbase, 32+32); // merkle [9..16] + reserved
+		work->data[25] = le32dec(sctx->job.ntime);
+		work->data[26] = le32dec(sctx->job.nbits);
+		memcpy(&work->data[27], sctx->xnonce1, sctx->xnonce1_size & 0x1F); // pool extranonce
+		work->data[35] = 0x80;
+		//applog_hex(work->data, 140);
 	} else if (opt_algo == ALGO_LBRY) {
 		for (i = 0; i < 8; i++)
 			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
@@ -1618,7 +1651,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 
 	pthread_mutex_unlock(&stratum_work_lock);
 
-	if (opt_debug && opt_algo != ALGO_DECRED && opt_algo != ALGO_SIA) {
+	if (opt_debug && opt_algo != ALGO_DECRED && opt_algo != ALGO_EQUIHASH && opt_algo != ALGO_SIA) {
 		uint32_t utm = work->data[17];
 		if (opt_algo != ALGO_ZR5) utm = swab32(utm);
 		char *tm = atime2str(utm - sctx->srvtime_diff);
@@ -1656,6 +1689,9 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_LYRA2:
 			work_set_target(work, sctx->job.diff / (128.0 * opt_difficulty));
 			break;
+		case ALGO_EQUIHASH:
+			equi_work_set_target(work, sctx->job.diff / opt_difficulty);
+			break;
 		default:
 			work_set_target(work, sctx->job.diff / opt_difficulty);
 	}
@@ -1830,6 +1866,9 @@ static void *miner_thread(void *userdata)
 		} else if (opt_algo == ALGO_CRYPTOLIGHT || opt_algo == ALGO_CRYPTONIGHT) {
 			nonceptr = (uint32_t*) (((char*)work.data) + 39);
 			wcmplen = 39;
+		} else if (opt_algo == ALGO_EQUIHASH) {
+			nonceptr = &work.data[EQNONCE_OFFSET]; // 27 is pool extranonce (256bits nonce space)
+			wcmplen = 4+32+32;
 		}
 
 		if (have_stratum) {
@@ -1962,6 +2001,10 @@ static void *miner_thread(void *userdata)
 			nonceptr[1] += 1;
 			nonceptr[2] |= thr_id;
 
+		} else if (opt_algo == ALGO_EQUIHASH) {
+			nonceptr[1]++;
+			nonceptr[1] |= thr_id << 24;
+			//applog_hex(&work.data[27], 32);
 		} else if (opt_algo == ALGO_WILDKECCAK) {
 			//nonceptr[1] += 1;
 		} else if (opt_algo == ALGO_SIA) {
@@ -2264,6 +2307,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_DEEP:
 			rc = scanhash_deep(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_EQUIHASH:
+			rc = scanhash_equihash(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_FRESH:
 			rc = scanhash_fresh(thr_id, &work, max_nonce, &hashes_done);
 			break;
@@ -2431,7 +2477,7 @@ static void *miner_thread(void *userdata)
 				work.nonces[1] = nonceptr[2];
 		}
 
-		if (stratum.rpc2 && rc == -EBUSY || work_restart[thr_id].restart) {
+		if (stratum.rpc2 && (rc == -EBUSY || work_restart[thr_id].restart)) {
 			// bbr scratchpad download or stale result
 			sleep(1);
 			if (!thr_id) pools[cur_pooln].wait_time += 1;
@@ -2489,7 +2535,7 @@ static void *miner_thread(void *userdata)
 		}
 
 		// only required to debug purpose
-		if (opt_debug && check_dups && opt_algo != ALGO_DECRED && opt_algo != ALGO_SIA)
+		if (opt_debug && check_dups && opt_algo != ALGO_DECRED && opt_algo != ALGO_EQUIHASH && opt_algo != ALGO_SIA)
 			hashlog_remember_scan_range(&work);
 
 		/* output */
@@ -3854,6 +3900,10 @@ int main(int argc, char *argv[])
 		allow_mininginfo = false;
 	}
 
+	if (opt_algo == ALGO_EQUIHASH) {
+		opt_extranonce = false; // disable subscribe
+	}
+
 	if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT) {
 		rpc2_init();
 		if (!opt_quiet) applog(LOG_INFO, "Using JSON-RPC 2.0");
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 5aad65e..1316052 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -156,7 +156,7 @@
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
       <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_30,sm_30;compute_20,sm_21</CodeGeneration>
-      <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions>--ptxas-options="-O2" --Wno-deprecated-gpu-targets %(AdditionalOptions)</AdditionalOptions>
       <Optimization>O2</Optimization>
     </CudaCompile>
     <CudaLink>
@@ -199,9 +199,10 @@
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
       <CodeGeneration>compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;compute_20,sm_21</CodeGeneration>
-      <Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
+      <Include>$(NVTOOLSEXT_PATH)\include</Include>
       <Optimization>O3</Optimization>
       <TargetMachinePlatform>64</TargetMachinePlatform>
+      <AdditionalOptions>--Wno-deprecated-gpu-targets %(AdditionalOptions)</AdditionalOptions>
     </CudaCompile>
     <CudaLink>
       <Optimization>O3</Optimization>
@@ -235,6 +236,12 @@
     <ClCompile Include="crypto\cryptolight-cpu.cpp" />
     <ClCompile Include="crypto\cryptonight-cpu.cpp" />
     <ClCompile Include="crypto\cpu\c_keccak.c" />
+    <ClCompile Include="equi\blake2\blake2bx.cpp">
+      <EnableEnhancedInstructionSet Condition="'$(Platform)'=='Win32'">StreamingSIMDExtensions</EnableEnhancedInstructionSet>
+    </ClCompile>
+    <ClCompile Include="equi\equi-stratum.cpp" />
+    <ClCompile Include="equi\equi.cpp" />
+    <ClCompile Include="equi\equihash.cpp" />
     <ClCompile Include="nvapi.cpp" />
     <ClCompile Include="pools.cpp" />
     <ClCompile Include="util.cpp" />
@@ -254,6 +261,8 @@
     <ClCompile Include="lyra2\Lyra2.c" />
     <ClCompile Include="lyra2\Sponge.c" />
     <ClCompile Include="lyra2\Lyra2Z.c" />
+    <ClInclude Include="equi\eqcuda.hpp" />
+    <ClInclude Include="equi\equihash.h" />
     <ClInclude Include="neoscrypt\neoscrypt.h" />
     <ClCompile Include="neoscrypt\neoscrypt.cpp" />
     <ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
@@ -283,6 +292,11 @@
     <CudaCompile Include="crypto\wildkeccak.cu">
       <MaxRegCount>128</MaxRegCount>
     </CudaCompile>
+    <CudaCompile Include="equi\cuda_equi.cu">
+      <CodeGeneration>compute_52,sm_52;compute_50,sm_50;compute_30,sm_30</CodeGeneration>
+      <AdditionalOptions> -Xptxas -dlcm=ca -Xptxas -dscm=cs %(AdditionalOptions)</AdditionalOptions>
+      <MaxRegCount>0</MaxRegCount>
+    </CudaCompile>
     <CudaCompile Include="neoscrypt\cuda_neoscrypt.cu">
       <MaxRegCount>160</MaxRegCount>
     </CudaCompile>
@@ -552,7 +566,7 @@
     </CudaCompile>
     <CudaCompile Include="x15\cuda_x14_shabal512.cu" />
     <CudaCompile Include="x15\cuda_x15_whirlpool.cu" />
-    <CudaCompile Include="x17\hmq17.cu"/>
+    <CudaCompile Include="x17\hmq17.cu" />
     <CudaCompile Include="x15\x15.cu" />
     <CudaCompile Include="x15\whirlpool.cu" />
     <CudaCompile Include="x17\x17.cu">
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index f37da23..a6b39c4 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -103,6 +103,9 @@
     <Filter Include="Source Files\CUDA\xmr">
       <UniqueIdentifier>{0f9aec5e-5409-488f-992a-2c108590d1ac}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Source Files\equi">
+      <UniqueIdentifier>{031afae7-2a78-4e32-9738-4b589b6f7ff3}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="compat\jansson\dump.c">
@@ -321,6 +324,18 @@
     <ClCompile Include="crypto\wildkeccak-cpu.cpp">
       <Filter>Source Files\crypto\bbr</Filter>
     </ClCompile>
+    <ClCompile Include="equi\equi.cpp">
+      <Filter>Source Files\equi</Filter>
+    </ClCompile>
+    <ClCompile Include="equi\equihash.cpp">
+      <Filter>Source Files\equi</Filter>
+    </ClCompile>
+    <ClCompile Include="equi\equi-stratum.cpp">
+      <Filter>Source Files\equi</Filter>
+    </ClCompile>
+    <ClCompile Include="equi\blake2\blake2bx.cpp">
+      <Filter>Source Files\equi</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="algos.h">
@@ -563,6 +578,12 @@
     <ClInclude Include="crypto\cryptonight.h">
       <Filter>Source Files\CUDA\xmr</Filter>
     </ClInclude>
+    <ClInclude Include="equi\eqcuda.hpp">
+      <Filter>Source Files\equi</Filter>
+    </ClInclude>
+    <ClInclude Include="equi\equihash.h">
+      <Filter>Source Files\equi</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <CudaCompile Include="cuda.cpp">
@@ -895,6 +916,9 @@
     <CudaCompile Include="crypto\wildkeccak.cu">
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
+    <CudaCompile Include="equi\cuda_equi.cu">
+      <Filter>Source Files\equi</Filter>
+    </CudaCompile>
   </ItemGroup>
   <ItemGroup>
     <Image Include="res\ccminer.ico">
@@ -911,4 +935,4 @@
       <Filter>Ressources</Filter>
     </Text>
   </ItemGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h
index 26cfea4..dfd973c 100644
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@@ -164,7 +164,7 @@
 #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.0"
+#define PACKAGE_VERSION "2.1"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
diff --git a/configure.ac b/configure.ac
index 1c76b70..c92a6fc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.0], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.1], [], [ccminer], [http://github.com/tpruvot/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/equi/blake2/blake2-config.h b/equi/blake2/blake2-config.h
new file mode 100644
index 0000000..2a8f9ed
--- /dev/null
+++ b/equi/blake2/blake2-config.h
@@ -0,0 +1,81 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_CONFIG_H__
+#define __BLAKE2_CONFIG_H__
+
+// These don't work everywhere
+#if (defined(__SSE2__) || defined(_M_AMD_64) || defined(_M_X64))
+#define HAVE_SSE2
+#endif
+
+#if defined(__SSSE3__)
+#define HAVE_SSSE3
+#endif
+
+#if defined(__SSE4_1__)
+#define HAVE_SSE41
+#endif
+
+#if defined(__AVX__)
+#define HAVE_AVX
+#endif
+
+#if defined(__XOP__)
+#define HAVE_XOP
+#endif
+
+
+#ifdef HAVE_AVX2
+#ifndef HAVE_AVX
+#define HAVE_AVX
+#endif
+#endif
+
+#ifdef HAVE_XOP
+#ifndef HAVE_AVX
+#define HAVE_AVX
+#endif
+#endif
+
+#ifdef HAVE_AVX
+#ifndef HAVE_SSE41
+#define HAVE_SSE41
+#endif
+#endif
+
+#ifdef HAVE_SSE41
+#ifndef HAVE_SSSE3
+#define HAVE_SSSE3
+#endif
+#endif
+
+#ifdef HAVE_SSSE3
+#define HAVE_SSE2
+#endif
+
+#if !defined(HAVE_SSE2)
+
+#ifdef _MSC_VER
+// enforce required stuff for now
+#define HAVE_SSE2
+//#define HAVE_SSSE3
+#define HAVE_SSE41
+#else
+# error "This code requires at least SSE 4.1"
+#endif
+
+#endif
+
+#endif
+
diff --git a/equi/blake2/blake2-impl.h b/equi/blake2/blake2-impl.h
new file mode 100644
index 0000000..16219db
--- /dev/null
+++ b/equi/blake2/blake2-impl.h
@@ -0,0 +1,136 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_IMPL_H__
+#define __BLAKE2_IMPL_H__
+
+#include <stdint.h>
+
+static inline uint32_t load32( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  uint32_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
+#else
+  const uint8_t *p = ( const uint8_t * )src;
+  uint32_t w = *p++;
+  w |= ( uint32_t )( *p++ ) <<  8;
+  w |= ( uint32_t )( *p++ ) << 16;
+  w |= ( uint32_t )( *p++ ) << 24;
+  return w;
+#endif
+}
+
+static inline uint64_t load64( const void *src )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  uint64_t w;
+  memcpy(&w, src, sizeof w);
+  return w;
+#else
+  const uint8_t *p = ( const uint8_t * )src;
+  uint64_t w = *p++;
+  w |= ( uint64_t )( *p++ ) <<  8;
+  w |= ( uint64_t )( *p++ ) << 16;
+  w |= ( uint64_t )( *p++ ) << 24;
+  w |= ( uint64_t )( *p++ ) << 32;
+  w |= ( uint64_t )( *p++ ) << 40;
+  w |= ( uint64_t )( *p++ ) << 48;
+  w |= ( uint64_t )( *p++ ) << 56;
+  return w;
+#endif
+}
+
+static inline void store32( void *dst, uint32_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  memcpy(dst, &w, sizeof w);
+#else
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+#endif
+}
+
+static inline void store64( void *dst, uint64_t w )
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+  memcpy(dst, &w, sizeof w);
+#else
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+#endif
+}
+
+static inline uint64_t load48( const void *src )
+{
+  const uint8_t *p = ( const uint8_t * )src;
+  uint64_t w = *p++;
+  w |= ( uint64_t )( *p++ ) <<  8;
+  w |= ( uint64_t )( *p++ ) << 16;
+  w |= ( uint64_t )( *p++ ) << 24;
+  w |= ( uint64_t )( *p++ ) << 32;
+  w |= ( uint64_t )( *p++ ) << 40;
+  return w;
+}
+
+static inline void store48( void *dst, uint64_t w )
+{
+  uint8_t *p = ( uint8_t * )dst;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w; w >>= 8;
+  *p++ = ( uint8_t )w;
+}
+
+static inline uint32_t rotl32( const uint32_t w, const unsigned c )
+{
+  return ( w << c ) | ( w >> ( 32 - c ) );
+}
+
+static inline uint64_t rotl64( const uint64_t w, const unsigned c )
+{
+  return ( w << c ) | ( w >> ( 64 - c ) );
+}
+
+static inline uint32_t rotr32( const uint32_t w, const unsigned c )
+{
+  return ( w >> c ) | ( w << ( 32 - c ) );
+}
+
+static inline uint64_t rotr64( const uint64_t w, const unsigned c )
+{
+  return ( w >> c ) | ( w << ( 64 - c ) );
+}
+
+/* prevents compiler optimizing out memset() */
+static inline void secure_zero_memory( void *v, size_t n )
+{
+  volatile uint8_t *p = ( volatile uint8_t * )v;
+  while( n-- ) *p++ = 0;
+}
+
+#endif
+
diff --git a/equi/blake2/blake2-round.h b/equi/blake2/blake2-round.h
new file mode 100644
index 0000000..400ed20
--- /dev/null
+++ b/equi/blake2/blake2-round.h
@@ -0,0 +1,85 @@
+#define _mm_roti_epi64(x, c) \
+	(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+	: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
+	: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+	: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
+	: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	row1l = _mm_add_epi64(row1l, row2l); \
+	row1h = _mm_add_epi64(row1h, row2h); \
+	\
+	row4l = _mm_xor_si128(row4l, row1l); \
+	row4h = _mm_xor_si128(row4h, row1h); \
+	\
+	row4l = _mm_roti_epi64(row4l, -32); \
+	row4h = _mm_roti_epi64(row4h, -32); \
+	\
+	row3l = _mm_add_epi64(row3l, row4l); \
+	row3h = _mm_add_epi64(row3h, row4h); \
+	\
+	row2l = _mm_xor_si128(row2l, row3l); \
+	row2h = _mm_xor_si128(row2h, row3h); \
+	\
+	row2l = _mm_roti_epi64(row2l, -24); \
+	row2h = _mm_roti_epi64(row2h, -24); \
+ 
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	row1l = _mm_add_epi64(row1l, row2l); \
+	row1h = _mm_add_epi64(row1h, row2h); \
+	\
+	row4l = _mm_xor_si128(row4l, row1l); \
+	row4h = _mm_xor_si128(row4h, row1h); \
+	\
+	row4l = _mm_roti_epi64(row4l, -16); \
+	row4h = _mm_roti_epi64(row4h, -16); \
+	\
+	row3l = _mm_add_epi64(row3l, row4l); \
+	row3h = _mm_add_epi64(row3h, row4h); \
+	\
+	row2l = _mm_xor_si128(row2l, row3l); \
+	row2h = _mm_xor_si128(row2h, row3h); \
+	\
+	row2l = _mm_roti_epi64(row2l, -63); \
+	row2h = _mm_roti_epi64(row2h, -63); \
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+	t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0;    \
+	\
+	t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+	t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+	row4l = t1; \
+	row4h = t0;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+	t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+	t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+	row2l = t0; \
+	row2h = t1; \
+	\
+	t0 = row3l; \
+	row3l = row3h; \
+	row3h = t0; \
+	\
+	t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+	t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+	row4l = t1; \
+	row4h = t0;
+
+#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
+	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+	\
+	UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
diff --git a/equi/blake2/blake2.h b/equi/blake2/blake2.h
new file mode 100644
index 0000000..3206576
--- /dev/null
+++ b/equi/blake2/blake2.h
@@ -0,0 +1,85 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2_H__
+#define __BLAKE2_H__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__ ((__aligned__(x)))
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+  enum blake2b_constant
+  {
+    BLAKE2B_BLOCKBYTES = 128,
+    BLAKE2B_OUTBYTES   = 64,
+    BLAKE2B_KEYBYTES   = 64,
+    BLAKE2B_SALTBYTES  = 16,
+    BLAKE2B_PERSONALBYTES = 16
+  };
+
+#pragma pack(push, 1)
+  typedef struct __blake2b_param
+  {
+    uint8_t  digest_length; // 1
+    uint8_t  key_length;    // 2
+    uint8_t  fanout;        // 3
+    uint8_t  depth;         // 4
+    uint32_t leaf_length;   // 8
+    uint64_t node_offset;   // 16
+    uint8_t  node_depth;    // 17
+    uint8_t  inner_length;  // 18
+    uint8_t  reserved[14];  // 32
+    uint8_t  salt[BLAKE2B_SALTBYTES]; // 48
+    uint8_t  personal[BLAKE2B_PERSONALBYTES];  // 64
+  } blake2b_param;
+
+  ALIGN( 64 ) typedef struct __blake2b_state
+  {
+    uint64_t h[8];
+    uint8_t  buf[BLAKE2B_BLOCKBYTES];
+    uint16_t counter;
+    uint8_t  buflen;
+    uint8_t  lastblock;
+  } blake2b_state;
+
+#pragma pack(pop)
+
+  int eq_blake2b_init( blake2b_state *S, const uint8_t outlen );
+  int eq_blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int eq_blake2b_init_param( blake2b_state *S, const blake2b_param *P );
+  int eq_blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen );
+  int eq_blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen );
+
+  // Simple API
+  int eq_blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+
+  static inline int eq_blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+  {
+    return eq_blake2b( out, in, key, outlen, inlen, keylen );
+  }
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
+
diff --git a/equi/blake2/blake2b-load-sse2.h b/equi/blake2/blake2b-load-sse2.h
new file mode 100644
index 0000000..1ba153c
--- /dev/null
+++ b/equi/blake2/blake2b-load-sse2.h
@@ -0,0 +1,68 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_LOAD_SSE2_H__
+#define __BLAKE2B_LOAD_SSE2_H__
+
+#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
+#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5)
+#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2)
+#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7)
+#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1)
+#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13)
+#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2)
+#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4)
+#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6)
+#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8)
+#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0)
+#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11)
+#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15)
+#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14)
+#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14)
+#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13)
+#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9)
+#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2)
+#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12)
+#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1)
+#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8)
+#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6)
+#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11)
+#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3)
+#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1)
+#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4)
+#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7)
+#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6)
+#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3)
+#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12)
+#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
+#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
+#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
+#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
+#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
+#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
+#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
+#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
+
+
+#endif
+
diff --git a/equi/blake2/blake2b-load-sse41.h b/equi/blake2/blake2b-load-sse41.h
new file mode 100644
index 0000000..f6c1bc8
--- /dev/null
+++ b/equi/blake2/blake2b-load-sse41.h
@@ -0,0 +1,402 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_LOAD_SSE41_H__
+#define __BLAKE2B_LOAD_SSE41_H__
+
+#define LOAD_MSG_0_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m1); \
+b1 = _mm_unpacklo_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_0_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m0, m1); \
+b1 = _mm_unpackhi_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_0_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m5); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_0_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m5); \
+b1 = _mm_unpackhi_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_1_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m2); \
+b1 = _mm_unpackhi_epi64(m4, m6); \
+} while(0)
+
+
+#define LOAD_MSG_1_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_alignr_epi8(m3, m7, 8); \
+} while(0)
+
+
+#define LOAD_MSG_1_3(b0, b1) \
+do \
+{ \
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+b1 = _mm_unpackhi_epi64(m5, m2); \
+} while(0)
+
+
+#define LOAD_MSG_1_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m1); \
+b1 = _mm_unpackhi_epi64(m3, m1); \
+} while(0)
+
+
+#define LOAD_MSG_2_1(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m6, m5, 8); \
+b1 = _mm_unpackhi_epi64(m2, m7); \
+} while(0)
+
+
+#define LOAD_MSG_2_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m0); \
+b1 = _mm_blend_epi16(m1, m6, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_2_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m5, m1, 0xF0); \
+b1 = _mm_unpackhi_epi64(m3, m4); \
+} while(0)
+
+
+#define LOAD_MSG_2_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m3); \
+b1 = _mm_alignr_epi8(m2, m0, 8); \
+} while(0)
+
+
+#define LOAD_MSG_3_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m3, m1); \
+b1 = _mm_unpackhi_epi64(m6, m5); \
+} while(0)
+
+
+#define LOAD_MSG_3_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m0); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_3_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m1, m2, 0xF0); \
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_3_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m3, m5); \
+b1 = _mm_unpacklo_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_4_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m2); \
+b1 = _mm_unpacklo_epi64(m1, m5); \
+} while(0)
+
+
+#define LOAD_MSG_4_2(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m0, m3, 0xF0); \
+b1 = _mm_blend_epi16(m2, m7, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_4_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m7, m5, 0xF0); \
+b1 = _mm_blend_epi16(m3, m1, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_4_4(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m6, m0, 8); \
+b1 = _mm_blend_epi16(m4, m6, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_5_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m1, m3); \
+b1 = _mm_unpacklo_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_5_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m5); \
+b1 = _mm_unpackhi_epi64(m5, m1); \
+} while(0)
+
+
+#define LOAD_MSG_5_3(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m2, m3, 0xF0); \
+b1 = _mm_unpackhi_epi64(m7, m0); \
+} while(0)
+
+
+#define LOAD_MSG_5_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m6, m2); \
+b1 = _mm_blend_epi16(m7, m4, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_6_1(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m6, m0, 0xF0); \
+b1 = _mm_unpacklo_epi64(m7, m2); \
+} while(0)
+
+
+#define LOAD_MSG_6_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m2, m7); \
+b1 = _mm_alignr_epi8(m5, m6, 8); \
+} while(0)
+
+
+#define LOAD_MSG_6_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m3); \
+b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
+} while(0)
+
+
+#define LOAD_MSG_6_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m3, m1); \
+b1 = _mm_blend_epi16(m1, m5, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_7_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m6, m3); \
+b1 = _mm_blend_epi16(m6, m1, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_7_2(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m7, m5, 8); \
+b1 = _mm_unpackhi_epi64(m0, m4); \
+} while(0)
+
+
+#define LOAD_MSG_7_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m2, m7); \
+b1 = _mm_unpacklo_epi64(m4, m1); \
+} while(0)
+
+
+#define LOAD_MSG_7_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m2); \
+b1 = _mm_unpacklo_epi64(m3, m5); \
+} while(0)
+
+
+#define LOAD_MSG_8_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m3, m7); \
+b1 = _mm_alignr_epi8(m0, m5, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m7, m4); \
+b1 = _mm_alignr_epi8(m4, m1, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_3(b0, b1) \
+do \
+{ \
+b0 = m6; \
+b1 = _mm_alignr_epi8(m5, m0, 8); \
+} while(0)
+
+
+#define LOAD_MSG_8_4(b0, b1) \
+do \
+{ \
+b0 = _mm_blend_epi16(m1, m3, 0xF0); \
+b1 = m2; \
+} while(0)
+
+
+#define LOAD_MSG_9_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_unpackhi_epi64(m3, m0); \
+} while(0)
+
+
+#define LOAD_MSG_9_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m1, m2); \
+b1 = _mm_blend_epi16(m3, m2, 0xF0); \
+} while(0)
+
+
+#define LOAD_MSG_9_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m7, m4); \
+b1 = _mm_unpackhi_epi64(m1, m6); \
+} while(0)
+
+
+#define LOAD_MSG_9_4(b0, b1) \
+do \
+{ \
+b0 = _mm_alignr_epi8(m7, m5, 8); \
+b1 = _mm_unpacklo_epi64(m6, m0); \
+} while(0)
+
+
+#define LOAD_MSG_10_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m0, m1); \
+b1 = _mm_unpacklo_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_10_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m0, m1); \
+b1 = _mm_unpackhi_epi64(m2, m3); \
+} while(0)
+
+
+#define LOAD_MSG_10_3(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m4, m5); \
+b1 = _mm_unpacklo_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_10_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpackhi_epi64(m4, m5); \
+b1 = _mm_unpackhi_epi64(m6, m7); \
+} while(0)
+
+
+#define LOAD_MSG_11_1(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m7, m2); \
+b1 = _mm_unpackhi_epi64(m4, m6); \
+} while(0)
+
+
+#define LOAD_MSG_11_2(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m5, m4); \
+b1 = _mm_alignr_epi8(m3, m7, 8); \
+} while(0)
+
+
+#define LOAD_MSG_11_3(b0, b1) \
+do \
+{ \
+b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
+b1 = _mm_unpackhi_epi64(m5, m2); \
+} while(0)
+
+
+#define LOAD_MSG_11_4(b0, b1) \
+do \
+{ \
+b0 = _mm_unpacklo_epi64(m6, m1); \
+b1 = _mm_unpackhi_epi64(m3, m1); \
+} while(0)
+
+
+#endif
+
diff --git a/equi/blake2/blake2b-round.h b/equi/blake2/blake2b-round.h
new file mode 100644
index 0000000..b39106b
--- /dev/null
+++ b/equi/blake2/blake2b-round.h
@@ -0,0 +1,170 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#pragma once
+#ifndef __BLAKE2B_ROUND_H__
+#define __BLAKE2B_ROUND_H__
+
+#define LOAD(p)  _mm_load_si128( (const __m128i *)(p) )
+#define STORE(p,r) _mm_store_si128((__m128i *)(p), r)
+
+#define LOADU(p)  _mm_loadu_si128( (const __m128i *)(p) )
+#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
+
+#define TOF(reg) _mm_castsi128_ps((reg))
+#define TOI(reg) _mm_castps_si128((reg))
+
+#define LIKELY(x) __builtin_expect((x),1)
+
+
+/* Microarchitecture-specific macros */
+#ifndef HAVE_XOP
+#ifdef HAVE_SSSE3
+#define _mm_roti_epi64(x, c) \
+    (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+    : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
+    : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+    : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
+    : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
+#else
+#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) ))
+#endif
+#else
+/* ... */
+#endif
+
+
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, (-32)); \
+  row4h = _mm_roti_epi64(row4h, (-32)); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, (-24)); \
+  row2h = _mm_roti_epi64(row2h, (-24)); \
+
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, (-16)); \
+  row4h = _mm_roti_epi64(row4h, (-16)); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, (-63)); \
+  row2h = _mm_roti_epi64(row2h, (-63)); \
+
+#if defined(HAVE_SSSE3)
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+  t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0;    \
+  \
+  t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+  t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+  row4l = t1; \
+  row4h = t0;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+  t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0; \
+  \
+  t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+  t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+  row4l = t1; \
+  row4h = t0;
+#else
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = row4l;\
+  t1 = row2l;\
+  row4l = row3l;\
+  row3l = row3h;\
+  row3h = row4l;\
+  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
+  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
+  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
+  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = row3l;\
+  row3l = row3h;\
+  row3h = t0;\
+  t0 = row2l;\
+  t1 = row4l;\
+  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
+  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
+  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
+  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
+
+#endif
+
+#if defined(HAVE_SSE41)
+#include "blake2b-load-sse41.h"
+#else
+#include "blake2b-load-sse2.h"
+#endif
+
+#define ROUND(r) \
+  LOAD_MSG_ ##r ##_1(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_2(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+  LOAD_MSG_ ##r ##_3(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_4(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
+
+#endif
+
+#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
+	G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
+	\
+	UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
diff --git a/equi/blake2/blake2bx.cpp b/equi/blake2/blake2bx.cpp
new file mode 100644
index 0000000..6f5c041
--- /dev/null
+++ b/equi/blake2/blake2bx.cpp
@@ -0,0 +1,262 @@
+/*
+   BLAKE2 reference source code package - optimized C implementations
+
+   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+#include "blake2-config.h"
+
+#ifdef WIN32
+#include <intrin.h>
+#endif
+
+#include <emmintrin.h>
+#if defined(HAVE_SSSE3)
+#include <tmmintrin.h>
+#endif
+#if defined(HAVE_SSE41)
+#include <smmintrin.h>
+#endif
+#if defined(HAVE_AVX)
+#include <immintrin.h>
+#endif
+#if defined(HAVE_XOP)
+#include <x86intrin.h>
+#endif
+
+#include "blake2b-round.h"
+
+ALIGN(64) static const uint64_t blake2b_IV[8] = {
+	0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+	0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+	0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+	0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+/* init xors IV with input parameter block */
+int eq_blake2b_init_param(blake2b_state *S, const blake2b_param *P)
+{
+	//blake2b_init0( S );
+	const uint8_t * v = (const uint8_t *)(blake2b_IV);
+	const uint8_t * p = (const uint8_t *)(P);
+	uint8_t * h = (uint8_t *)(S->h);
+	/* IV XOR ParamBlock */
+	memset(S, 0, sizeof(blake2b_state));
+
+	for (int i = 0; i < BLAKE2B_OUTBYTES; ++i) h[i] = v[i] ^ p[i];
+
+	return 0;
+}
+
+/* Some sort of default parameter block initialization, for sequential blake2b */
+int eq_blake2b_init(blake2b_state *S, const uint8_t outlen)
+{
+	if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
+
+	const blake2b_param P =
+	{
+		outlen,
+		0,
+		1,
+		1,
+		0,
+		0,
+		0,
+		0,
+		{ 0 },
+		{ 0 },
+		{ 0 }
+	};
+	return eq_blake2b_init_param(S, &P);
+}
+
+int eq_blake2b_init_key(blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen)
+{
+	if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
+
+	if ((!keylen) || keylen > BLAKE2B_KEYBYTES) return -1;
+
+	const blake2b_param P =
+	{
+		outlen,
+		keylen,
+		1,
+		1,
+		0,
+		0,
+		0,
+		0,
+		{ 0 },
+		{ 0 },
+		{ 0 }
+	};
+
+	if (eq_blake2b_init_param(S, &P) < 0)
+		return 0;
+
+	{
+		uint8_t block[BLAKE2B_BLOCKBYTES];
+		memset(block, 0, BLAKE2B_BLOCKBYTES);
+		memcpy(block, key, keylen);
+		eq_blake2b_update(S, block, BLAKE2B_BLOCKBYTES);
+		secure_zero_memory(block, BLAKE2B_BLOCKBYTES); /* Burn the key from stack */
+	}
+	return 0;
+}
+
+static inline int blake2b_compress(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES])
+{
+	__m128i row1l, row1h;
+	__m128i row2l, row2h;
+	__m128i row3l, row3h;
+	__m128i row4l, row4h;
+	__m128i b0, b1;
+	__m128i t0, t1;
+#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
+	const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
+	const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
+#endif
+#if defined(HAVE_SSE41)
+	const __m128i m0 = LOADU(block + 00);
+	const __m128i m1 = LOADU(block + 16);
+	const __m128i m2 = LOADU(block + 32);
+	const __m128i m3 = LOADU(block + 48);
+	const __m128i m4 = LOADU(block + 64);
+	const __m128i m5 = LOADU(block + 80);
+	const __m128i m6 = LOADU(block + 96);
+	const __m128i m7 = LOADU(block + 112);
+#else
+	const uint64_t  m0 = ( ( uint64_t * )block )[ 0];
+	const uint64_t  m1 = ( ( uint64_t * )block )[ 1];
+	const uint64_t  m2 = ( ( uint64_t * )block )[ 2];
+	const uint64_t  m3 = ( ( uint64_t * )block )[ 3];
+	const uint64_t  m4 = ( ( uint64_t * )block )[ 4];
+	const uint64_t  m5 = ( ( uint64_t * )block )[ 5];
+	const uint64_t  m6 = ( ( uint64_t * )block )[ 6];
+	const uint64_t  m7 = ( ( uint64_t * )block )[ 7];
+	const uint64_t  m8 = ( ( uint64_t * )block )[ 8];
+	const uint64_t  m9 = ( ( uint64_t * )block )[ 9];
+	const uint64_t m10 = ( ( uint64_t * )block )[10];
+	const uint64_t m11 = ( ( uint64_t * )block )[11];
+	const uint64_t m12 = ( ( uint64_t * )block )[12];
+	const uint64_t m13 = ( ( uint64_t * )block )[13];
+	const uint64_t m14 = ( ( uint64_t * )block )[14];
+	const uint64_t m15 = ( ( uint64_t * )block )[15];
+#endif
+	row1l = LOADU(&S->h[0]);
+	row1h = LOADU(&S->h[2]);
+	row2l = LOADU(&S->h[4]);
+	row2h = LOADU(&S->h[6]);
+	row3l = LOADU(&blake2b_IV[0]);
+	row3h = LOADU(&blake2b_IV[2]);
+	row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), _mm_set_epi32(0, 0, 0, S->counter));
+	row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), _mm_set_epi32(0, 0, 0L - S->lastblock, 0L - S->lastblock));
+	ROUND(0);
+	ROUND(1);
+	ROUND(2);
+	ROUND(3);
+	ROUND(4);
+	ROUND(5);
+	ROUND(6);
+	ROUND(7);
+	ROUND(8);
+	ROUND(9);
+	ROUND(10);
+	ROUND(11);
+	row1l = _mm_xor_si128(row3l, row1l);
+	row1h = _mm_xor_si128(row3h, row1h);
+	STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
+	STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
+	row2l = _mm_xor_si128(row4l, row2l);
+	row2h = _mm_xor_si128(row4h, row2h);
+	STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
+	STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
+	return 0;
+}
+
+int eq_blake2b_update(blake2b_state *S, const uint8_t *in, uint64_t inlen)
+{
+	while (inlen > 0)
+	{
+		size_t left = S->buflen;
+		size_t fill = BLAKE2B_BLOCKBYTES - left;
+
+		if (inlen > fill)
+		{
+			memcpy(S->buf + left, in, fill); // Fill buffer
+			in += fill;
+			inlen -= fill;
+			S->counter += BLAKE2B_BLOCKBYTES;
+			blake2b_compress(S, S->buf); // Compress
+			S->buflen = 0;
+		}
+		else // inlen <= fill
+		{
+			memcpy(S->buf + left, in, inlen);
+			S->buflen += (uint8_t) inlen; // not enough to compress
+			in += inlen;
+			inlen = 0;
+		}
+	}
+
+	return 0;
+}
+
+int eq_blake2b_final(blake2b_state *S, uint8_t *out, uint8_t outlen)
+{
+	if (outlen > BLAKE2B_OUTBYTES)
+		return -1;
+
+	if (S->buflen > BLAKE2B_BLOCKBYTES)
+	{
+		S->counter += BLAKE2B_BLOCKBYTES;
+		blake2b_compress(S, S->buf);
+		S->buflen -= BLAKE2B_BLOCKBYTES;
+		memcpy(S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen);
+	}
+
+	S->counter += S->buflen;
+	S->lastblock = 1;
+	memset(S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
+	blake2b_compress(S, S->buf);
+	memcpy(out, &S->h[0], outlen);
+	S->lastblock = 0;
+	return 0;
+}
+
+int eq_blake2b(uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen)
+{
+	blake2b_state S[1];
+
+	/* Verify parameters */
+	if (!in || !out) return -1;
+	if (NULL == key) keylen = 0;
+
+	if (keylen)
+	{
+		if (eq_blake2b_init_key(S, outlen, key, keylen) < 0) return -1;
+	}
+	else
+	{
+		if (eq_blake2b_init(S, outlen) < 0) return -1;
+	}
+
+	eq_blake2b_update(S, (const uint8_t *)in, inlen);
+	eq_blake2b_final(S, out, outlen);
+	return 0;
+}
diff --git a/equi/cuda_equi.cu b/equi/cuda_equi.cu
new file mode 100644
index 0000000..7a45ac8
--- /dev/null
+++ b/equi/cuda_equi.cu
@@ -0,0 +1,2117 @@
+/*
+ * Equihash solver created by djeZo (l33tsoftw@gmail.com) for NiceHash
+ * Adapted to be more compatible with older C++ compilers
+ *
+ * cuda_djezo solver was released by NiceHash (www.nicehash.com) under
+ * GPL 3.0 license. If you don't have a copy, you can obtain one from
+ * https://www.gnu.org/licenses/gpl-3.0.txt
+ *
+ * Based on CUDA solver by John Tromp released under MIT license.
+ * Some helper functions taken out of OpenCL solver by Marc Bevand
+ * released under MIT license.
+ *
+ * Copyright (c) 2016 John Tromp, Marc Bevand
+ * Copyright (c) 2017 djeZo, Tanguy Pruvot (GPL v3)
+ */
+
+#ifdef WIN32
+#include <Windows.h>
+#endif
+
+#include <stdio.h>
+#include <vector>
+//#include <mutex>
+
+#include "equihash.h"
+#include "eqcuda.hpp" // eq_cuda_context
+
+#include "blake2/blake2.h"
+
+//#define WN 200
+//#define WK 9
+#ifndef MAX_GPUS
+#define MAX_GPUS 16
+#endif
+
+#define NDIGITS		(WK+1)
+#define DIGITBITS	(WN/(NDIGITS))
+#define PROOFSIZE (1<<WK)
+#define BASE (1<<DIGITBITS)
+#define NHASHES (2*BASE)
+#define HASHESPERBLAKE (512/WN)
+#define HASHOUT (HASHESPERBLAKE*WN/8)
+#define NBLOCKS ((NHASHES + HASHESPERBLAKE - 1) / HASHESPERBLAKE)
+#define BUCKBITS (DIGITBITS - RB)
+#define NBUCKETS (1 << BUCKBITS)
+#define BUCKMASK (NBUCKETS - 1)
+#define SLOTBITS (RB + 2)
+#define SLOTRANGE (1 << SLOTBITS)
+#define NSLOTS SM
+#define SLOTMASK (SLOTRANGE - 1)
+#define NRESTS (1 << RB)
+#define RESTMASK (NRESTS - 1)
+#define CANTORBITS (2 * SLOTBITS - 2)
+#define CANTORMASK ((1 << CANTORBITS) - 1)
+#define CANTORMAXSQRT (2 * NSLOTS)
+#define RB8_NSLOTS 640
+#define RB8_NSLOTS_LD 624
+#define FD_THREADS 128
+
+#ifdef __INTELLISENSE__
+// reduce vstudio editor warnings
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+#define __launch_bounds__(max_tpb, min_blocks)
+#define __CUDA_ARCH__ 520
+uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
+uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
+uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z);
+uint32_t atomicExch(uint32_t *x, uint32_t y);
+uint32_t atomicAdd(uint32_t *x, uint32_t y);
+void __syncthreads(void);
+void __threadfence(void);
+void __threadfence_block(void);
+uint32_t __ldg(const uint32_t* address);
+uint64_t __ldg(const uint64_t* address);
+uint4 __ldca(const uint4 *ptr);
+u32 __ldca(const u32 *ptr);
+u32 umin(const u32, const u32);
+u32 umax(const u32, const u32);
+#endif
+
+typedef u32 proof[PROOFSIZE];
+
+struct __align__(32) slot {
+	u32 hash[8];
+};
+
+struct __align__(16) slotsmall {
+	u32 hash[4];
+};
+
+struct __align__(8) slottiny {
+	u32 hash[2];
+};
+
+template <u32 RB, u32 SM>
+struct equi
+{
+	slot round0trees[4096][RB8_NSLOTS];
+	slot trees[1][NBUCKETS][NSLOTS];
+	struct {
+		slotsmall treessmall[NSLOTS];
+		slottiny treestiny[NSLOTS];
+	} round2trees[NBUCKETS];
+	struct {
+		slotsmall treessmall[NSLOTS];
+		slottiny treestiny[NSLOTS];
+	} round3trees[NBUCKETS];
+	slotsmall treessmall[4][NBUCKETS][NSLOTS];
+	slottiny treestiny[1][4096][RB8_NSLOTS_LD];
+	u32 round4bidandsids[NBUCKETS][NSLOTS];
+	union {
+		u64 blake_h[8];
+		u32 blake_h32[16];
+	};
+	struct {
+		u32 nslots8[4096];
+		u32 nslots0[4096];
+		u32 nslots[9][NBUCKETS];
+		scontainerreal srealcont;
+	} edata;
+};
+
+// todo: use cuda_helper.h and/or cuda_vector.h
+__device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b)
+{
+	return make_uint2(a.x ^ b.x, a.y ^ b.y);
+}
+
+__device__ __forceinline__ uint4 operator^ (uint4 a, uint4 b)
+{
+	return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}
+
+// for ROR 63 (or ROL 1); this func only support (32 <= offset < 64)
+__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset)
+{
+	uint2 result;
+#if __CUDA_ARCH__ > 300
+	{
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+#else
+	result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
+	result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
+#endif
+	return result;
+}
+
+
+__device__ __forceinline__ uint2 SWAPUINT2(uint2 value)
+{
+	return make_uint2(value.y, value.x);
+}
+
+__device__ __forceinline__ uint2 ROR24(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x2107);
+	result.y = __byte_perm(a.y, a.x, 0x6543);
+	return result;
+}
+
+__device__ __forceinline__ uint2 ROR16(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x1076);
+	result.y = __byte_perm(a.y, a.x, 0x5432);
+	return result;
+}
+
+__device__ __forceinline__ void G2(u64 & a, u64 & b, u64 & c, u64 & d, u64 x, u64 y)
+{
+	a = a + b + x;
+	((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+	c = c + d;
+	((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]);
+	a = a + b + y;
+	((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]);
+	c = c + d;
+	((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
+}
+
+// untested..
+struct packer_default
+{
+	__device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return (((bucketid << SLOTBITS) | s0) << SLOTBITS) | s1;
+	}
+
+	__device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM)
+	{
+		// BUCKMASK-ed to prevent illegal memory accesses in case of memory errors
+		return (bid >> (2 * SLOTBITS)) & BUCKMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return bid & SLOTMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM)
+	{
+		return (bid >> SLOTBITS) & SLOTMASK;
+	}
+};
+
+
+struct packer_cantor
+{
+	__device__ __forceinline__ static u32 cantor(const u32 s0, const u32 s1)
+	{
+		u32 a = umax(s0, s1);
+		u32 b = umin(s0, s1);
+		return a * (a + 1) / 2 + b;
+	}
+
+	__device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return (bucketid << CANTORBITS) | cantor(s0, s1);
+	}
+
+	__device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM)
+	{
+		return (bid >> CANTORBITS) & BUCKMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM)
+	{
+		return ((bid & CANTORMASK) - cantor(0, s1)) & SLOTMASK;
+	}
+
+	__device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM)
+	{
+		u32 k, q, sqr = 8 * (bid & CANTORMASK) + 1;
+		// this k=sqrt(sqr) computing loop averages 3.4 iterations out of maximum 9
+		for (k = CANTORMAXSQRT; (q = sqr / k) < k; k = (k + q) / 2);
+		return ((k - 1) / 2) & SLOTMASK;
+	}
+};
+
+__device__ __constant__ const u64 blake_iv[] = {
+	0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
+	0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+	0x510e527fade682d1, 0x9b05688c2b3e6c1f,
+	0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+};
+
+#if CUDART_VERSION < 8000 || !defined(__ldca)
+#define __ldca(ptr) *(ptr)
+#endif
+
+template <u32 RB, u32 SM, typename PACKER>
+__global__ void digit_first(equi<RB, SM>* eq, u32 nonce)
+{
+	const u32 block = blockIdx.x * blockDim.x + threadIdx.x;
+	__shared__ u64 hash_h[8];
+	u32* hash_h32 = (u32*)hash_h;
+
+	if (threadIdx.x < 16)
+		hash_h32[threadIdx.x] = __ldca(&eq->blake_h32[threadIdx.x]);
+
+	__syncthreads();
+
+	u64 m = (u64)block << 32 | (u64)nonce;
+
+	union
+	{
+		u64 v[16];
+		u32 v32[32];
+		uint4 v128[8];
+	};
+
+	v[0] = hash_h[0];
+	v[1] = hash_h[1];
+	v[2] = hash_h[2];
+	v[3] = hash_h[3];
+	v[4] = hash_h[4];
+	v[5] = hash_h[5];
+	v[6] = hash_h[6];
+	v[7] = hash_h[7];
+	v[8] = blake_iv[0];
+	v[9] = blake_iv[1];
+	v[10] = blake_iv[2];
+	v[11] = blake_iv[3];
+	v[12] = blake_iv[4] ^ (128 + 16);
+	v[13] = blake_iv[5];
+	v[14] = blake_iv[6] ^ 0xffffffffffffffff;
+	v[15] = blake_iv[7];
+
+	// mix 1
+	G2(v[0], v[4], v[8], v[12], 0, m);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 2
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], m, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 3
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, m);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 4
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, m);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 5
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, m);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 6
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], m, 0);
+
+	// mix 7
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], m, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 8
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, m);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 9
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], m, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 10
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], m, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 11
+	G2(v[0], v[4], v[8], v[12], 0, m);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], 0, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	// mix 12
+	G2(v[0], v[4], v[8], v[12], 0, 0);
+	G2(v[1], v[5], v[9], v[13], 0, 0);
+	G2(v[2], v[6], v[10], v[14], 0, 0);
+	G2(v[3], v[7], v[11], v[15], 0, 0);
+	G2(v[0], v[5], v[10], v[15], m, 0);
+	G2(v[1], v[6], v[11], v[12], 0, 0);
+	G2(v[2], v[7], v[8], v[13], 0, 0);
+	G2(v[3], v[4], v[9], v[14], 0, 0);
+
+	v[0] ^= hash_h[0] ^ v[8];
+	v[1] ^= hash_h[1] ^ v[9];
+	v[2] ^= hash_h[2] ^ v[10];
+	v[3] ^= hash_h[3] ^ v[11];
+	v[4] ^= hash_h[4] ^ v[12];
+	v[5] ^= hash_h[5] ^ v[13];
+	v32[12] ^= hash_h32[12] ^ v32[28];
+
+	u32 bexor = __byte_perm(v32[0], 0, 0x4012); // first 20 bits
+	u32 bucketid;
+	asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor));
+	u32 slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1);
+	if (slotp < RB8_NSLOTS)
+	{
+		slot* s = &eq->round0trees[bucketid][slotp];
+
+		uint4 tt;
+		tt.x = __byte_perm(v32[0], v32[1], 0x1234);
+		tt.y = __byte_perm(v32[1], v32[2], 0x1234);
+		tt.z = __byte_perm(v32[2], v32[3], 0x1234);
+		tt.w = __byte_perm(v32[3], v32[4], 0x1234);
+		*(uint4*)(&s->hash[0]) = tt;
+
+		tt.x = __byte_perm(v32[4], v32[5], 0x1234);
+		tt.y = __byte_perm(v32[5], v32[6], 0x1234);
+		tt.z = 0;
+		tt.w = block << 1;
+		*(uint4*)(&s->hash[4]) = tt;
+	}
+
+	bexor = __byte_perm(v32[6], 0, 0x0123);
+	asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor));
+	slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1);
+	if (slotp < RB8_NSLOTS)
+	{
+		slot* s = &eq->round0trees[bucketid][slotp];
+
+		uint4 tt;
+		tt.x = __byte_perm(v32[6], v32[7], 0x2345);
+		tt.y = __byte_perm(v32[7], v32[8], 0x2345);
+		tt.z = __byte_perm(v32[8], v32[9], 0x2345);
+		tt.w = __byte_perm(v32[9], v32[10], 0x2345);
+		*(uint4*)(&s->hash[0]) = tt;
+
+		tt.x = __byte_perm(v32[10], v32[11], 0x2345);
+		tt.y = __byte_perm(v32[11], v32[12], 0x2345);
+		tt.z = 0;
+		tt.w = (block << 1) + 1;
+		*(uint4*)(&s->hash[4]) = tt;
+	}
+}
+
+/*
+  Functions digit_1 to digit_8 works by the same principle;
+  Each thread does 2-3 slot loads (loads are coalesced).
+  Xorwork of slots is loaded into shared memory and is kept in registers (except for digit_1).
+  At the same time, restbits (8 or 9 bits) in xorwork are used for collisions.
+  Restbits determine position in ht.
+  Following next is pair creation. First one (or two) pairs' xorworks are put into global memory
+  as soon as possible, the rest pairs are saved in shared memory (one u32 per pair - 16 bit indices).
+  In most cases, all threads have one (or two) pairs so with this trick, we offload memory writes a bit in last step.
+  In last step we save xorwork of pairs in memory.
+*/
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_1(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[256][SSM - 1];
+	__shared__ uint2 lastword1[RB8_NSLOTS];
+	__shared__ uint4 lastword2[RB8_NSLOTS];
+	__shared__ int ht_len[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < 256)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	u32 bsize = umin(eq->edata.nslots0[bucketid], RB8_NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	uint2 ta[2];
+	uint4 tb[2];
+
+	u32 si[2];
+
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	//__syncthreads();
+
+	#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slot* pslot1 = eq->round0trees[bucketid] + si[i];
+
+		// get xhash
+		uint4 a1 = *(uint4*)(&pslot1->hash[0]);
+		uint2 a2 = *(uint2*)(&pslot1->hash[4]);
+		ta[i].x = a1.x;
+		ta[i].y = a1.y;
+		lastword1[si[i]] = ta[i];
+		tb[i].x = a1.z;
+		tb[i].y = a1.w;
+		tb[i].z = a2.x;
+		tb[i].w = a2.y;
+		lastword2[si[i]] = tb[i];
+
+		asm("bfe.u32 %0, %1, 20, 8;" : "=r"(hr[i]) : "r"(ta[i].x));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+	int* pairs = ht_len;
+
+	u32 xors[6];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll
+	for (u32 i = 0; i != 2; ++i)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint2*)(&xors[0]) = ta[i] ^ lastword1[p];
+
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1);
+
+			if (xorslot < NSLOTS)
+			{
+				*(uint4*)(&xors[2]) = lastword2[si[i]] ^ lastword2[p];
+
+				slot &xs = eq->trees[0][xorbucketid][xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+				uint4 ttx;
+				ttx.x = xors[5];
+				ttx.y = xors[0];
+				ttx.z = packer_default::set_bucketid_and_slots(bucketid, si[i], p, 8, RB8_NSLOTS);
+				ttx.w = 0;
+				*(uint4*)(&xs.hash[4]) = ttx;
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k];
+
+		asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS));
+		xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1);
+
+		if (xorslot < NSLOTS)
+		{
+			*(uint4*)(&xors[2]) = lastword2[i] ^ lastword2[k];
+
+			slot &xs = eq->trees[0][xorbucketid][xorslot];
+			*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+			uint4 ttx;
+			ttx.x = xors[5];
+			ttx.y = xors[0];
+			ttx.z = packer_default::set_bucketid_and_slots(bucketid, i, k, 8, RB8_NSLOTS);
+			ttx.w = 0;
+			*(uint4*)(&xs.hash[4]) = ttx;
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_2(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][SSM - 1];
+	__shared__ u32 lastword1[NSLOTS];
+	__shared__ uint4 lastword2[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	slot* buck = eq->trees[0][bucketid];
+	u32 bsize = umin(eq->edata.nslots[1][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 ta[2];
+	uint4 tt[2];
+
+	u32 si[2];
+
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	//__syncthreads();
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		// get slot
+		const slot* pslot1 = buck + si[i];
+
+		uint4 ttx = *(uint4*)(&pslot1->hash[0]);
+		lastword1[si[i]] = ta[i] = ttx.x;
+		uint2 tty = *(uint2*)(&pslot1->hash[4]);
+		tt[i].x = ttx.y;
+		tt[i].y = ttx.z;
+		tt[i].z = ttx.w;
+		tt[i].w = tty.x;
+		lastword2[si[i]] = tt[i];
+
+		hr[i] = tty.y & RESTMASK;
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[5];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			xors[0] = ta[i] ^ lastword1[p];
+
+			xorbucketid = xors[0] >> (12 + RB);
+			xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1);
+			if (xorslot < NSLOTS)
+			{
+				*(uint4*)(&xors[1]) = tt[i] ^ lastword2[p];
+				slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+				slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot];
+				uint2 ttx;
+				ttx.x = xors[4];
+				ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+				*(uint2*)(&xst.hash[0]) = ttx;
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		xors[0] = lastword1[i] ^ lastword1[k];
+
+		xorbucketid = xors[0] >> (12 + RB);
+		xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1);
+		if (xorslot < NSLOTS)
+		{
+			*(uint4*)(&xors[1]) = lastword2[i] ^ lastword2[k];
+			slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot];
+			*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+			slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot];
+			uint2 ttx;
+			ttx.x = xors[4];
+			ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+			*(uint2*)(&xst.hash[0]) = ttx;
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_3(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint4 lastword1[NSLOTS];
+	__shared__ u32 lastword2[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	u32 bsize = umin(eq->edata.nslots[2][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 si[2];
+	uint4 tt[2];
+	u32 ta[2];
+
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	//__syncthreads();
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		slotsmall &xs = eq->round2trees[bucketid].treessmall[si[i]];
+		slottiny &xst = eq->round2trees[bucketid].treestiny[si[i]];
+
+		tt[i] = *(uint4*)(&xs.hash[0]);
+		lastword1[si[i]] = tt[i];
+		ta[i] = xst.hash[0];
+		lastword2[si[i]] = ta[i];
+		asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[5];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			xors[4] = ta[i] ^ lastword2[p];
+
+			if (xors[4] != 0)
+			{
+				*(uint4*)(&xors[0]) = tt[i] ^ lastword1[p];
+
+				bexor = __byte_perm(xors[0], xors[1], 0x2107);
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1);
+
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot];
+					*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+					slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot];
+					uint2 ttx;
+					ttx.x = bexor;
+					ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint2*)(&xst.hash[0]) = ttx;
+				}
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		xors[4] = lastword2[i] ^ lastword2[k];
+
+		if (xors[4] != 0)
+		{
+			*(uint4*)(&xors[0]) = lastword1[i] ^ lastword1[k];
+
+			bexor = __byte_perm(xors[0], xors[1], 0x2107);
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1);
+
+			if (xorslot < NSLOTS)
+			{
+				slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]);
+				slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot];
+				uint2 ttx;
+				ttx.x = bexor;
+				ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+				*(uint2*)(&xst.hash[0]) = ttx;
+			}
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_4(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint4 lastword[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	u32 bsize = umin(eq->edata.nslots[3][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 si[2];
+	uint4 tt[2];
+
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	//__syncthreads();
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		slotsmall &xs = eq->round3trees[bucketid].treessmall[si[i]];
+		slottiny &xst = eq->round3trees[bucketid].treestiny[si[i]];
+
+		// get xhash
+		tt[i] = *(uint4*)(&xs.hash[0]);
+		lastword[si[i]] = tt[i];
+		hr[i] = xst.hash[0] & RESTMASK;
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+	u32 xors[4];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint4*)(&xors[0]) = tt[i] ^ lastword[p];
+
+			if (xors[3] != 0)
+			{
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot];
+					*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+
+					eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+				}
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint4*)(&xors[0]) = lastword[i] ^ lastword[k];
+		if (xors[3] != 0)
+		{
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1);
+			if (xorslot < NSLOTS)
+			{
+				slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot];
+				*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]);
+				eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+			}
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS>
+__global__ void digit_5(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint4 lastword[NSLOTS];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	if (threadid < NRESTS)
+		ht_len[threadid] = 0;
+	else if (threadid == (THREADS - 1))
+		pairs_len = 0;
+	else if (threadid == (THREADS - 33))
+		next_pair = 0;
+
+	slotsmall* buck = eq->treessmall[3][bucketid];
+	u32 bsize = umin(eq->edata.nslots[4][bucketid], NSLOTS);
+
+	u32 hr[2];
+	int pos[2];
+	pos[0] = pos[1] = SSM;
+
+	u32 si[2];
+	uint4 tt[2];
+
+	// enable this to make fully safe shared mem operations;
+	// disabled gains some speed, but can rarely cause a crash
+	//__syncthreads();
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		si[i] = i * THREADS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		tt[i] = *(uint4*)(&pslot1->hash[0]);
+		lastword[si[i]] = tt[i];
+		asm("bfe.u32 %0, %1, 4, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+	u32 xors[4];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 2
+	for (u32 i = 0; i < 2; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint4*)(&xors[0]) = tt[i] ^ lastword[p];
+
+			if (xors[3] != 0)
+			{
+				bexor = __byte_perm(xors[0], xors[1], 0x1076);
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot];
+					uint4 ttx;
+					ttx.x = xors[1];
+					ttx.y = xors[2];
+					ttx.z = xors[3];
+					ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint4*)(&xs.hash[0]) = ttx;
+				}
+			}
+
+			for (int k = 1; k != pos[i]; ++k)
+			{
+				u32 pindex = atomicAdd(&pairs_len, 1);
+				if (pindex >= MAXPAIRS) break;
+				u16 prev = ht[hr[i]][k];
+				pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	u32 i, k;
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		i = __byte_perm(pair, 0, 0x4510);
+		k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint4*)(&xors[0]) = lastword[i] ^ lastword[k];
+
+		if (xors[3] != 0)
+		{
+			bexor = __byte_perm(xors[0], xors[1], 0x1076);
+			asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS));
+			xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1);
+			if (xorslot < NSLOTS)
+			{
+				slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot];
+				uint4 tt;
+				tt.x = xors[1];
+				tt.y = xors[2];
+				tt.z = xors[3];
+				tt.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+				*(uint4*)(&xs.hash[0]) = tt;
+			}
+		}
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS>
+__global__ void digit_6(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ uint2 lastword1[NSLOTS];
+	__shared__ u32 lastword2[NSLOTS];
+	__shared__ int ht_len[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 bsize_sh;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	ht_len[threadid] = 0;
+	if (threadid == (NRESTS - 1))
+	{
+		pairs_len = 0;
+		next_pair = 0;
+	}
+	else if (threadid == (NRESTS - 33))
+		bsize_sh = umin(eq->edata.nslots[5][bucketid], NSLOTS);
+
+	slotsmall* buck = eq->treessmall[2][bucketid];
+
+	u32 hr[3];
+	int pos[3];
+	pos[0] = pos[1] = pos[2] = SSM;
+
+	u32 si[3];
+	uint4 tt[3];
+
+	__syncthreads();
+
+	u32 bsize = bsize_sh;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		si[i] = i * NRESTS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		tt[i] = *(uint4*)(&pslot1->hash[0]);
+		lastword1[si[i]] = *(uint2*)(&tt[i].x);
+		lastword2[si[i]] = tt[i].z;
+		asm("bfe.u32 %0, %1, 16, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	// doing this to save shared memory
+	int* pairs = ht_len;
+	__syncthreads();
+
+	u32 xors[3];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			xors[2] = tt[i].z ^ lastword2[p];
+
+			if (xors[2] != 0)
+			{
+				*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p];
+
+				bexor = __byte_perm(xors[0], xors[1], 0x1076);
+				xorbucketid = bexor >> (12 + RB);
+				xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot];
+					uint4 ttx;
+					ttx.x = xors[1];
+					ttx.y = xors[2];
+					ttx.z = bexor;
+					ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint4*)(&xs.hash[0]) = ttx;
+				}
+			}
+
+			if (pos[i] > 1)
+			{
+				p = ht[hr[i]][1];
+
+				xors[2] = tt[i].z ^ lastword2[p];
+
+				if (xors[2] != 0)
+				{
+					*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p];
+
+					bexor = __byte_perm(xors[0], xors[1], 0x1076);
+					xorbucketid = bexor >> (12 + RB);
+					xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1);
+					if (xorslot < NSLOTS)
+					{
+						slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot];
+						uint4 ttx;
+						ttx.x = xors[1];
+						ttx.y = xors[2];
+						ttx.z = bexor;
+						ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+						*(uint4*)(&xs.hash[0]) = ttx;
+					}
+				}
+
+				for (int k = 2; k != pos[i]; ++k)
+				{
+					u32 pindex = atomicAdd(&pairs_len, 1);
+					if (pindex >= MAXPAIRS) break;
+					u16 prev = ht[hr[i]][k];
+					pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+				}
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		u32 pair = pairs[s];
+		u32 i = __byte_perm(pair, 0, 0x4510);
+		u32 k = __byte_perm(pair, 0, 0x4532);
+
+		xors[2] = lastword2[i] ^ lastword2[k];
+		if (xors[2] == 0)
+			continue;
+
+		*(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k];
+
+		bexor = __byte_perm(xors[0], xors[1], 0x1076);
+		xorbucketid = bexor >> (12 + RB);
+		xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1);
+		if (xorslot >= NSLOTS) continue;
+		slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot];
+		uint4 ttx;
+		ttx.x = xors[1];
+		ttx.y = xors[2];
+		ttx.z = bexor;
+		ttx.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+		*(uint4*)(&xs.hash[0]) = ttx;
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS>
+__global__ void digit_7(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ u32 lastword[NSLOTS][2];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 bsize_sh;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	ht_len[threadid] = 0;
+	if (threadid == (NRESTS - 1))
+	{
+		pairs_len = 0;
+		next_pair = 0;
+	}
+	else if (threadid == (NRESTS - 33))
+		bsize_sh = umin(eq->edata.nslots[6][bucketid], NSLOTS);
+
+	slotsmall* buck = eq->treessmall[0][bucketid];
+
+	u32 hr[3];
+	int pos[3];
+	pos[0] = pos[1] = pos[2] = SSM;
+
+	u32 si[3];
+	uint4 tt[3];
+
+	__syncthreads();
+
+	u32 bsize = bsize_sh;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		si[i] = i * NRESTS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		// get xhash
+		tt[i] = *(uint4*)(&pslot1->hash[0]);
+		*(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x);
+		asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].z), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[2];
+	u32 xorbucketid, xorslot;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+			if (xors[1] != 0)
+			{
+				asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS));
+				xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1);
+				if (xorslot < NSLOTS)
+				{
+					slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot];
+					uint4 ttx;
+					ttx.x = xors[0];
+					ttx.y = xors[1];
+					ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					ttx.w = 0;
+					*(uint4*)(&xs.hash[0]) = ttx;
+				}
+			}
+
+			if (pos[i] > 1)
+			{
+				p = ht[hr[i]][1];
+
+				*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+				if (xors[1] != 0)
+				{
+					asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS));
+					xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1);
+					if (xorslot < NSLOTS)
+					{
+						slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot];
+						uint4 ttx;
+						ttx.x = xors[0];
+						ttx.y = xors[1];
+						ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+						ttx.w = 0;
+						*(uint4*)(&xs.hash[0]) = ttx;
+					}
+				}
+
+				for (int k = 2; k != pos[i]; ++k)
+				{
+					u32 pindex = atomicAdd(&pairs_len, 1);
+					if (pindex >= MAXPAIRS) break;
+					u16 prev = ht[hr[i]][k];
+					pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+				}
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		u32 i = __byte_perm(pair, 0, 0x4510);
+		u32 k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]);
+
+		if (xors[1] == 0)
+			continue;
+
+		asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS));
+		xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1);
+		if (xorslot >= NSLOTS) continue;
+		slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot];
+		uint4 tt;
+		tt.x = xors[0];
+		tt.y = xors[1];
+		tt.z = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+		tt.w = 0;
+		*(uint4*)(&xs.hash[0]) = tt;
+	}
+}
+
+
+template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS>
+__global__ void digit_8(equi<RB, SM>* eq)
+{
+	__shared__ u16 ht[NRESTS][(SSM - 1)];
+	__shared__ u32 lastword[NSLOTS][2];
+	__shared__ int ht_len[NRESTS];
+	__shared__ int pairs[MAXPAIRS];
+	__shared__ u32 pairs_len;
+	__shared__ u32 bsize_sh;
+	__shared__ u32 next_pair;
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	ht_len[threadid] = 0;
+	if (threadid == (NRESTS - 1))
+	{
+		next_pair = 0;
+		pairs_len = 0;
+	}
+	else if (threadid == (NRESTS - 33))
+		bsize_sh = umin(eq->edata.nslots[7][bucketid], NSLOTS);
+
+	slotsmall* buck = eq->treessmall[1][bucketid];
+
+	u32 hr[3];
+	int pos[3];
+	pos[0] = pos[1] = pos[2] = SSM;
+
+	u32 si[3];
+	uint2 tt[3];
+
+	__syncthreads();
+
+	u32 bsize = bsize_sh;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		si[i] = i * NRESTS + threadid;
+		if (si[i] >= bsize) break;
+
+		const slotsmall* pslot1 = buck + si[i];
+
+		// get xhash
+		tt[i] = *(uint2*)(&pslot1->hash[0]);
+		*(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x);
+		asm("bfe.u32 %0, %1, 8, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB));
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	u32 xors[2];
+	u32 bexor, xorbucketid, xorslot;
+
+	#pragma unroll 3
+	for (u32 i = 0; i < 3; i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		if (pos[i] > 0)
+		{
+			u16 p = ht[hr[i]][0];
+
+			*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+			if (xors[1] != 0)
+			{
+				bexor = __byte_perm(xors[0], xors[1], 0x0765);
+				xorbucketid = bexor >> (12 + 8);
+				xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1);
+				if (xorslot < RB8_NSLOTS_LD)
+				{
+					slottiny &xs = eq->treestiny[0][xorbucketid][xorslot];
+					uint2 tt;
+					tt.x = xors[1];
+					tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+					*(uint2*)(&xs.hash[0]) = tt;
+				}
+			}
+
+			if (pos[i] > 1)
+			{
+				p = ht[hr[i]][1];
+
+				*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]);
+
+				if (xors[1] != 0)
+				{
+					bexor = __byte_perm(xors[0], xors[1], 0x0765);
+					xorbucketid = bexor >> (12 + 8);
+					xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1);
+					if (xorslot < RB8_NSLOTS_LD)
+					{
+						slottiny &xs = eq->treestiny[0][xorbucketid][xorslot];
+						uint2 tt;
+						tt.x = xors[1];
+						tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM);
+						*(uint2*)(&xs.hash[0]) = tt;
+					}
+				}
+
+				for (int k = 2; k != pos[i]; ++k)
+				{
+					u32 pindex = atomicAdd(&pairs_len, 1);
+					if (pindex >= MAXPAIRS) break;
+					u16 prev = ht[hr[i]][k];
+					pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+				}
+			}
+		}
+	}
+
+	__syncthreads();
+
+	// process pairs
+	u32 plen = umin(pairs_len, MAXPAIRS);
+	for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1))
+	{
+		int pair = pairs[s];
+		u32 i = __byte_perm(pair, 0, 0x4510);
+		u32 k = __byte_perm(pair, 0, 0x4532);
+
+		*(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]);
+
+		if (xors[1] == 0)
+			continue;
+
+		bexor = __byte_perm(xors[0], xors[1], 0x0765);
+		xorbucketid = bexor >> (12 + 8);
+		xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1);
+		if (xorslot >= RB8_NSLOTS_LD) continue;
+		slottiny &xs = eq->treestiny[0][xorbucketid][xorslot];
+		uint2 tt;
+		tt.x = xors[1];
+		tt.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM);
+		*(uint2*)(&xs.hash[0]) = tt;
+	}
+}
+
+/*
+  Last round function is similar to previous ones but has different ending.
+  We use warps to process final candidates. Each warp process one candidate.
+  First two bidandsids (u32 of stored bucketid and two slotids) are retreived by
+  lane 0 and lane 16, next four bidandsids by lane 0, 8, 16 and 24, ... until
+  all lanes in warp have bidandsids from round 4. Next, each thread retreives
+  16 indices. While doing so, indices are put into comparison using atomicExch
+  to determine if there are duplicates (tromp's method). At the end, if no
+  duplicates are found, candidate solution is saved (all indices). Note that this
+  dup check method is not exact so CPU dup checking is needed after.
+*/
+template <u32 RB, u32 SM, int SSM, u32 FCT, typename PACKER, u32 MAXPAIRS, u32 DUPBITS, u32 W>
+__global__ void digit_last_wdc(equi<RB, SM>* eq)
+{
+	__shared__ u8 shared_data[8192];
+	int* ht_len = (int*)(&shared_data[0]);
+	int* pairs = ht_len;
+	u32* lastword = (u32*)(&shared_data[256 * 4]);
+	u16* ht = (u16*)(&shared_data[256 * 4 + RB8_NSLOTS_LD * 4]);
+	u32* pairs_len = (u32*)(&shared_data[8188]);
+
+	const u32 threadid = threadIdx.x;
+	const u32 bucketid = blockIdx.x;
+
+	// reset hashtable len
+	#pragma unroll
+	for (u32 i = 0; i < FCT; i++)
+		ht_len[(i * (256 / FCT)) + threadid] = 0;
+
+	if (threadid == ((256 / FCT) - 1))
+		*pairs_len = 0;
+
+	slottiny* buck = eq->treestiny[0][bucketid];
+	u32 bsize = umin(eq->edata.nslots8[bucketid], RB8_NSLOTS_LD);
+
+	u32 si[3 * FCT];
+	u32 hr[3 * FCT];
+	int pos[3 * FCT];
+	u32 lw[3 * FCT];
+
+	#pragma unroll
+	for (u32 i = 0; i < (3 * FCT); i++)
+		pos[i] = SSM;
+
+	__syncthreads();
+
+	#pragma unroll
+	for (u32 i = 0; i < (3 * FCT); i++)
+	{
+		si[i] = i * (256 / FCT) + threadid;
+		if (si[i] >= bsize) break;
+
+		const slottiny* pslot1 = buck + si[i];
+
+		// get xhash
+		uint2 tt = *(uint2*)(&pslot1->hash[0]);
+		lw[i] = tt.x;
+		lastword[si[i]] = lw[i];
+
+		u32 a;
+		asm("bfe.u32 %0, %1, 20, 8;" : "=r"(a) : "r"(lw[i]));
+		hr[i] = a;
+
+		pos[i] = atomicAdd(&ht_len[hr[i]], 1);
+		if (pos[i] < (SSM - 1))
+			ht[hr[i] * (SSM - 1) + pos[i]] = si[i];
+	}
+
+	__syncthreads();
+
+	#pragma unroll
+	for (u32 i = 0; i < (3 * FCT); i++)
+	{
+		if (pos[i] >= SSM) continue;
+
+		for (int k = 0; k != pos[i]; ++k)
+		{
+			u16 prev = ht[hr[i] * (SSM - 1) + k];
+			if (lw[i] != lastword[prev]) continue;
+			u32 pindex = atomicAdd(pairs_len, 1);
+			if (pindex >= MAXPAIRS) break;
+			pairs[pindex] = __byte_perm(si[i], prev, 0x1054);
+		}
+	}
+
+	__syncthreads();
+	u32 plen = umin(*pairs_len, 64);
+
+#define CALC_LEVEL(a, b, c, d) { \
+	u32 plvl = levels[b]; \
+	u32* bucks = eq->round4bidandsids[PACKER::get_bucketid(plvl, RB, SM)]; \
+	u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \
+	u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \
+	levels[b] = bucks[slot1]; \
+	levels[c] = bucks[slot0]; \
+				}
+
+#define CALC_LEVEL_SMALL(a, b, c, d) { \
+	u32 plvl = levels[b]; \
+	slotsmall* bucks = eq->treessmall[a][PACKER::get_bucketid(plvl, RB, SM)]; \
+	u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \
+	u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \
+	levels[b] = bucks[slot1].hash[d]; \
+	levels[c] = bucks[slot0].hash[d]; \
+				}
+
+	u32 lane = threadIdx.x & 0x1f;
+	u32 par = threadIdx.x >> 5;
+
+	u32* levels = (u32*)&pairs[MAXPAIRS + (par << DUPBITS)];
+	u32* susp = levels;
+
+	while (par < plen)
+	{
+		int pair = pairs[par];
+		par += W;
+
+		if (lane % 16 == 0)
+		{
+			u32 plvl;
+			if (lane == 0) plvl = buck[__byte_perm(pair, 0, 0x4510)].hash[1];
+			else plvl = buck[__byte_perm(pair, 0, 0x4532)].hash[1];
+			slotsmall* bucks = eq->treessmall[1][PACKER::get_bucketid(plvl, RB, SM)];
+			u32 slot1 = PACKER::get_slot1(plvl, RB, SM);
+			u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM);
+			levels[lane] = bucks[slot1].hash[2];
+			levels[lane + 8] = bucks[slot0].hash[2];
+		}
+
+		if (lane % 8 == 0)
+			CALC_LEVEL_SMALL(0, lane, lane + 4, 3);
+
+		if (lane % 4 == 0)
+			CALC_LEVEL_SMALL(2, lane, lane + 2, 3);
+
+		if (lane % 2 == 0)
+			CALC_LEVEL(0, lane, lane + 1, 4);
+
+		u32 ind[16];
+
+		u32 f1 = levels[lane];
+		const slottiny* buck_v4 = &eq->round3trees[PACKER::get_bucketid(f1, RB, SM)].treestiny[0];
+		const u32 slot1_v4 = PACKER::get_slot1(f1, RB, SM);
+		const u32 slot0_v4 = PACKER::get_slot0(f1, slot1_v4, RB, SM);
+
+		susp[lane] = 0xffffffff;
+		susp[32 + lane] = 0xffffffff;
+
+#define CHECK_DUP(a) \
+	__any(atomicExch(&susp[(ind[a] & ((1 << DUPBITS) - 1))], (ind[a] >> DUPBITS)) == (ind[a] >> DUPBITS))
+
+		u32 f2 = buck_v4[slot1_v4].hash[1];
+		const slottiny* buck_v3_1 = &eq->round2trees[PACKER::get_bucketid(f2, RB, SM)].treestiny[0];
+		const u32 slot1_v3_1 = PACKER::get_slot1(f2, RB, SM);
+		const u32 slot0_v3_1 = PACKER::get_slot0(f2, slot1_v3_1, RB, SM);
+
+		susp[64 + lane] = 0xffffffff;
+		susp[96 + lane] = 0xffffffff;
+
+		u32 f0 = buck_v3_1[slot1_v3_1].hash[1];
+		const slot* buck_v2_1 = eq->trees[0][PACKER::get_bucketid(f0, RB, SM)];
+		const u32 slot1_v2_1 = PACKER::get_slot1(f0, RB, SM);
+		const u32 slot0_v2_1 = PACKER::get_slot0(f0, slot1_v2_1, RB, SM);
+
+		susp[128 + lane] = 0xffffffff;
+		susp[160 + lane] = 0xffffffff;
+
+		u32 f3 = buck_v2_1[slot1_v2_1].hash[6];
+		const slot* buck_fin_1 = eq->round0trees[packer_default::get_bucketid(f3, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_1 = packer_default::get_slot1(f3, 8, RB8_NSLOTS);
+		const u32 slot0_fin_1 = packer_default::get_slot0(f3, slot1_fin_1, 8, RB8_NSLOTS);
+
+		susp[192 + lane] = 0xffffffff;
+		susp[224 + lane] = 0xffffffff;
+
+		ind[0] = buck_fin_1[slot1_fin_1].hash[7];
+		if (CHECK_DUP(0)) continue;
+		ind[1] = buck_fin_1[slot0_fin_1].hash[7];
+		if (CHECK_DUP(1)) continue;
+
+		u32 f4 = buck_v2_1[slot0_v2_1].hash[6];
+		const slot* buck_fin_2 = eq->round0trees[packer_default::get_bucketid(f4, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_2 = packer_default::get_slot1(f4, 8, RB8_NSLOTS);
+		const u32 slot0_fin_2 = packer_default::get_slot0(f4, slot1_fin_2, 8, RB8_NSLOTS);
+
+		ind[2] = buck_fin_2[slot1_fin_2].hash[7];
+		if (CHECK_DUP(2)) continue;
+		ind[3] = buck_fin_2[slot0_fin_2].hash[7];
+		if (CHECK_DUP(3)) continue;
+
+		u32 f5 = buck_v3_1[slot0_v3_1].hash[1];
+		const slot* buck_v2_2 = eq->trees[0][PACKER::get_bucketid(f5, RB, SM)];
+		const u32 slot1_v2_2 = PACKER::get_slot1(f5, RB, SM);
+		const u32 slot0_v2_2 = PACKER::get_slot0(f5, slot1_v2_2, RB, SM);
+
+		u32 f6 = buck_v2_2[slot1_v2_2].hash[6];
+		const slot* buck_fin_3 = eq->round0trees[packer_default::get_bucketid(f6, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_3 = packer_default::get_slot1(f6, 8, RB8_NSLOTS);
+		const u32 slot0_fin_3 = packer_default::get_slot0(f6, slot1_fin_3, 8, RB8_NSLOTS);
+
+		ind[4] = buck_fin_3[slot1_fin_3].hash[7];
+		if (CHECK_DUP(4)) continue;
+		ind[5] = buck_fin_3[slot0_fin_3].hash[7];
+		if (CHECK_DUP(5)) continue;
+
+		u32 f7 = buck_v2_2[slot0_v2_2].hash[6];
+		const slot* buck_fin_4 = eq->round0trees[packer_default::get_bucketid(f7, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_4 = packer_default::get_slot1(f7, 8, RB8_NSLOTS);
+		const u32 slot0_fin_4 = packer_default::get_slot0(f7, slot1_fin_4, 8, RB8_NSLOTS);
+
+		ind[6] = buck_fin_4[slot1_fin_4].hash[7];
+		if (CHECK_DUP(6)) continue;
+		ind[7] = buck_fin_4[slot0_fin_4].hash[7];
+		if (CHECK_DUP(7)) continue;
+
+		u32 f8 = buck_v4[slot0_v4].hash[1];
+		const slottiny* buck_v3_2 = &eq->round2trees[PACKER::get_bucketid(f8, RB, SM)].treestiny[0];
+		const u32 slot1_v3_2 = PACKER::get_slot1(f8, RB, SM);
+		const u32 slot0_v3_2 = PACKER::get_slot0(f8, slot1_v3_2, RB, SM);
+
+		u32 f9 = buck_v3_2[slot1_v3_2].hash[1];
+		const slot* buck_v2_3 = eq->trees[0][PACKER::get_bucketid(f9, RB, SM)];
+		const u32 slot1_v2_3 = PACKER::get_slot1(f9, RB, SM);
+		const u32 slot0_v2_3 = PACKER::get_slot0(f9, slot1_v2_3, RB, SM);
+
+		u32 f10 = buck_v2_3[slot1_v2_3].hash[6];
+		const slot* buck_fin_5 = eq->round0trees[packer_default::get_bucketid(f10, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_5 = packer_default::get_slot1(f10, 8, RB8_NSLOTS);
+		const u32 slot0_fin_5 = packer_default::get_slot0(f10, slot1_fin_5, 8, RB8_NSLOTS);
+
+		ind[8] = buck_fin_5[slot1_fin_5].hash[7];
+		if (CHECK_DUP(8)) continue;
+		ind[9] = buck_fin_5[slot0_fin_5].hash[7];
+		if (CHECK_DUP(9)) continue;
+
+		u32 f11 = buck_v2_3[slot0_v2_3].hash[6];
+		const slot* buck_fin_6 = eq->round0trees[packer_default::get_bucketid(f11, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_6 = packer_default::get_slot1(f11, 8, RB8_NSLOTS);
+		const u32 slot0_fin_6 = packer_default::get_slot0(f11, slot1_fin_6, 8, RB8_NSLOTS);
+
+		ind[10] = buck_fin_6[slot1_fin_6].hash[7];
+		if (CHECK_DUP(10)) continue;
+		ind[11] = buck_fin_6[slot0_fin_6].hash[7];
+		if (CHECK_DUP(11)) continue;
+
+		u32 f12 = buck_v3_2[slot0_v3_2].hash[1];
+		const slot* buck_v2_4 = eq->trees[0][PACKER::get_bucketid(f12, RB, SM)];
+		const u32 slot1_v2_4 = PACKER::get_slot1(f12, RB, SM);
+		const u32 slot0_v2_4 = PACKER::get_slot0(f12, slot1_v2_4, RB, SM);
+
+		u32 f13 = buck_v2_4[slot1_v2_4].hash[6];
+		const slot* buck_fin_7 = eq->round0trees[packer_default::get_bucketid(f13, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_7 = packer_default::get_slot1(f13, 8, RB8_NSLOTS);
+		const u32 slot0_fin_7 = packer_default::get_slot0(f13, slot1_fin_7, 8, RB8_NSLOTS);
+
+		ind[12] = buck_fin_7[slot1_fin_7].hash[7];
+		if (CHECK_DUP(12)) continue;
+		ind[13] = buck_fin_7[slot0_fin_7].hash[7];
+		if (CHECK_DUP(13)) continue;
+
+		u32 f14 = buck_v2_4[slot0_v2_4].hash[6];
+		const slot* buck_fin_8 = eq->round0trees[packer_default::get_bucketid(f14, 8, RB8_NSLOTS)];
+		const u32 slot1_fin_8 = packer_default::get_slot1(f14, 8, RB8_NSLOTS);
+		const u32 slot0_fin_8 = packer_default::get_slot0(f14, slot1_fin_8, 8, RB8_NSLOTS);
+
+		ind[14] = buck_fin_8[slot1_fin_8].hash[7];
+		if (CHECK_DUP(14)) continue;
+		ind[15] = buck_fin_8[slot0_fin_8].hash[7];
+		if (CHECK_DUP(15)) continue;
+
+		u32 soli;
+		if (lane == 0) {
+			soli = atomicAdd(&eq->edata.srealcont.nsols, 1);
+		}
+#if __CUDA_ARCH__ >= 300
+		// useful ?
+		soli = __shfl(soli, 0);
+#else
+		__syncthreads();
+#endif
+		if (soli < MAXREALSOLS)
+		{
+			u32 pos = lane << 4;
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos     ]) = *(uint4*)(&ind[ 0]);
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos +  4]) = *(uint4*)(&ind[ 4]);
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos +  8]) = *(uint4*)(&ind[ 8]);
+			*(uint4*)(&eq->edata.srealcont.sols[soli][pos + 12]) = *(uint4*)(&ind[12]);
+		}
+	}
+}
+
+//std::mutex dev_init;
+int dev_init_done[MAX_GPUS] = { 0 };
+
+__host__
+static int compu32(const void *pa, const void *pb)
+{
+	uint32_t a = *(uint32_t *)pa, b = *(uint32_t *)pb;
+	return a<b ? -1 : a == b ? 0 : +1;
+}
+
+__host__
+static bool duped(uint32_t* prf)
+{
+	uint32_t sortprf[512];
+	memcpy(sortprf, prf, sizeof(uint32_t) * 512);
+	qsort(sortprf, 512, sizeof(uint32_t), &compu32);
+	for (uint32_t i = 1; i<512; i++) {
+		if (sortprf[i] <= sortprf[i - 1])
+			return true;
+	}
+	return false;
+}
+
+__host__
+static void sort_pair(uint32_t *a, uint32_t len)
+{
+	uint32_t *b = a + len;
+	uint32_t  tmp, need_sorting = 0;
+	for (uint32_t i = 0; i < len; i++) {
+		if (need_sorting || a[i] > b[i])
+		{
+			need_sorting = 1;
+			tmp = a[i];
+			a[i] = b[i];
+			b[i] = tmp;
+		}
+		else if (a[i] < b[i])
+			return;
+	}
+}
+
+__host__
+static void setheader(blake2b_state *ctx, const char *header, const u32 headerLen, const char* nce, const u32 nonceLen)
+{
+	uint32_t le_N = WN;
+	uint32_t le_K = WK;
+	uchar personal[] = "ZcashPoW01230123";
+	memcpy(personal + 8, &le_N, 4);
+	memcpy(personal + 12, &le_K, 4);
+	blake2b_param P[1];
+	P->digest_length = HASHOUT;
+	P->key_length = 0;
+	P->fanout = 1;
+	P->depth = 1;
+	P->leaf_length = 0;
+	P->node_offset = 0;
+	P->node_depth = 0;
+	P->inner_length = 0;
+	memset(P->reserved, 0, sizeof(P->reserved));
+	memset(P->salt, 0, sizeof(P->salt));
+	memcpy(P->personal, (const uint8_t *)personal, 16);
+	eq_blake2b_init_param(ctx, P);
+	eq_blake2b_update(ctx, (const uchar *)header, headerLen);
+	if (nonceLen) eq_blake2b_update(ctx, (const uchar *)nce, nonceLen);
+}
+
+#ifdef WIN32
+typedef CUresult(CUDAAPI *dec_cuDeviceGet)(CUdevice*, int);
+typedef CUresult(CUDAAPI *dec_cuCtxCreate)(CUcontext*, unsigned int, CUdevice);
+typedef CUresult(CUDAAPI *dec_cuCtxPushCurrent)(CUcontext);
+typedef CUresult(CUDAAPI *dec_cuCtxDestroy)(CUcontext);
+
+dec_cuDeviceGet _cuDeviceGet = nullptr;
+dec_cuCtxCreate _cuCtxCreate = nullptr;
+dec_cuCtxPushCurrent _cuCtxPushCurrent = nullptr;
+dec_cuCtxDestroy _cuCtxDestroy = nullptr;
+#endif
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__ eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::eq_cuda_context(int thr_id, int dev_id)
+{
+	thread_id = thr_id;
+	device_id = dev_id;
+	solutions = nullptr;
+	equi_mem_sz = sizeof(equi<RB, SM>);
+	throughput = NBLOCKS;
+	totalblocks = NBLOCKS/FD_THREADS;
+	threadsperblock = FD_THREADS;
+	threadsperblock_digits = THREADS;
+
+	//dev_init.lock();
+	if (!dev_init_done[device_id])
+	{
+		// only first thread shall init device
+		checkCudaErrors(cudaSetDevice(device_id));
+		checkCudaErrors(cudaDeviceReset());
+		checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+
+		pctx = nullptr;
+	}
+	else
+	{
+		// create new context
+		CUdevice dev;
+
+#ifdef WIN32
+		if (_cuDeviceGet == nullptr)
+		{
+			HMODULE hmod = LoadLibraryA("nvcuda.dll");
+			if (hmod == NULL)
+				throw std::runtime_error("Failed to load nvcuda.dll");
+			_cuDeviceGet = (dec_cuDeviceGet)GetProcAddress(hmod, "cuDeviceGet");
+			if (_cuDeviceGet == nullptr)
+				throw std::runtime_error("Failed to get cuDeviceGet address");
+			_cuCtxCreate = (dec_cuCtxCreate)GetProcAddress(hmod, "cuCtxCreate_v2");
+			if (_cuCtxCreate == nullptr)
+				throw std::runtime_error("Failed to get cuCtxCreate address");
+			_cuCtxPushCurrent = (dec_cuCtxPushCurrent)GetProcAddress(hmod, "cuCtxPushCurrent_v2");
+			if (_cuCtxPushCurrent == nullptr)
+				throw std::runtime_error("Failed to get cuCtxPushCurrent address");
+			_cuCtxDestroy = (dec_cuCtxDestroy)GetProcAddress(hmod, "cuCtxDestroy_v2");
+			if (_cuCtxDestroy == nullptr)
+				throw std::runtime_error("Failed to get cuCtxDestroy address");
+		}
+
+		checkCudaDriverErrors(_cuDeviceGet(&dev, device_id));
+		checkCudaDriverErrors(_cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev));
+		checkCudaDriverErrors(_cuCtxPushCurrent(pctx));
+#else
+		checkCudaDriverErrors(cuDeviceGet(&dev, device_id));
+		checkCudaDriverErrors(cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev));
+		checkCudaDriverErrors(cuCtxPushCurrent(pctx));
+#endif
+	}
+	++dev_init_done[device_id];
+	//dev_init.unlock();
+
+	if (cudaMalloc((void**)&device_eq, equi_mem_sz) != cudaSuccess)
+		throw std::runtime_error("CUDA: failed to alloc memory");
+
+	solutions = (scontainerreal*) malloc(sizeof(scontainerreal));
+	if (!solutions)
+		throw std::runtime_error("EOM: failed to alloc solutions memory");
+}
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__ void eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::solve(const char *tequihash_header,
+	unsigned int tequihash_header_len,
+	const char* nonce,
+	unsigned int nonce_len,
+	fn_cancel cancelf,
+	fn_solution solutionf,
+	fn_hashdone hashdonef)
+{
+	blake2b_state blake_ctx;
+
+	int blocks = NBUCKETS;
+
+	setheader(&blake_ctx, tequihash_header, tequihash_header_len, nonce, nonce_len);
+
+	// todo: improve
+	// djezo solver allows last 4 bytes of nonce to be iterrated
+	// this can be used to create internal loop - calc initial blake hash only once, then load 8*8 bytes on device (blake state h)
+	// then just iterate nn++
+	// less CPU load, 1 cudaMemcpy less -> faster
+	//u32 nn = *(u32*)&nonce[28];
+	u32 nn = 0;
+
+	checkCudaErrors(cudaMemcpy(&device_eq->blake_h, &blake_ctx.h, sizeof(u64) * 8, cudaMemcpyHostToDevice));
+
+	checkCudaErrors(cudaMemset(&device_eq->edata, 0, sizeof(device_eq->edata)));
+
+	digit_first<RB, SM, PACKER> <<<NBLOCKS / FD_THREADS, FD_THREADS >>>(device_eq, nn);
+
+	digit_1<RB, SM, SSM, PACKER, 4 * NRESTS, 512> <<<4096, 512 >>>(device_eq);
+	digit_2<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+	digit_3<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+
+	if (cancelf(thread_id)) return;
+
+	digit_4<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+	digit_5<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq);
+
+	digit_6<RB, SM, SSM - 1, PACKER, 3 * NRESTS> <<<blocks, NRESTS >>>(device_eq);
+	digit_7<RB, SM, SSM - 1, PACKER, 3 * NRESTS> <<<blocks, NRESTS >>>(device_eq);
+	digit_8<RB, SM, SSM - 1, PACKER, 3 * NRESTS> <<<blocks, NRESTS >>>(device_eq);
+
+	digit_last_wdc<RB, SM, SSM - 3, 2, PACKER, 64, 8, 4> <<<4096, 256 / 2 >>>(device_eq);
+
+	checkCudaErrors(cudaMemcpy(solutions, &device_eq->edata.srealcont, (MAXREALSOLS * (512 * 4)) + 4, cudaMemcpyDeviceToHost));
+
+	//printf("T%d nsols: %u\n", thread_id, solutions->nsols);
+	//if (solutions->nsols > 9)
+	//	printf("missing sol, total: %u\n", solutions->nsols);
+
+	for (u32 s = 0; (s < solutions->nsols) && (s < MAXREALSOLS); s++)
+	{
+		// remove dups on CPU (dup removal on GPU is not fully exact and can pass on some invalid solutions)
+		if (duped(solutions->sols[s])) continue;
+
+		// perform sort of pairs
+		for (uint32_t level = 0; level < 9; level++)
+			for (uint32_t i = 0; i < (1 << 9); i += (2 << level))
+				sort_pair(&solutions->sols[s][i], 1 << level);
+
+		std::vector<uint32_t> index_vector(PROOFSIZE);
+		for (u32 i = 0; i < PROOFSIZE; i++) {
+			index_vector[i] = solutions->sols[s][i];
+		}
+
+		solutionf(thread_id, index_vector, DIGITBITS, nullptr);
+	}
+
+	// ccminer: only use hashdonef if no solutions...
+	if (!solutions->nsols)
+		hashdonef(thread_id);
+}
+
+// destructor
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__
+eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::~eq_cuda_context()
+{
+	if (solutions)
+		free(solutions);
+
+	if (device_eq) {
+		cudaFree(device_eq);
+		device_eq = NULL;
+	}
+
+	if (pctx) {
+		// non primary thread, destroy context
+#ifdef WIN32
+		checkCudaDriverErrors(_cuCtxDestroy(pctx));
+#else
+		checkCudaDriverErrors(cuCtxDestroy(pctx));
+#endif
+	} else {
+		checkCudaErrors(cudaDeviceReset());
+		dev_init_done[device_id] = 0;
+	}
+}
+
+
+#ifdef CONFIG_MODE_1
+template class eq_cuda_context<CONFIG_MODE_1>;
+#endif
+
+#ifdef CONFIG_MODE_2
+template class eq_cuda_context<CONFIG_MODE_2>;
+#endif
+
+#ifdef CONFIG_MODE_3
+template class eq_cuda_context<CONFIG_MODE_3>;
+#endif
diff --git a/equi/eqcuda.hpp b/equi/eqcuda.hpp
new file mode 100644
index 0000000..68bdaf0
--- /dev/null
+++ b/equi/eqcuda.hpp
@@ -0,0 +1,136 @@
+#pragma once
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdexcept>
+#include <vector>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <device_functions_decls.h>
+
+#ifdef WIN32
+#define _SNPRINTF _snprintf
+#else
+#define _SNPRINTF snprintf
+#endif
+
+#ifndef nullptr
+#define nullptr NULL
+#endif
+
+#ifdef WIN32
+#define rt_error std::runtime_error
+#else
+class rt_error : public std::runtime_error
+{
+public:
+	explicit rt_error(const std::string& str) : std::runtime_error(str) {}
+};
+#endif
+
+#define checkCudaErrors(call)								\
+do {														\
+	cudaError_t err = call;									\
+	if (cudaSuccess != err) {								\
+		char errorBuff[512];								\
+		_SNPRINTF(errorBuff, sizeof(errorBuff) - 1,			\
+			"CUDA error '%s' in func '%s' line %d",			\
+			cudaGetErrorString(err), __FUNCTION__, __LINE__); \
+		throw rt_error(errorBuff);				\
+		}													\
+} while (0)
+
+#define checkCudaDriverErrors(call)							\
+do {														\
+	CUresult err = call;									\
+	if (CUDA_SUCCESS != err) {								\
+		char errorBuff[512];								\
+		_SNPRINTF(errorBuff, sizeof(errorBuff) - 1,			\
+			"CUDA error DRIVER: '%d' in func '%s' line %d", \
+			err, __FUNCTION__, __LINE__);	\
+		throw rt_error(errorBuff);				\
+				}											\
+} while (0)
+
+typedef uint64_t u64;
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+typedef unsigned char uchar;
+
+struct packer_default;
+struct packer_cantor;
+
+#define MAXREALSOLS 9
+
+struct scontainerreal {
+	u32 sols[MAXREALSOLS][512];
+	u32 nsols;
+};
+
+#if 0
+#include <functional>
+#define fn_solution std::function<void(int thr_id, const std::vector<uint32_t>&, size_t, const unsigned char*)>
+#define fn_hashdone std::function<void(int thr_id)>
+#define fn_cancel   std::function<bool(int thr_id)>
+#else
+typedef void (*fn_solution)(int thr_id, const std::vector<uint32_t>&, size_t, const unsigned char*);
+typedef void (*fn_hashdone)(int thr_id);
+typedef bool (*fn_cancel)(int thr_id);
+#endif
+
+template <u32 RB, u32 SM> struct equi;
+
+// ---------------------------------------------------------------------------------------------------
+
+struct eq_cuda_context_interface
+{
+	virtual ~eq_cuda_context_interface();
+
+	virtual void solve(const char *tequihash_header,
+		unsigned int tequihash_header_len,
+		const char* nonce,
+		unsigned int nonce_len,
+		fn_cancel cancelf,
+		fn_solution solutionf,
+		fn_hashdone hashdonef);
+public:
+	int thread_id;
+	int device_id;
+	int throughput;
+	int totalblocks;
+	int threadsperblock;
+	int threadsperblock_digits;
+	size_t equi_mem_sz;
+};
+
+// ---------------------------------------------------------------------------------------------------
+
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+class eq_cuda_context : public eq_cuda_context_interface
+{
+	equi<RB, SM>* device_eq;
+	scontainerreal* solutions;
+	CUcontext pctx;
+
+	void solve(const char *tequihash_header,
+		unsigned int tequihash_header_len,
+		const char* nonce,
+		unsigned int nonce_len,
+		fn_cancel cancelf,
+		fn_solution solutionf,
+		fn_hashdone hashdonef);
+
+public:
+	eq_cuda_context(int thr_id, int dev_id);
+	~eq_cuda_context();
+};
+
+// RB, SM, SSM, TPB, PACKER... but any change only here will fail..
+#define CONFIG_MODE_1	9, 1248, 12, 640, packer_cantor
+//#define CONFIG_MODE_2	8, 640, 12, 512, packer_default
diff --git a/equi/equi-stratum.cpp b/equi/equi-stratum.cpp
new file mode 100644
index 0000000..c9f6587
--- /dev/null
+++ b/equi/equi-stratum.cpp
@@ -0,0 +1,241 @@
+/**
+ * Equihash specific stratum protocol
+ * tpruvot@github - 2017 - Part under GPLv3 Licence
+ */
+
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <miner.h>
+
+#include "equihash.h"
+
+extern struct stratum_ctx stratum;
+extern pthread_mutex_t stratum_work_lock;
+
+// ZEC uses a different scale to compute diff... 
+// sample targets to diff (stored in the reverse byte order in work->target)
+// 0007fff800000000000000000000000000000000000000000000000000000000 is stratum diff 32
+// 003fffc000000000000000000000000000000000000000000000000000000000 is stratum diff 4
+// 00ffff0000000000000000000000000000000000000000000000000000000000 is stratum diff 1
+double target_to_diff_equi(uint32_t* target)
+{
+	uchar* tgt = (uchar*) target;
+	uint64_t m =
+		(uint64_t)tgt[30] << 24 |
+		(uint64_t)tgt[29] << 16 |
+		(uint64_t)tgt[28] << 8  |
+		(uint64_t)tgt[27] << 0;
+
+	if (!m)
+		return 0.;
+	else
+		return (double)0xffff0000UL/m;
+}
+
+void diff_to_target_equi(uint32_t *target, double diff)
+{
+	uint64_t m;
+	int k;
+
+	for (k = 6; k > 0 && diff > 1.0; k--)
+		diff /= 4294967296.0;
+	m = (uint64_t)(4294901760.0 / diff);
+	if (m == 0 && k == 6)
+		memset(target, 0xff, 32);
+	else {
+		memset(target, 0, 32);
+		target[k + 1] = (uint32_t)(m >> 8);
+		target[k + 2] = (uint32_t)(m >> 40);
+		//memset(target, 0xff, 6*sizeof(uint32_t));
+		for (k = 0; k < 28 && ((uint8_t*)target)[k] == 0; k++)
+			((uint8_t*)target)[k] = 0xff;
+	}
+}
+
+/* compute nbits to get the network diff */
+double equi_network_diff(struct work *work)
+{
+	//KMD bits: "1e 015971",
+	//KMD target: "00 00 015971000000000000000000000000000000000000000000000000000000",
+	//KMD bits: "1d 686aaf",
+	//KMD target: "00 0000 686aaf0000000000000000000000000000000000000000000000000000",
+	uint32_t nbits = work->data[26];
+	uint32_t bits = (nbits & 0xffffff);
+	int16_t shift = (swab32(nbits) & 0xff);
+	shift = (31 - shift) * 8; // 8 bits shift for 0x1e, 16 for 0x1d
+	uint64_t tgt64 = swab32(bits);
+	tgt64 = tgt64 << shift;
+	// applog_hex(&tgt64, 8);
+	uint8_t net_target[32] = { 0 };
+	for (int b=0; b<8; b++)
+		net_target[31-b] = ((uint8_t*)&tgt64)[b];
+	// applog_hex(net_target, 32);
+	double d = target_to_diff_equi((uint32_t*)net_target);
+	return d;
+}
+
+void equi_work_set_target(struct work* work, double diff)
+{
+	// target is given as data by the equihash stratum
+	// memcpy(work->target, stratum.job.claim, 32); // claim field is only used for lbry
+	diff_to_target_equi(work->target, diff);
+	//applog(LOG_BLUE, "diff %f to target :", diff);
+	//applog_hex(work->target, 32);
+	work->targetdiff = diff;
+}
+
+bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params)
+{
+	uint8_t target_bin[32], target_be[32];
+
+	const char *target_hex = json_string_value(json_array_get(params, 0));
+	if (!target_hex || strlen(target_hex) == 0)
+		return false;
+
+	hex2bin(target_bin, target_hex, 32);
+	memset(target_be, 0xff, 32);
+	int filled = 0;
+	for (int i=0; i<32; i++) {
+		if (filled == 3) break;
+		target_be[31-i] = target_bin[i];
+		if (target_bin[i]) filled++;
+	}
+	memcpy(sctx->job.claim, target_be, 32); // hack, unused struct field
+
+	pthread_mutex_lock(&stratum_work_lock);
+	sctx->next_diff = target_to_diff_equi((uint32_t*) &target_be);
+	pthread_mutex_unlock(&stratum_work_lock);
+
+	//applog(LOG_BLUE, "low diff %f", sctx->next_diff);
+	//applog_hex(target_be, 32);
+
+	return true;
+}
+
+bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params)
+{
+	const char *job_id, *version, *prevhash, *coinb1, *coinb2, *nbits, *stime;
+	size_t coinb1_size, coinb2_size;
+	bool clean, ret = false;
+	int ntime, i, p=0;
+	job_id = json_string_value(json_array_get(params, p++));
+	version = json_string_value(json_array_get(params, p++));
+	prevhash = json_string_value(json_array_get(params, p++));
+	coinb1 = json_string_value(json_array_get(params, p++)); //merkle
+	coinb2 = json_string_value(json_array_get(params, p++)); //blank (reserved)
+	stime = json_string_value(json_array_get(params, p++));
+	nbits = json_string_value(json_array_get(params, p++));
+	clean = json_is_true(json_array_get(params, p)); p++;
+
+	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime ||
+	    strlen(prevhash) != 64 || strlen(version) != 8 ||
+	    strlen(coinb1) != 64 || strlen(coinb2) != 64 ||
+	    strlen(nbits) != 8 || strlen(stime) != 8) {
+		applog(LOG_ERR, "Stratum notify: invalid parameters");
+		goto out;
+	}
+
+	/* store stratum server time diff */
+	hex2bin((uchar *)&ntime, stime, 4);
+	ntime = ntime - (int) time(0);
+	if (ntime > sctx->srvtime_diff) {
+		sctx->srvtime_diff = ntime;
+		if (opt_protocol && ntime > 20)
+			applog(LOG_DEBUG, "stratum time is at least %ds in the future", ntime);
+	}
+
+	pthread_mutex_lock(&stratum_work_lock);
+	hex2bin(sctx->job.version, version, 4);
+	hex2bin(sctx->job.prevhash, prevhash, 32);
+
+	coinb1_size = strlen(coinb1) / 2;
+	coinb2_size = strlen(coinb2) / 2;
+	sctx->job.coinbase_size = coinb1_size + coinb2_size + // merkle + reserved
+		sctx->xnonce1_size + sctx->xnonce2_size; // extranonce and...
+
+	sctx->job.coinbase = (uchar*) realloc(sctx->job.coinbase, sctx->job.coinbase_size);
+	hex2bin(sctx->job.coinbase, coinb1, coinb1_size);
+	hex2bin(sctx->job.coinbase + coinb1_size, coinb2, coinb2_size);
+
+	sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + coinb2_size + sctx->xnonce1_size;
+	if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id))
+		memset(sctx->job.xnonce2, 0, sctx->xnonce2_size);
+	memcpy(sctx->job.coinbase + coinb1_size + coinb2_size, sctx->xnonce1, sctx->xnonce1_size);
+
+	for (i = 0; i < sctx->job.merkle_count; i++)
+		free(sctx->job.merkle[i]);
+	free(sctx->job.merkle);
+	sctx->job.merkle = NULL;
+	sctx->job.merkle_count = 0;
+
+	free(sctx->job.job_id);
+	sctx->job.job_id = strdup(job_id);
+
+	hex2bin(sctx->job.nbits, nbits, 4);
+	hex2bin(sctx->job.ntime, stime, 4);
+	sctx->job.clean = clean;
+
+	sctx->job.diff = sctx->next_diff;
+	pthread_mutex_unlock(&stratum_work_lock);
+
+	ret = true;
+
+out:
+	return ret;
+}
+
+void equi_store_work_solution(struct work* work, uint32_t* hash, void* sol_data)
+{
+	int nonce = work->valid_nonces-1;
+	memcpy(work->extra, sol_data, 1347);
+	bn_store_hash_target_ratio(hash, work->target, work, nonce);
+	//work->sharediff[nonce] = target_to_diff_equi(hash);
+}
+
+#define JSON_SUBMIT_BUF_LEN (4*1024)
+// called by submit_upstream_work()
+bool equi_stratum_submit(struct pool_infos *pool, struct work *work)
+{
+	char _ALIGN(64) s[JSON_SUBMIT_BUF_LEN];
+	char _ALIGN(64) timehex[16] = { 0 };
+	char *jobid, *noncestr, *solhex;
+	int idnonce = work->submit_nonce_id;
+
+	// scanned nonce
+	work->data[EQNONCE_OFFSET] = work->nonces[idnonce];
+	unsigned char * nonce = (unsigned char*) (&work->data[27]);
+	size_t nonce_len = 32 - stratum.xnonce1_size;
+	// long nonce without pool prefix (extranonce)
+	noncestr = bin2hex(&nonce[stratum.xnonce1_size], nonce_len);
+
+	solhex = (char*) calloc(1, 1344*2 + 64);
+	if (!solhex || !noncestr) {
+		applog(LOG_ERR, "unable to alloc share memory");
+		return false;
+	}
+	cbin2hex(solhex, (const char*) work->extra, 1347);
+
+	jobid = work->job_id + 8;
+	sprintf(timehex, "%08x", swab32(work->data[25]));
+
+	snprintf(s, sizeof(s), "{\"method\":\"mining.submit\",\"params\":"
+		"[\"%s\",\"%s\",\"%s\",\"%s\",\"%s\"], \"id\":%u}",
+		pool->user, jobid, timehex, noncestr, solhex,
+		stratum.job.shares_count + 10);
+
+	free(solhex);
+	free(noncestr);
+
+	gettimeofday(&stratum.tv_submit, NULL);
+
+	if(!stratum_send_line(&stratum, s)) {
+		applog(LOG_ERR, "%s stratum_send_line failed", __func__);
+		return false;
+	}
+
+	stratum.sharediff = work->sharediff[idnonce];
+	stratum.job.shares_count++;
+
+	return true;
+}
diff --git a/equi/equi.cpp b/equi/equi.cpp
new file mode 100644
index 0000000..0a1d919
--- /dev/null
+++ b/equi/equi.cpp
@@ -0,0 +1,171 @@
+/*
+ * Port to Generic C of C++ implementation of the Equihash Proof-of-Work
+ * algorithm from zcashd.
+ *
+ * Copyright (c) 2016 abc at openwall dot com
+ * Copyright (c) 2016 Jack Grigg
+ * Copyright (c) 2016 The Zcash developers
+ * Copyright (c) 2017 tpruvot
+ *
+ * Distributed under the MIT software license, see the accompanying
+ * file COPYING or http://www.opensource.org/licenses/mit-license.php.
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <assert.h>
+
+#include "equihash.h"
+
+//#define USE_LIBSODIUM
+
+#ifdef USE_LIBSODIUM
+#include "sodium.h"
+#define blake2b_state crypto_generichash_blake2b_state
+#else
+#include "blake2/blake2.h"
+#define be32toh(x) swab32(x)
+#define htole32(x) (x)
+#define HASHOUT 50
+#endif
+
+#include <miner.h>
+
+static void digestInit(blake2b_state *S, const uint32_t n, const uint32_t k)
+{
+	uint32_t le_N = htole32(n);
+	uint32_t le_K = htole32(k);
+#ifdef USE_LIBSODIUM
+	uint8_t personalization[crypto_generichash_blake2b_PERSONALBYTES] = { 0 };
+
+	memcpy(personalization, "ZcashPoW", 8);
+	memcpy(personalization + 8,  &le_N, 4);
+	memcpy(personalization + 12, &le_K, 4);
+
+	crypto_generichash_blake2b_init_salt_personal(S,
+		NULL, 0, (512 / n) * n / 8, NULL, personalization);
+#else
+	unsigned char personal[] = "ZcashPoW01230123";
+	memcpy(personal + 8, &le_N, 4);
+	memcpy(personal + 12, &le_K, 4);
+	blake2b_param P[1];
+	P->digest_length = HASHOUT;
+	P->key_length = 0;
+	P->fanout = 1;
+	P->depth = 1;
+	P->leaf_length = 0;
+	P->node_offset = 0;
+	P->node_depth = 0;
+	P->inner_length = 0;
+	memset(P->reserved, 0, sizeof(P->reserved));
+	memset(P->salt, 0, sizeof(P->salt));
+	memcpy(P->personal, (const uint8_t *)personal, 16);
+	eq_blake2b_init_param(S, P);
+#endif
+}
+
+static void expandArray(const unsigned char *in, const uint32_t in_len,
+	unsigned char *out, const uint32_t out_len,
+	const uint32_t bit_len, const uint32_t byte_pad)
+{
+	assert(bit_len >= 8);
+	assert(8 * sizeof(uint32_t) >= 7 + bit_len);
+
+	const uint32_t out_width = (bit_len + 7) / 8 + byte_pad;
+	assert(out_len == 8 * out_width * in_len / bit_len);
+
+	const uint32_t bit_len_mask = ((uint32_t)1 << bit_len) - 1;
+
+	// The acc_bits least-significant bits of acc_value represent a bit sequence
+	// in big-endian order.
+	uint32_t acc_bits = 0;
+	uint32_t acc_value = 0;
+	uint32_t j = 0;
+
+	for (uint32_t i = 0; i < in_len; i++)
+	{
+		acc_value = (acc_value << 8) | in[i];
+		acc_bits += 8;
+
+		// When we have bit_len or more bits in the accumulator, write the next
+		// output element.
+		if (acc_bits >= bit_len) {
+			acc_bits -= bit_len;
+			for (uint32_t x = 0; x < byte_pad; x++) {
+				out[j + x] = 0;
+			}
+			for (uint32_t x = byte_pad; x < out_width; x++) {
+				out[j + x] = (
+					// Big-endian
+					acc_value >> (acc_bits + (8 * (out_width - x - 1)))
+				) & (
+					// Apply bit_len_mask across byte boundaries
+					(bit_len_mask >> (8 * (out_width - x - 1))) & 0xFF
+				);
+			}
+			j += out_width;
+		}
+	}
+}
+
+static void generateHash(blake2b_state *S, const uint32_t g, uint8_t *hash, const size_t hashLen)
+{
+	const uint32_t le_g = htole32(g);
+	blake2b_state digest = *S; /* copy */
+#ifdef USE_LIBSODIUM
+	crypto_generichash_blake2b_update(&digest, (uint8_t *)&le_g, sizeof(le_g));
+	crypto_generichash_blake2b_final(&digest, hash, hashLen);
+#else
+	eq_blake2b_update(&digest, (const uint8_t*) &le_g, sizeof(le_g));
+	eq_blake2b_final(&digest, hash, (uint8_t) (hashLen & 0xFF));
+#endif
+}
+
+static int isZero(const uint8_t *hash, size_t len)
+{
+	// This doesn't need to be constant time.
+	for (size_t i = 0; i < len; i++) {
+		if (hash[i] != 0) return 0;
+	}
+	return 1;
+}
+
+// hdr -> header including nonce (140 bytes)
+// soln -> equihash solution (excluding 3 bytes with size, so 1344 bytes length)
+bool equi_verify(uint8_t* const hdr, uint8_t* const soln)
+{
+	const uint32_t n = WN; // 200
+	const uint32_t k = WK; // 9
+	const uint32_t collisionBitLength = n / (k + 1);
+	const uint32_t collisionByteLength = (collisionBitLength + 7) / 8;
+	const uint32_t hashLength = (k + 1) * collisionByteLength;
+	const uint32_t indicesPerHashOutput = 512 / n;
+	const uint32_t hashOutput = indicesPerHashOutput * n / 8;
+	const uint32_t equihashSolutionSize = (1 << k) * (n / (k + 1) + 1) / 8;
+	const uint32_t solnr = 1 << k;
+
+	uint32_t indices[512] = { 0 };
+	uint8_t vHash[hashLength] = { 0 };
+
+	blake2b_state state;
+	digestInit(&state, n, k);
+#ifdef USE_LIBSODIUM
+	crypto_generichash_blake2b_update(&state, hdr, 140);
+#else
+	eq_blake2b_update(&state, hdr, 140);
+#endif
+
+	expandArray(soln, equihashSolutionSize, (uint8_t*) &indices, sizeof(indices), collisionBitLength + 1, 1);
+
+	for (uint32_t j = 0; j < solnr; j++) {
+		uint8_t tmpHash[hashOutput];
+		uint8_t hash[hashLength];
+		uint32_t i = be32toh(indices[j]);
+		generateHash(&state, i / indicesPerHashOutput, tmpHash, hashOutput);
+		expandArray(tmpHash + (i % indicesPerHashOutput * n / 8), n / 8, hash, hashLength, collisionBitLength, 0);
+		for (uint32_t k = 0; k < hashLength; k++)
+			vHash[k] ^= hash[k];
+	}
+	return isZero(vHash, sizeof(vHash));
+}
diff --git a/equi/equihash.cpp b/equi/equihash.cpp
new file mode 100644
index 0000000..2a6e514
--- /dev/null
+++ b/equi/equihash.cpp
@@ -0,0 +1,294 @@
+/**
+ * Equihash solver interface for ccminer (compatible with linux and windows)
+ * Solver taken from nheqminer, by djeZo (and NiceHash)
+ * tpruvot - 2017 (GPL v3)
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <assert.h>
+
+#include <stdexcept>
+#include <vector>
+
+#include <sph/sph_sha2.h>
+
+#include "eqcuda.hpp"
+#include "equihash.h" // equi_verify()
+
+#include <miner.h>
+
+// All solutions (BLOCK_HEADER_LEN + SOLSIZE_LEN + SOL_LEN) sha256d should be under the target
+extern "C" void equi_hash(const void* input, void* output, int len)
+{
+	uint8_t _ALIGN(64) hash0[32], hash1[32];
+
+	sph_sha256_context ctx_sha256;
+
+	sph_sha256_init(&ctx_sha256);
+	sph_sha256(&ctx_sha256, input, len);
+	sph_sha256_close(&ctx_sha256, hash0);
+	sph_sha256(&ctx_sha256, hash0, 32);
+	sph_sha256_close(&ctx_sha256, hash1);
+
+	memcpy(output, hash1, 32);
+}
+
+// input here is 140 for the header and 1344 for the solution (equi.cpp)
+extern "C" int equi_verify_sol(void * const hdr, void * const sol)
+{
+	bool res = equi_verify((uint8_t*) hdr, (uint8_t*) sol);
+
+	//applog_hex((void*)hdr, 140);
+	//applog_hex((void*)sol, 1344);
+
+	return res ? 1 : 0;
+}
+
+#include <cuda_helper.h>
+
+//#define EQNONCE_OFFSET 30 /* 27:34 */
+#define NONCE_OFT EQNONCE_OFFSET
+
+static bool init[MAX_GPUS] = { 0 };
+static int valid_sols[MAX_GPUS] = { 0 };
+static uint8_t _ALIGN(64) data_sols[MAX_GPUS][MAXREALSOLS][1536] = { 0 }; // 140+3+1344 required
+static eq_cuda_context_interface* solvers[MAX_GPUS] = { NULL };
+
+static void CompressArray(const unsigned char* in, size_t in_len,
+	unsigned char* out, size_t out_len, size_t bit_len, size_t byte_pad)
+{
+	assert(bit_len >= 8);
+	assert(8 * sizeof(uint32_t) >= 7 + bit_len);
+
+	size_t in_width = (bit_len + 7) / 8 + byte_pad;
+	assert(out_len == bit_len*in_len / (8 * in_width));
+
+	uint32_t bit_len_mask = (1UL << bit_len) - 1;
+
+	// The acc_bits least-significant bits of acc_value represent a bit sequence
+	// in big-endian order.
+	size_t acc_bits = 0;
+	uint32_t acc_value = 0;
+
+	size_t j = 0;
+	for (size_t i = 0; i < out_len; i++) {
+		// When we have fewer than 8 bits left in the accumulator, read the next
+		// input element.
+		if (acc_bits < 8) {
+			acc_value = acc_value << bit_len;
+			for (size_t x = byte_pad; x < in_width; x++) {
+				acc_value = acc_value | (
+					(
+					// Apply bit_len_mask across byte boundaries
+					in[j + x] & ((bit_len_mask >> (8 * (in_width - x - 1))) & 0xFF)
+					) << (8 * (in_width - x - 1))); // Big-endian
+			}
+			j += in_width;
+			acc_bits += bit_len;
+		}
+
+		acc_bits -= 8;
+		out[i] = (acc_value >> acc_bits) & 0xFF;
+	}
+}
+
+#ifndef htobe32
+#define htobe32(x) swab32(x)
+#endif
+
+static void EhIndexToArray(const u32 i, unsigned char* arr)
+{
+	u32 bei = htobe32(i);
+	memcpy(arr, &bei, sizeof(u32));
+}
+
+static std::vector<unsigned char> GetMinimalFromIndices(std::vector<u32> indices, size_t cBitLen)
+{
+	assert(((cBitLen + 1) + 7) / 8 <= sizeof(u32));
+	size_t lenIndices = indices.size()*sizeof(u32);
+	size_t minLen = (cBitLen + 1)*lenIndices / (8 * sizeof(u32));
+	size_t bytePad = sizeof(u32) - ((cBitLen + 1) + 7) / 8;
+	std::vector<unsigned char> array(lenIndices);
+	for (size_t i = 0; i < indices.size(); i++) {
+		EhIndexToArray(indices[i], array.data() + (i*sizeof(u32)));
+	}
+	std::vector<unsigned char> ret(minLen);
+	CompressArray(array.data(), lenIndices, ret.data(), minLen, cBitLen + 1, bytePad);
+	return ret;
+}
+
+// solver callbacks
+static void cb_solution(int thr_id, const std::vector<uint32_t>& solutions, size_t cbitlen, const unsigned char *compressed_sol)
+{
+	std::vector<unsigned char> nSolution;
+	if (!compressed_sol) {
+		nSolution = GetMinimalFromIndices(solutions, cbitlen);
+	} else {
+		gpulog(LOG_INFO, thr_id, "compressed_sol");
+		nSolution = std::vector<unsigned char>(1344);
+		for (size_t i = 0; i < cbitlen; i++)
+			nSolution[i] = compressed_sol[i];
+	}
+	int nsol = valid_sols[thr_id];
+	if (nsol < 0) nsol = 0;
+	if(nSolution.size() == 1344) {
+		// todo, only store solution data here...
+		le32enc(&data_sols[thr_id][nsol][140], 0x000540fd); // sol sz header
+		memcpy(&data_sols[thr_id][nsol][143], nSolution.data(), 1344);
+		valid_sols[thr_id] = nsol + 1;
+	}
+}
+static void cb_hashdone(int thr_id) {
+	if (!valid_sols[thr_id]) valid_sols[thr_id] = -1;
+}
+static bool cb_cancel(int thr_id) {
+	if (work_restart[thr_id].restart)
+		valid_sols[thr_id] = -1;
+	return work_restart[thr_id].restart;
+}
+
+extern "C" int scanhash_equihash(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[35];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[NONCE_OFT];
+	uint32_t nonce_increment = rand() & 0xFF; // nonce randomizer
+	struct timeval tv_start, tv_end, diff;
+	double secs, solps;
+	uint32_t soluce_count = 0;
+
+	if (opt_benchmark)
+		ptarget[7] = 0xfffff;
+
+	if (!init[thr_id]) {
+		try {
+			int mode = 1;
+			switch (mode) {
+			case 1:
+				solvers[thr_id] = new eq_cuda_context<CONFIG_MODE_1>(thr_id, device_map[thr_id]);
+				break;
+#ifdef CONFIG_MODE_2
+			case 2:
+				solvers[thr_id] = new eq_cuda_context<CONFIG_MODE_2>(thr_id, device_map[thr_id]);
+				break;
+#endif
+#ifdef CONFIG_MODE_3
+			case 3:
+				solvers[thr_id] = new eq_cuda_context<CONFIG_MODE_3>(thr_id, device_map[thr_id]);
+				break;
+#endif
+			default:
+				proper_exit(EXIT_CODE_SW_INIT_ERROR);
+				return -1;
+			}
+			size_t memSz = solvers[thr_id]->equi_mem_sz / (1024*1024);
+			gpus_intensity[thr_id] = (uint32_t) solvers[thr_id]->throughput;
+			api_set_throughput(thr_id, gpus_intensity[thr_id]);
+			gpulog(LOG_DEBUG, thr_id, "Allocated %u MB of context memory", (u32) memSz);
+			cuda_get_arch(thr_id);
+			init[thr_id] = true;
+		} catch (const std::exception & e) {
+			CUDA_LOG_ERROR();
+			gpulog(LOG_ERR, thr_id, "init: %s", e.what());
+			proper_exit(EXIT_CODE_CUDA_ERROR);
+		}
+	}
+
+	gettimeofday(&tv_start, NULL);
+	memcpy(endiandata, pdata, 140);
+	work->valid_nonces = 0;
+
+	do {
+
+		try {
+
+			valid_sols[thr_id] = 0;
+			solvers[thr_id]->solve(
+				(const char *) endiandata, (unsigned int) (140 - 32),
+				(const char *) &endiandata[27], (unsigned int) 32,
+				&cb_cancel, &cb_solution, &cb_hashdone
+			);
+
+			*hashes_done = soluce_count;
+
+		} catch (const std::exception & e) {
+			gpulog(LOG_WARNING, thr_id, "solver: %s", e.what());
+			free_equihash(thr_id);
+			sleep(1);
+			return -1;
+		}
+
+		if (valid_sols[thr_id] > 0)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			uint8_t _ALIGN(64) full_data[140+3+1344] = { 0 };
+			uint8_t* sol_data = &full_data[140];
+
+			soluce_count += valid_sols[thr_id];
+
+			for (int nsol=0; nsol < valid_sols[thr_id]; nsol++)
+			{
+				memcpy(full_data, endiandata, 140);
+				memcpy(sol_data, &data_sols[thr_id][nsol][140], 1347);
+				equi_hash(full_data, vhash, 140+3+1344);
+
+				if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
+				{
+					bool valid = equi_verify_sol(endiandata, &sol_data[3]);
+					if (valid && work->valid_nonces < MAX_NONCES) {
+						work->valid_nonces++;
+						memcpy(work->data, endiandata, 140);
+						equi_store_work_solution(work, vhash, sol_data);
+						work->nonces[work->valid_nonces-1] = endiandata[NONCE_OFT];
+						pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1;
+						//applog_hex(vhash, 32);
+						//applog_hex(&work->data[27], 32);
+						goto out; // second solution storage not handled..
+					}
+				}
+				if (work->valid_nonces == MAX_NONCES) goto out;
+			}
+			if (work->valid_nonces)
+				goto out;
+
+			valid_sols[thr_id] = 0;
+		}
+
+		endiandata[NONCE_OFT] += nonce_increment;
+
+	} while (!work_restart[thr_id].restart);
+
+out:
+	gettimeofday(&tv_end, NULL);
+	timeval_subtract(&diff, &tv_end, &tv_start);
+	secs = (1.0 * diff.tv_sec) + (0.000001 * diff.tv_usec);
+	solps = (double)soluce_count / secs;
+	gpulog(LOG_DEBUG, thr_id, "%d solutions in %.2f s (%.2f Sol/s)", soluce_count, secs, solps);
+
+	// H/s
+	*hashes_done = soluce_count;
+
+	pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1;
+
+	return work->valid_nonces;
+}
+
+// cleanup
+void free_equihash(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	delete(solvers[thr_id]);
+	solvers[thr_id] = NULL;
+
+	init[thr_id] = false;
+}
+
+// mmm... viva c++ junk
+void eq_cuda_context_interface::solve(const char *tequihash_header, unsigned int tequihash_header_len,
+	const char* nonce, unsigned int nonce_len,
+	fn_cancel cancelf, fn_solution solutionf, fn_hashdone hashdonef) { }
+eq_cuda_context_interface::~eq_cuda_context_interface() { }
diff --git a/equi/equihash.h b/equi/equihash.h
new file mode 100644
index 0000000..cdf47fa
--- /dev/null
+++ b/equi/equihash.h
@@ -0,0 +1,19 @@
+#ifndef EQUIHASH_H
+#define EQUIHASH_H
+
+#include <stdint.h>
+
+// miner nonce "cursor" unique for each thread
+#define EQNONCE_OFFSET 30 /* 27:34 */
+
+#define WK 9
+#define WN 200
+//#define CONFIG_MODE_1 9, 1248, 12, 640, packer_cantor /* eqcuda.hpp */
+
+extern "C" {
+	void equi_hash(const void* input, void* output, int len);
+	int  equi_verify_sol(void* const hdr, void* const soln);
+	bool equi_verify(uint8_t* const hdr, uint8_t* const soln);
+}
+
+#endif
diff --git a/miner.h b/miner.h
index f720bc7..307ff7d 100644
--- a/miner.h
+++ b/miner.h
@@ -282,6 +282,7 @@ extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonc
 extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_equihash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -340,6 +341,7 @@ extern void free_cryptolight(int thr_id);
 extern void free_cryptonight(int thr_id);
 extern void free_decred(int thr_id);
 extern void free_deep(int thr_id);
+extern void free_equihash(int thr_id);
 extern void free_keccak256(int thr_id);
 extern void free_fresh(int thr_id);
 extern void free_fugue256(int thr_id);
@@ -604,6 +606,7 @@ void cuda_clear_lasterror();
 #define CL_WHT  "\x1B[01;37m" /* white */
 
 extern void format_hashrate(double hashrate, char *output);
+extern void format_hashrate_unit(double hashrate, char *output, const char* unit);
 extern void applog(int prio, const char *fmt, ...);
 extern void gpulog(int prio, int thr_id, const char *fmt, ...);
 
@@ -679,6 +682,7 @@ struct stratum_ctx {
 	time_t tm_connected;
 
 	int rpc2;
+	int is_equihash;
 	int srvtime_diff;
 };
 
@@ -722,6 +726,8 @@ struct work {
 	/* pok getwork txs */
 	uint32_t tx_count;
 	struct tx txs[POK_MAX_TXS];
+	// zec solution
+	uint8_t extra[1388];
 };
 
 #define POK_BOOL_MASK 0x00008000
@@ -804,6 +810,14 @@ void stratum_free_job(struct stratum_ctx *sctx);
 
 bool rpc2_stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
 
+bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params);
+bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params);
+bool equi_stratum_submit(struct pool_infos *pool, struct work *work);
+void equi_work_set_target(struct work* work, double diff);
+void equi_store_work_solution(struct work* work, uint32_t* hash, void* sol_data);
+int equi_verify_sol(void * const hdr, void * const sol);
+double equi_network_diff(struct work *work);
+
 void hashlog_remember_submit(struct work* work, uint32_t nonce);
 void hashlog_remember_scan_range(struct work* work);
 double hashlog_get_sharediff(char* jobid, int idnonce, double defvalue);
diff --git a/util.cpp b/util.cpp
index 7c2878b..e15fbf6 100644
--- a/util.cpp
+++ b/util.cpp
@@ -211,35 +211,31 @@ void get_defconfig_path(char *out, size_t bufsize, char *argv0)
 #endif
 }
 
-void format_hashrate(double hashrate, char *output)
+void format_hashrate_unit(double hashrate, char *output, const char *unit)
 {
-	char prefix = '\0';
+	char prefix[2] = { 0, 0 };
 
 	if (hashrate < 10000) {
 		// nop
 	}
 	else if (hashrate < 1e7) {
-		prefix = 'k';
+		prefix[0] = 'k';
 		hashrate *= 1e-3;
 	}
 	else if (hashrate < 1e10) {
-		prefix = 'M';
+		prefix[0] = 'M';
 		hashrate *= 1e-6;
 	}
 	else if (hashrate < 1e13) {
-		prefix = 'G';
+		prefix[0] = 'G';
 		hashrate *= 1e-9;
 	}
 	else {
-		prefix = 'T';
+		prefix[0] = 'T';
 		hashrate *= 1e-12;
 	}
 
-	sprintf(
-		output,
-		prefix ? "%.2f %cH/s" : "%.2f H/s%c",
-		hashrate, prefix
-	);
+	sprintf(output, "%.2f %s%s", hashrate, prefix, unit);
 }
 
 static void databuf_free(struct data_buffer *db)
@@ -1179,14 +1175,27 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i
 	}
 	xn2_size = (int) json_integer_value(json_array_get(params, pndx+1));
 	if (!xn2_size) {
-		applog(LOG_ERR, "Failed to get extranonce2_size");
-		goto out;
+		char algo[64] = { 0 };
+		get_currentalgo(algo, sizeof(algo));
+		if (strcmp(algo, "equihash") == 0) {
+			int xn1_size = (int)strlen(xnonce1) / 2;
+			xn2_size = 32 - xn1_size;
+			if (xn1_size < 4 || xn1_size > 12) {
+				// This miner iterates the nonces at data32[30]
+				applog(LOG_ERR, "Unsupported extranonce size of %d (12 maxi)", xn1_size);
+				goto out;
+			}
+			goto skip_n2;
+		} else {
+			applog(LOG_ERR, "Failed to get extranonce2_size");
+			goto out;
+		}
 	}
 	if (xn2_size < 2 || xn2_size > 16) {
-		applog(LOG_INFO, "Failed to get valid n2size in parse_extranonce");
+		applog(LOG_ERR, "Failed to get valid n2size in parse_extranonce (%d)", xn2_size);
 		goto out;
 	}
-
+skip_n2:
 	pthread_mutex_lock(&stratum_work_lock);
 	if (sctx->xnonce1)
 		free(sctx->xnonce1);
@@ -1441,6 +1450,10 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	get_currentalgo(algo, sizeof(algo));
 	bool has_claim = !strcasecmp(algo, "lbry");
 
+	if (sctx->is_equihash) {
+		return equi_stratum_notify(sctx, params);
+	}
+
 	job_id = json_string_value(json_array_get(params, p++));
 	prevhash = json_string_value(json_array_get(params, p++));
 	if (has_claim) {
@@ -1842,6 +1855,11 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
 		ret = stratum_set_difficulty(sctx, params);
 		goto out;
 	}
+	if (!strcasecmp(method, "mining.set_target")) {
+		sctx->is_equihash = true;
+		ret = equi_stratum_set_target(sctx, params);
+		goto out;
+	}
 	if (!strcasecmp(method, "mining.set_extranonce")) {
 		ret = stratum_parse_extranonce(sctx, params, 0);
 		goto out;