Interface nicehash djeZo equihash solver (squashed)

Todo: - send block height via stratum protocol (encoded in jobid?) - remove equi/blake2 cpu algorithm to use common one the extranonce imcompatibility is related to the solver nonce data, offsets may be reversed in nheqminer, to check... The solver was adapted for SM 3.0+ support (no perf changes) Note: The solver was not improved on purpose, to be able compare the two miners performances (nheqminer 0.5c the last open sourced, and ccminer) Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> stratum: code cleanup, move equi fns in equi folder
8 years ago · 8cf21599d4
24 changed files with 4422 additions and 24 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -21,6 +21,8 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  crc32.c hefty1.c \
 			  ccminer.cpp pools.cpp util.cpp bench.cpp bignum.cpp \
 			  api.cpp hashlog.cpp nvml.cpp stats.cpp sysinfos.cpp cuda.cpp \
 			  equi/equi-stratum.cpp equi/equi.cpp equi/blake2/blake2bx.cpp \
 			  equi/equihash.cpp equi/cuda_equi.cu \
 			  heavy/heavy.cu \
 			  heavy/cuda_blake512.cu heavy/cuda_blake512.h \
 			  heavy/cuda_combine.cu heavy/cuda_combine.h \
@ -97,6 +99,10 @@ ccminer_LDFLAGS += -L/usr/local/llvm/lib
 ccminer_LDADD += -lomp
 endif
 #ccminer_CPPFLAGS += -DUSE_LIBSODIUM
 #ccminer_LDFLAGS += -Lequi/lib
 #ccminer_LDADD += -lsodium
 ccminer_LDADD += -lcuda
 nvcc_ARCH  = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
--- a/algos.h
+++ b/algos.h
@ -16,6 +16,7 @@ enum sha_algos {
 	ALGO_DEEP,
 	ALGO_DECRED,
 	ALGO_DMD_GR,
 	ALGO_EQUIHASH,
 	ALGO_FRESH,
 	ALGO_FUGUE256,		/* Fugue256 */
 	ALGO_GROESTL,
@ -78,6 +79,7 @@ static const char *algo_names[] = {
 	"deep",
 	"decred",
 	"dmd-gr",
 	"equihash",
 	"fresh",
 	"fugue256",
 	"groestl",
@ -149,6 +151,8 @@ static inline int algo_to_int(char* arg)
 			i = ALGO_C11;
 		else if (!strcasecmp("diamond", arg))
 			i = ALGO_DMD_GR;
 		else if (!strcasecmp("equi", arg))
 			i = ALGO_EQUIHASH;
 		else if (!strcasecmp("doom", arg))
 			i = ALGO_LUFFA;
 		else if (!strcasecmp("hmq17", arg))
--- a/api.cpp
+++ b/api.cpp
@ -1341,6 +1341,7 @@ void api_set_throughput(int thr_id, uint32_t throughput)
 	if (thr_id < MAX_GPUS && thr_info) {
 		struct cgpu_info *cgpu = &thr_info[thr_id].gpu;
 		cgpu->intensity = throughput2intensity(throughput);
 		if (cgpu->throughput != throughput) cgpu->throughput = throughput;
 	}
 	// to display in bench results
 	if (opt_benchmark)
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -44,6 +44,7 @@
 #include "algos.h"
 #include "sia/sia-rpc.h"
 #include "crypto/xmr-rpc.h"
 #include "equi/equihash.h"
 #include <cuda_runtime.h>
@ -243,6 +244,7 @@ Options:\n\
 			c11/flax    X11 variant\n\
 			decred      Decred Blake256\n\
 			deep        Deepcoin\n\
 			equihash    Zcash Equihash\n\
 			dmd-gr      Diamond-Groestl\n\
 			fresh       Freshcoin (shavite 80)\n\
 			fugue256    Fuguecoin\n\
@ -558,6 +560,14 @@ void get_currentalgo(char* buf, int sz)
 	snprintf(buf, sz, "%s", algo_names[opt_algo]);
 }
 void format_hashrate(double hashrate, char *output)
 {
 	if (opt_algo == ALGO_EQUIHASH)
 		format_hashrate_unit(hashrate, output, "Sol/s");
 	else
 		format_hashrate_unit(hashrate, output, "H/s");
 }
 /**
 * Exit app
 */
@ -634,6 +644,10 @@ static void calc_network_diff(struct work *work)
 	if (opt_algo == ALGO_LBRY) nbits = swab32(work->data[26]);
 	if (opt_algo == ALGO_DECRED) nbits = work->data[29];
 	if (opt_algo == ALGO_SIA) nbits = work->data[11]; // unsure if correct
 	if (opt_algo == ALGO_EQUIHASH) {
 		net_diff = equi_network_diff(work);
 		return;
 	}
 	uint32_t bits = (nbits & 0xffffff);
 	int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
@ -856,6 +870,17 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		return true;
 	}
 	if (pool->type & POOL_STRATUM && stratum.is_equihash) {
 		struct work submit_work;
 		memcpy(&submit_work, work, sizeof(struct work));
 		//if (!hashlog_already_submittted(submit_work.job_id, submit_work.nonces[idnonce])) {
 			if (equi_stratum_submit(pool, &submit_work))
 				hashlog_remember_submit(&submit_work, submit_work.nonces[idnonce]);
 			stratum.job.shares_count++;
 		//}
 		return true;
 	}
 	/* discard if a newer block was received */
 	stale_work = work->height && work->height < g_work.height;
 	if (have_stratum && !stale_work && opt_algo != ALGO_ZR5 && opt_algo != ALGO_SCRYPT_JANE) {
@ -1508,6 +1533,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	/* Generate merkle root */
 	switch (opt_algo) {
 		case ALGO_DECRED:
 		case ALGO_EQUIHASH:
 		case ALGO_SIA:
 			// getwork over stratum, no merkle to generate
 			break;
@ -1566,6 +1592,13 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		memcpy(&work->data[44], &sctx->job.coinbase[sctx->job.coinbase_size-4], 4);
 		sctx->job.height = work->data[32];
 		//applog_hex(work->data, 180);
 	} else if (opt_algo == ALGO_EQUIHASH) {
 		memcpy(&work->data[9], sctx->job.coinbase, 32+32); // merkle [9..16] + reserved
 		work->data[25] = le32dec(sctx->job.ntime);
 		work->data[26] = le32dec(sctx->job.nbits);
 		memcpy(&work->data[27], sctx->xnonce1, sctx->xnonce1_size & 0x1F); // pool extranonce
 		work->data[35] = 0x80;
 		//applog_hex(work->data, 140);
 	} else if (opt_algo == ALGO_LBRY) {
 		for (i = 0; i < 8; i++)
 			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
@ -1618,7 +1651,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	pthread_mutex_unlock(&stratum_work_lock);
-	if (opt_debug && opt_algo != ALGO_DECRED && opt_algo != ALGO_SIA) {
+	if (opt_debug && opt_algo != ALGO_DECRED && opt_algo != ALGO_EQUIHASH && opt_algo != ALGO_SIA) {
 		uint32_t utm = work->data[17];
 		if (opt_algo != ALGO_ZR5) utm = swab32(utm);
 		char *tm = atime2str(utm - sctx->srvtime_diff);
@ -1656,6 +1689,9 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_LYRA2:
 			work_set_target(work, sctx->job.diff / (128.0 * opt_difficulty));
 			break;
 		case ALGO_EQUIHASH:
 			equi_work_set_target(work, sctx->job.diff / opt_difficulty);
 			break;
 		default:
 			work_set_target(work, sctx->job.diff / opt_difficulty);
 	}
@ -1830,6 +1866,9 @@ static void *miner_thread(void *userdata)
 		} else if (opt_algo == ALGO_CRYPTOLIGHT || opt_algo == ALGO_CRYPTONIGHT) {
 			nonceptr = (uint32_t*) (((char*)work.data) + 39);
 			wcmplen = 39;
 		} else if (opt_algo == ALGO_EQUIHASH) {
 			nonceptr = &work.data[EQNONCE_OFFSET]; // 27 is pool extranonce (256bits nonce space)
 			wcmplen = 4+32+32;
 		}
 		if (have_stratum) {
@ -1962,6 +2001,10 @@ static void *miner_thread(void *userdata)
 			nonceptr[1] += 1;
 			nonceptr[2] |= thr_id;
 		} else if (opt_algo == ALGO_EQUIHASH) {
 			nonceptr[1]++;
 			nonceptr[1] |= thr_id << 24;
 			//applog_hex(&work.data[27], 32);
 		} else if (opt_algo == ALGO_WILDKECCAK) {
 			//nonceptr[1] += 1;
 		} else if (opt_algo == ALGO_SIA) {
@ -2264,6 +2307,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_DEEP:
 			rc = scanhash_deep(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_EQUIHASH:
 			rc = scanhash_equihash(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_FRESH:
 			rc = scanhash_fresh(thr_id, &work, max_nonce, &hashes_done);
 			break;
@ -2431,7 +2477,7 @@ static void *miner_thread(void *userdata)
 				work.nonces[1] = nonceptr[2];
 		}
-		if (stratum.rpc2 && rc == -EBUSY || work_restart[thr_id].restart) {
+		if (stratum.rpc2 && (rc == -EBUSY || work_restart[thr_id].restart)) {
 			// bbr scratchpad download or stale result
 			sleep(1);
 			if (!thr_id) pools[cur_pooln].wait_time += 1;
@ -2489,7 +2535,7 @@ static void *miner_thread(void *userdata)
 		}
 		// only required to debug purpose
-		if (opt_debug && check_dups && opt_algo != ALGO_DECRED && opt_algo != ALGO_SIA)
+		if (opt_debug && check_dups && opt_algo != ALGO_DECRED && opt_algo != ALGO_EQUIHASH && opt_algo != ALGO_SIA)
 			hashlog_remember_scan_range(&work);
 		/* output */
@ -3854,6 +3900,10 @@ int main(int argc, char *argv[])
 		allow_mininginfo = false;
 	}
 	if (opt_algo == ALGO_EQUIHASH) {
 		opt_extranonce = false; // disable subscribe
 	}
 	if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT) {
 		rpc2_init();
 		if (!opt_quiet) applog(LOG_INFO, "Using JSON-RPC 2.0");
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -156,7 +156,7 @@
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_30,sm_30;compute_20,sm_21</CodeGeneration>
-      <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions>--ptxas-options="-O2" --Wno-deprecated-gpu-targets %(AdditionalOptions)</AdditionalOptions>
      <Optimization>O2</Optimization>
    </CudaCompile>
    <CudaLink>
@ -199,9 +199,10 @@
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
      <CodeGeneration>compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;compute_20,sm_21</CodeGeneration>
-      <Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
+      <Include>$(NVTOOLSEXT_PATH)\include</Include>
      <Optimization>O3</Optimization>
      <TargetMachinePlatform>64</TargetMachinePlatform>
      <AdditionalOptions>--Wno-deprecated-gpu-targets %(AdditionalOptions)</AdditionalOptions>
    </CudaCompile>
    <CudaLink>
      <Optimization>O3</Optimization>
@ -235,6 +236,12 @@
    <ClCompile Include="crypto\cryptolight-cpu.cpp" />
    <ClCompile Include="crypto\cryptonight-cpu.cpp" />
    <ClCompile Include="crypto\cpu\c_keccak.c" />
    <ClCompile Include="equi\blake2\blake2bx.cpp">
      <EnableEnhancedInstructionSet Condition="'$(Platform)'=='Win32'">StreamingSIMDExtensions</EnableEnhancedInstructionSet>
    </ClCompile>
    <ClCompile Include="equi\equi-stratum.cpp" />
    <ClCompile Include="equi\equi.cpp" />
    <ClCompile Include="equi\equihash.cpp" />
    <ClCompile Include="nvapi.cpp" />
    <ClCompile Include="pools.cpp" />
    <ClCompile Include="util.cpp" />
@ -254,6 +261,8 @@
    <ClCompile Include="lyra2\Lyra2.c" />
    <ClCompile Include="lyra2\Sponge.c" />
    <ClCompile Include="lyra2\Lyra2Z.c" />
    <ClInclude Include="equi\eqcuda.hpp" />
    <ClInclude Include="equi\equihash.h" />
    <ClInclude Include="neoscrypt\neoscrypt.h" />
    <ClCompile Include="neoscrypt\neoscrypt.cpp" />
    <ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
@ -283,6 +292,11 @@
    <CudaCompile Include="crypto\wildkeccak.cu">
      <MaxRegCount>128</MaxRegCount>
    </CudaCompile>
    <CudaCompile Include="equi\cuda_equi.cu">
      <CodeGeneration>compute_52,sm_52;compute_50,sm_50;compute_30,sm_30</CodeGeneration>
      <AdditionalOptions> -Xptxas -dlcm=ca -Xptxas -dscm=cs %(AdditionalOptions)</AdditionalOptions>
      <MaxRegCount>0</MaxRegCount>
    </CudaCompile>
    <CudaCompile Include="neoscrypt\cuda_neoscrypt.cu">
      <MaxRegCount>160</MaxRegCount>
    </CudaCompile>
@ -552,7 +566,7 @@
    </CudaCompile>
    <CudaCompile Include="x15\cuda_x14_shabal512.cu" />
    <CudaCompile Include="x15\cuda_x15_whirlpool.cu" />
-    <CudaCompile Include="x17\hmq17.cu"/>
+    <CudaCompile Include="x17\hmq17.cu" />
    <CudaCompile Include="x15\x15.cu" />
    <CudaCompile Include="x15\whirlpool.cu" />
    <CudaCompile Include="x17\x17.cu">
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -103,6 +103,9 @@
    <Filter Include="Source Files\CUDA\xmr">
      <UniqueIdentifier>{0f9aec5e-5409-488f-992a-2c108590d1ac}</UniqueIdentifier>
    </Filter>
    <Filter Include="Source Files\equi">
      <UniqueIdentifier>{031afae7-2a78-4e32-9738-4b589b6f7ff3}</UniqueIdentifier>
    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="compat\jansson\dump.c">
@ -321,6 +324,18 @@
    <ClCompile Include="crypto\wildkeccak-cpu.cpp">
      <Filter>Source Files\crypto\bbr</Filter>
    </ClCompile>
    <ClCompile Include="equi\equi.cpp">
      <Filter>Source Files\equi</Filter>
    </ClCompile>
    <ClCompile Include="equi\equihash.cpp">
      <Filter>Source Files\equi</Filter>
    </ClCompile>
    <ClCompile Include="equi\equi-stratum.cpp">
      <Filter>Source Files\equi</Filter>
    </ClCompile>
    <ClCompile Include="equi\blake2\blake2bx.cpp">
      <Filter>Source Files\equi</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="algos.h">
@ -563,6 +578,12 @@
    <ClInclude Include="crypto\cryptonight.h">
      <Filter>Source Files\CUDA\xmr</Filter>
    </ClInclude>
    <ClInclude Include="equi\eqcuda.hpp">
      <Filter>Source Files\equi</Filter>
    </ClInclude>
    <ClInclude Include="equi\equihash.h">
      <Filter>Source Files\equi</Filter>
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <CudaCompile Include="cuda.cpp">
@ -895,6 +916,9 @@
    <CudaCompile Include="crypto\wildkeccak.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="equi\cuda_equi.cu">
      <Filter>Source Files\equi</Filter>
    </CudaCompile>
  </ItemGroup>
  <ItemGroup>
    <Image Include="res\ccminer.ico">
@ -911,4 +935,4 @@
      <Filter>Ressources</Filter>
    </Text>
  </ItemGroup>
-</Project>
+</Project>
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@ -164,7 +164,7 @@
 #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.0"
+#define PACKAGE_VERSION "2.1"
 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.0], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.1], [], [ccminer], [http://github.com/tpruvot/ccminer])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/equi/blake2/blake2-config.h
+++ b/equi/blake2/blake2-config.h
@ -0,0 +1,81 @@
 /*
   BLAKE2 reference source code package - optimized C implementations
   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
   To the extent possible under law, the author(s) have dedicated all copyright
   and related and neighboring rights to this software to the public domain
   worldwide. This software is distributed without any warranty.
   You should have received a copy of the CC0 Public Domain Dedication along with
   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 */
 #pragma once
 #ifndef __BLAKE2_CONFIG_H__
 #define __BLAKE2_CONFIG_H__
 // These don't work everywhere
 #if (defined(__SSE2__) || defined(_M_AMD_64) || defined(_M_X64))
 #define HAVE_SSE2
 #endif
 #if defined(__SSSE3__)
 #define HAVE_SSSE3
 #endif
 #if defined(__SSE4_1__)
 #define HAVE_SSE41
 #endif
 #if defined(__AVX__)
 #define HAVE_AVX
 #endif
 #if defined(__XOP__)
 #define HAVE_XOP
 #endif
 #ifdef HAVE_AVX2
 #ifndef HAVE_AVX
 #define HAVE_AVX
 #endif
 #endif
 #ifdef HAVE_XOP
 #ifndef HAVE_AVX
 #define HAVE_AVX
 #endif
 #endif
 #ifdef HAVE_AVX
 #ifndef HAVE_SSE41
 #define HAVE_SSE41
 #endif
 #endif
 #ifdef HAVE_SSE41
 #ifndef HAVE_SSSE3
 #define HAVE_SSSE3
 #endif
 #endif
 #ifdef HAVE_SSSE3
 #define HAVE_SSE2
 #endif
 #if !defined(HAVE_SSE2)
 #ifdef _MSC_VER
 // enforce required stuff for now
 #define HAVE_SSE2
 //#define HAVE_SSSE3
 #define HAVE_SSE41
 #else
 # error "This code requires at least SSE 4.1"
 #endif
 #endif
 #endif
--- a/equi/blake2/blake2-impl.h
+++ b/equi/blake2/blake2-impl.h
@ -0,0 +1,136 @@
 /*
   BLAKE2 reference source code package - optimized C implementations
   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
   To the extent possible under law, the author(s) have dedicated all copyright
   and related and neighboring rights to this software to the public domain
   worldwide. This software is distributed without any warranty.
   You should have received a copy of the CC0 Public Domain Dedication along with
   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 */
 #pragma once
 #ifndef __BLAKE2_IMPL_H__
 #define __BLAKE2_IMPL_H__
 #include <stdint.h>
 static inline uint32_t load32( const void *src )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
  uint32_t w;
  memcpy(&w, src, sizeof w);
  return w;
 #else
  const uint8_t *p = ( const uint8_t * )src;
  uint32_t w = *p++;
  w |= ( uint32_t )( *p++ ) <<  8;
  w |= ( uint32_t )( *p++ ) << 16;
  w |= ( uint32_t )( *p++ ) << 24;
  return w;
 #endif
 }
 static inline uint64_t load64( const void *src )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
  uint64_t w;
  memcpy(&w, src, sizeof w);
  return w;
 #else
  const uint8_t *p = ( const uint8_t * )src;
  uint64_t w = *p++;
  w |= ( uint64_t )( *p++ ) <<  8;
  w |= ( uint64_t )( *p++ ) << 16;
  w |= ( uint64_t )( *p++ ) << 24;
  w |= ( uint64_t )( *p++ ) << 32;
  w |= ( uint64_t )( *p++ ) << 40;
  w |= ( uint64_t )( *p++ ) << 48;
  w |= ( uint64_t )( *p++ ) << 56;
  return w;
 #endif
 }
 static inline void store32( void *dst, uint32_t w )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
  memcpy(dst, &w, sizeof w);
 #else
  uint8_t *p = ( uint8_t * )dst;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w;
 #endif
 }
 static inline void store64( void *dst, uint64_t w )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
  memcpy(dst, &w, sizeof w);
 #else
  uint8_t *p = ( uint8_t * )dst;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w;
 #endif
 }
 static inline uint64_t load48( const void *src )
 {
  const uint8_t *p = ( const uint8_t * )src;
  uint64_t w = *p++;
  w |= ( uint64_t )( *p++ ) <<  8;
  w |= ( uint64_t )( *p++ ) << 16;
  w |= ( uint64_t )( *p++ ) << 24;
  w |= ( uint64_t )( *p++ ) << 32;
  w |= ( uint64_t )( *p++ ) << 40;
  return w;
 }
 static inline void store48( void *dst, uint64_t w )
 {
  uint8_t *p = ( uint8_t * )dst;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w;
 }
 static inline uint32_t rotl32( const uint32_t w, const unsigned c )
 {
  return ( w << c ) | ( w >> ( 32 - c ) );
 }
 static inline uint64_t rotl64( const uint64_t w, const unsigned c )
 {
  return ( w << c ) | ( w >> ( 64 - c ) );
 }
 static inline uint32_t rotr32( const uint32_t w, const unsigned c )
 {
  return ( w >> c ) | ( w << ( 32 - c ) );
 }
 static inline uint64_t rotr64( const uint64_t w, const unsigned c )
 {
  return ( w >> c ) | ( w << ( 64 - c ) );
 }
 /* prevents compiler optimizing out memset() */
 static inline void secure_zero_memory( void *v, size_t n )
 {
  volatile uint8_t *p = ( volatile uint8_t * )v;
  while( n-- ) *p++ = 0;
 }
 #endif
--- a/equi/blake2/blake2-round.h
+++ b/equi/blake2/blake2-round.h
@ -0,0 +1,85 @@
 #define _mm_roti_epi64(x, c) \
 	(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
 	: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
 	: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
 	: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
 	: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
 #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
 	row1l = _mm_add_epi64(row1l, row2l); \
 	row1h = _mm_add_epi64(row1h, row2h); \
 	\
 	row4l = _mm_xor_si128(row4l, row1l); \
 	row4h = _mm_xor_si128(row4h, row1h); \
 	\
 	row4l = _mm_roti_epi64(row4l, -32); \
 	row4h = _mm_roti_epi64(row4h, -32); \
 	\
 	row3l = _mm_add_epi64(row3l, row4l); \
 	row3h = _mm_add_epi64(row3h, row4h); \
 	\
 	row2l = _mm_xor_si128(row2l, row3l); \
 	row2h = _mm_xor_si128(row2h, row3h); \
 	\
 	row2l = _mm_roti_epi64(row2l, -24); \
 	row2h = _mm_roti_epi64(row2h, -24); \
 #define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
 	row1l = _mm_add_epi64(row1l, row2l); \
 	row1h = _mm_add_epi64(row1h, row2h); \
 	\
 	row4l = _mm_xor_si128(row4l, row1l); \
 	row4h = _mm_xor_si128(row4h, row1h); \
 	\
 	row4l = _mm_roti_epi64(row4l, -16); \
 	row4h = _mm_roti_epi64(row4h, -16); \
 	\
 	row3l = _mm_add_epi64(row3l, row4l); \
 	row3h = _mm_add_epi64(row3h, row4h); \
 	\
 	row2l = _mm_xor_si128(row2l, row3l); \
 	row2h = _mm_xor_si128(row2h, row3h); \
 	\
 	row2l = _mm_roti_epi64(row2l, -63); \
 	row2h = _mm_roti_epi64(row2h, -63); \
 #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
 	t0 = _mm_alignr_epi8(row2h, row2l, 8); \
 	t1 = _mm_alignr_epi8(row2l, row2h, 8); \
 	row2l = t0; \
 	row2h = t1; \
 	\
 	t0 = row3l; \
 	row3l = row3h; \
 	row3h = t0;    \
 	\
 	t0 = _mm_alignr_epi8(row4h, row4l, 8); \
 	t1 = _mm_alignr_epi8(row4l, row4h, 8); \
 	row4l = t1; \
 	row4h = t0;
 #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
 	t0 = _mm_alignr_epi8(row2l, row2h, 8); \
 	t1 = _mm_alignr_epi8(row2h, row2l, 8); \
 	row2l = t0; \
 	row2h = t1; \
 	\
 	t0 = row3l; \
 	row3l = row3h; \
 	row3h = t0; \
 	\
 	t0 = _mm_alignr_epi8(row4l, row4h, 8); \
 	t1 = _mm_alignr_epi8(row4h, row4l, 8); \
 	row4l = t1; \
 	row4h = t0;
 #define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
 	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
 	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
 	\
 	DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
 	\
 	G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
 	G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
 	\
 	UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
--- a/equi/blake2/blake2.h
+++ b/equi/blake2/blake2.h
@ -0,0 +1,85 @@
 /*
   BLAKE2 reference source code package - optimized C implementations
   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
   To the extent possible under law, the author(s) have dedicated all copyright
   and related and neighboring rights to this software to the public domain
   worldwide. This software is distributed without any warranty.
   You should have received a copy of the CC0 Public Domain Dedication along with
   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 */
 #pragma once
 #ifndef __BLAKE2_H__
 #define __BLAKE2_H__
 #include <stddef.h>
 #include <stdint.h>
 #if defined(_MSC_VER)
 #define ALIGN(x) __declspec(align(x))
 #else
 #define ALIGN(x) __attribute__ ((__aligned__(x)))
 #endif
 #if defined(__cplusplus)
 extern "C" {
 #endif
  enum blake2b_constant
  {
    BLAKE2B_BLOCKBYTES = 128,
    BLAKE2B_OUTBYTES   = 64,
    BLAKE2B_KEYBYTES   = 64,
    BLAKE2B_SALTBYTES  = 16,
    BLAKE2B_PERSONALBYTES = 16
  };
 #pragma pack(push, 1)
  typedef struct __blake2b_param
  {
    uint8_t  digest_length; // 1
    uint8_t  key_length;    // 2
    uint8_t  fanout;        // 3
    uint8_t  depth;         // 4
    uint32_t leaf_length;   // 8
    uint64_t node_offset;   // 16
    uint8_t  node_depth;    // 17
    uint8_t  inner_length;  // 18
    uint8_t  reserved[14];  // 32
    uint8_t  salt[BLAKE2B_SALTBYTES]; // 48
    uint8_t  personal[BLAKE2B_PERSONALBYTES];  // 64
  } blake2b_param;
  ALIGN( 64 ) typedef struct __blake2b_state
  {
    uint64_t h[8];
    uint8_t  buf[BLAKE2B_BLOCKBYTES];
    uint16_t counter;
    uint8_t  buflen;
    uint8_t  lastblock;
  } blake2b_state;
 #pragma pack(pop)
  int eq_blake2b_init( blake2b_state *S, const uint8_t outlen );
  int eq_blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
  int eq_blake2b_init_param( blake2b_state *S, const blake2b_param *P );
  int eq_blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen );
  int eq_blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen );
  // Simple API
  int eq_blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
  static inline int eq_blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
  {
    return eq_blake2b( out, in, key, outlen, inlen, keylen );
  }
 #if defined(__cplusplus)
 }
 #endif
 #endif
--- a/equi/blake2/blake2b-load-sse2.h
+++ b/equi/blake2/blake2b-load-sse2.h
@ -0,0 +1,68 @@
 /*
   BLAKE2 reference source code package - optimized C implementations
   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
   To the extent possible under law, the author(s) have dedicated all copyright
   and related and neighboring rights to this software to the public domain
   worldwide. This software is distributed without any warranty.
   You should have received a copy of the CC0 Public Domain Dedication along with
   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 */
 #pragma once
 #ifndef __BLAKE2B_LOAD_SSE2_H__
 #define __BLAKE2B_LOAD_SSE2_H__
 #define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
 #define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
 #define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
 #define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
 #define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
 #define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
 #define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
 #define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
 #define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5)
 #define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2)
 #define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7)
 #define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1)
 #define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13)
 #define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12)
 #define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4)
 #define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0)
 #define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2)
 #define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4)
 #define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6)
 #define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8)
 #define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0)
 #define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11)
 #define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15)
 #define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14)
 #define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14)
 #define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13)
 #define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9)
 #define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2)
 #define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12)
 #define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1)
 #define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8)
 #define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6)
 #define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11)
 #define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3)
 #define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1)
 #define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4)
 #define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7)
 #define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6)
 #define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3)
 #define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12)
 #define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
 #define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
 #define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
 #define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
 #define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
 #define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
 #define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
 #define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
 #endif
--- a/equi/blake2/blake2b-load-sse41.h
+++ b/equi/blake2/blake2b-load-sse41.h
@ -0,0 +1,402 @@
 /*
   BLAKE2 reference source code package - optimized C implementations
   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
   To the extent possible under law, the author(s) have dedicated all copyright
   and related and neighboring rights to this software to the public domain
   worldwide. This software is distributed without any warranty.
   You should have received a copy of the CC0 Public Domain Dedication along with
   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 */
 #pragma once
 #ifndef __BLAKE2B_LOAD_SSE41_H__
 #define __BLAKE2B_LOAD_SSE41_H__
 #define LOAD_MSG_0_1(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m0, m1); \
 b1 = _mm_unpacklo_epi64(m2, m3); \
 } while(0)
 #define LOAD_MSG_0_2(b0, b1) \
 do \
 { \
 b0 = _mm_unpackhi_epi64(m0, m1); \
 b1 = _mm_unpackhi_epi64(m2, m3); \
 } while(0)
 #define LOAD_MSG_0_3(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m4, m5); \
 b1 = _mm_unpacklo_epi64(m6, m7); \
 } while(0)
 #define LOAD_MSG_0_4(b0, b1) \
 do \
 { \
 b0 = _mm_unpackhi_epi64(m4, m5); \
 b1 = _mm_unpackhi_epi64(m6, m7); \
 } while(0)
 #define LOAD_MSG_1_1(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m7, m2); \
 b1 = _mm_unpackhi_epi64(m4, m6); \
 } while(0)
 #define LOAD_MSG_1_2(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m5, m4); \
 b1 = _mm_alignr_epi8(m3, m7, 8); \
 } while(0)
 #define LOAD_MSG_1_3(b0, b1) \
 do \
 { \
 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
 b1 = _mm_unpackhi_epi64(m5, m2); \
 } while(0)
 #define LOAD_MSG_1_4(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m6, m1); \
 b1 = _mm_unpackhi_epi64(m3, m1); \
 } while(0)
 #define LOAD_MSG_2_1(b0, b1) \
 do \
 { \
 b0 = _mm_alignr_epi8(m6, m5, 8); \
 b1 = _mm_unpackhi_epi64(m2, m7); \
 } while(0)
 #define LOAD_MSG_2_2(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m4, m0); \
 b1 = _mm_blend_epi16(m1, m6, 0xF0); \
 } while(0)
 #define LOAD_MSG_2_3(b0, b1) \
 do \
 { \
 b0 = _mm_blend_epi16(m5, m1, 0xF0); \
 b1 = _mm_unpackhi_epi64(m3, m4); \
 } while(0)
 #define LOAD_MSG_2_4(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m7, m3); \
 b1 = _mm_alignr_epi8(m2, m0, 8); \
 } while(0)
 #define LOAD_MSG_3_1(b0, b1) \
 do \
 { \
 b0 = _mm_unpackhi_epi64(m3, m1); \
 b1 = _mm_unpackhi_epi64(m6, m5); \
 } while(0)
 #define LOAD_MSG_3_2(b0, b1) \
 do \
 { \
 b0 = _mm_unpackhi_epi64(m4, m0); \
 b1 = _mm_unpacklo_epi64(m6, m7); \
 } while(0)
 #define LOAD_MSG_3_3(b0, b1) \
 do \
 { \
 b0 = _mm_blend_epi16(m1, m2, 0xF0); \
 b1 = _mm_blend_epi16(m2, m7, 0xF0); \
 } while(0)
 #define LOAD_MSG_3_4(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m3, m5); \
 b1 = _mm_unpacklo_epi64(m0, m4); \
 } while(0)
 #define LOAD_MSG_4_1(b0, b1) \
 do \
 { \
 b0 = _mm_unpackhi_epi64(m4, m2); \
 b1 = _mm_unpacklo_epi64(m1, m5); \
 } while(0)
 #define LOAD_MSG_4_2(b0, b1) \
 do \
 { \
 b0 = _mm_blend_epi16(m0, m3, 0xF0); \
 b1 = _mm_blend_epi16(m2, m7, 0xF0); \
 } while(0)
 #define LOAD_MSG_4_3(b0, b1) \
 do \
 { \
 b0 = _mm_blend_epi16(m7, m5, 0xF0); \
 b1 = _mm_blend_epi16(m3, m1, 0xF0); \
 } while(0)
 #define LOAD_MSG_4_4(b0, b1) \
 do \
 { \
 b0 = _mm_alignr_epi8(m6, m0, 8); \
 b1 = _mm_blend_epi16(m4, m6, 0xF0); \
 } while(0)
 #define LOAD_MSG_5_1(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m1, m3); \
 b1 = _mm_unpacklo_epi64(m0, m4); \
 } while(0)
 #define LOAD_MSG_5_2(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m6, m5); \
 b1 = _mm_unpackhi_epi64(m5, m1); \
 } while(0)
 #define LOAD_MSG_5_3(b0, b1) \
 do \
 { \
 b0 = _mm_blend_epi16(m2, m3, 0xF0); \
 b1 = _mm_unpackhi_epi64(m7, m0); \
 } while(0)
 #define LOAD_MSG_5_4(b0, b1) \
 do \
 { \
 b0 = _mm_unpackhi_epi64(m6, m2); \
 b1 = _mm_blend_epi16(m7, m4, 0xF0); \
 } while(0)
 #define LOAD_MSG_6_1(b0, b1) \
 do \
 { \
 b0 = _mm_blend_epi16(m6, m0, 0xF0); \
 b1 = _mm_unpacklo_epi64(m7, m2); \
 } while(0)
 #define LOAD_MSG_6_2(b0, b1) \
 do \
 { \
 b0 = _mm_unpackhi_epi64(m2, m7); \
 b1 = _mm_alignr_epi8(m5, m6, 8); \
 } while(0)
 #define LOAD_MSG_6_3(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m0, m3); \
 b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
 } while(0)
 #define LOAD_MSG_6_4(b0, b1) \
 do \
 { \
 b0 = _mm_unpackhi_epi64(m3, m1); \
 b1 = _mm_blend_epi16(m1, m5, 0xF0); \
 } while(0)
 #define LOAD_MSG_7_1(b0, b1) \
 do \
 { \
 b0 = _mm_unpackhi_epi64(m6, m3); \
 b1 = _mm_blend_epi16(m6, m1, 0xF0); \
 } while(0)
 #define LOAD_MSG_7_2(b0, b1) \
 do \
 { \
 b0 = _mm_alignr_epi8(m7, m5, 8); \
 b1 = _mm_unpackhi_epi64(m0, m4); \
 } while(0)
 #define LOAD_MSG_7_3(b0, b1) \
 do \
 { \
 b0 = _mm_unpackhi_epi64(m2, m7); \
 b1 = _mm_unpacklo_epi64(m4, m1); \
 } while(0)
 #define LOAD_MSG_7_4(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m0, m2); \
 b1 = _mm_unpacklo_epi64(m3, m5); \
 } while(0)
 #define LOAD_MSG_8_1(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m3, m7); \
 b1 = _mm_alignr_epi8(m0, m5, 8); \
 } while(0)
 #define LOAD_MSG_8_2(b0, b1) \
 do \
 { \
 b0 = _mm_unpackhi_epi64(m7, m4); \
 b1 = _mm_alignr_epi8(m4, m1, 8); \
 } while(0)
 #define LOAD_MSG_8_3(b0, b1) \
 do \
 { \
 b0 = m6; \
 b1 = _mm_alignr_epi8(m5, m0, 8); \
 } while(0)
 #define LOAD_MSG_8_4(b0, b1) \
 do \
 { \
 b0 = _mm_blend_epi16(m1, m3, 0xF0); \
 b1 = m2; \
 } while(0)
 #define LOAD_MSG_9_1(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m5, m4); \
 b1 = _mm_unpackhi_epi64(m3, m0); \
 } while(0)
 #define LOAD_MSG_9_2(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m1, m2); \
 b1 = _mm_blend_epi16(m3, m2, 0xF0); \
 } while(0)
 #define LOAD_MSG_9_3(b0, b1) \
 do \
 { \
 b0 = _mm_unpackhi_epi64(m7, m4); \
 b1 = _mm_unpackhi_epi64(m1, m6); \
 } while(0)
 #define LOAD_MSG_9_4(b0, b1) \
 do \
 { \
 b0 = _mm_alignr_epi8(m7, m5, 8); \
 b1 = _mm_unpacklo_epi64(m6, m0); \
 } while(0)
 #define LOAD_MSG_10_1(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m0, m1); \
 b1 = _mm_unpacklo_epi64(m2, m3); \
 } while(0)
 #define LOAD_MSG_10_2(b0, b1) \
 do \
 { \
 b0 = _mm_unpackhi_epi64(m0, m1); \
 b1 = _mm_unpackhi_epi64(m2, m3); \
 } while(0)
 #define LOAD_MSG_10_3(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m4, m5); \
 b1 = _mm_unpacklo_epi64(m6, m7); \
 } while(0)
 #define LOAD_MSG_10_4(b0, b1) \
 do \
 { \
 b0 = _mm_unpackhi_epi64(m4, m5); \
 b1 = _mm_unpackhi_epi64(m6, m7); \
 } while(0)
 #define LOAD_MSG_11_1(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m7, m2); \
 b1 = _mm_unpackhi_epi64(m4, m6); \
 } while(0)
 #define LOAD_MSG_11_2(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m5, m4); \
 b1 = _mm_alignr_epi8(m3, m7, 8); \
 } while(0)
 #define LOAD_MSG_11_3(b0, b1) \
 do \
 { \
 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
 b1 = _mm_unpackhi_epi64(m5, m2); \
 } while(0)
 #define LOAD_MSG_11_4(b0, b1) \
 do \
 { \
 b0 = _mm_unpacklo_epi64(m6, m1); \
 b1 = _mm_unpackhi_epi64(m3, m1); \
 } while(0)
 #endif
--- a/equi/blake2/blake2b-round.h
+++ b/equi/blake2/blake2b-round.h
@ -0,0 +1,170 @@
 /*
   BLAKE2 reference source code package - optimized C implementations
   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
   To the extent possible under law, the author(s) have dedicated all copyright
   and related and neighboring rights to this software to the public domain
   worldwide. This software is distributed without any warranty.
   You should have received a copy of the CC0 Public Domain Dedication along with
   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 */
 #pragma once
 #ifndef __BLAKE2B_ROUND_H__
 #define __BLAKE2B_ROUND_H__
 #define LOAD(p)  _mm_load_si128( (const __m128i *)(p) )
 #define STORE(p,r) _mm_store_si128((__m128i *)(p), r)
 #define LOADU(p)  _mm_loadu_si128( (const __m128i *)(p) )
 #define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
 #define TOF(reg) _mm_castsi128_ps((reg))
 #define TOI(reg) _mm_castps_si128((reg))
 #define LIKELY(x) __builtin_expect((x),1)
 /* Microarchitecture-specific macros */
 #ifndef HAVE_XOP
 #ifdef HAVE_SSSE3
 #define _mm_roti_epi64(x, c) \
    (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
    : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
    : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
    : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
    : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
 #else
 #define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) ))
 #endif
 #else
 /* ... */
 #endif
 #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
  \
  row4l = _mm_xor_si128(row4l, row1l); \
  row4h = _mm_xor_si128(row4h, row1h); \
  \
  row4l = _mm_roti_epi64(row4l, (-32)); \
  row4h = _mm_roti_epi64(row4h, (-32)); \
  \
  row3l = _mm_add_epi64(row3l, row4l); \
  row3h = _mm_add_epi64(row3h, row4h); \
  \
  row2l = _mm_xor_si128(row2l, row3l); \
  row2h = _mm_xor_si128(row2h, row3h); \
  \
  row2l = _mm_roti_epi64(row2l, (-24)); \
  row2h = _mm_roti_epi64(row2h, (-24)); \
 #define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
  \
  row4l = _mm_xor_si128(row4l, row1l); \
  row4h = _mm_xor_si128(row4h, row1h); \
  \
  row4l = _mm_roti_epi64(row4l, (-16)); \
  row4h = _mm_roti_epi64(row4h, (-16)); \
  \
  row3l = _mm_add_epi64(row3l, row4l); \
  row3h = _mm_add_epi64(row3h, row4h); \
  \
  row2l = _mm_xor_si128(row2l, row3l); \
  row2h = _mm_xor_si128(row2h, row3h); \
  \
  row2l = _mm_roti_epi64(row2l, (-63)); \
  row2h = _mm_roti_epi64(row2h, (-63)); \
 #if defined(HAVE_SSSE3)
 #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  t0 = _mm_alignr_epi8(row2h, row2l, 8); \
  t1 = _mm_alignr_epi8(row2l, row2h, 8); \
  row2l = t0; \
  row2h = t1; \
  \
  t0 = row3l; \
  row3l = row3h; \
  row3h = t0;    \
  \
  t0 = _mm_alignr_epi8(row4h, row4l, 8); \
  t1 = _mm_alignr_epi8(row4l, row4h, 8); \
  row4l = t1; \
  row4h = t0;
 #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  t0 = _mm_alignr_epi8(row2l, row2h, 8); \
  t1 = _mm_alignr_epi8(row2h, row2l, 8); \
  row2l = t0; \
  row2h = t1; \
  \
  t0 = row3l; \
  row3l = row3h; \
  row3h = t0; \
  \
  t0 = _mm_alignr_epi8(row4l, row4h, 8); \
  t1 = _mm_alignr_epi8(row4h, row4l, 8); \
  row4l = t1; \
  row4h = t0;
 #else
 #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  t0 = row4l;\
  t1 = row2l;\
  row4l = row3l;\
  row3l = row3h;\
  row3h = row4l;\
  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
 #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  t0 = row3l;\
  row3l = row3h;\
  row3h = t0;\
  t0 = row2l;\
  t1 = row4l;\
  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
 #endif
 #if defined(HAVE_SSE41)
 #include "blake2b-load-sse41.h"
 #else
 #include "blake2b-load-sse2.h"
 #endif
 #define ROUND(r) \
  LOAD_MSG_ ##r ##_1(b0, b1); \
  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  LOAD_MSG_ ##r ##_2(b0, b1); \
  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
  LOAD_MSG_ ##r ##_3(b0, b1); \
  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  LOAD_MSG_ ##r ##_4(b0, b1); \
  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
 #endif
 #define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \
 	G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
 	G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
 	\
 	DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
 	\
 	G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
 	G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
 	\
 	UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
--- a/equi/blake2/blake2bx.cpp
+++ b/equi/blake2/blake2bx.cpp
@ -0,0 +1,262 @@
 /*
   BLAKE2 reference source code package - optimized C implementations
   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
   To the extent possible under law, the author(s) have dedicated all copyright
   and related and neighboring rights to this software to the public domain
   worldwide. This software is distributed without any warranty.
   You should have received a copy of the CC0 Public Domain Dedication along with
   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
 */
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "blake2.h"
 #include "blake2-impl.h"
 #include "blake2-config.h"
 #ifdef WIN32
 #include <intrin.h>
 #endif
 #include <emmintrin.h>
 #if defined(HAVE_SSSE3)
 #include <tmmintrin.h>
 #endif
 #if defined(HAVE_SSE41)
 #include <smmintrin.h>
 #endif
 #if defined(HAVE_AVX)
 #include <immintrin.h>
 #endif
 #if defined(HAVE_XOP)
 #include <x86intrin.h>
 #endif
 #include "blake2b-round.h"
 ALIGN(64) static const uint64_t blake2b_IV[8] = {
 	0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
 	0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
 	0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
 	0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
 };
 /* init xors IV with input parameter block */
 int eq_blake2b_init_param(blake2b_state *S, const blake2b_param *P)
 {
 	//blake2b_init0( S );
 	const uint8_t * v = (const uint8_t *)(blake2b_IV);
 	const uint8_t * p = (const uint8_t *)(P);
 	uint8_t * h = (uint8_t *)(S->h);
 	/* IV XOR ParamBlock */
 	memset(S, 0, sizeof(blake2b_state));
 	for (int i = 0; i < BLAKE2B_OUTBYTES; ++i) h[i] = v[i] ^ p[i];
 	return 0;
 }
 /* Some sort of default parameter block initialization, for sequential blake2b */
 int eq_blake2b_init(blake2b_state *S, const uint8_t outlen)
 {
 	if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
 	const blake2b_param P =
 	{
 		outlen,
 		0,
 		1,
 		1,
 		0,
 		0,
 		0,
 		0,
 		{ 0 },
 		{ 0 },
 		{ 0 }
 	};
 	return eq_blake2b_init_param(S, &P);
 }
 int eq_blake2b_init_key(blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen)
 {
 	if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1;
 	if ((!keylen) || keylen > BLAKE2B_KEYBYTES) return -1;
 	const blake2b_param P =
 	{
 		outlen,
 		keylen,
 		1,
 		1,
 		0,
 		0,
 		0,
 		0,
 		{ 0 },
 		{ 0 },
 		{ 0 }
 	};
 	if (eq_blake2b_init_param(S, &P) < 0)
 		return 0;
 	{
 		uint8_t block[BLAKE2B_BLOCKBYTES];
 		memset(block, 0, BLAKE2B_BLOCKBYTES);
 		memcpy(block, key, keylen);
 		eq_blake2b_update(S, block, BLAKE2B_BLOCKBYTES);
 		secure_zero_memory(block, BLAKE2B_BLOCKBYTES); /* Burn the key from stack */
 	}
 	return 0;
 }
 static inline int blake2b_compress(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES])
 {
 	__m128i row1l, row1h;
 	__m128i row2l, row2h;
 	__m128i row3l, row3h;
 	__m128i row4l, row4h;
 	__m128i b0, b1;
 	__m128i t0, t1;
 #if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
 	const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
 	const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
 #endif
 #if defined(HAVE_SSE41)
 	const __m128i m0 = LOADU(block + 00);
 	const __m128i m1 = LOADU(block + 16);
 	const __m128i m2 = LOADU(block + 32);
 	const __m128i m3 = LOADU(block + 48);
 	const __m128i m4 = LOADU(block + 64);
 	const __m128i m5 = LOADU(block + 80);
 	const __m128i m6 = LOADU(block + 96);
 	const __m128i m7 = LOADU(block + 112);
 #else
 	const uint64_t  m0 = ( ( uint64_t * )block )[ 0];
 	const uint64_t  m1 = ( ( uint64_t * )block )[ 1];
 	const uint64_t  m2 = ( ( uint64_t * )block )[ 2];
 	const uint64_t  m3 = ( ( uint64_t * )block )[ 3];
 	const uint64_t  m4 = ( ( uint64_t * )block )[ 4];
 	const uint64_t  m5 = ( ( uint64_t * )block )[ 5];
 	const uint64_t  m6 = ( ( uint64_t * )block )[ 6];
 	const uint64_t  m7 = ( ( uint64_t * )block )[ 7];
 	const uint64_t  m8 = ( ( uint64_t * )block )[ 8];
 	const uint64_t  m9 = ( ( uint64_t * )block )[ 9];
 	const uint64_t m10 = ( ( uint64_t * )block )[10];
 	const uint64_t m11 = ( ( uint64_t * )block )[11];
 	const uint64_t m12 = ( ( uint64_t * )block )[12];
 	const uint64_t m13 = ( ( uint64_t * )block )[13];
 	const uint64_t m14 = ( ( uint64_t * )block )[14];
 	const uint64_t m15 = ( ( uint64_t * )block )[15];
 #endif
 	row1l = LOADU(&S->h[0]);
 	row1h = LOADU(&S->h[2]);
 	row2l = LOADU(&S->h[4]);
 	row2h = LOADU(&S->h[6]);
 	row3l = LOADU(&blake2b_IV[0]);
 	row3h = LOADU(&blake2b_IV[2]);
 	row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), _mm_set_epi32(0, 0, 0, S->counter));
 	row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), _mm_set_epi32(0, 0, 0L - S->lastblock, 0L - S->lastblock));
 	ROUND(0);
 	ROUND(1);
 	ROUND(2);
 	ROUND(3);
 	ROUND(4);
 	ROUND(5);
 	ROUND(6);
 	ROUND(7);
 	ROUND(8);
 	ROUND(9);
 	ROUND(10);
 	ROUND(11);
 	row1l = _mm_xor_si128(row3l, row1l);
 	row1h = _mm_xor_si128(row3h, row1h);
 	STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
 	STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
 	row2l = _mm_xor_si128(row4l, row2l);
 	row2h = _mm_xor_si128(row4h, row2h);
 	STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
 	STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
 	return 0;
 }
 int eq_blake2b_update(blake2b_state *S, const uint8_t *in, uint64_t inlen)
 {
 	while (inlen > 0)
 	{
 		size_t left = S->buflen;
 		size_t fill = BLAKE2B_BLOCKBYTES - left;
 		if (inlen > fill)
 		{
 			memcpy(S->buf + left, in, fill); // Fill buffer
 			in += fill;
 			inlen -= fill;
 			S->counter += BLAKE2B_BLOCKBYTES;
 			blake2b_compress(S, S->buf); // Compress
 			S->buflen = 0;
 		}
 		else // inlen <= fill
 		{
 			memcpy(S->buf + left, in, inlen);
 			S->buflen += (uint8_t) inlen; // not enough to compress
 			in += inlen;
 			inlen = 0;
 		}
 	}
 	return 0;
 }
 int eq_blake2b_final(blake2b_state *S, uint8_t *out, uint8_t outlen)
 {
 	if (outlen > BLAKE2B_OUTBYTES)
 		return -1;
 	if (S->buflen > BLAKE2B_BLOCKBYTES)
 	{
 		S->counter += BLAKE2B_BLOCKBYTES;
 		blake2b_compress(S, S->buf);
 		S->buflen -= BLAKE2B_BLOCKBYTES;
 		memcpy(S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen);
 	}
 	S->counter += S->buflen;
 	S->lastblock = 1;
 	memset(S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
 	blake2b_compress(S, S->buf);
 	memcpy(out, &S->h[0], outlen);
 	S->lastblock = 0;
 	return 0;
 }
 int eq_blake2b(uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen)
 {
 	blake2b_state S[1];
 	/* Verify parameters */
 	if (!in || !out) return -1;
 	if (NULL == key) keylen = 0;
 	if (keylen)
 	{
 		if (eq_blake2b_init_key(S, outlen, key, keylen) < 0) return -1;
 	}
 	else
 	{
 		if (eq_blake2b_init(S, outlen) < 0) return -1;
 	}
 	eq_blake2b_update(S, (const uint8_t *)in, inlen);
 	eq_blake2b_final(S, out, outlen);
 	return 0;
 }
--- a/equi/cuda_equi.cu
+++ b/equi/cuda_equi.cu
--- a/equi/eqcuda.hpp
+++ b/equi/eqcuda.hpp
@ -0,0 +1,136 @@
 #pragma once
 #include <stdint.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdexcept>
 #include <vector>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 #include <device_functions_decls.h>
 #ifdef WIN32
 #define _SNPRINTF _snprintf
 #else
 #define _SNPRINTF snprintf
 #endif
 #ifndef nullptr
 #define nullptr NULL
 #endif
 #ifdef WIN32
 #define rt_error std::runtime_error
 #else
 class rt_error : public std::runtime_error
 {
 public:
 	explicit rt_error(const std::string& str) : std::runtime_error(str) {}
 };
 #endif
 #define checkCudaErrors(call)								\
 do {														\
 	cudaError_t err = call;									\
 	if (cudaSuccess != err) {								\
 		char errorBuff[512];								\
 		_SNPRINTF(errorBuff, sizeof(errorBuff) - 1,			\
 			"CUDA error '%s' in func '%s' line %d",			\
 			cudaGetErrorString(err), __FUNCTION__, __LINE__); \
 		throw rt_error(errorBuff);				\
 		}													\
 } while (0)
 #define checkCudaDriverErrors(call)							\
 do {														\
 	CUresult err = call;									\
 	if (CUDA_SUCCESS != err) {								\
 		char errorBuff[512];								\
 		_SNPRINTF(errorBuff, sizeof(errorBuff) - 1,			\
 			"CUDA error DRIVER: '%d' in func '%s' line %d", \
 			err, __FUNCTION__, __LINE__);	\
 		throw rt_error(errorBuff);				\
 				}											\
 } while (0)
 typedef uint64_t u64;
 typedef uint32_t u32;
 typedef uint16_t u16;
 typedef uint8_t u8;
 typedef unsigned char uchar;
 struct packer_default;
 struct packer_cantor;
 #define MAXREALSOLS 9
 struct scontainerreal {
 	u32 sols[MAXREALSOLS][512];
 	u32 nsols;
 };
 #if 0
 #include <functional>
 #define fn_solution std::function<void(int thr_id, const std::vector<uint32_t>&, size_t, const unsigned char*)>
 #define fn_hashdone std::function<void(int thr_id)>
 #define fn_cancel   std::function<bool(int thr_id)>
 #else
 typedef void (*fn_solution)(int thr_id, const std::vector<uint32_t>&, size_t, const unsigned char*);
 typedef void (*fn_hashdone)(int thr_id);
 typedef bool (*fn_cancel)(int thr_id);
 #endif
 template <u32 RB, u32 SM> struct equi;
 // ---------------------------------------------------------------------------------------------------
 struct eq_cuda_context_interface
 {
 	virtual ~eq_cuda_context_interface();
 	virtual void solve(const char *tequihash_header,
 		unsigned int tequihash_header_len,
 		const char* nonce,
 		unsigned int nonce_len,
 		fn_cancel cancelf,
 		fn_solution solutionf,
 		fn_hashdone hashdonef);
 public:
 	int thread_id;
 	int device_id;
 	int throughput;
 	int totalblocks;
 	int threadsperblock;
 	int threadsperblock_digits;
 	size_t equi_mem_sz;
 };
 // ---------------------------------------------------------------------------------------------------
 template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
 class eq_cuda_context : public eq_cuda_context_interface
 {
 	equi<RB, SM>* device_eq;
 	scontainerreal* solutions;
 	CUcontext pctx;
 	void solve(const char *tequihash_header,
 		unsigned int tequihash_header_len,
 		const char* nonce,
 		unsigned int nonce_len,
 		fn_cancel cancelf,
 		fn_solution solutionf,
 		fn_hashdone hashdonef);
 public:
 	eq_cuda_context(int thr_id, int dev_id);
 	~eq_cuda_context();
 };
 // RB, SM, SSM, TPB, PACKER... but any change only here will fail..
 #define CONFIG_MODE_1	9, 1248, 12, 640, packer_cantor
 //#define CONFIG_MODE_2	8, 640, 12, 512, packer_default
--- a/equi/equi-stratum.cpp
+++ b/equi/equi-stratum.cpp
@ -0,0 +1,241 @@
 /**
 * Equihash specific stratum protocol
 * tpruvot@github - 2017 - Part under GPLv3 Licence
 */
 #include <errno.h>
 #include <string.h>
 #include <unistd.h>
 #include <miner.h>
 #include "equihash.h"
 extern struct stratum_ctx stratum;
 extern pthread_mutex_t stratum_work_lock;
 // ZEC uses a different scale to compute diff... 
 // sample targets to diff (stored in the reverse byte order in work->target)
 // 0007fff800000000000000000000000000000000000000000000000000000000 is stratum diff 32
 // 003fffc000000000000000000000000000000000000000000000000000000000 is stratum diff 4
 // 00ffff0000000000000000000000000000000000000000000000000000000000 is stratum diff 1
 double target_to_diff_equi(uint32_t* target)
 {
 	uchar* tgt = (uchar*) target;
 	uint64_t m =
 		(uint64_t)tgt[30] << 24 |
 		(uint64_t)tgt[29] << 16 |
 		(uint64_t)tgt[28] << 8  |
 		(uint64_t)tgt[27] << 0;
 	if (!m)
 		return 0.;
 	else
 		return (double)0xffff0000UL/m;
 }
 void diff_to_target_equi(uint32_t *target, double diff)
 {
 	uint64_t m;
 	int k;
 	for (k = 6; k > 0 && diff > 1.0; k--)
 		diff /= 4294967296.0;
 	m = (uint64_t)(4294901760.0 / diff);
 	if (m == 0 && k == 6)
 		memset(target, 0xff, 32);
 	else {
 		memset(target, 0, 32);
 		target[k + 1] = (uint32_t)(m >> 8);
 		target[k + 2] = (uint32_t)(m >> 40);
 		//memset(target, 0xff, 6*sizeof(uint32_t));
 		for (k = 0; k < 28 && ((uint8_t*)target)[k] == 0; k++)
 			((uint8_t*)target)[k] = 0xff;
 	}
 }
 /* compute nbits to get the network diff */
 double equi_network_diff(struct work *work)
 {
 	//KMD bits: "1e 015971",
 	//KMD target: "00 00 015971000000000000000000000000000000000000000000000000000000",
 	//KMD bits: "1d 686aaf",
 	//KMD target: "00 0000 686aaf0000000000000000000000000000000000000000000000000000",
 	uint32_t nbits = work->data[26];
 	uint32_t bits = (nbits & 0xffffff);
 	int16_t shift = (swab32(nbits) & 0xff);
 	shift = (31 - shift) * 8; // 8 bits shift for 0x1e, 16 for 0x1d
 	uint64_t tgt64 = swab32(bits);
 	tgt64 = tgt64 << shift;
 	// applog_hex(&tgt64, 8);
 	uint8_t net_target[32] = { 0 };
 	for (int b=0; b<8; b++)
 		net_target[31-b] = ((uint8_t*)&tgt64)[b];
 	// applog_hex(net_target, 32);
 	double d = target_to_diff_equi((uint32_t*)net_target);
 	return d;
 }
 void equi_work_set_target(struct work* work, double diff)
 {
 	// target is given as data by the equihash stratum
 	// memcpy(work->target, stratum.job.claim, 32); // claim field is only used for lbry
 	diff_to_target_equi(work->target, diff);
 	//applog(LOG_BLUE, "diff %f to target :", diff);
 	//applog_hex(work->target, 32);
 	work->targetdiff = diff;
 }
 bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params)
 {
 	uint8_t target_bin[32], target_be[32];
 	const char *target_hex = json_string_value(json_array_get(params, 0));
 	if (!target_hex || strlen(target_hex) == 0)
 		return false;
 	hex2bin(target_bin, target_hex, 32);
 	memset(target_be, 0xff, 32);
 	int filled = 0;
 	for (int i=0; i<32; i++) {
 		if (filled == 3) break;
 		target_be[31-i] = target_bin[i];
 		if (target_bin[i]) filled++;
 	}
 	memcpy(sctx->job.claim, target_be, 32); // hack, unused struct field
 	pthread_mutex_lock(&stratum_work_lock);
 	sctx->next_diff = target_to_diff_equi((uint32_t*) &target_be);
 	pthread_mutex_unlock(&stratum_work_lock);
 	//applog(LOG_BLUE, "low diff %f", sctx->next_diff);
 	//applog_hex(target_be, 32);
 	return true;
 }
 bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params)
 {
 	const char *job_id, *version, *prevhash, *coinb1, *coinb2, *nbits, *stime;
 	size_t coinb1_size, coinb2_size;
 	bool clean, ret = false;
 	int ntime, i, p=0;
 	job_id = json_string_value(json_array_get(params, p++));
 	version = json_string_value(json_array_get(params, p++));
 	prevhash = json_string_value(json_array_get(params, p++));
 	coinb1 = json_string_value(json_array_get(params, p++)); //merkle
 	coinb2 = json_string_value(json_array_get(params, p++)); //blank (reserved)
 	stime = json_string_value(json_array_get(params, p++));
 	nbits = json_string_value(json_array_get(params, p++));
 	clean = json_is_true(json_array_get(params, p)); p++;
 	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime ||
 	    strlen(prevhash) != 64 || strlen(version) != 8 ||
 	    strlen(coinb1) != 64 || strlen(coinb2) != 64 ||
 	    strlen(nbits) != 8 || strlen(stime) != 8) {
 		applog(LOG_ERR, "Stratum notify: invalid parameters");
 		goto out;
 	}
 	/* store stratum server time diff */
 	hex2bin((uchar *)&ntime, stime, 4);
 	ntime = ntime - (int) time(0);
 	if (ntime > sctx->srvtime_diff) {
 		sctx->srvtime_diff = ntime;
 		if (opt_protocol && ntime > 20)
 			applog(LOG_DEBUG, "stratum time is at least %ds in the future", ntime);
 	}
 	pthread_mutex_lock(&stratum_work_lock);
 	hex2bin(sctx->job.version, version, 4);
 	hex2bin(sctx->job.prevhash, prevhash, 32);
 	coinb1_size = strlen(coinb1) / 2;
 	coinb2_size = strlen(coinb2) / 2;
 	sctx->job.coinbase_size = coinb1_size + coinb2_size + // merkle + reserved
 		sctx->xnonce1_size + sctx->xnonce2_size; // extranonce and...
 	sctx->job.coinbase = (uchar*) realloc(sctx->job.coinbase, sctx->job.coinbase_size);
 	hex2bin(sctx->job.coinbase, coinb1, coinb1_size);
 	hex2bin(sctx->job.coinbase + coinb1_size, coinb2, coinb2_size);
 	sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + coinb2_size + sctx->xnonce1_size;
 	if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id))
 		memset(sctx->job.xnonce2, 0, sctx->xnonce2_size);
 	memcpy(sctx->job.coinbase + coinb1_size + coinb2_size, sctx->xnonce1, sctx->xnonce1_size);
 	for (i = 0; i < sctx->job.merkle_count; i++)
 		free(sctx->job.merkle[i]);
 	free(sctx->job.merkle);
 	sctx->job.merkle = NULL;
 	sctx->job.merkle_count = 0;
 	free(sctx->job.job_id);
 	sctx->job.job_id = strdup(job_id);
 	hex2bin(sctx->job.nbits, nbits, 4);
 	hex2bin(sctx->job.ntime, stime, 4);
 	sctx->job.clean = clean;
 	sctx->job.diff = sctx->next_diff;
 	pthread_mutex_unlock(&stratum_work_lock);
 	ret = true;
 out:
 	return ret;
 }
 void equi_store_work_solution(struct work* work, uint32_t* hash, void* sol_data)
 {
 	int nonce = work->valid_nonces-1;
 	memcpy(work->extra, sol_data, 1347);
 	bn_store_hash_target_ratio(hash, work->target, work, nonce);
 	//work->sharediff[nonce] = target_to_diff_equi(hash);
 }
 #define JSON_SUBMIT_BUF_LEN (4*1024)
 // called by submit_upstream_work()
 bool equi_stratum_submit(struct pool_infos *pool, struct work *work)
 {
 	char _ALIGN(64) s[JSON_SUBMIT_BUF_LEN];
 	char _ALIGN(64) timehex[16] = { 0 };
 	char *jobid, *noncestr, *solhex;
 	int idnonce = work->submit_nonce_id;
 	// scanned nonce
 	work->data[EQNONCE_OFFSET] = work->nonces[idnonce];
 	unsigned char * nonce = (unsigned char*) (&work->data[27]);
 	size_t nonce_len = 32 - stratum.xnonce1_size;
 	// long nonce without pool prefix (extranonce)
 	noncestr = bin2hex(&nonce[stratum.xnonce1_size], nonce_len);
 	solhex = (char*) calloc(1, 1344*2 + 64);
 	if (!solhex || !noncestr) {
 		applog(LOG_ERR, "unable to alloc share memory");
 		return false;
 	}
 	cbin2hex(solhex, (const char*) work->extra, 1347);
 	jobid = work->job_id + 8;
 	sprintf(timehex, "%08x", swab32(work->data[25]));
 	snprintf(s, sizeof(s), "{\"method\":\"mining.submit\",\"params\":"
 		"[\"%s\",\"%s\",\"%s\",\"%s\",\"%s\"], \"id\":%u}",
 		pool->user, jobid, timehex, noncestr, solhex,
 		stratum.job.shares_count + 10);
 	free(solhex);
 	free(noncestr);
 	gettimeofday(&stratum.tv_submit, NULL);
 	if(!stratum_send_line(&stratum, s)) {
 		applog(LOG_ERR, "%s stratum_send_line failed", __func__);
 		return false;
 	}
 	stratum.sharediff = work->sharediff[idnonce];
 	stratum.job.shares_count++;
 	return true;
 }
--- a/equi/equi.cpp
+++ b/equi/equi.cpp
@ -0,0 +1,171 @@
 /*
 * Port to Generic C of C++ implementation of the Equihash Proof-of-Work
 * algorithm from zcashd.
 *
 * Copyright (c) 2016 abc at openwall dot com
 * Copyright (c) 2016 Jack Grigg
 * Copyright (c) 2016 The Zcash developers
 * Copyright (c) 2017 tpruvot
 *
 * Distributed under the MIT software license, see the accompanying
 * file COPYING or http://www.opensource.org/licenses/mit-license.php.
 */
 #include <string.h>
 #include <stdint.h>
 #include <stdbool.h>
 #include <assert.h>
 #include "equihash.h"
 //#define USE_LIBSODIUM
 #ifdef USE_LIBSODIUM
 #include "sodium.h"
 #define blake2b_state crypto_generichash_blake2b_state
 #else
 #include "blake2/blake2.h"
 #define be32toh(x) swab32(x)
 #define htole32(x) (x)
 #define HASHOUT 50
 #endif
 #include <miner.h>
 static void digestInit(blake2b_state *S, const uint32_t n, const uint32_t k)
 {
 	uint32_t le_N = htole32(n);
 	uint32_t le_K = htole32(k);
 #ifdef USE_LIBSODIUM
 	uint8_t personalization[crypto_generichash_blake2b_PERSONALBYTES] = { 0 };
 	memcpy(personalization, "ZcashPoW", 8);
 	memcpy(personalization + 8,  &le_N, 4);
 	memcpy(personalization + 12, &le_K, 4);
 	crypto_generichash_blake2b_init_salt_personal(S,
 		NULL, 0, (512 / n) * n / 8, NULL, personalization);
 #else
 	unsigned char personal[] = "ZcashPoW01230123";
 	memcpy(personal + 8, &le_N, 4);
 	memcpy(personal + 12, &le_K, 4);
 	blake2b_param P[1];
 	P->digest_length = HASHOUT;
 	P->key_length = 0;
 	P->fanout = 1;
 	P->depth = 1;
 	P->leaf_length = 0;
 	P->node_offset = 0;
 	P->node_depth = 0;
 	P->inner_length = 0;
 	memset(P->reserved, 0, sizeof(P->reserved));
 	memset(P->salt, 0, sizeof(P->salt));
 	memcpy(P->personal, (const uint8_t *)personal, 16);
 	eq_blake2b_init_param(S, P);
 #endif
 }
 static void expandArray(const unsigned char *in, const uint32_t in_len,
 	unsigned char *out, const uint32_t out_len,
 	const uint32_t bit_len, const uint32_t byte_pad)
 {
 	assert(bit_len >= 8);
 	assert(8 * sizeof(uint32_t) >= 7 + bit_len);
 	const uint32_t out_width = (bit_len + 7) / 8 + byte_pad;
 	assert(out_len == 8 * out_width * in_len / bit_len);
 	const uint32_t bit_len_mask = ((uint32_t)1 << bit_len) - 1;
 	// The acc_bits least-significant bits of acc_value represent a bit sequence
 	// in big-endian order.
 	uint32_t acc_bits = 0;
 	uint32_t acc_value = 0;
 	uint32_t j = 0;
 	for (uint32_t i = 0; i < in_len; i++)
 	{
 		acc_value = (acc_value << 8) | in[i];
 		acc_bits += 8;
 		// When we have bit_len or more bits in the accumulator, write the next
 		// output element.
 		if (acc_bits >= bit_len) {
 			acc_bits -= bit_len;
 			for (uint32_t x = 0; x < byte_pad; x++) {
 				out[j + x] = 0;
 			}
 			for (uint32_t x = byte_pad; x < out_width; x++) {
 				out[j + x] = (
 					// Big-endian
 					acc_value >> (acc_bits + (8 * (out_width - x - 1)))
 				) & (
 					// Apply bit_len_mask across byte boundaries
 					(bit_len_mask >> (8 * (out_width - x - 1))) & 0xFF
 				);
 			}
 			j += out_width;
 		}
 	}
 }
 static void generateHash(blake2b_state *S, const uint32_t g, uint8_t *hash, const size_t hashLen)
 {
 	const uint32_t le_g = htole32(g);
 	blake2b_state digest = *S; /* copy */
 #ifdef USE_LIBSODIUM
 	crypto_generichash_blake2b_update(&digest, (uint8_t *)&le_g, sizeof(le_g));
 	crypto_generichash_blake2b_final(&digest, hash, hashLen);
 #else
 	eq_blake2b_update(&digest, (const uint8_t*) &le_g, sizeof(le_g));
 	eq_blake2b_final(&digest, hash, (uint8_t) (hashLen & 0xFF));
 #endif
 }
 static int isZero(const uint8_t *hash, size_t len)
 {
 	// This doesn't need to be constant time.
 	for (size_t i = 0; i < len; i++) {
 		if (hash[i] != 0) return 0;
 	}
 	return 1;
 }
 // hdr -> header including nonce (140 bytes)
 // soln -> equihash solution (excluding 3 bytes with size, so 1344 bytes length)
 bool equi_verify(uint8_t* const hdr, uint8_t* const soln)
 {
 	const uint32_t n = WN; // 200
 	const uint32_t k = WK; // 9
 	const uint32_t collisionBitLength = n / (k + 1);
 	const uint32_t collisionByteLength = (collisionBitLength + 7) / 8;
 	const uint32_t hashLength = (k + 1) * collisionByteLength;
 	const uint32_t indicesPerHashOutput = 512 / n;
 	const uint32_t hashOutput = indicesPerHashOutput * n / 8;
 	const uint32_t equihashSolutionSize = (1 << k) * (n / (k + 1) + 1) / 8;
 	const uint32_t solnr = 1 << k;
 	uint32_t indices[512] = { 0 };
 	uint8_t vHash[hashLength] = { 0 };
 	blake2b_state state;
 	digestInit(&state, n, k);
 #ifdef USE_LIBSODIUM
 	crypto_generichash_blake2b_update(&state, hdr, 140);
 #else
 	eq_blake2b_update(&state, hdr, 140);
 #endif
 	expandArray(soln, equihashSolutionSize, (uint8_t*) &indices, sizeof(indices), collisionBitLength + 1, 1);
 	for (uint32_t j = 0; j < solnr; j++) {
 		uint8_t tmpHash[hashOutput];
 		uint8_t hash[hashLength];
 		uint32_t i = be32toh(indices[j]);
 		generateHash(&state, i / indicesPerHashOutput, tmpHash, hashOutput);
 		expandArray(tmpHash + (i % indicesPerHashOutput * n / 8), n / 8, hash, hashLength, collisionBitLength, 0);
 		for (uint32_t k = 0; k < hashLength; k++)
 			vHash[k] ^= hash[k];
 	}
 	return isZero(vHash, sizeof(vHash));
 }
--- a/equi/equihash.cpp
+++ b/equi/equihash.cpp
@ -0,0 +1,294 @@
 /**
 * Equihash solver interface for ccminer (compatible with linux and windows)
 * Solver taken from nheqminer, by djeZo (and NiceHash)
 * tpruvot - 2017 (GPL v3)
 */
 #include <stdio.h>
 #include <unistd.h>
 #include <assert.h>
 #include <stdexcept>
 #include <vector>
 #include <sph/sph_sha2.h>
 #include "eqcuda.hpp"
 #include "equihash.h" // equi_verify()
 #include <miner.h>
 // All solutions (BLOCK_HEADER_LEN + SOLSIZE_LEN + SOL_LEN) sha256d should be under the target
 extern "C" void equi_hash(const void* input, void* output, int len)
 {
 	uint8_t _ALIGN(64) hash0[32], hash1[32];
 	sph_sha256_context ctx_sha256;
 	sph_sha256_init(&ctx_sha256);
 	sph_sha256(&ctx_sha256, input, len);
 	sph_sha256_close(&ctx_sha256, hash0);
 	sph_sha256(&ctx_sha256, hash0, 32);
 	sph_sha256_close(&ctx_sha256, hash1);
 	memcpy(output, hash1, 32);
 }
 // input here is 140 for the header and 1344 for the solution (equi.cpp)
 extern "C" int equi_verify_sol(void * const hdr, void * const sol)
 {
 	bool res = equi_verify((uint8_t*) hdr, (uint8_t*) sol);
 	//applog_hex((void*)hdr, 140);
 	//applog_hex((void*)sol, 1344);
 	return res ? 1 : 0;
 }
 #include <cuda_helper.h>
 //#define EQNONCE_OFFSET 30 /* 27:34 */
 #define NONCE_OFT EQNONCE_OFFSET
 static bool init[MAX_GPUS] = { 0 };
 static int valid_sols[MAX_GPUS] = { 0 };
 static uint8_t _ALIGN(64) data_sols[MAX_GPUS][MAXREALSOLS][1536] = { 0 }; // 140+3+1344 required
 static eq_cuda_context_interface* solvers[MAX_GPUS] = { NULL };
 static void CompressArray(const unsigned char* in, size_t in_len,
 	unsigned char* out, size_t out_len, size_t bit_len, size_t byte_pad)
 {
 	assert(bit_len >= 8);
 	assert(8 * sizeof(uint32_t) >= 7 + bit_len);
 	size_t in_width = (bit_len + 7) / 8 + byte_pad;
 	assert(out_len == bit_len*in_len / (8 * in_width));
 	uint32_t bit_len_mask = (1UL << bit_len) - 1;
 	// The acc_bits least-significant bits of acc_value represent a bit sequence
 	// in big-endian order.
 	size_t acc_bits = 0;
 	uint32_t acc_value = 0;
 	size_t j = 0;
 	for (size_t i = 0; i < out_len; i++) {
 		// When we have fewer than 8 bits left in the accumulator, read the next
 		// input element.
 		if (acc_bits < 8) {
 			acc_value = acc_value << bit_len;
 			for (size_t x = byte_pad; x < in_width; x++) {
 				acc_value = acc_value | (
 					(
 					// Apply bit_len_mask across byte boundaries
 					in[j + x] & ((bit_len_mask >> (8 * (in_width - x - 1))) & 0xFF)
 					) << (8 * (in_width - x - 1))); // Big-endian
 			}
 			j += in_width;
 			acc_bits += bit_len;
 		}
 		acc_bits -= 8;
 		out[i] = (acc_value >> acc_bits) & 0xFF;
 	}
 }
 #ifndef htobe32
 #define htobe32(x) swab32(x)
 #endif
 static void EhIndexToArray(const u32 i, unsigned char* arr)
 {
 	u32 bei = htobe32(i);
 	memcpy(arr, &bei, sizeof(u32));
 }
 static std::vector<unsigned char> GetMinimalFromIndices(std::vector<u32> indices, size_t cBitLen)
 {
 	assert(((cBitLen + 1) + 7) / 8 <= sizeof(u32));
 	size_t lenIndices = indices.size()*sizeof(u32);
 	size_t minLen = (cBitLen + 1)*lenIndices / (8 * sizeof(u32));
 	size_t bytePad = sizeof(u32) - ((cBitLen + 1) + 7) / 8;
 	std::vector<unsigned char> array(lenIndices);
 	for (size_t i = 0; i < indices.size(); i++) {
 		EhIndexToArray(indices[i], array.data() + (i*sizeof(u32)));
 	}
 	std::vector<unsigned char> ret(minLen);
 	CompressArray(array.data(), lenIndices, ret.data(), minLen, cBitLen + 1, bytePad);
 	return ret;
 }
 // solver callbacks
 static void cb_solution(int thr_id, const std::vector<uint32_t>& solutions, size_t cbitlen, const unsigned char *compressed_sol)
 {
 	std::vector<unsigned char> nSolution;
 	if (!compressed_sol) {
 		nSolution = GetMinimalFromIndices(solutions, cbitlen);
 	} else {
 		gpulog(LOG_INFO, thr_id, "compressed_sol");
 		nSolution = std::vector<unsigned char>(1344);
 		for (size_t i = 0; i < cbitlen; i++)
 			nSolution[i] = compressed_sol[i];
 	}
 	int nsol = valid_sols[thr_id];
 	if (nsol < 0) nsol = 0;
 	if(nSolution.size() == 1344) {
 		// todo, only store solution data here...
 		le32enc(&data_sols[thr_id][nsol][140], 0x000540fd); // sol sz header
 		memcpy(&data_sols[thr_id][nsol][143], nSolution.data(), 1344);
 		valid_sols[thr_id] = nsol + 1;
 	}
 }
 static void cb_hashdone(int thr_id) {
 	if (!valid_sols[thr_id]) valid_sols[thr_id] = -1;
 }
 static bool cb_cancel(int thr_id) {
 	if (work_restart[thr_id].restart)
 		valid_sols[thr_id] = -1;
 	return work_restart[thr_id].restart;
 }
 extern "C" int scanhash_equihash(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t _ALIGN(64) endiandata[35];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[NONCE_OFT];
 	uint32_t nonce_increment = rand() & 0xFF; // nonce randomizer
 	struct timeval tv_start, tv_end, diff;
 	double secs, solps;
 	uint32_t soluce_count = 0;
 	if (opt_benchmark)
 		ptarget[7] = 0xfffff;
 	if (!init[thr_id]) {
 		try {
 			int mode = 1;
 			switch (mode) {
 			case 1:
 				solvers[thr_id] = new eq_cuda_context<CONFIG_MODE_1>(thr_id, device_map[thr_id]);
 				break;
 #ifdef CONFIG_MODE_2
 			case 2:
 				solvers[thr_id] = new eq_cuda_context<CONFIG_MODE_2>(thr_id, device_map[thr_id]);
 				break;
 #endif
 #ifdef CONFIG_MODE_3
 			case 3:
 				solvers[thr_id] = new eq_cuda_context<CONFIG_MODE_3>(thr_id, device_map[thr_id]);
 				break;
 #endif
 			default:
 				proper_exit(EXIT_CODE_SW_INIT_ERROR);
 				return -1;
 			}
 			size_t memSz = solvers[thr_id]->equi_mem_sz / (1024*1024);
 			gpus_intensity[thr_id] = (uint32_t) solvers[thr_id]->throughput;
 			api_set_throughput(thr_id, gpus_intensity[thr_id]);
 			gpulog(LOG_DEBUG, thr_id, "Allocated %u MB of context memory", (u32) memSz);
 			cuda_get_arch(thr_id);
 			init[thr_id] = true;
 		} catch (const std::exception & e) {
 			CUDA_LOG_ERROR();
 			gpulog(LOG_ERR, thr_id, "init: %s", e.what());
 			proper_exit(EXIT_CODE_CUDA_ERROR);
 		}
 	}
 	gettimeofday(&tv_start, NULL);
 	memcpy(endiandata, pdata, 140);
 	work->valid_nonces = 0;
 	do {
 		try {
 			valid_sols[thr_id] = 0;
 			solvers[thr_id]->solve(
 				(const char *) endiandata, (unsigned int) (140 - 32),
 				(const char *) &endiandata[27], (unsigned int) 32,
 				&cb_cancel, &cb_solution, &cb_hashdone
 			);
 			*hashes_done = soluce_count;
 		} catch (const std::exception & e) {
 			gpulog(LOG_WARNING, thr_id, "solver: %s", e.what());
 			free_equihash(thr_id);
 			sleep(1);
 			return -1;
 		}
 		if (valid_sols[thr_id] > 0)
 		{
 			const uint32_t Htarg = ptarget[7];
 			uint32_t _ALIGN(64) vhash[8];
 			uint8_t _ALIGN(64) full_data[140+3+1344] = { 0 };
 			uint8_t* sol_data = &full_data[140];
 			soluce_count += valid_sols[thr_id];
 			for (int nsol=0; nsol < valid_sols[thr_id]; nsol++)
 			{
 				memcpy(full_data, endiandata, 140);
 				memcpy(sol_data, &data_sols[thr_id][nsol][140], 1347);
 				equi_hash(full_data, vhash, 140+3+1344);
 				if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
 				{
 					bool valid = equi_verify_sol(endiandata, &sol_data[3]);
 					if (valid && work->valid_nonces < MAX_NONCES) {
 						work->valid_nonces++;
 						memcpy(work->data, endiandata, 140);
 						equi_store_work_solution(work, vhash, sol_data);
 						work->nonces[work->valid_nonces-1] = endiandata[NONCE_OFT];
 						pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1;
 						//applog_hex(vhash, 32);
 						//applog_hex(&work->data[27], 32);
 						goto out; // second solution storage not handled..
 					}
 				}
 				if (work->valid_nonces == MAX_NONCES) goto out;
 			}
 			if (work->valid_nonces)
 				goto out;
 			valid_sols[thr_id] = 0;
 		}
 		endiandata[NONCE_OFT] += nonce_increment;
 	} while (!work_restart[thr_id].restart);
 out:
 	gettimeofday(&tv_end, NULL);
 	timeval_subtract(&diff, &tv_end, &tv_start);
 	secs = (1.0 * diff.tv_sec) + (0.000001 * diff.tv_usec);
 	solps = (double)soluce_count / secs;
 	gpulog(LOG_DEBUG, thr_id, "%d solutions in %.2f s (%.2f Sol/s)", soluce_count, secs, solps);
 	// H/s
 	*hashes_done = soluce_count;
 	pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1;
 	return work->valid_nonces;
 }
 // cleanup
 void free_equihash(int thr_id)
 {
 	if (!init[thr_id])
 		return;
 	delete(solvers[thr_id]);
 	solvers[thr_id] = NULL;
 	init[thr_id] = false;
 }
 // mmm... viva c++ junk
 void eq_cuda_context_interface::solve(const char *tequihash_header, unsigned int tequihash_header_len,
 	const char* nonce, unsigned int nonce_len,
 	fn_cancel cancelf, fn_solution solutionf, fn_hashdone hashdonef) { }
 eq_cuda_context_interface::~eq_cuda_context_interface() { }
--- a/equi/equihash.h
+++ b/equi/equihash.h
@ -0,0 +1,19 @@
 #ifndef EQUIHASH_H
 #define EQUIHASH_H
 #include <stdint.h>
 // miner nonce "cursor" unique for each thread
 #define EQNONCE_OFFSET 30 /* 27:34 */
 #define WK 9
 #define WN 200
 //#define CONFIG_MODE_1 9, 1248, 12, 640, packer_cantor /* eqcuda.hpp */
 extern "C" {
 	void equi_hash(const void* input, void* output, int len);
 	int  equi_verify_sol(void* const hdr, void* const soln);
 	bool equi_verify(uint8_t* const hdr, uint8_t* const soln);
 }
 #endif
--- a/miner.h
+++ b/miner.h
@ -282,6 +282,7 @@ extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonc
 extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_equihash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -340,6 +341,7 @@ extern void free_cryptolight(int thr_id);
 extern void free_cryptonight(int thr_id);
 extern void free_decred(int thr_id);
 extern void free_deep(int thr_id);
 extern void free_equihash(int thr_id);
 extern void free_keccak256(int thr_id);
 extern void free_fresh(int thr_id);
 extern void free_fugue256(int thr_id);
@ -604,6 +606,7 @@ void cuda_clear_lasterror();
 #define CL_WHT  "\x1B[01;37m" /* white */
 extern void format_hashrate(double hashrate, char *output);
 extern void format_hashrate_unit(double hashrate, char *output, const char* unit);
 extern void applog(int prio, const char *fmt, ...);
 extern void gpulog(int prio, int thr_id, const char *fmt, ...);
@ -679,6 +682,7 @@ struct stratum_ctx {
 	time_t tm_connected;
 	int rpc2;
 	int is_equihash;
 	int srvtime_diff;
 };
@ -722,6 +726,8 @@ struct work {
 	/* pok getwork txs */
 	uint32_t tx_count;
 	struct tx txs[POK_MAX_TXS];
 	// zec solution
 	uint8_t extra[1388];
 };
 #define POK_BOOL_MASK 0x00008000
@ -804,6 +810,14 @@ void stratum_free_job(struct stratum_ctx *sctx);
 bool rpc2_stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
 bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params);
 bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params);
 bool equi_stratum_submit(struct pool_infos *pool, struct work *work);
 void equi_work_set_target(struct work* work, double diff);
 void equi_store_work_solution(struct work* work, uint32_t* hash, void* sol_data);
 int equi_verify_sol(void * const hdr, void * const sol);
 double equi_network_diff(struct work *work);
 void hashlog_remember_submit(struct work* work, uint32_t nonce);
 void hashlog_remember_scan_range(struct work* work);
 double hashlog_get_sharediff(char* jobid, int idnonce, double defvalue);
--- a/util.cpp
+++ b/util.cpp
@ -211,35 +211,31 @@ void get_defconfig_path(char *out, size_t bufsize, char *argv0)
 #endif
 }
-void format_hashrate(double hashrate, char *output)
+void format_hashrate_unit(double hashrate, char *output, const char *unit)
 {
-	char prefix = '\0';
+	char prefix[2] = { 0, 0 };
 	if (hashrate < 10000) {
 		// nop
 	}
 	else if (hashrate < 1e7) {
-		prefix = 'k';
+		prefix[0] = 'k';
 		hashrate *= 1e-3;
 	}
 	else if (hashrate < 1e10) {
-		prefix = 'M';
+		prefix[0] = 'M';
 		hashrate *= 1e-6;
 	}
 	else if (hashrate < 1e13) {
-		prefix = 'G';
+		prefix[0] = 'G';
 		hashrate *= 1e-9;
 	}
 	else {
-		prefix = 'T';
+		prefix[0] = 'T';
 		hashrate *= 1e-12;
 	}
-	sprintf(
+	sprintf(output, "%.2f %s%s", hashrate, prefix, unit);
 		output,
 		prefix ? "%.2f %cH/s" : "%.2f H/s%c",
 		hashrate, prefix
 	);
 }
 static void databuf_free(struct data_buffer *db)
@ -1179,14 +1175,27 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i
 	}
 	xn2_size = (int) json_integer_value(json_array_get(params, pndx+1));
 	if (!xn2_size) {
-		applog(LOG_ERR, "Failed to get extranonce2_size");
+		char algo[64] = { 0 };
-		goto out;
+		get_currentalgo(algo, sizeof(algo));
 		if (strcmp(algo, "equihash") == 0) {
 			int xn1_size = (int)strlen(xnonce1) / 2;
 			xn2_size = 32 - xn1_size;
 			if (xn1_size < 4 || xn1_size > 12) {
 				// This miner iterates the nonces at data32[30]
 				applog(LOG_ERR, "Unsupported extranonce size of %d (12 maxi)", xn1_size);
 				goto out;
 			}
 			goto skip_n2;
 		} else {
 			applog(LOG_ERR, "Failed to get extranonce2_size");
 			goto out;
 		}
 	}
 	if (xn2_size < 2 || xn2_size > 16) {
-		applog(LOG_INFO, "Failed to get valid n2size in parse_extranonce");
+		applog(LOG_ERR, "Failed to get valid n2size in parse_extranonce (%d)", xn2_size);
 		goto out;
 	}
-
+skip_n2:
 	pthread_mutex_lock(&stratum_work_lock);
 	if (sctx->xnonce1)
 		free(sctx->xnonce1);
@ -1441,6 +1450,10 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	get_currentalgo(algo, sizeof(algo));
 	bool has_claim = !strcasecmp(algo, "lbry");
 	if (sctx->is_equihash) {
 		return equi_stratum_notify(sctx, params);
 	}
 	job_id = json_string_value(json_array_get(params, p++));
 	prevhash = json_string_value(json_array_get(params, p++));
 	if (has_claim) {
@ -1842,6 +1855,11 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
 		ret = stratum_set_difficulty(sctx, params);
 		goto out;
 	}
 	if (!strcasecmp(method, "mining.set_target")) {
 		sctx->is_equihash = true;
 		ret = equi_stratum_set_target(sctx, params);
 		goto out;
 	}
 	if (!strcasecmp(method, "mining.set_extranonce")) {
 		ret = stratum_parse_extranonce(sctx, params, 0);
 		goto out;