From 7ffe65c262645b9542cd11ec5d3f36eec8befa49 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Wed, 9 Mar 2016 22:58:47 +0000
Subject: [PATCH] blake2s algo

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
---
 Algo256/blake2s.cu      | 514 ++++++++++++++++++++++++++++++++++++++++
 Makefile.am             |   1 +
 README.txt              |   7 +-
 algos.h                 |   2 +
 bench.cpp               |   1 +
 ccminer.cpp             |   8 +-
 ccminer.vcxproj         |   2 +
 ccminer.vcxproj.filters |   6 +
 configure.ac            |   2 +-
 cpuminer-config.h       |   6 +-
 miner.h                 |   3 +
 sph/blake2s.c           | 387 ++++++++++++++++++++++++++++++
 sph/blake2s.h           | 150 ++++++++++++
 util.cpp                |   3 +
 14 files changed, 1086 insertions(+), 6 deletions(-)
 create mode 100644 Algo256/blake2s.cu
 create mode 100644 sph/blake2s.c
 create mode 100644 sph/blake2s.h

diff --git a/Algo256/blake2s.cu b/Algo256/blake2s.cu
new file mode 100644
index 0000000..4851abb
--- /dev/null
+++ b/Algo256/blake2s.cu
@@ -0,0 +1,514 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include "miner.h"
+
+#define NATIVE_LITTLE_ENDIAN
+
+extern "C" {
+#include <sph/blake2s.h>
+}
+
+static __thread blake2s_state ALIGN(64) s_midstate;
+static __thread blake2s_state ALIGN(64) s_ctx;
+
+//#define GPU_MIDSTATE
+#define MIDLEN 76
+#define A 64
+
+#include "cuda_helper.h"
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, b) x
+#endif
+
+#ifndef GPU_MIDSTATE
+__constant__ uint32_t d_data[20];
+#else
+__constant__ blake2s_state ALIGN(8) d_state[1];
+#endif
+
+/* 16 adapters max */
+static uint32_t *d_resNonce[MAX_GPUS];
+static uint32_t *h_resNonce[MAX_GPUS];
+
+/* threads per block */
+#define TPB 512
+
+/* max count of found nonces in one call */
+#define NBN 2
+static uint32_t extra_results[NBN] = { UINT32_MAX };
+
+extern "C" void blake2s_hash(void *output, const void *input)
+{
+	uint8_t _ALIGN(A) hash[BLAKE2S_OUTBYTES];
+	blake2s_state blake2_ctx;
+
+	blake2s_init(&blake2_ctx, BLAKE2S_OUTBYTES);
+	blake2s_update(&blake2_ctx, (uint8_t*) input, 80);
+	blake2s_final(&blake2_ctx, hash, BLAKE2S_OUTBYTES);
+
+	memcpy(output, hash, 32);
+}
+
+__host__
+inline void blake2s_hash_end(uint32_t *output, const uint32_t *input)
+{
+	s_ctx.buflen = MIDLEN;
+	memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
+	blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80-MIDLEN);
+	blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
+}
+
+__host__
+void blake2s_cpu_setBlock(uint32_t *penddata, blake2s_state *pstate)
+{
+#ifndef GPU_MIDSTATE
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, penddata, 80, 0, cudaMemcpyHostToDevice));
+#else
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_state, pstate, sizeof(blake2s_state), 0, cudaMemcpyHostToDevice));
+#endif
+}
+
+__device__ __forceinline__
+uint32_t gpu_load32(const void *src) {
+	return *(uint32_t *)(src);
+}
+
+__device__ __forceinline__
+void gpu_store32(void *dst, uint32_t dw) {
+	*(uint32_t *)(dst) = dw;
+}
+
+__device__ __forceinline__
+void gpu_store64(void *dst, uint64_t lw) {
+	*(uint64_t *)(dst) = lw;
+}
+
+__device__ __forceinline__
+uint64_t gpu_load48(const void *src)
+{
+	const uint8_t *p = (const uint8_t *)src;
+	uint64_t w = *p++;
+	w |= (uint64_t)(*p++) << 8;
+	w |= (uint64_t)(*p++) << 16;
+	w |= (uint64_t)(*p++) << 24;
+	w |= (uint64_t)(*p++) << 32;
+	w |= (uint64_t)(*p++) << 40;
+	return w;
+}
+
+__device__ __forceinline__
+void gpu_blake2s_set_lastnode(blake2s_state *S) {
+	S->f[1] = ~0U;
+}
+
+__device__ __forceinline__
+void gpu_blake2s_clear_lastnode(blake2s_state *S) {
+	S->f[1] = 0U;
+}
+
+__device__ __forceinline__
+void gpu_blake2s_increment_counter(blake2s_state *S, const uint32_t inc)
+{
+	S->t[0] += inc;
+	S->t[1] += ( S->t[0] < inc );
+}
+
+__device__ __forceinline__
+void gpu_blake2s_set_lastblock(blake2s_state *S)
+{
+	if (S->last_node) gpu_blake2s_set_lastnode(S);
+	S->f[0] = ~0U;
+}
+
+__device__
+void gpu_blake2s_compress(blake2s_state *S, const uint32_t *block)
+{
+	uint32_t m[16];
+	uint32_t v[16];
+
+	const uint32_t blake2s_IV[8] = {
+		0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+		0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+	};
+
+	const uint8_t blake2s_sigma[10][16] = {
+		{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+		{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
+		{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
+		{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
+		{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
+		{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
+		{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
+		{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
+		{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
+		{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 },
+	};
+
+	#pragma unroll
+	for(int i = 0; i < 16; i++)
+		m[i] = block[i];
+
+	#pragma unroll
+	for(int i = 0; i < 8; i++)
+		v[i] = S->h[i];
+
+	v[ 8] = blake2s_IV[0];
+	v[ 9] = blake2s_IV[1];
+	v[10] = blake2s_IV[2];
+	v[11] = blake2s_IV[3];
+	v[12] = S->t[0] ^ blake2s_IV[4];
+	v[13] = S->t[1] ^ blake2s_IV[5];
+	v[14] = S->f[0] ^ blake2s_IV[6];
+	v[15] = S->f[1] ^ blake2s_IV[7];
+
+	#define G(r,i,a,b,c,d) { \
+		a += b + m[blake2s_sigma[r][2*i+0]]; \
+		d = __byte_perm(d ^ a, 0, 0x1032); /* d = ROTR32(d ^ a, 16); */ \
+		c = c + d; \
+		b = ROTR32(b ^ c, 12); \
+		a += b + m[blake2s_sigma[r][2*i+1]]; \
+		d = __byte_perm(d ^ a, 0, 0x0321); /* ROTR32(d ^ a, 8); */ \
+		c = c + d; \
+		b = ROTR32(b ^ c, 7); \
+	}
+
+	#define ROUND(r) { \
+		G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+		G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+		G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+		G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+		G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+		G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+		G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
+	}
+
+	ROUND( 0 );
+	ROUND( 1 );
+	ROUND( 2 );
+	ROUND( 3 );
+	ROUND( 4 );
+	ROUND( 5 );
+	ROUND( 6 );
+	ROUND( 7 );
+	ROUND( 8 );
+	ROUND( 9 );
+
+	#pragma unroll
+	for(int i = 0; i < 8; i++)
+		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+
+#undef G
+#undef ROUND
+}
+
+__device__ __forceinline__
+void gpu_blake2s_update(blake2s_state *S, const uint8_t *in, uint64_t inlen)
+{
+	while(inlen > 0)
+	{
+		const int left = S->buflen;
+		size_t fill = 2 * BLAKE2S_BLOCKBYTES - left;
+		if(inlen > fill)
+		{
+			memcpy(S->buf + left, in, fill); // Fill buffer
+			S->buflen += fill;
+
+			gpu_blake2s_increment_counter(S, BLAKE2S_BLOCKBYTES);
+			gpu_blake2s_compress(S, (uint32_t*) S->buf); // Compress
+			memcpy(S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES); // Shift buffer left
+			S->buflen -= BLAKE2S_BLOCKBYTES;
+			in += fill;
+			inlen -= fill;
+		}
+		else // inlen <= fill
+		{
+			memcpy(S->buf + left, in, (size_t) inlen);
+			S->buflen += (size_t) inlen; // Be lazy, do not compress
+			in += inlen;
+			inlen -= inlen;
+		}
+	}
+}
+
+__device__ __forceinline__
+void gpu_blake2s_update76(blake2s_state *S, const void *in)
+{
+	uint64_t *b64 = (uint64_t*) S->buf;
+	uint64_t *i64 = (uint64_t*) in;
+	#pragma unroll
+	for (int i=0; i < 80/8; i++)
+		b64[i] = i64[i];
+	//S->buflen = 76;
+}
+
+__device__ __forceinline__
+void gpu_blake2s_update_nonce(blake2s_state *S, const uint32_t nonce)
+{
+	gpu_store32(&S->buf[76], nonce);
+	S->buflen = 80;
+}
+
+__device__ __forceinline__
+void gpu_blake2s_final(blake2s_state *S, uint32_t *out)
+{
+	//if (S->buflen > BLAKE2S_BLOCKBYTES)
+	{
+		gpu_blake2s_increment_counter(S, BLAKE2S_BLOCKBYTES);
+		gpu_blake2s_compress(S, (uint32_t*) S->buf);
+		S->buflen -= BLAKE2S_BLOCKBYTES;
+		//memcpy(S->buf, S->buf + BLAKE2S_BLOCKBYTES, S->buflen);
+	}
+
+	gpu_blake2s_increment_counter(S, (uint32_t)S->buflen);
+	gpu_blake2s_set_lastblock(S);
+	//memset(&S->buf[S->buflen], 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen); /* Padding */
+	gpu_blake2s_compress(S, (uint32_t*) (S->buf + BLAKE2S_BLOCKBYTES));
+
+	#pragma unroll
+	for (int i = 0; i < 8; i++)
+		out[i] = S->h[i];
+}
+
+/* init2 xors IV with input parameter block */
+__device__ __forceinline__
+void gpu_blake2s_init_param(blake2s_state *S, const blake2s_param *P)
+{
+	//blake2s_IV
+	S->h[0] = 0x6A09E667UL;
+	S->h[1] = 0xBB67AE85UL;
+	S->h[2] = 0x3C6EF372UL;
+	S->h[3] = 0xA54FF53AUL;
+	S->h[4] = 0x510E527FUL;
+	S->h[5] = 0x9B05688CUL;
+	S->h[6] = 0x1F83D9ABUL;
+	S->h[7] = 0x5BE0CD19UL;
+
+	S->t[0] = 0; S->t[1] = 0;
+	S->f[0] = 0; S->f[1] = 0;
+	S->last_node = 0;
+
+	S->buflen = 0;
+
+	#pragma unroll
+	for (int i = 0; i < sizeof(S->buf)/4; i++)
+		gpu_store32(S->buf + (4*i), 0);
+
+	uint32_t *p = (uint32_t*) P;
+
+	/* IV XOR ParamBlock */
+	for (int i = 0; i < 8; i++)
+		S->h[i] ^= gpu_load32(&p[i]);
+}
+
+// Sequential blake2s initialization
+__device__ __forceinline__
+void gpu_blake2s_init(blake2s_state *S, const uint8_t outlen)
+{
+	blake2s_param P[1];
+
+	// if (!outlen || outlen > BLAKE2S_OUTBYTES) return;
+
+	P->digest_length = outlen;
+	P->key_length    = 0;
+	P->fanout        = 1;
+	P->depth         = 1;
+
+	P->leaf_length = 0;
+	gpu_store64(P->node_offset, 0);
+	//P->node_depth    = 0;
+	//P->inner_length  = 0;
+
+	gpu_store64(&P->salt, 0);
+	gpu_store64(&P->personal, 0);
+
+	gpu_blake2s_init_param(S, P);
+}
+
+__device__ __forceinline__
+void gpu_copystate(blake2s_state *dst, blake2s_state *src)
+{
+	uint64_t* d64 = (uint64_t*) dst;
+	uint64_t* s64 = (uint64_t*) src;
+	#pragma unroll
+	for (int i=0; i < (32 + 16 + 2 * BLAKE2S_BLOCKBYTES)/8; i++)
+		gpu_store64(&d64[i], s64[i]);
+	dst->buflen = src->buflen;
+	dst->last_node = src->last_node;
+}
+
+__global__
+void blake2s_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t nonce = startNonce + thread;
+	blake2s_state ALIGN(8) blake2_ctx;
+
+#ifndef GPU_MIDSTATE
+	gpu_blake2s_init(&blake2_ctx, BLAKE2S_OUTBYTES);
+	//gpu_blake2s_update(&blake2_ctx, (uint8_t*) d_data, 76);
+	gpu_blake2s_update76(&blake2_ctx, (uint64_t*) d_data);
+#else
+	gpu_copystate(&blake2_ctx, &d_state[0]);
+#endif
+	gpu_blake2s_update_nonce(&blake2_ctx, nonce);
+
+	uint32_t hash[8];
+	gpu_blake2s_final(&blake2_ctx, hash);
+
+	if (hash[7] <= target2.x && hash[6] <= target2.y) {
+#if NBN == 2
+		if (resNonce[0] != UINT32_MAX)
+			resNonce[1] = nonce;
+		else
+			resNonce[0] = nonce;
+#else
+		resNonce[0] = nonce;
+#endif
+	}
+}
+
+__host__
+uint32_t blake2s_host_hash(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2)
+{
+	uint32_t result = UINT32_MAX;
+
+	dim3 grid((threads + TPB-1)/TPB);
+	dim3 block(TPB);
+
+	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
+	if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
+		return result;
+
+	blake2s_gpu_hash <<<grid, block>>> (threads, startNonce, d_resNonce[thr_id], target2);
+	cudaThreadSynchronize();
+
+	if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+		result = h_resNonce[thr_id][0];
+		for (int n=0; n < (NBN-1); n++)
+			extra_results[n] = h_resNonce[thr_id][n+1];
+	}
+	return result;
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t first_nonce = pdata[19];
+
+	int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 26 : 22;
+	if (device_sm[dev_id] < 350) intensity = 22;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark) {
+		ptarget[7] = swab32(0xff);
+	}
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage (linux)
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+			CUDA_LOG_ERROR();
+		}
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
+		CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
+		init[thr_id] = true;
+	}
+
+	for (int i=0; i < 19; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+	}
+
+	// midstate
+	memset(s_midstate.buf, 0, sizeof(s_midstate.buf));
+	blake2s_init(&s_midstate, BLAKE2S_OUTBYTES);
+	blake2s_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
+	memcpy(&s_ctx, &s_midstate, sizeof(blake2s_state));
+
+	blake2s_cpu_setBlock(endiandata, &s_midstate);
+
+	uint2 gpu_target = make_uint2(ptarget[7], ptarget[6]);
+	const uint32_t Htarg = ptarget[7];
+
+	do {
+		uint32_t foundNonce = blake2s_host_hash(thr_id, throughput, pdata[19], gpu_target);
+
+		if (foundNonce != UINT32_MAX)
+		{
+			uint32_t _ALIGN(A) vhashcpu[8];
+
+			//blake2s_hash(vhashcpu, endiandata);
+			le32enc(&endiandata[19], foundNonce);
+			blake2s_hash_end(vhashcpu, endiandata);
+
+			if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
+				work_set_target_ratio(work, vhashcpu);
+				*hashes_done = pdata[19] + throughput - first_nonce + 1;
+				pdata[19] = work->nonces[0] = swab32(foundNonce);
+#if NBN > 1
+				if (extra_results[0] != UINT32_MAX) {
+					le32enc(&endiandata[19], extra_results[0]);
+					blake2s_hash_end(vhashcpu, endiandata);
+					if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
+						work->nonces[1] = swab32(extra_results[0]);
+						if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio) {
+							work_set_target_ratio(work, vhashcpu);
+							xchg(work->nonces[1], pdata[19]);
+						}
+						return 2;
+					}
+					extra_results[0] = UINT32_MAX;
+				}
+#endif
+				return 1;
+			} else {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+				applog_hex(pdata, 80);
+				applog_hex(ptarget, 32);
+				applog_hex(vhashcpu, 32);
+			}
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart && max_nonce > (uint64_t)throughput + pdata[19]);
+
+	*hashes_done = pdata[19] - first_nonce + 1;
+
+	MyStreamSynchronize(NULL, 0, device_map[thr_id]);
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_blake2s(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaDeviceSynchronize();
+
+	cudaFreeHost(h_resNonce[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/Makefile.am b/Makefile.am
index 1545619..9b80922 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -37,6 +37,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \
 			  Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \
 			  Algo256/blake256.cu Algo256/decred.cu Algo256/vanilla.cu Algo256/keccak256.cu \
+			  Algo256/blake2s.cu sph/blake2s.c \
 			  Algo256/bmw.cu Algo256/cuda_bmw.cu \
 			  JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
 			  JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \
diff --git a/README.txt b/README.txt
index 995bfbc..a30d6c9 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
 
-ccMiner release 1.7.4 (Feb 2015) "Decred Stratum and MrM4D VNL"
+ccMiner release 1.7.5 (Mar 2015) "Blake2-S"
 ---------------------------------------------------------------
 
 ***************************************************************
@@ -70,6 +70,7 @@ its command line interface and options.
   -a, --algo=ALGO       specify the algorithm to use
                           blake       use to mine Saffroncoin (Blake256)
                           blakecoin   use to mine Old Blake 256
+                          blake2s     use to mine Nevacoin (Blake2-S 256)
                           bmw         use to mine Midnight
                           c11/flax    use to mine Chaincoin and Flax
                           decred      use to mine Decred 180 bytes Blake256-14
@@ -237,6 +238,10 @@ features.
 
 >>> RELEASE HISTORY <<<
 
+  Mar. 12th 2015  v1.7.5
+                  Blake2S Algo
+                  ...
+
   Feb. 28th 2015  v1.7.4 (1.7.3 was a preview, not official)
                   Decred simplified stratum (getwork over stratum)
                   Vanilla kernel by MrMad
diff --git a/algos.h b/algos.h
index a0cefb0..6e116b2 100644
--- a/algos.h
+++ b/algos.h
@@ -7,6 +7,7 @@
 enum sha_algos {
 	ALGO_BLAKECOIN = 0,
 	ALGO_BLAKE,
+	ALGO_BLAKE2S,
 	ALGO_BMW,
 	ALGO_C11,
 	ALGO_DEEP,
@@ -53,6 +54,7 @@ extern volatile enum sha_algos opt_algo;
 static const char *algo_names[] = {
 	"blakecoin",
 	"blake",
+	"blake2s",
 	"bmw",
 	"c11",
 	"deep",
diff --git a/bench.cpp b/bench.cpp
index c203f27..da8eecd 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -45,6 +45,7 @@ void algo_free_all(int thr_id)
 {
 	// only initialized algos will be freed
 	free_blake256(thr_id);
+	free_blake2s(thr_id);
 	free_bmw(thr_id);
 	free_c11(thr_id);
 	free_decred(thr_id);
diff --git a/ccminer.cpp b/ccminer.cpp
index ecdff2a..fd6c6a0 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -209,6 +209,7 @@ Usage: " PROGRAM_NAME " [OPTIONS]\n\
 Options:\n\
   -a, --algo=ALGO       specify the hash algorithm to use\n\
 			blake       Blake 256 (SFR)\n\
+			blake2s     Blake2-S 256 (NEVA)\n\
 			blakecoin   Fast Blake 256 (8 rounds)\n\
 			bmw         BMW 256\n\
 			c11/flax    X11 variant\n\
@@ -803,6 +804,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 			break;
 		case ALGO_BLAKE:
 		case ALGO_BLAKECOIN:
+		case ALGO_BLAKE2S:
 		case ALGO_BMW:
 		case ALGO_VANILLA:
 			// fast algos require that...
@@ -1828,6 +1830,7 @@ static void *miner_thread(void *userdata)
 				minmax = 0x80000000U;
 				break;
 			case ALGO_BLAKE:
+			case ALGO_BLAKE2S:
 			case ALGO_BMW:
 			case ALGO_DECRED:
 			//case ALGO_WHIRLPOOLX:
@@ -1911,6 +1914,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_BLAKE:
 			rc = scanhash_blake256(thr_id, &work, max_nonce, &hashes_done, 14);
 			break;
+		case ALGO_BLAKE2S:
+			rc = scanhash_blake2s(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_BMW:
 			rc = scanhash_bmw(thr_id, &work, max_nonce, &hashes_done);
 			break;
@@ -2041,7 +2047,7 @@ static void *miner_thread(void *userdata)
 
 		// todo: update all algos to use work->nonces
 		work.nonces[0] = nonceptr[0];
-		if (opt_algo != ALGO_DECRED) {
+		if (opt_algo != ALGO_DECRED && opt_algo != ALGO_BLAKE2S) {
 			work.nonces[1] = nonceptr[2];
 		}
 
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 3d9b559..c072aaa 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -262,6 +262,7 @@
     <ClCompile Include="skein2.cpp" />
     <ClCompile Include="sph\aes_helper.c" />
     <ClCompile Include="sph\blake.c" />
+    <ClCompile Include="sph\blake2s.c" />
     <ClCompile Include="sph\bmw.c" />
     <ClCompile Include="sph\cubehash.c" />
     <ClCompile Include="sph\echo.c" />
@@ -408,6 +409,7 @@
       <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
       <FastMath>true</FastMath>
     </CudaCompile>
+    <CudaCompile Include="Algo256\blake2s.cu" />
     <CudaCompile Include="Algo256\decred.cu" />
     <CudaCompile Include="Algo256\vanilla.cu" />
     <CudaCompile Include="Algo256\keccak256.cu" />
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 7f24a1d..42c1287 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -258,6 +258,9 @@
     <ClCompile Include="bignum.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="sph\blake2s.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="algos.h">
@@ -706,6 +709,9 @@
     <CudaCompile Include="lyra2\lyra2REv2.cu">
       <Filter>Source Files\CUDA\lyra2</Filter>
     </CudaCompile>
+    <CudaCompile Include="Algo256\blake2s.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
   </ItemGroup>
   <ItemGroup>
     <Image Include="res\ccminer.ico">
diff --git a/configure.ac b/configure.ac
index 53ae463..7f2b345 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [1.7.4], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [1.7.5], [], [ccminer], [http://github.com/tpruvot/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpuminer-config.h b/cpuminer-config.h
index 64783ce..fff627b 100644
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@@ -162,7 +162,7 @@
 #define PACKAGE_NAME "ccminer"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 1.7.4"
+#define PACKAGE_STRING "ccminer 1.7.5"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "ccminer"
@@ -171,7 +171,7 @@
 #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.7.4"
+#define PACKAGE_VERSION "1.7.5"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
@@ -185,7 +185,7 @@
 #define STDC_HEADERS 1
 
 /* Version number of package */
-#define VERSION "1.7.4"
+#define VERSION "1.7.5"
 
 /* Define curl_free() as free() if our version of curl lacks curl_free. */
 /* #undef curl_free */
diff --git a/miner.h b/miner.h
index 2f99bf9..fc2bdba 100644
--- a/miner.h
+++ b/miner.h
@@ -262,6 +262,7 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len);
 struct work;
 
 extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds);
+extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -304,6 +305,7 @@ extern int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonc
 void algo_free_all(int thr_id);
 
 extern void free_blake256(int thr_id);
+extern void free_blake2s(int thr_id);
 extern void free_bmw(int thr_id);
 extern void free_c11(int thr_id);
 extern void free_decred(int thr_id);
@@ -773,6 +775,7 @@ void applog_compare_hash(void *hash, void *hash_ref);
 
 void print_hash_tests(void);
 void blake256hash(void *output, const void *input, int8_t rounds);
+void blake2s_hash(void *output, const void *input);
 void bmw_hash(void *state, const void *input);
 void c11hash(void *output, const void *input);
 void decred_hash(void *state, const void *input);
diff --git a/sph/blake2s.c b/sph/blake2s.c
new file mode 100644
index 0000000..62b5a39
--- /dev/null
+++ b/sph/blake2s.c
@@ -0,0 +1,387 @@
+/**
+ * BLAKE2 reference source code package - reference C implementations
+ *
+ * Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include "sph_types.h"
+#include "blake2s.h"
+
+static const uint32_t blake2s_IV[8] =
+{
+	0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+	0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static const uint8_t blake2s_sigma[10][16] =
+{
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
+	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
+	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
+	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
+	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
+};
+
+static inline int blake2s_set_lastnode( blake2s_state *S )
+{
+	S->f[1] = ~0U;
+	return 0;
+}
+
+static inline int blake2s_clear_lastnode( blake2s_state *S )
+{
+	S->f[1] = 0U;
+	return 0;
+}
+
+/* Some helper functions, not necessarily useful */
+static inline int blake2s_set_lastblock( blake2s_state *S )
+{
+	if( S->last_node ) blake2s_set_lastnode( S );
+
+	S->f[0] = ~0U;
+	return 0;
+}
+
+static inline int blake2s_clear_lastblock( blake2s_state *S )
+{
+	if( S->last_node ) blake2s_clear_lastnode( S );
+
+	S->f[0] = 0U;
+	return 0;
+}
+
+static inline int blake2s_increment_counter( blake2s_state *S, const uint32_t inc )
+{
+	S->t[0] += inc;
+	S->t[1] += ( S->t[0] < inc );
+	return 0;
+}
+
+// Parameter-related functions
+static inline int blake2s_param_set_digest_length( blake2s_param *P, const uint8_t digest_length )
+{
+	P->digest_length = digest_length;
+	return 0;
+}
+
+static inline int blake2s_param_set_fanout( blake2s_param *P, const uint8_t fanout )
+{
+	P->fanout = fanout;
+	return 0;
+}
+
+static inline int blake2s_param_set_max_depth( blake2s_param *P, const uint8_t depth )
+{
+	P->depth = depth;
+	return 0;
+}
+
+static inline int blake2s_param_set_leaf_length( blake2s_param *P, const uint32_t leaf_length )
+{
+	store32( &P->leaf_length, leaf_length );
+	return 0;
+}
+
+static inline int blake2s_param_set_node_offset( blake2s_param *P, const uint64_t node_offset )
+{
+	store48( P->node_offset, node_offset );
+	return 0;
+}
+
+static inline int blake2s_param_set_node_depth( blake2s_param *P, const uint8_t node_depth )
+{
+	P->node_depth = node_depth;
+	return 0;
+}
+
+static inline int blake2s_param_set_inner_length( blake2s_param *P, const uint8_t inner_length )
+{
+	P->inner_length = inner_length;
+	return 0;
+}
+
+static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[BLAKE2S_SALTBYTES] )
+{
+	memcpy( P->salt, salt, BLAKE2S_SALTBYTES );
+	return 0;
+}
+
+static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[BLAKE2S_PERSONALBYTES] )
+{
+	memcpy( P->personal, personal, BLAKE2S_PERSONALBYTES );
+	return 0;
+}
+
+static inline int blake2s_init0( blake2s_state *S )
+{
+	memset( S, 0, sizeof( blake2s_state ) );
+
+	for( int i = 0; i < 8; ++i ) S->h[i] = blake2s_IV[i];
+
+	return 0;
+}
+
+/* init2 xors IV with input parameter block */
+int blake2s_init_param( blake2s_state *S, const blake2s_param *P )
+{
+	blake2s_init0( S );
+	uint32_t *p = ( uint32_t * )( P );
+
+	/* IV XOR ParamBlock */
+	for( size_t i = 0; i < 8; ++i )
+		S->h[i] ^= load32( &p[i] );
+
+	return 0;
+}
+
+
+// Sequential blake2s initialization
+int blake2s_init( blake2s_state *S, const uint8_t outlen )
+{
+	blake2s_param P[1];
+
+	/* Move interval verification here? */
+	if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+
+	P->digest_length = outlen;
+	P->key_length    = 0;
+	P->fanout        = 1;
+	P->depth         = 1;
+	store32( &P->leaf_length, 0 );
+	store48( &P->node_offset, 0 );
+	P->node_depth    = 0;
+	P->inner_length  = 0;
+	// memset(P->reserved, 0, sizeof(P->reserved) );
+	memset( P->salt,     0, sizeof( P->salt ) );
+	memset( P->personal, 0, sizeof( P->personal ) );
+	return blake2s_init_param( S, P );
+}
+
+int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
+{
+	blake2s_param P[1];
+
+	if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+
+	if ( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1;
+
+	P->digest_length = outlen;
+	P->key_length    = keylen;
+	P->fanout        = 1;
+	P->depth         = 1;
+	store32( &P->leaf_length, 0 );
+	store48( &P->node_offset, 0 );
+	P->node_depth    = 0;
+	P->inner_length  = 0;
+	// memset(P->reserved, 0, sizeof(P->reserved) );
+	memset( P->salt,     0, sizeof( P->salt ) );
+	memset( P->personal, 0, sizeof( P->personal ) );
+
+	if( blake2s_init_param( S, P ) < 0 ) return -1;
+
+	{
+		uint8_t block[BLAKE2S_BLOCKBYTES];
+		memset( block, 0, BLAKE2S_BLOCKBYTES );
+		memcpy( block, key, keylen );
+		blake2s_update( S, block, BLAKE2S_BLOCKBYTES );
+		secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
+	}
+	return 0;
+}
+
+int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
+{
+	uint32_t m[16];
+	uint32_t v[16];
+
+	for( size_t i = 0; i < 16; ++i )
+		m[i] = load32( block + i * sizeof( m[i] ) );
+
+	for( size_t i = 0; i < 8; ++i )
+		v[i] = S->h[i];
+
+	v[ 8] = blake2s_IV[0];
+	v[ 9] = blake2s_IV[1];
+	v[10] = blake2s_IV[2];
+	v[11] = blake2s_IV[3];
+	v[12] = S->t[0] ^ blake2s_IV[4];
+	v[13] = S->t[1] ^ blake2s_IV[5];
+	v[14] = S->f[0] ^ blake2s_IV[6];
+	v[15] = S->f[1] ^ blake2s_IV[7];
+#define G(r,i,a,b,c,d) \
+	do { \
+		a = a + b + m[blake2s_sigma[r][2*i+0]]; \
+		d = SPH_ROTR32(d ^ a, 16); \
+		c = c + d; \
+		b = SPH_ROTR32(b ^ c, 12); \
+		a = a + b + m[blake2s_sigma[r][2*i+1]]; \
+		d = SPH_ROTR32(d ^ a, 8); \
+		c = c + d; \
+		b = SPH_ROTR32(b ^ c, 7); \
+	} while(0)
+#define ROUND(r)  \
+	do { \
+		G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+		G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+		G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+		G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+		G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+		G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+		G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
+	} while(0)
+	ROUND( 0 );
+	ROUND( 1 );
+	ROUND( 2 );
+	ROUND( 3 );
+	ROUND( 4 );
+	ROUND( 5 );
+	ROUND( 6 );
+	ROUND( 7 );
+	ROUND( 8 );
+	ROUND( 9 );
+
+	for( size_t i = 0; i < 8; ++i )
+		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+
+#undef G
+#undef ROUND
+	return 0;
+}
+
+
+int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen )
+{
+	while( inlen > 0 )
+	{
+		size_t left = S->buflen;
+		size_t fill = 2 * BLAKE2S_BLOCKBYTES - left;
+
+		if( inlen > fill )
+		{
+			memcpy( S->buf + left, in, fill ); // Fill buffer
+			S->buflen += fill;
+			blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
+			blake2s_compress( S, S->buf ); // Compress
+			memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); // Shift buffer left
+			S->buflen -= BLAKE2S_BLOCKBYTES;
+			in += fill;
+			inlen -= fill;
+		}
+		else // inlen <= fill
+		{
+			memcpy(S->buf + left, in, (size_t) inlen);
+			S->buflen += (size_t) inlen; // Be lazy, do not compress
+			in += inlen;
+			inlen -= inlen;
+		}
+	}
+
+	return 0;
+}
+
+int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
+{
+	uint8_t buffer[BLAKE2S_OUTBYTES];
+
+	if( S->buflen > BLAKE2S_BLOCKBYTES )
+	{
+		blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
+		blake2s_compress( S, S->buf );
+		S->buflen -= BLAKE2S_BLOCKBYTES;
+		memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, S->buflen );
+	}
+
+	blake2s_increment_counter( S, ( uint32_t )S->buflen );
+	blake2s_set_lastblock( S );
+	memset( S->buf + S->buflen, 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */
+	blake2s_compress( S, S->buf );
+
+	for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
+		store32( buffer + sizeof( S->h[i] ) * i, S->h[i] );
+
+	memcpy( out, buffer, outlen );
+	return 0;
+}
+
+int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+{
+	blake2s_state S[1];
+
+	/* Verify parameters */
+	if ( NULL == in ) return -1;
+
+	if ( NULL == out ) return -1;
+
+	if ( NULL == key ) keylen = 0; /* Fail here instead if keylen != 0 and key == NULL? */
+
+	if( keylen > 0 )
+	{
+		if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+	}
+	else
+	{
+		if( blake2s_init( S, outlen ) < 0 ) return -1;
+	}
+
+	blake2s_update( S, ( uint8_t * )in, inlen );
+	blake2s_final( S, out, outlen );
+	return 0;
+}
+
+#if defined(__cplusplus)
+}
+#endif
+
+
+#if defined(BLAKE2S_SELFTEST)
+#include <string.h>
+#include "blake2-kat.h" /* test data not included */
+int main( int argc, char **argv )
+{
+	uint8_t key[BLAKE2S_KEYBYTES];
+	uint8_t buf[KAT_LENGTH];
+
+	for( size_t i = 0; i < BLAKE2S_KEYBYTES; ++i )
+		key[i] = ( uint8_t )i;
+
+	for( size_t i = 0; i < KAT_LENGTH; ++i )
+		buf[i] = ( uint8_t )i;
+
+	for( size_t i = 0; i < KAT_LENGTH; ++i )
+	{
+		uint8_t hash[BLAKE2S_OUTBYTES];
+		blake2s( hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES );
+
+		if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) )
+		{
+			puts( "error" );
+			return -1;
+		}
+	}
+
+	puts( "ok" );
+	return 0;
+}
+#endif
diff --git a/sph/blake2s.h b/sph/blake2s.h
new file mode 100644
index 0000000..64aa25b
--- /dev/null
+++ b/sph/blake2s.h
@@ -0,0 +1,150 @@
+/**
+ * BLAKE2 reference source code package - reference C implementations
+ *
+ * Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+#pragma once
+#ifndef __BLAKE2_H__
+#define __BLAKE2_H__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <inttypes.h>
+#define inline __inline
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+/* blake2-impl.h */
+
+static inline uint32_t load32(const void *src)
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+	return *(uint32_t *)(src);
+#else
+	const uint8_t *p = (uint8_t *)src;
+	uint32_t w = *p++;
+	w |= (uint32_t)(*p++) << 8;
+	w |= (uint32_t)(*p++) << 16;
+	w |= (uint32_t)(*p++) << 24;
+	return w;
+#endif
+}
+
+static inline void store32(void *dst, uint32_t w)
+{
+#if defined(NATIVE_LITTLE_ENDIAN)
+	*(uint32_t *)(dst) = w;
+#else
+	uint8_t *p = (uint8_t *)dst;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w;
+#endif
+}
+
+static inline uint64_t load48(const void *src)
+{
+	const uint8_t *p = (const uint8_t *)src;
+	uint64_t w = *p++;
+	w |= (uint64_t)(*p++) << 8;
+	w |= (uint64_t)(*p++) << 16;
+	w |= (uint64_t)(*p++) << 24;
+	w |= (uint64_t)(*p++) << 32;
+	w |= (uint64_t)(*p++) << 40;
+	return w;
+}
+
+static inline void store48(void *dst, uint64_t w)
+{
+	uint8_t *p = (uint8_t *)dst;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w; w >>= 8;
+	*p++ = (uint8_t)w;
+}
+
+/* prevents compiler optimizing out memset() */
+static inline void secure_zero_memory(void *v, size_t n)
+{
+	volatile uint8_t *p = ( volatile uint8_t * )v;
+
+	while( n-- ) *p++ = 0;
+}
+
+/* blake2.h */
+
+enum blake2s_constant
+{
+	BLAKE2S_BLOCKBYTES = 64,
+	BLAKE2S_OUTBYTES   = 32,
+	BLAKE2S_KEYBYTES   = 32,
+	BLAKE2S_SALTBYTES  = 8,
+	BLAKE2S_PERSONALBYTES = 8
+};
+
+#pragma pack(push, 1)
+typedef struct __blake2s_param
+{
+	uint8_t  digest_length; // 1
+	uint8_t  key_length;    // 2
+	uint8_t  fanout;        // 3
+	uint8_t  depth;         // 4
+	uint32_t leaf_length;   // 8
+	uint8_t  node_offset[6];// 14
+	uint8_t  node_depth;    // 15
+	uint8_t  inner_length;  // 16
+	// uint8_t  reserved[0];
+	uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
+	uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
+} blake2s_param;
+
+ALIGN( 64 ) typedef struct __blake2s_state
+{
+	uint32_t h[8];
+	uint32_t t[2];
+	uint32_t f[2];
+	uint8_t  buf[2 * BLAKE2S_BLOCKBYTES];
+	size_t   buflen;
+	uint8_t  last_node;
+} blake2s_state;
+#pragma pack(pop)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+	int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] );
+
+	// Streaming API
+	int blake2s_init( blake2s_state *S, const uint8_t outlen );
+	int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+	int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
+	int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen );
+	int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen );
+
+	// Simple API
+	int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+
+	// Direct Hash Mining Helpers
+	#define blake2s_salt32(out, in, inlen, key32) blake2s(out, in, key32, 32, inlen, 32) /* neoscrypt */
+	#define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/util.cpp b/util.cpp
index 31988f5..526c9bb 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1915,6 +1915,9 @@ void print_hash_tests(void)
 	blake256hash(&hash[0], &buf[0], 14);
 	printpfx("blake", hash);
 
+	blake2s_hash(&hash[0], &buf[0]);
+	printpfx("blake2s", hash);
+
 	bmw_hash(&hash[0], &buf[0]);
 	printpfx("bmw", hash);