From c17d11e37758c37762a7664a731fda6e9a5454b1 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 31 Aug 2014 08:57:48 +0200
Subject: [PATCH 01/44] add "blake" 256, 14 rounds (for NEOS blake, not
 BlakeCoin)

also remove "missing" file, its old and not compatible with ubuntu 14.04
---
 Makefile.am             |   3 +-
 blake32.cu              | 494 ++++++++++++++++++++++++++++++++++++++++
 build.sh                |   2 +-
 ccminer.vcxproj         |   6 +-
 ccminer.vcxproj.filters |   3 +
 configure.sh            |   2 +-
 cpu-miner.c             |  48 +++-
 miner.h                 |   5 +
 missing                 | 367 -----------------------------
 quark/cuda_checkhash.cu |   4 +-
 util.c                  |   4 +
 11 files changed, 554 insertions(+), 384 deletions(-)
 create mode 100644 blake32.cu
 delete mode 100644 missing

diff --git a/Makefile.am b/Makefile.am
index 5e539a9..3935afe 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -32,7 +32,7 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
 			  quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \
 			  quark/cuda_quark_compactionTest.cu \
-			  cuda_nist5.cu \
+			  cuda_nist5.cu blake32.cu \
 			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
@@ -43,6 +43,7 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu x15/whirlpool.cu \
 			  x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu
 
+
 ccminer_LDFLAGS		= $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
 ccminer_LDADD		= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
 ccminer_CPPFLAGS	= -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
diff --git a/blake32.cu b/blake32.cu
new file mode 100644
index 0000000..882594f
--- /dev/null
+++ b/blake32.cu
@@ -0,0 +1,494 @@
+/**
+ * Blake-256 Cuda Kernel (Tested on SM 5.0)
+ *
+ * Tanguy Pruvot - Aug. 2014
+ */
+
+#include "miner.h"
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include <stdint.h>
+#include <memory.h>
+}
+
+/* hash by cpu with blake 256 */
+extern "C" void blake32hash(void *output, const void *input)
+{
+	unsigned char hash[64];
+	sph_blake256_context ctx;
+	sph_blake256_init(&ctx);
+	sph_blake256(&ctx, input, 80);
+	sph_blake256_close(&ctx, hash);
+	memcpy(output, hash, 32);
+}
+
+#include "cuda_helper.h"
+
+#if __CUDA_ARCH__ < 350
+	// Kepler (Compute 3.0) + Host
+	#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+#else
+	// Kepler (Compute 3.5 / 5.0)
+	#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
+#endif
+
+// in cpu-miner.c
+extern bool opt_benchmark;
+extern bool opt_debug;
+extern int device_map[8];
+
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+
+// shared for 8 threads of addresses (cudaMalloc)
+uint32_t* d_hash[8];
+
+__constant__
+static uint32_t pTarget[8];
+
+__constant__
+static uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
+static uint32_t *d_resNounce[8];
+static uint32_t *h_resNounce[8];
+
+__constant__
+static uint8_t c_sigma[16][16];
+const uint8_t host_sigma[16][16] =
+{
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+  {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+  {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+  { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+  {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
+};
+
+__device__ __constant__
+static const uint32_t c_IV256[8] = {
+	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
+	SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
+	SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
+	SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+};
+
+__device__ __constant__
+
+static const uint32_t c_u256[16] = {
+	SPH_C32(0x243F6A88), SPH_C32(0x85A308D3),
+	SPH_C32(0x13198A2E), SPH_C32(0x03707344),
+	SPH_C32(0xA4093822), SPH_C32(0x299F31D0),
+	SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89),
+	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
+	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
+	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
+	SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
+};
+
+#if 0
+#define GS(m0, m1, c0, c1, a, b, c, d)   do { \
+		a = SPH_T32(a + b + (m0 ^ c1)); \
+		d = SPH_ROTR32(d ^ a, 16); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 12); \
+		a = SPH_T32(a + b + (m1 ^ c0)); \
+		d = SPH_ROTR32(d ^ a, 8); \
+		c = SPH_T32(c + d); \
+		b = SPH_ROTR32(b ^ c, 7); \
+	} while (0)
+
+#define ROUND_S(r)   do { \
+	GS(Mx(r, 0x0), Mx(r, 0x1), CSx(r, 0x0), CSx(r, 0x1), v[0], v[4], v[0x8], v[0xC]); \
+	GS(Mx(r, 0x2), Mx(r, 0x3), CSx(r, 0x2), CSx(r, 0x3), v[1], v[5], v[0x9], v[0xD]); \
+	GS(Mx(r, 0x4), Mx(r, 0x5), CSx(r, 0x4), CSx(r, 0x5), v[2], v[6], v[0xA], v[0xE]); \
+	GS(Mx(r, 0x6), Mx(r, 0x7), CSx(r, 0x6), CSx(r, 0x7), v[3], v[7], v[0xB], v[0xF]); \
+	GS(Mx(r, 0x8), Mx(r, 0x9), CSx(r, 0x8), CSx(r, 0x9), v[0], v[5], v[0xA], v[0xF]); \
+	GS(Mx(r, 0xA), Mx(r, 0xB), CSx(r, 0xA), CSx(r, 0xB), v[1], v[6], v[0xB], v[0xC]); \
+	GS(Mx(r, 0xC), Mx(r, 0xD), CSx(r, 0xC), CSx(r, 0xD), v[2], v[7], v[0x8], v[0xD]); \
+	GS(Mx(r, 0xE), Mx(r, 0xF), CSx(r, 0xE), CSx(r, 0xF), v[3], v[4], v[0x9], v[0xE]); \
+} while (0)
+#endif
+
+#define GS(a,b,c,d,e) { \
+	v[a] += (m[sigma[i][e]] ^ u256[sigma[i][e+1]]) + v[b]; \
+	v[d] = ROTR32(v[d] ^ v[a], 16); \
+	v[c] += v[d]; \
+	v[b] = ROTR32(v[b] ^ v[c], 12); \
+\
+	v[a] += (m[sigma[i][e+1]] ^ u256[sigma[i][e]]) + v[b]; \
+	v[d] = ROTR32(v[d] ^ v[a], 8); \
+	v[c] += v[d]; \
+	v[b] = ROTR32(v[b] ^ v[c], 7); \
+}
+
+__device__ static
+void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), const uint32_t *u256, const uint32_t T0, uint8_t nullt = 1)
+{
+	uint32_t /* __align__(8) */ v[16];
+	uint32_t /* __align__(8) */ m[16];
+
+	//#pragma unroll
+	for (int i = 0; i < 16; ++i) {
+		m[i] = cuda_swab32(block[i]);
+		//m[i] = block[i];
+	}
+
+	#pragma unroll
+	for(int i = 0; i < 8; i++)
+		v[i] = h[i];
+
+	v[ 8] = u256[0];
+	v[ 9] = u256[1];
+	v[10] = u256[2];
+	v[11] = u256[3];
+
+	v[12] = u256[4] ^ T0;
+	v[13] = u256[5] ^ T0;
+	v[14] = u256[6];
+	v[15] = u256[7];
+
+	// on a 80-bytes null buffer :
+	// first : v = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, ...}
+	// second : v = {0xb5bfb2f9, 0x14cfcc63, 0xb85c549c, 0xc9b4184e, ..., 0x299f3350, 0x082efa98, 0xec4e6c89}
+
+	//#pragma unroll
+	for (int i = 0; i < 14; i++) {
+		/* column step */
+		GS(0, 4, 0x8, 0xC, 0);
+		GS(1, 5, 0x9, 0xD, 2);
+		GS(2, 6, 0xA, 0xE, 4);
+		GS(3, 7, 0xB, 0xF, 6);
+		/* diagonal step */
+		GS(0, 5, 0xA, 0xF, 0x8);
+		GS(1, 6, 0xB, 0xC, 0xA);
+		GS(2, 7, 0x8, 0xD, 0xC);
+		GS(3, 4, 0x9, 0xE, 0xE);
+	}
+
+	//#pragma unroll 16
+	for(int i = 0; i < 16; i++)
+		h[i % 8] ^= v[i];
+
+	//second H0 = 0x0c7b1594 ... H7 = 0x9051b305
+}
+
+#if __CUDA_ARCH__ >= 200
+#if (__NV_POINTER_SIZE == 64)
+# define SZCT uint64_t
+#else
+# define SZCT uint32_t
+#endif
+extern __device__ __device_builtin__ void __nvvm_memset(uint8_t *, unsigned char, SZCT, int);
+#endif
+
+__global__
+void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t /* __align__(16) */ h[8];
+		uint32_t /* __align__(16) */ msg[16];
+		const uint32_t nounce = startNounce + thread;
+
+		#pragma unroll
+		for(int i=0; i<8; i++)
+			h[i] = c_IV256[i];
+
+		blake256_compress(h, c_PaddedMessage80, c_sigma, c_u256, 0x200); /* 512 = 0x200 */
+
+		// ------ Close: Bytes 64 to 80 ------ 
+
+#if 0 /* __CUDA_ARCH__ >= 200 */
+		__nvvm_memset((uint8_t*)(&msg[4]), 0, sizeof(msg)-16, 16);
+#else
+		msg[5] = 0;
+		msg[6] = 0;
+		msg[7] = 0;
+		msg[8] = 0;
+		msg[9] = 0;
+		msg[10] = 0;
+		msg[11] = 0;
+		msg[12] = 0;
+		msg[14] = 0;
+#endif
+		msg[0] = c_PaddedMessage80[16];
+		msg[1] = c_PaddedMessage80[17];
+		msg[2] = c_PaddedMessage80[18];
+		msg[3] = cuda_swab32(nounce); // here or at 80 ?
+
+		msg[4] = 0x80; // uchar[16] after buffer
+		msg[13] = 0x01000000; //((uint8_t*)msg)[55] = 1; // uchar[17 to 55]
+		msg[15] = 0x80020000; // 60-63 0x280
+
+		//h => {0xb5bfb2f9, 0x14cfcc63, 0xb85c549c, 0xc9b4184e, 0x67dfc6ce, 0x29e9904b, 0xd59ee74e, 0xfaa9c653}
+		//msg  {0, 0, 0, 0, 0x80, 0...}
+
+		blake256_compress(h, msg, c_sigma, c_u256, 0x280); // or 0x80
+		//h => {0x0c7b1594, 0x52328517, 0x463db487, 0xdf5e39b7, 0x1322afaf, 0x14ed562c, 0xe9d18d7d, 0x9051b305}
+
+		uint32_t *outHash = (uint32_t*) outputHash + 16*thread; // 16 = 4 x sizeof(uint32)
+		//#pragma unroll
+		for (int i=0; i < 8; i++) {
+			outHash[i] = cuda_swab32(h[i]);
+		}
+	}
+}
+
+__host__
+void blake256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
+{
+	const int threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	size_t shared_size = 0;
+
+	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
+
+	MyStreamSynchronize(NULL, order, thr_id);
+}
+
+__global__
+void gpu_check_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		int hashPosition = nounce - startNounce;
+		uint32_t *inpHash = &g_hash[16 * hashPosition];
+		uint32_t hash[8];
+
+		#pragma unroll 8
+		for (int i=0; i < 8; i++)
+			hash[i] = inpHash[i];
+
+		int i, position = -1;
+		bool rc = true;
+
+		#pragma unroll 8
+		for (i = 7; i >= 0; i--) {
+			if (hash[i] > pTarget[i] && position < i) {
+				position = i;
+				rc = false;
+			}
+			if (hash[i] < pTarget[i] && position < i) {
+				position = i;
+				rc = true;
+			}
+		}
+
+		if(rc && resNounce[0] > nounce)
+			resNounce[0] = nounce;
+	}
+}
+
+__host__
+uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
+{
+	uint32_t result = 0xffffffff;
+	const int threadsperblock = 256;
+
+	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	size_t shared_size = 0;
+
+	gpu_check_hash_64 <<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
+
+	MyStreamSynchronize(NULL, order, thr_id);
+
+	CUDA_SAFE_CALL(cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost));
+
+	// cudaMemcpy() is asynch!
+	cudaThreadSynchronize();
+	result = *h_resNounce[thr_id];
+
+	return result;
+}
+
+__host__
+void blake256_cpu_init(int thr_id)
+{
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice));
+
+	CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t)));
+}
+
+__host__
+void blake256_cpu_setBlock_80(uint32_t *pdata, const void *ptarget)
+{
+	uint32_t PaddedMessage[32];
+	memcpy(PaddedMessage, pdata, 80);
+	memset(&PaddedMessage[20], 0, 48);
+	//for (int i=0; i<20; i++)
+	//	PaddedMessage[i] = cuda_swab32(pdata[i]);
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice));
+}
+
+#define NULLTEST 0
+
+extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t endiandata[20];
+	const uint32_t first_nonce = pdata[19];
+	const int throughput = 256*256*2;
+	static bool init[8] = {0,0,0,0,0,0,0,0};
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x00000f;
+
+	uint32_t Htarg = ptarget[7];
+
+	if (!init[thr_id]) {
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
+
+		blake256_cpu_init(thr_id);
+
+		init[thr_id] = true;
+	}
+
+#if NULLTEST
+	// dev test with a null buffer 0x00000...
+	for (int k = 0; k < 20; k++)
+		pdata[k] = 0;
+	uint32_t vhash[8];
+	blake32hash(vhash, pdata);
+#endif
+
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	blake256_cpu_setBlock_80(endiandata, (void*)ptarget);
+
+	do {
+		int order = 0;
+		uint32_t foundNonce;
+
+		// GPU
+		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+#if NULLTEST
+		uint32_t buf[8]; memset(buf, 0, sizeof buf);
+		CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
+		CUDA_SAFE_CALL(cudaThreadSynchronize());
+		//applog_hash((unsigned char*)buf);
+#endif
+		foundNonce = cpu_check_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		if (foundNonce != 0xffffffff)
+		{
+			uint32_t vhashcpu[8];
+			be32enc(&endiandata[19], foundNonce);
+
+			blake32hash(vhashcpu, endiandata);
+
+			if (opt_debug)
+				applog(LOG_DEBUG, "foundNonce = %08x",foundNonce);
+
+			if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
+			{
+				pdata[19] = foundNonce;
+				*hashes_done = pdata[19] - first_nonce + 1;
+				return 1;
+			} else {
+				applog(LOG_INFO, "GPU #%d: result for nonce %08x does not validate on CPU!", thr_id, foundNonce);
+			}
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+
+//#define DEBUG_ALGO
+
+__host__
+int scanhash_blake256_cpu(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, uint64_t *hashes_done)
+{
+	uint32_t n = pdata[19] - 1;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+
+	uint32_t __align__(32) hash64[8];
+	uint32_t endiandata[32];
+
+	uint64_t htmax[] = {
+		0,
+		0xF,
+		0xFF,
+		0xFFF,
+		0xFFFF,
+		0x10000000
+	};
+	uint32_t masks[] = {
+		0xFFFFFFFF,
+		0xFFFFFFF0,
+		0xFFFFFF00,
+		0xFFFFF000,
+		0xFFFF0000,
+		0
+	};
+
+	// we need bigendian data...
+	for (int kk=0; kk < 32; kk++) {
+		be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
+	};
+#ifdef DEBUG_ALGO
+	if (Htarg != 0)
+		printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
+	for (int m=0; m < 6; m++) {
+		if (Htarg <= htmax[m]) {
+			uint32_t mask = masks[m];
+			do {
+				pdata[19] = ++n;
+				be32enc(&endiandata[19], n);
+				blake32hash(hash64, endiandata);
+#ifndef DEBUG_ALGO
+				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
+					*hashes_done = n - first_nonce + 1;
+					return true;
+				}
+#else
+				if (!(n % 0x1000) && !thr_id) printf(".");
+				if (!(hash64[7] & mask)) {
+					printf("[%d]",thr_id);
+					if (fulltest(hash64, ptarget)) {
+						*hashes_done = n - first_nonce + 1;
+						return true;
+					}
+				}
+#endif
+			} while (n < max_nonce && !work_restart[thr_id].restart);
+			// see blake.c if else to understand the loop on htmax => mask
+			break;
+		}
+	}
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
diff --git a/build.sh b/build.sh
index 17935f3..2905734 100755
--- a/build.sh
+++ b/build.sh
@@ -4,7 +4,7 @@
 
 # export PATH="$PATH:/usr/local/cuda/bin/"
 
-make distclean || echo clean
+#make distclean || echo clean
 
 rm -f Makefile.in
 rm -f config.status
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 8bada54..d3ec423 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -397,6 +397,10 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
       <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
       <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
     </CudaCompile>
+    <CudaCompile Include="blake32.cu">
+      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
+    </CudaCompile>
     <CudaCompile Include="quark\animecoin.cu">
       <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
       <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
@@ -556,4 +560,4 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
   <ImportGroup Label="ExtensionTargets">
     <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" />
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index c972707..55c69aa 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -436,5 +436,8 @@
     <CudaCompile Include="x17\x17.cu">
       <Filter>Source Files\CUDA\x17</Filter>
     </CudaCompile>
+    <CudaCompile Include="blake32.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
   </ItemGroup>
 </Project>
diff --git a/configure.sh b/configure.sh
index 134abd1..c0cdd0d 100755
--- a/configure.sh
+++ b/configure.sh
@@ -1 +1 @@
-./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda
+./configure "CFLAGS=-O2" "CXXFLAGS=-O2" --with-cuda=/usr/local/cuda
diff --git a/cpu-miner.c b/cpu-miner.c
index a55f051..98d7daf 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -135,6 +135,7 @@ typedef enum {
 	ALGO_QUARK,
 	ALGO_ANIME,
 	ALGO_FRESH,
+	ALGO_BLAKE,
 	ALGO_NIST5,
 	ALGO_WHC,
 	ALGO_X11,
@@ -155,6 +156,7 @@ static const char *algo_names[] = {
 	"quark",
 	"anime",
 	"fresh",
+	"blake",
 	"nist5",
 	"whirl",
 	"x11",
@@ -235,6 +237,7 @@ Options:\n\
                         jackpot   Jackpot hash\n\
                         quark     Quark hash\n\
                         anime     Animecoin hash\n\
+                        blake     Blake 256 (like NEOS blake)\n\
                         fresh     Freshcoin hash (shavite 80)\n\
                         nist5     NIST5 (TalkCoin) hash\n\
                         whirl     Whirlcoin (old whirlpool)\n\
@@ -842,18 +845,23 @@ static void *miner_thread(void *userdata)
 		int64_t max64;
 		int rc;
 
+		// &work.data[19]
+		int wcmplen = 76;
+		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
+
 		if (have_stratum) {
 			while (time(NULL) >= g_work_time + 120)
 				sleep(1);
 			pthread_mutex_lock(&g_work_lock);
-			if (work.data[19] >= end_nonce)
+			if ((*nonceptr) >= end_nonce)
 				stratum_gen_work(&stratum, &g_work);
 		} else {
+			int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime;
 			/* obtain new work from internal workio thread */
 			pthread_mutex_lock(&g_work_lock);
-			if (!have_stratum && (!have_longpoll ||
-					time(NULL) >= g_work_time + LP_SCANTIME*3/4 ||
-					work.data[19] >= end_nonce)) {
+			if (!have_stratum &&
+			    (time(NULL) - g_work_time >= min_scantime ||
+			     (*nonceptr) >= end_nonce)) {
 				if (unlikely(!get_work(mythr, &g_work))) {
 					applog(LOG_ERR, "work retrieval failed, exiting "
 						"mining thread %d", mythr->id);
@@ -867,11 +875,11 @@ static void *miner_thread(void *userdata)
 				continue;
 			}
 		}
-		if (memcmp(work.data, g_work.data, 76)) {
+		if (memcmp(work.data, g_work.data, wcmplen)) {
 			memcpy(&work, &g_work, sizeof(struct work));
-			work.data[19] = 0xffffffffU / opt_n_threads * thr_id;
+			(*nonceptr) = 0xffffffffU / opt_n_threads * thr_id;
 		} else
-			work.data[19]++;
+			(*nonceptr)++;
 		pthread_mutex_unlock(&g_work_lock);
 		work_restart[thr_id].restart = 0;
 
@@ -881,13 +889,26 @@ static void *miner_thread(void *userdata)
 		else
 			max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime)
 			      - time(NULL);
+
 		max64 *= (int64_t)thr_hashrates[thr_id];
-		if (max64 <= 0)
-			max64 = (opt_algo == ALGO_JACKPOT) ? 0x1fffLL : 0xfffffLL;
-		if ((int64_t)work.data[19] + max64 > end_nonce)
+
+		if (max64 <= 0) {
+			switch (opt_algo) {
+			case ALGO_JACKPOT:
+				max64 = 0x1fffLL;
+				break;
+			case ALGO_BLAKE:
+				max64 = 0xffffffLL;
+				break;
+			default:
+				max64 = 0xfffffLL;
+				break;
+			}
+		}
+		if ((int64_t)(*nonceptr) + max64 > end_nonce)
 			max_nonce = end_nonce;
 		else
-			max_nonce = (uint32_t)(work.data[19] + max64);
+			max_nonce = (uint32_t)((*nonceptr) + max64);
 
 		hashes_done = 0;
 		gettimeofday(&tv_start, NULL);
@@ -931,6 +952,11 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;
 
+		case ALGO_BLAKE:
+			rc = scanhash_blake32(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done);
+			break;
+
 		case ALGO_ANIME:
 			rc = scanhash_anime(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
diff --git a/miner.h b/miner.h
index f3d4299..a23df96 100644
--- a/miner.h
+++ b/miner.h
@@ -241,6 +241,10 @@ extern int scanhash_fresh(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 
+extern int scanhash_blake32(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done);
+
 extern int scanhash_nist5(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
@@ -402,6 +406,7 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
 void groestlhash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
 void fresh_hash(void *state, const void *input);
+void blake32hash(void *output, const void *input);
 void nist5hash(void *state, const void *input);
 void quarkhash(void *state, const void *input);
 void wcoinhash(void *state, const void *input);
diff --git a/missing b/missing
deleted file mode 100644
index 1c8ff70..0000000
--- a/missing
+++ /dev/null
@@ -1,367 +0,0 @@
-#! /bin/sh
-# Common stub for a few missing GNU programs while installing.
-
-scriptversion=2006-05-10.23
-
-# Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006
-#   Free Software Foundation, Inc.
-# Originally by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-# 02110-1301, USA.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-if test $# -eq 0; then
-  echo 1>&2 "Try \`$0 --help' for more information"
-  exit 1
-fi
-
-run=:
-sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p'
-sed_minuso='s/.* -o \([^ ]*\).*/\1/p'
-
-# In the cases where this matters, `missing' is being run in the
-# srcdir already.
-if test -f configure.ac; then
-  configure_ac=configure.ac
-else
-  configure_ac=configure.in
-fi
-
-msg="missing on your system"
-
-case $1 in
---run)
-  # Try to run requested program, and just exit if it succeeds.
-  run=
-  shift
-  "$@" && exit 0
-  # Exit code 63 means version mismatch.  This often happens
-  # when the user try to use an ancient version of a tool on
-  # a file that requires a minimum version.  In this case we
-  # we should proceed has if the program had been absent, or
-  # if --run hadn't been passed.
-  if test $? = 63; then
-    run=:
-    msg="probably too old"
-  fi
-  ;;
-
-  -h|--h|--he|--hel|--help)
-    echo "\
-$0 [OPTION]... PROGRAM [ARGUMENT]...
-
-Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an
-error status if there is no known handling for PROGRAM.
-
-Options:
-  -h, --help      display this help and exit
-  -v, --version   output version information and exit
-  --run           try to run the given command, and emulate it if it fails
-
-Supported PROGRAM values:
-  aclocal      touch file \`aclocal.m4'
-  autoconf     touch file \`configure'
-  autoheader   touch file \`config.h.in'
-  autom4te     touch the output file, or create a stub one
-  automake     touch all \`Makefile.in' files
-  bison        create \`y.tab.[ch]', if possible, from existing .[ch]
-  flex         create \`lex.yy.c', if possible, from existing .c
-  help2man     touch the output file
-  lex          create \`lex.yy.c', if possible, from existing .c
-  makeinfo     touch the output file
-  tar          try tar, gnutar, gtar, then tar without non-portable flags
-  yacc         create \`y.tab.[ch]', if possible, from existing .[ch]
-
-Send bug reports to <bug-automake@gnu.org>."
-    exit $?
-    ;;
-
-  -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
-    echo "missing $scriptversion (GNU Automake)"
-    exit $?
-    ;;
-
-  -*)
-    echo 1>&2 "$0: Unknown \`$1' option"
-    echo 1>&2 "Try \`$0 --help' for more information"
-    exit 1
-    ;;
-
-esac
-
-# Now exit if we have it, but it failed.  Also exit now if we
-# don't have it and --version was passed (most likely to detect
-# the program).
-case $1 in
-  lex|yacc)
-    # Not GNU programs, they don't have --version.
-    ;;
-
-  tar)
-    if test -n "$run"; then
-       echo 1>&2 "ERROR: \`tar' requires --run"
-       exit 1
-    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
-       exit 1
-    fi
-    ;;
-
-  *)
-    if test -z "$run" && ($1 --version) > /dev/null 2>&1; then
-       # We have it, but it failed.
-       exit 1
-    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
-       # Could not run --version or --help.  This is probably someone
-       # running `$TOOL --version' or `$TOOL --help' to check whether
-       # $TOOL exists and not knowing $TOOL uses missing.
-       exit 1
-    fi
-    ;;
-esac
-
-# If it does not exist, or fails to run (possibly an outdated version),
-# try to emulate it.
-case $1 in
-  aclocal*)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`acinclude.m4' or \`${configure_ac}'.  You might want
-         to install the \`Automake' and \`Perl' packages.  Grab them from
-         any GNU archive site."
-    touch aclocal.m4
-    ;;
-
-  autoconf)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`${configure_ac}'.  You might want to install the
-         \`Autoconf' and \`GNU m4' packages.  Grab them from any GNU
-         archive site."
-    touch configure
-    ;;
-
-  autoheader)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`acconfig.h' or \`${configure_ac}'.  You might want
-         to install the \`Autoconf' and \`GNU m4' packages.  Grab them
-         from any GNU archive site."
-    files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}`
-    test -z "$files" && files="config.h"
-    touch_files=
-    for f in $files; do
-      case $f in
-      *:*) touch_files="$touch_files "`echo "$f" |
-				       sed -e 's/^[^:]*://' -e 's/:.*//'`;;
-      *) touch_files="$touch_files $f.in";;
-      esac
-    done
-    touch $touch_files
-    ;;
-
-  automake*)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'.
-         You might want to install the \`Automake' and \`Perl' packages.
-         Grab them from any GNU archive site."
-    find . -type f -name Makefile.am -print |
-	   sed 's/\.am$/.in/' |
-	   while read f; do touch "$f"; done
-    ;;
-
-  autom4te)
-    echo 1>&2 "\
-WARNING: \`$1' is needed, but is $msg.
-         You might have modified some files without having the
-         proper tools for further handling them.
-         You can get \`$1' as part of \`Autoconf' from any GNU
-         archive site."
-
-    file=`echo "$*" | sed -n "$sed_output"`
-    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
-    if test -f "$file"; then
-	touch $file
-    else
-	test -z "$file" || exec >$file
-	echo "#! /bin/sh"
-	echo "# Created by GNU Automake missing as a replacement of"
-	echo "#  $ $@"
-	echo "exit 0"
-	chmod +x $file
-	exit 1
-    fi
-    ;;
-
-  bison|yacc)
-    echo 1>&2 "\
-WARNING: \`$1' $msg.  You should only need it if
-         you modified a \`.y' file.  You may need the \`Bison' package
-         in order for those modifications to take effect.  You can get
-         \`Bison' from any GNU archive site."
-    rm -f y.tab.c y.tab.h
-    if test $# -ne 1; then
-        eval LASTARG="\${$#}"
-	case $LASTARG in
-	*.y)
-	    SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
-	    if test -f "$SRCFILE"; then
-	         cp "$SRCFILE" y.tab.c
-	    fi
-	    SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'`
-	    if test -f "$SRCFILE"; then
-	         cp "$SRCFILE" y.tab.h
-	    fi
-	  ;;
-	esac
-    fi
-    if test ! -f y.tab.h; then
-	echo >y.tab.h
-    fi
-    if test ! -f y.tab.c; then
-	echo 'main() { return 0; }' >y.tab.c
-    fi
-    ;;
-
-  lex|flex)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified a \`.l' file.  You may need the \`Flex' package
-         in order for those modifications to take effect.  You can get
-         \`Flex' from any GNU archive site."
-    rm -f lex.yy.c
-    if test $# -ne 1; then
-        eval LASTARG="\${$#}"
-	case $LASTARG in
-	*.l)
-	    SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
-	    if test -f "$SRCFILE"; then
-	         cp "$SRCFILE" lex.yy.c
-	    fi
-	  ;;
-	esac
-    fi
-    if test ! -f lex.yy.c; then
-	echo 'main() { return 0; }' >lex.yy.c
-    fi
-    ;;
-
-  help2man)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-	 you modified a dependency of a manual page.  You may need the
-	 \`Help2man' package in order for those modifications to take
-	 effect.  You can get \`Help2man' from any GNU archive site."
-
-    file=`echo "$*" | sed -n "$sed_output"`
-    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
-    if test -f "$file"; then
-	touch $file
-    else
-	test -z "$file" || exec >$file
-	echo ".ab help2man is required to generate this page"
-	exit 1
-    fi
-    ;;
-
-  makeinfo)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified a \`.texi' or \`.texinfo' file, or any other file
-         indirectly affecting the aspect of the manual.  The spurious
-         call might also be the consequence of using a buggy \`make' (AIX,
-         DU, IRIX).  You might want to install the \`Texinfo' package or
-         the \`GNU make' package.  Grab either from any GNU archive site."
-    # The file to touch is that specified with -o ...
-    file=`echo "$*" | sed -n "$sed_output"`
-    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
-    if test -z "$file"; then
-      # ... or it is the one specified with @setfilename ...
-      infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'`
-      file=`sed -n '
-	/^@setfilename/{
-	  s/.* \([^ ]*\) *$/\1/
-	  p
-	  q
-	}' $infile`
-      # ... or it is derived from the source name (dir/f.texi becomes f.info)
-      test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info
-    fi
-    # If the file does not exist, the user really needs makeinfo;
-    # let's fail without touching anything.
-    test -f $file || exit 1
-    touch $file
-    ;;
-
-  tar)
-    shift
-
-    # We have already tried tar in the generic part.
-    # Look for gnutar/gtar before invocation to avoid ugly error
-    # messages.
-    if (gnutar --version > /dev/null 2>&1); then
-       gnutar "$@" && exit 0
-    fi
-    if (gtar --version > /dev/null 2>&1); then
-       gtar "$@" && exit 0
-    fi
-    firstarg="$1"
-    if shift; then
-	case $firstarg in
-	*o*)
-	    firstarg=`echo "$firstarg" | sed s/o//`
-	    tar "$firstarg" "$@" && exit 0
-	    ;;
-	esac
-	case $firstarg in
-	*h*)
-	    firstarg=`echo "$firstarg" | sed s/h//`
-	    tar "$firstarg" "$@" && exit 0
-	    ;;
-	esac
-    fi
-
-    echo 1>&2 "\
-WARNING: I can't seem to be able to run \`tar' with the given arguments.
-         You may want to install GNU tar or Free paxutils, or check the
-         command line arguments."
-    exit 1
-    ;;
-
-  *)
-    echo 1>&2 "\
-WARNING: \`$1' is needed, and is $msg.
-         You might have modified some files without having the
-         proper tools for further handling them.  Check the \`README' file,
-         it often tells you about the needed prerequisites for installing
-         this package.  You may also peek at any GNU archive site, in case
-         some other package would contain this missing \`$1' program."
-    exit 1
-    ;;
-esac
-
-exit 0
-
-# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-end: "$"
-# End:
diff --git a/quark/cuda_checkhash.cu b/quark/cuda_checkhash.cu
index 3c41a02..1ce25ec 100644
--- a/quark/cuda_checkhash.cu
+++ b/quark/cuda_checkhash.cu
@@ -6,8 +6,8 @@
 // Hash Target gegen das wir testen sollen
 __constant__ uint32_t pTarget[8];
 
-uint32_t *d_resNounce[8];
-uint32_t *h_resNounce[8];
+static uint32_t *d_resNounce[8];
+static uint32_t *h_resNounce[8];
 
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
diff --git a/util.c b/util.c
index 79a7a24..d9363dc 100644
--- a/util.c
+++ b/util.c
@@ -1393,6 +1393,10 @@ void print_hash_tests(void)
 	myriadhash(&hash[0], &buf[0]);
 	printpfx("myriad", hash);
 
+	memset(hash, 0, sizeof hash);
+	blake32hash(&hash[0], &buf[0]);
+	printpfx("blake", hash);
+
 	memset(hash, 0, sizeof hash);
 	nist5hash(&hash[0], &buf[0]);
 	printpfx("nist5", hash);

From bfe96c49b0bf321ed0776cb1cf31c4fe8a0a8b8d Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 25 Aug 2014 11:21:06 +0200
Subject: [PATCH 02/44] release 1.4, update README...

---
 README.md    | 32 ++++++++++++++++++++++++++++++--
 README.txt   | 15 +++++++++++++++
 configure.ac |  2 +-
 cpu-miner.c  |  6 +++---
 4 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index af8fd12..715b387 100644
--- a/README.md
+++ b/README.md
@@ -3,5 +3,33 @@ ccminer
 
 Christian Buchner's &amp; Christian H.'s CUDA miner project
 
-Fork by tpruvot@github with X14 support
-   BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
\ No newline at end of file
+Fork by tpruvot@github with X14,X15,X17,WHIRL and M7 support
+   BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
+
+A big part of my recent additions were wrote by [djm34](https://github.com/djm34),
+You can also donate some beers (or redbulls) with these addresses :
+   XjPqpkCPoYJJYdQRrVByU7ySpVyeqJmSGU
+
+This variant was tested and built on Linux (ubuntu server 14.04)
+and VStudio 2013.
+
+Note that the x86 releases are faster than x64 ones on Windows.
+
+About source code dependencies
+------------------------------
+
+This project requires some libraries to be built :
+
+- OpenSSL
+
+- Curl
+
+- pthreads
+
+- [mpir math library](http://www.mpir.org)
+
+You can download prebuilt .lib and dll on the [bitcointalk forum thread](https://bitcointalk.org/?topic=167229.0)
+
+
+There is also a [Tutorial for windows](http://cudamining.co.uk/url/tutorials/id/3) on [CudaMining](http://cudamining.co.uk) website.
+
diff --git a/README.txt b/README.txt
index 32b2599..e6fe248 100644
--- a/README.txt
+++ b/README.txt
@@ -63,11 +63,14 @@ its command line interface and options.
                           jackpot     use to mine Jackpotcoin
                           quark       use to mine Quarkcoin
                           anime       use to mine Animecoin
+                          blake       use to mine NEOS (Blake 256)
                           nist5       use to mine TalkCoin
                           fresh       use to mine Freshcoin
+                          whirl       use to mine Whirlcoin
                           x11         use to mine DarkCoin
                           x14         use to mine X14Coin
                           x15         use to mine Halcyon
+                          x17         use to mine X17
 
   -d, --devices         gives a comma separated list of CUDA device IDs
                         to operate on. Device IDs start counting from 0!
@@ -98,6 +101,7 @@ its command line interface and options.
       --benchmark       run in offline benchmark mode
       --cputest         debug hashes from cpu algorithms
   -c, --config=FILE     load a JSON-format configuration file
+  -C, --color           display colored output in a linux Terminal
   -V, --version         display version information and exit
   -h, --help            display this help text and exit
 
@@ -148,6 +152,14 @@ features.
 
 >>> RELEASE HISTORY <<<
 
+  Sep.  1st 2014  add X17, optimized x15 and whirl
+                  add blake (256 variant)
+                  color support on Windows,
+                  remove some dll dependencies (pthreads, msvcp)
+
+  Aug. 18th 2014  add X14, X15, Whirl, and Fresh algos,
+                  also add colors and nvprof cmd line support
+
   June 15th 2014  add X13 and Diamond Groestl support.
                   Thanks to tsiv and to Bombadil for the contributions!
 
@@ -214,6 +226,9 @@ Notable contributors to this application are:
 
 Christian Buchner, Christian H. (Germany): CUDA implementation 
 
+Tanguy Pruvot : CUDA, blake, general code cleanup, tuneup for linux (Makefiles)
+                and some vstudio 2013 stuff...
+
 and also many thanks to anyone else who contributed to the original
 cpuminer application (Jeff Garzik, pooler), it's original HVC-fork
 and the HVC-fork available at hvc.1gh.com
diff --git a/configure.ac b/configure.ac
index 14e9468..2f52cdf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2014.08.12])
+AC_INIT([ccminer], [2014.09.01])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 98d7daf..69bafe9 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -47,7 +47,7 @@ BOOL WINAPI ConsoleHandler(DWORD);
 #pragma comment(lib, "winmm.lib")
 #endif
 
-#define PROGRAM_NAME		"minerd"
+#define PROGRAM_NAME		"ccminer"
 #define LP_SCANTIME		60
 #define HEAVYCOIN_BLKHDR_SZ		84
 #define MNR_BLKHDR_SZ 80
@@ -1238,7 +1238,7 @@ out:
 	return NULL;
 }
 
-#define PROGRAM_VERSION "1.3"
+#define PROGRAM_VERSION "1.4"
 static void show_version_and_exit(void)
 {
 	printf("%s v%s\n"
@@ -1618,7 +1618,7 @@ int main(int argc, char *argv[])
 	printf("\t    and HVC extension from http://hvc.1gh.com/" "\n\n");
 	printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n");
 	printf("\t  BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM\n");
-	printf("\tCleaned and optimized by Tanguy Pruvot\n");
+	printf("\tInclude some of djm34 additions, cleaned by Tanguy Pruvot\n");
 	printf("\t  BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo\n\n");
 
 	rpc_user = strdup("");

From 1fb9becc1f2b6a15d8ccea4d8314df9ddf0af4ed Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 1 Sep 2014 08:44:19 +0200
Subject: [PATCH 03/44] cpu-miner: sort algos by name, show reject reason

---
 cpu-miner.c        | 54 +++++++++++++++++++++++-----------------------
 miner.h            |  7 +++---
 quark/animecoin.cu |  2 +-
 util.c             | 20 ++++++++++-------
 4 files changed, 44 insertions(+), 39 deletions(-)

diff --git a/cpu-miner.c b/cpu-miner.c
index 69bafe9..ef981d1 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -126,16 +126,16 @@ struct workio_cmd {
 };
 
 typedef enum {
-	ALGO_HEAVY,		/* Heavycoin hash */
-	ALGO_MJOLLNIR,		/* Mjollnir hash */
+	ALGO_ANIME,
+	ALGO_BLAKE,
+	ALGO_FRESH,
 	ALGO_FUGUE256,		/* Fugue256 */
 	ALGO_GROESTL,
-	ALGO_MYR_GR,
+	ALGO_HEAVY,		/* Heavycoin hash */
 	ALGO_JACKPOT,
+	ALGO_MJOLLNIR,		/* Mjollnir hash */
+	ALGO_MYR_GR,
 	ALGO_QUARK,
-	ALGO_ANIME,
-	ALGO_FRESH,
-	ALGO_BLAKE,
 	ALGO_NIST5,
 	ALGO_WHC,
 	ALGO_X11,
@@ -147,17 +147,17 @@ typedef enum {
 } sha256_algos;
 
 static const char *algo_names[] = {
-	"heavy",
-	"mjollnir",
+	"anime",
+	"blake",
+	"fresh",
 	"fugue256",
 	"groestl",
-	"myr-gr",
+	"heavy",
 	"jackpot",
-	"quark",
-	"anime",
-	"fresh",
-	"blake",
+	"mjollnir",
+	"myr-gr",
 	"nist5",
+	"quark",
 	"whirl",
 	"x11",
 	"x13",
@@ -229,17 +229,17 @@ static char const usage[] = "\
 Usage: " PROGRAM_NAME " [OPTIONS]\n\
 Options:\n\
   -a, --algo=ALGO       specify the algorithm to use\n\
+                        anime     Animecoin hash\n\
+                        blake     Blake 256 (like NEOS blake)\n\
+                        fresh     Freshcoin hash (shavite 80)\n\
                         fugue256  Fuguecoin hash\n\
+                        groestl   Groestlcoin hash\n\
                         heavy     Heavycoin hash\n\
+                        jackpot   Jackpot hash\n\
                         mjollnir  Mjollnircoin hash\n\
-                        groestl   Groestlcoin hash\n\
                         myr-gr    Myriad-Groestl hash\n\
-                        jackpot   Jackpot hash\n\
-                        quark     Quark hash\n\
-                        anime     Animecoin hash\n\
-                        blake     Blake 256 (like NEOS blake)\n\
-                        fresh     Freshcoin hash (shavite 80)\n\
                         nist5     NIST5 (TalkCoin) hash\n\
+                        quark     Quark hash\n\
                         whirl     Whirlcoin (old whirlpool)\n\
                         x11       X11 (DarkCoin) hash\n\
                         x13       X13 (MaruCoin) hash\n\
@@ -420,11 +420,11 @@ static void share_result(int result, const char *reason)
 			100. * accepted_count / (accepted_count + rejected_count),
 			s,
 			use_colors ?
-				(result ? CL_GRN "(yay!!!)" : CL_RED "(booooo)")
+				(result ? CL_GRN "yay!!!" : CL_RED "booooo")
 			:	(result ? "(yay!!!)" : "(booooo)"));
 
-	if (opt_debug && reason)
-		applog(LOG_DEBUG, "DEBUG: reject reason: %s", reason);
+	if (reason)
+		applog(LOG_WARNING, "reject reason: %s", reason);
 }
 
 static bool submit_upstream_work(CURL *curl, struct work *work)
@@ -856,7 +856,7 @@ static void *miner_thread(void *userdata)
 			if ((*nonceptr) >= end_nonce)
 				stratum_gen_work(&stratum, &g_work);
 		} else {
-			int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime;
+			int min_scantime = have_longpoll ? (LP_SCANTIME*3)/4 : opt_scantime;
 			/* obtain new work from internal workio thread */
 			pthread_mutex_lock(&g_work_lock);
 			if (!have_stratum &&
@@ -952,13 +952,13 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;
 
-		case ALGO_BLAKE:
-			rc = scanhash_blake32(thr_id, work.data, work.target,
+		case ALGO_ANIME:
+			rc = scanhash_anime(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
 			break;
 
-		case ALGO_ANIME:
-			rc = scanhash_anime(thr_id, work.data, work.target,
+		case ALGO_BLAKE:
+			rc = scanhash_blake32(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
 			break;
 
diff --git a/miner.h b/miner.h
index a23df96..3ce6571 100644
--- a/miner.h
+++ b/miner.h
@@ -400,13 +400,14 @@ extern void tq_thaw(struct thread_q *tq);
 void applog_hash(unsigned char *hash);
 
 void print_hash_tests(void);
-unsigned int jackpothash(void *state, const void *input);
+void animehash(void *state, const void *input);
+void blake32hash(void *output, const void *input);
+void fresh_hash(void *state, const void *input);
 void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
 void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
+unsigned int jackpothash(void *state, const void *input);
 void groestlhash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
-void fresh_hash(void *state, const void *input);
-void blake32hash(void *output, const void *input);
 void nist5hash(void *state, const void *input);
 void quarkhash(void *state, const void *input);
 void wcoinhash(void *state, const void *input);
diff --git a/quark/animecoin.cu b/quark/animecoin.cu
index c19275d..4b2d097 100644
--- a/quark/animecoin.cu
+++ b/quark/animecoin.cu
@@ -57,7 +57,7 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads,
 											int order);
 
 // Original Quarkhash Funktion aus einem miner Quelltext
-inline void animehash(void *state, const void *input)
+extern "C" void animehash(void *state, const void *input)
 {
     sph_blake512_context ctx_blake;
     sph_bmw512_context ctx_bmw;
diff --git a/util.c b/util.c
index d9363dc..f947d37 100644
--- a/util.c
+++ b/util.c
@@ -1373,6 +1373,18 @@ void print_hash_tests(void)
 
 	printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n");
 
+	memset(hash, 0, sizeof hash);
+	animehash(&hash[0], &buf[0]);
+	printpfx("anime", hash);
+
+	memset(hash, 0, sizeof hash);
+	blake32hash(&hash[0], &buf[0]);
+	printpfx("blake", hash);
+
+	memset(hash, 0, sizeof hash);
+	fresh_hash(&hash[0], &buf[0]);
+	printpfx("fresh", hash);
+
 	memset(hash, 0, sizeof hash);
 	fugue256_hash(&hash[0], &buf[0], 32);
 	printpfx("fugue256", hash);
@@ -1393,10 +1405,6 @@ void print_hash_tests(void)
 	myriadhash(&hash[0], &buf[0]);
 	printpfx("myriad", hash);
 
-	memset(hash, 0, sizeof hash);
-	blake32hash(&hash[0], &buf[0]);
-	printpfx("blake", hash);
-
 	memset(hash, 0, sizeof hash);
 	nist5hash(&hash[0], &buf[0]);
 	printpfx("nist5", hash);
@@ -1405,10 +1413,6 @@ void print_hash_tests(void)
 	quarkhash(&hash[0], &buf[0]);
 	printpfx("quark", hash);
 
-	memset(hash, 0, sizeof hash);
-	fresh_hash(&hash[0], &buf[0]);
-	printpfx("fresh", hash);
-
 	memset(hash, 0, sizeof hash);
 	wcoinhash(&hash[0], &buf[0]);
 	printpfx("whirl", hash);

From 4a52d0553b0076b984be480725fa67689c544647 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 1 Sep 2014 10:22:32 +0200
Subject: [PATCH 04/44] debug: show json methods, hide hash/target if ok

---
 util.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/util.c b/util.c
index f947d37..6fd2b71 100644
--- a/util.c
+++ b/util.c
@@ -559,7 +559,7 @@ bool fulltest(const uint32_t *hash, const uint32_t *target)
 		}
 	}
 
-	if (opt_debug) {
+	if (!rc || opt_debug) {
 		uint32_t hash_be[8], target_be[8];
 		char *hash_str, *target_str;
 		
@@ -572,7 +572,7 @@ bool fulltest(const uint32_t *hash, const uint32_t *target)
 
 		applog(LOG_DEBUG, "DEBUG: %s\nHash:   %s\nTarget: %s",
 			rc ? "hash <= target"
-			   : "hash > target (false positive)",
+			   : CL_YLW "hash > target (false positive)" CL_N,
 			hash_str,
 			target_str);
 
@@ -1205,6 +1205,10 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
 	id = json_object_get(val, "id");
 	params = json_object_get(val, "params");
 
+	if (opt_debug) {
+		applog(LOG_DEBUG, "method: %s", s);
+	}
+
 	if (!strcasecmp(method, "mining.notify")) {
 		ret = stratum_notify(sctx, params);
 		goto out;
@@ -1368,7 +1372,8 @@ extern void applog_hash(unsigned char *hash)
 
 void print_hash_tests(void)
 {
-	unsigned char buf[128], hash[128], s[128];
+	char s[128] = {'\0'};
+	unsigned char buf[128], hash[128];
 	memset(buf, 0, sizeof buf);
 
 	printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n");

From 0aeac878ef60840f3123354037cd56a89d2e94e6 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 1 Sep 2014 06:12:55 +0200
Subject: [PATCH 05/44] blake: tune up and cleanup, ~100 MH/s on a normal 750Ti

tested on linux and windows (x86 binary)...

but there is a high number of duplicated shares... weird
---
 blake32.cu  | 197 +++++++++++++++-------------------------------------
 cpu-miner.c |   6 +-
 2 files changed, 59 insertions(+), 144 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index 882594f..4468368 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -25,14 +25,6 @@ extern "C" void blake32hash(void *output, const void *input)
 
 #include "cuda_helper.h"
 
-#if __CUDA_ARCH__ < 350
-	// Kepler (Compute 3.0) + Host
-	#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
-#else
-	// Kepler (Compute 3.5 / 5.0)
-	#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
-#endif
-
 // in cpu-miner.c
 extern bool opt_benchmark;
 extern bool opt_debug;
@@ -47,9 +39,11 @@ __constant__
 static uint32_t pTarget[8];
 
 __constant__
-static uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
+static uint32_t __align__(32) c_PaddedMessage80[32]; // padded message (80 bytes + padding)
+
 static uint32_t *d_resNounce[8];
 static uint32_t *h_resNounce[8];
+static bool init_made = false;
 
 __constant__
 static uint8_t c_sigma[16][16];
@@ -120,14 +114,14 @@ static const uint32_t c_u256[16] = {
 
 #define GS(a,b,c,d,e) { \
 	v[a] += (m[sigma[i][e]] ^ u256[sigma[i][e+1]]) + v[b]; \
-	v[d] = ROTR32(v[d] ^ v[a], 16); \
+	v[d] = SPH_ROTR32(v[d] ^ v[a], 16); \
 	v[c] += v[d]; \
-	v[b] = ROTR32(v[b] ^ v[c], 12); \
+	v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
 \
 	v[a] += (m[sigma[i][e+1]] ^ u256[sigma[i][e]]) + v[b]; \
-	v[d] = ROTR32(v[d] ^ v[a], 8); \
+	v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \
 	v[c] += v[d]; \
-	v[b] = ROTR32(v[b] ^ v[c], 7); \
+	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
 }
 
 __device__ static
@@ -138,11 +132,10 @@ void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), con
 
 	//#pragma unroll
 	for (int i = 0; i < 16; ++i) {
-		m[i] = cuda_swab32(block[i]);
-		//m[i] = block[i];
+		m[i] = block[i];
 	}
 
-	#pragma unroll
+	#pragma unroll 8
 	for(int i = 0; i < 8; i++)
 		v[i] = h[i];
 
@@ -156,10 +149,6 @@ void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), con
 	v[14] = u256[6];
 	v[15] = u256[7];
 
-	// on a 80-bytes null buffer :
-	// first : v = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, ...}
-	// second : v = {0xb5bfb2f9, 0x14cfcc63, 0xb85c549c, 0xc9b4184e, ..., 0x299f3350, 0x082efa98, 0xec4e6c89}
-
 	//#pragma unroll
 	for (int i = 0; i < 14; i++) {
 		/* column step */
@@ -177,11 +166,10 @@ void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), con
 	//#pragma unroll 16
 	for(int i = 0; i < 16; i++)
 		h[i % 8] ^= v[i];
-
-	//second H0 = 0x0c7b1594 ... H7 = 0x9051b305
 }
 
 #if __CUDA_ARCH__ >= 200
+/* memory should be aligned to use __nvvm_memset */
 #if (__NV_POINTER_SIZE == 64)
 # define SZCT uint64_t
 #else
@@ -196,9 +184,9 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t /* __align__(16) */ h[8];
-		uint32_t /* __align__(16) */ msg[16];
 		const uint32_t nounce = startNounce + thread;
+		uint32_t /* __align__(8) */ msg[16];
+		uint32_t h[8];
 
 		#pragma unroll
 		for(int i=0; i<8; i++)
@@ -209,9 +197,9 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
 		// ------ Close: Bytes 64 to 80 ------ 
 
 #if 0 /* __CUDA_ARCH__ >= 200 */
-		__nvvm_memset((uint8_t*)(&msg[4]), 0, sizeof(msg)-16, 16);
+		__nvvm_memset((uint8_t*)(&msg[4]), 0, sizeof(msg)-16, 8);
 #else
-		msg[5] = 0;
+		msg[5] = 0;  // uchar[17 to 55]
 		msg[6] = 0;
 		msg[7] = 0;
 		msg[8] = 0;
@@ -219,25 +207,22 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
 		msg[10] = 0;
 		msg[11] = 0;
 		msg[12] = 0;
+
 		msg[14] = 0;
 #endif
 		msg[0] = c_PaddedMessage80[16];
 		msg[1] = c_PaddedMessage80[17];
 		msg[2] = c_PaddedMessage80[18];
-		msg[3] = cuda_swab32(nounce); // here or at 80 ?
-
-		msg[4] = 0x80; // uchar[16] after buffer
-		msg[13] = 0x01000000; //((uint8_t*)msg)[55] = 1; // uchar[17 to 55]
-		msg[15] = 0x80020000; // 60-63 0x280
+		msg[3] = nounce; /* our tested value */
+		msg[4] = 0x80000000; //cuda_swab32(0x80U);
 
-		//h => {0xb5bfb2f9, 0x14cfcc63, 0xb85c549c, 0xc9b4184e, 0x67dfc6ce, 0x29e9904b, 0xd59ee74e, 0xfaa9c653}
-		//msg  {0, 0, 0, 0, 0x80, 0...}
+		msg[13] = 1;
+		msg[15] = 0x280; // 60-63
 
 		blake256_compress(h, msg, c_sigma, c_u256, 0x280); // or 0x80
-		//h => {0x0c7b1594, 0x52328517, 0x463db487, 0xdf5e39b7, 0x1322afaf, 0x14ed562c, 0xe9d18d7d, 0x9051b305}
 
-		uint32_t *outHash = (uint32_t*) outputHash + 16*thread; // 16 = 4 x sizeof(uint32)
-		//#pragma unroll
+		uint32_t *outHash = (uint32_t*) outputHash + thread;
+		//#pragma unroll 8
 		for (int i=0; i < 8; i++) {
 			outHash[i] = cuda_swab32(h[i]);
 		}
@@ -247,7 +232,7 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
 __host__
 void blake256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
 {
-	const int threadsperblock = 256;
+	const int threadsperblock = 128;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
@@ -265,28 +250,25 @@ void gpu_check_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVecto
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+		const uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
 
 		int hashPosition = nounce - startNounce;
-		uint32_t *inpHash = &g_hash[16 * hashPosition];
+		uint32_t *inpHash = &g_hash[hashPosition];
 		uint32_t hash[8];
 
 		#pragma unroll 8
 		for (int i=0; i < 8; i++)
 			hash[i] = inpHash[i];
 
-		int i, position = -1;
-		bool rc = true;
-
-		#pragma unroll 8
+		/* to enhance ? */
+		int i, rc = 1, position = -1;
 		for (i = 7; i >= 0; i--) {
+			// rc &= (hash[i] <= pTarget[i]);
 			if (hash[i] > pTarget[i] && position < i) {
-				position = i;
-				rc = false;
+				rc = false; position = i;
 			}
 			if (hash[i] < pTarget[i] && position < i) {
-				position = i;
-				rc = true;
+				rc = true; position = i;
 			}
 		}
 
@@ -298,8 +280,8 @@ void gpu_check_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVecto
 __host__
 uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
 {
+	const int threadsperblock = 128;
 	uint32_t result = 0xffffffff;
-	const int threadsperblock = 256;
 
 	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
 
@@ -309,14 +291,12 @@ uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32
 	size_t shared_size = 0;
 
 	gpu_check_hash_64 <<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
-
 	MyStreamSynchronize(NULL, order, thr_id);
 
-	CUDA_SAFE_CALL(cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost));
-
-	// cudaMemcpy() is asynch!
-	cudaThreadSynchronize();
-	result = *h_resNounce[thr_id];
+	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+		cudaThreadSynchronize();
+		result = *h_resNounce[thr_id];
+	}
 
 	return result;
 }
@@ -325,9 +305,9 @@ __host__
 void blake256_cpu_init(int thr_id)
 {
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice));
-
-	CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t)));
-	CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], sizeof(uint32_t)));
+	init_made = true;
 }
 
 __host__
@@ -336,8 +316,6 @@ void blake256_cpu_setBlock_80(uint32_t *pdata, const void *ptarget)
 	uint32_t PaddedMessage[32];
 	memcpy(PaddedMessage, pdata, 80);
 	memset(&PaddedMessage[20], 0, 48);
-	//for (int i=0; i<20; i++)
-	//	PaddedMessage[i] = cuda_swab32(pdata[i]);
 
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice));
@@ -348,19 +326,19 @@ void blake256_cpu_setBlock_80(uint32_t *pdata, const void *ptarget)
 extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {
-	uint32_t endiandata[20];
 	const uint32_t first_nonce = pdata[19];
-	const int throughput = 256*256*2;
+	const int throughput = 128 * 2048;
 	static bool init[8] = {0,0,0,0,0,0,0,0};
+	uint32_t endiandata[20];
+	uint32_t Htarg = ptarget[7];
+	int rc = 0;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x00000f;
-
-	uint32_t Htarg = ptarget[7];
+		((uint32_t*)ptarget)[7] = Htarg = 0x00000f;
 
 	if (!init[thr_id]) {
 		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 32 * throughput));
 
 		blake256_cpu_init(thr_id);
 
@@ -375,11 +353,11 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 	blake32hash(vhash, pdata);
 #endif
 
+	blake256_cpu_setBlock_80(pdata, (void*)ptarget);
+
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
 
-	blake256_cpu_setBlock_80(endiandata, (void*)ptarget);
-
 	do {
 		int order = 0;
 		uint32_t foundNonce;
@@ -401,14 +379,14 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 
 			blake32hash(vhashcpu, endiandata);
 
-			if (opt_debug)
-				applog(LOG_DEBUG, "foundNonce = %08x",foundNonce);
+			//if (opt_debug)
+			//	applog(LOG_DEBUG, "foundNonce = %08x",foundNonce);
 
 			if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
 			{
 				pdata[19] = foundNonce;
-				*hashes_done = pdata[19] - first_nonce + 1;
-				return 1;
+				rc = 1;
+				goto exit_scan;
 			} else {
 				applog(LOG_INFO, "GPU #%d: result for nonce %08x does not validate on CPU!", thr_id, foundNonce);
 			}
@@ -418,77 +396,12 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 
+exit_scan:
 	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
-}
-
-//#define DEBUG_ALGO
-
-__host__
-int scanhash_blake256_cpu(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-	uint32_t max_nonce, uint64_t *hashes_done)
-{
-	uint32_t n = pdata[19] - 1;
-	const uint32_t first_nonce = pdata[19];
-	const uint32_t Htarg = ptarget[7];
-
-	uint32_t __align__(32) hash64[8];
-	uint32_t endiandata[32];
-
-	uint64_t htmax[] = {
-		0,
-		0xF,
-		0xFF,
-		0xFFF,
-		0xFFFF,
-		0x10000000
-	};
-	uint32_t masks[] = {
-		0xFFFFFFFF,
-		0xFFFFFFF0,
-		0xFFFFFF00,
-		0xFFFFF000,
-		0xFFFF0000,
-		0
-	};
-
-	// we need bigendian data...
-	for (int kk=0; kk < 32; kk++) {
-		be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
-	};
-#ifdef DEBUG_ALGO
-	if (Htarg != 0)
-		printf("[%d] Htarg=%X\n", thr_id, Htarg);
-#endif
-	for (int m=0; m < 6; m++) {
-		if (Htarg <= htmax[m]) {
-			uint32_t mask = masks[m];
-			do {
-				pdata[19] = ++n;
-				be32enc(&endiandata[19], n);
-				blake32hash(hash64, endiandata);
-#ifndef DEBUG_ALGO
-				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
-					*hashes_done = n - first_nonce + 1;
-					return true;
-				}
-#else
-				if (!(n % 0x1000) && !thr_id) printf(".");
-				if (!(hash64[7] & mask)) {
-					printf("[%d]",thr_id);
-					if (fulltest(hash64, ptarget)) {
-						*hashes_done = n - first_nonce + 1;
-						return true;
-					}
-				}
-#endif
-			} while (n < max_nonce && !work_restart[thr_id].restart);
-			// see blake.c if else to understand the loop on htmax => mask
-			break;
-		}
+	if (init_made && opt_debug && h_resNounce[thr_id]) {
+		// made auto ???
+		//applog(LOG_DEBUG, "%08x", h_resNounce[thr_id]);
+		//cudaFreeHost(h_resNounce[thr_id]);
 	}
-
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
+	return rc;
 }
diff --git a/cpu-miner.c b/cpu-miner.c
index ef981d1..2b409d5 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -805,6 +805,8 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
 	else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH)
 		diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
+	else if (opt_algo == ALGO_BLAKE)
+		diff_to_target(work->target, sctx->job.diff / (4.0 * opt_difficulty));
 	else
 		diff_to_target(work->target, sctx->job.diff / opt_difficulty);
 }
@@ -898,8 +900,8 @@ static void *miner_thread(void *userdata)
 				max64 = 0x1fffLL;
 				break;
 			case ALGO_BLAKE:
-				max64 = 0xffffffLL;
-				break;
+				//max64 = 0x1000000LL;
+				//break;
 			default:
 				max64 = 0xfffffLL;
 				break;

From 530732458add6c4c3836606d028930f3581c0a5f Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 1 Sep 2014 12:22:51 +0200
Subject: [PATCH 06/44] blake: use a constant for threads, reduce mallocated
 d_hash size

and clean a bit more...
---
 blake32.cu        | 37 +++++++++++++++----------------------
 cpuminer-config.h |  6 +++---
 2 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index 4468368..e3d0bf8 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -12,6 +12,9 @@ extern "C" {
 #include <memory.h>
 }
 
+/* threads per block */
+#define TPB 128
+
 /* hash by cpu with blake 256 */
 extern "C" void blake32hash(void *output, const void *input)
 {
@@ -43,7 +46,6 @@ static uint32_t __align__(32) c_PaddedMessage80[32]; // padded message (80 bytes
 
 static uint32_t *d_resNounce[8];
 static uint32_t *h_resNounce[8];
-static bool init_made = false;
 
 __constant__
 static uint8_t c_sigma[16][16];
@@ -214,7 +216,7 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
 		msg[1] = c_PaddedMessage80[17];
 		msg[2] = c_PaddedMessage80[18];
 		msg[3] = nounce; /* our tested value */
-		msg[4] = 0x80000000; //cuda_swab32(0x80U);
+		msg[4] = 0x80000000UL; //cuda_swab32(0x80U);
 
 		msg[13] = 1;
 		msg[15] = 0x280; // 60-63
@@ -232,7 +234,7 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
 __host__
 void blake256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
 {
-	const int threadsperblock = 128;
+	const int threadsperblock = TPB;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
@@ -280,7 +282,7 @@ void gpu_check_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVecto
 __host__
 uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
 {
-	const int threadsperblock = 128;
+	const int threadsperblock = TPB;
 	uint32_t result = 0xffffffff;
 
 	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
@@ -307,7 +309,6 @@ void blake256_cpu_init(int thr_id)
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], sizeof(uint32_t)));
 	CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], sizeof(uint32_t)));
-	init_made = true;
 }
 
 __host__
@@ -327,7 +328,7 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 	uint32_t max_nonce, unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
-	const int throughput = 128 * 2048;
+	const int throughput = TPB * 2048;
 	static bool init[8] = {0,0,0,0,0,0,0,0};
 	uint32_t endiandata[20];
 	uint32_t Htarg = ptarget[7];
@@ -338,10 +339,8 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 
 	if (!init[thr_id]) {
 		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 32 * throughput));
-
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 48 * throughput)); // not sure for this size...
 		blake256_cpu_init(thr_id);
-
 		init[thr_id] = true;
 	}
 
@@ -349,8 +348,6 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 	// dev test with a null buffer 0x00000...
 	for (int k = 0; k < 20; k++)
 		pdata[k] = 0;
-	uint32_t vhash[8];
-	blake32hash(vhash, pdata);
 #endif
 
 	blake256_cpu_setBlock_80(pdata, (void*)ptarget);
@@ -362,7 +359,7 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 		int order = 0;
 		uint32_t foundNonce;
 
-		// GPU
+		// GPU HASH
 		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 
 #if NULLTEST
@@ -379,16 +376,17 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 
 			blake32hash(vhashcpu, endiandata);
 
-			//if (opt_debug)
-			//	applog(LOG_DEBUG, "foundNonce = %08x",foundNonce);
-
 			if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
 			{
 				pdata[19] = foundNonce;
 				rc = 1;
 				goto exit_scan;
-			} else {
-				applog(LOG_INFO, "GPU #%d: result for nonce %08x does not validate on CPU!", thr_id, foundNonce);
+			}
+			else if (vhashcpu[7] > Htarg) {
+				applog(LOG_WARNING, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg);
+			}
+			else {
+				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
 			}
 		}
 
@@ -398,10 +396,5 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 
 exit_scan:
 	*hashes_done = pdata[19] - first_nonce + 1;
-	if (init_made && opt_debug && h_resNounce[thr_id]) {
-		// made auto ???
-		//applog(LOG_DEBUG, "%08x", h_resNounce[thr_id]);
-		//cudaFreeHost(h_resNounce[thr_id]);
-	}
 	return rc;
 }
diff --git a/cpuminer-config.h b/cpuminer-config.h
index 0d0f042..0fafa85 100644
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@@ -156,7 +156,7 @@
 #define PACKAGE_NAME "ccminer"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 2014.08.12"
+#define PACKAGE_STRING "ccminer 2014.09.01"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "ccminer"
@@ -165,7 +165,7 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2014.08.12"
+#define PACKAGE_VERSION "2014.09.01"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
@@ -188,7 +188,7 @@
 #define USE_XOP 1
 
 /* Version number of package */
-#define VERSION "2014.08.12"
+#define VERSION "2014.09.01"
 
 /* Define curl_free() as free() if our version of curl lacks curl_free. */
 /* #undef curl_free */

From 1f99aae0ff621f4f85f119d811a3f1a8d2204f60 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 1 Sep 2014 18:49:23 +0200
Subject: [PATCH 07/44] exit on repeated duplicate shares (to enhance)

create a new function proper_exit() to do common stuff on exit...
---
 cpu-miner.c | 47 ++++++++++++++++++++++++++++-------------------
 miner.h     |  1 +
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/cpu-miner.c b/cpu-miner.c
index 2b409d5..d92c7e0 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -209,7 +209,7 @@ static struct stratum_ctx stratum;
 
 pthread_mutex_t applog_lock;
 static pthread_mutex_t stats_lock;
-
+static uint8_t duplicate_shares = 0;
 static unsigned long accepted_count = 0L;
 static unsigned long rejected_count = 0L;
 static double *thr_hashrates;
@@ -349,6 +349,13 @@ static struct work g_work;
 static time_t g_work_time;
 static pthread_mutex_t g_work_lock;
 
+
+void proper_exit(int reason)
+{
+	cuda_devicereset();
+	exit(reason);
+}
+
 static bool jobj_binary(const json_t *obj, const char *key,
 			void *buf, size_t buflen)
 {
@@ -423,8 +430,17 @@ static void share_result(int result, const char *reason)
 				(result ? CL_GRN "yay!!!" : CL_RED "booooo")
 			:	(result ? "(yay!!!)" : "(booooo)"));
 
-	if (reason)
+	if (reason) {
+		if (!strcmp(reason, "Duplicate share")) {
+			duplicate_shares++;
+			if (duplicate_shares > 3) {
+				// exit from app (until auto restart)
+				applog(LOG_WARNING, "Auto exit to prevent stratum bans: %s", reason);
+				proper_exit(1);
+			}
+		}
 		applog(LOG_WARNING, "reject reason: %s", reason);
+	}
 }
 
 static bool submit_upstream_work(CURL *curl, struct work *work)
@@ -1253,7 +1269,7 @@ static void show_version_and_exit(void)
 		PTW32_VERSION_STRING,
 #endif
 		curl_version());
-	exit(0);
+	proper_exit(0);
 }
 
 static void show_usage_and_exit(int status)
@@ -1262,7 +1278,7 @@ static void show_usage_and_exit(int status)
 		fprintf(stderr, "Try `" PROGRAM_NAME " --help' for more information.\n");
 	else
 		printf(usage);
-	exit(status);
+	proper_exit(status);
 }
 
 static void parse_arg (int key, char *arg)
@@ -1297,7 +1313,7 @@ static void parse_arg (int key, char *arg)
 #endif
 		if (!json_is_object(opt_config)) {
 			applog(LOG_ERR, "JSON decode of %s failed", arg);
-			exit(1);
+			proper_exit(1);
 		}
 		break;
 	}
@@ -1440,7 +1456,7 @@ static void parse_arg (int key, char *arg)
 		break;
 	case 1006:
 		print_hash_tests();
-		exit(0);
+		proper_exit(0);
 		break;
 	case 1003:
 		want_longpoll = false;
@@ -1462,7 +1478,7 @@ static void parse_arg (int key, char *arg)
 						device_map[opt_n_threads++] = atoi(pch);
 					else {
 						applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch));
-						exit(1);
+						proper_exit(1);
 					}
 				} else {
 					int device = cuda_finddevice(pch);
@@ -1470,7 +1486,7 @@ static void parse_arg (int key, char *arg)
 						device_map[opt_n_threads++] = device;
 					else {
 						applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch);
-						exit(1);
+						proper_exit(1);
 					}
 				}
 				pch = strtok (NULL, ",");
@@ -1572,13 +1588,11 @@ static void signal_handler(int sig)
 	case SIGINT:
 		signal(sig, SIG_IGN);
 		applog(LOG_INFO, "SIGINT received, exiting");
-		cuda_devicereset();
-		exit(0);
+		proper_exit(0);
 		break;
 	case SIGTERM:
 		applog(LOG_INFO, "SIGTERM received, exiting");
-		cuda_devicereset();
-		exit(0);
+		proper_exit(0);
 		break;
 	}
 }
@@ -1588,13 +1602,11 @@ BOOL WINAPI ConsoleHandler(DWORD dwType)
 	switch (dwType) {
 	case CTRL_C_EVENT:
 		applog(LOG_INFO, "CTRL_C_EVENT received, exiting");
-		cuda_devicereset();
-		exit(0);
+		proper_exit(0);
 		break;
 	case CTRL_BREAK_EVENT:
 		applog(LOG_INFO, "CTRL_BREAK_EVENT received, exiting");
-		cuda_devicereset();
-		exit(0);
+		proper_exit(0);
 		break;
 	default:
 		return false;
@@ -1785,8 +1797,5 @@ int main(int argc, char *argv[])
 
 	applog(LOG_INFO, "workio thread dead, exiting.");
 
-	// nvprof requires this
-	cuda_devicereset();
-
 	return 0;
 }
diff --git a/miner.h b/miner.h
index 3ce6571..e1e2d8d 100644
--- a/miner.h
+++ b/miner.h
@@ -396,6 +396,7 @@ extern void *tq_pop(struct thread_q *tq, const struct timespec *abstime);
 extern void tq_freeze(struct thread_q *tq);
 extern void tq_thaw(struct thread_q *tq);
 
+void proper_exit(int reason);
 
 void applog_hash(unsigned char *hash);
 

From 1b8c3c12fa5bb83afbb02f9d5f60586939f36d86 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Tue, 2 Sep 2014 03:38:57 +0200
Subject: [PATCH 08/44] debug: a new boolean to log or not json rpc data

---
 cpu-miner.c | 6 ++++++
 miner.h     | 1 +
 util.c      | 2 +-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/cpu-miner.c b/cpu-miner.c
index d92c7e0..9ff8375 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -168,6 +168,7 @@ static const char *algo_names[] = {
 };
 
 bool opt_debug = false;
+bool opt_debug_rpc = false;
 bool opt_protocol = false;
 bool opt_benchmark = false;
 bool want_longpoll = true;
@@ -522,6 +523,10 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		json_decref(val);
 	}
 
+	if (opt_debug_rpc) {
+		applog(LOG_DEBUG, "submit: %s", s);
+	}
+
 	rc = true;
 
 out:
@@ -1325,6 +1330,7 @@ static void parse_arg (int key, char *arg)
 		break;
 	case 'D':
 		opt_debug = true;
+		opt_debug_rpc = true;
 		break;
 	case 'p':
 		free(rpc_pass);
diff --git a/miner.h b/miner.h
index e1e2d8d..c9b2e44 100644
--- a/miner.h
+++ b/miner.h
@@ -285,6 +285,7 @@ struct work_restart {
 };
 
 extern bool opt_debug;
+extern bool opt_debug_rpc;
 extern bool opt_protocol;
 extern int opt_timeout;
 extern bool want_longpoll;
diff --git a/util.c b/util.c
index 6fd2b71..a9e0ae2 100644
--- a/util.c
+++ b/util.c
@@ -1205,7 +1205,7 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
 	id = json_object_get(val, "id");
 	params = json_object_get(val, "params");
 
-	if (opt_debug) {
+	if (opt_debug_rpc) {
 		applog(LOG_DEBUG, "method: %s", s);
 	}
 

From 2d42ae6de586a6ae8cbfd01806a273fd5cc4b262 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Tue, 2 Sep 2014 05:09:31 +0200
Subject: [PATCH 09/44] stratum: handle a small cache of submitted jobs

Prevent to send duplicated shares on some pools like hashharder..

This cache keeps submitted job/nounces of the last 15 minutes

so, remove exit on repeated duplicate shares,
    the submitted cache now handles this problem.

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
---
 Makefile.am             |  1 +
 README.md               |  6 +--
 ccminer.vcxproj         |  3 +-
 ccminer.vcxproj.filters |  5 ++-
 cpu-miner.c             | 29 ++++++++------
 hashlog.cpp             | 84 +++++++++++++++++++++++++++++++++++++++++
 miner.h                 |  6 +++
 7 files changed, 117 insertions(+), 17 deletions(-)
 create mode 100644 hashlog.cpp

diff --git a/Makefile.am b/Makefile.am
index 3935afe..c73d9d2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -17,6 +17,7 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  compat/inttypes.h compat/stdbool.h compat/unistd.h \
 			  compat/sys/time.h compat/getopt/getopt.h \
 			  cpu-miner.c util.c hefty1.c scrypt.c \
+			  hashlog.cpp \
 			  heavy/heavy.cu \
 			  heavy/cuda_blake512.cu heavy/cuda_blake512.h \
 			  heavy/cuda_combine.cu heavy/cuda_combine.h \
diff --git a/README.md b/README.md
index 715b387..2a2485b 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,12 @@ ccminer
 
 Christian Buchner's &amp; Christian H.'s CUDA miner project
 
-Fork by tpruvot@github with X14,X15,X17,WHIRL and M7 support
+Fork by tpruvot@github with X14,X15,X17,WHIRL and Blake256 support
    BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
+   [![tip for next commit](https://tip4commit.com/projects/927.svg)](https://tip4commit.com/github/tpruvot/ccminer)
 
 A big part of my recent additions were wrote by [djm34](https://github.com/djm34),
-You can also donate some beers (or redbulls) with these addresses :
-   XjPqpkCPoYJJYdQRrVByU7ySpVyeqJmSGU
+You can also donate some beers (or redbulls)
 
 This variant was tested and built on Linux (ubuntu server 14.04)
 and VStudio 2013.
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index d3ec423..509715b 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -243,6 +243,7 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
     </ClCompile>
     <ClCompile Include="fuguecoin.cpp" />
     <ClCompile Include="groestlcoin.cpp" />
+    <ClCompile Include="hashlog.cpp" />
     <ClCompile Include="hefty1.c" />
     <ClCompile Include="myriadgroestl.cpp" />
     <ClCompile Include="scrypt.c">
@@ -560,4 +561,4 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
   <ImportGroup Label="ExtensionTargets">
     <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" />
   </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 55c69aa..93e331c 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -180,6 +180,9 @@
     <ClCompile Include="compat\winansi.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="hashlog.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="compat.h">
@@ -440,4 +443,4 @@
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
   </ItemGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/cpu-miner.c b/cpu-miner.c
index 9ff8375..513d4f8 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -210,7 +210,6 @@ static struct stratum_ctx stratum;
 
 pthread_mutex_t applog_lock;
 static pthread_mutex_t stats_lock;
-static uint8_t duplicate_shares = 0;
 static unsigned long accepted_count = 0L;
 static unsigned long rejected_count = 0L;
 static double *thr_hashrates;
@@ -354,6 +353,7 @@ static pthread_mutex_t g_work_lock;
 void proper_exit(int reason)
 {
 	cuda_devicereset();
+	hashlog_purge_all();
 	exit(reason);
 }
 
@@ -432,14 +432,6 @@ static void share_result(int result, const char *reason)
 			:	(result ? "(yay!!!)" : "(booooo)"));
 
 	if (reason) {
-		if (!strcmp(reason, "Duplicate share")) {
-			duplicate_shares++;
-			if (duplicate_shares > 3) {
-				// exit from app (until auto restart)
-				applog(LOG_WARNING, "Auto exit to prevent stratum bans: %s", reason);
-				proper_exit(1);
-			}
-		}
 		applog(LOG_WARNING, "reject reason: %s", reason);
 	}
 }
@@ -460,6 +452,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 	}
 
 	if (have_stratum) {
+		uint32_t sent;
 		uint32_t ntime, nonce;
 		uint16_t nvote;
 		char *ntimestr, *noncestr, *xnonce2str, *nvotestr;
@@ -472,6 +465,16 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		noncestr = bin2hex((const unsigned char *)(&nonce), 4);
 		xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len);
 		nvotestr = bin2hex((const unsigned char *)(&nvote), 2);
+
+		sent = hashlog_already_submittted(work->job_id, nonce);
+		if (sent > 0) {
+			sent = (uint32_t) time(NULL) - sent;
+			if (!opt_quiet)
+				applog(LOG_WARNING, "skip submit, nonce %s was already sent %u seconds ago", noncestr, sent);
+			rc = true;
+			goto out;
+		}
+
 		if (opt_algo == ALGO_HEAVY) {
 			sprintf(s,
 				"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
@@ -490,6 +493,9 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 			applog(LOG_ERR, "submit_upstream_work stratum_send_line failed");
 			goto out;
 		}
+
+		hashlog_remember_submit(work->job_id, nonce);
+
 	} else {
 
 		/* build hex string */
@@ -826,8 +832,6 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
 	else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH)
 		diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
-	else if (opt_algo == ALGO_BLAKE)
-		diff_to_target(work->target, sctx->job.diff / (4.0 * opt_difficulty));
 	else
 		diff_to_target(work->target, sctx->job.diff / opt_difficulty);
 }
@@ -1237,8 +1241,9 @@ static void *stratum_thread(void *userdata)
 			pthread_mutex_unlock(&g_work_lock);
 			if (stratum.job.clean) {
 				if (!opt_quiet)
-					applog(LOG_BLUE, "%s send a new %s block", short_url, algo_names[opt_algo]);
+					applog(LOG_BLUE, "%s send a new %s job", short_url, algo_names[opt_algo]);
 				restart_threads();
+				hashlog_purge_old();
 			}
 		}
 		
diff --git a/hashlog.cpp b/hashlog.cpp
new file mode 100644
index 0000000..ded566e
--- /dev/null
+++ b/hashlog.cpp
@@ -0,0 +1,84 @@
+#include <inttypes.h>
+#include <stdlib.h>
+#include <map>
+
+#include "miner.h"
+
+static std::map<uint64_t, uint32_t> tlastshares;
+
+/**
+ * Purge entries after 15 minutes
+ */
+#define LOG_PURGE_TIMEOUT 15*60
+
+/**
+ * Store submitted nounces of a job
+ */
+extern "C" void hashlog_remember_submit(char* jobid, uint32_t nounce)
+{
+	char *ptr;
+	uint64_t njobid = (uint64_t) strtoul(jobid, &ptr, 16);
+	uint64_t key = (njobid << 32) + nounce;
+	tlastshares[key] = (uint32_t) time(NULL);
+}
+
+/**
+ * @return time of submission
+ */
+extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce)
+{
+	char *ptr;
+	uint32_t ret = 0;
+	uint64_t njobid = (uint64_t) strtoul(jobid, &ptr, 16);
+	uint64_t key = (njobid << 32) + nounce;
+	std::map<uint64_t, uint32_t>::iterator i = tlastshares.find(key);
+	if (i != tlastshares.end())
+		ret = (uint32_t) tlastshares[key];
+	return ret;
+}
+
+/**
+ * Remove entries of a job... not used yet
+ */
+extern "C" void hashlog_purge_job(char* jobid)
+{
+	char *ptr;
+	uint64_t njobid = strtoul(jobid, &ptr, 16);
+	uint64_t keypfx = (njobid << 32);
+	std::map<uint64_t, uint32_t>::iterator i = tlastshares.begin();
+	while (i != tlastshares.end()) {
+		if ((keypfx & i->first) != 0)
+			tlastshares.erase(i);
+		i++;
+	}
+}
+
+/**
+ * Remove old entries to reduce memory usage
+ */
+extern "C" void hashlog_purge_old(void)
+{
+	int deleted = 0;
+	uint32_t now = (uint32_t) time(NULL);
+	std::map<uint64_t, uint32_t>::iterator i = tlastshares.begin();
+	while (i != tlastshares.end()) {
+		if ((now - i->second) > LOG_PURGE_TIMEOUT) {
+			deleted++;
+			tlastshares.erase(i);
+		}
+		i++;
+	}
+	if (opt_debug && deleted) {
+		applog(LOG_DEBUG, "hashlog: %d/%d purged",
+			deleted, tlastshares.size());
+	}
+}
+
+/**
+ * Reset the submitted nounce cache
+ */
+extern "C" void hashlog_purge_all(void)
+{
+	tlastshares.clear();
+}
+
diff --git a/miner.h b/miner.h
index c9b2e44..b986197 100644
--- a/miner.h
+++ b/miner.h
@@ -388,6 +388,12 @@ bool stratum_subscribe(struct stratum_ctx *sctx);
 bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
 bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
 
+void hashlog_remember_submit(char* jobid, uint32_t nounce);
+uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce);
+void hashlog_purge_old(void);
+void hashlog_purge_job(char* jobid);
+void hashlog_purge_all(void);
+
 struct thread_q;
 
 extern struct thread_q *tq_new(void);

From de80c7e9d1448f15541d08c5dbbf372d5bfeba48 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Tue, 2 Sep 2014 12:40:44 +0200
Subject: [PATCH 10/44] blake: remove unused parameter and fix index in d_hash

that reduce the speed to 92MH/s but the next commit
give us 30 more

so, todo: merge the whole checkhash proc in gpu_hash
          and remove this d_hash buffer...
---
 blake32.cu | 62 +++++++++++++++++-------------------------------------
 1 file changed, 19 insertions(+), 43 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index e3d0bf8..814be2d 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -181,10 +181,10 @@ extern __device__ __device_builtin__ void __nvvm_memset(uint8_t *, unsigned char
 #endif
 
 __global__
-void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
+void blake256_gpu_hash_80(int threads, uint32_t startNounce, uint32_t* outputHash)
 {
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < (uint32_t) threads)
 	{
 		const uint32_t nounce = startNounce + thread;
 		uint32_t /* __align__(8) */ msg[16];
@@ -223,7 +223,7 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
 
 		blake256_compress(h, msg, c_sigma, c_u256, 0x280); // or 0x80
 
-		uint32_t *outHash = (uint32_t*) outputHash + thread;
+		uint32_t *outHash = &outputHash[thread<<3];
 		//#pragma unroll 8
 		for (int i=0; i < 8; i++) {
 			outHash[i] = cuda_swab32(h[i]);
@@ -247,40 +247,30 @@ void blake256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_
 }
 
 __global__
-void gpu_check_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
+void gpu_check_hash_64(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *resNounce)
 {
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		const uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
-
-		int hashPosition = nounce - startNounce;
-		uint32_t *inpHash = &g_hash[hashPosition];
-		uint32_t hash[8];
-
-		#pragma unroll 8
-		for (int i=0; i < 8; i++)
-			hash[i] = inpHash[i];
-
-		/* to enhance ? */
-		int i, rc = 1, position = -1;
-		for (i = 7; i >= 0; i--) {
-			// rc &= (hash[i] <= pTarget[i]);
-			if (hash[i] > pTarget[i] && position < i) {
-				rc = false; position = i;
+		uint32_t* pHash = &g_hash[thread<<3];
+		for (int i = 7; i >= 0; i--) {
+			uint32_t hash = pHash[i];
+			if (hash > pTarget[i]) {
+				return;
 			}
-			if (hash[i] < pTarget[i] && position < i) {
-				rc = true; position = i;
+			if (hash < pTarget[i]) {
+				break;
 			}
 		}
 
-		if(rc && resNounce[0] > nounce)
+		uint32_t nounce = startNounce + thread;
+		if(resNounce[0] > nounce)
 			resNounce[0] = nounce;
 	}
 }
 
 __host__
-uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
+uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
 {
 	const int threadsperblock = TPB;
 	uint32_t result = 0xffffffff;
@@ -292,7 +282,7 @@ uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32
 
 	size_t shared_size = 0;
 
-	gpu_check_hash_64 <<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
+	gpu_check_hash_64 <<<grid, block, shared_size>>>(threads, startNounce, d_inputHash, d_resNounce[thr_id]);
 	MyStreamSynchronize(NULL, order, thr_id);
 
 	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
@@ -322,8 +312,6 @@ void blake256_cpu_setBlock_80(uint32_t *pdata, const void *ptarget)
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice));
 }
 
-#define NULLTEST 0
-
 extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {
@@ -339,17 +327,11 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 
 	if (!init[thr_id]) {
 		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 48 * throughput)); // not sure for this size...
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 32 * throughput)); /* 32 bytes x 256K Threads (to be removed soon) */
 		blake256_cpu_init(thr_id);
 		init[thr_id] = true;
 	}
 
-#if NULLTEST
-	// dev test with a null buffer 0x00000...
-	for (int k = 0; k < 20; k++)
-		pdata[k] = 0;
-#endif
-
 	blake256_cpu_setBlock_80(pdata, (void*)ptarget);
 
 	for (int k=0; k < 20; k++)
@@ -362,13 +344,7 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 		// GPU HASH
 		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 
-#if NULLTEST
-		uint32_t buf[8]; memset(buf, 0, sizeof buf);
-		CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
-		CUDA_SAFE_CALL(cudaThreadSynchronize());
-		//applog_hash((unsigned char*)buf);
-#endif
-		foundNonce = cpu_check_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		foundNonce = cpu_check_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhashcpu[8];

From 7e595a36ea69027c8a28023399540a761e7686c3 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Tue, 2 Sep 2014 21:13:37 +0200
Subject: [PATCH 11/44] blake: cleanup, remove d_hash buf, not in a chain

host: only bencode if gpu hash was found
---
 Makefile.am |   3 ++
 blake32.cu  | 121 +++++++++++++++-------------------------------------
 2 files changed, 38 insertions(+), 86 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index c73d9d2..520dff0 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -60,6 +60,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES)
 .cu.o:
 	$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=128 -o $@ -c $<
 
+blake32.o: blake32.cu
+	$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=64 -o $@ -c $<
+
 # Luffa and Echo are faster with 80 registers than 128
 x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
 	$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $<
diff --git a/blake32.cu b/blake32.cu
index 814be2d..9755f93 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -35,11 +35,8 @@ extern int device_map[8];
 
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 
-// shared for 8 threads of addresses (cudaMalloc)
-uint32_t* d_hash[8];
-
 __constant__
-static uint32_t pTarget[8];
+static uint32_t c_Target[8];
 
 __constant__
 static uint32_t __align__(32) c_PaddedMessage80[32]; // padded message (80 bytes + padding)
@@ -181,7 +178,7 @@ extern __device__ __device_builtin__ void __nvvm_memset(uint8_t *, unsigned char
 #endif
 
 __global__
-void blake256_gpu_hash_80(int threads, uint32_t startNounce, uint32_t* outputHash)
+void blake256_gpu_hash_80(int threads, uint32_t startNounce, uint32_t *resNounce)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < (uint32_t) threads)
@@ -198,9 +195,12 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, uint32_t* outputHas
 
 		// ------ Close: Bytes 64 to 80 ------ 
 
-#if 0 /* __CUDA_ARCH__ >= 200 */
-		__nvvm_memset((uint8_t*)(&msg[4]), 0, sizeof(msg)-16, 8);
-#else
+		msg[0] = c_PaddedMessage80[16];
+		msg[1] = c_PaddedMessage80[17];
+		msg[2] = c_PaddedMessage80[18];
+		msg[3] = nounce; /* our tested value */
+		msg[4] = 0x80000000UL; //cuda_swab32(0x80U);
+
 		msg[5] = 0;  // uchar[17 to 55]
 		msg[6] = 0;
 		msg[7] = 0;
@@ -210,144 +210,93 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, uint32_t* outputHas
 		msg[11] = 0;
 		msg[12] = 0;
 
-		msg[14] = 0;
-#endif
-		msg[0] = c_PaddedMessage80[16];
-		msg[1] = c_PaddedMessage80[17];
-		msg[2] = c_PaddedMessage80[18];
-		msg[3] = nounce; /* our tested value */
-		msg[4] = 0x80000000UL; //cuda_swab32(0x80U);
-
 		msg[13] = 1;
-		msg[15] = 0x280; // 60-63
-
-		blake256_compress(h, msg, c_sigma, c_u256, 0x280); // or 0x80
-
-		uint32_t *outHash = &outputHash[thread<<3];
-		//#pragma unroll 8
-		for (int i=0; i < 8; i++) {
-			outHash[i] = cuda_swab32(h[i]);
-		}
-	}
-}
-
-__host__
-void blake256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
-{
-	const int threadsperblock = TPB;
-
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
-
-	size_t shared_size = 0;
+		msg[14] = 0;
+		msg[15] = 0x280;
 
-	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
+		blake256_compress(h, msg, c_sigma, c_u256, 0x280);
 
-	MyStreamSynchronize(NULL, order, thr_id);
-}
-
-__global__
-void gpu_check_hash_64(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *resNounce)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		uint32_t* pHash = &g_hash[thread<<3];
 		for (int i = 7; i >= 0; i--) {
-			uint32_t hash = pHash[i];
-			if (hash > pTarget[i]) {
+			uint32_t hash = cuda_swab32(h[i]);
+			if (hash > c_Target[i]) {
 				return;
 			}
-			if (hash < pTarget[i]) {
+			if (hash < c_Target[i]) {
 				break;
 			}
 		}
 
-		uint32_t nounce = startNounce + thread;
+		/* keep the smallest nounce, hmm... */
 		if(resNounce[0] > nounce)
 			resNounce[0] = nounce;
 	}
 }
 
 __host__
-uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
+uint32_t blake256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce)
 {
 	const int threadsperblock = TPB;
-	uint32_t result = 0xffffffff;
-
-	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-
 	size_t shared_size = 0;
 
-	gpu_check_hash_64 <<<grid, block, shared_size>>>(threads, startNounce, d_inputHash, d_resNounce[thr_id]);
-	MyStreamSynchronize(NULL, order, thr_id);
+	uint32_t result = 0xffffffffU;
+	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
+
+	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNounce[thr_id]);
+	MyStreamSynchronize(NULL, 1, thr_id);
 
 	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
 		cudaThreadSynchronize();
 		result = *h_resNounce[thr_id];
 	}
-
 	return result;
 }
 
-__host__
-void blake256_cpu_init(int thr_id)
-{
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], sizeof(uint32_t)));
-	CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], sizeof(uint32_t)));
-}
-
 __host__
 void blake256_cpu_setBlock_80(uint32_t *pdata, const void *ptarget)
 {
 	uint32_t PaddedMessage[32];
 	memcpy(PaddedMessage, pdata, 80);
 	memset(&PaddedMessage[20], 0, 48);
-
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice));
 }
 
 extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
-	const int throughput = TPB * 2048;
-	static bool init[8] = {0,0,0,0,0,0,0,0};
-	uint32_t endiandata[20];
-	uint32_t Htarg = ptarget[7];
+	const int throughput = TPB * 2048; /* 2048 threads is the max on a 750Ti */
+	static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 	int rc = 0;
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = Htarg = 0x00000f;
+		((uint32_t*)ptarget)[7] = 0x00000f;
 
 	if (!init[thr_id]) {
 		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 32 * throughput)); /* 32 bytes x 256K Threads (to be removed soon) */
-		blake256_cpu_init(thr_id);
+		CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], sizeof(uint32_t)));
+		CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], sizeof(uint32_t)));
 		init[thr_id] = true;
 	}
 
 	blake256_cpu_setBlock_80(pdata, (void*)ptarget);
 
-	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], pdata[k]);
-
 	do {
-		int order = 0;
-		uint32_t foundNonce;
-
 		// GPU HASH
-		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-
-		foundNonce = cpu_check_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19]);
 		if (foundNonce != 0xffffffff)
 		{
+			uint32_t endiandata[20];
 			uint32_t vhashcpu[8];
+			uint32_t Htarg = ptarget[7];
+
+			for (int k=0; k < 20; k++)
+				be32enc(&endiandata[k], pdata[k]);
+
 			be32enc(&endiandata[19], foundNonce);
 
 			blake32hash(vhashcpu, endiandata);

From 43d3e93e1a97e569ead2437f759c6b8423d30c0a Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Wed, 3 Sep 2014 09:29:51 +0200
Subject: [PATCH 12/44] blake: set a max throughput

---
 blake32.cu | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index 9755f93..68123f8 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -123,6 +123,8 @@ static const uint32_t c_u256[16] = {
 	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
 }
 
+#define BLAKE256_ROUNDS 14
+
 __device__ static
 void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), const uint32_t *u256, const uint32_t T0, uint8_t nullt = 1)
 {
@@ -134,7 +136,7 @@ void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), con
 		m[i] = block[i];
 	}
 
-	#pragma unroll 8
+	//#pragma unroll 8
 	for(int i = 0; i < 8; i++)
 		v[i] = h[i];
 
@@ -149,7 +151,7 @@ void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), con
 	v[15] = u256[7];
 
 	//#pragma unroll
-	for (int i = 0; i < 14; i++) {
+	for (int i = 0; i < BLAKE256_ROUNDS; i++) {
 		/* column step */
 		GS(0, 4, 0x8, 0xC, 0);
 		GS(1, 5, 0x9, 0xD, 2);
@@ -178,10 +180,10 @@ extern __device__ __device_builtin__ void __nvvm_memset(uint8_t *, unsigned char
 #endif
 
 __global__
-void blake256_gpu_hash_80(int threads, uint32_t startNounce, uint32_t *resNounce)
+void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < (uint32_t) threads)
+	if (thread < threads)
 	{
 		const uint32_t nounce = startNounce + thread;
 		uint32_t /* __align__(8) */ msg[16];
@@ -233,7 +235,7 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, uint32_t *resNounce
 }
 
 __host__
-uint32_t blake256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce)
+uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce)
 {
 	const int threadsperblock = TPB;
 
@@ -269,8 +271,8 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 	uint32_t max_nonce, unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
-	const int throughput = TPB * 2048; /* 2048 threads is the max on a 750Ti */
 	static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+	uint32_t throughput = min(TPB * 2048, max_nonce - first_nonce);
 	int rc = 0;
 
 	if (opt_benchmark)
@@ -294,6 +296,8 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 			uint32_t vhashcpu[8];
 			uint32_t Htarg = ptarget[7];
 
+			applog(LOG_WARNING, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce);
+
 			for (int k=0; k < 20; k++)
 				be32enc(&endiandata[k], pdata[k]);
 

From 049e57730116685755bd3ff214f0793cce7c773b Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Wed, 3 Sep 2014 09:49:14 +0200
Subject: [PATCH 13/44] tmp blake log

---
 blake32.cu  | 15 +++++++++++++--
 cpu-miner.c |  2 +-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index 68123f8..a86287e 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -285,6 +285,14 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 		init[thr_id] = true;
 	}
 
+	if (throughput < (TPB * 2048))
+		applog(LOG_WARNING, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce);
+
+	if (max_nonce < first_nonce) {
+		applog(LOG_ERR, "start=%x > end=%x !", first_nonce, max_nonce);
+		return 0;
+	}
+
 	blake256_cpu_setBlock_80(pdata, (void*)ptarget);
 
 	do {
@@ -312,10 +320,13 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 				goto exit_scan;
 			}
 			else if (vhashcpu[7] > Htarg) {
-				applog(LOG_WARNING, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg);
+				applog(LOG_WARNING, "GPU #%d: result for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg);
+			}
+			else if (vhashcpu[6] > ptarget[6]) {
+				applog(LOG_WARNING, "GPU #%d: hash[6] for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[6], ptarget[6]);
 			}
 			else {
-				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
+				applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
 			}
 		}
 
diff --git a/cpu-miner.c b/cpu-miner.c
index 513d4f8..a16c3b7 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -842,7 +842,7 @@ static void *miner_thread(void *userdata)
 	int thr_id = mythr->id;
 	struct work work;
 	uint32_t max_nonce;
-	uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20;
+	uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 2;
 	unsigned char *scratchbuf = NULL;
 	char s[16];
 	int i;

From 1a4391d7ff21397a128abf031f92733a8ac47437 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Tue, 2 Sep 2014 12:40:52 +0200
Subject: [PATCH 14/44] hashlog: prevent double computing on jobs already done

---
 blake32.cu  |  2 +-
 cpu-miner.c | 41 +++++++++++++++++++++++--------
 hashlog.cpp | 71 ++++++++++++++++++++++++++++++++++++-----------------
 miner.h     |  4 +++
 util.c      | 26 ++++++++++++++++++++
 5 files changed, 111 insertions(+), 33 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index a86287e..b50a3ca 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -304,7 +304,7 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 			uint32_t vhashcpu[8];
 			uint32_t Htarg = ptarget[7];
 
-			applog(LOG_WARNING, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce);
+			applog(LOG_WARNING, "throughput=%u, start=%x, max=%x, pdata=%x", throughput, first_nonce, max_nonce, pdata[0]);
 
 			for (int k=0; k < 20; k++)
 				be32enc(&endiandata[k], pdata[k]);
diff --git a/cpu-miner.c b/cpu-miner.c
index a16c3b7..6da1465 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -822,9 +822,11 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	pthread_mutex_unlock(&sctx->work_lock);
 
 	if (opt_debug) {
+		char *tm = atime2str(swab32(work->data[17]));
 		char *xnonce2str = bin2hex(work->xnonce2, sctx->xnonce2_size);
-		applog(LOG_DEBUG, "DEBUG: job_id='%s' extranonce2=%s ntime=%08x",
-		       work->job_id, xnonce2str, swab32(work->data[17]));
+		applog(LOG_DEBUG, "DEBUG: job_id=%s xnonce2=%s time=%s",
+		       work->job_id, xnonce2str, tm);
+		free(tm);
 		free(xnonce2str);
 	}
 
@@ -842,10 +844,9 @@ static void *miner_thread(void *userdata)
 	int thr_id = mythr->id;
 	struct work work;
 	uint32_t max_nonce;
-	uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 2;
+	uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1);
 	unsigned char *scratchbuf = NULL;
 	char s[16];
-	int i;
 
 	memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized
 
@@ -870,6 +871,7 @@ static void *miner_thread(void *userdata)
 		unsigned long hashes_done;
 		struct timeval tv_start, tv_end, diff;
 		int64_t max64;
+		uint64_t umax64;
 		int rc;
 
 		// &work.data[19]
@@ -877,13 +879,17 @@ static void *miner_thread(void *userdata)
 		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
 
 		if (have_stratum) {
-			while (time(NULL) >= g_work_time + 120)
-				sleep(1);
+			while (time(NULL) >= g_work_time + opt_scantime)
+				usleep(500*1000);
 			pthread_mutex_lock(&g_work_lock);
+			nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
 			if ((*nonceptr) >= end_nonce)
 				stratum_gen_work(&stratum, &g_work);
 		} else {
-			int min_scantime = have_longpoll ? (LP_SCANTIME*3)/4 : opt_scantime;
+			int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime;
+			if (!opt_quiet)
+			applog(LOG_DEBUG, "have_longpoll=%d, have_stratum=%d, min_scantime=%d, g_work_time=%d",
+				have_longpoll, have_stratum, min_scantime, g_work_time);
 			/* obtain new work from internal workio thread */
 			pthread_mutex_lock(&g_work_lock);
 			if (!have_stratum &&
@@ -904,7 +910,7 @@ static void *miner_thread(void *userdata)
 		}
 		if (memcmp(work.data, g_work.data, wcmplen)) {
 			memcpy(&work, &g_work, sizeof(struct work));
-			(*nonceptr) = 0xffffffffU / opt_n_threads * thr_id;
+			(*nonceptr) = 0xffffffffU / opt_n_threads * thr_id; // 0 if single thr
 		} else
 			(*nonceptr)++;
 		pthread_mutex_unlock(&g_work_lock);
@@ -932,10 +938,24 @@ static void *miner_thread(void *userdata)
 				break;
 			}
 		}
-		if ((int64_t)(*nonceptr) + max64 > end_nonce)
+
+		umax64 = (uint64_t) max64;
+		if (end_nonce < (umax64 + (*nonceptr)))
 			max_nonce = end_nonce;
 		else
-			max_nonce = (uint32_t)((*nonceptr) + max64);
+			max_nonce = umax64 + (*nonceptr);
+
+		/* do not recompute something already scanned (and sent) ! */
+		if (hashlog_already_submittted(work.job_id, 0)) {
+			uint32_t lastnonce = hashlog_get_last_sent(work.job_id);
+			if ((*nonceptr) < lastnonce && lastnonce <= max_nonce) {
+				applog(LOG_WARNING, "rescan of sent job? nonce=%x, last was %x", (*nonceptr), lastnonce);
+				max_nonce = lastnonce - 1;
+			} else if ((*nonceptr) == lastnonce) {
+				applog(LOG_WARNING, "rescan of sent job? start nonce = lastnonce");
+				(*nonceptr) = lastnonce + 1;
+			}
+		}
 
 		hashes_done = 0;
 		gettimeofday(&tv_start, NULL);
@@ -1051,6 +1071,7 @@ static void *miner_thread(void *userdata)
 		}
 		if (opt_benchmark && thr_id == opt_n_threads - 1) {
 			double hashrate = 0.;
+			int i;
 			for (i = 0; i < opt_n_threads && thr_hashrates[i]; i++)
 				hashrate += thr_hashrates[i];
 			if (i == opt_n_threads) {
diff --git a/hashlog.cpp b/hashlog.cpp
index ded566e..3948751 100644
--- a/hashlog.cpp
+++ b/hashlog.cpp
@@ -1,39 +1,68 @@
-#include <inttypes.h>
+//#include <inttypes.h>
 #include <stdlib.h>
 #include <map>
 
 #include "miner.h"
 
+#define HI_DWORD(u64) ((uint32_t) (u64 >> 32))
+#define LO_DWORD(u64) ((uint32_t) u64)
+
 static std::map<uint64_t, uint32_t> tlastshares;
 
-/**
- * Purge entries after 15 minutes
- */
 #define LOG_PURGE_TIMEOUT 15*60
 
 /**
- * Store submitted nounces of a job
+ * str hex to uint32
  */
-extern "C" void hashlog_remember_submit(char* jobid, uint32_t nounce)
+static uint64_t hextouint(char* jobid)
 {
 	char *ptr;
-	uint64_t njobid = (uint64_t) strtoul(jobid, &ptr, 16);
-	uint64_t key = (njobid << 32) + nounce;
+	return strtoull(jobid, &ptr, 16);
+}
+
+/**
+ * Store submitted nonces of a job
+ */
+extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce)
+{
+	uint64_t njobid = hextouint(jobid);
+	uint64_t key = (njobid << 32) + nonce;
 	tlastshares[key] = (uint32_t) time(NULL);
 }
 
 /**
- * @return time of submission
+ * Search last submitted nonce for a job
+ * @return max nonce
  */
-extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce)
+extern "C" uint32_t hashlog_get_last_sent(char* jobid)
 {
-	char *ptr;
 	uint32_t ret = 0;
-	uint64_t njobid = (uint64_t) strtoul(jobid, &ptr, 16);
-	uint64_t key = (njobid << 32) + nounce;
-	std::map<uint64_t, uint32_t>::iterator i = tlastshares.find(key);
-	if (i != tlastshares.end())
+	uint64_t njobid = hextouint(jobid);
+	uint64_t keypfx = (njobid << 32);
+	std::map<uint64_t, uint32_t>::iterator i = tlastshares.begin();
+	while (i != tlastshares.end()) {
+		if ((keypfx & i->first) == keypfx && LO_DWORD(i->first) > ret) {
+			ret = LO_DWORD(i->first);
+		}
+		i++;
+	}
+	return ret;
+}
+
+/**
+ * @return time of a job/nonce submission (or last nonce if nonce is 0)
+ */
+extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nonce)
+{
+	uint32_t ret = 0;
+	uint64_t njobid = hextouint(jobid);
+	uint64_t key = (njobid << 32) + nonce;
+	if (nonce == 0) {
+		// search last submitted nonce for job
+		ret = hashlog_get_last_sent(jobid);
+	} else if (tlastshares.find(key) != tlastshares.end()) {
 		ret = (uint32_t) tlastshares[key];
+	}
 	return ret;
 }
 
@@ -42,12 +71,11 @@ extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce)
  */
 extern "C" void hashlog_purge_job(char* jobid)
 {
-	char *ptr;
-	uint64_t njobid = strtoul(jobid, &ptr, 16);
+	uint64_t njobid = hextouint(jobid);
 	uint64_t keypfx = (njobid << 32);
 	std::map<uint64_t, uint32_t>::iterator i = tlastshares.begin();
 	while (i != tlastshares.end()) {
-		if ((keypfx & i->first) != 0)
+		if ((keypfx & i->first) == keypfx)
 			tlastshares.erase(i);
 		i++;
 	}
@@ -60,6 +88,7 @@ extern "C" void hashlog_purge_old(void)
 {
 	int deleted = 0;
 	uint32_t now = (uint32_t) time(NULL);
+	uint32_t sz = tlastshares.size();
 	std::map<uint64_t, uint32_t>::iterator i = tlastshares.begin();
 	while (i != tlastshares.end()) {
 		if ((now - i->second) > LOG_PURGE_TIMEOUT) {
@@ -69,16 +98,14 @@ extern "C" void hashlog_purge_old(void)
 		i++;
 	}
 	if (opt_debug && deleted) {
-		applog(LOG_DEBUG, "hashlog: %d/%d purged",
-			deleted, tlastshares.size());
+		applog(LOG_DEBUG, "hashlog: %d/%d purged", deleted, sz);
 	}
 }
 
 /**
- * Reset the submitted nounce cache
+ * Reset the submitted nonces cache
  */
 extern "C" void hashlog_purge_all(void)
 {
 	tlastshares.clear();
 }
-
diff --git a/miner.h b/miner.h
index b986197..3100371 100644
--- a/miner.h
+++ b/miner.h
@@ -390,6 +390,7 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
 
 void hashlog_remember_submit(char* jobid, uint32_t nounce);
 uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce);
+uint32_t hashlog_get_last_sent(char* jobid);
 void hashlog_purge_old(void);
 void hashlog_purge_job(char* jobid);
 void hashlog_purge_all(void);
@@ -405,6 +406,9 @@ extern void tq_thaw(struct thread_q *tq);
 
 void proper_exit(int reason);
 
+size_t time2str(char* buf, time_t timer);
+char* atime2str(time_t timer);
+
 void applog_hash(unsigned char *hash);
 
 void print_hash_tests(void);
diff --git a/util.c b/util.c
index a9e0ae2..275abf7 100644
--- a/util.c
+++ b/util.c
@@ -21,6 +21,7 @@
 #include <unistd.h>
 #include <jansson.h>
 #include <curl/curl.h>
+#include <sys/time.h>
 #include <time.h>
 #ifdef WIN32
 #include "compat/winansi.h"
@@ -1350,6 +1351,31 @@ out:
 	return rval;
 }
 
+/**
+ * @param buf char[9] mini
+ * @param time_t timer to convert
+ */
+size_t time2str(char* buf, time_t timer)
+{
+	struct tm* tm_info;
+	tm_info = localtime(&timer);
+	return strftime(buf, 19, "%H:%M:%S", tm_info);
+}
+
+/**
+ * Alloc and returns time string (to be freed)
+ * @param time_t timer to convert
+ */
+char* atime2str(time_t timer)
+{
+	struct tm* tm_info;
+	char* buf = malloc(16);
+	memset(buf, 0, 16);
+	tm_info = localtime(&timer);
+	strftime(buf, 19, "%H:%M:%S", tm_info);
+	return buf;
+}
+
 /* sprintf can be used in applog */
 static char* format_hash(char* buf, unsigned char *hash)
 {

From b1f5df374db13c597cd90fd3f8f4802f6b7b5f61 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Wed, 3 Sep 2014 12:54:13 +0200
Subject: [PATCH 15/44] stratum: store server time offset in context

---
 cpu-miner.c |  9 ++++++---
 miner.h     |  2 ++
 util.c      | 28 ++++++++++++++++++----------
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/cpu-miner.c b/cpu-miner.c
index 6da1465..20bac21 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -179,7 +179,7 @@ static bool submit_old = false;
 bool use_syslog = false;
 bool use_colors = false;
 static bool opt_background = false;
-static bool opt_quiet = false;
+bool opt_quiet = false;
 static int opt_retries = -1;
 static int opt_fail_pause = 30;
 int opt_timeout = 270;
@@ -789,7 +789,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	for (i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++);
 
 	/* Assemble block header */
-	memset(work->data, 0, 128);
+	memset(work->data, 0, sizeof(work->data));
 	work->data[0] = le32dec(sctx->job.version);
 	for (i = 0; i < 8; i++)
 		work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i);
@@ -822,7 +822,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	pthread_mutex_unlock(&sctx->work_lock);
 
 	if (opt_debug) {
-		char *tm = atime2str(swab32(work->data[17]));
+		char *tm = atime2str(swab32(work->data[17]) - sctx->srvtime_diff);
 		char *xnonce2str = bin2hex(work->xnonce2, sctx->xnonce2_size);
 		applog(LOG_DEBUG, "DEBUG: job_id=%s xnonce2=%s time=%s",
 		       work->job_id, xnonce2str, tm);
@@ -1690,6 +1690,9 @@ int main(int argc, char *argv[])
 		sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
 	}
 
+	/* init stratum data.. */
+	memset(&stratum.url, 0, sizeof(stratum));
+
 	pthread_mutex_init(&stats_lock, NULL);
 	pthread_mutex_init(&g_work_lock, NULL);
 	pthread_mutex_init(&stratum.sock_lock, NULL);
diff --git a/miner.h b/miner.h
index 3100371..9101c61 100644
--- a/miner.h
+++ b/miner.h
@@ -377,6 +377,8 @@ struct stratum_ctx {
 	size_t xnonce2_size;
 	struct stratum_job job;
 	pthread_mutex_t work_lock;
+
+	int srvtime_diff;
 };
 
 bool stratum_socket_full(struct stratum_ctx *sctx, int timeout);
diff --git a/util.c b/util.c
index 275abf7..73a1847 100644
--- a/util.c
+++ b/util.c
@@ -21,7 +21,6 @@
 #include <unistd.h>
 #include <jansson.h>
 #include <curl/curl.h>
-#include <sys/time.h>
 #include <time.h>
 #ifdef WIN32
 #include "compat/winansi.h"
@@ -1012,12 +1011,13 @@ out:
 
 static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 {
-	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *ntime, *nreward;
+	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime, *nreward;
 	size_t coinb1_size, coinb2_size;
 	bool clean, ret = false;
 	int merkle_count, i;
 	json_t *merkle_arr;
 	unsigned char **merkle;
+	int ntime;
 
 	job_id = json_string_value(json_array_get(params, 0));
 	prevhash = json_string_value(json_array_get(params, 1));
@@ -1029,16 +1029,26 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	merkle_count = json_array_size(merkle_arr);
 	version = json_string_value(json_array_get(params, 5));
 	nbits = json_string_value(json_array_get(params, 6));
-	ntime = json_string_value(json_array_get(params, 7));
+	stime = json_string_value(json_array_get(params, 7));
 	clean = json_is_true(json_array_get(params, 8));
 	nreward = json_string_value(json_array_get(params, 9));
 
-	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !ntime ||
+	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime ||
 	    strlen(prevhash) != 64 || strlen(version) != 8 ||
-	    strlen(nbits) != 8 || strlen(ntime) != 8) {
+	    strlen(nbits) != 8 || strlen(stime) != 8) {
 		applog(LOG_ERR, "Stratum notify: invalid parameters");
 		goto out;
 	}
+
+	/* store stratum server time diff */
+	hex2bin((unsigned char *)&ntime, stime, 4);
+	ntime = swab32(ntime) - time(0);
+	if (ntime > sctx->srvtime_diff) {
+		sctx->srvtime_diff = ntime;
+		if (!opt_quiet)
+			applog(LOG_DEBUG, "stratum time is at least %ds in the future", ntime);
+	}
+
 	merkle = (unsigned char**)malloc(merkle_count * sizeof(char *));
 	for (i = 0; i < merkle_count; i++) {
 		const char *s = json_string_value(json_array_get(merkle_arr, i));
@@ -1079,7 +1089,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 
 	hex2bin(sctx->job.version, version, 4);
 	hex2bin(sctx->job.nbits, nbits, 4);
-	hex2bin(sctx->job.ntime, ntime, 4);
+	hex2bin(sctx->job.ntime, stime, 4);
 	if(nreward != NULL)
 	{
 		if(strlen(nreward) == 4)
@@ -1368,11 +1378,9 @@ size_t time2str(char* buf, time_t timer)
  */
 char* atime2str(time_t timer)
 {
-	struct tm* tm_info;
-	char* buf = malloc(16);
+	char* buf = (char*) malloc(16);
 	memset(buf, 0, 16);
-	tm_info = localtime(&timer);
-	strftime(buf, 19, "%H:%M:%S", tm_info);
+	time2str(buf, timer);
 	return buf;
 }
 

From 69616b37ac447ec18d9592f43489196e4c702746 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Wed, 3 Sep 2014 13:53:01 +0200
Subject: [PATCH 16/44] hashlog: prepare store of scanned range

---
 blake32.cu      |  8 ++++++--
 ccminer.vcxproj |  3 ++-
 cpu-miner.c     |  4 +++-
 hashlog.cpp     | 27 +++++++++++++++++++--------
 miner.h         |  3 ++-
 5 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index b50a3ca..2b63ccf 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -304,11 +304,15 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 			uint32_t vhashcpu[8];
 			uint32_t Htarg = ptarget[7];
 
-			applog(LOG_WARNING, "throughput=%u, start=%x, max=%x, pdata=%x", throughput, first_nonce, max_nonce, pdata[0]);
-
 			for (int k=0; k < 20; k++)
 				be32enc(&endiandata[k], pdata[k]);
 
+			if (opt_debug && !opt_quiet) {
+				applog(LOG_DEBUG, "throughput=%u, start=%x, max=%x, pdata=%08x...%08x",
+					throughput, first_nonce, max_nonce, endiandata[0], endiandata[7]);
+				applog_hash((unsigned char *)pdata);
+			}
+
 			be32enc(&endiandata[19], foundNonce);
 
 			blake32hash(vhashcpu, endiandata);
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 509715b..7590d94 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -399,6 +399,7 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
       <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
     </CudaCompile>
     <CudaCompile Include="blake32.cu">
+      <MaxRegCount>64</MaxRegCount>
       <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
       <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
     </CudaCompile>
@@ -561,4 +562,4 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
   <ImportGroup Label="ExtensionTargets">
     <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" />
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/cpu-miner.c b/cpu-miner.c
index 20bac21..e239081 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -494,7 +494,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 			goto out;
 		}
 
-		hashlog_remember_submit(work->job_id, nonce);
+		hashlog_remember_submit(work->job_id, nonce, 0);
 
 	} else {
 
@@ -834,6 +834,8 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
 	else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH)
 		diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
+	else if (opt_algo == ALGO_BLAKE)
+		diff_to_target(work->target, sctx->job.diff / (16.0 * opt_difficulty));
 	else
 		diff_to_target(work->target, sctx->job.diff / opt_difficulty);
 }
diff --git a/hashlog.cpp b/hashlog.cpp
index 3948751..0b8b574 100644
--- a/hashlog.cpp
+++ b/hashlog.cpp
@@ -7,7 +7,13 @@
 #define HI_DWORD(u64) ((uint32_t) (u64 >> 32))
 #define LO_DWORD(u64) ((uint32_t) u64)
 
-static std::map<uint64_t, uint32_t> tlastshares;
+struct hashlog_data {
+	uint32_t ntime;
+	uint32_t scanned_from;
+	uint32_t scanned_to;
+};
+
+static std::map<uint64_t, hashlog_data> tlastshares;
 
 #define LOG_PURGE_TIMEOUT 15*60
 
@@ -23,11 +29,15 @@ static uint64_t hextouint(char* jobid)
 /**
  * Store submitted nonces of a job
  */
-extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce)
+extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce, uint64_t range)
 {
 	uint64_t njobid = hextouint(jobid);
 	uint64_t key = (njobid << 32) + nonce;
-	tlastshares[key] = (uint32_t) time(NULL);
+	struct hashlog_data data;
+	data.ntime = (uint32_t) time(NULL);
+	data.scanned_from = LO_DWORD(range);
+	data.scanned_to   = HI_DWORD(range);
+	tlastshares[key] = data;
 }
 
 /**
@@ -39,7 +49,7 @@ extern "C" uint32_t hashlog_get_last_sent(char* jobid)
 	uint32_t ret = 0;
 	uint64_t njobid = hextouint(jobid);
 	uint64_t keypfx = (njobid << 32);
-	std::map<uint64_t, uint32_t>::iterator i = tlastshares.begin();
+	std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
 	while (i != tlastshares.end()) {
 		if ((keypfx & i->first) == keypfx && LO_DWORD(i->first) > ret) {
 			ret = LO_DWORD(i->first);
@@ -61,7 +71,8 @@ extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nonce)
 		// search last submitted nonce for job
 		ret = hashlog_get_last_sent(jobid);
 	} else if (tlastshares.find(key) != tlastshares.end()) {
-		ret = (uint32_t) tlastshares[key];
+		hashlog_data data = tlastshares[key];
+		ret = data.ntime;
 	}
 	return ret;
 }
@@ -73,7 +84,7 @@ extern "C" void hashlog_purge_job(char* jobid)
 {
 	uint64_t njobid = hextouint(jobid);
 	uint64_t keypfx = (njobid << 32);
-	std::map<uint64_t, uint32_t>::iterator i = tlastshares.begin();
+	std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
 	while (i != tlastshares.end()) {
 		if ((keypfx & i->first) == keypfx)
 			tlastshares.erase(i);
@@ -89,9 +100,9 @@ extern "C" void hashlog_purge_old(void)
 	int deleted = 0;
 	uint32_t now = (uint32_t) time(NULL);
 	uint32_t sz = tlastshares.size();
-	std::map<uint64_t, uint32_t>::iterator i = tlastshares.begin();
+	std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
 	while (i != tlastshares.end()) {
-		if ((now - i->second) > LOG_PURGE_TIMEOUT) {
+		if ((now - i->second.ntime) > LOG_PURGE_TIMEOUT) {
 			deleted++;
 			tlastshares.erase(i);
 		}
diff --git a/miner.h b/miner.h
index 9101c61..79e3a15 100644
--- a/miner.h
+++ b/miner.h
@@ -286,6 +286,7 @@ struct work_restart {
 
 extern bool opt_debug;
 extern bool opt_debug_rpc;
+extern bool opt_quiet;
 extern bool opt_protocol;
 extern int opt_timeout;
 extern bool want_longpoll;
@@ -390,7 +391,7 @@ bool stratum_subscribe(struct stratum_ctx *sctx);
 bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
 bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
 
-void hashlog_remember_submit(char* jobid, uint32_t nounce);
+void hashlog_remember_submit(char* jobid, uint32_t nounce, uint64_t range);
 uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce);
 uint32_t hashlog_get_last_sent(char* jobid);
 void hashlog_purge_old(void);

From 124ddee2fe804fdac6e67c60965e423a95e8a57a Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Wed, 3 Sep 2014 14:56:51 +0200
Subject: [PATCH 17/44] blake: fix of bad difficulty

---
 cpu-miner.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpu-miner.c b/cpu-miner.c
index e239081..adf3f4d 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -835,7 +835,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH)
 		diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
 	else if (opt_algo == ALGO_BLAKE)
-		diff_to_target(work->target, sctx->job.diff / (16.0 * opt_difficulty));
+		diff_to_target(work->target, sctx->job.diff / (2.0 * opt_difficulty));
 	else
 		diff_to_target(work->target, sctx->job.diff / opt_difficulty);
 }
@@ -945,7 +945,7 @@ static void *miner_thread(void *userdata)
 		if (end_nonce < (umax64 + (*nonceptr)))
 			max_nonce = end_nonce;
 		else
-			max_nonce = umax64 + (*nonceptr);
+			max_nonce = (uint32_t) umax64 + (*nonceptr);
 
 		/* do not recompute something already scanned (and sent) ! */
 		if (hashlog_already_submittted(work.job_id, 0)) {

From a270adc4b6a88b93aaadbb7f9b924f8f0fbca2b0 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Wed, 3 Sep 2014 21:05:15 +0200
Subject: [PATCH 18/44] to test on windows

---
 cpu-miner.c | 122 +++++++++++++++++++++++++++++++++++++---------------
 hashlog.cpp |  96 +++++++++++++++++++++++++++++++----------
 miner.h     |   4 +-
 util.c      |   2 +-
 4 files changed, 164 insertions(+), 60 deletions(-)

diff --git a/cpu-miner.c b/cpu-miner.c
index adf3f4d..cb4d365 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -343,6 +343,9 @@ struct work {
 	char job_id[128];
 	size_t xnonce2_len;
 	unsigned char xnonce2[32];
+
+	uint32_t scanned_from;
+	uint32_t scanned_to;
 };
 
 static struct work g_work;
@@ -494,7 +497,8 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 			goto out;
 		}
 
-		hashlog_remember_submit(work->job_id, nonce, 0);
+		hashlog_remember_submit(work->job_id, nonce);
+		hashlog_remember_scan_range(work->job_id, work->scanned_from, work->scanned_to);
 
 	} else {
 
@@ -834,8 +838,6 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
 	else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH)
 		diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
-	else if (opt_algo == ALGO_BLAKE)
-		diff_to_target(work->target, sctx->job.diff / (2.0 * opt_difficulty));
 	else
 		diff_to_target(work->target, sctx->job.diff / opt_difficulty);
 }
@@ -848,6 +850,7 @@ static void *miner_thread(void *userdata)
 	uint32_t max_nonce;
 	uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1);
 	unsigned char *scratchbuf = NULL;
+	bool work_done = false;
 	char s[16];
 
 	memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized
@@ -871,6 +874,7 @@ static void *miner_thread(void *userdata)
 
 	while (1) {
 		unsigned long hashes_done;
+		uint32_t start_nonce;
 		struct timeval tv_start, tv_end, diff;
 		int64_t max64;
 		uint64_t umax64;
@@ -880,41 +884,51 @@ static void *miner_thread(void *userdata)
 		int wcmplen = 76;
 		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
 
+		applog(LOG_WARNING, "job %s %08x", g_work.job_id, (*nonceptr));
+
 		if (have_stratum) {
-			while (time(NULL) >= g_work_time + opt_scantime)
+			while (time(NULL) >= (g_work_time + opt_scantime) && !work_done)
 				usleep(500*1000);
+			work_done = false;
 			pthread_mutex_lock(&g_work_lock);
 			nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
 			if ((*nonceptr) >= end_nonce)
 				stratum_gen_work(&stratum, &g_work);
 		} else {
 			int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime;
-			if (!opt_quiet)
-			applog(LOG_DEBUG, "have_longpoll=%d, have_stratum=%d, min_scantime=%d, g_work_time=%d",
-				have_longpoll, have_stratum, min_scantime, g_work_time);
 			/* obtain new work from internal workio thread */
 			pthread_mutex_lock(&g_work_lock);
-			if (!have_stratum &&
-			    (time(NULL) - g_work_time >= min_scantime ||
-			     (*nonceptr) >= end_nonce)) {
+			if (time(NULL) - g_work_time >= min_scantime ||
+			     (*nonceptr) >= end_nonce) {
 				if (unlikely(!get_work(mythr, &g_work))) {
 					applog(LOG_ERR, "work retrieval failed, exiting "
 						"mining thread %d", mythr->id);
 					pthread_mutex_unlock(&g_work_lock);
 					goto out;
 				}
-				g_work_time = have_stratum ? 0 : time(NULL);
-			}
-			if (have_stratum) {
-				pthread_mutex_unlock(&g_work_lock);
-				continue;
+				g_work_time = time(NULL);
 			}
 		}
 		if (memcmp(work.data, g_work.data, wcmplen)) {
+	/*
+			applog(LOG_NOTICE, "job %s %08x work change", g_work.job_id, (*nonceptr));
+			for (int n=0; n<wcmplen; n+=8) {
+				if (memcmp(work.data + n, g_work.data + n, 8)) {
+					applog(LOG_ERR, "diff detected at offset %d", n);
+					applog_hash(work.data + n);
+					applog_hash(g_work.data + n);
+				}
+			}
+	*/
 			memcpy(&work, &g_work, sizeof(struct work));
-			(*nonceptr) = 0xffffffffU / opt_n_threads * thr_id; // 0 if single thr
+			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
+	/*	} else if (memcmp(work.target, g_work.target, sizeof(work.target))) {
+			applog(LOG_NOTICE, "job %s %08x target change", g_work.job_id, (*nonceptr));
+			memcpy(work.target, g_work.target, sizeof(work.target));
+			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
+	*/
 		} else
-			(*nonceptr)++;
+			(*nonceptr)++; //??
 		pthread_mutex_unlock(&g_work_lock);
 		work_restart[thr_id].restart = 0;
 
@@ -933,31 +947,56 @@ static void *miner_thread(void *userdata)
 				max64 = 0x1fffLL;
 				break;
 			case ALGO_BLAKE:
-				//max64 = 0x1000000LL;
-				//break;
+				/* based on the 750Ti hashrate */
+				max64 = 0x3ffffffLL;
+				break;
 			default:
 				max64 = 0xfffffLL;
 				break;
 			}
 		}
 
+		start_nonce = *nonceptr;
+
+		/* do not recompute something already scanned */
+		if (opt_algo == ALGO_BLAKE) {
+			union {
+				uint64_t data;
+				uint32_t scanned[2];
+			} range;
+
+			range.data = hashlog_get_scan_range(work.job_id);
+			if (range.data) {
+				if (range.scanned[0] == 1 && range.scanned[1] == 0xFFFFFFFFUL) {
+					applog(LOG_WARNING, "detected a rescan of fully scanned job!");
+				} else if (range.scanned[0] > 0 && range.scanned[1] > 0) {
+					/* continue scan the end */
+					start_nonce = range.scanned[1] + 1;
+					applog(LOG_WARNING, "scan the next part %x + 1", range.scanned[1]);
+				} else if (range.scanned[0] > 1) {
+					/* dont scan the beginning... make loops */
+					//end_nonce = range.scanned[0] - 1;
+					//applog(LOG_WARNING, "scan the missing part 0 -> %x", end_nonce);
+				}
+				if (start_nonce == work.scanned_from) {
+					/* to prevent stales, if last was in the same range */
+					applog(LOG_ERR, "detected a staled job!");
+					//(*nonceptr) = end_nonce + 1;
+					//work_done = true;
+					//continue;
+					start_nonce = range.scanned[1] + 1;
+				}
+			}
+		}
+
 		umax64 = (uint64_t) max64;
-		if (end_nonce < (umax64 + (*nonceptr)))
+		if ((umax64 + start_nonce) >= end_nonce)
 			max_nonce = end_nonce;
 		else
-			max_nonce = (uint32_t) umax64 + (*nonceptr);
-
-		/* do not recompute something already scanned (and sent) ! */
-		if (hashlog_already_submittted(work.job_id, 0)) {
-			uint32_t lastnonce = hashlog_get_last_sent(work.job_id);
-			if ((*nonceptr) < lastnonce && lastnonce <= max_nonce) {
-				applog(LOG_WARNING, "rescan of sent job? nonce=%x, last was %x", (*nonceptr), lastnonce);
-				max_nonce = lastnonce - 1;
-			} else if ((*nonceptr) == lastnonce) {
-				applog(LOG_WARNING, "rescan of sent job? start nonce = lastnonce");
-				(*nonceptr) = lastnonce + 1;
-			}
-		}
+			max_nonce = (uint32_t) umax64 + start_nonce;
+
+		work.scanned_from = start_nonce;
+		(*nonceptr) = start_nonce;
 
 		hashes_done = 0;
 		gettimeofday(&tv_start, NULL);
@@ -1058,6 +1097,10 @@ static void *miner_thread(void *userdata)
 
 		/* record scanhash elapsed time */
 		gettimeofday(&tv_end, NULL);
+
+		if (rc && opt_debug)
+			applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", *nonceptr, swab32(*nonceptr));
+
 		timeval_subtract(&diff, &tv_end, &tv_start);
 		if (diff.tv_usec || diff.tv_sec) {
 			pthread_mutex_lock(&stats_lock);
@@ -1068,7 +1111,7 @@ static void *miner_thread(void *userdata)
 		if (!opt_quiet) {
 			sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f",
 				1e-3 * thr_hashrates[thr_id]);
-			applog(LOG_INFO, "GPU #%d: %s, %s khash/s",
+			applog(LOG_INFO, "GPU #%d: %s, %s kH/s",
 				device_map[thr_id], device_name[thr_id], s);
 		}
 		if (opt_benchmark && thr_id == opt_n_threads - 1) {
@@ -1078,10 +1121,19 @@ static void *miner_thread(void *userdata)
 				hashrate += thr_hashrates[i];
 			if (i == opt_n_threads) {
 				sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", hashrate / 1000.);
-				applog(LOG_NOTICE, "Total: %s khash/s", s);
+				applog(LOG_NOTICE, "Total: %s kH/s", s);
 			}
 		}
 
+		if (rc) {
+			work.scanned_to = *nonceptr;
+		} else {
+			work.scanned_to = max_nonce;
+		}
+
+		// could be used to store speeds too..
+		hashlog_remember_scan_range(work.job_id, work.scanned_from, work.scanned_to);
+
 		/* if nonce found, submit work */
 		if (rc && !opt_benchmark && !submit_work(mythr, &work))
 			break;
diff --git a/hashlog.cpp b/hashlog.cpp
index 0b8b574..645fc88 100644
--- a/hashlog.cpp
+++ b/hashlog.cpp
@@ -1,21 +1,23 @@
-//#include <inttypes.h>
 #include <stdlib.h>
+#include <memory.h>
 #include <map>
 
 #include "miner.h"
 
 #define HI_DWORD(u64) ((uint32_t) (u64 >> 32))
 #define LO_DWORD(u64) ((uint32_t) u64)
+#define MK_HI64(u32) (0x100000000ULL * u32)
 
 struct hashlog_data {
 	uint32_t ntime;
 	uint32_t scanned_from;
 	uint32_t scanned_to;
+	uint32_t last_from;
 };
 
 static std::map<uint64_t, hashlog_data> tlastshares;
 
-#define LOG_PURGE_TIMEOUT 15*60
+#define LOG_PURGE_TIMEOUT 5*60
 
 /**
  * str hex to uint32
@@ -27,32 +29,79 @@ static uint64_t hextouint(char* jobid)
 }
 
 /**
- * Store submitted nonces of a job
+ * @return time of a job/nonce submission (or last nonce if nonce is 0)
  */
-extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce, uint64_t range)
+extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nonce)
 {
+	uint32_t ret = 0;
 	uint64_t njobid = hextouint(jobid);
 	uint64_t key = (njobid << 32) + nonce;
+	if (nonce == 0) {
+		// search last submitted nonce for job
+		ret = hashlog_get_last_sent(jobid);
+	} else if (tlastshares.find(key) != tlastshares.end()) {
+		hashlog_data data = tlastshares[key];
+		ret = data.ntime;
+	}
+	return ret;
+}
+/**
+ * Store submitted nonces of a job
+ */
+extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce)
+{
+	uint64_t njobid = hextouint(jobid);
+	uint64_t keyall = (njobid << 32);
+	uint64_t key = keyall + nonce;
 	struct hashlog_data data;
+
+	data = tlastshares[keyall];
 	data.ntime = (uint32_t) time(NULL);
-	data.scanned_from = LO_DWORD(range);
-	data.scanned_to   = HI_DWORD(range);
 	tlastshares[key] = data;
 }
 
 /**
- * Search last submitted nonce for a job
- * @return max nonce
+ * Update job scanned range
  */
-extern "C" uint32_t hashlog_get_last_sent(char* jobid)
+extern "C" void hashlog_remember_scan_range(char* jobid, uint32_t scanned_from, uint32_t scanned_to)
 {
-	uint32_t ret = 0;
+	uint64_t njobid = hextouint(jobid);
+	uint64_t keyall = (njobid << 32);
+	struct hashlog_data data;
+
+	// global scan range of a job
+	data = tlastshares[keyall];
+	if (hashlog_get_scan_range(jobid) == 0) {
+		memset(&data, 0, sizeof(data));
+	}
+
+	if (data.scanned_from == 0 || scanned_to == (data.scanned_from - 1))
+		data.scanned_from = scanned_from ? scanned_from : 1; // min 1
+	if (data.scanned_to == 0 || scanned_from == data.scanned_to + 1)
+		data.scanned_to = scanned_to;
+
+	data.last_from = scanned_from;
+
+	tlastshares[keyall] = data;
+	applog(LOG_BLUE, "job %s range : %x %x -> %x %x (%x)", jobid,
+		scanned_from, scanned_to, data.scanned_from, data.scanned_to, data.ntime);/* */
+}
+
+/**
+ * Returns the range of a job
+ * @return uint64_t to|from
+ */
+extern "C" uint64_t hashlog_get_scan_range(char* jobid)
+{
+	uint64_t ret = 0;
 	uint64_t njobid = hextouint(jobid);
 	uint64_t keypfx = (njobid << 32);
 	std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
 	while (i != tlastshares.end()) {
-		if ((keypfx & i->first) == keypfx && LO_DWORD(i->first) > ret) {
-			ret = LO_DWORD(i->first);
+		if ((keypfx & i->first) == keypfx) {
+			hashlog_data data = i->second;
+			ret = data.scanned_from;
+			ret += MK_HI64(data.scanned_to);
 		}
 		i++;
 	}
@@ -60,21 +109,22 @@ extern "C" uint32_t hashlog_get_last_sent(char* jobid)
 }
 
 /**
- * @return time of a job/nonce submission (or last nonce if nonce is 0)
+ * Search last submitted nonce for a job
+ * @return max nonce
  */
-extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nonce)
+extern "C" uint32_t hashlog_get_last_sent(char* jobid)
 {
-	uint32_t ret = 0;
+	uint32_t nonce = 0;
 	uint64_t njobid = hextouint(jobid);
-	uint64_t key = (njobid << 32) + nonce;
-	if (nonce == 0) {
-		// search last submitted nonce for job
-		ret = hashlog_get_last_sent(jobid);
-	} else if (tlastshares.find(key) != tlastshares.end()) {
-		hashlog_data data = tlastshares[key];
-		ret = data.ntime;
+	uint64_t keypfx = (njobid << 32);
+	std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
+	while (i != tlastshares.end()) {
+		if ((keypfx & i->first) == keypfx && i->second.ntime > 0) {
+			nonce = LO_DWORD(i->first);
+		}
+		i++;
 	}
-	return ret;
+	return nonce;
 }
 
 /**
diff --git a/miner.h b/miner.h
index 79e3a15..5f9e8ac 100644
--- a/miner.h
+++ b/miner.h
@@ -391,9 +391,11 @@ bool stratum_subscribe(struct stratum_ctx *sctx);
 bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
 bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
 
-void hashlog_remember_submit(char* jobid, uint32_t nounce, uint64_t range);
+void hashlog_remember_submit(char* jobid, uint32_t nounce);
+void hashlog_remember_scan_range(char* jobid, uint32_t scanned_from, uint32_t scanned_to);
 uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce);
 uint32_t hashlog_get_last_sent(char* jobid);
+uint64_t hashlog_get_scan_range(char* jobid);
 void hashlog_purge_old(void);
 void hashlog_purge_job(char* jobid);
 void hashlog_purge_all(void);
diff --git a/util.c b/util.c
index 73a1847..5567459 100644
--- a/util.c
+++ b/util.c
@@ -559,7 +559,7 @@ bool fulltest(const uint32_t *hash, const uint32_t *target)
 		}
 	}
 
-	if (!rc || opt_debug) {
+	if (!rc && opt_debug) {
 		uint32_t hash_be[8], target_be[8];
 		char *hash_str, *target_str;
 		

From 806c3e8691215d15fa1dc3f56ba5fcbb6bc21291 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 4 Sep 2014 08:55:00 +0200
Subject: [PATCH 19/44] enhance double scan checks

---
 blake32.cu  | 19 +++++++----
 cpu-miner.c | 93 ++++++++++++++++++++++++++++++++---------------------
 hashlog.cpp | 89 +++++++++++++++++++++++++++++++++++++++-----------
 miner.h     |  1 +
 util.c      |  2 +-
 5 files changed, 141 insertions(+), 63 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index 2b63ccf..ccba68a 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -29,8 +29,9 @@ extern "C" void blake32hash(void *output, const void *input)
 #include "cuda_helper.h"
 
 // in cpu-miner.c
+extern bool opt_n_threads;
 extern bool opt_benchmark;
-extern bool opt_debug;
+//extern bool opt_debug;
 extern int device_map[8];
 
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@@ -279,7 +280,9 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 		((uint32_t*)ptarget)[7] = 0x00000f;
 
 	if (!init[thr_id]) {
-		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		if (opt_n_threads > 1) {
+			CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		}
 		CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], sizeof(uint32_t)));
 		CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], sizeof(uint32_t)));
 		init[thr_id] = true;
@@ -288,11 +291,6 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 	if (throughput < (TPB * 2048))
 		applog(LOG_WARNING, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce);
 
-	if (max_nonce < first_nonce) {
-		applog(LOG_ERR, "start=%x > end=%x !", first_nonce, max_nonce);
-		return 0;
-	}
-
 	blake256_cpu_setBlock_80(pdata, (void*)ptarget);
 
 	do {
@@ -340,5 +338,12 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 
 exit_scan:
 	*hashes_done = pdata[19] - first_nonce + 1;
+	// reset the device to allow multiple instances
+	if (opt_n_threads == 1) {
+		CUDA_SAFE_CALL(cudaDeviceReset());
+		init[thr_id] = false;
+	}
+	// wait proper end of all threads
+	cudaDeviceSynchronize();
 	return rc;
 }
diff --git a/cpu-miner.c b/cpu-miner.c
index cb4d365..3f772c9 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -187,7 +187,7 @@ static int opt_scantime = 5;
 static json_t *opt_config;
 static const bool opt_time = true;
 static sha256_algos opt_algo = ALGO_HEAVY;
-static int opt_n_threads = 0;
+int opt_n_threads = 0;
 static double opt_difficulty = 1; // CH
 bool opt_trust_pool = false;
 uint16_t opt_vote = 9999;
@@ -411,11 +411,11 @@ err_out:
 	return false;
 }
 
-static void share_result(int result, const char *reason)
+static int share_result(int result, const char *reason)
 {
 	char s[345];
 	double hashrate;
-	int i;
+	int i, ret = 0;
 
 	hashrate = 0.;
 	pthread_mutex_lock(&stats_lock);
@@ -434,9 +434,15 @@ static void share_result(int result, const char *reason)
 				(result ? CL_GRN "yay!!!" : CL_RED "booooo")
 			:	(result ? "(yay!!!)" : "(booooo)"));
 
-	if (reason) {
+	if (reason && !opt_quiet) {
 		applog(LOG_WARNING, "reject reason: %s", reason);
+		if (strncmp(reason, "low difficulty share", 20) == 0) {
+			opt_difficulty = (opt_difficulty * 2.0) / 3.0;
+			applog(LOG_WARNING, "factor reduced to : %0.2f", opt_difficulty);
+			return 0;
+		}
 	}
+	return 1;
 }
 
 static bool submit_upstream_work(CURL *curl, struct work *work)
@@ -472,8 +478,10 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		sent = hashlog_already_submittted(work->job_id, nonce);
 		if (sent > 0) {
 			sent = (uint32_t) time(NULL) - sent;
-			if (!opt_quiet)
+			if (!opt_quiet) {
 				applog(LOG_WARNING, "skip submit, nonce %s was already sent %u seconds ago", noncestr, sent);
+				hashlog_dump_job(work->job_id);
+			}
 			rc = true;
 			goto out;
 		}
@@ -481,11 +489,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		if (opt_algo == ALGO_HEAVY) {
 			sprintf(s,
 				"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-				rpc_user, work->job_id, xnonce2str, ntimestr, noncestr, nvotestr);
+				rpc_user, work->job_id + 8, xnonce2str, ntimestr, noncestr, nvotestr);
 		} else {
 			sprintf(s,
 				"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-				rpc_user, work->job_id, xnonce2str, ntimestr, noncestr);
+				rpc_user, work->job_id + 8, xnonce2str, ntimestr, noncestr);
 		}
 		free(ntimestr);
 		free(noncestr);
@@ -528,7 +536,8 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 
 		res = json_object_get(val, "result");
 		reason = json_object_get(val, "reject-reason");
-		share_result(json_is_true(res), reason ? json_string_value(reason) : NULL);
+		if (!share_result(json_is_true(res), reason ? json_string_value(reason) : NULL))
+			hashlog_purge_job(work->job_id);
 
 		json_decref(val);
 	}
@@ -768,7 +777,9 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 
 	pthread_mutex_lock(&sctx->work_lock);
 
-	strcpy(work->job_id, sctx->job.job_id);
+	// store the job ntime as high part of jobid
+	snprintf(work->job_id, sizeof(work->job_id), "%07x %s",
+		be32dec(sctx->job.ntime) & 0xfffffff, sctx->job.job_id);
 	work->xnonce2_len = sctx->xnonce2_size;
 	memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size);
 
@@ -884,8 +895,6 @@ static void *miner_thread(void *userdata)
 		int wcmplen = 76;
 		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
 
-		applog(LOG_WARNING, "job %s %08x", g_work.job_id, (*nonceptr));
-
 		if (have_stratum) {
 			while (time(NULL) >= (g_work_time + opt_scantime) && !work_done)
 				usleep(500*1000);
@@ -909,29 +918,34 @@ static void *miner_thread(void *userdata)
 				g_work_time = time(NULL);
 			}
 		}
-		if (memcmp(work.data, g_work.data, wcmplen)) {
-	/*
-			applog(LOG_NOTICE, "job %s %08x work change", g_work.job_id, (*nonceptr));
-			for (int n=0; n<wcmplen; n+=8) {
-				if (memcmp(work.data + n, g_work.data + n, 8)) {
-					applog(LOG_ERR, "diff detected at offset %d", n);
-					applog_hash(work.data + n);
-					applog_hash(g_work.data + n);
+		if (memcmp(work.data, g_work.data, 72)) { // wcmplen)) {
+			if (opt_debug) {
+				applog(LOG_DEBUG, "job %s %08x work updated", g_work.job_id, (*nonceptr));
+				for (int n=0; n<wcmplen; n+=8) {
+					if (memcmp(work.data + n, g_work.data + n, 8)) {
+						applog(LOG_DEBUG, "diff detected at offset %d", n);
+						applog_hash((uint8_t*) work.data + n);
+						applog_hash((uint8_t*) g_work.data + n);
+					}
 				}
 			}
-	*/
 			memcpy(&work, &g_work, sizeof(struct work));
 			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
-	/*	} else if (memcmp(work.target, g_work.target, sizeof(work.target))) {
-			applog(LOG_NOTICE, "job %s %08x target change", g_work.job_id, (*nonceptr));
+		} else if (memcmp(work.target, g_work.target, sizeof(work.target))) {
+			if (opt_debug) {
+				applog(LOG_DEBUG, "job %s %08x target change", g_work.job_id, (*nonceptr));
+				applog_hash((uint8_t*) work.target);
+				applog_hash((uint8_t*) g_work.target);
+			}
 			memcpy(work.target, g_work.target, sizeof(work.target));
 			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
-	*/
 		} else
 			(*nonceptr)++; //??
 		pthread_mutex_unlock(&g_work_lock);
 		work_restart[thr_id].restart = 0;
 
+		applog(LOG_WARNING, "job %s %08x", g_work.job_id, (*nonceptr));
+
 		/* adjust max_nonce to meet target scan time */
 		if (have_stratum)
 			max64 = LP_SCANTIME;
@@ -959,7 +973,7 @@ static void *miner_thread(void *userdata)
 		start_nonce = *nonceptr;
 
 		/* do not recompute something already scanned */
-		if (opt_algo == ALGO_BLAKE) {
+		if (opt_algo == ALGO_BLAKE && opt_n_threads == 1) {
 			union {
 				uint64_t data;
 				uint32_t scanned[2];
@@ -967,24 +981,29 @@ static void *miner_thread(void *userdata)
 
 			range.data = hashlog_get_scan_range(work.job_id);
 			if (range.data) {
+				bool stall = false;
 				if (range.scanned[0] == 1 && range.scanned[1] == 0xFFFFFFFFUL) {
 					applog(LOG_WARNING, "detected a rescan of fully scanned job!");
-				} else if (range.scanned[0] > 0 && range.scanned[1] > 0) {
+				} else if (range.scanned[0] > 0 && range.scanned[1] > 0 && range.scanned[1] < 0xFFFFFFF0UL) {
 					/* continue scan the end */
 					start_nonce = range.scanned[1] + 1;
-					applog(LOG_WARNING, "scan the next part %x + 1", range.scanned[1]);
-				} else if (range.scanned[0] > 1) {
-					/* dont scan the beginning... make loops */
-					//end_nonce = range.scanned[0] - 1;
-					//applog(LOG_WARNING, "scan the missing part 0 -> %x", end_nonce);
+					//applog(LOG_DEBUG, "scan the next part %x + 1 (%x-%x)", range.scanned[1], range.scanned[0], range.scanned[1]);
 				}
-				if (start_nonce == work.scanned_from) {
-					/* to prevent stales, if last was in the same range */
-					applog(LOG_ERR, "detected a staled job!");
-					//(*nonceptr) = end_nonce + 1;
-					//work_done = true;
-					//continue;
-					start_nonce = range.scanned[1] + 1;
+
+				stall = (start_nonce == work.scanned_from && end_nonce == work.scanned_to);
+				stall |= (start_nonce == work.scanned_from && start_nonce == range.scanned[1] + 1);
+				stall |= (start_nonce > range.scanned[0] && start_nonce < range.scanned[1]);
+
+				if (stall) {
+					if (opt_algo)
+						applog(LOG_DEBUG, "job done, wait for a new one...");
+					work_restart[thr_id].restart = 1;
+					hashlog_purge_old();
+					// wait a bit for a new job...
+					usleep(1500*1000);
+					(*nonceptr) = end_nonce + 1;
+					work_done = true;
+					continue;
 				}
 			}
 		}
diff --git a/hashlog.cpp b/hashlog.cpp
index 645fc88..b069d26 100644
--- a/hashlog.cpp
+++ b/hashlog.cpp
@@ -9,10 +9,12 @@
 #define MK_HI64(u32) (0x100000000ULL * u32)
 
 struct hashlog_data {
-	uint32_t ntime;
+	uint32_t tm_sent;
 	uint32_t scanned_from;
 	uint32_t scanned_to;
 	uint32_t last_from;
+	uint32_t tm_add;
+	uint32_t tm_upd;
 };
 
 static std::map<uint64_t, hashlog_data> tlastshares;
@@ -41,7 +43,7 @@ extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nonce)
 		ret = hashlog_get_last_sent(jobid);
 	} else if (tlastshares.find(key) != tlastshares.end()) {
 		hashlog_data data = tlastshares[key];
-		ret = data.ntime;
+		ret = data.tm_sent;
 	}
 	return ret;
 }
@@ -56,7 +58,9 @@ extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce)
 	struct hashlog_data data;
 
 	data = tlastshares[keyall];
-	data.ntime = (uint32_t) time(NULL);
+	data.tm_upd = data.tm_sent = (uint32_t) time(NULL);
+	if (data.tm_add == 0)
+		data.tm_add = data.tm_upd;
 	tlastshares[key] = data;
 }
 
@@ -67,24 +71,38 @@ extern "C" void hashlog_remember_scan_range(char* jobid, uint32_t scanned_from,
 {
 	uint64_t njobid = hextouint(jobid);
 	uint64_t keyall = (njobid << 32);
+	uint64_t range = hashlog_get_scan_range(jobid);
 	struct hashlog_data data;
 
 	// global scan range of a job
 	data = tlastshares[keyall];
-	if (hashlog_get_scan_range(jobid) == 0) {
+	if (range == 0) {
 		memset(&data, 0, sizeof(data));
+	} else {
+		// get min and max from all sent records
+		data.scanned_from = LO_DWORD(range);
+		data.scanned_to   = HI_DWORD(range);
 	}
 
-	if (data.scanned_from == 0 || scanned_to == (data.scanned_from - 1))
-		data.scanned_from = scanned_from ? scanned_from : 1; // min 1
-	if (data.scanned_to == 0 || scanned_from == data.scanned_to + 1)
-		data.scanned_to = scanned_to;
+	if (data.tm_add == 0)
+		data.tm_add = (uint32_t) time(NULL);
 
 	data.last_from = scanned_from;
 
+	if (scanned_from < scanned_to) {
+		if (data.scanned_from == 0)
+			data.scanned_from = scanned_from ? scanned_from : 1; // min 1
+		else if (scanned_from < data.scanned_from) // || scanned_to == (data.scanned_from - 1)
+			data.scanned_from = scanned_from;
+		if (data.scanned_to == 0 || scanned_from == data.scanned_to + 1)
+			data.scanned_to = scanned_to;
+	}
+
+	data.tm_upd = (uint32_t) time(NULL);
+
 	tlastshares[keyall] = data;
-	applog(LOG_BLUE, "job %s range : %x %x -> %x %x (%x)", jobid,
-		scanned_from, scanned_to, data.scanned_from, data.scanned_to, data.ntime);/* */
+	applog(LOG_BLUE, "job %s range : %x %x -> %x %x", jobid,
+		scanned_from, scanned_to, data.scanned_from, data.scanned_to);/* */
 }
 
 /**
@@ -96,15 +114,21 @@ extern "C" uint64_t hashlog_get_scan_range(char* jobid)
 	uint64_t ret = 0;
 	uint64_t njobid = hextouint(jobid);
 	uint64_t keypfx = (njobid << 32);
+	struct hashlog_data data;
+	data.scanned_from = 0;
+	data.scanned_to = 0;
 	std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
 	while (i != tlastshares.end()) {
-		if ((keypfx & i->first) == keypfx) {
-			hashlog_data data = i->second;
-			ret = data.scanned_from;
-			ret += MK_HI64(data.scanned_to);
+		if ((keypfx & i->first) == keypfx && i->second.scanned_to > ret) {
+			if (i->second.scanned_to > data.scanned_to)
+				data.scanned_to = i->second.scanned_to;
+			if (i->second.scanned_from < data.scanned_from || data.scanned_from == 0)
+				data.scanned_from = i->second.scanned_from;
 		}
 		i++;
 	}
+	ret = data.scanned_from;
+	ret += MK_HI64(data.scanned_to);
 	return ret;
 }
 
@@ -119,7 +143,7 @@ extern "C" uint32_t hashlog_get_last_sent(char* jobid)
 	uint64_t keypfx = (njobid << 32);
 	std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
 	while (i != tlastshares.end()) {
-		if ((keypfx & i->first) == keypfx && i->second.ntime > 0) {
+		if ((keypfx & i->first) == keypfx && i->second.tm_sent > 0) {
 			nonce = LO_DWORD(i->first);
 		}
 		i++;
@@ -128,18 +152,25 @@ extern "C" uint32_t hashlog_get_last_sent(char* jobid)
 }
 
 /**
- * Remove entries of a job... not used yet
+ * Remove entries of a job...
  */
 extern "C" void hashlog_purge_job(char* jobid)
 {
+	int deleted = 0;
 	uint64_t njobid = hextouint(jobid);
 	uint64_t keypfx = (njobid << 32);
+	uint32_t sz = tlastshares.size();
 	std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
 	while (i != tlastshares.end()) {
-		if ((keypfx & i->first) == keypfx)
+		if ((keypfx & i->first) == keypfx) {
+			deleted++;
 			tlastshares.erase(i);
+		}
 		i++;
 	}
+	if (opt_debug && deleted) {
+		applog(LOG_DEBUG, "hashlog: purge job %s, del %d/%d", jobid, deleted, sz);
+	}
 }
 
 /**
@@ -152,7 +183,7 @@ extern "C" void hashlog_purge_old(void)
 	uint32_t sz = tlastshares.size();
 	std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
 	while (i != tlastshares.end()) {
-		if ((now - i->second.ntime) > LOG_PURGE_TIMEOUT) {
+		if ((now - i->second.tm_sent) > LOG_PURGE_TIMEOUT) {
 			deleted++;
 			tlastshares.erase(i);
 		}
@@ -170,3 +201,25 @@ extern "C" void hashlog_purge_all(void)
 {
 	tlastshares.clear();
 }
+
+
+/**
+ * Can be used to debug...
+ */
+extern "C" void hashlog_dump_job(char* jobid)
+{
+	int deleted = 0;
+	uint64_t njobid = hextouint(jobid);
+	uint64_t keypfx = (njobid << 32);
+	uint32_t sz = tlastshares.size();
+	std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
+	while (i != tlastshares.end()) {
+		if ((keypfx & i->first) == keypfx) {
+			applog(LOG_BLUE, "job %s range : %x %x %s added %x upd %x", jobid,
+				i->second.scanned_from, i->second.scanned_to,
+				i->second.tm_sent ? "sent" : "",
+				i->second.tm_add, i->second.tm_upd);/* */
+		}
+		i++;
+	}
+}
\ No newline at end of file
diff --git a/miner.h b/miner.h
index 5f9e8ac..098b6d5 100644
--- a/miner.h
+++ b/miner.h
@@ -399,6 +399,7 @@ uint64_t hashlog_get_scan_range(char* jobid);
 void hashlog_purge_old(void);
 void hashlog_purge_job(char* jobid);
 void hashlog_purge_all(void);
+void hashlog_dump_job(char* jobid);
 
 struct thread_q;
 
diff --git a/util.c b/util.c
index 5567459..9afc308 100644
--- a/util.c
+++ b/util.c
@@ -115,7 +115,7 @@ void applog(int prio, const char *fmt, ...)
 			case LOG_WARNING: color = CL_YLW; break;
 			case LOG_NOTICE:  color = CL_WHT; break;
 			case LOG_INFO:    color = ""; break;
-			case LOG_DEBUG:   color = ""; break;
+			case LOG_DEBUG:   color = CL_SIL; break;
 
 			case LOG_BLUE:
 				prio = LOG_NOTICE;

From 415945eb201f11ccdd04c576b374c48360ed623c Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 4 Sep 2014 11:34:45 +0200
Subject: [PATCH 20/44] Makefile: use the CUDA_CFLAGS var

---
 Makefile.am  | 18 +++++++++---------
 build.sh     |  2 +-
 configure.ac |  4 ++--
 configure.sh | 10 +++++++++-
 4 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index 520dff0..875f8b1 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -53,33 +53,33 @@ nvcc_ARCH  = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
 #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
 #nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
 
-nvcc_FLAGS = $(nvcc_ARCH) -I . --ptxas-options=-v --use_fast_math
+nvcc_FLAGS = $(nvcc_ARCH) -I . @CUDA_CFLAGS@
 nvcc_FLAGS += $(JANSSON_INCLUDES)
 
 # we're now targeting all major compute architectures within one binary.
 .cu.o:
-	$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=128 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=128 -o $@ -c $<
 
 blake32.o: blake32.cu
-	$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=64 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
 
 # Luffa and Echo are faster with 80 registers than 128
 x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
-	$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 
 x11/cuda_x11_echo.o: x11/cuda_x11_echo.cu
-	$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 
 # Shavite compiles faster with 128 regs
 x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
-	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ --maxrregcount=128 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=128 -o $@ -c $<
 
 x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
-	$(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=80 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 
 # ABI requiring code modules
 quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
-	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" --maxrregcount=80 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
 
 JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
-	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" --maxrregcount=80 -o $@ -c $<
+	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
diff --git a/build.sh b/build.sh
index 2905734..17935f3 100755
--- a/build.sh
+++ b/build.sh
@@ -4,7 +4,7 @@
 
 # export PATH="$PATH:/usr/local/cuda/bin/"
 
-#make distclean || echo clean
+make distclean || echo clean
 
 rm -f Makefile.in
 rm -f config.status
diff --git a/configure.ac b/configure.ac
index 2f52cdf..f7924d4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -144,12 +144,12 @@ AC_ARG_WITH([cuda],
 
 if test -n "$with_cuda"
 then
-   CUDA_CFLAGS="-I$with_cuda/include"
+   CUDA_CFLAGS="-I$with_cuda/include $CUDA_CFLAGS"
    CUDA_LIBS="-lcudart"
    CUDA_LDFLAGS="-L$with_cuda/lib$SUFFIX"
    NVCC="$with_cuda/bin/nvcc"
 else
-   CUDA_CFLAGS="-I/usr/local/cuda/include"
+   CUDA_CFLAGS="-I/usr/local/cuda/include $CUDA_CFLAGS"
    CUDA_LIBS="-lcudart -static-libstdc++"
    CUDA_LDFLAGS="-L/usr/local/cuda/lib$SUFFIX"
    NVCC="nvcc"
diff --git a/configure.sh b/configure.sh
index c0cdd0d..142b59e 100755
--- a/configure.sh
+++ b/configure.sh
@@ -1 +1,9 @@
-./configure "CFLAGS=-O2" "CXXFLAGS=-O2" --with-cuda=/usr/local/cuda
+# possible additional CUDA_CFLAGS
+#-gencode=arch=compute_50,code=\"sm_50,compute_50\"
+#-gencode=arch=compute_35,code=\"sm_35,compute_35\"
+#-gencode=arch=compute_30,code=\"sm_30,compute_30\"
+
+#--ptxas-options=\"-v -dlcm=cg\""
+
+CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda
+

From 2ebfb546a60f547b76549323f8babffeb3c6429d Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 4 Sep 2014 12:41:49 +0200
Subject: [PATCH 21/44] clean extra logs, show bloc height on new jobs

---
 configure.sh |  2 +-
 cpu-miner.c  | 17 +++++++++++------
 hashlog.cpp  |  4 ++--
 miner.h      |  1 +
 util.c       |  3 +++
 5 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/configure.sh b/configure.sh
index 142b59e..9c8b021 100755
--- a/configure.sh
+++ b/configure.sh
@@ -5,5 +5,5 @@
 
 #--ptxas-options=\"-v -dlcm=cg\""
 
-CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda
+CUDA_CFLAGS="-O2" ./configure "CFLAGS=-O2" "CXXFLAGS=-O2" --with-cuda=/usr/local/cuda
 
diff --git a/cpu-miner.c b/cpu-miner.c
index 3f772c9..d0dc418 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -918,9 +918,9 @@ static void *miner_thread(void *userdata)
 				g_work_time = time(NULL);
 			}
 		}
-		if (memcmp(work.data, g_work.data, 72)) { // wcmplen)) {
+		if (memcmp(work.data, g_work.data, wcmplen)) {
 			if (opt_debug) {
-				applog(LOG_DEBUG, "job %s %08x work updated", g_work.job_id, (*nonceptr));
+				applog(LOG_DEBUG, "job %s work updated", g_work.job_id);
 				for (int n=0; n<wcmplen; n+=8) {
 					if (memcmp(work.data + n, g_work.data + n, 8)) {
 						applog(LOG_DEBUG, "diff detected at offset %d", n);
@@ -933,7 +933,7 @@ static void *miner_thread(void *userdata)
 			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
 		} else if (memcmp(work.target, g_work.target, sizeof(work.target))) {
 			if (opt_debug) {
-				applog(LOG_DEBUG, "job %s %08x target change", g_work.job_id, (*nonceptr));
+				applog(LOG_DEBUG, "job %s target change", g_work.job_id);
 				applog_hash((uint8_t*) work.target);
 				applog_hash((uint8_t*) g_work.target);
 			}
@@ -944,7 +944,8 @@ static void *miner_thread(void *userdata)
 		pthread_mutex_unlock(&g_work_lock);
 		work_restart[thr_id].restart = 0;
 
-		applog(LOG_WARNING, "job %s %08x", g_work.job_id, (*nonceptr));
+		if (opt_debug)
+			applog(LOG_WARNING, "job %s %08x", g_work.job_id, (*nonceptr));
 
 		/* adjust max_nonce to meet target scan time */
 		if (have_stratum)
@@ -1328,16 +1329,20 @@ static void *stratum_thread(void *userdata)
 		}
 
 		if (stratum.job.job_id &&
-		    (strcmp(stratum.job.job_id, g_work.job_id) || !g_work_time)) {
+		    (!g_work_time || strncmp(stratum.job.job_id, g_work.job_id + 8, 120))) {
 			pthread_mutex_lock(&g_work_lock);
 			stratum_gen_work(&stratum, &g_work);
 			time(&g_work_time);
 			pthread_mutex_unlock(&g_work_lock);
 			if (stratum.job.clean) {
 				if (!opt_quiet)
-					applog(LOG_BLUE, "%s send a new %s job", short_url, algo_names[opt_algo]);
+					applog(LOG_BLUE, "%s requested %s job %d restart, block %d", short_url, algo_names[opt_algo],
+						strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height);
 				restart_threads();
 				hashlog_purge_old();
+			} else if (!opt_quiet) {
+					applog(LOG_BLUE, "%s send %s job %d, block %d", short_url, algo_names[opt_algo],
+						strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height);
 			}
 		}
 		
diff --git a/hashlog.cpp b/hashlog.cpp
index b069d26..06720b0 100644
--- a/hashlog.cpp
+++ b/hashlog.cpp
@@ -101,8 +101,8 @@ extern "C" void hashlog_remember_scan_range(char* jobid, uint32_t scanned_from,
 	data.tm_upd = (uint32_t) time(NULL);
 
 	tlastshares[keyall] = data;
-	applog(LOG_BLUE, "job %s range : %x %x -> %x %x", jobid,
-		scanned_from, scanned_to, data.scanned_from, data.scanned_to);/* */
+/* 	applog(LOG_BLUE, "job %s range : %x %x -> %x %x", jobid,
+		scanned_from, scanned_to, data.scanned_from, data.scanned_to); */
 }
 
 /**
diff --git a/miner.h b/miner.h
index 098b6d5..5e36442 100644
--- a/miner.h
+++ b/miner.h
@@ -380,6 +380,7 @@ struct stratum_ctx {
 	pthread_mutex_t work_lock;
 
 	int srvtime_diff;
+	int bloc_height;
 };
 
 bool stratum_socket_full(struct stratum_ctx *sctx, int timeout);
diff --git a/util.c b/util.c
index 9afc308..7927d18 100644
--- a/util.c
+++ b/util.c
@@ -1069,10 +1069,13 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	coinb2_size = strlen(coinb2) / 2;
 	sctx->job.coinbase_size = coinb1_size + sctx->xnonce1_size +
 	                          sctx->xnonce2_size + coinb2_size;
+
 	sctx->job.coinbase = (unsigned char*)realloc(sctx->job.coinbase, sctx->job.coinbase_size);
 	sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + sctx->xnonce1_size;
 	hex2bin(sctx->job.coinbase, coinb1, coinb1_size);
 	memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size);
+
+	sctx->bloc_height = le16dec((uint8_t*) sctx->job.coinbase + 43);
 	if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id))
 		memset(sctx->job.xnonce2, 0, sctx->xnonce2_size);
 	hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size);

From 3341e0324ff083112c9e4af8d7b7112a7243b9ee Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 4 Sep 2014 16:17:11 +0200
Subject: [PATCH 22/44] blake: speed +10%, no more size conversions

---
 blake32.cu  | 91 +++++++++++++++++++++++++----------------------------
 cpu-miner.c |  2 +-
 2 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index ccba68a..e0e6814 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -31,44 +31,45 @@ extern "C" void blake32hash(void *output, const void *input)
 // in cpu-miner.c
 extern bool opt_n_threads;
 extern bool opt_benchmark;
-//extern bool opt_debug;
 extern int device_map[8];
 
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 
 __constant__
-static uint32_t c_Target[8];
+static uint32_t __align__(32) c_PaddedMessage80[32]; // padded message (80 bytes + padding)
 
 __constant__
-static uint32_t __align__(32) c_PaddedMessage80[32]; // padded message (80 bytes + padding)
+static uint32_t __align__(32) c_Target[8];
+
+#define MAXU 0xffffffffU
 
 static uint32_t *d_resNounce[8];
 static uint32_t *h_resNounce[8];
 
 __constant__
-static uint8_t c_sigma[16][16];
-const uint8_t host_sigma[16][16] =
-{
-  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-  {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-  {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-  { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-  {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
-  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-  {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-  {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
+static uint32_t __align__(32) c_sigma[16][16];
+/* prefer uint32_t to prevent size conversions = speed +5/10 % */
+const uint32_t host_sigma[16][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
 };
 
 __device__ __constant__
-static const uint32_t c_IV256[8] = {
+static const uint32_t __align__(32) c_IV256[8] = {
 	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
 	SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
 	SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
@@ -76,8 +77,7 @@ static const uint32_t c_IV256[8] = {
 };
 
 __device__ __constant__
-
-static const uint32_t c_u256[16] = {
+static const uint32_t __align__(32) c_u256[16] = {
 	SPH_C32(0x243F6A88), SPH_C32(0x85A308D3),
 	SPH_C32(0x13198A2E), SPH_C32(0x03707344),
 	SPH_C32(0xA4093822), SPH_C32(0x299F31D0),
@@ -112,13 +112,15 @@ static const uint32_t c_u256[16] = {
 } while (0)
 #endif
 
-#define GS(a,b,c,d,e) { \
-	v[a] += (m[sigma[i][e]] ^ u256[sigma[i][e+1]]) + v[b]; \
-	v[d] = SPH_ROTR32(v[d] ^ v[a], 16); \
+#define GS(a,b,c,d,x) { \
+	const uint32_t idx1 = c_sigma[i][x]; \
+	const uint32_t idx2 = c_sigma[i][x+1]; \
+	v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \
+	v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \
 	v[c] += v[d]; \
 	v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
 \
-	v[a] += (m[sigma[i][e+1]] ^ u256[sigma[i][e]]) + v[b]; \
+	v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \
 	v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \
 	v[c] += v[d]; \
 	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
@@ -127,11 +129,13 @@ static const uint32_t c_u256[16] = {
 #define BLAKE256_ROUNDS 14
 
 __device__ static
-void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), const uint32_t *u256, const uint32_t T0, uint8_t nullt = 1)
+void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0)
 {
 	uint32_t /* __align__(8) */ v[16];
 	uint32_t /* __align__(8) */ m[16];
 
+	const uint32_t* u256 = c_u256;
+
 	//#pragma unroll
 	for (int i = 0; i < 16; ++i) {
 		m[i] = block[i];
@@ -170,16 +174,6 @@ void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), con
 		h[i % 8] ^= v[i];
 }
 
-#if __CUDA_ARCH__ >= 200
-/* memory should be aligned to use __nvvm_memset */
-#if (__NV_POINTER_SIZE == 64)
-# define SZCT uint64_t
-#else
-# define SZCT uint32_t
-#endif
-extern __device__ __device_builtin__ void __nvvm_memset(uint8_t *, unsigned char, SZCT, int);
-#endif
-
 __global__
 void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
 {
@@ -194,7 +188,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
 		for(int i=0; i<8; i++)
 			h[i] = c_IV256[i];
 
-		blake256_compress(h, c_PaddedMessage80, c_sigma, c_u256, 0x200); /* 512 = 0x200 */
+		blake256_compress(h, c_PaddedMessage80, 0x200); /* 512 = 0x200 */
 
 		// ------ Close: Bytes 64 to 80 ------ 
 
@@ -217,7 +211,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
 		msg[14] = 0;
 		msg[15] = 0x280;
 
-		blake256_compress(h, msg, c_sigma, c_u256, 0x280);
+		blake256_compress(h, msg, 0x280);
 
 		for (int i = 7; i >= 0; i--) {
 			uint32_t hash = cuda_swab32(h[i]);
@@ -239,17 +233,18 @@ __host__
 uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce)
 {
 	const int threadsperblock = TPB;
+	uint32_t result = MAXU;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	size_t shared_size = 0;
 
-	uint32_t result = 0xffffffffU;
-	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
+	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
+	if (cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)) != cudaSuccess)
+		return result;
 
 	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNounce[thr_id]);
-	MyStreamSynchronize(NULL, 1, thr_id);
-
+	cudaDeviceSynchronize();
 	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
 		cudaThreadSynchronize();
 		result = *h_resNounce[thr_id];
@@ -258,7 +253,7 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce
 }
 
 __host__
-void blake256_cpu_setBlock_80(uint32_t *pdata, const void *ptarget)
+void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget)
 {
 	uint32_t PaddedMessage[32];
 	memcpy(PaddedMessage, pdata, 80);
@@ -291,7 +286,7 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 	if (throughput < (TPB * 2048))
 		applog(LOG_WARNING, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce);
 
-	blake256_cpu_setBlock_80(pdata, (void*)ptarget);
+	blake256_cpu_setBlock_80(pdata, ptarget);
 
 	do {
 		// GPU HASH
diff --git a/cpu-miner.c b/cpu-miner.c
index d0dc418..dc86a3e 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -962,7 +962,7 @@ static void *miner_thread(void *userdata)
 				max64 = 0x1fffLL;
 				break;
 			case ALGO_BLAKE:
-				/* based on the 750Ti hashrate */
+				/* based on the 750Ti hashrate (100kH) */
 				max64 = 0x3ffffffLL;
 				break;
 			default:

From 746398f435a03b96651efa914e1935a2cfa63d34 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 4 Sep 2014 17:34:30 +0200
Subject: [PATCH 23/44] blake: fix reduced speed on windows, wtf

---
 blake32.cu | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index e0e6814..8be5205 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -47,9 +47,16 @@ static uint32_t *d_resNounce[8];
 static uint32_t *h_resNounce[8];
 
 __constant__
-static uint32_t __align__(32) c_sigma[16][16];
+#ifdef WIN32
+/* what the fuck ! */
+static uint8_t c_sigma[16][16];
+const uint8_t host_sigma[16][16] =
+#else
 /* prefer uint32_t to prevent size conversions = speed +5/10 % */
-const uint32_t host_sigma[16][16] = {
+static uint32_t __align__(32) c_sigma[16][16];
+const uint32_t host_sigma[16][16]
+#endif
+= {
 	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
 	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
 	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },

From 9bf927a496e5aead1d9669c58887bec67d85ed6d Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 4 Sep 2014 18:18:53 +0200
Subject: [PATCH 24/44] hashlog: fix erase while iterating exception

---
 hashlog.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hashlog.cpp b/hashlog.cpp
index 06720b0..811fed2 100644
--- a/hashlog.cpp
+++ b/hashlog.cpp
@@ -164,9 +164,9 @@ extern "C" void hashlog_purge_job(char* jobid)
 	while (i != tlastshares.end()) {
 		if ((keypfx & i->first) == keypfx) {
 			deleted++;
-			tlastshares.erase(i);
+			tlastshares.erase(i++);
 		}
-		i++;
+		else ++i;
 	}
 	if (opt_debug && deleted) {
 		applog(LOG_DEBUG, "hashlog: purge job %s, del %d/%d", jobid, deleted, sz);
@@ -185,9 +185,9 @@ extern "C" void hashlog_purge_old(void)
 	while (i != tlastshares.end()) {
 		if ((now - i->second.tm_sent) > LOG_PURGE_TIMEOUT) {
 			deleted++;
-			tlastshares.erase(i);
+			tlastshares.erase(i++);
 		}
-		i++;
+		else ++i;
 	}
 	if (opt_debug && deleted) {
 		applog(LOG_DEBUG, "hashlog: %d/%d purged", deleted, sz);

From 033fb5745c3c4d9eae20b10690780d797a70dbcb Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 4 Sep 2014 20:04:55 +0200
Subject: [PATCH 25/44] Release v1.4 with blake

---
 README.md   |  3 +--
 hashlog.cpp | 29 +++++++++++++++--------------
 miner.h     |  4 ++--
 util.c      |  2 +-
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 2a2485b..d3836eb 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,7 @@ ccminer
 Christian Buchner's &amp; Christian H.'s CUDA miner project
 
 Fork by tpruvot@github with X14,X15,X17,WHIRL and Blake256 support
+
    BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
    [![tip for next commit](https://tip4commit.com/projects/927.svg)](https://tip4commit.com/github/tpruvot/ccminer)
 
@@ -26,8 +27,6 @@ This project requires some libraries to be built :
 
 - pthreads
 
-- [mpir math library](http://www.mpir.org)
-
 You can download prebuilt .lib and dll on the [bitcointalk forum thread](https://bitcointalk.org/?topic=167229.0)
 
 
diff --git a/hashlog.cpp b/hashlog.cpp
index 811fed2..7a679e8 100644
--- a/hashlog.cpp
+++ b/hashlog.cpp
@@ -202,24 +202,25 @@ extern "C" void hashlog_purge_all(void)
 	tlastshares.clear();
 }
 
-
 /**
- * Can be used to debug...
+ * Used to debug ranges...
  */
 extern "C" void hashlog_dump_job(char* jobid)
 {
-	int deleted = 0;
-	uint64_t njobid = hextouint(jobid);
-	uint64_t keypfx = (njobid << 32);
-	uint32_t sz = tlastshares.size();
-	std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
-	while (i != tlastshares.end()) {
-		if ((keypfx & i->first) == keypfx) {
-			applog(LOG_BLUE, "job %s range : %x %x %s added %x upd %x", jobid,
-				i->second.scanned_from, i->second.scanned_to,
-				i->second.tm_sent ? "sent" : "",
-				i->second.tm_add, i->second.tm_upd);/* */
+	if (opt_debug) {
+		int deleted = 0;
+		uint64_t njobid = hextouint(jobid);
+		uint64_t keypfx = (njobid << 32);
+		uint32_t sz = tlastshares.size();
+		std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
+		while (i != tlastshares.end()) {
+			if ((keypfx & i->first) == keypfx) {
+				applog(LOG_BLUE, "job %s range : %x %x %s added %x upd %x", jobid,
+					i->second.scanned_from, i->second.scanned_to,
+					i->second.tm_sent ? "sent" : "",
+					i->second.tm_add, i->second.tm_upd);/* */
+			}
+			i++;
 		}
-		i++;
 	}
 }
\ No newline at end of file
diff --git a/miner.h b/miner.h
index 5e36442..6ce4ca8 100644
--- a/miner.h
+++ b/miner.h
@@ -317,7 +317,7 @@ extern uint16_t opt_vote;
 #define CL_BLK  "\x1B[22;30m" /* black */
 #define CL_RD2  "\x1B[22;31m" /* red */
 #define CL_GR2  "\x1B[22;32m" /* green */
-#define CL_BRW  "\x1B[22;33m" /* brown */
+#define CL_YL2  "\x1B[22;33m" /* dark yellow */
 #define CL_BL2  "\x1B[22;34m" /* blue */
 #define CL_MA2  "\x1B[22;35m" /* magenta */
 #define CL_CY2  "\x1B[22;36m" /* cyan */
@@ -326,7 +326,7 @@ extern uint16_t opt_vote;
 #define CL_GRY  "\x1B[01;30m" /* dark gray */
 #define CL_LRD  "\x1B[01;31m" /* light red */
 #define CL_LGR  "\x1B[01;32m" /* light green */
-#define CL_YL2  "\x1B[01;33m" /* yellow */
+#define CL_LYL  "\x1B[01;33m" /* tooltips */
 #define CL_LBL  "\x1B[01;34m" /* light blue */
 #define CL_LMA  "\x1B[01;35m" /* light magenta */
 #define CL_LCY  "\x1B[01;36m" /* light cyan */
diff --git a/util.c b/util.c
index 7927d18..f451d95 100644
--- a/util.c
+++ b/util.c
@@ -115,7 +115,7 @@ void applog(int prio, const char *fmt, ...)
 			case LOG_WARNING: color = CL_YLW; break;
 			case LOG_NOTICE:  color = CL_WHT; break;
 			case LOG_INFO:    color = ""; break;
-			case LOG_DEBUG:   color = CL_SIL; break;
+			case LOG_DEBUG:   color = CL_GRY; break;
 
 			case LOG_BLUE:
 				prio = LOG_NOTICE;

From e1159629b4e34e3b4e23c15a6acb610fda1c5677 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Fri, 5 Sep 2014 09:46:45 +0200
Subject: [PATCH 26/44] blake: typo for windows on last commit

---
 README.txt | 32 +++++++++++++++-----------------
 blake32.cu |  2 +-
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/README.txt b/README.txt
index e6fe248..15c7c72 100644
--- a/README.txt
+++ b/README.txt
@@ -1,27 +1,24 @@
 
-ccMiner release 1.3-tpruvot (Aug 21th 2014) - "X14 X15 Fresh"
--------------------------------------------------------------
+ccMiner release 1.4-tpruvot (Sept 04th 2014) - "X17 Blake NEOS"
+---------------------------------------------------------------
 
 ***************************************************************
 If you find this tool useful and like to support its continued 
           development, then consider a donation.
 
-   LTC donation address: LKS1WDKGED647msBQfLBHV3Ls8sveGncnm
-   BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM
-   YAC donation address: Y87sptDEcpLkLeAuex6qZioDbvy1qXZEj4
-   VTC donation address: VrjeFzMgvteCGarLw85KivBzmsiH9fqp4a
-   MAX donation address: mHrhQP9EFArechWxTFJ97s9D3jvcCvEEnt
-  DOGE donation address: DT9ghsGmez6ojVdEZgvaZbT2Z3TruXG6yP
-   HVC donation address: HNN3PyyTMkDo4RkEjkWSGMwqia1yD8mwJN
-   GRS donation address: FmJKJAhvyHWPeEVeLQHefr2naqgWc9ABTM
-   MYR donation address: MNHM7Q7HVfGpKDJgVJrY8ofwvmeugNewyf
-   JPC donation address: JYFBypVDkk583yKWY4M46TG5vXG8hfgD2U
-   SFR donation address: SR4b87aEnPfTs77bo9NnnaV21fiF6jQpAp
-   MNC donation address: MShgNUSYwybEbXLvJUtdNg1a7rUeiNgooK
-   BTQ donation address: 13GFwLiZL2DaA9XeE733PNrQX5QYLFsonS
-
 tpruvot@github:
-   BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
+  BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
+  DRK  : XeVrkPrWB7pDbdFLfKhF1Z3xpqhsx6wkH3
+  NEO$ : NaEcVrdzoCWHUYXb7X8QoafoKS9UV69Yk4
+
+DJM34:
+  XCN donation address: CNh6F4h1byX7vvbmfQn4LMtsC4TYb8mgmn
+  BTC donation address: 1NENYmxwZGHsKFmyjTc5WferTn5VTFb7Ze
+  TAC donation address: TuqNvPoQxghHfzwnPpAxSTiYoN6FM8LM5p
+
+cbuchner v1.2:
+  LTC donation address: LKS1WDKGED647msBQfLBHV3Ls8sveGncnm
+  BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM
 
 ***************************************************************
 
@@ -36,6 +33,7 @@ JackpotCoin
 QuarkCoin family & AnimeCoin
 TalkCoin
 DarkCoin and other X11 coins
+NEOS blake (256 14-rounds)
 
 where some of these coins have a VERY NOTABLE nVidia advantage
 over competing AMD (OpenCL) implementations.
diff --git a/blake32.cu b/blake32.cu
index 8be5205..b24c5f7 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -50,7 +50,7 @@ __constant__
 #ifdef WIN32
 /* what the fuck ! */
 static uint8_t c_sigma[16][16];
-const uint8_t host_sigma[16][16] =
+const uint8_t host_sigma[16][16]
 #else
 /* prefer uint32_t to prevent size conversions = speed +5/10 % */
 static uint32_t __align__(32) c_sigma[16][16];

From 416f7f3708ec0d6f46a16f457f8a223c3371bfe4 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Fri, 5 Sep 2014 18:16:40 +0200
Subject: [PATCH 27/44] hashlog: keep compat with VS2012

---
 hashlog.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/hashlog.cpp b/hashlog.cpp
index 7a679e8..aad202e 100644
--- a/hashlog.cpp
+++ b/hashlog.cpp
@@ -27,7 +27,8 @@ static std::map<uint64_t, hashlog_data> tlastshares;
 static uint64_t hextouint(char* jobid)
 {
 	char *ptr;
-	return strtoull(jobid, &ptr, 16);
+	/* dont use strtoull(), only since VS2013 */
+	return (uint64_t) strtoul(jobid, &ptr, 16);
 }
 
 /**
@@ -38,6 +39,7 @@ extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nonce)
 	uint32_t ret = 0;
 	uint64_t njobid = hextouint(jobid);
 	uint64_t key = (njobid << 32) + nonce;
+
 	if (nonce == 0) {
 		// search last submitted nonce for job
 		ret = hashlog_get_last_sent(jobid);
@@ -55,7 +57,7 @@ extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce)
 	uint64_t njobid = hextouint(jobid);
 	uint64_t keyall = (njobid << 32);
 	uint64_t key = keyall + nonce;
-	struct hashlog_data data;
+	hashlog_data data;
 
 	data = tlastshares[keyall];
 	data.tm_upd = data.tm_sent = (uint32_t) time(NULL);
@@ -72,7 +74,7 @@ extern "C" void hashlog_remember_scan_range(char* jobid, uint32_t scanned_from,
 	uint64_t njobid = hextouint(jobid);
 	uint64_t keyall = (njobid << 32);
 	uint64_t range = hashlog_get_scan_range(jobid);
-	struct hashlog_data data;
+	hashlog_data data;
 
 	// global scan range of a job
 	data = tlastshares[keyall];
@@ -114,7 +116,8 @@ extern "C" uint64_t hashlog_get_scan_range(char* jobid)
 	uint64_t ret = 0;
 	uint64_t njobid = hextouint(jobid);
 	uint64_t keypfx = (njobid << 32);
-	struct hashlog_data data;
+	hashlog_data data;
+
 	data.scanned_from = 0;
 	data.scanned_to = 0;
 	std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();

From 5682b7d241a17890273488b4544934ac974e4b20 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Fri, 5 Sep 2014 10:52:04 +0200
Subject: [PATCH 28/44] blake: add also blakecoin (8-rounds) variant

---
 blake32.cu      | 33 ++++++++++++++++++---------------
 ccminer.vcxproj |  5 +++--
 cpu-miner.c     | 13 +++++++++++--
 miner.h         |  8 ++++----
 sph/blake.c     |  4 +++-
 sph/sph_blake.h |  5 +++++
 util.c          |  8 ++++++--
 7 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index b24c5f7..877c319 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -15,11 +15,17 @@ extern "C" {
 /* threads per block */
 #define TPB 128
 
+extern "C" int blake256_rounds = 14;
+
 /* hash by cpu with blake 256 */
-extern "C" void blake32hash(void *output, const void *input)
+extern "C" void blake256hash(void *output, const void *input, int rounds = 14)
 {
 	unsigned char hash[64];
 	sph_blake256_context ctx;
+
+	/* in sph_blake.c */
+	blake256_rounds = rounds;
+
 	sph_blake256_init(&ctx);
 	sph_blake256(&ctx, input, 80);
 	sph_blake256_close(&ctx, hash);
@@ -133,10 +139,8 @@ static const uint32_t __align__(32) c_u256[16] = {
 	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
 }
 
-#define BLAKE256_ROUNDS 14
-
 __device__ static
-void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0)
+void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, int blakerounds)
 {
 	uint32_t /* __align__(8) */ v[16];
 	uint32_t /* __align__(8) */ m[16];
@@ -162,8 +166,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0)
 	v[14] = u256[6];
 	v[15] = u256[7];
 
-	//#pragma unroll
-	for (int i = 0; i < BLAKE256_ROUNDS; i++) {
+	for (int i = 0; i < blakerounds; i++) {
 		/* column step */
 		GS(0, 4, 0x8, 0xC, 0);
 		GS(1, 5, 0x9, 0xD, 2);
@@ -182,7 +185,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0)
 }
 
 __global__
-void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
+void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, int blakerounds)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -195,7 +198,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
 		for(int i=0; i<8; i++)
 			h[i] = c_IV256[i];
 
-		blake256_compress(h, c_PaddedMessage80, 0x200); /* 512 = 0x200 */
+		blake256_compress(h, c_PaddedMessage80, 0x200, blakerounds); /* 512 = 0x200 */
 
 		// ------ Close: Bytes 64 to 80 ------ 
 
@@ -218,7 +221,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
 		msg[14] = 0;
 		msg[15] = 0x280;
 
-		blake256_compress(h, msg, 0x280);
+		blake256_compress(h, msg, 0x280, blakerounds);
 
 		for (int i = 7; i >= 0; i--) {
 			uint32_t hash = cuda_swab32(h[i]);
@@ -237,7 +240,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
 }
 
 __host__
-uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce)
+uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, int blakerounds)
 {
 	const int threadsperblock = TPB;
 	uint32_t result = MAXU;
@@ -250,7 +253,7 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce
 	if (cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)) != cudaSuccess)
 		return result;
 
-	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNounce[thr_id]);
+	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNounce[thr_id], blakerounds);
 	cudaDeviceSynchronize();
 	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
 		cudaThreadSynchronize();
@@ -270,8 +273,8 @@ void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget)
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice));
 }
 
-extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done)
+extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done, uint32_t blakerounds=14)
 {
 	const uint32_t first_nonce = pdata[19];
 	static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
@@ -297,7 +300,7 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 
 	do {
 		// GPU HASH
-		uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19]);
+		uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], blakerounds);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t endiandata[20];
@@ -315,7 +318,7 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta
 
 			be32enc(&endiandata[19], foundNonce);
 
-			blake32hash(vhashcpu, endiandata);
+			blake256hash(vhashcpu, endiandata, blakerounds);
 
 			if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
 			{
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 7590d94..cb633ad 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -400,8 +400,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
     </CudaCompile>
     <CudaCompile Include="blake32.cu">
       <MaxRegCount>64</MaxRegCount>
-      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-O2 -dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
       <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
+      <FastMath>true</FastMath>
     </CudaCompile>
     <CudaCompile Include="quark\animecoin.cu">
       <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
@@ -562,4 +563,4 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
   <ImportGroup Label="ExtensionTargets">
     <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" />
   </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/cpu-miner.c b/cpu-miner.c
index dc86a3e..e3b77e8 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -128,6 +128,7 @@ struct workio_cmd {
 typedef enum {
 	ALGO_ANIME,
 	ALGO_BLAKE,
+	ALGO_BLAKECOIN,
 	ALGO_FRESH,
 	ALGO_FUGUE256,		/* Fugue256 */
 	ALGO_GROESTL,
@@ -149,6 +150,7 @@ typedef enum {
 static const char *algo_names[] = {
 	"anime",
 	"blake",
+	"blakecoin",
 	"fresh",
 	"fugue256",
 	"groestl",
@@ -231,6 +233,7 @@ Options:\n\
   -a, --algo=ALGO       specify the algorithm to use\n\
                         anime     Animecoin hash\n\
                         blake     Blake 256 (like NEOS blake)\n\
+                        blakecoin Old Blake 256 (8 rounds)\n\
                         fresh     Freshcoin hash (shavite 80)\n\
                         fugue256  Fuguecoin hash\n\
                         groestl   Groestlcoin hash\n\
@@ -961,6 +964,7 @@ static void *miner_thread(void *userdata)
 			case ALGO_JACKPOT:
 				max64 = 0x1fffLL;
 				break;
+			case ALGO_BLAKECOIN:
 			case ALGO_BLAKE:
 				/* based on the 750Ti hashrate (100kH) */
 				max64 = 0x3ffffffLL;
@@ -1065,9 +1069,14 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;
 
+		case ALGO_BLAKECOIN:
+			rc = scanhash_blake256(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done, 8);
+			break;
+
 		case ALGO_BLAKE:
-			rc = scanhash_blake32(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done);
+			rc = scanhash_blake256(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done, 14);
 			break;
 
 		case ALGO_FRESH:
diff --git a/miner.h b/miner.h
index 6ce4ca8..0e281da 100644
--- a/miner.h
+++ b/miner.h
@@ -237,11 +237,11 @@ extern int scanhash_anime(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 
-extern int scanhash_fresh(int thr_id, uint32_t *pdata,
+extern int scanhash_blake256(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done);
+	unsigned long *hashes_done, uint32_t blakerounds);
 
-extern int scanhash_blake32(int thr_id, uint32_t *pdata,
+extern int scanhash_fresh(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 
@@ -420,7 +420,7 @@ void applog_hash(unsigned char *hash);
 
 void print_hash_tests(void);
 void animehash(void *state, const void *input);
-void blake32hash(void *output, const void *input);
+void blake256hash(void *output, const void *input, int rounds);
 void fresh_hash(void *state, const void *input);
 void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
 void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
diff --git a/sph/blake.c b/sph/blake.c
index 0650b9c..ea829f0 100644
--- a/sph/blake.c
+++ b/sph/blake.c
@@ -548,7 +548,7 @@ static const sph_u64 CB[16] = {
 		M[0xD] = sph_dec32be_aligned(buf + 52); \
 		M[0xE] = sph_dec32be_aligned(buf + 56); \
 		M[0xF] = sph_dec32be_aligned(buf + 60); \
-		for (r = 0; r < 14; r ++) \
+		for (r = 0; r < blake256_rounds; r ++) \
 			ROUND_S(r); \
 		H0 ^= S0 ^ V0 ^ V8; \
 		H1 ^= S1 ^ V1 ^ V9; \
@@ -592,6 +592,7 @@ static const sph_u64 CB[16] = {
 		M6 = sph_dec32be_aligned(buf + 24); \
 		M7 = sph_dec32be_aligned(buf + 28); \
 		M8 = sph_dec32be_aligned(buf + 32); \
+		if (blake256_rounds == 14) { \
 		M9 = sph_dec32be_aligned(buf + 36); \
 		MA = sph_dec32be_aligned(buf + 40); \
 		MB = sph_dec32be_aligned(buf + 44); \
@@ -599,6 +600,7 @@ static const sph_u64 CB[16] = {
 		MD = sph_dec32be_aligned(buf + 52); \
 		ME = sph_dec32be_aligned(buf + 56); \
 		MF = sph_dec32be_aligned(buf + 60); \
+		} \
 		ROUND_S(0); \
 		ROUND_S(1); \
 		ROUND_S(2); \
diff --git a/sph/sph_blake.h b/sph/sph_blake.h
index d8d7943..24aa89d 100644
--- a/sph/sph_blake.h
+++ b/sph/sph_blake.h
@@ -181,6 +181,11 @@ void sph_blake224_close(void *cc, void *dst);
 void sph_blake224_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);
 
+/**
+ * Switch for the number of rounds (old blake was 8)
+ */
+extern int blake256_rounds;
+
 /**
  * Initialize a BLAKE-256 context. This process performs no memory allocation.
  *
diff --git a/util.c b/util.c
index f451d95..fe9168b 100644
--- a/util.c
+++ b/util.c
@@ -1042,7 +1042,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 
 	/* store stratum server time diff */
 	hex2bin((unsigned char *)&ntime, stime, 4);
-	ntime = swab32(ntime) - time(0);
+	ntime = swab32(ntime) - (uint32_t) time(0);
 	if (ntime > sctx->srvtime_diff) {
 		sctx->srvtime_diff = ntime;
 		if (!opt_quiet)
@@ -1420,7 +1420,11 @@ void print_hash_tests(void)
 	printpfx("anime", hash);
 
 	memset(hash, 0, sizeof hash);
-	blake32hash(&hash[0], &buf[0]);
+	blake256hash(&hash[0], &buf[0], 8);
+	printpfx("blakecoin", hash);
+
+	memset(hash, 0, sizeof hash);
+	blake256hash(&hash[0], &buf[0], 14);
 	printpfx("blake", hash);
 
 	memset(hash, 0, sizeof hash);

From 12fefe5de0362b46155627788e3cfbf28e5a8c4a Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Fri, 5 Sep 2014 18:17:11 +0200
Subject: [PATCH 29/44] blake: add a few more MH/s, prepare blakecoin

---
 blake32.cu | 97 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 51 insertions(+), 46 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index 877c319..a592b4d 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -42,10 +42,10 @@ extern int device_map[8];
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 
 __constant__
-static uint32_t __align__(32) c_PaddedMessage80[32]; // padded message (80 bytes + padding)
+static uint32_t __align__(32) c_Target[8];
 
 __constant__
-static uint32_t __align__(32) c_Target[8];
+static uint32_t __align__(32) c_data[20];
 
 #define MAXU 0xffffffffU
 
@@ -128,50 +128,70 @@ static const uint32_t __align__(32) c_u256[16] = {
 #define GS(a,b,c,d,x) { \
 	const uint32_t idx1 = c_sigma[i][x]; \
 	const uint32_t idx2 = c_sigma[i][x+1]; \
-	v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \
+	v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \
 	v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \
 	v[c] += v[d]; \
 	v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
 \
-	v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \
+	v[a] += (m[idx2] ^ c_u256[idx1]) + v[b]; \
 	v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \
 	v[c] += v[d]; \
 	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
 }
 
+/* Second part (64-80) msg never change, store it */
+__device__ __constant__
+static const uint32_t __align__(32) c_Padding[16] = {
+	0, 0, 0, 0,
+	0x80000000UL, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 1, 0, 640,
+};
+
 __device__ static
 void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, int blakerounds)
 {
-	uint32_t /* __align__(8) */ v[16];
 	uint32_t /* __align__(8) */ m[16];
 
-	const uint32_t* u256 = c_u256;
+	m[0] = block[0];
+	m[1] = block[1];
+	m[2] = block[2];
+	m[3] = block[3];
 
-	//#pragma unroll
-	for (int i = 0; i < 16; ++i) {
-		m[i] = block[i];
+	if (T0 == 0x200) {
+		//#pragma unroll 12
+		for (int i = 4; i < 16; ++i) {
+			m[i] = block[i];
+		}
+	} else {
+		//#pragma unroll 12
+		for (int i = 4; i < 16; ++i) {
+			m[i] = c_Padding[i];
+		}
 	}
 
+	uint32_t /* __align__(8) */ v[16];
+
 	//#pragma unroll 8
 	for(int i = 0; i < 8; i++)
 		v[i] = h[i];
 
-	v[ 8] = u256[0];
-	v[ 9] = u256[1];
-	v[10] = u256[2];
-	v[11] = u256[3];
+	v[ 8] = c_u256[0];
+	v[ 9] = c_u256[1];
+	v[10] = c_u256[2];
+	v[11] = c_u256[3];
 
-	v[12] = u256[4] ^ T0;
-	v[13] = u256[5] ^ T0;
-	v[14] = u256[6];
-	v[15] = u256[7];
+	v[12] = c_u256[4] ^ T0;
+	v[13] = c_u256[5] ^ T0;
+	v[14] = c_u256[6];
+	v[15] = c_u256[7];
 
 	for (int i = 0; i < blakerounds; i++) {
 		/* column step */
-		GS(0, 4, 0x8, 0xC, 0);
-		GS(1, 5, 0x9, 0xD, 2);
-		GS(2, 6, 0xA, 0xE, 4);
-		GS(3, 7, 0xB, 0xF, 6);
+		GS(0, 4, 0x8, 0xC, 0x0);
+		GS(1, 5, 0x9, 0xD, 0x2);
+		GS(2, 6, 0xA, 0xE, 0x4);
+		GS(3, 7, 0xB, 0xF, 0x6);
 		/* diagonal step */
 		GS(0, 5, 0xA, 0xF, 0x8);
 		GS(1, 6, 0xB, 0xC, 0xA);
@@ -191,37 +211,23 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
 	if (thread < threads)
 	{
 		const uint32_t nounce = startNounce + thread;
-		uint32_t /* __align__(8) */ msg[16];
 		uint32_t h[8];
 
 		#pragma unroll
 		for(int i=0; i<8; i++)
 			h[i] = c_IV256[i];
 
-		blake256_compress(h, c_PaddedMessage80, 0x200, blakerounds); /* 512 = 0x200 */
+		blake256_compress(h, c_data, 512, blakerounds);
 
 		// ------ Close: Bytes 64 to 80 ------ 
 
-		msg[0] = c_PaddedMessage80[16];
-		msg[1] = c_PaddedMessage80[17];
-		msg[2] = c_PaddedMessage80[18];
-		msg[3] = nounce; /* our tested value */
-		msg[4] = 0x80000000UL; //cuda_swab32(0x80U);
-
-		msg[5] = 0;  // uchar[17 to 55]
-		msg[6] = 0;
-		msg[7] = 0;
-		msg[8] = 0;
-		msg[9] = 0;
-		msg[10] = 0;
-		msg[11] = 0;
-		msg[12] = 0;
-
-		msg[13] = 1;
-		msg[14] = 0;
-		msg[15] = 0x280;
+		uint32_t ending[4];
+		ending[0] = c_data[16];
+		ending[1] = c_data[17];
+		ending[2] = c_data[18];
+		ending[3] = nounce; /* our tested value */
 
-		blake256_compress(h, msg, 0x280, blakerounds);
+		blake256_compress(h, ending, 640, blakerounds);
 
 		for (int i = 7; i >= 0; i--) {
 			uint32_t hash = cuda_swab32(h[i]);
@@ -265,10 +271,9 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce
 __host__
 void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget)
 {
-	uint32_t PaddedMessage[32];
-	memcpy(PaddedMessage, pdata, 80);
-	memset(&PaddedMessage[20], 0, 48);
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice));
+	uint32_t data[20];
+	memcpy(data, pdata, 80);
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice));
 }

From 3356e6f8bfba816874d0a897ed9f6374bb069ac4 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Fri, 5 Sep 2014 20:00:07 +0200
Subject: [PATCH 30/44] blake: some more KH/s on linux

---
 blake32.cu | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index a592b4d..a1403c8 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -315,12 +315,6 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 			for (int k=0; k < 20; k++)
 				be32enc(&endiandata[k], pdata[k]);
 
-			if (opt_debug && !opt_quiet) {
-				applog(LOG_DEBUG, "throughput=%u, start=%x, max=%x, pdata=%08x...%08x",
-					throughput, first_nonce, max_nonce, endiandata[0], endiandata[7]);
-				applog_hash((unsigned char *)pdata);
-			}
-
 			be32enc(&endiandata[19], foundNonce);
 
 			blake256hash(vhashcpu, endiandata, blakerounds);
@@ -348,11 +342,14 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 
 exit_scan:
 	*hashes_done = pdata[19] - first_nonce + 1;
-	// reset the device to allow multiple instances
+#if 0
+	/* reset the device to allow multiple instances
+	 * could be made in cpu-miner... check later if required */
 	if (opt_n_threads == 1) {
 		CUDA_SAFE_CALL(cudaDeviceReset());
 		init[thr_id] = false;
 	}
+#endif
 	// wait proper end of all threads
 	cudaDeviceSynchronize();
 	return rc;

From b98239ec2a75108d7c4e71ebff2af50629cbf9f2 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Fri, 5 Sep 2014 20:00:48 +0200
Subject: [PATCH 31/44] hashlog: enhance scan range store and debug dump

---
 cpu-miner.c |  3 +--
 hashlog.cpp | 33 +++++++++++++++++++++------------
 miner.h     |  2 +-
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/cpu-miner.c b/cpu-miner.c
index e3b77e8..9110d74 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -508,8 +508,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 			goto out;
 		}
 
-		hashlog_remember_submit(work->job_id, nonce);
-		hashlog_remember_scan_range(work->job_id, work->scanned_from, work->scanned_to);
+		hashlog_remember_submit(work->job_id, nonce, work->scanned_from);
 
 	} else {
 
diff --git a/hashlog.cpp b/hashlog.cpp
index aad202e..9bcd04b 100644
--- a/hashlog.cpp
+++ b/hashlog.cpp
@@ -1,3 +1,11 @@
+/**
+ * Hash log of submitted job nonces
+ * Prevent duplicate shares and could be used for RPC stats later
+ *
+ * Note: this source is C++ (requires std::map)
+ *
+ * tpruvot@github 2014
+ */
 #include <stdlib.h>
 #include <memory.h>
 #include <map>
@@ -52,17 +60,17 @@ extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nonce)
 /**
  * Store submitted nonces of a job
  */
-extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce)
+extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce, uint32_t scanned_from)
 {
 	uint64_t njobid = hextouint(jobid);
 	uint64_t keyall = (njobid << 32);
 	uint64_t key = keyall + nonce;
 	hashlog_data data;
 
-	data = tlastshares[keyall];
-	data.tm_upd = data.tm_sent = (uint32_t) time(NULL);
-	if (data.tm_add == 0)
-		data.tm_add = data.tm_upd;
+	memset(&data, 0, sizeof(data));
+	data.scanned_from = scanned_from;
+	data.scanned_to = nonce;
+	data.tm_add = data.tm_upd = data.tm_sent = (uint32_t) time(NULL);
 	tlastshares[key] = data;
 }
 
@@ -92,12 +100,12 @@ extern "C" void hashlog_remember_scan_range(char* jobid, uint32_t scanned_from,
 	data.last_from = scanned_from;
 
 	if (scanned_from < scanned_to) {
+		if (data.scanned_to == 0 || scanned_from == data.scanned_to + 1)
+			data.scanned_to = scanned_to;
 		if (data.scanned_from == 0)
 			data.scanned_from = scanned_from ? scanned_from : 1; // min 1
-		else if (scanned_from < data.scanned_from) // || scanned_to == (data.scanned_from - 1)
+		else if (scanned_from < data.scanned_from || scanned_to == (data.scanned_from - 1))
 			data.scanned_from = scanned_from;
-		if (data.scanned_to == 0 || scanned_from == data.scanned_to + 1)
-			data.scanned_to = scanned_to;
 	}
 
 	data.tm_upd = (uint32_t) time(NULL);
@@ -218,10 +226,11 @@ extern "C" void hashlog_dump_job(char* jobid)
 		std::map<uint64_t, hashlog_data>::iterator i = tlastshares.begin();
 		while (i != tlastshares.end()) {
 			if ((keypfx & i->first) == keypfx) {
-				applog(LOG_BLUE, "job %s range : %x %x %s added %x upd %x", jobid,
-					i->second.scanned_from, i->second.scanned_to,
-					i->second.tm_sent ? "sent" : "",
-					i->second.tm_add, i->second.tm_upd);/* */
+				if (i->first != keypfx)
+					applog(LOG_DEBUG, CL_YLW "job %s, found %08x ", jobid, LO_DWORD(i->first));
+				else
+					applog(LOG_DEBUG, CL_YLW "job %s scanned range : %08x-%08x", jobid,
+						i->second.scanned_from, i->second.scanned_to);
 			}
 			i++;
 		}
diff --git a/miner.h b/miner.h
index 0e281da..d9951e9 100644
--- a/miner.h
+++ b/miner.h
@@ -392,7 +392,7 @@ bool stratum_subscribe(struct stratum_ctx *sctx);
 bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
 bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
 
-void hashlog_remember_submit(char* jobid, uint32_t nounce);
+void hashlog_remember_submit(char* jobid, uint32_t nounce, uint32_t scanned_from);
 void hashlog_remember_scan_range(char* jobid, uint32_t scanned_from, uint32_t scanned_to);
 uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce);
 uint32_t hashlog_get_last_sent(char* jobid);

From ecc86af102d0f8816811184b65566c9eeca32adf Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Fri, 5 Sep 2014 21:12:38 +0200
Subject: [PATCH 32/44] blake: sometimes faster, or not

---
 blake32.cu | 42 ++++++++++++++----------------------------
 1 file changed, 14 insertions(+), 28 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index a1403c8..638cfbe 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -34,6 +34,8 @@ extern "C" void blake256hash(void *output, const void *input, int rounds = 14)
 
 #include "cuda_helper.h"
 
+#define MAXU 0xffffffffU
+
 // in cpu-miner.c
 extern bool opt_n_threads;
 extern bool opt_benchmark;
@@ -47,22 +49,13 @@ static uint32_t __align__(32) c_Target[8];
 __constant__
 static uint32_t __align__(32) c_data[20];
 
-#define MAXU 0xffffffffU
-
 static uint32_t *d_resNounce[8];
 static uint32_t *h_resNounce[8];
 
-__constant__
-#ifdef WIN32
-/* what the fuck ! */
-static uint8_t c_sigma[16][16];
-const uint8_t host_sigma[16][16]
-#else
 /* prefer uint32_t to prevent size conversions = speed +5/10 % */
+__constant__
 static uint32_t __align__(32) c_sigma[16][16];
-const uint32_t host_sigma[16][16]
-#endif
-= {
+const uint32_t host_sigma[16][16] = {
 	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
 	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
 	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
@@ -152,28 +145,19 @@ __device__ static
 void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, int blakerounds)
 {
 	uint32_t /* __align__(8) */ m[16];
+	uint32_t /* __align__(8) */ v[16];
 
 	m[0] = block[0];
 	m[1] = block[1];
 	m[2] = block[2];
 	m[3] = block[3];
 
-	if (T0 == 0x200) {
-		//#pragma unroll 12
-		for (int i = 4; i < 16; ++i) {
-			m[i] = block[i];
-		}
-	} else {
-		//#pragma unroll 12
-		for (int i = 4; i < 16; ++i) {
-			m[i] = c_Padding[i];
-		}
+	for (uint32_t i = 4; i < 16; i++) {
+		m[i] = (T0 == 0x200) ? block[i] : c_Padding[i];
 	}
 
-	uint32_t /* __align__(8) */ v[16];
-
 	//#pragma unroll 8
-	for(int i = 0; i < 8; i++)
+	for(uint32_t i = 0; i < 8; i++)
 		v[i] = h[i];
 
 	v[ 8] = c_u256[0];
@@ -200,8 +184,10 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, in
 	}
 
 	//#pragma unroll 16
-	for(int i = 0; i < 16; i++)
-		h[i % 8] ^= v[i];
+	for (uint32_t i = 0; i < 16; i++) {
+		uint32_t j = i % 8;
+		h[j] ^= v[i];
+	}
 }
 
 __global__
@@ -306,13 +292,13 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 	do {
 		// GPU HASH
 		uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], blakerounds);
-		if (foundNonce != 0xffffffff)
+		if (foundNonce != MAXU)
 		{
 			uint32_t endiandata[20];
 			uint32_t vhashcpu[8];
 			uint32_t Htarg = ptarget[7];
 
-			for (int k=0; k < 20; k++)
+			for (int k=0; k < 19; k++)
 				be32enc(&endiandata[k], pdata[k]);
 
 			be32enc(&endiandata[19], foundNonce);

From 52ec8830b1a9231b35bf44286a8f07d6ac0eb5b1 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 6 Sep 2014 01:21:30 +0200
Subject: [PATCH 33/44] blake: blakecoin variant now works

---
 README.txt   | 1 +
 blake32.cu   | 4 ++--
 configure.ac | 2 +-
 cpu-miner.c  | 5 +++--
 sph/blake.c  | 4 ++--
 5 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/README.txt b/README.txt
index 15c7c72..ea4cd5b 100644
--- a/README.txt
+++ b/README.txt
@@ -62,6 +62,7 @@ its command line interface and options.
                           quark       use to mine Quarkcoin
                           anime       use to mine Animecoin
                           blake       use to mine NEOS (Blake 256)
+                          blakecoin   use to mine Old Blake 256
                           nist5       use to mine TalkCoin
                           fresh       use to mine Freshcoin
                           whirl       use to mine Whirlcoin
diff --git a/blake32.cu b/blake32.cu
index 638cfbe..2ce2acd 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -191,7 +191,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, in
 }
 
 __global__
-void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, int blakerounds)
+void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, const int blakerounds)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -232,7 +232,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
 }
 
 __host__
-uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, int blakerounds)
+uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, const int blakerounds)
 {
 	const int threadsperblock = TPB;
 	uint32_t result = MAXU;
diff --git a/configure.ac b/configure.ac
index f7924d4..2a554f1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2014.09.01])
+AC_INIT([ccminer], [2014.09.06])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 9110d74..b3a6ba7 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -789,7 +789,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR)
 		heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
 	else
-	if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_WHC)
+	if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_WHC || opt_algo == ALGO_BLAKECOIN)
 		SHA256((unsigned char*)sctx->job.coinbase, sctx->job.coinbase_size, (unsigned char*)merkle_root);
 	else
 		sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
@@ -964,6 +964,7 @@ static void *miner_thread(void *userdata)
 				max64 = 0x1fffLL;
 				break;
 			case ALGO_BLAKECOIN:
+				max64 = 0x3ffffffLL;
 			case ALGO_BLAKE:
 				/* based on the 750Ti hashrate (100kH) */
 				max64 = 0x3ffffffLL;
@@ -1373,7 +1374,7 @@ out:
 	return NULL;
 }
 
-#define PROGRAM_VERSION "1.4"
+#define PROGRAM_VERSION "1.4.1"
 static void show_version_and_exit(void)
 {
 	printf("%s v%s\n"
diff --git a/sph/blake.c b/sph/blake.c
index ea829f0..c89de5e 100644
--- a/sph/blake.c
+++ b/sph/blake.c
@@ -592,7 +592,6 @@ static const sph_u64 CB[16] = {
 		M6 = sph_dec32be_aligned(buf + 24); \
 		M7 = sph_dec32be_aligned(buf + 28); \
 		M8 = sph_dec32be_aligned(buf + 32); \
-		if (blake256_rounds == 14) { \
 		M9 = sph_dec32be_aligned(buf + 36); \
 		MA = sph_dec32be_aligned(buf + 40); \
 		MB = sph_dec32be_aligned(buf + 44); \
@@ -600,7 +599,6 @@ static const sph_u64 CB[16] = {
 		MD = sph_dec32be_aligned(buf + 52); \
 		ME = sph_dec32be_aligned(buf + 56); \
 		MF = sph_dec32be_aligned(buf + 60); \
-		} \
 		ROUND_S(0); \
 		ROUND_S(1); \
 		ROUND_S(2); \
@@ -609,12 +607,14 @@ static const sph_u64 CB[16] = {
 		ROUND_S(5); \
 		ROUND_S(6); \
 		ROUND_S(7); \
+		if (blake256_rounds == 14) { \
 		ROUND_S(8); \
 		ROUND_S(9); \
 		ROUND_S(0); \
 		ROUND_S(1); \
 		ROUND_S(2); \
 		ROUND_S(3); \
+		} \
 		H0 ^= S0 ^ V0 ^ V8; \
 		H1 ^= S1 ^ V1 ^ V9; \
 		H2 ^= S2 ^ V2 ^ VA; \

From 65909ec3b778fdba97c97146ff8900795b972526 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 6 Sep 2014 10:55:44 +0200
Subject: [PATCH 34/44] blake: handle case when 2 hashes are found in a call

---
 blake32.cu        | 50 +++++++++++++++++++++++++++++++++++++----------
 cpu-miner.c       |  2 +-
 cpuminer-config.h |  6 +++---
 util.c            |  6 ++++--
 4 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index 2ce2acd..5013de7 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -52,6 +52,8 @@ static uint32_t __align__(32) c_data[20];
 static uint32_t *d_resNounce[8];
 static uint32_t *h_resNounce[8];
 
+static uint32_t extra_results[2] = { MAXU, MAXU };
+
 /* prefer uint32_t to prevent size conversions = speed +5/10 % */
 __constant__
 static uint32_t __align__(32) c_sigma[16][16];
@@ -225,9 +227,13 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
 			}
 		}
 
-		/* keep the smallest nounce, hmm... */
-		if(resNounce[0] > nounce)
+		/* keep the smallest nounce, + extra one if found */
+		if (resNounce[0] > nounce) {
+			resNounce[1] = resNounce[0];
 			resNounce[0] = nounce;
+		}
+		else
+			resNounce[1] = nounce;
 	}
 }
 
@@ -242,14 +248,15 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce
 	size_t shared_size = 0;
 
 	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
-	if (cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)) != cudaSuccess)
+	if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess)
 		return result;
 
 	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNounce[thr_id], blakerounds);
 	cudaDeviceSynchronize();
-	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
 		cudaThreadSynchronize();
-		result = *h_resNounce[thr_id];
+		result = h_resNounce[thr_id][0];
+		extra_results[0] = h_resNounce[thr_id][1];
 	}
 	return result;
 }
@@ -269,9 +276,20 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 {
 	const uint32_t first_nonce = pdata[19];
 	static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
-	uint32_t throughput = min(TPB * 2048, max_nonce - first_nonce);
+	uint32_t throughput = min(TPB * 4096, max_nonce - first_nonce);
 	int rc = 0;
 
+	if (extra_results[0] != MAXU) {
+		// possible extra result found in previous call
+		if (first_nonce <= extra_results[0] && max_nonce >= extra_results[0]) {
+			pdata[19] = extra_results[0];
+			*hashes_done = pdata[19] - first_nonce + 1;
+			extra_results[0] = MAXU;
+			rc = 1;
+			goto exit_scan;
+		}
+	}
+
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x00000f;
 
@@ -279,13 +297,13 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 		if (opt_n_threads > 1) {
 			CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		}
-		CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], sizeof(uint32_t)));
-		CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], sizeof(uint32_t)));
+		CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t)));
+		CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t)));
 		init[thr_id] = true;
 	}
 
-	if (throughput < (TPB * 2048))
-		applog(LOG_WARNING, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce);
+	if (opt_debug && throughput < (TPB * 4096))
+		applog(LOG_DEBUG, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce);
 
 	blake256_cpu_setBlock_80(pdata, ptarget);
 
@@ -309,6 +327,18 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 			{
 				pdata[19] = foundNonce;
 				rc = 1;
+
+				if (extra_results[0] != MAXU) {
+					// Rare but possible if the throughput is big
+					be32enc(&endiandata[19], extra_results[0]);
+					blake256hash(vhashcpu, endiandata, blakerounds);
+					if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
+						applog(LOG_NOTICE, "GPU found more than one result yippee!");
+					} else {
+						extra_results[0] = MAXU;
+					}
+				}
+
 				goto exit_scan;
 			}
 			else if (vhashcpu[7] > Htarg) {
diff --git a/cpu-miner.c b/cpu-miner.c
index b3a6ba7..7f70a6e 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1005,7 +1005,7 @@ static void *miner_thread(void *userdata)
 					work_restart[thr_id].restart = 1;
 					hashlog_purge_old();
 					// wait a bit for a new job...
-					usleep(1500*1000);
+					sleep(1);
 					(*nonceptr) = end_nonce + 1;
 					work_done = true;
 					continue;
diff --git a/cpuminer-config.h b/cpuminer-config.h
index 0fafa85..11edf82 100644
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@@ -156,7 +156,7 @@
 #define PACKAGE_NAME "ccminer"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 2014.09.01"
+#define PACKAGE_STRING "ccminer 2014.09.06"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "ccminer"
@@ -165,7 +165,7 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2014.09.01"
+#define PACKAGE_VERSION "2014.09.06"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
@@ -188,7 +188,7 @@
 #define USE_XOP 1
 
 /* Version number of package */
-#define VERSION "2014.09.01"
+#define VERSION "2014.09.06"
 
 /* Define curl_free() as free() if our version of curl lacks curl_free. */
 /* #undef curl_free */
diff --git a/util.c b/util.c
index fe9168b..b2c0b0f 100644
--- a/util.c
+++ b/util.c
@@ -557,6 +557,9 @@ bool fulltest(const uint32_t *hash, const uint32_t *target)
 			rc = true;
 			break;
 		}
+		if (hash[0] == target[0]) {
+			applog(LOG_NOTICE, "We found an exact match!");
+		}
 	}
 
 	if (!rc && opt_debug) {
@@ -1122,8 +1125,7 @@ static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
 	sctx->next_diff = diff;
 	pthread_mutex_unlock(&sctx->work_lock);
 
-	if (opt_debug)
-		applog(LOG_DEBUG, "Stratum difficulty set to %g", diff);
+	applog(LOG_INFO, "Stratum difficulty set to %g", diff);
 
 	return true;
 }

From 5ccd1669161e174211f916ffd2e17d2117fa6d1c Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 6 Sep 2014 16:26:53 +0200
Subject: [PATCH 35/44] blake: introduce pdata head cache (speed x2)

---
 blake32.cu  | 103 ++++++++++++++++++++++++++++++++++++++++++++++++----
 cpu-miner.c |  10 ++---
 util.c      |   4 +-
 3 files changed, 102 insertions(+), 15 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index 5013de7..a0f502b 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -41,7 +41,7 @@ extern bool opt_n_threads;
 extern bool opt_benchmark;
 extern int device_map[8];
 
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+uint32_t crc32(const uint32_t *buf, size_t size);
 
 __constant__
 static uint32_t __align__(32) c_Target[8];
@@ -51,9 +51,16 @@ static uint32_t __align__(32) c_data[20];
 
 static uint32_t *d_resNounce[8];
 static uint32_t *h_resNounce[8];
-
 static uint32_t extra_results[2] = { MAXU, MAXU };
 
+#define USE_CACHE 1
+#if USE_CACHE
+__device__
+static uint32_t cache[8];
+__device__
+static uint32_t prevsum = 0;
+#endif
+
 /* prefer uint32_t to prevent size conversions = speed +5/10 % */
 __constant__
 static uint32_t __align__(32) c_sigma[16][16];
@@ -193,7 +200,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, in
 }
 
 __global__
-void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, const int blakerounds)
+void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, const int blakerounds, const int crcsum)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -202,11 +209,27 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
 		uint32_t h[8];
 
 		#pragma unroll
-		for(int i=0; i<8; i++)
+		for(int i=0; i<8; i++) {
 			h[i] = c_IV256[i];
+		}
 
+#if !USE_CACHE
 		blake256_compress(h, c_data, 512, blakerounds);
-
+#else
+		if (crcsum != prevsum) {
+			prevsum = crcsum;
+			blake256_compress(h, c_data, 512, blakerounds);
+			#pragma unroll
+			for(int i=0; i<8; i++) {
+				cache[i] = h[i];
+			}
+		} else {
+			#pragma unroll
+			for(int i=0; i<8; i++) {
+				h[i] = cache[i];
+			}
+		}
+#endif
 		// ------ Close: Bytes 64 to 80 ------ 
 
 		uint32_t ending[4];
@@ -238,7 +261,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
 }
 
 __host__
-uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, const int blakerounds)
+uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, const int blakerounds, const uint32_t crcsum)
 {
 	const int threadsperblock = TPB;
 	uint32_t result = MAXU;
@@ -251,7 +274,7 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce
 	if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess)
 		return result;
 
-	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNounce[thr_id], blakerounds);
+	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNounce[thr_id], blakerounds, crcsum);
 	cudaDeviceSynchronize();
 	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
 		cudaThreadSynchronize();
@@ -277,6 +300,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 	const uint32_t first_nonce = pdata[19];
 	static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 	uint32_t throughput = min(TPB * 4096, max_nonce - first_nonce);
+	uint32_t crcsum = MAXU;
 	int rc = 0;
 
 	if (extra_results[0] != MAXU) {
@@ -306,10 +330,13 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 		applog(LOG_DEBUG, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce);
 
 	blake256_cpu_setBlock_80(pdata, ptarget);
+#if USE_CACHE
+	crcsum = crc32(pdata, 64);
+#endif
 
 	do {
 		// GPU HASH
-		uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], blakerounds);
+		uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], blakerounds, crcsum);
 		if (foundNonce != MAXU)
 		{
 			uint32_t endiandata[20];
@@ -370,3 +397,63 @@ exit_scan:
 	cudaDeviceSynchronize();
 	return rc;
 }
+
+static uint32_t crc32_tab[] = {
+	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
+	0xe963a535, 0x9e6495a3,	0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+	0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
+	0xf3b97148, 0x84be41de,	0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+	0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,	0x14015c4f, 0x63066cd9,
+	0xfa0f3d63, 0x8d080df5,	0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+	0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,	0x35b5a8fa, 0x42b2986c,
+	0xdbbbc9d6, 0xacbcf940,	0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+	0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
+	0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+	0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,	0x76dc4190, 0x01db7106,
+	0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+	0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
+	0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+	0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+	0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+	0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
+	0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+	0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
+	0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+	0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
+	0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+	0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
+	0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+	0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+	0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+	0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
+	0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+	0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
+	0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+	0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
+	0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+	0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
+	0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+	0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+	0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+	0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
+	0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+	0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
+	0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+	0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
+	0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+	0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
+};
+
+uint32_t crc32(const uint32_t *buf, size_t size)
+{
+	const uint8_t *p;
+	uint32_t crc = 0;
+
+	p = (uint8_t *) buf;
+	crc = crc ^ ~0U;
+
+	while (size--)
+		crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
+
+	return crc ^ ~0U;
+}
\ No newline at end of file
diff --git a/cpu-miner.c b/cpu-miner.c
index 7f70a6e..6cf40d5 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -582,8 +582,8 @@ static bool get_upstream_work(CURL *curl, struct work *work)
 
 	if (opt_debug && rc) {
 		timeval_subtract(&diff, &tv_end, &tv_start);
-		applog(LOG_DEBUG, "DEBUG: got new work in %d ms",
-		       diff.tv_sec * 1000 + diff.tv_usec / 1000);
+		applog(LOG_DEBUG, "DEBUG: got new work in %u µs",
+		       diff.tv_sec * 1000000 + diff.tv_usec);
 	}
 
 	json_decref(val);
@@ -1345,12 +1345,12 @@ static void *stratum_thread(void *userdata)
 			pthread_mutex_unlock(&g_work_lock);
 			if (stratum.job.clean) {
 				if (!opt_quiet)
-					applog(LOG_BLUE, "%s requested %s job %d restart, block %d", short_url, algo_names[opt_algo],
-						strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height);
+					applog(LOG_BLUE, "%s send a new %s block %d", short_url, algo_names[opt_algo],
+						stratum.bloc_height);
 				restart_threads();
 				hashlog_purge_old();
 			} else if (!opt_quiet) {
-					applog(LOG_BLUE, "%s send %s job %d, block %d", short_url, algo_names[opt_algo],
+					applog(LOG_BLUE, "%s send job %d for block %d", short_url,
 						strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height);
 			}
 		}
diff --git a/util.c b/util.c
index b2c0b0f..dfe98ab 100644
--- a/util.c
+++ b/util.c
@@ -557,8 +557,8 @@ bool fulltest(const uint32_t *hash, const uint32_t *target)
 			rc = true;
 			break;
 		}
-		if (hash[0] == target[0]) {
-			applog(LOG_NOTICE, "We found an exact match!");
+		if (hash[1] == target[1]) {
+			applog(LOG_NOTICE, "We found a close match!");
 		}
 	}
 

From 11b04d82ffd7940e8ec2f5324cfcff57b88bbdf3 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 6 Sep 2014 17:16:07 +0200
Subject: [PATCH 36/44] update readme, tag v1.4.1

---
 README.md  | 2 +-
 README.txt | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d3836eb..4ca9fab 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ ccminer
 
 Christian Buchner's &amp; Christian H.'s CUDA miner project
 
-Fork by tpruvot@github with X14,X15,X17,WHIRL and Blake256 support
+Fork by tpruvot@github with X14,X15,X17,WHIRL and Blake256 support (NEOS + BlakeCoin)
 
    BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
    [![tip for next commit](https://tip4commit.com/projects/927.svg)](https://tip4commit.com/github/tpruvot/ccminer)
diff --git a/README.txt b/README.txt
index ea4cd5b..36a3e00 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
 
-ccMiner release 1.4-tpruvot (Sept 04th 2014) - "X17 Blake NEOS"
+ccMiner release 1.4.1-tpruvot (Sep 06th 2014) - "Cached Blake"
 ---------------------------------------------------------------
 
 ***************************************************************
@@ -34,6 +34,7 @@ QuarkCoin family & AnimeCoin
 TalkCoin
 DarkCoin and other X11 coins
 NEOS blake (256 14-rounds)
+BlakeCoin (256 8-rounds)
 
 where some of these coins have a VERY NOTABLE nVidia advantage
 over competing AMD (OpenCL) implementations.

From 42eafcbe85bb4594c9d0d8dacc90aed6312a4c87 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 6 Sep 2014 19:22:35 +0200
Subject: [PATCH 37/44] Put CRC-32 function in a new unit

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
---
 Makefile.am             |   2 +-
 blake32.cu              |  67 ++--------------------
 ccminer.vcxproj         |   1 +
 ccminer.vcxproj.filters |   3 +
 crc32.c                 | 119 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 128 insertions(+), 64 deletions(-)
 create mode 100644 crc32.c

diff --git a/Makefile.am b/Makefile.am
index 875f8b1..bb60063 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -16,7 +16,7 @@ bin_PROGRAMS	= ccminer
 ccminer_SOURCES		= elist.h miner.h compat.h \
 			  compat/inttypes.h compat/stdbool.h compat/unistd.h \
 			  compat/sys/time.h compat/getopt/getopt.h \
-			  cpu-miner.c util.c hefty1.c scrypt.c \
+			  cpu-miner.c util.c crc32.c hefty1.c scrypt.c \
 			  hashlog.cpp \
 			  heavy/heavy.cu \
 			  heavy/cuda_blake512.cu heavy/cuda_blake512.h \
diff --git a/blake32.cu b/blake32.cu
index a0f502b..f5c0f6b 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -15,6 +15,9 @@ extern "C" {
 /* threads per block */
 #define TPB 128
 
+/* crc32.c */
+extern "C" uint32_t crc32_u32t(const uint32_t *buf, size_t size);
+
 extern "C" int blake256_rounds = 14;
 
 /* hash by cpu with blake 256 */
@@ -41,8 +44,6 @@ extern bool opt_n_threads;
 extern bool opt_benchmark;
 extern int device_map[8];
 
-uint32_t crc32(const uint32_t *buf, size_t size);
-
 __constant__
 static uint32_t __align__(32) c_Target[8];
 
@@ -331,7 +332,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 
 	blake256_cpu_setBlock_80(pdata, ptarget);
 #if USE_CACHE
-	crcsum = crc32(pdata, 64);
+	crcsum = crc32_u32t(pdata, 64);
 #endif
 
 	do {
@@ -397,63 +398,3 @@ exit_scan:
 	cudaDeviceSynchronize();
 	return rc;
 }
-
-static uint32_t crc32_tab[] = {
-	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
-	0xe963a535, 0x9e6495a3,	0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
-	0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
-	0xf3b97148, 0x84be41de,	0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
-	0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,	0x14015c4f, 0x63066cd9,
-	0xfa0f3d63, 0x8d080df5,	0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
-	0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,	0x35b5a8fa, 0x42b2986c,
-	0xdbbbc9d6, 0xacbcf940,	0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
-	0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
-	0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
-	0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,	0x76dc4190, 0x01db7106,
-	0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
-	0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
-	0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
-	0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
-	0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
-	0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
-	0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
-	0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
-	0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
-	0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
-	0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
-	0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
-	0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
-	0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
-	0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
-	0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
-	0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
-	0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
-	0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
-	0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
-	0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
-	0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
-	0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
-	0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
-	0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
-	0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
-	0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
-	0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
-	0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
-	0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
-	0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
-	0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
-};
-
-uint32_t crc32(const uint32_t *buf, size_t size)
-{
-	const uint8_t *p;
-	uint32_t crc = 0;
-
-	p = (uint8_t *) buf;
-	crc = crc ^ ~0U;
-
-	while (size--)
-		crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
-
-	return crc ^ ~0U;
-}
\ No newline at end of file
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index cb633ad..0ab1d60 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -241,6 +241,7 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
       <TreatWChar_tAsBuiltInType>false</TreatWChar_tAsBuiltInType>
       <Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
     </ClCompile>
+    <ClCompile Include="crc32.c" />
     <ClCompile Include="fuguecoin.cpp" />
     <ClCompile Include="groestlcoin.cpp" />
     <ClCompile Include="hashlog.cpp" />
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 93e331c..bc990e0 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -96,6 +96,9 @@
     <ClCompile Include="cpu-miner.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="crc32.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="hefty1.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/crc32.c b/crc32.c
new file mode 100644
index 0000000..f036bcb
--- /dev/null
+++ b/crc32.c
@@ -0,0 +1,119 @@
+/*-
+ *  COPYRIGHT (C) 1986 Gary S. Brown.  You may use this program, or
+ *  code or tables extracted from it, as desired without restriction.
+ *
+ *  First, the polynomial itself and its table of feedback terms.  The
+ *  polynomial is
+ *  X^32+X^26+X^23+X^22+X^16+X^12+X^11+X^10+X^8+X^7+X^5+X^4+X^2+X^1+X^0
+ *
+ *  Note that we take it "backwards" and put the highest-order term in
+ *  the lowest-order bit.  The X^32 term is "implied"; the LSB is the
+ *  X^31 term, etc.  The X^0 term (usually shown as "+1") results in
+ *  the MSB being 1
+ *
+ *  Note that the usual hardware shift register implementation, which
+ *  is what we're using (we're merely optimizing it by doing eight-bit
+ *  chunks at a time) shifts bits into the lowest-order term.  In our
+ *  implementation, that means shifting towards the right.  Why do we
+ *  do it this way?  Because the calculated CRC must be transmitted in
+ *  order from highest-order term to lowest-order term.  UARTs transmit
+ *  characters in order from LSB to MSB.  By storing the CRC this way
+ *  we hand it to the UART in the order low-byte to high-byte; the UART
+ *  sends each low-bit to hight-bit; and the result is transmission bit
+ *  by bit from highest- to lowest-order term without requiring any bit
+ *  shuffling on our part.  Reception works similarly
+ *
+ *  The feedback terms table consists of 256, 32-bit entries.  Notes
+ *
+ *      The table can be generated at runtime if desired; code to do so
+ *      is shown later.  It might not be obvious, but the feedback
+ *      terms simply represent the results of eight shift/xor opera
+ *      tions for all combinations of data and CRC register values
+ *
+ *      The values must be right-shifted by eight bits by the "updcrc
+ *      logic; the shift must be unsigned (bring in zeroes).  On some
+ *      hardware you could probably optimize the shift in assembler by
+ *      using byte-swap instructions
+ *      polynomial $edb88320
+ *
+ *
+ * CRC32 code derived from work by Gary S. Brown.
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+
+static uint32_t crc32_tab[] = {
+	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
+	0xe963a535, 0x9e6495a3,	0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+	0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
+	0xf3b97148, 0x84be41de,	0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+	0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,	0x14015c4f, 0x63066cd9,
+	0xfa0f3d63, 0x8d080df5,	0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+	0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,	0x35b5a8fa, 0x42b2986c,
+	0xdbbbc9d6, 0xacbcf940,	0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+	0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
+	0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+	0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,	0x76dc4190, 0x01db7106,
+	0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+	0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
+	0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+	0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+	0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+	0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
+	0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+	0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
+	0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+	0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
+	0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+	0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
+	0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+	0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+	0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+	0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
+	0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+	0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
+	0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+	0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
+	0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+	0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
+	0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+	0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+	0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+	0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
+	0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+	0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
+	0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+	0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
+	0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+	0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
+};
+
+/* Real CRC32 Function */
+extern uint32_t crc32(uint32_t crc, const void *buf, size_t size)
+{
+	const uint8_t *p;
+
+	p = buf;
+	crc = crc ^ ~0U;
+
+	while (size--)
+		crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
+
+	return crc ^ ~0U;
+}
+
+/* CRC32 Function simplified for ccminer */
+extern uint32_t crc32_u32t(const uint32_t *buf, size_t size)
+{
+	const uint8_t *p;
+	uint32_t crc = 0;
+
+	p = (uint8_t *) buf;
+	crc = crc ^ ~0U;
+
+	while (size--)
+		crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
+
+	return crc ^ ~0U;
+}

From 95ac1d0f194a36695f60fe2da627a38baa21f38d Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 6 Sep 2014 20:54:41 +0200
Subject: [PATCH 38/44] x11: adapt some blake 256 opts to 512 one

blake512: for the moment 6.2ms vs 7.12 before (+10%)
---
 cuda_nist5.cu                |   9 +-
 quark/cuda_quark_blake512.cu | 156 ++++++++++++++++-------------------
 quark/quarkcoin.cu           |  31 ++-----
 x11/x11.cu                   |  18 ++--
 x15/x14.cu                   |   4 +-
 x15/x15.cu                   |  31 +------
 x17/x17.cu                   |  18 +---
 7 files changed, 98 insertions(+), 169 deletions(-)

diff --git a/cuda_nist5.cu b/cuda_nist5.cu
index 2feb32e..419c1a5 100644
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@@ -5,9 +5,11 @@ extern "C"
 #include "sph/sph_skein.h"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
+}
+
 #include "miner.h"
+
 #include "cuda_helper.h"
-}
 
 // aus cpu-miner.c
 extern int device_map[8];
@@ -74,9 +76,6 @@ extern "C" void nist5hash(void *state, const void *input)
     memcpy(state, hash, 32);
 }
 
-
-extern bool opt_benchmark;
-
 extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
     const uint32_t *ptarget, uint32_t max_nonce,
     unsigned long *hashes_done)
@@ -84,7 +83,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 	const uint32_t first_nonce = pdata[19];
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+		((uint32_t*)ptarget)[7] = 0x00FF;
 
 	const uint32_t Htarg = ptarget[7];
 
diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu
index e3d299d..787b8a0 100644
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@@ -50,59 +50,60 @@ const uint64_t c_u512[16] =
   0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
 };
 
-#define G(a,b,c,d,e)          \
-    v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
-    v[d] = ROTR( v[d] ^ v[a],32);        \
-    v[c] += v[d];           \
-    v[b] = ROTR( v[b] ^ v[c],25);        \
-    v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b];  \
-    v[d] = ROTR( v[d] ^ v[a],16);        \
-    v[c] += v[d];           \
-    v[b] = ROTR( v[b] ^ v[c],11);
-
+#define G(a,b,c,d,x) { \
+	uint32_t idx1 = sigma[i][x]; \
+	uint32_t idx2 = sigma[i][x+1]; \
+	v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \
+	v[d] = ROTR( v[d] ^ v[a], 32); \
+	v[c] += v[d]; \
+	v[b] = ROTR( v[b] ^ v[c], 25); \
+	v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \
+	v[d] = ROTR( v[d] ^ v[a], 16); \
+	v[c] += v[d]; \
+	v[b] = ROTR( v[b] ^ v[c], 11); \
+}
 
 __device__ static
-void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
+void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0)
 {
     uint64_t v[16], m[16], i;
 
-#pragma unroll 16
-    for( i = 0; i < 16; ++i ) {
-        m[i] = cuda_swab64(block[i]);
-    }
-
-#pragma unroll 8
-    for( i = 0; i < 8; ++i )  v[i] = h[i];
-
-    v[ 8] = u512[0];
-    v[ 9] = u512[1];
-    v[10] = u512[2];
-    v[11] = u512[3];
-    v[12] = u512[4];
-    v[13] = u512[5];
-    v[14] = u512[6];
-    v[15] = u512[7];
-
-    v[12] ^= bits;
-    v[13] ^= bits;
-
-//#pragma unroll 16
-    for( i = 0; i < 16; ++i )
-    {
-        /* column step */
-        G( 0, 4, 8, 12, 0 );
-        G( 1, 5, 9, 13, 2 );
-        G( 2, 6, 10, 14, 4 );
-        G( 3, 7, 11, 15, 6 );
-        /* diagonal step */
-        G( 0, 5, 10, 15, 8 );
-        G( 1, 6, 11, 12, 10 );
-        G( 2, 7, 8, 13, 12 );
-        G( 3, 4, 9, 14, 14 );
-    }
-
-#pragma unroll 16
-    for( i = 0; i < 16; ++i )  h[i % 8] ^= v[i];
+	#pragma unroll 16
+	for( i = 0; i < 16; i++) {
+		m[i] = cuda_swab64(block[i]);
+	}
+
+	#pragma unroll 8
+	for (i = 0; i < 8; i++)
+		v[i] = h[i];
+
+	v[ 8] = u512[0];
+	v[ 9] = u512[1];
+	v[10] = u512[2];
+	v[11] = u512[3];
+	v[12] = u512[4] ^ T0;
+	v[13] = u512[5] ^ T0;
+	v[14] = u512[6];
+	v[15] = u512[7];
+
+	//#pragma unroll 16
+	for( i = 0; i < 16; ++i )
+	{
+		/* column step */
+		G( 0, 4, 8, 12, 0 );
+		G( 1, 5, 9, 13, 2 );
+		G( 2, 6, 10, 14, 4 );
+		G( 3, 7, 11, 15, 6 );
+		/* diagonal step */
+		G( 0, 5, 10, 15, 8 );
+		G( 1, 6, 11, 12, 10 );
+		G( 2, 7, 8, 13, 12 );
+		G( 3, 4, 9, 14, 14 );
+	}
+
+	#pragma unroll 16
+	for( i = 0; i < 16; ++i )
+		h[i % 8] ^= v[i];
 }
 
 __device__ __constant__
@@ -114,7 +115,8 @@ static const uint64_t d_constMem[8] = {
 	0x510e527fade682d1ULL,
 	0x9b05688c2b3e6c1fULL,
 	0x1f83d9abfb41bd6bULL,
-	0x5be0cd19137e2179ULL };
+	0x5be0cd19137e2179ULL
+};
 
 // Hash-Padding
 __device__ __constant__
@@ -126,7 +128,8 @@ static const uint64_t d_constHashPadding[8] = {
 	0,
 	0x0100000000000000ull,
 	0,
-	0x0002000000000000ull };
+	0x0002000000000000ull
+};
 
 __global__ __launch_bounds__(256, 4)
 void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
@@ -145,48 +148,42 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n
 	if (thread < threads)
 #endif
 	{
-		uint8_t i;
-		// bestimme den aktuellen Z�hler
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
 		int hashPosition = nounce - startNounce;
 		uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8
 
-		// 128 Byte f�r die Message
+		// 128 Bytes
 		uint64_t buf[16];
 
-		// State vorbereiten
+		// State
 		uint64_t h[8];
 		#pragma unroll 8
-		for (i=0;i<8;i++)
+		for (int i=0;i<8;i++)
 			h[i] = d_constMem[i];
 
-		// Message f�r die erste Runde in Register holen
+		// Message for first round
 		#pragma unroll 8
-		for (i=0; i < 8; ++i)
+		for (int i=0; i < 8; ++i)
 			buf[i] = inpHash[i];
 
 		#pragma unroll 8
-		for (i=0; i < 8; i++)
+		for (int i=0; i < 8; i++)
 			buf[i+8] = d_constHashPadding[i];
 
-		// die einzige Hashing-Runde
+		// Ending round
 		quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );
 
-#if __CUDA_ARCH__ >= 130
-		// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verf�gbar sind
+#if __CUDA_ARCH__ <= 350
 		uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
 		#pragma unroll 8
-		for (i=0; i < 8; ++i) {
+		for (int i=0; i < 8; i++) {
 			outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
 			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
 		}
 #else
-		// in dieser Version passieren auch ein paar 64 Bit Shifts
 		uint64_t *outHash = &g_hash[8 * hashPosition];
-		#pragma unroll 8
-		for (i=0; i < 8; ++i)
-		{
+		for (int i=0; i < 8; i++) {
 			outHash[i] = cuda_swab64(h[i]);
 		}
 #endif
@@ -198,45 +195,38 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		// State vorbereiten
 		uint64_t h[8];
-		// 128 Byte f�r die Message
 		uint64_t buf[16];
-		uint8_t i;
-		// bestimme den aktuellen Z�hler
 		uint32_t nounce = startNounce + thread;
 
 		#pragma unroll 8
-		for(i=0;i<8;i++)
+		for(int i=0; i<8; i++)
 			h[i] = d_constMem[i];
 
 		// Message f�r die erste Runde in Register holen
 		#pragma unroll 16
-		for (i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
+		for (int i=0; i < 16; ++i)
+			buf[i] = c_PaddedMessage80[i];
 
-		// die Nounce durch die thread-spezifische ersetzen
-		buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce));
+		// The test Nonce
+		((uint32_t*)buf)[19] = cuda_swab32(nounce);
 
-		// die einzige Hashing-Runde
 		quark_blake512_compress( h, buf, c_sigma, c_u512, 640 );
 
-		// Hash rauslassen
-#if __CUDA_ARCH__ >= 130
-		// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verf�gbar sind
+#if __CUDA_ARCH__ <= 350
 		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
 		#pragma unroll 8
-		for (i=0; i < 8; ++i) {
-			outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
+		for (uint32_t i=0; i < 8; i++) {
+			outHash[2*i]   = cuda_swab32( _HIWORD(h[i]) );
 			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
 		}
 #else
-		// in dieser Version passieren auch ein paar 64 Bit Shifts
 		uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
-		#pragma unroll 8
-		for (i=0; i < 8; ++i) {
+		for (uint32_t i=0; i < 8; i++) {
 			outHash[i] = cuda_swab64( h[i] );
 		}
 #endif
+
 	}
 }
 
diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu
index be6eda8..a905ec4 100644
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@@ -6,12 +6,12 @@ extern "C"
 #include "sph/sph_skein.h"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
+}
+
 #include "miner.h"
 
 #include "cuda_helper.h"
-}
 
-// aus cpu-miner.c
 extern int device_map[8];
 
 // Speicher für Input/Output der verketteten Hashfunktionen
@@ -70,76 +70,64 @@ extern "C" void quarkhash(void *state, const void *input)
     unsigned char hash[64];
 
     sph_blake512_init(&ctx_blake);
-    // ZBLAKE;
     sph_blake512 (&ctx_blake, input, 80);
     sph_blake512_close(&ctx_blake, (void*) hash);
     
     sph_bmw512_init(&ctx_bmw);
-    // ZBMW;
     sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
     sph_bmw512_close(&ctx_bmw, (void*) hash);
 
     if (hash[0] & 0x8)
     {
         sph_groestl512_init(&ctx_groestl);
-        // ZGROESTL;
         sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
         sph_groestl512_close(&ctx_groestl, (void*) hash);
     }
     else
     {
         sph_skein512_init(&ctx_skein);
-        // ZSKEIN;
         sph_skein512 (&ctx_skein, (const void*) hash, 64);
         sph_skein512_close(&ctx_skein, (void*) hash);
     }
     
     sph_groestl512_init(&ctx_groestl);
-    // ZGROESTL;
     sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
     sph_groestl512_close(&ctx_groestl, (void*) hash);
 
     sph_jh512_init(&ctx_jh);
-    // ZJH;
     sph_jh512 (&ctx_jh, (const void*) hash, 64);
     sph_jh512_close(&ctx_jh, (void*) hash);
 
     if (hash[0] & 0x8)
     {
         sph_blake512_init(&ctx_blake);
-        // ZBLAKE;
         sph_blake512 (&ctx_blake, (const void*) hash, 64);
         sph_blake512_close(&ctx_blake, (void*) hash);
     }
     else
     {
         sph_bmw512_init(&ctx_bmw);
-        // ZBMW;
         sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
         sph_bmw512_close(&ctx_bmw, (void*) hash);
     }
 
     sph_keccak512_init(&ctx_keccak);
-    // ZKECCAK;
     sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
     sph_keccak512_close(&ctx_keccak, (void*) hash);
 
     sph_skein512_init(&ctx_skein);
-    // SKEIN;
     sph_skein512 (&ctx_skein, (const void*) hash, 64);
     sph_skein512_close(&ctx_skein, (void*) hash);
 
     if (hash[0] & 0x8)
     {
         sph_keccak512_init(&ctx_keccak);
-        // ZKECCAK;
         sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
         sph_keccak512_close(&ctx_keccak, (void*) hash);
     }
     else
     {
         sph_jh512_init(&ctx_jh);
-        // ZJH;
         sph_jh512 (&ctx_jh, (const void*) hash, 64);
         sph_jh512_close(&ctx_jh, (void*) hash);
     }
@@ -147,23 +135,17 @@ extern "C" void quarkhash(void *state, const void *input)
     memcpy(state, hash, 32);
 }
 
-
-extern bool opt_benchmark;
-
 extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
     const uint32_t *ptarget, uint32_t max_nonce,
     unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
+	const int throughput = 256*4096; // 100;
+	static bool init[8] = {0,0,0,0,0,0,0,0};
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
-
-	const uint32_t Htarg = ptarget[7];
+		((uint32_t*)ptarget)[7] = 0x00FF;
 
-	const int throughput = 256*4096; // 100;
-
-	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
@@ -252,11 +234,12 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
+			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			quarkhash(vhash64, endiandata);
 
-			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 
 				pdata[19] = foundNonce;
 				*hashes_done = (foundNonce - first_nonce + 1)/2;
diff --git a/x11/x11.cu b/x11/x11.cu
index 3c18030..dc2f97f 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -21,10 +21,9 @@ extern "C"
 #include <memory.h>
 }
 
-// aus cpu-miner.c
+// in cpu-miner.c
 extern int device_map[8];
 
-// Speicher für Input/Output der verketteten Hashfunktionen
 static uint32_t *d_hash[8];
 
 extern void quark_blake512_cpu_init(int thr_id, int threads);
@@ -140,22 +139,17 @@ extern "C" void x11hash(void *output, const void *input)
 }
 
 
-extern bool opt_benchmark;
-
 extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
     const uint32_t *ptarget, uint32_t max_nonce,
     unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
+	const int throughput = 256*256*8;
+	static bool init[8] = {0,0,0,0,0,0,0,0};
 
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
 
-	const uint32_t Htarg = ptarget[7];
-
-	const int throughput = 256*256*8;
-
-	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (!init[thr_id])
 	{
 		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
@@ -186,8 +180,10 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 	cuda_check_cpu_setTarget(ptarget);
 
 	do {
-		uint32_t foundNonce;
+		const uint32_t Htarg = ptarget[7];
+
 		int order = 0;
+		uint32_t foundNonce;
 
 		// Hash with CUDA
 		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
@@ -204,7 +200,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 
 		// Scan nach Gewinner Hashes auf der GPU
 		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		if  (foundNonce != 0xffffffff)
+		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
diff --git a/x15/x14.cu b/x15/x14.cu
index 0b56584..b3519cd 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -20,11 +20,11 @@ extern "C" {
 #include "sph/sph_hamsi.h"
 #include "sph/sph_fugue.h"
 #include "sph/sph_shabal.h"
+}
 
 #include "miner.h"
 
 #include "cuda_helper.h"
-}
 
 // from cpu-miner.c
 extern int device_map[8];
@@ -167,8 +167,6 @@ extern "C" void x14hash(void *output, const void *input)
 }
 
 
-extern bool opt_benchmark;
-
 extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done)
diff --git a/x15/x15.cu b/x15/x15.cu
index 50e2080..faea354 100644
--- a/x15/x15.cu
+++ b/x15/x15.cu
@@ -21,14 +21,11 @@ extern "C" {
 #include "sph/sph_fugue.h"
 #include "sph/sph_shabal.h"
 #include "sph/sph_whirlpool.h"
+}
 
 #include "miner.h"
 
 #include "cuda_helper.h"
-}
-
-// to test gpu hash on a null buffer
-#define NULLTEST 0
 
 // from cpu-miner.c
 extern int device_map[8];
@@ -92,8 +89,6 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
 											uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
 
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
 // X15 CPU Hash function
 extern "C" void x15hash(void *output, const void *input)
 {
@@ -181,17 +176,6 @@ extern "C" void x15hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }
 
-#if NULLTEST
-static void print_hash(unsigned char *hash)
-{
-	for (int i=0; i < 32; i += 4) {
-		printf("%02x%02x%02x%02x ", hash[i], hash[i+1], hash[i+2], hash[i+3]);
-	}
-}
-#endif
-
-extern bool opt_benchmark;
-
 extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done)
@@ -203,12 +187,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 	uint32_t Htarg = ptarget[7];
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = Htarg = 0x0000ff;
-
-#if NULLTEST
-	for (int k=0; k < 20; k++)
-		pdata[k] = 0;
-#endif
+		((uint32_t*)ptarget)[7] = Htarg = 0x00FF;
 
 	if (!init[thr_id])
 	{
@@ -259,12 +238,6 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
-#if NULLTEST
-		uint32_t buf[8]; memset(buf, 0, sizeof buf);
-		CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
-		CUDA_SAFE_CALL(cudaThreadSynchronize());
-		print_hash((unsigned char*)buf); printf("\n");
-#endif
 		/* Scan with GPU */
 		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
diff --git a/x17/x17.cu b/x17/x17.cu
index ffcd57a..65d2259 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -26,17 +26,15 @@ extern "C"
 
 #include "sph/sph_sha2.h"
 #include "sph/sph_haval.h"
+}
 
 #include "miner.h"
-}
+#include "cuda_helper.h"
 
 static uint32_t *d_hash[8];
 
-
-// cpu-miner.c
+// in cpu-miner.c
 extern int device_map[8];
-extern bool opt_benchmark;
-
 
 extern void quark_blake512_cpu_init(int thr_id, int threads);
 extern void quark_blake512_cpu_setBlock_80(void *pdata);
@@ -204,20 +202,12 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
 	unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
-
-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
-
 	const int throughput = 256*256*8;
-
-		if (opt_benchmark)
-				((uint32_t*)ptarget)[7] = 0x0000ff;
-
 	static bool init[8] = {0,0,0,0,0,0,0,0};
 	uint32_t Htarg = ptarget[7];
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = Htarg = 0x0000ff;
+		((uint32_t*)ptarget)[7] = Htarg = 0x00FF;
 
 	if (!init[thr_id])
 	{

From 402e4168534e4eaab97d024369ea7516a0153d56 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 6 Sep 2014 21:54:46 +0200
Subject: [PATCH 39/44] Add pentablake algo (-a penta)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
---
 Makefile.am             |   2 +-
 blake32.cu              |  10 +-
 ccminer.vcxproj         |   6 +
 ccminer.vcxproj.filters |   3 +
 cpu-miner.c             |  10 +-
 miner.h                 |   6 +
 pentablake.cu           | 600 ++++++++++++++++++++++++++++++++++++++++
 util.c                  |   4 +
 8 files changed, 637 insertions(+), 4 deletions(-)
 create mode 100644 pentablake.cu

diff --git a/Makefile.am b/Makefile.am
index bb60063..c2fa11d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -33,7 +33,7 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
 			  quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \
 			  quark/cuda_quark_compactionTest.cu \
-			  cuda_nist5.cu blake32.cu \
+			  cuda_nist5.cu blake32.cu pentablake.cu \
 			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
diff --git a/blake32.cu b/blake32.cu
index f5c0f6b..96a78a0 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -362,6 +362,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 					blake256hash(vhashcpu, endiandata, blakerounds);
 					if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
 						applog(LOG_NOTICE, "GPU found more than one result yippee!");
+						rc = 2;
 					} else {
 						extra_results[0] = MAXU;
 					}
@@ -380,9 +381,14 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 			}
 		}
 
+		if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) {
+			pdata[19] = max_nonce - first_nonce + 1;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
 exit_scan:
 	*hashes_done = pdata[19] - first_nonce + 1;
@@ -395,6 +401,6 @@ exit_scan:
 	}
 #endif
 	// wait proper end of all threads
-	cudaDeviceSynchronize();
+	//cudaDeviceSynchronize();
 	return rc;
 }
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 0ab1d60..06ba665 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -405,6 +405,12 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
       <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
       <FastMath>true</FastMath>
     </CudaCompile>
+    <CudaCompile Include="pentablake.cu">
+      <MaxRegCount>80</MaxRegCount>
+      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-O2 -dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
+      <FastMath>true</FastMath>
+    </CudaCompile>
     <CudaCompile Include="quark\animecoin.cu">
       <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
       <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index bc990e0..065e196 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -445,5 +445,8 @@
     <CudaCompile Include="blake32.cu">
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
+    <CudaCompile Include="pentablake.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/cpu-miner.c b/cpu-miner.c
index 6cf40d5..85d4d2b 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -136,8 +136,9 @@ typedef enum {
 	ALGO_JACKPOT,
 	ALGO_MJOLLNIR,		/* Mjollnir hash */
 	ALGO_MYR_GR,
-	ALGO_QUARK,
 	ALGO_NIST5,
+	ALGO_PENTABLAKE,
+	ALGO_QUARK,
 	ALGO_WHC,
 	ALGO_X11,
 	ALGO_X13,
@@ -159,6 +160,7 @@ static const char *algo_names[] = {
 	"mjollnir",
 	"myr-gr",
 	"nist5",
+	"penta",
 	"quark",
 	"whirl",
 	"x11",
@@ -242,6 +244,7 @@ Options:\n\
                         mjollnir  Mjollnircoin hash\n\
                         myr-gr    Myriad-Groestl hash\n\
                         nist5     NIST5 (TalkCoin) hash\n\
+                        penta     Pentablake hash (5x Blake 512)\n\
                         quark     Quark hash\n\
                         whirl     Whirlcoin (old whirlpool)\n\
                         x11       X11 (DarkCoin) hash\n\
@@ -1089,6 +1092,11 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;
 
+		case ALGO_PENTABLAKE:
+			rc = scanhash_pentablake(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done);
+			break;
+
 		case ALGO_WHC:
 			rc = scanhash_whc(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
diff --git a/miner.h b/miner.h
index d9951e9..e33bfff 100644
--- a/miner.h
+++ b/miner.h
@@ -249,6 +249,10 @@ extern int scanhash_nist5(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 
+extern int scanhash_pentablake(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done);
+
 extern int scanhash_whc(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
@@ -284,6 +288,7 @@ struct work_restart {
 	char			padding[128 - sizeof(unsigned long)];
 };
 
+extern bool opt_benchmark;
 extern bool opt_debug;
 extern bool opt_debug_rpc;
 extern bool opt_quiet;
@@ -428,6 +433,7 @@ unsigned int jackpothash(void *state, const void *input);
 void groestlhash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
 void nist5hash(void *state, const void *input);
+void pentablakehash(void *output, const void *input);
 void quarkhash(void *state, const void *input);
 void wcoinhash(void *state, const void *input);
 void x11hash(void *output, const void *input);
diff --git a/pentablake.cu b/pentablake.cu
new file mode 100644
index 0000000..9958e53
--- /dev/null
+++ b/pentablake.cu
@@ -0,0 +1,600 @@
+/**
+ * Penta Blake-512 Cuda Kernel (Tested on SM 5.0)
+ *
+ * Tanguy Pruvot - Aug. 2014
+ */
+
+#include "miner.h"
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include <stdint.h>
+#include <memory.h>
+}
+
+/* threads per block */
+#define TPB 192
+
+/* hash by cpu with blake 256 */
+extern "C" void pentablakehash(void *output, const void *input)
+{
+	unsigned char hash[128];
+	#define hashB hash + 64
+	sph_blake512_context ctx;
+
+	sph_blake512_init(&ctx);
+	sph_blake512(&ctx, input, 80);
+	sph_blake512_close(&ctx, hash);
+
+	sph_blake512(&ctx, hash, 64);
+	sph_blake512_close(&ctx, hashB);
+
+	sph_blake512(&ctx, hashB, 64);
+	sph_blake512_close(&ctx, hash);
+
+	sph_blake512(&ctx, hash, 64);
+	sph_blake512_close(&ctx, hashB);
+
+	sph_blake512(&ctx, hashB, 64);
+	sph_blake512_close(&ctx, hash);
+
+	memcpy(output, hash, 32);
+}
+
+#include "cuda_helper.h"
+
+#define MAXU 0xffffffffU
+
+// in cpu-miner.c
+extern bool opt_n_threads;
+extern bool opt_benchmark;
+extern int device_map[8];
+
+__constant__
+static uint32_t __align__(32) c_Target[8];
+
+__constant__
+static uint64_t __align__(32) c_data[32];
+
+static uint32_t *d_hash[8];
+static uint32_t *d_resNounce[8];
+static uint32_t *h_resNounce[8];
+static uint32_t extra_results[2] = { MAXU, MAXU };
+
+/* prefer uint32_t to prevent size conversions = speed +5/10 % */
+__constant__
+static uint32_t __align__(32) c_sigma[16][16];
+const uint32_t host_sigma[16][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
+};
+
+__device__ __constant__
+static const uint64_t __align__(32) c_IV512[8] = {
+	0x6a09e667f3bcc908ULL,
+	0xbb67ae8584caa73bULL,
+	0x3c6ef372fe94f82bULL,
+	0xa54ff53a5f1d36f1ULL,
+	0x510e527fade682d1ULL,
+	0x9b05688c2b3e6c1fULL,
+	0x1f83d9abfb41bd6bULL,
+	0x5be0cd19137e2179ULL
+};
+
+__device__ __constant__
+const uint64_t c_u512[16] =
+{
+	0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
+	0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
+	0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
+	0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
+	0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
+	0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
+	0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
+	0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
+};
+
+#define G(a,b,c,d,x) { \
+	uint32_t idx1 = c_sigma[i][x]; \
+	uint32_t idx2 = c_sigma[i][x+1]; \
+	v[a] += (m[idx1] ^ c_u512[idx2]) + v[b]; \
+	v[d] = ROTR64(v[d] ^ v[a], 32); \
+	v[c] += v[d]; \
+	v[b] = ROTR64(v[b] ^ v[c], 25); \
+	v[a] += (m[idx2] ^ c_u512[idx1]) + v[b]; \
+	v[d] = ROTR64(v[d] ^ v[a], 16); \
+	v[c] += v[d]; \
+	v[b] = ROTR64(v[b] ^ v[c], 11); \
+}
+
+// Hash-Padding
+__device__ __constant__
+static const uint64_t d_constHashPadding[8] = {
+	0x0000000000000080ull,
+	0,
+	0,
+	0,
+	0,
+	0x0100000000000000ull,
+	0,
+	0x0002000000000000ull
+};
+
+#if 0
+
+__device__ __constant__
+static const uint64_t __align__(32) c_Padding[16] = {
+	0, 0, 0, 0,
+	0x80000000ULL, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 1, 0, 640,
+};
+
+__device__ static
+void pentablake_compress(uint64_t *h, const uint64_t *block, const uint32_t T0)
+{
+	uint64_t v[16], m[16];
+
+	m[0] = block[0];
+	m[1] = block[1];
+	m[2] = block[2];
+	m[3] = block[3];
+
+	for (uint32_t i = 4; i < 16; i++) {
+		m[i] = (T0 == 0x200) ? block[i] : c_Padding[i];
+	}
+
+	//#pragma unroll 8
+	for(uint32_t i = 0; i < 8; i++)
+		v[i] = h[i];
+
+	v[ 8] = c_u512[0];
+	v[ 9] = c_u512[1];
+	v[10] = c_u512[2];
+	v[11] = c_u512[3];
+
+	v[12] = xor1(c_u512[4], T0);
+	v[13] = xor1(c_u512[5], T0);
+	v[14] = c_u512[6];
+	v[15] = c_u512[7];
+
+	for (uint32_t i = 0; i < 16; i++) {
+		/* column step */
+		G(0, 4, 0x8, 0xC, 0x0);
+		G(1, 5, 0x9, 0xD, 0x2);
+		G(2, 6, 0xA, 0xE, 0x4);
+		G(3, 7, 0xB, 0xF, 0x6);
+		/* diagonal step */
+		G(0, 5, 0xA, 0xF, 0x8);
+		G(1, 6, 0xB, 0xC, 0xA);
+		G(2, 7, 0x8, 0xD, 0xC);
+		G(3, 4, 0x9, 0xE, 0xE);
+	}
+
+	//#pragma unroll 16
+	for (uint32_t i = 0; i < 16; i++) {
+		uint32_t j = i % 8;
+		h[j] ^= v[i];
+	}
+}
+
+__global__
+void pentablake_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nounce = startNounce + thread;
+		uint64_t h[8];
+
+		#pragma unroll
+		for(int i=0; i<8; i++) {
+			h[i] = c_IV512[i];
+		}
+
+		uint64_t ending[4];
+		ending[0] = c_data[16];
+		ending[1] = c_data[17];
+		ending[2] = c_data[18];
+		ending[3] = nounce; /* our tested value */
+
+		pentablake_compress(h, ending, 640);
+
+		// -----------------------------------
+
+		for (int r = 0; r < 4; r++) {
+			uint64_t data[8];
+			for (int i = 0; i < 7; i++) {
+				data[i] = h[i];
+			}
+			pentablake_compress(h, data, 512); /* todo: use h,h when ok*/
+		}
+	}
+}
+#endif
+
+__device__ static
+void pentablake_compress(uint64_t *h, const uint64_t *block, const uint64_t T0)
+{
+	uint64_t v[16], m[16], i;
+
+	#pragma unroll 16
+	for(i = 0; i < 16; i++) {
+		m[i] = cuda_swab64(block[i]);
+	}
+
+	#pragma unroll 8
+	for (i = 0; i < 8; i++)
+		v[i] = h[i];
+
+	v[ 8] = c_u512[0];
+	v[ 9] = c_u512[1];
+	v[10] = c_u512[2];
+	v[11] = c_u512[3];
+	v[12] = c_u512[4] ^ T0;
+	v[13] = c_u512[5] ^ T0;
+	v[14] = c_u512[6];
+	v[15] = c_u512[7];
+
+	//#pragma unroll 16
+	for( i = 0; i < 16; i++)
+	{
+		/* column step */
+		G(0, 4, 0x8, 0xC, 0x0);
+		G(1, 5, 0x9, 0xD, 0x2);
+		G(2, 6, 0xA, 0xE, 0x4);
+		G(3, 7, 0xB, 0xF, 0x6);
+		/* diagonal step */
+		G(0, 5, 0xA, 0xF, 0x8);
+		G(1, 6, 0xB, 0xC, 0xA);
+		G(2, 7, 0x8, 0xD, 0xC);
+		G(3, 4, 0x9, 0xE, 0xE);
+	}
+
+	//#pragma unroll 16
+	for (i = 0; i < 16; i++) {
+		uint32_t idx = i % 8;
+		h[idx] ^= v[i];
+	}
+}
+
+__global__
+void pentablake_gpu_hash_80(int threads, const uint32_t startNounce, void *outputHash)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint64_t h[8];
+		uint64_t buf[16];
+		uint32_t nounce = startNounce + thread;
+
+		//#pragma unroll 8
+		for(int i=0; i<8; i++)
+			h[i] = c_IV512[i];
+
+		//#pragma unroll 16
+		for (int i=0; i < 16; i++)
+			buf[i] = c_data[i];
+
+		// The test Nonce
+		((uint32_t*)buf)[19] = cuda_swab32(nounce);
+
+		pentablake_compress(h, buf, 640ULL);
+
+#if __CUDA_ARCH__ < 300
+		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
+		#pragma unroll 8
+		for (uint32_t i=0; i < 8; i++) {
+			outHash[2*i]   = cuda_swab32( _HIWORD(h[i]) );
+			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
+		}
+#else
+		uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
+		for (uint32_t i=0; i < 8; i++) {
+			outHash[i] = cuda_swab64( h[i] );
+		}
+#endif
+
+	}
+}
+
+__host__
+void pentablake_cpu_hash_80(int thr_id, int threads, const uint32_t startNounce, uint32_t *d_outputHash, int order)
+{
+	const int threadsperblock = TPB;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+	size_t shared_size = 0;
+
+	pentablake_gpu_hash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash);
+
+	//MyStreamSynchronize(NULL, order, thr_id);
+	cudaDeviceSynchronize();
+}
+
+
+__global__
+void pentablake_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	if (thread < threads)
+	{
+		uint64_t *inpHash = &g_hash[thread<<3]; // hashPosition * 8
+		uint64_t buf[16]; // 128 Bytes
+		uint64_t h[8]; // State
+
+		#pragma unroll 8
+		for (int i=0; i<8; i++)
+			h[i] = c_IV512[i];
+
+		// Message for first round
+		#pragma unroll 8
+		for (int i=0; i < 8; ++i)
+			buf[i] = inpHash[i];
+
+		#pragma unroll 8
+		for (int i=0; i < 8; i++)
+			buf[i+8] = d_constHashPadding[i];
+
+		// Ending round
+		pentablake_compress(h, buf, 512);
+
+#if __CUDA_ARCH__ < 300
+		uint32_t *outHash = (uint32_t*)&g_hash[thread<<3];
+		#pragma unroll 8
+		for (int i=0; i < 8; i++) {
+			outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
+			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
+		}
+#else
+		uint64_t *outHash = &g_hash[thread<<3];
+		for (int i=0; i < 8; i++) {
+			outHash[i] = cuda_swab64(h[i]);
+		}
+#endif
+	}
+}
+
+__host__
+void pentablake_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
+{
+	const int threadsperblock = TPB;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+	size_t shared_size = 0;
+
+	pentablake_gpu_hash_64 <<<grid, block, shared_size>>> (threads, startNounce, (uint64_t*)d_outputHash);
+
+	//MyStreamSynchronize(NULL, order, thr_id);
+	cudaDeviceSynchronize();
+}
+
+#if 0
+
+__host__
+uint32_t pentablake_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce)
+{
+	const int threadsperblock = TPB;
+	uint32_t result = MAXU;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+	size_t shared_size = 0;
+
+	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
+	if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess)
+		return result;
+
+	pentablake_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNounce[thr_id]);
+	cudaDeviceSynchronize();
+	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+		cudaThreadSynchronize();
+		result = h_resNounce[thr_id][0];
+		extra_results[0] = h_resNounce[thr_id][1];
+	}
+	return result;
+}
+#endif
+
+__global__
+void pentablake_gpu_check_hash(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *resNounce)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = startNounce + thread;
+		uint32_t *inpHash = &g_hash[thread<<4];
+		uint32_t h[8];
+
+		#pragma unroll 8
+		for (int i=0; i < 8; i++)
+			h[i] = inpHash[i];
+
+		for (int i = 7; i >= 0; i--) {
+			uint32_t hash = h[i]; // cuda_swab32(h[i]);
+			if (hash > c_Target[i]) {
+				return;
+			}
+			if (hash < c_Target[i]) {
+				break;
+			}
+		}
+
+		/* keep the smallest nounce, + extra one if found */
+		if (resNounce[0] > nounce) {
+			resNounce[1] = resNounce[0];
+			resNounce[0] = nounce;
+		}
+		else
+			resNounce[1] = nounce;
+	}
+}
+
+__host__ static
+uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
+{
+	const int threadsperblock = TPB;
+	uint32_t result = MAXU;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+	size_t shared_size = 0;
+
+	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
+	if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess)
+		return result;
+
+	pentablake_gpu_check_hash <<<grid, block, shared_size>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
+
+	CUDA_SAFE_CALL(cudaDeviceSynchronize());
+	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+		cudaThreadSynchronize();
+		result = h_resNounce[thr_id][0];
+		extra_results[0] = h_resNounce[thr_id][1];
+	}
+	return result;
+}
+
+
+__host__
+void pentablake_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget)
+{
+	uint8_t data[128];
+	memcpy((void*) data, (void*) pdata, 80);
+	memset(data+80, 0, 48);
+
+	// to swab...
+	data[80] = 0x80;
+	data[111] = 1;
+	data[126] = 0x02;
+	data[127] = 0x80;
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice));
+}
+
+extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
+{
+	const uint32_t first_nonce = pdata[19];
+	static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+	uint32_t throughput = min(128 * 2560, max_nonce - first_nonce);
+	uint32_t endiandata[20];
+	int rc = 0;
+
+	if (extra_results[0] != MAXU) {
+		// possible extra result found in previous call
+		if (first_nonce <= extra_results[0] && max_nonce >= extra_results[0]) {
+			pdata[19] = extra_results[0];
+			*hashes_done = pdata[19] - first_nonce + 1;
+			extra_results[0] = MAXU;
+			rc = 1;
+			goto exit_scan;
+		}
+	}
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x000F;
+
+	if (!init[thr_id]) {
+		if (opt_n_threads > 1) {
+			CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		}
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput));
+		CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t)));
+		CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t)));
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	pentablake_cpu_setBlock_80(endiandata, ptarget);
+
+	do {
+		int order = 0;
+
+		// GPU HASH
+		pentablake_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		uint32_t foundNonce = pentablake_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		if (foundNonce != MAXU)
+		{
+			uint32_t vhashcpu[8];
+			uint32_t Htarg = ptarget[7];
+
+			be32enc(&endiandata[19], foundNonce);
+
+			pentablakehash(vhashcpu, endiandata);
+
+			if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
+			{
+				pdata[19] = foundNonce;
+				rc = 1;
+
+				// Rare but possible if the throughput is big
+				be32enc(&endiandata[19], extra_results[0]);
+				pentablakehash(vhashcpu, endiandata);
+				if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
+					applog(LOG_NOTICE, "GPU found more than one result yippee!");
+					rc = 2;
+				} else {
+					extra_results[0] = MAXU;
+				}
+
+				goto exit_scan;
+			}
+			else if (vhashcpu[7] > Htarg) {
+				applog(LOG_WARNING, "GPU #%d: result for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg);
+			}
+			else if (vhashcpu[6] > ptarget[6]) {
+				applog(LOG_WARNING, "GPU #%d: hash[6] for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[6], ptarget[6]);
+			}
+			else {
+				applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
+			}
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+exit_scan:
+	*hashes_done = pdata[19] - first_nonce + 1;
+#if 0
+	/* reset the device to allow multiple instances
+	 * could be made in cpu-miner... check later if required */
+	if (opt_n_threads == 1) {
+		CUDA_SAFE_CALL(cudaDeviceReset());
+		init[thr_id] = false;
+	}
+#endif
+
+	cudaDeviceSynchronize();
+	return rc;
+}
diff --git a/util.c b/util.c
index dfe98ab..04209e0 100644
--- a/util.c
+++ b/util.c
@@ -1457,6 +1457,10 @@ void print_hash_tests(void)
 	nist5hash(&hash[0], &buf[0]);
 	printpfx("nist5", hash);
 
+	memset(hash, 0, sizeof hash);
+	pentablakehash(&hash[0], &buf[0]);
+	printpfx("pentablake", hash);
+
 	memset(hash, 0, sizeof hash);
 	quarkhash(&hash[0], &buf[0]);
 	printpfx("quark", hash);

From 3ed36f285b17f07cd758d063178443593a095149 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 8 Sep 2014 09:31:00 +0200
Subject: [PATCH 40/44] try to prevent gpu pauses

---
 cpu-miner.c | 34 ++++++++++++++++++++++++++--------
 util.c      |  7 ++++---
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/cpu-miner.c b/cpu-miner.c
index 85d4d2b..2b04c5c 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -460,11 +460,14 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 	bool rc = false;
 
 	/* pass if the previous hash is not the current previous hash */
+	pthread_mutex_lock(&g_work_lock);
 	if (memcmp(work->data + 1, g_work.data + 1, 32)) {
+		pthread_mutex_unlock(&g_work_lock);
 		if (opt_debug)
 			applog(LOG_DEBUG, "DEBUG: stale work detected, discarding");
 		return true;
 	}
+	pthread_mutex_unlock(&g_work_lock);
 
 	if (have_stratum) {
 		uint32_t sent;
@@ -894,6 +897,7 @@ static void *miner_thread(void *userdata)
 		struct timeval tv_start, tv_end, diff;
 		int64_t max64;
 		uint64_t umax64;
+		bool extrajob = false;
 		int rc;
 
 		// &work.data[19]
@@ -901,13 +905,24 @@ static void *miner_thread(void *userdata)
 		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
 
 		if (have_stratum) {
-			while (time(NULL) >= (g_work_time + opt_scantime) && !work_done)
-				usleep(500*1000);
-			work_done = false;
-			pthread_mutex_lock(&g_work_lock);
+			uint32_t sleeptime = 0;
+			while (!work_done && time(NULL) >= (g_work_time + opt_scantime)) {
+				sleeptime++;
+				usleep(50*1000);
+				if (sleeptime > 5) {
+					extrajob = true;
+					break;
+				}
+			}
+			if (sleeptime)
+				applog(LOG_DEBUG, "sleeptime: %u ms", sleeptime*100);
 			nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
-			if ((*nonceptr) >= end_nonce)
+			pthread_mutex_lock(&g_work_lock);
+			extrajob |= work_done;
+			if ((*nonceptr) >= end_nonce || extrajob) {
+				work_done = false;
 				stratum_gen_work(&stratum, &g_work);
+			}
 		} else {
 			int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime;
 			/* obtain new work from internal workio thread */
@@ -946,11 +961,11 @@ static void *miner_thread(void *userdata)
 			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
 		} else
 			(*nonceptr)++; //??
-		pthread_mutex_unlock(&g_work_lock);
 		work_restart[thr_id].restart = 0;
 
 		if (opt_debug)
 			applog(LOG_WARNING, "job %s %08x", g_work.job_id, (*nonceptr));
+		pthread_mutex_unlock(&g_work_lock);
 
 		/* adjust max_nonce to meet target scan time */
 		if (have_stratum)
@@ -962,15 +977,18 @@ static void *miner_thread(void *userdata)
 		max64 *= (int64_t)thr_hashrates[thr_id];
 
 		if (max64 <= 0) {
+			/* should not be set too high,
+			   else you can miss multiple nounces */
 			switch (opt_algo) {
 			case ALGO_JACKPOT:
 				max64 = 0x1fffLL;
 				break;
 			case ALGO_BLAKECOIN:
 				max64 = 0x3ffffffLL;
+				break;
 			case ALGO_BLAKE:
 				/* based on the 750Ti hashrate (100kH) */
-				max64 = 0x3ffffffLL;
+				max64 = 0x1ffffffLL;
 				break;
 			default:
 				max64 = 0xfffffLL;
@@ -1008,7 +1026,7 @@ static void *miner_thread(void *userdata)
 					work_restart[thr_id].restart = 1;
 					hashlog_purge_old();
 					// wait a bit for a new job...
-					sleep(1);
+					usleep(500*1000);
 					(*nonceptr) = end_nonce + 1;
 					work_done = true;
 					continue;
diff --git a/util.c b/util.c
index 04209e0..6ed44c0 100644
--- a/util.c
+++ b/util.c
@@ -1020,7 +1020,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	int merkle_count, i;
 	json_t *merkle_arr;
 	unsigned char **merkle;
-	int ntime;
+	int ntime, hoffset;
 
 	job_id = json_string_value(json_array_get(params, 0));
 	prevhash = json_string_value(json_array_get(params, 1));
@@ -1078,7 +1078,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	hex2bin(sctx->job.coinbase, coinb1, coinb1_size);
 	memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size);
 
-	sctx->bloc_height = le16dec((uint8_t*) sctx->job.coinbase + 43);
+	hoffset = coinb1_size - 15; // 43;
+	sctx->bloc_height = le16dec((uint8_t*) sctx->job.coinbase + hoffset);
 	if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id))
 		memset(sctx->job.xnonce2, 0, sctx->xnonce2_size);
 	hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size);
@@ -1125,7 +1126,7 @@ static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
 	sctx->next_diff = diff;
 	pthread_mutex_unlock(&sctx->work_lock);
 
-	applog(LOG_INFO, "Stratum difficulty set to %g", diff);
+	applog(LOG_WARNING, "Stratum difficulty set to %g", diff);
 
 	return true;
 }

From cec5baea9527fcdb11c2db4f22cd7028e3e50357 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 8 Sep 2014 10:42:40 +0200
Subject: [PATCH 41/44] enable colors by default, except for syslog

debug: show compared hash diffs in color
---
 blake32.cu  |  2 +-
 cpu-miner.c | 22 +++++++++++-----------
 miner.h     |  1 +
 util.c      | 14 ++++++++++++++
 4 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index 96a78a0..6b51e2e 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -361,7 +361,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 					be32enc(&endiandata[19], extra_results[0]);
 					blake256hash(vhashcpu, endiandata, blakerounds);
 					if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
-						applog(LOG_NOTICE, "GPU found more than one result yippee!");
+						applog(LOG_NOTICE, "GPU found more than one result " CL_GRN "yippee!");
 						rc = 2;
 					} else {
 						extra_results[0] = MAXU;
diff --git a/cpu-miner.c b/cpu-miner.c
index 2b04c5c..77a7526 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -181,7 +181,7 @@ bool want_stratum = true;
 bool have_stratum = false;
 static bool submit_old = false;
 bool use_syslog = false;
-bool use_colors = false;
+bool use_colors = true;
 static bool opt_background = false;
 bool opt_quiet = false;
 static int opt_retries = -1;
@@ -940,12 +940,11 @@ static void *miner_thread(void *userdata)
 		}
 		if (memcmp(work.data, g_work.data, wcmplen)) {
 			if (opt_debug) {
-				applog(LOG_DEBUG, "job %s work updated", g_work.job_id);
-				for (int n=0; n<wcmplen; n+=8) {
+				for (int n=0; n <= (wcmplen-8); n+=8) {
 					if (memcmp(work.data + n, g_work.data + n, 8)) {
-						applog(LOG_DEBUG, "diff detected at offset %d", n);
+						applog(LOG_DEBUG, "job %s work updated at offset %d:", g_work.job_id, n);
 						applog_hash((uint8_t*) work.data + n);
-						applog_hash((uint8_t*) g_work.data + n);
+						applog_compare_hash((uint8_t*) g_work.data + n, (uint8_t*) work.data + n);
 					}
 				}
 			}
@@ -953,9 +952,9 @@ static void *miner_thread(void *userdata)
 			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
 		} else if (memcmp(work.target, g_work.target, sizeof(work.target))) {
 			if (opt_debug) {
-				applog(LOG_DEBUG, "job %s target change", g_work.job_id);
+				applog(LOG_DEBUG, "job %s target change:", g_work.job_id);
 				applog_hash((uint8_t*) work.target);
-				applog_hash((uint8_t*) g_work.target);
+				applog_compare_hash((uint8_t*) g_work.target, (uint8_t*) work.target);
 			}
 			memcpy(work.target, g_work.target, sizeof(work.target));
 			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
@@ -1021,7 +1020,7 @@ static void *miner_thread(void *userdata)
 				stall |= (start_nonce > range.scanned[0] && start_nonce < range.scanned[1]);
 
 				if (stall) {
-					if (opt_algo)
+					if (opt_debug && !opt_quiet)
 						applog(LOG_DEBUG, "job done, wait for a new one...");
 					work_restart[thr_id].restart = 1;
 					hashlog_purge_old();
@@ -1464,13 +1463,14 @@ static void parse_arg (int key, char *arg)
 	case 'C':
 		use_colors = true;
 		break;
-	case 'q':
-		opt_quiet = true;
-		break;
 	case 'D':
 		opt_debug = true;
 		opt_debug_rpc = true;
 		break;
+	case 'q':
+		opt_quiet = true;
+		opt_debug_rpc = false;
+		break;
 	case 'p':
 		free(rpc_pass);
 		rpc_pass = strdup(arg);
diff --git a/miner.h b/miner.h
index e33bfff..6bd0ae5 100644
--- a/miner.h
+++ b/miner.h
@@ -422,6 +422,7 @@ size_t time2str(char* buf, time_t timer);
 char* atime2str(time_t timer);
 
 void applog_hash(unsigned char *hash);
+void applog_compare_hash(unsigned char *hash, unsigned char *hash2);
 
 void print_hash_tests(void);
 void animehash(void *state, const void *input);
diff --git a/util.c b/util.c
index 6ed44c0..eb4af11 100644
--- a/util.c
+++ b/util.c
@@ -1401,6 +1401,20 @@ static char* format_hash(char* buf, unsigned char *hash)
 	return buf;
 }
 
+/* to debug diff in data */
+extern void applog_compare_hash(unsigned char *hash, unsigned char *hash2)
+{
+	char s[256] = "";
+	int len = 0;
+	for (int i=0; i < 32; i += 4) {
+		char *color = memcmp(hash+i, hash2+i, 4) ? CL_RED : CL_GRY;
+		len += sprintf(s+len, "%s%02x%02x%02x%02x " CL_GRY, color,
+			hash[i], hash[i+1], hash[i+2], hash[i+3]);
+		s[len] = '\0';
+	}
+	applog(LOG_DEBUG, "%s", s);
+}
+
 extern void applog_hash(unsigned char *hash)
 {
 	char s[128] = {'\0'};

From 9e5ec398b28f56ec88abdbae96538430a88c9ac6 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 8 Sep 2014 11:07:16 +0200
Subject: [PATCH 42/44] Purge anti-dup data on target change

---
 cpu-miner.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/cpu-miner.c b/cpu-miner.c
index 77a7526..c03786b 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -938,6 +938,19 @@ static void *miner_thread(void *userdata)
 				g_work_time = time(NULL);
 			}
 		}
+		if (memcmp(work.target, g_work.target, sizeof(work.target))) {
+			if (opt_debug) {
+				applog(LOG_DEBUG, "job %s target change:", g_work.job_id);
+				applog_hash((uint8_t*) work.target);
+				applog_compare_hash((uint8_t*) g_work.target, (uint8_t*) work.target);
+			}
+			memcpy(work.target, g_work.target, sizeof(work.target));
+			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
+			/* on new target, ignoring nonce, clear sent data (hashlog) */
+			if (memcmp(work.target, g_work.target, sizeof(work.target) - 4)) {
+				hashlog_purge_job(work.job_id);
+			}
+		}
 		if (memcmp(work.data, g_work.data, wcmplen)) {
 			if (opt_debug) {
 				for (int n=0; n <= (wcmplen-8); n+=8) {
@@ -950,14 +963,6 @@ static void *miner_thread(void *userdata)
 			}
 			memcpy(&work, &g_work, sizeof(struct work));
 			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
-		} else if (memcmp(work.target, g_work.target, sizeof(work.target))) {
-			if (opt_debug) {
-				applog(LOG_DEBUG, "job %s target change:", g_work.job_id);
-				applog_hash((uint8_t*) work.target);
-				applog_compare_hash((uint8_t*) g_work.target, (uint8_t*) work.target);
-			}
-			memcpy(work.target, g_work.target, sizeof(work.target));
-			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
 		} else
 			(*nonceptr)++; //??
 		work_restart[thr_id].restart = 0;

From 13bb9d267ef17434392830c7fbf0c602f7f202fd Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 8 Sep 2014 17:43:45 +0200
Subject: [PATCH 43/44] Remove debug rpc, already exists with -P

---
 cpu-miner.c   | 71 +++++++++++++++++++++++++++++++++------------------
 cuda_helper.h |  2 ++
 miner.h       |  1 -
 util.c        |  4 ---
 4 files changed, 48 insertions(+), 30 deletions(-)

diff --git a/cpu-miner.c b/cpu-miner.c
index c03786b..0450ac7 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -172,7 +172,6 @@ static const char *algo_names[] = {
 };
 
 bool opt_debug = false;
-bool opt_debug_rpc = false;
 bool opt_protocol = false;
 bool opt_benchmark = false;
 bool want_longpoll = true;
@@ -440,7 +439,7 @@ static int share_result(int result, const char *reason)
 				(result ? CL_GRN "yay!!!" : CL_RED "booooo")
 			:	(result ? "(yay!!!)" : "(booooo)"));
 
-	if (reason && !opt_quiet) {
+	if (reason) {
 		applog(LOG_WARNING, "reject reason: %s", reason);
 		if (strncmp(reason, "low difficulty share", 20) == 0) {
 			opt_difficulty = (opt_difficulty * 2.0) / 3.0;
@@ -550,10 +549,6 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		json_decref(val);
 	}
 
-	if (opt_debug_rpc) {
-		applog(LOG_DEBUG, "submit: %s", s);
-	}
-
 	rc = true;
 
 out:
@@ -792,13 +787,20 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size);
 
 	/* Generate merkle root */
-	if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR)
-		heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
-	else
-	if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_WHC || opt_algo == ALGO_BLAKECOIN)
-		SHA256((unsigned char*)sctx->job.coinbase, sctx->job.coinbase_size, (unsigned char*)merkle_root);
-	else
-		sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
+	switch (opt_algo) {
+		case ALGO_HEAVY:
+		case ALGO_MJOLLNIR:
+			heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
+			break;
+		case ALGO_FUGUE256:
+		case ALGO_GROESTL:
+		case ALGO_BLAKECOIN:
+		case ALGO_WHC:
+			SHA256((uint8_t*)sctx->job.coinbase, sctx->job.coinbase_size, (uint8_t*)merkle_root);
+			break;
+		default:
+			sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
+	}
 
 	for (i = 0; i < sctx->job.merkle_count; i++) {
 		memcpy(merkle_root + 32, sctx->job.merkle[i], 32);
@@ -870,7 +872,9 @@ static void *miner_thread(void *userdata)
 	uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1);
 	unsigned char *scratchbuf = NULL;
 	bool work_done = false;
+	bool extrajob = false;
 	char s[16];
+	int rc = 0;
 
 	memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized
 
@@ -897,8 +901,6 @@ static void *miner_thread(void *userdata)
 		struct timeval tv_start, tv_end, diff;
 		int64_t max64;
 		uint64_t umax64;
-		bool extrajob = false;
-		int rc;
 
 		// &work.data[19]
 		int wcmplen = 76;
@@ -907,20 +909,21 @@ static void *miner_thread(void *userdata)
 		if (have_stratum) {
 			uint32_t sleeptime = 0;
 			while (!work_done && time(NULL) >= (g_work_time + opt_scantime)) {
-				sleeptime++;
-				usleep(50*1000);
-				if (sleeptime > 5) {
+				usleep(100*1000);
+				if (sleeptime > 4) {
 					extrajob = true;
 					break;
 				}
+				sleeptime++;
 			}
-			if (sleeptime)
+			if (sleeptime && opt_debug && !opt_quiet)
 				applog(LOG_DEBUG, "sleeptime: %u ms", sleeptime*100);
 			nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
 			pthread_mutex_lock(&g_work_lock);
 			extrajob |= work_done;
 			if ((*nonceptr) >= end_nonce || extrajob) {
 				work_done = false;
+				extrajob = false;
 				stratum_gen_work(&stratum, &g_work);
 			}
 		} else {
@@ -938,6 +941,22 @@ static void *miner_thread(void *userdata)
 				g_work_time = time(NULL);
 			}
 		}
+#if 0
+		if (!opt_benchmark && g_work.xnonce2_len == 0) {
+			applog(LOG_ERR, "work data not read yet");
+			extrajob = true;
+			work_done = true;
+			sleep(1);
+			continue;
+		}
+#endif
+		if (rc > 1) {
+			/* if we found more than one on last loop */
+			/* todo: handle an array to get them directly */
+			pthread_mutex_unlock(&g_work_lock);
+			goto continue_scan;
+		}
+
 		if (memcmp(work.target, g_work.target, sizeof(work.target))) {
 			if (opt_debug) {
 				applog(LOG_DEBUG, "job %s target change:", g_work.job_id);
@@ -947,7 +966,7 @@ static void *miner_thread(void *userdata)
 			memcpy(work.target, g_work.target, sizeof(work.target));
 			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
 			/* on new target, ignoring nonce, clear sent data (hashlog) */
-			if (memcmp(work.target, g_work.target, sizeof(work.target) - 4)) {
+			if (memcmp(work.target, g_work.target, sizeof(work.target))) {
 				hashlog_purge_job(work.job_id);
 			}
 		}
@@ -1048,6 +1067,7 @@ static void *miner_thread(void *userdata)
 		(*nonceptr) = start_nonce;
 
 		hashes_done = 0;
+continue_scan:
 		gettimeofday(&tv_start, NULL);
 
 		/* scan nonces for a proof-of-work hash */
@@ -1163,8 +1183,11 @@ static void *miner_thread(void *userdata)
 		timeval_subtract(&diff, &tv_end, &tv_start);
 		if (diff.tv_usec || diff.tv_sec) {
 			pthread_mutex_lock(&stats_lock);
-			thr_hashrates[thr_id] =
-				hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec);
+			if (diff.tv_sec + 1e-6 * diff.tv_usec > 0.0) {
+				thr_hashrates[thr_id] = hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec);
+				if (rc > 1)
+					thr_hashrates[thr_id] = (rc * hashes_done) / (diff.tv_sec + 1e-6 * diff.tv_usec);
+			}
 			pthread_mutex_unlock(&stats_lock);
 		}
 		if (!opt_quiet) {
@@ -1372,7 +1395,6 @@ static void *stratum_thread(void *userdata)
 			pthread_mutex_lock(&g_work_lock);
 			stratum_gen_work(&stratum, &g_work);
 			time(&g_work_time);
-			pthread_mutex_unlock(&g_work_lock);
 			if (stratum.job.clean) {
 				if (!opt_quiet)
 					applog(LOG_BLUE, "%s send a new %s block %d", short_url, algo_names[opt_algo],
@@ -1383,6 +1405,7 @@ static void *stratum_thread(void *userdata)
 					applog(LOG_BLUE, "%s send job %d for block %d", short_url,
 						strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height);
 			}
+			pthread_mutex_unlock(&g_work_lock);
 		}
 		
 		if (!stratum_socket_full(&stratum, 120)) {
@@ -1470,11 +1493,9 @@ static void parse_arg (int key, char *arg)
 		break;
 	case 'D':
 		opt_debug = true;
-		opt_debug_rpc = true;
 		break;
 	case 'q':
 		opt_quiet = true;
-		opt_debug_rpc = false;
 		break;
 	case 'p':
 		free(rpc_pass);
diff --git a/cuda_helper.h b/cuda_helper.h
index fecf531..66c8e7d 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -12,6 +12,8 @@
 
 #include <stdint.h>
 
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+
 extern __device__ __device_builtin__ void __syncthreads(void);
 
 #ifndef __CUDA_ARCH__
diff --git a/miner.h b/miner.h
index 6bd0ae5..d9d29a8 100644
--- a/miner.h
+++ b/miner.h
@@ -290,7 +290,6 @@ struct work_restart {
 
 extern bool opt_benchmark;
 extern bool opt_debug;
-extern bool opt_debug_rpc;
 extern bool opt_quiet;
 extern bool opt_protocol;
 extern int opt_timeout;
diff --git a/util.c b/util.c
index eb4af11..fe733c0 100644
--- a/util.c
+++ b/util.c
@@ -1222,10 +1222,6 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
 	id = json_object_get(val, "id");
 	params = json_object_get(val, "params");
 
-	if (opt_debug_rpc) {
-		applog(LOG_DEBUG, "method: %s", s);
-	}
-
 	if (!strcasecmp(method, "mining.notify")) {
 		ret = stratum_notify(sctx, params);
 		goto out;

From 429266346ce4a1376688bce56d3131c2d973c721 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 8 Sep 2014 20:48:51 +0200
Subject: [PATCH 44/44] Prepare version 1.4.2

---
 README.txt   | 4 +++-
 configure.ac | 2 +-
 cpu-miner.c  | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.txt b/README.txt
index 36a3e00..001ff3f 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
 
-ccMiner release 1.4.1-tpruvot (Sep 06th 2014) - "Cached Blake"
+ccMiner release 1.4.2-tpruvot (Sep 09th 2014) - "Pentablake"
 ---------------------------------------------------------------
 
 ***************************************************************
@@ -35,6 +35,7 @@ TalkCoin
 DarkCoin and other X11 coins
 NEOS blake (256 14-rounds)
 BlakeCoin (256 8-rounds)
+Pentablake (Blake 512 x5)
 
 where some of these coins have a VERY NOTABLE nVidia advantage
 over competing AMD (OpenCL) implementations.
@@ -65,6 +66,7 @@ its command line interface and options.
                           blake       use to mine NEOS (Blake 256)
                           blakecoin   use to mine Old Blake 256
                           nist5       use to mine TalkCoin
+                          penta       use to mine Joincoin / Pentablake
                           fresh       use to mine Freshcoin
                           whirl       use to mine Whirlcoin
                           x11         use to mine DarkCoin
diff --git a/configure.ac b/configure.ac
index 2a554f1..a4ef290 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2014.09.06])
+AC_INIT([ccminer], [2014.09.09])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 0450ac7..cb6e790 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1427,7 +1427,7 @@ out:
 	return NULL;
 }
 
-#define PROGRAM_VERSION "1.4.1"
+#define PROGRAM_VERSION "1.4.2"
 static void show_version_and_exit(void)
 {
 	printf("%s v%s\n"