From 5e42389b4b018bb62682a22411f0aa22e1845e36 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 9 Mar 2015 02:21:44 +0100
Subject: [PATCH] pluck: finish the algo, add cpu validation

---
 miner.h             |   1 +
 pluck/cuda_pluck.cu |   7 +-
 pluck/pluck.cu      | 180 ++++++++++++++++++--------------------------
 util.cpp            |  15 +++-
 4 files changed, 91 insertions(+), 112 deletions(-)

diff --git a/miner.h b/miner.h
index f3f51e4..0944e59 100644
--- a/miner.h
+++ b/miner.h
@@ -667,6 +667,7 @@ void lyra2_hash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
 void nist5hash(void *state, const void *input);
 void pentablakehash(void *output, const void *input);
+void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N);
 void quarkhash(void *state, const void *input);
 void qubithash(void *state, const void *input);
 void s3hash(void *output, const void *input);
diff --git a/pluck/cuda_pluck.cu b/pluck/cuda_pluck.cu
index 0bbcbba..cbb21cb 100644
--- a/pluck/cuda_pluck.cu
+++ b/pluck/cuda_pluck.cu
@@ -539,8 +539,8 @@ void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t* hash)
 __host__
 uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce,  int order)
 {
-	uint32_t result[8] = {0xffffffff};
-	cudaMemset(d_PlNonce[thr_id], 0xffffffff, sizeof(uint32_t));
+	uint32_t result[8] = { 0xffffffff };
+	cudaMemset(d_PlNonce[thr_id], 0xff, sizeof(uint32_t));
 
 	const uint32_t threadsperblock = 128;
 
@@ -557,7 +557,8 @@ uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce,  int
 		pluck_gpu_hash <<< grid, block >>>(threads, startNounce, d_PlNonce[thr_id]);
 	}
 
-	MyStreamSynchronize(NULL, order, thr_id);
+	//MyStreamSynchronize(NULL, order, thr_id);
+	CUDA_SAFE_CALL(cudaThreadSynchronize());
 	cudaMemcpy(&result[thr_id], d_PlNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 
 	return result[thr_id];
diff --git a/pluck/pluck.cu b/pluck/pluck.cu
index 813a214..a1e7ff0 100644
--- a/pluck/pluck.cu
+++ b/pluck/pluck.cu
@@ -1,10 +1,11 @@
 /* Based on djm code */
 
-extern "C" {
+#include <stdint.h>
+
 #include "miner.h"
-}
+#include "cuda_helper.h"
 
-#include <stdint.h>
+#include <openssl/sha.h>
 
 static uint32_t *d_hash[MAX_GPUS] ;
 
@@ -84,137 +85,102 @@ static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
 #undef ROTL
 }
 
-static void sha256_hash(unsigned char *hash, const unsigned char *data, int len)
+static void sha256_hash(uchar *hash, const uchar *data, int len)
 {
-	uint32_t S[16], T[16];
-	int i, r;
-
-	sha256_init(S);
-	for (r = len; r > -9; r -= 64) {
-		if (r < 64)
-			memset(T, 0, 64);
-		memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
-		if (r >= 0 && r < 64)
-			((unsigned char *)T)[r] = 0x80;
-		for (i = 0; i < 16; i++)
-			T[i] = be32dec(T + i);
-
-		if (r < 56)
-			T[15] = 8 * len;
-		sha256_transform(S, T, 0);
-	}
-	for (i = 0; i < 8; i++)
-		be32enc((uint32_t *)hash + i, S[i]);
+	SHA256_CTX ctx;
+	SHA256_Init(&ctx);
+	SHA256_Update(&ctx, data, len);
+	SHA256_Final(hash, &ctx);
 }
 
-static void sha256_hash512(unsigned char *hash, const unsigned char *data)
+// hash exactly 64 bytes (ie, sha256 block size)
+static void sha256_hash512(uint32_t *hash, const uint32_t *data)
 {
-	uint32_t S[16], T[16];
+	uint32_t _ALIGN(64) S[16];
+	uint32_t _ALIGN(64) T[16];
+	uchar _ALIGN(64) E[64] = { 0 };
 	int i;
 
 	sha256_init(S);
 
-	memcpy(T, data, 64);
 	for (i = 0; i < 16; i++)
-		T[i] = be32dec(T + i);
+		T[i] = be32dec(&data[i]);
 	sha256_transform(S, T, 0);
 
-	memset(T, 0, 64);
-	//memcpy(T, data + 64, 0);
-	((unsigned char *)T)[0] = 0x80;
-	for (i = 0; i < 16; i++)
-		T[i] = be32dec(T + i);
-	T[15] = 8 * 64;
-	sha256_transform(S, T, 0);
+	E[3] = 0x80;
+	E[61] = 0x02; // T[15] = 8 * 64 => 0x200;
+	sha256_transform(S, (uint32_t*)E, 0);
 
 	for (i = 0; i < 8; i++)
-		be32enc((uint32_t *)hash + i, S[i]);
+		be32enc(&hash[i], S[i]);
 }
 
-void pluckhash(uint32_t *hash, uint32_t *input)
+#define BLOCK_HEADER_SIZE 80
+void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N)
 {
+	int size = N * 1024;
+	sha256_hash(hashbuffer, (uchar*)data, BLOCK_HEADER_SIZE);
+	memset(&hashbuffer[32], 0, 32);
 
-	uint32_t data[20];
-	//uint32_t midstate[8];
+	for (int i = 64; i < size - 32; i += 32)
+	{
+		uint32_t _ALIGN(64) randseed[16];
+		uint32_t _ALIGN(64) randbuffer[16];
+		uint32_t _ALIGN(64) joint[16];
+		//i-4 because we use integers for all references against this, and we don't want to go 3 bytes over the defined area
+		//we could use size here, but then it's probable to use 0 as the value in most cases
+		int randmax = i - 4;
 
-	const int HASH_MEMORY = 128 * 1024;
-	uint8_t * scratchbuf = (uint8_t*)malloc(HASH_MEMORY);
+		//setup randbuffer to be an array of random indexes
+		memcpy(randseed, &hashbuffer[i - 64], 64);
 
-	for (int k = 0; k<20; k++) { data[k] = input[k]; }
+		if (i > 128) memcpy(randbuffer, &hashbuffer[i - 128], 64);
+		else memset(randbuffer, 0, 64);
 
-		uint8_t *hashbuffer = scratchbuf; //don't allocate this on stack, since it's huge..
-		int size = HASH_MEMORY;
-		memset(hashbuffer, 0, 64);
+		xor_salsa8((uint32_t*)randbuffer, (uint32_t*)randseed);
+		memcpy(joint, &hashbuffer[i - 32], 32);
 
-		sha256_hash(&hashbuffer[0], (uint8_t*)data, 80);
-		for (int i = 64; i < size - 32; i += 32)
+		//use the last hash value as the seed
+		for (int j = 32; j < 64; j += 4)
 		{
-			//i-4 because we use integers for all references against this, and we don't want to go 3 bytes over the defined area
-			int randmax = i - 4; //we could use size here, but then it's probable to use 0 as the value in most cases
-			uint32_t joint[16];
-			uint32_t randbuffer[16];
-
-			uint32_t randseed[16];
-			memcpy(randseed, &hashbuffer[i - 64], 64);
-			if (i>128)
-			{
-				memcpy(randbuffer, &hashbuffer[i - 128], 64);
-			}
-			else
-			{
-				memset(&randbuffer, 0, 64);
-			}
+			//every other time, change to next random index
+			//randmax - 32 as otherwise we go beyond memory that's already been written to
+			uint32_t rand = randbuffer[(j - 32) >> 2] % (randmax - 32);
+			joint[j >> 2] = *((uint32_t *)&hashbuffer[rand]);
+		}
 
-			xor_salsa8(randbuffer, randseed);
+		sha256_hash512((uint32_t*)&hashbuffer[i], joint);
 
-			memcpy(joint, &hashbuffer[i - 32], 32);
-			//use the last hash value as the seed
-			for (int j = 32; j < 64; j += 4)
-			{
-				uint32_t rand = randbuffer[(j - 32) / 4] % (randmax - 32); //randmax - 32 as otherwise we go beyond memory that's already been written to
-				joint[j / 4] = *((uint32_t*)&hashbuffer[rand]);
-			}
-			sha256_hash512(&hashbuffer[i], (uint8_t*)joint);
-//			for (int k = 0; k<8; k++) { printf("sha hashbuffer %d %08x\n", k, ((uint32_t*)(hashbuffer+i))[k]); }
-			memcpy(randseed, &hashbuffer[i - 32], 64); //use last hash value and previous hash value(post-mixing)
-			if (i>128)
-			{
-				memcpy(randbuffer, &hashbuffer[i - 128], 64);
-			}
-			else
-			{
-				memset(randbuffer, 0, 64);
-			}
-			xor_salsa8(randbuffer, randseed);
-			for (int j = 0; j < 32; j += 2)
-			{
-				uint32_t rand = randbuffer[j / 2] % randmax;
-				*((uint32_t*)&hashbuffer[rand]) = *((uint32_t*)&hashbuffer[j + i - 4]);
-			}
-		}
+		//setup randbuffer to be an array of random indexes
+		//use last hash value and previous hash value(post-mixing)
+		memcpy(randseed, &hashbuffer[i - 32], 64);
 
-//		for (int k = 0; k<8; k++) { printf("cpu final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); }
+		if (i > 128) memcpy(randbuffer, &hashbuffer[i - 128], 64);
+		else memset(randbuffer, 0, 64);
 
-		//note: off-by-one error is likely here...
-/*
-		for (int i = size - 64 - 1; i >= 64; i -= 64)
+		xor_salsa8((uint32_t*)randbuffer, (uint32_t*)randseed);
+
+		//use the last hash value as the seed
+		for (int j = 0; j < 32; j += 2)
 		{
-			sha256_hash512(&hashbuffer[i - 64], &hashbuffer[i]);
+			uint32_t rand = randbuffer[j >> 1] % randmax;
+			*((uint32_t *)(hashbuffer + rand)) = *((uint32_t *)(hashbuffer + j + randmax));
 		}
+	}
 
-		for (int k = 0; k<8; k++) { printf("cpu after of by one final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); }
-*/
-		memcpy((unsigned char*)hash, hashbuffer, 32);
+	memcpy(hash, hashbuffer, 32);
 }
 
 static bool init[MAX_GPUS] = { 0 };
 
+static uchar* scratchbuf = NULL;
+
 extern "C" int scanhash_pluck(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
 	uint32_t endiandata[20];
-
+	int opt_pluck_n = 128;
 	int intensity = 18; /* beware > 20 could work and create diff problems later */
 	uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity);
 	// divide by 128 for this algo which require a lot of memory
@@ -230,14 +196,15 @@ extern "C" int scanhash_pluck(int thr_id, uint32_t *pdata, const uint32_t *ptarg
 		//cudaDeviceReset();
 		//cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		//cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
+		cudaMalloc(&d_hash[thr_id], opt_pluck_n * 1024 * throughput);
 
-		cudaMalloc(&d_hash[thr_id], 32 * 1024 * sizeof(uint32_t) * throughput);
+		if (!scratchbuf)
+			scratchbuf = (uchar*) calloc(opt_pluck_n, 1024);
 
 		pluck_cpu_init(thr_id, throughput, d_hash[thr_id]);
 		init[thr_id] = true;
 	}
 
-
 	for (int k = 0; k < 20; k++)
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
 
@@ -247,18 +214,17 @@ extern "C" int scanhash_pluck(int thr_id, uint32_t *pdata, const uint32_t *ptarg
 		uint32_t foundNonce = pluck_cpu_hash(thr_id, throughput, pdata[19], 0);
 		if (foundNonce != UINT32_MAX)
 		{
-//			const uint32_t Htarg = ptarget[7];
-//			uint32_t vhash64[8];
-//			be32enc(&endiandata[19], foundNonce);
-//			pluckhash(vhash64,endiandata);
-//			printf("target %08x vhash64 %08x", ptarget[7], vhash64[7]);
-//			if (vhash64[7] <= Htarg) { // && fulltest(vhash64, ptarget)) {
+			const uint32_t Htarg = ptarget[7];
+			uint32_t vhash64[8];
+			be32enc(&endiandata[19], foundNonce);
+			pluckhash(vhash64, endiandata, scratchbuf, opt_pluck_n);
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 				*hashes_done = pdata[19] - first_nonce + throughput;
 				pdata[19] = foundNonce;
 				return 1;
-//			} else {
-//				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
-//			}
+			} else {
+				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
+			}
 		}
 
 		pdata[19] += throughput;
diff --git a/util.cpp b/util.cpp
index d0f047a..30200e3 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1646,6 +1646,8 @@ extern void applog_hash(uchar *hash)
 #define printpfx(n,h) \
 	printf("%s%12s%s: %s\n", CL_BLU, n, CL_N, format_hash(s, h))
 
+static uchar scratchbuf[128 * 1024];
+
 void do_gpu_tests(void)
 {
 #ifdef _DEBUG
@@ -1678,8 +1680,11 @@ void do_gpu_tests(void)
 void print_hash_tests(void)
 {
 	char s[128] = {'\0'};
-	uchar buf[128], hash[128];
-	memset(buf, 0, sizeof buf);
+	uchar hash[128];
+	uchar* buf = scratchbuf;
+
+	//scratchbuf = (uchar*)malloc(1, 128*1024);
+	memset(buf, 0, sizeof scratchbuf);
 	// buf[0] = 1; buf[64] = 2; // for endian tests
 
 	printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n");
@@ -1744,6 +1749,10 @@ void print_hash_tests(void)
 	pentablakehash(&hash[0], &buf[0]);
 	printpfx("pentablake", hash);
 
+	memset(hash, 0, sizeof hash);
+	pluckhash((uint32_t*)&hash[0], (uint32_t*)&buf[0], &buf[0], 128);
+	printpfx("pluck", hash);
+
 	memset(hash, 0, sizeof hash);
 	quarkhash(&hash[0], &buf[0]);
 	printpfx("quark", hash);
@@ -1787,4 +1796,6 @@ void print_hash_tests(void)
 	printf("\n");
 
 	do_gpu_tests();
+
+	//free(scratchbuf);
 }