From 91eea0d76b425cd2631b07c8344bdef06cc61d10 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 13 Sep 2014 13:22:14 +0200
Subject: [PATCH] blake: remove int cudaMemcpyToSymbol for MSVC

use clz (leading zeros) asm func for a fast gpu compare of ptarget[6]:[7]

add also missing windows ctz/clz host functions

New NEOS speed: 227MH to 270MH (Gigabyte 750Ti Black Edition)
---
 blake32.cu    | 53 +++++++++++++++++++++++------------------
 cuda_helper.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 93 insertions(+), 26 deletions(-)

diff --git a/blake32.cu b/blake32.cu
index fd2c7c5..5f4043b 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -47,10 +47,6 @@ extern int device_map[8];
 __constant__
 static uint32_t __align__(32) c_data[20];
 
-// only store the 2 high uint32 of the target hash
-__constant__ static uint64_t c_Target;
-__constant__ static int8_t c_BlakeRounds;
-
 /* 8 adapters max (-t threads) */
 static uint32_t *d_resNonce[8];
 static uint32_t *h_resNonce[8];
@@ -132,7 +128,7 @@ static const uint32_t __align__(32) c_Padding[16] = {
 };
 
 __device__ static
-void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0)
+void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, const int rounds)
 {
 	uint32_t /* __align__(8) */ m[16];
 	uint32_t /* __align__(8) */ v[16];
@@ -160,7 +156,6 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0)
 	v[14] = c_u256[6];
 	v[15] = c_u256[7];
 
-	int rounds = c_BlakeRounds;
 	for (int i = 0; i < rounds; i++) {
 		/* column step */
 		GS(0, 4, 0x8, 0xC, 0x0);
@@ -176,18 +171,19 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0)
 
 	//#pragma unroll 16
 	for (uint32_t i = 0; i < 16; i++) {
-		uint32_t j = i % 8;
+		uint32_t j = i % 8U;
 		h[j] ^= v[i];
 	}
 }
 
 __global__
-void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, const int crcsum)
+void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNounce,
+	const uint8_t nClzTarget, const int crcsum, const int rounds)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		const uint32_t nounce = startNounce + thread;
+		const uint32_t nounce = startNonce + thread;
 		uint32_t h[8];
 
 		#pragma unroll
@@ -200,7 +196,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
 #else
 		if (crcsum != prevsum) {
 			prevsum = crcsum;
-			blake256_compress(h, c_data, 512);
+			blake256_compress(h, c_data, 512, rounds);
 			#pragma unroll
 			for(int i=0; i<8; i++) {
 				cache[i] = h[i];
@@ -220,10 +216,16 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
 		ending[2] = c_data[18];
 		ending[3] = nounce; /* our tested value */
 
-		blake256_compress(h, ending, 640);
+		blake256_compress(h, ending, 640, rounds);
+
+		// not sure why, h[7] is ok
+		h[6] = cuda_swab32(h[6]);
+
+		// compare count of leading zeros h[6] + h[7]
+		uint64_t high64 = ((uint64_t*)h)[3];
+		uint32_t clz = cuda_clz64(high64);
 
-		/* do not test all parts, fulltest() will do it */
-		if (((uint64_t*)h)[3] <= c_Target)
+		if (clz >= nClzTarget)
 #if NBN == 2
 		/* keep the smallest nounce, + extra one if found */
 		if (resNounce[0] > nounce) {
@@ -239,7 +241,8 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
 }
 
 __host__
-uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t crcsum)
+uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint8_t clzTarget,
+	const uint32_t crcsum, const int8_t rounds)
 {
 	const int threadsperblock = TPB;
 	uint32_t result = MAXU;
@@ -252,7 +255,7 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce
 	if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
 		return result;
 
-	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNonce[thr_id], crcsum);
+	blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], clzTarget, crcsum, (int) rounds);
 	cudaDeviceSynchronize();
 	if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
 		//cudaThreadSynchronize(); /* seems no more required */
@@ -264,14 +267,12 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce
 }
 
 __host__
-void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget, int8_t blakerounds)
+void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget)
 {
 	uint32_t data[20];
 	memcpy(data, pdata, 80);
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, &ptarget[6], 2*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_BlakeRounds, &blakerounds, sizeof(int8_t), 0, cudaMemcpyHostToDevice));
 }
 
 extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
@@ -280,6 +281,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 	const uint32_t first_nonce = pdata[19];
 	static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 	uint32_t throughput = min(TPB * 4096, max_nonce - first_nonce);
+	uint64_t targetHigh = ((uint64_t*)ptarget)[3];
+	uint32_t clzTarget = cuda_clz64(targetHigh);
 	uint32_t crcsum = MAXU;
 	int rc = 0;
 
@@ -308,19 +311,19 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 		init[thr_id] = true;
 	}
 
-	blake256_cpu_setBlock_80(pdata, ptarget, blakerounds);
+	blake256_cpu_setBlock_80(pdata, ptarget);
 #if USE_CACHE
 	crcsum = crc32_u32t(pdata, 64);
 #endif
 
 	do {
 		// GPU HASH
-		uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], crcsum);
+		uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], (uint8_t) clzTarget, crcsum, blakerounds);
 		if (foundNonce != MAXU)
 		{
 			uint32_t endiandata[20];
 			uint32_t vhashcpu[8];
-			uint32_t Htarg = ptarget[7];
+			uint32_t Htarg = ptarget[6];
 
 			for (int k=0; k < 19; k++)
 				be32enc(&endiandata[k], pdata[k]);
@@ -329,7 +332,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 
 			blake256hash(vhashcpu, endiandata, blakerounds);
 
-			if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
+			if (vhashcpu[6] <= Htarg || cuda_swab32(vhashcpu[6]) <= Htarg /*&& fulltest(vhashcpu, ptarget)*/)
 			{
 				pdata[19] = foundNonce;
 				rc = 1;
@@ -338,7 +341,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 					// Rare but possible if the throughput is big
 					be32enc(&endiandata[19], extra_results[0]);
 					blake256hash(vhashcpu, endiandata, blakerounds);
-					if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
+					if (vhashcpu[6] <= Htarg /* && fulltest(vhashcpu, ptarget) */) {
 						applog(LOG_NOTICE, "GPU found more than one result " CL_GRN "yippee!");
 						rc = 2;
 					} else {
@@ -346,9 +349,13 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 					}
 				}
 
+				//applog_hash((uint8_t*)ptarget);
+				//applog_compare_hash((uint8_t*)vhashcpu,(uint8_t*)ptarget);
 				goto exit_scan;
 			}
 			else if (opt_debug) {
+				applog_hash((uint8_t*)ptarget);
+				applog_compare_hash((uint8_t*)vhashcpu,(uint8_t*)ptarget);
 				applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
 			}
 		}
diff --git a/cuda_helper.h b/cuda_helper.h
index 9e10968..56b80a7 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -248,9 +248,12 @@ uint64_t shl_t64(uint64_t x, uint32_t n)
 	return result;
 }
 
+#ifndef USE_ROT_ASM_OPT
+#define USE_ROT_ASM_OPT 1
+#endif
 
 // 64-bit ROTATE RIGHT
-#if __CUDA_ARCH__ >= 350
+#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1
 /* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
 __device__ __forceinline__
 uint64_t ROTR64(const uint64_t value, const int offset) {
@@ -264,7 +267,7 @@ uint64_t ROTR64(const uint64_t value, const int offset) {
 	}
 	return __double_as_longlong(__hiloint2double(result.y, result.x));
 }
-#elif __CUDA_ARCH__ >= 120
+#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
 __device__ __forceinline__
 uint64_t ROTR64(const uint64_t x, const int offset)
 {
@@ -286,7 +289,7 @@ uint64_t ROTR64(const uint64_t x, const int offset)
 #endif
 
 // 64-bit ROTATE LEFT
-#if __CUDA_ARCH__ >= 350
+#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT
 __device__ __forceinline__
 uint64_t ROTL64(const uint64_t value, const int offset) {
 	uint2 result;
@@ -320,4 +323,61 @@ uint64_t ROTL64(const uint64_t x, const int offset)
 #define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
 #endif
 
+#ifdef WIN32
+#include <intrin.h>
+static uint32_t __inline __builtin_clz(uint32_t x) {
+	unsigned long r = 0;
+	_BitScanReverse(&r, x);
+	return (31-r);
+}
+static uint32_t __inline __builtin_ctz(uint32_t x) {
+	unsigned long r = 0;
+	_BitScanForward(&r, x);
+	return r;
+}
+#endif
+
+/* count leading zeros of a 64bit int */
+#if __CUDA_ARCH__ >= 200
+__device__
+static uint32_t cuda_clz64(const uint64_t x)
+{
+	uint32_t result;
+	asm("clz.b64 %0, %1;\n"
+		: "=r"(result) : "l"(x));
+	return result;
+}
+#else
+/* host */
+static uint32_t cuda_clz64(const uint64_t x)
+{
+	uint32_t u32 = (x >> 32);
+	uint32_t result = u32 ? __builtin_clz(u32) : 32;
+	if (result == 32) {
+		u32 = (uint32_t) x;
+		result += (u32 ? __builtin_clz(u32) : 32);
+	}
+	return result;
+}
+#endif
+
+/* count trailing zeros of a 32bit int */
+#if __CUDA_ARCH__ >= 200
+__device__
+static uint32_t cuda_ctz32(const uint32_t x)
+{
+	uint32_t result;
+	asm("brev.b32 %1, %1;\n\t"
+		"clz.b32 %0, %1;\n"
+		: "=r"(result) : "r"(x));
+	return result;
+}
+#else
+/* host */
+static uint32_t cuda_ctz32(const uint32_t x)
+{
+	return x ? __builtin_ctz(x) : 32;
+}
+#endif
+
 #endif // #ifndef CUDA_HELPER_H