From 1b31f11252bd1dfdb6f8c5bce0343a07be5788bd Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 9 May 2015 19:25:40 +0200
Subject: [PATCH] neoscrypt: cleanup...

My SM 3.0 functions are ok but djm34 implementation uses too much registers
for this arch...
---
 neoscrypt/cuda_neoscrypt.cu | 43 +++++++++++++++++++++----------------
 neoscrypt/cuda_vectors.h    | 19 ++++++++--------
 neoscrypt/neoscrypt.cpp     | 30 ++++++++------------------
 3 files changed, 43 insertions(+), 49 deletions(-)

diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu
index cd0908f..ab23e55 100644
--- a/neoscrypt/cuda_neoscrypt.cu
+++ b/neoscrypt/cuda_neoscrypt.cu
@@ -4,9 +4,9 @@
 #include "cuda_helper.h"
 #include "cuda_vectors.h" /* NOT COMPATIBLE WITH SM 3.0 !!! */
 
- __device__ uint4* W;
-uint32_t *d_NNonce[MAX_GPUS];
-uint32_t *d_nnounce[MAX_GPUS];
+static uint32_t *d_buffer[MAX_GPUS];
+static uint32_t *d_NNonce[MAX_GPUS];
+__constant__ uint4* W;
 __constant__ uint32_t pTarget[8];
 __constant__ uint32_t key_init[16];
 __constant__ uint32_t input_init[16];
@@ -423,14 +423,14 @@ static __device__ __forceinline__ void neoscrypt_salsa(uint16 *XV)
 #define SHIFT 130
 
 __global__ __launch_bounds__(128, 1)
-void neoscrypt_gpu_hash_k0(int stratum, uint32_t threads, uint32_t startNonce)
+void neoscrypt_gpu_hash_k0(uint32_t threads, uint32_t startNonce, bool stratum)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	uint32_t shift = SHIFT * 16 * thread;
 //	if (thread < threads)
 	{
 		uint32_t data[80];
 		uint16 X[4];
+		uint32_t shift = thread * SHIFT * 16;
 		const uint32_t nonce = startNonce + thread;
 
 		for (int i = 0; i<20; i++) {
@@ -451,10 +451,10 @@ __global__ __launch_bounds__(128, 1)
 void neoscrypt_gpu_hash_k01(uint32_t threads, uint32_t startNonce)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	uint32_t shift = SHIFT * 16 * thread;
 //	if (thread < threads)
 	{
 		uint16 X[4];
+		uint32_t shift = thread * SHIFT * 16;
 		((uintx64 *)X)[0]= ldg256(&(W + shift)[0]);
 
 		//#pragma unroll
@@ -471,10 +471,10 @@ __global__ __launch_bounds__(128, 1)
 void neoscrypt_gpu_hash_k2(uint32_t threads, uint32_t startNonce)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	uint32_t shift = SHIFT * 16 * thread;
 //	if (thread < threads)
 	{
 		uint16 X[4];
+		uint32_t shift = thread * SHIFT * 16;
 		((uintx64 *)X)[0] = ldg256(&(W + shift)[2048]);
 
 		for (int t = 0; t < 128; t++)
@@ -495,7 +495,7 @@ void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce)
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 //	if (thread < threads)
 	{
-		uint32_t shift = SHIFT * 16 * thread;
+		uint32_t shift = thread * SHIFT * 16;
 		uint16 Z[4];
 
 		((uintx64*)Z)[0] = ldg256(&(W + shift)[0]);
@@ -510,14 +510,14 @@ void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce)
 }
 
 __global__ __launch_bounds__(128, 1)
-void neoscrypt_gpu_hash_k4(int stratum, uint32_t threads, uint32_t startNonce, uint32_t *nonceVector)
+void neoscrypt_gpu_hash_k4(uint32_t threads, uint32_t startNonce, uint32_t *nonceRes, bool stratum)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-//	if (thread < threads)
+	if (thread < threads)
 	{
 		const uint32_t nonce = startNonce + thread;
 
-		uint32_t shift = SHIFT * 16 * thread;
+		uint32_t shift = thread * SHIFT * 16;
 		uint16 Z[4];
 		uint32_t outbuf[8];
 		uint32_t data[80];
@@ -539,33 +539,37 @@ void neoscrypt_gpu_hash_k4(int stratum, uint32_t threads, uint32_t startNonce, u
 		((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[2064]);
 		fastkdf32(data, (uint32_t*)Z, outbuf);
 		if (outbuf[7] <= pTarget[7]) {
-			uint32_t tmp = atomicExch(&nonceVector[0], nonce);
+			atomicMin(nonceRes, nonce); // init val is UINT32_MAX
 		}
 	}
 }
 
-void neoscrypt_cpu_init(int thr_id, uint32_t threads, uint32_t *hash)
+__host__
+void neoscrypt_cpu_init(int thr_id, uint32_t threads)
 {
-	cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(W, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice);
+	cuda_get_arch(thr_id);
 	cudaMalloc(&d_NNonce[thr_id], sizeof(uint32_t));
+	CUDA_SAFE_CALL(cudaMalloc(&d_buffer[thr_id], threads * 256 * SHIFT));
+	cudaMemcpyToSymbol(W, &d_buffer[thr_id], sizeof(uint4*), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice);
 }
 
 __host__
-uint32_t neoscrypt_cpu_hash_k4(int stratum, int thr_id, uint32_t threads, uint32_t startNounce, int order)
+uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, bool have_stratum, int order)
 {
-	uint32_t result[MAX_GPUS] = { 0xffffffff };
+	uint32_t result[MAX_GPUS];
+	memset(result, 0xff, sizeof(result));
 	cudaMemset(d_NNonce[thr_id], 0xff, sizeof(uint32_t));
 
 	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	neoscrypt_gpu_hash_k0  <<< grid, block >>>(stratum, threads, startNounce);
+	neoscrypt_gpu_hash_k0  <<< grid, block >>>(threads, startNounce, have_stratum);
 	neoscrypt_gpu_hash_k01 <<< grid, block >>>(threads, startNounce);
 	neoscrypt_gpu_hash_k2  <<< grid, block >>>(threads, startNounce);
 	neoscrypt_gpu_hash_k3  <<< grid, block >>>(threads, startNounce);
-	neoscrypt_gpu_hash_k4  <<< grid, block >>>(stratum, threads, startNounce, d_NNonce[thr_id]);
+	neoscrypt_gpu_hash_k4  <<< grid, block >>>(threads, startNounce, d_NNonce[thr_id], have_stratum);
 
 	MyStreamSynchronize(NULL, order, thr_id);
 	cudaMemcpy(&result[thr_id], d_NNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
@@ -578,6 +582,7 @@ void neoscrypt_setBlockTarget(uint32_t* pdata, const void *target)
 {
 	unsigned char PaddedMessage[80*4]; //bring balance to the force
 	uint32_t input[16], key[16] = { 0 };
+
 	memcpy(PaddedMessage,     pdata, 80);
 	memcpy(PaddedMessage + 80, pdata, 80);
 	memcpy(PaddedMessage + 160, pdata, 80);
diff --git a/neoscrypt/cuda_vectors.h b/neoscrypt/cuda_vectors.h
index 67f77d9..08fc0ee 100644
--- a/neoscrypt/cuda_vectors.h
+++ b/neoscrypt/cuda_vectors.h
@@ -478,23 +478,25 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift
 
 #if __CUDA_ARCH__ < 320
 
-// right shift a 64 bytes input (256-bits integer) by 0 8 16 24 bits
-static __forceinline__ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
+// right shift a 64-bytes integer (256-bits) by 0 8 16 24 bits
+// require a uint32_t[9] ret array
+// note: djm neoscrypt implementation is near the limits of gpu capabilities
+//       and weird behaviors can happen when tuning device functions code...
+__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
 {
 	uint8_t *v = (uint8_t*) &vec4.s0;
 	uint8_t *r = (uint8_t*) ret;
 	uint8_t bytes = (uint8_t) (shift >> 3);
-	for (uint8_t i=0; i<bytes; i++)
-		r[i] = 0;
+	ret[0] = 0;
 	for (uint8_t i=bytes; i<32; i++)
 		r[i] = v[i-bytes];
-	ret[8] = vec4.s7 >> (32 - shift); // shuffled part required ?
-	//printf("A %02u %08x %08x > %08x %08x\n", shift, vec4.s6, vec4.s7, ret[7], ret[8]);
+	ret[8] = vec4.s7 >> (32 - shift); // shuffled part required
 }
+
 #else
 
-// right shift a 32 bytes input (256-bits integer) by 0 8 16 24 bits
-static __forceinline__ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
+// same for SM 3.5+, really faster ?
+__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
 {
 	uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
 	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
@@ -522,7 +524,6 @@ static __forceinline__ __device__ void shift256R(uint32_t* ret, const uint8 &vec
 	ret[1] = cuda_swab32(truc);
 	asm("shr.b32        %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift));
 	ret[0] = cuda_swab32(truc);
-	//printf("B %02u %08x %08x > %08x %08x\n", shift, vec4.s6, vec4.s7, ret[7], ret[8]);
 }
 #endif
 
diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp
index f1d3290..9238e77 100644
--- a/neoscrypt/neoscrypt.cpp
+++ b/neoscrypt/neoscrypt.cpp
@@ -2,13 +2,9 @@
 #include "miner.h"
 #include "neoscrypt/neoscrypt.h"
 
-static uint32_t *d_hash[MAX_GPUS] ;
 extern void neoscrypt_setBlockTarget(uint32_t * data, const void *ptarget);
-extern void neoscrypt_cpu_init(int thr_id, uint32_t threads, uint32_t* hash);
-extern uint32_t neoscrypt_cpu_hash_k4(int stratum, int thr_id, uint32_t threads, uint32_t startNounce, int order);
-extern int cuda_get_arch(int thr_id);
-
-#define SHIFT 130
+extern void neoscrypt_cpu_init(int thr_id, uint32_t threads);
+extern uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, bool have_stratum, int order);
 
 static bool init[MAX_GPUS] = { 0 };
 
@@ -16,43 +12,35 @@ int scanhash_neoscrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uin
 {
 	const uint32_t first_nonce = pdata[19];
 
-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
-
 	int intensity = is_windows() ? 18 : 19;
 	uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity);
 	throughput = throughput / 32; /* set for max intensity ~= 20 */
 	throughput = min(throughput, max_nonce - first_nonce + 1);
 
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x0000ff;
+
 	if (!init[thr_id])
 	{
 		int dev_id = device_map[thr_id];
 		cudaSetDevice(dev_id);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 
-		cuda_get_arch(thr_id);
 		if (device_sm[dev_id] <= 300) {
 			applog(LOG_ERR, "Sorry neoscrypt is not supported on SM 3.0 devices");
 			proper_exit(EXIT_CODE_CUDA_ERROR);
 		}
 
-		cudaMalloc(&d_hash[thr_id], 32 * SHIFT * sizeof(uint64_t) * throughput);
-		neoscrypt_cpu_init(thr_id, throughput, d_hash[thr_id]);
-
 		applog(LOG_INFO, "Using %d cuda threads", throughput);
-		if (cudaGetLastError() != cudaSuccess) {
-			cudaError_t err = cudaGetLastError();
-		        fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n",
-			                 __FUNCTION__, __LINE__, cudaGetErrorString(err) );
-			proper_exit(EXIT_FAILURE);
-		}
+		neoscrypt_cpu_init(thr_id, throughput);
+
 		init[thr_id] = true;
 	}
 
 	uint32_t endiandata[20];
 	if (have_stratum) {
 		for (int k = 0; k < 20; k++)
-			be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+			be32enc(&endiandata[k], pdata[k]);
 	} else {
 		for (int k = 0; k < 20; k++)
 			endiandata[k] = pdata[k];
@@ -61,7 +49,7 @@ int scanhash_neoscrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uin
 	neoscrypt_setBlockTarget(endiandata,ptarget);
 
 	do {
-		uint32_t foundNonce = neoscrypt_cpu_hash_k4((int)have_stratum, thr_id, throughput, pdata[19], 0);
+		uint32_t foundNonce = neoscrypt_cpu_hash_k4(thr_id, throughput, pdata[19], have_stratum, 0);
 		if (foundNonce != UINT32_MAX)
 		{
 			uint32_t _ALIGN(64) vhash64[8];