From 6a9280a04590beaf70b2252101982959796e8a5f Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Wed, 14 Oct 2015 21:55:40 +0200
Subject: [PATCH] lyra2v2: set a better TPB for intensity 20 (sm52)

use sp forced unroll in skein and do some cleanup...
---
 Algo256/cuda_skein256.cu |  31 +++++++----
 lyra2/cuda_lyra2v2.cu    | 108 +++++++++++++++++++--------------------
 lyra2/lyra2REv2.cu       |   4 +-
 util.cpp                 |   2 +-
 4 files changed, 78 insertions(+), 67 deletions(-)

diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu
index 2f1bc82..cbeb660 100644
--- a/Algo256/cuda_skein256.cu
+++ b/Algo256/cuda_skein256.cu
@@ -119,10 +119,16 @@ void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outp
 		p6 = h[6] + t12[1];
 		p7 = h[7];
 
-		#pragma unroll
-		for (int i = 1; i<19; i+=2) {
-			Round_8_512v35(h,t12,p0,p1,p2,p3,p4,p5,p6,p7,i);
-		}
+		// forced unroll required
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 1);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 3);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 5);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 7);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 9);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 11);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 13);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 15);
+		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 17);
 
 		p0 ^= dt0;
 		p1 ^= dt1;
@@ -143,11 +149,17 @@ void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outp
 		p5 += t12[3];  //p5 already equal h[5]
 		p6 += t12[4];
 
-		#pragma unroll
-		for (int i = 1; i<17; i+=2)	{
-			Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i);
-		}
+		// forced unroll
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 1);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 3);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 5);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 7);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 9);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 11);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 13);
+		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 15);
 		Round_8_512v35_final(h, t, p0, p1, p2, p3, p4, p5, p6, p7);
+
 		outputHash[thread]           = devectorize(p0);
 		outputHash[threads+thread]   = devectorize(p1);
 		outputHash[2*threads+thread] = devectorize(p2);
@@ -285,12 +297,13 @@ __host__
 void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
 	const uint32_t threadsperblock = 256;
+	int dev_id = device_map[thr_id];
 
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
 	// only 1kH/s perf change between kernels on a 960...
-	if (device_sm[device_map[thr_id]] > 300 && cuda_arch[device_map[thr_id]] > 300)
+	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
 		skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
 	else
 		skein256_gpu_hash_32_v30<<<grid, block>>>(threads, startNounce, d_outputHash);
diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu
index 5493c1a..bd2ad25 100644
--- a/lyra2/cuda_lyra2v2.cu
+++ b/lyra2/cuda_lyra2v2.cu
@@ -2,7 +2,7 @@
 #include <stdint.h>
 #include <memory.h>
 
-#define TPB52 10
+#define TPB52 8
 #define TPB50 16
 
 #include "cuda_lyra2v2_sm3.cuh"
@@ -27,8 +27,8 @@ __device__ __forceinline__
 void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
 {
 	a += b; d ^= a; d = SWAPUINT2(d);
-	c += d; b ^= c; b = ROR24(b);
-	a += b; d ^= a; d = ROR16(d);
+	c += d; b ^= c; b = ROR2(b, 24);
+	a += b; d ^= a; d = ROR2(d, 16);
 	c += d; b ^= c; b = ROR2(b, 63);
 }
 
@@ -47,11 +47,11 @@ void round_lyra_v5(uint4x2* s)
 }
 
 __device__ __forceinline__
-void reduceDuplex(uint4x2 state[4], uint32_t thread)
+void reduceDuplex(uint4x2 state[4], const uint32_t thread)
 {
 	uint4x2 state1[3];
-	uint32_t ps1 = (Nrow * Ncol * memshift * thread);
-	uint32_t ps2 = (memshift * (Ncol-1) + memshift * Ncol + Nrow * Ncol * memshift * thread);
+	const uint32_t ps1 = (Nrow * Ncol * memshift * thread);
+	const uint32_t ps2 = (memshift * (Ncol-1) + memshift * Ncol + Nrow * Ncol * memshift * thread);
 
 	#pragma unroll 4
 	for (int i = 0; i < Ncol; i++)
@@ -80,16 +80,16 @@ void reduceDuplex(uint4x2 state[4], uint32_t thread)
 }
 
 __device__ __forceinline__
-void reduceDuplex50(uint4x2 state[4], uint32_t thread)
+void reduceDuplex50(uint4x2 state[4], const uint32_t thread)
 {
-	uint32_t ps1 = (Nrow * Ncol * memshift * thread);
-	uint32_t ps2 = (memshift * (Ncol - 1) + memshift * Ncol + Nrow * Ncol * memshift * thread);
+	const uint32_t ps1 = (Nrow * Ncol * memshift * thread);
+	const uint32_t ps2 = (memshift * (Ncol - 1) + memshift * Ncol + Nrow * Ncol * memshift * thread);
 
 	#pragma unroll 4
 	for (int i = 0; i < Ncol; i++)
 	{
-		uint32_t s1 = ps1 + i*memshift;
-		uint32_t s2 = ps2 - i*memshift;
+		const uint32_t s1 = ps1 + i*memshift;
+		const int32_t s2 = ps2 - i*memshift;
 
 		#pragma unroll
 		for (int j = 0; j < 3; j++)
@@ -104,19 +104,19 @@ void reduceDuplex50(uint4x2 state[4], uint32_t thread)
 }
 
 __device__ __forceinline__
-void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, uint4x2 state[4], uint32_t thread)
+void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, uint4x2 state[4], const uint32_t thread)
 {
 	uint4x2 state2[3], state1[3];
 
-	uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread);
-	uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread);
-	uint32_t ps3 = (memshift * (Ncol-1) + memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread);
+	const uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread);
+	const uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread);
+	const uint32_t ps3 = (memshift * (Ncol-1) + memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread);
 
 	for (int i = 0; i < Ncol; i++)
 	{
-		uint32_t s1 = ps1 + i*memshift;
-		uint32_t s2 = ps2 + i*memshift;
-		uint32_t s3 = ps3 - i*memshift;
+		const uint32_t s1 = ps1 + i*memshift;
+		const uint32_t s2 = ps2 + i*memshift;
+		const uint32_t s3 = ps3 - i*memshift;
 
 #if __CUDA_ARCH__ == 500
 
@@ -179,18 +179,18 @@ void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOu
 
 
 __device__ __forceinline__
-void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, uint4x2* state, uint32_t thread)
+void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, uint4x2* state, const uint32_t thread)
 {
-	uint4x2 state1[3],state2[3];
-	uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread);
-	uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread);
-	uint32_t ps3 = (memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread);
+	uint4x2 state1[3], state2[3];
+	const uint32_t ps1 = (memshift * Ncol * rowIn    + Nrow * Ncol * memshift * thread);
+	const uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread);
+	const uint32_t ps3 = (memshift * Ncol * rowOut   + Nrow * Ncol * memshift * thread);
 
 	for (int i = 0; i < Ncol; i++)
 	{
-		uint32_t s1 = ps1 + i*memshift;
-		uint32_t s2 = ps2 + i*memshift;
-		uint32_t s3 = ps3 + i*memshift;
+		const uint32_t s1 = ps1 + i*memshift;
+		const uint32_t s2 = ps2 + i*memshift;
+		const uint32_t s3 = ps3 + i*memshift;
 
 		#pragma unroll
 		for (int j = 0; j < 3; j++)
@@ -255,12 +255,10 @@ __global__ __launch_bounds__(TPB50, 1)
 #else
 __global__ __launch_bounds__(TPB52, 1)
 #endif
-void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
-	uint4x2 state[4];
-
 	uint4x2 blake2b_IV[2];
 
 	if (threadIdx.x == 0) {
@@ -275,10 +273,12 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa
 
 	if (thread < threads)
 	{
-		((uint2*)state)[0] = __ldg(&outputHash[thread]);
-		((uint2*)state)[1] = __ldg(&outputHash[thread + threads]);
-		((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]);
-		((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]);
+		uint4x2 state[4];
+
+		((uint2*)state)[0] = __ldg(&g_hash[thread]);
+		((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
+		((uint2*)state)[2] = __ldg(&g_hash[thread + threads*2]);
+		((uint2*)state)[3] = __ldg(&g_hash[thread + threads*3]);
 
 		state[1] = state[0];
 
@@ -286,7 +286,7 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa
 		state[3] = ((blake2b_IV)[1]);
 
 		for (int i = 0; i<12; i++)
-			 round_lyra_v5(state);
+			round_lyra_v5(state);
 
 		((uint2*)state)[0].x ^= 0x20;
 		((uint2*)state)[1].x ^= 0x20;
@@ -298,9 +298,9 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa
 		((uint2*)state)[7].y ^= 0x01000000;
 
 		for (int i = 0; i<12; i++)
-			 round_lyra_v5(state);
+			round_lyra_v5(state);
 
-		uint32_t ps1 = (memshift * (Ncol - 1) + Nrow * Ncol * memshift * thread);
+		const uint32_t ps1 = (memshift * (Ncol - 1) + Nrow * Ncol * memshift * thread);
 
 		for (int i = 0; i < Ncol; i++)
 		{
@@ -323,7 +323,7 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa
 		{
 			rowa = ((uint2*)state)[0].x & 3;
 			reduceDuplexRowtV2(prev, rowa, i, state, thread);
-			prev=i;
+			prev = i;
 		}
 
 		const uint32_t shift = (memshift * Ncol * rowa + Nrow * Ncol * memshift * thread);
@@ -335,10 +335,10 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa
 		for (int i = 0; i < 12; i++)
 			round_lyra_v5(state);
 
-		outputHash[thread] =               ((uint2*)state)[0];
-		outputHash[thread + threads]     = ((uint2*)state)[1];
-		outputHash[thread + 2 * threads] = ((uint2*)state)[2];
-		outputHash[thread + 3 * threads] = ((uint2*)state)[3];
+		g_hash[thread]             = ((uint2*)state)[0];
+		g_hash[thread + threads]   = ((uint2*)state)[1];
+		g_hash[thread + threads*2] = ((uint2*)state)[2];
+		g_hash[thread + threads*3] = ((uint2*)state)[3];
 	}
 }
 #else
@@ -346,36 +346,34 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa
 #if __CUDA_ARCH__ < 300
 __device__ void* DMatrix;
 #endif
-__global__ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {}
+__global__ void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
 #endif
 
 __host__
 void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
 {
+	cuda_get_arch(thr_id);
 	// just assign the device pointer allocated in main loop
 	cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
 }
 
 __host__
-void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order)
 {
-	uint32_t tpb;
-	if (device_sm[device_map[thr_id]] < 350)
-		tpb = TPB30;
-	else if (device_sm[device_map[thr_id]] == 350)
-		tpb = TPB35;
-	else if (device_sm[device_map[thr_id]] == 500)
-		tpb = TPB50;
-	else
-		tpb = TPB52;
+	int dev_id = device_map[thr_id % MAX_GPUS];
+	uint32_t tpb = TPB52;
+
+	if (device_sm[dev_id] == 500 || cuda_arch[dev_id] == 500) tpb = TPB50;
+	else if (device_sm[dev_id] == 350 || cuda_arch[dev_id] == 350) tpb = TPB35;
+	else if (device_sm[dev_id] < 350 || cuda_arch[dev_id] < 350) tpb = TPB30;
 
 	dim3 grid((threads + tpb - 1) / tpb);
 	dim3 block(tpb);
 
-	if (device_sm[device_map[thr_id]] >= 500)
-		lyra2v2_gpu_hash_32    <<<grid, block>>> (threads, startNounce, (uint2*)d_outputHash);
+	if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
+		lyra2v2_gpu_hash_32    <<<grid, block>>> (threads, startNounce, (uint2*)g_hash);
 	else
-		lyra2v2_gpu_hash_32_v3 <<<grid, block>>> (threads, startNounce, (uint2*)d_outputHash);
+		lyra2v2_gpu_hash_32_v3 <<<grid, block>>> (threads, startNounce, (uint2*)g_hash);
 
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu
index d8b590e..54b8303 100644
--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@@ -11,7 +11,7 @@ extern "C" {
 #include "cuda_helper.h"
 
 
-static _ALIGN(64) uint64_t *d_hash[MAX_GPUS];
+static uint64_t *d_hash[MAX_GPUS];
 static uint64_t* d_matrix[MAX_GPUS];
 
 extern void blake256_cpu_init(int thr_id, uint32_t threads);
@@ -86,7 +86,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
-		ptarget[7] = 0x00ff;
+		ptarget[7] = 0x000f;
 
 	if (!init[thr_id])
 	{
diff --git a/util.cpp b/util.cpp
index fb9e12d..97e4b28 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1849,7 +1849,7 @@ static uint32_t zrtest[20] = {
 
 void do_gpu_tests(void)
 {
-#if 1 //def _DEBUG
+#ifdef _DEBUG
 	unsigned long done;
 	char s[128] = { '\0' };
 	uchar buf[160];