From 2b43d57d42590e080f6add5c711d47859c49d80b Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Fri, 5 Jun 2015 23:12:17 +0200
Subject: [PATCH] lyra2: simplify skein code (no perf changes)

---
 Algo256/cuda_skein256.cu | 223 ++++++++++++++++++---------------------
 1 file changed, 103 insertions(+), 120 deletions(-)

diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu
index bc9acef..6902509 100644
--- a/Algo256/cuda_skein256.cu
+++ b/Algo256/cuda_skein256.cu
@@ -20,21 +20,17 @@ static __constant__ uint2 vSKEIN_IV512_256[8] = {
 	{ 0x33EDFC13, 0x3EEDBA18 }
 };
 
-static __constant__ int ROT256[8][4] =
-{
-	46,36, 19, 37,
-	33,27, 14, 42,
-	17,49, 36, 39,
-	44, 9, 54, 56,
-	39,30, 34, 24,
-	13,50, 10, 17,
-	25,29, 39, 43,
-	8, 35, 56, 22,
+static __constant__ uint8_t ROT256[8][4] = {
+	46, 36, 19, 37,
+	33, 27, 14, 42,
+	17, 49, 36, 39,
+	44,  9, 54, 56,
+	39, 30, 34, 24,
+	13, 50, 10, 17,
+	25, 29, 39, 43,
+	8,  35, 56, 22,
 };
 
-static __constant__ uint2 skein_ks_parity = { 0xA9FC1A22,0x1BD11BDA};
-static __constant__ uint64_t skein_ks_parity64 = 0x1BD11BDAA9FC1A22ull;
-
 static __constant__ uint2 t12[6] = {
 	{ 0x20,	0 },
 	{ 0,	0xf0000000 },
@@ -44,25 +40,19 @@ static __constant__ uint2 t12[6] = {
 	{ 0x08,	0xff000000 }
 };
 
-static __constant__ uint64_t t12_30[6] = {
-	0x20,
-	0xf000000000000000,
-	0xf000000000000020,
-	0x08,
-	0xff00000000000000,
-	0xff00000000000008
-};
+static __constant__ uint2 skein_ks_parity = { 0xA9FC1A22, 0x1BD11BDA };
+static __constant__ uint64_t skein_ks_parity64 = 0x1BD11BDAA9FC1A22ull;
+
 
 static __forceinline__ __device__
 void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int ROT)
 {
-	p0 += p1; p1 = ROL2(p1, ROT256[ROT][0]);  p1 ^= p0;
-	p2 += p3; p3 = ROL2(p3, ROT256[ROT][1]);  p3 ^= p2;
-	p4 += p5; p5 = ROL2(p5, ROT256[ROT][2]);  p5 ^= p4;
-	p6 += p7; p7 = ROL2(p7, ROT256[ROT][3]);  p7 ^= p6;
+	p0 += p1; p1 = ROL2(p1, ROT256[ROT][0]) ^ p0;
+	p2 += p3; p3 = ROL2(p3, ROT256[ROT][1]) ^ p2;
+	p4 += p5; p5 = ROL2(p5, ROT256[ROT][2]) ^ p4;
+	p6 += p7; p7 = ROL2(p7, ROT256[ROT][3]) ^ p6;
 }
 
-
 static __forceinline__ __device__
 void Round_8_512v35(uint2 *ks, uint2 *ts,
                     uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3,
@@ -72,39 +62,36 @@ void Round_8_512v35(uint2 *ks, uint2 *ts,
 	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 1);
 	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 2);
 	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 3);
-	p0 += ks[((R)+0) % 9];   /* inject the key schedule value */
-	p1 += ks[((R)+1) % 9];
-	p2 += ks[((R)+2) % 9];
-	p3 += ks[((R)+3) % 9];
-	p4 += ks[((R)+4) % 9];
-	p5 += ks[((R)+5) % 9] + ts[((R)+0) % 3];
-	p6 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
-	p7 += ks[((R)+7) % 9] + make_uint2((R),0);
+	p0 += ks[(R+0) % 9];   /* inject the key schedule value */
+	p1 += ks[(R+1) % 9];
+	p2 += ks[(R+2) % 9];
+	p3 += ks[(R+3) % 9];
+	p4 += ks[(R+4) % 9];
+	p5 += ks[(R+5) % 9] + ts[(R+0) % 3];
+	p6 += ks[(R+6) % 9] + ts[(R+1) % 3];
+	p7 += ks[(R+7) % 9] + make_uint2((R),0);
 	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 4);
 	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 5);
 	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 6);
 	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 7);
-	p0 += ks[((R)+1) % 9];   /* inject the key schedule value */
-	p1 += ks[((R)+2) % 9];
-	p2 += ks[((R)+3) % 9];
-	p3 += ks[((R)+4) % 9];
-	p4 += ks[((R)+5) % 9];
-	p5 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
-	p6 += ks[((R)+7) % 9] + ts[((R)+2) % 3];
-	p7 += ks[((R)+8) % 9] + make_uint2((R)+1, 0);
+	p0 += ks[(R+1) % 9];   /* inject the key schedule value */
+	p1 += ks[(R+2) % 9];
+	p2 += ks[(R+3) % 9];
+	p3 += ks[(R+4) % 9];
+	p4 += ks[(R+5) % 9];
+	p5 += ks[(R+6) % 9] + ts[(R+1) % 3];
+	p6 += ks[(R+7) % 9] + ts[(R+2) % 3];
+	p7 += ks[(R+8) % 9] + make_uint2(R+1, 0);
 }
 
 
 __global__ __launch_bounds__(256,3)
 void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint2 h[9];
-		uint2 t[3];
-		uint2 dt0,dt1,dt2,dt3;
-		uint2 p0, p1, p2, p3, p4, p5, p6, p7;
+		uint2 h[9], t[3] = { t12[0], t12[1], t12[2] };
 
 		h[8] = skein_ks_parity;
 		for (int i = 0; i<8; i++) {
@@ -112,27 +99,23 @@ void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outp
 			h[8] ^= h[i];
 		}
 
-		t[0]=t12[0];
-		t[1]=t12[1];
-		t[2]=t12[2];
-
-		LOHI(dt0.x,dt0.y,outputHash[thread]);
-		LOHI(dt1.x,dt1.y,outputHash[threads+thread]);
-		LOHI(dt2.x,dt2.y,outputHash[2*threads+thread]);
-		LOHI(dt3.x,dt3.y,outputHash[3*threads+thread]);
-
-		p0 = h[0] + dt0;
-		p1 = h[1] + dt1;
-		p2 = h[2] + dt2;
-		p3 = h[3] + dt3;
-		p4 = h[4];
-		p5 = h[5] + t[0];
-		p6 = h[6] + t[1];
-		p7 = h[7];
-
-		#pragma unroll
+		uint2 dt0 = vectorize(outputHash[thread]);
+		uint2 dt1 = vectorize(outputHash[threads   + thread]);
+		uint2 dt2 = vectorize(outputHash[threads*2 + thread]);
+		uint2 dt3 = vectorize(outputHash[threads*3 + thread]);
+
+		uint2 p0 = h[0] + dt0;
+		uint2 p1 = h[1] + dt1;
+		uint2 p2 = h[2] + dt2;
+		uint2 p3 = h[3] + dt3;
+		uint2 p4 = h[4];
+		uint2 p5 = h[5] + t[0];
+		uint2 p6 = h[6] + t[1];
+		uint2 p7 = h[7];
+
+		#pragma unroll 9
 		for (int i = 1; i<19; i+=2) {
-			Round_8_512v35(h,t,p0,p1,p2,p3,p4,p5,p6,p7,i);
+			Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i);
 		}
 
 		p0 ^= dt0;
@@ -158,18 +141,19 @@ void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outp
 		t[0] = t12[3];
 		t[1] = t12[4];
 		t[2] = t12[5];
-		p5 += t[0];  //p5 already equal h[5]
+
+		p5 += t[0];
 		p6 += t[1];
 
-		#pragma unroll
+		#pragma unroll 9
 		for (int i = 1; i<19; i+=2) {
 			Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i);
 		}
 
-		outputHash[thread]           = devectorize(p0);
-		outputHash[threads+thread]   = devectorize(p1);
-		outputHash[2*threads+thread] = devectorize(p2);
-		outputHash[3*threads+thread] = devectorize(p3);
+		outputHash[thread]             = devectorize(p0);
+		outputHash[threads   + thread] = devectorize(p1);
+		outputHash[threads*2 + thread] = devectorize(p2);
+		outputHash[threads*3 + thread] = devectorize(p3);
 	}
 }
 
@@ -177,10 +161,10 @@ static __forceinline__ __device__
 void Round512v30(uint64_t &p0, uint64_t &p1, uint64_t &p2, uint64_t &p3,
 	uint64_t &p4, uint64_t &p5, uint64_t &p6, uint64_t &p7, int ROT)
 {
-	p0 += p1; p1 = ROTL64(p1, ROT256[ROT][0]);  p1 ^= p0;
-	p2 += p3; p3 = ROTL64(p3, ROT256[ROT][1]);  p3 ^= p2;
-	p4 += p5; p5 = ROTL64(p5, ROT256[ROT][2]);  p5 ^= p4;
-	p6 += p7; p7 = ROTL64(p7, ROT256[ROT][3]);  p7 ^= p6;
+	p0 += p1; p1 = ROTL64(p1, ROT256[ROT][0]) ^ p0;
+	p2 += p3; p3 = ROTL64(p3, ROT256[ROT][1]) ^ p2;
+	p4 += p5; p5 = ROTL64(p5, ROT256[ROT][2]) ^ p4;
+	p6 += p7; p7 = ROTL64(p7, ROT256[ROT][3]) ^ p6;
 }
 
 static __forceinline__ __device__
@@ -191,38 +175,36 @@ void Round_8_512v30(uint64_t *ks, uint64_t *ts, uint64_t &p0, uint64_t &p1, uint
 	Round512v30(p2, p1, p4, p7, p6, p5, p0, p3, 1);
 	Round512v30(p4, p1, p6, p3, p0, p5, p2, p7, 2);
 	Round512v30(p6, p1, p0, p7, p2, p5, p4, p3, 3);
-	p0 += ks[((R)+0) % 9];   /* inject the key schedule value */
-	p1 += ks[((R)+1) % 9];
-	p2 += ks[((R)+2) % 9];
-	p3 += ks[((R)+3) % 9];
-	p4 += ks[((R)+4) % 9];
-	p5 += ks[((R)+5) % 9] + ts[((R)+0) % 3];
-	p6 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
-	p7 += ks[((R)+7) % 9] + R;
+	p0 += ks[(R+0) % 9];   /* inject the key schedule value */
+	p1 += ks[(R+1) % 9];
+	p2 += ks[(R+2) % 9];
+	p3 += ks[(R+3) % 9];
+	p4 += ks[(R+4) % 9];
+	p5 += ks[(R+5) % 9] + ts[(R+0) % 3];
+	p6 += ks[(R+6) % 9] + ts[(R+1) % 3];
+	p7 += ks[(R+7) % 9] + R;
 	Round512v30(p0, p1, p2, p3, p4, p5, p6, p7, 4);
 	Round512v30(p2, p1, p4, p7, p6, p5, p0, p3, 5);
 	Round512v30(p4, p1, p6, p3, p0, p5, p2, p7, 6);
 	Round512v30(p6, p1, p0, p7, p2, p5, p4, p3, 7);
-	p0 += ks[((R)+1) % 9];   /* inject the key schedule value */
-	p1 += ks[((R)+2) % 9];
-	p2 += ks[((R)+3) % 9];
-	p3 += ks[((R)+4) % 9];
-	p4 += ks[((R)+5) % 9];
-	p5 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
-	p6 += ks[((R)+7) % 9] + ts[((R)+2) % 3];
-	p7 += ks[((R)+8) % 9] + (R)+1;
+	p0 += ks[(R+1) % 9];   /* inject the key schedule value */
+	p1 += ks[(R+2) % 9];
+	p2 += ks[(R+3) % 9];
+	p3 += ks[(R+4) % 9];
+	p4 += ks[(R+5) % 9];
+	p5 += ks[(R+6) % 9] + ts[(R+1) % 3];
+	p6 += ks[(R+7) % 9] + ts[(R+2) % 3];
+	p7 += ks[(R+8) % 9] + R+1;
 }
 
 __global__  __launch_bounds__(256, 3)
 void skein256_gpu_hash_32_v30(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint64_t h[9];
-		uint64_t t[3];
-		uint64_t dt0, dt1, dt2, dt3;
-		uint64_t p0, p1, p2, p3, p4, p5, p6, p7;
+		uint64_t h[9], t[3];
+
 		h[8] = skein_ks_parity64;
 		for (int i = 0; i<8; i++) {
 			h[i] = SKEIN_IV512_256[i];
@@ -233,21 +215,21 @@ void skein256_gpu_hash_32_v30(uint32_t threads, uint32_t startNounce, uint64_t *
 		t[1] = devectorize(t12[1]);
 		t[2] = devectorize(t12[2]);
 
-		dt0 = outputHash[thread];
-		dt1 = outputHash[threads+thread];
-		dt2 = outputHash[2*threads+thread];
-		dt3 = outputHash[3*threads+thread];
-
-		p0 = h[0] + dt0;
-		p1 = h[1] + dt1;
-		p2 = h[2] + dt2;
-		p3 = h[3] + dt3;
-		p4 = h[4];
-		p5 = h[5] + t[0];
-		p6 = h[6] + t[1];
-		p7 = h[7];
-
-		#pragma unroll
+		uint64_t dt0 = outputHash[thread];
+		uint64_t dt1 = outputHash[threads   + thread];
+		uint64_t dt2 = outputHash[threads*2 + thread];
+		uint64_t dt3 = outputHash[threads*3 + thread];
+
+		uint64_t p0 = h[0] + dt0;
+		uint64_t p1 = h[1] + dt1;
+		uint64_t p2 = h[2] + dt2;
+		uint64_t p3 = h[3] + dt3;
+		uint64_t p4 = h[4];
+		uint64_t p5 = h[5] + t[0];
+		uint64_t p6 = h[6] + t[1];
+		uint64_t p7 = h[7];
+
+		#pragma unroll 9
 		for (int i = 1; i<19; i += 2) {
 			Round_8_512v30(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i);
 		}
@@ -272,22 +254,22 @@ void skein256_gpu_hash_32_v30(uint32_t threads, uint32_t startNounce, uint64_t *
 			h[8] ^= h[i];
 		}
 
-		t[0] = t12_30[3];
-		t[1] = t12_30[4];
-		t[2] = t12_30[5];
+		t[0] = devectorize(t12[3]);
+		t[1] = devectorize(t12[4]);
+		t[2] = devectorize(t12[5]);
 
 		p5 += t[0];  //p5 already equal h[5]
 		p6 += t[1];
 
-		#pragma unroll
+		#pragma unroll 9
 		for (int i = 1; i<19; i += 2) {
 			Round_8_512v30(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i);
 		}
 
 		outputHash[thread] = p0;
-		outputHash[threads + thread] = p1;
-		outputHash[2 * threads + thread] = p2;
-		outputHash[3 * threads + thread] = p3;
+		outputHash[threads   + thread] = p1;
+		outputHash[threads*2 + thread] = p2;
+		outputHash[threads*3 + thread] = p3;
 
 	} //thread
 }
@@ -306,6 +288,7 @@ void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, ui
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
+	// only 1kH/s perf change between kernels on a 960...
 	if (device_sm[device_map[thr_id]] > 300 && cuda_arch[device_map[thr_id]] > 300)
 		skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
 	else