From 63e3387dbbe62fd21690d2f56d58903fe31b09ed Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Tue, 16 Dec 2014 14:18:12 +0100
Subject: [PATCH] lyra2: add sm30 device compat (skein256)

---
 Algo256/cuda_skein256.cu | 130 +++++++++++++++++++++++++++++++++++++--
 lyra2/cuda_lyra2.cu      |  77 +----------------------
 2 files changed, 126 insertions(+), 81 deletions(-)

diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu
index 9e4013f..5cc0904 100644
--- a/Algo256/cuda_skein256.cu
+++ b/Algo256/cuda_skein256.cu
@@ -2,14 +2,12 @@
 
 #include "cuda_helper.h"
 
-#if 0
 static __constant__ uint64_t SKEIN_IV512_256[8] = {
 	0xCCD044A12FDB3E13, 0xE83590301A79A9EB,
 	0x55AEA0614F816E6F, 0x2A2767A4AE9B94DB,
 	0xEC06025E74DD7683, 0xE7A436CDC4746251,
 	0xC36FBAF9393AD185, 0x3EEDBA1833EDFC13
 };
-#endif
 
 static __constant__ uint2 vSKEIN_IV512_256[8] = {
 	{ 0x2FDB3E13, 0xCCD044A1 },
@@ -35,6 +33,8 @@ static __constant__ int ROT256[8][4] =
 };
 
 static __constant__ uint2 skein_ks_parity = { 0xA9FC1A22,0x1BD11BDA};
+static __constant__ uint64_t skein_ks_parity64 = 0x1BD11BDAA9FC1A22ull;
+
 static __constant__ uint2 t12[6] = {
 	{ 0x20,	0 },
 	{ 0,	0xf0000000 },
@@ -44,7 +44,6 @@ static __constant__ uint2 t12[6] = {
 	{ 0x08,	0xff000000 }
 };
 
-#if 0
 static __constant__ uint64_t t12_30[6] = {
 	0x20,
 	0xf000000000000000,
@@ -53,7 +52,6 @@ static __constant__ uint64_t t12_30[6] = {
 	0xff00000000000000,
 	0xff00000000000008
 };
-#endif
 
 static __forceinline__ __device__
 void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int ROT)
@@ -175,6 +173,125 @@ void skein256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHas
 	}
 }
 
+static __forceinline__ __device__
+void Round512v30(uint64_t &p0, uint64_t &p1, uint64_t &p2, uint64_t &p3,
+	uint64_t &p4, uint64_t &p5, uint64_t &p6, uint64_t &p7, int ROT)
+{
+	p0 += p1; p1 = ROTL64(p1, ROT256[ROT][0]);  p1 ^= p0;
+	p2 += p3; p3 = ROTL64(p3, ROT256[ROT][1]);  p3 ^= p2;
+	p4 += p5; p5 = ROTL64(p5, ROT256[ROT][2]);  p5 ^= p4;
+	p6 += p7; p7 = ROTL64(p7, ROT256[ROT][3]);  p7 ^= p6;
+}
+
+static __forceinline__ __device__
+void Round_8_512v30(uint64_t *ks, uint64_t *ts, uint64_t &p0, uint64_t &p1, uint64_t &p2, uint64_t &p3,
+	uint64_t &p4, uint64_t &p5, uint64_t &p6, uint64_t &p7, int R)
+{
+	Round512v30(p0, p1, p2, p3, p4, p5, p6, p7, 0);
+	Round512v30(p2, p1, p4, p7, p6, p5, p0, p3, 1);
+	Round512v30(p4, p1, p6, p3, p0, p5, p2, p7, 2);
+	Round512v30(p6, p1, p0, p7, p2, p5, p4, p3, 3);
+	p0 += ks[((R)+0) % 9];   /* inject the key schedule value */
+	p1 += ks[((R)+1) % 9];
+	p2 += ks[((R)+2) % 9];
+	p3 += ks[((R)+3) % 9];
+	p4 += ks[((R)+4) % 9];
+	p5 += ks[((R)+5) % 9] + ts[((R)+0) % 3];
+	p6 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
+	p7 += ks[((R)+7) % 9] + R;
+	Round512v30(p0, p1, p2, p3, p4, p5, p6, p7, 4);
+	Round512v30(p2, p1, p4, p7, p6, p5, p0, p3, 5);
+	Round512v30(p4, p1, p6, p3, p0, p5, p2, p7, 6);
+	Round512v30(p6, p1, p0, p7, p2, p5, p4, p3, 7);
+	p0 += ks[((R)+1) % 9];   /* inject the key schedule value */
+	p1 += ks[((R)+2) % 9];
+	p2 += ks[((R)+3) % 9];
+	p3 += ks[((R)+4) % 9];
+	p4 += ks[((R)+5) % 9];
+	p5 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
+	p6 += ks[((R)+7) % 9] + ts[((R)+2) % 3];
+	p7 += ks[((R)+8) % 9] + (R)+1;
+}
+
+__global__  __launch_bounds__(256, 3)
+void skein256_gpu_hash_32_v30(int threads, uint32_t startNounce, uint64_t *outputHash)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint64_t h[9];
+		uint64_t t[3];
+		uint64_t dt0, dt1, dt2, dt3;
+		uint64_t p0, p1, p2, p3, p4, p5, p6, p7;
+		h[8] = skein_ks_parity64;
+		for (int i = 0; i<8; i++) {
+			h[i] = SKEIN_IV512_256[i];
+			h[8] ^= h[i];
+		}
+
+		t[0] = devectorize(t12[0]);
+		t[1] = devectorize(t12[1]);
+		t[2] = devectorize(t12[2]);
+
+		dt0 = outputHash[thread];
+		dt1 = outputHash[threads+thread];
+		dt2 = outputHash[2*threads+thread];
+		dt3 = outputHash[3*threads+thread];
+
+		p0 = h[0] + dt0;
+		p1 = h[1] + dt1;
+		p2 = h[2] + dt2;
+		p3 = h[3] + dt3;
+		p4 = h[4];
+		p5 = h[5] + t[0];
+		p6 = h[6] + t[1];
+		p7 = h[7];
+
+		#pragma unroll
+		for (int i = 1; i<19; i += 2) {
+			Round_8_512v30(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i);
+		}
+
+		p0 ^= dt0;
+		p1 ^= dt1;
+		p2 ^= dt2;
+		p3 ^= dt3;
+
+		h[0] = p0;
+		h[1] = p1;
+		h[2] = p2;
+		h[3] = p3;
+		h[4] = p4;
+		h[5] = p5;
+		h[6] = p6;
+		h[7] = p7;
+		h[8] = skein_ks_parity64;
+
+		#pragma unroll 8
+		for (int i = 0; i<8; i++) {
+			h[8] ^= h[i];
+		}
+
+		t[0] = t12_30[3];
+		t[1] = t12_30[4];
+		t[2] = t12_30[5];
+
+		p5 += t[0];  //p5 already equal h[5]
+		p6 += t[1];
+
+		#pragma unroll
+		for (int i = 1; i<19; i += 2) {
+			Round_8_512v30(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i);
+		}
+
+		outputHash[thread] = p0;
+		outputHash[threads + thread] = p1;
+		outputHash[2 * threads + thread] = p2;
+		outputHash[3 * threads + thread] = p3;
+
+	} //thread
+}
+
 __host__
 void skein256_cpu_init(int thr_id, int threads)
 {
@@ -189,7 +306,10 @@ void skein256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
+	if (device_sm[device_map[thr_id]] >= 320)
+		skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
+	else
+		skein256_gpu_hash_32_v30<<<grid, block>>>(threads, startNounce, d_outputHash);
 
 	MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu
index 95d61f5..3afd225 100644
--- a/lyra2/cuda_lyra2.cu
+++ b/lyra2/cuda_lyra2.cu
@@ -434,81 +434,6 @@ void lyra2_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
 
 	} //thread
 }
-#if 0
-__global__ __launch_bounds__(TPB, 1)
-void lyra2_gpu_hash_32_test(int threads, uint32_t startNounce, uint64_t *outputHash)
-{
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		uint2 state[16];
-		#pragma unroll
-		for (int i = 0; i<4; i++) { LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); } //password
-		#pragma unroll
-		for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt
-		#pragma unroll
-		for (int i = 0; i<8; i++) { state[i + 8] = blake2b_IV[i]; }
-
-		// blake2blyra x2
-		#pragma unroll 24
-		for (int i = 0; i<24; i++) { round_lyra_v35(state); } //because 12 is not enough
-
-		uint2 Matrix[12][8][8]; // not cool
-
-		// reducedSqueezeRow0
-		#pragma unroll 8
-		for (int i = 0; i < 8; i++) {
-			#pragma unroll 12
-			for (int j = 0; j<12; j++) { Matrix[j][7-i][0] = state[j]; }
-			round_lyra_v35(state);
-		}
-
-		// reducedSqueezeRow1
-		#pragma unroll 8
-		for (int i = 0; i < 8; i++)
-		{
-			#pragma unroll 12
-			for (int j = 0; j<12; j++) { state[j] ^= Matrix[j][i][0]; }
-			round_lyra_v35(state);
-			#pragma unroll 12
-			for (int j = 0; j<12; j++) { Matrix[j][7-i][1] = Matrix[j][i][0] ^ state[j]; }
-		}
-
-		reduceDuplexRowSetup_test(1, 0, 2);
-		reduceDuplexRowSetup_test(2, 1, 3);
-		reduceDuplexRowSetup_test(3, 0, 4);
-		reduceDuplexRowSetup_test(4, 3, 5);
-		reduceDuplexRowSetup_test(5, 2, 6);
-		reduceDuplexRowSetup_test(6, 1, 7);
-
-		uint64_t rowa;
-		rowa = devectorize(state[0]) & 7;
-		reduceDuplexRow_test(7, rowa, 0);
-		rowa = devectorize(state[0]) & 7;
-		reduceDuplexRow_test(0, rowa, 3);
-		rowa = devectorize(state[0]) & 7;
-		reduceDuplexRow_test(3, rowa, 6);
-		rowa = devectorize(state[0]) & 7;
-		reduceDuplexRow_test(6, rowa, 1);
-		rowa = devectorize(state[0]) & 7;
-		reduceDuplexRow_test(1, rowa, 4);
-		rowa = devectorize(state[0]) & 7;
-		reduceDuplexRow_test(4, rowa, 7);
-		rowa = devectorize(state[0]) & 7;
-		reduceDuplexRow_test(7, rowa, 2);
-		rowa = devectorize(state[0]) & 7;
-		reduceDuplexRow_test(2, rowa, 5);
-
-		absorbblock_test(rowa);
-
-		#pragma unroll
-		for (int i = 0; i<4; i++) {
-			outputHash[threads*i + thread] = devectorize(state[i]);
-		} //password
-
-	} //thread
-}
-#endif
 
 __host__
 void lyra2_cpu_init(int thr_id, int threads)
@@ -524,7 +449,7 @@ void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	if (device_sm[device_map[thr_id]] >= 350) {
+	if (device_sm[device_map[thr_id]] >= 320) {
 		lyra2_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
 	} else {
 		// kernel for compute30 card