From 940c1b3a2f5c26848d116c029275f82036434bc1 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 16 Jul 2016 12:24:50 +0200
Subject: [PATCH] lbry: small changes for second build

sha512/ripemd swab

this was preview 2
---
 lbry/cuda_ripemd160.cu   | 67 +++++++++-------------------------------
 lbry/cuda_sha256_lbry.cu | 28 +++++++----------
 lbry/cuda_sha512_lbry.cu | 36 ++++++++++-----------
 lbry/lbry.cu             |  2 +-
 4 files changed, 44 insertions(+), 89 deletions(-)

diff --git a/lbry/cuda_ripemd160.cu b/lbry/cuda_ripemd160.cu
index 1ae7f35..bc4406e 100644
--- a/lbry/cuda_ripemd160.cu
+++ b/lbry/cuda_ripemd160.cu
@@ -37,6 +37,18 @@ static __constant__ uint32_t c_IV[5] = {
 	0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u
 };
 
+__device__ __forceinline__
+uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
+	uint32_t result;
+#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+	asm ("lop3.b32 %0, %1, %2, %3, 0x96; // xor3b"  //0x96 = 0xF0 ^ 0xCC ^ 0xAA
+		: "=r"(result) : "r"(a), "r"(b),"r"(c));
+#else
+	result = a^b^c;
+#endif
+	return result;
+}
+
 //__host__
 //uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c) {
 //	return c ^ (a | !b);
@@ -83,7 +95,7 @@ uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c)
 #define F4(x, y, z)   ((((x) ^ (y)) & (z)) ^ (y))
 #define F5(x, y, z)   ((x) ^ ((y) | ~(z)))
 #else
-#define F1(x, y, z)   xor3(x,y,z)
+#define F1(x, y, z)   xor3b(x,y,z)
 #define F2(x, y, z)   xandx(x,y,z)
 #define F3(x, y, z)   xornot64(x,y,z)
 #define F4(x, y, z)   xandx(z,x,y)
@@ -305,59 +317,7 @@ uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c)
 	h[0] = tmp; \
 }
 
-#if 0
-__global__
-void lbry_ripemd160_gpu_hash_32(const uint32_t threads, uint64_t *g_hash, const uint32_t byteOffset)
-{
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U + byteOffset/8]);
-
-		uint32_t in[16];
-		for (int i=0; i<8; i++)
-			in[i] = (hash[i]);
-		in[8] = 0x80;
-
-		#pragma unroll
-		for (int i=9;i<16;i++) in[i] = 0;
-
-		in[14] = 0x100; // size in bits
-
-		uint32_t h[5];
-		#pragma unroll
-		for (int i=0; i<5; i++)
-			h[i] = c_IV[i];
-
-		RIPEMD160_ROUND_BODY(in, h);
-
-		#pragma unroll
-		for (int i=0; i<5; i++)
-			hash[i] = h[i];
-
-#ifdef PAD_ZEROS
-		// 20 bytes hash on 32 or 64 bytes output space
-		hash[5] = 0;
-		hash[6] = 0;
-		hash[7] = 0;
-#endif
-	}
-}
-
-__host__
-void lbry_ripemd160_hash_32(int thr_id, uint32_t threads, uint32_t *g_Hash, uint32_t byteOffset, cudaStream_t stream)
-{
-	const uint32_t threadsperblock = 128;
-
-	dim3 grid(threads/threadsperblock);
-	dim3 block(threadsperblock);
-
-	lbry_ripemd160_gpu_hash_32 <<<grid, block, 0, stream>>> (threads, (uint64_t*) g_Hash, byteOffset);
-}
-#endif
-
 __global__
-//__launch_bounds__(256,6)
 void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
@@ -366,6 +326,7 @@ void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash)
 		uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U]);
 
 		uint32_t in[16];
+		#pragma unroll
 		for (int i=0; i<8; i++)
 			in[i] = (hash[i]);
 		in[8] = 0x80;
diff --git a/lbry/cuda_sha256_lbry.cu b/lbry/cuda_sha256_lbry.cu
index db0c41c..c75325e 100644
--- a/lbry/cuda_sha256_lbry.cu
+++ b/lbry/cuda_sha256_lbry.cu
@@ -160,20 +160,6 @@ uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
 	return result;
 }
 
-/*
-__device__ __forceinline__
-uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
-	uint32_t result;
-	asm("{  .reg .u32 t1; // xor3b \n\t"
-		"xor.b32 t1, %2, %3;\n\t"
-		"xor.b32 %0, %1, t1;"
-		"}"
-		: "=r"(result) : "r"(a) ,"r"(b),"r"(c));
-	return result;
-}
-#define xor3b(a,b,c) (a ^ b ^ c)
-*/
-
 __device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
 {
 	uint32_t r1 = ROTR32(x,2);
@@ -220,6 +206,13 @@ __device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b,
 	return result;
 }
 
+__device__ __forceinline__ uint2 vectorizeswap(uint64_t v) {
+	uint2 result;
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(result.y), "=r"(result.x) : "l"(v));
+	return result;
+}
+
 __device__
 static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
 	uint32_t in, const uint32_t Kshared)
@@ -393,7 +386,8 @@ void lbry_sha256_gpu_hash_32(uint32_t threads, uint64_t *Hash512)
 		uint2* output = (uint2*) input;
 		#pragma unroll
 		for (int i=0;i<4;i++) {
-			output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
+			//output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
+			output[i] = vectorizeswap(((uint64_t*)buf)[i]);
 		}
 #ifdef PAD_ZEROS
 		#pragma unroll
@@ -447,8 +441,8 @@ void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce
 		uint2* output = (uint2*) (&outputHash[thread * 8U]);
 		#pragma unroll
 		for (int i=0;i<4;i++) {
-			output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
-			//output[i] = vectorize(((uint64_t*)buf)[i]);
+		//	//output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
+			output[i] = vectorizeswap(((uint64_t*)buf)[i]);
 		}
 	}
 }
diff --git a/lbry/cuda_sha512_lbry.cu b/lbry/cuda_sha512_lbry.cu
index 39f549b..79c0906 100644
--- a/lbry/cuda_sha512_lbry.cu
+++ b/lbry/cuda_sha512_lbry.cu
@@ -6,6 +6,7 @@
 #include <stdint.h>
 #include <memory.h>
 
+//#define USE_ROT_ASM_OPT 0
 #include <cuda_helper.h>
 
 static __constant__ uint64_t K_512[80];
@@ -36,8 +37,6 @@ static const uint64_t K512[80] = {
 //#undef xor3
 //#define xor3(a,b,c) (a^b^c)
 
-//#undef 
-
 static __device__ __forceinline__
 uint64_t bsg5_0(const uint64_t x)
 {
@@ -111,26 +110,17 @@ __global__
 void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
+	//if (thread < threads)
 	{
 		uint64_t *pHash = &g_hash[thread * 8U];
 
 		uint64_t W[80];
-		uint64_t r[8];
-
-		uint64_t IV512[8] = {
-		        0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
-		        0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
-		};
-
-		#pragma unroll
-		for (int i = 0; i < 8; i++)
-			r[i] = IV512[i];
 
 		#pragma unroll
 		for (int i = 0; i < 4; i++) {
 			// 32 bytes input
-			W[i] = cuda_swab64(pHash[i]);
+			W[i] = pHash[i];
+			//W[i] = cuda_swab64(pHash[i]); // made in sha256
 		}
 
 		W[4] = 0x8000000000000000; // end tag
@@ -140,13 +130,23 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
 
 		W[15] = 0x100; // 256 bits
 
-		#pragma unroll
-		for (int i = 16; i < 80; i++) W[i] = 0;
+		//#pragma unroll
+		//for (int i = 16; i < 78; i++) W[i] = 0;
 
-		#pragma unroll 64
+		#pragma unroll
 		for (int i = 16; i < 80; i++)
 			W[i] = ssg5_1(W[i - 2]) + W[i - 7] + ssg5_0(W[i - 15]) + W[i - 16];
 
+		const uint64_t IV512[8] = {
+			0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+			0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+		};
+
+		uint64_t r[8];
+		#pragma unroll
+		for (int i = 0; i < 8; i++)
+			r[i] = IV512[i];
+
 		#pragma unroll 10
 		for (int i = 0; i < 10; i++) {
 			#pragma unroll 8
@@ -168,7 +168,7 @@ void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStr
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	size_t shared_size = 80*8;
+	size_t shared_size = 0;
 	lbry_sha512_gpu_hash_32 <<<grid, block, shared_size, stream>>> (threads, (uint64_t*)d_hash);
 }
 
diff --git a/lbry/lbry.cu b/lbry/lbry.cu
index f5da268..b81cc79 100644
--- a/lbry/lbry.cu
+++ b/lbry/lbry.cu
@@ -106,7 +106,7 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark) {
-		ptarget[7] = 0xff;
+		ptarget[7] = 0xf;
 	}
 
 	if (!init[thr_id]){