From d7abd01bf4c27e8b3ff0338fdb8cf970680ea717 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 17 May 2015 23:52:29 +0200
Subject: [PATCH] scrypt(titan): small kernel code optimisations

---
 scrypt/titan_kernel.cu | 130 +++++++++++++++++++++--------------------
 1 file changed, 67 insertions(+), 63 deletions(-)

diff --git a/scrypt/titan_kernel.cu b/scrypt/titan_kernel.cu
index a172958..13b047c 100644
--- a/scrypt/titan_kernel.cu
+++ b/scrypt/titan_kernel.cu
@@ -90,7 +90,7 @@ void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start)
 {
 	uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
 	if (SCHEME == ANDERSEN) {
-		int target_thread = (threadIdx.x + 4)%32;
+		int target_thread = (threadIdx.x + 4)&31;
 		uint4 t=b, t2=__shfl(bx, target_thread);
 		int t2_start = __shfl((int)start, target_thread) + 4;
 		bool c = (threadIdx.x & 0x4);
@@ -107,12 +107,12 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
 {
 	uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
 	if (SCHEME == ANDERSEN) {
-		int t2_start = __shfl((int)start, (threadIdx.x + 4)%32) + 4;
+		int t2_start = __shfl((int)start, (threadIdx.x + 4)&31) + 4;
 		bool c = (threadIdx.x & 0x4);
 		b  = __ldg((uint4 *)(&scratch[c ? t2_start : start]));
 		bx = __ldg((uint4 *)(&scratch[c ? start : t2_start]));
 		uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx);
-		bx = __shfl(bx, (threadIdx.x + 28)%32);
+		bx = __shfl(bx, (threadIdx.x + 28)&31);
 	} else {
 		b = *((uint4 *)(&scratch[start]));
 		bx = *((uint4 *)(&scratch[start+16]));
@@ -122,9 +122,9 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
 __device__  __forceinline__
 void primary_order_shuffle(uint32_t b[4], uint32_t bx[4]) {
 	/* Inner loop shuffle targets */
-	int x1 = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+1)&0x3);
-	int x2 = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+2)&0x3);
-	int x3 = (threadIdx.x & 0xfc) + (((threadIdx.x & 0x03)+3)&0x3);
+	int x1 = (threadIdx.x & 0xfc) + (((threadIdx.x & 3)+1)&3);
+	int x2 = (threadIdx.x & 0xfc) + (((threadIdx.x & 3)+2)&3);
+	int x3 = (threadIdx.x & 0xfc) + (((threadIdx.x & 3)+3)&3);
 
 	b[3] = __shfl((int)b[3], x1);
 	b[2] = __shfl((int)b[2], x2);
@@ -140,9 +140,9 @@ void primary_order_shuffle(uint32_t b[4], uint32_t bx[4]) {
 __device__  __forceinline__
 void primary_order_shuffle(uint4 &b, uint4 &bx) {
 	/* Inner loop shuffle targets */
-	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+1)&0x3);
-	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
-	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
+	int x1 = (threadIdx.x & 0x1c) + (((threadIdx.x & 3)+1)&3);
+	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 3)+2)&3);
+	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 3)+3)&3);
 
 	b.w = __shfl((int)b.w, x1);
 	b.z = __shfl((int)b.z, x2);
@@ -161,23 +161,24 @@ void primary_order_shuffle(uint4 &b, uint4 &bx) {
  * After loading, each thread has its four b and four bx keys stored
  * in internal processing order.
  */
-
 __device__  __forceinline__
 void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx)
 {
-	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
-	int key_offset = scrypt_block * 32;
-	uint32_t thread_in_block = threadIdx.x % 4;
+	uint32_t scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	uint32_t thread_in_block = threadIdx.x & 3U;
+	uint32_t key_offset = scrypt_block * 32 + (thread_in_block*4);
 
 	// Read in permuted order. Key loads are not our bottleneck right now.
-	b.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4];
-	b.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4];
-	b.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4];
-	b.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4];
-	bx.x = B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16];
-	bx.y = B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16];
-	bx.z = B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16];
-	bx.w = B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16];
+	b.x  = B[key_offset + (thread_in_block+0) & 3U];
+	b.y  = B[key_offset + (thread_in_block+1) & 3U];
+	b.z  = B[key_offset + (thread_in_block+2) & 3U];
+	b.w  = B[key_offset + (thread_in_block+3) & 3U];
+
+	key_offset += 16;
+	bx.x = B[key_offset + (thread_in_block+0) & 3U];
+	bx.y = B[key_offset + (thread_in_block+1) & 3U];
+	bx.z = B[key_offset + (thread_in_block+2) & 3U];
+	bx.w = B[key_offset + (thread_in_block+3) & 3U];
 
 	primary_order_shuffle(b, bx);
 }
@@ -187,24 +188,25 @@ void load_key_salsa(const uint32_t *B, uint4 &b, uint4 &bx)
  * internally-ordered b and bx and storing them into a contiguous
  * region of B in external order.
  */
-
 __device__  __forceinline__
 void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx)
 {
-	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
-	int key_offset = scrypt_block * 32;
-	uint32_t thread_in_block = threadIdx.x % 4;
+	uint32_t scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	uint32_t thread_in_block = threadIdx.x & 3U;
+	uint32_t key_offset = scrypt_block * 32 + (thread_in_block*4);
 
 	primary_order_shuffle(b, bx);
 
-	B[key_offset + 4*thread_in_block + (thread_in_block+0)%4] = b.x;
-	B[key_offset + 4*thread_in_block + (thread_in_block+1)%4] = b.y;
-	B[key_offset + 4*thread_in_block + (thread_in_block+2)%4] = b.z;
-	B[key_offset + 4*thread_in_block + (thread_in_block+3)%4] = b.w;
-	B[key_offset + 4*thread_in_block + (thread_in_block+0)%4 + 16] = bx.x;
-	B[key_offset + 4*thread_in_block + (thread_in_block+1)%4 + 16] = bx.y;
-	B[key_offset + 4*thread_in_block + (thread_in_block+2)%4 + 16] = bx.z;
-	B[key_offset + 4*thread_in_block + (thread_in_block+3)%4 + 16] = bx.w;
+	B[key_offset + (thread_in_block+0) & 3U] = b.x;
+	B[key_offset + (thread_in_block+1) & 3U] = b.y;
+	B[key_offset + (thread_in_block+2) & 3U] = b.z;
+	B[key_offset + (thread_in_block+3) & 3U] = b.w;
+
+	key_offset += 16;
+	B[key_offset + (thread_in_block+0) & 3U] = bx.x;
+	B[key_offset + (thread_in_block+1) & 3U] = bx.y;
+	B[key_offset + (thread_in_block+2) & 3U] = bx.z;
+	B[key_offset + (thread_in_block+3) & 3U] = bx.w;
 }
 
 
@@ -214,23 +216,24 @@ void store_key_salsa(uint32_t *B, uint4 &b, uint4 &bx)
  * After loading, each thread has its four b and four bx keys stored
  * in internal processing order.
  */
-
 __device__  __forceinline__
 void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx)
 {
-	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
-	int key_offset = scrypt_block * 32;
-	uint32_t thread_in_block = threadIdx.x % 4;
+	uint32_t scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	uint32_t thread_in_block = threadIdx.x & 3U;
+	uint32_t key_offset = scrypt_block * 32 + thread_in_block;
 
 	// Read in permuted order. Key loads are not our bottleneck right now.
-	b.x = B[key_offset + 4*0 + thread_in_block%4];
-	b.y = B[key_offset + 4*1 + thread_in_block%4];
-	b.z = B[key_offset + 4*2 + thread_in_block%4];
-	b.w = B[key_offset + 4*3 + thread_in_block%4];
-	bx.x = B[key_offset + 4*0 + thread_in_block%4 + 16];
-	bx.y = B[key_offset + 4*1 + thread_in_block%4 + 16];
-	bx.z = B[key_offset + 4*2 + thread_in_block%4 + 16];
-	bx.w = B[key_offset + 4*3 + thread_in_block%4 + 16];
+	b.x  = B[key_offset      ];
+	b.y  = B[key_offset + 4*1];
+	b.z  = B[key_offset + 4*2];
+	b.w  = B[key_offset + 4*3];
+
+	key_offset += 16;
+	bx.x = B[key_offset      ];
+	bx.y = B[key_offset + 4  ];
+	bx.z = B[key_offset + 4*2];
+	bx.w = B[key_offset + 4*3];
 }
 
 /*
@@ -238,22 +241,23 @@ void load_key_chacha(const uint32_t *B, uint4 &b, uint4 &bx)
  * internally-ordered b and bx and storing them into a contiguous
  * region of B in external order.
  */
-
 __device__  __forceinline__
 void store_key_chacha(uint32_t *B, const uint4 &b, const uint4 &bx)
 {
-	int scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
-	int key_offset = scrypt_block * 32;
-	uint32_t thread_in_block = threadIdx.x % 4;
-
-	B[key_offset + 4*0 + thread_in_block%4] = b.x;
-	B[key_offset + 4*1 + thread_in_block%4] = b.y;
-	B[key_offset + 4*2 + thread_in_block%4] = b.z;
-	B[key_offset + 4*3 + thread_in_block%4] = b.w;
-	B[key_offset + 4*0 + thread_in_block%4 + 16] = bx.x;
-	B[key_offset + 4*1 + thread_in_block%4 + 16] = bx.y;
-	B[key_offset + 4*2 + thread_in_block%4 + 16] = bx.z;
-	B[key_offset + 4*3 + thread_in_block%4 + 16] = bx.w;
+	uint32_t scrypt_block = (blockIdx.x*blockDim.x + threadIdx.x)/THREADS_PER_WU;
+	uint32_t thread_in_block = threadIdx.x & 3U;
+	uint32_t key_offset = scrypt_block * 32U + thread_in_block;
+
+	B[key_offset      ] = b.x;
+	B[key_offset + 4  ] = b.y;
+	B[key_offset + 4*2] = b.z;
+	B[key_offset + 4*3] = b.w;
+
+	key_offset += 16;
+	B[key_offset      ] = bx.x;
+	B[key_offset + 4  ] = bx.y;
+	B[key_offset + 4*2] = bx.z;
+	B[key_offset + 4*3] = bx.w;
 }
 
 
@@ -399,17 +403,17 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
 __device__  __forceinline__
 void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3)
 {
-	uint4 x;
+	uint4 x = b ^= bx;
 
-	b ^= bx;
-	x = b;
+	//b ^= bx;
+	//x = b;
 
 	// Enter in "column" mode (t0 has 0, 4,  8, 12)
 	//                        (t1 has 1, 5,  9, 13)
 	//                        (t2 has 2, 6, 10, 14)
 	//                        (t3 has 3, 7, 11, 15)
 
-#pragma unroll 4
+	//#pragma unroll
 	for (int j = 0; j < 4; j++) {
 
 		// Column Mixing phase of chacha
@@ -438,7 +442,7 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
 	bx ^= b;
 	x = bx;
 
-	#pragma unroll
+	//#pragma unroll
 	for (int j = 0; j < 4; j++) 
 	{