From 5be6811dcffbafce70a28e162b6f4609e2e23f41 Mon Sep 17 00:00:00 2001
From: sp-hash <sp-hash@users.noreply.github.com>
Date: Thu, 6 Nov 2014 11:39:53 +0100
Subject: [PATCH] x11: echo and cubehash optimization

echo : 40.056ms -> 39.241ms
cube : 14.490ms -> 13.511ms

cube hash change look like useless (__device__ code in generally inlined)
but the reality proves that cuda documentation is wrong...

tpruvot: fixed dos lines ending in echo,
and used my style for cuda function attributes
---
 x11/cuda_x11_aes.cu         | 104 ++++++++++++------------------------
 x11/cuda_x11_cubehash512.cu |  24 +++++----
 x11/cuda_x11_echo.cu        |  47 ++++++++--------
 3 files changed, 71 insertions(+), 104 deletions(-)

diff --git a/x11/cuda_x11_aes.cu b/x11/cuda_x11_aes.cu
index fca1b05..1d1ae07 100644
--- a/x11/cuda_x11_aes.cu
+++ b/x11/cuda_x11_aes.cu
@@ -319,49 +319,32 @@ static void aes_round(
 	uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0,
 	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
 {
-	uint32_t idx0, idx1, idx2, idx3;
 
-	idx0 = __byte_perm(x0, 0, 0x4440);
-	idx1 = __byte_perm(x1, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x2, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x3, 0, 0x4443) + 768;
 	y0 = xor4_32(
-		sharedMemory[idx0],
-		sharedMemory[idx1],
-		sharedMemory[idx2],
-		sharedMemory[idx3]);
-	y0 ^= k0;
-
-	idx0 = __byte_perm(x1, 0, 0x4440);
-	idx1 = __byte_perm(x2, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x3, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x0, 0, 0x4443) + 768;
+		sharedMemory[__byte_perm(x0, 0, 0x4440)],
+		sharedMemory[__byte_perm(x1, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x2, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]);
 
 	y1 = xor4_32(
-		sharedMemory[idx0],
-		sharedMemory[idx1],
-		sharedMemory[idx2],
-		sharedMemory[idx3]);
+		sharedMemory[__byte_perm(x1, 0, 0x4440)],
+		sharedMemory[__byte_perm(x2, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x3, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]);
 
-	idx0 = __byte_perm(x2, 0, 0x4440);
-	idx1 = __byte_perm(x3, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x0, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x1, 0, 0x4443) + 768;
 	y2 = xor4_32(
-		sharedMemory[idx0],
-		sharedMemory[idx1],
-		sharedMemory[idx2],
-		sharedMemory[idx3]); // ^k2
+		sharedMemory[__byte_perm(x2, 0, 0x4440)],
+		sharedMemory[__byte_perm(x3, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x0, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2
+
+	y0 ^= k0;
 
-	idx0 = __byte_perm(x3, 0, 0x4440);
-	idx1 = __byte_perm(x0, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x1, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x2, 0, 0x4443) + 768;
 	y3 = xor4_32(
-		sharedMemory[idx0],
-		sharedMemory[idx1],
-		sharedMemory[idx2],
-		sharedMemory[idx3]); // ^k3
+		sharedMemory[__byte_perm(x3, 0, 0x4440)],
+		sharedMemory[__byte_perm(x0, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x1, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3
 }
 
 __device__
@@ -370,46 +353,27 @@ static void aes_round(
 	uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, 
 	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
 {
-	uint32_t idx0, idx1, idx2, idx3;
-
-	idx0 = __byte_perm(x0, 0, 0x4440);
-	idx1 = __byte_perm(x1, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x2, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x3, 0, 0x4443) + 768;
 	y0 = xor4_32(
-		sharedMemory[idx0],
-		sharedMemory[idx1],
-		sharedMemory[idx2],
-		sharedMemory[idx3]);
-
-	idx0 = __byte_perm(x1, 0, 0x4440);
-	idx1 = __byte_perm(x2, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x3, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x0, 0, 0x4443) + 768;
+		sharedMemory[__byte_perm(x0, 0, 0x4440)],
+		sharedMemory[__byte_perm(x1, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x2, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]);
 
 	y1 = xor4_32(
-		sharedMemory[idx0],
-		sharedMemory[idx1],
-		sharedMemory[idx2],
-		sharedMemory[idx3]);
+		sharedMemory[__byte_perm(x1, 0, 0x4440)],
+		sharedMemory[__byte_perm(x2, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x3, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]);
 
-	idx0 = __byte_perm(x2, 0, 0x4440);
-	idx1 = __byte_perm(x3, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x0, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x1, 0, 0x4443) + 768;
 	y2 = xor4_32(
-		sharedMemory[idx0],
-		sharedMemory[idx1],
-		sharedMemory[idx2],
-		sharedMemory[idx3]); // ^k2
+		sharedMemory[__byte_perm(x2, 0, 0x4440)],
+		sharedMemory[__byte_perm(x3, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x0, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2
 
-	idx0 = __byte_perm(x3, 0, 0x4440);
-	idx1 = __byte_perm(x0, 0, 0x4441) + 256;
-	idx2 = __byte_perm(x1, 0, 0x4442) + 512;
-	idx3 = __byte_perm(x2, 0, 0x4443) + 768;
 	y3 = xor4_32(
-		sharedMemory[idx0],
-		sharedMemory[idx1],
-		sharedMemory[idx2],
-		sharedMemory[idx3]); // ^k3
+		sharedMemory[__byte_perm(x3, 0, 0x4440)],
+		sharedMemory[__byte_perm(x0, 0, 0x4441) + 256],
+		sharedMemory[__byte_perm(x1, 0, 0x4442) + 512],
+		sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3
 }
diff --git a/x11/cuda_x11_cubehash512.cu b/x11/cuda_x11_cubehash512.cu
index 5f77b0b..cd2ccef 100644
--- a/x11/cuda_x11_cubehash512.cu
+++ b/x11/cuda_x11_cubehash512.cu
@@ -34,7 +34,8 @@ static const uint32_t c_IV_512[32] = {
 	0x7795D246, 0xD43E3B44
 };
 
-static __device__ void rrounds(uint32_t x[2][2][2][2][2])
+__device__ __forceinline__
+static void rrounds(uint32_t x[2][2][2][2][2])
 {
     int r;
     int j;
@@ -150,8 +151,8 @@ static __device__ void rrounds(uint32_t x[2][2][2][2][2])
     }
 }
 
-
-static __device__ void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2])
+__device__ __forceinline__
+static void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2])
 {
     int k;
     int l;
@@ -167,7 +168,8 @@ static __device__ void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2])
                 x[0][0][k][l][m] ^= *in++;
 }
 
-static __device__ void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2])
+__device__ __forceinline__
+static void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2])
 {
     int j;
     int k;
@@ -186,7 +188,8 @@ static __device__ void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2])
                     *out++ = x[0][j][k][l][m];
 }
 
-void __device__ Init(uint32_t x[2][2][2][2][2])
+__device__
+void Init(uint32_t x[2][2][2][2][2])
 {
     int i,j,k,l,m;
 #if 0
@@ -227,7 +230,8 @@ void __device__ Init(uint32_t x[2][2][2][2][2])
 #endif
 }
 
-void __device__ Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
+__device__ __forceinline__
+void Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
 {
     /* "xor the block into the first b bytes of the state" */
     /* "and then transform the state invertibly through r identical rounds" */
@@ -235,7 +239,8 @@ void __device__ Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
     rrounds(x);
 }
 
-void __device__ Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
+__device__ __forceinline__
+void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
 {
     int i;
 
@@ -252,8 +257,9 @@ void __device__ Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
 
 
 /***************************************************/
-// Die Hash-Funktion
-__global__ void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+// GPU Hash Function
+__global__
+void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
     int thread = (blockDim.x * blockIdx.x + threadIdx.x);
     if (thread < threads)
diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu
index 5f20754..523316a 100644
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@@ -75,41 +75,38 @@ __device__ __forceinline__ void cuda_echo_round(
 	}
 
 	// Mix Columns
-#pragma unroll 4
-	for(int i=0;i<4;i++) // Schleife über je 2*uint32_t
+	#pragma unroll
+	for (int i = 0; i<4; i++) // Schleife über je 2*uint32_t
 	{
-#pragma unroll 4
-		for(int j=0;j<4;j++) // Schleife über die elemnte
+		#pragma unroll 64
+		for (int idx = 0; idx<64; idx += 16) // Schleife über die elemnte
 		{
-			int idx = j<<2; // j*4
 
-			uint32_t a = W[ ((idx + 0)<<2) + i];
-			uint32_t b = W[ ((idx + 1)<<2) + i];
-			uint32_t c = W[ ((idx + 2)<<2) + i];
-			uint32_t d = W[ ((idx + 3)<<2) + i];
+			uint32_t a = W[idx + i];
+			uint32_t b = W[idx + i + 4];
+			uint32_t c = W[idx + i + 8];
+			uint32_t d = W[idx + i + 12];
 
 			uint32_t ab = a ^ b;
 			uint32_t bc = b ^ c;
 			uint32_t cd = c ^ d;
 
-			uint32_t t;
-			t = ((ab & 0x80808080) >> 7);
-			uint32_t abx = t<<4 ^ t<<3 ^ t<<1 ^ t;
-			t = ((bc & 0x80808080) >> 7);
-			uint32_t bcx = t<<4 ^ t<<3 ^ t<<1 ^ t;
-			t = ((cd & 0x80808080) >> 7);
-			uint32_t cdx = t<<4 ^ t<<3 ^ t<<1 ^ t;
-
-			abx ^= ((ab & 0x7F7F7F7F) << 1);
-			bcx ^= ((bc & 0x7F7F7F7F) << 1);
-			cdx ^= ((cd & 0x7F7F7F7F) << 1);
-
-			W[ ((idx + 0)<<2) + i] = abx ^ bc ^ d;
-			W[ ((idx + 1)<<2) + i] = bcx ^ a ^ cd;
-			W[ ((idx + 2)<<2) + i] = cdx ^ ab ^ d;
-			W[ ((idx + 3)<<2) + i] = abx ^ bcx ^ cdx ^ ab ^ c;
+			uint32_t t, t2, t3;
+			t = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1);
+			uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1);
+			uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1);
+
+			W[idx + i] = abx ^ bc ^ d;
+			W[idx + i + 4] = bcx ^ a ^ cd;
+			W[idx + i + 8] = cdx ^ ab ^ d;
+			W[idx + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
 		}
 	}
+
 }
 
 __global__ void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)