From fdd5d290716108904a42e15bf6449e55ea7ad42b Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 16 Nov 2014 16:40:23 +0100
Subject: [PATCH] x11: shavite and echo from sp (now ok on win32)

Previous echo commit was only increasing linux performance, and reducing
windows perf compared to the 1.4.9, this one seems to give at least
the 1.4.9 on windows, and the same on linux...

Shavite optimisation seems ok on both (use now 64 registers)

the launch_bounds will force the number of registers, so remove specific
Makefile rules on linux...

manual "cherry pick" with fixed line endings and some adaptations
---
 Makefile.am                |  7 -----
 x11/cuda_x11_aes.cu        |  2 +-
 x11/cuda_x11_echo.cu       | 56 ++++++++++++++++++++++++++++----------
 x11/cuda_x11_shavite512.cu | 36 ++++++++++++++++++------
 4 files changed, 69 insertions(+), 32 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index f2af2d1..79f521d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -89,13 +89,6 @@ qubit/qubit_luffa512.o: qubit/qubit_luffa512.cu
 x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 
-x11/cuda_x11_echo.o: x11/cuda_x11_echo.cu
-	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
-
-# Shavite compiles faster with 128 regs
-x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
-	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=128 -o $@ -c $<
-
 x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 
diff --git a/x11/cuda_x11_aes.cu b/x11/cuda_x11_aes.cu
index 1d1ae07..51f407c 100644
--- a/x11/cuda_x11_aes.cu
+++ b/x11/cuda_x11_aes.cu
@@ -311,7 +311,7 @@ void aes_gpu_init(uint32_t *sharedMemory)
 }
 
 /* tried with 3 xor.b32 asm, not faster */
-#define xor4_32(a,b,c,d) (a ^ b ^ c ^ d);
+#define xor4_32(a,b,c,d) ((a ^ b) ^ (c ^ d));
 
 __device__
 static void aes_round(
diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu
index 29505da..5fc2a0f 100644
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@@ -28,7 +28,11 @@ __device__ __forceinline__ void AES_2ROUND(
 	k0++;
 }
 
-__constant__ uint32_t P[48] = {
+__device__ __forceinline__
+void cuda_echo_round(
+	const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__  hash)
+{
+	const uint32_t P[48] = {
 	0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
 	0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
 	//8-12
@@ -45,14 +49,11 @@ __constant__ uint32_t P[48] = {
 	0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
 	0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
 	//58-61
-};
-
-__device__ __forceinline__
-void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ hash)
-{
+	};
 	uint32_t k0;
 	uint32_t h[16];
-	#pragma unroll
+
+	#pragma unroll 16
 	for (int i = 0; i < 16; i++)
 	{
 		h[i] = hash[i];
@@ -170,13 +171,21 @@ void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *
 	{
 
 		// Big Sub Words
-		#pragma unroll 16
-		for (int i = 0; i < 16; i++)
+		#pragma unroll 4
+		for (int idx = 0; idx < 64; idx += 16)
 		{
-			int idx = i << 2; // *4
 			AES_2ROUND(sharedMemory,
 				W[idx + 0], W[idx + 1], W[idx + 2], W[idx + 3],
 				k0);
+			AES_2ROUND(sharedMemory,
+				W[idx + 4], W[idx + 5], W[idx + 6], W[idx + 7],
+				k0);
+			AES_2ROUND(sharedMemory,
+				W[idx + 8], W[idx + 9], W[idx + 10], W[idx + 11],
+				k0);
+			AES_2ROUND(sharedMemory,
+				W[idx + 12], W[idx + 13], W[idx + 14], W[idx + 15],
+				k0);
 		}
 
 		// Shift Rows
@@ -241,8 +250,8 @@ void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *
 		}
 	}
 
-	#pragma unroll 8
-	for (int i = 0; i<32; i += 4)
+	#pragma unroll
+	for (int i = 0; i<16; i += 4)
 	{
 		W[i] ^= W[32 + i] ^ 512;
 		W[i + 1] ^= W[32 + i + 1];
@@ -255,12 +264,29 @@ void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *
 		hash[i] ^= W[i];
 }
 
-__global__ /* __launch_bounds__(320, 3) will force 64 registers on the 750 Ti */
+__device__ __forceinline__
+void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
+{
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x < 128) {
+		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
+		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
+		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
+		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
+
+		sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
+	}
+}
+
+__global__ __launch_bounds__(128, 7) /* will force 72 registers */
 void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
-	aes_gpu_init(sharedMemory);
+	echo_gpu_init(sharedMemory);
 
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -283,7 +309,7 @@ void x11_echo512_cpu_init(int thr_id, int threads)
 __host__
 void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-	const int threadsperblock = 256;
+	const int threadsperblock = 128;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu
index f87562a..dfe6a1e 100644
--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@@ -1,9 +1,8 @@
 #include "cuda_helper.h"
 
-#define TPB 256
+#include <memory.h>
 
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+#define TPB 128
 
 __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
 
@@ -1294,12 +1293,30 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, u
 	state[0xF] ^= p7;
 }
 
+__device__ __forceinline__
+void shavite_gpu_init(uint32_t *sharedMemory)
+{
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x < 128) {
+		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
+		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
+		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
+		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
+
+		sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
+	}
+}
+
 // GPU Hash
-__global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+__global__ __launch_bounds__(TPB, 8) /* 64 registers if TPB 128 (fast), 80 with 92 (medium), 32 if 256 (slow) */
+void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
-	aes_gpu_init(sharedMemory);
+	shavite_gpu_init(sharedMemory);
 
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -1344,11 +1361,12 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
 	}
 }
 
-__global__ void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
+__global__ __launch_bounds__(TPB, 8)
+void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
-	aes_gpu_init(sharedMemory);
+	shavite_gpu_init(sharedMemory);
 
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -1397,9 +1415,9 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	size_t shared_size = 0;
+	cudaFuncSetCacheConfig(x11_shavite512_gpu_hash_64, cudaFuncCachePreferL1);
 
-	x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x11_shavite512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	MyStreamSynchronize(NULL, order, thr_id);
 }