x11: shavite and echo from sp (now ok on win32)

Previous echo commit was only increasing linux performance, and reducing windows perf compared to the 1.4.9, this one seems to give at least the 1.4.9 on windows, and the same on linux... Shavite optimisation seems ok on both (use now 64 registers) the launch_bounds will force the number of registers, so remove specific Makefile rules on linux... manual "cherry pick" with fixed line endings and some adaptations
10 years ago · fdd5d29071
4 changed files with 69 additions and 32 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -89,13 +89,6 @@ qubit/qubit_luffa512.o: qubit/qubit_luffa512.cu
 x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 x11/cuda_x11_echo.o: x11/cuda_x11_echo.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 # Shavite compiles faster with 128 regs
 x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
 	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=128 -o $@ -c $<
 x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
--- a/x11/cuda_x11_aes.cu
+++ b/x11/cuda_x11_aes.cu
@ -311,7 +311,7 @@ void aes_gpu_init(uint32_t *sharedMemory)
 }
 /* tried with 3 xor.b32 asm, not faster */
-#define xor4_32(a,b,c,d) (a ^ b ^ c ^ d);
+#define xor4_32(a,b,c,d) ((a ^ b) ^ (c ^ d));
 __device__
 static void aes_round(
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@ -28,7 +28,11 @@ __device__ __forceinline__ void AES_2ROUND(
 	k0++;
 }
-__constant__ uint32_t P[48] = {
+__device__ __forceinline__
 void cuda_echo_round(
 	const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__  hash)
 {
 	const uint32_t P[48] = {
 	0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
 	0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
 	//8-12
@ -46,13 +50,10 @@ __constant__ uint32_t P[48] = {
 	0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
 	//58-61
 	};
 __device__ __forceinline__
 void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ hash)
 {
 	uint32_t k0;
 	uint32_t h[16];
-	#pragma unroll
+
 	#pragma unroll 16
 	for (int i = 0; i < 16; i++)
 	{
 		h[i] = hash[i];
@ -170,13 +171,21 @@ void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *
 	{
 		// Big Sub Words
-		#pragma unroll 16
+		#pragma unroll 4
-		for (int i = 0; i < 16; i++)
+		for (int idx = 0; idx < 64; idx += 16)
 		{
 			int idx = i << 2; // *4
 			AES_2ROUND(sharedMemory,
 				W[idx + 0], W[idx + 1], W[idx + 2], W[idx + 3],
 				k0);
 			AES_2ROUND(sharedMemory,
 				W[idx + 4], W[idx + 5], W[idx + 6], W[idx + 7],
 				k0);
 			AES_2ROUND(sharedMemory,
 				W[idx + 8], W[idx + 9], W[idx + 10], W[idx + 11],
 				k0);
 			AES_2ROUND(sharedMemory,
 				W[idx + 12], W[idx + 13], W[idx + 14], W[idx + 15],
 				k0);
 		}
 		// Shift Rows
@ -241,8 +250,8 @@ void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *
 		}
 	}
-	#pragma unroll 8
+	#pragma unroll
-	for (int i = 0; i<32; i += 4)
+	for (int i = 0; i<16; i += 4)
 	{
 		W[i] ^= W[32 + i] ^ 512;
 		W[i + 1] ^= W[32 + i + 1];
@ -255,12 +264,29 @@ void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *
 		hash[i] ^= W[i];
 }
-__global__ /* __launch_bounds__(320, 3) will force 64 registers on the 750 Ti */
+__device__ __forceinline__
 void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
 {
 	/* each thread startup will fill a uint32 */
 	if (threadIdx.x < 128) {
 		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
 		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
 		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
 		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
 		sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
 		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
 		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
 		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
 	}
 }
 __global__ __launch_bounds__(128, 7) /* will force 72 registers */
 void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	__shared__ uint32_t sharedMemory[1024];
-	aes_gpu_init(sharedMemory);
+	echo_gpu_init(sharedMemory);
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@ -283,7 +309,7 @@ void x11_echo512_cpu_init(int thr_id, int threads)
 __host__
 void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-	const int threadsperblock = 256;
+	const int threadsperblock = 128;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@ -1,9 +1,8 @@
 #include "cuda_helper.h"
-#define TPB 256
+#include <memory.h>
-// aus heavy.cu
+#define TPB 128
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
@ -1294,12 +1293,30 @@ static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, u
 	state[0xF] ^= p7;
 }
 __device__ __forceinline__
 void shavite_gpu_init(uint32_t *sharedMemory)
 {
 	/* each thread startup will fill a uint32 */
 	if (threadIdx.x < 128) {
 		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
 		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
 		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
 		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
 		sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
 		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
 		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
 		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
 	}
 }
 // GPU Hash
-__global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+__global__ __launch_bounds__(TPB, 8) /* 64 registers if TPB 128 (fast), 80 with 92 (medium), 32 if 256 (slow) */
 void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	__shared__ uint32_t sharedMemory[1024];
-	aes_gpu_init(sharedMemory);
+	shavite_gpu_init(sharedMemory);
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@ -1344,11 +1361,12 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
 	}
 }
-__global__ void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
+__global__ __launch_bounds__(TPB, 8)
 void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
 {
 	__shared__ uint32_t sharedMemory[1024];
-	aes_gpu_init(sharedMemory);
+	shavite_gpu_init(sharedMemory);
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@ -1397,9 +1415,9 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	size_t shared_size = 0;
+	cudaFuncSetCacheConfig(x11_shavite512_gpu_hash_64, cudaFuncCachePreferL1);
-	x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	x11_shavite512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	MyStreamSynchronize(NULL, order, thr_id);
 }