From c88750332c9a66ffc1ff9a73118c5ddb178cecba Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 23 Nov 2014 19:05:27 +0100
Subject: [PATCH] simd512: restore SM3/3.5 perfs

Simple change which affect all algos based on SIMD512

fresh, qubit, s3, x11 to x17...
---
 x11/cuda_x11_echo.cu    |  3 ---
 x11/cuda_x11_simd512.cu | 17 +++++++++++------
 x11/x11.cu              | 14 +++++++++++---
 x13/x13.cu              |  2 +-
 4 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu
index cd8a83b..0866f8c 100644
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@@ -3,9 +3,6 @@
 
 #include "cuda_helper.h"
 
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
 #include "cuda_x11_aes.cu"
 
 __device__ __forceinline__ void AES_2ROUND(
diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu
index 4c1e537..fb63c15 100644
--- a/x11/cuda_x11_simd512.cu
+++ b/x11/cuda_x11_simd512.cu
@@ -576,7 +576,7 @@ x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, u
 	}
 }
 
-/*
+
 __global__ void __launch_bounds__(TPB, 4)
 x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
 {
@@ -604,7 +604,7 @@ x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash
 		Compression2(hashPosition, g_fft4, g_state);
 	}
 }
-*/
+
 
 __global__ void __launch_bounds__(TPB, 4)
 x11_simd512_gpu_compress_64_maxwell(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
@@ -670,13 +670,18 @@ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint
 	const int threadsperblock = TPB;
 
 	dim3 block(threadsperblock);
-	dim3 grid8(((threads + threadsperblock-1)/threadsperblock)*8);
+	dim3 grid((threads + threadsperblock-1) / threadsperblock);
 
-	x11_simd512_gpu_expand_64 <<<grid8, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id]);
+	dim3 grid8(((threads + threadsperblock - 1) / threadsperblock) * 8);
 
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	x11_simd512_gpu_expand_64 <<<grid8, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id]);
+	if (device_sm[device_map[thr_id]] >= 500) {
+		x11_simd512_gpu_compress_64_maxwell <<< grid8, block >>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
+	} else {
+		x11_simd512_gpu_compress1_64 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
+		x11_simd512_gpu_compress2_64 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
+	}
 
-	x11_simd512_gpu_compress_64_maxwell << <grid, block >> > (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
 	x11_simd512_gpu_final_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
 
 	MyStreamSynchronize(NULL, order, thr_id);
diff --git a/x11/x11.cu b/x11/x11.cu
index 4043be4..55d4dc0 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -58,7 +58,11 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
                                           uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
 
-// X11 Hashfunktion
+// to check... new sp method
+//extern void x11_echo512_cpu_setTarget(const void *ptarget);
+//extern uint32_t x11_echo512_cpu_hash_64_final(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+// X11 CPU Hash
 extern "C" void x11hash(void *output, const void *input)
 {
 	// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11
@@ -146,7 +150,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 	throughput = min(throughput, (int)(max_nonce - first_nonce));
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x000f;
+		((uint32_t*)ptarget)[7] = 0x5;
 
 	if (!init[thr_id])
 	{
@@ -176,6 +180,8 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
 
 	quark_blake512_cpu_setBlock_80((void*)endiandata);
+
+	//x11_echo512_cpu_setTarget(ptarget);
 	cuda_check_cpu_setTarget(ptarget);
 
 	do {
@@ -196,8 +202,10 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
-		// Scan nach Gewinner Hashes auf der GPU
+		// todo...
+		//foundNonce = x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		//foundNonce = cuda_check_hash_fast(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
 		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if (foundNonce != 0xffffffff)
 		{
diff --git a/x13/x13.cu b/x13/x13.cu
index 36c6cd7..e9e3cd4 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -67,7 +67,7 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, 
                                           uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
 
-// X13 Hashfunktion
+// X13 CPU Hash
 extern "C" void x13hash(void *output, const void *input)
 {
 	// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11-hamsi12-fugue13