From d8a23fa970532ec521e48895d4f6b90333ed229e Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 20 Oct 2014 02:18:21 +0200
Subject: [PATCH] Tune quark part of Xn funcs

based on klaus commits, will increase a bit speed of most algos

PS: main increase is due to the register count tuning in Makefile

and for skein512 on linux, its the ROTL64

but almost no changes on X11 : 2648MH/s vs 2630 before
---
 Makefile.am                  | 10 ++++-
 ccminer.vcxproj              |  5 ++-
 configure.ac                 |  2 +-
 configure.sh                 |  2 +-
 cpuminer-config.h            |  6 +--
 cuda_helper.h                | 19 ++++++++++
 quark/cuda_quark_blake512.cu | 43 ++++++++++-----------
 quark/cuda_skein512.cu       | 72 +++++++++++++++++++-----------------
 x11/x11.cu                   |  2 +-
 9 files changed, 96 insertions(+), 65 deletions(-)
diff --git a/Makefile.am b/Makefile.am
index 431c260..1d39ec7 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -87,7 +87,15 @@ x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
 x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 
-# ABI requiring code modules
+quark/cuda_quark_blake512.o: quark/cuda_quark_blake512.cu
+	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
+
+quark/cuda_jh512.o: quark/cuda_jh512.cu
+	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
+
+quark/cuda_quark_keccak512.o: quark/cuda_quark_keccak512.cu
+	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=88 -o $@ -c $<
+
 quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
 	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
 
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index dcc26f9..f77d7ac 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -405,11 +405,13 @@
       <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
     </CudaCompile>
     <CudaCompile Include="quark\cuda_jh512.cu">
+      <MaxRegCount>80</MaxRegCount>
       <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
       <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
       <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
     </CudaCompile>
     <CudaCompile Include="quark\cuda_quark_blake512.cu">
+      <MaxRegCount>80</MaxRegCount>
       <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
       <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
       <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
@@ -431,6 +433,7 @@
       <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
     </CudaCompile>
     <CudaCompile Include="quark\cuda_quark_keccak512.cu">
+      <MaxRegCount>88</MaxRegCount>
       <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
       <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
       <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
@@ -579,4 +582,4 @@
   <Target Name="AfterClean">
     <Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
   </Target>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/configure.ac b/configure.ac
index 50f5881..e115d79 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2014.09.28])
+AC_INIT([ccminer], [1.4.5])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/configure.sh b/configure.sh
index 9c8b021..142b59e 100755
--- a/configure.sh
+++ b/configure.sh
@@ -5,5 +5,5 @@
 
 #--ptxas-options=\"-v -dlcm=cg\""
 
-CUDA_CFLAGS="-O2" ./configure "CFLAGS=-O2" "CXXFLAGS=-O2" --with-cuda=/usr/local/cuda
+CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda
 
diff --git a/cpuminer-config.h b/cpuminer-config.h
index 0602c17..f1d7a31 100644
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@@ -156,7 +156,7 @@
 #define PACKAGE_NAME "ccminer"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 2014.09.28"
+#define PACKAGE_STRING "ccminer 1.4.5"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "ccminer"
@@ -165,7 +165,7 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2014.09.28"
+#define PACKAGE_VERSION "1.4.5"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
@@ -188,7 +188,7 @@
 #define USE_XOP 1
 
 /* Version number of package */
-#define VERSION "2014.09.28"
+#define VERSION "1.4.5"
 
 /* Define curl_free() as free() if our version of curl lacks curl_free. */
 /* #undef curl_free */
diff --git a/cuda_helper.h b/cuda_helper.h
index 4755d8a..db1dde3 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -318,6 +318,25 @@ uint64_t ROTL64(const uint64_t x, const int offset)
 	: "=l"(result) : "l"(x), "r"(offset));
 	return result;
 }
+#elif __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 3
+__device__
+uint64_t ROTL64(const uint64_t x, const int offset)
+{
+	uint64_t res;
+	asm("{\n\t"
+		".reg .u32 tl,th,vl,vh;\n\t"
+		".reg .pred p;\n\t"
+		"mov.b64 {tl,th}, %1;\n\t"
+		"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
+		"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
+		"setp.lt.u32 p, %2, 32;\n\t"
+		"@!p mov.b64 %0, {vl,vh};\n\t"
+		"@p  mov.b64 %0, {vh,vl};\n\t"
+		"}"
+		: "=l"(res) : "l"(x) , "r"(offset)
+	);
+	return res;
+}
 #else
 /* host */
 #define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu
index 787b8a0..39e633b 100644
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@@ -7,9 +7,6 @@
 
 #define USE_SHUFFLE 0
 
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
 // die Message it Padding zur Berechnung auf der GPU
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 
@@ -106,18 +103,6 @@ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t
 		h[i % 8] ^= v[i];
 }
 
-__device__ __constant__
-static const uint64_t d_constMem[8] = {
-	0x6a09e667f3bcc908ULL,
-	0xbb67ae8584caa73bULL,
-	0x3c6ef372fe94f82bULL,
-	0xa54ff53a5f1d36f1ULL,
-	0x510e527fade682d1ULL,
-	0x9b05688c2b3e6c1fULL,
-	0x1f83d9abfb41bd6bULL,
-	0x5be0cd19137e2179ULL
-};
-
 // Hash-Padding
 __device__ __constant__
 static const uint64_t d_constHashPadding[8] = {
@@ -157,10 +142,16 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n
 		uint64_t buf[16];
 
 		// State
-		uint64_t h[8];
-		#pragma unroll 8
-		for (int i=0;i<8;i++)
-			h[i] = d_constMem[i];
+		uint64_t h[8] = {
+			0x6a09e667f3bcc908ULL,
+			0xbb67ae8584caa73bULL,
+			0x3c6ef372fe94f82bULL,
+			0xa54ff53a5f1d36f1ULL,
+			0x510e527fade682d1ULL,
+			0x9b05688c2b3e6c1fULL,
+			0x1f83d9abfb41bd6bULL,
+			0x5be0cd19137e2179ULL
+		};
 
 		// Message for first round
 		#pragma unroll 8
@@ -195,13 +186,19 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint64_t h[8];
 		uint64_t buf[16];
 		uint32_t nounce = startNounce + thread;
 
-		#pragma unroll 8
-		for(int i=0; i<8; i++)
-			h[i] = d_constMem[i];
+		uint64_t h[8] = {
+			0x6a09e667f3bcc908ULL,
+			0xbb67ae8584caa73bULL,
+			0x3c6ef372fe94f82bULL,
+			0xa54ff53a5f1d36f1ULL,
+			0x510e527fade682d1ULL,
+			0x9b05688c2b3e6c1fULL,
+			0x1f83d9abfb41bd6bULL,
+			0x5be0cd19137e2179ULL
+		};
 
 		// Message für die erste Runde in Register holen
 		#pragma unroll 16
diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu
index 899ed2a..1413809 100644
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@@ -5,19 +5,36 @@
 #include "cuda_helper.h"
 
 // aus cpu-miner.c
-extern "C" extern int device_map[8];
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+extern int device_map[8];
 
 // Take a look at: https://www.schneier.com/skein1.3.pdf
 
 #define SHL(x, n)			((x) << (n))
 #define SHR(x, n)			((x) >> (n))
 
-// Zum testen Hostcode...
-/*	Hier erstmal die Tabelle mit den Konstanten für die Mix-Funktion. Kann später vll.
-	mal direkt in den Code eingesetzt werden
-*/
+__device__
+uint64_t skein_rotl64(const uint64_t x, const int offset)
+{
+	uint64_t res;
+	asm("{\n\t"
+		".reg .u32 tl,th,vl,vh;\n\t"
+		".reg .pred p;\n\t"
+		"mov.b64 {tl,th}, %1;\n\t"
+		"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
+		"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
+		"setp.lt.u32 p, %2, 32;\n\t"
+		"@!p mov.b64 %0, {vl,vh};\n\t"
+		"@p  mov.b64 %0, {vh,vl};\n\t"
+	"}"
+		: "=l"(res) : "l"(x) , "r"(offset)
+	);
+	return res;
+}
+
+#if __CUDA_ARCH__ >= 350
+#undef ROTL64
+#define ROTL64 skein_rotl64
+#endif
 
 /*
  * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
@@ -288,18 +305,8 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 		TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
 	}
 
-static __constant__ uint64_t d_constMem[8];
-static const uint64_t h_constMem[8] = {
-	SPH_C64(0x4903ADFF749C51CE),
-	SPH_C64(0x0D95DE399746DF03),
-	SPH_C64(0x8FD1934127C79BCE),
-	SPH_C64(0x9A255629FF352CB1),
-	SPH_C64(0x5DB62599DF6CA7B0),
-	SPH_C64(0xEABE394CA9D5C3F4),
-	SPH_C64(0x991112C71A75B523),
-	SPH_C64(0xAE18A40B660FCC33) };
-
-__global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+__global__
+void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -315,14 +322,14 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui
 		uint64_t *inpHash = &g_hash[8 * hashPosition];
 
 		// Initialisierung
-		h0 = d_constMem[0];
-		h1 = d_constMem[1];
-		h2 = d_constMem[2];
-		h3 = d_constMem[3];
-		h4 = d_constMem[4];
-		h5 = d_constMem[5];
-		h6 = d_constMem[6];
-		h7 = d_constMem[7];
+		h0 = 0x4903ADFF749C51CEull;
+		h1 = 0x0D95DE399746DF03ull;
+		h2 = 0x8FD1934127C79BCEull;
+		h3 = 0x9A255629FF352CB1ull;
+		h4 = 0x5DB62599DF6CA7B0ull;
+		h5 = 0xEABE394CA9D5C3F4ull;
+		h6 = 0x991112C71A75B523ull;
+		h7 = 0xAE18A40B660FCC33ull;
 
 		// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg		
 #pragma unroll 8
@@ -399,16 +406,13 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui
 }
 
 // Setup-Funktionen
-__host__ void quark_skein512_cpu_init(int thr_id, int threads)
+__host__
+void quark_skein512_cpu_init(int thr_id, int threads)
 {
-	// nix zu tun ;-)
-	cudaMemcpyToSymbol( d_constMem,
-                        h_constMem,
-                        sizeof(h_constMem),
-                        0, cudaMemcpyHostToDevice);
 }
 
-__host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__
+void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
 	const int threadsperblock = 256;
 
diff --git a/x11/x11.cu b/x11/x11.cu
index 60ea553..7bb4f24 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -142,7 +142,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 	static bool init[8] = {0,0,0,0,0,0,0,0};
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+		((uint32_t*)ptarget)[7] = 0x0000f;
 
 	if (!init[thr_id])
 	{