Tune quark part of Xn funcs

based on klaus commits, will increase a bit speed of most algos PS: main increase is due to the register count tuning in Makefile and for skein512 on linux, its the ROTL64 but almost no changes on X11 : 2648MH/s vs 2630 before
10 years ago · d8a23fa970
9 changed files with 96 additions and 65 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -87,7 +87,15 @@ x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
				@@ -87,7 +87,15 @@ x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
 x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<

-# ABI requiring code modules
+quark/cuda_quark_blake512.o: quark/cuda_quark_blake512.cu
+	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
+
+quark/cuda_jh512.o: quark/cuda_jh512.cu
+	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
+
+quark/cuda_quark_keccak512.o: quark/cuda_quark_keccak512.cu
+	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=88 -o $@ -c $<
+
 quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
 	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<

--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -405,11 +405,13 @@
				@@ -405,11 +405,13 @@
      <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
    </CudaCompile>
    <CudaCompile Include="quark\cuda_jh512.cu">
+      <MaxRegCount>80</MaxRegCount>
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
      <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
    </CudaCompile>
    <CudaCompile Include="quark\cuda_quark_blake512.cu">
+      <MaxRegCount>80</MaxRegCount>
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
      <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
@ -431,6 +433,7 @@
				@@ -431,6 +433,7 @@
      <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
    </CudaCompile>
    <CudaCompile Include="quark\cuda_quark_keccak512.cu">
+      <MaxRegCount>88</MaxRegCount>
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
      <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
				@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2014.09.28])
+AC_INIT([ccminer], [1.4.5])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/configure.sh
+++ b/configure.sh
@ -5,5 +5,5 @@
				@@ -5,5 +5,5 @@

 #--ptxas-options=\"-v -dlcm=cg\""

-CUDA_CFLAGS="-O2" ./configure "CFLAGS=-O2" "CXXFLAGS=-O2" --with-cuda=/usr/local/cuda
+CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda

--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@ -156,7 +156,7 @@
				@@ -156,7 +156,7 @@
 #define PACKAGE_NAME "ccminer"

 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 2014.09.28"
+#define PACKAGE_STRING "ccminer 1.4.5"

 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "ccminer"
@ -165,7 +165,7 @@
				@@ -165,7 +165,7 @@
 #define PACKAGE_URL ""

 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2014.09.28"
+#define PACKAGE_VERSION "1.4.5"

 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
@ -188,7 +188,7 @@
				@@ -188,7 +188,7 @@
 #define USE_XOP 1

 /* Version number of package */
-#define VERSION "2014.09.28"
+#define VERSION "1.4.5"

 /* Define curl_free() as free() if our version of curl lacks curl_free. */
 /* #undef curl_free */
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -318,6 +318,25 @@ uint64_t ROTL64(const uint64_t x, const int offset)
				@@ -318,6 +318,25 @@ uint64_t ROTL64(const uint64_t x, const int offset)
 	: "=l"(result) : "l"(x), "r"(offset));
 	return result;
 }
+#elif __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 3
+__device__
+uint64_t ROTL64(const uint64_t x, const int offset)
+{
+	uint64_t res;
+	asm("{\n\t"
+		".reg .u32 tl,th,vl,vh;\n\t"
+		".reg .pred p;\n\t"
+		"mov.b64 {tl,th}, %1;\n\t"
+		"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
+		"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
+		"setp.lt.u32 p, %2, 32;\n\t"
+		"@!p mov.b64 %0, {vl,vh};\n\t"
+		"@p  mov.b64 %0, {vh,vl};\n\t"
+		"}"
+		: "=l"(res) : "l"(x) , "r"(offset)
+	);
+	return res;
+}
 #else
 /* host */
 #define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@ -7,9 +7,6 @@
				@@ -7,9 +7,6 @@

 #define USE_SHUFFLE 0

-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
 // die Message it Padding zur Berechnung auf der GPU
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)

@ -106,18 +103,6 @@ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t
				@@ -106,18 +103,6 @@ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t
 		h[i % 8] ^= v[i];
 }

-__device__ __constant__
-static const uint64_t d_constMem[8] = {
-	0x6a09e667f3bcc908ULL,
-	0xbb67ae8584caa73bULL,
-	0x3c6ef372fe94f82bULL,
-	0xa54ff53a5f1d36f1ULL,
-	0x510e527fade682d1ULL,
-	0x9b05688c2b3e6c1fULL,
-	0x1f83d9abfb41bd6bULL,
-	0x5be0cd19137e2179ULL
-};
-
 // Hash-Padding
 __device__ __constant__
 static const uint64_t d_constHashPadding[8] = {
@ -157,10 +142,16 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n
				@@ -157,10 +142,16 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n
 		uint64_t buf[16];

 		// State
-		uint64_t h[8];
-		#pragma unroll 8
-		for (int i=0;i<8;i++)
-			h[i] = d_constMem[i];
+		uint64_t h[8] = {
+			0x6a09e667f3bcc908ULL,
+			0xbb67ae8584caa73bULL,
+			0x3c6ef372fe94f82bULL,
+			0xa54ff53a5f1d36f1ULL,
+			0x510e527fade682d1ULL,
+			0x9b05688c2b3e6c1fULL,
+			0x1f83d9abfb41bd6bULL,
+			0x5be0cd19137e2179ULL
+		};

 		// Message for first round
 		#pragma unroll 8
@ -195,13 +186,19 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
				@@ -195,13 +186,19 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint64_t h[8];
 		uint64_t buf[16];
 		uint32_t nounce = startNounce + thread;

-		#pragma unroll 8
-		for(int i=0; i<8; i++)
-			h[i] = d_constMem[i];
+		uint64_t h[8] = {
+			0x6a09e667f3bcc908ULL,
+			0xbb67ae8584caa73bULL,
+			0x3c6ef372fe94f82bULL,
+			0xa54ff53a5f1d36f1ULL,
+			0x510e527fade682d1ULL,
+			0x9b05688c2b3e6c1fULL,
+			0x1f83d9abfb41bd6bULL,
+			0x5be0cd19137e2179ULL
+		};

 		// Message für die erste Runde in Register holen
 		#pragma unroll 16
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@ -5,19 +5,36 @@
				@@ -5,19 +5,36 @@
 #include "cuda_helper.h"

 // aus cpu-miner.c
-extern "C" extern int device_map[8];
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+extern int device_map[8];

 // Take a look at: https://www.schneier.com/skein1.3.pdf

 #define SHL(x, n)			((x) << (n))
 #define SHR(x, n)			((x) >> (n))

-// Zum testen Hostcode...
-/*	Hier erstmal die Tabelle mit den Konstanten für die Mix-Funktion. Kann später vll.
-	mal direkt in den Code eingesetzt werden
-*/
+__device__
+uint64_t skein_rotl64(const uint64_t x, const int offset)
+{
+	uint64_t res;
+	asm("{\n\t"
+		".reg .u32 tl,th,vl,vh;\n\t"
+		".reg .pred p;\n\t"
+		"mov.b64 {tl,th}, %1;\n\t"
+		"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
+		"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
+		"setp.lt.u32 p, %2, 32;\n\t"
+		"@!p mov.b64 %0, {vl,vh};\n\t"
+		"@p  mov.b64 %0, {vh,vl};\n\t"
+	"}"
+		: "=l"(res) : "l"(x) , "r"(offset)
+	);
+	return res;
+}
+
+#if __CUDA_ARCH__ >= 350
+#undef ROTL64
+#define ROTL64 skein_rotl64
+#endif

 /*
 * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
@ -288,18 +305,8 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
				@@ -288,18 +305,8 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 		TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
 	}

-static __constant__ uint64_t d_constMem[8];
-static const uint64_t h_constMem[8] = {
-	SPH_C64(0x4903ADFF749C51CE),
-	SPH_C64(0x0D95DE399746DF03),
-	SPH_C64(0x8FD1934127C79BCE),
-	SPH_C64(0x9A255629FF352CB1),
-	SPH_C64(0x5DB62599DF6CA7B0),
-	SPH_C64(0xEABE394CA9D5C3F4),
-	SPH_C64(0x991112C71A75B523),
-	SPH_C64(0xAE18A40B660FCC33) };
-
-__global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
+__global__
+void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@ -315,14 +322,14 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui
				@@ -315,14 +322,14 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui
 		uint64_t *inpHash = &g_hash[8 * hashPosition];

 		// Initialisierung
-		h0 = d_constMem[0];
-		h1 = d_constMem[1];
-		h2 = d_constMem[2];
-		h3 = d_constMem[3];
-		h4 = d_constMem[4];
-		h5 = d_constMem[5];
-		h6 = d_constMem[6];
-		h7 = d_constMem[7];
+		h0 = 0x4903ADFF749C51CEull;
+		h1 = 0x0D95DE399746DF03ull;
+		h2 = 0x8FD1934127C79BCEull;
+		h3 = 0x9A255629FF352CB1ull;
+		h4 = 0x5DB62599DF6CA7B0ull;
+		h5 = 0xEABE394CA9D5C3F4ull;
+		h6 = 0x991112C71A75B523ull;
+		h7 = 0xAE18A40B660FCC33ull;

 		// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg		
 #pragma unroll 8
@ -399,16 +406,13 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui
				@@ -399,16 +406,13 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui
 }

 // Setup-Funktionen
-__host__ void quark_skein512_cpu_init(int thr_id, int threads)
+__host__
+void quark_skein512_cpu_init(int thr_id, int threads)
 {
-	// nix zu tun ;-)
-	cudaMemcpyToSymbol( d_constMem,
-                        h_constMem,
-                        sizeof(h_constMem),
-                        0, cudaMemcpyHostToDevice);
 }

-__host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__
+void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
 	const int threadsperblock = 256;

--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -142,7 +142,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
				@@ -142,7 +142,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 	static bool init[8] = {0,0,0,0,0,0,0,0};

 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+		((uint32_t*)ptarget)[7] = 0x0000f;

 	if (!init[thr_id])
 	{