Tune quark part of Xn funcs

based on klaus commits, will increase a bit speed of most algos PS: main increase is due to the register count tuning in Makefile and for skein512 on linux, its the ROTL64 but almost no changes on X11 : 2648MH/s vs 2630 before
10 years ago · d8a23fa970
9 changed files with 96 additions and 65 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -87,7 +87,15 @@ x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
 x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
-# ABI requiring code modules
+quark/cuda_quark_blake512.o: quark/cuda_quark_blake512.cu
 	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
 quark/cuda_jh512.o: quark/cuda_jh512.cu
 	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
 quark/cuda_quark_keccak512.o: quark/cuda_quark_keccak512.cu
 	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=88 -o $@ -c $<
 quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
 	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -405,11 +405,13 @@
      <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
    </CudaCompile>
    <CudaCompile Include="quark\cuda_jh512.cu">
      <MaxRegCount>80</MaxRegCount>
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
      <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
    </CudaCompile>
    <CudaCompile Include="quark\cuda_quark_blake512.cu">
      <MaxRegCount>80</MaxRegCount>
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
      <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
@ -431,6 +433,7 @@
      <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
    </CudaCompile>
    <CudaCompile Include="quark\cuda_quark_keccak512.cu">
      <MaxRegCount>88</MaxRegCount>
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
      <TargetMachinePlatform Condition="'$(Platform)'=='x64'">64</TargetMachinePlatform>
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2014.09.28])
+AC_INIT([ccminer], [1.4.5])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/configure.sh
+++ b/configure.sh
@ -5,5 +5,5 @@
 #--ptxas-options=\"-v -dlcm=cg\""
-CUDA_CFLAGS="-O2" ./configure "CFLAGS=-O2" "CXXFLAGS=-O2" --with-cuda=/usr/local/cuda
+CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@ -156,7 +156,7 @@
 #define PACKAGE_NAME "ccminer"
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 2014.09.28"
+#define PACKAGE_STRING "ccminer 1.4.5"
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "ccminer"
@ -165,7 +165,7 @@
 #define PACKAGE_URL ""
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2014.09.28"
+#define PACKAGE_VERSION "1.4.5"
 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
@ -188,7 +188,7 @@
 #define USE_XOP 1
 /* Version number of package */
-#define VERSION "2014.09.28"
+#define VERSION "1.4.5"
 /* Define curl_free() as free() if our version of curl lacks curl_free. */
 /* #undef curl_free */
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -318,6 +318,25 @@ uint64_t ROTL64(const uint64_t x, const int offset)
 	: "=l"(result) : "l"(x), "r"(offset));
 	return result;
 }
 #elif __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 3
 __device__
 uint64_t ROTL64(const uint64_t x, const int offset)
 {
 	uint64_t res;
 	asm("{\n\t"
 		".reg .u32 tl,th,vl,vh;\n\t"
 		".reg .pred p;\n\t"
 		"mov.b64 {tl,th}, %1;\n\t"
 		"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
 		"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
 		"setp.lt.u32 p, %2, 32;\n\t"
 		"@!p mov.b64 %0, {vl,vh};\n\t"
 		"@p  mov.b64 %0, {vh,vl};\n\t"
 		"}"
 		: "=l"(res) : "l"(x) , "r"(offset)
 	);
 	return res;
 }
 #else
 /* host */
 #define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@ -7,9 +7,6 @@
 #define USE_SHUFFLE 0
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 // die Message it Padding zur Berechnung auf der GPU
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
@ -106,18 +103,6 @@ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t
 		h[i % 8] ^= v[i];
 }
 __device__ __constant__
 static const uint64_t d_constMem[8] = {
 	0x6a09e667f3bcc908ULL,
 	0xbb67ae8584caa73bULL,
 	0x3c6ef372fe94f82bULL,
 	0xa54ff53a5f1d36f1ULL,
 	0x510e527fade682d1ULL,
 	0x9b05688c2b3e6c1fULL,
 	0x1f83d9abfb41bd6bULL,
 	0x5be0cd19137e2179ULL
 };
 // Hash-Padding
 __device__ __constant__
 static const uint64_t d_constHashPadding[8] = {
@ -157,10 +142,16 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n
 		uint64_t buf[16];
 		// State
-		uint64_t h[8];
+		uint64_t h[8] = {
-		#pragma unroll 8
+			0x6a09e667f3bcc908ULL,
-		for (int i=0;i<8;i++)
+			0xbb67ae8584caa73bULL,
-			h[i] = d_constMem[i];
+			0x3c6ef372fe94f82bULL,
 			0xa54ff53a5f1d36f1ULL,
 			0x510e527fade682d1ULL,
 			0x9b05688c2b3e6c1fULL,
 			0x1f83d9abfb41bd6bULL,
 			0x5be0cd19137e2179ULL
 		};
 		// Message for first round
 		#pragma unroll 8
@ -195,13 +186,19 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint64_t h[8];
 		uint64_t buf[16];
 		uint32_t nounce = startNounce + thread;
-		#pragma unroll 8
+		uint64_t h[8] = {
-		for(int i=0; i<8; i++)
+			0x6a09e667f3bcc908ULL,
-			h[i] = d_constMem[i];
+			0xbb67ae8584caa73bULL,
 			0x3c6ef372fe94f82bULL,
 			0xa54ff53a5f1d36f1ULL,
 			0x510e527fade682d1ULL,
 			0x9b05688c2b3e6c1fULL,
 			0x1f83d9abfb41bd6bULL,
 			0x5be0cd19137e2179ULL
 		};
 		// Message für die erste Runde in Register holen
 		#pragma unroll 16
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@ -5,19 +5,36 @@
 #include "cuda_helper.h"
 // aus cpu-miner.c
-extern "C" extern int device_map[8];
+extern int device_map[8];
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 // Take a look at: https://www.schneier.com/skein1.3.pdf
 #define SHL(x, n)			((x) << (n))
 #define SHR(x, n)			((x) >> (n))
-// Zum testen Hostcode...
+__device__
-/*	Hier erstmal die Tabelle mit den Konstanten für die Mix-Funktion. Kann später vll.
+uint64_t skein_rotl64(const uint64_t x, const int offset)
-	mal direkt in den Code eingesetzt werden
+{
-*/
+	uint64_t res;
 	asm("{\n\t"
 		".reg .u32 tl,th,vl,vh;\n\t"
 		".reg .pred p;\n\t"
 		"mov.b64 {tl,th}, %1;\n\t"
 		"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
 		"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
 		"setp.lt.u32 p, %2, 32;\n\t"
 		"@!p mov.b64 %0, {vl,vh};\n\t"
 		"@p  mov.b64 %0, {vh,vl};\n\t"
 	"}"
 		: "=l"(res) : "l"(x) , "r"(offset)
 	);
 	return res;
 }
 #if __CUDA_ARCH__ >= 350
 #undef ROTL64
 #define ROTL64 skein_rotl64
 #endif
 /*
 * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
@ -288,18 +305,8 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 		TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3],  8, 35, 56, 22); \
 	}
-static __constant__ uint64_t d_constMem[8];
+__global__
-static const uint64_t h_constMem[8] = {
+void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector)
 	SPH_C64(0x4903ADFF749C51CE),
 	SPH_C64(0x0D95DE399746DF03),
 	SPH_C64(0x8FD1934127C79BCE),
 	SPH_C64(0x9A255629FF352CB1),
 	SPH_C64(0x5DB62599DF6CA7B0),
 	SPH_C64(0xEABE394CA9D5C3F4),
 	SPH_C64(0x991112C71A75B523),
 	SPH_C64(0xAE18A40B660FCC33) };
 __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@ -315,14 +322,14 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui
 		uint64_t *inpHash = &g_hash[8 * hashPosition];
 		// Initialisierung
-		h0 = d_constMem[0];
+		h0 = 0x4903ADFF749C51CEull;
-		h1 = d_constMem[1];
+		h1 = 0x0D95DE399746DF03ull;
-		h2 = d_constMem[2];
+		h2 = 0x8FD1934127C79BCEull;
-		h3 = d_constMem[3];
+		h3 = 0x9A255629FF352CB1ull;
-		h4 = d_constMem[4];
+		h4 = 0x5DB62599DF6CA7B0ull;
-		h5 = d_constMem[5];
+		h5 = 0xEABE394CA9D5C3F4ull;
-		h6 = d_constMem[6];
+		h6 = 0x991112C71A75B523ull;
-		h7 = d_constMem[7];
+		h7 = 0xAE18A40B660FCC33ull;
 		// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg		
 #pragma unroll 8
@ -399,16 +406,13 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui
 }
 // Setup-Funktionen
-__host__ void quark_skein512_cpu_init(int thr_id, int threads)
+__host__
 void quark_skein512_cpu_init(int thr_id, int threads)
 {
 	// nix zu tun ;-)
 	cudaMemcpyToSymbol( d_constMem,
                        h_constMem,
                        sizeof(h_constMem),
                        0, cudaMemcpyHostToDevice);
 }
-__host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__
 void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
 	const int threadsperblock = 256;
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -142,7 +142,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+		((uint32_t*)ptarget)[7] = 0x0000f;
 	if (!init[thr_id])
 	{
`@ -5,5 +5,5 @@`

	`#--ptxas-options=\"-v -dlcm=cg\""`	`#--ptxas-options=\"-v -dlcm=cg\""`

	`CUDA_CFLAGS="-O2" ./configure "CFLAGS=-O2" "CXXFLAGS=-O2" --with-cuda=/usr/local/cuda`	`CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda`