From d8a23fa970532ec521e48895d4f6b90333ed229e Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 20 Oct 2014 02:18:21 +0200 Subject: [PATCH] Tune quark part of Xn funcs based on klaus commits, will increase a bit speed of most algos PS: main increase is due to the register count tuning in Makefile and for skein512 on linux, its the ROTL64 but almost no changes on X11 : 2648MH/s vs 2630 before --- Makefile.am | 10 ++++- ccminer.vcxproj | 5 ++- configure.ac | 2 +- configure.sh | 2 +- cpuminer-config.h | 6 +-- cuda_helper.h | 19 ++++++++++ quark/cuda_quark_blake512.cu | 43 ++++++++++----------- quark/cuda_skein512.cu | 72 +++++++++++++++++++----------------- x11/x11.cu | 2 +- 9 files changed, 96 insertions(+), 65 deletions(-) diff --git a/Makefile.am b/Makefile.am index 431c260..1d39ec7 100644 --- a/Makefile.am +++ b/Makefile.am @@ -87,7 +87,15 @@ x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< -# ABI requiring code modules +quark/cuda_quark_blake512.o: quark/cuda_quark_blake512.cu + $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $< + +quark/cuda_jh512.o: quark/cuda_jh512.cu + $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $< + +quark/cuda_quark_keccak512.o: quark/cuda_quark_keccak512.cu + $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=88 -o $@ -c $< + quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $< diff --git a/ccminer.vcxproj b/ccminer.vcxproj index dcc26f9..f77d7ac 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -405,11 +405,13 @@ 64 + 80 --ptxas-options=-O2 %(AdditionalOptions) %(AdditionalOptions) 64 + 80 --ptxas-options=-O2 %(AdditionalOptions) %(AdditionalOptions) 64 @@ -431,6 +433,7 @@ 64 + 88 --ptxas-options=-O2 %(AdditionalOptions) %(AdditionalOptions) 64 @@ -579,4 +582,4 @@ - \ No newline at end of file + diff --git a/configure.ac b/configure.ac index 50f5881..e115d79 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2014.09.28]) +AC_INIT([ccminer], [1.4.5]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/configure.sh b/configure.sh index 9c8b021..142b59e 100755 --- a/configure.sh +++ b/configure.sh @@ -5,5 +5,5 @@ #--ptxas-options=\"-v -dlcm=cg\"" -CUDA_CFLAGS="-O2" ./configure "CFLAGS=-O2" "CXXFLAGS=-O2" --with-cuda=/usr/local/cuda +CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda diff --git a/cpuminer-config.h b/cpuminer-config.h index 0602c17..f1d7a31 100644 --- a/cpuminer-config.h +++ b/cpuminer-config.h @@ -156,7 +156,7 @@ #define PACKAGE_NAME "ccminer" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "ccminer 2014.09.28" +#define PACKAGE_STRING "ccminer 1.4.5" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "ccminer" @@ -165,7 +165,7 @@ #define PACKAGE_URL "" /* Define to the version of this package. */ -#define PACKAGE_VERSION "2014.09.28" +#define PACKAGE_VERSION "1.4.5" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be @@ -188,7 +188,7 @@ #define USE_XOP 1 /* Version number of package */ -#define VERSION "2014.09.28" +#define VERSION "1.4.5" /* Define curl_free() as free() if our version of curl lacks curl_free. */ /* #undef curl_free */ diff --git a/cuda_helper.h b/cuda_helper.h index 4755d8a..db1dde3 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -318,6 +318,25 @@ uint64_t ROTL64(const uint64_t x, const int offset) : "=l"(result) : "l"(x), "r"(offset)); return result; } +#elif __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 3 +__device__ +uint64_t ROTL64(const uint64_t x, const int offset) +{ + uint64_t res; + asm("{\n\t" + ".reg .u32 tl,th,vl,vh;\n\t" + ".reg .pred p;\n\t" + "mov.b64 {tl,th}, %1;\n\t" + "shf.l.wrap.b32 vl, tl, th, %2;\n\t" + "shf.l.wrap.b32 vh, th, tl, %2;\n\t" + "setp.lt.u32 p, %2, 32;\n\t" + "@!p mov.b64 %0, {vl,vh};\n\t" + "@p mov.b64 %0, {vh,vl};\n\t" + "}" + : "=l"(res) : "l"(x) , "r"(offset) + ); + return res; +} #else /* host */ #define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu index 787b8a0..39e633b 100644 --- a/quark/cuda_quark_blake512.cu +++ b/quark/cuda_quark_blake512.cu @@ -7,9 +7,6 @@ #define USE_SHUFFLE 0 -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - // die Message it Padding zur Berechnung auf der GPU __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) @@ -106,18 +103,6 @@ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t h[i % 8] ^= v[i]; } -__device__ __constant__ -static const uint64_t d_constMem[8] = { - 0x6a09e667f3bcc908ULL, - 0xbb67ae8584caa73bULL, - 0x3c6ef372fe94f82bULL, - 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, - 0x9b05688c2b3e6c1fULL, - 0x1f83d9abfb41bd6bULL, - 0x5be0cd19137e2179ULL -}; - // Hash-Padding __device__ __constant__ static const uint64_t d_constHashPadding[8] = { @@ -157,10 +142,16 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n uint64_t buf[16]; // State - uint64_t h[8]; - #pragma unroll 8 - for (int i=0;i<8;i++) - h[i] = d_constMem[i]; + uint64_t h[8] = { + 0x6a09e667f3bcc908ULL, + 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, + 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, + 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, + 0x5be0cd19137e2179ULL + }; // Message for first round #pragma unroll 8 @@ -195,13 +186,19 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint64_t h[8]; uint64_t buf[16]; uint32_t nounce = startNounce + thread; - #pragma unroll 8 - for(int i=0; i<8; i++) - h[i] = d_constMem[i]; + uint64_t h[8] = { + 0x6a09e667f3bcc908ULL, + 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, + 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, + 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, + 0x5be0cd19137e2179ULL + }; // Message für die erste Runde in Register holen #pragma unroll 16 diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu index 899ed2a..1413809 100644 --- a/quark/cuda_skein512.cu +++ b/quark/cuda_skein512.cu @@ -5,19 +5,36 @@ #include "cuda_helper.h" // aus cpu-miner.c -extern "C" extern int device_map[8]; -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); +extern int device_map[8]; // Take a look at: https://www.schneier.com/skein1.3.pdf #define SHL(x, n) ((x) << (n)) #define SHR(x, n) ((x) >> (n)) -// Zum testen Hostcode... -/* Hier erstmal die Tabelle mit den Konstanten für die Mix-Funktion. Kann später vll. - mal direkt in den Code eingesetzt werden -*/ +__device__ +uint64_t skein_rotl64(const uint64_t x, const int offset) +{ + uint64_t res; + asm("{\n\t" + ".reg .u32 tl,th,vl,vh;\n\t" + ".reg .pred p;\n\t" + "mov.b64 {tl,th}, %1;\n\t" + "shf.l.wrap.b32 vl, tl, th, %2;\n\t" + "shf.l.wrap.b32 vh, th, tl, %2;\n\t" + "setp.lt.u32 p, %2, 32;\n\t" + "@!p mov.b64 %0, {vl,vh};\n\t" + "@p mov.b64 %0, {vh,vl};\n\t" + "}" + : "=l"(res) : "l"(x) , "r"(offset) + ); + return res; +} + +#if __CUDA_ARCH__ >= 350 +#undef ROTL64 +#define ROTL64 skein_rotl64 +#endif /* * M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7). @@ -288,18 +305,8 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ } -static __constant__ uint64_t d_constMem[8]; -static const uint64_t h_constMem[8] = { - SPH_C64(0x4903ADFF749C51CE), - SPH_C64(0x0D95DE399746DF03), - SPH_C64(0x8FD1934127C79BCE), - SPH_C64(0x9A255629FF352CB1), - SPH_C64(0x5DB62599DF6CA7B0), - SPH_C64(0xEABE394CA9D5C3F4), - SPH_C64(0x991112C71A75B523), - SPH_C64(0xAE18A40B660FCC33) }; - -__global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +__global__ +void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector) { int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -315,14 +322,14 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui uint64_t *inpHash = &g_hash[8 * hashPosition]; // Initialisierung - h0 = d_constMem[0]; - h1 = d_constMem[1]; - h2 = d_constMem[2]; - h3 = d_constMem[3]; - h4 = d_constMem[4]; - h5 = d_constMem[5]; - h6 = d_constMem[6]; - h7 = d_constMem[7]; + h0 = 0x4903ADFF749C51CEull; + h1 = 0x0D95DE399746DF03ull; + h2 = 0x8FD1934127C79BCEull; + h3 = 0x9A255629FF352CB1ull; + h4 = 0x5DB62599DF6CA7B0ull; + h5 = 0xEABE394CA9D5C3F4ull; + h6 = 0x991112C71A75B523ull; + h7 = 0xAE18A40B660FCC33ull; // 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg #pragma unroll 8 @@ -399,16 +406,13 @@ __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, ui } // Setup-Funktionen -__host__ void quark_skein512_cpu_init(int thr_id, int threads) +__host__ +void quark_skein512_cpu_init(int thr_id, int threads) { - // nix zu tun ;-) - cudaMemcpyToSymbol( d_constMem, - h_constMem, - sizeof(h_constMem), - 0, cudaMemcpyHostToDevice); } -__host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ +void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { const int threadsperblock = 256; diff --git a/x11/x11.cu b/x11/x11.cu index 60ea553..7bb4f24 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -142,7 +142,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, static bool init[8] = {0,0,0,0,0,0,0,0}; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; + ((uint32_t*)ptarget)[7] = 0x0000f; if (!init[thr_id]) {