From 912ef1215dd710cc43bd474283e087c5cfd41fac Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 21 Aug 2014 02:21:39 +0200 Subject: [PATCH] small reg tunes, rename whirlcoin to whirl --- Makefile.am | 5 ++- ccminer.vcxproj | 3 +- config.sh | 4 +- cpu-miner.c | 14 +++---- cuda_helper.h | 21 +++++----- util.c | 2 +- x11/cuda_x11_aes.cu | 93 ++++++++++++++++++++++++++------------------- 7 files changed, 77 insertions(+), 65 deletions(-) diff --git a/Makefile.am b/Makefile.am index 635a530..cb4ba62 100644 --- a/Makefile.am +++ b/Makefile.am @@ -46,15 +46,16 @@ ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@ ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME -nvcc_FLAGS = -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -I . -Xptxas "-v" --ptxas-options=-v +nvcc_FLAGS = -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -I . --ptxas-options=-v --use_fast_math nvcc_FLAGS += $(JANSSON_INCLUDES) # we're now targeting all major compute architectures within one binary. .cu.o: $(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=128 -o $@ -c $< -# Luffa is faster with 80 registers than 128 +# Luffa and Echo are faster with 80 registers than 128 x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu +x11/cuda_x11_echo.o: x11/cuda_x11_echo.cu $(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $< # Shavite compiles faster with 128 regs diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 689b017..a0edd97 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -479,6 +479,7 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" %(AdditionalOptions) + 80 --ptxas-options=-O3 %(AdditionalOptions) %(AdditionalOptions) --ptxas-options=-O3 %(AdditionalOptions) @@ -577,4 +578,4 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" - \ No newline at end of file + diff --git a/config.sh b/config.sh index 04ef6f9..9955155 100755 --- a/config.sh +++ b/config.sh @@ -3,7 +3,7 @@ # Simple script to create the Makefile # then type 'make' -# export PATH="$PATH:/usr/local/cuda-6.5/bin/" +# export PATH="$PATH:/usr/local/cuda/bin/" make clean || echo clean @@ -11,4 +11,4 @@ rm -f Makefile.in rm -f config.status ./autogen.sh || echo done -CC=/usr/local/bin/colorgcc.pl CFLAGS="-O2 -D_REENTRANT" ./configure +CC=/usr/local/bin/colorgcc.pl CFLAGS="-O2" ./configure diff --git a/cpu-miner.c b/cpu-miner.c index 4dcb71b..8295d1a 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -153,7 +153,7 @@ static const char *algo_names[] = { "anime", "fresh", "nist5", - "whirlcoin", + "whirl", "x11", "x13", "x14", @@ -231,7 +231,7 @@ Options:\n\ anime Animecoin hash\n\ fresh Freshcoin hash (shavite 80)\n\ nist5 NIST5 (TalkCoin) hash\n\ - whirlcoin Whirlcoin hash\n\ + whirl Whirlcoin (old whirlpool)\n\ x11 X11 (DarkCoin) hash\n\ x13 X13 (MaruCoin) hash\n\ x14 X14 hash\n\ @@ -1526,17 +1526,17 @@ int main(int argc, char *argv[]) int i; printf("*** ccMiner for nVidia GPUs by Christian Buchner and Christian H. ***\n"); - printf("\t This is version "PROGRAM_VERSION" (tpruvot@github)\n"); + printf("\t This is the forked version "PROGRAM_VERSION" (tpruvot@github)\n"); #ifdef WIN32 - printf("\t Built with VC++ 2013 and nVidia CUDA SDK 6.5 RC (DC 5.0)\n\n"); + printf("\t Built with VC++ 2013 and nVidia CUDA SDK 6.5\n\n"); #else - printf("\t Built with the nVidia CUDA SDK 6.5 RC\n\n"); + printf("\t Built with the nVidia CUDA SDK 6.5\n\n"); #endif printf("\t based on pooler-cpuminer 2.3.2 (c) 2010 Jeff Garzik, 2012 pooler\n"); - printf("\t based on pooler-cpuminer extension for HVC from http://hvc.1gh.com/" "\n\n"); + printf("\t and HVC extension from http://hvc.1gh.com/" "\n\n"); printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n"); printf("\t BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM\n"); - printf("\tCuda X14 and X15 added by Tanguy Pruvot (also in cpuminer-multi)\n"); + printf("\tCleaned and optimized by Tanguy Pruvot\n"); printf("\t BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo\n\n"); rpc_user = strdup(""); diff --git a/cuda_helper.h b/cuda_helper.h index c2c9f3e..8563737 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -38,12 +38,12 @@ extern const uint3 threadIdx; #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) #endif -__device__ __forceinline__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI) +__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI) { #if __CUDA_ARCH__ >= 130 return __double_as_longlong(__hiloint2double(HI, LO)); #else - return (unsigned long long)LO | (((unsigned long long)HI) << 32); + return (uint64_t)LO | (((uint64_t)HI) << 32); #endif } @@ -94,11 +94,8 @@ __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x) { // Input: 77665544 33221100 // Output: 00112233 44556677 - uint64_t temp[2]; - temp[0] = __byte_perm(_HIWORD(x), 0, 0x0123); - temp[1] = __byte_perm(_LOWORD(x), 0, 0x0123); - - return temp[0] | (temp[1]<<32); + uint64_t result = __byte_perm((uint32_t) x, 0, 0x0123); + return (result << 32) | __byte_perm(_HIWORD(x), 0, 0x0123); } #else /* host */ @@ -132,7 +129,7 @@ __device__ __forceinline__ uint64_t xor1(uint64_t a, uint64_t b) { uint64_t result; - asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a) ,"l"(b)); + asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a),"l"(b)); return result; } @@ -141,10 +138,10 @@ __device__ __forceinline__ uint64_t xor3(uint64_t a, uint64_t b, uint64_t c) { uint64_t result; - asm("{\n\t" - " .reg .u64 t1;\n\t" - "xor.b64 t1, %2, %3;\n\t" - "xor.b64 %0, %1, t1;\n\t" + asm("{" + ".reg .u64 lt;\n\t" + "xor.b64 lt, %2, %3;\n\t" + "xor.b64 %0, %1, lt;\n\t" "}" : "=l"(result) : "l"(a) ,"l"(b),"l"(c)); return result; diff --git a/util.c b/util.c index 81cc85b..9ccbbdf 100644 --- a/util.c +++ b/util.c @@ -1366,7 +1366,7 @@ void print_hash_tests(void) memset(hash, 0, sizeof hash); wcoinhash(&hash[0], &buf[0]); - printf("\nwhirlc: "); print_hash(hash); + printf("\nwhirl: "); print_hash(hash); memset(hash, 0, sizeof hash); x11hash(&hash[0], &buf[0]); diff --git a/x11/cuda_x11_aes.cu b/x11/cuda_x11_aes.cu index 97cd1dd..fca1b05 100644 --- a/x11/cuda_x11_aes.cu +++ b/x11/cuda_x11_aes.cu @@ -298,10 +298,11 @@ static void aes_cpu_init() 0, cudaMemcpyHostToDevice); } -static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory) +__device__ __forceinline__ +void aes_gpu_init(uint32_t *sharedMemory) { - if(threadIdx.x < 256) - { + /* each thread startup will fill a uint32 */ + if (threadIdx.x < 256) { sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; sharedMemory[threadIdx.x+256] = d_AES1[threadIdx.x]; sharedMemory[threadIdx.x+512] = d_AES2[threadIdx.x]; @@ -309,10 +310,13 @@ static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory) } } -static __device__ __forceinline__ void aes_round( +/* tried with 3 xor.b32 asm, not faster */ +#define xor4_32(a,b,c,d) (a ^ b ^ c ^ d); + +__device__ +static void aes_round( const uint32_t *sharedMemory, - uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, - uint32_t k0, + uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) { uint32_t idx0, idx1, idx2, idx3; @@ -321,42 +325,47 @@ static __device__ __forceinline__ void aes_round( idx1 = __byte_perm(x1, 0, 0x4441) + 256; idx2 = __byte_perm(x2, 0, 0x4442) + 512; idx3 = __byte_perm(x3, 0, 0x4443) + 768; - y0 =sharedMemory[idx0] ^ - sharedMemory[idx1] ^ - sharedMemory[idx2] ^ - sharedMemory[idx3] ^ - k0; + y0 = xor4_32( + sharedMemory[idx0], + sharedMemory[idx1], + sharedMemory[idx2], + sharedMemory[idx3]); + y0 ^= k0; idx0 = __byte_perm(x1, 0, 0x4440); idx1 = __byte_perm(x2, 0, 0x4441) + 256; idx2 = __byte_perm(x3, 0, 0x4442) + 512; idx3 = __byte_perm(x0, 0, 0x4443) + 768; - y1 =sharedMemory[idx0] ^ - sharedMemory[idx1] ^ - sharedMemory[idx2] ^ - sharedMemory[idx3]; // ^k3 + y1 = xor4_32( + sharedMemory[idx0], + sharedMemory[idx1], + sharedMemory[idx2], + sharedMemory[idx3]); idx0 = __byte_perm(x2, 0, 0x4440); idx1 = __byte_perm(x3, 0, 0x4441) + 256; idx2 = __byte_perm(x0, 0, 0x4442) + 512; idx3 = __byte_perm(x1, 0, 0x4443) + 768; - y2 =sharedMemory[idx0] ^ - sharedMemory[idx1] ^ - sharedMemory[idx2] ^ - sharedMemory[idx3]; // ^k2 + y2 = xor4_32( + sharedMemory[idx0], + sharedMemory[idx1], + sharedMemory[idx2], + sharedMemory[idx3]); // ^k2 idx0 = __byte_perm(x3, 0, 0x4440); idx1 = __byte_perm(x0, 0, 0x4441) + 256; idx2 = __byte_perm(x1, 0, 0x4442) + 512; idx3 = __byte_perm(x2, 0, 0x4443) + 768; - y3 =sharedMemory[idx0] ^ - sharedMemory[idx1] ^ - sharedMemory[idx2] ^ - sharedMemory[idx3]; // ^k3 + y3 = xor4_32( + sharedMemory[idx0], + sharedMemory[idx1], + sharedMemory[idx2], + sharedMemory[idx3]); // ^k3 } -static __device__ __forceinline__ void aes_round( +__device__ +static void aes_round( const uint32_t *sharedMemory, uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) @@ -367,36 +376,40 @@ static __device__ __forceinline__ void aes_round( idx1 = __byte_perm(x1, 0, 0x4441) + 256; idx2 = __byte_perm(x2, 0, 0x4442) + 512; idx3 = __byte_perm(x3, 0, 0x4443) + 768; - y0 =sharedMemory[idx0] ^ - sharedMemory[idx1] ^ - sharedMemory[idx2] ^ - sharedMemory[idx3]; + y0 = xor4_32( + sharedMemory[idx0], + sharedMemory[idx1], + sharedMemory[idx2], + sharedMemory[idx3]); idx0 = __byte_perm(x1, 0, 0x4440); idx1 = __byte_perm(x2, 0, 0x4441) + 256; idx2 = __byte_perm(x3, 0, 0x4442) + 512; idx3 = __byte_perm(x0, 0, 0x4443) + 768; - y1 =sharedMemory[idx0] ^ - sharedMemory[idx1] ^ - sharedMemory[idx2] ^ - sharedMemory[idx3]; // ^k3 + y1 = xor4_32( + sharedMemory[idx0], + sharedMemory[idx1], + sharedMemory[idx2], + sharedMemory[idx3]); idx0 = __byte_perm(x2, 0, 0x4440); idx1 = __byte_perm(x3, 0, 0x4441) + 256; idx2 = __byte_perm(x0, 0, 0x4442) + 512; idx3 = __byte_perm(x1, 0, 0x4443) + 768; - y2 =sharedMemory[idx0] ^ - sharedMemory[idx1] ^ - sharedMemory[idx2] ^ - sharedMemory[idx3]; // ^k2 + y2 = xor4_32( + sharedMemory[idx0], + sharedMemory[idx1], + sharedMemory[idx2], + sharedMemory[idx3]); // ^k2 idx0 = __byte_perm(x3, 0, 0x4440); idx1 = __byte_perm(x0, 0, 0x4441) + 256; idx2 = __byte_perm(x1, 0, 0x4442) + 512; idx3 = __byte_perm(x2, 0, 0x4443) + 768; - y3 =sharedMemory[idx0] ^ - sharedMemory[idx1] ^ - sharedMemory[idx2] ^ - sharedMemory[idx3]; // ^k3 + y3 = xor4_32( + sharedMemory[idx0], + sharedMemory[idx1], + sharedMemory[idx2], + sharedMemory[idx3]); // ^k3 }