From 10314d844f65ee0db42922b5ebd5f5c484711957 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 24 Aug 2014 04:09:20 +0200 Subject: [PATCH] whirlpool: remove dead code, win 2ms in final hash --- Makefile.am | 2 +- ccminer.vcxproj | 4 +- ccminer.vcxproj.filters | 20 ++++----- x15/cuda_x15_whirlpool.cu | 65 ++++++++++++------------------ x15/{whirlcoin.cu => whirlpool.cu} | 21 +++++----- 5 files changed, 50 insertions(+), 62 deletions(-) rename x15/{whirlcoin.cu => whirlpool.cu} (86%) diff --git a/Makefile.am b/Makefile.am index ee86bf4..5e539a9 100644 --- a/Makefile.am +++ b/Makefile.am @@ -40,7 +40,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \ x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \ - x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu x15/whirlcoin.cu \ + x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu x15/whirlpool.cu \ x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@ diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 97b25fe..389bf5b 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -523,7 +523,7 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" 64 80 - + --ptxas-options=-O2 %(AdditionalOptions) %(AdditionalOptions) 64 @@ -548,4 +548,4 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" - \ No newline at end of file + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index ecbecb3..cee7bbd 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -298,6 +298,12 @@ Source Files\CUDA + + Source Files\CUDA + + + Source Files\CUDA + Source Files\CUDA\JHA @@ -406,19 +412,13 @@ Source Files\CUDA\x15 - + Source Files\CUDA\x15 - + Source Files\CUDA\x15 - - Source Files\CUDA - - - Source Files\CUDA - - + Source Files\CUDA\x15 @@ -431,4 +431,4 @@ Source Files\CUDA\x17 - \ No newline at end of file + diff --git a/x15/cuda_x15_whirlpool.cu b/x15/cuda_x15_whirlpool.cu index d12c851..0bdd4c3 100644 --- a/x15/cuda_x15_whirlpool.cu +++ b/x15/cuda_x15_whirlpool.cu @@ -2214,7 +2214,7 @@ static uint64_t table_skew(uint64_t val, int num) { } __device__ __forceinline__ -static uint64_t ROUND_ELT(const uint64_t* __restrict__ sharedMemory, uint64_t in[8], +static uint64_t ROUND_ELT(const uint64_t* sharedMemory, uint64_t* __restrict__ in, int i0,int i1,int i2,int i3,int i4,int i5,int i6,int i7) { uint32_t idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7; @@ -2242,7 +2242,7 @@ static uint64_t ROUND_ELT(const uint64_t* __restrict__ sharedMemory, uint64_t in #else __device__ __forceinline__ -static uint64_t ROUND_ELT(const uint64_t* __restrict__ sharedMemory, uint64_t in[8], +static uint64_t ROUND_ELT(const uint64_t* sharedMemory, uint64_t* __restrict__ in, int i0,int i1,int i2,int i3,int i4,int i5,int i6,int i7) { uint32_t idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7; @@ -2317,7 +2317,6 @@ void oldwhirlpool_gpu_hash_80(int threads, uint32_t startNounce, void *outputHas for (int i=0; i<8; i++) { n[i] = c_PaddedMessage80[i]; // read data h[i] = 0; // read state - //n[i] = xor1(n[i], h[i]); } #pragma unroll 10 @@ -2330,7 +2329,6 @@ void oldwhirlpool_gpu_hash_80(int threads, uint32_t startNounce, void *outputHas #pragma unroll 8 for (int i=0; i < 8; i++) { state[i] = xor1(n[i],c_PaddedMessage80[i]); - n[i]=0; } /// round 2 /////// @@ -2338,10 +2336,14 @@ void oldwhirlpool_gpu_hash_80(int threads, uint32_t startNounce, void *outputHas n[0] = c_PaddedMessage80[8]; //read data n[1] = REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce)); //whirlpool n[2] = 0x0000000000000080; //whirlpool + n[3] = 0; + n[4] = 0; + n[5] = 0; + n[6] = 0; n[7] = 0x8002000000000000; #pragma unroll 8 - for (int i=0;i<8;i++) { + for (int i=0; i<8; i++) { h[i] = state[i]; //read state n[i] = xor1(n[i],h[i]); } @@ -2396,15 +2398,14 @@ void x15_whirlpool_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_ha int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); uint32_t hashPosition = (nounce - startNounce) << 3; uint64_t hash[8], state[8], n[8], h[8] = { 0 }; uint8_t i; #pragma unroll 8 - for (i=0; i<8; i++) { + for (i=0; i<8; i++) n[i] = hash[i] = g_hash[hashPosition + i]; - } #pragma unroll 10 for (i=0; i < 10; i++) { @@ -2414,10 +2415,12 @@ void x15_whirlpool_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_ha } #pragma unroll 8 - for (i=0; i<8; i++) { + for (i=0; i<8; i++) state[i] = xor1(n[i], hash[i]); + + #pragma unroll 6 + for (i=1; i<7; i++) n[i]=0; - } n[0] = 0x80; n[7] = 0x2000000000000; @@ -2472,18 +2475,15 @@ void oldwhirlpool_gpu_finalhash_64(int threads, uint32_t startNounce, uint64_t * int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); int hashPosition = nounce - startNounce; - uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition]; - union { - uint32_t h4[16]; - uint64_t h8[8]; - } hash; + uint64_t *inpHash = (uint64_t*) &g_hash[8 * hashPosition]; + uint64_t h8[8]; - #pragma unroll 16 - for (int i=0; i<16; i++) { - hash.h4[i]= inpHash[i]; + #pragma unroll 8 + for (int i=0; i<8; i++) { + h8[i] = inpHash[i]; } uint64_t state[8]; @@ -2492,9 +2492,8 @@ void oldwhirlpool_gpu_finalhash_64(int threads, uint32_t startNounce, uint64_t * #pragma unroll 8 for (int i=0; i<8; i++) { - n[i] = hash.h8[i]; + n[i] = h8[i]; h[i] = 0; - n[i] = xor1(n[i], h[i]); } #pragma unroll 10 @@ -2506,7 +2505,7 @@ void oldwhirlpool_gpu_finalhash_64(int threads, uint32_t startNounce, uint64_t * #pragma unroll 8 for (int i=0; i<8; i++) { - state[i] = xor1(n[i], hash.h8[i]); + state[i] = xor1(n[i], h8[i]); n[i]=0; } @@ -2516,7 +2515,7 @@ void oldwhirlpool_gpu_finalhash_64(int threads, uint32_t startNounce, uint64_t * #pragma unroll 8 for (int i=0; i<8; i++) { h[i] = state[i]; - n[i] = xor1(n[i],h[i]); + n[i] = xor1(n[i], h[i]); } #pragma unroll 10 @@ -2535,22 +2534,7 @@ void oldwhirlpool_gpu_finalhash_64(int threads, uint32_t startNounce, uint64_t * state[6] = xor1(state[6], n[6]); state[7] = xor3(state[7], n[7], 0x2000000000000); - #pragma unroll 8 - for (unsigned i = 0; i < 8; i++) - hash.h8[i] = state[i]; - - bool rc = true; - for (int i = 7; i >= 0; i--) { - if (hash.h4[i] > pTarget[i]) { - rc = false; - break; - } - if (hash.h4[i] < pTarget[i]) { - rc = true; - break; - } - } - + bool rc = (state[3] <= ((uint64_t*)pTarget)[3]); if (rc && resNounce[0] > nounce) resNounce[0] = nounce; } @@ -2574,7 +2558,7 @@ extern void x15_whirlpool_cpu_init(int thr_id, int threads, int mode) #endif break; - case 1: /* old (whirlcoin?) */ + case 1: /* old whirlpool */ cudaMemcpyToSymbol(InitVector_RC, old1_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(mixTob0Tox, old1_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(mixTob1Tox, old1_T1, (256*8), 0, cudaMemcpyHostToDevice); @@ -2653,6 +2637,7 @@ void whirlpool512_setBlock_80(void *pdata, const void *ptarget) unsigned char PaddedMessage[128]; memcpy(PaddedMessage, pdata, 80); memset(PaddedMessage+80, 0, 48); + PaddedMessage[80] = 0x80; /* ending */ cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); } diff --git a/x15/whirlcoin.cu b/x15/whirlpool.cu similarity index 86% rename from x15/whirlcoin.cu rename to x15/whirlpool.cu index 975eca8..eeb50e7 100644 --- a/x15/whirlcoin.cu +++ b/x15/whirlpool.cu @@ -29,24 +29,27 @@ extern "C" void wcoinhash(void *state, const void *input) { sph_whirlpool_context ctx_whirlpool; - uint32_t hash[16]; + unsigned char hash[128]; // uint32_t hashA[16], hashB[16]; + #define hashB hash+64 + + memset(hash, 0, sizeof hash); // shavite 1 sph_whirlpool1_init(&ctx_whirlpool); sph_whirlpool1(&ctx_whirlpool, input, 80); - sph_whirlpool1_close(&ctx_whirlpool, (void*) hash); + sph_whirlpool1_close(&ctx_whirlpool, hash); sph_whirlpool1_init(&ctx_whirlpool); - sph_whirlpool1(&ctx_whirlpool, (const void*) hash, 64); - sph_whirlpool1_close(&ctx_whirlpool, (void*) hash); + sph_whirlpool1(&ctx_whirlpool, hash, 64); + sph_whirlpool1_close(&ctx_whirlpool, hashB); sph_whirlpool1_init(&ctx_whirlpool); - sph_whirlpool1(&ctx_whirlpool, (const void*) hash, 64); - sph_whirlpool1_close(&ctx_whirlpool, (void*) hash); + sph_whirlpool1(&ctx_whirlpool, hashB, 64); + sph_whirlpool1_close(&ctx_whirlpool, hash); sph_whirlpool1_init(&ctx_whirlpool); - sph_whirlpool1(&ctx_whirlpool, (const void*) hash, 64); - sph_whirlpool1_close(&ctx_whirlpool, (void*) hash); + sph_whirlpool1(&ctx_whirlpool, hash, 64); + sph_whirlpool1_close(&ctx_whirlpool, hash); memcpy(state, hash, 32); } @@ -68,7 +71,7 @@ extern "C" int scanhash_whc(int thr_id, uint32_t *pdata, cudaSetDevice(device_map[thr_id]); // Konstanten kopieren, Speicher belegen cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); - x15_whirlpool_cpu_init(thr_id, throughput,1); + x15_whirlpool_cpu_init(thr_id, throughput, 1 /* old whirlpool */); init[thr_id] = true; }