From ef8a73d6aab4cde96f686a1604550b76c849a3da Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 6 Dec 2014 13:08:16 +0100 Subject: [PATCH] keccak: not compatible with second nonces (was broken) Use djm34 new uint2 method to get a +40% boost (115 to 153MH/s) --- Algo256/cuda_keccak256.cu | 67 +++++++++++++++++++++------------------ Algo256/keccak256.cu | 15 +++------ Makefile.am | 5 +-- ccminer.cpp | 12 +++++-- 4 files changed, 51 insertions(+), 48 deletions(-) diff --git a/Algo256/cuda_keccak256.cu b/Algo256/cuda_keccak256.cu index 8b3546e..7437622 100644 --- a/Algo256/cuda_keccak256.cu +++ b/Algo256/cuda_keccak256.cu @@ -7,6 +7,12 @@ extern "C" { #include "cuda_helper.h" +#ifdef _MSC_VER +#define UINT2(x,y) { x, y } +#else +#define UINT2(x,y) (uint2) { x, y } +#endif + static const uint64_t host_keccak_round_constants[24] = { 0x0000000000000001ull, 0x0000000000008082ull, 0x800000000000808aull, 0x8000000080008000ull, @@ -97,7 +103,7 @@ static void keccak_blockv35(uint2 *s, const uint64_t *keccak_round_constants) s[0] ^= vectorize(keccak_round_constants[i]); } } -#endif +#else __device__ __forceinline__ static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants) @@ -167,43 +173,52 @@ static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants) s[0] ^= keccak_round_constants[i]; } } +#endif -__global__ +__global__ __launch_bounds__(128,5) void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce) { int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { uint32_t nounce = startNounce + thread; - uint64_t keccak_gpu_state[25]; - //#pragma unroll 25 +#if __CUDA_ARCH__ >= 350 + uint2 keccak_gpu_state[25]; + #pragma unroll 25 for (int i=0; i<25; i++) { - if (i < 9) - keccak_gpu_state[i] = c_PaddedMessage80[i]; - else - keccak_gpu_state[i] = 0; + if (i<9) keccak_gpu_state[i] = vectorize(c_PaddedMessage80[i]); + else keccak_gpu_state[i] = UINT2(0, 0); + } + + keccak_gpu_state[9]= vectorize(c_PaddedMessage80[9]); + keccak_gpu_state[9].y = cuda_swab32(nounce); + keccak_gpu_state[10] = UINT2(1, 0); + keccak_gpu_state[16] = UINT2(0, 0x80000000); + + keccak_blockv35(keccak_gpu_state,keccak_round_constants); + if (devectorize(keccak_gpu_state[3]) <= ((uint64_t*)pTarget)[3]) {resNounce[0] = nounce;} +#else + uint64_t keccak_gpu_state[25]; + #pragma unroll 25 + for (int i=0; i<25; i++) { + if (i<9) keccak_gpu_state[i] = c_PaddedMessage80[i]; + else keccak_gpu_state[i] = 0; } keccak_gpu_state[9] = REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce)); keccak_gpu_state[10] = 0x0000000000000001; keccak_gpu_state[16] = 0x8000000000000000; keccak_blockv30(keccak_gpu_state, keccak_round_constants); - - bool rc = false; - if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) {rc = true;} - - if (rc == true) { - if(resNounce[0] > nounce) - resNounce[0] = nounce; - } + if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) { resNounce[0] = nounce; } +#endif } } __host__ uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order) { - uint32_t result = 0xffffffff; + uint32_t result = UINT32_MAX; cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t)); const int threadsperblock = 128; @@ -222,12 +237,6 @@ uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, ui return result; } -#ifdef _MSC_VER -#define UINT2(a, b) { a, b } -#else -#define UINT2(a, b) (uint2) { a, b } -#endif - __global__ __launch_bounds__(256,3) void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash) { @@ -238,18 +247,16 @@ void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHa uint2 keccak_gpu_state[25]; #pragma unroll 25 for (int i = 0; i<25; i++) { - if (i < 4) - keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]); - else - keccak_gpu_state[i] = UINT2(0, 0); + if (i<4) keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]); + else keccak_gpu_state[i] = UINT2(0, 0); } keccak_gpu_state[4] = UINT2(1, 0); keccak_gpu_state[16] = UINT2(0, 0x80000000); keccak_blockv35(keccak_gpu_state, keccak_round_constants); #pragma unroll 4 - for (int i=0; i<4;i++) - outputHash[i*threads+thread]=devectorize(keccak_gpu_state[i]); + for (int i=0; i<4; i++) + outputHash[i*threads+thread] = devectorize(keccak_gpu_state[i]); #else uint64_t keccak_gpu_state[25]; #pragma unroll 25 @@ -259,7 +266,7 @@ void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHa else keccak_gpu_state[i] = 0; } - keccak_gpu_state[4] = 0x0000000000000001; + keccak_gpu_state[4] = 0x0000000000000001; keccak_gpu_state[16] = 0x8000000000000000; keccak_blockv30(keccak_gpu_state, keccak_round_constants); diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu index f1f1200..ac6f4d0 100644 --- a/Algo256/keccak256.cu +++ b/Algo256/keccak256.cu @@ -41,13 +41,12 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; + int throughput = opt_work_size ? opt_work_size : (1 << 21); // 256*256*8*4 + throughput = min(throughput, (int)(max_nonce - first_nonce)); if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x0005; - int throughput = opt_work_size ? opt_work_size : (1 << 21); // 256*256*8*4 - throughput = min(throughput, (int)(max_nonce - first_nonce)); - if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); @@ -75,15 +74,9 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata, keccak256_hash(vhash64, endiandata); if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { - int res = 1; - uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); - *hashes_done = pdata[19] - first_nonce + throughput; - if (secNonce != 0) { - pdata[21] = secNonce; - res++; - } + *hashes_done = foundNonce - first_nonce + 1; pdata[19] = foundNonce; - return res; + return 1; } else { applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce); diff --git a/Makefile.am b/Makefile.am index 168ac14..9044fce 100644 --- a/Makefile.am +++ b/Makefile.am @@ -79,15 +79,12 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v" .cu.o: $(NVCC) $(nvcc_FLAGS) --maxrregcount=128 -o $@ -c $< -blake32.o: blake32.cu +Algo256/blake256.o: Algo256/blake256.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $< heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< -keccak/cuda_keccak256.o: keccak/cuda_keccak256.cu - $(NVCC) $(nvcc_FLAGS) --maxrregcount=92 -o $@ -c $< - qubit/qubit_luffa512.o: qubit/qubit_luffa512.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< diff --git a/ccminer.cpp b/ccminer.cpp index 4afbcb2..4cc0d46 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -1100,9 +1100,11 @@ static void *miner_thread(void *userdata) case ALGO_BLAKE: minmax = 0x80000000U; break; + case ALGO_KECCAK: + minmax = 0x40000000U; + break; case ALGO_DOOM: case ALGO_JACKPOT: - case ALGO_KECCAK: case ALGO_LUFFA_DOOM: minmax = 0x2000000; break; @@ -1157,6 +1159,9 @@ static void *miner_thread(void *userdata) } } #endif + if (opt_algo == ALGO_KECCAK && max64 == UINT32_MAX) { + max64 = 0x7FFFFFFFUL; + } /* never let small ranges at end */ if (end_nonce >= UINT32_MAX - 256) end_nonce = UINT32_MAX; @@ -1406,9 +1411,10 @@ out: static void restart_threads(void) { - int i; + if (opt_debug) + applog(LOG_DEBUG,"%s", __FUNCTION__); - for (i = 0; i < opt_n_threads; i++) + for (int i = 0; i < opt_n_threads; i++) work_restart[i].restart = 1; }