From 91eea0d76b425cd2631b07c8344bdef06cc61d10 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 13 Sep 2014 13:22:14 +0200 Subject: [PATCH] blake: remove int cudaMemcpyToSymbol for MSVC use clz (leading zeros) asm func for a fast gpu compare of ptarget[6]:[7] add also missing windows ctz/clz host functions New NEOS speed: 227MH to 270MH (Gigabyte 750Ti Black Edition) --- blake32.cu | 53 +++++++++++++++++++++++------------------ cuda_helper.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 93 insertions(+), 26 deletions(-) diff --git a/blake32.cu b/blake32.cu index fd2c7c5..5f4043b 100644 --- a/blake32.cu +++ b/blake32.cu @@ -47,10 +47,6 @@ extern int device_map[8]; __constant__ static uint32_t __align__(32) c_data[20]; -// only store the 2 high uint32 of the target hash -__constant__ static uint64_t c_Target; -__constant__ static int8_t c_BlakeRounds; - /* 8 adapters max (-t threads) */ static uint32_t *d_resNonce[8]; static uint32_t *h_resNonce[8]; @@ -132,7 +128,7 @@ static const uint32_t __align__(32) c_Padding[16] = { }; __device__ static -void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0) +void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, const int rounds) { uint32_t /* __align__(8) */ m[16]; uint32_t /* __align__(8) */ v[16]; @@ -160,7 +156,6 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0) v[14] = c_u256[6]; v[15] = c_u256[7]; - int rounds = c_BlakeRounds; for (int i = 0; i < rounds; i++) { /* column step */ GS(0, 4, 0x8, 0xC, 0x0); @@ -176,18 +171,19 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0) //#pragma unroll 16 for (uint32_t i = 0; i < 16; i++) { - uint32_t j = i % 8; + uint32_t j = i % 8U; h[j] ^= v[i]; } } __global__ -void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, const int crcsum) +void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNounce, + const uint8_t nClzTarget, const int crcsum, const int rounds) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - const uint32_t nounce = startNounce + thread; + const uint32_t nounce = startNonce + thread; uint32_t h[8]; #pragma unroll @@ -200,7 +196,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN #else if (crcsum != prevsum) { prevsum = crcsum; - blake256_compress(h, c_data, 512); + blake256_compress(h, c_data, 512, rounds); #pragma unroll for(int i=0; i<8; i++) { cache[i] = h[i]; @@ -220,10 +216,16 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN ending[2] = c_data[18]; ending[3] = nounce; /* our tested value */ - blake256_compress(h, ending, 640); + blake256_compress(h, ending, 640, rounds); + + // not sure why, h[7] is ok + h[6] = cuda_swab32(h[6]); + + // compare count of leading zeros h[6] + h[7] + uint64_t high64 = ((uint64_t*)h)[3]; + uint32_t clz = cuda_clz64(high64); - /* do not test all parts, fulltest() will do it */ - if (((uint64_t*)h)[3] <= c_Target) + if (clz >= nClzTarget) #if NBN == 2 /* keep the smallest nounce, + extra one if found */ if (resNounce[0] > nounce) { @@ -239,7 +241,8 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN } __host__ -uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t crcsum) +uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint8_t clzTarget, + const uint32_t crcsum, const int8_t rounds) { const int threadsperblock = TPB; uint32_t result = MAXU; @@ -252,7 +255,7 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess) return result; - blake256_gpu_hash_80<<>>(threads, startNounce, d_resNonce[thr_id], crcsum); + blake256_gpu_hash_80<<>>(threads, startNonce, d_resNonce[thr_id], clzTarget, crcsum, (int) rounds); cudaDeviceSynchronize(); if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { //cudaThreadSynchronize(); /* seems no more required */ @@ -264,14 +267,12 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce } __host__ -void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget, int8_t blakerounds) +void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget) { uint32_t data[20]; memcpy(data, pdata, 80); CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice)); - CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, &ptarget[6], 2*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); - CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_BlakeRounds, &blakerounds, sizeof(int8_t), 0, cudaMemcpyHostToDevice)); } extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *ptarget, @@ -280,6 +281,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt const uint32_t first_nonce = pdata[19]; static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; uint32_t throughput = min(TPB * 4096, max_nonce - first_nonce); + uint64_t targetHigh = ((uint64_t*)ptarget)[3]; + uint32_t clzTarget = cuda_clz64(targetHigh); uint32_t crcsum = MAXU; int rc = 0; @@ -308,19 +311,19 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt init[thr_id] = true; } - blake256_cpu_setBlock_80(pdata, ptarget, blakerounds); + blake256_cpu_setBlock_80(pdata, ptarget); #if USE_CACHE crcsum = crc32_u32t(pdata, 64); #endif do { // GPU HASH - uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], crcsum); + uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], (uint8_t) clzTarget, crcsum, blakerounds); if (foundNonce != MAXU) { uint32_t endiandata[20]; uint32_t vhashcpu[8]; - uint32_t Htarg = ptarget[7]; + uint32_t Htarg = ptarget[6]; for (int k=0; k < 19; k++) be32enc(&endiandata[k], pdata[k]); @@ -329,7 +332,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt blake256hash(vhashcpu, endiandata, blakerounds); - if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) + if (vhashcpu[6] <= Htarg || cuda_swab32(vhashcpu[6]) <= Htarg /*&& fulltest(vhashcpu, ptarget)*/) { pdata[19] = foundNonce; rc = 1; @@ -338,7 +341,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt // Rare but possible if the throughput is big be32enc(&endiandata[19], extra_results[0]); blake256hash(vhashcpu, endiandata, blakerounds); - if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { + if (vhashcpu[6] <= Htarg /* && fulltest(vhashcpu, ptarget) */) { applog(LOG_NOTICE, "GPU found more than one result " CL_GRN "yippee!"); rc = 2; } else { @@ -346,9 +349,13 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt } } + //applog_hash((uint8_t*)ptarget); + //applog_compare_hash((uint8_t*)vhashcpu,(uint8_t*)ptarget); goto exit_scan; } else if (opt_debug) { + applog_hash((uint8_t*)ptarget); + applog_compare_hash((uint8_t*)vhashcpu,(uint8_t*)ptarget); applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce); } } diff --git a/cuda_helper.h b/cuda_helper.h index 9e10968..56b80a7 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -248,9 +248,12 @@ uint64_t shl_t64(uint64_t x, uint32_t n) return result; } +#ifndef USE_ROT_ASM_OPT +#define USE_ROT_ASM_OPT 1 +#endif // 64-bit ROTATE RIGHT -#if __CUDA_ARCH__ >= 350 +#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1 /* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */ __device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) { @@ -264,7 +267,7 @@ uint64_t ROTR64(const uint64_t value, const int offset) { } return __double_as_longlong(__hiloint2double(result.y, result.x)); } -#elif __CUDA_ARCH__ >= 120 +#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2 __device__ __forceinline__ uint64_t ROTR64(const uint64_t x, const int offset) { @@ -286,7 +289,7 @@ uint64_t ROTR64(const uint64_t x, const int offset) #endif // 64-bit ROTATE LEFT -#if __CUDA_ARCH__ >= 350 +#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT __device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) { uint2 result; @@ -320,4 +323,61 @@ uint64_t ROTL64(const uint64_t x, const int offset) #define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) #endif +#ifdef WIN32 +#include +static uint32_t __inline __builtin_clz(uint32_t x) { + unsigned long r = 0; + _BitScanReverse(&r, x); + return (31-r); +} +static uint32_t __inline __builtin_ctz(uint32_t x) { + unsigned long r = 0; + _BitScanForward(&r, x); + return r; +} +#endif + +/* count leading zeros of a 64bit int */ +#if __CUDA_ARCH__ >= 200 +__device__ +static uint32_t cuda_clz64(const uint64_t x) +{ + uint32_t result; + asm("clz.b64 %0, %1;\n" + : "=r"(result) : "l"(x)); + return result; +} +#else +/* host */ +static uint32_t cuda_clz64(const uint64_t x) +{ + uint32_t u32 = (x >> 32); + uint32_t result = u32 ? __builtin_clz(u32) : 32; + if (result == 32) { + u32 = (uint32_t) x; + result += (u32 ? __builtin_clz(u32) : 32); + } + return result; +} +#endif + +/* count trailing zeros of a 32bit int */ +#if __CUDA_ARCH__ >= 200 +__device__ +static uint32_t cuda_ctz32(const uint32_t x) +{ + uint32_t result; + asm("brev.b32 %1, %1;\n\t" + "clz.b32 %0, %1;\n" + : "=r"(result) : "r"(x)); + return result; +} +#else +/* host */ +static uint32_t cuda_ctz32(const uint32_t x) +{ + return x ? __builtin_ctz(x) : 32; +} +#endif + #endif // #ifndef CUDA_HELPER_H