diff --git a/blake32.cu b/blake32.cu index 5f4043b..025a2af 100644 --- a/blake32.cu +++ b/blake32.cu @@ -178,7 +178,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, co __global__ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNounce, - const uint8_t nClzTarget, const int crcsum, const int rounds) + const uint64_t highTarget, const int crcsum, const int rounds) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -223,12 +223,11 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin // compare count of leading zeros h[6] + h[7] uint64_t high64 = ((uint64_t*)h)[3]; - uint32_t clz = cuda_clz64(high64); - - if (clz >= nClzTarget) + if (high64 <= highTarget) #if NBN == 2 /* keep the smallest nounce, + extra one if found */ if (resNounce[0] > nounce) { + // printf("%llx %llx \n", high64, highTarget); resNounce[1] = resNounce[0]; resNounce[0] = nounce; } @@ -241,7 +240,7 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin } __host__ -uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint8_t clzTarget, +uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint64_t highTarget, const uint32_t crcsum, const int8_t rounds) { const int threadsperblock = TPB; @@ -255,7 +254,7 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess) return result; - blake256_gpu_hash_80<<>>(threads, startNonce, d_resNonce[thr_id], clzTarget, crcsum, (int) rounds); + blake256_gpu_hash_80<<>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds); cudaDeviceSynchronize(); if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { //cudaThreadSynchronize(); /* seems no more required */ @@ -282,7 +281,6 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; uint32_t throughput = min(TPB * 4096, max_nonce - first_nonce); uint64_t targetHigh = ((uint64_t*)ptarget)[3]; - uint32_t clzTarget = cuda_clz64(targetHigh); uint32_t crcsum = MAXU; int rc = 0; @@ -318,7 +316,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt do { // GPU HASH - uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], (uint8_t) clzTarget, crcsum, blakerounds); + uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], targetHigh, crcsum, blakerounds); if (foundNonce != MAXU) { uint32_t endiandata[20]; diff --git a/cuda_helper.h b/cuda_helper.h index 56b80a7..4755d8a 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -289,7 +289,7 @@ uint64_t ROTR64(const uint64_t x, const int offset) #endif // 64-bit ROTATE LEFT -#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT +#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1 __device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) { uint2 result; @@ -302,7 +302,7 @@ uint64_t ROTL64(const uint64_t value, const int offset) { } return __double_as_longlong(__hiloint2double(result.y, result.x)); } -#elif __CUDA_ARCH__ >= 120 +#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2 __device__ __forceinline__ uint64_t ROTL64(const uint64_t x, const int offset) { @@ -323,61 +323,4 @@ uint64_t ROTL64(const uint64_t x, const int offset) #define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) #endif -#ifdef WIN32 -#include -static uint32_t __inline __builtin_clz(uint32_t x) { - unsigned long r = 0; - _BitScanReverse(&r, x); - return (31-r); -} -static uint32_t __inline __builtin_ctz(uint32_t x) { - unsigned long r = 0; - _BitScanForward(&r, x); - return r; -} -#endif - -/* count leading zeros of a 64bit int */ -#if __CUDA_ARCH__ >= 200 -__device__ -static uint32_t cuda_clz64(const uint64_t x) -{ - uint32_t result; - asm("clz.b64 %0, %1;\n" - : "=r"(result) : "l"(x)); - return result; -} -#else -/* host */ -static uint32_t cuda_clz64(const uint64_t x) -{ - uint32_t u32 = (x >> 32); - uint32_t result = u32 ? __builtin_clz(u32) : 32; - if (result == 32) { - u32 = (uint32_t) x; - result += (u32 ? __builtin_clz(u32) : 32); - } - return result; -} -#endif - -/* count trailing zeros of a 32bit int */ -#if __CUDA_ARCH__ >= 200 -__device__ -static uint32_t cuda_ctz32(const uint32_t x) -{ - uint32_t result; - asm("brev.b32 %1, %1;\n\t" - "clz.b32 %0, %1;\n" - : "=r"(result) : "r"(x)); - return result; -} -#else -/* host */ -static uint32_t cuda_ctz32(const uint32_t x) -{ - return x ? __builtin_ctz(x) : 32; -} -#endif - #endif // #ifndef CUDA_HELPER_H