blake: return to ptarget 6:7 compare
clz can be erroneous, ex 0xE0 vs 0xF0
This commit is contained in:
parent
91eea0d76b
commit
ba33492592
14
blake32.cu
14
blake32.cu
@ -178,7 +178,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, co
|
||||
|
||||
__global__
|
||||
void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNounce,
|
||||
const uint8_t nClzTarget, const int crcsum, const int rounds)
|
||||
const uint64_t highTarget, const int crcsum, const int rounds)
|
||||
{
|
||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
@ -223,12 +223,11 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
|
||||
|
||||
// compare count of leading zeros h[6] + h[7]
|
||||
uint64_t high64 = ((uint64_t*)h)[3];
|
||||
uint32_t clz = cuda_clz64(high64);
|
||||
|
||||
if (clz >= nClzTarget)
|
||||
if (high64 <= highTarget)
|
||||
#if NBN == 2
|
||||
/* keep the smallest nounce, + extra one if found */
|
||||
if (resNounce[0] > nounce) {
|
||||
// printf("%llx %llx \n", high64, highTarget);
|
||||
resNounce[1] = resNounce[0];
|
||||
resNounce[0] = nounce;
|
||||
}
|
||||
@ -241,7 +240,7 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
|
||||
}
|
||||
|
||||
__host__
|
||||
uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint8_t clzTarget,
|
||||
uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint64_t highTarget,
|
||||
const uint32_t crcsum, const int8_t rounds)
|
||||
{
|
||||
const int threadsperblock = TPB;
|
||||
@ -255,7 +254,7 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui
|
||||
if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
|
||||
return result;
|
||||
|
||||
blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], clzTarget, crcsum, (int) rounds);
|
||||
blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds);
|
||||
cudaDeviceSynchronize();
|
||||
if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
|
||||
//cudaThreadSynchronize(); /* seems no more required */
|
||||
@ -282,7 +281,6 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
|
||||
static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
uint32_t throughput = min(TPB * 4096, max_nonce - first_nonce);
|
||||
uint64_t targetHigh = ((uint64_t*)ptarget)[3];
|
||||
uint32_t clzTarget = cuda_clz64(targetHigh);
|
||||
uint32_t crcsum = MAXU;
|
||||
int rc = 0;
|
||||
|
||||
@ -318,7 +316,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
|
||||
|
||||
do {
|
||||
// GPU HASH
|
||||
uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], (uint8_t) clzTarget, crcsum, blakerounds);
|
||||
uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], targetHigh, crcsum, blakerounds);
|
||||
if (foundNonce != MAXU)
|
||||
{
|
||||
uint32_t endiandata[20];
|
||||
|
@ -289,7 +289,7 @@ uint64_t ROTR64(const uint64_t x, const int offset)
|
||||
#endif
|
||||
|
||||
// 64-bit ROTATE LEFT
|
||||
#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT
|
||||
#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1
|
||||
__device__ __forceinline__
|
||||
uint64_t ROTL64(const uint64_t value, const int offset) {
|
||||
uint2 result;
|
||||
@ -302,7 +302,7 @@ uint64_t ROTL64(const uint64_t value, const int offset) {
|
||||
}
|
||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||
}
|
||||
#elif __CUDA_ARCH__ >= 120
|
||||
#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
|
||||
__device__ __forceinline__
|
||||
uint64_t ROTL64(const uint64_t x, const int offset)
|
||||
{
|
||||
@ -323,61 +323,4 @@ uint64_t ROTL64(const uint64_t x, const int offset)
|
||||
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
|
||||
#endif
|
||||
|
||||
#ifdef WIN32
|
||||
#include <intrin.h>
|
||||
static uint32_t __inline __builtin_clz(uint32_t x) {
|
||||
unsigned long r = 0;
|
||||
_BitScanReverse(&r, x);
|
||||
return (31-r);
|
||||
}
|
||||
static uint32_t __inline __builtin_ctz(uint32_t x) {
|
||||
unsigned long r = 0;
|
||||
_BitScanForward(&r, x);
|
||||
return r;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* count leading zeros of a 64bit int */
|
||||
#if __CUDA_ARCH__ >= 200
|
||||
__device__
|
||||
static uint32_t cuda_clz64(const uint64_t x)
|
||||
{
|
||||
uint32_t result;
|
||||
asm("clz.b64 %0, %1;\n"
|
||||
: "=r"(result) : "l"(x));
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
/* host */
|
||||
static uint32_t cuda_clz64(const uint64_t x)
|
||||
{
|
||||
uint32_t u32 = (x >> 32);
|
||||
uint32_t result = u32 ? __builtin_clz(u32) : 32;
|
||||
if (result == 32) {
|
||||
u32 = (uint32_t) x;
|
||||
result += (u32 ? __builtin_clz(u32) : 32);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* count trailing zeros of a 32bit int */
|
||||
#if __CUDA_ARCH__ >= 200
|
||||
__device__
|
||||
static uint32_t cuda_ctz32(const uint32_t x)
|
||||
{
|
||||
uint32_t result;
|
||||
asm("brev.b32 %1, %1;\n\t"
|
||||
"clz.b32 %0, %1;\n"
|
||||
: "=r"(result) : "r"(x));
|
||||
return result;
|
||||
}
|
||||
#else
|
||||
/* host */
|
||||
static uint32_t cuda_ctz32(const uint32_t x)
|
||||
{
|
||||
return x ? __builtin_ctz(x) : 32;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // #ifndef CUDA_HELPER_H
|
||||
|
Loading…
Reference in New Issue
Block a user