blake: return to ptarget 6:7 compare
clz can be erroneous, ex 0xE0 vs 0xF0
This commit is contained in:
parent
91eea0d76b
commit
ba33492592
14
blake32.cu
14
blake32.cu
@ -178,7 +178,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, co
|
|||||||
|
|
||||||
__global__
|
__global__
|
||||||
void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNounce,
|
void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNounce,
|
||||||
const uint8_t nClzTarget, const int crcsum, const int rounds)
|
const uint64_t highTarget, const int crcsum, const int rounds)
|
||||||
{
|
{
|
||||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
@ -223,12 +223,11 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
|
|||||||
|
|
||||||
// compare count of leading zeros h[6] + h[7]
|
// compare count of leading zeros h[6] + h[7]
|
||||||
uint64_t high64 = ((uint64_t*)h)[3];
|
uint64_t high64 = ((uint64_t*)h)[3];
|
||||||
uint32_t clz = cuda_clz64(high64);
|
if (high64 <= highTarget)
|
||||||
|
|
||||||
if (clz >= nClzTarget)
|
|
||||||
#if NBN == 2
|
#if NBN == 2
|
||||||
/* keep the smallest nounce, + extra one if found */
|
/* keep the smallest nounce, + extra one if found */
|
||||||
if (resNounce[0] > nounce) {
|
if (resNounce[0] > nounce) {
|
||||||
|
// printf("%llx %llx \n", high64, highTarget);
|
||||||
resNounce[1] = resNounce[0];
|
resNounce[1] = resNounce[0];
|
||||||
resNounce[0] = nounce;
|
resNounce[0] = nounce;
|
||||||
}
|
}
|
||||||
@ -241,7 +240,7 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
|
|||||||
}
|
}
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint8_t clzTarget,
|
uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint64_t highTarget,
|
||||||
const uint32_t crcsum, const int8_t rounds)
|
const uint32_t crcsum, const int8_t rounds)
|
||||||
{
|
{
|
||||||
const int threadsperblock = TPB;
|
const int threadsperblock = TPB;
|
||||||
@ -255,7 +254,7 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui
|
|||||||
if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
|
if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
|
||||||
return result;
|
return result;
|
||||||
|
|
||||||
blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], clzTarget, crcsum, (int) rounds);
|
blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds);
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
|
if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
|
||||||
//cudaThreadSynchronize(); /* seems no more required */
|
//cudaThreadSynchronize(); /* seems no more required */
|
||||||
@ -282,7 +281,6 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
|
|||||||
static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
|
static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||||
uint32_t throughput = min(TPB * 4096, max_nonce - first_nonce);
|
uint32_t throughput = min(TPB * 4096, max_nonce - first_nonce);
|
||||||
uint64_t targetHigh = ((uint64_t*)ptarget)[3];
|
uint64_t targetHigh = ((uint64_t*)ptarget)[3];
|
||||||
uint32_t clzTarget = cuda_clz64(targetHigh);
|
|
||||||
uint32_t crcsum = MAXU;
|
uint32_t crcsum = MAXU;
|
||||||
int rc = 0;
|
int rc = 0;
|
||||||
|
|
||||||
@ -318,7 +316,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
|
|||||||
|
|
||||||
do {
|
do {
|
||||||
// GPU HASH
|
// GPU HASH
|
||||||
uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], (uint8_t) clzTarget, crcsum, blakerounds);
|
uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], targetHigh, crcsum, blakerounds);
|
||||||
if (foundNonce != MAXU)
|
if (foundNonce != MAXU)
|
||||||
{
|
{
|
||||||
uint32_t endiandata[20];
|
uint32_t endiandata[20];
|
||||||
|
@ -289,7 +289,7 @@ uint64_t ROTR64(const uint64_t x, const int offset)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 64-bit ROTATE LEFT
|
// 64-bit ROTATE LEFT
|
||||||
#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT
|
#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1
|
||||||
__device__ __forceinline__
|
__device__ __forceinline__
|
||||||
uint64_t ROTL64(const uint64_t value, const int offset) {
|
uint64_t ROTL64(const uint64_t value, const int offset) {
|
||||||
uint2 result;
|
uint2 result;
|
||||||
@ -302,7 +302,7 @@ uint64_t ROTL64(const uint64_t value, const int offset) {
|
|||||||
}
|
}
|
||||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||||
}
|
}
|
||||||
#elif __CUDA_ARCH__ >= 120
|
#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
|
||||||
__device__ __forceinline__
|
__device__ __forceinline__
|
||||||
uint64_t ROTL64(const uint64_t x, const int offset)
|
uint64_t ROTL64(const uint64_t x, const int offset)
|
||||||
{
|
{
|
||||||
@ -323,61 +323,4 @@ uint64_t ROTL64(const uint64_t x, const int offset)
|
|||||||
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
|
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef WIN32
|
|
||||||
#include <intrin.h>
|
|
||||||
static uint32_t __inline __builtin_clz(uint32_t x) {
|
|
||||||
unsigned long r = 0;
|
|
||||||
_BitScanReverse(&r, x);
|
|
||||||
return (31-r);
|
|
||||||
}
|
|
||||||
static uint32_t __inline __builtin_ctz(uint32_t x) {
|
|
||||||
unsigned long r = 0;
|
|
||||||
_BitScanForward(&r, x);
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* count leading zeros of a 64bit int */
|
|
||||||
#if __CUDA_ARCH__ >= 200
|
|
||||||
__device__
|
|
||||||
static uint32_t cuda_clz64(const uint64_t x)
|
|
||||||
{
|
|
||||||
uint32_t result;
|
|
||||||
asm("clz.b64 %0, %1;\n"
|
|
||||||
: "=r"(result) : "l"(x));
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
/* host */
|
|
||||||
static uint32_t cuda_clz64(const uint64_t x)
|
|
||||||
{
|
|
||||||
uint32_t u32 = (x >> 32);
|
|
||||||
uint32_t result = u32 ? __builtin_clz(u32) : 32;
|
|
||||||
if (result == 32) {
|
|
||||||
u32 = (uint32_t) x;
|
|
||||||
result += (u32 ? __builtin_clz(u32) : 32);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* count trailing zeros of a 32bit int */
|
|
||||||
#if __CUDA_ARCH__ >= 200
|
|
||||||
__device__
|
|
||||||
static uint32_t cuda_ctz32(const uint32_t x)
|
|
||||||
{
|
|
||||||
uint32_t result;
|
|
||||||
asm("brev.b32 %1, %1;\n\t"
|
|
||||||
"clz.b32 %0, %1;\n"
|
|
||||||
: "=r"(result) : "r"(x));
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
/* host */
|
|
||||||
static uint32_t cuda_ctz32(const uint32_t x)
|
|
||||||
{
|
|
||||||
return x ? __builtin_ctz(x) : 32;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif // #ifndef CUDA_HELPER_H
|
#endif // #ifndef CUDA_HELPER_H
|
||||||
|
Loading…
x
Reference in New Issue
Block a user