Browse Source

blake: return to ptarget 6:7 compare

clz can be erroneous, ex 0xE0 vs 0xF0
master
Tanguy Pruvot 10 years ago
parent
commit
ba33492592
  1. 14
      blake32.cu
  2. 61
      cuda_helper.h

14
blake32.cu

@ -178,7 +178,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, co @@ -178,7 +178,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, co
__global__
void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNounce,
const uint8_t nClzTarget, const int crcsum, const int rounds)
const uint64_t highTarget, const int crcsum, const int rounds)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
@ -223,12 +223,11 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin @@ -223,12 +223,11 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
// compare count of leading zeros h[6] + h[7]
uint64_t high64 = ((uint64_t*)h)[3];
uint32_t clz = cuda_clz64(high64);
if (clz >= nClzTarget)
if (high64 <= highTarget)
#if NBN == 2
/* keep the smallest nounce, + extra one if found */
if (resNounce[0] > nounce) {
// printf("%llx %llx \n", high64, highTarget);
resNounce[1] = resNounce[0];
resNounce[0] = nounce;
}
@ -241,7 +240,7 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin @@ -241,7 +240,7 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
}
__host__
uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint8_t clzTarget,
uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint64_t highTarget,
const uint32_t crcsum, const int8_t rounds)
{
const int threadsperblock = TPB;
@ -255,7 +254,7 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui @@ -255,7 +254,7 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui
if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
return result;
blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], clzTarget, crcsum, (int) rounds);
blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds);
cudaDeviceSynchronize();
if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
//cudaThreadSynchronize(); /* seems no more required */
@ -282,7 +281,6 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt @@ -282,7 +281,6 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
uint32_t throughput = min(TPB * 4096, max_nonce - first_nonce);
uint64_t targetHigh = ((uint64_t*)ptarget)[3];
uint32_t clzTarget = cuda_clz64(targetHigh);
uint32_t crcsum = MAXU;
int rc = 0;
@ -318,7 +316,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt @@ -318,7 +316,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
do {
// GPU HASH
uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], (uint8_t) clzTarget, crcsum, blakerounds);
uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], targetHigh, crcsum, blakerounds);
if (foundNonce != MAXU)
{
uint32_t endiandata[20];

61
cuda_helper.h

@ -289,7 +289,7 @@ uint64_t ROTR64(const uint64_t x, const int offset) @@ -289,7 +289,7 @@ uint64_t ROTR64(const uint64_t x, const int offset)
#endif
// 64-bit ROTATE LEFT
#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT
#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1
__device__ __forceinline__
uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
@ -302,7 +302,7 @@ uint64_t ROTL64(const uint64_t value, const int offset) { @@ -302,7 +302,7 @@ uint64_t ROTL64(const uint64_t value, const int offset) {
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#elif __CUDA_ARCH__ >= 120
#elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
__device__ __forceinline__
uint64_t ROTL64(const uint64_t x, const int offset)
{
@ -323,61 +323,4 @@ uint64_t ROTL64(const uint64_t x, const int offset) @@ -323,61 +323,4 @@ uint64_t ROTL64(const uint64_t x, const int offset)
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#ifdef WIN32
#include <intrin.h>
static uint32_t __inline __builtin_clz(uint32_t x) {
unsigned long r = 0;
_BitScanReverse(&r, x);
return (31-r);
}
static uint32_t __inline __builtin_ctz(uint32_t x) {
unsigned long r = 0;
_BitScanForward(&r, x);
return r;
}
#endif
/* count leading zeros of a 64bit int */
#if __CUDA_ARCH__ >= 200
__device__
static uint32_t cuda_clz64(const uint64_t x)
{
uint32_t result;
asm("clz.b64 %0, %1;\n"
: "=r"(result) : "l"(x));
return result;
}
#else
/* host */
static uint32_t cuda_clz64(const uint64_t x)
{
uint32_t u32 = (x >> 32);
uint32_t result = u32 ? __builtin_clz(u32) : 32;
if (result == 32) {
u32 = (uint32_t) x;
result += (u32 ? __builtin_clz(u32) : 32);
}
return result;
}
#endif
/* count trailing zeros of a 32bit int */
#if __CUDA_ARCH__ >= 200
__device__
static uint32_t cuda_ctz32(const uint32_t x)
{
uint32_t result;
asm("brev.b32 %1, %1;\n\t"
"clz.b32 %0, %1;\n"
: "=r"(result) : "r"(x));
return result;
}
#else
/* host */
static uint32_t cuda_ctz32(const uint32_t x)
{
return x ? __builtin_ctz(x) : 32;
}
#endif
#endif // #ifndef CUDA_HELPER_H

Loading…
Cancel
Save