blake: remove int cudaMemcpyToSymbol for MSVC

use clz (leading zeros) asm func for a fast gpu compare of ptarget[6]:[7]

add also missing windows ctz/clz host functions

New NEOS speed: 227MH to 270MH (Gigabyte 750Ti Black Edition)
This commit is contained in:
Tanguy Pruvot 2014-09-13 13:22:14 +02:00
parent 9efe0b965d
commit 91eea0d76b
2 changed files with 93 additions and 26 deletions

View File

@ -47,10 +47,6 @@ extern int device_map[8];
__constant__ __constant__
static uint32_t __align__(32) c_data[20]; static uint32_t __align__(32) c_data[20];
// only store the 2 high uint32 of the target hash
__constant__ static uint64_t c_Target;
__constant__ static int8_t c_BlakeRounds;
/* 8 adapters max (-t threads) */ /* 8 adapters max (-t threads) */
static uint32_t *d_resNonce[8]; static uint32_t *d_resNonce[8];
static uint32_t *h_resNonce[8]; static uint32_t *h_resNonce[8];
@ -132,7 +128,7 @@ static const uint32_t __align__(32) c_Padding[16] = {
}; };
__device__ static __device__ static
void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0) void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, const int rounds)
{ {
uint32_t /* __align__(8) */ m[16]; uint32_t /* __align__(8) */ m[16];
uint32_t /* __align__(8) */ v[16]; uint32_t /* __align__(8) */ v[16];
@ -160,7 +156,6 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0)
v[14] = c_u256[6]; v[14] = c_u256[6];
v[15] = c_u256[7]; v[15] = c_u256[7];
int rounds = c_BlakeRounds;
for (int i = 0; i < rounds; i++) { for (int i = 0; i < rounds; i++) {
/* column step */ /* column step */
GS(0, 4, 0x8, 0xC, 0x0); GS(0, 4, 0x8, 0xC, 0x0);
@ -176,18 +171,19 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0)
//#pragma unroll 16 //#pragma unroll 16
for (uint32_t i = 0; i < 16; i++) { for (uint32_t i = 0; i < 16; i++) {
uint32_t j = i % 8; uint32_t j = i % 8U;
h[j] ^= v[i]; h[j] ^= v[i];
} }
} }
__global__ __global__
void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, const int crcsum) void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNounce,
const uint8_t nClzTarget, const int crcsum, const int rounds)
{ {
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
const uint32_t nounce = startNounce + thread; const uint32_t nounce = startNonce + thread;
uint32_t h[8]; uint32_t h[8];
#pragma unroll #pragma unroll
@ -200,7 +196,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
#else #else
if (crcsum != prevsum) { if (crcsum != prevsum) {
prevsum = crcsum; prevsum = crcsum;
blake256_compress(h, c_data, 512); blake256_compress(h, c_data, 512, rounds);
#pragma unroll #pragma unroll
for(int i=0; i<8; i++) { for(int i=0; i<8; i++) {
cache[i] = h[i]; cache[i] = h[i];
@ -220,10 +216,16 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
ending[2] = c_data[18]; ending[2] = c_data[18];
ending[3] = nounce; /* our tested value */ ending[3] = nounce; /* our tested value */
blake256_compress(h, ending, 640); blake256_compress(h, ending, 640, rounds);
/* do not test all parts, fulltest() will do it */ // not sure why, h[7] is ok
if (((uint64_t*)h)[3] <= c_Target) h[6] = cuda_swab32(h[6]);
// compare count of leading zeros h[6] + h[7]
uint64_t high64 = ((uint64_t*)h)[3];
uint32_t clz = cuda_clz64(high64);
if (clz >= nClzTarget)
#if NBN == 2 #if NBN == 2
/* keep the smallest nounce, + extra one if found */ /* keep the smallest nounce, + extra one if found */
if (resNounce[0] > nounce) { if (resNounce[0] > nounce) {
@ -239,7 +241,8 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN
} }
__host__ __host__
uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, const uint32_t crcsum) uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint8_t clzTarget,
const uint32_t crcsum, const int8_t rounds)
{ {
const int threadsperblock = TPB; const int threadsperblock = TPB;
uint32_t result = MAXU; uint32_t result = MAXU;
@ -252,7 +255,7 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce
if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess) if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
return result; return result;
blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNonce[thr_id], crcsum); blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], clzTarget, crcsum, (int) rounds);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
//cudaThreadSynchronize(); /* seems no more required */ //cudaThreadSynchronize(); /* seems no more required */
@ -264,14 +267,12 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce
} }
__host__ __host__
void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget, int8_t blakerounds) void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget)
{ {
uint32_t data[20]; uint32_t data[20];
memcpy(data, pdata, 80); memcpy(data, pdata, 80);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, &ptarget[6], 2*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_BlakeRounds, &blakerounds, sizeof(int8_t), 0, cudaMemcpyHostToDevice));
} }
extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *ptarget, extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
@ -280,6 +281,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
uint32_t throughput = min(TPB * 4096, max_nonce - first_nonce); uint32_t throughput = min(TPB * 4096, max_nonce - first_nonce);
uint64_t targetHigh = ((uint64_t*)ptarget)[3];
uint32_t clzTarget = cuda_clz64(targetHigh);
uint32_t crcsum = MAXU; uint32_t crcsum = MAXU;
int rc = 0; int rc = 0;
@ -308,19 +311,19 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
init[thr_id] = true; init[thr_id] = true;
} }
blake256_cpu_setBlock_80(pdata, ptarget, blakerounds); blake256_cpu_setBlock_80(pdata, ptarget);
#if USE_CACHE #if USE_CACHE
crcsum = crc32_u32t(pdata, 64); crcsum = crc32_u32t(pdata, 64);
#endif #endif
do { do {
// GPU HASH // GPU HASH
uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], crcsum); uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], (uint8_t) clzTarget, crcsum, blakerounds);
if (foundNonce != MAXU) if (foundNonce != MAXU)
{ {
uint32_t endiandata[20]; uint32_t endiandata[20];
uint32_t vhashcpu[8]; uint32_t vhashcpu[8];
uint32_t Htarg = ptarget[7]; uint32_t Htarg = ptarget[6];
for (int k=0; k < 19; k++) for (int k=0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]); be32enc(&endiandata[k], pdata[k]);
@ -329,7 +332,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
blake256hash(vhashcpu, endiandata, blakerounds); blake256hash(vhashcpu, endiandata, blakerounds);
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) if (vhashcpu[6] <= Htarg || cuda_swab32(vhashcpu[6]) <= Htarg /*&& fulltest(vhashcpu, ptarget)*/)
{ {
pdata[19] = foundNonce; pdata[19] = foundNonce;
rc = 1; rc = 1;
@ -338,7 +341,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
// Rare but possible if the throughput is big // Rare but possible if the throughput is big
be32enc(&endiandata[19], extra_results[0]); be32enc(&endiandata[19], extra_results[0]);
blake256hash(vhashcpu, endiandata, blakerounds); blake256hash(vhashcpu, endiandata, blakerounds);
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { if (vhashcpu[6] <= Htarg /* && fulltest(vhashcpu, ptarget) */) {
applog(LOG_NOTICE, "GPU found more than one result " CL_GRN "yippee!"); applog(LOG_NOTICE, "GPU found more than one result " CL_GRN "yippee!");
rc = 2; rc = 2;
} else { } else {
@ -346,9 +349,13 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
} }
} }
//applog_hash((uint8_t*)ptarget);
//applog_compare_hash((uint8_t*)vhashcpu,(uint8_t*)ptarget);
goto exit_scan; goto exit_scan;
} }
else if (opt_debug) { else if (opt_debug) {
applog_hash((uint8_t*)ptarget);
applog_compare_hash((uint8_t*)vhashcpu,(uint8_t*)ptarget);
applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce); applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
} }
} }

View File

@ -248,9 +248,12 @@ uint64_t shl_t64(uint64_t x, uint32_t n)
return result; return result;
} }
#ifndef USE_ROT_ASM_OPT
#define USE_ROT_ASM_OPT 1
#endif
// 64-bit ROTATE RIGHT // 64-bit ROTATE RIGHT
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1
/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */ /* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
__device__ __forceinline__ __device__ __forceinline__
uint64_t ROTR64(const uint64_t value, const int offset) { uint64_t ROTR64(const uint64_t value, const int offset) {
@ -264,7 +267,7 @@ uint64_t ROTR64(const uint64_t value, const int offset) {
} }
return __double_as_longlong(__hiloint2double(result.y, result.x)); return __double_as_longlong(__hiloint2double(result.y, result.x));
} }
#elif __CUDA_ARCH__ >= 120 #elif __CUDA_ARCH__ >= 120 && USE_ROT_ASM_OPT == 2
__device__ __forceinline__ __device__ __forceinline__
uint64_t ROTR64(const uint64_t x, const int offset) uint64_t ROTR64(const uint64_t x, const int offset)
{ {
@ -286,7 +289,7 @@ uint64_t ROTR64(const uint64_t x, const int offset)
#endif #endif
// 64-bit ROTATE LEFT // 64-bit ROTATE LEFT
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT
__device__ __forceinline__ __device__ __forceinline__
uint64_t ROTL64(const uint64_t value, const int offset) { uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result; uint2 result;
@ -320,4 +323,61 @@ uint64_t ROTL64(const uint64_t x, const int offset)
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) #define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif #endif
#ifdef WIN32
#include <intrin.h>
static uint32_t __inline __builtin_clz(uint32_t x) {
unsigned long r = 0;
_BitScanReverse(&r, x);
return (31-r);
}
static uint32_t __inline __builtin_ctz(uint32_t x) {
unsigned long r = 0;
_BitScanForward(&r, x);
return r;
}
#endif
/* count leading zeros of a 64bit int */
#if __CUDA_ARCH__ >= 200
__device__
static uint32_t cuda_clz64(const uint64_t x)
{
uint32_t result;
asm("clz.b64 %0, %1;\n"
: "=r"(result) : "l"(x));
return result;
}
#else
/* host */
static uint32_t cuda_clz64(const uint64_t x)
{
uint32_t u32 = (x >> 32);
uint32_t result = u32 ? __builtin_clz(u32) : 32;
if (result == 32) {
u32 = (uint32_t) x;
result += (u32 ? __builtin_clz(u32) : 32);
}
return result;
}
#endif
/* count trailing zeros of a 32bit int */
#if __CUDA_ARCH__ >= 200
__device__
static uint32_t cuda_ctz32(const uint32_t x)
{
uint32_t result;
asm("brev.b32 %1, %1;\n\t"
"clz.b32 %0, %1;\n"
: "=r"(result) : "r"(x));
return result;
}
#else
/* host */
static uint32_t cuda_ctz32(const uint32_t x)
{
return x ? __builtin_ctz(x) : 32;
}
#endif
#endif // #ifndef CUDA_HELPER_H #endif // #ifndef CUDA_HELPER_H