Browse Source

lbry cleanup, and proper error on cuda 6.5

both merged and unmerged implementations are broken with CUDA 6.5

No perf changes...
2upstream
Tanguy Pruvot 8 years ago
parent
commit
2152fd102d
  1. 89
      lbry/cuda_lbry_merged.cu
  2. 15
      lbry/cuda_sha256_lbry.cu
  3. 5
      lbry/cuda_sha512_lbry.cu
  4. 54
      lbry/lbry.cu

89
lbry/cuda_lbry_merged.cu

@ -284,11 +284,11 @@ static void sha2_step(const uint32_t a, const uint32_t b,const uint32_t c, uint3 @@ -284,11 +284,11 @@ static void sha2_step(const uint32_t a, const uint32_t b,const uint32_t c, uint3
const uint32_t t1 = h + bsg2_1(e) + Ch(e, f, g) + Kshared + in;
h = t1 + Maj(a, b, c) + bsg2_0(a);
d+= t1;
}
__device__
static void sha256_round_first(uint32_t *in,uint32_t *buf,const uint32_t *state,const uint32_t* __restrict__ Kshared)
static void sha256_round_first(uint32_t *in, uint32_t *buf,
const uint32_t *state, const uint32_t* __restrict__ Kshared)
{
uint32_t a = buf[0] + in[11];
uint32_t b = buf[1];
@ -427,7 +427,7 @@ static void sha256_round_body(uint32_t *in, uint32_t *state,const uint32_t* Ksha @@ -427,7 +427,7 @@ static void sha256_round_body(uint32_t *in, uint32_t *state,const uint32_t* Ksha
sha2_step(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
#pragma unroll 3
for (uint32_t i=0; i<3; i++)
for (int i=0; i<3; i++)
{
#pragma unroll 16
for (uint32_t j = 0; j < 16; j++) {
@ -493,8 +493,8 @@ static void sha256_round_body_final(uint32_t *in, uint32_t *state,const uint32_t @@ -493,8 +493,8 @@ static void sha256_round_body_final(uint32_t *in, uint32_t *state,const uint32_t
sha2_step(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
#pragma unroll 2
for (uint32_t i=0; i<2; i++){
for (int i=0; i<2; i++)
{
#pragma unroll 16
for (uint32_t j = 0; j < 16; j++) {
const uint32_t x2_0 = ssg2_0(in[(j + 1) & 15]);
@ -547,16 +547,26 @@ static void sha256_round_body_final(uint32_t *in, uint32_t *state,const uint32_t @@ -547,16 +547,26 @@ static void sha256_round_body_final(uint32_t *in, uint32_t *state,const uint32_t
//SHA512 MACROS ---------------------------------------------------------------------------
static __constant__ _ALIGN(8) uint64_t K_512[80] = {
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65, 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4, 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF, 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30, 0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8, 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC, 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178, 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C, 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
};
#undef xor3
@ -567,7 +577,6 @@ static __constant__ _ALIGN(8) uint64_t K_512[80] = { @@ -567,7 +577,6 @@ static __constant__ _ALIGN(8) uint64_t K_512[80] = {
#define ssg5_0(x) xor3(ROTR64(x, 1),ROTR64(x, 8),x>>7)
#define ssg5_1(x) xor3(ROTR64(x,19),ROTR64(x,61),x>>6)
#define andor64(a,b,c) ((a & (b | c)) | (b & c))
#define xandx64(e,f,g) (g ^ (e & (g ^ f)))
@ -584,7 +593,6 @@ uint64_t cuda_swab64ll(const uint32_t x, const uint32_t y) @@ -584,7 +593,6 @@ uint64_t cuda_swab64ll(const uint32_t x, const uint32_t y)
// RIPEMD MACROS-----------------------------------------------------------------------------
static __constant__ const uint32_t c_IV[5] = { 0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u };
static __constant__ const uint32_t c_K1[5] = { 0, 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xA953FD4E };
static __constant__ const uint32_t c_K2[5] = { 0x50A28BE6, 0x5C4DD124, 0x6D703EF3, 0x7A6D76E9, 0 };
@ -858,26 +866,25 @@ uint32_t F5(const uint32_t x,const uint32_t y,const uint32_t z){ @@ -858,26 +866,25 @@ uint32_t F5(const uint32_t x,const uint32_t y,const uint32_t z){
}
// END OF RIPEMD MACROS----------------------------------------------------------------------
__global__ __launch_bounds__(768,1) /* to force 32 regs */
__global__
__launch_bounds__(768,1) /* will force 64 regs max on SM 3+ */
void gpu_lbry_merged(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces, const uint64_t target64)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint32_t buf[8], state[8];
const uint64_t IV512[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
if (thread < threads)
{
uint64_t r[8];
uint64_t W[16];
uint32_t dat[16];
uint32_t buf[8], state[8];
uint32_t h[5];
if (thread < threads){
//#pragma unroll 11
//for (uint32_t i=0; i<11; i++)
// dat[i] = c_dataEnd112[i];
*(uint2x4*)&dat[0] = *(uint2x4*)&c_dataEnd112[0];
dat[ 8] = c_dataEnd112[ 8];
dat[ 9] = c_dataEnd112[ 9];
@ -899,11 +906,10 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t @@ -899,11 +906,10 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
for(int i=0; i<8; i++){
dat[i] = buf[i];
}
dat[8] = 0x80000000;
#pragma unroll 6
for (uint32_t i=9; i<15; i++) dat[i] = 0;
for(int i=9; i<15; i++) dat[i] = 0;
dat[15] = 0x100;
#pragma unroll 8
@ -913,6 +919,7 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t @@ -913,6 +919,7 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
sha256_round_body(dat, buf, c_K);
// SHA512-------------------------------------------------------------------------------------
#pragma unroll 8
for(int i=0; i<8; i++)
r[i] = IV512[i];
@ -924,24 +931,26 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t @@ -924,24 +931,26 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
W[4] = 0x8000000000000000; // end tag
#pragma unroll 10
for (uint32_t i = 5; i < 15; i++)
for (int i = 5; i < 15; i++)
W[i] = 0;
W[15] = 0x100; // 256 bits
#pragma unroll 16
for (int i = 0; i < 16; i ++){
for (uint32_t i = 0; i < 16; i++)
{
// sha512_step2(r, W[ i], K_512[ i], i&7);
const uint32_t ord = i&7;
const uint64_t T1 = r[(15-ord) & 7] + K_512[ i] + W[ i] + bsg5_1(r[(12-ord) & 7]) + xandx64(r[(12-ord) & 7],r[(13-ord) & 7],r[(14-ord) & 7]);
const uint64_t T1 = r[(15-ord) & 7] + K_512[ i] + W[ i] + bsg5_1(r[(12-ord) & 7]) +
xandx64(r[(12-ord) & 7], r[(13-ord) & 7], r[(14-ord) & 7]);
r[(15-ord)& 7] = andor64(r[( 8-ord) & 7], r[( 9-ord) & 7], r[(10-ord) & 7]) + bsg5_0(r[( 8-ord) & 7]) + T1;
r[(11-ord)& 7] = r[(11-ord)& 7] + T1;
}
#pragma unroll 5
for (uint32_t i = 16; i < 80; i+=16){
for (uint32_t i = 16; i < 80; i+=16)
{
#pragma unroll 16
for (uint32_t j = 0; j<16; j++)
W[(i + j) & 15] = W[((i + j) - 7) & 15] + W[(i + j) & 15] + ssg5_0(W[((i + j) - 15) & 15]) + ssg5_1(W[((i + j) - 2) & 15]);
@ -950,7 +959,8 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t @@ -950,7 +959,8 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
for (uint32_t j = 0; j<16; j++) {
const uint32_t ord = (i+j)&7;
const uint64_t T1 = K_512[i+j] + W[ j] + r[(15-ord) & 7] + bsg5_1(r[(12-ord) & 7]) + xandx64(r[(12-ord) & 7],r[(13-ord) & 7],r[(14-ord) & 7]);
const uint64_t T1 = K_512[i+j] + W[ j] + r[(15-ord) & 7] + bsg5_1(r[(12-ord) & 7]) +
xandx64(r[(12-ord) & 7], r[(13-ord) & 7], r[(14-ord) & 7]);
r[(15-ord)& 7] = andor64(r[( 8-ord) & 7], r[( 9-ord) & 7], r[(10-ord) & 7]) + bsg5_0(r[( 8-ord) & 7]) + T1;
r[(11-ord)& 7] = r[(11-ord)& 7] + T1;
@ -959,8 +969,8 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t @@ -959,8 +969,8 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
//END OF SHA512------------------------------------------------------------------------------
#pragma unroll 4
for (uint32_t i = 0; i < 4; i++)
*(uint64_t*)&dat[i<<1] = cuda_swab64(r[i] + IV512[i]);
for (int i = 0; i < 4; i++)
*(uint64_t*)&dat[i*2] = cuda_swab64(r[i] + IV512[i]);
dat[8] = 0x80;
#pragma unroll 7
@ -980,8 +990,8 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t @@ -980,8 +990,8 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
// second 32 bytes block hash
#pragma unroll 4
for (uint32_t i = 0; i < 4; i++)
*(uint64_t*)&dat[i<<1] = cuda_swab64(r[i+4] + IV512[i+4]);
for (int i=0; i < 4; i++)
*(uint64_t*)&dat[i*2] = cuda_swab64(r[i+4] + IV512[i+4]);
dat[8] = 0x80;
@ -1008,11 +1018,9 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t @@ -1008,11 +1018,9 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
dat[15] = 0x140;
// *(uint2x4*)&buf[0] = *(uint2x4*)&c_H256[0];
#pragma unroll 8
for(int i=0;i<8;i++){
for(int i=0; i<8; i++)
buf[i] = c_H256[i];
}
sha256_round_body(dat, buf, c_K);
@ -1025,9 +1033,8 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t @@ -1025,9 +1033,8 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
dat[8] = 0x80000000;
#pragma unroll 8
for(int i=0;i<8;i++){
for(int i=0; i<8; i++)
buf[i] = c_H256[i];
}
#pragma unroll 6
for (int i=9; i<15; i++) dat[i] = 0;

15
lbry/cuda_sha256_lbry.cu

@ -453,7 +453,10 @@ uint64_t cuda_swab64ll(const uint32_t x, const uint32_t y) { @@ -453,7 +453,10 @@ uint64_t cuda_swab64ll(const uint32_t x, const uint32_t y) {
return r;
}
__global__ __launch_bounds__(768,2) /* to force 32 regs */
__global__
#if CUDA_VERSION > 6050
__launch_bounds__(768,2) /* to force 32 regs */
#endif
void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, uint64_t *outputHash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -833,7 +836,10 @@ static uint32_t ROTATE(const uint32_t x,const uint32_t r){ @@ -833,7 +836,10 @@ static uint32_t ROTATE(const uint32_t x,const uint32_t r){
h[0] = tmp; \
}
__global__ __launch_bounds__(1024,2) /* to force 32 regs */
__global__
#if CUDA_VERSION > 6050
__launch_bounds__(1024,2) /* to force 32 regs */
#endif
void lbry_ripemd(const uint32_t threads, uint64_t *Hash512){
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint32_t dat[16];
@ -889,7 +895,10 @@ void lbry_ripemd(const uint32_t threads, uint64_t *Hash512){ @@ -889,7 +895,10 @@ void lbry_ripemd(const uint32_t threads, uint64_t *Hash512){
}
}
__global__ __launch_bounds__(768,2) /* to force 32 regs */
__global__
#if CUDA_VERSION > 6050
__launch_bounds__(768,2) /* to force 32 regs */
#endif
void lbry_sha256d_gpu_hash_final(const uint32_t threads, uint64_t *Hash512, uint32_t *resNonces,const uint64_t target64)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);

5
lbry/cuda_sha512_lbry.cu

@ -64,7 +64,10 @@ static void sha512_step2(uint64_t *const r,const uint64_t W,const uint64_t K, co @@ -64,7 +64,10 @@ static void sha512_step2(uint64_t *const r,const uint64_t W,const uint64_t K, co
/**************************************************************************************************/
__global__ __launch_bounds__(512,2)
__global__
#if CUDA_VERSION > 6050
__launch_bounds__(512,2)
#endif
void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);

54
lbry/lbry.cu

@ -1,7 +1,7 @@ @@ -1,7 +1,7 @@
/**
* Lbry Algo (sha-256 / sha-512 / ripemd)
*
* tpruvot and Provos Alexis - Jul / Sep 2016
* tpruvot and Provos Alexis - Jan 2017
*
* Sponsored by LBRY.IO team
*/
@ -87,7 +87,6 @@ static uint32_t *d_resNonce[MAX_GPUS]; @@ -87,7 +87,6 @@ static uint32_t *d_resNonce[MAX_GPUS];
extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t _ALIGN(A) vhash[8];
uint32_t _ALIGN(A) endiandata[28];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@ -96,6 +95,8 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, @@ -96,6 +95,8 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
const int swap = 0; // to toggle nonce endian (need kernel change)
const int dev_id = device_map[thr_id];
const bool merged_kernel = (device_sm[dev_id] > 500);
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 22 : 20;
if (device_sm[dev_id] >= 600) intensity = 23;
if (device_sm[dev_id] < 350) intensity = 18;
@ -118,8 +119,14 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, @@ -118,8 +119,14 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
if(device_sm[dev_id] <= 500)
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 8 * sizeof(uint64_t) * throughput));
if (CUDART_VERSION == 6050) {
applog(LOG_ERR, "This lbry kernel is not compatible with CUDA 6.5!");
proper_exit(EXIT_FAILURE);
}
if (!merged_kernel)
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput));
CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
CUDA_LOG_ERROR();
@ -131,44 +138,48 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, @@ -131,44 +138,48 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
be32enc(&endiandata[i], pdata[i]);
}
if(device_sm[dev_id] <= 500)
lbry_sha256_setBlock_112(endiandata);
else
if (merged_kernel)
lbry_sha256_setBlock_112_merged(endiandata);
else
lbry_sha256_setBlock_112(endiandata);
cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
do {
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
// Hash with CUDA
if(device_sm[dev_id] <= 500){
if (merged_kernel) {
lbry_merged(thr_id, pdata[LBC_NONCE_OFT32], throughput, d_resNonce[thr_id], AS_U64(&ptarget[6]));
} else {
lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]);
lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id]);
lbry_sha256d_hash_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], *(uint64_t*)&ptarget[6]);
}else{
lbry_merged(thr_id,pdata[LBC_NONCE_OFT32], throughput, d_resNonce[thr_id], *(uint64_t*)&ptarget[6]);
lbry_sha256d_hash_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6]));
}
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
cudaMemcpy(resNonces, d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput;
cudaMemcpy(resNonces, d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
if (resNonces[0] != UINT32_MAX)
{
uint32_t _ALIGN(A) vhash[8];
const uint32_t Htarg = ptarget[7];
const uint32_t startNonce = pdata[LBC_NONCE_OFT32];
resNonces[0] += startNonce;
endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[0], !swap);
lbry_hash(vhash, endiandata);
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
{
work->nonces[0] = swab32_if(resNonces[0], swap);
work_set_target_ratio(work, vhash);
work->valid_nonces = 1;
if (resNonces[1] != UINT32_MAX) {
if (resNonces[1] != UINT32_MAX)
{
resNonces[1] += startNonce;
gpulog(LOG_DEBUG, thr_id, "second nonce %08x", swab32(resNonces[1]));
endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[1], !swap);
lbry_hash(vhash, endiandata);
work->nonces[1] = swab32_if(resNonces[1], swap);
@ -179,19 +190,18 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, @@ -179,19 +190,18 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
work->sharediff[1] = work->sharediff[0];
work->shareratio[1] = work->shareratio[0];
work_set_target_ratio(work, vhash);
work->valid_nonces++;
} else {
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
}
work->valid_nonces++;
}
pdata[LBC_NONCE_OFT32] = max(work->nonces[0], work->nonces[1]); // next scan start
return work->valid_nonces;
} else if (vhash[7] > ptarget[7]) {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU %08x > %08x!", resNonces[0], vhash[7], ptarget[7]);
}
else if (vhash[7] > Htarg) {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", resNonces[0]);
cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
}
}

Loading…
Cancel
Save