Browse Source

lbry cleanup, and proper error on cuda 6.5

both merged and unmerged implementations are broken with CUDA 6.5

No perf changes...
2upstream
Tanguy Pruvot 8 years ago
parent
commit
2152fd102d
  1. 89
      lbry/cuda_lbry_merged.cu
  2. 15
      lbry/cuda_sha256_lbry.cu
  3. 5
      lbry/cuda_sha512_lbry.cu
  4. 54
      lbry/lbry.cu

89
lbry/cuda_lbry_merged.cu

@ -284,11 +284,11 @@ static void sha2_step(const uint32_t a, const uint32_t b,const uint32_t c, uint3
const uint32_t t1 = h + bsg2_1(e) + Ch(e, f, g) + Kshared + in; const uint32_t t1 = h + bsg2_1(e) + Ch(e, f, g) + Kshared + in;
h = t1 + Maj(a, b, c) + bsg2_0(a); h = t1 + Maj(a, b, c) + bsg2_0(a);
d+= t1; d+= t1;
} }
__device__ __device__
static void sha256_round_first(uint32_t *in,uint32_t *buf,const uint32_t *state,const uint32_t* __restrict__ Kshared) static void sha256_round_first(uint32_t *in, uint32_t *buf,
const uint32_t *state, const uint32_t* __restrict__ Kshared)
{ {
uint32_t a = buf[0] + in[11]; uint32_t a = buf[0] + in[11];
uint32_t b = buf[1]; uint32_t b = buf[1];
@ -427,7 +427,7 @@ static void sha256_round_body(uint32_t *in, uint32_t *state,const uint32_t* Ksha
sha2_step(b,c,d,e,f,g,h,a,in[15], Kshared[15]); sha2_step(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
#pragma unroll 3 #pragma unroll 3
for (uint32_t i=0; i<3; i++) for (int i=0; i<3; i++)
{ {
#pragma unroll 16 #pragma unroll 16
for (uint32_t j = 0; j < 16; j++) { for (uint32_t j = 0; j < 16; j++) {
@ -493,8 +493,8 @@ static void sha256_round_body_final(uint32_t *in, uint32_t *state,const uint32_t
sha2_step(b,c,d,e,f,g,h,a,in[15], Kshared[15]); sha2_step(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
#pragma unroll 2 #pragma unroll 2
for (uint32_t i=0; i<2; i++){ for (int i=0; i<2; i++)
{
#pragma unroll 16 #pragma unroll 16
for (uint32_t j = 0; j < 16; j++) { for (uint32_t j = 0; j < 16; j++) {
const uint32_t x2_0 = ssg2_0(in[(j + 1) & 15]); const uint32_t x2_0 = ssg2_0(in[(j + 1) & 15]);
@ -547,16 +547,26 @@ static void sha256_round_body_final(uint32_t *in, uint32_t *state,const uint32_t
//SHA512 MACROS --------------------------------------------------------------------------- //SHA512 MACROS ---------------------------------------------------------------------------
static __constant__ _ALIGN(8) uint64_t K_512[80] = { static __constant__ _ALIGN(8) uint64_t K_512[80] = {
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118, 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694, 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65, 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5, 0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4, 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70, 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF, 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B, 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30, 0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8, 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8, 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3, 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC, 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B, 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178, 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B, 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C, 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
}; };
#undef xor3 #undef xor3
@ -567,7 +577,6 @@ static __constant__ _ALIGN(8) uint64_t K_512[80] = {
#define ssg5_0(x) xor3(ROTR64(x, 1),ROTR64(x, 8),x>>7) #define ssg5_0(x) xor3(ROTR64(x, 1),ROTR64(x, 8),x>>7)
#define ssg5_1(x) xor3(ROTR64(x,19),ROTR64(x,61),x>>6) #define ssg5_1(x) xor3(ROTR64(x,19),ROTR64(x,61),x>>6)
#define andor64(a,b,c) ((a & (b | c)) | (b & c)) #define andor64(a,b,c) ((a & (b | c)) | (b & c))
#define xandx64(e,f,g) (g ^ (e & (g ^ f))) #define xandx64(e,f,g) (g ^ (e & (g ^ f)))
@ -584,7 +593,6 @@ uint64_t cuda_swab64ll(const uint32_t x, const uint32_t y)
// RIPEMD MACROS----------------------------------------------------------------------------- // RIPEMD MACROS-----------------------------------------------------------------------------
static __constant__ const uint32_t c_IV[5] = { 0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u }; static __constant__ const uint32_t c_IV[5] = { 0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u };
static __constant__ const uint32_t c_K1[5] = { 0, 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xA953FD4E }; static __constant__ const uint32_t c_K1[5] = { 0, 0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xA953FD4E };
static __constant__ const uint32_t c_K2[5] = { 0x50A28BE6, 0x5C4DD124, 0x6D703EF3, 0x7A6D76E9, 0 }; static __constant__ const uint32_t c_K2[5] = { 0x50A28BE6, 0x5C4DD124, 0x6D703EF3, 0x7A6D76E9, 0 };
@ -858,26 +866,25 @@ uint32_t F5(const uint32_t x,const uint32_t y,const uint32_t z){
} }
// END OF RIPEMD MACROS---------------------------------------------------------------------- // END OF RIPEMD MACROS----------------------------------------------------------------------
__global__ __launch_bounds__(768,1) /* to force 32 regs */ __global__
__launch_bounds__(768,1) /* will force 64 regs max on SM 3+ */
void gpu_lbry_merged(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces, const uint64_t target64) void gpu_lbry_merged(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces, const uint64_t target64)
{ {
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint32_t buf[8], state[8];
const uint64_t IV512[8] = { const uint64_t IV512[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
}; };
if (thread < threads)
{
uint64_t r[8]; uint64_t r[8];
uint64_t W[16]; uint64_t W[16];
uint32_t dat[16]; uint32_t dat[16];
uint32_t buf[8], state[8];
uint32_t h[5]; uint32_t h[5];
if (thread < threads){
//#pragma unroll 11
//for (uint32_t i=0; i<11; i++)
// dat[i] = c_dataEnd112[i];
*(uint2x4*)&dat[0] = *(uint2x4*)&c_dataEnd112[0]; *(uint2x4*)&dat[0] = *(uint2x4*)&c_dataEnd112[0];
dat[ 8] = c_dataEnd112[ 8]; dat[ 8] = c_dataEnd112[ 8];
dat[ 9] = c_dataEnd112[ 9]; dat[ 9] = c_dataEnd112[ 9];
@ -899,11 +906,10 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
for(int i=0; i<8; i++){ for(int i=0; i<8; i++){
dat[i] = buf[i]; dat[i] = buf[i];
} }
dat[8] = 0x80000000; dat[8] = 0x80000000;
#pragma unroll 6 #pragma unroll 6
for (uint32_t i=9; i<15; i++) dat[i] = 0; for(int i=9; i<15; i++) dat[i] = 0;
dat[15] = 0x100; dat[15] = 0x100;
#pragma unroll 8 #pragma unroll 8
@ -913,6 +919,7 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
sha256_round_body(dat, buf, c_K); sha256_round_body(dat, buf, c_K);
// SHA512------------------------------------------------------------------------------------- // SHA512-------------------------------------------------------------------------------------
#pragma unroll 8 #pragma unroll 8
for(int i=0; i<8; i++) for(int i=0; i<8; i++)
r[i] = IV512[i]; r[i] = IV512[i];
@ -924,24 +931,26 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
W[4] = 0x8000000000000000; // end tag W[4] = 0x8000000000000000; // end tag
#pragma unroll 10 #pragma unroll 10
for (uint32_t i = 5; i < 15; i++) for (int i = 5; i < 15; i++)
W[i] = 0; W[i] = 0;
W[15] = 0x100; // 256 bits W[15] = 0x100; // 256 bits
#pragma unroll 16 #pragma unroll 16
for (int i = 0; i < 16; i ++){ for (uint32_t i = 0; i < 16; i++)
{
// sha512_step2(r, W[ i], K_512[ i], i&7); // sha512_step2(r, W[ i], K_512[ i], i&7);
const uint32_t ord = i&7; const uint32_t ord = i&7;
const uint64_t T1 = r[(15-ord) & 7] + K_512[ i] + W[ i] + bsg5_1(r[(12-ord) & 7]) +
const uint64_t T1 = r[(15-ord) & 7] + K_512[ i] + W[ i] + bsg5_1(r[(12-ord) & 7]) + xandx64(r[(12-ord) & 7],r[(13-ord) & 7],r[(14-ord) & 7]); xandx64(r[(12-ord) & 7], r[(13-ord) & 7], r[(14-ord) & 7]);
r[(15-ord)& 7] = andor64(r[( 8-ord) & 7], r[( 9-ord) & 7], r[(10-ord) & 7]) + bsg5_0(r[( 8-ord) & 7]) + T1; r[(15-ord)& 7] = andor64(r[( 8-ord) & 7], r[( 9-ord) & 7], r[(10-ord) & 7]) + bsg5_0(r[( 8-ord) & 7]) + T1;
r[(11-ord)& 7] = r[(11-ord)& 7] + T1; r[(11-ord)& 7] = r[(11-ord)& 7] + T1;
} }
#pragma unroll 5 #pragma unroll 5
for (uint32_t i = 16; i < 80; i+=16){ for (uint32_t i = 16; i < 80; i+=16)
{
#pragma unroll 16 #pragma unroll 16
for (uint32_t j = 0; j<16; j++) for (uint32_t j = 0; j<16; j++)
W[(i + j) & 15] = W[((i + j) - 7) & 15] + W[(i + j) & 15] + ssg5_0(W[((i + j) - 15) & 15]) + ssg5_1(W[((i + j) - 2) & 15]); W[(i + j) & 15] = W[((i + j) - 7) & 15] + W[(i + j) & 15] + ssg5_0(W[((i + j) - 15) & 15]) + ssg5_1(W[((i + j) - 2) & 15]);
@ -950,7 +959,8 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
for (uint32_t j = 0; j<16; j++) { for (uint32_t j = 0; j<16; j++) {
const uint32_t ord = (i+j)&7; const uint32_t ord = (i+j)&7;
const uint64_t T1 = K_512[i+j] + W[ j] + r[(15-ord) & 7] + bsg5_1(r[(12-ord) & 7]) + xandx64(r[(12-ord) & 7],r[(13-ord) & 7],r[(14-ord) & 7]); const uint64_t T1 = K_512[i+j] + W[ j] + r[(15-ord) & 7] + bsg5_1(r[(12-ord) & 7]) +
xandx64(r[(12-ord) & 7], r[(13-ord) & 7], r[(14-ord) & 7]);
r[(15-ord)& 7] = andor64(r[( 8-ord) & 7], r[( 9-ord) & 7], r[(10-ord) & 7]) + bsg5_0(r[( 8-ord) & 7]) + T1; r[(15-ord)& 7] = andor64(r[( 8-ord) & 7], r[( 9-ord) & 7], r[(10-ord) & 7]) + bsg5_0(r[( 8-ord) & 7]) + T1;
r[(11-ord)& 7] = r[(11-ord)& 7] + T1; r[(11-ord)& 7] = r[(11-ord)& 7] + T1;
@ -959,8 +969,8 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
//END OF SHA512------------------------------------------------------------------------------ //END OF SHA512------------------------------------------------------------------------------
#pragma unroll 4 #pragma unroll 4
for (uint32_t i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
*(uint64_t*)&dat[i<<1] = cuda_swab64(r[i] + IV512[i]); *(uint64_t*)&dat[i*2] = cuda_swab64(r[i] + IV512[i]);
dat[8] = 0x80; dat[8] = 0x80;
#pragma unroll 7 #pragma unroll 7
@ -980,8 +990,8 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
// second 32 bytes block hash // second 32 bytes block hash
#pragma unroll 4 #pragma unroll 4
for (uint32_t i = 0; i < 4; i++) for (int i=0; i < 4; i++)
*(uint64_t*)&dat[i<<1] = cuda_swab64(r[i+4] + IV512[i+4]); *(uint64_t*)&dat[i*2] = cuda_swab64(r[i+4] + IV512[i+4]);
dat[8] = 0x80; dat[8] = 0x80;
@ -1008,11 +1018,9 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
dat[15] = 0x140; dat[15] = 0x140;
// *(uint2x4*)&buf[0] = *(uint2x4*)&c_H256[0];
#pragma unroll 8 #pragma unroll 8
for(int i=0;i<8;i++){ for(int i=0; i<8; i++)
buf[i] = c_H256[i]; buf[i] = c_H256[i];
}
sha256_round_body(dat, buf, c_K); sha256_round_body(dat, buf, c_K);
@ -1025,9 +1033,8 @@ void gpu_lbry_merged(const uint32_t threads,const uint32_t startNonce, uint32_t
dat[8] = 0x80000000; dat[8] = 0x80000000;
#pragma unroll 8 #pragma unroll 8
for(int i=0;i<8;i++){ for(int i=0; i<8; i++)
buf[i] = c_H256[i]; buf[i] = c_H256[i];
}
#pragma unroll 6 #pragma unroll 6
for (int i=9; i<15; i++) dat[i] = 0; for (int i=9; i<15; i++) dat[i] = 0;

15
lbry/cuda_sha256_lbry.cu

@ -453,7 +453,10 @@ uint64_t cuda_swab64ll(const uint32_t x, const uint32_t y) {
return r; return r;
} }
__global__ __launch_bounds__(768,2) /* to force 32 regs */ __global__
#if CUDA_VERSION > 6050
__launch_bounds__(768,2) /* to force 32 regs */
#endif
void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, uint64_t *outputHash) void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, uint64_t *outputHash)
{ {
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -833,7 +836,10 @@ static uint32_t ROTATE(const uint32_t x,const uint32_t r){
h[0] = tmp; \ h[0] = tmp; \
} }
__global__ __launch_bounds__(1024,2) /* to force 32 regs */ __global__
#if CUDA_VERSION > 6050
__launch_bounds__(1024,2) /* to force 32 regs */
#endif
void lbry_ripemd(const uint32_t threads, uint64_t *Hash512){ void lbry_ripemd(const uint32_t threads, uint64_t *Hash512){
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint32_t dat[16]; uint32_t dat[16];
@ -889,7 +895,10 @@ void lbry_ripemd(const uint32_t threads, uint64_t *Hash512){
} }
} }
__global__ __launch_bounds__(768,2) /* to force 32 regs */ __global__
#if CUDA_VERSION > 6050
__launch_bounds__(768,2) /* to force 32 regs */
#endif
void lbry_sha256d_gpu_hash_final(const uint32_t threads, uint64_t *Hash512, uint32_t *resNonces,const uint64_t target64) void lbry_sha256d_gpu_hash_final(const uint32_t threads, uint64_t *Hash512, uint32_t *resNonces,const uint64_t target64)
{ {
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);

5
lbry/cuda_sha512_lbry.cu

@ -64,7 +64,10 @@ static void sha512_step2(uint64_t *const r,const uint64_t W,const uint64_t K, co
/**************************************************************************************************/ /**************************************************************************************************/
__global__ __launch_bounds__(512,2) __global__
#if CUDA_VERSION > 6050
__launch_bounds__(512,2)
#endif
void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash) void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
{ {
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);

54
lbry/lbry.cu

@ -1,7 +1,7 @@
/** /**
* Lbry Algo (sha-256 / sha-512 / ripemd) * Lbry Algo (sha-256 / sha-512 / ripemd)
* *
* tpruvot and Provos Alexis - Jul / Sep 2016 * tpruvot and Provos Alexis - Jan 2017
* *
* Sponsored by LBRY.IO team * Sponsored by LBRY.IO team
*/ */
@ -87,7 +87,6 @@ static uint32_t *d_resNonce[MAX_GPUS];
extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done) extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
{ {
uint32_t _ALIGN(A) vhash[8];
uint32_t _ALIGN(A) endiandata[28]; uint32_t _ALIGN(A) endiandata[28];
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
@ -96,6 +95,8 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
const int swap = 0; // to toggle nonce endian (need kernel change) const int swap = 0; // to toggle nonce endian (need kernel change)
const int dev_id = device_map[thr_id]; const int dev_id = device_map[thr_id];
const bool merged_kernel = (device_sm[dev_id] > 500);
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 22 : 20; int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 22 : 20;
if (device_sm[dev_id] >= 600) intensity = 23; if (device_sm[dev_id] >= 600) intensity = 23;
if (device_sm[dev_id] < 350) intensity = 18; if (device_sm[dev_id] < 350) intensity = 18;
@ -118,8 +119,14 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
} }
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
if(device_sm[dev_id] <= 500)
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 8 * sizeof(uint64_t) * throughput)); if (CUDART_VERSION == 6050) {
applog(LOG_ERR, "This lbry kernel is not compatible with CUDA 6.5!");
proper_exit(EXIT_FAILURE);
}
if (!merged_kernel)
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput));
CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t))); CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
CUDA_LOG_ERROR(); CUDA_LOG_ERROR();
@ -131,44 +138,48 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
be32enc(&endiandata[i], pdata[i]); be32enc(&endiandata[i], pdata[i]);
} }
if(device_sm[dev_id] <= 500) if (merged_kernel)
lbry_sha256_setBlock_112(endiandata);
else
lbry_sha256_setBlock_112_merged(endiandata); lbry_sha256_setBlock_112_merged(endiandata);
else
lbry_sha256_setBlock_112(endiandata);
cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t)); cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
do { do {
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
// Hash with CUDA // Hash with CUDA
if(device_sm[dev_id] <= 500){ if (merged_kernel) {
lbry_merged(thr_id, pdata[LBC_NONCE_OFT32], throughput, d_resNonce[thr_id], AS_U64(&ptarget[6]));
} else {
lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]); lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]);
lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id]); lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id]);
lbry_sha256d_hash_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], *(uint64_t*)&ptarget[6]); lbry_sha256d_hash_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6]));
}else{
lbry_merged(thr_id,pdata[LBC_NONCE_OFT32], throughput, d_resNonce[thr_id], *(uint64_t*)&ptarget[6]);
} }
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
cudaMemcpy(resNonces, d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput; *hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput;
cudaMemcpy(resNonces, d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
if (resNonces[0] != UINT32_MAX) if (resNonces[0] != UINT32_MAX)
{ {
uint32_t _ALIGN(A) vhash[8];
const uint32_t Htarg = ptarget[7];
const uint32_t startNonce = pdata[LBC_NONCE_OFT32]; const uint32_t startNonce = pdata[LBC_NONCE_OFT32];
resNonces[0] += startNonce; resNonces[0] += startNonce;
endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[0], !swap); endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[0], !swap);
lbry_hash(vhash, endiandata); lbry_hash(vhash, endiandata);
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
{
work->nonces[0] = swab32_if(resNonces[0], swap); work->nonces[0] = swab32_if(resNonces[0], swap);
work_set_target_ratio(work, vhash); work_set_target_ratio(work, vhash);
work->valid_nonces = 1; work->valid_nonces = 1;
if (resNonces[1] != UINT32_MAX) { if (resNonces[1] != UINT32_MAX)
{
resNonces[1] += startNonce; resNonces[1] += startNonce;
gpulog(LOG_DEBUG, thr_id, "second nonce %08x", swab32(resNonces[1]));
endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[1], !swap); endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[1], !swap);
lbry_hash(vhash, endiandata); lbry_hash(vhash, endiandata);
work->nonces[1] = swab32_if(resNonces[1], swap); work->nonces[1] = swab32_if(resNonces[1], swap);
@ -179,19 +190,18 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
work->sharediff[1] = work->sharediff[0]; work->sharediff[1] = work->sharediff[0];
work->shareratio[1] = work->shareratio[0]; work->shareratio[1] = work->shareratio[0];
work_set_target_ratio(work, vhash); work_set_target_ratio(work, vhash);
work->valid_nonces++;
} else { } else {
bn_set_target_ratio(work, vhash, 1); bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
} }
work->valid_nonces++;
} }
pdata[LBC_NONCE_OFT32] = max(work->nonces[0], work->nonces[1]); // next scan start pdata[LBC_NONCE_OFT32] = max(work->nonces[0], work->nonces[1]); // next scan start
return work->valid_nonces; return work->valid_nonces;
}
} else if (vhash[7] > ptarget[7]) { else if (vhash[7] > Htarg) {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU %08x > %08x!", resNonces[0], vhash[7], ptarget[7]); gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", resNonces[0]);
cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t)); cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
} }
} }

Loading…
Cancel
Save