|
|
|
@ -1,8 +1,9 @@
@@ -1,8 +1,9 @@
|
|
|
|
|
/** |
|
|
|
|
* Lbry CUDA Implementation |
|
|
|
|
* Lbry Algo (sha-256 / sha-512 / ripemd) |
|
|
|
|
* |
|
|
|
|
* by tpruvot@github - July 2016 |
|
|
|
|
* tpruvot and Provos Alexis - Jul / Sep 2016 |
|
|
|
|
* |
|
|
|
|
* Sponsored by LBRY.IO team |
|
|
|
|
*/ |
|
|
|
|
|
|
|
|
|
#include <string.h> |
|
|
|
@ -64,11 +65,14 @@ extern "C" void lbry_hash(void* output, const void* input)
@@ -64,11 +65,14 @@ extern "C" void lbry_hash(void* output, const void* input)
|
|
|
|
|
|
|
|
|
|
extern void lbry_sha256_init(int thr_id); |
|
|
|
|
extern void lbry_sha256_free(int thr_id); |
|
|
|
|
extern void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget); |
|
|
|
|
extern void lbry_sha256_setBlock_112(uint32_t *pdata); |
|
|
|
|
extern void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash); |
|
|
|
|
extern void lbry_sha512_init(int thr_id); |
|
|
|
|
extern void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash); |
|
|
|
|
extern int lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t *d_inputHash, uint32_t *d_resNonce); |
|
|
|
|
extern void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t *d_inputHash, uint32_t *d_resNonce, const uint64_t target64); |
|
|
|
|
|
|
|
|
|
extern void lbry_sha256_setBlock_112_merged(uint32_t *pdata); |
|
|
|
|
extern void lbry_merged(int thr_id,uint32_t startNonce, uint32_t threads, uint32_t *d_resNonce, const uint64_t target64); |
|
|
|
|
|
|
|
|
|
static __inline uint32_t swab32_if(uint32_t val, bool iftrue) { |
|
|
|
|
return iftrue ? swab32(val) : val; |
|
|
|
@ -97,7 +101,7 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
@@ -97,7 +101,7 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
|
|
|
|
|
if (device_sm[dev_id] < 350) intensity = 18; |
|
|
|
|
|
|
|
|
|
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); |
|
|
|
|
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); |
|
|
|
|
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); |
|
|
|
|
|
|
|
|
|
if (opt_benchmark) { |
|
|
|
|
ptarget[7] = 0xf; |
|
|
|
@ -107,17 +111,16 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
@@ -107,17 +111,16 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
|
|
|
|
|
cudaSetDevice(dev_id); |
|
|
|
|
if (opt_cudaschedule == -1 && gpu_threads == 1) { |
|
|
|
|
cudaDeviceReset(); |
|
|
|
|
// reduce cpu usage (linux) |
|
|
|
|
// reduce cpu usage |
|
|
|
|
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); |
|
|
|
|
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); |
|
|
|
|
CUDA_LOG_ERROR(); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); |
|
|
|
|
CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t))); |
|
|
|
|
if(device_sm[dev_id] <= 500) |
|
|
|
|
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 8 * sizeof(uint64_t) * throughput)); |
|
|
|
|
|
|
|
|
|
lbry_sha256_init(thr_id); |
|
|
|
|
lbry_sha512_init(thr_id); |
|
|
|
|
CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t))); |
|
|
|
|
CUDA_LOG_ERROR(); |
|
|
|
|
|
|
|
|
|
init[thr_id] = true; |
|
|
|
@ -127,29 +130,29 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
@@ -127,29 +130,29 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
|
|
|
|
|
be32enc(&endiandata[i], pdata[i]); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
lbry_sha256_setBlock_112(endiandata, ptarget); |
|
|
|
|
if(device_sm[dev_id] <= 500) |
|
|
|
|
lbry_sha256_setBlock_112(endiandata); |
|
|
|
|
else |
|
|
|
|
lbry_sha256_setBlock_112_merged(endiandata); |
|
|
|
|
|
|
|
|
|
cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t)); |
|
|
|
|
|
|
|
|
|
do { |
|
|
|
|
// Hash with CUDA |
|
|
|
|
lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]); |
|
|
|
|
lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id]); |
|
|
|
|
|
|
|
|
|
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX }; |
|
|
|
|
int err = lbry_sha256d_hash_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); |
|
|
|
|
if (err) { |
|
|
|
|
// reinit |
|
|
|
|
free_lbry(thr_id); |
|
|
|
|
return -1; |
|
|
|
|
if(device_sm[dev_id] <= 500){ |
|
|
|
|
lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]); |
|
|
|
|
lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id]); |
|
|
|
|
lbry_sha256d_hash_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], *(uint64_t*)&ptarget[6]); |
|
|
|
|
}else{ |
|
|
|
|
lbry_merged(thr_id,pdata[LBC_NONCE_OFT32], throughput, d_resNonce[thr_id], *(uint64_t*)&ptarget[6]); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX }; |
|
|
|
|
cudaMemcpy(resNonces, d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); |
|
|
|
|
|
|
|
|
|
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput; |
|
|
|
|
|
|
|
|
|
if (resNonces[0] != UINT32_MAX) |
|
|
|
|
{ |
|
|
|
|
const uint32_t startNonce = pdata[LBC_NONCE_OFT32]; |
|
|
|
|
|
|
|
|
|
resNonces[0] += startNonce; |
|
|
|
|
|
|
|
|
|
endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[0], !swap); |
|
|
|
@ -157,12 +160,13 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
@@ -157,12 +160,13 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
|
|
|
|
|
|
|
|
|
|
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { |
|
|
|
|
int res = 1; |
|
|
|
|
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput; |
|
|
|
|
work->nonces[0] = swab32_if(resNonces[0], swap); |
|
|
|
|
work_set_target_ratio(work, vhash); |
|
|
|
|
if (resNonces[1] != UINT32_MAX) { |
|
|
|
|
resNonces[1] += startNonce; |
|
|
|
|
if (opt_debug) |
|
|
|
|
gpulog(LOG_BLUE, thr_id, "found second nonce %08x", resNonces[1]); |
|
|
|
|
gpulog(LOG_BLUE, thr_id, "Found second nonce %08x", swab32(resNonces[1])); |
|
|
|
|
endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[1], !swap); |
|
|
|
|
lbry_hash(vhash, endiandata); |
|
|
|
|
work->nonces[1] = swab32_if(resNonces[1], swap); |
|
|
|
@ -189,22 +193,23 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
@@ -189,22 +193,23 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
|
|
|
|
|
|
|
|
|
|
} while (!work_restart[thr_id].restart); |
|
|
|
|
|
|
|
|
|
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + 1; |
|
|
|
|
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce; |
|
|
|
|
|
|
|
|
|
return 0; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// cleanup |
|
|
|
|
extern "C" void free_lbry(int thr_id) |
|
|
|
|
void free_lbry(int thr_id) |
|
|
|
|
{ |
|
|
|
|
if (!init[thr_id]) |
|
|
|
|
return; |
|
|
|
|
|
|
|
|
|
cudaThreadSynchronize(); |
|
|
|
|
|
|
|
|
|
cudaFree(d_hash[thr_id]); |
|
|
|
|
if(device_sm[device_map[thr_id]]<=500) |
|
|
|
|
cudaFree(d_hash[thr_id]); |
|
|
|
|
|
|
|
|
|
cudaFree(d_resNonce[thr_id]); |
|
|
|
|
lbry_sha256_free(thr_id); |
|
|
|
|
|
|
|
|
|
init[thr_id] = false; |
|
|
|
|
|
|
|
|
|