Browse Source

decred: multiple nonces code cleanup

The double loop is not useful, and prefer the __thread attribute
to enhance the code readability (remove the 2D host arrays).

squashed: return to host 2D array to allow the free
master
Tanguy Pruvot 8 years ago
parent
commit
a43205a84f
  1. 74
      Algo256/decred.cu

74
Algo256/decred.cu

@ -1,11 +1,7 @@
/** /**
* Blake-256 Decred 180-Bytes input Cuda Kernel (Tested on SM 5/5.2/6.1) * Blake-256 Decred 180-Bytes input Cuda Kernel
* *
* Tanguy Pruvot - Feb 2016 * Tanguy Pruvot, Alexis Provos - Feb/Sep 2016
*
* Merged 8-round blake (XVC) tweaks
* Further improved by: ~2.72%
* Alexis Provos - Jun 2016
*/ */
#include <stdint.h> #include <stdint.h>
@ -20,7 +16,7 @@ extern "C" {
#define TPB 640 #define TPB 640
/* max count of found nonces in one call (like sgminer) */ /* max count of found nonces in one call (like sgminer) */
#define maxResults 4 #define MAX_RESULTS 4
/* hash by cpu with blake 256 */ /* hash by cpu with blake 256 */
extern "C" void decred_hash(void *output, const void *input) extern "C" void decred_hash(void *output, const void *input)
@ -378,60 +374,74 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce
} }
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], maxResults*sizeof(uint32_t)), -1); CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], MAX_RESULTS*sizeof(uint32_t)), -1);
CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], maxResults*sizeof(uint32_t)), -1); CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], MAX_RESULTS*sizeof(uint32_t)), -1);
init[thr_id] = true; init[thr_id] = true;
} }
memcpy(endiandata, pdata, 180); memcpy(endiandata, pdata, 180);
decred_cpu_setBlock_52(endiandata); decred_cpu_setBlock_52(endiandata);
h_resNonce[thr_id][0] = 1; cudaMemset(d_resNonce[thr_id], 0x00, sizeof(uint32_t));
do { do {
if (h_resNonce[thr_id][0]) uint32_t* resNonces = h_resNonce[thr_id];
cudaMemset(d_resNonce[thr_id], 0x00, sizeof(uint32_t));
if (resNonces[0]) cudaMemset(d_resNonce[thr_id], 0x00, sizeof(uint32_t));
// GPU HASH // GPU HASH
decred_gpu_hash_nonce <<<grid, block>>> (throughput, (*pnonce), d_resNonce[thr_id], targetHigh); decred_gpu_hash_nonce <<<grid, block>>> (throughput, (*pnonce), d_resNonce[thr_id], targetHigh);
cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
if (h_resNonce[thr_id][0]) // first cell contains the valid nonces count
{ cudaMemcpy(resNonces, d_resNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], (h_resNonce[thr_id][0]+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);
for(uint32_t i=1; i <= h_resNonce[thr_id][0]; i++) if (resNonces[0])
{ {
uint32_t _ALIGN(64) vhash[8]; uint32_t _ALIGN(64) vhash[8];
be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[thr_id][i]);
cudaMemcpy(resNonces, d_resNonce[thr_id], (resNonces[0]+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);
be32enc(&endiandata[DCR_NONCE_OFT32], resNonces[1]);
decred_hash(vhash, endiandata); decred_hash(vhash, endiandata);
if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget)) if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget))
{ {
int rc = 1; int rc = work->valid_nonces = 1;
work_set_target_ratio(work, vhash); work_set_target_ratio(work, vhash);
*hashes_done = (*pnonce) - first_nonce + throughput; *hashes_done = (*pnonce) - first_nonce + throughput;
work->nonces[0] = swab32(h_resNonce[thr_id][i]); work->nonces[0] = swab32(resNonces[1]);
*pnonce = work->nonces[0];
// search for another nonce // search for another nonce
for(uint32_t j=i+1; j <= h_resNonce[thr_id][0]; j++) for(uint32_t n=2; n <= resNonces[0]; n++)
{ {
be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[thr_id][j]); be32enc(&endiandata[DCR_NONCE_OFT32], resNonces[n]);
decred_hash(vhash, endiandata); decred_hash(vhash, endiandata);
if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget)) { if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget)) {
work->nonces[1] = swab32(h_resNonce[thr_id][j]); work->nonces[1] = swab32(resNonces[n]);
if(!opt_quiet)
gpulog(LOG_NOTICE, thr_id, "second nonce found %u / %08x - %u / %08x", i, work->nonces[0], j, work->nonces[1]);
if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) { if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
work_set_target_ratio(work, vhash); // we really want the best first ? depends...
work->shareratio[1] = work->shareratio[0];
work->sharediff[1] = work->sharediff[0];
xchg(work->nonces[1], work->nonces[0]); xchg(work->nonces[1], work->nonces[0]);
work_set_target_ratio(work, vhash);
work->valid_nonces++;
} else if (work->valid_nonces == 1) {
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
} }
rc = 2; rc = 2; // MAX_NONCES submit limited to 2
break;
gpulog(LOG_DEBUG, thr_id, "multiple nonces 1:%08x (%g) %u:%08x (%g)",
work->nonces[0], work->sharediff[0], n, work->nonces[1], work->sharediff[1]);
} else if (vhash[6] > ptarget[6]) {
gpulog(LOG_WARNING, thr_id, "result %u for %08x does not validate on CPU!", n, resNonces[n]);
} }
} }
*pnonce = work->nonces[0];
return rc; return rc;
} else {
gpulog(LOG_WARNING, thr_id, "result %u for %08x does not validate on CPU!", i, h_resNonce[thr_id][i]); } else if (vhash[6] > ptarget[6]) {
} gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", resNonces[1]);
} }
} }
*pnonce += throughput; *pnonce += throughput;

Loading…
Cancel
Save