Prepare multiple nonces support in one loop (if found)
Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later...
This commit is contained in:
parent
a186a4df8b
commit
f387898ead
@ -4,18 +4,21 @@
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
#include "miner.h"
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
__constant__ uint32_t pTarget[8];
|
||||
__constant__ uint32_t pTarget[8]; // 32 bytes
|
||||
|
||||
static uint32_t *d_resNounce[8];
|
||||
static uint32_t *h_resNounce[8];
|
||||
// store 8 device arrays of 8 nonces
|
||||
static uint32_t* h_resNonces[8];
|
||||
static uint32_t* d_resNonces[8];
|
||||
|
||||
__host__
|
||||
void cuda_check_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
CUDA_CALL_OR_RET(cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t)));
|
||||
CUDA_CALL_OR_RET(cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t)));
|
||||
CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 8*sizeof(uint32_t)));
|
||||
CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 8*sizeof(uint32_t)));
|
||||
}
|
||||
|
||||
// Target Difficulty
|
||||
@ -68,17 +71,18 @@ static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint3
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(512, 4)
|
||||
void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNounce)
|
||||
void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
|
||||
{
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
// shl 4 = *16 x 4 (uint32) = 64 bytes
|
||||
// todo: use only 32 bytes * threads if possible
|
||||
uint32_t *inpHash = &hash[thread << 4];
|
||||
|
||||
if (hashbelowtarget(inpHash, pTarget)) {
|
||||
uint32_t nounce = (startNounce + thread);
|
||||
resNounce[0] = nounce;
|
||||
if (resNonces[0] == UINT32_MAX) {
|
||||
if (hashbelowtarget(inpHash, pTarget))
|
||||
resNonces[0] = (startNounce + thread);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -86,20 +90,61 @@ void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32
|
||||
__host__
|
||||
uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash)
|
||||
{
|
||||
uint32_t result = 0xffffffff;
|
||||
cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
|
||||
cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
|
||||
|
||||
const int threadsperblock = 512;
|
||||
|
||||
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
|
||||
|
||||
cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
result = *h_resNounce[thr_id];
|
||||
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
return h_resNonces[thr_id][0];
|
||||
}
|
||||
|
||||
/* --------------------------------------------------------------------------------------------- */
|
||||
|
||||
__global__ __launch_bounds__(512, 4)
|
||||
void cuda_checkhash_64_suppl(uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
|
||||
{
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
|
||||
uint32_t *inpHash = &hash[thread << 4];
|
||||
|
||||
if (hashbelowtarget(inpHash, pTarget)) {
|
||||
int resNum = ++resNonces[0];
|
||||
__threadfence();
|
||||
if (resNum < 8)
|
||||
resNonces[resNum] = (startNounce + thread);
|
||||
}
|
||||
}
|
||||
|
||||
__host__
|
||||
uint32_t cuda_check_hash_suppl(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce)
|
||||
{
|
||||
uint32_t rescnt, result = 0;
|
||||
|
||||
const int threadsperblock = 512;
|
||||
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// first element stores the count of found nonces
|
||||
cudaMemset(d_resNonces[thr_id], 0, sizeof(uint32_t));
|
||||
|
||||
cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
rescnt = h_resNonces[thr_id][0];
|
||||
if (rescnt > numNonce) {
|
||||
if (numNonce <= rescnt) {
|
||||
result = h_resNonces[thr_id][numNonce+1];
|
||||
}
|
||||
if (opt_debug)
|
||||
applog(LOG_WARNING, "Found %d nonces: %x + %x", rescnt, h_resNonces[thr_id][1], result);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
@ -115,11 +160,6 @@ void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_no
|
||||
uint32_t nounce = g_nonceVector[thread];
|
||||
uint32_t hashPosition = (nounce - startNounce) << 4;
|
||||
uint32_t *inpHash = &g_hash[hashPosition];
|
||||
//uint32_t hash[8];
|
||||
|
||||
//#pragma unroll 8
|
||||
//for (int i=0; i < 8; i++)
|
||||
// hash[i] = inpHash[i];
|
||||
|
||||
for (int i = 7; i >= 0; i--) {
|
||||
if (inpHash[i] > pTarget[i]) {
|
||||
@ -138,21 +178,21 @@ __host__
|
||||
uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
|
||||
{
|
||||
uint32_t result = 0xffffffff;
|
||||
cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
|
||||
cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
|
||||
|
||||
const int threadsperblock = 256;
|
||||
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
|
||||
cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]);
|
||||
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
|
||||
cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
|
||||
cudaThreadSynchronize();
|
||||
result = *h_resNounce[thr_id];
|
||||
result = *h_resNonces[thr_id];
|
||||
|
||||
return result;
|
||||
}
|
@ -20,6 +20,7 @@ extern "C" long device_sm[8];
|
||||
extern void cuda_check_cpu_init(int thr_id, int threads);
|
||||
extern void cuda_check_cpu_setTarget(const void *ptarget);
|
||||
extern uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash);
|
||||
extern uint32_t cuda_check_hash_suppl(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce);
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
|
||||
|
||||
|
@ -155,7 +155,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
|
||||
if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
|
||||
return 0;
|
||||
}
|
||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
|
||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); // why 64 ?
|
||||
|
||||
cuda_check_cpu_init(thr_id, throughput);
|
||||
|
||||
@ -195,9 +195,12 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
|
||||
be32enc(&endiandata[19], foundNonce);
|
||||
x11hash(vhash64, endiandata);
|
||||
|
||||
if ((vhash64[7] <= Htarg) && fulltest(vhash64, ptarget)) {
|
||||
/* uint32_t secNonce = */ cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
|
||||
|
||||
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
|
||||
// just check if there was some other ones...
|
||||
*hashes_done = pdata[19] - first_nonce + throughput;
|
||||
pdata[19] = foundNonce;
|
||||
*hashes_done = foundNonce - first_nonce + 1;
|
||||
return 1;
|
||||
}
|
||||
else if (vhash64[7] > Htarg) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user