Browse Source

Prepare multiple nonces support in one loop (if found)

Tested on x11 which find sometimes 3 nonces in one call,
actually they are ignored because only the biggest was kept...

This commit doesnt fix that, but will allow to enhance shares rate later...
master
Tanguy Pruvot 10 years ago
parent
commit
f387898ead
  1. 86
      cuda_checkhash.cu
  2. 1
      cuda_helper.h
  3. 9
      x11/x11.cu

86
cuda_checkhash.cu

@ -4,18 +4,21 @@ @@ -4,18 +4,21 @@
#include <stdio.h>
#include <memory.h>
#include "miner.h"
#include "cuda_helper.h"
__constant__ uint32_t pTarget[8];
__constant__ uint32_t pTarget[8]; // 32 bytes
static uint32_t *d_resNounce[8];
static uint32_t *h_resNounce[8];
// store 8 device arrays of 8 nonces
static uint32_t* h_resNonces[8];
static uint32_t* d_resNonces[8];
__host__
void cuda_check_cpu_init(int thr_id, int threads)
{
CUDA_CALL_OR_RET(cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t)));
CUDA_CALL_OR_RET(cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t)));
CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 8*sizeof(uint32_t)));
CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 8*sizeof(uint32_t)));
}
// Target Difficulty
@ -68,17 +71,18 @@ static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint3 @@ -68,17 +71,18 @@ static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint3
}
__global__ __launch_bounds__(512, 4)
void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNounce)
void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// shl 4 = *16 x 4 (uint32) = 64 bytes
// todo: use only 32 bytes * threads if possible
uint32_t *inpHash = &hash[thread << 4];
if (hashbelowtarget(inpHash, pTarget)) {
uint32_t nounce = (startNounce + thread);
resNounce[0] = nounce;
if (resNonces[0] == UINT32_MAX) {
if (hashbelowtarget(inpHash, pTarget))
resNonces[0] = (startNounce + thread);
}
}
}
@ -86,20 +90,61 @@ void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32 @@ -86,20 +90,61 @@ void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32
__host__
uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash)
{
uint32_t result = 0xffffffff;
cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 512;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);
cudaThreadSynchronize();
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
return h_resNonces[thr_id][0];
}
/* --------------------------------------------------------------------------------------------- */
__global__ __launch_bounds__(512, 4)
void cuda_checkhash_64_suppl(uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint32_t *inpHash = &hash[thread << 4];
if (hashbelowtarget(inpHash, pTarget)) {
int resNum = ++resNonces[0];
__threadfence();
if (resNum < 8)
resNonces[resNum] = (startNounce + thread);
}
}
__host__
uint32_t cuda_check_hash_suppl(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce)
{
uint32_t rescnt, result = 0;
const int threadsperblock = 512;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
// first element stores the count of found nonces
cudaMemset(d_resNonces[thr_id], 0, sizeof(uint32_t));
cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
cudaThreadSynchronize();
cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
result = *h_resNounce[thr_id];
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost);
rescnt = h_resNonces[thr_id][0];
if (rescnt > numNonce) {
if (numNonce <= rescnt) {
result = h_resNonces[thr_id][numNonce+1];
}
if (opt_debug)
applog(LOG_WARNING, "Found %d nonces: %x + %x", rescnt, h_resNonces[thr_id][1], result);
}
return result;
}
@ -115,11 +160,6 @@ void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_no @@ -115,11 +160,6 @@ void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_no
uint32_t nounce = g_nonceVector[thread];
uint32_t hashPosition = (nounce - startNounce) << 4;
uint32_t *inpHash = &g_hash[hashPosition];
//uint32_t hash[8];
//#pragma unroll 8
//for (int i=0; i < 8; i++)
// hash[i] = inpHash[i];
for (int i = 7; i >= 0; i--) {
if (inpHash[i] > pTarget[i]) {
@ -138,21 +178,21 @@ __host__ @@ -138,21 +178,21 @@ __host__
uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
{
uint32_t result = 0xffffffff;
cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]);
MyStreamSynchronize(NULL, order, thr_id);
cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
result = *h_resNounce[thr_id];
result = *h_resNonces[thr_id];
return result;
}

1
cuda_helper.h

@ -20,6 +20,7 @@ extern "C" long device_sm[8]; @@ -20,6 +20,7 @@ extern "C" long device_sm[8];
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash);
extern uint32_t cuda_check_hash_suppl(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce);
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);

9
x11/x11.cu

@ -155,7 +155,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, @@ -155,7 +155,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
return 0;
}
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); // why 64 ?
cuda_check_cpu_init(thr_id, throughput);
@ -195,9 +195,12 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, @@ -195,9 +195,12 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
be32enc(&endiandata[19], foundNonce);
x11hash(vhash64, endiandata);
if ((vhash64[7] <= Htarg) && fulltest(vhash64, ptarget)) {
/* uint32_t secNonce = */ cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
// just check if there was some other ones...
*hashes_done = pdata[19] - first_nonce + throughput;
pdata[19] = foundNonce;
*hashes_done = foundNonce - first_nonce + 1;
return 1;
}
else if (vhash64[7] > Htarg) {

Loading…
Cancel
Save