@ -4,18 +4,21 @@
@@ -4,18 +4,21 @@
#include <stdio.h>
#include <memory.h>
#include "miner.h"
#include "cuda_helper.h"
__constant__ uint32_t pTarget[8];
__constant__ uint32_t pTarget[8]; // 32 bytes
static uint32_t *d_resNounce[8];
static uint32_t *h_resNounce[8];
// store 8 device arrays of 8 nonces
static uint32_t* h_resNonces[8];
static uint32_t* d_resNonces[8];
__host__
void cuda_check_cpu_init(int thr_id, int threads)
{
CUDA_CALL_OR_RET(cudaMallocHost(&h_resNounce[thr_id], 1 *sizeof(uint32_t)));
CUDA_CALL_OR_RET(cudaMalloc(&d_resNounce[thr_id], 1 *sizeof(uint32_t)));
CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 8 *sizeof(uint32_t)));
CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 8 *sizeof(uint32_t)));
}
// Target Difficulty
@ -68,17 +71,18 @@ static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint3
@@ -68,17 +71,18 @@ static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint3
}
__global__ __launch_bounds__(512, 4)
void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNou nce)
void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces )
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// shl 4 = *16 x 4 (uint32) = 64 bytes
// todo: use only 32 bytes * threads if possible
uint32_t *inpHash = &hash[thread << 4];
if (hashbelowtarget(inpHash, pTarget) ) {
uint32_t nounce = (startNounce + thread);
resNounce[0] = nounce ;
if (resNonces[0] == UINT32_MAX ) {
if (hashbelowtarget(inpHash, pTarget))
resNonces[0] = (startNounce + thread) ;
}
}
}
@ -86,20 +90,61 @@ void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32
@@ -86,20 +90,61 @@ void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32
__host__
uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash)
{
uint32_t result = 0xffffffff;
cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 512;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);
cudaThreadSynchronize();
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
return h_resNonces[thr_id][0];
}
/* --------------------------------------------------------------------------------------------- */
__global__ __launch_bounds__(512, 4)
void cuda_checkhash_64_suppl(uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint32_t *inpHash = &hash[thread << 4];
if (hashbelowtarget(inpHash, pTarget)) {
int resNum = ++resNonces[0];
__threadfence();
if (resNum < 8)
resNonces[resNum] = (startNounce + thread);
}
}
__host__
uint32_t cuda_check_hash_suppl(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce)
{
uint32_t rescnt, result = 0;
const int threadsperblock = 512;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
// first element stores the count of found nonces
cudaMemset(d_resNonces[thr_id], 0, sizeof(uint32_t));
cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
cudaThreadSynchronize();
cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
result = *h_resNounce[thr_id];
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost);
rescnt = h_resNonces[thr_id][0];
if (rescnt > numNonce) {
if (numNonce <= rescnt) {
result = h_resNonces[thr_id][numNonce+1];
}
if (opt_debug)
applog(LOG_WARNING, "Found %d nonces: %x + %x", rescnt, h_resNonces[thr_id][1], result);
}
return result;
}
@ -115,11 +160,6 @@ void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_no
@@ -115,11 +160,6 @@ void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_no
uint32_t nounce = g_nonceVector[thread];
uint32_t hashPosition = (nounce - startNounce) << 4;
uint32_t *inpHash = &g_hash[hashPosition];
//uint32_t hash[8];
//#pragma unroll 8
//for (int i=0; i < 8; i++)
// hash[i] = inpHash[i];
for (int i = 7; i >= 0; i--) {
if (inpHash[i] > pTarget[i]) {
@ -138,21 +178,21 @@ __host__
@@ -138,21 +178,21 @@ __host__
uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
{
uint32_t result = 0xffffffff;
cudaMemset(d_resNou nce[thr_id], 0xff, sizeof(uint32_t));
cudaMemset(d_resNonces [thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNou nce[thr_id]);
cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces [thr_id]);
MyStreamSynchronize(NULL, order, thr_id);
cudaMemcpy(h_resNou nce[thr_id], d_resNou nce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaMemcpy(h_resNonces [thr_id], d_resNonces [thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
result = *h_resNou nce[thr_id];
result = *h_resNonces [thr_id];
return result;
}