ccminer-gostd-lite/cuda_checkhash.cu
Tanguy Pruvot e50556b637 various changes, cleanup for the release
small fixes to handle better the multi thread per gpu

explicitly report than quark is not compatible with SM 2.1 (compact shuffle)
2015-11-04 14:59:59 +01:00

299 lines
7.7 KiB
Plaintext

/**
* This code compares final hash against target
*/
#include <stdio.h>
#include <memory.h>
#include "miner.h"
#include "cuda_helper.h"
__constant__ uint32_t pTarget[8]; // 32 bytes
// store MAX_GPUS device arrays of 8 nonces
static uint32_t* h_resNonces[MAX_GPUS] = { NULL };
static uint32_t* d_resNonces[MAX_GPUS] = { NULL };
static __thread bool init_done = false;
__host__
void cuda_check_cpu_init(int thr_id, uint32_t threads)
{
CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 32));
CUDA_SAFE_CALL(cudaMallocHost(&h_resNonces[thr_id], 32));
init_done = true;
}
__host__
void cuda_check_cpu_free(int thr_id)
{
if (!init_done) return;
cudaFree(d_resNonces[thr_id]);
cudaFreeHost(h_resNonces[thr_id]);
d_resNonces[thr_id] = NULL;
h_resNonces[thr_id] = NULL;
init_done = false;
}
// Target Difficulty
__host__
void cuda_check_cpu_setTarget(const void *ptarget)
{
CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice));
}
/* --------------------------------------------------------------------------------------------- */
__device__ __forceinline__
static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target)
{
if (hash[7] > target[7])
return false;
if (hash[7] < target[7])
return true;
if (hash[6] > target[6])
return false;
if (hash[6] < target[6])
return true;
if (hash[5] > target[5])
return false;
if (hash[5] < target[5])
return true;
if (hash[4] > target[4])
return false;
if (hash[4] < target[4])
return true;
if (hash[3] > target[3])
return false;
if (hash[3] < target[3])
return true;
if (hash[2] > target[2])
return false;
if (hash[2] < target[2])
return true;
if (hash[1] > target[1])
return false;
if (hash[1] < target[1])
return true;
if (hash[0] > target[0])
return false;
return true;
}
__global__ __launch_bounds__(512, 4)
void cuda_checkhash_64(uint32_t threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// shl 4 = *16 x 4 (uint32) = 64 bytes
// todo: use only 32 bytes * threads if possible
uint32_t *inpHash = &hash[thread << 4];
if (resNonces[0] == UINT32_MAX) {
if (hashbelowtarget(inpHash, pTarget))
resNonces[0] = (startNounce + thread);
}
}
}
__global__ __launch_bounds__(512, 4)
void cuda_checkhash_32(uint32_t threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t *inpHash = &hash[thread << 3];
if (resNonces[0] == UINT32_MAX) {
if (hashbelowtarget(inpHash, pTarget))
resNonces[0] = (startNounce + thread);
}
}
}
__host__
uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash)
{
cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
const uint32_t threadsperblock = 512;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
if (bench_algo >= 0) // dont interrupt the global benchmark
return UINT32_MAX;
if (!init_done) {
applog(LOG_ERR, "missing call to cuda_check_cpu_init");
return UINT32_MAX;
}
cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);
cudaThreadSynchronize();
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
return h_resNonces[thr_id][0];
}
__host__
uint32_t cuda_check_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash)
{
cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
const uint32_t threadsperblock = 512;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
if (bench_algo >= 0) // dont interrupt the global benchmark
return UINT32_MAX;
if (!init_done) {
applog(LOG_ERR, "missing call to cuda_check_cpu_init");
return UINT32_MAX;
}
cuda_checkhash_32 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);
cudaThreadSynchronize();
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
return h_resNonces[thr_id][0];
}
/* --------------------------------------------------------------------------------------------- */
__global__ __launch_bounds__(512, 4)
void cuda_checkhash_64_suppl(uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint32_t *inpHash = &hash[thread << 4];
if (hashbelowtarget(inpHash, pTarget)) {
int resNum = ++resNonces[0];
__threadfence();
if (resNum < 8)
resNonces[resNum] = (startNounce + thread);
}
}
__host__
uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce)
{
uint32_t rescnt, result = 0;
const uint32_t threadsperblock = 512;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
if (!init_done) {
applog(LOG_ERR, "missing call to cuda_check_cpu_init");
return 0;
}
// first element stores the count of found nonces
cudaMemset(d_resNonces[thr_id], 0, sizeof(uint32_t));
cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
cudaThreadSynchronize();
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 32, cudaMemcpyDeviceToHost);
rescnt = h_resNonces[thr_id][0];
if (rescnt > numNonce) {
if (numNonce <= rescnt) {
result = h_resNonces[thr_id][numNonce+1];
}
if (opt_debug)
applog(LOG_WARNING, "Found %d nonces: %x + %x", rescnt, h_resNonces[thr_id][1], result);
}
return result;
}
/* --------------------------------------------------------------------------------------------- */
__global__
void cuda_check_hash_branch_64(uint32_t threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = g_nonceVector[thread];
uint32_t hashPosition = (nounce - startNounce) << 4;
uint32_t *inpHash = &g_hash[hashPosition];
for (int i = 7; i >= 0; i--) {
if (inpHash[i] > pTarget[i]) {
return;
}
if (inpHash[i] < pTarget[i]) {
break;
}
}
if (resNounce[0] > nounce)
resNounce[0] = nounce;
}
}
__host__
uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
{
const uint32_t threadsperblock = 256;
uint32_t result = UINT32_MAX;
if (bench_algo >= 0) // dont interrupt the global benchmark
return result;
if (!init_done) {
applog(LOG_ERR, "missing call to cuda_check_cpu_init");
return result;
}
cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]);
MyStreamSynchronize(NULL, order, thr_id);
cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
result = *h_resNonces[thr_id];
return result;
}
/* Function to get the compiled Shader Model version */
int cuda_arch[MAX_GPUS] = { 0 };
__global__ void nvcc_get_arch(int *d_version)
{
*d_version = 0;
#ifdef __CUDA_ARCH__
*d_version = __CUDA_ARCH__;
#endif
}
__host__
int cuda_get_arch(int thr_id)
{
int *d_version;
int dev_id = device_map[thr_id];
if (cuda_arch[dev_id] == 0) {
// only do it once...
cudaMalloc(&d_version, sizeof(int));
nvcc_get_arch <<< 1, 1 >>> (d_version);
cudaMemcpy(&cuda_arch[dev_id], d_version, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_version);
}
return cuda_arch[dev_id];
}