Browse Source

checkhash: some work on a faster variant (wip)

This should not be used for all algos... not enabled yet

todo: multiple nounces or blake32 style checkup
2upstream
Tanguy Pruvot 10 years ago
parent
commit
11dbbcc12d
  1. 60
      cuda_checkhash.cu
  2. 1
      cuda_helper.h
  3. 1
      x11/x11.cu

60
cuda_checkhash.cu

@ -18,9 +18,8 @@ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonce
// bestimme den aktuellen Zähler // bestimme den aktuellen Zähler
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce; uint32_t hashPosition = (nounce - startNounce) << 4;
uint32_t *inpHash = &g_hash[hashPosition<<4]; uint32_t *inpHash = &g_hash[hashPosition];
uint32_t hash[8]; uint32_t hash[8];
#pragma unroll 8 #pragma unroll 8
@ -31,11 +30,10 @@ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonce
if (hash[i] > pTarget[i]) { if (hash[i] > pTarget[i]) {
return; return;
} }
if (hash[i] < pTarget[i]) { if (hash[i] <= pTarget[i]) {
break; break;
} }
} }
if (resNounce[0] > nounce) if (resNounce[0] > nounce)
resNounce[0] = nounce; resNounce[0] = nounce;
} }
@ -53,8 +51,7 @@ void cuda_check_cpu_init(int thr_id, int threads)
__host__ __host__
void cuda_check_cpu_setTarget(const void *ptarget) void cuda_check_cpu_setTarget(const void *ptarget)
{ {
// die Message zur Berechnung auf der GPU CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
} }
__host__ __host__
@ -65,14 +62,10 @@ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, u
const int threadsperblock = 256; const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs cuda_check_gpu_hash_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
size_t shared_size = 0;
cuda_check_gpu_hash_64 <<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last // Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
@ -86,3 +79,46 @@ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, u
return result; return result;
} }
__global__
void cuda_check_gpu_hash_fast(int threads, uint32_t startNounce, uint32_t *hashEnd, uint32_t *resNounce)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
/* only test the last 2 dwords, ok for most algos */
int hashPos = thread << 4;
uint32_t *inpHash = &hashEnd[hashPos];
if (inpHash[7] <= pTarget[7] && inpHash[6] <= pTarget[6]) {
uint32_t nounce = (startNounce + thread);
if (resNounce[0] > nounce)
resNounce[0] = nounce;
}
}
}
__host__
uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
{
uint32_t result = 0xffffffff;
cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 256;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
cuda_check_gpu_hash_fast <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
// MyStreamSynchronize(NULL, order, thr_id);
cudaThreadSynchronize();
cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
// cudaMemcpy() was asynchron ?
// cudaThreadSynchronize();
result = *h_resNounce[thr_id];
return result;
}

1
cuda_helper.h

@ -20,6 +20,7 @@ extern int device_sm[8];
extern void cuda_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order);
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
extern __device__ __device_builtin__ void __syncthreads(void); extern __device__ __device_builtin__ void __syncthreads(void);

1
x11/x11.cu

@ -191,6 +191,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU // Scan nach Gewinner Hashes auf der GPU
//foundNonce = cuda_check_hash_fast(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff) if (foundNonce != 0xffffffff)
{ {

Loading…
Cancel
Save