Browse Source

checkhash: simplify the common function

use klaus trivial function, the old code has always been a bit weird..

split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff
2upstream
Tanguy Pruvot 10 years ago
parent
commit
118a6be361
  1. 5
      JHA/jackpotcoin.cu
  2. 150
      cuda_checkhash.cu
  3. 3
      cuda_helper.h
  4. 4
      cuda_nist5.cu
  5. 6
      quark/animecoin.cu
  6. 4
      quark/quarkcoin.cu
  7. 3
      qubit/deep.cu
  8. 2
      qubit/qubit.cu
  9. 2
      x11/fresh.cu
  10. 2
      x11/s3.cu
  11. 11
      x11/x11.cu
  12. 3
      x13/x13.cu
  13. 2
      x15/x14.cu
  14. 2
      x15/x15.cu
  15. 2
      x17/x17.cu

5
JHA/jackpotcoin.cu

@ -34,6 +34,8 @@ extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t st @@ -34,6 +34,8 @@ extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t st
uint32_t *d_nonces2, size_t *nrm2,
int order);
extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
static uint32_t *d_jackpotNonces[8];
static uint32_t *d_branch1Nonces[8];
@ -205,8 +207,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, @@ -205,8 +207,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
}
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
unsigned int rounds;

150
cuda_checkhash.cu

@ -1,45 +1,16 @@ @@ -1,45 +1,16 @@
/**
* This code compares final hash against target
*/
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
// Hash Target gegen das wir testen sollen
__constant__ uint32_t pTarget[8];
static uint32_t *d_resNounce[8];
static uint32_t *h_resNounce[8];
__global__
void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// bestimme den aktuellen Zähler
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
uint32_t hashPosition = (nounce - startNounce) << 4;
uint32_t *inpHash = &g_hash[hashPosition];
uint32_t hash[8];
#pragma unroll 8
for (int i=0; i < 8; i++)
hash[i] = inpHash[i];
for (int i = 7; i >= 0; i--) {
if (hash[i] > pTarget[i]) {
return;
}
if (hash[i] <= pTarget[i]) {
break;
}
}
if (resNounce[0] > nounce)
resNounce[0] = nounce;
}
}
// Setup-Funktionen
__host__
void cuda_check_cpu_init(int thr_id, int threads)
{
@ -54,71 +25,134 @@ void cuda_check_cpu_setTarget(const void *ptarget) @@ -54,71 +25,134 @@ void cuda_check_cpu_setTarget(const void *ptarget)
CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
}
/* --------------------------------------------------------------------------------------------- */
__device__ __forceinline__
static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target)
{
if (hash[7] > target[7])
return false;
if (hash[7] < target[7])
return true;
if (hash[6] > target[6])
return false;
if (hash[6] < target[6])
return true;
if (hash[5] > target[5])
return false;
if (hash[5] < target[5])
return true;
if (hash[4] > target[4])
return false;
if (hash[4] < target[4])
return true;
if (hash[3] > target[3])
return false;
if (hash[3] < target[3])
return true;
if (hash[2] > target[2])
return false;
if (hash[2] < target[2])
return true;
if (hash[1] > target[1])
return false;
if (hash[1] < target[1])
return true;
if (hash[0] > target[0])
return false;
return true;
}
__global__ __launch_bounds__(512, 4)
void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNounce)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// shl 4 = *16 x 4 (uint32) = 64 bytes
uint32_t *inpHash = &hash[thread << 4];
if (hashbelowtarget(inpHash, pTarget)) {
uint32_t nounce = (startNounce + thread);
resNounce[0] = nounce;
}
}
}
__host__
uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash)
{
uint32_t result = 0xffffffff;
cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 256;
const int threadsperblock = 512;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
cuda_check_gpu_hash_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, order, thr_id);
cudaThreadSynchronize();
// Ergebnis zum Host kopieren (in page locked memory, damits schneller geht)
cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
// cudaMemcpy() ist asynchron!
cudaThreadSynchronize();
result = *h_resNounce[thr_id];
return result;
}
/* --------------------------------------------------------------------------------------------- */
__global__
void cuda_check_gpu_hash_fast(int threads, uint32_t startNounce, uint32_t *hashEnd, uint32_t *resNounce)
void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
/* only test the last 2 dwords, ok for most algos */
int hashPos = thread << 4;
uint32_t *inpHash = &hashEnd[hashPos];
uint32_t nounce = g_nonceVector[thread];
uint32_t hashPosition = (nounce - startNounce) << 4;
uint32_t *inpHash = &g_hash[hashPosition];
//uint32_t hash[8];
if (inpHash[7] <= pTarget[7] && inpHash[6] <= pTarget[6]) {
uint32_t nounce = (startNounce + thread);
if (resNounce[0] > nounce)
resNounce[0] = nounce;
//#pragma unroll 8
//for (int i=0; i < 8; i++)
// hash[i] = inpHash[i];
for (int i = 7; i >= 0; i--) {
if (inpHash[i] > pTarget[i]) {
return;
}
if (inpHash[i] < pTarget[i]) {
break;
}
}
if (resNounce[0] > nounce)
resNounce[0] = nounce;
}
}
__host__
uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
{
uint32_t result = 0xffffffff;
cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 256;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
cuda_check_gpu_hash_fast <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
// MyStreamSynchronize(NULL, order, thr_id);
cudaThreadSynchronize();
MyStreamSynchronize(NULL, order, thr_id);
cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
// cudaMemcpy() was asynchron ?
// cudaThreadSynchronize();
cudaThreadSynchronize();
result = *h_resNounce[thr_id];
return result;
}
}

3
cuda_helper.h

@ -19,8 +19,7 @@ extern "C" long device_sm[8]; @@ -19,8 +19,7 @@ extern "C" long device_sm[8];
// common functions
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order);
extern uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash);
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);

4
cuda_nist5.cu

@ -11,7 +11,6 @@ extern "C" @@ -11,7 +11,6 @@ extern "C"
#include "cuda_helper.h"
// Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);
@ -113,8 +112,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata, @@ -113,8 +112,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

6
quark/animecoin.cu

@ -48,7 +48,9 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, @@ -48,7 +48,9 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads,
uint32_t *d_nonces1, size_t *nrm1,
int order);
// Original Quarkhash Funktion aus einem miner Quelltext
extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
/* CPU Hash */
extern "C" void animehash(void *state, const void *input)
{
sph_blake512_context ctx_blake;
@ -255,7 +257,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata, @@ -255,7 +257,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

4
quark/quarkcoin.cu

@ -50,6 +50,8 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, @@ -50,6 +50,8 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads,
uint32_t *d_nonces1, size_t *nrm1,
int order);
extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
// Original Quarkhash Funktion aus einem miner Quelltext
extern "C" void quarkhash(void *state, const void *input)
{
@ -227,7 +229,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, @@ -227,7 +229,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
const uint32_t Htarg = ptarget[7];

3
qubit/deep.cu

@ -93,7 +93,8 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata, @@ -93,7 +93,8 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

2
qubit/qubit.cu

@ -120,7 +120,7 @@ extern "C" int scanhash_qubit(int thr_id, uint32_t *pdata, @@ -120,7 +120,7 @@ extern "C" int scanhash_qubit(int thr_id, uint32_t *pdata,
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

2
x11/fresh.cu

@ -123,7 +123,7 @@ extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata, @@ -123,7 +123,7 @@ extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata,
print_hash((unsigned char*)buf); printf("\n");
#endif
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

2
x11/s3.cu

@ -99,7 +99,7 @@ extern "C" int scanhash_s3(int thr_id, uint32_t *pdata, @@ -99,7 +99,7 @@ extern "C" int scanhash_s3(int thr_id, uint32_t *pdata,
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{

11
x11/x11.cu

@ -58,10 +58,6 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads); @@ -58,10 +58,6 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
// to check... new sp method
//extern void x11_echo512_cpu_setTarget(const void *ptarget);
//extern uint32_t x11_echo512_cpu_hash_64_final(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
// X11 CPU Hash
extern "C" void x11hash(void *output, const void *input)
{
@ -172,7 +168,6 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, @@ -172,7 +168,6 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
quark_blake512_cpu_setBlock_80((void*)endiandata);
//x11_echo512_cpu_setTarget(ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
@ -193,11 +188,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, @@ -193,11 +188,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// todo...
//foundNonce = x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
//foundNonce = cuda_check_hash_fast(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

3
x13/x13.cu

@ -208,8 +208,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata, @@ -208,8 +208,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
const uint32_t Htarg = ptarget[7];

2
x15/x14.cu

@ -215,7 +215,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata, @@ -215,7 +215,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
/* check now with the CPU to confirm */

2
x15/x15.cu

@ -229,7 +229,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, @@ -229,7 +229,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
/* Scan with GPU */
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{

2
x17/x17.cu

@ -253,7 +253,7 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata, @@ -253,7 +253,7 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];

Loading…
Cancel
Save