diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu index 4e726af..d66751a 100644 --- a/JHA/jackpotcoin.cu +++ b/JHA/jackpotcoin.cu @@ -34,6 +34,8 @@ extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t st uint32_t *d_nonces2, size_t *nrm2, int order); +extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); + // Speicher zur Generierung der Noncevektoren für die bedingten Hashes static uint32_t *d_jackpotNonces[8]; static uint32_t *d_branch1Nonces[8]; @@ -205,8 +207,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); } - // Scan nach Gewinner Hashes auf der GPU - uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); if (foundNonce != 0xffffffff) { unsigned int rounds; diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu index e20f81d..aeafcd8 100644 --- a/cuda_checkhash.cu +++ b/cuda_checkhash.cu @@ -1,45 +1,16 @@ +/** + * This code compares final hash against target + */ #include #include #include "cuda_helper.h" -// Hash Target gegen das wir testen sollen __constant__ uint32_t pTarget[8]; static uint32_t *d_resNounce[8]; static uint32_t *h_resNounce[8]; -__global__ -void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce) -{ - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // bestimme den aktuellen Zähler - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - uint32_t hashPosition = (nounce - startNounce) << 4; - uint32_t *inpHash = &g_hash[hashPosition]; - uint32_t hash[8]; - - #pragma unroll 8 - for (int i=0; i < 8; i++) - hash[i] = inpHash[i]; - - for (int i = 7; i >= 0; i--) { - if (hash[i] > pTarget[i]) { - return; - } - if (hash[i] <= pTarget[i]) { - break; - } - } - if (resNounce[0] > nounce) - resNounce[0] = nounce; - } -} - -// Setup-Funktionen __host__ void cuda_check_cpu_init(int thr_id, int threads) { @@ -54,71 +25,134 @@ void cuda_check_cpu_setTarget(const void *ptarget) CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); } +/* --------------------------------------------------------------------------------------------- */ + +__device__ __forceinline__ +static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target) +{ + if (hash[7] > target[7]) + return false; + if (hash[7] < target[7]) + return true; + if (hash[6] > target[6]) + return false; + if (hash[6] < target[6]) + return true; + + if (hash[5] > target[5]) + return false; + if (hash[5] < target[5]) + return true; + if (hash[4] > target[4]) + return false; + if (hash[4] < target[4]) + return true; + + if (hash[3] > target[3]) + return false; + if (hash[3] < target[3]) + return true; + if (hash[2] > target[2]) + return false; + if (hash[2] < target[2]) + return true; + + if (hash[1] > target[1]) + return false; + if (hash[1] < target[1]) + return true; + if (hash[0] > target[0]) + return false; + + return true; +} + +__global__ __launch_bounds__(512, 4) +void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNounce) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + // shl 4 = *16 x 4 (uint32) = 64 bytes + uint32_t *inpHash = &hash[thread << 4]; + + if (hashbelowtarget(inpHash, pTarget)) { + uint32_t nounce = (startNounce + thread); + resNounce[0] = nounce; + } + } +} + __host__ -uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) +uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash) { uint32_t result = 0xffffffff; cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); - const int threadsperblock = 256; + const int threadsperblock = 512; - dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - cuda_check_gpu_hash_64 <<>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]); + cuda_checkhash_64 <<>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]); - // Strategisches Sleep Kommando zur Senkung der CPU Last - MyStreamSynchronize(NULL, order, thr_id); + cudaThreadSynchronize(); - // Ergebnis zum Host kopieren (in page locked memory, damits schneller geht) cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); - - // cudaMemcpy() ist asynchron! - cudaThreadSynchronize(); result = *h_resNounce[thr_id]; return result; } +/* --------------------------------------------------------------------------------------------- */ + __global__ -void cuda_check_gpu_hash_fast(int threads, uint32_t startNounce, uint32_t *hashEnd, uint32_t *resNounce) +void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce) { int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - /* only test the last 2 dwords, ok for most algos */ - int hashPos = thread << 4; - uint32_t *inpHash = &hashEnd[hashPos]; + uint32_t nounce = g_nonceVector[thread]; + uint32_t hashPosition = (nounce - startNounce) << 4; + uint32_t *inpHash = &g_hash[hashPosition]; + //uint32_t hash[8]; - if (inpHash[7] <= pTarget[7] && inpHash[6] <= pTarget[6]) { - uint32_t nounce = (startNounce + thread); - if (resNounce[0] > nounce) - resNounce[0] = nounce; + //#pragma unroll 8 + //for (int i=0; i < 8; i++) + // hash[i] = inpHash[i]; + + for (int i = 7; i >= 0; i--) { + if (inpHash[i] > pTarget[i]) { + return; + } + if (inpHash[i] < pTarget[i]) { + break; + } } + if (resNounce[0] > nounce) + resNounce[0] = nounce; } } __host__ -uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order) +uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) { uint32_t result = 0xffffffff; cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); const int threadsperblock = 256; - dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - cuda_check_gpu_hash_fast <<>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]); + cuda_check_hash_branch_64 <<>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]); - // MyStreamSynchronize(NULL, order, thr_id); - cudaThreadSynchronize(); + MyStreamSynchronize(NULL, order, thr_id); cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); - // cudaMemcpy() was asynchron ? - // cudaThreadSynchronize(); + cudaThreadSynchronize(); result = *h_resNounce[thr_id]; return result; -} +} \ No newline at end of file diff --git a/cuda_helper.h b/cuda_helper.h index f22100e..255ce45 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -19,8 +19,7 @@ extern "C" long device_sm[8]; // common functions extern void cuda_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_setTarget(const void *ptarget); -extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); -extern uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order); +extern uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func); diff --git a/cuda_nist5.cu b/cuda_nist5.cu index d0c85ad..ad4a21f 100644 --- a/cuda_nist5.cu +++ b/cuda_nist5.cu @@ -11,7 +11,6 @@ extern "C" #include "cuda_helper.h" -// Speicher für Input/Output der verketteten Hashfunktionen static uint32_t *d_hash[8]; extern void quark_blake512_cpu_init(int thr_id, int threads); @@ -113,8 +112,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata, quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - // Scan nach Gewinner Hashes auf der GPU - uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != 0xffffffff) { uint32_t vhash64[8]; diff --git a/quark/animecoin.cu b/quark/animecoin.cu index e526f4c..e3205ad 100644 --- a/quark/animecoin.cu +++ b/quark/animecoin.cu @@ -48,7 +48,9 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t *d_nonces1, size_t *nrm1, int order); -// Original Quarkhash Funktion aus einem miner Quelltext +extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); + +/* CPU Hash */ extern "C" void animehash(void *state, const void *input) { sph_blake512_context ctx_blake; @@ -255,7 +257,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata, quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); // Scan nach Gewinner Hashes auf der GPU - uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); if (foundNonce != 0xffffffff) { uint32_t vhash64[8]; diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu index 337d14e..f52337f 100644 --- a/quark/quarkcoin.cu +++ b/quark/quarkcoin.cu @@ -50,6 +50,8 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t *d_nonces1, size_t *nrm1, int order); +extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); + // Original Quarkhash Funktion aus einem miner Quelltext extern "C" void quarkhash(void *state, const void *input) { @@ -227,7 +229,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); // Scan nach Gewinner Hashes auf der GPU - uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); if (foundNonce != 0xffffffff) { const uint32_t Htarg = ptarget[7]; diff --git a/qubit/deep.cu b/qubit/deep.cu index e99818a..407ce25 100644 --- a/qubit/deep.cu +++ b/qubit/deep.cu @@ -93,7 +93,8 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata, qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + + uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != 0xffffffff) { uint32_t vhash64[8]; diff --git a/qubit/qubit.cu b/qubit/qubit.cu index 205e6f2..92375bf 100644 --- a/qubit/qubit.cu +++ b/qubit/qubit.cu @@ -120,7 +120,7 @@ extern "C" int scanhash_qubit(int thr_id, uint32_t *pdata, x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != 0xffffffff) { uint32_t vhash64[8]; diff --git a/x11/fresh.cu b/x11/fresh.cu index a6cbb8c..40c2455 100644 --- a/x11/fresh.cu +++ b/x11/fresh.cu @@ -123,7 +123,7 @@ extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata, print_hash((unsigned char*)buf); printf("\n"); #endif - foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != 0xffffffff) { uint32_t vhash64[8]; diff --git a/x11/s3.cu b/x11/s3.cu index ce608bf..4997f2d 100644 --- a/x11/s3.cu +++ b/x11/s3.cu @@ -99,7 +99,7 @@ extern "C" int scanhash_s3(int thr_id, uint32_t *pdata, x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != 0xffffffff) { diff --git a/x11/x11.cu b/x11/x11.cu index 68b71e4..6c77c34 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -58,10 +58,6 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order); -// to check... new sp method -//extern void x11_echo512_cpu_setTarget(const void *ptarget); -//extern uint32_t x11_echo512_cpu_hash_64_final(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); - // X11 CPU Hash extern "C" void x11hash(void *output, const void *input) { @@ -172,7 +168,6 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, quark_blake512_cpu_setBlock_80((void*)endiandata); - //x11_echo512_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget); do { @@ -193,11 +188,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - // todo... - //foundNonce = x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - //foundNonce = cuda_check_hash_fast(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - - foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != 0xffffffff) { uint32_t vhash64[8]; diff --git a/x13/x13.cu b/x13/x13.cu index 5a73bd1..fb204f1 100644 --- a/x13/x13.cu +++ b/x13/x13.cu @@ -208,8 +208,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata, x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - // Scan nach Gewinner Hashes auf der GPU - foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != 0xffffffff) { const uint32_t Htarg = ptarget[7]; diff --git a/x15/x14.cu b/x15/x14.cu index ec50c42..39c6004 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -215,7 +215,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata, x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != 0xffffffff) { /* check now with the CPU to confirm */ diff --git a/x15/x15.cu b/x15/x15.cu index 00ff0a5..c9bff02 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -229,7 +229,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); /* Scan with GPU */ - uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != 0xffffffff) { diff --git a/x17/x17.cu b/x17/x17.cu index 17cb73e..1809a70 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -253,7 +253,7 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata, x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != 0xffffffff) { uint32_t vhash64[8];