From 03c3b7d341d4b5e63c59650427a47548ad7659c9 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 10 May 2015 18:14:06 +0200 Subject: [PATCH] Various algos cleanup + lyra2 sec nonce fix --- Algo256/cuda_fugue256.cu | 19 ++++++++----------- Algo256/cuda_groestl256.cu | 7 +++++-- Algo256/keccak256.cu | 18 ++++++++---------- cuda_groestlcoin.cu | 22 +++++++++++----------- cuda_myriadgroestl.cu | 15 ++++++++------- fuguecoin.cpp | 9 ++++++--- groestlcoin.cpp | 21 ++++++++++----------- lyra2/lyra2RE.cu | 10 ++++------ myriadgroestl.cpp | 26 +++++++++++--------------- 9 files changed, 71 insertions(+), 76 deletions(-) diff --git a/Algo256/cuda_fugue256.cu b/Algo256/cuda_fugue256.cu index bb72752..7bcbfcf 100644 --- a/Algo256/cuda_fugue256.cu +++ b/Algo256/cuda_fugue256.cu @@ -9,7 +9,7 @@ #define USE_SHARED 1 uint32_t *d_fugue256_hashoutput[MAX_GPUS]; -uint32_t *d_resultNonce[MAX_GPUS]; +static uint32_t *d_resultNonce[MAX_GPUS]; __constant__ uint32_t GPUstate[30]; // Single GPU __constant__ uint32_t pTarget[8]; // Single GPU @@ -718,10 +718,9 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } +__host__ void fugue256_cpu_init(int thr_id, uint32_t threads) { - cudaSetDevice(device_map[thr_id]); - // Kopiere die Hash-Tabellen in den GPU-Speicher texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256); texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256); @@ -733,25 +732,23 @@ void fugue256_cpu_init(int thr_id, uint32_t threads) cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); } -__host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn) +__host__ +void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn) { // CPU-Vorbereitungen treffen sph_fugue256_context ctx_fugue_const; sph_fugue256_init(&ctx_fugue_const); sph_fugue256 (&ctx_fugue_const, data, 80); // State speichern - cudaMemcpyToSymbol( GPUstate, - ctx_fugue_const.S, - sizeof(uint32_t) * 30 ); + cudaMemcpyToSymbol(GPUstate, ctx_fugue_const.S, sizeof(uint32_t) * 30); - cudaMemcpyToSymbol( pTarget, - pTargetIn, - sizeof(uint32_t) * 8 ); + cudaMemcpyToSymbol(pTarget, pTargetIn, sizeof(uint32_t) * 8); cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); } -__host__ void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce) +__host__ +void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce) { #if USE_SHARED const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN diff --git a/Algo256/cuda_groestl256.cu b/Algo256/cuda_groestl256.cu index 865767e..1e8bf61 100644 --- a/Algo256/cuda_groestl256.cu +++ b/Algo256/cuda_groestl256.cu @@ -280,8 +280,8 @@ void groestl256_cpu_init(int thr_id, uint32_t threads) __host__ uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order) { - uint32_t result = 0xffffffff; - cudaMemset(d_GNonces[thr_id], 0xff, sizeof(uint32_t)); + uint32_t result = UINT32_MAX; + cudaMemset(d_GNonces[thr_id], 0xff, 2*sizeof(uint32_t)); const uint32_t threadsperblock = 256; // berechne wie viele Thread Blocks wir brauchen @@ -308,7 +308,10 @@ __host__ uint32_t groestl256_getSecNonce(int thr_id, int num) { uint32_t results[2]; + memset(results, 0xFF, sizeof(results)); cudaMemcpy(results, d_GNonces[thr_id], sizeof(results), cudaMemcpyDeviceToHost); + if (results[1] == results[0]) + return UINT32_MAX; return results[num]; } diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu index 17c57c9..ecb181e 100644 --- a/Algo256/keccak256.cu +++ b/Algo256/keccak256.cu @@ -23,10 +23,9 @@ extern uint32_t keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t sta // CPU Hash extern "C" void keccak256_hash(void *state, const void *input) { + uint32_t _ALIGN(64) hash[16]; sph_keccak_context ctx_keccak; - uint32_t hash[16]; - sph_keccak256_init(&ctx_keccak); sph_keccak256 (&ctx_keccak, input, 80); sph_keccak256_close(&ctx_keccak, (void*) hash); @@ -50,8 +49,8 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata, if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); - keccak256_cpu_init(thr_id, (int) throughput); + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64)); + keccak256_cpu_init(thr_id, throughput); init[thr_id] = true; } @@ -65,16 +64,16 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata, do { int order = 0; - uint32_t foundNonce = keccak256_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++); + *hashes_done = pdata[19] - first_nonce + throughput; + + uint32_t foundNonce = keccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); if (foundNonce != UINT32_MAX) { - uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; + uint32_t _ALIGN(64) vhash64[8]; be32enc(&endiandata[19], foundNonce); keccak256_hash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { - *hashes_done = foundNonce - first_nonce + 1; + if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { pdata[19] = foundNonce; return 1; } @@ -91,6 +90,5 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata, } while (!work_restart[thr_id].restart); - *hashes_done = pdata[19] - first_nonce; return 0; } diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu index e580f55..6d57fe8 100644 --- a/cuda_groestlcoin.cu +++ b/cuda_groestlcoin.cu @@ -8,10 +8,10 @@ // globaler Speicher für alle HeftyHashes aller Threads __constant__ uint32_t pTarget[8]; // Single GPU -extern uint32_t *d_resultNonce[MAX_GPUS]; - __constant__ uint32_t groestlcoin_gpu_msg[32]; +static uint32_t *d_resultNonce[MAX_GPUS]; + #if __CUDA_ARCH__ >= 300 // 64 Registers Variant for Compute 3.0+ #include "quark/groestl_functions_quad.h" @@ -30,7 +30,8 @@ void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t { // GROESTL uint32_t paddedInput[8]; -#pragma unroll 8 + + #pragma unroll 8 for(int k=0;k<8;k++) paddedInput[k] = groestlcoin_gpu_msg[4*k+threadIdx.x%4]; uint32_t nounce = startNounce + thread; @@ -68,7 +69,7 @@ void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t int i, position = -1; bool rc = true; - #pragma unroll 8 + #pragma unroll 8 for (i = 7; i >= 0; i--) { if (out_state[i] > pTarget[i]) { if(position < i) { @@ -92,16 +93,14 @@ void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t #endif } -// Setup-Funktionen -__host__ void groestlcoin_cpu_init(int thr_id, uint32_t threads) +__host__ +void groestlcoin_cpu_init(int thr_id, uint32_t threads) { - cudaSetDevice(device_map[thr_id]); - - // Speicher für Gewinner-Nonce belegen cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); } -__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn) +__host__ +void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn) { // Nachricht expandieren und setzen uint32_t msgBlock[32]; @@ -128,7 +127,8 @@ __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn) sizeof(uint32_t) * 8 ); } -__host__ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) +__host__ +void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) { uint32_t threadsperblock = 256; diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu index 0c9acd7..28ea94c 100644 --- a/cuda_myriadgroestl.cu +++ b/cuda_myriadgroestl.cu @@ -14,7 +14,7 @@ // globaler Speicher für alle HeftyHashes aller Threads __constant__ uint32_t pTarget[8]; // Single GPU uint32_t *d_outputHashes[MAX_GPUS]; -extern uint32_t *d_resultNonce[MAX_GPUS]; +static uint32_t *d_resultNonce[MAX_GPUS]; __constant__ uint32_t myriadgroestl_gpu_msg[32]; @@ -299,11 +299,10 @@ __global__ void #endif } -// Setup-Funktionen -__host__ void myriadgroestl_cpu_init(int thr_id, uint32_t threads) +// Setup Function +__host__ +void myriadgroestl_cpu_init(int thr_id, uint32_t threads) { - cudaSetDevice(device_map[thr_id]); - cudaMemcpyToSymbol( myr_sha256_gpu_hashTable, myr_sha256_cpu_hashTable, sizeof(uint32_t) * 8 ); @@ -328,7 +327,8 @@ __host__ void myriadgroestl_cpu_init(int thr_id, uint32_t threads) cudaMalloc(&d_outputHashes[thr_id], 16*sizeof(uint32_t)*threads); } -__host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn) +__host__ +void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn) { // Nachricht expandieren und setzen uint32_t msgBlock[32]; @@ -355,7 +355,8 @@ __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn sizeof(uint32_t) * 8 ); } -__host__ void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) +__host__ +void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) { uint32_t threadsperblock = 256; diff --git a/fuguecoin.cpp b/fuguecoin.cpp index fdd28c7..d2f8dd9 100644 --- a/fuguecoin.cpp +++ b/fuguecoin.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "uint256.h" #include "sph/sph_fugue.h" @@ -22,7 +23,7 @@ extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, static bool init[MAX_GPUS] = { 0 }; -extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *ptarget, +int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { uint32_t start_nonce = pdata[19]++; @@ -36,6 +37,8 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt // init if(!init[thr_id]) { + cudaSetDevice(device_map[thr_id]); + fugue256_cpu_init(thr_id, throughput); init[thr_id] = true; } @@ -50,10 +53,10 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt do { // GPU - uint32_t foundNounce = 0xFFFFFFFF; + uint32_t foundNounce = UINT32_MAX; fugue256_cpu_hash(thr_id, throughput, pdata[19], NULL, &foundNounce); - if(foundNounce < 0xffffffff) + if (foundNounce < UINT32_MAX) { uint32_t hash[8]; const uint32_t Htarg = ptarget[7]; diff --git a/groestlcoin.cpp b/groestlcoin.cpp index db74549..b501284 100644 --- a/groestlcoin.cpp +++ b/groestlcoin.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include "uint256.h" @@ -36,11 +37,11 @@ static bool init[MAX_GPUS] = { 0 }; extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { - uint32_t start_nonce = pdata[19]++; + uint32_t start_nonce = pdata[19]; uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8 throughput = min(throughput, max_nonce - start_nonce); - uint32_t *outputHash = (uint32_t*)malloc(throughput * 16 * sizeof(uint32_t)); + uint32_t *outputHash = (uint32_t*)malloc(throughput * 64); if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x000000ff; @@ -48,6 +49,7 @@ extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t // init if(!init[thr_id]) { + cudaSetDevice(device_map[thr_id]); groestlcoin_cpu_init(thr_id, throughput); init[thr_id] = true; } @@ -62,27 +64,25 @@ extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t do { // GPU - uint32_t foundNounce = 0xFFFFFFFF; - const uint32_t Htarg = ptarget[7]; + uint32_t foundNounce = UINT32_MAX; + + *hashes_done = pdata[19] - start_nonce + throughput; groestlcoin_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce); - if(foundNounce < 0xffffffff) + if(foundNounce < UINT32_MAX) { - uint32_t tmpHash[8]; + uint32_t _ALIGN(64) tmpHash[8]; endiandata[19] = SWAP32(foundNounce); groestlhash(tmpHash, endiandata); - if (tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget)) { + if (tmpHash[7] <= ptarget[7] && fulltest(tmpHash, ptarget)) { pdata[19] = foundNounce; - *hashes_done = foundNounce - start_nonce + 1; free(outputHash); return true; } else { applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNounce); } - - foundNounce = 0xffffffff; } if (pdata[19] + throughput < pdata[19]) @@ -91,7 +91,6 @@ extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); - *hashes_done = pdata[19] - start_nonce + 1; free(outputHash); return 0; } diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu index bbc5131..e5fe1fe 100644 --- a/lyra2/lyra2RE.cu +++ b/lyra2/lyra2RE.cu @@ -79,7 +79,7 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata, skein256_cpu_init(thr_id, throughput); groestl256_cpu_init(thr_id, throughput); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64)); init[thr_id] = true; } @@ -95,23 +95,22 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata, int order = 0; uint32_t foundNonce; + *hashes_done = pdata[19] - first_nonce + throughput; + blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - *hashes_done = pdata[19] - first_nonce + throughput; - foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); if (foundNonce != UINT32_MAX) { uint32_t _ALIGN(64) vhash64[8]; - const uint32_t Htarg = ptarget[7]; be32enc(&endiandata[19], foundNonce); lyra2_hash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { + if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { int res = 1; uint32_t secNonce = groestl256_getSecNonce(thr_id, 1); if (secNonce != UINT32_MAX) @@ -136,6 +135,5 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata, } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); - *hashes_done = pdata[19] - first_nonce + 1; return 0; } diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp index 41b3d83..a3e0340 100644 --- a/myriadgroestl.cpp +++ b/myriadgroestl.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include "uint256.h" @@ -41,7 +42,7 @@ extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptar uint32_t throughput = device_intensity(thr_id, __func__, 1 << 17); throughput = min(throughput, max_nonce - start_nonce); - uint32_t *outputHash = (uint32_t*)malloc(throughput * 16 * sizeof(uint32_t)); + uint32_t *outputHash = (uint32_t*)malloc(throughput * 64); if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x0000ff; @@ -49,14 +50,13 @@ extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptar // init if(!init[thr_id]) { -#if BIG_DEBUG -#else + cudaSetDevice(device_map[thr_id]); + myriadgroestl_cpu_init(thr_id, throughput); -#endif init[thr_id] = true; } - uint32_t endiandata[32]; + uint32_t _ALIGN(64) endiandata[32]; for (int kk=0; kk < 32; kk++) be32enc(&endiandata[kk], pdata[kk]); @@ -66,26 +66,23 @@ extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptar do { // GPU uint32_t foundNounce = UINT32_MAX; - const uint32_t Htarg = ptarget[7]; + + *hashes_done = pdata[19] - start_nonce + throughput; myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce); if (foundNounce < UINT32_MAX) { - uint32_t tmpHash[8]; + uint32_t _ALIGN(64) tmpHash[8]; endiandata[19] = SWAP32(foundNounce); myriadhash(tmpHash, endiandata); - if (tmpHash[7] <= Htarg && - fulltest(tmpHash, ptarget)) { - pdata[19] = foundNounce; - *hashes_done = foundNounce - start_nonce + 1; - free(outputHash); + if (tmpHash[7] <= ptarget[7] && fulltest(tmpHash, ptarget)) { + pdata[19] = foundNounce; + free(outputHash); return true; } else { applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNounce); } - - foundNounce = 0xffffffff; } if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) { @@ -96,7 +93,6 @@ extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptar } while (!work_restart[thr_id].restart); - *hashes_done = pdata[19] - start_nonce + 1; free(outputHash); return 0; }