From 9dfa757dc7c14c64554bb5e14fb9d1673a3ac823 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 12 Oct 2015 04:49:12 +0200 Subject: [PATCH] warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case --- JHA/jackpotcoin.cu | 132 ++++++++++++++++++++------------------ README.txt | 2 +- api.cpp | 4 ++ bench.cpp | 56 ++++++++++++++-- ccminer.cpp | 59 ++++------------- cuda.cpp | 11 +++- lyra2/lyra2REv2.cu | 3 +- miner.h | 5 +- neoscrypt/neoscrypt.cpp | 2 + qubit/luffa.cu | 10 +-- scrypt-jane.cpp | 3 + x13/x13.cu | 10 ++- x15/cuda_x15_whirlpool.cu | 12 ++-- x15/x14.cu | 21 ++++-- x15/x15.cu | 8 +-- x17/x17.cu | 8 ++- 16 files changed, 198 insertions(+), 148 deletions(-) diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu index 4e811df..6d605e4 100644 --- a/JHA/jackpotcoin.cu +++ b/JHA/jackpotcoin.cu @@ -10,7 +10,13 @@ extern "C" #include "miner.h" #include "cuda_helper.h" -static uint32_t *d_hash[MAX_GPUS]; +static uint32_t *d_hash[MAX_GPUS] = { 0 }; + +// Speicher zur Generierung der Noncevektoren für die bedingten Hashes +static uint32_t *d_jackpotNonces[MAX_GPUS] = { 0 }; +static uint32_t *d_branch1Nonces[MAX_GPUS] = { 0 }; +static uint32_t *d_branch2Nonces[MAX_GPUS] = { 0 }; +static uint32_t *d_branch3Nonces[MAX_GPUS] = { 0 }; extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads); extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen); @@ -31,60 +37,54 @@ extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st extern void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads); extern void jackpot_compactTest_cpu_free(int thr_id); -extern void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, - uint32_t *d_nonces1, uint32_t *nrm1, - uint32_t *d_nonces2, uint32_t *nrm2, - int order); +extern void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, + uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order); extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); -// Speicher zur Generierung der Noncevektoren für die bedingten Hashes -static uint32_t *d_jackpotNonces[MAX_GPUS]; -static uint32_t *d_branch1Nonces[MAX_GPUS]; -static uint32_t *d_branch2Nonces[MAX_GPUS]; -static uint32_t *d_branch3Nonces[MAX_GPUS]; - // Original jackpothash Funktion aus einem miner Quelltext extern "C" unsigned int jackpothash(void *state, const void *input) { - sph_blake512_context ctx_blake; - sph_groestl512_context ctx_groestl; - sph_jh512_context ctx_jh; - sph_keccak512_context ctx_keccak; - sph_skein512_context ctx_skein; - - uint32_t hash[16]; - - sph_keccak512_init(&ctx_keccak); - sph_keccak512 (&ctx_keccak, input, 80); - sph_keccak512_close(&ctx_keccak, hash); - - unsigned int round; - for (round = 0; round < 3; round++) { - if (hash[0] & 0x01) { - sph_groestl512_init(&ctx_groestl); - sph_groestl512 (&ctx_groestl, (&hash), 64); - sph_groestl512_close(&ctx_groestl, (&hash)); - } - else { - sph_skein512_init(&ctx_skein); - sph_skein512 (&ctx_skein, (&hash), 64); - sph_skein512_close(&ctx_skein, (&hash)); - } - if (hash[0] & 0x01) { - sph_blake512_init(&ctx_blake); - sph_blake512 (&ctx_blake, (&hash), 64); - sph_blake512_close(&ctx_blake, (&hash)); - } - else { - sph_jh512_init(&ctx_jh); - sph_jh512 (&ctx_jh, (&hash), 64); - sph_jh512_close(&ctx_jh, (&hash)); - } - } - memcpy(state, hash, 32); - - return round; + uint32_t hash[16]; + unsigned int rnd; + + sph_blake512_context ctx_blake; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + + sph_keccak512_init(&ctx_keccak); + sph_keccak512 (&ctx_keccak, input, 80); + sph_keccak512_close(&ctx_keccak, hash); + + for (rnd = 0; rnd < 3; rnd++) + { + if (hash[0] & 0x01) { + sph_groestl512_init(&ctx_groestl); + sph_groestl512 (&ctx_groestl, (&hash), 64); + sph_groestl512_close(&ctx_groestl, (&hash)); + } + else { + sph_skein512_init(&ctx_skein); + sph_skein512 (&ctx_skein, (&hash), 64); + sph_skein512_close(&ctx_skein, (&hash)); + } + + if (hash[0] & 0x01) { + sph_blake512_init(&ctx_blake); + sph_blake512 (&ctx_blake, (&hash), 64); + sph_blake512_close(&ctx_blake, (&hash)); + } + else { + sph_jh512_init(&ctx_jh); + sph_jh512 (&ctx_jh, (&hash), 64); + sph_jh512_close(&ctx_jh, (&hash)); + } + } + memcpy(state, hash, 32); + + return rnd; } static bool init[MAX_GPUS] = { 0 }; @@ -106,7 +106,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc { cudaSetDevice(device_map[thr_id]); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); jackpot_keccak512_cpu_init(thr_id, throughput); jackpot_compactTest_cpu_init(thr_id, throughput); @@ -117,11 +117,11 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc cuda_check_cpu_init(thr_id, throughput); - cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2); - cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2); - cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput*2); + cudaMalloc(&d_branch1Nonces[thr_id], (size_t) sizeof(uint32_t)*throughput*2); + cudaMalloc(&d_branch2Nonces[thr_id], (size_t) sizeof(uint32_t)*throughput*2); + cudaMalloc(&d_branch3Nonces[thr_id], (size_t) sizeof(uint32_t)*throughput*2); - CUDA_SAFE_CALL(cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2)); + CUDA_SAFE_CALL(cudaMalloc(&d_jackpotNonces[thr_id], (size_t) sizeof(uint32_t)*throughput*2)); init[thr_id] = true; } @@ -211,19 +211,22 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc *hashes_done = pdata[19] - first_nonce + throughput; + CUDA_LOG_ERROR(); + uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); - if (foundNonce != 0xffffffff) + if (foundNonce != UINT32_MAX) { uint32_t vhash64[8]; be32enc(&endiandata[19], foundNonce); - // diese jackpothash Funktion gibt die Zahl der Runden zurück + // jackpothash function gibt die Zahl der Runden zurück jackpothash(vhash64, endiandata); if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { int res = 1; - uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); work_set_target_ratio(work, vhash64); +#if 0 + uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); if (secNonce != 0) { be32enc(&endiandata[19], secNonce); nist5hash(vhash64, endiandata); @@ -232,6 +235,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc pdata[21] = secNonce; res++; } +#endif pdata[19] = foundNonce; return res; } else { @@ -250,6 +254,8 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc } while (!work_restart[thr_id].restart); + CUDA_LOG_ERROR(); + return 0; } @@ -259,9 +265,7 @@ extern "C" void free_jackpot(int thr_id) if (!init[thr_id]) return; - cudaSetDevice(device_map[thr_id]); - - cudaFree(d_hash[thr_id]); + cudaThreadSynchronize(); cudaFree(d_branch1Nonces[thr_id]); cudaFree(d_branch2Nonces[thr_id]); @@ -271,8 +275,12 @@ extern "C" void free_jackpot(int thr_id) quark_groestl512_cpu_free(thr_id); jackpot_compactTest_cpu_free(thr_id); + cudaFree(d_hash[thr_id]); + cuda_check_cpu_free(thr_id); - init[thr_id] = false; + CUDA_LOG_ERROR(); cudaDeviceSynchronize(); -} \ No newline at end of file + + init[thr_id] = false; +} diff --git a/README.txt b/README.txt index a2a2a1a..8eac428 100644 --- a/README.txt +++ b/README.txt @@ -231,7 +231,7 @@ features. Under Dev... v1.7 Restore whirlpool algo (and whirlcoin variant) Prepare algo switch ability - Add --benchmark -a auto to run a multi algo benchmark + Add --benchmark -a all to run a benchmark for all algos Add --cuda-schedule parameter Add --show-diff parameter, which display shares diff, and is able to detect real solved blocks on pools. diff --git a/api.cpp b/api.cpp index c9e28ba..9ad0c71 100644 --- a/api.cpp +++ b/api.cpp @@ -1002,5 +1002,9 @@ void api_set_throughput(int thr_id, uint32_t throughput) if (i && (1U << i) < throughput) { cgpu->intensity += ((float) (throughput-(1U << i)) / (1U << i)); } + + // to display in bench results + if (opt_benchmark) + bench_set_throughput(thr_id, throughput); } } diff --git a/bench.cpp b/bench.cpp index 87712bd..98330f4 100644 --- a/bench.cpp +++ b/bench.cpp @@ -21,7 +21,7 @@ static pthread_barrier_t algo_barr; static pthread_mutex_t bench_lock = PTHREAD_MUTEX_INITIALIZER; extern double thr_hashrates[MAX_GPUS]; -extern enum sha_algos opt_algo; +extern volatile enum sha_algos opt_algo; void bench_init(int threads) { @@ -41,6 +41,48 @@ void bench_free() pthread_barrier_destroy(&algo_barr); } +// required to switch algos +void algo_free_all(int thr_id) +{ + // only initialized algos will be freed + free_blake256(thr_id); + free_bmw(thr_id); + free_c11(thr_id); + free_deep(thr_id); + free_keccak256(thr_id); + free_fresh(thr_id); + free_fugue256(thr_id); + free_groestlcoin(thr_id); + free_heavy(thr_id); + free_jackpot(thr_id); + free_luffa(thr_id); + free_lyra2(thr_id); + free_lyra2v2(thr_id); + free_myriad(thr_id); + free_neoscrypt(thr_id); + free_nist5(thr_id); + free_pentablake(thr_id); + free_quark(thr_id); + free_qubit(thr_id); + free_skeincoin(thr_id); + free_skein2(thr_id); + free_s3(thr_id); + free_whirl(thr_id); + free_whirlx(thr_id); + free_x11(thr_id); + free_x13(thr_id); + free_x14(thr_id); + free_x15(thr_id); + free_x17(thr_id); + free_zr5(thr_id); + //free_sha256d(thr_id); + free_scrypt(thr_id); + free_scrypt_jane(thr_id); + + // warn on cuda error + CUDA_LOG_ERROR(); +} + // benchmark all algos (called once per mining thread) bool bench_algo_switch_next(int thr_id) { @@ -60,6 +102,11 @@ bool bench_algo_switch_next(int thr_id) if (algo == ALGO_SCRYPT) algo++; if (algo == ALGO_SCRYPT_JANE) algo++; + // free current algo memory and track mem usage + mused = cuda_available_memory(thr_id); + algo_free_all(thr_id); + mfree = cuda_available_memory(thr_id); + // we need to wait completion on all cards before the switch if (opt_n_threads > 1) { pthread_barrier_wait(&miner_barr); @@ -70,15 +117,12 @@ bool bench_algo_switch_next(int thr_id) format_hashrate(hashrate, rate); gpulog(LOG_NOTICE, thr_id, "%s hashrate = %s", algo_names[prev_algo], rate); - // free current algo memory and track mem usage - mused = cuda_available_memory(thr_id); - miner_free_device(thr_id); - mfree = cuda_available_memory(thr_id); - // check if there is memory leak if (device_mem_free[thr_id] > mfree) { gpulog(LOG_WARNING, thr_id, "memory leak detected in %s ! %d MB free", algo_names[prev_algo], mfree); + cuda_reset_device(thr_id, NULL); // force to free the leak + mfree = cuda_available_memory(thr_id); } // store used memory per algo algo_mem_used[thr_id][opt_algo] = device_mem_free[thr_id] - mused; diff --git a/ccminer.cpp b/ccminer.cpp index 7ff4d24..b0df5f3 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -108,7 +108,7 @@ int opt_timeout = 300; // curl int opt_scantime = 10; static json_t *opt_config; static const bool opt_time = true; -enum sha_algos opt_algo = ALGO_X11; +volatile enum sha_algos opt_algo = ALGO_AUTO; int opt_n_threads = 0; int gpu_threads = 1; int64_t opt_affinity = -1L; @@ -1435,49 +1435,6 @@ static bool wanna_mine(int thr_id) return state; } -// required to switch algos -void miner_free_device(int thr_id) -{ - // todo: some kind of algo "registration" - // to call automatically if needed - free_blake256(thr_id); - free_bmw(thr_id); - free_c11(thr_id); - free_deep(thr_id); - free_keccak256(thr_id); - free_fresh(thr_id); - free_fugue256(thr_id); - free_groestlcoin(thr_id); - free_heavy(thr_id); - free_jackpot(thr_id); - free_luffa(thr_id); - free_lyra2(thr_id); - free_lyra2v2(thr_id); - free_myriad(thr_id); - free_neoscrypt(thr_id); - free_nist5(thr_id); - free_pentablake(thr_id); - free_quark(thr_id); - free_qubit(thr_id); - free_skeincoin(thr_id); - free_skein2(thr_id); - free_s3(thr_id); - free_whirl(thr_id); - free_whirlx(thr_id); - free_x11(thr_id); - free_x13(thr_id); - free_x14(thr_id); - free_x15(thr_id); - free_x17(thr_id); - free_zr5(thr_id); - //free_sha256d(thr_id); - free_scrypt(thr_id); - free_scrypt_jane(thr_id); - - // reset remains of error.. - cudaGetLastError(); -} - static void *miner_thread(void *userdata) { struct thr_info *mythr = (struct thr_info *)userdata; @@ -1634,7 +1591,7 @@ static void *miner_thread(void *userdata) pthread_mutex_unlock(&g_work_lock); - // --benchmark [-a auto] + // --benchmark [-a all] if (opt_benchmark && bench_algo >= 0) { //gpulog(LOG_DEBUG, thr_id, "loop %d", loopcnt); if (loopcnt >= 3) { @@ -1756,6 +1713,7 @@ static void *miner_thread(void *userdata) break; case ALGO_KECCAK: case ALGO_JACKPOT: + case ALGO_X14: case ALGO_X15: minmax = 0x300000; break; @@ -1800,8 +1758,10 @@ static void *miner_thread(void *userdata) hashes_done = 0; gettimeofday(&tv_start, NULL); - - cudaGetLastError(); // reset previous errors + // check (and reset) previous errors + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess && !opt_quiet) + gpulog(LOG_WARNING, thr_id, "%s", cudaGetErrorString(err)); /* scan nonces for a proof-of-work hash */ switch (opt_algo) { @@ -1904,6 +1864,7 @@ static void *miner_thread(void *userdata) break; case ALGO_X14: rc = scanhash_x14(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_X15: rc = scanhash_x15(thr_id, &work, max_nonce, &hashes_done); break; @@ -2388,7 +2349,9 @@ void parse_arg(int key, char *arg) } if (i == ALGO_COUNT) { // some aliases... - if (!strcasecmp("flax", arg)) + if (!strcasecmp("all", arg)) + i = opt_algo = ALGO_AUTO; + else if (!strcasecmp("flax", arg)) i = opt_algo = ALGO_C11; else if (!strcasecmp("diamond", arg)) i = opt_algo = ALGO_DMD_GR; diff --git a/cuda.cpp b/cuda.cpp index 86f479d..b912cad 100644 --- a/cuda.cpp +++ b/cuda.cpp @@ -156,8 +156,7 @@ uint32_t cuda_default_throughput(int thr_id, uint32_t defcount) uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount; if (gpu_threads > 1 && throughput == defcount) throughput /= (gpu_threads-1); api_set_throughput(thr_id, throughput); - bench_set_throughput(thr_id, throughput); - //if (opt_debug) applog(LOG_DEBUG, "GPU %d-%d: throughput %u", dev_id, thr_id, throughput); + //gpulog(LOG_INFO, thr_id, "throughput %u", throughput); return throughput; } @@ -196,6 +195,14 @@ int cuda_available_memory(int thr_id) return (int) (mfree / (1024 * 1024)); } +// Check (and reset) last cuda error, and report it in logs +void cuda_log_lasterror(int thr_id, const char* func, int line) +{ + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess && !opt_quiet) + gpulog(LOG_WARNING, thr_id, "%s:%d %s", func, line, cudaGetErrorString(err)); +} + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu index 98a0291..ed601f5 100644 --- a/lyra2/lyra2REv2.cu +++ b/lyra2/lyra2REv2.cu @@ -147,6 +147,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc { int res = 1; work_set_target_ratio(work, vhash64); + pdata[19] = foundNonces[0]; // check if there was another one... if (foundNonces[1] != 0) { @@ -155,9 +156,9 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio) work_set_target_ratio(work, vhash64); pdata[21] = foundNonces[1]; + //xchg(pdata[19], pdata[21]); res++; } - pdata[19] = foundNonces[0]; MyStreamSynchronize(NULL, 0, device_map[thr_id]); return res; } diff --git a/miner.h b/miner.h index f65a9c3..3096500 100644 --- a/miner.h +++ b/miner.h @@ -493,6 +493,9 @@ int cuda_available_memory(int thr_id); uint32_t cuda_default_throughput(int thr_id, uint32_t defcount); #define device_intensity(t,f,d) cuda_default_throughput(t,d) +void cuda_log_lasterror(int thr_id, const char* func, int line); +#define CUDA_LOG_ERROR() cuda_log_lasterror(thr_id, __func__, __LINE__) + #define CL_N "\x1B[0m" #define CL_RED "\x1B[31m" #define CL_GRN "\x1B[32m" @@ -553,7 +556,7 @@ void bench_free(); bool bench_algo_switch_next(int thr_id); void bench_set_throughput(int thr_id, uint32_t throughput); void bench_display_results(); - +void algo_free_all(int thr_id); struct stratum_job { char *job_id; diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp index 43681d9..c111cfc 100644 --- a/neoscrypt/neoscrypt.cpp +++ b/neoscrypt/neoscrypt.cpp @@ -20,6 +20,8 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign int intensity = is_windows() ? 18 : 19; uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); throughput = throughput / 32; /* set for max intensity ~= 20 */ + api_set_throughput(thr_id, throughput); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce + 1); if (opt_benchmark) diff --git a/qubit/luffa.cu b/qubit/luffa.cu index 5d278d6..2ea6b20 100644 --- a/qubit/luffa.cu +++ b/qubit/luffa.cu @@ -48,7 +48,7 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, if (opt_cudaschedule == -1) // to reduce cpu usage... cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); qubit_luffa512_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput); @@ -103,12 +103,12 @@ extern "C" void free_luffa(int thr_id) if (!init[thr_id]) return; - cudaSetDevice(device_map[thr_id]); + cudaDeviceSynchronize(); cudaFree(d_hash[thr_id]); cuda_check_cpu_free(thr_id); - init[thr_id] = false; - cudaDeviceSynchronize(); -} \ No newline at end of file + cudaThreadSynchronize(); + init[thr_id] = false; +} diff --git a/scrypt-jane.cpp b/scrypt-jane.cpp index 4620fdf..f49e950 100644 --- a/scrypt-jane.cpp +++ b/scrypt-jane.cpp @@ -395,6 +395,9 @@ unsigned char GetNfactor(unsigned int nTimestamp) } else if (!strcmp(jane_params, "RAD") || !strcasecmp(jane_params, "RadioactiveCoin")) { // InternetCoin:1389196388, minN: 4, maxN: 30 Ntimestamp = 1389196388; minN= 4; maxN= 30; + } else if (!strcmp(jane_params, "LEO") || !strcasecmp(jane_params, "LEOCoin")) { + // LEOCoin:1402845776, minN: 4, maxN: 30 + Ntimestamp = 1402845776; minN= 4; maxN= 30; } else { if (sscanf(jane_params, "%u,%u,%u", &Ntimestamp, &minN, &maxN) != 3) if (sscanf(jane_params, "%u", &Nfactor) == 1) return Nfactor; // skip bounding against minN, maxN diff --git a/x13/x13.cu b/x13/x13.cu index e7fc923..cfdc3fb 100644 --- a/x13/x13.cu +++ b/x13/x13.cu @@ -208,6 +208,8 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u *hashes_done = pdata[19] - first_nonce + throughput; + CUDA_LOG_ERROR(); + foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != UINT32_MAX) { @@ -245,6 +247,9 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u } while (!work_restart[thr_id].restart); *hashes_done = pdata[19] - first_nonce + 1; + + CUDA_LOG_ERROR(); + return 0; } @@ -254,7 +259,7 @@ extern "C" void free_x13(int thr_id) if (!init[thr_id]) return; - cudaSetDevice(device_map[thr_id]); + cudaThreadSynchronize(); cudaFree(d_hash[thr_id]); @@ -263,7 +268,8 @@ extern "C" void free_x13(int thr_id) x13_fugue512_cpu_free(thr_id); cuda_check_cpu_free(thr_id); - init[thr_id] = false; + CUDA_LOG_ERROR(); cudaDeviceSynchronize(); + init[thr_id] = false; } \ No newline at end of file diff --git a/x15/cuda_x15_whirlpool.cu b/x15/cuda_x15_whirlpool.cu index 49f6867..810f51d 100644 --- a/x15/cuda_x15_whirlpool.cu +++ b/x15/cuda_x15_whirlpool.cu @@ -14,8 +14,8 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) __constant__ uint32_t pTarget[8]; -static uint32_t *d_wnounce[MAX_GPUS]; -static uint32_t *d_WNonce[MAX_GPUS]; +static uint32_t *h_wnounce[MAX_GPUS] = { 0 }; +static uint32_t *d_WNonce[MAX_GPUS] = { 0 }; #define USE_ALL_TABLES 1 @@ -2575,14 +2575,14 @@ extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode) } cudaMalloc(&d_WNonce[thr_id], sizeof(uint32_t)); - cudaMallocHost(&d_wnounce[thr_id], sizeof(uint32_t)); + cudaMallocHost(&h_wnounce[thr_id], sizeof(uint32_t)); } __host__ extern void x15_whirlpool_cpu_free(int thr_id) { cudaFree(d_WNonce[thr_id]); - cudaFreeHost(d_wnounce[thr_id]); + cudaFreeHost(h_wnounce[thr_id]); } __host__ @@ -2613,9 +2613,9 @@ extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint oldwhirlpool_gpu_finalhash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector,d_WNonce[thr_id]); MyStreamSynchronize(NULL, order, thr_id); - cudaMemcpy(d_wnounce[thr_id], d_WNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemcpy(h_wnounce[thr_id], d_WNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); - result = *d_wnounce[thr_id]; + result = *h_wnounce[thr_id]; return result; } diff --git a/x15/x14.cu b/x15/x14.cu index 817ec43..559d03f 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -27,7 +27,7 @@ extern "C" { #include "cuda_helper.h" // Memory for the hash functions -static uint32_t *d_hash[MAX_GPUS]; +static uint32_t *d_hash[MAX_GPUS] = { 0 }; extern void quark_blake512_cpu_init(int thr_id, uint32_t threads); extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata); @@ -190,7 +190,9 @@ extern "C" int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, cuda_check_cpu_init(thr_id, throughput); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); + cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput); + + CUDA_LOG_ERROR(); init[thr_id] = true; } @@ -217,9 +219,12 @@ extern "C" int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + CUDA_LOG_ERROR(); + *hashes_done = pdata[19] - first_nonce + throughput; uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); + if (foundNonce != UINT32_MAX) { const uint32_t Htarg = ptarget[7]; @@ -250,6 +255,8 @@ extern "C" int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + CUDA_LOG_ERROR(); + *hashes_done = pdata[19] - first_nonce + 1; return 0; } @@ -260,17 +267,17 @@ extern "C" void free_x14(int thr_id) if (!init[thr_id]) return; - cudaSetDevice(device_map[thr_id]); - cudaDeviceSynchronize(); - - cudaFree(d_hash[thr_id]); + cudaThreadSynchronize(); quark_groestl512_cpu_free(thr_id); x11_simd512_cpu_free(thr_id); x13_fugue512_cpu_free(thr_id); + cudaFree(d_hash[thr_id]); + d_hash[thr_id] = NULL; + cuda_check_cpu_free(thr_id); - init[thr_id] = false; cudaDeviceSynchronize(); + init[thr_id] = false; } diff --git a/x15/x15.cu b/x15/x15.cu index a7c224a..ba72208 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -28,7 +28,7 @@ extern "C" { #include "cuda_helper.h" // Memory for the hash functions -static uint32_t *d_hash[MAX_GPUS]; +static uint32_t *d_hash[MAX_GPUS] = { 0 }; extern void quark_blake512_cpu_init(int thr_id, uint32_t threads); extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata); @@ -264,7 +264,6 @@ extern "C" int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, *hashes_done = pdata[19] - first_nonce + 1; - x15_whirlpool_cpu_free(thr_id); return 0; } @@ -274,16 +273,17 @@ extern "C" void free_x15(int thr_id) if (!init[thr_id]) return; - cudaSetDevice(device_map[thr_id]); + cudaThreadSynchronize(); cudaFree(d_hash[thr_id]); quark_groestl512_cpu_free(thr_id); x11_simd512_cpu_free(thr_id); x13_fugue512_cpu_free(thr_id); + x15_whirlpool_cpu_free(thr_id); cuda_check_cpu_free(thr_id); - init[thr_id] = false; cudaDeviceSynchronize(); + init[thr_id] = false; } diff --git a/x17/x17.cu b/x17/x17.cu index d72f319..9bb40bd 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -78,6 +78,7 @@ extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t sta extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag); extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x15_whirlpool_cpu_free(int thr_id); extern void x17_sha512_cpu_init(int thr_id, uint32_t threads); extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -296,16 +297,17 @@ extern "C" void free_x17(int thr_id) if (!init[thr_id]) return; - cudaSetDevice(device_map[thr_id]); + cudaThreadSynchronize(); cudaFree(d_hash[thr_id]); quark_groestl512_cpu_free(thr_id); x11_simd512_cpu_free(thr_id); x13_fugue512_cpu_free(thr_id); + x15_whirlpool_cpu_free(thr_id); cuda_check_cpu_free(thr_id); - init[thr_id] = false; cudaDeviceSynchronize(); -} \ No newline at end of file + init[thr_id] = false; +}