From 4e1e03b8916fbc1cb48662990fb379b7696403c2 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 8 Oct 2015 23:55:32 +0200 Subject: [PATCH] benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... --- Algo256/bmw.cu | 3 +- ccminer.cpp | 148 ++++++++++++++++++++++++----------- cuda_nist5.cu | 3 +- lyra2/lyra2RE.cu | 12 +-- lyra2/lyra2REv2.cu | 10 ++- neoscrypt/neoscrypt.cpp | 12 +-- quark/cuda_quark_blake512.cu | 2 +- scrypt-jane.cpp | 2 + scrypt.cpp | 2 + x15/x14.cu | 2 +- x15/x15.cu | 6 +- 11 files changed, 135 insertions(+), 67 deletions(-) diff --git a/Algo256/bmw.cu b/Algo256/bmw.cu index 9b91252..d9f9e72 100644 --- a/Algo256/bmw.cu +++ b/Algo256/bmw.cu @@ -49,7 +49,7 @@ extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, u throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0005; + ptarget[7] = 0x0005; if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); @@ -66,6 +66,7 @@ extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, u be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); } + cudaGetLastError(); bmw256_setBlock_80(thr_id, (void*)endiandata); cuda_check_cpu_setTarget(ptarget); diff --git a/ccminer.cpp b/ccminer.cpp index 8e23d6c..eda3a35 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -85,7 +85,9 @@ struct workio_cmd { }; enum sha_algos { - ALGO_BLAKE, + ALGO_BLAKE = 0, + ALGO_LYRA2, /* moved first for benchs */ + ALGO_LYRA2v2, ALGO_BLAKECOIN, ALGO_BMW, ALGO_C11, @@ -98,8 +100,6 @@ enum sha_algos { ALGO_KECCAK, ALGO_JACKPOT, ALGO_LUFFA, - ALGO_LYRA2, - ALGO_LYRA2v2, ALGO_MJOLLNIR, /* Hefty hash */ ALGO_MYR_GR, ALGO_NEOSCRYPT, @@ -127,6 +127,8 @@ enum sha_algos { static const char *algo_names[] = { "blake", + "lyra2", + "lyra2v2", "blakecoin", "bmw", "c11", @@ -139,8 +141,6 @@ static const char *algo_names[] = { "keccak", "jackpot", "luffa", - "lyra2", - "lyra2v2", "mjollnir", "myr-gr", "neoscrypt", @@ -171,7 +171,6 @@ bool opt_debug_diff = false; bool opt_debug_threads = false; bool opt_protocol = false; bool opt_benchmark = false; -int algo_benchmark = -1; bool opt_showdiff = false; // todo: limit use of these flags, @@ -266,6 +265,7 @@ volatile bool abort_flag = false; struct work_restart *work_restart = NULL; static int app_exit_code = EXIT_CODE_OK; +pthread_mutex_t algo_lock; pthread_mutex_t applog_lock; static pthread_mutex_t stats_lock; static double thr_hashrates[MAX_GPUS] = { 0 }; @@ -279,8 +279,12 @@ uint8_t conditional_state[MAX_GPUS] = { 0 }; double opt_max_temp = 0.0; double opt_max_diff = -1.; double opt_max_rate = -1.; - +// algos benchmark +int algo_benchmark = -1; +double * algo_hashrates[MAX_GPUS] = { 0 }; +int device_mem_free[MAX_GPUS] = { 0 }; int opt_statsavg = 30; + // strdup on char* to allow a common free() if used static char* opt_syslog_pfx = strdup(PROGRAM_NAME); char *opt_api_allow = strdup("127.0.0.1"); /* 0.0.0.0 for all ips */ @@ -1562,6 +1566,9 @@ void miner_free_device(int thr_id) //free_sha256d(thr_id); free_scrypt(thr_id); free_scrypt_jane(thr_id); + + // reset remains of error.. + cudaGetLastError(); } // to benchmark all algos @@ -1577,40 +1584,78 @@ bool algo_switch_next(int thr_id) miner_free_device(thr_id); mfree = cuda_available_memory(thr_id); - work_restart[thr_id].restart = 1; - algo++; - if (algo == ALGO_AUTO) - return false; + + // skip some duplicated algos + if (algo == ALGO_C11) algo++; // same as x11 + if (algo == ALGO_DMD_GR) algo++; // same as groestl + if (algo == ALGO_WHIRLCOIN) algo++; // same as whirlpool + // and unwanted ones... + if (algo == ALGO_SCRYPT) algo++; + if (algo == ALGO_SCRYPT_JANE) algo++; + + work_restart[thr_id].restart = 1; // we need to wait completion on all cards before the switch if (opt_n_threads > 1) { - pthread_mutex_lock(&stratum_sock_lock); // unused in benchmark - for (int n=0; n < opt_n_threads; n++) - if (!work_restart[thr_id].restart) { - applog(LOG_DEBUG, "GPU #%d: waiting GPU %d", dev_id, device_map[n]); - usleep(100*1000); + pthread_mutex_lock(&algo_lock); // wait work_restart for all + for (int n=0; n < opt_n_threads; n++) { + int timeout = 1000; + while (!work_restart[n].restart && --timeout) { + //applog(LOG_DEBUG, "GPU #%d: waiting GPU %d", dev_id, device_map[n]); + usleep(200*1000); } - sleep(1); - pthread_mutex_unlock(&stratum_sock_lock); + } + pthread_mutex_unlock(&algo_lock); } double hashrate = stats_get_speed(thr_id, thr_hashrates[thr_id]); format_hashrate(hashrate, rate); - applog(LOG_NOTICE, "GPU #%d: %s rate: %s - %d MB free", dev_id, algo_names[prev_algo], rate, mfree); + applog(LOG_NOTICE, "GPU #%d: %s hashrate = %s", dev_id, algo_names[prev_algo], rate); + + // check if there is there is memory leaks + if (device_mem_free[thr_id] > mfree) + applog(LOG_WARNING, "GPU #%d, memory leak detected! %d MB free", dev_id, mfree); + device_mem_free[thr_id] = mfree; + + // store to dump a table per gpu later + algo_hashrates[thr_id][prev_algo] = hashrate; + + if (algo == ALGO_AUTO) + return false; + + // wait other threads before algo switch + pthread_mutex_lock(&algo_lock); + + opt_algo = (enum sha_algos) algo; + work_restart[thr_id].restart = 0; stats_purge_all(); global_hashrate = 0; - opt_algo = (enum sha_algos) algo; + if (thr_id == 0) + applog(LOG_BLUE, "Benchmark algo %s...", algo_names[algo]); - applog(LOG_BLUE, "GPU #%d: Benchmark for algo %s...", dev_id, algo_names[algo]); - sleep(1); - work_restart[thr_id].restart = 0; + //applog(LOG_BLUE, "GPU #%d: Benchmark algo %s...", dev_id, algo_names[algo]); + pthread_mutex_unlock(&algo_lock); return true; } +static void display_benchmark_results() +{ + for (int n=0; n < opt_n_threads; n++) + { + int dev_id = device_map[n]; + applog(LOG_BLUE, "Benchmark results for GPU #%d - %s:", dev_id, device_name[dev_id]); + for (int i=0; i < ALGO_COUNT-1; i++) { + double rate = algo_hashrates[n][i]; + if (rate == 0.0) continue; + applog(LOG_INFO, "%12s : %15.0f H/s", algo_names[i], rate); + } + } +} + static void *miner_thread(void *userdata) { struct thr_info *mythr = (struct thr_info *)userdata; @@ -1729,19 +1774,6 @@ static void *miner_thread(void *userdata) } } - if (opt_benchmark && algo_benchmark >= 0) { - if (loopcnt > 3) { - if (!algo_switch_next(thr_id)) { - proper_exit(0); - break; - } - algo_benchmark = (int) opt_algo; - // for scrypt... - opt_autotune = false; - loopcnt = 0; - } - } - if (!opt_benchmark && (g_work.height != work.height || memcmp(work.target, g_work.target, sizeof(work.target)))) { if (opt_debug) { @@ -1779,6 +1811,24 @@ static void *miner_thread(void *userdata) pthread_mutex_unlock(&g_work_lock); + // -a auto --benchmark + if (opt_benchmark && algo_benchmark >= 0) { + //applog(LOG_DEBUG, "GPU #%d: loop %d", device_map[thr_id], loopcnt); + if (loopcnt >= 3) { + if (!algo_switch_next(thr_id) && thr_id == 0) + { + display_benchmark_results(); + proper_exit(0); + break; + } + algo_benchmark = (int) opt_algo; + // for scrypt... + opt_autotune = false; + loopcnt = 0; + } + } + loopcnt++; + /* prevent gpu scans before a job is received */ if (have_stratum && work.data[0] == 0 && !opt_benchmark) { sleep(1); @@ -1877,6 +1927,7 @@ static void *miner_thread(void *userdata) minmax = 0x2000000; break; case ALGO_C11: + case ALGO_DEEP: case ALGO_LYRA2v2: case ALGO_S3: case ALGO_X11: @@ -1885,11 +1936,11 @@ static void *miner_thread(void *userdata) case ALGO_WHIRLPOOL: minmax = 0x400000; break; - case ALGO_LYRA2: case ALGO_NEOSCRYPT: case ALGO_X15: minmax = 0x300000; break; + case ALGO_LYRA2: case ALGO_SCRYPT: minmax = 0x80000; break; @@ -1914,7 +1965,7 @@ static void *miner_thread(void *userdata) else max_nonce = (uint32_t) (max64 + start_nonce); - // todo: keep it rounded for gpu threads ? + // todo: keep it rounded to a multiple of 256 ? if (unlikely(start_nonce > max_nonce)) { // should not happen but seen in skein2 benchmark with 2 gpus @@ -1930,6 +1981,9 @@ static void *miner_thread(void *userdata) hashes_done = 0; gettimeofday(&tv_start, NULL); + + cudaGetLastError(); // reset previous errors + /* scan nonces for a proof-of-work hash */ switch (opt_algo) { @@ -2080,7 +2134,7 @@ static void *miner_thread(void *userdata) pthread_mutex_lock(&stats_lock); thr_hashrates[thr_id] = hashes_done / dtime; thr_hashrates[thr_id] *= rate_factor; - if (loopcnt) // ignore first (init time) + if (loopcnt > 1) // ignore first (init time) stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t) rc, work.height); pthread_mutex_unlock(&stats_lock); } @@ -2116,7 +2170,7 @@ static void *miner_thread(void *userdata) for (int i = 0; i < opt_n_threads && thr_hashrates[i]; i++) hashrate += stats_get_speed(i, thr_hashrates[i]); pthread_mutex_unlock(&stats_lock); - if (opt_benchmark) { + if (opt_benchmark && algo_benchmark == -1) { format_hashrate(hashrate, s); applog(LOG_NOTICE, "Total: %s", s); } @@ -2159,7 +2213,6 @@ static void *miner_thread(void *userdata) break; } } - loopcnt++; } out: @@ -3084,25 +3137,27 @@ static void parse_cmdline(int argc, char *argv[]) parse_arg(key, optarg); } if (optind < argc) { - fprintf(stderr, "%s: unsupported non-option argument '%s'\n", + fprintf(stderr, "%s: unsupported non-option argument '%s' (see --help)\n", argv[0], argv[optind]); - show_usage_and_exit(1); + //show_usage_and_exit(1); } parse_config(opt_config); - if (opt_algo == ALGO_HEAVY && opt_vote == 9999) { + if (opt_algo == ALGO_HEAVY && opt_vote == 9999 && !opt_benchmark) { fprintf(stderr, "%s: Heavycoin hash requires block reward vote parameter (see --vote)\n", argv[0]); show_usage_and_exit(1); } if (opt_algo == ALGO_AUTO) { - for (int n=0; n < MAX_GPUS; n++) + for (int n=0; n < MAX_GPUS; n++) { gpus_intensity[n] = 0; // use default + algo_hashrates[n] = (double*) calloc(1, ALGO_COUNT * sizeof(double)); + } if (opt_benchmark) { opt_autotune = false; - algo_benchmark = opt_algo = ALGO_BLAKE; /* first */ + algo_benchmark = opt_algo = (enum sha_algos) 0; /* first */ applog(LOG_BLUE, "Starting benchmark mode"); } } @@ -3177,6 +3232,7 @@ int main(int argc, char *argv[]) jane_params = strdup(""); pthread_mutex_init(&applog_lock, NULL); + pthread_mutex_init(&algo_lock, NULL); // number of cpus for thread affinity #if defined(WIN32) diff --git a/cuda_nist5.cu b/cuda_nist5.cu index 44bfe60..9f92987 100644 --- a/cuda_nist5.cu +++ b/cuda_nist5.cu @@ -83,6 +83,7 @@ extern "C" int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, if (!init[thr_id]) { + cudaDeviceSynchronize(); cudaSetDevice(device_map[thr_id]); // Constants copy/init (no device alloc in these algos) @@ -179,4 +180,4 @@ extern "C" void free_nist5(int thr_id) init[thr_id] = false; cudaDeviceSynchronize(); -} \ No newline at end of file +} diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu index 5a9cb71..daa6dd7 100644 --- a/lyra2/lyra2RE.cu +++ b/lyra2/lyra2RE.cu @@ -10,7 +10,7 @@ extern "C" { #include "cuda_helper.h" static uint64_t* d_hash[MAX_GPUS]; -//static uint64_t* d_hash2[MAX_GPUS]; +//static uint64_t* d_matrix[MAX_GPUS]; extern void blake256_cpu_init(int thr_id, uint32_t threads); extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); @@ -87,11 +87,12 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x00ff; + ptarget[7] = 0x00ff; if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); + cudaGetLastError(); // reset last error blake256_cpu_init(thr_id, throughput); keccak256_cpu_init(thr_id,throughput); @@ -99,8 +100,8 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, groestl256_cpu_init(thr_id, throughput); // DMatrix -// cudaMalloc(&d_hash2[thr_id], (size_t)16 * 8 * 8 * sizeof(uint64_t) * throughput); -// lyra2_cpu_init(thr_id, throughput, d_hash2[thr_id]); +// cudaMalloc(&d_matrix[thr_id], (size_t)16 * 8 * 8 * sizeof(uint64_t) * throughput); +// lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]); CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); @@ -174,6 +175,7 @@ extern "C" void free_lyra2(int thr_id) cudaSetDevice(device_map[thr_id]); cudaFree(d_hash[thr_id]); + //cudaFree(d_matrix[thr_id]); keccak256_cpu_free(thr_id); groestl256_cpu_free(thr_id); @@ -181,4 +183,4 @@ extern "C" void free_lyra2(int thr_id) init[thr_id] = false; cudaDeviceSynchronize(); -} \ No newline at end of file +} diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu index 15327ec..b84441e 100644 --- a/lyra2/lyra2REv2.cu +++ b/lyra2/lyra2REv2.cu @@ -84,14 +84,16 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc uint32_t throughput = device_intensity(dev_id, __func__, defthr); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x00ff; + ptarget[7] = 0x00ff; if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); + cudaSetDevice(dev_id); //cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); //if (opt_n_gputhreads == 1) // cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + cudaGetLastError(); + blake256_cpu_init(thr_id, throughput); keccak256_cpu_init(thr_id,throughput); skein256_cpu_init(thr_id, throughput); @@ -103,8 +105,8 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc return -1; } - // DMatrix - CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], (size_t)16 * sizeof(uint64_t) * 4 * 3 * throughput)); + // DMatrix (780Ti may prefer 16 instead of 12, cf djm34) + CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], (size_t)12 * sizeof(uint64_t) * 4 * 4 * throughput)); lyra2v2_cpu_init(thr_id, throughput, d_matrix[thr_id]); CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp index 27551ae..92d7d1f 100644 --- a/neoscrypt/neoscrypt.cpp +++ b/neoscrypt/neoscrypt.cpp @@ -16,26 +16,28 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; + int dev_id = device_map[thr_id]; int intensity = is_windows() ? 18 : 19; uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity); throughput = throughput / 32; /* set for max intensity ~= 20 */ throughput = min(throughput, max_nonce - first_nonce + 1); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; + ptarget[7] = 0x00ff; if (!init[thr_id]) { - int dev_id = device_map[thr_id]; + cudaDeviceSynchronize(); cudaSetDevice(dev_id); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + cudaGetLastError(); // reset errors if device is not "reset" if (device_sm[dev_id] <= 300) { applog(LOG_ERR, "Sorry neoscrypt is not supported on SM 3.0 devices"); proper_exit(EXIT_CODE_CUDA_ERROR); } - applog(LOG_INFO, "Using %d cuda threads", throughput); + applog(LOG_INFO, "GPU #%d: Using %d cuda threads", dev_id, throughput); neoscrypt_cpu_init(thr_id, throughput); init[thr_id] = true; @@ -71,7 +73,7 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign pdata[19] = foundNonce; return 1; } else { - applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce); + applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", dev_id, foundNonce); } } @@ -95,4 +97,4 @@ void free_neoscrypt(int thr_id) init[thr_id] = false; cudaDeviceSynchronize(); -} \ No newline at end of file +} diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu index 5ac6cdb..1cb88ae 100644 --- a/quark/cuda_quark_blake512.cu +++ b/quark/cuda_quark_blake512.cu @@ -225,7 +225,7 @@ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou __host__ void quark_blake512_cpu_init(int thr_id, uint32_t threads) { - CUDA_SAFE_CALL(cudaGetLastError()); +// CUDA_SAFE_CALL(cudaGetLastError()); } __host__ diff --git a/scrypt-jane.cpp b/scrypt-jane.cpp index 099501a..8023fe9 100644 --- a/scrypt-jane.cpp +++ b/scrypt-jane.cpp @@ -434,6 +434,7 @@ void free_scrypt_jane(int thr_id) int dev_id = device_map[thr_id]; cudaSetDevice(dev_id); + cudaDeviceSynchronize(); cudaDeviceReset(); // well, simple way to free ;) init[thr_id] = false; @@ -479,6 +480,7 @@ int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonce, unsi int dev_id = device_map[thr_id]; cudaSetDevice(dev_id); + cudaDeviceSynchronize(); cudaDeviceReset(); cudaSetDevice(dev_id); throughput = cuda_throughput(thr_id); diff --git a/scrypt.cpp b/scrypt.cpp index 62c5a58..e447583 100644 --- a/scrypt.cpp +++ b/scrypt.cpp @@ -694,6 +694,7 @@ void free_scrypt(int thr_id) // trivial way to free all... cudaSetDevice(dev_id); + cudaDeviceSynchronize(); cudaDeviceReset(); init[thr_id] = false; @@ -714,6 +715,7 @@ int scanhash_scrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned if (!init[thr_id]) { int dev_id = device_map[thr_id]; cudaSetDevice(dev_id); + cudaDeviceSynchronize(); cudaDeviceReset(); cudaSetDevice(dev_id); throughput = cuda_throughput(thr_id); diff --git a/x15/x14.cu b/x15/x14.cu index 195ace9..990f1d4 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -168,7 +168,7 @@ extern "C" int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x000f; + ptarget[7] = 0x000f; if (!init[thr_id]) { diff --git a/x15/x15.cu b/x15/x15.cu index 5808770..3b21345 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -178,7 +178,7 @@ extern "C" int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x00FF; + ptarget[7] = 0x00FF; if (!init[thr_id]) { @@ -199,7 +199,7 @@ extern "C" int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, x14_shabal512_cpu_init(thr_id, throughput); x15_whirlpool_cpu_init(thr_id, throughput, 0); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); cuda_check_cpu_init(thr_id, throughput); init[thr_id] = true; @@ -286,4 +286,4 @@ extern "C" void free_x15(int thr_id) init[thr_id] = false; cudaDeviceSynchronize(); -} \ No newline at end of file +}