From c6dcc5e5cfa8f32365cf9bcffd8153cdf4de92b2 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 11 Oct 2015 02:33:03 +0200 Subject: [PATCH] benchmark: show mem and default throughput in results and prepare a new function to get the default intensity also, take care of multiple threads per gpu... --- Algo256/blake256.cu | 3 ++- README.txt | 4 ++++ api.cpp | 2 +- bench.cpp | 37 +++++++++++++++++++++---------------- ccminer.cpp | 25 ++++++++++++++----------- cuda.cpp | 23 ++++++++++++++++++++--- miner.h | 7 ++++++- skein.cu | 2 +- 8 files changed, 69 insertions(+), 34 deletions(-) diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu index 4fe7f14..065f1f6 100644 --- a/Algo256/blake256.cu +++ b/Algo256/blake256.cu @@ -388,7 +388,8 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non const uint32_t first_nonce = pdata[19]; uint64_t targetHigh = ((uint64_t*)ptarget)[3]; int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 20; - uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity, max_nonce - first_nonce); + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + throughput = min(throughput, max_nonce - first_nonce); int rc = 0; diff --git a/README.txt b/README.txt index f53ffa4..a2a2a1a 100644 --- a/README.txt +++ b/README.txt @@ -96,6 +96,7 @@ its command line interface and options. x14 use to mine X14Coin x15 use to mine Halcyon x17 use to mine X17 + whirlpool use to mine Joincoin whirlpoolx use to mine Vanilla zr5 use to mine ZiftrCoin @@ -228,6 +229,9 @@ features. >>> RELEASE HISTORY <<< Under Dev... v1.7 + Restore whirlpool algo (and whirlcoin variant) + Prepare algo switch ability + Add --benchmark -a auto to run a multi algo benchmark Add --cuda-schedule parameter Add --show-diff parameter, which display shares diff, and is able to detect real solved blocks on pools. diff --git a/api.cpp b/api.cpp index f15f117..c9e28ba 100644 --- a/api.cpp +++ b/api.cpp @@ -990,7 +990,7 @@ void *api_thread(void *userdata) /* to be able to report the default value set in each algo */ void api_set_throughput(int thr_id, uint32_t throughput) { - if (&thr_info[thr_id]) { + if (thr_id < MAX_GPUS) { struct cgpu_info *cgpu = &thr_info[thr_id].gpu; uint32_t ws = throughput; uint8_t i = 0; diff --git a/bench.cpp b/bench.cpp index f820cc2..016e120 100644 --- a/bench.cpp +++ b/bench.cpp @@ -11,7 +11,9 @@ int bench_algo = -1; -static double * algo_hashrates[MAX_GPUS] = { 0 }; +static double algo_hashrates[MAX_GPUS][ALGO_COUNT] = { 0 }; +static uint32_t algo_throughput[MAX_GPUS][ALGO_COUNT] = { 0 }; +static int algo_mem_used[MAX_GPUS][ALGO_COUNT] = { 0 }; static int device_mem_free[MAX_GPUS] = { 0 }; static pthread_barrier_t miner_barr; @@ -25,18 +27,12 @@ void bench_init(int threads) { bench_algo = opt_algo = (enum sha_algos) 0; /* first */ applog(LOG_BLUE, "Starting benchmark mode with %s", algo_names[opt_algo]); - for (int n=0; n < MAX_GPUS; n++) { - algo_hashrates[n] = (double*) calloc(1, ALGO_COUNT * sizeof(double)); - } pthread_barrier_init(&miner_barr, NULL, threads); pthread_barrier_init(&algo_barr, NULL, threads); } void bench_free() { - for (int n=0; n < MAX_GPUS; n++) { - free(algo_hashrates[n]); - } pthread_barrier_destroy(&miner_barr); pthread_barrier_destroy(&algo_barr); } @@ -47,12 +43,7 @@ bool bench_algo_switch_next(int thr_id) int algo = (int) opt_algo; int prev_algo = algo; int dev_id = device_map[thr_id % MAX_GPUS]; - int mfree; - char rate[32] = { 0 }; - - // free current algo memory and track mem usage - miner_free_device(thr_id); - mfree = cuda_available_memory(thr_id); + int mfree, mused; algo++; @@ -70,16 +61,23 @@ bool bench_algo_switch_next(int thr_id) pthread_barrier_wait(&miner_barr); } - + char rate[32] = { 0 }; double hashrate = stats_get_speed(thr_id, thr_hashrates[thr_id]); format_hashrate(hashrate, rate); applog(LOG_NOTICE, "GPU #%d: %s hashrate = %s", dev_id, algo_names[prev_algo], rate); + // free current algo memory and track mem usage + mused = cuda_available_memory(thr_id); + miner_free_device(thr_id); + mfree = cuda_available_memory(thr_id); + // check if there is memory leak if (device_mem_free[thr_id] > mfree) { applog(LOG_WARNING, "GPU #%d, memory leak detected in %s ! %d MB free", - dev_id, algo_names[prev_algo], mfree); + dev_id, algo_names[prev_algo], mfree); } + // store used memory per algo + algo_mem_used[thr_id][opt_algo] = device_mem_free[thr_id] - mused; device_mem_free[thr_id] = mfree; // store to dump a table per gpu later @@ -109,6 +107,11 @@ bool bench_algo_switch_next(int thr_id) return true; } +void bench_set_throughput(int thr_id, uint32_t throughput) +{ + algo_throughput[thr_id][opt_algo] = throughput; +} + void bench_display_results() { for (int n=0; n < opt_n_threads; n++) @@ -118,7 +121,9 @@ void bench_display_results() for (int i=0; i < ALGO_COUNT-1; i++) { double rate = algo_hashrates[n][i]; if (rate == 0.0) continue; - applog(LOG_INFO, "%12s : %12.1f kH/s", algo_names[i], rate / 1024.); + applog(LOG_INFO, "%12s : %12.1f kH/s, %5d MB, %8u thr.", algo_names[i], + rate / 1024., algo_mem_used[n][i], algo_throughput[n][i]); } } } + diff --git a/ccminer.cpp b/ccminer.cpp index d618208..8baa966 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -110,6 +110,7 @@ static json_t *opt_config; static const bool opt_time = true; enum sha_algos opt_algo = ALGO_X11; int opt_n_threads = 0; +int gpu_threads = 1; int64_t opt_affinity = -1L; int opt_priority = 0; static double opt_difficulty = 1.; @@ -1483,6 +1484,7 @@ static void *miner_thread(void *userdata) struct thr_info *mythr = (struct thr_info *)userdata; int switchn = pool_switch_count; int thr_id = mythr->id; + int dev_id = device_map[thr_id % MAX_GPUS]; struct work work; uint64_t loopcnt = 0; uint32_t max_nonce; @@ -1635,7 +1637,7 @@ static void *miner_thread(void *userdata) // --benchmark [-a auto] if (opt_benchmark && bench_algo >= 0) { - //applog(LOG_DEBUG, "GPU #%d: loop %d", device_map[thr_id], loopcnt); + //applog(LOG_DEBUG, "GPU #%d: loop %d", dev_id, loopcnt); if (loopcnt >= 3) { if (!bench_algo_switch_next(thr_id) && thr_id == 0) { @@ -1755,11 +1757,11 @@ static void *miner_thread(void *userdata) break; case ALGO_KECCAK: case ALGO_JACKPOT: - case ALGO_NEOSCRYPT: case ALGO_X15: minmax = 0x300000; break; case ALGO_LYRA2: + case ALGO_NEOSCRYPT: case ALGO_SCRYPT: minmax = 0x80000; break; @@ -1795,7 +1797,7 @@ static void *miner_thread(void *userdata) if (opt_debug) applog(LOG_DEBUG, "GPU #%d: start=%08x end=%08x range=%08x", - device_map[thr_id], start_nonce, max_nonce, (max_nonce-start_nonce)); + dev_id, start_nonce, max_nonce, (max_nonce-start_nonce)); hashes_done = 0; gettimeofday(&tv_start, NULL); @@ -1967,7 +1969,7 @@ static void *miner_thread(void *userdata) work.scanned_to = max_nonce; if (opt_debug && opt_benchmark) { // to debug nonce ranges - applog(LOG_DEBUG, "GPU #%d: ends=%08x range=%08x", device_map[thr_id], + applog(LOG_DEBUG, "GPU #%d: ends=%08x range=%08x", dev_id, nonceptr[0], (nonceptr[0] - start_nonce)); } } @@ -1978,8 +1980,7 @@ static void *miner_thread(void *userdata) /* output */ if (!opt_quiet && firstwork_time) { format_hashrate(thr_hashrates[thr_id], s); - applog(LOG_INFO, "GPU #%d: %s, %s", - device_map[thr_id], device_name[device_map[thr_id]], s); + applog(LOG_INFO, "GPU #%d: %s, %s", dev_id, device_name[dev_id], s); } /* ignore first loop hashrate */ @@ -2835,8 +2836,6 @@ void parse_arg(int key, char *arg) proper_exit(EXIT_CODE_CUDA_NODEVICE); } } - // set number of active gpus - active_gpus = opt_n_threads; pch = strtok (NULL, ","); } } @@ -3057,8 +3056,11 @@ int main(int argc, char *argv[]) if (num_cpus < 1) num_cpus = 1; + // number of gpus + active_gpus = cuda_num_devices(); + for (i = 0; i < MAX_GPUS; i++) { - device_map[i] = i; + device_map[i] = i % active_gpus; device_name[i] = NULL; device_config[i] = NULL; device_backoff[i] = is_windows() ? 12 : 2; @@ -3070,8 +3072,6 @@ int main(int argc, char *argv[]) device_pstate[i] = -1; } - // number of gpus - active_gpus = cuda_num_devices(); cuda_devicenames(); /* parse command line */ @@ -3192,6 +3192,9 @@ int main(int argc, char *argv[]) if (!opt_n_threads) opt_n_threads = active_gpus; + // generally doesn't work... let 1 + gpu_threads = opt_n_threads / active_gpus; + if (opt_benchmark && opt_algo == ALGO_AUTO) { bench_init(opt_n_threads); for (int n=0; n < MAX_GPUS; n++) { diff --git a/cuda.cpp b/cuda.cpp index 4c8d748..08c0a57 100644 --- a/cuda.cpp +++ b/cuda.cpp @@ -67,7 +67,8 @@ void cuda_devicenames() exit(1); } - GPU_N = min(MAX_GPUS, GPU_N); + if (opt_n_threads) + GPU_N = min(MAX_GPUS, opt_n_threads); for (int i=0; i < GPU_N; i++) { char vendorname[32] = { 0 }; @@ -98,7 +99,7 @@ void cuda_print_devices() int ngpus = cuda_num_devices(); cuda_devicenames(); for (int n=0; n < ngpus; n++) { - int m = device_map[n]; + int m = device_map[n % MAX_GPUS]; cudaDeviceProp props; cudaGetDeviceProperties(&props, m); if (!opt_n_threads || n < opt_n_threads) { @@ -148,10 +149,25 @@ int cuda_finddevice(char *name) return -1; } +// deprecated since 1.7 uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount) { uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount; + if (gpu_threads > 1) throughput >> (gpu_threads-1); api_set_throughput(thr_id, throughput); + bench_set_throughput(thr_id, throughput); + return throughput; +} + +// since 1.7 +uint32_t cuda_default_throughput(int thr_id, uint32_t defcount) +{ + //int dev_id = device_map[thr_id % MAX_GPUS]; + uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount; + if (gpu_threads > 1) throughput >> (gpu_threads-1); + api_set_throughput(thr_id, throughput); + bench_set_throughput(thr_id, throughput); + //if (opt_debug) applog(LOG_DEBUG, "GPU %d-%d: throughput %u", dev_id, thr_id, throughput); return throughput; } @@ -240,7 +256,8 @@ cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id) void cudaReportHardwareFailure(int thr_id, cudaError_t err, const char* func) { struct cgpu_info *gpu = &thr_info[thr_id].gpu; + int dev_id = device_map[thr_id % MAX_GPUS]; gpu->hw_errors++; - applog(LOG_ERR, "GPU #%d: %s %s", device_map[thr_id], func, cudaGetErrorString(err)); + applog(LOG_ERR, "GPU #%d: %s %s", dev_id, func, cudaGetErrorString(err)); sleep(1); } diff --git a/miner.h b/miner.h index a4dc10b..8441e00 100644 --- a/miner.h +++ b/miner.h @@ -447,6 +447,7 @@ extern bool opt_showdiff; extern bool opt_tracegpu; extern int opt_n_threads; extern int active_gpus; +extern int gpu_threads; extern int opt_timeout; extern bool want_longpoll; extern bool have_longpoll; @@ -489,6 +490,9 @@ int cuda_finddevice(char *name); void cuda_print_devices(); int cuda_available_memory(int thr_id); +uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount); +uint32_t cuda_default_throughput(int thr_id, uint32_t defcount); + #define CL_N "\x1B[0m" #define CL_RED "\x1B[31m" #define CL_GRN "\x1B[32m" @@ -522,6 +526,7 @@ int cuda_available_memory(int thr_id); extern void format_hashrate(double hashrate, char *output); extern void applog(int prio, const char *fmt, ...); +#define gpulog(prio, fmt, thr_id, ...) applog(prio, fmt, thr_id, __VA_ARGS__) void get_defconfig_path(char *out, size_t bufsize, char *argv0); extern void cbin2hex(char *out, const char *in, size_t len); extern char *bin2hex(const unsigned char *in, size_t len); @@ -533,7 +538,6 @@ void diff_to_target(uint32_t* target, double diff); void work_set_target(struct work* work, double diff); double target_to_diff(uint32_t* target); extern void get_currentalgo(char* buf, int sz); -extern uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount); // bignum double bn_convert_nbits(const uint32_t nbits); @@ -547,6 +551,7 @@ extern int bench_algo; void bench_init(int threads); void bench_free(); bool bench_algo_switch_next(int thr_id); +void bench_set_throughput(int thr_id, uint32_t throughput); void bench_display_results(); diff --git a/skein.cu b/skein.cu index b7b46ec..da88227 100644 --- a/skein.cu +++ b/skein.cu @@ -364,7 +364,7 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no throughput = min(throughput, (max_nonce - first_nonce)); uint32_t foundNonce, secNonce = 0; - uint64_t target64; + uint64_t target64 = 0; if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x03;