ccminer-gostd-lite/bench.cpp

/**
 * Made to benchmark and test algo switch
 *
 * 2015 - tpruvot@github
 */

#include <unistd.h>

#include "miner.h"
#include "algos.h"
#include <cuda_runtime.h>

#ifdef __APPLE__
#include "compat/pthreads/pthread_barrier.hpp"
#endif

int bench_algo = -1;

static double algo_hashrates[MAX_GPUS][ALGO_COUNT] = { 0 };
static uint32_t algo_throughput[MAX_GPUS][ALGO_COUNT] = { 0 };
static int algo_mem_used[MAX_GPUS][ALGO_COUNT] = { 0 };
static int device_mem_free[MAX_GPUS] = { 0 };

static pthread_barrier_t miner_barr;
static pthread_barrier_t algo_barr;
static pthread_mutex_t bench_lock = PTHREAD_MUTEX_INITIALIZER;

extern double thr_hashrates[MAX_GPUS];

void bench_init(int threads)
{
	bench_algo = opt_algo = (enum sha_algos) 0; /* first */
	applog(LOG_BLUE, "Starting benchmark mode with %s", algo_names[opt_algo]);
	pthread_barrier_init(&miner_barr, NULL, threads);
	pthread_barrier_init(&algo_barr, NULL, threads);
	// required for usage of first algo.
	for (int n=0; n < opt_n_threads; n++) {
		device_mem_free[n] = cuda_available_memory(n);
	}
}

void bench_free()
{
	pthread_barrier_destroy(&miner_barr);
	pthread_barrier_destroy(&algo_barr);
}

// required to switch algos
void algo_free_all(int thr_id)
{
	// only initialized algos will be freed
	free_gostd(thr_id);
}

// benchmark all algos (called once per mining thread)
bool bench_algo_switch_next(int thr_id)
{
	int algo = (int) opt_algo;
	int prev_algo = algo;
	int dev_id = device_map[thr_id % MAX_GPUS];
	int mfree, mused;
	// doesnt seems enough to prevent device slow down
	// after some algo switchs
	bool need_reset = (gpu_threads == 1);

	algo++;

	// free current algo memory and track mem usage
	mused = cuda_available_memory(thr_id);
	algo_free_all(thr_id);
	CUDA_LOG_ERROR();

	// device can take some time to free
	mfree = cuda_available_memory(thr_id);
	if (device_mem_free[thr_id] > mfree) {
		sleep(1);
		mfree = cuda_available_memory(thr_id);
	}

	// we need to wait completion on all cards before the switch
	if (opt_n_threads > 1) {
		pthread_barrier_wait(&miner_barr);
	}

	char rate[32] = { 0 };
	double hashrate = stats_get_speed(thr_id, thr_hashrates[thr_id]);
	format_hashrate(hashrate, rate);
	gpulog(LOG_NOTICE, thr_id, "%s hashrate = %s", algo_names[prev_algo], rate);

	// ensure memory leak is still real after the barrier
	if (device_mem_free[thr_id] > mfree) {
		mfree = cuda_available_memory(thr_id);
	}

	// check if there is memory leak
	if (device_mem_free[thr_id] - mfree > 1) {
		gpulog(LOG_WARNING, thr_id, "possible %d MB memory leak in %s! %d MB free",
			(device_mem_free[thr_id] - mfree), algo_names[prev_algo], mfree);
		cuda_reset_device(thr_id, NULL); // force to free the leak
		need_reset = false;
		mfree = cuda_available_memory(thr_id);
	}
	// store used memory per algo
	algo_mem_used[thr_id][opt_algo] = device_mem_free[thr_id] - mused;
	device_mem_free[thr_id] = mfree;

	// store to dump a table per gpu later
	algo_hashrates[thr_id][prev_algo] = hashrate;

	// wait the other threads to display logs correctly
	if (opt_n_threads > 1) {
		pthread_barrier_wait(&algo_barr);
	}

	if (algo == ALGO_AUTO)
		return false; // all algos done

	// mutex primary used for the stats purge
	pthread_mutex_lock(&bench_lock);
	stats_purge_all();

	opt_algo = (enum sha_algos) algo;
	global_hashrate = 0;
	thr_hashrates[thr_id] = 0; // reset for minmax64
	pthread_mutex_unlock(&bench_lock);

	if (need_reset)
		cuda_reset_device(thr_id, NULL);

	if (thr_id == 0)
		applog(LOG_BLUE, "Benchmark algo %s...", algo_names[algo]);

	return true;
}

void bench_set_throughput(int thr_id, uint32_t throughput)
{
	algo_throughput[thr_id][opt_algo] = throughput;
}

void bench_display_results()
{
	for (int n=0; n < opt_n_threads; n++)
	{
		int dev_id = device_map[n];
		applog(LOG_BLUE, "Benchmark results for GPU #%d - %s:", dev_id, device_name[dev_id]);
		for (int i=0; i < ALGO_COUNT-1; i++) {
			double rate = algo_hashrates[n][i];
			if (rate == 0.0) continue;
			applog(LOG_INFO, "%12s : %12.1f kH/s, %5d MB, %8u thr.", algo_names[i],
				rate / 1024., algo_mem_used[n][i], algo_throughput[n][i]);
		}
	}
}
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`/**`
			`* Made to benchmark and test algo switch`
			`*`
			`* 2015 - tpruvot@github`
			`*/`

benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`#include <unistd.h>`

refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`#include "miner.h"`
			`#include "algos.h"`
fix various memory leaks on algo switch 8 years ago			`#include <cuda_runtime.h>`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago
Improve project build compatibility (mac/vs2015) This is incomplete, but is a first step... 8 years ago			`#ifdef __APPLE__`
			`#include "compat/pthreads/pthread_barrier.hpp"`
			`#endif`

refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`int bench_algo = -1;`

benchmark: show mem and default throughput in results and prepare a new function to get the default intensity also, take care of multiple threads per gpu... 9 years ago			`static double algo_hashrates[MAX_GPUS][ALGO_COUNT] = { 0 };`
			`static uint32_t algo_throughput[MAX_GPUS][ALGO_COUNT] = { 0 };`
			`static int algo_mem_used[MAX_GPUS][ALGO_COUNT] = { 0 };`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`static int device_mem_free[MAX_GPUS] = { 0 };`

			`static pthread_barrier_t miner_barr;`
			`static pthread_barrier_t algo_barr;`
			`static pthread_mutex_t bench_lock = PTHREAD_MUTEX_INITIALIZER;`

			`extern double thr_hashrates[MAX_GPUS];`

			`void bench_init(int threads)`
			`{`
			`bench_algo = opt_algo = (enum sha_algos) 0; /* first */`
			`applog(LOG_BLUE, "Starting benchmark mode with %s", algo_names[opt_algo]);`
			`pthread_barrier_init(&miner_barr, NULL, threads);`
			`pthread_barrier_init(&algo_barr, NULL, threads);`
intensity: do not reduce throughput before init Else the memory allocated could be less than required later btw, use the new "cuda" function to apply intensity/throughput 9 years ago			`// required for usage of first algo.`
			`for (int n=0; n < opt_n_threads; n++) {`
			`device_mem_free[n] = cuda_available_memory(n);`
			`}`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`}`

			`void bench_free()`
			`{`
			`pthread_barrier_destroy(&miner_barr);`
			`pthread_barrier_destroy(&algo_barr);`
			`}`

warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`// required to switch algos`
			`void algo_free_all(int thr_id)`
			`{`
			`// only initialized algos will be freed`
add gostd algo 7 years ago			`free_gostd(thr_id);`
warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`}`

refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`// benchmark all algos (called once per mining thread)`
			`bool bench_algo_switch_next(int thr_id)`
			`{`
			`int algo = (int) opt_algo;`
			`int prev_algo = algo;`
			`int dev_id = device_map[thr_id % MAX_GPUS];`
benchmark: show mem and default throughput in results and prepare a new function to get the default intensity also, take care of multiple threads per gpu... 9 years ago			`int mfree, mused;`
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`// doesnt seems enough to prevent device slow down`
			`// after some algo switchs`
			`bool need_reset = (gpu_threads == 1);`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago
			`algo++;`

warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`// free current algo memory and track mem usage`
			`mused = cuda_available_memory(thr_id);`
			`algo_free_all(thr_id);`
prepare the 1.7 release 9 years ago			`CUDA_LOG_ERROR();`
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago
			`// device can take some time to free`
warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`mfree = cuda_available_memory(thr_id);`
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`if (device_mem_free[thr_id] > mfree) {`
			`sleep(1);`
			`mfree = cuda_available_memory(thr_id);`
			`}`
warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`// we need to wait completion on all cards before the switch`
			`if (opt_n_threads > 1) {`
			`pthread_barrier_wait(&miner_barr);`
			`}`

benchmark: show mem and default throughput in results and prepare a new function to get the default intensity also, take care of multiple threads per gpu... 9 years ago			`char rate[32] = { 0 };`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`double hashrate = stats_get_speed(thr_id, thr_hashrates[thr_id]);`
			`format_hashrate(hashrate, rate);`
add gpulog() function helper, simple and multi-threads when using multiple cpu threads per gpu, use the T prefix, ex: [2015-10-11 09:52:49] GPU #0: app clocks set to P0 (3600/1228) vs [2015-10-11 09:52:51] GPU T0: MSI GTX 960, 5953.35 kH/s Only thr_id is required, the function take care of the dev id 9 years ago			`gpulog(LOG_NOTICE, thr_id, "%s hashrate = %s", algo_names[prev_algo], rate);`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`// ensure memory leak is still real after the barrier`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`if (device_mem_free[thr_id] > mfree) {`
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`mfree = cuda_available_memory(thr_id);`
			`}`

			`// check if there is memory leak`
			`if (device_mem_free[thr_id] - mfree > 1) {`
			`gpulog(LOG_WARNING, thr_id, "possible %d MB memory leak in %s! %d MB free",`
			`(device_mem_free[thr_id] - mfree), algo_names[prev_algo], mfree);`
warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`cuda_reset_device(thr_id, NULL); // force to free the leak`
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`need_reset = false;`
warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`mfree = cuda_available_memory(thr_id);`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`}`
benchmark: show mem and default throughput in results and prepare a new function to get the default intensity also, take care of multiple threads per gpu... 9 years ago			`// store used memory per algo`
			`algo_mem_used[thr_id][opt_algo] = device_mem_free[thr_id] - mused;`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`device_mem_free[thr_id] = mfree;`

			`// store to dump a table per gpu later`
			`algo_hashrates[thr_id][prev_algo] = hashrate;`

			`// wait the other threads to display logs correctly`
			`if (opt_n_threads > 1) {`
			`pthread_barrier_wait(&algo_barr);`
			`}`

			`if (algo == ALGO_AUTO)`
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`return false; // all algos done`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago
			`// mutex primary used for the stats purge`
			`pthread_mutex_lock(&bench_lock);`
			`stats_purge_all();`

			`opt_algo = (enum sha_algos) algo;`
			`global_hashrate = 0;`
			`thr_hashrates[thr_id] = 0; // reset for minmax64`
			`pthread_mutex_unlock(&bench_lock);`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`if (need_reset)`
			`cuda_reset_device(thr_id, NULL);`

refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`if (thr_id == 0)`
			`applog(LOG_BLUE, "Benchmark algo %s...", algo_names[algo]);`

			`return true;`
			`}`

benchmark: show mem and default throughput in results and prepare a new function to get the default intensity also, take care of multiple threads per gpu... 9 years ago			`void bench_set_throughput(int thr_id, uint32_t throughput)`
			`{`
			`algo_throughput[thr_id][opt_algo] = throughput;`
			`}`

refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`void bench_display_results()`
			`{`
			`for (int n=0; n < opt_n_threads; n++)`
			`{`
			`int dev_id = device_map[n];`
			`applog(LOG_BLUE, "Benchmark results for GPU #%d - %s:", dev_id, device_name[dev_id]);`
			`for (int i=0; i < ALGO_COUNT-1; i++) {`
			`double rate = algo_hashrates[n][i];`
			`if (rate == 0.0) continue;`
benchmark: show mem and default throughput in results and prepare a new function to get the default intensity also, take care of multiple threads per gpu... 9 years ago			`applog(LOG_INFO, "%12s : %12.1f kH/s, %5d MB, %8u thr.", algo_names[i],`
			`rate / 1024., algo_mem_used[n][i], algo_throughput[n][i]);`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`}`
			`}`
			`}`