ccminer/bench.cpp

/**
 * Made to benchmark and test algo switch
 *
 * 2015 - tpruvot@github
 */

#include <unistd.h>

#include "miner.h"
#include "algos.h"

int bench_algo = -1;

static double algo_hashrates[MAX_GPUS][ALGO_COUNT] = { 0 };
static uint32_t algo_throughput[MAX_GPUS][ALGO_COUNT] = { 0 };
static int algo_mem_used[MAX_GPUS][ALGO_COUNT] = { 0 };
static int device_mem_free[MAX_GPUS] = { 0 };

static pthread_barrier_t miner_barr;
static pthread_barrier_t algo_barr;
static pthread_mutex_t bench_lock = PTHREAD_MUTEX_INITIALIZER;

extern double thr_hashrates[MAX_GPUS];

void bench_init(int threads)
{
	bench_algo = opt_algo = (enum sha_algos) 0; /* first */
	applog(LOG_BLUE, "Starting benchmark mode with %s", algo_names[opt_algo]);
	pthread_barrier_init(&miner_barr, NULL, threads);
	pthread_barrier_init(&algo_barr, NULL, threads);
	// required for usage of first algo.
	for (int n=0; n < opt_n_threads; n++) {
		device_mem_free[n] = cuda_available_memory(n);
	}
}

void bench_free()
{
	pthread_barrier_destroy(&miner_barr);
	pthread_barrier_destroy(&algo_barr);
}

// required to switch algos
void algo_free_all(int thr_id)
{
	// only initialized algos will be freed
	free_blake256(thr_id);
	free_bmw(thr_id);
	free_c11(thr_id);
	free_deep(thr_id);
	free_keccak256(thr_id);
	free_fresh(thr_id);
	free_fugue256(thr_id);
	free_groestlcoin(thr_id);
	free_heavy(thr_id);
	free_jackpot(thr_id);
	free_luffa(thr_id);
	free_lyra2(thr_id);
	free_lyra2v2(thr_id);
	free_myriad(thr_id);
	free_neoscrypt(thr_id);
	free_nist5(thr_id);
	free_pentablake(thr_id);
	free_quark(thr_id);
	free_qubit(thr_id);
	free_skeincoin(thr_id);
	free_skein2(thr_id);
	free_s3(thr_id);
	free_whirl(thr_id);
	free_whirlx(thr_id);
	free_x11(thr_id);
	free_x13(thr_id);
	free_x14(thr_id);
	free_x15(thr_id);
	free_x17(thr_id);
	free_zr5(thr_id);
	//free_sha256d(thr_id);
	free_scrypt(thr_id);
	free_scrypt_jane(thr_id);
}

// benchmark all algos (called once per mining thread)
bool bench_algo_switch_next(int thr_id)
{
	int algo = (int) opt_algo;
	int prev_algo = algo;
	int dev_id = device_map[thr_id % MAX_GPUS];
	int mfree, mused;
	// doesnt seems enough to prevent device slow down
	// after some algo switchs
	bool need_reset = (gpu_threads == 1);

	algo++;

	// skip some duplicated algos
	if (algo == ALGO_C11) algo++; // same as x11
	if (algo == ALGO_DMD_GR) algo++; // same as groestl
	if (algo == ALGO_MJOLLNIR) algo++; // same as heavy
	if (algo == ALGO_WHIRLCOIN) algo++; // same as whirlpool

	if (device_sm[dev_id] && device_sm[dev_id] < 300) {
		// incompatible SM 2.1 kernels...
		if (algo == ALGO_GROESTL) algo++;
		if (algo == ALGO_MYR_GR) algo++;
		if (algo == ALGO_JACKPOT) algo++; // compact shuffle
		if (algo == ALGO_LYRA2v2) algo++;
		if (algo == ALGO_NEOSCRYPT) algo++;
		if (algo == ALGO_QUARK) algo++; // todo
		if (algo == ALGO_WHIRLPOOLX) algo++;
	}
	// and unwanted ones...
	if (algo == ALGO_SCRYPT) algo++;
	if (algo == ALGO_SCRYPT_JANE) algo++;

	// free current algo memory and track mem usage
	mused = cuda_available_memory(thr_id);
	algo_free_all(thr_id);
	CUDA_LOG_ERROR();

	// device can take some time to free
	mfree = cuda_available_memory(thr_id);
	if (device_mem_free[thr_id] > mfree) {
		sleep(1);
		mfree = cuda_available_memory(thr_id);
	}

	// we need to wait completion on all cards before the switch
	if (opt_n_threads > 1) {
		pthread_barrier_wait(&miner_barr);
	}

	char rate[32] = { 0 };
	double hashrate = stats_get_speed(thr_id, thr_hashrates[thr_id]);
	format_hashrate(hashrate, rate);
	gpulog(LOG_NOTICE, thr_id, "%s hashrate = %s", algo_names[prev_algo], rate);

	// ensure memory leak is still real after the barrier
	if (device_mem_free[thr_id] > mfree) {
		mfree = cuda_available_memory(thr_id);
	}

	// check if there is memory leak
	if (device_mem_free[thr_id] - mfree > 1) {
		gpulog(LOG_WARNING, thr_id, "possible %d MB memory leak in %s! %d MB free",
			(device_mem_free[thr_id] - mfree), algo_names[prev_algo], mfree);
		cuda_reset_device(thr_id, NULL); // force to free the leak
		need_reset = false;
		mfree = cuda_available_memory(thr_id);
	}
	// store used memory per algo
	algo_mem_used[thr_id][opt_algo] = device_mem_free[thr_id] - mused;
	device_mem_free[thr_id] = mfree;

	// store to dump a table per gpu later
	algo_hashrates[thr_id][prev_algo] = hashrate;

	// wait the other threads to display logs correctly
	if (opt_n_threads > 1) {
		pthread_barrier_wait(&algo_barr);
	}

	if (algo == ALGO_AUTO)
		return false; // all algos done

	// mutex primary used for the stats purge
	pthread_mutex_lock(&bench_lock);
	stats_purge_all();

	opt_algo = (enum sha_algos) algo;
	global_hashrate = 0;
	thr_hashrates[thr_id] = 0; // reset for minmax64
	pthread_mutex_unlock(&bench_lock);

	if (need_reset)
		cuda_reset_device(thr_id, NULL);

	if (thr_id == 0)
		applog(LOG_BLUE, "Benchmark algo %s...", algo_names[algo]);

	return true;
}

void bench_set_throughput(int thr_id, uint32_t throughput)
{
	algo_throughput[thr_id][opt_algo] = throughput;
}

void bench_display_results()
{
	for (int n=0; n < opt_n_threads; n++)
	{
		int dev_id = device_map[n];
		applog(LOG_BLUE, "Benchmark results for GPU #%d - %s:", dev_id, device_name[dev_id]);
		for (int i=0; i < ALGO_COUNT-1; i++) {
			double rate = algo_hashrates[n][i];
			if (rate == 0.0) continue;
			applog(LOG_INFO, "%12s : %12.1f kH/s, %5d MB, %8u thr.", algo_names[i],
				rate / 1024., algo_mem_used[n][i], algo_throughput[n][i]);
		}
	}
}
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`/**`
			`* Made to benchmark and test algo switch`
			`*`
			`* 2015 - tpruvot@github`
			`*/`

benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`#include <unistd.h>`

refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`#include "miner.h"`
			`#include "algos.h"`

			`int bench_algo = -1;`

benchmark: show mem and default throughput in results and prepare a new function to get the default intensity also, take care of multiple threads per gpu... 9 years ago			`static double algo_hashrates[MAX_GPUS][ALGO_COUNT] = { 0 };`
			`static uint32_t algo_throughput[MAX_GPUS][ALGO_COUNT] = { 0 };`
			`static int algo_mem_used[MAX_GPUS][ALGO_COUNT] = { 0 };`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`static int device_mem_free[MAX_GPUS] = { 0 };`

			`static pthread_barrier_t miner_barr;`
			`static pthread_barrier_t algo_barr;`
			`static pthread_mutex_t bench_lock = PTHREAD_MUTEX_INITIALIZER;`

			`extern double thr_hashrates[MAX_GPUS];`

			`void bench_init(int threads)`
			`{`
			`bench_algo = opt_algo = (enum sha_algos) 0; /* first */`
			`applog(LOG_BLUE, "Starting benchmark mode with %s", algo_names[opt_algo]);`
			`pthread_barrier_init(&miner_barr, NULL, threads);`
			`pthread_barrier_init(&algo_barr, NULL, threads);`
intensity: do not reduce throughput before init Else the memory allocated could be less than required later btw, use the new "cuda" function to apply intensity/throughput 9 years ago			`// required for usage of first algo.`
			`for (int n=0; n < opt_n_threads; n++) {`
			`device_mem_free[n] = cuda_available_memory(n);`
			`}`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`}`

			`void bench_free()`
			`{`
			`pthread_barrier_destroy(&miner_barr);`
			`pthread_barrier_destroy(&algo_barr);`
			`}`

warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`// required to switch algos`
			`void algo_free_all(int thr_id)`
			`{`
			`// only initialized algos will be freed`
			`free_blake256(thr_id);`
			`free_bmw(thr_id);`
			`free_c11(thr_id);`
			`free_deep(thr_id);`
			`free_keccak256(thr_id);`
			`free_fresh(thr_id);`
			`free_fugue256(thr_id);`
			`free_groestlcoin(thr_id);`
			`free_heavy(thr_id);`
			`free_jackpot(thr_id);`
			`free_luffa(thr_id);`
			`free_lyra2(thr_id);`
			`free_lyra2v2(thr_id);`
			`free_myriad(thr_id);`
			`free_neoscrypt(thr_id);`
			`free_nist5(thr_id);`
			`free_pentablake(thr_id);`
			`free_quark(thr_id);`
			`free_qubit(thr_id);`
			`free_skeincoin(thr_id);`
			`free_skein2(thr_id);`
			`free_s3(thr_id);`
			`free_whirl(thr_id);`
			`free_whirlx(thr_id);`
			`free_x11(thr_id);`
			`free_x13(thr_id);`
			`free_x14(thr_id);`
			`free_x15(thr_id);`
			`free_x17(thr_id);`
			`free_zr5(thr_id);`
			`//free_sha256d(thr_id);`
			`free_scrypt(thr_id);`
			`free_scrypt_jane(thr_id);`
			`}`

refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`// benchmark all algos (called once per mining thread)`
			`bool bench_algo_switch_next(int thr_id)`
			`{`
			`int algo = (int) opt_algo;`
			`int prev_algo = algo;`
			`int dev_id = device_map[thr_id % MAX_GPUS];`
benchmark: show mem and default throughput in results and prepare a new function to get the default intensity also, take care of multiple threads per gpu... 9 years ago			`int mfree, mused;`
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`// doesnt seems enough to prevent device slow down`
			`// after some algo switchs`
			`bool need_reset = (gpu_threads == 1);`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago
			`algo++;`

			`// skip some duplicated algos`
			`if (algo == ALGO_C11) algo++; // same as x11`
			`if (algo == ALGO_DMD_GR) algo++; // same as groestl`
windows: add support for SM 2.1, drop SM 3.5 (x86) Mostly to do compatibilty tests, SM 2.1 support is very limited SM 3.0 code should run on SM 3.5 (only a few cards use this arch) As i can't test SM 3.5, its best to let users do their own tests... 9 years ago			`if (algo == ALGO_MJOLLNIR) algo++; // same as heavy`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`if (algo == ALGO_WHIRLCOIN) algo++; // same as whirlpool`
various fixes for SM 2.1 and the benchmark X11+ algos and quark are not compatible for the moment but these ones are : Benchmark results for Gigabyte GTX 460 (SM 2.1 / 1 GB): blakecoin : 159090.5 kH/s, 1 MB, 1048576 thr. blake : 70208.9 kH/s, 1 MB, 1048576 thr. bmw : 122802.6 kH/s, 65 MB, 2097152 thr. deep : 3533.6 kH/s, 33 MB, 524288 thr. fugue256 : 43177.9 kH/s, 17 MB, 524288 thr. heavy : 4118.2 kH/s, 147 MB, 524032 thr. keccak : 18673.1 kH/s, 129 MB, 2097152 thr. luffa : 28816.0 kH/s, 257 MB, 4194304 thr. lyra2 : 213.7 kH/s, 570 MB, 65536 thr. mjollnir : 3895.6 kH/s, 147 MB, 524032 thr. nist5 : 1101.4 kH/s, 67 MB, 1048576 thr. penta : 501.6 kH/s, 21 MB, 327680 thr. skein : 5432.4 kH/s, 65 MB, 1048576 thr. skein2 : 6788.9 kH/s, 33 MB, 524288 thr. whirlpool : 688.5 kH/s, 33 MB, 524288 thr. zr5 : 122.5 kH/s, 86 MB, 262144 thr. 9 years ago
			`if (device_sm[dev_id] && device_sm[dev_id] < 300) {`
			`// incompatible SM 2.1 kernels...`
			`if (algo == ALGO_GROESTL) algo++;`
			`if (algo == ALGO_MYR_GR) algo++;`
simd: add support for SM 2.1 devices Add support for x11..x17, s3, fresh and qubit Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`if (algo == ALGO_JACKPOT) algo++; // compact shuffle`
various fixes for SM 2.1 and the benchmark X11+ algos and quark are not compatible for the moment but these ones are : Benchmark results for Gigabyte GTX 460 (SM 2.1 / 1 GB): blakecoin : 159090.5 kH/s, 1 MB, 1048576 thr. blake : 70208.9 kH/s, 1 MB, 1048576 thr. bmw : 122802.6 kH/s, 65 MB, 2097152 thr. deep : 3533.6 kH/s, 33 MB, 524288 thr. fugue256 : 43177.9 kH/s, 17 MB, 524288 thr. heavy : 4118.2 kH/s, 147 MB, 524032 thr. keccak : 18673.1 kH/s, 129 MB, 2097152 thr. luffa : 28816.0 kH/s, 257 MB, 4194304 thr. lyra2 : 213.7 kH/s, 570 MB, 65536 thr. mjollnir : 3895.6 kH/s, 147 MB, 524032 thr. nist5 : 1101.4 kH/s, 67 MB, 1048576 thr. penta : 501.6 kH/s, 21 MB, 327680 thr. skein : 5432.4 kH/s, 65 MB, 1048576 thr. skein2 : 6788.9 kH/s, 33 MB, 524288 thr. whirlpool : 688.5 kH/s, 33 MB, 524288 thr. zr5 : 122.5 kH/s, 86 MB, 262144 thr. 9 years ago			`if (algo == ALGO_LYRA2v2) algo++;`
			`if (algo == ALGO_NEOSCRYPT) algo++;`
simd: add support for SM 2.1 devices Add support for x11..x17, s3, fresh and qubit Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`if (algo == ALGO_QUARK) algo++; // todo`
various fixes for SM 2.1 and the benchmark X11+ algos and quark are not compatible for the moment but these ones are : Benchmark results for Gigabyte GTX 460 (SM 2.1 / 1 GB): blakecoin : 159090.5 kH/s, 1 MB, 1048576 thr. blake : 70208.9 kH/s, 1 MB, 1048576 thr. bmw : 122802.6 kH/s, 65 MB, 2097152 thr. deep : 3533.6 kH/s, 33 MB, 524288 thr. fugue256 : 43177.9 kH/s, 17 MB, 524288 thr. heavy : 4118.2 kH/s, 147 MB, 524032 thr. keccak : 18673.1 kH/s, 129 MB, 2097152 thr. luffa : 28816.0 kH/s, 257 MB, 4194304 thr. lyra2 : 213.7 kH/s, 570 MB, 65536 thr. mjollnir : 3895.6 kH/s, 147 MB, 524032 thr. nist5 : 1101.4 kH/s, 67 MB, 1048576 thr. penta : 501.6 kH/s, 21 MB, 327680 thr. skein : 5432.4 kH/s, 65 MB, 1048576 thr. skein2 : 6788.9 kH/s, 33 MB, 524288 thr. whirlpool : 688.5 kH/s, 33 MB, 524288 thr. zr5 : 122.5 kH/s, 86 MB, 262144 thr. 9 years ago			`if (algo == ALGO_WHIRLPOOLX) algo++;`
			`}`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`// and unwanted ones...`
			`if (algo == ALGO_SCRYPT) algo++;`
			`if (algo == ALGO_SCRYPT_JANE) algo++;`

warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`// free current algo memory and track mem usage`
			`mused = cuda_available_memory(thr_id);`
			`algo_free_all(thr_id);`
prepare the 1.7 release 9 years ago			`CUDA_LOG_ERROR();`
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago
			`// device can take some time to free`
warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`mfree = cuda_available_memory(thr_id);`
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`if (device_mem_free[thr_id] > mfree) {`
			`sleep(1);`
			`mfree = cuda_available_memory(thr_id);`
			`}`
warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`// we need to wait completion on all cards before the switch`
			`if (opt_n_threads > 1) {`
			`pthread_barrier_wait(&miner_barr);`
			`}`

benchmark: show mem and default throughput in results and prepare a new function to get the default intensity also, take care of multiple threads per gpu... 9 years ago			`char rate[32] = { 0 };`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`double hashrate = stats_get_speed(thr_id, thr_hashrates[thr_id]);`
			`format_hashrate(hashrate, rate);`
add gpulog() function helper, simple and multi-threads when using multiple cpu threads per gpu, use the T prefix, ex: [2015-10-11 09:52:49] GPU #0: app clocks set to P0 (3600/1228) vs [2015-10-11 09:52:51] GPU T0: MSI GTX 960, 5953.35 kH/s Only thr_id is required, the function take care of the dev id 9 years ago			`gpulog(LOG_NOTICE, thr_id, "%s hashrate = %s", algo_names[prev_algo], rate);`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`// ensure memory leak is still real after the barrier`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`if (device_mem_free[thr_id] > mfree) {`
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`mfree = cuda_available_memory(thr_id);`
			`}`

			`// check if there is memory leak`
			`if (device_mem_free[thr_id] - mfree > 1) {`
			`gpulog(LOG_WARNING, thr_id, "possible %d MB memory leak in %s! %d MB free",`
			`(device_mem_free[thr_id] - mfree), algo_names[prev_algo], mfree);`
warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`cuda_reset_device(thr_id, NULL); // force to free the leak`
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`need_reset = false;`
warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`mfree = cuda_available_memory(thr_id);`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`}`
benchmark: show mem and default throughput in results and prepare a new function to get the default intensity also, take care of multiple threads per gpu... 9 years ago			`// store used memory per algo`
			`algo_mem_used[thr_id][opt_algo] = device_mem_free[thr_id] - mused;`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`device_mem_free[thr_id] = mfree;`

			`// store to dump a table per gpu later`
			`algo_hashrates[thr_id][prev_algo] = hashrate;`

			`// wait the other threads to display logs correctly`
			`if (opt_n_threads > 1) {`
			`pthread_barrier_wait(&algo_barr);`
			`}`

			`if (algo == ALGO_AUTO)`
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`return false; // all algos done`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago
			`// mutex primary used for the stats purge`
			`pthread_mutex_lock(&bench_lock);`
			`stats_purge_all();`

			`opt_algo = (enum sha_algos) algo;`
			`global_hashrate = 0;`
			`thr_hashrates[thr_id] = 0; // reset for minmax64`
			`pthread_mutex_unlock(&bench_lock);`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`if (need_reset)`
			`cuda_reset_device(thr_id, NULL);`

refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`if (thr_id == 0)`
			`applog(LOG_BLUE, "Benchmark algo %s...", algo_names[algo]);`

			`return true;`
			`}`

benchmark: show mem and default throughput in results and prepare a new function to get the default intensity also, take care of multiple threads per gpu... 9 years ago			`void bench_set_throughput(int thr_id, uint32_t throughput)`
			`{`
			`algo_throughput[thr_id][opt_algo] = throughput;`
			`}`

refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`void bench_display_results()`
			`{`
			`for (int n=0; n < opt_n_threads; n++)`
			`{`
			`int dev_id = device_map[n];`
			`applog(LOG_BLUE, "Benchmark results for GPU #%d - %s:", dev_id, device_name[dev_id]);`
			`for (int i=0; i < ALGO_COUNT-1; i++) {`
			`double rate = algo_hashrates[n][i];`
			`if (rate == 0.0) continue;`
benchmark: show mem and default throughput in results and prepare a new function to get the default intensity also, take care of multiple threads per gpu... 9 years ago			`applog(LOG_INFO, "%12s : %12.1f kH/s, %5d MB, %8u thr.", algo_names[i],`
			`rate / 1024., algo_mem_used[n][i], algo_throughput[n][i]);`
refactor: create bench.cpp and algos.h Also enhance multi-thread benchmark synchro. with pthread barriers 9 years ago			`}`
			`}`
			`}`
benchmark: show mem and default throughput in results and prepare a new function to get the default intensity also, take care of multiple threads per gpu... 9 years ago