From 934555994dc330634e084dbb1ca306c2fecf6a93 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 8 Oct 2015 21:41:20 +0200
Subject: [PATCH] benchmark: allow -a auto to bench all algos at once

---
 ccminer.cpp | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 cuda.cpp    | 12 +++++++-
 2 files changed, 95 insertions(+), 5 deletions(-)

diff --git a/ccminer.cpp b/ccminer.cpp
index 1b4ac47..8e23d6c 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -55,13 +55,14 @@ BOOL WINAPI ConsoleHandler(DWORD);
 #define HEAVYCOIN_BLKHDR_SZ		84
 #define MNR_BLKHDR_SZ 80
 
-// from cuda.cpp
+// decl. from cuda.cpp (to move in miner.h)
 int cuda_num_devices();
 void cuda_devicenames();
 void cuda_reset_device(int thr_id, bool *init);
 void cuda_shutdown();
 int cuda_finddevice(char *name);
 void cuda_print_devices();
+int cuda_available_memory(int thr_id);
 
 #include "nvml.h"
 #ifdef USE_WRAPNVML
@@ -120,6 +121,7 @@ enum sha_algos {
 	ALGO_WHIRLPOOL,
 	ALGO_WHIRLPOOLX,
 	ALGO_ZR5,
+	ALGO_AUTO,
 	ALGO_COUNT
 };
 
@@ -160,6 +162,7 @@ static const char *algo_names[] = {
 	"whirlpool",
 	"whirlpoolx",
 	"zr5",
+	"auto", /* reserved for multi algo */
 	""
 };
 
@@ -168,6 +171,7 @@ bool opt_debug_diff = false;
 bool opt_debug_threads = false;
 bool opt_protocol = false;
 bool opt_benchmark = false;
+int algo_benchmark = -1;
 bool opt_showdiff = false;
 
 // todo: limit use of these flags,
@@ -319,7 +323,8 @@ Options:\n\
 			x14         X14\n\
 			x15         X15\n\
 			x17         X17\n\
-			whirlpool   Old Whirlcoin algo\n\
+			whirlcoin   Old Whirlcoin (Whirlpool algo)\n\
+			whirlpool   Whirlpool algo\n\
 			whirlpoolx  WhirlpoolX (VNL)\n\
 			zr5         ZR5 (ZiftrCoin)\n\
   -d, --devices         Comma separated list of CUDA devices to use.\n\
@@ -1559,12 +1564,60 @@ void miner_free_device(int thr_id)
 	free_scrypt_jane(thr_id);
 }
 
+// to benchmark all algos
+bool algo_switch_next(int thr_id)
+{
+	int algo = (int) opt_algo;
+	int prev_algo = algo;
+	int dev_id = device_map[thr_id % MAX_GPUS];
+	int mfree;
+	char rate[32] = { 0 };
+
+	// free current algo memory and track mem usage
+	miner_free_device(thr_id);
+	mfree = cuda_available_memory(thr_id);
+
+	work_restart[thr_id].restart = 1;
+
+	algo++;
+	if (algo == ALGO_AUTO)
+		return false;
+
+	// we need to wait completion on all cards before the switch
+	if (opt_n_threads > 1) {
+		pthread_mutex_lock(&stratum_sock_lock); // unused in benchmark
+		for (int n=0; n < opt_n_threads; n++)
+			if (!work_restart[thr_id].restart) {
+				applog(LOG_DEBUG, "GPU #%d: waiting GPU %d", dev_id, device_map[n]);
+				usleep(100*1000);
+			}
+		sleep(1);
+		pthread_mutex_unlock(&stratum_sock_lock);
+	}
+
+	double hashrate = stats_get_speed(thr_id, thr_hashrates[thr_id]);
+	format_hashrate(hashrate, rate);
+	applog(LOG_NOTICE, "GPU #%d: %s rate: %s - %d MB free", dev_id, algo_names[prev_algo], rate, mfree);
+
+	stats_purge_all();
+	global_hashrate = 0;
+
+	opt_algo = (enum sha_algos) algo;
+
+	applog(LOG_BLUE, "GPU #%d: Benchmark for algo %s...", dev_id, algo_names[algo]);
+	sleep(1);
+	work_restart[thr_id].restart = 0;
+
+	return true;
+}
+
 static void *miner_thread(void *userdata)
 {
 	struct thr_info *mythr = (struct thr_info *)userdata;
 	int switchn = pool_switch_count;
 	int thr_id = mythr->id;
 	struct work work;
+	uint64_t loopcnt = 0;
 	uint32_t max_nonce;
 	uint32_t end_nonce = UINT32_MAX / opt_n_threads * (thr_id + 1) - (thr_id + 1);
 	bool work_done = false;
@@ -1676,6 +1729,19 @@ static void *miner_thread(void *userdata)
 			}
 		}
 
+		if (opt_benchmark && algo_benchmark >= 0) {
+			if (loopcnt > 3) {
+				if (!algo_switch_next(thr_id)) {
+					proper_exit(0);
+					break;
+				}
+				algo_benchmark = (int) opt_algo;
+				// for scrypt...
+				opt_autotune = false;
+				loopcnt = 0;
+			}
+		}
+
 		if (!opt_benchmark && (g_work.height != work.height || memcmp(work.target, g_work.target, sizeof(work.target))))
 		{
 			if (opt_debug) {
@@ -1825,8 +1891,10 @@ static void *miner_thread(void *userdata)
 				minmax = 0x300000;
 				break;
 			case ALGO_SCRYPT:
+				minmax = 0x80000;
+				break;
 			case ALGO_SCRYPT_JANE:
-				minmax = 0x100000;
+				minmax = 0x1000;
 				break;
 			}
 			max64 = max(minmax-1, max64);
@@ -2012,7 +2080,8 @@ static void *miner_thread(void *userdata)
 				pthread_mutex_lock(&stats_lock);
 				thr_hashrates[thr_id] = hashes_done / dtime;
 				thr_hashrates[thr_id] *= rate_factor;
-				stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t) rc, work.height);
+				if (loopcnt) // ignore first (init time)
+					stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t) rc, work.height);
 				pthread_mutex_unlock(&stats_lock);
 			}
 		}
@@ -2090,6 +2159,7 @@ static void *miner_thread(void *userdata)
 					break;
 			}
 		}
+		loopcnt++;
 	}
 
 out:
@@ -3026,6 +3096,16 @@ static void parse_cmdline(int argc, char *argv[])
 			argv[0]);
 		show_usage_and_exit(1);
 	}
+
+	if (opt_algo == ALGO_AUTO) {
+		for (int n=0; n < MAX_GPUS; n++)
+			gpus_intensity[n] = 0; // use default
+		if (opt_benchmark) {
+			opt_autotune = false;
+			algo_benchmark = opt_algo = ALGO_BLAKE; /* first */
+			applog(LOG_BLUE, "Starting benchmark mode");
+		}
+	}
 }
 
 #ifndef WIN32
diff --git a/cuda.cpp b/cuda.cpp
index 679e77f..5a41b49 100644
--- a/cuda.cpp
+++ b/cuda.cpp
@@ -196,7 +196,7 @@ int cuda_gpu_clocks(struct cgpu_info *gpu)
 // if we use 2 threads on the same gpu, we need to reinit the threads
 void cuda_reset_device(int thr_id, bool *init)
 {
-	int dev_id = device_map[thr_id];
+	int dev_id = device_map[thr_id % MAX_GPUS];
 	cudaSetDevice(dev_id);
 	if (init != NULL) {
 		// with init array, its meant to be used in algo's scan code...
@@ -216,6 +216,16 @@ void cuda_reset_device(int thr_id, bool *init)
 		cudaSetDeviceFlags((unsigned)(opt_cudaschedule & cudaDeviceScheduleMask));
 }
 
+// return free memory in megabytes
+int cuda_available_memory(int thr_id)
+{
+	int dev_id = device_map[thr_id % MAX_GPUS];
+	size_t mtotal, mfree = 0;
+	cudaSetDevice(dev_id);
+	cudaMemGetInfo(&mfree, &mtotal);
+	return (int) (mfree / (1024 * 1024));
+}
+
 void cudaReportHardwareFailure(int thr_id, cudaError_t err, const char* func)
 {
 	struct cgpu_info *gpu = &thr_info[thr_id].gpu;