diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu
index 4fe7f14..065f1f6 100644
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
@@ -388,7 +388,8 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non
 	const uint32_t first_nonce = pdata[19];
 	uint64_t targetHigh = ((uint64_t*)ptarget)[3];
 	int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 20;
-	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity, max_nonce - first_nonce);
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	throughput = min(throughput, max_nonce - first_nonce);
 
 	int rc = 0;
 
diff --git a/README.txt b/README.txt
index f53ffa4..a2a2a1a 100644
--- a/README.txt
+++ b/README.txt
@@ -96,6 +96,7 @@ its command line interface and options.
                           x14         use to mine X14Coin
                           x15         use to mine Halcyon
                           x17         use to mine X17
+                          whirlpool   use to mine Joincoin
                           whirlpoolx  use to mine Vanilla
                           zr5         use to mine ZiftrCoin
 
@@ -228,6 +229,9 @@ features.
 >>> RELEASE HISTORY <<<
 
   Under Dev...    v1.7
+                  Restore whirlpool algo (and whirlcoin variant)
+                  Prepare algo switch ability
+                  Add --benchmark -a auto to run a multi algo benchmark
                   Add --cuda-schedule parameter
                   Add --show-diff parameter, which display shares diff,
                     and is able to detect real solved blocks on pools.
diff --git a/api.cpp b/api.cpp
index f15f117..c9e28ba 100644
--- a/api.cpp
+++ b/api.cpp
@@ -990,7 +990,7 @@ void *api_thread(void *userdata)
 /* to be able to report the default value set in each algo */
 void api_set_throughput(int thr_id, uint32_t throughput)
 {
-	if (&thr_info[thr_id]) {
+	if (thr_id < MAX_GPUS) {
 		struct cgpu_info *cgpu = &thr_info[thr_id].gpu;
 		uint32_t ws = throughput;
 		uint8_t i = 0;
diff --git a/bench.cpp b/bench.cpp
index f820cc2..016e120 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -11,7 +11,9 @@
 
 int bench_algo = -1;
 
-static double * algo_hashrates[MAX_GPUS] = { 0 };
+static double algo_hashrates[MAX_GPUS][ALGO_COUNT] = { 0 };
+static uint32_t algo_throughput[MAX_GPUS][ALGO_COUNT] = { 0 };
+static int algo_mem_used[MAX_GPUS][ALGO_COUNT] = { 0 };
 static int device_mem_free[MAX_GPUS] = { 0 };
 
 static pthread_barrier_t miner_barr;
@@ -25,18 +27,12 @@ void bench_init(int threads)
 {
 	bench_algo = opt_algo = (enum sha_algos) 0; /* first */
 	applog(LOG_BLUE, "Starting benchmark mode with %s", algo_names[opt_algo]);
-	for (int n=0; n < MAX_GPUS; n++) {
-		algo_hashrates[n] = (double*) calloc(1, ALGO_COUNT * sizeof(double));
-	}
 	pthread_barrier_init(&miner_barr, NULL, threads);
 	pthread_barrier_init(&algo_barr, NULL, threads);
 }
 
 void bench_free()
 {
-	for (int n=0; n < MAX_GPUS; n++) {
-		free(algo_hashrates[n]);
-	}
 	pthread_barrier_destroy(&miner_barr);
 	pthread_barrier_destroy(&algo_barr);
 }
@@ -47,12 +43,7 @@ bool bench_algo_switch_next(int thr_id)
 	int algo = (int) opt_algo;
 	int prev_algo = algo;
 	int dev_id = device_map[thr_id % MAX_GPUS];
-	int mfree;
-	char rate[32] = { 0 };
-
-	// free current algo memory and track mem usage
-	miner_free_device(thr_id);
-	mfree = cuda_available_memory(thr_id);
+	int mfree, mused;
 
 	algo++;
 
@@ -70,16 +61,23 @@ bool bench_algo_switch_next(int thr_id)
 		pthread_barrier_wait(&miner_barr);
 	}
 
-
+	char rate[32] = { 0 };
 	double hashrate = stats_get_speed(thr_id, thr_hashrates[thr_id]);
 	format_hashrate(hashrate, rate);
 	applog(LOG_NOTICE, "GPU #%d: %s hashrate = %s", dev_id, algo_names[prev_algo], rate);
 
+	// free current algo memory and track mem usage
+	mused = cuda_available_memory(thr_id);
+	miner_free_device(thr_id);
+	mfree = cuda_available_memory(thr_id);
+
 	// check if there is memory leak
 	if (device_mem_free[thr_id] > mfree) {
 		applog(LOG_WARNING, "GPU #%d, memory leak detected in %s ! %d MB free",
-			dev_id,	algo_names[prev_algo], mfree);
+			dev_id, algo_names[prev_algo], mfree);
 	}
+	// store used memory per algo
+	algo_mem_used[thr_id][opt_algo] = device_mem_free[thr_id] - mused;
 	device_mem_free[thr_id] = mfree;
 
 	// store to dump a table per gpu later
@@ -109,6 +107,11 @@ bool bench_algo_switch_next(int thr_id)
 	return true;
 }
 
+void bench_set_throughput(int thr_id, uint32_t throughput)
+{
+	algo_throughput[thr_id][opt_algo] = throughput;
+}
+
 void bench_display_results()
 {
 	for (int n=0; n < opt_n_threads; n++)
@@ -118,7 +121,9 @@ void bench_display_results()
 		for (int i=0; i < ALGO_COUNT-1; i++) {
 			double rate = algo_hashrates[n][i];
 			if (rate == 0.0) continue;
-			applog(LOG_INFO, "%12s : %12.1f kH/s", algo_names[i], rate / 1024.);
+			applog(LOG_INFO, "%12s : %12.1f kH/s, %5d MB, %8u thr.", algo_names[i],
+				rate / 1024., algo_mem_used[n][i], algo_throughput[n][i]);
 		}
 	}
 }
+
diff --git a/ccminer.cpp b/ccminer.cpp
index d618208..8baa966 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -110,6 +110,7 @@ static json_t *opt_config;
 static const bool opt_time = true;
 enum sha_algos opt_algo = ALGO_X11;
 int opt_n_threads = 0;
+int gpu_threads = 1;
 int64_t opt_affinity = -1L;
 int opt_priority = 0;
 static double opt_difficulty = 1.;
@@ -1483,6 +1484,7 @@ static void *miner_thread(void *userdata)
 	struct thr_info *mythr = (struct thr_info *)userdata;
 	int switchn = pool_switch_count;
 	int thr_id = mythr->id;
+	int dev_id = device_map[thr_id % MAX_GPUS];
 	struct work work;
 	uint64_t loopcnt = 0;
 	uint32_t max_nonce;
@@ -1635,7 +1637,7 @@ static void *miner_thread(void *userdata)
 
 		// --benchmark [-a auto]
 		if (opt_benchmark && bench_algo >= 0) {
-			//applog(LOG_DEBUG, "GPU #%d: loop %d", device_map[thr_id], loopcnt);
+			//applog(LOG_DEBUG, "GPU #%d: loop %d", dev_id, loopcnt);
 			if (loopcnt >= 3) {
 				if (!bench_algo_switch_next(thr_id) && thr_id == 0)
 				{
@@ -1755,11 +1757,11 @@ static void *miner_thread(void *userdata)
 				break;
 			case ALGO_KECCAK:
 			case ALGO_JACKPOT:
-			case ALGO_NEOSCRYPT:
 			case ALGO_X15:
 				minmax = 0x300000;
 				break;
 			case ALGO_LYRA2:
+			case ALGO_NEOSCRYPT:
 			case ALGO_SCRYPT:
 				minmax = 0x80000;
 				break;
@@ -1795,7 +1797,7 @@ static void *miner_thread(void *userdata)
 
 		if (opt_debug)
 			applog(LOG_DEBUG, "GPU #%d: start=%08x end=%08x range=%08x",
-				device_map[thr_id], start_nonce, max_nonce, (max_nonce-start_nonce));
+				dev_id, start_nonce, max_nonce, (max_nonce-start_nonce));
 
 		hashes_done = 0;
 		gettimeofday(&tv_start, NULL);
@@ -1967,7 +1969,7 @@ static void *miner_thread(void *userdata)
 			work.scanned_to = max_nonce;
 			if (opt_debug && opt_benchmark) {
 				// to debug nonce ranges
-				applog(LOG_DEBUG, "GPU #%d:  ends=%08x range=%08x", device_map[thr_id],
+				applog(LOG_DEBUG, "GPU #%d:  ends=%08x range=%08x", dev_id,
 					nonceptr[0], (nonceptr[0] - start_nonce));
 			}
 		}
@@ -1978,8 +1980,7 @@ static void *miner_thread(void *userdata)
 		/* output */
 		if (!opt_quiet && firstwork_time) {
 			format_hashrate(thr_hashrates[thr_id], s);
-			applog(LOG_INFO, "GPU #%d: %s, %s",
-				device_map[thr_id], device_name[device_map[thr_id]], s);
+			applog(LOG_INFO, "GPU #%d: %s, %s", dev_id, device_name[dev_id], s);
 		}
 
 		/* ignore first loop hashrate */
@@ -2835,8 +2836,6 @@ void parse_arg(int key, char *arg)
 						proper_exit(EXIT_CODE_CUDA_NODEVICE);
 					}
 				}
-				// set number of active gpus
-				active_gpus = opt_n_threads;
 				pch = strtok (NULL, ",");
 			}
 		}
@@ -3057,8 +3056,11 @@ int main(int argc, char *argv[])
 	if (num_cpus < 1)
 		num_cpus = 1;
 
+	// number of gpus
+	active_gpus = cuda_num_devices();
+
 	for (i = 0; i < MAX_GPUS; i++) {
-		device_map[i] = i;
+		device_map[i] = i % active_gpus;
 		device_name[i] = NULL;
 		device_config[i] = NULL;
 		device_backoff[i] = is_windows() ? 12 : 2;
@@ -3070,8 +3072,6 @@ int main(int argc, char *argv[])
 		device_pstate[i] = -1;
 	}
 
-	// number of gpus
-	active_gpus = cuda_num_devices();
 	cuda_devicenames();
 
 	/* parse command line */
@@ -3192,6 +3192,9 @@ int main(int argc, char *argv[])
 	if (!opt_n_threads)
 		opt_n_threads = active_gpus;
 
+	// generally doesn't work... let 1
+	gpu_threads = opt_n_threads / active_gpus;
+
 	if (opt_benchmark && opt_algo == ALGO_AUTO) {
 		bench_init(opt_n_threads);
 		for (int n=0; n < MAX_GPUS; n++) {
diff --git a/cuda.cpp b/cuda.cpp
index 4c8d748..08c0a57 100644
--- a/cuda.cpp
+++ b/cuda.cpp
@@ -67,7 +67,8 @@ void cuda_devicenames()
 		exit(1);
 	}
 
-	GPU_N = min(MAX_GPUS, GPU_N);
+	if (opt_n_threads)
+		GPU_N = min(MAX_GPUS, opt_n_threads);
 	for (int i=0; i < GPU_N; i++)
 	{
 		char vendorname[32] = { 0 };
@@ -98,7 +99,7 @@ void cuda_print_devices()
 	int ngpus = cuda_num_devices();
 	cuda_devicenames();
 	for (int n=0; n < ngpus; n++) {
-		int m = device_map[n];
+		int m = device_map[n % MAX_GPUS];
 		cudaDeviceProp props;
 		cudaGetDeviceProperties(&props, m);
 		if (!opt_n_threads || n < opt_n_threads) {
@@ -148,10 +149,25 @@ int cuda_finddevice(char *name)
 	return -1;
 }
 
+// deprecated since 1.7
 uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount)
 {
 	uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount;
+	if (gpu_threads > 1) throughput >> (gpu_threads-1);
 	api_set_throughput(thr_id, throughput);
+	bench_set_throughput(thr_id, throughput);
+	return throughput;
+}
+
+// since 1.7
+uint32_t cuda_default_throughput(int thr_id, uint32_t defcount)
+{
+	//int dev_id = device_map[thr_id % MAX_GPUS];
+	uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount;
+	if (gpu_threads > 1) throughput >> (gpu_threads-1);
+	api_set_throughput(thr_id, throughput);
+	bench_set_throughput(thr_id, throughput);
+	//if (opt_debug) applog(LOG_DEBUG, "GPU %d-%d: throughput %u", dev_id, thr_id, throughput);
 	return throughput;
 }
 
@@ -240,7 +256,8 @@ cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
 void cudaReportHardwareFailure(int thr_id, cudaError_t err, const char* func)
 {
 	struct cgpu_info *gpu = &thr_info[thr_id].gpu;
+	int dev_id = device_map[thr_id % MAX_GPUS];
 	gpu->hw_errors++;
-	applog(LOG_ERR, "GPU #%d: %s %s", device_map[thr_id], func, cudaGetErrorString(err));
+	applog(LOG_ERR, "GPU #%d: %s %s", dev_id, func, cudaGetErrorString(err));
 	sleep(1);
 }
diff --git a/miner.h b/miner.h
index a4dc10b..8441e00 100644
--- a/miner.h
+++ b/miner.h
@@ -447,6 +447,7 @@ extern bool opt_showdiff;
 extern bool opt_tracegpu;
 extern int opt_n_threads;
 extern int active_gpus;
+extern int gpu_threads;
 extern int opt_timeout;
 extern bool want_longpoll;
 extern bool have_longpoll;
@@ -489,6 +490,9 @@ int cuda_finddevice(char *name);
 void cuda_print_devices();
 int cuda_available_memory(int thr_id);
 
+uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount);
+uint32_t cuda_default_throughput(int thr_id, uint32_t defcount);
+
 #define CL_N    "\x1B[0m"
 #define CL_RED  "\x1B[31m"
 #define CL_GRN  "\x1B[32m"
@@ -522,6 +526,7 @@ int cuda_available_memory(int thr_id);
 
 extern void format_hashrate(double hashrate, char *output);
 extern void applog(int prio, const char *fmt, ...);
+#define gpulog(prio, fmt, thr_id, ...) applog(prio, fmt, thr_id, __VA_ARGS__)
 void get_defconfig_path(char *out, size_t bufsize, char *argv0);
 extern void cbin2hex(char *out, const char *in, size_t len);
 extern char *bin2hex(const unsigned char *in, size_t len);
@@ -533,7 +538,6 @@ void diff_to_target(uint32_t* target, double diff);
 void work_set_target(struct work* work, double diff);
 double target_to_diff(uint32_t* target);
 extern void get_currentalgo(char* buf, int sz);
-extern uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount);
 
 // bignum
 double bn_convert_nbits(const uint32_t nbits);
@@ -547,6 +551,7 @@ extern int bench_algo;
 void bench_init(int threads);
 void bench_free();
 bool bench_algo_switch_next(int thr_id);
+void bench_set_throughput(int thr_id, uint32_t throughput);
 void bench_display_results();
 
 
diff --git a/skein.cu b/skein.cu
index b7b46ec..da88227 100644
--- a/skein.cu
+++ b/skein.cu
@@ -364,7 +364,7 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
 	throughput = min(throughput, (max_nonce - first_nonce));
 
 	uint32_t foundNonce, secNonce = 0;
-	uint64_t target64;
+	uint64_t target64 = 0;
 
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x03;