diff --git a/api.cpp b/api.cpp
index 917ef10..c209b4e 100644
--- a/api.cpp
+++ b/api.cpp
@@ -8,7 +8,7 @@
  * Software Foundation; either version 2 of the License, or (at your option)
  * any later version.  See COPYING for more details.
  */
-#define APIVERSION "1.8"
+#define APIVERSION "1.9"
 
 #ifdef WIN32
 # define  _WINSOCK_DEPRECATED_NO_WARNINGS
@@ -112,6 +112,7 @@ static void gpustatus(int thr_id)
 
 	if (thr_id >= 0 && thr_id < opt_n_threads) {
 		struct cgpu_info *cgpu = &thr_info[thr_id].gpu;
+		double khashes_per_watt = 0;
 		int gpuid = cgpu->gpu_id;
 		char buf[512]; *buf = '\0';
 		char* card;
@@ -131,14 +132,24 @@ static void gpustatus(int thr_id)
 		cgpu->rejected = p->rejected_count;
 
 		cgpu->khashes = stats_get_speed(thr_id, 0.0) / 1000.0;
+		if (cgpu->monitor.gpu_power) {
+			cgpu->gpu_power = cgpu->monitor.gpu_power;
+			khashes_per_watt = (double)cgpu->khashes / cgpu->monitor.gpu_power;
+			khashes_per_watt *= 1000; // power in mW
+			//gpulog(LOG_BLUE, thr_id, "KHW: %g", khashes_per_watt);
+		}
 
 		card = device_name[gpuid];
 
 		snprintf(buf, sizeof(buf), "GPU=%d;BUS=%hd;CARD=%s;TEMP=%.1f;"
-			"POWER=%u;FAN=%hu;RPM=%hu;FREQ=%d;KHS=%.2f;HWF=%d;I=%.1f;THR=%u|",
+			"POWER=%u;FAN=%hu;RPM=%hu;"
+			"FREQ=%u;CORE=%u;MEM=%u;"
+			"KHS=%.2f;KHW=%.5f;"
+			"HWF=%d;I=%.1f;THR=%u|",
 			gpuid, cgpu->gpu_bus, card, cgpu->gpu_temp,
 			cgpu->gpu_power, cgpu->gpu_fan, cgpu->gpu_fan_rpm,
-			cgpu->gpu_clock, cgpu->khashes,
+			cgpu->gpu_clock, cgpu->monitor.gpu_clock, cgpu->monitor.gpu_memclock,
+			cgpu->khashes, khashes_per_watt,
 			cgpu->hw_errors, cgpu->intensity, cgpu->throughput);
 
 		// append to buffer for multi gpus
@@ -349,7 +360,7 @@ static char *gethistory(char *params)
 	*buffer = '\0';
 	for (int i = 0; i < records; i++) {
 		time_t ts = data[i].tm_stat;
-		p += sprintf(p, "GPU=%d;H=%u;KHS=%.2f;DIFF=%.6f;"
+		p += sprintf(p, "GPU=%d;H=%u;KHS=%.2f;DIFF=%g;"
 				"COUNT=%u;FOUND=%u;ID=%u;TS=%u|",
 			data[i].gpu_id, data[i].height, data[i].hashrate, data[i].difficulty,
 			data[i].hashcount, data[i].hashfound, data[i].uid, (uint32_t)ts);
@@ -358,7 +369,7 @@ static char *gethistory(char *params)
 }
 
 /**
- * Returns the job scans ranges (debug purpose)
+ * Returns the job scans ranges (debug purpose, only with -D)
  */
 static char *getscanlog(char *params)
 {
@@ -368,9 +379,11 @@ static char *getscanlog(char *params)
 	*buffer = '\0';
 	for (int i = 0; i < records; i++) {
 		time_t ts = data[i].tm_upd;
-		p += sprintf(p, "H=%u;P=%u;JOB=%u;N=%u;FROM=0x%x;SCANTO=0x%x;"
+		p += sprintf(p, "H=%u;P=%u;JOB=%u;ID=%d;DIFF=%g;"
+				"N=0x%x;FROM=0x%x;SCANTO=0x%x;"
 				"COUNT=0x%x;FOUND=%u;TS=%u|",
-			data[i].height, data[i].npool, data[i].njobid, data[i].nonce, data[i].scanned_from, data[i].scanned_to,
+			data[i].height, data[i].npool, data[i].njobid, (int)data[i].job_nonce_id, data[i].sharediff,
+			data[i].nonce, data[i].scanned_from, data[i].scanned_to,
 			(data[i].scanned_to - data[i].scanned_from), data[i].tm_sent ? 1 : 0, (uint32_t)ts);
 	}
 	return buffer;
diff --git a/ccminer.cpp b/ccminer.cpp
index b85fe86..4721a45 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -85,6 +85,7 @@ bool opt_debug_threads = false;
 bool opt_protocol = false;
 bool opt_benchmark = false;
 bool opt_showdiff = true;
+bool opt_hwmonitor = true;
 
 // todo: limit use of these flags,
 // prefer the pools[] attributes
@@ -183,6 +184,7 @@ struct thr_api *thr_api;
 int longpoll_thr_id = -1;
 int stratum_thr_id = -1;
 int api_thr_id = -1;
+int monitor_thr_id = -1;
 bool stratum_need_reset = false;
 volatile bool abort_flag = false;
 struct work_restart *work_restart = NULL;
@@ -1684,6 +1686,7 @@ static void *miner_thread(void *userdata)
 	int switchn = pool_switch_count;
 	int thr_id = mythr->id;
 	int dev_id = device_map[thr_id % MAX_GPUS];
+	struct cgpu_info * cgpu = &thr_info[thr_id].gpu;
 	struct work work;
 	uint64_t loopcnt = 0;
 	uint32_t max_nonce;
@@ -2142,6 +2145,11 @@ static void *miner_thread(void *userdata)
 		if (opt_led_mode == LED_MODE_MINING)
 			gpu_led_on(dev_id);
 
+		if (cgpu && loopcnt > 1) {
+			cgpu->monitor.sampling_flag = true;
+			pthread_cond_signal(&cgpu->monitor.sampling_signal);
+		}
+
 		hashes_done = 0;
 		gettimeofday(&tv_start, NULL);
 
@@ -2350,6 +2358,10 @@ static void *miner_thread(void *userdata)
 
 		timeval_subtract(&diff, &tv_end, &tv_start);
 
+		if (cgpu && diff.tv_sec) { // stop monitoring
+			cgpu->monitor.sampling_flag = false;
+		}
+
 		if (diff.tv_usec || diff.tv_sec) {
 			double dtime = (double) diff.tv_sec + 1e-6 * diff.tv_usec;
 
@@ -3805,7 +3817,7 @@ int main(int argc, char *argv[])
 	if (!work_restart)
 		return EXIT_CODE_SW_INIT_ERROR;
 
-	thr_info = (struct thr_info *)calloc(opt_n_threads + 4, sizeof(*thr));
+	thr_info = (struct thr_info *)calloc(opt_n_threads + 5, sizeof(*thr));
 	if (!thr_info)
 		return EXIT_CODE_SW_INIT_ERROR;
 
@@ -3914,6 +3926,22 @@ int main(int argc, char *argv[])
 		}
 	}
 
+#ifdef USE_WRAPNVML
+	// to monitor gpu activitity during work, a thread is required
+	if (1) {
+		monitor_thr_id = opt_n_threads + 4;
+		thr = &thr_info[monitor_thr_id];
+		thr->id = monitor_thr_id;
+		thr->q = tq_new();
+		if (!thr->q)
+			return EXIT_CODE_SW_INIT_ERROR;
+		if (unlikely(pthread_create(&thr->pth, NULL, monitor_thread, thr))) {
+			applog(LOG_ERR, "Monitoring thread %d create failed", i);
+			return EXIT_CODE_SW_INIT_ERROR;
+		}
+	}
+#endif
+
 	/* start mining threads */
 	for (i = 0; i < opt_n_threads; i++) {
 		thr = &thr_info[i];
@@ -3926,6 +3954,9 @@ int main(int argc, char *argv[])
 		if (!thr->q)
 			return EXIT_CODE_SW_INIT_ERROR;
 
+		pthread_mutex_init(&thr->gpu.monitor.lock, NULL);
+		pthread_cond_init(&thr->gpu.monitor.sampling_signal, NULL);
+
 		if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) {
 			applog(LOG_ERR, "thread %d create failed", i);
 			return EXIT_CODE_SW_INIT_ERROR;
@@ -3944,9 +3975,21 @@ int main(int argc, char *argv[])
 	/* main loop - simply wait for workio thread to exit */
 	pthread_join(thr_info[work_thr_id].pth, NULL);
 
+	abort_flag = true;
+
 	/* wait for mining threads */
-	for (i = 0; i < opt_n_threads; i++)
+	for (i = 0; i < opt_n_threads; i++) {
+		struct cgpu_info *cgpu = &thr_info[i].gpu;
+		if (monitor_thr_id != -1 && cgpu) {
+			pthread_cond_signal(&cgpu->monitor.sampling_signal);
+		}
 		pthread_join(thr_info[i].pth, NULL);
+	}
+
+	if (monitor_thr_id != -1) {
+		pthread_join(thr_info[monitor_thr_id].pth, NULL);
+		//tq_free(thr_info[monitor_thr_id].q);
+	}
 
 	if (opt_debug)
 		applog(LOG_DEBUG, "workio thread dead, exiting.");
diff --git a/miner.h b/miner.h
index 0ff92f3..05cd476 100644
--- a/miner.h
+++ b/miner.h
@@ -361,6 +361,19 @@ extern void free_scrypt_jane(int thr_id);
 void *api_thread(void *userdata);
 void api_set_throughput(int thr_id, uint32_t throughput);
 
+struct monitor_info {
+	uint32_t gpu_temp;
+	uint32_t gpu_fan;
+	uint32_t gpu_clock;
+	uint32_t gpu_memclock;
+	uint32_t gpu_power;
+
+	pthread_mutex_t lock;
+	pthread_cond_t sampling_signal;
+	volatile bool sampling_flag;
+	uint32_t tm_displayed;
+};
+
 struct cgpu_info {
 	uint8_t gpu_id;
 	uint8_t thr_id;
@@ -391,6 +404,8 @@ struct cgpu_info {
 	char gpu_desc[64];
 	double intensity;
 	uint32_t throughput;
+
+	struct monitor_info monitor;
 };
 
 struct thr_api {
diff --git a/nvml.cpp b/nvml.cpp
index 140eb1e..eb02802 100644
--- a/nvml.cpp
+++ b/nvml.cpp
@@ -418,6 +418,25 @@ int nvml_reset_clocks(nvml_handle *nvmlh, int dev_id)
 	return ret;
 }
 
+int nvml_get_clocks(nvml_handle *nvmlh, int dev_id, unsigned int *core, unsigned int *mem)
+{
+	int ret = 0;
+	nvmlReturn_t rc;
+	uint32_t gpu_clk = 0, mem_clk = 0;
+	int n = nvmlh->cuda_nvml_device_id[dev_id];
+	if (n < 0 || n >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (nvmlh->nvmlDeviceGetClockInfo) {
+		rc = nvmlh->nvmlDeviceGetClockInfo(nvmlh->devs[n], NVML_CLOCK_SM, &gpu_clk);
+		rc = nvmlh->nvmlDeviceGetClockInfo(nvmlh->devs[n], NVML_CLOCK_MEM, &mem_clk);
+		if (rc == NVML_SUCCESS) {
+			*core = gpu_clk; *mem = mem_clk;
+			return 1;
+		}
+	}
+	return ret;
+}
 
 /**
  * Set power state of a device (9xx)
@@ -639,6 +658,21 @@ int nvml_get_fanpcnt(nvml_handle *nvmlh, int cudaindex, unsigned int *fanpcnt)
 	return 0;
 }
 
+
+int nvml_get_current_clocks(int cudaindex, uint32_t *graphics_clock, uint32_t *mem_clock)
+{
+	nvmlReturn_t rc;
+	int gpuindex = hnvml->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= hnvml->nvml_gpucount) return -1;
+
+	rc = hnvml->nvmlDeviceGetClockInfo(hnvml->devs[gpuindex], NVML_CLOCK_SM, graphics_clock);
+	if (rc != NVML_SUCCESS) return -1;
+	rc = hnvml->nvmlDeviceGetClockInfo(hnvml->devs[gpuindex], NVML_CLOCK_MEM, mem_clock);
+	if (rc != NVML_SUCCESS) return -1;
+
+	return 0;
+}
+
 /* Not Supported on 750Ti 340.23 */
 int nvml_get_power_usage(nvml_handle *nvmlh, int cudaindex, unsigned int *milliwatts)
 {
@@ -2051,3 +2085,79 @@ void gpu_led_off(int dev_id)
 	}
 #endif
 }
+
+#ifdef USE_WRAPNVML
+extern double thr_hashrates[MAX_GPUS];
+extern bool opt_debug_threads;
+extern bool opt_hwmonitor;
+extern int num_cpus;
+
+void *monitor_thread(void *userdata)
+{
+	//struct thr_info *mythr = (struct thr_info *)userdata;
+	int thr_id = -1;
+
+	while (!abort_flag && !opt_quiet)
+	{
+		// This thread monitors card's power lazily during scans, one at a time...
+		thr_id = (thr_id + 1) % opt_n_threads;
+		struct cgpu_info *cgpu = &thr_info[thr_id].gpu;
+		int dev_id = cgpu->gpu_id; cudaSetDevice(dev_id);
+
+		//applog(LOG_BLUE, "sampling device %d", dev_id);
+		if (hnvml != NULL && cgpu)
+		{
+			uint64_t clock = 0, mem_clock = 0;
+			uint32_t fanpercent = 0, power = 0;
+			double tempC = 0, khs_per_watt = 0;
+			uint32_t counter = 0;
+			int max_loops = 2000;
+
+			pthread_cond_wait(&cgpu->monitor.sampling_signal, &cgpu->monitor.lock);
+
+			do {
+				uint32_t tmp_clock, tmp_memclock;
+				nvml_get_current_clocks(device_map[thr_id], &tmp_clock, &tmp_memclock);
+				clock += tmp_clock;
+				mem_clock += tmp_memclock;
+				tempC += gpu_temp(cgpu);
+				fanpercent += gpu_fanpercent(cgpu);
+				power += gpu_power(cgpu);
+				counter++;
+
+				usleep(50000);
+				if (abort_flag) goto abort;
+
+			} while (cgpu->monitor.sampling_flag && (--max_loops));
+
+			cgpu->monitor.gpu_temp = (uint32_t) (tempC/counter);
+			cgpu->monitor.gpu_fan = fanpercent/counter;
+			cgpu->monitor.gpu_power = power/counter;
+			cgpu->monitor.gpu_clock = (uint32_t) (clock/counter);
+			cgpu->monitor.gpu_memclock = (uint32_t) (mem_clock/counter);
+
+			if (power) {
+				// todo: handle units
+				khs_per_watt = stats_get_speed(thr_id, thr_hashrates[thr_id]) / ((double)power / counter);
+			}
+
+			// todo: not shown on decred
+			if (opt_hwmonitor && (time(NULL) - cgpu->monitor.tm_displayed) > 60) {
+				gpulog(LOG_INFO, thr_id, "%u MHz %.3f kH/W %uW %uC FAN %u%%",
+					cgpu->monitor.gpu_clock/*, cgpu->monitor.gpu_memclock*/,
+					khs_per_watt, cgpu->monitor.gpu_power / 1000,
+					cgpu->monitor.gpu_temp, cgpu->monitor.gpu_fan
+				);
+				cgpu->monitor.tm_displayed = (uint32_t)time(NULL);
+			}
+
+			pthread_mutex_unlock(&cgpu->monitor.lock);
+		}
+		usleep(500); // safety
+	}
+abort:
+	if (opt_debug_threads)
+		applog(LOG_DEBUG, "%s() died", __func__);
+	return NULL;
+}
+#endif
diff --git a/nvml.h b/nvml.h
index 75016db..1298a3a 100644
--- a/nvml.h
+++ b/nvml.h
@@ -17,6 +17,8 @@
 
 #include "miner.h"
 
+void *monitor_thread(void *userdata);
+
 typedef void * nvmlDevice_t;
 
 /* our own version of the PCI info struct */
@@ -212,6 +214,8 @@ unsigned int gpu_power(struct cgpu_info *gpu);
 int gpu_pstate(struct cgpu_info *gpu);
 int gpu_busid(struct cgpu_info *gpu);
 
+void gpu_current_clocks(struct cgpu_info *gpu);
+
 // pid/vid, sn and bios rev
 int gpu_info(struct cgpu_info *gpu);