diff --git a/ccminer.cpp b/ccminer.cpp
index 010886a..3d7cf3b 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -198,6 +198,8 @@ long  device_sm[MAX_GPUS] = { 0 };
 uint32_t gpus_intensity[MAX_GPUS] = { 0 };
 uint32_t device_gpu_clocks[MAX_GPUS] = { 0 };
 uint32_t device_mem_clocks[MAX_GPUS] = { 0 };
+uint32_t device_plimit[MAX_GPUS] = { 0 };
+int8_t device_pstate[MAX_GPUS] = { -1 };
 
 // un-linked to cmdline scrypt options (useless)
 int device_batchsize[MAX_GPUS] = { 0 };
@@ -343,10 +345,12 @@ Options:\n\
       --max-temp=N      Only mine if gpu temp is less than specified value\n\
       --max-rate=N[KMG] Only mine if net hashrate is less than specified value\n\
       --max-diff=N      Only mine if net difficulty is less than specified value\n"
-#if defined(USE_WRAPNVML) && (defined(__linux) || defined(_WIN64))
+#if defined(USE_WRAPNVML) && (defined(__linux) || defined(_WIN64)) /* via nvml */
 "\
-      --gpu-clock=1150  Set device application clock\n\
-      --mem-clock=3505  Set the gpu memory clock (require 346.72+ driver)\n"
+      --mem-clock=3505  Set the gpu memory max clock (346.72+ driver)\n\
+      --gpu-clock=1150  Set the gpu engine max clock (346.72+ driver)\n\
+      --pstate=0[,2]    Set the gpu power state (352.21+ driver)\n\
+      --plimit=100W     Set the gpu power limit (352.21+ driver)\n"
 #endif
 #ifdef HAVE_SYSLOG_H
 "\
@@ -410,6 +414,8 @@ struct option options[] = {
 	{ "statsavg", 1, NULL, 'N' },
 	{ "gpu-clock", 1, NULL, 1070 },
 	{ "mem-clock", 1, NULL, 1071 },
+	{ "pstate", 1, NULL, 1072 },
+	{ "plimit", 1, NULL, 1073 },
 #ifdef HAVE_SYSLOG_H
 	{ "syslog", 0, NULL, 'S' },
 	{ "syslog-prefix", 1, NULL, 1018 },
@@ -2731,6 +2737,28 @@ void parse_arg(int key, char *arg)
 			}
 		}
 		break;
+	case 1072: /* --pstate */
+		{
+			char *pch = strtok(arg,",");
+			int n = 0;
+			while (pch != NULL && n < MAX_GPUS) {
+				int dev_id = device_map[n++];
+				device_pstate[dev_id] = (int8_t) atoi(pch);
+				pch = strtok(NULL, ",");
+			}
+		}
+		break;
+	case 1073: /* --plimit */
+		{
+			char *pch = strtok(arg,",");
+			int n = 0;
+			while (pch != NULL && n < MAX_GPUS) {
+				int dev_id = device_map[n++];
+				device_plimit[dev_id] = atoi(pch);
+				pch = strtok(NULL, ",");
+			}
+		}
+		break;
 	case 1005:
 		opt_benchmark = true;
 		want_longpoll = false;
@@ -3049,6 +3077,7 @@ int main(int argc, char *argv[])
 		device_interactive[i] = -1;
 		device_texturecache[i] = -1;
 		device_singlememory[i] = -1;
+		device_pstate[i] = -1;
 	}
 
 	// number of gpus
@@ -3231,10 +3260,17 @@ int main(int argc, char *argv[])
 	/* nvml is currently not the best choice on Windows (only in x64) */
 	hnvml = nvml_create();
 	if (hnvml) {
+		bool gpu_reinit = false;
 		cuda_devicenames(); // refresh gpu vendor name
 		applog(LOG_INFO, "NVML GPU monitoring enabled.");
 		for (int n=0; n < opt_n_threads; n++) {
+			if (nvml_set_pstate(hnvml, device_map[n]) == 1)
+				gpu_reinit = true;
+			if (nvml_set_plimit(hnvml, device_map[n]) == 1)
+				gpu_reinit = true;
 			if (nvml_set_clocks(hnvml, device_map[n]) == 1)
+				gpu_reinit = true;
+			if (gpu_reinit)
 				cuda_reset_device(n, NULL);
 		}
 	}
diff --git a/nvml.cpp b/nvml.cpp
index daa56b4..becc4c5 100644
--- a/nvml.cpp
+++ b/nvml.cpp
@@ -37,6 +37,8 @@ static uint32_t device_bus_ids[MAX_GPUS] = { 0 };
 
 extern uint32_t device_gpu_clocks[MAX_GPUS];
 extern uint32_t device_mem_clocks[MAX_GPUS];
+extern uint32_t device_plimit[MAX_GPUS];
+extern int8_t device_pstate[MAX_GPUS];
 
 uint8_t gpu_clocks_changed[MAX_GPUS] = { 0 };
 
@@ -326,12 +328,12 @@ int nvml_set_clocks(nvml_handle *nvmlh, int dev_id)
 	if (device_gpu_clocks[dev_id]) gpu_clk = device_gpu_clocks[dev_id];
 
 	// these functions works for the 960 and the 970 (346.72+), not for the 750 Ti
-	uint32_t nclocks = 0, clocks[128] = { 0 };
+	uint32_t nclocks = 0, clocks[127] = { 0 };
 	nvmlh->nvmlDeviceGetSupportedMemoryClocks(nvmlh->devs[n], &nclocks, NULL);
-	nclocks = min(nclocks, 128);
+	nclocks = min(nclocks, 127);
 	nvmlh->nvmlDeviceGetSupportedMemoryClocks(nvmlh->devs[n], &nclocks, clocks);
-	for (uint8_t u=0; u < nclocks; u++) {
-		// ordered desc, so get first
+	for (int8_t u=0; u < nclocks; u++) {
+		// ordered by pstate (so highest is first memory clock - P0)
 		if (clocks[u] <= mem_clk) {
 			mem_clk = clocks[u];
 			break;
@@ -340,7 +342,7 @@ int nvml_set_clocks(nvml_handle *nvmlh, int dev_id)
 
 	nclocks = 0;
 	nvmlh->nvmlDeviceGetSupportedGraphicsClocks(nvmlh->devs[n], mem_clk, &nclocks, NULL);
-	nclocks = min(nclocks, 128);
+	nclocks = min(nclocks, 127);
 	nvmlh->nvmlDeviceGetSupportedGraphicsClocks(nvmlh->devs[n], mem_clk, &nclocks, clocks);
 	for (uint8_t u=0; u < nclocks; u++) {
 		// ordered desc, so get first
@@ -354,7 +356,7 @@ int nvml_set_clocks(nvml_handle *nvmlh, int dev_id)
 	if (rc == NVML_SUCCESS)
 		applog(LOG_INFO, "GPU #%d: application clocks set to %u/%u", dev_id, mem_clk, gpu_clk);
 	else {
-		applog(LOG_ERR, "GPU #%d: %u/%u - %s", dev_id, mem_clk, gpu_clk, nvmlh->nvmlErrorString(rc));
+		applog(LOG_WARNING, "GPU #%d: %u/%u - %s", dev_id, mem_clk, gpu_clk, nvmlh->nvmlErrorString(rc));
 		return -1;
 	}
 
@@ -383,6 +385,123 @@ int nvml_reset_clocks(nvml_handle *nvmlh, int dev_id)
 	return 1;
 }
 
+
+/**
+ * Set power state of a device (9xx)
+ * Code is similar as clocks one, which allow the change of the pstate
+ */
+int nvml_set_pstate(nvml_handle *nvmlh, int dev_id)
+{
+	nvmlReturn_t rc;
+	uint32_t gpu_clk = 0, mem_clk = 0;
+	int n = nvmlh->cuda_nvml_device_id[dev_id];
+	if (n < 0 || n >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (device_pstate[dev_id] < 0)
+		return 0;
+
+	// prevent double operations on the same gpu... to enhance
+	if (gpu_clocks_changed[dev_id])
+		return 0;
+
+	if (nvmlh->app_clocks[n] != NVML_FEATURE_ENABLED) {
+		applog(LOG_WARNING, "GPU #%d: NVML app. clock feature is not allowed!", dev_id);
+		return -EPERM;
+	}
+
+	nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[n], NVML_CLOCK_MEM, &mem_clk);
+	rc = nvmlh->nvmlDeviceGetDefaultApplicationsClock(nvmlh->devs[n], NVML_CLOCK_GRAPHICS, &gpu_clk);
+	if (rc != NVML_SUCCESS) {
+		applog(LOG_WARNING, "GPU #%d: unable to query application clocks", dev_id);
+		return -EINVAL;
+	}
+
+	// get application config values
+	if (device_mem_clocks[dev_id]) mem_clk = device_mem_clocks[dev_id];
+	if (device_gpu_clocks[dev_id]) gpu_clk = device_gpu_clocks[dev_id];
+
+	// these functions works for the 960 and the 970 (346.72+), not for the 750 Ti
+	uint32_t nclocks = 0, clocks[127] = { 0 };
+	int8_t wanted_pstate = device_pstate[dev_id];
+	nvmlh->nvmlDeviceGetSupportedMemoryClocks(nvmlh->devs[n], &nclocks, NULL);
+	nclocks = min(nclocks, 127);
+	nvmlh->nvmlDeviceGetSupportedMemoryClocks(nvmlh->devs[n], &nclocks, clocks);
+	for (uint8_t u=0; u < nclocks; u++) {
+		// ordered by pstate (so high first)
+		if (u == wanted_pstate) {
+			mem_clk = clocks[u];
+			break;
+		}
+	}
+
+	nclocks = 0;
+	nvmlh->nvmlDeviceGetSupportedGraphicsClocks(nvmlh->devs[n], mem_clk, &nclocks, NULL);
+	nclocks = min(nclocks, 127);
+	nvmlh->nvmlDeviceGetSupportedGraphicsClocks(nvmlh->devs[n], mem_clk, &nclocks, clocks);
+	for (uint8_t u=0; u < nclocks; u++) {
+		// ordered desc, so get first
+		if (clocks[u] <= gpu_clk) {
+			gpu_clk = clocks[u];
+			break;
+		}
+	}
+
+	rc = nvmlh->nvmlDeviceSetApplicationsClocks(nvmlh->devs[n], mem_clk, gpu_clk);
+	if (rc != NVML_SUCCESS) {
+		applog(LOG_WARNING, "GPU #%d: pstate %s", dev_id, nvmlh->nvmlErrorString(rc));
+		return -1;
+	}
+
+	if (opt_debug)
+		applog(LOG_INFO, "GPU #%d: app clocks set to P%d (%u/%u)", dev_id, (int) wanted_pstate, mem_clk, gpu_clk);
+
+	gpu_clocks_changed[dev_id] = 1;
+	return 1;
+}
+
+int nvml_set_plimit(nvml_handle *nvmlh, int dev_id)
+{
+	nvmlReturn_t rc = NVML_ERROR_UNKNOWN;
+	uint32_t gpu_clk = 0, mem_clk = 0;
+	int n = nvmlh->cuda_nvml_device_id[dev_id];
+	if (n < 0 || n >= nvmlh->nvml_gpucount)
+		return -ENODEV;
+
+	if (!device_plimit[dev_id])
+		return 0; // nothing to do
+
+	if (!nvmlh->nvmlDeviceSetPowerManagementLimit)
+		return -ENOSYS;
+
+	uint32_t plimit = device_plimit[dev_id] * 1000U;
+	uint32_t pmin = 1000, pmax = 0;
+	if (nvmlh->nvmlDeviceGetPowerManagementLimitConstraints)
+		rc = nvmlh->nvmlDeviceGetPowerManagementLimitConstraints(nvmlh->devs[n], &pmin, &pmax);
+
+	if (rc != NVML_SUCCESS) {
+		if (!nvmlh->nvmlDeviceGetPowerManagementLimit)
+			return -ENOSYS;
+		pmax = 100 * 1000; // should not happen...
+		nvmlh->nvmlDeviceGetPowerManagementLimit(nvmlh->devs[n], &pmax);
+	}
+
+	plimit = min(plimit, pmax);
+	plimit = max(plimit, pmin);
+	rc = nvmlh->nvmlDeviceSetPowerManagementLimit(nvmlh->devs[n], plimit);
+	if (rc != NVML_SUCCESS) {
+		applog(LOG_WARNING, "GPU #%d: plimit %s", dev_id, nvmlh->nvmlErrorString(rc));
+		return -1;
+	}
+
+	if (opt_debug) {
+		applog(LOG_INFO, "GPU #%d: power limit set to %uW (allowed range is %u-%u)",
+			dev_id, plimit/1000U, pmin/1000U, pmax/1000U);
+	}
+
+	return 0;
+}
+
 int nvml_get_gpucount(nvml_handle *nvmlh, int *gpucount)
 {
 	*gpucount = nvmlh->nvml_gpucount;
diff --git a/nvml.h b/nvml.h
index 1397b78..4e1df9f 100644
--- a/nvml.h
+++ b/nvml.h
@@ -153,43 +153,8 @@ int nvml_destroy(nvml_handle *nvmlh);
  */
 int nvml_get_gpucount(nvml_handle *nvmlh, int *gpucount);
 
-/*
- * Query the number of GPUs seen by CUDA
- */
-int cuda_get_gpucount(nvml_handle *nvmlh, int *gpucount);
-
-
-/*
- * query the name of the GPU model from the CUDA device ID
- *
- */
-int nvml_get_gpu_name(nvml_handle *nvmlh,
-                           int gpuindex,
-                           char *namebuf,
-                           int bufsize);
-
-/*
- * Query the current GPU temperature (Celsius), from the CUDA device ID
- */
-int nvml_get_tempC(nvml_handle *nvmlh,
-                        int gpuindex, unsigned int *tempC);
-
-/*
- * Query the current GPU fan speed (percent) from the CUDA device ID
- */
-int nvml_get_fanpcnt(nvml_handle *nvmlh,
-                          int gpuindex, unsigned int *fanpcnt);
-
-/*
- * Query the current GPU power usage in millwatts from the CUDA device ID
- *
- * This feature is only available on recent GPU generations and may be
- * limited in some cases only to Tesla series GPUs.
- * If the query is run on an unsupported GPU, this routine will return -1.
- */
-int nvml_get_power_usage(nvml_handle *nvmlh,
-                              int gpuindex,
-                              unsigned int *milliwatts);
+int nvml_set_plimit(nvml_handle *nvmlh, int dev_id);
+int nvml_set_pstate(nvml_handle *nvmlh, int dev_id);
 
 int nvml_set_clocks(nvml_handle *nvmlh, int dev_id);
 int nvml_reset_clocks(nvml_handle *nvmlh, int dev_id);