diff --git a/Makefile.am b/Makefile.am
index 26897ef..2f8c418 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,8 +1,9 @@
+# allow to use Host cuda functions in C/C++
+DEF_INCLUDES = @CUDA_INCLUDES@
 
+JANSSON_INCLUDES=
 if WANT_JANSSON
 JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson
-else
-JANSSON_INCLUDES=
 endif
 
 EXTRA_DIST		= autogen.sh README.txt LICENSE.txt \
@@ -17,7 +18,7 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  compat/inttypes.h compat/stdbool.h compat/unistd.h \
 			  compat/sys/time.h compat/getopt/getopt.h \
 			  cpu-miner.c util.c crc32.c hefty1.c scrypt.c \
-			  api.c hashlog.cpp stats.cpp cuda.cu \
+			  api.cpp hashlog.cpp stats.cpp cuda.cpp \
 			  heavy/heavy.cu \
 			  heavy/cuda_blake512.cu heavy/cuda_blake512.h \
 			  heavy/cuda_combine.cu heavy/cuda_combine.h \
@@ -49,19 +50,25 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \
 			  x11/s3.cu
 
+if HAVE_NVML
+ccminer_SOURCES += nvml.cpp
+nvml_defs = -DUSE_WRAPNVML
+nvml_libs = -ldl
+endif
+
 if HAVE_WINDOWS
 ccminer_SOURCES += compat/winansi.c
 endif
 
-ccminer_LDFLAGS		= $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
-ccminer_LDADD		= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
-ccminer_CPPFLAGS	= @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(CPPFLAGS) $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
+ccminer_LDFLAGS  = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
+ccminer_LDADD    = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@ $(nvml_libs)
+ccminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(CPPFLAGS) $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) $(DEF_INCLUDES) $(nvml_defs) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
 
 nvcc_ARCH  = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
 #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
 #nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
 
-nvcc_FLAGS = $(nvcc_ARCH) -I . @CUDA_CFLAGS@
+nvcc_FLAGS = $(nvcc_ARCH) @CUDA_INCLUDES@ -I. @CUDA_CFLAGS@
 nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"
 
 # we're now targeting all major compute architectures within one binary.
diff --git a/README.txt b/README.txt
index 416fb5c..62a7e3c 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
 
-ccMiner release 1.4.8-tpruvot (12 Nov 2014) - "API Stats"
+ccMiner release 1.4.9-tpruvot (Nov 2014) - "GPU Monitoring"
 ---------------------------------------------------------------
 
 ***************************************************************
@@ -155,9 +155,13 @@ features.
 
 >>> RELEASE HISTORY <<<
 
+  Nov. 13th 2014  v1.4.9
+                  Add nvml unit to monitor nvidia cards (api)
+                  API: small changes, bump v1.1
+
   Nov. 12th 2014  v1.4.8
                   Add a basic API and sample php json wrapper
-		  Add statsavg (def 20) and api-bind parameters
+                  Add statsavg (def 20) and api-bind parameters
                   Fix displayed hashrate for multi gpus systems
 
   Nov. 11th 2014  v1.4.7
diff --git a/api.c b/api.cpp
similarity index 91%
rename from api.c
rename to api.cpp
index 8d65616..fa716ba 100644
--- a/api.c
+++ b/api.cpp
@@ -8,7 +8,7 @@
  * Software Foundation; either version 2 of the License, or (at your option)
  * any later version.  See COPYING for more details.
  */
-#define APIVERSION "1.0"
+#define APIVERSION "1.1"
 
 #ifdef _MSC_VER
 # define  _WINSOCK_DEPRECATED_NO_WARNINGS
@@ -35,6 +35,10 @@
 #include "compat.h"
 #include "miner.h"
 
+#ifdef USE_WRAPNVML
+#include "nvml.h"
+#endif
+
 #ifndef _MSC_VER
 # include <errno.h>
 # include <sys/socket.h>
@@ -105,25 +109,26 @@ extern uint32_t rejected_count;
 
 #define gpu_threads opt_n_threads
 
-extern void get_currentalgo(char* buf, int sz);
-
 /***************************************************************/
 
 static void gpustatus(int thr_id)
 {
 	char buf[MYBUFSIZ];
 	float gt;
-	int gf, gp;
+	int gp, gf;
 
 	if (thr_id >= 0 && thr_id < gpu_threads) {
 		struct cgpu_info *cgpu = &thr_info[thr_id].gpu;
 
-#ifdef HAVE_HWMONITORING
+		cgpu->thr_id = thr_id;
+
+#ifdef USE_WRAPNVML
 		// todo
-		if (gpu->has_monitoring) {
-			gt = gpu_temp(gpu);
-			gf = gpu_fanspeed(gpu);
-			gp = gpu_fanpercent(gpu);
+		if (1 || cgpu->has_monitoring) {
+			gf = gpu_fanpercent(cgpu);
+			gt = gpu_temp(cgpu);
+			gp = gpu_power(cgpu);
+			// gpu_clock(cgpu);
 		}
 		else
 #endif
@@ -148,7 +153,7 @@ static void gpustatus(int thr_id)
 
 		cgpu->khashes = stats_get_speed(thr_id) / 1000.0;
 
-		sprintf(buf, "GPU=%d;TEMP=%.1f;FAN=%d;FANP=%d;KHS=%.2f;"
+		sprintf(buf, "GPU=%d;TEMP=%.1f;FAN=%d;POWER=%d;KHS=%.2f;"
 			"HWF=%d;I=%d|",
 			thr_id, gt, gf, gp, cgpu->khashes,
 			cgpu->hw_errors, cgpu->intensity);
@@ -162,14 +167,14 @@ static void gpustatus(int thr_id)
 static char *getsummary(char *params)
 {
 	char algo[64] = "";
-	time_t uptime = (time(NULL) - startup);
-	double accps = (60.0 * accepted_count) / (uptime ? (uint32_t) uptime : 1.0);
+	double uptime = difftime(time(NULL), startup);
+	double accps = (60.0 * accepted_count) / (uptime ? uptime : 1.0);
 
 	get_currentalgo(algo, sizeof(algo));
 
 	*buffer = '\0';
 	sprintf(buffer, "NAME=%s;VER=%s;API=%s;"
-		"ALGO=%s;KHS=%.2f;ACC=%d;REJ=%d;ACCMN=%.3f;UPTIME=%d|",
+		"ALGO=%s;KHS=%.2f;ACC=%d;REJ=%d;ACCMN=%.3f;UPTIME=%.1f|",
 		PACKAGE_NAME, PACKAGE_VERSION, APIVERSION,
 		algo, (double)global_hashrate / 1000.0,
 		accepted_count, rejected_count,
@@ -186,7 +191,7 @@ static char *getstats(char *params)
 }
 
 struct CMDS {
-	char *name;
+	const char *name;
 	char *(*func)(char *);
 } cmds[] = {
 	{ "summary", getsummary },
@@ -195,15 +200,18 @@ struct CMDS {
 
 #define CMDMAX 2
 
-static void send_result(SOCKETTYPE c, char *result)
+static int send_result(SOCKETTYPE c, char *result)
 {
 	int n;
 
-	if (result == NULL)
-		result = "";
+	if (!result) {
+		n = send(c, "", 1, 0);
+	} else {
+		// ignore failure - it's closed immediately anyway
+		n = send(c, result, strlen(result) + 1, 0);
+	}
 
-	// ignore failure - it's closed immediately anyway
-	n = send(c, result, strlen(result) + 1, 0);
+	return n;
 }
 
 /*
@@ -400,7 +408,8 @@ static void api()
 			if ((time(NULL) - bindstart) > 61)
 				break;
 			else {
-				applog(LOG_ERR, "API bind to port %d failed - trying again in 15sec", port);
+				if (!opt_quiet || opt_debug)
+					applog(LOG_WARNING, "API bind to port %d failed - trying again in 15sec", port);
 				sleep(15);
 			}
 		}
@@ -409,7 +418,7 @@ static void api()
 	}
 
 	if (bound == 0) {
-		applog(LOG_ERR, "API bind to port %d failed (%s)%s", port, binderror, UNAVAILABLE);
+		applog(LOG_WARNING, "API bind to port %d failed (%s)%s", port, binderror, UNAVAILABLE);
 		free(apisock);
 		return;
 	}
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 9d5603b..f14e5bc 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -87,7 +87,7 @@
       <Optimization>Disabled</Optimization>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <FunctionLevelLinking>true</FunctionLevelLinking>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <AdditionalIncludeDirectories>.;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
       <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
     </ClCompile>
@@ -114,7 +114,7 @@
       <Optimization>Disabled</Optimization>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
       <FunctionLevelLinking>true</FunctionLevelLinking>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <AdditionalIncludeDirectories>.;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
       <StructMemberAlignment>8Bytes</StructMemberAlignment>
       <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
@@ -150,7 +150,7 @@
       <CompileAsManaged>false</CompileAsManaged>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <AdditionalIncludeDirectories>.;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
       <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
       <ExceptionHandling>SyncCThrow</ExceptionHandling>
@@ -193,7 +193,7 @@
       <CompileAsManaged>false</CompileAsManaged>
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <AdditionalIncludeDirectories>.;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir)</AdditionalIncludeDirectories>
     </ClCompile>
     <Link>
@@ -240,9 +240,8 @@
     <ClCompile Include="groestlcoin.cpp" />
     <ClCompile Include="hashlog.cpp" />
     <ClCompile Include="stats.cpp" />
-    <ClCompile Include="api.c">
-      <AdditionalOptions>/Tp %(AdditionalOptions)</AdditionalOptions>
-    </ClCompile>
+    <ClCompile Include="nvml.cpp" />
+    <ClCompile Include="api.cpp" />
     <ClCompile Include="hefty1.c" />
     <ClCompile Include="myriadgroestl.cpp" />
     <ClCompile Include="scrypt.c">
@@ -321,7 +320,7 @@
     <ClInclude Include="uint256.h" />
   </ItemGroup>
   <ItemGroup>
-    <CudaCompile Include="cuda.cu" />
+    <CudaCompile Include="cuda.cpp" />
     <CudaCompile Include="bitslice_transformations_quad.cu">
       <ExcludedFromBuild>true</ExcludedFromBuild>
     </CudaCompile>
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 6d1ff7c..51cb1de 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -192,7 +192,10 @@
     <ClCompile Include="stats.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="api.c">
+    <ClCompile Include="api.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="nvml.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
   </ItemGroup>
@@ -313,7 +316,7 @@
     </ClInclude>
   </ItemGroup>
   <ItemGroup>
-    <CudaCompile Include="cuda.cu">
+    <CudaCompile Include="cuda.cpp">
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
     <CudaCompile Include="cuda_fugue256.cu">
@@ -482,4 +485,4 @@
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
   </ItemGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/configure.ac b/configure.ac
index 674b2b6..e6418ed 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [1.4.8])
+AC_INIT([ccminer], [1.4.9])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
@@ -142,19 +142,32 @@ dnl Setup CUDA paths
 AC_ARG_WITH([cuda],
    [  --with-cuda=PATH    prefix where cuda is installed [default=/usr/local/cuda]])
 
+AC_ARG_WITH([nvml],
+   [  --with-nvml=PATH    prefix where libnvml is installed [default=/usr/lib]])
+
+AM_CONDITIONAL([HAVE_NVML], [test -n "$with_nvml"])
+
 if test -n "$with_cuda"
 then
-   CUDA_CFLAGS="-I$with_cuda/include $CUDA_CFLAGS"
-   CUDA_LIBS="-lcudart"
-   CUDA_LDFLAGS="-L$with_cuda/lib$SUFFIX"
-   NVCC="$with_cuda/bin/nvcc"
+  CUDA_INCLUDES="-I$with_cuda/include"
+  CUDA_LIBS="-lcudart"
+  CUDA_LDFLAGS="-L$with_cuda/lib$SUFFIX"
+  NVCC="$with_cuda/bin/nvcc"
 else
-   CUDA_CFLAGS="-I/usr/local/cuda/include $CUDA_CFLAGS"
-   CUDA_LIBS="-lcudart -static-libstdc++"
-   CUDA_LDFLAGS="-L/usr/local/cuda/lib$SUFFIX"
-   NVCC="nvcc"
+  CUDA_INCLUDES="-I/usr/local/cuda/include"
+  CUDA_LIBS="-lcudart -static-libstdc++"
+  CUDA_LDFLAGS="-L/usr/local/cuda/lib$SUFFIX"
+  NVCC="nvcc"
 fi
+
+if test -n "$with_nvml" ; then
+  NVML_LIBPATH=$with_nvml
+  CUDA_LDFLAGS="$CUDA_LDFLAGS -ldl"
+fi
+AC_SUBST(NVML_LIBPATH)
+
 AC_SUBST(CUDA_CFLAGS)
+AC_SUBST(CUDA_INCLUDES)
 AC_SUBST(CUDA_LIBS)
 AC_SUBST(CUDA_LDFLAGS)
 AC_SUBST(NVCC)
diff --git a/configure.sh b/configure.sh
index 7e277f7..1084ba7 100755
--- a/configure.sh
+++ b/configure.sh
@@ -7,5 +7,5 @@
 
 extracflags="-march=native -D_REENTRANT -falign-functions=16 -falign-jumps=16 -falign-labels=16"
 
-CUDA_CFLAGS="-O3 -Xcompiler -Wall" ./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda
+CUDA_CFLAGS="-O3 -Xcompiler -Wall" ./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda --with-nvml=libnvidia-ml.so
 
diff --git a/cpu-miner.c b/cpu-miner.c
index 9cae3e3..f2ff544 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -56,7 +56,7 @@ BOOL WINAPI ConsoleHandler(DWORD);
 #define HEAVYCOIN_BLKHDR_SZ		84
 #define MNR_BLKHDR_SZ 80
 
-// from heavy.cu
+// from cuda.cu
 #ifdef __cplusplus
 extern "C"
 {
@@ -69,6 +69,9 @@ int cuda_finddevice(char *name);
 }
 #endif
 
+#ifdef USE_WRAPNVML
+#include "nvml.h"
+#endif
 
 #ifdef __linux /* Linux specific policy and affinity management */
 #include <sched.h>
@@ -244,6 +247,10 @@ uint32_t opt_work_size = 0; /* default */
 char *opt_api_allow = "127.0.0.1"; /* 0.0.0.0 for all ips */
 int opt_api_listen = 4068; /* 0 to disable */
 
+#ifdef USE_WRAPNVML
+wrap_nvml_handle *nvmlh = NULL;
+#endif
+
 #ifdef HAVE_GETOPT_LONG
 #include <getopt.h>
 #else
@@ -421,7 +428,10 @@ void proper_exit(int reason)
 #ifdef WIN32
 	timeEndPeriod(1); // else never executed
 #endif
-
+#ifdef USE_WRAPNVML
+	if (nvmlh)
+		wrap_nvml_destroy(nvmlh);
+#endif
 	exit(reason);
 }
 
@@ -2129,6 +2139,16 @@ int main(int argc, char *argv[])
 			tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url));
 	}
 
+#ifdef USE_WRAPNVML
+	nvmlh = wrap_nvml_create();
+	if (nvmlh) {
+		// todo: link threads info gpu
+		applog(LOG_INFO, "NVML GPU monitoring enabled.");
+	} else {
+		applog(LOG_INFO, "NVML GPU monitoring is not available.");
+	}
+#endif
+
 	if (opt_api_listen) {
 		/* api thread */
 		api_thr_id = opt_n_threads + 3;
diff --git a/cpuminer-config.h b/cpuminer-config.h
index 1767071..469a029 100644
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@@ -156,7 +156,7 @@
 #define PACKAGE_NAME "ccminer"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 1.4.8"
+#define PACKAGE_STRING "ccminer 1.4.9"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "ccminer"
@@ -165,7 +165,7 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.4.8"
+#define PACKAGE_VERSION "1.4.9"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
@@ -188,7 +188,7 @@
 #define USE_XOP 1
 
 /* Version number of package */
-#define VERSION "1.4.8"
+#define VERSION "1.4.9"
 
 /* Define curl_free() as free() if our version of curl lacks curl_free. */
 /* #undef curl_free */
diff --git a/cuda.cu b/cuda.cpp
similarity index 96%
rename from cuda.cu
rename to cuda.cpp
index eb76a7e..6d03f66 100644
--- a/cuda.cu
+++ b/cuda.cpp
@@ -9,16 +9,22 @@
 #endif
 
 // include thrust
+#ifndef __cplusplus
 #include <thrust/version.h>
 #include <thrust/remove.h>
 #include <thrust/device_vector.h>
 #include <thrust/iterator/constant_iterator.h>
+#else
+#include <ctype.h>
+#endif
 
 #include "miner.h"
 
-#include "cuda_helper.h"
+#include "cuda_runtime.h"
 
 extern char *device_name[8];
+extern int device_map[8];
+extern int device_sm[8];
 
 // CUDA Devices on the System
 extern "C" int cuda_num_devices()
diff --git a/miner.h b/miner.h
index 1de9bf6..4140df5 100644
--- a/miner.h
+++ b/miner.h
@@ -356,21 +356,20 @@ extern int scanhash_x17(int thr_id, uint32_t *pdata,
 void *api_thread(void *userdata);
 
 struct cgpu_info {
+	int thr_id;
 	int accepted;
 	int rejected;
 	int hw_errors;
 	double khashes;
 	int intensity;
-#ifdef HAVE_HWMONITORING
+#ifdef USE_WRAPNVML
 	bool has_monitoring;
-	int gpu_engine;
-	int min_engine;
-	int gpu_fan;
-	int min_fan;
-	int gpu_memclock;
-	int gpu_memdiff;
-	int gpu_powertune;
-	float gpu_vddc;
+	float gpu_temp;
+	unsigned int gpu_fan;
+	unsigned int gpu_power;
+	unsigned int gpu_clock;
+	unsigned int gpu_memclock;
+	double gpu_vddc;
 #endif
 };
 
@@ -456,6 +455,7 @@ extern int timeval_subtract(struct timeval *result, struct timeval *x,
 	struct timeval *y);
 extern bool fulltest(const uint32_t *hash, const uint32_t *target);
 extern void diff_to_target(uint32_t *target, double diff);
+extern void get_currentalgo(char* buf, int sz);
 
 struct stratum_job {
 	char *job_id;
diff --git a/nvml.cpp b/nvml.cpp
new file mode 100644
index 0000000..6700567
--- /dev/null
+++ b/nvml.cpp
@@ -0,0 +1,479 @@
+/*
+ * A trivial little dlopen()-based wrapper library for the
+ * NVIDIA NVML library, to allow runtime discovery of NVML on an
+ * arbitrary system.  This is all very hackish and simple-minded, but
+ * it serves my immediate needs in the short term until NVIDIA provides
+ * a static NVML wrapper library themselves, hopefully in
+ * CUDA 6.5 or maybe sometime shortly after.
+ *
+ * This trivial code is made available under the "new" 3-clause BSD license,
+ * and/or any of the GPL licenses you prefer.
+ * Feel free to use the code and modify as you see fit.
+ *
+ * John E. Stone - john.stone@gmail.com
+ * Tanguy Pruvot - tpruvot@github
+ *
+ */
+
+#ifdef USE_WRAPNVML
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#ifndef _MSC_VER
+#include <libgen.h>
+#endif
+
+#include "miner.h"
+#include "cuda_runtime.h"
+#include "nvml.h"
+
+/*
+ * Wrappers to emulate dlopen() on other systems like Windows
+ */
+#if defined(_MSC_VER) || defined(_WIN32) || defined(_WIN64)
+	#include <windows.h>
+	static void *wrap_dlopen(const char *filename) {
+		return (void *)LoadLibrary(filename);
+	}
+	static void *wrap_dlsym(void *h, const char *sym) {
+		return (void *)GetProcAddress((HINSTANCE)h, sym);
+	}
+	static int wrap_dlclose(void *h) {
+		/* FreeLibrary returns nonzero on success */
+		return (!FreeLibrary((HINSTANCE)h));
+	}
+#else
+	/* assume we can use dlopen itself... */
+	#include <dlfcn.h>
+	static void *wrap_dlopen(const char *filename) {
+		return dlopen(filename, RTLD_NOW);
+	}
+	static void *wrap_dlsym(void *h, const char *sym) {
+		return dlsym(h, sym);
+	}
+	static int wrap_dlclose(void *h) {
+		return dlclose(h);
+	}
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+wrap_nvml_handle * wrap_nvml_create()
+{
+	int i=0;
+	wrap_nvml_handle *nvmlh = NULL;
+
+	/*
+	 * We use hard-coded library installation locations for the time being...
+	 * No idea where or if libnvidia-ml.so is installed on MacOS X, a
+	 * deep scouring of the filesystem on one of the Mac CUDA build boxes
+	 * I used turned up nothing, so for now it's not going to work on OSX.
+	 */
+#if defined(_WIN64)
+	/* 64-bit Windows */
+#define  libnvidia_ml "%PROGRAMFILES%/NVIDIA Corporation/NVSMI/nvml.dll"
+#elif defined(_WIN32) || defined(_MSC_VER)
+	/* 32-bit Windows */
+#define  libnvidia_ml "%PROGRAMFILES%/NVIDIA Corporation/NVSMI/nvml.dll"
+#elif defined(__linux) && (defined(__i386__) || defined(__ARM_ARCH_7A__))
+	/* 32-bit linux assumed */
+#define  libnvidia_ml "/usr/lib32/libnvidia-ml.so"
+#elif defined(__linux)
+	/* 64-bit linux assumed */
+#define  libnvidia_ml "/usr/lib/libnvidia-ml.so"
+#else
+#error "Unrecognized platform: need NVML DLL path for this platform..."
+#endif
+
+#if WIN32
+	char tmp[512];
+	ExpandEnvironmentStringsA(libnvidia_ml, tmp, sizeof(tmp));
+#else
+	char tmp[512] = libnvidia_ml;
+#endif
+
+	void *nvml_dll = wrap_dlopen(tmp);
+	if (nvml_dll == NULL) {
+#ifdef WIN32
+		char lib[] = "nvml.dll";
+#else
+		char lib[64] = { '\0' };
+		snprintf(lib, sizeof(lib), "%s", basename(tmp));
+		/* try dlopen without path, here /usr/lib/nvidia-340/libnvidia-ml.so */
+#endif
+		nvml_dll = wrap_dlopen(lib);
+		if (opt_debug)
+			applog(LOG_DEBUG, "dlopen: %s=%p", lib, nvml_dll);
+	}
+	if (nvml_dll == NULL) {
+		if (opt_debug)
+			applog(LOG_DEBUG, "dlopen(%d): failed to load %s", errno, tmp);
+		return NULL;
+	}
+
+	nvmlh = (wrap_nvml_handle *) calloc(1, sizeof(wrap_nvml_handle));
+
+	nvmlh->nvml_dll = nvml_dll;
+
+	nvmlh->nvmlInit = (wrap_nvmlReturn_t (*)(void))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlInit_v2");
+	if (!nvmlh->nvmlInit)
+		nvmlh->nvmlInit = (wrap_nvmlReturn_t (*)(void))
+			wrap_dlsym(nvmlh->nvml_dll, "nvmlInit");
+	nvmlh->nvmlDeviceGetCount = (wrap_nvmlReturn_t (*)(int *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetCount_v2");
+	nvmlh->nvmlDeviceGetHandleByIndex = (wrap_nvmlReturn_t (*)(int, wrap_nvmlDevice_t *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetHandleByIndex_v2");
+	nvmlh->nvmlDeviceGetClockInfo = (wrap_nvmlReturn_t (*)(wrap_nvmlDevice_t, wrap_nvmlClockType_t, unsigned int *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetClockInfo");
+	nvmlh->nvmlDeviceGetPciInfo = (wrap_nvmlReturn_t (*)(wrap_nvmlDevice_t, wrap_nvmlPciInfo_t *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPciInfo");
+	nvmlh->nvmlDeviceGetName = (wrap_nvmlReturn_t (*)(wrap_nvmlDevice_t, char *, int))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetName");
+	nvmlh->nvmlDeviceGetTemperature = (wrap_nvmlReturn_t (*)(wrap_nvmlDevice_t, int, unsigned int *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetTemperature");
+	nvmlh->nvmlDeviceGetFanSpeed = (wrap_nvmlReturn_t (*)(wrap_nvmlDevice_t, unsigned int *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetFanSpeed");
+	nvmlh->nvmlDeviceGetPerformanceState = (wrap_nvmlReturn_t (*)(wrap_nvmlDevice_t, int *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerUsage");
+	nvmlh->nvmlDeviceGetPowerUsage = (wrap_nvmlReturn_t (*)(wrap_nvmlDevice_t, unsigned int *))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlDeviceGetPowerUsage");
+	nvmlh->nvmlErrorString = (char* (*)(wrap_nvmlReturn_t))
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlErrorString");
+	nvmlh->nvmlShutdown = (wrap_nvmlReturn_t (*)())
+		wrap_dlsym(nvmlh->nvml_dll, "nvmlShutdown");
+
+	if (nvmlh->nvmlInit == NULL ||
+			nvmlh->nvmlShutdown == NULL ||
+			nvmlh->nvmlDeviceGetCount == NULL ||
+			nvmlh->nvmlDeviceGetHandleByIndex == NULL ||
+			nvmlh->nvmlDeviceGetPciInfo == NULL ||
+			nvmlh->nvmlDeviceGetName == NULL ||
+			nvmlh->nvmlDeviceGetTemperature == NULL ||
+			nvmlh->nvmlDeviceGetFanSpeed == NULL ||
+			nvmlh->nvmlDeviceGetPowerUsage == NULL)
+	{
+		if (opt_debug)
+			applog(LOG_DEBUG, "Failed to obtain all required NVML function pointers");
+		wrap_dlclose(nvmlh->nvml_dll);
+		free(nvmlh);
+		return NULL;
+	}
+
+	nvmlh->nvmlInit();
+	nvmlh->nvmlDeviceGetCount(&nvmlh->nvml_gpucount);
+
+	/* Query CUDA device count, in case it doesn't agree with NVML, since  */
+	/* CUDA will only report GPUs with compute capability greater than 1.0 */
+	if (cudaGetDeviceCount(&nvmlh->cuda_gpucount) != cudaSuccess) {
+		if (opt_debug)
+			applog(LOG_DEBUG, "Failed to query CUDA device count!");
+		wrap_dlclose(nvmlh->nvml_dll);
+		free(nvmlh);
+		return NULL;
+	}
+
+	nvmlh->devs = (wrap_nvmlDevice_t *) calloc(nvmlh->nvml_gpucount, sizeof(wrap_nvmlDevice_t));
+	nvmlh->nvml_pci_domain_id = (unsigned int*) calloc(nvmlh->nvml_gpucount, sizeof(unsigned int));
+	nvmlh->nvml_pci_bus_id = (unsigned int*) calloc(nvmlh->nvml_gpucount, sizeof(unsigned int));
+	nvmlh->nvml_pci_device_id = (unsigned int*) calloc(nvmlh->nvml_gpucount, sizeof(unsigned int));
+	nvmlh->nvml_cuda_device_id = (int*) calloc(nvmlh->nvml_gpucount, sizeof(int));
+	nvmlh->cuda_nvml_device_id = (int*) calloc(nvmlh->cuda_gpucount, sizeof(int));
+
+	/* Obtain GPU device handles we're going to need repeatedly... */
+	for (i=0; i<nvmlh->nvml_gpucount; i++) {
+		nvmlh->nvmlDeviceGetHandleByIndex(i, &nvmlh->devs[i]);
+	}
+
+	/* Query PCI info for each NVML device, and build table for mapping of */
+	/* CUDA device IDs to NVML device IDs and vice versa                   */
+	for (i=0; i<nvmlh->nvml_gpucount; i++) {
+		wrap_nvmlPciInfo_t pciinfo;
+		nvmlh->nvmlDeviceGetPciInfo(nvmlh->devs[i], &pciinfo);
+		nvmlh->nvml_pci_domain_id[i] = pciinfo.domain;
+		nvmlh->nvml_pci_bus_id[i]    = pciinfo.bus;
+		nvmlh->nvml_pci_device_id[i] = pciinfo.device;
+	}
+
+	/* build mapping of NVML device IDs to CUDA IDs */
+	for (i=0; i<nvmlh->nvml_gpucount; i++) {
+		nvmlh->nvml_cuda_device_id[i] = -1;
+	}
+	for (i=0; i<nvmlh->cuda_gpucount; i++) {
+		cudaDeviceProp props;
+		nvmlh->cuda_nvml_device_id[i] = -1;
+
+		if (cudaGetDeviceProperties(&props, i) == cudaSuccess) {
+			int j;
+			for (j=0; j<nvmlh->nvml_gpucount; j++) {
+				if ((nvmlh->nvml_pci_domain_id[j] == (uint32_t) props.pciDomainID) &&
+				    (nvmlh->nvml_pci_bus_id[j]    == (uint32_t) props.pciBusID) &&
+				    (nvmlh->nvml_pci_device_id[j] == (uint32_t) props.pciDeviceID)) {
+					if (opt_debug)
+						applog(LOG_DEBUG, "CUDA GPU[%d] matches NVML GPU[%d]", i, j);
+					nvmlh->nvml_cuda_device_id[j] = i;
+					nvmlh->cuda_nvml_device_id[i] = j;
+				}
+			}
+		}
+	}
+
+	return nvmlh;
+}
+
+int wrap_nvml_get_gpucount(wrap_nvml_handle *nvmlh, int *gpucount)
+{
+	*gpucount = nvmlh->nvml_gpucount;
+	return 0;
+}
+
+int wrap_cuda_get_gpucount(wrap_nvml_handle *nvmlh, int *gpucount)
+{
+	*gpucount = nvmlh->cuda_gpucount;
+	return 0;
+}
+
+int wrap_nvml_get_gpu_name(wrap_nvml_handle *nvmlh, int cudaindex, char *namebuf, int bufsize)
+{
+	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
+		return -1;
+
+	if (nvmlh->nvmlDeviceGetName(nvmlh->devs[gpuindex], namebuf, bufsize) != WRAPNVML_SUCCESS)
+		return -1;
+
+	return 0;
+}
+
+
+int wrap_nvml_get_tempC(wrap_nvml_handle *nvmlh, int cudaindex, unsigned int *tempC)
+{
+	wrap_nvmlReturn_t rc;
+	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
+		return -1;
+
+	rc = nvmlh->nvmlDeviceGetTemperature(nvmlh->devs[gpuindex], 0u /* NVML_TEMPERATURE_GPU */, tempC);
+	if (rc != WRAPNVML_SUCCESS) {
+		return -1;
+	}
+
+	return 0;
+}
+
+
+int wrap_nvml_get_fanpcnt(wrap_nvml_handle *nvmlh, int cudaindex, unsigned int *fanpcnt)
+{
+	wrap_nvmlReturn_t rc;
+	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
+		return -1;
+
+	rc = nvmlh->nvmlDeviceGetFanSpeed(nvmlh->devs[gpuindex], fanpcnt);
+	if (rc != WRAPNVML_SUCCESS) {
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Not Supported on 750Ti 340.23 */
+int wrap_nvml_get_clock(wrap_nvml_handle *nvmlh, int cudaindex, int type, unsigned int *freq)
+{
+	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
+		return -1;
+
+	wrap_nvmlReturn_t res = nvmlh->nvmlDeviceGetClockInfo(nvmlh->devs[gpuindex], (wrap_nvmlClockType_t) type, freq);
+	if (res != WRAPNVML_SUCCESS) {
+		if (opt_debug)
+			applog(LOG_DEBUG, "nvmlDeviceGetClockInfo: %s", nvmlh->nvmlErrorString(res));
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Not Supported on 750Ti 340.23 */
+int wrap_nvml_get_power_usage(wrap_nvml_handle *nvmlh, int cudaindex, unsigned int *milliwatts)
+{
+	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
+		return -1;
+
+	wrap_nvmlReturn_t res = nvmlh->nvmlDeviceGetPowerUsage(nvmlh->devs[gpuindex], milliwatts);
+	if (res != WRAPNVML_SUCCESS) {
+		if (opt_debug)
+			applog(LOG_DEBUG, "nvmlDeviceGetPowerUsage: %s", nvmlh->nvmlErrorString(res));
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Not Supported on 750Ti 340.23 */
+int wrap_nvml_get_pstate(wrap_nvml_handle *nvmlh, int cudaindex, int *pstate)
+{
+	int gpuindex = nvmlh->cuda_nvml_device_id[cudaindex];
+	if (gpuindex < 0 || gpuindex >= nvmlh->nvml_gpucount)
+		return -1;
+
+	wrap_nvmlReturn_t res = nvmlh->nvmlDeviceGetPerformanceState(nvmlh->devs[gpuindex], pstate);
+	if (res != WRAPNVML_SUCCESS) {
+		if (opt_debug)
+			applog(LOG_DEBUG, "nvmlDeviceGetPerformanceState: %s", nvmlh->nvmlErrorString(res));
+		return -1;
+	}
+
+	return 0;
+}
+
+int wrap_nvml_destroy(wrap_nvml_handle *nvmlh)
+{
+	nvmlh->nvmlShutdown();
+
+	wrap_dlclose(nvmlh->nvml_dll);
+	free(nvmlh);
+	return 0;
+}
+
+/* api functions */
+
+extern wrap_nvml_handle *nvmlh;
+extern int device_map[8];
+
+unsigned int gpu_fanpercent(struct cgpu_info *gpu)
+{
+	unsigned int pct = 0;
+	if (nvmlh) {
+		wrap_nvml_get_fanpcnt(nvmlh, device_map[gpu->thr_id], &pct);
+	}
+	return pct;
+}
+
+double gpu_temp(struct cgpu_info *gpu)
+{
+	double tc = 0.0;
+	if (nvmlh) {
+		unsigned int tmp = 0;
+		wrap_nvml_get_tempC(nvmlh, device_map[gpu->thr_id], &tmp);
+		tc = (double) tmp;
+	}
+	return tc;
+}
+
+unsigned int gpu_clock(struct cgpu_info *gpu)
+{
+	unsigned int freq = 0;
+	if (nvmlh) {
+		wrap_nvml_get_clock(nvmlh, device_map[gpu->thr_id], NVML_CLOCK_SM, &freq);
+	}
+	return freq;
+}
+
+unsigned int gpu_power(struct cgpu_info *gpu)
+{
+	unsigned int mw = 0;
+	if (nvmlh) {
+		wrap_nvml_get_power_usage(nvmlh, device_map[gpu->thr_id], &mw);
+	}
+	return mw;
+}
+
+int gpu_pstate(struct cgpu_info *gpu)
+{
+	int pstate = 0;
+	if (nvmlh) {
+		wrap_nvml_get_pstate(nvmlh, device_map[gpu->thr_id], &pstate);
+		//gpu->gpu_pstate = pstate;
+	}
+	return pstate;
+}
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* USE_WRAPNVML */
+
+/* strings /usr/lib/nvidia-340/libnvidia-ml.so | grep nvmlDeviceGet | grep -v : | sort | uniq
+
+	nvmlDeviceGetAccountingBufferSize
+	nvmlDeviceGetAccountingMode
+	nvmlDeviceGetAccountingPids
+	nvmlDeviceGetAccountingStats
+	nvmlDeviceGetAPIRestriction
+	nvmlDeviceGetApplicationsClock
+	nvmlDeviceGetAutoBoostedClocksEnabled
+	nvmlDeviceGetBAR1MemoryInfo
+	nvmlDeviceGetBoardId
+	nvmlDeviceGetBrand
+	nvmlDeviceGetBridgeChipInfo
+*	nvmlDeviceGetClockInfo
+	nvmlDeviceGetComputeMode
+	nvmlDeviceGetComputeRunningProcesses
+	nvmlDeviceGetCount
+	nvmlDeviceGetCount_v2
+	nvmlDeviceGetCpuAffinity
+	nvmlDeviceGetCurrentClocksThrottleReasons
+	nvmlDeviceGetCurrPcieLinkGeneration
+	nvmlDeviceGetCurrPcieLinkWidth
+	nvmlDeviceGetDecoderUtilization
+	nvmlDeviceGetDefaultApplicationsClock
+	nvmlDeviceGetDetailedEccErrors
+	nvmlDeviceGetDisplayActive
+	nvmlDeviceGetDisplayMode
+	nvmlDeviceGetDriverModel
+	nvmlDeviceGetEccMode
+	nvmlDeviceGetEncoderUtilization
+	nvmlDeviceGetEnforcedPowerLimit
+*	nvmlDeviceGetFanSpeed
+	nvmlDeviceGetGpuOperationMode
+	nvmlDeviceGetHandleByIndex
+	nvmlDeviceGetHandleByIndex_v2
+	nvmlDeviceGetHandleByPciBusId
+	nvmlDeviceGetHandleByPciBusId_v2
+	nvmlDeviceGetHandleBySerial
+	nvmlDeviceGetHandleByUUID
+	nvmlDeviceGetIndex
+	nvmlDeviceGetInforomConfigurationChecksum
+	nvmlDeviceGetInforomImageVersion
+	nvmlDeviceGetInforomVersion
+	nvmlDeviceGetMaxClockInfo
+	nvmlDeviceGetMaxPcieLinkGeneration
+	nvmlDeviceGetMaxPcieLinkWidth
+	nvmlDeviceGetMemoryErrorCounter
+	nvmlDeviceGetMemoryInfo
+	nvmlDeviceGetMinorNumber
+	nvmlDeviceGetMultiGpuBoard
+	nvmlDeviceGetName
+	nvmlDeviceGetPciInfo
+	nvmlDeviceGetPciInfo_v2
+*	nvmlDeviceGetPerformanceState
+	nvmlDeviceGetPersistenceMode
+	nvmlDeviceGetPowerManagementDefaultLimit
+	nvmlDeviceGetPowerManagementLimit
+	nvmlDeviceGetPowerManagementLimitConstraints
+	nvmlDeviceGetPowerManagementMode
+	nvmlDeviceGetPowerState (deprecated)
+*	nvmlDeviceGetPowerUsage
+	nvmlDeviceGetRetiredPages
+	nvmlDeviceGetRetiredPagesPendingStatus
+	nvmlDeviceGetSamples
+	nvmlDeviceGetSerial
+	nvmlDeviceGetSupportedClocksThrottleReasons
+	nvmlDeviceGetSupportedEventTypes
+	nvmlDeviceGetSupportedGraphicsClocks
+	nvmlDeviceGetSupportedMemoryClocks
+	nvmlDeviceGetTemperature
+	nvmlDeviceGetTemperatureThreshold
+	nvmlDeviceGetTotalEccErrors
+	nvmlDeviceGetUtilizationRates
+	nvmlDeviceGetUUID
+	nvmlDeviceGetVbiosVersion
+	nvmlDeviceGetViolationStatus
+
+*/
\ No newline at end of file
diff --git a/nvml.h b/nvml.h
new file mode 100644
index 0000000..7f200df
--- /dev/null
+++ b/nvml.h
@@ -0,0 +1,146 @@
+/*
+ * A trivial little dlopen()-based wrapper library for the
+ * NVIDIA NVML library, to allow runtime discovery of NVML on an
+ * arbitrary system.  This is all very hackish and simple-minded, but
+ * it serves my immediate needs in the short term until NVIDIA provides
+ * a static NVML wrapper library themselves, hopefully in
+ * CUDA 6.5 or maybe sometime shortly after.
+ *
+ * This trivial code is made available under the "new" 3-clause BSD license,
+ * and/or any of the GPL licenses you prefer.
+ * Feel free to use the code and modify as you see fit.
+ *
+ * John E. Stone - john.stone@gmail.com
+ *
+ */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * Ugly hacks to avoid dependencies on the real nvml.h until it starts
+ * getting included with the CUDA toolkit or a GDK that's got a known
+ * install location, etc.
+ */
+typedef enum wrap_nvmlReturn_enum {
+	WRAPNVML_SUCCESS = 0
+} wrap_nvmlReturn_t;
+
+typedef void * wrap_nvmlDevice_t;
+
+/* our own version of the PCI info struct */
+typedef struct {
+	char bus_id_str[16];             /* string form of bus info */
+	unsigned int domain;
+	unsigned int bus;
+	unsigned int device;
+	unsigned int pci_device_id;      /* combined device and vendor id */
+	unsigned int pci_subsystem_id;
+	unsigned int res0;               /* NVML internal use only */
+	unsigned int res1;
+	unsigned int res2;
+	unsigned int res3;
+} wrap_nvmlPciInfo_t;
+
+typedef enum nvmlClockType_t {
+NVML_CLOCK_GRAPHICS = 0,
+NVML_CLOCK_SM = 1,
+NVML_CLOCK_MEM = 2
+} wrap_nvmlClockType_t;
+
+/*
+ * Handle to hold the function pointers for the entry points we need,
+ * and the shared library itself.
+ */
+typedef struct {
+	void *nvml_dll;
+	int nvml_gpucount;
+	int cuda_gpucount;
+	unsigned int *nvml_pci_domain_id;
+	unsigned int *nvml_pci_bus_id;
+	unsigned int *nvml_pci_device_id;
+	int *nvml_cuda_device_id;          /* map NVML dev to CUDA dev */
+	int *cuda_nvml_device_id;          /* map CUDA dev to NVML dev */
+	wrap_nvmlDevice_t *devs;
+	wrap_nvmlReturn_t (*nvmlInit)(void);
+	wrap_nvmlReturn_t (*nvmlDeviceGetCount)(int *);
+	wrap_nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(int, wrap_nvmlDevice_t *);
+	wrap_nvmlReturn_t (*nvmlDeviceGetClockInfo)(wrap_nvmlDevice_t, wrap_nvmlClockType_t, unsigned int *);
+	wrap_nvmlReturn_t (*nvmlDeviceGetPciInfo)(wrap_nvmlDevice_t, wrap_nvmlPciInfo_t *);
+	wrap_nvmlReturn_t (*nvmlDeviceGetName)(wrap_nvmlDevice_t, char *, int);
+	wrap_nvmlReturn_t (*nvmlDeviceGetTemperature)(wrap_nvmlDevice_t, int, unsigned int *);
+	wrap_nvmlReturn_t (*nvmlDeviceGetFanSpeed)(wrap_nvmlDevice_t, unsigned int *);
+	wrap_nvmlReturn_t (*nvmlDeviceGetPerformanceState)(wrap_nvmlDevice_t, int *); /* enum */
+	wrap_nvmlReturn_t (*nvmlDeviceGetPowerUsage)(wrap_nvmlDevice_t, unsigned int *);
+	char* (*nvmlErrorString)(wrap_nvmlReturn_t);
+	wrap_nvmlReturn_t (*nvmlShutdown)(void);
+} wrap_nvml_handle;
+
+
+wrap_nvml_handle * wrap_nvml_create();
+int wrap_nvml_destroy(wrap_nvml_handle *nvmlh);
+
+/*
+ * Query the number of GPUs seen by NVML
+ */
+int wrap_nvml_get_gpucount(wrap_nvml_handle *nvmlh, int *gpucount);
+
+/*
+ * Query the number of GPUs seen by CUDA
+ */
+int wrap_cuda_get_gpucount(wrap_nvml_handle *nvmlh, int *gpucount);
+
+
+/*
+ * query the name of the GPU model from the CUDA device ID
+ *
+ */
+int wrap_nvml_get_gpu_name(wrap_nvml_handle *nvmlh,
+                           int gpuindex,
+                           char *namebuf,
+                           int bufsize);
+
+/*
+ * Query the current GPU temperature (Celsius), from the CUDA device ID
+ */
+int wrap_nvml_get_tempC(wrap_nvml_handle *nvmlh,
+                        int gpuindex, unsigned int *tempC);
+
+/*
+ * Query the current GPU fan speed (percent) from the CUDA device ID
+ */
+int wrap_nvml_get_fanpcnt(wrap_nvml_handle *nvmlh,
+                          int gpuindex, unsigned int *fanpcnt);
+
+/*
+ * Query the current GPU speed from the CUDA device ID
+ */
+int wrap_nvml_get_clock(wrap_nvml_handle *nvmlh,
+                          int gpuindex, int clktype, unsigned int *freq);
+
+/*
+ * Query the current GPU power usage in millwatts from the CUDA device ID
+ *
+ * This feature is only available on recent GPU generations and may be
+ * limited in some cases only to Tesla series GPUs.
+ * If the query is run on an unsupported GPU, this routine will return -1.
+ */
+int wrap_nvml_get_power_usage(wrap_nvml_handle *nvmlh,
+                              int gpuindex,
+                              unsigned int *milliwatts);
+
+/* api functions */
+
+#include "miner.h"
+
+unsigned int gpu_fanpercent(struct cgpu_info *gpu);
+double gpu_temp(struct cgpu_info *gpu);
+unsigned int gpu_clock(struct cgpu_info *gpu);
+unsigned int gpu_power(struct cgpu_info *gpu);
+int gpu_pstate(struct cgpu_info *gpu);
+
+#if defined(__cplusplus)
+}
+#endif
+