diff --git a/api.cpp b/api.cpp
index 44320de..f15f117 100644
--- a/api.cpp
+++ b/api.cpp
@@ -101,8 +101,7 @@ extern int num_cpus;
 extern float cpu_temp(int);
 extern uint32_t cpu_clock(int);
 // cuda.cpp
-int cuda_num_devices();
-int cuda_gpu_clocks(struct cgpu_info *gpu);
+extern int cuda_gpu_clocks(struct cgpu_info *gpu);
 
 char driver_version[32] = { 0 };
 
diff --git a/cuda.cpp b/cuda.cpp
index 64dd2ac..4c8d748 100644
--- a/cuda.cpp
+++ b/cuda.cpp
@@ -23,6 +23,11 @@
 
 #include "cuda_runtime.h"
 
+#ifdef __cplusplus
+/* miner.h functions are declared in C type, not C++ */
+extern "C" {
+#endif
+
 // CUDA Devices on the System
 int cuda_num_devices()
 {
@@ -150,49 +155,6 @@ uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount)
 	return throughput;
 }
 
-// Zeitsynchronisations-Routine von cudaminer mit CPU sleep
-// Note: if you disable all of these calls, CPU usage will hit 100%
-typedef struct { double value[8]; } tsumarray;
-cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
-{
-	cudaError_t result = cudaSuccess;
-	if (situation >= 0)
-	{
-		static std::map<int, tsumarray> tsum;
-
-		double a = 0.95, b = 0.05;
-		if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence
-
-		double tsync = 0.0;
-		double tsleep = 0.95 * tsum[situation].value[thr_id];
-		if (cudaStreamQuery(stream) == cudaErrorNotReady)
-		{
-			usleep((useconds_t)(1e6*tsleep));
-			struct timeval tv_start, tv_end;
-			gettimeofday(&tv_start, NULL);
-			result = cudaStreamSynchronize(stream);
-			gettimeofday(&tv_end, NULL);
-			tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec);
-		}
-		if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync);
-	}
-	else
-		result = cudaStreamSynchronize(stream);
-	return result;
-}
-
-int cuda_gpu_clocks(struct cgpu_info *gpu)
-{
-	cudaDeviceProp props;
-	if (cudaGetDeviceProperties(&props, gpu->gpu_id) == cudaSuccess) {
-		gpu->gpu_clock = props.clockRate;
-		gpu->gpu_memclock = props.memoryClockRate;
-		gpu->gpu_mem = props.totalGlobalMem;
-		return 0;
-	}
-	return -1;
-}
-
 // if we use 2 threads on the same gpu, we need to reinit the threads
 void cuda_reset_device(int thr_id, bool *init)
 {
@@ -228,6 +190,53 @@ int cuda_available_memory(int thr_id)
 	return (int) (mfree / (1024 * 1024));
 }
 
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+int cuda_gpu_clocks(struct cgpu_info *gpu)
+{
+	cudaDeviceProp props;
+	if (cudaGetDeviceProperties(&props, gpu->gpu_id) == cudaSuccess) {
+		gpu->gpu_clock = props.clockRate;
+		gpu->gpu_memclock = props.memoryClockRate;
+		gpu->gpu_mem = props.totalGlobalMem;
+		return 0;
+	}
+	return -1;
+}
+
+// Zeitsynchronisations-Routine von cudaminer mit CPU sleep
+// Note: if you disable all of these calls, CPU usage will hit 100%
+typedef struct { double value[8]; } tsumarray;
+cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
+{
+	cudaError_t result = cudaSuccess;
+	if (situation >= 0)
+	{
+		static std::map<int, tsumarray> tsum;
+
+		double a = 0.95, b = 0.05;
+		if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence
+
+		double tsync = 0.0;
+		double tsleep = 0.95 * tsum[situation].value[thr_id];
+		if (cudaStreamQuery(stream) == cudaErrorNotReady)
+		{
+			usleep((useconds_t)(1e6*tsleep));
+			struct timeval tv_start, tv_end;
+			gettimeofday(&tv_start, NULL);
+			result = cudaStreamSynchronize(stream);
+			gettimeofday(&tv_end, NULL);
+			tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec);
+		}
+		if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync);
+	}
+	else
+		result = cudaStreamSynchronize(stream);
+	return result;
+}
+
 void cudaReportHardwareFailure(int thr_id, cudaError_t err, const char* func)
 {
 	struct cgpu_info *gpu = &thr_info[thr_id].gpu;
diff --git a/miner.h b/miner.h
index 4058b83..a4dc10b 100644
--- a/miner.h
+++ b/miner.h
@@ -39,9 +39,6 @@ extern "C" {
 #  include <malloc.h>
 #  define alloca _alloca
 # elif !defined HAVE_ALLOCA
-#  ifdef  __cplusplus
-extern "C"
-#  endif
 void *alloca (size_t);
 # endif
 #endif
@@ -483,7 +480,7 @@ extern long  device_sm[MAX_GPUS];
 extern uint32_t gpus_intensity[MAX_GPUS];
 extern int opt_cudaschedule;
 
-// decl. from cuda.cpp
+// cuda.cpp
 int cuda_num_devices();
 void cuda_devicenames();
 void cuda_reset_device(int thr_id, bool *init);
diff --git a/nvml.cpp b/nvml.cpp
index 417a86a..2691057 100644
--- a/nvml.cpp
+++ b/nvml.cpp
@@ -25,9 +25,6 @@
 #include "nvml.h"
 #include "cuda_runtime.h"
 
-// cuda.cpp
-int cuda_num_devices();
-
 #ifdef USE_WRAPNVML
 
 extern nvml_handle *hnvml;