diff --git a/ccminer.cpp b/ccminer.cpp
index 6654c3e..051f70e 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -206,6 +206,7 @@ int opt_n_threads = 0;
 static double opt_difficulty = 1; // CH
 bool opt_trust_pool = false;
 uint16_t opt_vote = 9999;
+int num_cpus;
 int num_processors;
 int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
 char *device_name[8]; // CB
@@ -986,13 +987,12 @@ static void *miner_thread(void *userdata)
 		drop_policy();
 	}
 
-	/* Cpu affinity only makes sense if the number of threads is a multiple
-	 * of the number of CPUs */
-	if (num_processors > 1 && opt_n_threads % num_processors == 0) {
+	/* Cpu thread affinity */
+	if (num_cpus > 1) {
 		if (!opt_quiet)
-			applog(LOG_DEBUG, "Binding thread %d to gpu %d", thr_id,
-					thr_id % num_processors);
-		affine_to_cpu(thr_id, thr_id % num_processors);
+			applog(LOG_DEBUG, "Binding thread %d to cpu %d", thr_id,
+					thr_id % num_cpus);
+		affine_to_cpu(thr_id, thr_id % num_cpus);
 	}
 
 	while (1) {
@@ -1846,12 +1846,13 @@ static void parse_arg(int key, char *arg)
 		break;
 	case 'd': // CB
 		{
+			int ngpus = cuda_num_devices();
 			char * pch = strtok (arg,",");
 			opt_n_threads = 0;
 			while (pch != NULL) {
 				if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
 				{
-					if (atoi(pch) < num_processors)
+					if (atoi(pch) < ngpus)
 						device_map[opt_n_threads++] = atoi(pch);
 					else {
 						applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch));
@@ -1859,13 +1860,15 @@ static void parse_arg(int key, char *arg)
 					}
 				} else {
 					int device = cuda_finddevice(pch);
-					if (device >= 0 && device < num_processors)
+					if (device >= 0 && device < ngpus)
 						device_map[opt_n_threads++] = device;
 					else {
 						applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch);
 						proper_exit(1);
 					}
 				}
+				// set number of active gpus
+				num_processors = opt_n_threads;
 				pch = strtok (NULL, ",");
 			}
 		}
@@ -2026,6 +2029,25 @@ int main(int argc, char *argv[])
 	rpc_pass = strdup("");
 
 	pthread_mutex_init(&applog_lock, NULL);
+
+	// number of cpus for thread affinity
+#if defined(WIN32)
+	SYSTEM_INFO sysinfo;
+	GetSystemInfo(&sysinfo);
+	num_cpus = sysinfo.dwNumberOfProcessors;
+#elif defined(_SC_NPROCESSORS_CONF)
+	num_cpus = sysconf(_SC_NPROCESSORS_CONF);
+#elif defined(CTL_HW) && defined(HW_NCPU)
+	int req[] = { CTL_HW, HW_NCPU };
+	size_t len = sizeof(num_cpus);
+	sysctl(req, 2, &num_cpus, &len, NULL, 0);
+#else
+	num_cpus = 1;
+#endif
+	if (num_cpus < 1)
+		num_cpus = 1;
+
+	// number of gpus
 	num_processors = cuda_num_devices();
 	cuda_devicenames();
 
diff --git a/nvml.cpp b/nvml.cpp
index 33263c9..e13f8f2 100644
--- a/nvml.cpp
+++ b/nvml.cpp
@@ -30,7 +30,6 @@
 #include "nvml.h"
 
 extern wrap_nvml_handle *hnvml;
-extern int num_processors; // gpus
 
 static uint32_t device_bus_ids[8] = { 0 };
 
@@ -531,6 +530,7 @@ int nvapi_getbusid(unsigned int devNum, int *busid)
 
 int wrap_nvapi_init()
 {
+	int num_gpus = cuda_num_devices();
 	NvAPI_Status ret = NvAPI_Initialize();
 	if (!ret == NVAPI_OK){
 		NvAPI_ShortString string;
@@ -549,7 +549,7 @@ int wrap_nvapi_init()
 		return -1;
 	}
 
-	for (int g = 0; g < num_processors; g++) {
+	for (int g = 0; g < num_gpus; g++) {
 		cudaDeviceProp props;
 		if (cudaGetDeviceProperties(&props, g) == cudaSuccess) {
 			device_bus_ids[g] = props.pciBusID;
@@ -561,7 +561,7 @@ int wrap_nvapi_init()
 		NvAPI_ShortString name;
 		ret = NvAPI_GPU_GetFullName(phys[i], name);
 		if (ret == NVAPI_OK) {
-			for (int g = 0; g < num_processors; g++) {
+			for (int g = 0; g < num_gpus; g++) {
 				NvU32 busId;
 				ret = NvAPI_GPU_GetBusId(phys[i], &busId);
 				if (ret == NVAPI_OK && busId == device_bus_ids[g]) {