nvapi: fix mapping of devices

10 years ago · 124dc6ea57
3 changed files with 41 additions and 27 deletions
--- a/api.cpp
+++ b/api.cpp
@ -118,8 +118,8 @@ static void gpustatus(int thr_id)
				@@ -118,8 +118,8 @@ static void gpustatus(int thr_id)
 {
 	if (thr_id >= 0 && thr_id < gpu_threads) {
 		struct cgpu_info *cgpu = &thr_info[thr_id].gpu;
-		char buf[512];
-		char pstate[4];
+		char buf[512]; *buf = '\0';
+		char pstate[8]; *pstate = '\0';

 		cgpu->thr_id = thr_id;

@ -148,12 +148,12 @@ static void gpustatus(int thr_id)
				@@ -148,12 +148,12 @@ static void gpustatus(int thr_id)

 		cgpu->khashes = stats_get_speed(thr_id, 0.0) / 1000.0;

-		sprintf(pstate, "P%u", cgpu->gpu_pstate);
+		snprintf(pstate, sizeof(pstate), "P%u", cgpu->gpu_pstate);
 		if (cgpu->gpu_pstate == -1)
-			sprintf(pstate, "");
+			*pstate= '\0';

-		sprintf(buf, "GPU=%d;TEMP=%.1f;FAN=%d;FREQ=%d;PST=%s;KHS=%.2f;"
-			"HWF=%d;I=%d|",
+		snprintf(buf, sizeof(buf), "GPU=%d;TEMP=%.1f;FAN=%d;FREQ=%d;"
+			"PST=%s;KHS=%.2f;HWF=%d;I=%d|",
 			thr_id, cgpu->gpu_temp, cgpu->gpu_fan,
 			cgpu->gpu_clock, pstate, cgpu->khashes,
 			cgpu->hw_errors, cgpu->intensity);
@ -170,7 +170,7 @@ static void gpustatus(int thr_id)
				@@ -170,7 +170,7 @@ static void gpustatus(int thr_id)
 */
 static char *getsummary(char *params)
 {
-	char algo[64] = "";
+	char algo[64]; *algo = '\0';
 	time_t ts = time(NULL);
 	double uptime = difftime(ts, startup);
 	double accps = (60.0 * accepted_count) / (uptime ? uptime : 1.0);
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -997,7 +997,7 @@ static void *miner_thread(void *userdata)
				@@ -997,7 +997,7 @@ static void *miner_thread(void *userdata)
 	 * of the number of CPUs */
 	if (num_processors > 1 && opt_n_threads % num_processors == 0) {
 		if (!opt_quiet)
-			applog(LOG_DEBUG, "Binding thread %d to cpu %d", thr_id,
+			applog(LOG_DEBUG, "Binding thread %d to gpu %d", thr_id,
 					thr_id % num_processors);
 		affine_to_cpu(thr_id, thr_id % num_processors);
 	}
@ -2064,8 +2064,7 @@ int main(int argc, char *argv[])
				@@ -2064,8 +2064,7 @@ int main(int argc, char *argv[])
 	SetConsoleCtrlHandler((PHANDLER_ROUTINE)ConsoleHandler, TRUE);
 #endif

-	if (num_processors == 0)
-	{
+	if (num_processors == 0) {
 		applog(LOG_ERR, "No CUDA devices found! terminating.");
 		exit(1);
 	}
--- a/nvml.cpp
+++ b/nvml.cpp
@ -29,7 +29,11 @@
				@@ -29,7 +29,11 @@
 #include "nvml.h"

 extern wrap_nvml_handle *hnvml;
+extern int num_processors; // gpus
 extern int device_map[8];
+extern char* device_name[8];
+
+static uint32_t device_bus_ids[8] = { 0 };

 /*
 * Wrappers to emulate dlopen() on other systems like Windows
@ -199,6 +203,7 @@ wrap_nvml_handle * wrap_nvml_create()
				@@ -199,6 +203,7 @@ wrap_nvml_handle * wrap_nvml_create()

 		if (cudaGetDeviceProperties(&props, i) == cudaSuccess) {
 			int j;
+			device_bus_ids[i] = props.pciBusID;
 			for (j = 0; j<nvmlh->nvml_gpucount; j++) {
 				if ((nvmlh->nvml_pci_domain_id[j] == (uint32_t) props.pciDomainID) &&
 				    (nvmlh->nvml_pci_bus_id[j]    == (uint32_t) props.pciBusID) &&
@ -297,8 +302,8 @@ int wrap_nvml_get_power_usage(wrap_nvml_handle *nvmlh, int cudaindex, unsigned i
				@@ -297,8 +302,8 @@ int wrap_nvml_get_power_usage(wrap_nvml_handle *nvmlh, int cudaindex, unsigned i

 	wrap_nvmlReturn_t res = nvmlh->nvmlDeviceGetPowerUsage(nvmlh->devs[gpuindex], milliwatts);
 	if (res != WRAPNVML_SUCCESS) {
-		//if (opt_debug)
-		//	applog(LOG_DEBUG, "nvmlDeviceGetPowerUsage: %s", nvmlh->nvmlErrorString(res));
+		if (opt_debug)
+			applog(LOG_DEBUG, "nvmlDeviceGetPowerUsage: %s", nvmlh->nvmlErrorString(res));
 		return -1;
 	}

@ -314,8 +319,8 @@ int wrap_nvml_get_pstate(wrap_nvml_handle *nvmlh, int cudaindex, int *pstate)
				@@ -314,8 +319,8 @@ int wrap_nvml_get_pstate(wrap_nvml_handle *nvmlh, int cudaindex, int *pstate)

 	wrap_nvmlReturn_t res = nvmlh->nvmlDeviceGetPerformanceState(nvmlh->devs[gpuindex], pstate);
 	if (res != WRAPNVML_SUCCESS) {
-		if (opt_debug)
-			applog(LOG_DEBUG, "nvmlDeviceGetPerformanceState: %s", nvmlh->nvmlErrorString(res));
+		//if (opt_debug)
+		//	applog(LOG_DEBUG, "nvmlDeviceGetPerformanceState: %s", nvmlh->nvmlErrorString(res));
 		return -1;
 	}

@ -457,22 +462,32 @@ int wrap_nvapi_init()
				@@ -457,22 +462,32 @@ int wrap_nvapi_init()
 		return -1;
 	}

-	for (int i = 0; i < 8; i++) {
-		// to fix
-		nvapi_dev_map[i] = i;
-	}
-#if 0
-	NvAPI_ShortString ver;
-	NvAPI_GetInterfaceVersionString(ver);
-	applog(LOG_DEBUG, "NVAPI Version: %s", ver);
-
+	for (NvU8 i = 0; i < nvapi_dev_cnt; i++) {
 		NvAPI_ShortString name;
-	ret = NvAPI_GPU_GetFullName(phys[devNum], name);
-	if (ret != NVAPI_OK){
+		nvapi_dev_map[i] = i; // default mapping
+		ret = NvAPI_GPU_GetFullName(phys[i], name);
+		if (ret == NVAPI_OK) {
+			for (int g = 0; g < num_processors; g++) {
+				//todo : device_bus_ids, could be wrong on rigs
+				if (strcmp(device_name[g], name) == 0 && nvapi_dev_map[i] == i) {
+					nvapi_dev_map[i] = g;
+					break;
+				}
+			}
+			if (opt_debug)
+				applog(LOG_DEBUG, "NVAPI dev %d: %s - mapped to CUDA device %d",
+					i, name, nvapi_dev_map[i]);
+		} else {
 			NvAPI_ShortString string;
 			NvAPI_GetErrorMessage(ret, string);
 			applog(LOG_DEBUG, "NVAPI NvAPI_GPU_GetFullName: %s", string);
 		}
+	}
+
+#if 0
+	NvAPI_ShortString ver;
+	NvAPI_GetInterfaceVersionString(ver);
+	applog(LOG_DEBUG, "NVAPI Version: %s", ver);
 #endif

 	return 0;