From dbb9507d2b2d93f8a4e43c35c2d51571918338ba Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 10 Jul 2016 12:59:04 +0200
Subject: [PATCH] api: change unit of device mem to MB

without that, no way to read sizes > 4GB on x86 binaries
---
 api.cpp                     | 9 ++++-----
 cuda.cpp                    | 9 +++++++--
 miner.h                     | 4 +++-
 neoscrypt/cuda_neoscrypt.cu | 2 +-
 nvml.cpp                    | 2 +-
 util.cpp                    | 2 +-
 6 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/api.cpp b/api.cpp
index 457155b..b103df5 100644
--- a/api.cpp
+++ b/api.cpp
@@ -116,6 +116,7 @@ static void gpustatus(int thr_id)
 		char buf[512]; *buf = '\0';
 		char* card;
 
+		cuda_gpu_info(cgpu);
 #ifdef USE_WRAPNVML
 		cgpu->has_monitoring = true;
 		cgpu->gpu_bus = gpu_busid(cgpu);
@@ -124,7 +125,6 @@ static void gpustatus(int thr_id)
 		cgpu->gpu_fan_rpm = (uint16_t) gpu_fanrpm(cgpu);
 		cgpu->gpu_power = gpu_power(cgpu); // mWatts
 #endif
-		cuda_gpu_clocks(cgpu);
 
 		// todo: per gpu
 		cgpu->accepted = p->accepted_count;
@@ -254,6 +254,7 @@ static void gpuhwinfos(int gpu_id)
 	if (cgpu == NULL)
 		return;
 
+	cuda_gpu_info(cgpu);
 #ifdef USE_WRAPNVML
 	cgpu->has_monitoring = true;
 	cgpu->gpu_bus = gpu_busid(cgpu);
@@ -268,18 +269,16 @@ static void gpuhwinfos(int gpu_id)
 #endif
 #endif
 
-	cuda_gpu_clocks(cgpu);
-
 	memset(pstate, 0, sizeof(pstate));
 	if (cgpu->gpu_pstate != -1)
 		snprintf(pstate, sizeof(pstate), "P%d", (int) cgpu->gpu_pstate);
 
 	card = device_name[gpu_id];
 
-	snprintf(buf, sizeof(buf), "GPU=%d;BUS=%hd;CARD=%s;SM=%hu;MEM=%lu;"
+	snprintf(buf, sizeof(buf), "GPU=%d;BUS=%hd;CARD=%s;SM=%hu;MEM=%u;"
 		"TEMP=%.1f;FAN=%hu;RPM=%hu;FREQ=%d;MEMFREQ=%d;PST=%s;POWER=%u;"
 		"VID=%hx;PID=%hx;NVML=%d;NVAPI=%d;SN=%s;BIOS=%s|",
-		gpu_id, cgpu->gpu_bus, card, cgpu->gpu_arch, cgpu->gpu_mem,
+		gpu_id, cgpu->gpu_bus, card, cgpu->gpu_arch, (uint32_t) cgpu->gpu_mem,
 		cgpu->gpu_temp, cgpu->gpu_fan, cgpu->gpu_fan_rpm,
 		cgpu->gpu_clock, cgpu->gpu_memclock,
 		pstate, cgpu->gpu_power,
diff --git a/cuda.cpp b/cuda.cpp
index 79be292..4575214 100644
--- a/cuda.cpp
+++ b/cuda.cpp
@@ -233,13 +233,18 @@ void cuda_clear_lasterror()
 } /* extern "C" */
 #endif
 
-int cuda_gpu_clocks(struct cgpu_info *gpu)
+int cuda_gpu_info(struct cgpu_info *gpu)
 {
 	cudaDeviceProp props;
 	if (cudaGetDeviceProperties(&props, gpu->gpu_id) == cudaSuccess) {
 		gpu->gpu_clock = props.clockRate;
 		gpu->gpu_memclock = props.memoryClockRate;
-		gpu->gpu_mem = props.totalGlobalMem;
+		gpu->gpu_mem = (props.totalGlobalMem / 1024); // kB
+#if defined(_WIN32) && defined(USE_WRAPNVML)
+		// required to get mem size > 4GB (size_t too small for bytes on 32bit)
+		nvapiMemGetInfo(gpu->gpu_id, &gpu->gpu_memfree, &gpu->gpu_mem); // kB
+#endif
+		gpu->gpu_mem = gpu->gpu_mem / 1024; // MB
 		return 0;
 	}
 	return -1;
diff --git a/miner.h b/miner.h
index 2d30aa0..6bbb3a8 100644
--- a/miner.h
+++ b/miner.h
@@ -365,6 +365,7 @@ struct cgpu_info {
 	int gpu_clock;
 	int gpu_memclock;
 	size_t gpu_mem;
+	size_t gpu_memfree;
 	uint32_t gpu_power;
 	double gpu_vddc;
 	int16_t gpu_pstate;
@@ -486,6 +487,7 @@ extern double net_diff;
 extern double stratum_diff;
 
 #define MAX_GPUS 16
+//#define MAX_THREADS 32 todo
 extern char* device_name[MAX_GPUS];
 extern short device_map[MAX_GPUS];
 extern long  device_sm[MAX_GPUS];
@@ -500,7 +502,7 @@ void cuda_shutdown();
 int cuda_finddevice(char *name);
 int cuda_version();
 void cuda_print_devices();
-int cuda_gpu_clocks(struct cgpu_info *gpu);
+int cuda_gpu_info(struct cgpu_info *gpu);
 int cuda_available_memory(int thr_id);
 
 uint32_t cuda_default_throughput(int thr_id, uint32_t defcount);
diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu
index 1fa45e8..42b3382 100644
--- a/neoscrypt/cuda_neoscrypt.cu
+++ b/neoscrypt/cuda_neoscrypt.cu
@@ -752,8 +752,8 @@ void fastkdf256_v1(const uint32_t thread, const uint32_t nonce, uint32_t* const
 	rbuf = idx & 3;
 	bitbuf = rbuf << 3;
 
-	for(int i = 0; i<64; i++)
 #if __CUDA_ARCH__ >= 320
+	for(int i = 0; i<64; i++)
 		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(B[(qbuf + i) & 0x3f]), "r"(B[(qbuf + i + 1) & 0x3f4]), "r"(bitbuf));
 #endif
 
diff --git a/nvml.cpp b/nvml.cpp
index 2581874..0b32967 100644
--- a/nvml.cpp
+++ b/nvml.cpp
@@ -1483,7 +1483,7 @@ int nvapiMemGetInfo(int dev_id, size_t *free, size_t *total)
 	mem.version = NV_DISPLAY_DRIVER_MEMORY_INFO_VER;
 	unsigned int devNum = nvapi_dev_map[dev_id % MAX_GPUS];
 	if ((ret = NvAPI_GPU_GetMemoryInfo(phys[devNum], &mem)) == NVAPI_OK) {
-		*total = mem.availableDedicatedVideoMemory;
+		*total = mem.dedicatedVideoMemory;// mem.availableDedicatedVideoMemory;
 		*free  = mem.curAvailableDedicatedVideoMemory;
 	}
 	return (int) ret;
diff --git a/util.cpp b/util.cpp
index 1b107ca..e7b305a 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1638,13 +1638,13 @@ static bool stratum_benchdata(json_t *result, json_t *params, int thr_id)
 	strcpy(os, is_windows() ? "win32" : "linux");
 #endif
 
+	cuda_gpu_info(cgpu);
 #ifdef USE_WRAPNVML
 	cgpu->has_monitoring = true;
 	cgpu->gpu_power = gpu_power(cgpu); // mWatts
 	watts = (cgpu->gpu_power >= 1000) ? cgpu->gpu_power / 1000 : 0; // ignore nvapi %
 	gpu_info(cgpu);
 #endif
-	cuda_gpu_clocks(cgpu);
 	get_currentalgo(algo, sizeof(algo));
 
 	card = device_name[dev_id];