ccminer/cuda.cpp

#include <stdio.h>
#include <memory.h>
#include <string.h>
#include <unistd.h>
#include <map>

// include thrust
#ifndef __cplusplus
#include <thrust/version.h>
#include <thrust/remove.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#else
#include <ctype.h>
#endif

#include "miner.h"
#include "nvml.h"

#include "cuda_runtime.h"

#ifdef __cplusplus
/* miner.h functions are declared in C type, not C++ */
extern "C" {
#endif

// CUDA Devices on the System
int cuda_num_devices()
{
	int version;
	cudaError_t err = cudaDriverGetVersion(&version);
	if (err != cudaSuccess)
	{
		applog(LOG_ERR, "Unable to query CUDA driver version! Is an nVidia driver installed?");
		exit(1);
	}

	int maj = version / 1000, min = version % 100; // same as in deviceQuery sample
	if (maj < 5 || (maj == 5 && min < 5))
	{
		applog(LOG_ERR, "Driver does not support CUDA %d.%d API! Update your nVidia driver!", 5, 5);
		exit(1);
	}

	int GPU_N;
	err = cudaGetDeviceCount(&GPU_N);
	if (err != cudaSuccess)
	{
		applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
		exit(1);
	}
	return GPU_N;
}

int cuda_version()
{
	return (int) CUDART_VERSION;
}

void cuda_devicenames()
{
	cudaError_t err;
	int GPU_N;
	err = cudaGetDeviceCount(&GPU_N);
	if (err != cudaSuccess)
	{
		applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
		exit(1);
	}

	if (opt_n_threads)
		GPU_N = min(MAX_GPUS, opt_n_threads);
	for (int i=0; i < GPU_N; i++)
	{
		char vendorname[32] = { 0 };
		int dev_id = device_map[i];
		cudaDeviceProp props;
		cudaGetDeviceProperties(&props, dev_id);

		device_sm[dev_id] = (props.major * 100 + props.minor * 10);

		if (device_name[dev_id]) {
			free(device_name[dev_id]);
			device_name[dev_id] = NULL;
		}
#ifdef USE_WRAPNVML
		if (gpu_vendor((uint8_t)props.pciBusID, vendorname) > 0 && strlen(vendorname)) {
			device_name[dev_id] = (char*) calloc(1, strlen(vendorname) + strlen(props.name) + 2);
			if (!strncmp(props.name, "GeForce ", 8))
				sprintf(device_name[dev_id], "%s %s", vendorname, &props.name[8]);
			else
				sprintf(device_name[dev_id], "%s %s", vendorname, props.name);
		} else
#endif
			device_name[dev_id] = strdup(props.name);
	}
}

void cuda_print_devices()
{
	int ngpus = cuda_num_devices();
	cuda_devicenames();
	for (int n=0; n < ngpus; n++) {
		int dev_id = device_map[n % MAX_GPUS];
		cudaDeviceProp props;
		cudaGetDeviceProperties(&props, dev_id);
		if (!opt_n_threads || n < opt_n_threads) {
			fprintf(stderr, "GPU #%d: SM %d.%d %s @ %.0f MHz (MEM %.0f)\n", dev_id, props.major, props.minor,
				device_name[dev_id], (double) props.clockRate/1000, (double) props.memoryClockRate/1000);
#ifdef USE_WRAPNVML
			if (opt_debug) nvml_print_device_info(dev_id);
#ifdef WIN32
			if (opt_debug) {
				unsigned int devNum = nvapi_devnum(dev_id);
				nvapi_pstateinfo(devNum);
			}
#endif
#endif
		}
	}
}

void cuda_shutdown()
{
	// require gpu init first
	if (thr_info != NULL)
		cudaDeviceSynchronize();
	cudaDeviceReset();
}

static bool substringsearch(const char *haystack, const char *needle, int &match)
{
	int hlen = (int) strlen(haystack);
	int nlen = (int) strlen(needle);
	for (int i=0; i < hlen; ++i)
	{
		if (haystack[i] == ' ') continue;
		int j=0, x = 0;
		while(j < nlen)
		{
			if (haystack[i+x] == ' ') {++x; continue;}
			if (needle[j] == ' ') {++j; continue;}
			if (needle[j] == '#') return ++match == needle[j+1]-'0';
			if (tolower(haystack[i+x]) != tolower(needle[j])) break;
			++j; ++x;
		}
		if (j == nlen) return true;
	}
	return false;
}

// CUDA Gerät nach Namen finden (gibt Geräte-Index zurück oder -1)
int cuda_finddevice(char *name)
{
	int num = cuda_num_devices();
	int match = 0;
	for (int i=0; i < num; ++i)
	{
		cudaDeviceProp props;
		if (cudaGetDeviceProperties(&props, i) == cudaSuccess)
			if (substringsearch(props.name, name, match)) return i;
	}
	return -1;
}

// since 1.7
uint32_t cuda_default_throughput(int thr_id, uint32_t defcount)
{
	//int dev_id = device_map[thr_id % MAX_GPUS];
	uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount;
	if (gpu_threads > 1 && throughput == defcount) throughput /= (gpu_threads-1);
	if (api_thr_id != -1) api_set_throughput(thr_id, throughput);
	//gpulog(LOG_INFO, thr_id, "throughput %u", throughput);
	return throughput;
}

// since 1.8.3
double throughput2intensity(uint32_t throughput)
{
	double intensity = 0.;
	uint32_t ws = throughput;
	uint8_t i = 0;
	while (ws > 1 && i++ < 32)
		ws = ws >> 1;
	intensity = (double) i;
	if (i && ((1U << i) < throughput)) {
		intensity += ((double) (throughput-(1U << i)) / (1U << i));
	}
	return intensity;
}

// if we use 2 threads on the same gpu, we need to reinit the threads
void cuda_reset_device(int thr_id, bool *init)
{
	int dev_id = device_map[thr_id % MAX_GPUS];
	cudaSetDevice(dev_id);
	if (init != NULL) {
		// with init array, its meant to be used in algo's scan code...
		for (int i=0; i < MAX_GPUS; i++) {
			if (device_map[i] == dev_id) {
				init[i] = false;
			}
		}
		// force exit from algo's scan loops/function
		restart_threads();
		cudaDeviceSynchronize();
		while (cudaStreamQuery(NULL) == cudaErrorNotReady)
			usleep(1000);
	}
	cudaDeviceReset();
	if (opt_cudaschedule >= 0) {
		cudaSetDeviceFlags((unsigned)(opt_cudaschedule & cudaDeviceScheduleMask));
	} else {
		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
	}
	cudaDeviceSynchronize();
}

// return free memory in megabytes
int cuda_available_memory(int thr_id)
{
	int dev_id = device_map[thr_id % MAX_GPUS];
#if defined(_WIN32) && defined(USE_WRAPNVML)
	uint64_t tot64 = 0, free64 = 0;
	// cuda (6.5) one can crash on pascal and dont handle 8GB
	nvapiMemGetInfo(dev_id, &free64, &tot64);
	return (int) (free64 / (1024 * 1024));
#else
	size_t mtotal = 0, mfree = 0;
	cudaSetDevice(dev_id);
	cudaDeviceSynchronize();
	cudaMemGetInfo(&mfree, &mtotal);
	return (int) (mfree / (1024 * 1024));
#endif
}

// Check (and reset) last cuda error, and report it in logs
void cuda_log_lasterror(int thr_id, const char* func, int line)
{
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess && !opt_quiet)
		gpulog(LOG_WARNING, thr_id, "%s:%d %s", func, line, cudaGetErrorString(err));
}

// Clear any cuda error in non-cuda unit (.c/.cpp)
void cuda_clear_lasterror()
{
	cudaGetLastError();
}

#ifdef __cplusplus
} /* extern "C" */
#endif

int cuda_gpu_info(struct cgpu_info *gpu)
{
	cudaDeviceProp props;
	if (cudaGetDeviceProperties(&props, gpu->gpu_id) == cudaSuccess) {
		gpu->gpu_clock = (uint32_t) props.clockRate;
		gpu->gpu_memclock = (uint32_t) props.memoryClockRate;
		gpu->gpu_mem = (uint64_t) (props.totalGlobalMem / 1024); // kB
#if defined(_WIN32) && defined(USE_WRAPNVML)
		// required to get mem size > 4GB (size_t too small for bytes on 32bit)
		nvapiMemGetInfo(gpu->gpu_id, &gpu->gpu_memfree, &gpu->gpu_mem); // kB
#endif
		gpu->gpu_mem = gpu->gpu_mem / 1024; // MB
		return 0;
	}
	return -1;
}

// Zeitsynchronisations-Routine von cudaminer mit CPU sleep
// Note: if you disable all of these calls, CPU usage will hit 100%
typedef struct { double value[8]; } tsumarray;
cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
{
	cudaError_t result = cudaSuccess;
	if (abort_flag)
		return result;
	if (situation >= 0)
	{
		static std::map<int, tsumarray> tsum;

		double a = 0.95, b = 0.05;
		if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence

		double tsync = 0.0;
		double tsleep = 0.95 * tsum[situation].value[thr_id];
		if (cudaStreamQuery(stream) == cudaErrorNotReady)
		{
			usleep((useconds_t)(1e6*tsleep));
			struct timeval tv_start, tv_end;
			gettimeofday(&tv_start, NULL);
			result = cudaStreamSynchronize(stream);
			gettimeofday(&tv_end, NULL);
			tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec);
		}
		if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync);
	}
	else
		result = cudaStreamSynchronize(stream);
	return result;
}

void cudaReportHardwareFailure(int thr_id, cudaError_t err, const char* func)
{
	struct cgpu_info *gpu = &thr_info[thr_id].gpu;
	gpu->hw_errors++;
	gpulog(LOG_ERR, thr_id, "%s %s", func, cudaGetErrorString(err));
	sleep(1);
}
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								#include <stdio.h>
 								#include <memory.h>
 								#include <string.h>
-												cuda: reduce possible segfaults on exit

not perfect but helps...

											
										
										
											10 years ago
+								#include <unistd.h>
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								#include <map>
 								// include thrust
-												Add nvml for GPU monitoring (squashed)

  Based on mwhite73 <marvin.white@gmail.com> implementation

  Linked to the api system

  Also fix Makefile to support standard c++ files
  This prevent nvcc use without device code

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								#ifndef __cplusplus
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								#include <thrust/version.h>
 								#include <thrust/remove.h>
 								#include <thrust/device_vector.h>
 								#include <thrust/iterator/constant_iterator.h>
-												Add nvml for GPU monitoring (squashed)

  Based on mwhite73 <marvin.white@gmail.com> implementation

  Linked to the api system

  Also fix Makefile to support standard c++ files
  This prevent nvcc use without device code

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								#else
 								#include <ctype.h>
 								#endif
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
 								#include "miner.h"
-												nvml: get devices vendor names with libpci

made for linux and require libpci-dev (optional)

if libpci is not installed, card's vendor names are not handled...

Note: only a few vendor names were added, common GeForce vendors.

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								#include "nvml.h"
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
-												Add nvml for GPU monitoring (squashed)

  Based on mwhite73 <marvin.white@gmail.com> implementation

  Linked to the api system

  Also fix Makefile to support standard c++ files
  This prevent nvcc use without device code

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								#include "cuda_runtime.h"
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
-												Fix windows linkage, C/C++ mismatch

											
										
										
											9 years ago
+								#ifdef __cplusplus
 								/* miner.h functions are declared in C type, not C++ */
 								extern "C" {
 								#endif
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								// CUDA Devices on the System
-												ccminer: rename main file and switch to C++

There was a different behavior on linux and visual studio

That was making it hard to link functions correctly

That remove some ifdef / extern "C" requirements

note about x86 releases, x86 nvml.dll is not installed on Windows x64!

											
										
										
											10 years ago
+								int cuda_num_devices()
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								{
 									int version;
 									cudaError_t err = cudaDriverGetVersion(&version);
 									if (err != cudaSuccess)
 									{
 										applog(LOG_ERR, "Unable to query CUDA driver version! Is an nVidia driver installed?");
 										exit(1);
 									}
 									int maj = version / 1000, min = version % 100; // same as in deviceQuery sample
 									if (maj < 5 || (maj == 5 && min < 5))
 									{
 										applog(LOG_ERR, "Driver does not support CUDA %d.%d API! Update your nVidia driver!", 5, 5);
 										exit(1);
 									}
 									int GPU_N;
 									err = cudaGetDeviceCount(&GPU_N);
 									if (err != cudaSuccess)
 									{
 										applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
 										exit(1);
 									}
 									return GPU_N;
 								}
-												stratum: improve and add new methods for pool benchmark/stats

will allow to submit some device benchmark data to compute algo
power usage and profitability.

Introduce two new methods mining.get_algo and mining_get_stats
These methods will be used with yiimp stratum with a special benchmark option
Note: only the first card is handled for the moment.

also add stratum mining.ping support (like cgminer 4.7.1+) and prevent disconnect
on unknown methods, reply it is unsupported.

											
										
										
											9 years ago
+								int cuda_version()
 								{
 									return (int) CUDART_VERSION;
 								}
-												ccminer: rename main file and switch to C++

There was a different behavior on linux and visual studio

That was making it hard to link functions correctly

That remove some ifdef / extern "C" requirements

note about x86 releases, x86 nvml.dll is not installed on Windows x64!

											
										
										
											10 years ago
+								void cuda_devicenames()
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								{
 									cudaError_t err;
 									int GPU_N;
 									err = cudaGetDeviceCount(&GPU_N);
 									if (err != cudaSuccess)
 									{
 										applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
 										exit(1);
 									}
-												benchmark: show mem and default throughput in results

and prepare a new function to get the default intensity

also, take care of multiple threads per gpu...

											
										
										
											9 years ago
+									if (opt_n_threads)
 										GPU_N = min(MAX_GPUS, opt_n_threads);
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+									for (int i=0; i < GPU_N; i++)
 									{
-												nvml: get devices vendor names with libpci

made for linux and require libpci-dev (optional)

if libpci is not installed, card's vendor names are not handled...

Note: only a few vendor names were added, common GeForce vendors.

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+										char vendorname[32] = { 0 };
-												improve the device mapping and a possible segfault

when using -n, work_restart array was not allocated

											
										
										
											9 years ago
+										int dev_id = device_map[i];
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+										cudaDeviceProp props;
-												improve the device mapping and a possible segfault

when using -n, work_restart array was not allocated

											
										
										
											9 years ago
+										cudaGetDeviceProperties(&props, dev_id);
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
-												improve the device mapping and a possible segfault

when using -n, work_restart array was not allocated

											
										
										
											9 years ago
+										device_sm[dev_id] = (props.major * 100 + props.minor * 10);
-												nvml: get devices vendor names with libpci

made for linux and require libpci-dev (optional)

if libpci is not installed, card's vendor names are not handled...

Note: only a few vendor names were added, common GeForce vendors.

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
-												improve the device mapping and a possible segfault

when using -n, work_restart array was not allocated

											
										
										
											9 years ago
+										if (device_name[dev_id]) {
 											free(device_name[dev_id]);
 											device_name[dev_id] = NULL;
-												nvml: get devices vendor names with libpci

made for linux and require libpci-dev (optional)

if libpci is not installed, card's vendor names are not handled...

Note: only a few vendor names were added, common GeForce vendors.

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+										}
-												nvml: add missing ifdef for vendors

											
										
										
											10 years ago
+								#ifdef USE_WRAPNVML
 										if (gpu_vendor((uint8_t)props.pciBusID, vendorname) > 0 && strlen(vendorname)) {
-												improve the device mapping and a possible segfault

when using -n, work_restart array was not allocated

											
										
										
											9 years ago
+											device_name[dev_id] = (char*) calloc(1, strlen(vendorname) + strlen(props.name) + 2);
-												nvml: get devices vendor names with libpci

made for linux and require libpci-dev (optional)

if libpci is not installed, card's vendor names are not handled...

Note: only a few vendor names were added, common GeForce vendors.

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+											if (!strncmp(props.name, "GeForce ", 8))
-												improve the device mapping and a possible segfault

when using -n, work_restart array was not allocated

											
										
										
											9 years ago
+												sprintf(device_name[dev_id], "%s %s", vendorname, &props.name[8]);
-												nvml: get devices vendor names with libpci

made for linux and require libpci-dev (optional)

if libpci is not installed, card's vendor names are not handled...

Note: only a few vendor names were added, common GeForce vendors.

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+											else
-												improve the device mapping and a possible segfault

when using -n, work_restart array was not allocated

											
										
										
											9 years ago
+												sprintf(device_name[dev_id], "%s %s", vendorname, props.name);
-												nvml: get devices vendor names with libpci

made for linux and require libpci-dev (optional)

if libpci is not installed, card's vendor names are not handled...

Note: only a few vendor names were added, common GeForce vendors.

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+										} else
-												nvml: add missing ifdef for vendors

											
										
										
											10 years ago
+								#endif
-												improve the device mapping and a possible segfault

when using -n, work_restart array was not allocated

											
										
										
											9 years ago
+											device_name[dev_id] = strdup(props.name);
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+									}
 								}
-												Add the -n (--ndevs) option like cgminer

											
										
										
											10 years ago
+								void cuda_print_devices()
 								{
 									int ngpus = cuda_num_devices();
-												nvml: get devices vendor names with libpci

made for linux and require libpci-dev (optional)

if libpci is not installed, card's vendor names are not handled...

Note: only a few vendor names were added, common GeForce vendors.

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+									cuda_devicenames();
-												Add the -n (--ndevs) option like cgminer

											
										
										
											10 years ago
+									for (int n=0; n < ngpus; n++) {
-												improve the device mapping and a possible segfault

when using -n, work_restart array was not allocated

											
										
										
											9 years ago
+										int dev_id = device_map[n % MAX_GPUS];
-												Add the -n (--ndevs) option like cgminer

											
										
										
											10 years ago
+										cudaDeviceProp props;
-												improve the device mapping and a possible segfault

when using -n, work_restart array was not allocated

											
										
										
											9 years ago
+										cudaGetDeviceProperties(&props, dev_id);
-												nvml: get devices vendor names with libpci

made for linux and require libpci-dev (optional)

if libpci is not installed, card's vendor names are not handled...

Note: only a few vendor names were added, common GeForce vendors.

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+										if (!opt_n_threads || n < opt_n_threads) {
-												nvapi: pascal core voltage boost + meminfo fix x86

cuda 6.5 one seems to crash on pascal or report invalid mem sizes

											
										
										
											9 years ago
+											fprintf(stderr, "GPU #%d: SM %d.%d %s @ %.0f MHz (MEM %.0f)\n", dev_id, props.major, props.minor,
 												device_name[dev_id], (double) props.clockRate/1000, (double) props.memoryClockRate/1000);
-												nvml: add new getclock api (v8)

+ some new functions to test on pascal (if supported)

											
										
										
											9 years ago
+								#ifdef USE_WRAPNVML
 											if (opt_debug) nvml_print_device_info(dev_id);
-												nvapi: link some more apis, pascal boost table

these informations are shown with ccminer -D -n

											
										
										
											9 years ago
+								#ifdef WIN32
-												nvapi: fix device mapping in -D -n

and rename RVB to RGB, french typo...

											
										
										
											8 years ago
+											if (opt_debug) {
 												unsigned int devNum = nvapi_devnum(dev_id);
 												nvapi_pstateinfo(devNum);
 											}
-												nvapi: link some more apis, pascal boost table

these informations are shown with ccminer -D -n

											
										
										
											9 years ago
+								#endif
-												nvml: add new getclock api (v8)

+ some new functions to test on pascal (if supported)

											
										
										
											9 years ago
+								#endif
-												nvml: get devices vendor names with libpci

made for linux and require libpci-dev (optional)

if libpci is not installed, card's vendor names are not handled...

Note: only a few vendor names were added, common GeForce vendors.

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+										}
-												Add the -n (--ndevs) option like cgminer

											
										
										
											10 years ago
+									}
 								}
-												reset: take care of multi-threaded gpus (-d 0,0)

to be tested... could create problems when reset in a chain like x11...

											
										
										
											10 years ago
+								void cuda_shutdown()
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								{
-												cuda: prevent ptxas crash with -n

											
										
										
											8 years ago
+									// require gpu init first
 									if (thr_info != NULL)
 										cudaDeviceSynchronize();
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+									cudaDeviceReset();
 								}
 								static bool substringsearch(const char *haystack, const char *needle, int &match)
 								{
 									int hlen = (int) strlen(haystack);
 									int nlen = (int) strlen(needle);
 									for (int i=0; i < hlen; ++i)
 									{
 										if (haystack[i] == ' ') continue;
 										int j=0, x = 0;
 										while(j < nlen)
 										{
 											if (haystack[i+x] == ' ') {++x; continue;}
 											if (needle[j] == ' ') {++j; continue;}
 											if (needle[j] == '#') return ++match == needle[j+1]-'0';
 											if (tolower(haystack[i+x]) != tolower(needle[j])) break;
 											++j; ++x;
 										}
 										if (j == nlen) return true;
 									}
 									return false;
 								}
 								// CUDA Gerät nach Namen finden (gibt Geräte-Index zurück oder -1)
-												ccminer: rename main file and switch to C++

There was a different behavior on linux and visual studio

That was making it hard to link functions correctly

That remove some ifdef / extern "C" requirements

note about x86 releases, x86 nvml.dll is not installed on Windows x64!

											
										
										
											10 years ago
+								int cuda_finddevice(char *name)
-												Store and display average hashrate (benchmark + on share)

Displayed data is the average of the last 50 scans in the 5 last minutes

Also move cuda common functions in a new file (cuda.cu)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								{
 									int num = cuda_num_devices();
 									int match = 0;
 									for (int i=0; i < num; ++i)
 									{
 										cudaDeviceProp props;
 										if (cudaGetDeviceProperties(&props, i) == cudaSuccess)
 											if (substringsearch(props.name, name, match)) return i;
 									}
 									return -1;
 								}
-												benchmark: show mem and default throughput in results

and prepare a new function to get the default intensity

also, take care of multiple threads per gpu...

											
										
										
											9 years ago
+								// since 1.7
 								uint32_t cuda_default_throughput(int thr_id, uint32_t defcount)
 								{
 									//int dev_id = device_map[thr_id % MAX_GPUS];
 									uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount;
-												intensity: fix typo and drop old function

											
										
										
											9 years ago
+									if (gpu_threads > 1 && throughput == defcount) throughput /= (gpu_threads-1);
-												pool switch: add thr_id param to handle a future barrier

Switching to a pool with a different algo will require a barrier
to free ressources, like what was made in the global benchmark.

add also the algo in pool structure...

											
										
										
											9 years ago
+									if (api_thr_id != -1) api_set_throughput(thr_id, throughput);
-												warn on cuda errors + various small changes

The full benchmark can now be launched with "ccminer --benchmark"

add a new helper function which log a warning with last cuda error
(not shown with the quiet option) : CUDA_LOG_ERROR();
it can be used where miner.h is included (.c/.cpp/.cu)

fix x14 (in ccminer.cpp), a break was missing in switch..case

											
										
										
											9 years ago
+									//gpulog(LOG_INFO, thr_id, "throughput %u", throughput);
-												Allow different intensity per device

and clean the old variables, no more required

											
										
										
											10 years ago
+									return throughput;
 								}
-												cuda: throughput2intensity function to show default

											
										
										
											8 years ago
+								// since 1.8.3
 								double throughput2intensity(uint32_t throughput)
 								{
 									double intensity = 0.;
 									uint32_t ws = throughput;
 									uint8_t i = 0;
 									while (ws > 1 && i++ < 32)
 										ws = ws >> 1;
 									intensity = (double) i;
 									if (i && ((1U << i) < throughput)) {
 										intensity += ((double) (throughput-(1U << i)) / (1U << i));
 									}
 									return intensity;
 								}
-												reset: take care of multi-threaded gpus (-d 0,0)

to be tested... could create problems when reset in a chain like x11...

											
										
										
											10 years ago
+								// if we use 2 threads on the same gpu, we need to reinit the threads
 								void cuda_reset_device(int thr_id, bool *init)
 								{
-												benchmark: allow -a auto to bench all algos at once

											
										
										
											9 years ago
+									int dev_id = device_map[thr_id % MAX_GPUS];
-												cuda: reduce possible segfaults on exit

not perfect but helps...

											
										
										
											10 years ago
+									cudaSetDevice(dev_id);
 									if (init != NULL) {
 										// with init array, its meant to be used in algo's scan code...
 										for (int i=0; i < MAX_GPUS; i++) {
 											if (device_map[i] == dev_id) {
 												init[i] = false;
 											}
-												reset: take care of multi-threaded gpus (-d 0,0)

to be tested... could create problems when reset in a chain like x11...

											
										
										
											10 years ago
+										}
-												cuda: reduce possible segfaults on exit

not perfect but helps...

											
										
										
											10 years ago
+										// force exit from algo's scan loops/function
 										restart_threads();
 										cudaDeviceSynchronize();
 										while (cudaStreamQuery(NULL) == cudaErrorNotReady)
 											usleep(1000);
-												reset: take care of multi-threaded gpus (-d 0,0)

to be tested... could create problems when reset in a chain like x11...

											
										
										
											10 years ago
+									}
 									cudaDeviceReset();
-												refactor: create bench.cpp and algos.h

Also enhance multi-thread benchmark synchro. with pthread barriers

											
										
										
											9 years ago
+									if (opt_cudaschedule >= 0) {
-												Add a new cuda-schedule parameter

0: cudaDeviceScheduleAuto
1: cudaDeviceScheduleSpin
2: cudaDeviceScheduleYield
4: cudaDeviceScheduleBlockingSync

Also set the best one (4) for luffa algo by default...

											
										
										
											9 years ago
+										cudaSetDeviceFlags((unsigned)(opt_cudaschedule & cudaDeviceScheduleMask));
-												never interrupt global benchmark with found nonces

fix some algo weird hashrates (like blake)
and reset device between algos, for better accuracy

but this reset doesnt seems enough to bench all algos correctly...

to test on linux, could be a driver issue...

heavy: fix first alloc and indent with tabs...

											
										
										
											9 years ago
+									} else {
 										cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
-												refactor: create bench.cpp and algos.h

Also enhance multi-thread benchmark synchro. with pthread barriers

											
										
										
											9 years ago
+									}
-												never interrupt global benchmark with found nonces

fix some algo weird hashrates (like blake)
and reset device between algos, for better accuracy

but this reset doesnt seems enough to bench all algos correctly...

to test on linux, could be a driver issue...

heavy: fix first alloc and indent with tabs...

											
										
										
											9 years ago
+									cudaDeviceSynchronize();
-												reset: take care of multi-threaded gpus (-d 0,0)

to be tested... could create problems when reset in a chain like x11...

											
										
										
											10 years ago
+								}
-												benchmark: allow -a auto to bench all algos at once

											
										
										
											9 years ago
+								// return free memory in megabytes
 								int cuda_available_memory(int thr_id)
 								{
 									int dev_id = device_map[thr_id % MAX_GPUS];
-												nvapi: pascal core voltage boost + meminfo fix x86

cuda 6.5 one seems to crash on pascal or report invalid mem sizes

											
										
										
											9 years ago
+								#if defined(_WIN32) && defined(USE_WRAPNVML)
-												nvml: force 64bits types for mem sizes

size_t can be a bit... imprevisible on x86

											
										
										
											8 years ago
+									uint64_t tot64 = 0, free64 = 0;
-												nvapi: pascal core voltage boost + meminfo fix x86

cuda 6.5 one seems to crash on pascal or report invalid mem sizes

											
										
										
											9 years ago
+									// cuda (6.5) one can crash on pascal and dont handle 8GB
-												nvml: force 64bits types for mem sizes

size_t can be a bit... imprevisible on x86

											
										
										
											8 years ago
+									nvapiMemGetInfo(dev_id, &free64, &tot64);
 									return (int) (free64 / (1024 * 1024));
-												nvapi: pascal core voltage boost + meminfo fix x86

cuda 6.5 one seems to crash on pascal or report invalid mem sizes

											
										
										
											9 years ago
+								#else
-												nvml: force 64bits types for mem sizes

size_t can be a bit... imprevisible on x86

											
										
										
											8 years ago
+									size_t mtotal = 0, mfree = 0;
-												benchmark: allow -a auto to bench all algos at once

											
										
										
											9 years ago
+									cudaSetDevice(dev_id);
-												nvapi: link clocks and tlimit to command line

boost clocks and the thermal limit are shared with afterburner
beware with your settings, not as safe as application clocks!

Note: both nvapi and nvml are now used on windows x64
Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											9 years ago
+									cudaDeviceSynchronize();
-												nvapi: pascal core voltage boost + meminfo fix x86

cuda 6.5 one seems to crash on pascal or report invalid mem sizes

											
										
										
											9 years ago
+									cudaMemGetInfo(&mfree, &mtotal);
-												benchmark: allow -a auto to bench all algos at once

											
										
										
											9 years ago
+									return (int) (mfree / (1024 * 1024));
-												nvml: force 64bits types for mem sizes

size_t can be a bit... imprevisible on x86

											
										
										
											8 years ago
+								#endif
-												benchmark: allow -a auto to bench all algos at once

											
										
										
											9 years ago
+								}
-												warn on cuda errors + various small changes

The full benchmark can now be launched with "ccminer --benchmark"

add a new helper function which log a warning with last cuda error
(not shown with the quiet option) : CUDA_LOG_ERROR();
it can be used where miner.h is included (.c/.cpp/.cu)

fix x14 (in ccminer.cpp), a break was missing in switch..case

											
										
										
											9 years ago
+								// Check (and reset) last cuda error, and report it in logs
 								void cuda_log_lasterror(int thr_id, const char* func, int line)
 								{
 									cudaError_t err = cudaGetLastError();
 									if (err != cudaSuccess && !opt_quiet)
 										gpulog(LOG_WARNING, thr_id, "%s:%d %s", func, line, cudaGetErrorString(err));
 								}
-												prepare the 1.7 release

											
										
										
											9 years ago
+								// Clear any cuda error in non-cuda unit (.c/.cpp)
 								void cuda_clear_lasterror()
 								{
 									cudaGetLastError();
 								}
-												Fix windows linkage, C/C++ mismatch

											
										
										
											9 years ago
+								#ifdef __cplusplus
 								} /* extern "C" */
 								#endif
-												api: change unit of device mem to MB

without that, no way to read sizes > 4GB on x86 binaries

											
										
										
											8 years ago
+								int cuda_gpu_info(struct cgpu_info *gpu)
-												Fix windows linkage, C/C++ mismatch

											
										
										
											9 years ago
+								{
 									cudaDeviceProp props;
 									if (cudaGetDeviceProperties(&props, gpu->gpu_id) == cudaSuccess) {
-												nvml: force 64bits types for mem sizes

size_t can be a bit... imprevisible on x86

											
										
										
											8 years ago
+										gpu->gpu_clock = (uint32_t) props.clockRate;
 										gpu->gpu_memclock = (uint32_t) props.memoryClockRate;
 										gpu->gpu_mem = (uint64_t) (props.totalGlobalMem / 1024); // kB
-												api: change unit of device mem to MB

without that, no way to read sizes > 4GB on x86 binaries

											
										
										
											8 years ago
+								#if defined(_WIN32) && defined(USE_WRAPNVML)
 										// required to get mem size > 4GB (size_t too small for bytes on 32bit)
 										nvapiMemGetInfo(gpu->gpu_id, &gpu->gpu_memfree, &gpu->gpu_mem); // kB
 								#endif
 										gpu->gpu_mem = gpu->gpu_mem / 1024; // MB
-												Fix windows linkage, C/C++ mismatch

											
										
										
											9 years ago
+										return 0;
 									}
 									return -1;
 								}
 								// Zeitsynchronisations-Routine von cudaminer mit CPU sleep
 								// Note: if you disable all of these calls, CPU usage will hit 100%
 								typedef struct { double value[8]; } tsumarray;
 								cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
 								{
 									cudaError_t result = cudaSuccess;
-												reduce crashes on debug tests

											
										
										
											9 years ago
+									if (abort_flag)
 										return result;
-												Fix windows linkage, C/C++ mismatch

											
										
										
											9 years ago
+									if (situation >= 0)
 									{
 										static std::map<int, tsumarray> tsum;
 										double a = 0.95, b = 0.05;
 										if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence
 										double tsync = 0.0;
 										double tsleep = 0.95 * tsum[situation].value[thr_id];
 										if (cudaStreamQuery(stream) == cudaErrorNotReady)
 										{
 											usleep((useconds_t)(1e6*tsleep));
 											struct timeval tv_start, tv_end;
 											gettimeofday(&tv_start, NULL);
 											result = cudaStreamSynchronize(stream);
 											gettimeofday(&tv_end, NULL);
 											tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec);
 										}
 										if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync);
 									}
 									else
 										result = cudaStreamSynchronize(stream);
 									return result;
 								}
-												Prepare trap of hardware/mem failures

											
										
										
											10 years ago
+								void cudaReportHardwareFailure(int thr_id, cudaError_t err, const char* func)
 								{
 									struct cgpu_info *gpu = &thr_info[thr_id].gpu;
 									gpu->hw_errors++;
-												add gpulog() function helper, simple and multi-threads

when using multiple cpu threads per gpu, use the T prefix, ex:

[2015-10-11 09:52:49] GPU #0: app clocks set to P0 (3600/1228)
 vs
[2015-10-11 09:52:51] GPU T0: MSI GTX 960, 5953.35 kH/s

Only thr_id is required, the function take care of the dev id

											
										
										
											9 years ago
+									gpulog(LOG_ERR, thr_id, "%s %s", func, cudaGetErrorString(err));
-												Prepare trap of hardware/mem failures

											
										
										
											10 years ago
+									sleep(1);
 								}