diff --git a/ccminer.cpp b/ccminer.cpp index 8db1685..fe99248 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -128,6 +128,7 @@ int active_gpus; char * device_name[MAX_GPUS]; short device_map[MAX_GPUS] = { 0 }; long device_sm[MAX_GPUS] = { 0 }; +short device_mpcount[MAX_GPUS] = { 0 }; uint32_t gpus_intensity[MAX_GPUS] = { 0 }; uint32_t device_gpu_clocks[MAX_GPUS] = { 0 }; uint32_t device_mem_clocks[MAX_GPUS] = { 0 }; diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu index 5d5b1c6..5a78639 100644 --- a/crypto/cryptonight.cu +++ b/crypto/cryptonight.cu @@ -3,8 +3,13 @@ extern char *device_config[MAX_GPUS]; // -l 32x16 -static __thread uint32_t cn_blocks = 32; -static __thread uint32_t cn_threads = 16; +static __thread uint32_t cn_blocks; +static __thread uint32_t cn_threads; + +// used for gpu intensity on algo init +static __thread bool gpu_init_shown = false; +#define gpulog_init(p,thr,fmt, ...) if (!gpu_init_shown) \ + gpulog(p, thr, fmt, ##__VA_ARGS__) static uint32_t *d_long_state[MAX_GPUS]; static uint64_t *d_ctx_state[MAX_GPUS]; @@ -26,6 +31,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ uint32_t *nonceptr = (uint32_t*) (&pdata[39]); const uint32_t first_nonce = *nonceptr; uint32_t nonce = first_nonce; + int dev_id = device_map[thr_id]; if(opt_benchmark) { ptarget[7] = 0x00ff; @@ -33,19 +39,29 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ if(!init[thr_id]) { + int mem = cuda_available_memory(thr_id); + int mul = device_sm[dev_id] >= 300 ? 4 : 1; // see cryptonight-core.cu + cn_threads = device_sm[dev_id] >= 600 ? 16 : 8; // real TPB is x4 on SM3+ + cn_blocks = device_mpcount[dev_id] * 4; + if (cn_blocks*cn_threads*2.2 > mem) cn_blocks = device_mpcount[dev_id] * 2; + + if (!opt_quiet) + gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id], + mem, device_mpcount[dev_id]); + if (device_config[thr_id]) { - sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads); + int res = sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads); throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads); - gpulog(LOG_INFO, thr_id, "Using %u x %u kernel launch config, %u threads", - cn_blocks, cn_threads, throughput); + gpulog_init(LOG_INFO, thr_id, "Using %ux%u(x%d) kernel launch config, %u threads", + cn_blocks, cn_threads, mul, throughput); } else { throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads); if (throughput != cn_blocks*cn_threads && cn_threads) { cn_blocks = throughput / cn_threads; throughput = cn_threads * cn_blocks; } - gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u threads (%ux%u)", - throughput2intensity(throughput), throughput, cn_blocks, cn_threads); + gpulog_init(LOG_INFO, thr_id, "%u threads (%g) with %u blocks",// of %ux%d", + throughput, throughput2intensity(throughput), cn_blocks);//, cn_threads, mul); } if(sizeof(size_t) == 4 && throughput > UINT32_MAX / MEMORY) { @@ -67,7 +83,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ cudaMalloc(&d_long_state[thr_id], alloc); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); - cudaMalloc(&d_ctx_state[thr_id], 208 * throughput); // 200 is aligned 8, not 16 + cudaMalloc(&d_ctx_state[thr_id], 208 * throughput); // 52*4 (200 is not aligned 16) exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); @@ -80,6 +96,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + gpu_init_shown = true; init[thr_id] = true; } diff --git a/cuda.cpp b/cuda.cpp index a2d19c4..9b65adc 100644 --- a/cuda.cpp +++ b/cuda.cpp @@ -78,6 +78,7 @@ void cuda_devicenames() cudaGetDeviceProperties(&props, dev_id); device_sm[dev_id] = (props.major * 100 + props.minor * 10); + device_mpcount[dev_id] = (short) props.multiProcessorCount; if (device_name[dev_id]) { free(device_name[dev_id]); @@ -105,8 +106,10 @@ void cuda_print_devices() cudaDeviceProp props; cudaGetDeviceProperties(&props, dev_id); if (!opt_n_threads || n < opt_n_threads) { - fprintf(stderr, "GPU #%d: SM %d.%d %s @ %.0f MHz (MEM %.0f)\n", dev_id, props.major, props.minor, - device_name[dev_id], (double) props.clockRate/1000, (double) props.memoryClockRate/1000); + fprintf(stderr, "GPU #%d: SM %d.%d %s @ %.0f MHz (MEM %.0f)\n", dev_id, + props.major, props.minor, device_name[dev_id], + (double) props.clockRate/1000, + (double) props.memoryClockRate/1000); #ifdef USE_WRAPNVML if (opt_debug) nvml_print_device_info(dev_id); #ifdef WIN32 @@ -224,7 +227,7 @@ int cuda_available_memory(int thr_id) uint64_t tot64 = 0, free64 = 0; // cuda (6.5) one can crash on pascal and dont handle 8GB nvapiMemGetInfo(dev_id, &free64, &tot64); - return (int) (free64 / (1024 * 1024)); + return (int) (free64 / (1024)); #else size_t mtotal = 0, mfree = 0; cudaSetDevice(dev_id); diff --git a/cuda_helper.h b/cuda_helper.h index da0fc2a..5d6b6cd 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -25,7 +25,7 @@ extern "C" short device_map[MAX_GPUS]; extern "C" long device_sm[MAX_GPUS]; - +extern short device_mpcount[MAX_GPUS]; extern int cuda_arch[MAX_GPUS]; // common functions diff --git a/miner.h b/miner.h index cd1ae92..9446760 100644 --- a/miner.h +++ b/miner.h @@ -519,6 +519,7 @@ extern double stratum_diff; //#define MAX_THREADS 32 todo extern char* device_name[MAX_GPUS]; extern short device_map[MAX_GPUS]; +extern short device_mpcount[MAX_GPUS]; extern long device_sm[MAX_GPUS]; extern uint32_t gpus_intensity[MAX_GPUS]; extern int opt_cudaschedule; @@ -576,6 +577,7 @@ void cuda_clear_lasterror(); extern void format_hashrate(double hashrate, char *output); extern void applog(int prio, const char *fmt, ...); extern void gpulog(int prio, int thr_id, const char *fmt, ...); + void get_defconfig_path(char *out, size_t bufsize, char *argv0); extern void cbin2hex(char *out, const char *in, size_t len); extern char *bin2hex(const unsigned char *in, size_t len);