Browse Source

benchmark: show mem and default throughput in results

and prepare a new function to get the default intensity

also, take care of multiple threads per gpu...
master
Tanguy Pruvot 9 years ago
parent
commit
c6dcc5e5cf
  1. 3
      Algo256/blake256.cu
  2. 4
      README.txt
  3. 2
      api.cpp
  4. 37
      bench.cpp
  5. 25
      ccminer.cpp
  6. 23
      cuda.cpp
  7. 7
      miner.h
  8. 2
      skein.cu

3
Algo256/blake256.cu

@ -388,7 +388,8 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non @@ -388,7 +388,8 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non
const uint32_t first_nonce = pdata[19];
uint64_t targetHigh = ((uint64_t*)ptarget)[3];
int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 20;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity, max_nonce - first_nonce);
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
throughput = min(throughput, max_nonce - first_nonce);
int rc = 0;

4
README.txt

@ -96,6 +96,7 @@ its command line interface and options. @@ -96,6 +96,7 @@ its command line interface and options.
x14 use to mine X14Coin
x15 use to mine Halcyon
x17 use to mine X17
whirlpool use to mine Joincoin
whirlpoolx use to mine Vanilla
zr5 use to mine ZiftrCoin
@ -228,6 +229,9 @@ features. @@ -228,6 +229,9 @@ features.
>>> RELEASE HISTORY <<<
Under Dev... v1.7
Restore whirlpool algo (and whirlcoin variant)
Prepare algo switch ability
Add --benchmark -a auto to run a multi algo benchmark
Add --cuda-schedule parameter
Add --show-diff parameter, which display shares diff,
and is able to detect real solved blocks on pools.

2
api.cpp

@ -990,7 +990,7 @@ void *api_thread(void *userdata) @@ -990,7 +990,7 @@ void *api_thread(void *userdata)
/* to be able to report the default value set in each algo */
void api_set_throughput(int thr_id, uint32_t throughput)
{
if (&thr_info[thr_id]) {
if (thr_id < MAX_GPUS) {
struct cgpu_info *cgpu = &thr_info[thr_id].gpu;
uint32_t ws = throughput;
uint8_t i = 0;

37
bench.cpp

@ -11,7 +11,9 @@ @@ -11,7 +11,9 @@
int bench_algo = -1;
static double * algo_hashrates[MAX_GPUS] = { 0 };
static double algo_hashrates[MAX_GPUS][ALGO_COUNT] = { 0 };
static uint32_t algo_throughput[MAX_GPUS][ALGO_COUNT] = { 0 };
static int algo_mem_used[MAX_GPUS][ALGO_COUNT] = { 0 };
static int device_mem_free[MAX_GPUS] = { 0 };
static pthread_barrier_t miner_barr;
@ -25,18 +27,12 @@ void bench_init(int threads) @@ -25,18 +27,12 @@ void bench_init(int threads)
{
bench_algo = opt_algo = (enum sha_algos) 0; /* first */
applog(LOG_BLUE, "Starting benchmark mode with %s", algo_names[opt_algo]);
for (int n=0; n < MAX_GPUS; n++) {
algo_hashrates[n] = (double*) calloc(1, ALGO_COUNT * sizeof(double));
}
pthread_barrier_init(&miner_barr, NULL, threads);
pthread_barrier_init(&algo_barr, NULL, threads);
}
void bench_free()
{
for (int n=0; n < MAX_GPUS; n++) {
free(algo_hashrates[n]);
}
pthread_barrier_destroy(&miner_barr);
pthread_barrier_destroy(&algo_barr);
}
@ -47,12 +43,7 @@ bool bench_algo_switch_next(int thr_id) @@ -47,12 +43,7 @@ bool bench_algo_switch_next(int thr_id)
int algo = (int) opt_algo;
int prev_algo = algo;
int dev_id = device_map[thr_id % MAX_GPUS];
int mfree;
char rate[32] = { 0 };
// free current algo memory and track mem usage
miner_free_device(thr_id);
mfree = cuda_available_memory(thr_id);
int mfree, mused;
algo++;
@ -70,16 +61,23 @@ bool bench_algo_switch_next(int thr_id) @@ -70,16 +61,23 @@ bool bench_algo_switch_next(int thr_id)
pthread_barrier_wait(&miner_barr);
}
char rate[32] = { 0 };
double hashrate = stats_get_speed(thr_id, thr_hashrates[thr_id]);
format_hashrate(hashrate, rate);
applog(LOG_NOTICE, "GPU #%d: %s hashrate = %s", dev_id, algo_names[prev_algo], rate);
// free current algo memory and track mem usage
mused = cuda_available_memory(thr_id);
miner_free_device(thr_id);
mfree = cuda_available_memory(thr_id);
// check if there is memory leak
if (device_mem_free[thr_id] > mfree) {
applog(LOG_WARNING, "GPU #%d, memory leak detected in %s ! %d MB free",
dev_id, algo_names[prev_algo], mfree);
dev_id, algo_names[prev_algo], mfree);
}
// store used memory per algo
algo_mem_used[thr_id][opt_algo] = device_mem_free[thr_id] - mused;
device_mem_free[thr_id] = mfree;
// store to dump a table per gpu later
@ -109,6 +107,11 @@ bool bench_algo_switch_next(int thr_id) @@ -109,6 +107,11 @@ bool bench_algo_switch_next(int thr_id)
return true;
}
void bench_set_throughput(int thr_id, uint32_t throughput)
{
algo_throughput[thr_id][opt_algo] = throughput;
}
void bench_display_results()
{
for (int n=0; n < opt_n_threads; n++)
@ -118,7 +121,9 @@ void bench_display_results() @@ -118,7 +121,9 @@ void bench_display_results()
for (int i=0; i < ALGO_COUNT-1; i++) {
double rate = algo_hashrates[n][i];
if (rate == 0.0) continue;
applog(LOG_INFO, "%12s : %12.1f kH/s", algo_names[i], rate / 1024.);
applog(LOG_INFO, "%12s : %12.1f kH/s, %5d MB, %8u thr.", algo_names[i],
rate / 1024., algo_mem_used[n][i], algo_throughput[n][i]);
}
}
}

25
ccminer.cpp

@ -110,6 +110,7 @@ static json_t *opt_config; @@ -110,6 +110,7 @@ static json_t *opt_config;
static const bool opt_time = true;
enum sha_algos opt_algo = ALGO_X11;
int opt_n_threads = 0;
int gpu_threads = 1;
int64_t opt_affinity = -1L;
int opt_priority = 0;
static double opt_difficulty = 1.;
@ -1483,6 +1484,7 @@ static void *miner_thread(void *userdata) @@ -1483,6 +1484,7 @@ static void *miner_thread(void *userdata)
struct thr_info *mythr = (struct thr_info *)userdata;
int switchn = pool_switch_count;
int thr_id = mythr->id;
int dev_id = device_map[thr_id % MAX_GPUS];
struct work work;
uint64_t loopcnt = 0;
uint32_t max_nonce;
@ -1635,7 +1637,7 @@ static void *miner_thread(void *userdata) @@ -1635,7 +1637,7 @@ static void *miner_thread(void *userdata)
// --benchmark [-a auto]
if (opt_benchmark && bench_algo >= 0) {
//applog(LOG_DEBUG, "GPU #%d: loop %d", device_map[thr_id], loopcnt);
//applog(LOG_DEBUG, "GPU #%d: loop %d", dev_id, loopcnt);
if (loopcnt >= 3) {
if (!bench_algo_switch_next(thr_id) && thr_id == 0)
{
@ -1755,11 +1757,11 @@ static void *miner_thread(void *userdata) @@ -1755,11 +1757,11 @@ static void *miner_thread(void *userdata)
break;
case ALGO_KECCAK:
case ALGO_JACKPOT:
case ALGO_NEOSCRYPT:
case ALGO_X15:
minmax = 0x300000;
break;
case ALGO_LYRA2:
case ALGO_NEOSCRYPT:
case ALGO_SCRYPT:
minmax = 0x80000;
break;
@ -1795,7 +1797,7 @@ static void *miner_thread(void *userdata) @@ -1795,7 +1797,7 @@ static void *miner_thread(void *userdata)
if (opt_debug)
applog(LOG_DEBUG, "GPU #%d: start=%08x end=%08x range=%08x",
device_map[thr_id], start_nonce, max_nonce, (max_nonce-start_nonce));
dev_id, start_nonce, max_nonce, (max_nonce-start_nonce));
hashes_done = 0;
gettimeofday(&tv_start, NULL);
@ -1967,7 +1969,7 @@ static void *miner_thread(void *userdata) @@ -1967,7 +1969,7 @@ static void *miner_thread(void *userdata)
work.scanned_to = max_nonce;
if (opt_debug && opt_benchmark) {
// to debug nonce ranges
applog(LOG_DEBUG, "GPU #%d: ends=%08x range=%08x", device_map[thr_id],
applog(LOG_DEBUG, "GPU #%d: ends=%08x range=%08x", dev_id,
nonceptr[0], (nonceptr[0] - start_nonce));
}
}
@ -1978,8 +1980,7 @@ static void *miner_thread(void *userdata) @@ -1978,8 +1980,7 @@ static void *miner_thread(void *userdata)
/* output */
if (!opt_quiet && firstwork_time) {
format_hashrate(thr_hashrates[thr_id], s);
applog(LOG_INFO, "GPU #%d: %s, %s",
device_map[thr_id], device_name[device_map[thr_id]], s);
applog(LOG_INFO, "GPU #%d: %s, %s", dev_id, device_name[dev_id], s);
}
/* ignore first loop hashrate */
@ -2835,8 +2836,6 @@ void parse_arg(int key, char *arg) @@ -2835,8 +2836,6 @@ void parse_arg(int key, char *arg)
proper_exit(EXIT_CODE_CUDA_NODEVICE);
}
}
// set number of active gpus
active_gpus = opt_n_threads;
pch = strtok (NULL, ",");
}
}
@ -3057,8 +3056,11 @@ int main(int argc, char *argv[]) @@ -3057,8 +3056,11 @@ int main(int argc, char *argv[])
if (num_cpus < 1)
num_cpus = 1;
// number of gpus
active_gpus = cuda_num_devices();
for (i = 0; i < MAX_GPUS; i++) {
device_map[i] = i;
device_map[i] = i % active_gpus;
device_name[i] = NULL;
device_config[i] = NULL;
device_backoff[i] = is_windows() ? 12 : 2;
@ -3070,8 +3072,6 @@ int main(int argc, char *argv[]) @@ -3070,8 +3072,6 @@ int main(int argc, char *argv[])
device_pstate[i] = -1;
}
// number of gpus
active_gpus = cuda_num_devices();
cuda_devicenames();
/* parse command line */
@ -3192,6 +3192,9 @@ int main(int argc, char *argv[]) @@ -3192,6 +3192,9 @@ int main(int argc, char *argv[])
if (!opt_n_threads)
opt_n_threads = active_gpus;
// generally doesn't work... let 1
gpu_threads = opt_n_threads / active_gpus;
if (opt_benchmark && opt_algo == ALGO_AUTO) {
bench_init(opt_n_threads);
for (int n=0; n < MAX_GPUS; n++) {

23
cuda.cpp

@ -67,7 +67,8 @@ void cuda_devicenames() @@ -67,7 +67,8 @@ void cuda_devicenames()
exit(1);
}
GPU_N = min(MAX_GPUS, GPU_N);
if (opt_n_threads)
GPU_N = min(MAX_GPUS, opt_n_threads);
for (int i=0; i < GPU_N; i++)
{
char vendorname[32] = { 0 };
@ -98,7 +99,7 @@ void cuda_print_devices() @@ -98,7 +99,7 @@ void cuda_print_devices()
int ngpus = cuda_num_devices();
cuda_devicenames();
for (int n=0; n < ngpus; n++) {
int m = device_map[n];
int m = device_map[n % MAX_GPUS];
cudaDeviceProp props;
cudaGetDeviceProperties(&props, m);
if (!opt_n_threads || n < opt_n_threads) {
@ -148,10 +149,25 @@ int cuda_finddevice(char *name) @@ -148,10 +149,25 @@ int cuda_finddevice(char *name)
return -1;
}
// deprecated since 1.7
uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount)
{
uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount;
if (gpu_threads > 1) throughput >> (gpu_threads-1);
api_set_throughput(thr_id, throughput);
bench_set_throughput(thr_id, throughput);
return throughput;
}
// since 1.7
uint32_t cuda_default_throughput(int thr_id, uint32_t defcount)
{
//int dev_id = device_map[thr_id % MAX_GPUS];
uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount;
if (gpu_threads > 1) throughput >> (gpu_threads-1);
api_set_throughput(thr_id, throughput);
bench_set_throughput(thr_id, throughput);
//if (opt_debug) applog(LOG_DEBUG, "GPU %d-%d: throughput %u", dev_id, thr_id, throughput);
return throughput;
}
@ -240,7 +256,8 @@ cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id) @@ -240,7 +256,8 @@ cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
void cudaReportHardwareFailure(int thr_id, cudaError_t err, const char* func)
{
struct cgpu_info *gpu = &thr_info[thr_id].gpu;
int dev_id = device_map[thr_id % MAX_GPUS];
gpu->hw_errors++;
applog(LOG_ERR, "GPU #%d: %s %s", device_map[thr_id], func, cudaGetErrorString(err));
applog(LOG_ERR, "GPU #%d: %s %s", dev_id, func, cudaGetErrorString(err));
sleep(1);
}

7
miner.h

@ -447,6 +447,7 @@ extern bool opt_showdiff; @@ -447,6 +447,7 @@ extern bool opt_showdiff;
extern bool opt_tracegpu;
extern int opt_n_threads;
extern int active_gpus;
extern int gpu_threads;
extern int opt_timeout;
extern bool want_longpoll;
extern bool have_longpoll;
@ -489,6 +490,9 @@ int cuda_finddevice(char *name); @@ -489,6 +490,9 @@ int cuda_finddevice(char *name);
void cuda_print_devices();
int cuda_available_memory(int thr_id);
uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount);
uint32_t cuda_default_throughput(int thr_id, uint32_t defcount);
#define CL_N "\x1B[0m"
#define CL_RED "\x1B[31m"
#define CL_GRN "\x1B[32m"
@ -522,6 +526,7 @@ int cuda_available_memory(int thr_id); @@ -522,6 +526,7 @@ int cuda_available_memory(int thr_id);
extern void format_hashrate(double hashrate, char *output);
extern void applog(int prio, const char *fmt, ...);
#define gpulog(prio, fmt, thr_id, ...) applog(prio, fmt, thr_id, __VA_ARGS__)
void get_defconfig_path(char *out, size_t bufsize, char *argv0);
extern void cbin2hex(char *out, const char *in, size_t len);
extern char *bin2hex(const unsigned char *in, size_t len);
@ -533,7 +538,6 @@ void diff_to_target(uint32_t* target, double diff); @@ -533,7 +538,6 @@ void diff_to_target(uint32_t* target, double diff);
void work_set_target(struct work* work, double diff);
double target_to_diff(uint32_t* target);
extern void get_currentalgo(char* buf, int sz);
extern uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount);
// bignum
double bn_convert_nbits(const uint32_t nbits);
@ -547,6 +551,7 @@ extern int bench_algo; @@ -547,6 +551,7 @@ extern int bench_algo;
void bench_init(int threads);
void bench_free();
bool bench_algo_switch_next(int thr_id);
void bench_set_throughput(int thr_id, uint32_t throughput);
void bench_display_results();

2
skein.cu

@ -364,7 +364,7 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no @@ -364,7 +364,7 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
throughput = min(throughput, (max_nonce - first_nonce));
uint32_t foundNonce, secNonce = 0;
uint64_t target64;
uint64_t target64 = 0;
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x03;

Loading…
Cancel
Save