Browse Source

benchmark: show mem and default throughput in results

and prepare a new function to get the default intensity

also, take care of multiple threads per gpu...
2upstream
Tanguy Pruvot 9 years ago
parent
commit
c6dcc5e5cf
  1. 3
      Algo256/blake256.cu
  2. 4
      README.txt
  3. 2
      api.cpp
  4. 37
      bench.cpp
  5. 25
      ccminer.cpp
  6. 23
      cuda.cpp
  7. 7
      miner.h
  8. 2
      skein.cu

3
Algo256/blake256.cu

@ -388,7 +388,8 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint64_t targetHigh = ((uint64_t*)ptarget)[3]; uint64_t targetHigh = ((uint64_t*)ptarget)[3];
int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 20; int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 20;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity, max_nonce - first_nonce); uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
throughput = min(throughput, max_nonce - first_nonce);
int rc = 0; int rc = 0;

4
README.txt

@ -96,6 +96,7 @@ its command line interface and options.
x14 use to mine X14Coin x14 use to mine X14Coin
x15 use to mine Halcyon x15 use to mine Halcyon
x17 use to mine X17 x17 use to mine X17
whirlpool use to mine Joincoin
whirlpoolx use to mine Vanilla whirlpoolx use to mine Vanilla
zr5 use to mine ZiftrCoin zr5 use to mine ZiftrCoin
@ -228,6 +229,9 @@ features.
>>> RELEASE HISTORY <<< >>> RELEASE HISTORY <<<
Under Dev... v1.7 Under Dev... v1.7
Restore whirlpool algo (and whirlcoin variant)
Prepare algo switch ability
Add --benchmark -a auto to run a multi algo benchmark
Add --cuda-schedule parameter Add --cuda-schedule parameter
Add --show-diff parameter, which display shares diff, Add --show-diff parameter, which display shares diff,
and is able to detect real solved blocks on pools. and is able to detect real solved blocks on pools.

2
api.cpp

@ -990,7 +990,7 @@ void *api_thread(void *userdata)
/* to be able to report the default value set in each algo */ /* to be able to report the default value set in each algo */
void api_set_throughput(int thr_id, uint32_t throughput) void api_set_throughput(int thr_id, uint32_t throughput)
{ {
if (&thr_info[thr_id]) { if (thr_id < MAX_GPUS) {
struct cgpu_info *cgpu = &thr_info[thr_id].gpu; struct cgpu_info *cgpu = &thr_info[thr_id].gpu;
uint32_t ws = throughput; uint32_t ws = throughput;
uint8_t i = 0; uint8_t i = 0;

37
bench.cpp

@ -11,7 +11,9 @@
int bench_algo = -1; int bench_algo = -1;
static double * algo_hashrates[MAX_GPUS] = { 0 }; static double algo_hashrates[MAX_GPUS][ALGO_COUNT] = { 0 };
static uint32_t algo_throughput[MAX_GPUS][ALGO_COUNT] = { 0 };
static int algo_mem_used[MAX_GPUS][ALGO_COUNT] = { 0 };
static int device_mem_free[MAX_GPUS] = { 0 }; static int device_mem_free[MAX_GPUS] = { 0 };
static pthread_barrier_t miner_barr; static pthread_barrier_t miner_barr;
@ -25,18 +27,12 @@ void bench_init(int threads)
{ {
bench_algo = opt_algo = (enum sha_algos) 0; /* first */ bench_algo = opt_algo = (enum sha_algos) 0; /* first */
applog(LOG_BLUE, "Starting benchmark mode with %s", algo_names[opt_algo]); applog(LOG_BLUE, "Starting benchmark mode with %s", algo_names[opt_algo]);
for (int n=0; n < MAX_GPUS; n++) {
algo_hashrates[n] = (double*) calloc(1, ALGO_COUNT * sizeof(double));
}
pthread_barrier_init(&miner_barr, NULL, threads); pthread_barrier_init(&miner_barr, NULL, threads);
pthread_barrier_init(&algo_barr, NULL, threads); pthread_barrier_init(&algo_barr, NULL, threads);
} }
void bench_free() void bench_free()
{ {
for (int n=0; n < MAX_GPUS; n++) {
free(algo_hashrates[n]);
}
pthread_barrier_destroy(&miner_barr); pthread_barrier_destroy(&miner_barr);
pthread_barrier_destroy(&algo_barr); pthread_barrier_destroy(&algo_barr);
} }
@ -47,12 +43,7 @@ bool bench_algo_switch_next(int thr_id)
int algo = (int) opt_algo; int algo = (int) opt_algo;
int prev_algo = algo; int prev_algo = algo;
int dev_id = device_map[thr_id % MAX_GPUS]; int dev_id = device_map[thr_id % MAX_GPUS];
int mfree; int mfree, mused;
char rate[32] = { 0 };
// free current algo memory and track mem usage
miner_free_device(thr_id);
mfree = cuda_available_memory(thr_id);
algo++; algo++;
@ -70,16 +61,23 @@ bool bench_algo_switch_next(int thr_id)
pthread_barrier_wait(&miner_barr); pthread_barrier_wait(&miner_barr);
} }
char rate[32] = { 0 };
double hashrate = stats_get_speed(thr_id, thr_hashrates[thr_id]); double hashrate = stats_get_speed(thr_id, thr_hashrates[thr_id]);
format_hashrate(hashrate, rate); format_hashrate(hashrate, rate);
applog(LOG_NOTICE, "GPU #%d: %s hashrate = %s", dev_id, algo_names[prev_algo], rate); applog(LOG_NOTICE, "GPU #%d: %s hashrate = %s", dev_id, algo_names[prev_algo], rate);
// free current algo memory and track mem usage
mused = cuda_available_memory(thr_id);
miner_free_device(thr_id);
mfree = cuda_available_memory(thr_id);
// check if there is memory leak // check if there is memory leak
if (device_mem_free[thr_id] > mfree) { if (device_mem_free[thr_id] > mfree) {
applog(LOG_WARNING, "GPU #%d, memory leak detected in %s ! %d MB free", applog(LOG_WARNING, "GPU #%d, memory leak detected in %s ! %d MB free",
dev_id, algo_names[prev_algo], mfree); dev_id, algo_names[prev_algo], mfree);
} }
// store used memory per algo
algo_mem_used[thr_id][opt_algo] = device_mem_free[thr_id] - mused;
device_mem_free[thr_id] = mfree; device_mem_free[thr_id] = mfree;
// store to dump a table per gpu later // store to dump a table per gpu later
@ -109,6 +107,11 @@ bool bench_algo_switch_next(int thr_id)
return true; return true;
} }
void bench_set_throughput(int thr_id, uint32_t throughput)
{
algo_throughput[thr_id][opt_algo] = throughput;
}
void bench_display_results() void bench_display_results()
{ {
for (int n=0; n < opt_n_threads; n++) for (int n=0; n < opt_n_threads; n++)
@ -118,7 +121,9 @@ void bench_display_results()
for (int i=0; i < ALGO_COUNT-1; i++) { for (int i=0; i < ALGO_COUNT-1; i++) {
double rate = algo_hashrates[n][i]; double rate = algo_hashrates[n][i];
if (rate == 0.0) continue; if (rate == 0.0) continue;
applog(LOG_INFO, "%12s : %12.1f kH/s", algo_names[i], rate / 1024.); applog(LOG_INFO, "%12s : %12.1f kH/s, %5d MB, %8u thr.", algo_names[i],
rate / 1024., algo_mem_used[n][i], algo_throughput[n][i]);
} }
} }
} }

25
ccminer.cpp

@ -110,6 +110,7 @@ static json_t *opt_config;
static const bool opt_time = true; static const bool opt_time = true;
enum sha_algos opt_algo = ALGO_X11; enum sha_algos opt_algo = ALGO_X11;
int opt_n_threads = 0; int opt_n_threads = 0;
int gpu_threads = 1;
int64_t opt_affinity = -1L; int64_t opt_affinity = -1L;
int opt_priority = 0; int opt_priority = 0;
static double opt_difficulty = 1.; static double opt_difficulty = 1.;
@ -1483,6 +1484,7 @@ static void *miner_thread(void *userdata)
struct thr_info *mythr = (struct thr_info *)userdata; struct thr_info *mythr = (struct thr_info *)userdata;
int switchn = pool_switch_count; int switchn = pool_switch_count;
int thr_id = mythr->id; int thr_id = mythr->id;
int dev_id = device_map[thr_id % MAX_GPUS];
struct work work; struct work work;
uint64_t loopcnt = 0; uint64_t loopcnt = 0;
uint32_t max_nonce; uint32_t max_nonce;
@ -1635,7 +1637,7 @@ static void *miner_thread(void *userdata)
// --benchmark [-a auto] // --benchmark [-a auto]
if (opt_benchmark && bench_algo >= 0) { if (opt_benchmark && bench_algo >= 0) {
//applog(LOG_DEBUG, "GPU #%d: loop %d", device_map[thr_id], loopcnt); //applog(LOG_DEBUG, "GPU #%d: loop %d", dev_id, loopcnt);
if (loopcnt >= 3) { if (loopcnt >= 3) {
if (!bench_algo_switch_next(thr_id) && thr_id == 0) if (!bench_algo_switch_next(thr_id) && thr_id == 0)
{ {
@ -1755,11 +1757,11 @@ static void *miner_thread(void *userdata)
break; break;
case ALGO_KECCAK: case ALGO_KECCAK:
case ALGO_JACKPOT: case ALGO_JACKPOT:
case ALGO_NEOSCRYPT:
case ALGO_X15: case ALGO_X15:
minmax = 0x300000; minmax = 0x300000;
break; break;
case ALGO_LYRA2: case ALGO_LYRA2:
case ALGO_NEOSCRYPT:
case ALGO_SCRYPT: case ALGO_SCRYPT:
minmax = 0x80000; minmax = 0x80000;
break; break;
@ -1795,7 +1797,7 @@ static void *miner_thread(void *userdata)
if (opt_debug) if (opt_debug)
applog(LOG_DEBUG, "GPU #%d: start=%08x end=%08x range=%08x", applog(LOG_DEBUG, "GPU #%d: start=%08x end=%08x range=%08x",
device_map[thr_id], start_nonce, max_nonce, (max_nonce-start_nonce)); dev_id, start_nonce, max_nonce, (max_nonce-start_nonce));
hashes_done = 0; hashes_done = 0;
gettimeofday(&tv_start, NULL); gettimeofday(&tv_start, NULL);
@ -1967,7 +1969,7 @@ static void *miner_thread(void *userdata)
work.scanned_to = max_nonce; work.scanned_to = max_nonce;
if (opt_debug && opt_benchmark) { if (opt_debug && opt_benchmark) {
// to debug nonce ranges // to debug nonce ranges
applog(LOG_DEBUG, "GPU #%d: ends=%08x range=%08x", device_map[thr_id], applog(LOG_DEBUG, "GPU #%d: ends=%08x range=%08x", dev_id,
nonceptr[0], (nonceptr[0] - start_nonce)); nonceptr[0], (nonceptr[0] - start_nonce));
} }
} }
@ -1978,8 +1980,7 @@ static void *miner_thread(void *userdata)
/* output */ /* output */
if (!opt_quiet && firstwork_time) { if (!opt_quiet && firstwork_time) {
format_hashrate(thr_hashrates[thr_id], s); format_hashrate(thr_hashrates[thr_id], s);
applog(LOG_INFO, "GPU #%d: %s, %s", applog(LOG_INFO, "GPU #%d: %s, %s", dev_id, device_name[dev_id], s);
device_map[thr_id], device_name[device_map[thr_id]], s);
} }
/* ignore first loop hashrate */ /* ignore first loop hashrate */
@ -2835,8 +2836,6 @@ void parse_arg(int key, char *arg)
proper_exit(EXIT_CODE_CUDA_NODEVICE); proper_exit(EXIT_CODE_CUDA_NODEVICE);
} }
} }
// set number of active gpus
active_gpus = opt_n_threads;
pch = strtok (NULL, ","); pch = strtok (NULL, ",");
} }
} }
@ -3057,8 +3056,11 @@ int main(int argc, char *argv[])
if (num_cpus < 1) if (num_cpus < 1)
num_cpus = 1; num_cpus = 1;
// number of gpus
active_gpus = cuda_num_devices();
for (i = 0; i < MAX_GPUS; i++) { for (i = 0; i < MAX_GPUS; i++) {
device_map[i] = i; device_map[i] = i % active_gpus;
device_name[i] = NULL; device_name[i] = NULL;
device_config[i] = NULL; device_config[i] = NULL;
device_backoff[i] = is_windows() ? 12 : 2; device_backoff[i] = is_windows() ? 12 : 2;
@ -3070,8 +3072,6 @@ int main(int argc, char *argv[])
device_pstate[i] = -1; device_pstate[i] = -1;
} }
// number of gpus
active_gpus = cuda_num_devices();
cuda_devicenames(); cuda_devicenames();
/* parse command line */ /* parse command line */
@ -3192,6 +3192,9 @@ int main(int argc, char *argv[])
if (!opt_n_threads) if (!opt_n_threads)
opt_n_threads = active_gpus; opt_n_threads = active_gpus;
// generally doesn't work... let 1
gpu_threads = opt_n_threads / active_gpus;
if (opt_benchmark && opt_algo == ALGO_AUTO) { if (opt_benchmark && opt_algo == ALGO_AUTO) {
bench_init(opt_n_threads); bench_init(opt_n_threads);
for (int n=0; n < MAX_GPUS; n++) { for (int n=0; n < MAX_GPUS; n++) {

23
cuda.cpp

@ -67,7 +67,8 @@ void cuda_devicenames()
exit(1); exit(1);
} }
GPU_N = min(MAX_GPUS, GPU_N); if (opt_n_threads)
GPU_N = min(MAX_GPUS, opt_n_threads);
for (int i=0; i < GPU_N; i++) for (int i=0; i < GPU_N; i++)
{ {
char vendorname[32] = { 0 }; char vendorname[32] = { 0 };
@ -98,7 +99,7 @@ void cuda_print_devices()
int ngpus = cuda_num_devices(); int ngpus = cuda_num_devices();
cuda_devicenames(); cuda_devicenames();
for (int n=0; n < ngpus; n++) { for (int n=0; n < ngpus; n++) {
int m = device_map[n]; int m = device_map[n % MAX_GPUS];
cudaDeviceProp props; cudaDeviceProp props;
cudaGetDeviceProperties(&props, m); cudaGetDeviceProperties(&props, m);
if (!opt_n_threads || n < opt_n_threads) { if (!opt_n_threads || n < opt_n_threads) {
@ -148,10 +149,25 @@ int cuda_finddevice(char *name)
return -1; return -1;
} }
// deprecated since 1.7
uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount) uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount)
{ {
uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount; uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount;
if (gpu_threads > 1) throughput >> (gpu_threads-1);
api_set_throughput(thr_id, throughput); api_set_throughput(thr_id, throughput);
bench_set_throughput(thr_id, throughput);
return throughput;
}
// since 1.7
uint32_t cuda_default_throughput(int thr_id, uint32_t defcount)
{
//int dev_id = device_map[thr_id % MAX_GPUS];
uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount;
if (gpu_threads > 1) throughput >> (gpu_threads-1);
api_set_throughput(thr_id, throughput);
bench_set_throughput(thr_id, throughput);
//if (opt_debug) applog(LOG_DEBUG, "GPU %d-%d: throughput %u", dev_id, thr_id, throughput);
return throughput; return throughput;
} }
@ -240,7 +256,8 @@ cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
void cudaReportHardwareFailure(int thr_id, cudaError_t err, const char* func) void cudaReportHardwareFailure(int thr_id, cudaError_t err, const char* func)
{ {
struct cgpu_info *gpu = &thr_info[thr_id].gpu; struct cgpu_info *gpu = &thr_info[thr_id].gpu;
int dev_id = device_map[thr_id % MAX_GPUS];
gpu->hw_errors++; gpu->hw_errors++;
applog(LOG_ERR, "GPU #%d: %s %s", device_map[thr_id], func, cudaGetErrorString(err)); applog(LOG_ERR, "GPU #%d: %s %s", dev_id, func, cudaGetErrorString(err));
sleep(1); sleep(1);
} }

7
miner.h

@ -447,6 +447,7 @@ extern bool opt_showdiff;
extern bool opt_tracegpu; extern bool opt_tracegpu;
extern int opt_n_threads; extern int opt_n_threads;
extern int active_gpus; extern int active_gpus;
extern int gpu_threads;
extern int opt_timeout; extern int opt_timeout;
extern bool want_longpoll; extern bool want_longpoll;
extern bool have_longpoll; extern bool have_longpoll;
@ -489,6 +490,9 @@ int cuda_finddevice(char *name);
void cuda_print_devices(); void cuda_print_devices();
int cuda_available_memory(int thr_id); int cuda_available_memory(int thr_id);
uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount);
uint32_t cuda_default_throughput(int thr_id, uint32_t defcount);
#define CL_N "\x1B[0m" #define CL_N "\x1B[0m"
#define CL_RED "\x1B[31m" #define CL_RED "\x1B[31m"
#define CL_GRN "\x1B[32m" #define CL_GRN "\x1B[32m"
@ -522,6 +526,7 @@ int cuda_available_memory(int thr_id);
extern void format_hashrate(double hashrate, char *output); extern void format_hashrate(double hashrate, char *output);
extern void applog(int prio, const char *fmt, ...); extern void applog(int prio, const char *fmt, ...);
#define gpulog(prio, fmt, thr_id, ...) applog(prio, fmt, thr_id, __VA_ARGS__)
void get_defconfig_path(char *out, size_t bufsize, char *argv0); void get_defconfig_path(char *out, size_t bufsize, char *argv0);
extern void cbin2hex(char *out, const char *in, size_t len); extern void cbin2hex(char *out, const char *in, size_t len);
extern char *bin2hex(const unsigned char *in, size_t len); extern char *bin2hex(const unsigned char *in, size_t len);
@ -533,7 +538,6 @@ void diff_to_target(uint32_t* target, double diff);
void work_set_target(struct work* work, double diff); void work_set_target(struct work* work, double diff);
double target_to_diff(uint32_t* target); double target_to_diff(uint32_t* target);
extern void get_currentalgo(char* buf, int sz); extern void get_currentalgo(char* buf, int sz);
extern uint32_t device_intensity(int thr_id, const char *func, uint32_t defcount);
// bignum // bignum
double bn_convert_nbits(const uint32_t nbits); double bn_convert_nbits(const uint32_t nbits);
@ -547,6 +551,7 @@ extern int bench_algo;
void bench_init(int threads); void bench_init(int threads);
void bench_free(); void bench_free();
bool bench_algo_switch_next(int thr_id); bool bench_algo_switch_next(int thr_id);
void bench_set_throughput(int thr_id, uint32_t throughput);
void bench_display_results(); void bench_display_results();

2
skein.cu

@ -364,7 +364,7 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
throughput = min(throughput, (max_nonce - first_nonce)); throughput = min(throughput, (max_nonce - first_nonce));
uint32_t foundNonce, secNonce = 0; uint32_t foundNonce, secNonce = 0;
uint64_t target64; uint64_t target64 = 0;
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x03; ((uint32_t*)ptarget)[7] = 0x03;

Loading…
Cancel
Save