|
|
|
/**
|
|
|
|
* Made to benchmark and test algo switch
|
|
|
|
*
|
|
|
|
* 2015 - tpruvot@github
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <unistd.h>
|
|
|
|
|
|
|
|
#include "miner.h"
|
|
|
|
#include "algos.h"
|
|
|
|
#include <cuda_runtime.h>
|
|
|
|
|
|
|
|
#ifdef __APPLE__
|
|
|
|
#include "compat/pthreads/pthread_barrier.hpp"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
int bench_algo = -1;
|
|
|
|
|
|
|
|
static double algo_hashrates[MAX_GPUS][ALGO_COUNT] = { 0 };
|
|
|
|
static uint32_t algo_throughput[MAX_GPUS][ALGO_COUNT] = { 0 };
|
|
|
|
static int algo_mem_used[MAX_GPUS][ALGO_COUNT] = { 0 };
|
|
|
|
static int device_mem_free[MAX_GPUS] = { 0 };
|
|
|
|
|
|
|
|
static pthread_barrier_t miner_barr;
|
|
|
|
static pthread_barrier_t algo_barr;
|
|
|
|
static pthread_mutex_t bench_lock = PTHREAD_MUTEX_INITIALIZER;
|
|
|
|
|
|
|
|
extern double thr_hashrates[MAX_GPUS];
|
|
|
|
|
|
|
|
void bench_init(int threads)
|
|
|
|
{
|
|
|
|
bench_algo = opt_algo = (enum sha_algos) 0; /* first */
|
|
|
|
applog(LOG_BLUE, "Starting benchmark mode with %s", algo_names[opt_algo]);
|
|
|
|
pthread_barrier_init(&miner_barr, NULL, threads);
|
|
|
|
pthread_barrier_init(&algo_barr, NULL, threads);
|
|
|
|
// required for usage of first algo.
|
|
|
|
for (int n=0; n < opt_n_threads; n++) {
|
|
|
|
device_mem_free[n] = cuda_available_memory(n);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void bench_free()
|
|
|
|
{
|
|
|
|
pthread_barrier_destroy(&miner_barr);
|
|
|
|
pthread_barrier_destroy(&algo_barr);
|
|
|
|
}
|
|
|
|
|
|
|
|
// required to switch algos
|
|
|
|
void algo_free_all(int thr_id)
|
|
|
|
{
|
|
|
|
// only initialized algos will be freed
|
|
|
|
free_bastion(thr_id);
|
|
|
|
free_bitcore(thr_id);
|
|
|
|
free_blake256(thr_id);
|
|
|
|
free_blake2s(thr_id);
|
|
|
|
free_bmw(thr_id);
|
|
|
|
free_c11(thr_id);
|
|
|
|
free_cryptolight(thr_id);
|
|
|
|
free_cryptonight(thr_id);
|
|
|
|
free_decred(thr_id);
|
|
|
|
free_deep(thr_id);
|
|
|
|
free_equihash(thr_id);
|
|
|
|
free_keccak256(thr_id);
|
|
|
|
free_fresh(thr_id);
|
|
|
|
free_fugue256(thr_id);
|
|
|
|
free_groestlcoin(thr_id);
|
|
|
|
free_heavy(thr_id);
|
|
|
|
free_hmq17(thr_id);
|
|
|
|
free_hsr(thr_id);
|
|
|
|
free_jackpot(thr_id);
|
|
|
|
free_jha(thr_id);
|
|
|
|
free_lbry(thr_id);
|
|
|
|
free_luffa(thr_id);
|
|
|
|
free_lyra2(thr_id);
|
|
|
|
free_lyra2v2(thr_id);
|
|
|
|
free_lyra2Z(thr_id);
|
|
|
|
free_myriad(thr_id);
|
|
|
|
free_neoscrypt(thr_id);
|
|
|
|
free_nist5(thr_id);
|
|
|
|
free_pentablake(thr_id);
|
|
|
|
free_phi(thr_id);
|
|
|
|
free_polytimos(thr_id);
|
|
|
|
free_quark(thr_id);
|
|
|
|
free_qubit(thr_id);
|
|
|
|
free_skeincoin(thr_id);
|
|
|
|
free_skein2(thr_id);
|
|
|
|
free_skunk(thr_id);
|
|
|
|
free_sha256d(thr_id);
|
|
|
|
free_sha256t(thr_id);
|
|
|
|
free_sia(thr_id);
|
|
|
|
free_sib(thr_id);
|
|
|
|
free_s3(thr_id);
|
|
|
|
free_vanilla(thr_id);
|
|
|
|
free_veltor(thr_id);
|
|
|
|
free_whirl(thr_id);
|
|
|
|
//free_whirlx(thr_id);
|
|
|
|
free_wildkeccak(thr_id);
|
|
|
|
free_x11evo(thr_id);
|
|
|
|
free_x11(thr_id);
|
|
|
|
free_x13(thr_id);
|
|
|
|
free_x14(thr_id);
|
|
|
|
free_x15(thr_id);
|
|
|
|
free_x17(thr_id);
|
|
|
|
free_zr5(thr_id);
|
|
|
|
free_scrypt(thr_id);
|
|
|
|
free_scrypt_jane(thr_id);
|
|
|
|
free_timetravel(thr_id);
|
|
|
|
free_tribus(thr_id);
|
|
|
|
free_bitcore(thr_id);
|
|
|
|
}
|
|
|
|
|
|
|
|
// benchmark all algos (called once per mining thread)
|
|
|
|
bool bench_algo_switch_next(int thr_id)
|
|
|
|
{
|
|
|
|
int algo = (int) opt_algo;
|
|
|
|
int prev_algo = algo;
|
|
|
|
int dev_id = device_map[thr_id % MAX_GPUS];
|
|
|
|
int mfree, mused;
|
|
|
|
// doesnt seems enough to prevent device slow down
|
|
|
|
// after some algo switchs
|
|
|
|
bool need_reset = (gpu_threads == 1);
|
|
|
|
|
|
|
|
algo++;
|
|
|
|
|
|
|
|
// skip some duplicated algos
|
|
|
|
if (algo == ALGO_C11) algo++; // same as x11
|
|
|
|
if (algo == ALGO_DMD_GR) algo++; // same as groestl
|
|
|
|
if (algo == ALGO_MJOLLNIR) algo++; // same as heavy
|
|
|
|
if (algo == ALGO_WHIRLCOIN) algo++; // same as whirlpool
|
|
|
|
if (algo == ALGO_WHIRLPOOLX) algo++; // disabled
|
|
|
|
// todo: algo switch from RPC 2.0
|
|
|
|
if (algo == ALGO_CRYPTOLIGHT) algo++;
|
|
|
|
if (algo == ALGO_CRYPTONIGHT) algo++;
|
|
|
|
if (algo == ALGO_WILDKECCAK) algo++;
|
|
|
|
//if (algo == ALGO_JACKPOT) algo++; // to fix
|
|
|
|
if (algo == ALGO_QUARK) algo++; // to fix
|
|
|
|
if (algo == ALGO_LBRY && CUDART_VERSION < 7000) algo++;
|
various fixes for SM 2.1 and the benchmark
X11+ algos and quark are not compatible for the moment
but these ones are :
Benchmark results for Gigabyte GTX 460 (SM 2.1 / 1 GB):
blakecoin : 159090.5 kH/s, 1 MB, 1048576 thr.
blake : 70208.9 kH/s, 1 MB, 1048576 thr.
bmw : 122802.6 kH/s, 65 MB, 2097152 thr.
deep : 3533.6 kH/s, 33 MB, 524288 thr.
fugue256 : 43177.9 kH/s, 17 MB, 524288 thr.
heavy : 4118.2 kH/s, 147 MB, 524032 thr.
keccak : 18673.1 kH/s, 129 MB, 2097152 thr.
luffa : 28816.0 kH/s, 257 MB, 4194304 thr.
lyra2 : 213.7 kH/s, 570 MB, 65536 thr.
mjollnir : 3895.6 kH/s, 147 MB, 524032 thr.
nist5 : 1101.4 kH/s, 67 MB, 1048576 thr.
penta : 501.6 kH/s, 21 MB, 327680 thr.
skein : 5432.4 kH/s, 65 MB, 1048576 thr.
skein2 : 6788.9 kH/s, 33 MB, 524288 thr.
whirlpool : 688.5 kH/s, 33 MB, 524288 thr.
zr5 : 122.5 kH/s, 86 MB, 262144 thr.
9 years ago
|
|
|
|
|
|
|
if (device_sm[dev_id] && device_sm[dev_id] < 300) {
|
|
|
|
// incompatible SM 2.1 kernels...
|
|
|
|
if (algo == ALGO_GROESTL) algo++;
|
|
|
|
if (algo == ALGO_MYR_GR) algo++;
|
|
|
|
//if (algo == ALGO_JACKPOT) algo++; // compact shuffle
|
various fixes for SM 2.1 and the benchmark
X11+ algos and quark are not compatible for the moment
but these ones are :
Benchmark results for Gigabyte GTX 460 (SM 2.1 / 1 GB):
blakecoin : 159090.5 kH/s, 1 MB, 1048576 thr.
blake : 70208.9 kH/s, 1 MB, 1048576 thr.
bmw : 122802.6 kH/s, 65 MB, 2097152 thr.
deep : 3533.6 kH/s, 33 MB, 524288 thr.
fugue256 : 43177.9 kH/s, 17 MB, 524288 thr.
heavy : 4118.2 kH/s, 147 MB, 524032 thr.
keccak : 18673.1 kH/s, 129 MB, 2097152 thr.
luffa : 28816.0 kH/s, 257 MB, 4194304 thr.
lyra2 : 213.7 kH/s, 570 MB, 65536 thr.
mjollnir : 3895.6 kH/s, 147 MB, 524032 thr.
nist5 : 1101.4 kH/s, 67 MB, 1048576 thr.
penta : 501.6 kH/s, 21 MB, 327680 thr.
skein : 5432.4 kH/s, 65 MB, 1048576 thr.
skein2 : 6788.9 kH/s, 33 MB, 524288 thr.
whirlpool : 688.5 kH/s, 33 MB, 524288 thr.
zr5 : 122.5 kH/s, 86 MB, 262144 thr.
9 years ago
|
|
|
if (algo == ALGO_NEOSCRYPT) algo++;
|
|
|
|
if (algo == ALGO_WHIRLPOOLX) algo++;
|
|
|
|
}
|
|
|
|
// and unwanted ones...
|
|
|
|
if (algo == ALGO_SCRYPT) algo++;
|
|
|
|
if (algo == ALGO_SCRYPT_JANE) algo++;
|
|
|
|
|
|
|
|
// free current algo memory and track mem usage
|
|
|
|
mused = cuda_available_memory(thr_id);
|
|
|
|
algo_free_all(thr_id);
|
|
|
|
CUDA_LOG_ERROR();
|
|
|
|
|
|
|
|
// device can take some time to free
|
|
|
|
mfree = cuda_available_memory(thr_id);
|
|
|
|
if (device_mem_free[thr_id] > mfree) {
|
|
|
|
sleep(1);
|
|
|
|
mfree = cuda_available_memory(thr_id);
|
|
|
|
}
|
|
|
|
|
|
|
|
// we need to wait completion on all cards before the switch
|
|
|
|
if (opt_n_threads > 1) {
|
|
|
|
pthread_barrier_wait(&miner_barr);
|
|
|
|
}
|
|
|
|
|
|
|
|
char rate[32] = { 0 };
|
|
|
|
double hashrate = stats_get_speed(thr_id, thr_hashrates[thr_id]);
|
|
|
|
format_hashrate(hashrate, rate);
|
|
|
|
gpulog(LOG_NOTICE, thr_id, "%s hashrate = %s", algo_names[prev_algo], rate);
|
|
|
|
|
|
|
|
// ensure memory leak is still real after the barrier
|
|
|
|
if (device_mem_free[thr_id] > mfree) {
|
|
|
|
mfree = cuda_available_memory(thr_id);
|
|
|
|
}
|
|
|
|
|
|
|
|
// check if there is memory leak
|
|
|
|
if (device_mem_free[thr_id] - mfree > 1) {
|
|
|
|
gpulog(LOG_WARNING, thr_id, "possible %d MB memory leak in %s! %d MB free",
|
|
|
|
(device_mem_free[thr_id] - mfree), algo_names[prev_algo], mfree);
|
|
|
|
cuda_reset_device(thr_id, NULL); // force to free the leak
|
|
|
|
need_reset = false;
|
|
|
|
mfree = cuda_available_memory(thr_id);
|
|
|
|
}
|
|
|
|
// store used memory per algo
|
|
|
|
algo_mem_used[thr_id][opt_algo] = device_mem_free[thr_id] - mused;
|
|
|
|
device_mem_free[thr_id] = mfree;
|
|
|
|
|
|
|
|
// store to dump a table per gpu later
|
|
|
|
algo_hashrates[thr_id][prev_algo] = hashrate;
|
|
|
|
|
|
|
|
// wait the other threads to display logs correctly
|
|
|
|
if (opt_n_threads > 1) {
|
|
|
|
pthread_barrier_wait(&algo_barr);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (algo == ALGO_AUTO)
|
|
|
|
return false; // all algos done
|
|
|
|
|
|
|
|
// mutex primary used for the stats purge
|
|
|
|
pthread_mutex_lock(&bench_lock);
|
|
|
|
stats_purge_all();
|
|
|
|
|
|
|
|
opt_algo = (enum sha_algos) algo;
|
|
|
|
global_hashrate = 0;
|
|
|
|
thr_hashrates[thr_id] = 0; // reset for minmax64
|
|
|
|
pthread_mutex_unlock(&bench_lock);
|
|
|
|
|
|
|
|
if (need_reset)
|
|
|
|
cuda_reset_device(thr_id, NULL);
|
|
|
|
|
|
|
|
if (thr_id == 0)
|
|
|
|
applog(LOG_BLUE, "Benchmark algo %s...", algo_names[algo]);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void bench_set_throughput(int thr_id, uint32_t throughput)
|
|
|
|
{
|
|
|
|
algo_throughput[thr_id][opt_algo] = throughput;
|
|
|
|
}
|
|
|
|
|
|
|
|
void bench_display_results()
|
|
|
|
{
|
|
|
|
for (int n=0; n < opt_n_threads; n++)
|
|
|
|
{
|
|
|
|
int dev_id = device_map[n];
|
|
|
|
applog(LOG_BLUE, "Benchmark results for GPU #%d - %s:", dev_id, device_name[dev_id]);
|
|
|
|
for (int i=0; i < ALGO_COUNT-1; i++) {
|
|
|
|
double rate = algo_hashrates[n][i];
|
|
|
|
if (rate == 0.0) continue;
|
|
|
|
applog(LOG_INFO, "%12s : %12.1f kH/s, %5d MB, %8u thr.", algo_names[i],
|
|
|
|
rate / 1024., algo_mem_used[n][i], algo_throughput[n][i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|