From 438308b3a2a4f50d02db47255d6fb7410c15abfc Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 16 Nov 2014 22:39:00 +0100 Subject: [PATCH] Rework benchmark mode and min/max range Was maybe my fault, but the benchmark mode was always recomputing from nonce 0. Also fix blake if -d 1 is used (one thread but second gpu) stats: do not use thread id as key, prefer gpu id... --- blake32.cu | 13 ++++---- ccminer.cpp | 86 +++++++++++++++++++++++++++---------------------- ccminer.vcxproj | 4 +-- stats.cpp | 16 ++++----- x11/x11.cu | 4 +-- x13/x13.cu | 22 +++++++------ x15/x14.cu | 5 ++- x15/x15.cu | 2 +- 8 files changed, 82 insertions(+), 70 deletions(-) diff --git a/blake32.cu b/blake32.cu index 2dc7235..e7aae76 100644 --- a/blake32.cu +++ b/blake32.cu @@ -17,7 +17,7 @@ extern "C" { /* threads per block and throughput (intensity) */ #define TPB 128 -extern int opt_n_threads; +extern int num_processors; /* added in sph_blake.c */ extern "C" int blake256_rounds = 14; @@ -416,8 +416,10 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt } #endif - if (opt_benchmark) + if (opt_benchmark) { targetHigh = 0x1ULL << 32; + ((uint32_t*)ptarget)[6] = swab32(0xff); + } if (opt_tracegpu) { /* test call from util.c */ @@ -427,9 +429,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt } if (!init[thr_id]) { - if (opt_n_threads > 1) { + if (num_processors > 1) CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); - } CUDA_SAFE_CALL(cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t))); CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t))); init[thr_id] = true; @@ -489,8 +490,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt goto exit_scan; } else if (opt_debug) { - applog_hash((uint8_t*)ptarget); - applog_compare_hash((uint8_t*)vhashcpu,(uint8_t*)ptarget); + applog_hash((uchar*)ptarget); + applog_compare_hash((uchar*)vhashcpu, (uchar*)ptarget); applog(LOG_DEBUG, "GPU #%d: result for nonce %08x does not validate on CPU!", thr_id, foundNonce); } } diff --git a/ccminer.cpp b/ccminer.cpp index cb2bd4a..68c6973 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -816,7 +816,7 @@ static bool get_work(struct thr_info *thr, struct work *work) if (opt_benchmark) { memset(work->data, 0x55, 76); - work->data[17] = swab32((uint32_t)time(NULL)); + //work->data[17] = swab32((uint32_t)time(NULL)); memset(work->data + 19, 0x00, 52); work->data[20] = 0x80000000; work->data[31] = 0x00000280; @@ -1004,11 +1004,11 @@ static void *miner_thread(void *userdata) } while (1) { + struct timeval tv_start, tv_end, diff; unsigned long hashes_done; uint32_t start_nonce; - struct timeval tv_start, tv_end, diff; - int64_t max64; - uint64_t umax64; + uint32_t scan_time = have_longpoll ? LP_SCANTIME : opt_scantime; + uint64_t max64, minmax = 0x100000; // &work.data[19] int wcmplen = 76; @@ -1035,7 +1035,7 @@ static void *miner_thread(void *userdata) stratum_gen_work(&stratum, &g_work); } } else { - int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime; + int min_scantime = scan_time; /* obtain new work from internal workio thread */ pthread_mutex_lock(&g_work_lock); if (time(NULL) - g_work_time >= min_scantime || @@ -1065,7 +1065,7 @@ static void *miner_thread(void *userdata) goto continue_scan; } - if (memcmp(work.target, g_work.target, sizeof(work.target))) { + if (!opt_benchmark && memcmp(work.target, g_work.target, sizeof(work.target))) { calc_diff(&g_work, 0); if (opt_debug) { uint64_t target64 = g_work.target[7] * 0x100000000ULL + g_work.target[6]; @@ -1080,56 +1080,59 @@ static void *miner_thread(void *userdata) } } if (memcmp(work.data, g_work.data, wcmplen)) { + #if 0 if (opt_debug) { -#if 0 for (int n=0; n <= (wcmplen-8); n+=8) { if (memcmp(work.data + n, g_work.data + n, 8)) { applog(LOG_DEBUG, "job %s work updated at offset %d:", g_work.job_id, n); - applog_hash((uint8_t*) work.data + n); - applog_compare_hash((uint8_t*) g_work.data + n, (uint8_t*) work.data + n); + applog_hash((uchar*) &work.data[n]); + applog_compare_hash((uchar*) &g_work.data[n], (uchar*) &work.data[n]); } } -#endif } + #endif memcpy(&work, &g_work, sizeof(struct work)); (*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr } else (*nonceptr)++; //?? - work_restart[thr_id].restart = 0; - if (opt_debug) - applog(LOG_DEBUG, "job %s %08x", g_work.job_id, (*nonceptr)); + work_restart[thr_id].restart = 0; pthread_mutex_unlock(&g_work_lock); /* adjust max_nonce to meet target scan time */ if (have_stratum) max64 = LP_SCANTIME; else - max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime) - - time(NULL); + max64 = max(1, scan_time + g_work_time - time(NULL)); - max64 *= (int64_t)thr_hashrates[thr_id]; + max64 *= (uint32_t)thr_hashrates[thr_id]; - if (max64 <= 0) { - /* should not be set too high, - else you can miss multiple nounces */ + /* on start, max64 should not be 0, + * before hashrate is computed */ + if (max64 < minmax) { switch (opt_algo) { case ALGO_BLAKECOIN: - max64 = 0x3ffffffLL; + minmax = 0x4000000; break; case ALGO_BLAKE: case ALGO_DOOM: case ALGO_JACKPOT: case ALGO_KECCAK: case ALGO_LUFFA_DOOM: - max64 = 0x1ffffffLL; + minmax = 0x2000000; break; - default: - max64 = 0xfffffLL; + case ALGO_S3: + case ALGO_X11: + case ALGO_X13: + minmax = 0x400000; break; } + max64 = max(minmax-1, max64); } + if (opt_debug) + applog(LOG_DEBUG, "GPU #%d: start=%08x range=%llx", device_map[thr_id], *nonceptr, max64); + start_nonce = *nonceptr; /* do not recompute something already scanned */ @@ -1140,7 +1143,7 @@ static void *miner_thread(void *userdata) } range; range.data = hashlog_get_scan_range(work.job_id); - if (range.data) { + if (range.data && !opt_benchmark) { bool stall = false; if (range.scanned[0] == 1 && range.scanned[1] == 0xFFFFFFFFUL) { applog(LOG_WARNING, "detected a rescan of fully scanned job!"); @@ -1168,11 +1171,14 @@ static void *miner_thread(void *userdata) } } - umax64 = (uint64_t) max64; - if ((umax64 + start_nonce) >= end_nonce) + if ((max64 + start_nonce) >= end_nonce) max_nonce = end_nonce; else - max_nonce = (uint32_t) umax64 + start_nonce; + max_nonce = (uint32_t) (max64 + start_nonce); + + /* never let small ranges at end */ + if (max_nonce >= UINT32_MAX - 256) + max_nonce = UINT32_MAX; work.scanned_from = start_nonce; (*nonceptr) = start_nonce; @@ -1343,6 +1349,19 @@ continue_scan: pthread_mutex_unlock(&stats_lock); } + if (rc) + work.scanned_to = *nonceptr; + else { + work.scanned_to = max_nonce; + if (opt_debug && opt_benchmark) { + // to debug nonce ranges + applog(LOG_DEBUG, "GPU #%d: ends=%08x range=%llx", device_map[thr_id], + *nonceptr, ((*nonceptr) - start_nonce)); + } + } + + hashlog_remember_scan_range(work.job_id, work.scanned_from, work.scanned_to); + /* output */ if (!opt_quiet && loopcnt) { sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f", @@ -1365,18 +1384,9 @@ continue_scan: global_hashrate = llround(hashrate); } - if (rc) { - work.scanned_to = *nonceptr; - } else { - work.scanned_to = max_nonce; - } - - // could be used to store speeds too.. - hashlog_remember_scan_range(work.job_id, work.scanned_from, work.scanned_to); - /* if nonce found, submit work */ - if (rc) { - if (!opt_benchmark && !submit_work(mythr, &work)) + if (rc && !opt_benchmark) { + if (!submit_work(mythr, &work)) break; } diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 4844f38..3a8ae58 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -172,7 +172,7 @@ 80 true false - compute_50,sm_50 + compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_52,sm_52 --ptxas-options="-O2" %(AdditionalOptions) @@ -473,4 +473,4 @@ - + \ No newline at end of file diff --git a/stats.cpp b/stats.cpp index 771f73b..5ae0135 100644 --- a/stats.cpp +++ b/stats.cpp @@ -27,8 +27,8 @@ extern int device_map[8]; */ extern "C" void stats_remember_speed(int thr_id, uint32_t hashcount, double hashrate, uint8_t found) { - uint64_t thr = (0xff & thr_id); - uint64_t key = (thr << 56) + (uid++ % UINT_MAX); + uint64_t gpu = device_map[thr_id]; + uint64_t key = (gpu << 56) + (uid++ % UINT_MAX); stats_data data; // to enough hashes to give right stats if (hashcount < 1000 || hashrate < 0.01) @@ -39,8 +39,8 @@ extern "C" void stats_remember_speed(int thr_id, uint32_t hashcount, double hash return; memset(&data, 0, sizeof(data)); - data.gpu_id = device_map[thr_id]; - data.thr_id = (uint8_t)thr; + data.gpu_id = (uint8_t)gpu; + data.thr_id = (uint8_t)thr_id; data.tm_stat = (uint32_t) time(NULL); data.hashcount = hashcount; data.hashfound = found; @@ -61,8 +61,8 @@ extern "C" void stats_remember_speed(int thr_id, uint32_t hashcount, double hash */ extern "C" double stats_get_speed(int thr_id, double def_speed) { - uint64_t thr = (0xff & thr_id); - uint64_t keypfx = (thr << 56); + uint64_t gpu = device_map[thr_id]; + uint64_t keypfx = (gpu << 56); uint64_t keymsk = (0xffULL << 56); double speed = 0.0; int records = 0; @@ -93,8 +93,8 @@ extern "C" double stats_get_speed(int thr_id, double def_speed) extern "C" int stats_get_history(int thr_id, struct stats_data *data, int max_records) { - uint64_t thr = (0xff & thr_id); - uint64_t keypfx = (thr << 56); + uint64_t gpu = device_map[thr_id]; + uint64_t keypfx = (gpu << 56); uint64_t keymsk = (0xffULL << 56); double speed = 0.0; int records = 0; diff --git a/x11/x11.cu b/x11/x11.cu index b48dde6..9baed10 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -135,13 +135,13 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; - static bool init[8] = {0,0,0,0,0,0,0,0}; + static bool init[8] = { 0 }; int intensity = (device_sm[device_map[thr_id]] >= 500) ? 20 : 19; int throughput = opt_work_size ? opt_work_size : (1 << intensity); // 20=256*256*16; throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000f; + ((uint32_t*)ptarget)[7] = 0x000f; if (!init[thr_id]) { diff --git a/x13/x13.cu b/x13/x13.cu index 4918e3a..31a55a5 100644 --- a/x13/x13.cu +++ b/x13/x13.cu @@ -154,20 +154,17 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - - const uint32_t Htarg = ptarget[7]; - + static bool init[8] = { 0 }; int throughput = opt_work_size ? opt_work_size : (1 << 19); // 256*256*8; throughput = min(throughput, max_nonce - first_nonce); - static bool init[8] = {0,0,0,0,0,0,0,0}; + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x000f; + if (!init[thr_id]) { CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 2 * 32 * throughput)); quark_blake512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput); @@ -216,11 +213,12 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata, foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); if (foundNonce != 0xffffffff) { + const uint32_t Htarg = ptarget[7]; uint32_t vhash64[8]; be32enc(&endiandata[19], foundNonce); x13hash(vhash64, endiandata); - if( (vhash64[7]<=Htarg) && fulltest(vhash64, ptarget) ) { + if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget) ) { pdata[19] = foundNonce; *hashes_done = foundNonce - first_nonce + 1; return 1; @@ -233,9 +231,13 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata, } } + if ((uint64_t)pdata[19] + throughput > (uint64_t)max_nonce) { + pdata[19] = max_nonce; + break; + } pdata[19] += throughput; - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + } while (!work_restart[thr_id].restart); *hashes_done = pdata[19] - first_nonce + 1; return 0; diff --git a/x15/x14.cu b/x15/x14.cu index a0adc2f..99c62fd 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -165,14 +165,13 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; - static bool init[8] = {0,0,0,0,0,0,0,0}; + static bool init[8] = { 0 }; uint32_t endiandata[20]; - int throughput = opt_work_size ? opt_work_size : (1 << 19); // 256*256*8; throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0xff; + ((uint32_t*)ptarget)[7] = 0x000f; if (!init[thr_id]) { diff --git a/x15/x15.cu b/x15/x15.cu index a424834..23f6aa8 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -174,7 +174,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; - static bool init[8] = {0,0,0,0,0,0,0,0}; + static bool init[8] = { 0 }; uint32_t endiandata[20]; int throughput = opt_work_size ? opt_work_size : (1 << 19); // 256*256*8;