From f54d2cc0edc1491f24d5d42ae42a08d047ee224c Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Wed, 22 Jun 2011 23:07:30 +1000 Subject: [PATCH] Make poclbm use 4 vectors and decrease worksize to keep pipelines fullish. Make it possible to have 0 CPU threads and update docs. Fix counter with no cpu threads. --- cpu-miner.c | 96 ++++++++++++++++++++++++++++++++--------------------- poclbm.cl | 46 ++++++++++++++++++++++--- 2 files changed, 99 insertions(+), 43 deletions(-) diff --git a/cpu-miner.c b/cpu-miner.c index 13c92bd3..ff428d57 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -207,7 +207,7 @@ static struct option_help options_help[] = { #endif { "threads N", - "(-t N) Number of miner threads (default: 1)" }, + "(-t N) Number of miner CPU threads (default: number of processors)" }, { "url URL", "URL for bitcoin JSON-RPC server " @@ -500,22 +500,21 @@ static void hashmeter(int thr_id, struct timeval *diff, { struct timeval temp_tv_end, total_diff; double khashes, secs; + double total_mhashes, total_secs; /* Don't bother calculating anything if we're not displaying it */ if (opt_quiet) return; khashes = hashes_done / 1000.0; secs = (double)diff->tv_sec + ((double)diff->tv_usec / 1000000.0); + if (opt_debug) + applog(LOG_DEBUG, "[thread %d: %lu hashes, %.0f khash/sec]", + thr_id, hashes_done, hashes_done / secs); + gettimeofday(&temp_tv_end, NULL); + timeval_subtract(&total_diff, &temp_tv_end, &total_tv_end); - if (opt_n_threads + nDevs > 1) { - double total_mhashes, total_secs; - - if (opt_debug) - applog(LOG_DEBUG, "[thread %d: %lu hashes, %.0f khash/sec]", - thr_id, hashes_done, hashes_done / secs); - gettimeofday(&temp_tv_end, NULL); - timeval_subtract(&total_diff, &temp_tv_end, &total_tv_end); + if (opt_n_threads + nDevs > 1) { /* Totals are updated by all threads so can race without locking */ pthread_mutex_lock(&hash_lock); total_hashes_done += hashes_done; @@ -533,10 +532,19 @@ static void hashmeter(int thr_id, struct timeval *diff, applog(LOG_INFO, "[%.2f Mhash/sec] [%d Accepted] [%d Rejected]", total_mhashes / total_secs, accepted, rejected); } else { - if (opt_debug) - applog(LOG_DEBUG, "[%lu hashes]", hashes_done); - applog(LOG_INFO, "%.0f khash/sec] [%d Accepted] [%d Rejected]", - khashes / secs, accepted, rejected); + total_hashes_done += hashes_done; + if (total_diff.tv_sec < 5) { + /* Only update the total every 5 seconds */ + pthread_mutex_unlock(&hash_lock); + return; + } + gettimeofday(&total_tv_end, NULL); + timeval_subtract(&total_diff, &total_tv_end, &total_tv_start); + total_mhashes = total_hashes_done / 1000000.0; + total_secs = (double)total_diff.tv_sec + + ((double)total_diff.tv_usec / 1000000.0); + applog(LOG_INFO, "[%.2f Mhash/sec] [%d Accepted] [%d Rejected]", + total_mhashes / total_secs, accepted, rejected); } } @@ -608,6 +616,11 @@ bool submit_nonce(struct thr_info *thr, struct work *work, uint32_t nonce) return submit_work(thr, work); } +static inline int cpu_from_thr_id(int thr_id) +{ + return (thr_id - nDevs) % num_processors; +} + static void *miner_thread(void *userdata) { struct thr_info *mythr = userdata; @@ -718,7 +731,7 @@ static void *miner_thread(void *userdata) /* if nonce found, submit work */ if (unlikely(rc)) { - applog(LOG_INFO, "CPU found something?"); + applog(LOG_INFO, "CPU %d found something?", cpu_from_thr_id(thr_id)); if (!submit_work(mythr, &work)) break; } @@ -772,6 +785,11 @@ static inline cl_int queue_kernel_parameters(dev_blk_ctx *blk, cl_kernel *kernel return status; } +static inline int gpu_from_thr_id(int thr_id) +{ + return thr_id; +} + static void *gpuminer_thread(void *userdata) { struct thr_info *mythr = userdata; @@ -799,15 +817,19 @@ static void *gpuminer_thread(void *userdata) struct work *work = malloc(sizeof(struct work)); bool need_work = true; - unsigned int threads = 1 << 22; + unsigned int threads = 1 << 21; + unsigned int vectors = 4; + unsigned int hashes_done = threads * vectors; gettimeofday(&tv_start, NULL); globalThreads[0] = threads; - localThreads[0] = 128; + localThreads[0] = 64; while (1) { struct timeval tv_end, diff; - int i; + unsigned int i; + + clFinish(clState->commandQueue); if (need_work) { /* obtain new work from internal workio thread */ @@ -821,7 +843,7 @@ static void *gpuminer_thread(void *userdata) work->blk.nonce = 0; status = queue_kernel_parameters(&work->blk, kernel, clState->outputBuffer); if (unlikely(status != CL_SUCCESS)) - { applog(LOG_ERR, "Error: clSetKernelArg failed."); exit (1); } + { applog(LOG_ERR, "Error: clSetKernelArg of all params failed."); exit (1); } work_restart[thr_id].restart = 0; need_work = false; @@ -829,8 +851,11 @@ static void *gpuminer_thread(void *userdata) if (opt_debug) applog(LOG_DEBUG, "getwork"); + } else { + status = clSetKernelArg(*kernel, 14, sizeof(uint), (void *)&work->blk.nonce); + if (unlikely(status != CL_SUCCESS)) + { applog(LOG_ERR, "Error: clSetKernelArg of nonce failed."); goto out; } } - clFinish(clState->commandQueue); status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL); @@ -846,7 +871,7 @@ static void *gpuminer_thread(void *userdata) { applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed."); goto out; } for (i = 0; i < 127; i++) { if (res[i]) { - applog(LOG_INFO, "GPU Found something?"); + applog(LOG_INFO, "GPU %d found something?", gpu_from_thr_id(thr_id)); postcalc_hash(mythr, &work->blk, work, res[i]); } else break; @@ -861,19 +886,14 @@ static void *gpuminer_thread(void *userdata) gettimeofday(&tv_end, NULL); timeval_subtract(&diff, &tv_end, &tv_start); - hashmeter(thr_id, &diff, threads); + hashmeter(thr_id, &diff, hashes_done); gettimeofday(&tv_start, NULL); - work->blk.nonce += threads; + work->blk.nonce += hashes_done; - if (unlikely(work->blk.nonce > MAXTHREADS - threads) || + if (unlikely(work->blk.nonce > MAXTHREADS - hashes_done) || (work_restart[thr_id].restart)) need_work = true; - - clFinish(clState->commandQueue); - status = clSetKernelArg(*kernel, 14, sizeof(uint), (void *)&work->blk.nonce); - if (unlikely(status != CL_SUCCESS)) - { applog(LOG_ERR, "Error: clSetKernelArg failed."); goto out; } } out: tq_freeze(mythr->q); @@ -982,6 +1002,15 @@ static void parse_arg (int key, char *arg) { int v, i; +#ifdef WIN32 + if (!opt_n_threads) + opt_n_threads = 1; +#else + num_processors = sysconf(_SC_NPROCESSORS_ONLN); + if (!opt_n_threads) + opt_n_threads = num_processors; +#endif /* !WIN32 */ + switch(key) { case 'a': for (i = 0; i < ARRAY_SIZE(algo_names); i++) { @@ -1041,7 +1070,7 @@ static void parse_arg (int key, char *arg) break; case 't': v = atoi(arg); - if (v < 1 || v > 9999) /* sanity check */ + if (v < 0 || v > 9999) /* sanity check */ show_usage(); opt_n_threads = v; @@ -1074,15 +1103,6 @@ static void parse_arg (int key, char *arg) default: show_usage(); } - -#ifdef WIN32 - if (!opt_n_threads) - opt_n_threads = 1; -#else - num_processors = sysconf(_SC_NPROCESSORS_ONLN); - if (!opt_n_threads) - opt_n_threads = num_processors; -#endif /* !WIN32 */ } static void parse_config(void) diff --git a/poclbm.cl b/poclbm.cl index baba753e..a310f557 100644 --- a/poclbm.cl +++ b/poclbm.cl @@ -1,8 +1,10 @@ // This file is taken and modified from the public-domain poclbm project, and // we have therefore decided to keep it public-domain in Phoenix. +#define VECTORS + #ifdef VECTORS - typedef uint2 u; + typedef uint4 u; #else typedef uint u; #endif @@ -35,6 +37,8 @@ __constant uint K[64] = { // detected, use it for Ch. Otherwise, construct Ch out of simpler logical // primitives. +#define BFI_INT + #ifdef BFI_INT // Well, slight problem... It turns out BFI_INT isn't actually exposed to // OpenCL (or CAL IL for that matter) in any way. However, there is @@ -72,7 +76,7 @@ __kernel void search( const uint state0, const uint state1, const uint state2, c uint it; #ifdef VECTORS - nonce = ((base + get_global_id(0))<<1) + (uint2)(0, 1); + nonce = ((base >> 2) + (get_global_id(0))<<2) + (uint4)(0, 1, 2, 3); #else nonce = base + get_global_id(0); #endif @@ -302,11 +306,43 @@ __kernel void search( const uint state0, const uint state1, const uint state2, c #ifdef VECTORS if (H.x == 0) { - output[OUTPUT_SIZE] = output[nonce.x & OUTPUT_MASK] = nonce.x; + for (it = 0; it != 127; it++) { + if (!output[it]) { + output[it] = nonce.x; + output[127] = 1; + break; + } + } + } + if (H.y == 0) + { + for (it = 0; it != 127; it++) { + if (!output[it]) { + output[it] = nonce.y; + output[127] = 1; + break; + } + } + } + if (H.z == 0) + { + for (it = 0; it != 127; it++) { + if (!output[it]) { + output[it] = nonce.z; + output[127] = 1; + break; + } + } } - else if (H.y == 0) + if (H.w == 0) { - output[OUTPUT_SIZE] = output[nonce.y & OUTPUT_MASK] = nonce.y; + for (it = 0; it != 127; it++) { + if (!output[it]) { + output[it] = nonce.w; + output[127] = 1; + break; + } + } } #else if (H == 0)