Make poclbm use 4 vectors and decrease worksize to keep pipelines fullish.

Make it possible to have 0 CPU threads and update docs. Fix counter with no cpu threads.
2025-08-26 22:02:09 +00:00 · 2011-06-22 23:07:30 +10:00 · 2011-06-22 23:07:30 +10:00 · f54d2cc0ed
commit f54d2cc0ed
parent b4d2733cfc
2 changed files with 99 additions and 43 deletions
--- a/cpu-miner.c
+++ b/cpu-miner.c
@ -207,7 +207,7 @@ static struct option_help options_help[] = {
 #endif
 	{ "threads N",
-	  "(-t N) Number of miner threads (default: 1)" },
+	  "(-t N) Number of miner CPU threads (default: number of processors)" },
 	{ "url URL",
 	  "URL for bitcoin JSON-RPC server "
@ -500,22 +500,21 @@ static void hashmeter(int thr_id, struct timeval *diff,
 {
 	struct timeval temp_tv_end, total_diff;
 	double khashes, secs;
 	double total_mhashes, total_secs;
 	/* Don't bother calculating anything if we're not displaying it */
 	if (opt_quiet)
 		return;
 	khashes = hashes_done / 1000.0;
 	secs = (double)diff->tv_sec + ((double)diff->tv_usec / 1000000.0);
 	if (opt_debug)
 		applog(LOG_DEBUG, "[thread %d: %lu hashes, %.0f khash/sec]",
 			thr_id, hashes_done, hashes_done / secs);
 	gettimeofday(&temp_tv_end, NULL);
 	timeval_subtract(&total_diff, &temp_tv_end, &total_tv_end);
 	if (opt_n_threads + nDevs > 1) {
 		double total_mhashes, total_secs;
 		if (opt_debug)
 			applog(LOG_DEBUG, "[thread %d: %lu hashes, %.0f khash/sec]",
 			       thr_id, hashes_done, hashes_done / secs);
 		gettimeofday(&temp_tv_end, NULL);
 		timeval_subtract(&total_diff, &temp_tv_end, &total_tv_end);
 		/* Totals are updated by all threads so can race without locking */
 		pthread_mutex_lock(&hash_lock);
 		total_hashes_done += hashes_done;
@ -533,10 +532,19 @@ static void hashmeter(int thr_id, struct timeval *diff,
 		applog(LOG_INFO, "[%.2f Mhash/sec] [%d Accepted] [%d Rejected]",
 		       total_mhashes / total_secs, accepted, rejected);
 	} else {
-		if (opt_debug)
+		total_hashes_done += hashes_done;
-			applog(LOG_DEBUG, "[%lu hashes]", hashes_done);
+		if (total_diff.tv_sec < 5) {
-		applog(LOG_INFO, "%.0f khash/sec] [%d Accepted] [%d Rejected]",
+			/* Only update the total every 5 seconds */
-				khashes / secs, accepted, rejected);
+			pthread_mutex_unlock(&hash_lock);
 			return;
 		}
 		gettimeofday(&total_tv_end, NULL);
 		timeval_subtract(&total_diff, &total_tv_end, &total_tv_start);
 		total_mhashes = total_hashes_done / 1000000.0;
 		total_secs = (double)total_diff.tv_sec +
 			((double)total_diff.tv_usec / 1000000.0);
 		applog(LOG_INFO, "[%.2f Mhash/sec] [%d Accepted] [%d Rejected]",
 		       total_mhashes / total_secs, accepted, rejected);
 	}
 }
@ -608,6 +616,11 @@ bool submit_nonce(struct thr_info *thr, struct work *work, uint32_t nonce)
 	return submit_work(thr, work);
 }
 static inline int cpu_from_thr_id(int thr_id)
 {
 	return (thr_id - nDevs) % num_processors;
 }
 static void *miner_thread(void *userdata)
 {
 	struct thr_info *mythr = userdata;
@ -718,7 +731,7 @@ static void *miner_thread(void *userdata)
 		/* if nonce found, submit work */
 		if (unlikely(rc)) {
-			applog(LOG_INFO, "CPU found something?");
+			applog(LOG_INFO, "CPU %d found something?", cpu_from_thr_id(thr_id));
 			if (!submit_work(mythr, &work))
 				break;
 		}
@ -772,6 +785,11 @@ static inline cl_int queue_kernel_parameters(dev_blk_ctx *blk, cl_kernel *kernel
 	return status;
 }
 static inline int gpu_from_thr_id(int thr_id)
 {
 	return thr_id;
 }
 static void *gpuminer_thread(void *userdata)
 {
 	struct thr_info *mythr = userdata;
@ -799,15 +817,19 @@ static void *gpuminer_thread(void *userdata)
 	struct work *work = malloc(sizeof(struct work));
 	bool need_work = true;
-	unsigned int threads = 1 << 22;
+	unsigned int threads = 1 << 21;
 	unsigned int vectors = 4;
 	unsigned int hashes_done = threads * vectors;
 	gettimeofday(&tv_start, NULL);
 	globalThreads[0] = threads;
-	localThreads[0] = 128;
+	localThreads[0] = 64;
 	while (1) {
 		struct timeval tv_end, diff;
-		int i;
+		unsigned int i;
 		clFinish(clState->commandQueue);
 		if (need_work) {
 			/* obtain new work from internal workio thread */
@ -821,7 +843,7 @@ static void *gpuminer_thread(void *userdata)
 			work->blk.nonce = 0;
 			status = queue_kernel_parameters(&work->blk, kernel, clState->outputBuffer);
 			if (unlikely(status != CL_SUCCESS))
-				{ applog(LOG_ERR, "Error: clSetKernelArg failed."); exit (1); }
+				{ applog(LOG_ERR, "Error: clSetKernelArg of all params failed."); exit (1); }
 			work_restart[thr_id].restart = 0;
 			need_work = false;
@ -829,8 +851,11 @@ static void *gpuminer_thread(void *userdata)
 			if (opt_debug)
 				applog(LOG_DEBUG, "getwork");
 		} else {
 			status = clSetKernelArg(*kernel, 14, sizeof(uint), (void *)&work->blk.nonce);
 			if (unlikely(status != CL_SUCCESS))
 				{ applog(LOG_ERR, "Error: clSetKernelArg of nonce failed."); goto out; }
 		}
 		clFinish(clState->commandQueue);
 		status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL,
 				globalThreads, localThreads, 0,  NULL, NULL);
@ -846,7 +871,7 @@ static void *gpuminer_thread(void *userdata)
 				{ applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed."); goto out; }
 			for (i = 0; i < 127; i++) {
 				if (res[i]) {
-					applog(LOG_INFO, "GPU Found something?");
+					applog(LOG_INFO, "GPU %d found something?", gpu_from_thr_id(thr_id));
 					postcalc_hash(mythr, &work->blk, work, res[i]);
 				} else
 					break;
@ -861,19 +886,14 @@ static void *gpuminer_thread(void *userdata)
 		gettimeofday(&tv_end, NULL);
 		timeval_subtract(&diff, &tv_end, &tv_start);
-		hashmeter(thr_id, &diff, threads);
+		hashmeter(thr_id, &diff, hashes_done);
 		gettimeofday(&tv_start, NULL);
-		work->blk.nonce += threads;
+		work->blk.nonce += hashes_done;
-		if (unlikely(work->blk.nonce > MAXTHREADS - threads) ||
+		if (unlikely(work->blk.nonce > MAXTHREADS - hashes_done) ||
 			(work_restart[thr_id].restart))
 				need_work = true;
 		clFinish(clState->commandQueue);
 		status = clSetKernelArg(*kernel, 14, sizeof(uint), (void *)&work->blk.nonce);
 		if (unlikely(status != CL_SUCCESS))
 			{ applog(LOG_ERR, "Error: clSetKernelArg failed."); goto out; }
 	}
 out:
 	tq_freeze(mythr->q);
@ -982,6 +1002,15 @@ static void parse_arg (int key, char *arg)
 {
 	int v, i;
 #ifdef WIN32
 	if (!opt_n_threads)
 		opt_n_threads = 1;
 #else
 	num_processors = sysconf(_SC_NPROCESSORS_ONLN);
 	if (!opt_n_threads)
 		opt_n_threads = num_processors;
 #endif /* !WIN32 */
 	switch(key) {
 	case 'a':
 		for (i = 0; i < ARRAY_SIZE(algo_names); i++) {
@ -1041,7 +1070,7 @@ static void parse_arg (int key, char *arg)
 		break;
 	case 't':
 		v = atoi(arg);
-		if (v < 1 || v > 9999)	/* sanity check */
+		if (v < 0 || v > 9999)	/* sanity check */
 			show_usage();
 		opt_n_threads = v;
@ -1074,15 +1103,6 @@ static void parse_arg (int key, char *arg)
 	default:
 		show_usage();
 	}
 #ifdef WIN32
 	if (!opt_n_threads)
 		opt_n_threads = 1;
 #else
 	num_processors = sysconf(_SC_NPROCESSORS_ONLN);
 	if (!opt_n_threads)
 		opt_n_threads = num_processors;
 #endif /* !WIN32 */
 }
 static void parse_config(void)
--- a/poclbm.cl
+++ b/poclbm.cl
@ -1,8 +1,10 @@
 // This file is taken and modified from the public-domain poclbm project, and
 // we have therefore decided to keep it public-domain in Phoenix.
 #define VECTORS
 #ifdef VECTORS
-	typedef uint2 u;
+	typedef uint4 u;
 #else
 	typedef uint u;
 #endif
@ -35,6 +37,8 @@ __constant uint K[64] = {
 // detected, use it for Ch. Otherwise, construct Ch out of simpler logical
 // primitives.
 #define BFI_INT
 #ifdef BFI_INT
 	// Well, slight problem... It turns out BFI_INT isn't actually exposed to
 	// OpenCL (or CAL IL for that matter) in any way. However, there is 
@ -72,7 +76,7 @@ __kernel void search(	const uint state0, const uint state1, const uint state2, c
 	uint it;
 #ifdef VECTORS 
-	nonce = ((base + get_global_id(0))<<1) + (uint2)(0, 1);
+	nonce = ((base >> 2) + (get_global_id(0))<<2) + (uint4)(0, 1, 2, 3);
 #else
 	nonce = base + get_global_id(0);
 #endif
@ -302,11 +306,43 @@ __kernel void search(	const uint state0, const uint state1, const uint state2, c
 #ifdef VECTORS
 	if (H.x == 0)
 	{
-		output[OUTPUT_SIZE] = output[nonce.x & OUTPUT_MASK] = nonce.x;
+		for (it = 0; it != 127; it++) {
 			if (!output[it]) {
 				output[it] = nonce.x;
 				output[127] = 1;
 				break;
 			}
 		}
 	}
-	else if (H.y == 0)
+	if (H.y == 0)
 	{
-		output[OUTPUT_SIZE] = output[nonce.y & OUTPUT_MASK] = nonce.y;
+		for (it = 0; it != 127; it++) {
 			if (!output[it]) {
 				output[it] = nonce.y;
 				output[127] = 1;
 				break;
 			}
 		}
 	}
 	if (H.z == 0)
 	{
 		for (it = 0; it != 127; it++) {
 			if (!output[it]) {
 				output[it] = nonce.z;
 				output[127] = 1;
 				break;
 			}
 		}
 	}
 	if (H.w == 0)
 	{
 		for (it = 0; it != 127; it++) {
 			if (!output[it]) {
 				output[it] = nonce.w;
 				output[127] = 1;
 				break;
 			}
 		}
 	}
 #else
 	if (H == 0)