From f54d2cc0edc1491f24d5d42ae42a08d047ee224c Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Wed, 22 Jun 2011 23:07:30 +1000
Subject: [PATCH] Make poclbm use 4 vectors and decrease worksize to keep
 pipelines fullish. Make it possible to have 0 CPU threads and update docs.
 Fix counter with no cpu threads.

---
 cpu-miner.c | 96 ++++++++++++++++++++++++++++++++---------------------
 poclbm.cl   | 46 ++++++++++++++++++++++---
 2 files changed, 99 insertions(+), 43 deletions(-)

diff --git a/cpu-miner.c b/cpu-miner.c
index 13c92bd3..ff428d57 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -207,7 +207,7 @@ static struct option_help options_help[] = {
 #endif
 
 	{ "threads N",
-	  "(-t N) Number of miner threads (default: 1)" },
+	  "(-t N) Number of miner CPU threads (default: number of processors)" },
 
 	{ "url URL",
 	  "URL for bitcoin JSON-RPC server "
@@ -500,22 +500,21 @@ static void hashmeter(int thr_id, struct timeval *diff,
 {
 	struct timeval temp_tv_end, total_diff;
 	double khashes, secs;
+	double total_mhashes, total_secs;
 
 	/* Don't bother calculating anything if we're not displaying it */
 	if (opt_quiet)
 		return;
 	khashes = hashes_done / 1000.0;
 	secs = (double)diff->tv_sec + ((double)diff->tv_usec / 1000000.0);
+	if (opt_debug)
+		applog(LOG_DEBUG, "[thread %d: %lu hashes, %.0f khash/sec]",
+			thr_id, hashes_done, hashes_done / secs);
+	gettimeofday(&temp_tv_end, NULL);
+	timeval_subtract(&total_diff, &temp_tv_end, &total_tv_end);
 
-	if (opt_n_threads + nDevs > 1) {
-		double total_mhashes, total_secs;
-
-		if (opt_debug)
-			applog(LOG_DEBUG, "[thread %d: %lu hashes, %.0f khash/sec]",
-			       thr_id, hashes_done, hashes_done / secs);
-		gettimeofday(&temp_tv_end, NULL);
-		timeval_subtract(&total_diff, &temp_tv_end, &total_tv_end);
 
+	if (opt_n_threads + nDevs > 1) {
 		/* Totals are updated by all threads so can race without locking */
 		pthread_mutex_lock(&hash_lock);
 		total_hashes_done += hashes_done;
@@ -533,10 +532,19 @@ static void hashmeter(int thr_id, struct timeval *diff,
 		applog(LOG_INFO, "[%.2f Mhash/sec] [%d Accepted] [%d Rejected]",
 		       total_mhashes / total_secs, accepted, rejected);
 	} else {
-		if (opt_debug)
-			applog(LOG_DEBUG, "[%lu hashes]", hashes_done);
-		applog(LOG_INFO, "%.0f khash/sec] [%d Accepted] [%d Rejected]",
-				khashes / secs, accepted, rejected);
+		total_hashes_done += hashes_done;
+		if (total_diff.tv_sec < 5) {
+			/* Only update the total every 5 seconds */
+			pthread_mutex_unlock(&hash_lock);
+			return;
+		}
+		gettimeofday(&total_tv_end, NULL);
+		timeval_subtract(&total_diff, &total_tv_end, &total_tv_start);
+		total_mhashes = total_hashes_done / 1000000.0;
+		total_secs = (double)total_diff.tv_sec +
+			((double)total_diff.tv_usec / 1000000.0);
+		applog(LOG_INFO, "[%.2f Mhash/sec] [%d Accepted] [%d Rejected]",
+		       total_mhashes / total_secs, accepted, rejected);
 	}
 }
 
@@ -608,6 +616,11 @@ bool submit_nonce(struct thr_info *thr, struct work *work, uint32_t nonce)
 	return submit_work(thr, work);
 }
 
+static inline int cpu_from_thr_id(int thr_id)
+{
+	return (thr_id - nDevs) % num_processors;
+}
+
 static void *miner_thread(void *userdata)
 {
 	struct thr_info *mythr = userdata;
@@ -718,7 +731,7 @@ static void *miner_thread(void *userdata)
 
 		/* if nonce found, submit work */
 		if (unlikely(rc)) {
-			applog(LOG_INFO, "CPU found something?");
+			applog(LOG_INFO, "CPU %d found something?", cpu_from_thr_id(thr_id));
 			if (!submit_work(mythr, &work))
 				break;
 		}
@@ -772,6 +785,11 @@ static inline cl_int queue_kernel_parameters(dev_blk_ctx *blk, cl_kernel *kernel
 	return status;
 }
 
+static inline int gpu_from_thr_id(int thr_id)
+{
+	return thr_id;
+}
+
 static void *gpuminer_thread(void *userdata)
 {
 	struct thr_info *mythr = userdata;
@@ -799,15 +817,19 @@ static void *gpuminer_thread(void *userdata)
 
 	struct work *work = malloc(sizeof(struct work));
 	bool need_work = true;
-	unsigned int threads = 1 << 22;
+	unsigned int threads = 1 << 21;
+	unsigned int vectors = 4;
+	unsigned int hashes_done = threads * vectors;
 
 	gettimeofday(&tv_start, NULL);
 	globalThreads[0] = threads;
-	localThreads[0] = 128;
+	localThreads[0] = 64;
 
 	while (1) {
 		struct timeval tv_end, diff;
-		int i;
+		unsigned int i;
+
+		clFinish(clState->commandQueue);
 
 		if (need_work) {
 			/* obtain new work from internal workio thread */
@@ -821,7 +843,7 @@ static void *gpuminer_thread(void *userdata)
 			work->blk.nonce = 0;
 			status = queue_kernel_parameters(&work->blk, kernel, clState->outputBuffer);
 			if (unlikely(status != CL_SUCCESS))
-				{ applog(LOG_ERR, "Error: clSetKernelArg failed."); exit (1); }
+				{ applog(LOG_ERR, "Error: clSetKernelArg of all params failed."); exit (1); }
 
 			work_restart[thr_id].restart = 0;
 			need_work = false;
@@ -829,8 +851,11 @@ static void *gpuminer_thread(void *userdata)
 			if (opt_debug)
 				applog(LOG_DEBUG, "getwork");
 
+		} else {
+			status = clSetKernelArg(*kernel, 14, sizeof(uint), (void *)&work->blk.nonce);
+			if (unlikely(status != CL_SUCCESS))
+				{ applog(LOG_ERR, "Error: clSetKernelArg of nonce failed."); goto out; }
 		}
-		clFinish(clState->commandQueue);
 
 		status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL,
 				globalThreads, localThreads, 0,  NULL, NULL);
@@ -846,7 +871,7 @@ static void *gpuminer_thread(void *userdata)
 				{ applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed."); goto out; }
 			for (i = 0; i < 127; i++) {
 				if (res[i]) {
-					applog(LOG_INFO, "GPU Found something?");
+					applog(LOG_INFO, "GPU %d found something?", gpu_from_thr_id(thr_id));
 					postcalc_hash(mythr, &work->blk, work, res[i]);
 				} else
 					break;
@@ -861,19 +886,14 @@ static void *gpuminer_thread(void *userdata)
 
 		gettimeofday(&tv_end, NULL);
 		timeval_subtract(&diff, &tv_end, &tv_start);
-		hashmeter(thr_id, &diff, threads);
+		hashmeter(thr_id, &diff, hashes_done);
 		gettimeofday(&tv_start, NULL);
 
-		work->blk.nonce += threads;
+		work->blk.nonce += hashes_done;
 
-		if (unlikely(work->blk.nonce > MAXTHREADS - threads) ||
+		if (unlikely(work->blk.nonce > MAXTHREADS - hashes_done) ||
 			(work_restart[thr_id].restart))
 				need_work = true;
-
-		clFinish(clState->commandQueue);
-		status = clSetKernelArg(*kernel, 14, sizeof(uint), (void *)&work->blk.nonce);
-		if (unlikely(status != CL_SUCCESS))
-			{ applog(LOG_ERR, "Error: clSetKernelArg failed."); goto out; }
 	}
 out:
 	tq_freeze(mythr->q);
@@ -982,6 +1002,15 @@ static void parse_arg (int key, char *arg)
 {
 	int v, i;
 
+#ifdef WIN32
+	if (!opt_n_threads)
+		opt_n_threads = 1;
+#else
+	num_processors = sysconf(_SC_NPROCESSORS_ONLN);
+	if (!opt_n_threads)
+		opt_n_threads = num_processors;
+#endif /* !WIN32 */
+
 	switch(key) {
 	case 'a':
 		for (i = 0; i < ARRAY_SIZE(algo_names); i++) {
@@ -1041,7 +1070,7 @@ static void parse_arg (int key, char *arg)
 		break;
 	case 't':
 		v = atoi(arg);
-		if (v < 1 || v > 9999)	/* sanity check */
+		if (v < 0 || v > 9999)	/* sanity check */
 			show_usage();
 
 		opt_n_threads = v;
@@ -1074,15 +1103,6 @@ static void parse_arg (int key, char *arg)
 	default:
 		show_usage();
 	}
-
-#ifdef WIN32
-	if (!opt_n_threads)
-		opt_n_threads = 1;
-#else
-	num_processors = sysconf(_SC_NPROCESSORS_ONLN);
-	if (!opt_n_threads)
-		opt_n_threads = num_processors;
-#endif /* !WIN32 */
 }
 
 static void parse_config(void)
diff --git a/poclbm.cl b/poclbm.cl
index baba753e..a310f557 100644
--- a/poclbm.cl
+++ b/poclbm.cl
@@ -1,8 +1,10 @@
 // This file is taken and modified from the public-domain poclbm project, and
 // we have therefore decided to keep it public-domain in Phoenix.
 
+#define VECTORS
+
 #ifdef VECTORS
-	typedef uint2 u;
+	typedef uint4 u;
 #else
 	typedef uint u;
 #endif
@@ -35,6 +37,8 @@ __constant uint K[64] = {
 // detected, use it for Ch. Otherwise, construct Ch out of simpler logical
 // primitives.
 
+#define BFI_INT
+
 #ifdef BFI_INT
 	// Well, slight problem... It turns out BFI_INT isn't actually exposed to
 	// OpenCL (or CAL IL for that matter) in any way. However, there is 
@@ -72,7 +76,7 @@ __kernel void search(	const uint state0, const uint state1, const uint state2, c
 	uint it;
 
 #ifdef VECTORS 
-	nonce = ((base + get_global_id(0))<<1) + (uint2)(0, 1);
+	nonce = ((base >> 2) + (get_global_id(0))<<2) + (uint4)(0, 1, 2, 3);
 #else
 	nonce = base + get_global_id(0);
 #endif
@@ -302,11 +306,43 @@ __kernel void search(	const uint state0, const uint state1, const uint state2, c
 #ifdef VECTORS
 	if (H.x == 0)
 	{
-		output[OUTPUT_SIZE] = output[nonce.x & OUTPUT_MASK] = nonce.x;
+		for (it = 0; it != 127; it++) {
+			if (!output[it]) {
+				output[it] = nonce.x;
+				output[127] = 1;
+				break;
+			}
+		}
+	}
+	if (H.y == 0)
+	{
+		for (it = 0; it != 127; it++) {
+			if (!output[it]) {
+				output[it] = nonce.y;
+				output[127] = 1;
+				break;
+			}
+		}
+	}
+	if (H.z == 0)
+	{
+		for (it = 0; it != 127; it++) {
+			if (!output[it]) {
+				output[it] = nonce.z;
+				output[127] = 1;
+				break;
+			}
+		}
 	}
-	else if (H.y == 0)
+	if (H.w == 0)
 	{
-		output[OUTPUT_SIZE] = output[nonce.y & OUTPUT_MASK] = nonce.y;
+		for (it = 0; it != 127; it++) {
+			if (!output[it]) {
+				output[it] = nonce.w;
+				output[127] = 1;
+				break;
+			}
+		}
 	}
 #else
 	if (H == 0)