Cope with delays in retrieving work from the server by knowing when we have run out of staged work.

Once we are out of staged work, roll the ntime forwards in the work to generate work locally without a getwork. Display a message explaining when we move to/from local generation to server retrieval. Make sure we don't think we've run out of work transiently after a longpoll by flagging a buffer of fake staged work.
2025-03-13 06:01:03 +00:00 · 2011-07-09 10:18:29 +10:00 · 2011-07-09 10:18:29 +10:00 · c9bbaec3b2
commit c9bbaec3b2
parent 3126002b51
2 changed files with 75 additions and 5 deletions
--- a/main.c
+++ b/main.c
@ -150,11 +150,13 @@ struct work_restart *work_restart = NULL;
 pthread_mutex_t time_lock;
 static pthread_mutex_t hash_lock;
 static pthread_mutex_t qd_lock;
+static pthread_mutex_t stgd_lock;
 static double total_mhashes_done;
 static struct timeval total_tv_start, total_tv_end;
 static int accepted, rejected;
 int hw_errors;
-static int total_queued;
+static int total_queued, total_staged, lp_staged;
+static bool localgen = false;
 static unsigned int getwork_requested = 0;
 static char current_block[37];
 static char longpoll_block[37];
@ -592,7 +594,7 @@ static bool get_upstream_work(struct work *work)
 	val = json_rpc_call(curl, rpc_url, rpc_userpass, rpc_req,
 			    want_longpoll, false);
 	if (unlikely(!val)) {
-		applog(LOG_ERR, "Failed json_rpc_call in get_upstream_work");
+		applog(LOG_DEBUG, "Failed json_rpc_call in get_upstream_work");
 		goto out;
 	}

@ -667,7 +669,7 @@ static void *get_work_thread(void *userdata)
 		}

 		/* pause, then restart work-request loop */
-		applog(LOG_ERR, "json_rpc_call failed on get work, retry after %d seconds",
+		applog(LOG_DEBUG, "json_rpc_call failed on get work, retry after %d seconds",
 			opt_fail_pause);
 		sleep(opt_fail_pause);
 	}
@ -748,6 +750,34 @@ static bool workio_submit_work(struct workio_cmd *wc)
 	return true;
 }

+static void inc_staged(int inc, bool lp)
+{
+	pthread_mutex_lock(&stgd_lock);
+	total_staged += inc;
+	if (lp)
+		lp_staged += inc;
+	pthread_mutex_unlock(&stgd_lock);
+}
+
+static void dec_staged(int inc)
+{
+	pthread_mutex_lock(&stgd_lock);
+	if (lp_staged)
+		lp_staged -= inc;
+	total_staged -= inc;
+	pthread_mutex_unlock(&stgd_lock);
+}
+
+static int requests_staged(void)
+{
+	int ret;
+
+	pthread_mutex_lock(&stgd_lock);
+	ret = total_staged;
+	pthread_mutex_unlock(&stgd_lock);
+	return ret;
+}
+
 static void *stage_thread(void *userdata)
 {
 	struct thr_info *mythr = userdata;
@ -798,6 +828,7 @@ static void *stage_thread(void *userdata)
 			ok = false;
 			break;
 		}
+		inc_staged(1, false);
 	}

 	tq_freeze(mythr->q);
@ -919,6 +950,7 @@ static void dec_queued(void)
 	pthread_mutex_lock(&qd_lock);
 	total_queued--;
 	pthread_mutex_unlock(&qd_lock);
+	dec_staged(1);
 }

 static int requests_queued(void)
@ -992,6 +1024,10 @@ static void flush_requests(bool longpoll)
 	else
 		extra--;

+	/* Temporarily increase the staged count so that get_work thinks there
+	 * is work available instead of making threads reuse existing work */
+	inc_staged(extra, true);
+
 	for (i = 0; i < extra; i++) {
 		/* Queue a whole batch of new requests */
 		if (unlikely(!queue_request())) {
@ -1022,6 +1058,25 @@ retry:
 		goto out;
 	}

+	if (!requests_staged()) {
+		uint32_t *work_ntime;
+		uint32_t ntime;
+
+		/* Only print this message once each time we shift to localgen */
+		if (!localgen)
+			applog(LOG_WARNING, "Server not providing work fast enough, generating work locally");
+		localgen = true;
+		work_ntime = (uint32_t *)(work->data + 68);
+		ntime = be32toh(*work_ntime);
+		ntime++;
+		*work_ntime = htobe32(ntime);
+		ret = true;
+		goto out;
+	} else if (localgen) {
+		localgen = false;
+		applog(LOG_WARNING, "Resumed retrieving work from server");
+	}
+
 	/* wait for 1st response, or get cached response */
 	work_heap = tq_pop(thr->q, NULL);
 	if (unlikely(!work_heap)) {
@ -1040,7 +1095,7 @@ out:
 			applog(LOG_ERR, "Failed %d times to get_work");
 			return ret;
 		}
-		applog(LOG_WARNING, "Retrying after %d seconds", opt_fail_pause);
+		applog(LOG_DEBUG, "Retrying after %d seconds", opt_fail_pause);
 		sleep(opt_fail_pause);
 		goto retry;
 	}
@ -1519,7 +1574,7 @@ static void *longpoll_thread(void *userdata)
 		} else {
 			if (failures++ < 10) {
 				sleep(30);
-				applog(LOG_ERR,
+				applog(LOG_WARNING,
 					"longpoll failed, sleeping for 30s");
 			} else {
 				applog(LOG_ERR,
@ -1622,6 +1677,8 @@ int main (int argc, char *argv[])
 		return 1;
 	if (unlikely(pthread_mutex_init(&qd_lock, NULL)))
 		return 1;
+	if (unlikely(pthread_mutex_init(&stgd_lock, NULL)))
+		return 1;

 	if (unlikely(curl_global_init(CURL_GLOBAL_ALL)))
 		return 1;
--- a/miner.h
+++ b/miner.h
@ -84,6 +84,19 @@ void *alloca (size_t);
 #endif
 #endif /* !defined(__GLXBYTEORDER_H__) */

+/* This assumes htobe32 is a macro in endian.h */
+#ifndef htobe32
+# if __BYTE_ORDER == __LITTLE_ENDIAN
+#  define be32toh(x) bswap_32(x)
+#  define htobe32(x) bswap_32(x)
+# elif __BYTE_ORDER == __BIG_ENDIAN
+#  define be32toh(x) (x)
+#  define htobe32(x) (x)
+#else
+#error UNKNOWN BYTE ORDER
+#endif
+#endif
+
 #ifdef HAVE_SYSLOG_H
 #include <syslog.h>
 #else