MMQ overheat: remove clockdown (doesn't help) + ensure no lost shares

12 years ago · 2653c906c1
1 changed files with 48 additions and 30 deletions
--- a/driver-modminer.c
+++ b/driver-modminer.c
@ -45,9 +45,11 @@
				@@ -45,9 +45,11 @@
 #define MODMINER_DEF_CLOCK 200
 #define MODMINER_MIN_CLOCK 160

-#define MODMINER_CLOCK_DOWN -2
-#define MODMINER_CLOCK_SET 0
 #define MODMINER_CLOCK_UP 2
+#define MODMINER_CLOCK_SET 0
+#define MODMINER_CLOCK_DOWN -2
+// = 0 means OVERHEAT doesn't affect the clock
+#define MODMINER_CLOCK_OVERHEAT 0
 #define MODMINER_CLOCK_DEAD -6
 #define MODMINER_CLOCK_CUTOFF -10

@ -578,12 +580,14 @@ static bool modminer_fpga_prepare(struct thr_info *thr)
				@@ -578,12 +580,14 @@ static bool modminer_fpga_prepare(struct thr_info *thr)
 *	If device exceeds cutoff or overheat temp - stop sending work until it cools
 *		decrease the clock by MODMINER_CLOCK_CUTOFF/MODMINER_CLOCK_OVERHEAT
 *		for when it restarts
+ *		with MODMINER_CLOCK_OVERHEAT=0 basically says that temp shouldn't
+ *		affect the clock unless we reach CUTOFF
 *
- * When to clock down:
 *	If device overheats
- *		also halve shares_to_good
- *		(so multiple temp drops can recover faster)
- *	 or
+ *		set shares_to_good back to MODMINER_MIN_BACK
+ *		to speed up clock recovery if temp drop doesnt help
+ *
+ * When to clock down:
 *	If device gets MODMINER_HW_ERROR_PERCENT errors since last clock up or down
 *		if clock is <= default it requires 2 HW to do this test
 *		if clock is > default it only requires 1 HW to do this test
@ -603,7 +607,6 @@ static bool modminer_delta_clock(struct thr_info *thr, int delta, bool temp)
				@@ -603,7 +607,6 @@ static bool modminer_delta_clock(struct thr_info *thr, int delta, bool temp)
 	int err, amount;

 	// Only do once if multiple shares per work or multiple reasons
-	// Since the temperature down clock test is first in the code this is OK
 	if (!state->new_work)
 		return false;

@ -613,18 +616,17 @@ static bool modminer_delta_clock(struct thr_info *thr, int delta, bool temp)
				@@ -613,18 +616,17 @@ static bool modminer_delta_clock(struct thr_info *thr, int delta, bool temp)
 	state->shares_last_hw = 0;
 	state->hw_errors = 0;

-	// If drop requested due to temperature, clock drop is always allowed
-	if (!temp && delta < 0 && modminer->clock <= MODMINER_MIN_CLOCK)
+	// FYI clock drop has little effect on temp
+	if (delta < 0 && modminer->clock <= MODMINER_MIN_CLOCK)
 		return false;

 	if (delta > 0 && modminer->clock >= MODMINER_MAX_CLOCK)
 		return false;

 	if (delta < 0) {
-		if (temp) {
-			if (state->shares_to_good > MODMINER_MIN_BACK)
-				state->shares_to_good /= 2;
-		} else {
+		if (temp)
+			state->shares_to_good = MODMINER_MIN_BACK;
+		else {
 			if ((state->shares_to_good * 2) < MODMINER_TRY_UP)
 				state->shares_to_good *= 2;
 			else
@ -759,7 +761,8 @@ static bool modminer_start_work(struct thr_info *thr)
				@@ -759,7 +761,8 @@ static bool modminer_start_work(struct thr_info *thr)
 	mutex_lock(modminer->modminer_mutex);

 	if ((err = usb_write(modminer, (char *)(state->next_work_cmd), 46, &amount, C_SENDWORK)) < 0 || amount != 46) {
-// TODO: err = -4 means the MMQ disappeared - need to delete it and rescan for it? (after a delay?)
+// TODO: err = LIBUSB_ERROR_NO_DEVICE means the MMQ disappeared
+// - need to delete it and rescan for it? (after a delay?)
 // but check all (4) disappeared
 		mutex_unlock(modminer->modminer_mutex);

@ -837,7 +840,9 @@ static void check_temperature(struct thr_info *thr)
				@@ -837,7 +840,9 @@ static void check_temperature(struct thr_info *thr)
 					modminer->api->name, modminer->device_id,
 					MODMINER_OVERHEAT_TEMP, modminer->temp);

-				modminer_delta_clock(thr, MODMINER_CLOCK_DOWN, true);
+				// If it's defined to be 0 then don't call modminer_delta_clock()
+				if (MODMINER_CLOCK_OVERHEAT != 0)
+					modminer_delta_clock(thr, MODMINER_CLOCK_OVERHEAT, true);
 				state->overheated = true;
 				dev_error(modminer, REASON_DEV_OVER_HEAT);
 			}
@ -854,6 +859,11 @@ static void check_temperature(struct thr_info *thr)
				@@ -854,6 +859,11 @@ static void check_temperature(struct thr_info *thr)

 #define work_restart(thr)  thr->work_restart

+// 250Mhz is 17.17s - ensure we don't go idle
+static const double processtime = 17.0;
+// 160Mhz is 26.84 - when overheated ensure we don't throw away shares
+static const double overheattime = 26.9;
+
 static uint64_t modminer_process_results(struct thr_info *thr)
 {
 	struct cgpu_info *modminer = thr->cgpu;
@ -865,7 +875,7 @@ static uint64_t modminer_process_results(struct thr_info *thr)
				@@ -865,7 +875,7 @@ static uint64_t modminer_process_results(struct thr_info *thr)
 	uint32_t curr_hw_errors;
 	int err, amount;
 	int timeoutloop;
-	double processtime;
+	double timeout;
 	int temploop;

 	// If we are overheated it will just keep checking for results
@ -876,20 +886,19 @@ static uint64_t modminer_process_results(struct thr_info *thr)
				@@ -876,20 +886,19 @@ static uint64_t modminer_process_results(struct thr_info *thr)
 	cmd[0] = MODMINER_CHECK_WORK;
 	cmd[1] = modminer->fpgaid;

-	// 250Mhz is 17.17s
-	processtime = 17.0;
 	timeoutloop = 0;
 	temploop = 0;
 	while (1) {
 		mutex_lock(modminer->modminer_mutex);
 		if ((err = usb_write(modminer, cmd, 2, &amount, C_REQUESTWORKSTATUS)) < 0 || amount != 2) {
-// TODO: err = -4 means the MMQ disappeared - need to delete it and rescan for it? (after a delay?)
+// TODO: err = LIBUSB_ERROR_NO_DEVICE means the MMQ disappeared
+// - need to delete it and rescan for it? (after a delay?)
 // but check all (4) disappeared
 			mutex_unlock(modminer->modminer_mutex);

 			// timeoutloop never resets so the timeouts can't
 			// accumulate much during a single item of work
-			if (err == -7 && ++timeoutloop < 10)
+			if (err == LIBUSB_ERROR_TIMEOUT && ++timeoutloop < 10)
 				goto tryagain;

 			applog(LOG_ERR, "%s%u: Error sending (get nonce) (%d:%d)",
@ -901,11 +910,10 @@ static uint64_t modminer_process_results(struct thr_info *thr)
				@@ -901,11 +910,10 @@ static uint64_t modminer_process_results(struct thr_info *thr)
 		err = usb_read(modminer, (char *)(&nonce), 4, &amount, C_GETWORKSTATUS);
 		mutex_unlock(modminer->modminer_mutex);

-		if (err < 0 || amount != 4) {
-
+		if (err < 0 || amount < 4) {
 			// timeoutloop never resets so the timeouts can't
 			// accumulate much during a single item of work
-			if (err == -7 && ++timeoutloop < 10)
+			if (err == LIBUSB_ERROR_TIMEOUT && ++timeoutloop < 10)
 				goto tryagain;

 			applog(LOG_ERR, "%s%u: Error reading (get nonce) (%d:%d)",
@ -970,16 +978,25 @@ tryagain:
				@@ -970,16 +978,25 @@ tryagain:
 		if (work_restart(thr))
 			break;

-		gettimeofday(&now, NULL);
-		if (tdiff(&now, &state->tv_workstart) > processtime)
-			break;
-
+		if (state->overheated == true)
+		{
 			// don't check every time
-		if (state->overheated == true && ++temploop > 30) {
+			if (++temploop > 30) {
 				check_temperature(thr);
 				temploop = 0;
 			}

+		}
+
+		if (state->overheated == true)
+			timeout = overheattime;
+		else
+			timeout = processtime;
+
+		gettimeofday(&now, NULL);
+		if (tdiff(&now, &state->tv_workstart) > timeout)
+			break;
+
 		nmsleep(10);
 		if (work_restart(thr))
 			break;
@ -991,6 +1008,7 @@ tryagain:
				@@ -991,6 +1008,7 @@ tryagain:

 	// Not exact since the clock may have changed ... but close enough I guess
 	uint64_t hashes = (uint64_t)modminer->clock * (((uint64_t)elapsed.tv_sec * 1000000) + elapsed.tv_usec);
+	// Overheat will complete the nonce range
 	if (hashes > 0xffffffff)
 		hashes = 0xffffffff;
 	else