MMQ overheat: remove clockdown (doesn't help) + ensure no lost shares

12 years ago · 2653c906c1
1 changed files with 48 additions and 30 deletions
--- a/driver-modminer.c
+++ b/driver-modminer.c
@ -45,9 +45,11 @@
 #define MODMINER_DEF_CLOCK 200
 #define MODMINER_MIN_CLOCK 160
 #define MODMINER_CLOCK_DOWN -2
 #define MODMINER_CLOCK_SET 0
 #define MODMINER_CLOCK_UP 2
 #define MODMINER_CLOCK_SET 0
 #define MODMINER_CLOCK_DOWN -2
 // = 0 means OVERHEAT doesn't affect the clock
 #define MODMINER_CLOCK_OVERHEAT 0
 #define MODMINER_CLOCK_DEAD -6
 #define MODMINER_CLOCK_CUTOFF -10
@ -578,12 +580,14 @@ static bool modminer_fpga_prepare(struct thr_info *thr)
 *	If device exceeds cutoff or overheat temp - stop sending work until it cools
 *		decrease the clock by MODMINER_CLOCK_CUTOFF/MODMINER_CLOCK_OVERHEAT
 *		for when it restarts
 *		with MODMINER_CLOCK_OVERHEAT=0 basically says that temp shouldn't
 *		affect the clock unless we reach CUTOFF
 *
 * When to clock down:
 *	If device overheats
- *		also halve shares_to_good
+ *		set shares_to_good back to MODMINER_MIN_BACK
- *		(so multiple temp drops can recover faster)
+ *		to speed up clock recovery if temp drop doesnt help
- *	 or
+ *
 * When to clock down:
 *	If device gets MODMINER_HW_ERROR_PERCENT errors since last clock up or down
 *		if clock is <= default it requires 2 HW to do this test
 *		if clock is > default it only requires 1 HW to do this test
@ -603,7 +607,6 @@ static bool modminer_delta_clock(struct thr_info *thr, int delta, bool temp)
 	int err, amount;
 	// Only do once if multiple shares per work or multiple reasons
 	// Since the temperature down clock test is first in the code this is OK
 	if (!state->new_work)
 		return false;
@ -613,18 +616,17 @@ static bool modminer_delta_clock(struct thr_info *thr, int delta, bool temp)
 	state->shares_last_hw = 0;
 	state->hw_errors = 0;
-	// If drop requested due to temperature, clock drop is always allowed
+	// FYI clock drop has little effect on temp
-	if (!temp && delta < 0 && modminer->clock <= MODMINER_MIN_CLOCK)
+	if (delta < 0 && modminer->clock <= MODMINER_MIN_CLOCK)
 		return false;
 	if (delta > 0 && modminer->clock >= MODMINER_MAX_CLOCK)
 		return false;
 	if (delta < 0) {
-		if (temp) {
+		if (temp)
-			if (state->shares_to_good > MODMINER_MIN_BACK)
+			state->shares_to_good = MODMINER_MIN_BACK;
-				state->shares_to_good /= 2;
+		else {
 		} else {
 			if ((state->shares_to_good * 2) < MODMINER_TRY_UP)
 				state->shares_to_good *= 2;
 			else
@ -759,7 +761,8 @@ static bool modminer_start_work(struct thr_info *thr)
 	mutex_lock(modminer->modminer_mutex);
 	if ((err = usb_write(modminer, (char *)(state->next_work_cmd), 46, &amount, C_SENDWORK)) < 0 || amount != 46) {
-// TODO: err = -4 means the MMQ disappeared - need to delete it and rescan for it? (after a delay?)
+// TODO: err = LIBUSB_ERROR_NO_DEVICE means the MMQ disappeared
 // - need to delete it and rescan for it? (after a delay?)
 // but check all (4) disappeared
 		mutex_unlock(modminer->modminer_mutex);
@ -837,7 +840,9 @@ static void check_temperature(struct thr_info *thr)
 					modminer->api->name, modminer->device_id,
 					MODMINER_OVERHEAT_TEMP, modminer->temp);
-				modminer_delta_clock(thr, MODMINER_CLOCK_DOWN, true);
+				// If it's defined to be 0 then don't call modminer_delta_clock()
 				if (MODMINER_CLOCK_OVERHEAT != 0)
 					modminer_delta_clock(thr, MODMINER_CLOCK_OVERHEAT, true);
 				state->overheated = true;
 				dev_error(modminer, REASON_DEV_OVER_HEAT);
 			}
@ -854,6 +859,11 @@ static void check_temperature(struct thr_info *thr)
 #define work_restart(thr)  thr->work_restart
 // 250Mhz is 17.17s - ensure we don't go idle
 static const double processtime = 17.0;
 // 160Mhz is 26.84 - when overheated ensure we don't throw away shares
 static const double overheattime = 26.9;
 static uint64_t modminer_process_results(struct thr_info *thr)
 {
 	struct cgpu_info *modminer = thr->cgpu;
@ -865,7 +875,7 @@ static uint64_t modminer_process_results(struct thr_info *thr)
 	uint32_t curr_hw_errors;
 	int err, amount;
 	int timeoutloop;
-	double processtime;
+	double timeout;
 	int temploop;
 	// If we are overheated it will just keep checking for results
@ -876,20 +886,19 @@ static uint64_t modminer_process_results(struct thr_info *thr)
 	cmd[0] = MODMINER_CHECK_WORK;
 	cmd[1] = modminer->fpgaid;
 	// 250Mhz is 17.17s
 	processtime = 17.0;
 	timeoutloop = 0;
 	temploop = 0;
 	while (1) {
 		mutex_lock(modminer->modminer_mutex);
 		if ((err = usb_write(modminer, cmd, 2, &amount, C_REQUESTWORKSTATUS)) < 0 || amount != 2) {
-// TODO: err = -4 means the MMQ disappeared - need to delete it and rescan for it? (after a delay?)
+// TODO: err = LIBUSB_ERROR_NO_DEVICE means the MMQ disappeared
 // - need to delete it and rescan for it? (after a delay?)
 // but check all (4) disappeared
 			mutex_unlock(modminer->modminer_mutex);
 			// timeoutloop never resets so the timeouts can't
 			// accumulate much during a single item of work
-			if (err == -7 && ++timeoutloop < 10)
+			if (err == LIBUSB_ERROR_TIMEOUT && ++timeoutloop < 10)
 				goto tryagain;
 			applog(LOG_ERR, "%s%u: Error sending (get nonce) (%d:%d)",
@ -901,11 +910,10 @@ static uint64_t modminer_process_results(struct thr_info *thr)
 		err = usb_read(modminer, (char *)(&nonce), 4, &amount, C_GETWORKSTATUS);
 		mutex_unlock(modminer->modminer_mutex);
-		if (err < 0 || amount != 4) {
+		if (err < 0 || amount < 4) {
 			// timeoutloop never resets so the timeouts can't
 			// accumulate much during a single item of work
-			if (err == -7 && ++timeoutloop < 10)
+			if (err == LIBUSB_ERROR_TIMEOUT && ++timeoutloop < 10)
 				goto tryagain;
 			applog(LOG_ERR, "%s%u: Error reading (get nonce) (%d:%d)",
@ -970,16 +978,25 @@ tryagain:
 		if (work_restart(thr))
 			break;
-		gettimeofday(&now, NULL);
+		if (state->overheated == true)
-		if (tdiff(&now, &state->tv_workstart) > processtime)
+		{
-			break;
+			// don't check every time
 			if (++temploop > 30) {
 				check_temperature(thr);
 				temploop = 0;
 			}
 		// don't check every time
 		if (state->overheated == true && ++temploop > 30) {
 			check_temperature(thr);
 			temploop = 0;
 		}
 		if (state->overheated == true)
 			timeout = overheattime;
 		else
 			timeout = processtime;
 		gettimeofday(&now, NULL);
 		if (tdiff(&now, &state->tv_workstart) > timeout)
 			break;
 		nmsleep(10);
 		if (work_restart(thr))
 			break;
@ -991,6 +1008,7 @@ tryagain:
 	// Not exact since the clock may have changed ... but close enough I guess
 	uint64_t hashes = (uint64_t)modminer->clock * (((uint64_t)elapsed.tv_sec * 1000000) + elapsed.tv_usec);
 	// Overheat will complete the nonce range
 	if (hashes > 0xffffffff)
 		hashes = 0xffffffff;
 	else