Browse Source

Merge pull request #369 from kanoi/mmq

MMQ overheat: remove clockdown (doesn't help) + ensure no lost shares + allow partial work replies and count them + count work check timeout failures
nfactor-troky
Con Kolivas 12 years ago
parent
commit
368f503f5f
  1. 98
      driver-modminer.c
  2. 2
      miner.h

98
driver-modminer.c

@ -45,9 +45,11 @@
#define MODMINER_DEF_CLOCK 200 #define MODMINER_DEF_CLOCK 200
#define MODMINER_MIN_CLOCK 160 #define MODMINER_MIN_CLOCK 160
#define MODMINER_CLOCK_DOWN -2
#define MODMINER_CLOCK_SET 0
#define MODMINER_CLOCK_UP 2 #define MODMINER_CLOCK_UP 2
#define MODMINER_CLOCK_SET 0
#define MODMINER_CLOCK_DOWN -2
// = 0 means OVERHEAT doesn't affect the clock
#define MODMINER_CLOCK_OVERHEAT 0
#define MODMINER_CLOCK_DEAD -6 #define MODMINER_CLOCK_DEAD -6
#define MODMINER_CLOCK_CUTOFF -10 #define MODMINER_CLOCK_CUTOFF -10
@ -578,12 +580,14 @@ static bool modminer_fpga_prepare(struct thr_info *thr)
* If device exceeds cutoff or overheat temp - stop sending work until it cools * If device exceeds cutoff or overheat temp - stop sending work until it cools
* decrease the clock by MODMINER_CLOCK_CUTOFF/MODMINER_CLOCK_OVERHEAT * decrease the clock by MODMINER_CLOCK_CUTOFF/MODMINER_CLOCK_OVERHEAT
* for when it restarts * for when it restarts
* with MODMINER_CLOCK_OVERHEAT=0 basically says that temp shouldn't
* affect the clock unless we reach CUTOFF
* *
* When to clock down:
* If device overheats * If device overheats
* also halve shares_to_good * set shares_to_good back to MODMINER_MIN_BACK
* (so multiple temp drops can recover faster) * to speed up clock recovery if temp drop doesnt help
* or *
* When to clock down:
* If device gets MODMINER_HW_ERROR_PERCENT errors since last clock up or down * If device gets MODMINER_HW_ERROR_PERCENT errors since last clock up or down
* if clock is <= default it requires 2 HW to do this test * if clock is <= default it requires 2 HW to do this test
* if clock is > default it only requires 1 HW to do this test * if clock is > default it only requires 1 HW to do this test
@ -603,7 +607,6 @@ static bool modminer_delta_clock(struct thr_info *thr, int delta, bool temp)
int err, amount; int err, amount;
// Only do once if multiple shares per work or multiple reasons // Only do once if multiple shares per work or multiple reasons
// Since the temperature down clock test is first in the code this is OK
if (!state->new_work) if (!state->new_work)
return false; return false;
@ -613,18 +616,17 @@ static bool modminer_delta_clock(struct thr_info *thr, int delta, bool temp)
state->shares_last_hw = 0; state->shares_last_hw = 0;
state->hw_errors = 0; state->hw_errors = 0;
// If drop requested due to temperature, clock drop is always allowed // FYI clock drop has little effect on temp
if (!temp && delta < 0 && modminer->clock <= MODMINER_MIN_CLOCK) if (delta < 0 && modminer->clock <= MODMINER_MIN_CLOCK)
return false; return false;
if (delta > 0 && modminer->clock >= MODMINER_MAX_CLOCK) if (delta > 0 && modminer->clock >= MODMINER_MAX_CLOCK)
return false; return false;
if (delta < 0) { if (delta < 0) {
if (temp) { if (temp)
if (state->shares_to_good > MODMINER_MIN_BACK) state->shares_to_good = MODMINER_MIN_BACK;
state->shares_to_good /= 2; else {
} else {
if ((state->shares_to_good * 2) < MODMINER_TRY_UP) if ((state->shares_to_good * 2) < MODMINER_TRY_UP)
state->shares_to_good *= 2; state->shares_to_good *= 2;
else else
@ -759,7 +761,8 @@ static bool modminer_start_work(struct thr_info *thr)
mutex_lock(modminer->modminer_mutex); mutex_lock(modminer->modminer_mutex);
if ((err = usb_write(modminer, (char *)(state->next_work_cmd), 46, &amount, C_SENDWORK)) < 0 || amount != 46) { if ((err = usb_write(modminer, (char *)(state->next_work_cmd), 46, &amount, C_SENDWORK)) < 0 || amount != 46) {
// TODO: err = -4 means the MMQ disappeared - need to delete it and rescan for it? (after a delay?) // TODO: err = LIBUSB_ERROR_NO_DEVICE means the MMQ disappeared
// - need to delete it and rescan for it? (after a delay?)
// but check all (4) disappeared // but check all (4) disappeared
mutex_unlock(modminer->modminer_mutex); mutex_unlock(modminer->modminer_mutex);
@ -802,8 +805,7 @@ static void check_temperature(struct thr_info *thr)
mutex_lock(modminer->modminer_mutex); mutex_lock(modminer->modminer_mutex);
if (usb_write(modminer, (char *)cmd, 2, &amount, C_REQUESTTEMPERATURE) == 0 && amount == 2 && if (usb_write(modminer, (char *)cmd, 2, &amount, C_REQUESTTEMPERATURE) == 0 && amount == 2 &&
usb_read(modminer, (char *)(&temperature), tbytes, &tamount, C_GETTEMPERATURE) == 0 && tamount == tbytes) usb_read(modminer, (char *)(&temperature), tbytes, &tamount, C_GETTEMPERATURE) == 0 && tamount == tbytes) {
{
mutex_unlock(modminer->modminer_mutex); mutex_unlock(modminer->modminer_mutex);
if (state->one_byte_temp) if (state->one_byte_temp)
modminer->temp = temperature[0]; modminer->temp = temperature[0];
@ -837,7 +839,9 @@ static void check_temperature(struct thr_info *thr)
modminer->api->name, modminer->device_id, modminer->api->name, modminer->device_id,
MODMINER_OVERHEAT_TEMP, modminer->temp); MODMINER_OVERHEAT_TEMP, modminer->temp);
modminer_delta_clock(thr, MODMINER_CLOCK_DOWN, true); // If it's defined to be 0 then don't call modminer_delta_clock()
if (MODMINER_CLOCK_OVERHEAT != 0)
modminer_delta_clock(thr, MODMINER_CLOCK_OVERHEAT, true);
state->overheated = true; state->overheated = true;
dev_error(modminer, REASON_DEV_OVER_HEAT); dev_error(modminer, REASON_DEV_OVER_HEAT);
} }
@ -854,6 +858,11 @@ static void check_temperature(struct thr_info *thr)
#define work_restart(thr) thr->work_restart #define work_restart(thr) thr->work_restart
// 250Mhz is 17.17s - ensure we don't go idle
static const double processtime = 17.0;
// 160Mhz is 26.84 - when overheated ensure we don't throw away shares
static const double overheattime = 26.9;
static uint64_t modminer_process_results(struct thr_info *thr) static uint64_t modminer_process_results(struct thr_info *thr)
{ {
struct cgpu_info *modminer = thr->cgpu; struct cgpu_info *modminer = thr->cgpu;
@ -863,9 +872,9 @@ static uint64_t modminer_process_results(struct thr_info *thr)
char cmd[2]; char cmd[2];
uint32_t nonce; uint32_t nonce;
uint32_t curr_hw_errors; uint32_t curr_hw_errors;
int err, amount; int err, amount, amount2;
int timeoutloop; int timeoutloop;
double processtime; double timeout;
int temploop; int temploop;
// If we are overheated it will just keep checking for results // If we are overheated it will just keep checking for results
@ -876,21 +885,22 @@ static uint64_t modminer_process_results(struct thr_info *thr)
cmd[0] = MODMINER_CHECK_WORK; cmd[0] = MODMINER_CHECK_WORK;
cmd[1] = modminer->fpgaid; cmd[1] = modminer->fpgaid;
// 250Mhz is 17.17s
processtime = 17.0;
timeoutloop = 0; timeoutloop = 0;
temploop = 0; temploop = 0;
while (1) { while (1) {
mutex_lock(modminer->modminer_mutex); mutex_lock(modminer->modminer_mutex);
if ((err = usb_write(modminer, cmd, 2, &amount, C_REQUESTWORKSTATUS)) < 0 || amount != 2) { if ((err = usb_write(modminer, cmd, 2, &amount, C_REQUESTWORKSTATUS)) < 0 || amount != 2) {
// TODO: err = -4 means the MMQ disappeared - need to delete it and rescan for it? (after a delay?) // TODO: err = LIBUSB_ERROR_NO_DEVICE means the MMQ disappeared
// - need to delete it and rescan for it? (after a delay?)
// but check all (4) disappeared // but check all (4) disappeared
mutex_unlock(modminer->modminer_mutex); mutex_unlock(modminer->modminer_mutex);
// timeoutloop never resets so the timeouts can't // timeoutloop never resets so the timeouts can't
// accumulate much during a single item of work // accumulate much during a single item of work
if (err == -7 && ++timeoutloop < 10) if (err == LIBUSB_ERROR_TIMEOUT && ++timeoutloop < 10) {
state->timeout_fail++;
goto tryagain; goto tryagain;
}
applog(LOG_ERR, "%s%u: Error sending (get nonce) (%d:%d)", applog(LOG_ERR, "%s%u: Error sending (get nonce) (%d:%d)",
modminer->api->name, modminer->device_id, amount, err); modminer->api->name, modminer->device_id, amount, err);
@ -899,17 +909,28 @@ static uint64_t modminer_process_results(struct thr_info *thr)
} }
err = usb_read(modminer, (char *)(&nonce), 4, &amount, C_GETWORKSTATUS); err = usb_read(modminer, (char *)(&nonce), 4, &amount, C_GETWORKSTATUS);
mutex_unlock(modminer->modminer_mutex); while (err == LIBUSB_SUCCESS && amount < 4) {
size_t remain = 4 - amount;
char *pos = ((char *)(&nonce)) + amount;
state->success_more++;
if (err < 0 || amount != 4) { err = usb_read(modminer, pos, remain, &amount2, C_GETWORKSTATUS);
amount += amount2;
}
mutex_unlock(modminer->modminer_mutex);
if (err < 0 || amount < 4) {
// timeoutloop never resets so the timeouts can't // timeoutloop never resets so the timeouts can't
// accumulate much during a single item of work // accumulate much during a single item of work
if (err == -7 && ++timeoutloop < 10) if (err == LIBUSB_ERROR_TIMEOUT && ++timeoutloop < 10) {
state->timeout_fail++;
goto tryagain; goto tryagain;
}
applog(LOG_ERR, "%s%u: Error reading (get nonce) (%d:%d)", applog(LOG_ERR, "%s%u: Error reading (get nonce) (%d:%d)",
modminer->api->name, modminer->device_id, amount, err); modminer->api->name, modminer->device_id, amount+amount2, err);
} }
if (memcmp(&nonce, "\xff\xff\xff\xff", 4)) { if (memcmp(&nonce, "\xff\xff\xff\xff", 4)) {
@ -970,16 +991,24 @@ tryagain:
if (work_restart(thr)) if (work_restart(thr))
break; break;
gettimeofday(&now, NULL); if (state->overheated == true) {
if (tdiff(&now, &state->tv_workstart) > processtime) // don't check every time
break; if (++temploop > 30) {
check_temperature(thr);
temploop = 0;
}
// don't check every time
if (state->overheated == true && ++temploop > 30) {
check_temperature(thr);
temploop = 0;
} }
if (state->overheated == true)
timeout = overheattime;
else
timeout = processtime;
gettimeofday(&now, NULL);
if (tdiff(&now, &state->tv_workstart) > timeout)
break;
nmsleep(10); nmsleep(10);
if (work_restart(thr)) if (work_restart(thr))
break; break;
@ -991,6 +1020,7 @@ tryagain:
// Not exact since the clock may have changed ... but close enough I guess // Not exact since the clock may have changed ... but close enough I guess
uint64_t hashes = (uint64_t)modminer->clock * (((uint64_t)elapsed.tv_sec * 1000000) + elapsed.tv_usec); uint64_t hashes = (uint64_t)modminer->clock * (((uint64_t)elapsed.tv_sec * 1000000) + elapsed.tv_usec);
// Overheat will complete the nonce range
if (hashes > 0xffffffff) if (hashes > 0xffffffff)
hashes = 0xffffffff; hashes = 0xffffffff;
else else

2
miner.h

@ -1033,6 +1033,8 @@ struct modminer_fpga_state {
uint32_t shares_last_hw; uint32_t shares_last_hw;
uint32_t hw_errors; uint32_t hw_errors;
uint32_t shares_to_good; uint32_t shares_to_good;
uint32_t timeout_fail;
uint32_t success_more;
struct timeval last_changed; struct timeval last_changed;
struct timeval last_nonce; struct timeval last_nonce;
struct timeval first_work; struct timeval first_work;

Loading…
Cancel
Save