Browse Source

klondike - downgrade 'late update' but add an idle detect - and correct error levels

nfactor-troky
Kano 11 years ago
parent
commit
b4eb870af0
  1. 54
      driver-klondike.c

54
driver-klondike.c

@ -208,6 +208,7 @@ typedef struct jobque {
int workqc; int workqc;
struct timeval last_update; struct timeval last_update;
bool overheat; bool overheat;
bool flushed;
int late_update_count; int late_update_count;
int late_update_sequential; int late_update_sequential;
} JOBQUE; } JOBQUE;
@ -315,7 +316,7 @@ static KLIST *allocate_kitem(struct cgpu_info *klncgpu)
cg_wunlock(&klninfo->klist_lock); cg_wunlock(&klninfo->klist_lock);
if (ran_out > 0) if (ran_out > 0)
applog(LOG_ERR, "%s", errbuf); applog(LOG_WARNING, "%s", errbuf);
return kitem; return kitem;
} }
@ -985,7 +986,9 @@ static void *klondike_get_replies(void *userdata)
} }
if (!err && recd == REPLY_SIZE) { if (!err && recd == REPLY_SIZE) {
cgtime(&(kitem->tv_when)); cgtime(&(kitem->tv_when));
rd_lock(&(klninfo->stat_lock));
kitem->block_seq = klninfo->block_seq; kitem->block_seq = klninfo->block_seq;
rd_unlock(&(klninfo->stat_lock));
if (opt_log_level <= READ_DEBUG) { if (opt_log_level <= READ_DEBUG) {
hexdata = bin2hex((unsigned char *)&(kitem->kline.hd.dev), recd-1); hexdata = bin2hex((unsigned char *)&(kitem->kline.hd.dev), recd-1);
applog(READ_DEBUG, "%s%i:%d reply [%c:%s]", applog(READ_DEBUG, "%s%i:%d reply [%c:%s]",
@ -1022,8 +1025,27 @@ static void *klondike_get_replies(void *userdata)
klondike_check_nonce(klncgpu, kitem); klondike_check_nonce(klncgpu, kitem);
display_kline(klncgpu, &kitem->kline, msg_reply); display_kline(klncgpu, &kitem->kline, msg_reply);
break; break;
case KLN_CMD_STATUS:
case KLN_CMD_WORK: case KLN_CMD_WORK:
// We can't do/check this until it's initialised
if (klninfo->initialised) {
if (kitem->kline.ws.workqc == 0) {
bool idle = false;
rd_lock(&(klninfo->stat_lock));
if (klninfo->jobque[dev].flushed == false)
idle = true;
slaves = klninfo->status[0].kline.ws.slavecount;
rd_lock(&(klninfo->stat_lock));
if (idle)
applog(LOG_WARNING, "%s%i:%d went idle before work was sent",
klncgpu->drv->name,
klncgpu->device_id,
dev);
}
wr_lock(&(klninfo->stat_lock));
klninfo->jobque[dev].flushed = false;
wr_lock(&(klninfo->stat_lock));
}
case KLN_CMD_STATUS:
case KLN_CMD_ABORT: case KLN_CMD_ABORT:
// We can't do/check this until it's initialised // We can't do/check this until it's initialised
if (klninfo->initialised) { if (klninfo->initialised) {
@ -1067,9 +1089,10 @@ static void *klondike_get_replies(void *userdata)
klninfo->jobque[dev].overheat = true; klninfo->jobque[dev].overheat = true;
wr_unlock(&(klninfo->stat_lock)); wr_unlock(&(klninfo->stat_lock));
applog(LOG_ERR, "%s%i:%d Critical overheat (%.0fC)", applog(LOG_WARNING, "%s%i:%d Critical overheat (%.0fC)",
klncgpu->drv->name, klncgpu->device_id, klncgpu->drv->name,
dev, temp); klncgpu->device_id,
dev, temp);
zero_kline(&kline); zero_kline(&kline);
kline.hd.cmd = KLN_CMD_ABORT; kline.hd.cmd = KLN_CMD_ABORT;
@ -1122,13 +1145,13 @@ static void klondike_flush_work(struct cgpu_info *klncgpu)
KLINE kline; KLINE kline;
int slaves, dev; int slaves, dev;
wr_lock(&(klninfo->stat_lock));
klninfo->block_seq++; klninfo->block_seq++;
slaves = klninfo->status[0].kline.ws.slavecount;
wr_unlock(&(klninfo->stat_lock));
applog(LOG_DEBUG, "%s%i: flushing work", applog(LOG_DEBUG, "%s%i: flushing work",
klncgpu->drv->name, klncgpu->device_id); klncgpu->drv->name, klncgpu->device_id);
rd_lock(&(klninfo->stat_lock));
slaves = klninfo->status[0].kline.ws.slavecount;
rd_unlock(&(klninfo->stat_lock));
zero_kline(&kline); zero_kline(&kline);
kline.hd.cmd = KLN_CMD_ABORT; kline.hd.cmd = KLN_CMD_ABORT;
for (dev = 0; dev <= slaves; dev++) { for (dev = 0; dev <= slaves; dev++) {
@ -1139,6 +1162,7 @@ static void klondike_flush_work(struct cgpu_info *klncgpu)
memcpy((void *)&(klninfo->status[dev]), memcpy((void *)&(klninfo->status[dev]),
kitem, kitem,
sizeof(klninfo->status[dev])); sizeof(klninfo->status[dev]));
klninfo->jobque[dev].flushed = true;
wr_unlock(&(klninfo->stat_lock)); wr_unlock(&(klninfo->stat_lock));
kitem = release_kitem(klncgpu, kitem); kitem = release_kitem(klncgpu, kitem);
} }
@ -1286,14 +1310,14 @@ static bool klondike_queue_full(struct cgpu_info *klncgpu)
seq = ++klninfo->jobque[dev].late_update_sequential; seq = ++klninfo->jobque[dev].late_update_sequential;
rd_unlock(&(klninfo->stat_lock)); rd_unlock(&(klninfo->stat_lock));
if (seq < LATE_UPDATE_LIMIT) { if (seq < LATE_UPDATE_LIMIT) {
applog(LOG_ERR, "%s%i:%d late update", applog(LOG_DEBUG, "%s%i:%d late update",
klncgpu->drv->name, klncgpu->device_id, dev); klncgpu->drv->name, klncgpu->device_id, dev);
klondike_get_stats(klncgpu); klondike_get_stats(klncgpu);
goto que; goto que;
} else { } else {
applog(LOG_ERR, "%s%i:%d late update (%d) reached - attempting reset", applog(LOG_WARNING, "%s%i:%d late update (%d) reached - attempting reset",
klncgpu->drv->name, klncgpu->device_id, klncgpu->drv->name, klncgpu->device_id,
dev, LATE_UPDATE_LIMIT); dev, LATE_UPDATE_LIMIT);
control_init(klncgpu); control_init(klncgpu);
kln_enable(klncgpu); kln_enable(klncgpu);
klondike_get_stats(klncgpu); klondike_get_stats(klncgpu);
@ -1333,9 +1357,9 @@ tryagain:
if (temp <= KLN_COOLED_DOWN) { if (temp <= KLN_COOLED_DOWN) {
klninfo->jobque[dev].overheat = false; klninfo->jobque[dev].overheat = false;
rd_unlock(&(klninfo->stat_lock)); rd_unlock(&(klninfo->stat_lock));
applog(LOG_ERR, "%s%i:%d Overheat recovered (%.0fC)", applog(LOG_WARNING, "%s%i:%d Overheat recovered (%.0fC)",
klncgpu->drv->name, klncgpu->device_id, klncgpu->drv->name, klncgpu->device_id,
dev, temp); dev, temp);
kln_enable(klncgpu); kln_enable(klncgpu);
goto tryagain; goto tryagain;
} else { } else {

Loading…
Cancel
Save