From 8437805f5c52a00dd4eb213c5bcb17546452ef19 Mon Sep 17 00:00:00 2001 From: Kano Date: Thu, 24 Oct 2013 03:44:04 +1100 Subject: [PATCH] klondike - error condition handling --- driver-klondike.c | 484 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 356 insertions(+), 128 deletions(-) diff --git a/driver-klondike.c b/driver-klondike.c index 4d99f431..19965a40 100644 --- a/driver-klondike.c +++ b/driver-klondike.c @@ -33,6 +33,22 @@ #define K16 "K16" #define K64 "K64" +static const char *msg_detect_send = "DSend"; +static const char *msg_detect_reply = "DReply"; +static const char *msg_send = "Send"; +static const char *msg_reply = "Reply"; + +#define KLN_CMD_ABORT 'A' +#define KLN_CMD_CONFIG 'C' +#define KLN_CMD_ENABLE 'E' +#define KLN_CMD_IDENT 'I' +#define KLN_CMD_NONCE '=' +#define KLN_CMD_STATUS 'S' +#define KLN_CMD_WORK 'W' + +#define KLN_CMD_ENABLE_OFF '0' +#define KLN_CMD_ENABLE_ON '1' + #define MIDSTATE_BYTES 32 #define MERKLE_OFFSET 64 #define MERKLE_BYTES 12 @@ -44,6 +60,12 @@ #define MAX_WORK_COUNT 4 // for now, must be binary multiple and match firmware #define TACH_FACTOR 87890 // fan rpm divisor +#define KLN_KILLWORK_TEMP 53.5 +#define KLN_COOLED_DOWN 45.5 + +// If 5 late updates in a row, try to reset the device +#define KLN_LATE_UPDATE_LIMIT 5 + /* * Work older than 5s will already be completed * FYI it must not be possible to complete 256 work @@ -53,10 +75,10 @@ #define OLD_WORK_MS ((int)(5 * 1000)) /* - * If the queue status hasn't been updated for this long - * then do it now + * If the queue status hasn't been updated for this long then do it now + * 5GH/s = 859ms per full nonce range */ -#define LATE_UPDATE_MS ((int)(4 * 1000)) +#define LATE_UPDATE_MS ((int)(2.5 * 1000)) struct device_drv klondike_drv; @@ -85,7 +107,7 @@ typedef struct klondike_header { (_hashclock)[1] = (uint8_t)(((_value) >> 8) & 0xff); \ } while(0) -#define KSENDHD(_add) (sizeof(char) + sizeof(uint8_t) + _add) +#define KSENDHD(_add) (sizeof(uint8_t) + sizeof(uint8_t) + _add) typedef struct klondike_id { uint8_t cmd; @@ -147,6 +169,8 @@ typedef struct kline { }; } KLINE; +#define zero_kline(_kline) memset((void *)(_kline), 0, sizeof(KLINE)); + typedef struct device_info { uint32_t noncecount; uint32_t nextworkid; @@ -169,6 +193,9 @@ typedef struct klist { typedef struct jobque { int workqc; struct timeval last_update; + bool overheat; + int late_update_count; + int late_update_sequential; } JOBQUE; struct klondike_info { @@ -360,33 +387,32 @@ static int cvtCToKln(double deg) // Change this to LOG_WARNING if you wish to always see the replies #define READ_DEBUG LOG_DEBUG -//#define READ_DEBUG LOG_ERR -static void display_kline(struct cgpu_info *klncgpu, KLINE *kline) +static void display_kline(struct cgpu_info *klncgpu, KLINE *kline, const char *msg) { char *hexdata; switch (kline->hd.cmd) { - case '=': + case KLN_CMD_NONCE: applog(READ_DEBUG, - "%s (%s) work [%c] dev=%d workid=%d" + "%s%i:%d %s work [%c] dev=%d workid=%d" " nonce=0x%08x", - klncgpu->drv->dname, klncgpu->device_path, - kline->wr.cmd, + klncgpu->drv->name, klncgpu->device_id, + (int)(kline->wr.dev), msg, kline->wr.cmd, (int)(kline->wr.dev), (int)(kline->wr.workid), - (unsigned int)K_NONCE(kline->wr.nonce)); + (unsigned int)K_NONCE(kline->wr.nonce) - 0xC0); break; - case 'S': - case 'W': - case 'A': - case 'E': + case KLN_CMD_STATUS: + case KLN_CMD_WORK: + case KLN_CMD_ENABLE: + case KLN_CMD_ABORT: applog(READ_DEBUG, - "%s (%s) status [%c] dev=%d chips=%d" + "%s%i:%d %s status [%c] dev=%d chips=%d" " slaves=%d workcq=%d workid=%d temp=%d fan=%d" " errors=%d hashes=%d max=%d noise=%d", - klncgpu->drv->dname, klncgpu->device_path, - kline->ws.cmd, + klncgpu->drv->name, klncgpu->device_id, + (int)(kline->ws.dev), msg, kline->ws.cmd, (int)(kline->ws.dev), (int)(kline->ws.chipcount), (int)(kline->ws.slavecount), @@ -399,24 +425,24 @@ static void display_kline(struct cgpu_info *klncgpu, KLINE *kline) K_MAXCOUNT(kline->ws.maxcount), (int)(kline->ws.noise)); break; - case 'C': + case KLN_CMD_CONFIG: applog(READ_DEBUG, - "%s (%s) config [%c] dev=%d clock=%d" + "%s%i:%d %s config [%c] dev=%d clock=%d" " temptarget=%d tempcrit=%d fan=%d", - klncgpu->drv->dname, klncgpu->device_path, - kline->cfg.cmd, + klncgpu->drv->name, klncgpu->device_id, + (int)(kline->cfg.dev), msg, kline->cfg.cmd, (int)(kline->cfg.dev), K_HASHCLOCK(kline->cfg.hashclock), (int)(kline->cfg.temptarget), (int)(kline->cfg.tempcritical), (int)(kline->cfg.fantarget)); break; - case 'I': + case KLN_CMD_IDENT: applog(READ_DEBUG, - "%s (%s) info [%c] version=0x%02x prod=%.7s" + "%s%i:%d %s info [%c] version=0x%02x prod=%.7s" " serial=0x%08x", - klncgpu->drv->dname, klncgpu->device_path, - kline->hd.cmd, + klncgpu->drv->name, klncgpu->device_id, + (int)(kline->hd.dev), msg, kline->hd.cmd, (int)(kline->id.version), kline->id.product, (unsigned int)K_SERIAL(kline->id.serial)); @@ -424,40 +450,103 @@ static void display_kline(struct cgpu_info *klncgpu, KLINE *kline) default: hexdata = bin2hex((unsigned char *)&(kline->hd.dev), REPLY_SIZE - 1); applog(LOG_ERR, - "%s (%s) [%c:%s] unknown and ignored", - klncgpu->drv->dname, klncgpu->device_path, - kline->hd.cmd, hexdata); + "%s%i:%d %s [%c:%s] unknown and ignored", + klncgpu->drv->name, klncgpu->device_id, + (int)(kline->hd.dev), msg, kline->hd.cmd, + hexdata); free(hexdata); break; } } -static KLIST *SendCmdGetReply(struct cgpu_info *klncgpu, KLINE *kline, int datalen) +static void display_send_kline(struct cgpu_info *klncgpu, KLINE *kline, const char *msg) +{ + char *hexdata; + + switch (kline->hd.cmd) { + case KLN_CMD_WORK: + applog(READ_DEBUG, + "%s%i:%d %s work [%c] dev=%d workid=0x%02x ...", + klncgpu->drv->name, klncgpu->device_id, + (int)(kline->wt.dev), msg, kline->ws.cmd, + (int)(kline->wt.dev), + (int)(kline->wt.workid)); + break; + case KLN_CMD_CONFIG: + applog(READ_DEBUG, + "%s%i:%d %s config [%c] dev=%d clock=%d" + " temptarget=%d tempcrit=%d fan=%d", + klncgpu->drv->name, klncgpu->device_id, + (int)(kline->cfg.dev), msg, kline->cfg.cmd, + (int)(kline->cfg.dev), + K_HASHCLOCK(kline->cfg.hashclock), + (int)(kline->cfg.temptarget), + (int)(kline->cfg.tempcritical), + (int)(kline->cfg.fantarget)); + break; + case KLN_CMD_IDENT: + case KLN_CMD_STATUS: + case KLN_CMD_ABORT: + applog(READ_DEBUG, + "%s%i:%d %s cmd [%c]", + klncgpu->drv->name, klncgpu->device_id, + (int)(kline->hd.dev), msg, kline->hd.cmd); + break; + case KLN_CMD_ENABLE: + applog(READ_DEBUG, + "%s%i:%d %s enable [%c] enable=%c", + klncgpu->drv->name, klncgpu->device_id, + (int)(kline->hd.dev), msg, kline->hd.cmd, + (char)(kline->hd.buf[0])); + break; + case KLN_CMD_NONCE: + default: + hexdata = bin2hex((unsigned char *)&(kline->hd.dev), REPLY_SIZE - 1); + applog(LOG_ERR, + "%s%i:%d %s [%c:%s] unknown/unexpected and ignored", + klncgpu->drv->name, klncgpu->device_id, + (int)(kline->hd.dev), msg, kline->hd.cmd, + hexdata); + free(hexdata); + break; + } +} + +static bool SendCmd(struct cgpu_info *klncgpu, KLINE *kline, int datalen) { - struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data); - KLIST *kitem; - int retries = CMD_REPLY_RETRIES; int err, amt, writ; if (klncgpu->usbinfo.nodev) - return NULL; + return false; + display_send_kline(klncgpu, kline, msg_send); writ = KSENDHD(datalen); err = usb_write(klncgpu, (char *)kline, writ, &amt, C_REQUESTRESULTS); if (err < 0 || amt != writ) { - applog(LOG_ERR, "%s (%s) Cmd:%c Dev:%d, write failed (%d:%d:%d)", - klncgpu->drv->dname, klncgpu->device_path, - kline->hd.cmd, (int)kline->hd.dev, + applog(LOG_ERR, "%s%i:%d Cmd:%c Dev:%d, write failed (%d:%d:%d)", + klncgpu->drv->name, klncgpu->device_id, + (int)(kline->hd.dev), + kline->hd.cmd, (int)(kline->hd.dev), writ, amt, err); + return false; } + return true; +} + +static KLIST *GetReply(struct cgpu_info *klncgpu, uint8_t cmd, uint8_t dev) +{ + struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data); + KLIST *kitem; + int retries = CMD_REPLY_RETRIES; + while (retries-- > 0 && klninfo->shutdown == false) { cgsleep_ms(REPLY_WAIT_TIME); cg_rlock(&klninfo->klist_lock); kitem = klninfo->used; while (kitem) { - if (kitem->kline.hd.cmd == kline->hd.cmd && - kitem->kline.hd.dev == kline->hd.dev && + if (kitem->kline.hd.cmd == cmd && + kitem->kline.hd.dev == dev && kitem->ready == true && kitem->working == false) { kitem->working = true; cg_runlock(&klninfo->klist_lock); @@ -470,6 +559,14 @@ static KLIST *SendCmdGetReply(struct cgpu_info *klncgpu, KLINE *kline, int datal return NULL; } +static KLIST *SendCmdGetReply(struct cgpu_info *klncgpu, KLINE *kline, int datalen) +{ + if (!SendCmd(klncgpu, kline, datalen)) + return NULL; + + return GetReply(klncgpu, kline->hd.cmd, kline->hd.dev); +} + static bool klondike_get_stats(struct cgpu_info *klncgpu) { struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data); @@ -480,7 +577,8 @@ static bool klondike_get_stats(struct cgpu_info *klncgpu) if (klncgpu->usbinfo.nodev || klninfo->status == NULL) return false; - applog(LOG_DEBUG, "Klondike getting status"); + applog(LOG_DEBUG, "%s%i: getting status", + klncgpu->drv->name, klncgpu->device_id); rd_lock(&(klninfo->stat_lock)); slaves = klninfo->status[0].kline.ws.slavecount; @@ -488,7 +586,8 @@ static bool klondike_get_stats(struct cgpu_info *klncgpu) // loop thru devices and get status for each for (dev = 0; dev <= slaves; dev++) { - kline.hd.cmd = 'S'; + zero_kline(&kline); + kline.hd.cmd = KLN_CMD_STATUS; kline.hd.dev = dev; kitem = SendCmdGetReply(klncgpu, &kline, 0); if (kitem != NULL) { @@ -498,12 +597,55 @@ static bool klondike_get_stats(struct cgpu_info *klncgpu) sizeof(klninfo->status[dev])); wr_unlock(&(klninfo->stat_lock)); kitem = release_kitem(klncgpu, kitem); + } else { + applog(LOG_ERR, "%s%i:%d failed to update stats", + klncgpu->drv->name, klncgpu->device_id, dev); } } + return true; +} - // todo: detect slavecount change and realloc space +// TODO: this only enables the master (no slaves) +static bool kln_enable(struct cgpu_info *klncgpu) +{ + KLIST *kitem; + KLINE kline; + int tries = 2; + bool ok = false; - return true; + zero_kline(&kline); + kline.hd.cmd = KLN_CMD_ENABLE; + kline.hd.dev = 0; + kline.hd.buf[0] = KLN_CMD_ENABLE_ON; + + while (tries-- > 0) { + kitem = SendCmdGetReply(klncgpu, &kline, 1); + if (kitem) { + kitem = release_kitem(klncgpu, kitem); + ok = true; + break; + } + cgsleep_ms(50); + } + + if (ok) + cgsleep_ms(50); + + return ok; +} + +static void kln_disable(struct cgpu_info *klncgpu, int dev, bool all) +{ + KLINE kline; + int i; + + zero_kline(&kline); + kline.hd.cmd = KLN_CMD_ENABLE; + kline.hd.buf[0] = KLN_CMD_ENABLE_OFF; + for (i = (all ? 0 : dev); i <= dev; i++) { + kline.hd.dev = i; + SendCmd(klncgpu, &kline, KSENDHD(1)); + } } static bool klondike_init(struct cgpu_info *klncgpu) @@ -515,7 +657,8 @@ static bool klondike_init(struct cgpu_info *klncgpu) klninfo->initialised = false; - kline.hd.cmd = 'S'; + zero_kline(&kline); + kline.hd.cmd = KLN_CMD_STATUS; kline.hd.dev = 0; kitem = SendCmdGetReply(klncgpu, &kline, 0); if (kitem == NULL) @@ -523,7 +666,8 @@ static bool klondike_init(struct cgpu_info *klncgpu) slaves = kitem->kline.ws.slavecount; if (klninfo->status == NULL) { - applog(LOG_DEBUG, "Klondike initializing data"); + applog(LOG_DEBUG, "%s%i: initializing data", + klncgpu->drv->name, klncgpu->device_id); // alloc space for status, devinfo, cfg and jobque for master and slaves klninfo->status = calloc(slaves+1, sizeof(*(klninfo->status))); @@ -544,8 +688,8 @@ static bool klondike_init(struct cgpu_info *klncgpu) kitem = release_kitem(klncgpu, kitem); // zero init triggers read back only - memset(&(kline.cfg), 0, sizeof(kline.cfg)); - kline.cfg.cmd = 'C'; + zero_kline(&kline); + kline.cfg.cmd = KLN_CMD_CONFIG; int size = 2; @@ -570,7 +714,8 @@ static bool klondike_init(struct cgpu_info *klncgpu) kitem = SendCmdGetReply(klncgpu, &kline, size); if (kitem != NULL) { memcpy((void *)&(klninfo->cfg[dev]), kitem, sizeof(klninfo->cfg[dev])); - applog(LOG_WARNING, "Klondike config (%d: Clk: %d, T:%.0lf, C:%.0lf, F:%d)", + applog(LOG_WARNING, "%s%i:%d config (%d: Clk: %d, T:%.0lf, C:%.0lf, F:%d)", + klncgpu->drv->name, klncgpu->device_id, dev, dev, K_HASHCLOCK(klninfo->cfg[dev].kline.cfg.hashclock), cvtKlnToC(klninfo->cfg[dev].kline.cfg.temptarget), cvtKlnToC(klninfo->cfg[dev].kline.cfg.tempcritical), @@ -585,23 +730,7 @@ static bool klondike_init(struct cgpu_info *klncgpu) klninfo->devinfo[dev].chipstats = calloc(klninfo->status[dev].kline.ws.chipcount*2 , sizeof(uint32_t)); } - int tries = 2; - bool ok = false; - - kline.hd.cmd = 'E'; - kline.hd.dev = 0; - kline.hd.buf[0] = '1'; - - while (tries-- > 0) { - kitem = SendCmdGetReply(klncgpu, &kline, 1); - if (kitem) { - kitem = release_kitem(klncgpu, kitem); - ok = true; - break; - } - cgsleep_ms(50); - } - cgsleep_ms(50); + bool ok = kln_enable(klncgpu); if (!ok) applog(LOG_ERR, "%s%i: failed to enable", klncgpu->drv->name, klncgpu->device_id); @@ -628,6 +757,7 @@ static bool klondike_detect_one(struct libusb_device *dev, struct usb_find_devic { struct cgpu_info *klncgpu = usb_alloc_cgpu(&klondike_drv, 1); struct klondike_info *klninfo = NULL; + KLINE kline; if (unlikely(!klncgpu)) quit(1, "Failed to calloc klncgpu in klondike_detect_one"); @@ -647,7 +777,10 @@ static bool klondike_detect_one(struct libusb_device *dev, struct usb_find_devic control_init(klncgpu); while (attempts++ < 3) { - err = usb_write(klncgpu, "I", 2, &sent, C_REQUESTRESULTS); + kline.hd.cmd = KLN_CMD_IDENT; + kline.hd.dev = 0; + display_send_kline(klncgpu, &kline, msg_detect_send); + err = usb_write(klncgpu, (char *)&(kline.hd), 2, &sent, C_REQUESTRESULTS); if (err < 0 || sent != 2) { applog(LOG_ERR, "%s (%s) detect write failed (%d:%d)", klncgpu->drv->dname, @@ -666,8 +799,8 @@ static bool klondike_detect_one(struct libusb_device *dev, struct usb_find_devic klncgpu->drv->dname, klncgpu->device_path, recd); - } else if (kitem.kline.hd.cmd == 'I' && kitem.kline.hd.dev == 0) { - display_kline(klncgpu, &kitem.kline); + } else if (kitem.kline.hd.cmd == KLN_CMD_IDENT && kitem.kline.hd.dev == 0) { + display_kline(klncgpu, &kitem.kline, msg_detect_reply); applog(LOG_DEBUG, "%s (%s) detect successful (%d attempt%s)", klncgpu->drv->dname, klncgpu->device_path, @@ -698,7 +831,8 @@ static void klondike_identify(__maybe_unused struct cgpu_info *klncgpu) /* KLINE kline; - kline.hd.cmd = 'I'; + zero_kline(&kline); + kline.hd.cmd = KLN_CMD_IDENT; kline.hd.dev = 0; SendCmdGetReply(klncgpu, &kline, KSENDHD(0)); */ @@ -713,7 +847,8 @@ static void klondike_check_nonce(struct cgpu_info *klncgpu, KLIST *kitem) double us_diff; uint32_t nonce = K_NONCE(kline->wr.nonce) - 0xC0; - applog(LOG_DEBUG, "Klondike FOUND NONCE (%02x:%08x)", + applog(LOG_DEBUG, "%s%i:%d FOUND NONCE (%02x:%08x)", + klncgpu->drv->name, klncgpu->device_id, (int)(kline->wr.dev), kline->wr.workid, (unsigned int)nonce); work = NULL; @@ -734,14 +869,15 @@ static void klondike_check_nonce(struct cgpu_info *klncgpu, KLIST *kitem) klninfo->noncecount++; wr_unlock(&(klninfo->stat_lock)); -// kline->wr.nonce = le32toh(kline->wr.nonce - 0xC0); - applog(LOG_DEBUG, "Klondike SUBMIT NONCE (%02x:%08x)", + applog(LOG_DEBUG, "%s%i:%d SUBMIT NONCE (%02x:%08x)", + klncgpu->drv->name, klncgpu->device_id, (int)(kline->wr.dev), kline->wr.workid, (unsigned int)nonce); cgtime(&tv_now); bool ok = submit_nonce(klncgpu->thr[0], work, nonce); - applog(LOG_DEBUG, "Klondike chip stats %d, %08x, %d, %d", + applog(LOG_DEBUG, "%s%i:%d chip stats %d, %08x, %d, %d", + klncgpu->drv->name, klncgpu->device_id, (int)(kline->wr.dev), kline->wr.dev, (unsigned int)nonce, klninfo->devinfo[kline->wr.dev].rangesize, klninfo->status[kline->wr.dev].kline.ws.chipcount); @@ -783,8 +919,8 @@ static void klondike_check_nonce(struct cgpu_info *klncgpu, KLIST *kitem) } applog(LOG_ERR, "%s%i:%d unknown work (%02x:%08x) - ignored", - klncgpu->drv->name, klncgpu->device_id, - kline->wr.dev, kline->wr.workid, (unsigned int)nonce); + klncgpu->drv->name, klncgpu->device_id, (int)(kline->wr.dev), + kline->wr.workid, (unsigned int)nonce); //inc_hw_errors(klncgpu->thr[0]); } @@ -796,9 +932,11 @@ static void *klondike_get_replies(void *userdata) struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data); KLIST *kitem = NULL; char *hexdata; - int err, recd, slaves; + int err, recd, slaves, dev; + bool overheat; - applog(LOG_DEBUG, "Klondike listening for replies"); + applog(LOG_DEBUG, "%s%i: listening for replies", + klncgpu->drv->name, klncgpu->device_id); while (klninfo->shutdown == false) { if (klncgpu->usbinfo.nodev) @@ -810,22 +948,30 @@ static void *klondike_get_replies(void *userdata) memset((void *)&(kitem->kline), 0, sizeof(kitem->kline)); err = usb_read(klncgpu, (char *)&(kitem->kline), REPLY_SIZE, &recd, C_GETRESULTS); + if (err || recd != REPLY_SIZE) { + if (err != -7) + applog(LOG_ERR, "%s%i: reply err=%d amt=%d", + klncgpu->drv->name, klncgpu->device_id, + err, recd); + } if (!err && recd == REPLY_SIZE) { cgtime(&(kitem->tv_when)); kitem->block_seq = klninfo->block_seq; if (opt_log_level <= READ_DEBUG) { hexdata = bin2hex((unsigned char *)&(kitem->kline.hd.dev), recd-1); - applog(READ_DEBUG, "%s (%s) reply [%c:%s]", - klncgpu->drv->dname, klncgpu->device_path, - kitem->kline.hd.cmd, hexdata); + applog(READ_DEBUG, "%s%i:%d reply [%c:%s]", + klncgpu->drv->name, klncgpu->device_id, + (int)(kitem->kline.hd.dev), + kitem->kline.hd.cmd, hexdata); free(hexdata); } // We can't check this until it's initialised if (klninfo->initialised) { - rd_lock(&(klninfo->stat_lock)); + wr_lock(&(klninfo->stat_lock)); slaves = klninfo->status[0].kline.ws.slavecount; - rd_unlock(&(klninfo->stat_lock)); + klninfo->jobque[dev].late_update_sequential = 0; + wr_unlock(&(klninfo->stat_lock)); if (kitem->kline.hd.dev > slaves) { applog(LOG_ERR, "%s%i: reply [%c] has invalid dev=%d (max=%d) using 0", @@ -838,53 +984,83 @@ static void *klondike_get_replies(void *userdata) } switch (kitem->kline.hd.cmd) { - case '=': + case KLN_CMD_NONCE: klondike_check_nonce(klncgpu, kitem); - display_kline(klncgpu, &kitem->kline); + display_kline(klncgpu, &kitem->kline, msg_reply); break; - case 'S': - case 'W': - case 'A': + case KLN_CMD_STATUS: + case KLN_CMD_WORK: + case KLN_CMD_ABORT: // We can't do/check this until it's initialised if (klninfo->initialised) { + dev = kitem->kline.ws.dev; wr_lock(&(klninfo->stat_lock)); - klninfo->jobque[kitem->kline.ws.dev].workqc = - (int)(kitem->kline.ws.workqc); - cgtime(&(klninfo->jobque[kitem->kline.ws.dev].last_update)); + klninfo->jobque[dev].workqc = (int)(kitem->kline.ws.workqc); + cgtime(&(klninfo->jobque[dev].last_update)); slaves = klninfo->status[0].kline.ws.slavecount; + overheat = klninfo->jobque[dev].overheat; wr_unlock(&(klninfo->stat_lock)); if (kitem->kline.ws.slavecount != slaves) { - applog(LOG_ERR, "%s%i: reply [%c] has a diff # of slaves=%d (curr=%d) dropping device to hotplug", + applog(LOG_ERR, "%s%i:%d reply [%c] has a diff # of slaves=%d" + " (curr=%d) dropping device to hotplug", klncgpu->drv->name, klncgpu->device_id, - (char)(kitem->kline.ws.cmd), + dev, (char)(kitem->kline.ws.cmd), (int)(kitem->kline.ws.slavecount), slaves); klninfo->shutdown = true; break; } + + if (!overheat) { + double temp = cvtKlnToC(kitem->kline.ws.temp); + if (temp >= KLN_KILLWORK_TEMP) { + KLINE kline; + + wr_lock(&(klninfo->stat_lock)); + klninfo->jobque[dev].overheat = true; + wr_unlock(&(klninfo->stat_lock)); + + applog(LOG_ERR, "%s%i:%d Critical overheat (%.0fC)", + klncgpu->drv->name, klncgpu->device_id, + dev, temp); + + zero_kline(&kline); + kline.hd.cmd = KLN_CMD_ABORT; + kline.hd.dev = dev; + if (!SendCmd(klncgpu, &kline, KSENDHD(0))) { + applog(LOG_ERR, "%s%i:%d failed to abort work" + " - dropping device to hotplug", + klncgpu->drv->name, + klncgpu->device_id, + dev); + klninfo->shutdown = true; + } + kln_disable(klncgpu, dev, false); + } + } } - case 'E': + case KLN_CMD_ENABLE: wr_lock(&(klninfo->stat_lock)); klninfo->errorcount += kitem->kline.ws.errorcount; klninfo->noisecount += kitem->kline.ws.noise; wr_unlock(&(klninfo->stat_lock)); - display_kline(klncgpu, &kitem->kline); + display_kline(klncgpu, &kitem->kline, msg_reply); kitem->ready = true; kitem = NULL; break; - case 'C': - display_kline(klncgpu, &kitem->kline); + case KLN_CMD_CONFIG: + display_kline(klncgpu, &kitem->kline, msg_reply); kitem->ready = true; kitem = NULL; break; - case 'I': - display_kline(klncgpu, &kitem->kline); + case KLN_CMD_IDENT: + display_kline(klncgpu, &kitem->kline, msg_reply); kitem->ready = true; kitem = NULL; break; default: - display_kline(klncgpu, &kitem->kline); + display_kline(klncgpu, &kitem->kline, msg_reply); break; } } @@ -901,11 +1077,13 @@ static void klondike_flush_work(struct cgpu_info *klncgpu) klninfo->block_seq++; - applog(LOG_DEBUG, "Klondike flushing work"); + applog(LOG_DEBUG, "%s%i: flushing work", + klncgpu->drv->name, klncgpu->device_id); rd_lock(&(klninfo->stat_lock)); slaves = klninfo->status[0].kline.ws.slavecount; rd_unlock(&(klninfo->stat_lock)); - kline.hd.cmd = 'A'; + zero_kline(&kline); + kline.hd.cmd = KLN_CMD_ABORT; for (dev = 0; dev <= slaves; dev++) { kline.hd.dev = dev; kitem = SendCmdGetReply(klncgpu, &kline, KSENDHD(0)); @@ -953,19 +1131,12 @@ static void klondike_shutdown(struct thr_info *thr) { struct cgpu_info *klncgpu = thr->cgpu; struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data); - KLIST *kitem; - KLINE kline; - int dev; - applog(LOG_DEBUG, "Klondike shutting down work"); - kline.hd.cmd = 'E'; - for (dev = 0; dev <= klninfo->status[0].kline.ws.slavecount; dev++) { - kline.hd.dev = dev; - kline.hd.buf[0] = '0'; - kitem = SendCmdGetReply(klncgpu, &kline, KSENDHD(1)); - if (kitem) - kitem = release_kitem(klncgpu, kitem); - } + applog(LOG_DEBUG, "%s%i: shutting down work", + klncgpu->drv->name, klncgpu->device_id); + + kln_disable(klncgpu, klninfo->status[0].kline.ws.slavecount, true); + klncgpu->shutdown = klninfo->shutdown = true; } @@ -979,9 +1150,10 @@ static void klondike_thread_enable(struct thr_info *thr) /* KLINE kline; - kline.hd.cmd = 'E'; + zero_kline(&kline); + kline.hd.cmd = KLN_CMD_ENABLE; kline.hd.dev = dev; - kline.hd.buf[0] = '0'; + kline.hd.buf[0] = KLN_CMD_ENABLE_OFF; kitem = SendCmdGetReply(klncgpu, &kline, KSENDHD(1)); */ @@ -998,7 +1170,8 @@ static bool klondike_send_work(struct cgpu_info *klncgpu, int dev, struct work * if (klncgpu->usbinfo.nodev) return false; - kline.wt.cmd = 'W'; + zero_kline(&kline); + kline.wt.cmd = KLN_CMD_WORK; kline.wt.dev = dev; memcpy(kline.wt.midstate, work->midstate, MIDSTATE_BYTES); memcpy(kline.wt.merkle, work->data + MERKLE_OFFSET, MERKLE_BYTES); @@ -1012,7 +1185,9 @@ static bool klondike_send_work(struct cgpu_info *klncgpu, int dev, struct work * free(hexdata); } - applog(LOG_DEBUG, "Klondike sending work (%d:%02x)", dev, kline.wt.workid); + applog(LOG_DEBUG, "%s%i:%d sending work (%d:%02x)", + klncgpu->drv->name, klncgpu->device_id, dev, + dev, kline.wt.workid); KLIST *kitem = SendCmdGetReply(klncgpu, &kline, sizeof(kline.wt)); if (kitem != NULL) { wr_lock(&(klninfo->stat_lock)); @@ -1029,6 +1204,7 @@ static bool klondike_send_work(struct cgpu_info *klncgpu, int dev, struct work * if (ms_tdiff(&tv_old, &(look->tv_stamp)) > OLD_WORK_MS) { __work_completed(klncgpu, look); free_work(look); + wque_cleared++; } else wque_size++; } @@ -1047,40 +1223,88 @@ static bool klondike_queue_full(struct cgpu_info *klncgpu) { struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data); struct work *work = NULL; - int dev, queued, slaves; + int dev, queued, slaves, seq; struct timeval now; - + bool nowork; cgtime(&now); rd_lock(&(klninfo->stat_lock)); slaves = klninfo->status[0].kline.ws.slavecount; for (dev = 0; dev <= slaves; dev++) if (ms_tdiff(&now, &(klninfo->jobque[dev].last_update)) > LATE_UPDATE_MS) { + klninfo->jobque[dev].late_update_count++; + seq = ++klninfo->jobque[dev].late_update_sequential; rd_unlock(&(klninfo->stat_lock)); - applog(LOG_ERR, "%s%i: late update", - klncgpu->drv->name, klncgpu->device_id); - klondike_get_stats(klncgpu); - goto que; + if (seq < KLN_LATE_UPDATE_LIMIT) { + applog(LOG_ERR, "%s%i:%d late update", + klncgpu->drv->name, klncgpu->device_id, dev); + klondike_get_stats(klncgpu); + goto que; + } else { + applog(LOG_ERR, "%s%i:%d late update (%d) reached - attempting reset", + klncgpu->drv->name, klncgpu->device_id, + dev, KLN_LATE_UPDATE_LIMIT); + control_init(klncgpu); + kln_enable(klncgpu); + klondike_get_stats(klncgpu); + rd_lock(&(klninfo->stat_lock)); + if (ms_tdiff(&now, &(klninfo->jobque[dev].last_update)) > LATE_UPDATE_MS) { + rd_unlock(&(klninfo->stat_lock)); + applog(LOG_ERR, "%s%i:%d reset failed - dropping device", + klncgpu->drv->name, klncgpu->device_id, dev); + klninfo->shutdown = true; + return false; + } + break; + } } rd_unlock(&(klninfo->stat_lock)); que: + nowork = true; for (queued = 0; queued < MAX_WORK_COUNT-1; queued++) for (dev = 0; dev <= slaves; dev++) { +tryagain: rd_lock(&(klninfo->stat_lock)); + if (klninfo->jobque[dev].overheat) { + double temp = cvtKlnToC(klninfo->status[0].kline.ws.temp); + if ((queued == MAX_WORK_COUNT-2) && + ms_tdiff(&now, &(klninfo->jobque[dev].last_update)) > (LATE_UPDATE_MS/2)) { + rd_unlock(&(klninfo->stat_lock)); + klondike_get_stats(klncgpu); + goto tryagain; + } + if (temp <= KLN_COOLED_DOWN) { + klninfo->jobque[dev].overheat = false; + rd_unlock(&(klninfo->stat_lock)); + applog(LOG_ERR, "%s%i:%d Overheat recovered (%.0fC)", + klncgpu->drv->name, klncgpu->device_id, + dev, temp); + kln_enable(klncgpu); + goto tryagain; + } else { + rd_unlock(&(klninfo->stat_lock)); + continue; + } + } + if (klninfo->jobque[dev].workqc <= queued) { rd_unlock(&(klninfo->stat_lock)); if (!work) work = get_queued(klncgpu); if (unlikely(!work)) return false; + nowork = false; if (klondike_send_work(klncgpu, dev, work)) return false; } else rd_unlock(&(klninfo->stat_lock)); } + if (nowork) + cgsleep_ms(10); // avoid a hard loop in case we have nothing to do + return true; } @@ -1104,14 +1328,13 @@ static int64_t klondike_scanwork(struct thr_info *thr) hashcount = K_HASHCOUNT(klninfo->status[dev].kline.ws.hashcount); maxcount = K_MAXCOUNT(klninfo->status[dev].kline.ws.maxcount); - if (klninfo->devinfo[dev].lasthashcount > hashcount) // todo: chg this to check workid for wrapped instead + // todo: chg this to check workid for wrapped instead + if (klninfo->devinfo[dev].lasthashcount > hashcount) newhashdev += maxcount; // hash counter wrapped newhashdev += hashcount - klninfo->devinfo[dev].lasthashcount; klninfo->devinfo[dev].lasthashcount = hashcount; if (maxcount != 0) klninfo->hashcount += (newhashdev << 32) / maxcount; - - // todo: check stats for critical conditions } newhashcount += 0xffffffffull * (uint64_t)klninfo->noncecount; klninfo->noncecount = 0; @@ -1143,15 +1366,20 @@ static void get_klondike_statline_before(char *buf, size_t siz, struct cgpu_info fan += klninfo->cfg[dev].kline.cfg.fantarget; clock += (uint16_t)K_HASHCLOCK(klninfo->cfg[dev].kline.cfg.hashclock); } + rd_unlock(&(klninfo->stat_lock)); fan /= slaves + 1; + fan *= 100/255; + if (fan > 99) // short on screen space + fan = 99; clock /= slaves + 1; - rd_unlock(&(klninfo->stat_lock)); + if (clock > 999) // error - so truncate it + clock = 999; snprintf(tmp, sizeof(tmp), "%2.0fC", cvtKlnToC(temp)); if (strlen(tmp) < 4) strcat(tmp, " "); - tailsprintf(buf, siz, "%3dMHz %3d%% %s| ", (int)clock, fan*100/255, tmp); + tailsprintf(buf, siz, "%3dMHz %2d%% %s| ", (int)clock, fan, tmp); } static struct api_data *klondike_api_stats(struct cgpu_info *klncgpu)