Browse Source

Merge pull request #511 from kanoi/master

klondike - error condition handling
nfactor-troky
kanoi 11 years ago
parent
commit
3c2647d4b5
  1. 478
      driver-klondike.c

478
driver-klondike.c

@ -33,6 +33,22 @@
#define K16 "K16" #define K16 "K16"
#define K64 "K64" #define K64 "K64"
static const char *msg_detect_send = "DSend";
static const char *msg_detect_reply = "DReply";
static const char *msg_send = "Send";
static const char *msg_reply = "Reply";
#define KLN_CMD_ABORT 'A'
#define KLN_CMD_CONFIG 'C'
#define KLN_CMD_ENABLE 'E'
#define KLN_CMD_IDENT 'I'
#define KLN_CMD_NONCE '='
#define KLN_CMD_STATUS 'S'
#define KLN_CMD_WORK 'W'
#define KLN_CMD_ENABLE_OFF '0'
#define KLN_CMD_ENABLE_ON '1'
#define MIDSTATE_BYTES 32 #define MIDSTATE_BYTES 32
#define MERKLE_OFFSET 64 #define MERKLE_OFFSET 64
#define MERKLE_BYTES 12 #define MERKLE_BYTES 12
@ -44,6 +60,12 @@
#define MAX_WORK_COUNT 4 // for now, must be binary multiple and match firmware #define MAX_WORK_COUNT 4 // for now, must be binary multiple and match firmware
#define TACH_FACTOR 87890 // fan rpm divisor #define TACH_FACTOR 87890 // fan rpm divisor
#define KLN_KILLWORK_TEMP 53.5
#define KLN_COOLED_DOWN 45.5
// If 5 late updates in a row, try to reset the device
#define KLN_LATE_UPDATE_LIMIT 5
/* /*
* Work older than 5s will already be completed * Work older than 5s will already be completed
* FYI it must not be possible to complete 256 work * FYI it must not be possible to complete 256 work
@ -53,10 +75,10 @@
#define OLD_WORK_MS ((int)(5 * 1000)) #define OLD_WORK_MS ((int)(5 * 1000))
/* /*
* If the queue status hasn't been updated for this long * If the queue status hasn't been updated for this long then do it now
* then do it now * 5GH/s = 859ms per full nonce range
*/ */
#define LATE_UPDATE_MS ((int)(4 * 1000)) #define LATE_UPDATE_MS ((int)(2.5 * 1000))
struct device_drv klondike_drv; struct device_drv klondike_drv;
@ -85,7 +107,7 @@ typedef struct klondike_header {
(_hashclock)[1] = (uint8_t)(((_value) >> 8) & 0xff); \ (_hashclock)[1] = (uint8_t)(((_value) >> 8) & 0xff); \
} while(0) } while(0)
#define KSENDHD(_add) (sizeof(char) + sizeof(uint8_t) + _add) #define KSENDHD(_add) (sizeof(uint8_t) + sizeof(uint8_t) + _add)
typedef struct klondike_id { typedef struct klondike_id {
uint8_t cmd; uint8_t cmd;
@ -147,6 +169,8 @@ typedef struct kline {
}; };
} KLINE; } KLINE;
#define zero_kline(_kline) memset((void *)(_kline), 0, sizeof(KLINE));
typedef struct device_info { typedef struct device_info {
uint32_t noncecount; uint32_t noncecount;
uint32_t nextworkid; uint32_t nextworkid;
@ -169,6 +193,9 @@ typedef struct klist {
typedef struct jobque { typedef struct jobque {
int workqc; int workqc;
struct timeval last_update; struct timeval last_update;
bool overheat;
int late_update_count;
int late_update_sequential;
} JOBQUE; } JOBQUE;
struct klondike_info { struct klondike_info {
@ -360,33 +387,32 @@ static int cvtCToKln(double deg)
// Change this to LOG_WARNING if you wish to always see the replies // Change this to LOG_WARNING if you wish to always see the replies
#define READ_DEBUG LOG_DEBUG #define READ_DEBUG LOG_DEBUG
//#define READ_DEBUG LOG_ERR
static void display_kline(struct cgpu_info *klncgpu, KLINE *kline) static void display_kline(struct cgpu_info *klncgpu, KLINE *kline, const char *msg)
{ {
char *hexdata; char *hexdata;
switch (kline->hd.cmd) { switch (kline->hd.cmd) {
case '=': case KLN_CMD_NONCE:
applog(READ_DEBUG, applog(READ_DEBUG,
"%s (%s) work [%c] dev=%d workid=%d" "%s%i:%d %s work [%c] dev=%d workid=%d"
" nonce=0x%08x", " nonce=0x%08x",
klncgpu->drv->dname, klncgpu->device_path, klncgpu->drv->name, klncgpu->device_id,
kline->wr.cmd, (int)(kline->wr.dev), msg, kline->wr.cmd,
(int)(kline->wr.dev), (int)(kline->wr.dev),
(int)(kline->wr.workid), (int)(kline->wr.workid),
(unsigned int)K_NONCE(kline->wr.nonce)); (unsigned int)K_NONCE(kline->wr.nonce) - 0xC0);
break; break;
case 'S': case KLN_CMD_STATUS:
case 'W': case KLN_CMD_WORK:
case 'A': case KLN_CMD_ENABLE:
case 'E': case KLN_CMD_ABORT:
applog(READ_DEBUG, applog(READ_DEBUG,
"%s (%s) status [%c] dev=%d chips=%d" "%s%i:%d %s status [%c] dev=%d chips=%d"
" slaves=%d workcq=%d workid=%d temp=%d fan=%d" " slaves=%d workcq=%d workid=%d temp=%d fan=%d"
" errors=%d hashes=%d max=%d noise=%d", " errors=%d hashes=%d max=%d noise=%d",
klncgpu->drv->dname, klncgpu->device_path, klncgpu->drv->name, klncgpu->device_id,
kline->ws.cmd, (int)(kline->ws.dev), msg, kline->ws.cmd,
(int)(kline->ws.dev), (int)(kline->ws.dev),
(int)(kline->ws.chipcount), (int)(kline->ws.chipcount),
(int)(kline->ws.slavecount), (int)(kline->ws.slavecount),
@ -399,24 +425,24 @@ static void display_kline(struct cgpu_info *klncgpu, KLINE *kline)
K_MAXCOUNT(kline->ws.maxcount), K_MAXCOUNT(kline->ws.maxcount),
(int)(kline->ws.noise)); (int)(kline->ws.noise));
break; break;
case 'C': case KLN_CMD_CONFIG:
applog(READ_DEBUG, applog(READ_DEBUG,
"%s (%s) config [%c] dev=%d clock=%d" "%s%i:%d %s config [%c] dev=%d clock=%d"
" temptarget=%d tempcrit=%d fan=%d", " temptarget=%d tempcrit=%d fan=%d",
klncgpu->drv->dname, klncgpu->device_path, klncgpu->drv->name, klncgpu->device_id,
kline->cfg.cmd, (int)(kline->cfg.dev), msg, kline->cfg.cmd,
(int)(kline->cfg.dev), (int)(kline->cfg.dev),
K_HASHCLOCK(kline->cfg.hashclock), K_HASHCLOCK(kline->cfg.hashclock),
(int)(kline->cfg.temptarget), (int)(kline->cfg.temptarget),
(int)(kline->cfg.tempcritical), (int)(kline->cfg.tempcritical),
(int)(kline->cfg.fantarget)); (int)(kline->cfg.fantarget));
break; break;
case 'I': case KLN_CMD_IDENT:
applog(READ_DEBUG, applog(READ_DEBUG,
"%s (%s) info [%c] version=0x%02x prod=%.7s" "%s%i:%d %s info [%c] version=0x%02x prod=%.7s"
" serial=0x%08x", " serial=0x%08x",
klncgpu->drv->dname, klncgpu->device_path, klncgpu->drv->name, klncgpu->device_id,
kline->hd.cmd, (int)(kline->hd.dev), msg, kline->hd.cmd,
(int)(kline->id.version), (int)(kline->id.version),
kline->id.product, kline->id.product,
(unsigned int)K_SERIAL(kline->id.serial)); (unsigned int)K_SERIAL(kline->id.serial));
@ -424,40 +450,103 @@ static void display_kline(struct cgpu_info *klncgpu, KLINE *kline)
default: default:
hexdata = bin2hex((unsigned char *)&(kline->hd.dev), REPLY_SIZE - 1); hexdata = bin2hex((unsigned char *)&(kline->hd.dev), REPLY_SIZE - 1);
applog(LOG_ERR, applog(LOG_ERR,
"%s (%s) [%c:%s] unknown and ignored", "%s%i:%d %s [%c:%s] unknown and ignored",
klncgpu->drv->dname, klncgpu->device_path, klncgpu->drv->name, klncgpu->device_id,
kline->hd.cmd, hexdata); (int)(kline->hd.dev), msg, kline->hd.cmd,
hexdata);
free(hexdata); free(hexdata);
break; break;
} }
} }
static KLIST *SendCmdGetReply(struct cgpu_info *klncgpu, KLINE *kline, int datalen) static void display_send_kline(struct cgpu_info *klncgpu, KLINE *kline, const char *msg)
{
char *hexdata;
switch (kline->hd.cmd) {
case KLN_CMD_WORK:
applog(READ_DEBUG,
"%s%i:%d %s work [%c] dev=%d workid=0x%02x ...",
klncgpu->drv->name, klncgpu->device_id,
(int)(kline->wt.dev), msg, kline->ws.cmd,
(int)(kline->wt.dev),
(int)(kline->wt.workid));
break;
case KLN_CMD_CONFIG:
applog(READ_DEBUG,
"%s%i:%d %s config [%c] dev=%d clock=%d"
" temptarget=%d tempcrit=%d fan=%d",
klncgpu->drv->name, klncgpu->device_id,
(int)(kline->cfg.dev), msg, kline->cfg.cmd,
(int)(kline->cfg.dev),
K_HASHCLOCK(kline->cfg.hashclock),
(int)(kline->cfg.temptarget),
(int)(kline->cfg.tempcritical),
(int)(kline->cfg.fantarget));
break;
case KLN_CMD_IDENT:
case KLN_CMD_STATUS:
case KLN_CMD_ABORT:
applog(READ_DEBUG,
"%s%i:%d %s cmd [%c]",
klncgpu->drv->name, klncgpu->device_id,
(int)(kline->hd.dev), msg, kline->hd.cmd);
break;
case KLN_CMD_ENABLE:
applog(READ_DEBUG,
"%s%i:%d %s enable [%c] enable=%c",
klncgpu->drv->name, klncgpu->device_id,
(int)(kline->hd.dev), msg, kline->hd.cmd,
(char)(kline->hd.buf[0]));
break;
case KLN_CMD_NONCE:
default:
hexdata = bin2hex((unsigned char *)&(kline->hd.dev), REPLY_SIZE - 1);
applog(LOG_ERR,
"%s%i:%d %s [%c:%s] unknown/unexpected and ignored",
klncgpu->drv->name, klncgpu->device_id,
(int)(kline->hd.dev), msg, kline->hd.cmd,
hexdata);
free(hexdata);
break;
}
}
static bool SendCmd(struct cgpu_info *klncgpu, KLINE *kline, int datalen)
{ {
struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data);
KLIST *kitem;
int retries = CMD_REPLY_RETRIES;
int err, amt, writ; int err, amt, writ;
if (klncgpu->usbinfo.nodev) if (klncgpu->usbinfo.nodev)
return NULL; return false;
display_send_kline(klncgpu, kline, msg_send);
writ = KSENDHD(datalen); writ = KSENDHD(datalen);
err = usb_write(klncgpu, (char *)kline, writ, &amt, C_REQUESTRESULTS); err = usb_write(klncgpu, (char *)kline, writ, &amt, C_REQUESTRESULTS);
if (err < 0 || amt != writ) { if (err < 0 || amt != writ) {
applog(LOG_ERR, "%s (%s) Cmd:%c Dev:%d, write failed (%d:%d:%d)", applog(LOG_ERR, "%s%i:%d Cmd:%c Dev:%d, write failed (%d:%d:%d)",
klncgpu->drv->dname, klncgpu->device_path, klncgpu->drv->name, klncgpu->device_id,
kline->hd.cmd, (int)kline->hd.dev, (int)(kline->hd.dev),
kline->hd.cmd, (int)(kline->hd.dev),
writ, amt, err); writ, amt, err);
return false;
} }
return true;
}
static KLIST *GetReply(struct cgpu_info *klncgpu, uint8_t cmd, uint8_t dev)
{
struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data);
KLIST *kitem;
int retries = CMD_REPLY_RETRIES;
while (retries-- > 0 && klninfo->shutdown == false) { while (retries-- > 0 && klninfo->shutdown == false) {
cgsleep_ms(REPLY_WAIT_TIME); cgsleep_ms(REPLY_WAIT_TIME);
cg_rlock(&klninfo->klist_lock); cg_rlock(&klninfo->klist_lock);
kitem = klninfo->used; kitem = klninfo->used;
while (kitem) { while (kitem) {
if (kitem->kline.hd.cmd == kline->hd.cmd && if (kitem->kline.hd.cmd == cmd &&
kitem->kline.hd.dev == kline->hd.dev && kitem->kline.hd.dev == dev &&
kitem->ready == true && kitem->working == false) { kitem->ready == true && kitem->working == false) {
kitem->working = true; kitem->working = true;
cg_runlock(&klninfo->klist_lock); cg_runlock(&klninfo->klist_lock);
@ -470,6 +559,14 @@ static KLIST *SendCmdGetReply(struct cgpu_info *klncgpu, KLINE *kline, int datal
return NULL; return NULL;
} }
static KLIST *SendCmdGetReply(struct cgpu_info *klncgpu, KLINE *kline, int datalen)
{
if (!SendCmd(klncgpu, kline, datalen))
return NULL;
return GetReply(klncgpu, kline->hd.cmd, kline->hd.dev);
}
static bool klondike_get_stats(struct cgpu_info *klncgpu) static bool klondike_get_stats(struct cgpu_info *klncgpu)
{ {
struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data); struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data);
@ -480,7 +577,8 @@ static bool klondike_get_stats(struct cgpu_info *klncgpu)
if (klncgpu->usbinfo.nodev || klninfo->status == NULL) if (klncgpu->usbinfo.nodev || klninfo->status == NULL)
return false; return false;
applog(LOG_DEBUG, "Klondike getting status"); applog(LOG_DEBUG, "%s%i: getting status",
klncgpu->drv->name, klncgpu->device_id);
rd_lock(&(klninfo->stat_lock)); rd_lock(&(klninfo->stat_lock));
slaves = klninfo->status[0].kline.ws.slavecount; slaves = klninfo->status[0].kline.ws.slavecount;
@ -488,7 +586,8 @@ static bool klondike_get_stats(struct cgpu_info *klncgpu)
// loop thru devices and get status for each // loop thru devices and get status for each
for (dev = 0; dev <= slaves; dev++) { for (dev = 0; dev <= slaves; dev++) {
kline.hd.cmd = 'S'; zero_kline(&kline);
kline.hd.cmd = KLN_CMD_STATUS;
kline.hd.dev = dev; kline.hd.dev = dev;
kitem = SendCmdGetReply(klncgpu, &kline, 0); kitem = SendCmdGetReply(klncgpu, &kline, 0);
if (kitem != NULL) { if (kitem != NULL) {
@ -498,12 +597,55 @@ static bool klondike_get_stats(struct cgpu_info *klncgpu)
sizeof(klninfo->status[dev])); sizeof(klninfo->status[dev]));
wr_unlock(&(klninfo->stat_lock)); wr_unlock(&(klninfo->stat_lock));
kitem = release_kitem(klncgpu, kitem); kitem = release_kitem(klncgpu, kitem);
} else {
applog(LOG_ERR, "%s%i:%d failed to update stats",
klncgpu->drv->name, klncgpu->device_id, dev);
} }
} }
return true;
}
// todo: detect slavecount change and realloc space // TODO: this only enables the master (no slaves)
static bool kln_enable(struct cgpu_info *klncgpu)
{
KLIST *kitem;
KLINE kline;
int tries = 2;
bool ok = false;
return true; zero_kline(&kline);
kline.hd.cmd = KLN_CMD_ENABLE;
kline.hd.dev = 0;
kline.hd.buf[0] = KLN_CMD_ENABLE_ON;
while (tries-- > 0) {
kitem = SendCmdGetReply(klncgpu, &kline, 1);
if (kitem) {
kitem = release_kitem(klncgpu, kitem);
ok = true;
break;
}
cgsleep_ms(50);
}
if (ok)
cgsleep_ms(50);
return ok;
}
static void kln_disable(struct cgpu_info *klncgpu, int dev, bool all)
{
KLINE kline;
int i;
zero_kline(&kline);
kline.hd.cmd = KLN_CMD_ENABLE;
kline.hd.buf[0] = KLN_CMD_ENABLE_OFF;
for (i = (all ? 0 : dev); i <= dev; i++) {
kline.hd.dev = i;
SendCmd(klncgpu, &kline, KSENDHD(1));
}
} }
static bool klondike_init(struct cgpu_info *klncgpu) static bool klondike_init(struct cgpu_info *klncgpu)
@ -515,7 +657,8 @@ static bool klondike_init(struct cgpu_info *klncgpu)
klninfo->initialised = false; klninfo->initialised = false;
kline.hd.cmd = 'S'; zero_kline(&kline);
kline.hd.cmd = KLN_CMD_STATUS;
kline.hd.dev = 0; kline.hd.dev = 0;
kitem = SendCmdGetReply(klncgpu, &kline, 0); kitem = SendCmdGetReply(klncgpu, &kline, 0);
if (kitem == NULL) if (kitem == NULL)
@ -523,7 +666,8 @@ static bool klondike_init(struct cgpu_info *klncgpu)
slaves = kitem->kline.ws.slavecount; slaves = kitem->kline.ws.slavecount;
if (klninfo->status == NULL) { if (klninfo->status == NULL) {
applog(LOG_DEBUG, "Klondike initializing data"); applog(LOG_DEBUG, "%s%i: initializing data",
klncgpu->drv->name, klncgpu->device_id);
// alloc space for status, devinfo, cfg and jobque for master and slaves // alloc space for status, devinfo, cfg and jobque for master and slaves
klninfo->status = calloc(slaves+1, sizeof(*(klninfo->status))); klninfo->status = calloc(slaves+1, sizeof(*(klninfo->status)));
@ -544,8 +688,8 @@ static bool klondike_init(struct cgpu_info *klncgpu)
kitem = release_kitem(klncgpu, kitem); kitem = release_kitem(klncgpu, kitem);
// zero init triggers read back only // zero init triggers read back only
memset(&(kline.cfg), 0, sizeof(kline.cfg)); zero_kline(&kline);
kline.cfg.cmd = 'C'; kline.cfg.cmd = KLN_CMD_CONFIG;
int size = 2; int size = 2;
@ -570,7 +714,8 @@ static bool klondike_init(struct cgpu_info *klncgpu)
kitem = SendCmdGetReply(klncgpu, &kline, size); kitem = SendCmdGetReply(klncgpu, &kline, size);
if (kitem != NULL) { if (kitem != NULL) {
memcpy((void *)&(klninfo->cfg[dev]), kitem, sizeof(klninfo->cfg[dev])); memcpy((void *)&(klninfo->cfg[dev]), kitem, sizeof(klninfo->cfg[dev]));
applog(LOG_WARNING, "Klondike config (%d: Clk: %d, T:%.0lf, C:%.0lf, F:%d)", applog(LOG_WARNING, "%s%i:%d config (%d: Clk: %d, T:%.0lf, C:%.0lf, F:%d)",
klncgpu->drv->name, klncgpu->device_id, dev,
dev, K_HASHCLOCK(klninfo->cfg[dev].kline.cfg.hashclock), dev, K_HASHCLOCK(klninfo->cfg[dev].kline.cfg.hashclock),
cvtKlnToC(klninfo->cfg[dev].kline.cfg.temptarget), cvtKlnToC(klninfo->cfg[dev].kline.cfg.temptarget),
cvtKlnToC(klninfo->cfg[dev].kline.cfg.tempcritical), cvtKlnToC(klninfo->cfg[dev].kline.cfg.tempcritical),
@ -585,23 +730,7 @@ static bool klondike_init(struct cgpu_info *klncgpu)
klninfo->devinfo[dev].chipstats = calloc(klninfo->status[dev].kline.ws.chipcount*2 , sizeof(uint32_t)); klninfo->devinfo[dev].chipstats = calloc(klninfo->status[dev].kline.ws.chipcount*2 , sizeof(uint32_t));
} }
int tries = 2; bool ok = kln_enable(klncgpu);
bool ok = false;
kline.hd.cmd = 'E';
kline.hd.dev = 0;
kline.hd.buf[0] = '1';
while (tries-- > 0) {
kitem = SendCmdGetReply(klncgpu, &kline, 1);
if (kitem) {
kitem = release_kitem(klncgpu, kitem);
ok = true;
break;
}
cgsleep_ms(50);
}
cgsleep_ms(50);
if (!ok) if (!ok)
applog(LOG_ERR, "%s%i: failed to enable", klncgpu->drv->name, klncgpu->device_id); applog(LOG_ERR, "%s%i: failed to enable", klncgpu->drv->name, klncgpu->device_id);
@ -628,6 +757,7 @@ static bool klondike_detect_one(struct libusb_device *dev, struct usb_find_devic
{ {
struct cgpu_info *klncgpu = usb_alloc_cgpu(&klondike_drv, 1); struct cgpu_info *klncgpu = usb_alloc_cgpu(&klondike_drv, 1);
struct klondike_info *klninfo = NULL; struct klondike_info *klninfo = NULL;
KLINE kline;
if (unlikely(!klncgpu)) if (unlikely(!klncgpu))
quit(1, "Failed to calloc klncgpu in klondike_detect_one"); quit(1, "Failed to calloc klncgpu in klondike_detect_one");
@ -647,7 +777,10 @@ static bool klondike_detect_one(struct libusb_device *dev, struct usb_find_devic
control_init(klncgpu); control_init(klncgpu);
while (attempts++ < 3) { while (attempts++ < 3) {
err = usb_write(klncgpu, "I", 2, &sent, C_REQUESTRESULTS); kline.hd.cmd = KLN_CMD_IDENT;
kline.hd.dev = 0;
display_send_kline(klncgpu, &kline, msg_detect_send);
err = usb_write(klncgpu, (char *)&(kline.hd), 2, &sent, C_REQUESTRESULTS);
if (err < 0 || sent != 2) { if (err < 0 || sent != 2) {
applog(LOG_ERR, "%s (%s) detect write failed (%d:%d)", applog(LOG_ERR, "%s (%s) detect write failed (%d:%d)",
klncgpu->drv->dname, klncgpu->drv->dname,
@ -666,8 +799,8 @@ static bool klondike_detect_one(struct libusb_device *dev, struct usb_find_devic
klncgpu->drv->dname, klncgpu->drv->dname,
klncgpu->device_path, klncgpu->device_path,
recd); recd);
} else if (kitem.kline.hd.cmd == 'I' && kitem.kline.hd.dev == 0) { } else if (kitem.kline.hd.cmd == KLN_CMD_IDENT && kitem.kline.hd.dev == 0) {
display_kline(klncgpu, &kitem.kline); display_kline(klncgpu, &kitem.kline, msg_detect_reply);
applog(LOG_DEBUG, "%s (%s) detect successful (%d attempt%s)", applog(LOG_DEBUG, "%s (%s) detect successful (%d attempt%s)",
klncgpu->drv->dname, klncgpu->drv->dname,
klncgpu->device_path, klncgpu->device_path,
@ -698,7 +831,8 @@ static void klondike_identify(__maybe_unused struct cgpu_info *klncgpu)
/* /*
KLINE kline; KLINE kline;
kline.hd.cmd = 'I'; zero_kline(&kline);
kline.hd.cmd = KLN_CMD_IDENT;
kline.hd.dev = 0; kline.hd.dev = 0;
SendCmdGetReply(klncgpu, &kline, KSENDHD(0)); SendCmdGetReply(klncgpu, &kline, KSENDHD(0));
*/ */
@ -713,7 +847,8 @@ static void klondike_check_nonce(struct cgpu_info *klncgpu, KLIST *kitem)
double us_diff; double us_diff;
uint32_t nonce = K_NONCE(kline->wr.nonce) - 0xC0; uint32_t nonce = K_NONCE(kline->wr.nonce) - 0xC0;
applog(LOG_DEBUG, "Klondike FOUND NONCE (%02x:%08x)", applog(LOG_DEBUG, "%s%i:%d FOUND NONCE (%02x:%08x)",
klncgpu->drv->name, klncgpu->device_id, (int)(kline->wr.dev),
kline->wr.workid, (unsigned int)nonce); kline->wr.workid, (unsigned int)nonce);
work = NULL; work = NULL;
@ -734,14 +869,15 @@ static void klondike_check_nonce(struct cgpu_info *klncgpu, KLIST *kitem)
klninfo->noncecount++; klninfo->noncecount++;
wr_unlock(&(klninfo->stat_lock)); wr_unlock(&(klninfo->stat_lock));
// kline->wr.nonce = le32toh(kline->wr.nonce - 0xC0); applog(LOG_DEBUG, "%s%i:%d SUBMIT NONCE (%02x:%08x)",
applog(LOG_DEBUG, "Klondike SUBMIT NONCE (%02x:%08x)", klncgpu->drv->name, klncgpu->device_id, (int)(kline->wr.dev),
kline->wr.workid, (unsigned int)nonce); kline->wr.workid, (unsigned int)nonce);
cgtime(&tv_now); cgtime(&tv_now);
bool ok = submit_nonce(klncgpu->thr[0], work, nonce); bool ok = submit_nonce(klncgpu->thr[0], work, nonce);
applog(LOG_DEBUG, "Klondike chip stats %d, %08x, %d, %d", applog(LOG_DEBUG, "%s%i:%d chip stats %d, %08x, %d, %d",
klncgpu->drv->name, klncgpu->device_id, (int)(kline->wr.dev),
kline->wr.dev, (unsigned int)nonce, kline->wr.dev, (unsigned int)nonce,
klninfo->devinfo[kline->wr.dev].rangesize, klninfo->devinfo[kline->wr.dev].rangesize,
klninfo->status[kline->wr.dev].kline.ws.chipcount); klninfo->status[kline->wr.dev].kline.ws.chipcount);
@ -783,8 +919,8 @@ static void klondike_check_nonce(struct cgpu_info *klncgpu, KLIST *kitem)
} }
applog(LOG_ERR, "%s%i:%d unknown work (%02x:%08x) - ignored", applog(LOG_ERR, "%s%i:%d unknown work (%02x:%08x) - ignored",
klncgpu->drv->name, klncgpu->device_id, klncgpu->drv->name, klncgpu->device_id, (int)(kline->wr.dev),
kline->wr.dev, kline->wr.workid, (unsigned int)nonce); kline->wr.workid, (unsigned int)nonce);
//inc_hw_errors(klncgpu->thr[0]); //inc_hw_errors(klncgpu->thr[0]);
} }
@ -796,9 +932,11 @@ static void *klondike_get_replies(void *userdata)
struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data); struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data);
KLIST *kitem = NULL; KLIST *kitem = NULL;
char *hexdata; char *hexdata;
int err, recd, slaves; int err, recd, slaves, dev;
bool overheat;
applog(LOG_DEBUG, "Klondike listening for replies"); applog(LOG_DEBUG, "%s%i: listening for replies",
klncgpu->drv->name, klncgpu->device_id);
while (klninfo->shutdown == false) { while (klninfo->shutdown == false) {
if (klncgpu->usbinfo.nodev) if (klncgpu->usbinfo.nodev)
@ -810,22 +948,30 @@ static void *klondike_get_replies(void *userdata)
memset((void *)&(kitem->kline), 0, sizeof(kitem->kline)); memset((void *)&(kitem->kline), 0, sizeof(kitem->kline));
err = usb_read(klncgpu, (char *)&(kitem->kline), REPLY_SIZE, &recd, C_GETRESULTS); err = usb_read(klncgpu, (char *)&(kitem->kline), REPLY_SIZE, &recd, C_GETRESULTS);
if (err || recd != REPLY_SIZE) {
if (err != -7)
applog(LOG_ERR, "%s%i: reply err=%d amt=%d",
klncgpu->drv->name, klncgpu->device_id,
err, recd);
}
if (!err && recd == REPLY_SIZE) { if (!err && recd == REPLY_SIZE) {
cgtime(&(kitem->tv_when)); cgtime(&(kitem->tv_when));
kitem->block_seq = klninfo->block_seq; kitem->block_seq = klninfo->block_seq;
if (opt_log_level <= READ_DEBUG) { if (opt_log_level <= READ_DEBUG) {
hexdata = bin2hex((unsigned char *)&(kitem->kline.hd.dev), recd-1); hexdata = bin2hex((unsigned char *)&(kitem->kline.hd.dev), recd-1);
applog(READ_DEBUG, "%s (%s) reply [%c:%s]", applog(READ_DEBUG, "%s%i:%d reply [%c:%s]",
klncgpu->drv->dname, klncgpu->device_path, klncgpu->drv->name, klncgpu->device_id,
(int)(kitem->kline.hd.dev),
kitem->kline.hd.cmd, hexdata); kitem->kline.hd.cmd, hexdata);
free(hexdata); free(hexdata);
} }
// We can't check this until it's initialised // We can't check this until it's initialised
if (klninfo->initialised) { if (klninfo->initialised) {
rd_lock(&(klninfo->stat_lock)); wr_lock(&(klninfo->stat_lock));
slaves = klninfo->status[0].kline.ws.slavecount; slaves = klninfo->status[0].kline.ws.slavecount;
rd_unlock(&(klninfo->stat_lock)); klninfo->jobque[dev].late_update_sequential = 0;
wr_unlock(&(klninfo->stat_lock));
if (kitem->kline.hd.dev > slaves) { if (kitem->kline.hd.dev > slaves) {
applog(LOG_ERR, "%s%i: reply [%c] has invalid dev=%d (max=%d) using 0", applog(LOG_ERR, "%s%i: reply [%c] has invalid dev=%d (max=%d) using 0",
@ -838,53 +984,83 @@ static void *klondike_get_replies(void *userdata)
} }
switch (kitem->kline.hd.cmd) { switch (kitem->kline.hd.cmd) {
case '=': case KLN_CMD_NONCE:
klondike_check_nonce(klncgpu, kitem); klondike_check_nonce(klncgpu, kitem);
display_kline(klncgpu, &kitem->kline); display_kline(klncgpu, &kitem->kline, msg_reply);
break; break;
case 'S': case KLN_CMD_STATUS:
case 'W': case KLN_CMD_WORK:
case 'A': case KLN_CMD_ABORT:
// We can't do/check this until it's initialised // We can't do/check this until it's initialised
if (klninfo->initialised) { if (klninfo->initialised) {
dev = kitem->kline.ws.dev;
wr_lock(&(klninfo->stat_lock)); wr_lock(&(klninfo->stat_lock));
klninfo->jobque[kitem->kline.ws.dev].workqc = klninfo->jobque[dev].workqc = (int)(kitem->kline.ws.workqc);
(int)(kitem->kline.ws.workqc); cgtime(&(klninfo->jobque[dev].last_update));
cgtime(&(klninfo->jobque[kitem->kline.ws.dev].last_update));
slaves = klninfo->status[0].kline.ws.slavecount; slaves = klninfo->status[0].kline.ws.slavecount;
overheat = klninfo->jobque[dev].overheat;
wr_unlock(&(klninfo->stat_lock)); wr_unlock(&(klninfo->stat_lock));
if (kitem->kline.ws.slavecount != slaves) { if (kitem->kline.ws.slavecount != slaves) {
applog(LOG_ERR, "%s%i: reply [%c] has a diff # of slaves=%d (curr=%d) dropping device to hotplug", applog(LOG_ERR, "%s%i:%d reply [%c] has a diff # of slaves=%d"
" (curr=%d) dropping device to hotplug",
klncgpu->drv->name, klncgpu->device_id, klncgpu->drv->name, klncgpu->device_id,
(char)(kitem->kline.ws.cmd), dev, (char)(kitem->kline.ws.cmd),
(int)(kitem->kline.ws.slavecount), (int)(kitem->kline.ws.slavecount),
slaves); slaves);
klninfo->shutdown = true; klninfo->shutdown = true;
break; break;
} }
if (!overheat) {
double temp = cvtKlnToC(kitem->kline.ws.temp);
if (temp >= KLN_KILLWORK_TEMP) {
KLINE kline;
wr_lock(&(klninfo->stat_lock));
klninfo->jobque[dev].overheat = true;
wr_unlock(&(klninfo->stat_lock));
applog(LOG_ERR, "%s%i:%d Critical overheat (%.0fC)",
klncgpu->drv->name, klncgpu->device_id,
dev, temp);
zero_kline(&kline);
kline.hd.cmd = KLN_CMD_ABORT;
kline.hd.dev = dev;
if (!SendCmd(klncgpu, &kline, KSENDHD(0))) {
applog(LOG_ERR, "%s%i:%d failed to abort work"
" - dropping device to hotplug",
klncgpu->drv->name,
klncgpu->device_id,
dev);
klninfo->shutdown = true;
}
kln_disable(klncgpu, dev, false);
} }
case 'E': }
}
case KLN_CMD_ENABLE:
wr_lock(&(klninfo->stat_lock)); wr_lock(&(klninfo->stat_lock));
klninfo->errorcount += kitem->kline.ws.errorcount; klninfo->errorcount += kitem->kline.ws.errorcount;
klninfo->noisecount += kitem->kline.ws.noise; klninfo->noisecount += kitem->kline.ws.noise;
wr_unlock(&(klninfo->stat_lock)); wr_unlock(&(klninfo->stat_lock));
display_kline(klncgpu, &kitem->kline); display_kline(klncgpu, &kitem->kline, msg_reply);
kitem->ready = true; kitem->ready = true;
kitem = NULL; kitem = NULL;
break; break;
case 'C': case KLN_CMD_CONFIG:
display_kline(klncgpu, &kitem->kline); display_kline(klncgpu, &kitem->kline, msg_reply);
kitem->ready = true; kitem->ready = true;
kitem = NULL; kitem = NULL;
break; break;
case 'I': case KLN_CMD_IDENT:
display_kline(klncgpu, &kitem->kline); display_kline(klncgpu, &kitem->kline, msg_reply);
kitem->ready = true; kitem->ready = true;
kitem = NULL; kitem = NULL;
break; break;
default: default:
display_kline(klncgpu, &kitem->kline); display_kline(klncgpu, &kitem->kline, msg_reply);
break; break;
} }
} }
@ -901,11 +1077,13 @@ static void klondike_flush_work(struct cgpu_info *klncgpu)
klninfo->block_seq++; klninfo->block_seq++;
applog(LOG_DEBUG, "Klondike flushing work"); applog(LOG_DEBUG, "%s%i: flushing work",
klncgpu->drv->name, klncgpu->device_id);
rd_lock(&(klninfo->stat_lock)); rd_lock(&(klninfo->stat_lock));
slaves = klninfo->status[0].kline.ws.slavecount; slaves = klninfo->status[0].kline.ws.slavecount;
rd_unlock(&(klninfo->stat_lock)); rd_unlock(&(klninfo->stat_lock));
kline.hd.cmd = 'A'; zero_kline(&kline);
kline.hd.cmd = KLN_CMD_ABORT;
for (dev = 0; dev <= slaves; dev++) { for (dev = 0; dev <= slaves; dev++) {
kline.hd.dev = dev; kline.hd.dev = dev;
kitem = SendCmdGetReply(klncgpu, &kline, KSENDHD(0)); kitem = SendCmdGetReply(klncgpu, &kline, KSENDHD(0));
@ -953,19 +1131,12 @@ static void klondike_shutdown(struct thr_info *thr)
{ {
struct cgpu_info *klncgpu = thr->cgpu; struct cgpu_info *klncgpu = thr->cgpu;
struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data); struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data);
KLIST *kitem;
KLINE kline;
int dev;
applog(LOG_DEBUG, "Klondike shutting down work"); applog(LOG_DEBUG, "%s%i: shutting down work",
kline.hd.cmd = 'E'; klncgpu->drv->name, klncgpu->device_id);
for (dev = 0; dev <= klninfo->status[0].kline.ws.slavecount; dev++) {
kline.hd.dev = dev; kln_disable(klncgpu, klninfo->status[0].kline.ws.slavecount, true);
kline.hd.buf[0] = '0';
kitem = SendCmdGetReply(klncgpu, &kline, KSENDHD(1));
if (kitem)
kitem = release_kitem(klncgpu, kitem);
}
klncgpu->shutdown = klninfo->shutdown = true; klncgpu->shutdown = klninfo->shutdown = true;
} }
@ -979,9 +1150,10 @@ static void klondike_thread_enable(struct thr_info *thr)
/* /*
KLINE kline; KLINE kline;
kline.hd.cmd = 'E'; zero_kline(&kline);
kline.hd.cmd = KLN_CMD_ENABLE;
kline.hd.dev = dev; kline.hd.dev = dev;
kline.hd.buf[0] = '0'; kline.hd.buf[0] = KLN_CMD_ENABLE_OFF;
kitem = SendCmdGetReply(klncgpu, &kline, KSENDHD(1)); kitem = SendCmdGetReply(klncgpu, &kline, KSENDHD(1));
*/ */
@ -998,7 +1170,8 @@ static bool klondike_send_work(struct cgpu_info *klncgpu, int dev, struct work *
if (klncgpu->usbinfo.nodev) if (klncgpu->usbinfo.nodev)
return false; return false;
kline.wt.cmd = 'W'; zero_kline(&kline);
kline.wt.cmd = KLN_CMD_WORK;
kline.wt.dev = dev; kline.wt.dev = dev;
memcpy(kline.wt.midstate, work->midstate, MIDSTATE_BYTES); memcpy(kline.wt.midstate, work->midstate, MIDSTATE_BYTES);
memcpy(kline.wt.merkle, work->data + MERKLE_OFFSET, MERKLE_BYTES); memcpy(kline.wt.merkle, work->data + MERKLE_OFFSET, MERKLE_BYTES);
@ -1012,7 +1185,9 @@ static bool klondike_send_work(struct cgpu_info *klncgpu, int dev, struct work *
free(hexdata); free(hexdata);
} }
applog(LOG_DEBUG, "Klondike sending work (%d:%02x)", dev, kline.wt.workid); applog(LOG_DEBUG, "%s%i:%d sending work (%d:%02x)",
klncgpu->drv->name, klncgpu->device_id, dev,
dev, kline.wt.workid);
KLIST *kitem = SendCmdGetReply(klncgpu, &kline, sizeof(kline.wt)); KLIST *kitem = SendCmdGetReply(klncgpu, &kline, sizeof(kline.wt));
if (kitem != NULL) { if (kitem != NULL) {
wr_lock(&(klninfo->stat_lock)); wr_lock(&(klninfo->stat_lock));
@ -1029,6 +1204,7 @@ static bool klondike_send_work(struct cgpu_info *klncgpu, int dev, struct work *
if (ms_tdiff(&tv_old, &(look->tv_stamp)) > OLD_WORK_MS) { if (ms_tdiff(&tv_old, &(look->tv_stamp)) > OLD_WORK_MS) {
__work_completed(klncgpu, look); __work_completed(klncgpu, look);
free_work(look); free_work(look);
wque_cleared++;
} else } else
wque_size++; wque_size++;
} }
@ -1047,40 +1223,88 @@ static bool klondike_queue_full(struct cgpu_info *klncgpu)
{ {
struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data); struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data);
struct work *work = NULL; struct work *work = NULL;
int dev, queued, slaves; int dev, queued, slaves, seq;
struct timeval now; struct timeval now;
bool nowork;
cgtime(&now); cgtime(&now);
rd_lock(&(klninfo->stat_lock)); rd_lock(&(klninfo->stat_lock));
slaves = klninfo->status[0].kline.ws.slavecount; slaves = klninfo->status[0].kline.ws.slavecount;
for (dev = 0; dev <= slaves; dev++) for (dev = 0; dev <= slaves; dev++)
if (ms_tdiff(&now, &(klninfo->jobque[dev].last_update)) > LATE_UPDATE_MS) { if (ms_tdiff(&now, &(klninfo->jobque[dev].last_update)) > LATE_UPDATE_MS) {
klninfo->jobque[dev].late_update_count++;
seq = ++klninfo->jobque[dev].late_update_sequential;
rd_unlock(&(klninfo->stat_lock)); rd_unlock(&(klninfo->stat_lock));
applog(LOG_ERR, "%s%i: late update", if (seq < KLN_LATE_UPDATE_LIMIT) {
klncgpu->drv->name, klncgpu->device_id); applog(LOG_ERR, "%s%i:%d late update",
klncgpu->drv->name, klncgpu->device_id, dev);
klondike_get_stats(klncgpu); klondike_get_stats(klncgpu);
goto que; goto que;
} else {
applog(LOG_ERR, "%s%i:%d late update (%d) reached - attempting reset",
klncgpu->drv->name, klncgpu->device_id,
dev, KLN_LATE_UPDATE_LIMIT);
control_init(klncgpu);
kln_enable(klncgpu);
klondike_get_stats(klncgpu);
rd_lock(&(klninfo->stat_lock));
if (ms_tdiff(&now, &(klninfo->jobque[dev].last_update)) > LATE_UPDATE_MS) {
rd_unlock(&(klninfo->stat_lock));
applog(LOG_ERR, "%s%i:%d reset failed - dropping device",
klncgpu->drv->name, klncgpu->device_id, dev);
klninfo->shutdown = true;
return false;
}
break;
}
} }
rd_unlock(&(klninfo->stat_lock)); rd_unlock(&(klninfo->stat_lock));
que: que:
nowork = true;
for (queued = 0; queued < MAX_WORK_COUNT-1; queued++) for (queued = 0; queued < MAX_WORK_COUNT-1; queued++)
for (dev = 0; dev <= slaves; dev++) { for (dev = 0; dev <= slaves; dev++) {
tryagain:
rd_lock(&(klninfo->stat_lock)); rd_lock(&(klninfo->stat_lock));
if (klninfo->jobque[dev].overheat) {
double temp = cvtKlnToC(klninfo->status[0].kline.ws.temp);
if ((queued == MAX_WORK_COUNT-2) &&
ms_tdiff(&now, &(klninfo->jobque[dev].last_update)) > (LATE_UPDATE_MS/2)) {
rd_unlock(&(klninfo->stat_lock));
klondike_get_stats(klncgpu);
goto tryagain;
}
if (temp <= KLN_COOLED_DOWN) {
klninfo->jobque[dev].overheat = false;
rd_unlock(&(klninfo->stat_lock));
applog(LOG_ERR, "%s%i:%d Overheat recovered (%.0fC)",
klncgpu->drv->name, klncgpu->device_id,
dev, temp);
kln_enable(klncgpu);
goto tryagain;
} else {
rd_unlock(&(klninfo->stat_lock));
continue;
}
}
if (klninfo->jobque[dev].workqc <= queued) { if (klninfo->jobque[dev].workqc <= queued) {
rd_unlock(&(klninfo->stat_lock)); rd_unlock(&(klninfo->stat_lock));
if (!work) if (!work)
work = get_queued(klncgpu); work = get_queued(klncgpu);
if (unlikely(!work)) if (unlikely(!work))
return false; return false;
nowork = false;
if (klondike_send_work(klncgpu, dev, work)) if (klondike_send_work(klncgpu, dev, work))
return false; return false;
} else } else
rd_unlock(&(klninfo->stat_lock)); rd_unlock(&(klninfo->stat_lock));
} }
if (nowork)
cgsleep_ms(10); // avoid a hard loop in case we have nothing to do
return true; return true;
} }
@ -1104,14 +1328,13 @@ static int64_t klondike_scanwork(struct thr_info *thr)
hashcount = K_HASHCOUNT(klninfo->status[dev].kline.ws.hashcount); hashcount = K_HASHCOUNT(klninfo->status[dev].kline.ws.hashcount);
maxcount = K_MAXCOUNT(klninfo->status[dev].kline.ws.maxcount); maxcount = K_MAXCOUNT(klninfo->status[dev].kline.ws.maxcount);
if (klninfo->devinfo[dev].lasthashcount > hashcount) // todo: chg this to check workid for wrapped instead // todo: chg this to check workid for wrapped instead
if (klninfo->devinfo[dev].lasthashcount > hashcount)
newhashdev += maxcount; // hash counter wrapped newhashdev += maxcount; // hash counter wrapped
newhashdev += hashcount - klninfo->devinfo[dev].lasthashcount; newhashdev += hashcount - klninfo->devinfo[dev].lasthashcount;
klninfo->devinfo[dev].lasthashcount = hashcount; klninfo->devinfo[dev].lasthashcount = hashcount;
if (maxcount != 0) if (maxcount != 0)
klninfo->hashcount += (newhashdev << 32) / maxcount; klninfo->hashcount += (newhashdev << 32) / maxcount;
// todo: check stats for critical conditions
} }
newhashcount += 0xffffffffull * (uint64_t)klninfo->noncecount; newhashcount += 0xffffffffull * (uint64_t)klninfo->noncecount;
klninfo->noncecount = 0; klninfo->noncecount = 0;
@ -1143,15 +1366,20 @@ static void get_klondike_statline_before(char *buf, size_t siz, struct cgpu_info
fan += klninfo->cfg[dev].kline.cfg.fantarget; fan += klninfo->cfg[dev].kline.cfg.fantarget;
clock += (uint16_t)K_HASHCLOCK(klninfo->cfg[dev].kline.cfg.hashclock); clock += (uint16_t)K_HASHCLOCK(klninfo->cfg[dev].kline.cfg.hashclock);
} }
rd_unlock(&(klninfo->stat_lock));
fan /= slaves + 1; fan /= slaves + 1;
fan *= 100/255;
if (fan > 99) // short on screen space
fan = 99;
clock /= slaves + 1; clock /= slaves + 1;
rd_unlock(&(klninfo->stat_lock)); if (clock > 999) // error - so truncate it
clock = 999;
snprintf(tmp, sizeof(tmp), "%2.0fC", cvtKlnToC(temp)); snprintf(tmp, sizeof(tmp), "%2.0fC", cvtKlnToC(temp));
if (strlen(tmp) < 4) if (strlen(tmp) < 4)
strcat(tmp, " "); strcat(tmp, " ");
tailsprintf(buf, siz, "%3dMHz %3d%% %s| ", (int)clock, fan*100/255, tmp); tailsprintf(buf, siz, "%3dMHz %2d%% %s| ", (int)clock, fan, tmp);
} }
static struct api_data *klondike_api_stats(struct cgpu_info *klncgpu) static struct api_data *klondike_api_stats(struct cgpu_info *klncgpu)

Loading…
Cancel
Save