mirror of
https://github.com/GOSTSec/sgminer
synced 2025-09-11 05:32:16 +00:00
klondike - drop the device for hotplug if it's unresponsive
This commit is contained in:
parent
5bd1b560ac
commit
037f430bb0
@ -63,9 +63,6 @@ static const char *msg_reply = "Reply";
|
|||||||
#define KLN_KILLWORK_TEMP 53.5
|
#define KLN_KILLWORK_TEMP 53.5
|
||||||
#define KLN_COOLED_DOWN 45.5
|
#define KLN_COOLED_DOWN 45.5
|
||||||
|
|
||||||
// If 5 late updates in a row, try to reset the device
|
|
||||||
#define KLN_LATE_UPDATE_LIMIT 5
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Work older than 5s will already be completed
|
* Work older than 5s will already be completed
|
||||||
* FYI it must not be possible to complete 256 work
|
* FYI it must not be possible to complete 256 work
|
||||||
@ -74,12 +71,29 @@ static const char *msg_reply = "Reply";
|
|||||||
*/
|
*/
|
||||||
#define OLD_WORK_MS ((int)(5 * 1000))
|
#define OLD_WORK_MS ((int)(5 * 1000))
|
||||||
|
|
||||||
|
/*
|
||||||
|
* How many incorrect slave counts to ignore in a row
|
||||||
|
* 2 means it allows random grabage returned twice
|
||||||
|
* Until slaves are implemented, this should never occur
|
||||||
|
* so allowing 2 in a row should ignore random errros
|
||||||
|
*/
|
||||||
|
#define KLN_ISS_IGNORE 2
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the queue status hasn't been updated for this long then do it now
|
* If the queue status hasn't been updated for this long then do it now
|
||||||
* 5GH/s = 859ms per full nonce range
|
* 5GH/s = 859ms per full nonce range
|
||||||
*/
|
*/
|
||||||
#define LATE_UPDATE_MS ((int)(2.5 * 1000))
|
#define LATE_UPDATE_MS ((int)(2.5 * 1000))
|
||||||
|
|
||||||
|
// If 5 late updates in a row, try to reset the device
|
||||||
|
#define LATE_UPDATE_LIMIT 5
|
||||||
|
|
||||||
|
// If the reset fails sleep for 1s
|
||||||
|
#define LATE_UPDATE_SLEEP_MS 1000
|
||||||
|
|
||||||
|
// However give up after 8s
|
||||||
|
#define LATE_UPDATE_NODEV_MS ((int)(8.0 * 1000))
|
||||||
|
|
||||||
struct device_drv klondike_drv;
|
struct device_drv klondike_drv;
|
||||||
|
|
||||||
typedef struct klondike_header {
|
typedef struct klondike_header {
|
||||||
@ -215,6 +229,7 @@ struct klondike_info {
|
|||||||
uint64_t hashcount;
|
uint64_t hashcount;
|
||||||
uint64_t errorcount;
|
uint64_t errorcount;
|
||||||
uint64_t noisecount;
|
uint64_t noisecount;
|
||||||
|
int incorrect_slave_sequential;
|
||||||
|
|
||||||
// us Delay from USB reply to being processed
|
// us Delay from USB reply to being processed
|
||||||
double delay_count;
|
double delay_count;
|
||||||
@ -946,8 +961,8 @@ static void *klondike_get_replies(void *userdata)
|
|||||||
struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data);
|
struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data);
|
||||||
KLIST *kitem = NULL;
|
KLIST *kitem = NULL;
|
||||||
char *hexdata;
|
char *hexdata;
|
||||||
int err, recd, slaves, dev;
|
int err, recd, slaves, dev, isc;
|
||||||
bool overheat;
|
bool overheat, sent;
|
||||||
|
|
||||||
applog(LOG_DEBUG, "%s%i: listening for replies",
|
applog(LOG_DEBUG, "%s%i: listening for replies",
|
||||||
klncgpu->drv->name, klncgpu->device_id);
|
klncgpu->drv->name, klncgpu->device_id);
|
||||||
@ -1018,16 +1033,27 @@ static void *klondike_get_replies(void *userdata)
|
|||||||
cgtime(&(klninfo->jobque[dev].last_update));
|
cgtime(&(klninfo->jobque[dev].last_update));
|
||||||
slaves = klninfo->status[0].kline.ws.slavecount;
|
slaves = klninfo->status[0].kline.ws.slavecount;
|
||||||
overheat = klninfo->jobque[dev].overheat;
|
overheat = klninfo->jobque[dev].overheat;
|
||||||
|
if (dev == 0) {
|
||||||
|
if (kitem->kline.ws.slavecount != slaves)
|
||||||
|
isc = ++klninfo->incorrect_slave_sequential;
|
||||||
|
else
|
||||||
|
isc = klninfo->incorrect_slave_sequential = 0;
|
||||||
|
}
|
||||||
wr_unlock(&(klninfo->stat_lock));
|
wr_unlock(&(klninfo->stat_lock));
|
||||||
|
|
||||||
if (kitem->kline.ws.slavecount != slaves) {
|
if (isc) {
|
||||||
applog(LOG_ERR, "%s%i:%d reply [%c] has a diff # of slaves=%d"
|
applog(LOG_ERR, "%s%i:%d reply [%c] has a diff"
|
||||||
" (curr=%d) dropping device to hotplug",
|
" # of slaves=%d (curr=%d)%s",
|
||||||
klncgpu->drv->name, klncgpu->device_id,
|
klncgpu->drv->name,
|
||||||
dev, (char)(kitem->kline.ws.cmd),
|
klncgpu->device_id,
|
||||||
|
dev,
|
||||||
|
(char)(kitem->kline.ws.cmd),
|
||||||
(int)(kitem->kline.ws.slavecount),
|
(int)(kitem->kline.ws.slavecount),
|
||||||
slaves);
|
slaves,
|
||||||
klncgpu->shutdown = true;
|
isc <= KLN_ISS_IGNORE ? "" :
|
||||||
|
" disabling device");
|
||||||
|
if (isc > KLN_ISS_IGNORE)
|
||||||
|
usb_nodev(klncgpu);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1047,15 +1073,16 @@ static void *klondike_get_replies(void *userdata)
|
|||||||
zero_kline(&kline);
|
zero_kline(&kline);
|
||||||
kline.hd.cmd = KLN_CMD_ABORT;
|
kline.hd.cmd = KLN_CMD_ABORT;
|
||||||
kline.hd.dev = dev;
|
kline.hd.dev = dev;
|
||||||
if (!SendCmd(klncgpu, &kline, KSENDHD(0))) {
|
sent = SendCmd(klncgpu, &kline, KSENDHD(0));
|
||||||
applog(LOG_ERR, "%s%i:%d failed to abort work"
|
kln_disable(klncgpu, dev, false);
|
||||||
" - dropping device to hotplug",
|
if (!sent) {
|
||||||
|
applog(LOG_ERR, "%s%i:%d overheat failed to"
|
||||||
|
" abort work - disabling device",
|
||||||
klncgpu->drv->name,
|
klncgpu->drv->name,
|
||||||
klncgpu->device_id,
|
klncgpu->device_id,
|
||||||
dev);
|
dev);
|
||||||
klncgpu->shutdown = true;
|
usb_nodev(klncgpu);
|
||||||
}
|
}
|
||||||
kln_disable(klncgpu, dev, false);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1242,10 +1269,13 @@ static bool klondike_queue_full(struct cgpu_info *klncgpu)
|
|||||||
{
|
{
|
||||||
struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data);
|
struct klondike_info *klninfo = (struct klondike_info *)(klncgpu->device_data);
|
||||||
struct work *work = NULL;
|
struct work *work = NULL;
|
||||||
int dev, queued, slaves, seq;
|
int dev, queued, slaves, seq, howlong;
|
||||||
struct timeval now;
|
struct timeval now;
|
||||||
bool nowork;
|
bool nowork;
|
||||||
|
|
||||||
|
if (klncgpu->shutdown == true)
|
||||||
|
return true;
|
||||||
|
|
||||||
cgtime(&now);
|
cgtime(&now);
|
||||||
rd_lock(&(klninfo->stat_lock));
|
rd_lock(&(klninfo->stat_lock));
|
||||||
slaves = klninfo->status[0].kline.ws.slavecount;
|
slaves = klninfo->status[0].kline.ws.slavecount;
|
||||||
@ -1254,7 +1284,7 @@ static bool klondike_queue_full(struct cgpu_info *klncgpu)
|
|||||||
klninfo->jobque[dev].late_update_count++;
|
klninfo->jobque[dev].late_update_count++;
|
||||||
seq = ++klninfo->jobque[dev].late_update_sequential;
|
seq = ++klninfo->jobque[dev].late_update_sequential;
|
||||||
rd_unlock(&(klninfo->stat_lock));
|
rd_unlock(&(klninfo->stat_lock));
|
||||||
if (seq < KLN_LATE_UPDATE_LIMIT) {
|
if (seq < LATE_UPDATE_LIMIT) {
|
||||||
applog(LOG_ERR, "%s%i:%d late update",
|
applog(LOG_ERR, "%s%i:%d late update",
|
||||||
klncgpu->drv->name, klncgpu->device_id, dev);
|
klncgpu->drv->name, klncgpu->device_id, dev);
|
||||||
klondike_get_stats(klncgpu);
|
klondike_get_stats(klncgpu);
|
||||||
@ -1262,17 +1292,22 @@ static bool klondike_queue_full(struct cgpu_info *klncgpu)
|
|||||||
} else {
|
} else {
|
||||||
applog(LOG_ERR, "%s%i:%d late update (%d) reached - attempting reset",
|
applog(LOG_ERR, "%s%i:%d late update (%d) reached - attempting reset",
|
||||||
klncgpu->drv->name, klncgpu->device_id,
|
klncgpu->drv->name, klncgpu->device_id,
|
||||||
dev, KLN_LATE_UPDATE_LIMIT);
|
dev, LATE_UPDATE_LIMIT);
|
||||||
control_init(klncgpu);
|
control_init(klncgpu);
|
||||||
kln_enable(klncgpu);
|
kln_enable(klncgpu);
|
||||||
klondike_get_stats(klncgpu);
|
klondike_get_stats(klncgpu);
|
||||||
rd_lock(&(klninfo->stat_lock));
|
rd_lock(&(klninfo->stat_lock));
|
||||||
if (ms_tdiff(&now, &(klninfo->jobque[dev].last_update)) > LATE_UPDATE_MS) {
|
howlong = ms_tdiff(&now, &(klninfo->jobque[dev].last_update));
|
||||||
|
if (howlong > LATE_UPDATE_MS) {
|
||||||
rd_unlock(&(klninfo->stat_lock));
|
rd_unlock(&(klninfo->stat_lock));
|
||||||
applog(LOG_ERR, "%s%i:%d reset failed - dropping device",
|
if (howlong > LATE_UPDATE_NODEV_MS) {
|
||||||
klncgpu->drv->name, klncgpu->device_id, dev);
|
applog(LOG_ERR, "%s%i:%d reset failed - dropping device",
|
||||||
klncgpu->shutdown = true;
|
klncgpu->drv->name, klncgpu->device_id, dev);
|
||||||
return false;
|
usb_nodev(klncgpu);
|
||||||
|
} else
|
||||||
|
cgsleep_ms(LATE_UPDATE_SLEEP_MS);
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1360,10 +1395,7 @@ static int64_t klondike_scanwork(struct thr_info *thr)
|
|||||||
rd_unlock(&(klninfo->stat_lock));
|
rd_unlock(&(klninfo->stat_lock));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (klncgpu->shutdown)
|
return newhashcount;
|
||||||
return -1;
|
|
||||||
else
|
|
||||||
return newhashcount;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user