mirror of
https://github.com/GOSTSec/sgminer
synced 2025-01-25 14:04:25 +00:00
commit
f14bf5b16c
13
adl.c
13
adl.c
@ -1070,9 +1070,7 @@ static bool fan_autotune(int gpu, int temp, int fanpercent, int lasttemp, bool *
|
|||||||
applog(LOG_WARNING, "Overheat detected on GPU %d, increasing fan to 100%", gpu);
|
applog(LOG_WARNING, "Overheat detected on GPU %d, increasing fan to 100%", gpu);
|
||||||
newpercent = iMax;
|
newpercent = iMax;
|
||||||
|
|
||||||
cgpu->device_last_not_well = time(NULL);
|
dev_error(cgpu, REASON_DEV_OVER_HEAT);
|
||||||
cgpu->device_not_well_reason = REASON_DEV_OVER_HEAT;
|
|
||||||
cgpu->dev_over_heat_count++;
|
|
||||||
} else if (temp > ga->targettemp && fanpercent < top && tdiff >= 0) {
|
} else if (temp > ga->targettemp && fanpercent < top && tdiff >= 0) {
|
||||||
applog(LOG_DEBUG, "Temperature over target, increasing fanspeed");
|
applog(LOG_DEBUG, "Temperature over target, increasing fanspeed");
|
||||||
if (temp > ga->targettemp + opt_hysteresis)
|
if (temp > ga->targettemp + opt_hysteresis)
|
||||||
@ -1176,17 +1174,12 @@ void gpu_autotune(int gpu, enum dev_enable *denable)
|
|||||||
applog(LOG_WARNING, "Hit thermal cutoff limit on GPU %d, disabling!", gpu);
|
applog(LOG_WARNING, "Hit thermal cutoff limit on GPU %d, disabling!", gpu);
|
||||||
*denable = DEV_RECOVER;
|
*denable = DEV_RECOVER;
|
||||||
newengine = ga->minspeed;
|
newengine = ga->minspeed;
|
||||||
|
dev_error(cgpu, REASON_DEV_THERMAL_CUTOFF);
|
||||||
cgpu->device_last_not_well = time(NULL);
|
|
||||||
cgpu->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
|
|
||||||
cgpu->dev_thermal_cutoff_count++;
|
|
||||||
} else if (temp > ga->overtemp && engine > ga->minspeed) {
|
} else if (temp > ga->overtemp && engine > ga->minspeed) {
|
||||||
applog(LOG_WARNING, "Overheat detected, decreasing GPU %d clock speed", gpu);
|
applog(LOG_WARNING, "Overheat detected, decreasing GPU %d clock speed", gpu);
|
||||||
newengine = ga->minspeed;
|
newengine = ga->minspeed;
|
||||||
|
|
||||||
cgpu->device_last_not_well = time(NULL);
|
dev_error(cgpu, REASON_DEV_OVER_HEAT);
|
||||||
cgpu->device_not_well_reason = REASON_DEV_OVER_HEAT;
|
|
||||||
cgpu->dev_over_heat_count++;
|
|
||||||
} else if (temp > ga->targettemp + opt_hysteresis && engine > ga->minspeed && fan_optimal) {
|
} else if (temp > ga->targettemp + opt_hysteresis && engine > ga->minspeed && fan_optimal) {
|
||||||
applog(LOG_DEBUG, "Temperature %d degrees over target, decreasing clock speed", opt_hysteresis);
|
applog(LOG_DEBUG, "Temperature %d degrees over target, decreasing clock speed", opt_hysteresis);
|
||||||
newengine = engine - ga->lpOdParameters.sEngineClock.iStep;
|
newengine = engine - ga->lpOdParameters.sEngineClock.iStep;
|
||||||
|
19
cgminer.c
19
cgminer.c
@ -5487,10 +5487,7 @@ void *miner_thread(void *userdata)
|
|||||||
gettimeofday(&getwork_start, NULL);
|
gettimeofday(&getwork_start, NULL);
|
||||||
|
|
||||||
if (api->thread_init && !api->thread_init(mythr)) {
|
if (api->thread_init && !api->thread_init(mythr)) {
|
||||||
cgpu->device_last_not_well = time(NULL);
|
dev_error(cgpu, REASON_THREAD_FAIL_INIT);
|
||||||
cgpu->device_not_well_reason = REASON_THREAD_FAIL_INIT;
|
|
||||||
cgpu->thread_fail_init_count++;
|
|
||||||
|
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -5561,11 +5558,7 @@ void *miner_thread(void *userdata)
|
|||||||
if (unlikely(hashes == -1)) {
|
if (unlikely(hashes == -1)) {
|
||||||
applog(LOG_ERR, "%s %d failure, disabling!", api->name, cgpu->device_id);
|
applog(LOG_ERR, "%s %d failure, disabling!", api->name, cgpu->device_id);
|
||||||
cgpu->deven = DEV_DISABLED;
|
cgpu->deven = DEV_DISABLED;
|
||||||
|
dev_error(cgpu, REASON_THREAD_ZERO_HASH);
|
||||||
cgpu->device_last_not_well = time(NULL);
|
|
||||||
cgpu->device_not_well_reason = REASON_THREAD_ZERO_HASH;
|
|
||||||
cgpu->thread_zero_hash_count++;
|
|
||||||
|
|
||||||
mt_disable(mythr, thr_id, api);
|
mt_disable(mythr, thr_id, api);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6130,9 +6123,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
|
|||||||
applog(LOG_ERR, "%s: Idle for more than 60 seconds, declaring SICK!", dev_str);
|
applog(LOG_ERR, "%s: Idle for more than 60 seconds, declaring SICK!", dev_str);
|
||||||
gettimeofday(&thr->sick, NULL);
|
gettimeofday(&thr->sick, NULL);
|
||||||
|
|
||||||
cgpu->device_last_not_well = time(NULL);
|
dev_error(cgpu, REASON_DEV_SICK_IDLE_60);
|
||||||
cgpu->device_not_well_reason = REASON_DEV_SICK_IDLE_60;
|
|
||||||
cgpu->dev_sick_idle_60_count++;
|
|
||||||
#ifdef HAVE_ADL
|
#ifdef HAVE_ADL
|
||||||
if (adl_active && cgpu->has_adl && gpu_activity(gpu) > 50) {
|
if (adl_active && cgpu->has_adl && gpu_activity(gpu) > 50) {
|
||||||
applog(LOG_ERR, "GPU still showing activity suggesting a hard hang.");
|
applog(LOG_ERR, "GPU still showing activity suggesting a hard hang.");
|
||||||
@ -6148,9 +6139,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
|
|||||||
applog(LOG_ERR, "%s: Not responded for more than 10 minutes, declaring DEAD!", dev_str);
|
applog(LOG_ERR, "%s: Not responded for more than 10 minutes, declaring DEAD!", dev_str);
|
||||||
gettimeofday(&thr->sick, NULL);
|
gettimeofday(&thr->sick, NULL);
|
||||||
|
|
||||||
cgpu->device_last_not_well = time(NULL);
|
dev_error(cgpu, REASON_DEV_DEAD_IDLE_600);
|
||||||
cgpu->device_not_well_reason = REASON_DEV_DEAD_IDLE_600;
|
|
||||||
cgpu->dev_dead_idle_600_count++;
|
|
||||||
} else if (now.tv_sec - thr->sick.tv_sec > 60 &&
|
} else if (now.tv_sec - thr->sick.tv_sec > 60 &&
|
||||||
(cgpu->status == LIFE_SICK || cgpu->status == LIFE_DEAD)) {
|
(cgpu->status == LIFE_SICK || cgpu->status == LIFE_DEAD)) {
|
||||||
/* Attempt to restart a GPU that's sick or dead once every minute */
|
/* Attempt to restart a GPU that's sick or dead once every minute */
|
||||||
|
@ -420,10 +420,7 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce)
|
|||||||
if (unlikely(bitforce->cutofftemp > 0 && temp > bitforce->cutofftemp)) {
|
if (unlikely(bitforce->cutofftemp > 0 && temp > bitforce->cutofftemp)) {
|
||||||
applog(LOG_WARNING, "BFL%i: Hit thermal cutoff limit, disabling!", bitforce->device_id);
|
applog(LOG_WARNING, "BFL%i: Hit thermal cutoff limit, disabling!", bitforce->device_id);
|
||||||
bitforce->deven = DEV_RECOVER;
|
bitforce->deven = DEV_RECOVER;
|
||||||
|
dev_error(bitforce, REASON_DEV_THERMAL_CUTOFF);
|
||||||
bitforce->device_last_not_well = time(NULL);
|
|
||||||
bitforce->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
|
|
||||||
bitforce->dev_thermal_cutoff_count++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -431,9 +428,7 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce)
|
|||||||
* our responses are out of sync and flush the buffer to
|
* our responses are out of sync and flush the buffer to
|
||||||
* hopefully recover */
|
* hopefully recover */
|
||||||
applog(LOG_WARNING, "BFL%i: Garbled response probably throttling, clearing buffer", bitforce->device_id);
|
applog(LOG_WARNING, "BFL%i: Garbled response probably throttling, clearing buffer", bitforce->device_id);
|
||||||
bitforce->device_last_not_well = time(NULL);
|
dev_error(bitforce, REASON_DEV_THROTTLE);
|
||||||
bitforce->device_not_well_reason = REASON_DEV_THROTTLE;
|
|
||||||
bitforce->dev_throttle_count++;
|
|
||||||
/* Count throttling episodes as hardware errors */
|
/* Count throttling episodes as hardware errors */
|
||||||
bitforce->hw_errors++;
|
bitforce->hw_errors++;
|
||||||
bitforce_clear_buffer(bitforce);
|
bitforce_clear_buffer(bitforce);
|
||||||
@ -568,9 +563,7 @@ static int64_t bitforce_get_result(struct thr_info *thr, struct work *work)
|
|||||||
if (elapsed.tv_sec > BITFORCE_TIMEOUT_S) {
|
if (elapsed.tv_sec > BITFORCE_TIMEOUT_S) {
|
||||||
applog(LOG_ERR, "BFL%i: took %dms - longer than %dms", bitforce->device_id,
|
applog(LOG_ERR, "BFL%i: took %dms - longer than %dms", bitforce->device_id,
|
||||||
tv_to_ms(elapsed), BITFORCE_TIMEOUT_MS);
|
tv_to_ms(elapsed), BITFORCE_TIMEOUT_MS);
|
||||||
bitforce->device_last_not_well = time(NULL);
|
dev_error(bitforce, REASON_DEV_OVER_HEAT);
|
||||||
bitforce->device_not_well_reason = REASON_DEV_OVER_HEAT;
|
|
||||||
bitforce->dev_over_heat_count++;
|
|
||||||
|
|
||||||
if (!pdevbuf[0]) /* Only return if we got nothing after timeout - there still may be results */
|
if (!pdevbuf[0]) /* Only return if we got nothing after timeout - there still may be results */
|
||||||
return 0;
|
return 0;
|
||||||
@ -673,9 +666,7 @@ static int64_t bitforce_scanhash(struct thr_info *thr, struct work *work, int64_
|
|||||||
if (ret == -1) {
|
if (ret == -1) {
|
||||||
ret = 0;
|
ret = 0;
|
||||||
applog(LOG_ERR, "BFL%i: Comms error", bitforce->device_id);
|
applog(LOG_ERR, "BFL%i: Comms error", bitforce->device_id);
|
||||||
bitforce->device_last_not_well = time(NULL);
|
dev_error(bitforce, REASON_DEV_COMMS_ERROR);
|
||||||
bitforce->device_not_well_reason = REASON_DEV_COMMS_ERROR;
|
|
||||||
bitforce->dev_comms_error_count++;
|
|
||||||
bitforce->hw_errors++;
|
bitforce->hw_errors++;
|
||||||
/* empty read buffer */
|
/* empty read buffer */
|
||||||
bitforce_clear_buffer(bitforce);
|
bitforce_clear_buffer(bitforce);
|
||||||
|
@ -669,9 +669,7 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
|
|||||||
if (icarus->device_fd == -1)
|
if (icarus->device_fd == -1)
|
||||||
if (!icarus_prepare(thr)) {
|
if (!icarus_prepare(thr)) {
|
||||||
applog(LOG_ERR, "ICA%i: Comms error", icarus->device_id);
|
applog(LOG_ERR, "ICA%i: Comms error", icarus->device_id);
|
||||||
icarus->device_last_not_well = time(NULL);
|
dev_error(icarus, REASON_DEV_COMMS_ERROR);
|
||||||
icarus->device_not_well_reason = REASON_DEV_COMMS_ERROR;
|
|
||||||
icarus->dev_comms_error_count++;
|
|
||||||
|
|
||||||
// fail the device if the reopen attempt fails
|
// fail the device if the reopen attempt fails
|
||||||
return -1;
|
return -1;
|
||||||
@ -691,9 +689,7 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
|
|||||||
if (ret) {
|
if (ret) {
|
||||||
do_icarus_close(thr);
|
do_icarus_close(thr);
|
||||||
applog(LOG_ERR, "ICA%i: Comms error", icarus->device_id);
|
applog(LOG_ERR, "ICA%i: Comms error", icarus->device_id);
|
||||||
icarus->device_last_not_well = time(NULL);
|
dev_error(icarus, REASON_DEV_COMMS_ERROR);
|
||||||
icarus->device_not_well_reason = REASON_DEV_COMMS_ERROR;
|
|
||||||
icarus->dev_comms_error_count++;
|
|
||||||
return 0; /* This should never happen */
|
return 0; /* This should never happen */
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -713,9 +709,7 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
|
|||||||
if (ret == ICA_GETS_ERROR) {
|
if (ret == ICA_GETS_ERROR) {
|
||||||
do_icarus_close(thr);
|
do_icarus_close(thr);
|
||||||
applog(LOG_ERR, "ICA%i: Comms error", icarus->device_id);
|
applog(LOG_ERR, "ICA%i: Comms error", icarus->device_id);
|
||||||
icarus->device_last_not_well = time(NULL);
|
dev_error(icarus, REASON_DEV_COMMS_ERROR);
|
||||||
icarus->device_not_well_reason = REASON_DEV_COMMS_ERROR;
|
|
||||||
icarus->dev_comms_error_count++;
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -560,17 +560,13 @@ static uint64_t modminer_process_results(struct thr_info *thr)
|
|||||||
applog(LOG_WARNING, "%s%u.%u: Hit thermal cutoff limit (%f) at %f, disabling device!", modminer->api->name, modminer->device_id, fpgaid, MODMINER_CUTOFF_TEMP, state->temp);
|
applog(LOG_WARNING, "%s%u.%u: Hit thermal cutoff limit (%f) at %f, disabling device!", modminer->api->name, modminer->device_id, fpgaid, MODMINER_CUTOFF_TEMP, state->temp);
|
||||||
modminer_delta_clock(thr, true, MODMINER_OVERHEAT_CLOCK, true);
|
modminer_delta_clock(thr, true, MODMINER_OVERHEAT_CLOCK, true);
|
||||||
|
|
||||||
|
dev_error(modminer, REASON_DEV_THERMAL_CUTOFF);
|
||||||
modminer->deven = DEV_RECOVER;
|
modminer->deven = DEV_RECOVER;
|
||||||
modminer->device_last_not_well = time(NULL);
|
|
||||||
modminer->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
|
|
||||||
modminer->dev_thermal_cutoff_count++;
|
|
||||||
} else {
|
} else {
|
||||||
applog(LOG_WARNING, "%s%u.%u Overheat limit (%f) reached %f", modminer->api->name, modminer->device_id, fpgaid, MODMINER_OVERHEAT_TEMP, state->temp);
|
applog(LOG_WARNING, "%s%u.%u Overheat limit (%f) reached %f", modminer->api->name, modminer->device_id, fpgaid, MODMINER_OVERHEAT_TEMP, state->temp);
|
||||||
modminer_delta_clock(thr, true, MODMINER_CLOCK_DOWN, true);
|
modminer_delta_clock(thr, true, MODMINER_CLOCK_DOWN, true);
|
||||||
|
|
||||||
modminer->device_last_not_well = time(NULL);
|
dev_error(modminer, REASON_DEV_OVER_HEAT);
|
||||||
modminer->device_not_well_reason = REASON_DEV_OVER_HEAT;
|
|
||||||
modminer->dev_over_heat_count++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
43
util.c
43
util.c
@ -1423,3 +1423,46 @@ out:
|
|||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void dev_error(struct cgpu_info *dev, enum dev_reason reason)
|
||||||
|
{
|
||||||
|
dev->device_last_not_well = time(NULL);
|
||||||
|
dev->device_not_well_reason = reason;
|
||||||
|
|
||||||
|
|
||||||
|
switch (reason)
|
||||||
|
{
|
||||||
|
case REASON_THREAD_FAIL_INIT:
|
||||||
|
dev->thread_fail_init_count++;
|
||||||
|
break;
|
||||||
|
case REASON_THREAD_ZERO_HASH:
|
||||||
|
dev->thread_zero_hash_count++;
|
||||||
|
break;
|
||||||
|
case REASON_THREAD_FAIL_QUEUE:
|
||||||
|
dev->thread_fail_queue_count++;
|
||||||
|
break;
|
||||||
|
case REASON_DEV_SICK_IDLE_60:
|
||||||
|
dev->dev_sick_idle_60_count++;
|
||||||
|
break;
|
||||||
|
case REASON_DEV_DEAD_IDLE_600:
|
||||||
|
dev->dev_dead_idle_600_count++;
|
||||||
|
break;
|
||||||
|
case REASON_DEV_NOSTART:
|
||||||
|
dev->dev_nostart_count++;
|
||||||
|
break;
|
||||||
|
case REASON_DEV_OVER_HEAT:
|
||||||
|
dev->dev_over_heat_count++;
|
||||||
|
break;
|
||||||
|
case REASON_DEV_THERMAL_CUTOFF:
|
||||||
|
dev->dev_thermal_cutoff_count++;
|
||||||
|
break;
|
||||||
|
case REASON_DEV_COMMS_ERROR:
|
||||||
|
dev->dev_comms_error_count++;
|
||||||
|
break;
|
||||||
|
case REASON_DEV_THROTTLE:
|
||||||
|
dev->dev_throttle_count++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
3
util.h
3
util.h
@ -43,11 +43,14 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct pool;
|
struct pool;
|
||||||
|
enum dev_reason;
|
||||||
|
struct cgpu_info;
|
||||||
bool stratum_send(struct pool *pool, char *s, ssize_t len);
|
bool stratum_send(struct pool *pool, char *s, ssize_t len);
|
||||||
char *recv_line(struct pool *pool);
|
char *recv_line(struct pool *pool);
|
||||||
bool parse_method(struct pool *pool, char *s);
|
bool parse_method(struct pool *pool, char *s);
|
||||||
bool extract_sockaddr(struct pool *pool, char *url);
|
bool extract_sockaddr(struct pool *pool, char *url);
|
||||||
bool auth_stratum(struct pool *pool);
|
bool auth_stratum(struct pool *pool);
|
||||||
bool initiate_stratum(struct pool *pool);
|
bool initiate_stratum(struct pool *pool);
|
||||||
|
void dev_error(struct cgpu_info *dev, enum dev_reason reason);
|
||||||
|
|
||||||
#endif /* __UTIL_H__ */
|
#endif /* __UTIL_H__ */
|
||||||
|
Loading…
x
Reference in New Issue
Block a user