diff --git a/adl.c b/adl.c index 9a0c9b2a..d5e83a90 100644 --- a/adl.c +++ b/adl.c @@ -1070,9 +1070,7 @@ static bool fan_autotune(int gpu, int temp, int fanpercent, int lasttemp, bool * applog(LOG_WARNING, "Overheat detected on GPU %d, increasing fan to 100%", gpu); newpercent = iMax; - cgpu->device_last_not_well = time(NULL); - cgpu->device_not_well_reason = REASON_DEV_OVER_HEAT; - cgpu->dev_over_heat_count++; + dev_error(cgpu, REASON_DEV_OVER_HEAT); } else if (temp > ga->targettemp && fanpercent < top && tdiff >= 0) { applog(LOG_DEBUG, "Temperature over target, increasing fanspeed"); if (temp > ga->targettemp + opt_hysteresis) @@ -1176,17 +1174,12 @@ void gpu_autotune(int gpu, enum dev_enable *denable) applog(LOG_WARNING, "Hit thermal cutoff limit on GPU %d, disabling!", gpu); *denable = DEV_RECOVER; newengine = ga->minspeed; - - cgpu->device_last_not_well = time(NULL); - cgpu->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF; - cgpu->dev_thermal_cutoff_count++; + dev_error(cgpu, REASON_DEV_THERMAL_CUTOFF); } else if (temp > ga->overtemp && engine > ga->minspeed) { applog(LOG_WARNING, "Overheat detected, decreasing GPU %d clock speed", gpu); newengine = ga->minspeed; - cgpu->device_last_not_well = time(NULL); - cgpu->device_not_well_reason = REASON_DEV_OVER_HEAT; - cgpu->dev_over_heat_count++; + dev_error(cgpu, REASON_DEV_OVER_HEAT); } else if (temp > ga->targettemp + opt_hysteresis && engine > ga->minspeed && fan_optimal) { applog(LOG_DEBUG, "Temperature %d degrees over target, decreasing clock speed", opt_hysteresis); newengine = engine - ga->lpOdParameters.sEngineClock.iStep; diff --git a/cgminer.c b/cgminer.c index 134212c6..83159b21 100644 --- a/cgminer.c +++ b/cgminer.c @@ -5487,10 +5487,7 @@ void *miner_thread(void *userdata) gettimeofday(&getwork_start, NULL); if (api->thread_init && !api->thread_init(mythr)) { - cgpu->device_last_not_well = time(NULL); - cgpu->device_not_well_reason = REASON_THREAD_FAIL_INIT; - cgpu->thread_fail_init_count++; - + dev_error(cgpu, REASON_THREAD_FAIL_INIT); goto out; } @@ -5561,11 +5558,7 @@ void *miner_thread(void *userdata) if (unlikely(hashes == -1)) { applog(LOG_ERR, "%s %d failure, disabling!", api->name, cgpu->device_id); cgpu->deven = DEV_DISABLED; - - cgpu->device_last_not_well = time(NULL); - cgpu->device_not_well_reason = REASON_THREAD_ZERO_HASH; - cgpu->thread_zero_hash_count++; - + dev_error(cgpu, REASON_THREAD_ZERO_HASH); mt_disable(mythr, thr_id, api); } @@ -6130,9 +6123,7 @@ static void *watchdog_thread(void __maybe_unused *userdata) applog(LOG_ERR, "%s: Idle for more than 60 seconds, declaring SICK!", dev_str); gettimeofday(&thr->sick, NULL); - cgpu->device_last_not_well = time(NULL); - cgpu->device_not_well_reason = REASON_DEV_SICK_IDLE_60; - cgpu->dev_sick_idle_60_count++; + dev_error(cgpu, REASON_DEV_SICK_IDLE_60); #ifdef HAVE_ADL if (adl_active && cgpu->has_adl && gpu_activity(gpu) > 50) { applog(LOG_ERR, "GPU still showing activity suggesting a hard hang."); @@ -6148,9 +6139,7 @@ static void *watchdog_thread(void __maybe_unused *userdata) applog(LOG_ERR, "%s: Not responded for more than 10 minutes, declaring DEAD!", dev_str); gettimeofday(&thr->sick, NULL); - cgpu->device_last_not_well = time(NULL); - cgpu->device_not_well_reason = REASON_DEV_DEAD_IDLE_600; - cgpu->dev_dead_idle_600_count++; + dev_error(cgpu, REASON_DEV_DEAD_IDLE_600); } else if (now.tv_sec - thr->sick.tv_sec > 60 && (cgpu->status == LIFE_SICK || cgpu->status == LIFE_DEAD)) { /* Attempt to restart a GPU that's sick or dead once every minute */ diff --git a/driver-bitforce.c b/driver-bitforce.c index ae9fa01c..4cb1721d 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -420,10 +420,7 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce) if (unlikely(bitforce->cutofftemp > 0 && temp > bitforce->cutofftemp)) { applog(LOG_WARNING, "BFL%i: Hit thermal cutoff limit, disabling!", bitforce->device_id); bitforce->deven = DEV_RECOVER; - - bitforce->device_last_not_well = time(NULL); - bitforce->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF; - bitforce->dev_thermal_cutoff_count++; + dev_error(bitforce, REASON_DEV_THERMAL_CUTOFF); } } } else { @@ -431,9 +428,7 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce) * our responses are out of sync and flush the buffer to * hopefully recover */ applog(LOG_WARNING, "BFL%i: Garbled response probably throttling, clearing buffer", bitforce->device_id); - bitforce->device_last_not_well = time(NULL); - bitforce->device_not_well_reason = REASON_DEV_THROTTLE; - bitforce->dev_throttle_count++; + dev_error(bitforce, REASON_DEV_THROTTLE); /* Count throttling episodes as hardware errors */ bitforce->hw_errors++; bitforce_clear_buffer(bitforce); @@ -568,9 +563,7 @@ static int64_t bitforce_get_result(struct thr_info *thr, struct work *work) if (elapsed.tv_sec > BITFORCE_TIMEOUT_S) { applog(LOG_ERR, "BFL%i: took %dms - longer than %dms", bitforce->device_id, tv_to_ms(elapsed), BITFORCE_TIMEOUT_MS); - bitforce->device_last_not_well = time(NULL); - bitforce->device_not_well_reason = REASON_DEV_OVER_HEAT; - bitforce->dev_over_heat_count++; + dev_error(bitforce, REASON_DEV_OVER_HEAT); if (!pdevbuf[0]) /* Only return if we got nothing after timeout - there still may be results */ return 0; @@ -673,9 +666,7 @@ static int64_t bitforce_scanhash(struct thr_info *thr, struct work *work, int64_ if (ret == -1) { ret = 0; applog(LOG_ERR, "BFL%i: Comms error", bitforce->device_id); - bitforce->device_last_not_well = time(NULL); - bitforce->device_not_well_reason = REASON_DEV_COMMS_ERROR; - bitforce->dev_comms_error_count++; + dev_error(bitforce, REASON_DEV_COMMS_ERROR); bitforce->hw_errors++; /* empty read buffer */ bitforce_clear_buffer(bitforce); diff --git a/driver-icarus.c b/driver-icarus.c index c3adafa6..c979a681 100644 --- a/driver-icarus.c +++ b/driver-icarus.c @@ -669,9 +669,7 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work, if (icarus->device_fd == -1) if (!icarus_prepare(thr)) { applog(LOG_ERR, "ICA%i: Comms error", icarus->device_id); - icarus->device_last_not_well = time(NULL); - icarus->device_not_well_reason = REASON_DEV_COMMS_ERROR; - icarus->dev_comms_error_count++; + dev_error(icarus, REASON_DEV_COMMS_ERROR); // fail the device if the reopen attempt fails return -1; @@ -691,9 +689,7 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work, if (ret) { do_icarus_close(thr); applog(LOG_ERR, "ICA%i: Comms error", icarus->device_id); - icarus->device_last_not_well = time(NULL); - icarus->device_not_well_reason = REASON_DEV_COMMS_ERROR; - icarus->dev_comms_error_count++; + dev_error(icarus, REASON_DEV_COMMS_ERROR); return 0; /* This should never happen */ } @@ -713,9 +709,7 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work, if (ret == ICA_GETS_ERROR) { do_icarus_close(thr); applog(LOG_ERR, "ICA%i: Comms error", icarus->device_id); - icarus->device_last_not_well = time(NULL); - icarus->device_not_well_reason = REASON_DEV_COMMS_ERROR; - icarus->dev_comms_error_count++; + dev_error(icarus, REASON_DEV_COMMS_ERROR); return 0; } diff --git a/driver-modminer.c b/driver-modminer.c index 9c4c45be..afdd68b1 100644 --- a/driver-modminer.c +++ b/driver-modminer.c @@ -560,17 +560,13 @@ static uint64_t modminer_process_results(struct thr_info *thr) applog(LOG_WARNING, "%s%u.%u: Hit thermal cutoff limit (%f) at %f, disabling device!", modminer->api->name, modminer->device_id, fpgaid, MODMINER_CUTOFF_TEMP, state->temp); modminer_delta_clock(thr, true, MODMINER_OVERHEAT_CLOCK, true); + dev_error(modminer, REASON_DEV_THERMAL_CUTOFF); modminer->deven = DEV_RECOVER; - modminer->device_last_not_well = time(NULL); - modminer->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF; - modminer->dev_thermal_cutoff_count++; } else { applog(LOG_WARNING, "%s%u.%u Overheat limit (%f) reached %f", modminer->api->name, modminer->device_id, fpgaid, MODMINER_OVERHEAT_TEMP, state->temp); modminer_delta_clock(thr, true, MODMINER_CLOCK_DOWN, true); - modminer->device_last_not_well = time(NULL); - modminer->device_not_well_reason = REASON_DEV_OVER_HEAT; - modminer->dev_over_heat_count++; + dev_error(modminer, REASON_DEV_OVER_HEAT); } } } diff --git a/util.c b/util.c index d5c2e542..d9d1d48f 100644 --- a/util.c +++ b/util.c @@ -1423,3 +1423,46 @@ out: return ret; } + + +void dev_error(struct cgpu_info *dev, enum dev_reason reason) +{ + dev->device_last_not_well = time(NULL); + dev->device_not_well_reason = reason; + + + switch (reason) + { + case REASON_THREAD_FAIL_INIT: + dev->thread_fail_init_count++; + break; + case REASON_THREAD_ZERO_HASH: + dev->thread_zero_hash_count++; + break; + case REASON_THREAD_FAIL_QUEUE: + dev->thread_fail_queue_count++; + break; + case REASON_DEV_SICK_IDLE_60: + dev->dev_sick_idle_60_count++; + break; + case REASON_DEV_DEAD_IDLE_600: + dev->dev_dead_idle_600_count++; + break; + case REASON_DEV_NOSTART: + dev->dev_nostart_count++; + break; + case REASON_DEV_OVER_HEAT: + dev->dev_over_heat_count++; + break; + case REASON_DEV_THERMAL_CUTOFF: + dev->dev_thermal_cutoff_count++; + break; + case REASON_DEV_COMMS_ERROR: + dev->dev_comms_error_count++; + break; + case REASON_DEV_THROTTLE: + dev->dev_throttle_count++; + break; + } + +} diff --git a/util.h b/util.h index d5ac54c3..61dc6689 100644 --- a/util.h +++ b/util.h @@ -43,11 +43,14 @@ #endif struct pool; +enum dev_reason; +struct cgpu_info; bool stratum_send(struct pool *pool, char *s, ssize_t len); char *recv_line(struct pool *pool); bool parse_method(struct pool *pool, char *s); bool extract_sockaddr(struct pool *pool, char *url); bool auth_stratum(struct pool *pool); bool initiate_stratum(struct pool *pool); +void dev_error(struct cgpu_info *dev, enum dev_reason reason); #endif /* __UTIL_H__ */