Skip to content

Commit 7f96d91

Browse files
committed
hw-mgmt: thermal: Fan state validation after re-insertion
A customer (case #4486661) was reported regarding the fan start issues after re-insertion. After re-inserting fans in some systems, it may fail to start rotating. A stuck fan can start rotating by either of the following actions: 1) Pulling out and reinserting an additional fans. 2) Lowering the PWM of all fans for a few seconds. The commit ff627 introduced FR #4521169 to Thermal control 2.5. This patch enables the feature for Thermal control 2.0. Bugs: #4360106 NVBug: #5317658 Signed-off-by: Ciju Rajan K <[email protected]> Signed-off-by: Oleksandr Shamray <[email protected]>
1 parent 468b2a1 commit 7f96d91

File tree

1 file changed

+108
-0
lines changed

1 file changed

+108
-0
lines changed

usr/usr/bin/hw_management_thermal_control.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ class CONST(object):
9595
SYS_CONF_SENSOR_LIST_PARAM = "sensor_list"
9696
SYS_CONF_ERR_MASK = "error_mask"
9797
SYS_CONF_REDUNDANCY_PARAM = "redundancy"
98+
SYS_CONF_GENERAL_CONFIG_PARAM = "general_config"
99+
SYS_CONF_FAN_STEADY_STATE_DELAY = "fan_steady_state_delay"
100+
SYS_CONF_FAN_STEADY_STATE_PWM = "fan_steady_state_pwm"
101+
SYS_CONF_FAN_STEADY_ATTENTION_ITEMS = "attention_fans"
98102

99103
# *************************
100104
# Folders definition
@@ -169,6 +173,10 @@ class CONST(object):
169173
# FAN RPM tolerance in percent
170174
FAN_RPM_TOLERANCE = 30
171175

176+
# attention fan insertion recovery defaults
177+
FAN_STEADY_STATE_DELAY_DEF = 0
178+
FAN_STEADY_STATE_PWM_DEF = 50
179+
172180
# default system devices
173181
PSU_COUNT_DEF = 2
174182
FAN_DRWR_COUNT_DEF = 6
@@ -2035,6 +2043,11 @@ def __init__(self, cmd_arg, sys_config, name, tc_logger):
20352043

20362044
self.rpm_valid_state = True
20372045

2046+
self.insert_status = 0
2047+
self.insert_event_ts = 0
2048+
self.insert_failed = False
2049+
self.insert_event = False
2050+
20382051
# ----------------------------------------------------------------------
20392052
def sensor_configure(self):
20402053
"""
@@ -2053,6 +2066,11 @@ def sensor_configure(self):
20532066
self.fan_shutdown(False)
20542067
self.pwm_set = self.read_pwm(CONST.PWM_MIN)
20552068

2069+
self.insert_status = 0
2070+
self.insert_event_ts = 0
2071+
self.insert_failed = False
2072+
self.insert_event = False
2073+
20562074
# ----------------------------------------------------------------------
20572075
def refresh_attr(self):
20582076
"""
@@ -2116,6 +2134,42 @@ def _get_status(self):
21162134
self.log.error("Value reading from file: {}".format(status_filename))
21172135
return status
21182136

2137+
# ----------------------------------------------------------------------
2138+
def update_insert_state(self):
2139+
"""
2140+
@summary: Update insert state
2141+
"""
2142+
status = self._get_status()
2143+
if status:
2144+
if not self.insert_status:
2145+
self.insert_event_ts = self.get_timestump()
2146+
self.insert_event = True
2147+
else:
2148+
self.insert_event = 0
2149+
self.insert_failed = False
2150+
self.insert_status = status
2151+
2152+
# ----------------------------------------------------------------------
2153+
def is_insert_failed(self):
2154+
"""
2155+
@summary: Check if insert failed
2156+
"""
2157+
if self.insert_event:
2158+
if self.insert_event_ts + CONST.FAN_RELAX_TIME * 1000 <= self.get_timestump():
2159+
fan_fault_list = self._get_fault()
2160+
if any(x == 1 for x in fan_fault_list):
2161+
self.insert_failed = True
2162+
self.insert_event = False
2163+
2164+
return self.insert_failed
2165+
2166+
# ----------------------------------------------------------------------
2167+
def reset_insert_failed_state(self):
2168+
"""
2169+
@summary: Reset insert failed state
2170+
"""
2171+
self.insert_failed = False
2172+
21192173
# ----------------------------------------------------------------------
21202174
def _get_fault(self):
21212175
"""
@@ -2687,6 +2741,18 @@ def __init__(self, cmd_arg, tc_logger):
26872741
self.exit.wait(10)
26882742
self.log.notice("PWM control activated", 1)
26892743

2744+
self.attention_fans_lst = get_dict_val_by_path(self.sys_config, [CONST.SYS_CONF_GENERAL_CONFIG_PARAM, CONST.SYS_CONF_FAN_STEADY_ATTENTION_ITEMS])
2745+
if self.attention_fans_lst:
2746+
self.fan_steady_state_delay = get_dict_val_by_path(self.sys_config, [CONST.SYS_CONF_GENERAL_CONFIG_PARAM, CONST.SYS_CONF_FAN_STEADY_STATE_DELAY])
2747+
if not self.fan_steady_state_delay:
2748+
self.fan_steady_state_delay = CONST.FAN_STEADY_STATE_DELAY_DEF
2749+
self.fan_steady_state_pwm = get_dict_val_by_path(self.sys_config, [CONST.SYS_CONF_GENERAL_CONFIG_PARAM, CONST.SYS_CONF_FAN_STEADY_STATE_PWM])
2750+
if not self.fan_steady_state_pwm:
2751+
self.fan_steady_state_delay = CONST.FAN_STEADY_STATE_PWM_DEF
2752+
self.log.info("Fan {} insertion recovery enabled: delay {}s, pwm {}%".format(self.attention_fans_lst,
2753+
self.fan_steady_state_delay,
2754+
self.fan_steady_state_pwm))
2755+
26902756
# Set PWM to the default state while we are waiting for system configuration
26912757
self.log.notice("Set FAN PWM {}".format(self.pwm_target), 1)
26922758
if not self.write_pwm(self.pwm_target, validate=True):
@@ -2936,6 +3002,30 @@ def _get_chassis_fan_dir(self):
29363002

29373003
return pref_dir
29383004

3005+
# ---------------------------------------------------------------------
3006+
def _is_attention_fan_insertion_fail(self):
3007+
fan_insert_failed = False
3008+
for fan_obj in self.attention_fans:
3009+
fan_obj.update_insert_state()
3010+
if fan_obj.is_insert_failed():
3011+
self.log.notice("{} fan not started after insertion".format(fan_obj.name))
3012+
fan_obj.reset_insert_failed_state()
3013+
fan_insert_failed = True
3014+
break
3015+
return fan_insert_failed
3016+
3017+
# ---------------------------------------------------------------------
3018+
def _attention_fan_insertion_recovery(self):
3019+
pwm = self.read_pwm(100)
3020+
self.log.notice("Attention fan not started after insertion: Setting pwm to {}% from {}%".format(self.fan_steady_state_pwm, pwm), 1)
3021+
self._update_chassis_fan_speed(self.fan_steady_state_pwm, force=True)
3022+
self.log.info("Waiting {}s for newly inserted fan to stabilize".format(self.fan_steady_state_delay))
3023+
timeout = current_milli_time() + 1000 * self.fan_steady_state_delay
3024+
while timeout > current_milli_time():
3025+
self.exit.wait(1)
3026+
self.log.info("Resuming normal operation: Setting pwm back to {}%".format(pwm))
3027+
self._update_chassis_fan_speed(pwm, force=True)
3028+
29393029
# ----------------------------------------------------------------------
29403030
def _update_psu_fan_speed(self, pwm):
29413031
"""
@@ -3313,6 +3403,9 @@ def load_configuration(self):
33133403
if CONST.SYS_CONF_REDUNDANCY_PARAM not in sys_config:
33143404
sys_config[CONST.SYS_CONF_REDUNDANCY_PARAM] = {}
33153405

3406+
if CONST.SYS_CONF_GENERAL_CONFIG_PARAM not in sys_config:
3407+
sys_config[CONST.SYS_CONF_GENERAL_CONFIG_PARAM] = {}
3408+
33163409
self.sys_config = sys_config
33173410

33183411
# ----------------------------------------------------------------------
@@ -3522,6 +3615,16 @@ def init(self):
35223615
self.dev_obj_list.sort(key=lambda x: x.name)
35233616
self.write_file(CONST.PERIODIC_REPORT_FILE, self.periodic_report_time)
35243617

3618+
self.attention_fans = []
3619+
if self.attention_fans_lst:
3620+
for fan_drwr_name in self.attention_fans_lst:
3621+
fan_drwr_obj = self._get_dev_obj(fan_drwr_name)
3622+
if not fan_drwr_obj:
3623+
self.log.warn("Dev name {} missing in system_config".format(fan_drwr_name))
3624+
continue
3625+
self.attention_fans.append(fan_drwr_obj)
3626+
self.log.info("{} added to attention_fans".format(fan_drwr_name))
3627+
35253628
# ----------------------------------------------------------------------
35263629
def start(self, reason=""):
35273630
"""
@@ -3627,6 +3730,11 @@ def run(self):
36273730
self.exit.wait(30)
36283731
continue
36293732

3733+
if self._is_attention_fan_insertion_fail():
3734+
self.log.info("Attention fan insertion failed, trying to recover")
3735+
self._attention_fan_insertion_recovery()
3736+
continue
3737+
36303738
if self._is_suspend():
36313739
self.stop(reason="suspend")
36323740
self.exit.wait(5)

0 commit comments

Comments
 (0)