Skip to content

Commit a577317

Browse files
committed
hw-mgmt: thermal: TC add attention_fans config for SN4280
TC add attention_fans config for SN4280. FANs : ["drwr1", "drwr2", "drwr3", "drwr4"] fan_steady_state_delay = 10 sec fan_steady_state_pwm = 50% Bugs: #4360106 NVBug: #5317658 Signed-off-by: Ciju Rajan K <[email protected]>
1 parent 7f96d91 commit a577317

File tree

2 files changed

+97
-0
lines changed

2 files changed

+97
-0
lines changed
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
root@r-bobcat-02:~# dmidecode -t1 -t2
2+
# dmidecode 3.4
3+
Getting SMBIOS data from sysfs.
4+
SMBIOS 3.2.1 present.
5+
6+
Handle 0x0001, DMI type 1, 27 bytes
7+
System Information
8+
Manufacturer: Nvidia
9+
Product Name: SN4280
10+
Version: V0-C*GeGdFdRiRaEg-S*GbGbFcRaRaRaRaA0RaTcEiFcEi-D*TfGeGdFaRaRa-F*Tc
11+
Serial Number: MT2428XZ0JXV
12+
UUID: b908e90a-a121-11ef-8000-b0cf0e209200
13+
Wake-up Type: Power Switch
14+
SKU Number: HI160
15+
Family: Not Specified
16+
17+
Handle 0x0002, DMI type 2, 15 bytes
18+
Base Board Information
19+
Manufacturer: Nvidia
20+
Product Name: VMOD0019
21+
Version: A2
22+
Serial Number: MT2428XZ0HXA
23+
Asset Tag: Not Specified
24+
Features:
25+
Board is a hosting board
26+
Board is removable
27+
Board is replaceable
28+
Location In Chassis: Not Specified
29+
Chassis Handle: 0x0003
30+
Type: Motherboard
31+
Contained Object Handles: 0
32+
33+
root@r-bobcat-02:~# systemctl status hw-management-tc
34+
● hw-management-tc.service - Thermal control service (ver 2.0) of Mellanox systems
35+
Loaded: loaded (/lib/systemd/system/hw-management-tc.service; enabled; preset: enabled)
36+
Active: active (running) since Fri 2025-07-11 08:05:32 UTC; 42s ago
37+
Docs: man:hw-management-tc.service(8)
38+
Main PID: 19259 (hw_management_t)
39+
Tasks: 1 (limit: 153549)
40+
Memory: 17.4M
41+
CGroup: /system.slice/hw-management-tc.service
42+
└─19259 /usr/bin/python /usr/bin/hw_management_thermal_control.py
43+
44+
Jul 11 08:05:32 r-bobcat-02 systemd[1]: Started hw-management-tc.service - Thermal control service (ver 2.0) of Mellanox systems.
45+
Jul 11 08:05:33 r-bobcat-02 hw-management-tc[19259]: NOTICE - Preinit thermal control ver 2.1.0
46+
Jul 11 08:05:33 r-bobcat-02 hw-management-tc[19259]: NOTICE - Set FAN PWM 100
47+
Jul 11 08:05:33 r-bobcat-02 hw-management-tc[19259]: NOTICE - Additional delay defined in ./config/thermal_delay (35 sec).
48+
Jul 11 08:05:33 r-bobcat-02 hw-management-tc[19259]: NOTICE - Mellanox thermal control is waiting for configuration (60 sec).
49+
50+
root@r-bobcat-02:~# cat /var/run/hw-management/thermal/pwm1
51+
255
52+
53+
root@r-gaur-01:/var/run/hw-management/thermal# dvs_start.sh --sdk_bridge_mode=HYBRID
54+
55+
root@r-bobcat-02:/var/run/hw-management/thermal# cat asic
56+
64000
57+
58+
root@r-bobcat-02:/var/run/hw-management/thermal# cat pwm1
59+
76
60+
61+
# TC: Simulate fan insertion and fault condition for Fan drawer 1
62+
root@r-bobcat-02:/var/run/hw-management/thermal# unlink fan1_fault
63+
root@r-bobcat-02:/var/run/hw-management/thermal# unlink fan1_status
64+
root@r-bobcat-02:/var/run/hw-management/thermal# unlink fan1_speed_get
65+
root@r-bobcat-02:/var/run/hw-management/thermal# echo 0 > fan1_speed_get; echo 1 > fan1_fault; echo 0 > fan1_status
66+
# Wait for sometime
67+
root@r-bobcat-02:/var/run/hw-management/thermal# echo 1 > fan1_status
68+
root@r-bobcat-02:/var/run/hw-management/thermal# cat pwm1
69+
127
70+
71+
72+
=======================Snippet from /var/log/tc_log======================================
73+
2025-07-11 08:20:34,765 - INFO - drwr1:[1] tacho1=0 out of RPM range 3100:11000
74+
2025-07-11 08:20:34,767 - WARNING - drwr1:[1] status 0. Set PWM 20
75+
2025-07-11 08:20:34,767 - WARNING - drwr1:[1] incorrect rpm [0]. Set PWM 20
76+
2025-07-11 08:20:39,804 - INFO - drwr1:[1] tacho1=0 out of RPM range 3100:11000
77+
2025-07-11 08:20:39,805 - WARNING - drwr1:[1] status 0. Set PWM 20
78+
2025-07-11 08:20:39,805 - WARNING - drwr1:[1] incorrect rpm [0]. Set PWM 20
79+
2025-07-11 08:20:44,805 - INFO - drwr1:[1] tacho1=0 out of RPM range 3100:11000
80+
2025-07-11 08:20:44,806 - WARNING - drwr1:[1] status 0. Set PWM 20
81+
2025-07-11 08:20:44,807 - WARNING - drwr1:[1] incorrect rpm [0]. Set PWM 20
82+
2025-07-11 08:20:49,808 - INFO - drwr1:[1] tacho1=0 out of RPM range 3100:11000
83+
2025-07-11 08:20:49,809 - WARNING - drwr1:[1] incorrect rpm [0]. Set PWM 20
84+
2025-07-11 08:20:54,823 - INFO - drwr1:[1] tacho1=0 out of RPM range 3100:11000
85+
2025-07-11 08:20:54,824 - WARNING - drwr1:[1] incorrect rpm [0]. Set PWM 20
86+
2025-07-11 08:20:55,825 - NOTICE - drwr1:[1] fan not started after insertion
87+
2025-07-11 08:20:55,825 - INFO - Attention fan insertion failed, trying to recover
88+
2025-07-11 08:20:55,825 - NOTICE - @syslog Attention fan not started after insertion: Setting pwm to 50% from 30%
89+
2025-07-11 08:20:55,825 - INFO - Update chassis FAN PWM 50
90+
2025-07-11 08:20:55,825 - INFO - Write drwr1:[1] PWM 50
91+
2025-07-11 08:20:55,825 - INFO - Write drwr2:[2] PWM 50
92+
2025-07-11 08:20:55,826 - INFO - Write drwr3:[3] PWM 50
93+
2025-07-11 08:20:55,826 - INFO - Write drwr4:[4] PWM 50
94+
2025-07-11 08:20:55,826 - INFO - Waiting 10s for newly inserted fan to stabilize
95+
2025-07-11 08:21:05,827 - INFO - Resuming normal operation: Setting pwm back to 30%
96+
2025-07-11 08:21:05,827 - INFO - Update chassis FAN PWM 30

usr/etc/hw-management-thermal/tc_config_sn4280.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
"1" : {"rpm_min":3100, "rpm_max":11000, "slope": 98.8, "pwm_min" : 20, "pwm_max_reduction" : 10, "rpm_tolerance" : 30}
3737
}
3838
},
39+
"general_config" : {"attention_fans" : ["drwr1", "drwr2", "drwr3", "drwr4"], "fan_steady_state_delay" : 10, "fan_steady_state_pwm" : 50},
3940
"dev_parameters" : {
4041
"asic\\d*": {"pwm_min": 30, "pwm_max" : 100, "val_min":"!70000", "val_max":"!105000", "poll_time": 3, "sensor_read_error":100},
4142
"(cpu_pack|cpu_core\\d+)": {"pwm_min": 30, "pwm_max" : 100, "val_min": "!70000", "val_max": "!105000", "poll_time": 3, "sensor_read_error":100},

0 commit comments

Comments
 (0)