1313# See the License for the specific language governing permissions and
1414# limitations under the License.
1515
16-
1716set -euo pipefail
1817
1918SCRIPT_DIR=" $( cd " $( dirname " ${BASH_SOURCE[0]} " ) " && pwd) "
@@ -29,118 +28,241 @@ wait_for_boot_id_change() {
2928 local original_boot_id=$2
3029 local timeout=600
3130 local elapsed=0
32-
31+
3332 log " Waiting for node $node to reboot (boot ID to change)..."
34-
33+
3534 while [[ $elapsed -lt $timeout ]]; do
3635 local current_boot_id
3736 current_boot_id=$( get_boot_id " $node " 2> /dev/null || echo " " )
38-
37+
3938 if [[ -n " $current_boot_id " && " $current_boot_id " != " $original_boot_id " ]]; then
4039 log " Node $node rebooted successfully (boot ID changed)"
4140 elapsed=0
4241 break
4342 fi
44-
43+
4544 sleep 5
4645 elapsed=$(( elapsed + 5 ))
4746 done
48-
47+
4948 if [[ $elapsed -ge $timeout ]]; then
5049 error " Timeout waiting for node $node to reboot"
5150 fi
52-
51+
5352 log " Waiting for node $node to be uncordoned..."
5453 while [[ $elapsed -lt $timeout ]]; do
5554 local is_cordoned
5655 is_cordoned=$( kubectl get node " $node " -o jsonpath=' {.spec.unschedulable}' )
57-
56+
5857 if [[ " $is_cordoned " != " true" ]]; then
5958 log " Node $node is uncordoned and ready ✓"
6059 return 0
6160 fi
62-
61+
6362 sleep 5
6463 elapsed=$(( elapsed + 5 ))
6564 done
66-
65+
6766 error " Timeout waiting for node $node to be uncordoned"
6867}
6968
7069test_gpu_monitoring_dcgm () {
7170 log " ========================================="
7271 log " Test 1: GPU monitoring via DCGM"
7372 log " ========================================="
74-
73+
7574 local gpu_node
7675 gpu_node=$( kubectl get nodes -l workload-type=gpu -o jsonpath=' {.items[0].metadata.name}' )
77-
76+
7877 if [[ -z " $gpu_node " ]]; then
7978 error " No GPU nodes found"
8079 fi
81-
80+
8281 log " Selected GPU node: $gpu_node "
83-
82+
8483 local original_boot_id
8584 original_boot_id=$( get_boot_id " $gpu_node " )
8685 log " Original boot ID: $original_boot_id "
87-
86+
8887 local dcgm_pod
8988 dcgm_pod=$( kubectl get pods -n gpu-operator -l app=nvidia-dcgm -o jsonpath=" {.items[?(@.spec.nodeName=='$gpu_node ')].metadata.name}" | head -1)
90-
89+
9190 if [[ -z " $dcgm_pod " ]]; then
9291 error " No DCGM pod found on node $gpu_node "
9392 fi
94-
93+
9594 log " Injecting Inforom error via DCGM on pod: $dcgm_pod "
96- kubectl exec -n gpu-operator " $dcgm_pod " -- dcgmi test --inject --gpuid 0 -f 84 -v 0
97-
95+ kubectl exec -n gpu-operator " $dcgm_pod " -- dcgmi test --inject --gpuid 0 -f 84 -v 0 # infoROM watch error
96+ kubectl exec -n gpu-operator " $dcgm_pod " -- dcgmi test --inject --gpuid 0 -f 240 -v 1000 # PCIE watch error
97+ kubectl exec -n gpu-operator " $dcgm_pod " -- dcgmi test --inject --gpuid 0 -f 202 -v 99999 # power watch error
98+
99+ log " Waiting for node conditions to appear..."
100+ local max_wait=30
101+ local waited=0
102+ while [[ $waited -lt $max_wait ]]; do
103+ conditions_count=$( kubectl get node " $gpu_node " -o json | jq ' [.status.conditions[] | select(.type == "GpuInforomWatch" or .type == "GpuPcieWatch")] | length' )
104+ if [[ " $conditions_count " -ge 2 ]]; then
105+ log " Found $conditions_count node conditions"
106+ break
107+ fi
108+ sleep 2
109+ waited=$(( waited + 2 ))
110+ done
111+
112+ log " Verifying node conditions are populated"
113+ kubectl get node " $gpu_node " -o json | jq -r ' .status.conditions[] | select(.type == "GpuInforomWatch" or .type == "GpuPcieWatch") | "\(.type) Status=\(.status) Reason=\(.reason)"'
114+
115+ inforom_condition=$( kubectl get node " $gpu_node " -o json | jq -r ' .status.conditions[] | select(.type == "GpuInforomWatch" and .status == "True") | .type' )
116+ pcie_condition=$( kubectl get node " $gpu_node " -o json | jq -r ' .status.conditions[] | select(.type == "GpuPcieWatch" and .status == "True") | .type' )
117+
118+ if [[ -z " $inforom_condition " ]] || [[ -z " $pcie_condition " ]]; then
119+ error " Expected node conditions not found: GpuInforomWatch=$inforom_condition , GpuPcieWatch=$pcie_condition "
120+ fi
121+ log " Node conditions verified ✓"
122+
123+ log " Verifying node events are populated (non-fatal errors appear here)"
124+ kubectl get events --field-selector involvedObject.name=" $gpu_node " -o json | jq -r ' .items[] | select(.reason | contains("IsNotHealthy")) | "\(.reason) Message=\(.message)"' | head -5
125+
126+ power_event=$( kubectl get events --field-selector involvedObject.name=" $gpu_node " -o json | jq -r ' .items[] | select(.reason == "GpuPowerWatchIsNotHealthy") | .reason' )
127+ if [[ -z " $power_event " ]]; then
128+ error " GpuPowerWatch event not found (non-fatal errors should create events)"
129+ fi
130+ log " Node event verified: GpuPowerWatch is non-fatal, appears in events ✓"
131+
98132 log " Waiting for node to be quarantined and rebooted..."
99133 wait_for_boot_id_change " $gpu_node " " $original_boot_id "
100-
134+
101135 log " Test 1 PASSED ✓"
102136}
103137
104138test_xid_monitoring_syslog () {
105139 log " ========================================="
106140 log " Test 2: XID monitoring via syslog"
107141 log " ========================================="
108-
142+
109143 local gpu_node
110144 gpu_node=$( kubectl get nodes -l workload-type=gpu -o jsonpath=' {.items[0].metadata.name}' )
111-
145+
112146 if [[ -z " $gpu_node " ]]; then
113147 error " No GPU nodes found"
114148 fi
115-
149+
116150 log " Selected GPU node: $gpu_node "
117-
151+
118152 local original_boot_id
119153 original_boot_id=$( get_boot_id " $gpu_node " )
120154 log " Original boot ID: $original_boot_id "
121-
155+
122156 local driver_pod
123157 driver_pod=$( kubectl get pods -n gpu-operator -l app=nvidia-driver-daemonset -o jsonpath=" {.items[?(@.spec.nodeName=='$gpu_node ')].metadata.name}" | head -1)
124-
158+
125159 if [[ -z " $driver_pod " ]]; then
126160 error " No driver pod found on node $gpu_node "
127161 fi
128-
162+
129163 log " Injecting XID 119 message via logger on pod: $driver_pod "
130164 kubectl exec -n gpu-operator " $driver_pod " -- logger -p daemon.err " [6085126.134786] NVRM: Xid (PCI:0002:00:00): 119, pid=1582259, name=nvc:[driver], Timeout after 6s of waiting for RPC response from GPU1 GSP! Expected function 76 (GSP_RM_CONTROL) (0x20802a02 0x8)."
131-
165+
132166 log " Waiting for node to be quarantined and rebooted..."
133167 wait_for_boot_id_change " $gpu_node " " $original_boot_id "
134-
168+
135169 log " Test 2 PASSED ✓"
136170}
137171
172+ test_sxid_monitoring_syslog () {
173+ log " ========================================="
174+ log " Test 3: SXID monitoring (NVSwitch errors)"
175+ log " ========================================="
176+
177+ local gpu_node
178+ gpu_node=$( kubectl get nodes -l workload-type=gpu -o jsonpath=' {.items[0].metadata.name}' )
179+
180+ if [[ -z " $gpu_node " ]]; then
181+ error " No GPU nodes found"
182+ fi
183+
184+ log " Selected GPU node: $gpu_node "
185+
186+ local dcgm_pod
187+ dcgm_pod=$( kubectl get pods -n gpu-operator -l app=nvidia-dcgm -o jsonpath=" {.items[?(@.spec.nodeName=='$gpu_node ')].metadata.name}" | head -1)
188+
189+ if [[ -z " $dcgm_pod " ]]; then
190+ error " No DCGM pod found on node $gpu_node "
191+ fi
192+
193+ log " Getting NVLink topology from DCGM pod: $dcgm_pod "
194+ local nvlink_output
195+ nvlink_output=$( kubectl exec -n gpu-operator " $dcgm_pod " -- nvidia-smi nvlink -R 2> /dev/null)
196+
197+ if [[ -z " $nvlink_output " ]]; then
198+ log " Warning: nvidia-smi nvlink not available, using fallback PCI/Link values"
199+ local pci_id=" 0005:00:00.0"
200+ local link_number=" 29"
201+ else
202+ log " Parsing NVLink topology to extract PCI and Link"
203+ local link_line
204+ link_line=$( echo " $nvlink_output " | grep -E " Link [0-9]+: Remote Device" | head -1)
205+
206+ if [[ -z " $link_line " ]]; then
207+ log " Warning: No link information found, using fallback values"
208+ local pci_id=" 0005:00:00.0"
209+ local link_number=" 29"
210+ else
211+ local pci_id
212+ pci_id=$( echo " $link_line " | grep -oE ' [0-9A-Fa-f]{8}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}\.[0-9]' | head -1)
213+ local link_number
214+ link_number=$( echo " $link_line " | grep -oE ' Link [0-9]+$' | grep -oE ' [0-9]+$' )
215+
216+ log " Extracted from topology: PCI=$pci_id , Link=$link_number "
217+ fi
218+ fi
219+
220+ local driver_pod
221+ driver_pod=$( kubectl get pods -n gpu-operator -l app=nvidia-driver-daemonset -o jsonpath=" {.items[?(@.spec.nodeName=='$gpu_node ')].metadata.name}" | head -1)
222+
223+ if [[ -z " $driver_pod " ]]; then
224+ error " No driver pod found on node $gpu_node "
225+ fi
226+
227+ log " Injecting SXID error messages via logger on pod: $driver_pod "
228+ log " - SXID 20034 (Fatal): LTSSM Fault Up on Link $link_number "
229+ kubectl exec -n gpu-operator " $driver_pod " -- logger -p daemon.err " nvidia-nvswitch3: SXid (PCI:${pci_id} ): 20034, Fatal, Link ${link_number} LTSSM Fault Up"
230+
231+ log " - SXID 28002 (Non-fatal): Therm Warn Deactivated on Link $link_number "
232+ kubectl exec -n gpu-operator " $driver_pod " -- logger -p daemon.err " nvidia-nvswitch0: SXid (PCI:${pci_id} ): 28002, Non-fatal, Link ${link_number} Therm Warn Deactivated"
233+
234+ log " Waiting for node conditions to appear..."
235+ sleep 15
236+
237+ log " Verifying SXID node condition is populated (fatal SXID 20034)"
238+ sxid_condition=$( kubectl get node " $gpu_node " -o json | jq -r ' .status.conditions[] | select(.type == "SysLogsSXIDError" and .status == "True") | .type' )
239+
240+ if [[ -z " $sxid_condition " ]]; then
241+ error " SysLogsSXIDError condition not found (fatal SXID should create condition)"
242+ fi
243+ log " Node condition verified: SysLogsSXIDError ✓"
244+
245+ log " Verifying SXID node event is populated (non-fatal SXID 28002)"
246+ sxid_event=$( kubectl get events --field-selector involvedObject.name=" $gpu_node " -o json | jq -r ' .items[] | select(.reason == "SysLogsSXIDErrorIsNotHealthy") | .reason' )
247+
248+ if [[ -z " $sxid_event " ]]; then
249+ error " SysLogsSXIDError event not found (non-fatal SXID may not create separate event)"
250+ fi
251+ log " Node event verified: SysLogsSXIDError ✓"
252+
253+ log " Waiting for node to be quarantined and rebooted..."
254+ wait_for_boot_id_change " $gpu_node " " $original_boot_id "
255+
256+ log " Test 3 PASSED ✓"
257+ }
258+
138259main () {
139260 log " Starting NVSentinel UAT tests..."
140-
261+
141262 test_gpu_monitoring_dcgm
142263 test_xid_monitoring_syslog
143-
264+ test_sxid_monitoring_nvswitch
265+
144266 log " ========================================="
145267 log " All tests PASSED ✓"
146268 log " ========================================="
0 commit comments