@@ -119,6 +119,8 @@ test_gpu_monitoring_dcgm() {
119119 fi
120120 log " Node event verified: GpuPowerWatch is non-fatal, appears in events ✓"
121121
122+ kubectl exec -n gpu-operator " $dcgm_pod " -- dcgmi test --inject --gpuid 0 -f 84 -v 0 # infoROM watch error
123+
122124 log " Waiting for node conditions to appear..."
123125 local max_wait=30
124126 local waited=0
@@ -131,9 +133,6 @@ test_gpu_monitoring_dcgm() {
131133 sleep 2
132134 waited=$(( waited + 2 ))
133135 done
134-
135-
136- kubectl exec -n gpu-operator " $dcgm_pod " -- dcgmi test --inject --gpuid 0 -f 84 -v 0 # infoROM watch error
137136
138137 log " Verifying node conditions are populated"
139138 kubectl get node " $gpu_node " -o json | jq -r ' .status.conditions[] | select(.type == "GpuInforomWatch") | "\(.type) Status=\(.status) Reason=\(.reason)"'
@@ -242,52 +241,53 @@ test_sxid_monitoring_syslog() {
242241 fi
243242
244243 log " Injecting SXID error messages via logger on pod: $driver_pod "
245- log " - SXID 20034 (Fatal): LTSSM Fault Up on Link $link_number "
246- kubectl exec -n gpu-operator " $driver_pod " -- logger -p daemon.err " nvidia-nvswitch3: SXid (PCI:${pci_id} ): 20034, Fatal, Link ${link_number} LTSSM Fault Up"
247244
248245 log " - SXID 28002 (Non-fatal): Therm Warn Deactivated on Link $link_number "
249246 kubectl exec -n gpu-operator " $driver_pod " -- logger -p daemon.err " nvidia-nvswitch0: SXid (PCI:${pci_id} ): 28002, Non-fatal, Link ${link_number} Therm Warn Deactivated"
250247
251- log " Waiting for node conditions to appear..."
252248 local max_wait=30
253249 local waited=0
254250 while [[ $waited -lt $max_wait ]]; do
255- conditions_count =$( kubectl get node " $gpu_node " -o json | jq ' [.status.conditions [] | select(.type == "SysLogsSXIDError" and .status == "True")] | length ' )
256- if [[ " $conditions_count " -ge 1 ]]; then
257- log " Found $conditions_count node conditions "
251+ power_event =$( kubectl get events --field-selector involvedObject.name= " $gpu_node " -o json | jq -r ' .items [] | select(.reason == "SysLogsSXIDErrorIsNotHealthy") | .reason ' )
252+ if [[ -n " $power_event " ]]; then
253+ log " Found sxid event "
258254 break
259255 fi
260256 sleep 2
261257 waited=$(( waited + 2 ))
262258 done
263259
264- log " Verifying SXID node condition is populated (fatal SXID 20034 )"
265- sxid_condition =$( kubectl get node " $gpu_node " -o json | jq -r ' .status.conditions [] | select(.type == "SysLogsSXIDError" and .status == "True" ) | .type ' )
260+ log " Verifying SXID node event is populated (non- fatal SXID 28002 )"
261+ sxid_event =$( kubectl get events --field-selector involvedObject.name= " $gpu_node " -o json | jq -r ' .items [] | select(.reason == "SysLogsSXIDErrorIsNotHealthy" ) | .reason ' )
266262
267- if [[ -z " $sxid_condition " ]]; then
268- error " SysLogsSXIDError condition not found (fatal SXID should create condition )"
263+ if [[ -z " $sxid_event " ]]; then
264+ error " SysLogsSXIDError event not found (non- fatal SXID may not create separate event )"
269265 fi
270- log " Node condition verified: SysLogsSXIDError ✓"
266+ log " Node event verified: SysLogsSXIDError ✓"
267+
268+ log " - SXID 20034 (Fatal): LTSSM Fault Up on Link $link_number "
269+ kubectl exec -n gpu-operator " $driver_pod " -- logger -p daemon.err " nvidia-nvswitch3: SXid (PCI:${pci_id} ): 20034, Fatal, Link ${link_number} LTSSM Fault Up"
271270
271+ log " Waiting for node conditions to appear..."
272272 local max_wait=30
273273 local waited=0
274274 while [[ $waited -lt $max_wait ]]; do
275- power_event =$( kubectl get events --field-selector involvedObject.name= " $gpu_node " -o json | jq -r ' .items [] | select(.reason == "SysLogsSXIDErrorIsNotHealthy") | .reason ' )
276- if [[ -n " $power_event " ]]; then
277- log " Found sxid event "
275+ conditions_count =$( kubectl get node " $gpu_node " -o json | jq ' [.status.conditions [] | select(.type == "SysLogsSXIDError" and .status == "True")] | length ' )
276+ if [[ " $conditions_count " -ge 1 ]]; then
277+ log " Found $conditions_count node conditions "
278278 break
279279 fi
280280 sleep 2
281281 waited=$(( waited + 2 ))
282282 done
283283
284- log " Verifying SXID node event is populated (non- fatal SXID 28002 )"
285- sxid_event =$( kubectl get events --field-selector involvedObject.name= " $gpu_node " -o json | jq -r ' .items [] | select(.reason == "SysLogsSXIDErrorIsNotHealthy" ) | .reason ' )
284+ log " Verifying SXID node condition is populated (fatal SXID 20034 )"
285+ sxid_condition =$( kubectl get node " $gpu_node " -o json | jq -r ' .status.conditions [] | select(.type == "SysLogsSXIDError" and .status == "True" ) | .type ' )
286286
287- if [[ -z " $sxid_event " ]]; then
288- error " SysLogsSXIDError event not found (non- fatal SXID may not create separate event )"
287+ if [[ -z " $sxid_condition " ]]; then
288+ error " SysLogsSXIDError condition not found (fatal SXID should create condition )"
289289 fi
290- log " Node event verified: SysLogsSXIDError ✓"
290+ log " Node condition verified: SysLogsSXIDError ✓"
291291
292292 log " Waiting for node to be quarantined and rebooted..."
293293 wait_for_boot_id_change " $gpu_node " " $original_boot_id "
0 commit comments