Skip to content

Commit b9f3c95

Browse files
committed
fix: test
1 parent 801afe6 commit b9f3c95

File tree

1 file changed

+22
-22
lines changed

1 file changed

+22
-22
lines changed

tests/uat/tests.sh

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,8 @@ test_gpu_monitoring_dcgm() {
119119
fi
120120
log "Node event verified: GpuPowerWatch is non-fatal, appears in events ✓"
121121

122+
kubectl exec -n gpu-operator "$dcgm_pod" -- dcgmi test --inject --gpuid 0 -f 84 -v 0 # infoROM watch error
123+
122124
log "Waiting for node conditions to appear..."
123125
local max_wait=30
124126
local waited=0
@@ -131,9 +133,6 @@ test_gpu_monitoring_dcgm() {
131133
sleep 2
132134
waited=$((waited + 2))
133135
done
134-
135-
136-
kubectl exec -n gpu-operator "$dcgm_pod" -- dcgmi test --inject --gpuid 0 -f 84 -v 0 # infoROM watch error
137136

138137
log "Verifying node conditions are populated"
139138
kubectl get node "$gpu_node" -o json | jq -r '.status.conditions[] | select(.type == "GpuInforomWatch") | "\(.type) Status=\(.status) Reason=\(.reason)"'
@@ -242,52 +241,53 @@ test_sxid_monitoring_syslog() {
242241
fi
243242

244243
log "Injecting SXID error messages via logger on pod: $driver_pod"
245-
log " - SXID 20034 (Fatal): LTSSM Fault Up on Link $link_number"
246-
kubectl exec -n gpu-operator "$driver_pod" -- logger -p daemon.err "nvidia-nvswitch3: SXid (PCI:${pci_id}): 20034, Fatal, Link ${link_number} LTSSM Fault Up"
247244

248245
log " - SXID 28002 (Non-fatal): Therm Warn Deactivated on Link $link_number"
249246
kubectl exec -n gpu-operator "$driver_pod" -- logger -p daemon.err "nvidia-nvswitch0: SXid (PCI:${pci_id}): 28002, Non-fatal, Link ${link_number} Therm Warn Deactivated"
250247

251-
log "Waiting for node conditions to appear..."
252248
local max_wait=30
253249
local waited=0
254250
while [[ $waited -lt $max_wait ]]; do
255-
conditions_count=$(kubectl get node "$gpu_node" -o json | jq '[.status.conditions[] | select(.type == "SysLogsSXIDError" and .status == "True")] | length')
256-
if [[ "$conditions_count" -ge 1 ]]; then
257-
log "Found $conditions_count node conditions"
251+
power_event=$(kubectl get events --field-selector involvedObject.name="$gpu_node" -o json | jq -r '.items[] | select(.reason == "SysLogsSXIDErrorIsNotHealthy") | .reason')
252+
if [[ -n "$power_event" ]]; then
253+
log "Found sxid event"
258254
break
259255
fi
260256
sleep 2
261257
waited=$((waited + 2))
262258
done
263259

264-
log "Verifying SXID node condition is populated (fatal SXID 20034)"
265-
sxid_condition=$(kubectl get node "$gpu_node" -o json | jq -r '.status.conditions[] | select(.type == "SysLogsSXIDError" and .status == "True") | .type')
260+
log "Verifying SXID node event is populated (non-fatal SXID 28002)"
261+
sxid_event=$(kubectl get events --field-selector involvedObject.name="$gpu_node" -o json | jq -r '.items[] | select(.reason == "SysLogsSXIDErrorIsNotHealthy") | .reason')
266262

267-
if [[ -z "$sxid_condition" ]]; then
268-
error "SysLogsSXIDError condition not found (fatal SXID should create condition)"
263+
if [[ -z "$sxid_event" ]]; then
264+
error "SysLogsSXIDError event not found (non-fatal SXID may not create separate event)"
269265
fi
270-
log "Node condition verified: SysLogsSXIDError ✓"
266+
log "Node event verified: SysLogsSXIDError ✓"
267+
268+
log " - SXID 20034 (Fatal): LTSSM Fault Up on Link $link_number"
269+
kubectl exec -n gpu-operator "$driver_pod" -- logger -p daemon.err "nvidia-nvswitch3: SXid (PCI:${pci_id}): 20034, Fatal, Link ${link_number} LTSSM Fault Up"
271270

271+
log "Waiting for node conditions to appear..."
272272
local max_wait=30
273273
local waited=0
274274
while [[ $waited -lt $max_wait ]]; do
275-
power_event=$(kubectl get events --field-selector involvedObject.name="$gpu_node" -o json | jq -r '.items[] | select(.reason == "SysLogsSXIDErrorIsNotHealthy") | .reason')
276-
if [[ -n "$power_event" ]]; then
277-
log "Found sxid event"
275+
conditions_count=$(kubectl get node "$gpu_node" -o json | jq '[.status.conditions[] | select(.type == "SysLogsSXIDError" and .status == "True")] | length')
276+
if [[ "$conditions_count" -ge 1 ]]; then
277+
log "Found $conditions_count node conditions"
278278
break
279279
fi
280280
sleep 2
281281
waited=$((waited + 2))
282282
done
283283

284-
log "Verifying SXID node event is populated (non-fatal SXID 28002)"
285-
sxid_event=$(kubectl get events --field-selector involvedObject.name="$gpu_node" -o json | jq -r '.items[] | select(.reason == "SysLogsSXIDErrorIsNotHealthy") | .reason')
284+
log "Verifying SXID node condition is populated (fatal SXID 20034)"
285+
sxid_condition=$(kubectl get node "$gpu_node" -o json | jq -r '.status.conditions[] | select(.type == "SysLogsSXIDError" and .status == "True") | .type')
286286

287-
if [[ -z "$sxid_event" ]]; then
288-
error "SysLogsSXIDError event not found (non-fatal SXID may not create separate event)"
287+
if [[ -z "$sxid_condition" ]]; then
288+
error "SysLogsSXIDError condition not found (fatal SXID should create condition)"
289289
fi
290-
log "Node event verified: SysLogsSXIDError ✓"
290+
log "Node condition verified: SysLogsSXIDError ✓"
291291

292292
log "Waiting for node to be quarantined and rebooted..."
293293
wait_for_boot_id_change "$gpu_node" "$original_boot_id"

0 commit comments

Comments
 (0)