Skip to content

Commit 2f8d714

Browse files
committed
chore: add tests in uat
Signed-off-by: Ajay Mishra <[email protected]>
1 parent 5f3c25e commit 2f8d714

File tree

2 files changed

+154
-32
lines changed

2 files changed

+154
-32
lines changed

node-drainer-module/pkg/queue/queue.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ import (
2929
func NewEventQueueManager() EventQueueManager {
3030
mgr := &eventQueueManager{
3131
queue: workqueue.NewTypedRateLimitingQueue(
32-
workqueue.NewTypedItemExponentialFailureRateLimiter[NodeEvent](10*time.Second, 5*time.Minute),
32+
workqueue.NewTypedItemExponentialFailureRateLimiter[NodeEvent](10*time.Second, 2*time.Minute),
3333
),
3434
shutdown: make(chan struct{}),
3535
}

tests/uat/tests.sh

Lines changed: 153 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
1716
set -euo pipefail
1817

1918
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -29,118 +28,241 @@ wait_for_boot_id_change() {
2928
local original_boot_id=$2
3029
local timeout=600
3130
local elapsed=0
32-
31+
3332
log "Waiting for node $node to reboot (boot ID to change)..."
34-
33+
3534
while [[ $elapsed -lt $timeout ]]; do
3635
local current_boot_id
3736
current_boot_id=$(get_boot_id "$node" 2>/dev/null || echo "")
38-
37+
3938
if [[ -n "$current_boot_id" && "$current_boot_id" != "$original_boot_id" ]]; then
4039
log "Node $node rebooted successfully (boot ID changed)"
4140
elapsed=0
4241
break
4342
fi
44-
43+
4544
sleep 5
4645
elapsed=$((elapsed + 5))
4746
done
48-
47+
4948
if [[ $elapsed -ge $timeout ]]; then
5049
error "Timeout waiting for node $node to reboot"
5150
fi
52-
51+
5352
log "Waiting for node $node to be uncordoned..."
5453
while [[ $elapsed -lt $timeout ]]; do
5554
local is_cordoned
5655
is_cordoned=$(kubectl get node "$node" -o jsonpath='{.spec.unschedulable}')
57-
56+
5857
if [[ "$is_cordoned" != "true" ]]; then
5958
log "Node $node is uncordoned and ready ✓"
6059
return 0
6160
fi
62-
61+
6362
sleep 5
6463
elapsed=$((elapsed + 5))
6564
done
66-
65+
6766
error "Timeout waiting for node $node to be uncordoned"
6867
}
6968

7069
test_gpu_monitoring_dcgm() {
7170
log "========================================="
7271
log "Test 1: GPU monitoring via DCGM"
7372
log "========================================="
74-
73+
7574
local gpu_node
7675
gpu_node=$(kubectl get nodes -l workload-type=gpu -o jsonpath='{.items[0].metadata.name}')
77-
76+
7877
if [[ -z "$gpu_node" ]]; then
7978
error "No GPU nodes found"
8079
fi
81-
80+
8281
log "Selected GPU node: $gpu_node"
83-
82+
8483
local original_boot_id
8584
original_boot_id=$(get_boot_id "$gpu_node")
8685
log "Original boot ID: $original_boot_id"
87-
86+
8887
local dcgm_pod
8988
dcgm_pod=$(kubectl get pods -n gpu-operator -l app=nvidia-dcgm -o jsonpath="{.items[?(@.spec.nodeName=='$gpu_node')].metadata.name}" | head -1)
90-
89+
9190
if [[ -z "$dcgm_pod" ]]; then
9291
error "No DCGM pod found on node $gpu_node"
9392
fi
94-
93+
9594
log "Injecting Inforom error via DCGM on pod: $dcgm_pod"
96-
kubectl exec -n gpu-operator "$dcgm_pod" -- dcgmi test --inject --gpuid 0 -f 84 -v 0
97-
95+
kubectl exec -n gpu-operator "$dcgm_pod" -- dcgmi test --inject --gpuid 0 -f 84 -v 0 # infoROM watch error
96+
kubectl exec -n gpu-operator "$dcgm_pod" -- dcgmi test --inject --gpuid 0 -f 240 -v 1000 # PCIE watch error
97+
kubectl exec -n gpu-operator "$dcgm_pod" -- dcgmi test --inject --gpuid 0 -f 202 -v 99999 # power watch error
98+
99+
log "Waiting for node conditions to appear..."
100+
local max_wait=30
101+
local waited=0
102+
while [[ $waited -lt $max_wait ]]; do
103+
conditions_count=$(kubectl get node "$gpu_node" -o json | jq '[.status.conditions[] | select(.type == "GpuInforomWatch" or .type == "GpuPcieWatch")] | length')
104+
if [[ "$conditions_count" -ge 2 ]]; then
105+
log "Found $conditions_count node conditions"
106+
break
107+
fi
108+
sleep 2
109+
waited=$((waited + 2))
110+
done
111+
112+
log "Verifying node conditions are populated"
113+
kubectl get node "$gpu_node" -o json | jq -r '.status.conditions[] | select(.type == "GpuInforomWatch" or .type == "GpuPcieWatch") | "\(.type) Status=\(.status) Reason=\(.reason)"'
114+
115+
inforom_condition=$(kubectl get node "$gpu_node" -o json | jq -r '.status.conditions[] | select(.type == "GpuInforomWatch" and .status == "True") | .type')
116+
pcie_condition=$(kubectl get node "$gpu_node" -o json | jq -r '.status.conditions[] | select(.type == "GpuPcieWatch" and .status == "True") | .type')
117+
118+
if [[ -z "$inforom_condition" ]] || [[ -z "$pcie_condition" ]]; then
119+
error "Expected node conditions not found: GpuInforomWatch=$inforom_condition, GpuPcieWatch=$pcie_condition"
120+
fi
121+
log "Node conditions verified ✓"
122+
123+
log "Verifying node events are populated (non-fatal errors appear here)"
124+
kubectl get events --field-selector involvedObject.name="$gpu_node" -o json | jq -r '.items[] | select(.reason | contains("IsNotHealthy")) | "\(.reason) Message=\(.message)"' | head -5
125+
126+
power_event=$(kubectl get events --field-selector involvedObject.name="$gpu_node" -o json | jq -r '.items[] | select(.reason == "GpuPowerWatchIsNotHealthy") | .reason')
127+
if [[ -z "$power_event" ]]; then
128+
error "GpuPowerWatch event not found (non-fatal errors should create events)"
129+
fi
130+
log "Node event verified: GpuPowerWatch is non-fatal, appears in events ✓"
131+
98132
log "Waiting for node to be quarantined and rebooted..."
99133
wait_for_boot_id_change "$gpu_node" "$original_boot_id"
100-
134+
101135
log "Test 1 PASSED ✓"
102136
}
103137

104138
test_xid_monitoring_syslog() {
105139
log "========================================="
106140
log "Test 2: XID monitoring via syslog"
107141
log "========================================="
108-
142+
109143
local gpu_node
110144
gpu_node=$(kubectl get nodes -l workload-type=gpu -o jsonpath='{.items[0].metadata.name}')
111-
145+
112146
if [[ -z "$gpu_node" ]]; then
113147
error "No GPU nodes found"
114148
fi
115-
149+
116150
log "Selected GPU node: $gpu_node"
117-
151+
118152
local original_boot_id
119153
original_boot_id=$(get_boot_id "$gpu_node")
120154
log "Original boot ID: $original_boot_id"
121-
155+
122156
local driver_pod
123157
driver_pod=$(kubectl get pods -n gpu-operator -l app=nvidia-driver-daemonset -o jsonpath="{.items[?(@.spec.nodeName=='$gpu_node')].metadata.name}" | head -1)
124-
158+
125159
if [[ -z "$driver_pod" ]]; then
126160
error "No driver pod found on node $gpu_node"
127161
fi
128-
162+
129163
log "Injecting XID 119 message via logger on pod: $driver_pod"
130164
kubectl exec -n gpu-operator "$driver_pod" -- logger -p daemon.err "[6085126.134786] NVRM: Xid (PCI:0002:00:00): 119, pid=1582259, name=nvc:[driver], Timeout after 6s of waiting for RPC response from GPU1 GSP! Expected function 76 (GSP_RM_CONTROL) (0x20802a02 0x8)."
131-
165+
132166
log "Waiting for node to be quarantined and rebooted..."
133167
wait_for_boot_id_change "$gpu_node" "$original_boot_id"
134-
168+
135169
log "Test 2 PASSED ✓"
136170
}
137171

172+
test_sxid_monitoring_syslog() {
173+
log "========================================="
174+
log "Test 3: SXID monitoring (NVSwitch errors)"
175+
log "========================================="
176+
177+
local gpu_node
178+
gpu_node=$(kubectl get nodes -l workload-type=gpu -o jsonpath='{.items[0].metadata.name}')
179+
180+
if [[ -z "$gpu_node" ]]; then
181+
error "No GPU nodes found"
182+
fi
183+
184+
log "Selected GPU node: $gpu_node"
185+
186+
local dcgm_pod
187+
dcgm_pod=$(kubectl get pods -n gpu-operator -l app=nvidia-dcgm -o jsonpath="{.items[?(@.spec.nodeName=='$gpu_node')].metadata.name}" | head -1)
188+
189+
if [[ -z "$dcgm_pod" ]]; then
190+
error "No DCGM pod found on node $gpu_node"
191+
fi
192+
193+
log "Getting NVLink topology from DCGM pod: $dcgm_pod"
194+
local nvlink_output
195+
nvlink_output=$(kubectl exec -n gpu-operator "$dcgm_pod" -- nvidia-smi nvlink -R 2>/dev/null)
196+
197+
if [[ -z "$nvlink_output" ]]; then
198+
log "Warning: nvidia-smi nvlink not available, using fallback PCI/Link values"
199+
local pci_id="0005:00:00.0"
200+
local link_number="29"
201+
else
202+
log "Parsing NVLink topology to extract PCI and Link"
203+
local link_line
204+
link_line=$(echo "$nvlink_output" | grep -E "Link [0-9]+: Remote Device" | head -1)
205+
206+
if [[ -z "$link_line" ]]; then
207+
log "Warning: No link information found, using fallback values"
208+
local pci_id="0005:00:00.0"
209+
local link_number="29"
210+
else
211+
local pci_id
212+
pci_id=$(echo "$link_line" | grep -oE '[0-9A-Fa-f]{8}:[0-9A-Fa-f]{2}:[0-9A-Fa-f]{2}\.[0-9]' | head -1)
213+
local link_number
214+
link_number=$(echo "$link_line" | grep -oE 'Link [0-9]+$' | grep -oE '[0-9]+$')
215+
216+
log "Extracted from topology: PCI=$pci_id, Link=$link_number"
217+
fi
218+
fi
219+
220+
local driver_pod
221+
driver_pod=$(kubectl get pods -n gpu-operator -l app=nvidia-driver-daemonset -o jsonpath="{.items[?(@.spec.nodeName=='$gpu_node')].metadata.name}" | head -1)
222+
223+
if [[ -z "$driver_pod" ]]; then
224+
error "No driver pod found on node $gpu_node"
225+
fi
226+
227+
log "Injecting SXID error messages via logger on pod: $driver_pod"
228+
log " - SXID 20034 (Fatal): LTSSM Fault Up on Link $link_number"
229+
kubectl exec -n gpu-operator "$driver_pod" -- logger -p daemon.err "nvidia-nvswitch3: SXid (PCI:${pci_id}): 20034, Fatal, Link ${link_number} LTSSM Fault Up"
230+
231+
log " - SXID 28002 (Non-fatal): Therm Warn Deactivated on Link $link_number"
232+
kubectl exec -n gpu-operator "$driver_pod" -- logger -p daemon.err "nvidia-nvswitch0: SXid (PCI:${pci_id}): 28002, Non-fatal, Link ${link_number} Therm Warn Deactivated"
233+
234+
log "Waiting for node conditions to appear..."
235+
sleep 15
236+
237+
log "Verifying SXID node condition is populated (fatal SXID 20034)"
238+
sxid_condition=$(kubectl get node "$gpu_node" -o json | jq -r '.status.conditions[] | select(.type == "SysLogsSXIDError" and .status == "True") | .type')
239+
240+
if [[ -z "$sxid_condition" ]]; then
241+
error "SysLogsSXIDError condition not found (fatal SXID should create condition)"
242+
fi
243+
log "Node condition verified: SysLogsSXIDError ✓"
244+
245+
log "Verifying SXID node event is populated (non-fatal SXID 28002)"
246+
sxid_event=$(kubectl get events --field-selector involvedObject.name="$gpu_node" -o json | jq -r '.items[] | select(.reason == "SysLogsSXIDErrorIsNotHealthy") | .reason')
247+
248+
if [[ -z "$sxid_event" ]]; then
249+
error "SysLogsSXIDError event not found (non-fatal SXID may not create separate event)"
250+
fi
251+
log "Node event verified: SysLogsSXIDError ✓"
252+
253+
log "Waiting for node to be quarantined and rebooted..."
254+
wait_for_boot_id_change "$gpu_node" "$original_boot_id"
255+
256+
log "Test 3 PASSED ✓"
257+
}
258+
138259
main() {
139260
log "Starting NVSentinel UAT tests..."
140-
261+
141262
test_gpu_monitoring_dcgm
142263
test_xid_monitoring_syslog
143-
264+
test_sxid_monitoring_nvswitch
265+
144266
log "========================================="
145267
log "All tests PASSED ✓"
146268
log "========================================="

0 commit comments

Comments
 (0)