Skip to content

Commit a68050c

Browse files
authored
Merge pull request #598 from jgehrcke/jp/test-strict-decode-on-prepare
Test for strict opaque config-decoding in prepare path
2 parents adf0b8e + ea75b8f commit a68050c

File tree

6 files changed

+133
-12
lines changed

6 files changed

+133
-12
lines changed

cmd/compute-domain-kubelet-plugin/driver.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,11 @@ func (d *driver) nodePrepareResource(ctx context.Context, claim *resourceapi.Res
247247
res := kubeletplugin.PrepareResult{
248248
Err: fmt.Errorf("error preparing devices for claim %v: %w", claim.UID, err),
249249
}
250-
return isPermanentError(err), res
250+
if isPermanentError(err) {
251+
klog.V(6).Infof("Permanent error preparing devices for claim %v: %v", claim.UID, err)
252+
return true, res
253+
}
254+
return false, res
251255
}
252256

253257
klog.Infof("Returning newly prepared devices for claim '%v': %v", claim.UID, devs)

tests/bats/Makefile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,7 @@ tests: image
9090
$(BATS_IMAGE) \
9191
-c "cd /cwd; \
9292
echo 'Running k8s cluster cleanup (invasive)... '; \
93-
bash tests/bats/cleanup-from-previous-run.sh &> $${_RUNDIR}/cleanup.outerr || \
94-
(echo 'Cleanup failed:'; cat $${_RUNDIR}/cleanup.outerr); \
93+
bash tests/bats/cleanup-from-previous-run.sh 2>&1 | tee -a $${_RUNDIR}/cleanup.outerr; \
9594
TMPDIR=/cwd/$${_RUNDIR} bats \
9695
--print-output-on-failure \
9796
--no-tempdir-cleanup \

tests/bats/cleanup-from-previous-run.sh

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,16 +36,18 @@ kubectl apply -f "${CRD_URL}"
3636
# interrupted run. TODO: try to affect all-at-once, maybe with a special label.
3737
# Note: the following commands are OK to fail -- the `errexit` shell option is
3838
# deliberately not set here.
39-
timeout -v 5 kubectl delete -f demo/specs/imex/channel-injection.yaml
40-
timeout -v 5 kubectl delete -f demo/specs/imex/channel-injection-all.yaml
41-
timeout -v 5 kubectl delete jobs nickelpie-test
42-
timeout -v 5 kubectl delete computedomain nickelpie-test-compute-domain
43-
timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-1.yaml
39+
timeout -v 5 kubectl delete -f demo/specs/imex/channel-injection.yaml 2> /dev/null
40+
timeout -v 5 kubectl delete -f demo/specs/imex/channel-injection-all.yaml 2> /dev/null
41+
timeout -v 5 kubectl delete jobs nickelpie-test 2> /dev/null
42+
timeout -v 5 kubectl delete computedomain nickelpie-test-compute-domain 2> /dev/null
43+
timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-1.yaml 2> /dev/null
44+
timeout -v 5 kubectl delete pods -l env=batssuite 2> /dev/null
45+
timeout -v 2 kubectl delete resourceclaim batssuite-rc-bad-opaque-config --force 2> /dev/null
4446

4547
# Delete any previous remainder of `clean-state-dirs-all-nodes.sh` invocation.
46-
kubectl delete pods privpod-rm-plugindirs
48+
kubectl delete pods privpod-rm-plugindirs 2> /dev/null
4749

48-
helm uninstall nvidia-dra-driver-gpu-batssuite -n nvidia-dra-driver-gpu
50+
timeout -v 5 helm uninstall nvidia-dra-driver-gpu-batssuite -n nvidia-dra-driver-gpu
4951

5052
kubectl wait \
5153
--for=delete pods -A \
@@ -58,7 +60,8 @@ kubectl wait \
5860
# properly.
5961
timeout -v 10 kubectl delete crds computedomains.resource.nvidia.com || echo "CRD deletion failed"
6062

63+
# Remove kubelet plugin state directories from all nodes (critical part of
64+
# cleanup, fail hard if this does not succeed).
6165
set -e
62-
# Remove kubelet plugin state directories from all nodes.
6366
bash tests/bats/clean-state-dirs-all-nodes.sh
6467
set +x

tests/bats/helpers.sh

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/bin/bash
2+
#
3+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4+
# SPDX-License-Identifier: Apache-2.0
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
# Events accumulate over time, so for certainty it's best to use a unique pod
19+
# name. Right now, this inspects an entire line which includes REASON, MESSAGE,
20+
# and OBJECT, so choose the needle (grepped for) precisely enough.
21+
# Example: wait_for_pod_event pod/testpod-ls09x FailedPrepareDynamicResources 60
22+
wait_for_pod_event() {
23+
# Expect this to have the pod/ prefix
24+
local POD_NAME="$1"
25+
local REASON="$2"
26+
local TIMEOUT="$3"
27+
28+
local START=$SECONDS
29+
while true; do
30+
if kubectl events --for "${POD_NAME}" | grep -q "${REASON}"; then
31+
echo "Event detected: ${REASON} (for ${POD_NAME})"
32+
return 0
33+
fi
34+
if (( SECONDS - START > TIMEOUT )); then
35+
echo "Timeout (${TIMEOUT} s) waiting for '${REASON}' in events for ${POD_NAME}"
36+
return 1
37+
fi
38+
sleep 2
39+
done
40+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
apiVersion: resource.k8s.io/v1beta1
2+
kind: ResourceClaim
3+
metadata:
4+
name: batssuite-rc-bad-opaque-config
5+
spec:
6+
devices:
7+
requests:
8+
- deviceClassName: compute-domain-default-channel.nvidia.com
9+
name: chan
10+
config:
11+
- opaque:
12+
driver: compute-domain.nvidia.com
13+
parameters:
14+
apiVersion: resource.nvidia.com/v1beta1
15+
domainID: not-needed-for-test
16+
kind: ComputeDomainChannelConfig
17+
unexpectedField: foo
18+
requests:
19+
- chan
20+
---
21+
apiVersion: v1
22+
kind: Pod
23+
metadata:
24+
generateName: batssuite-pod-boc-
25+
labels:
26+
env: batssuite
27+
spec:
28+
containers:
29+
- name: app
30+
image: busybox
31+
resources:
32+
claims:
33+
- name: batssuite-claim-boc
34+
resourceClaims:
35+
- name: batssuite-claim-boc
36+
resourceClaimName: batssuite-rc-bad-opaque-config

tests/bats/tests.bats

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ setup() {
44
load '/bats-libraries/bats-support/load.bash'
55
load '/bats-libraries/bats-assert/load.bash'
66
load '/bats-libraries/bats-file/load.bash'
7+
load 'helpers.sh'
78
}
89

910
# Currently, the tests defined in this file deliberately depend on each other
@@ -162,6 +163,44 @@ apply_check_delete_workload_imex_chan_inject() {
162163
kubectl delete -f demo/specs/imex/channel-injection-all.yaml
163164
}
164165

166+
@test "NodePrepareResources: catch unknown field in opaque cfg in ResourceClaim" {
167+
local SPEC="tests/bats/specs/rc-opaque-cfg-unknown-field.yaml"
168+
169+
# Create pod with random name suffix.
170+
# Store ref of the form `pod/batssuite-pod-boc-brs2l`.
171+
local POD
172+
POD=$(kubectl create -f "${SPEC}" | grep pod | awk '{print $1;}')
173+
174+
# Confirm ContainerCreating state (no failure yet though).
175+
kubectl wait \
176+
--for=jsonpath='{.status.containerStatuses[0].state.waiting.reason}'=ContainerCreating \
177+
--timeout=10s \
178+
"${POD}"
179+
180+
# Rather quickly, we expect an event with reason
181+
# `FailedPrepareDynamicResources`. That's not typically the method users
182+
# discover the error.
183+
wait_for_pod_event "${POD}" FailedPrepareDynamicResources 10
184+
185+
# This is how users probably see this error first.
186+
kubectl describe "${POD}" | grep FailedPrepareDynamicResources | \
187+
grep "error preparing devices" | \
188+
grep 'strict decoding error: unknown field "unexpectedField"'
189+
190+
# Confirm that precise root cause can also be inferred from
191+
# CD kubelet plugin logs.
192+
kubectl logs \
193+
-l nvidia-dra-driver-gpu-component=kubelet-plugin \
194+
-n nvidia-dra-driver-gpu \
195+
--prefix --tail=-1 | \
196+
grep 'Permanent error' | \
197+
grep 'strict decoding error: unknown field "unexpectedField"'
198+
199+
# Clean up.
200+
kubectl delete "${POD}"
201+
kubectl delete resourceclaim batssuite-rc-bad-opaque-config
202+
}
203+
165204
@test "nickelpie (NCCL send/recv/broadcast, 2 pods, 2 nodes, small payload)" {
166205
# Do not run in checkout dir (to not pollute that).
167206
cd "${BATS_TEST_TMPDIR}"
@@ -188,7 +227,7 @@ apply_check_delete_workload_imex_chan_inject() {
188227
echo "${output}" | grep -E '^.*SUM multinode_device_to_device_memcpy_read_ce [0-9]+\.[0-9]+.*$'
189228
}
190229

191-
@test "downgrade: current -> ${TEST_CHART_LASTSTABLE_REPO}/${TEST_CHART_LASTSTABLE_VERSION}" {
230+
@test "downgrade: current-dev -> last-stable" {
192231
# Stage 1: apply workload, but do not delete.
193232
kubectl apply -f demo/specs/imex/channel-injection.yaml
194233
kubectl wait --for=condition=READY pods imex-channel-injection --timeout=60s

0 commit comments

Comments
 (0)