@@ -4,6 +4,7 @@ setup() {
44 load ' /bats-libraries/bats-support/load.bash'
55 load ' /bats-libraries/bats-assert/load.bash'
66 load ' /bats-libraries/bats-file/load.bash'
7+ load ' helpers.sh'
78}
89
910# Currently, the tests defined in this file deliberately depend on each other
@@ -162,6 +163,44 @@ apply_check_delete_workload_imex_chan_inject() {
162163 kubectl delete -f demo/specs/imex/channel-injection-all.yaml
163164}
164165
166+ @test " NodePrepareResources: catch unknown field in opaque cfg in ResourceClaim" {
167+ local SPEC=" tests/bats/specs/rc-opaque-cfg-unknown-field.yaml"
168+
169+ # Create pod with random name suffix.
170+ # Store ref of the form `pod/batssuite-pod-boc-brs2l`.
171+ local POD
172+ POD=$( kubectl create -f " ${SPEC} " | grep pod | awk ' {print $1;}' )
173+
174+ # Confirm ContainerCreating state (no failure yet though).
175+ kubectl wait \
176+ --for=jsonpath=' {.status.containerStatuses[0].state.waiting.reason}' =ContainerCreating \
177+ --timeout=10s \
178+ " ${POD} "
179+
180+ # Rather quickly, we expect an event with reason
181+ # `FailedPrepareDynamicResources`. That's not typically the method users
182+ # discover the error.
183+ wait_for_pod_event " ${POD} " FailedPrepareDynamicResources 10
184+
185+ # This is how users probably see this error first.
186+ kubectl describe " ${POD} " | grep FailedPrepareDynamicResources | \
187+ grep " error preparing devices" | \
188+ grep ' strict decoding error: unknown field "unexpectedField"'
189+
190+ # Confirm that precise root cause can also be inferred from
191+ # CD kubelet plugin logs.
192+ kubectl logs \
193+ -l nvidia-dra-driver-gpu-component=kubelet-plugin \
194+ -n nvidia-dra-driver-gpu \
195+ --prefix --tail=-1 | \
196+ grep ' Permanent error' | \
197+ grep ' strict decoding error: unknown field "unexpectedField"'
198+
199+ # Clean up.
200+ kubectl delete " ${POD} "
201+ kubectl delete resourceclaim batssuite-rc-bad-opaque-config
202+ }
203+
165204@test " nickelpie (NCCL send/recv/broadcast, 2 pods, 2 nodes, small payload)" {
166205 # Do not run in checkout dir (to not pollute that).
167206 cd " ${BATS_TEST_TMPDIR} "
@@ -188,7 +227,7 @@ apply_check_delete_workload_imex_chan_inject() {
188227 echo " ${output} " | grep -E ' ^.*SUM multinode_device_to_device_memcpy_read_ce [0-9]+\.[0-9]+.*$'
189228}
190229
191- @test " downgrade: current -> ${TEST_CHART_LASTSTABLE_REPO} / ${TEST_CHART_LASTSTABLE_VERSION} " {
230+ @test " downgrade: current-dev -> last-stable " {
192231 # Stage 1: apply workload, but do not delete.
193232 kubectl apply -f demo/specs/imex/channel-injection.yaml
194233 kubectl wait --for=condition=READY pods imex-channel-injection --timeout=60s
0 commit comments