@@ -193,6 +193,7 @@ log_objects() {
193193
194194@test " NodePrepareResources: catch unknown field in opaque cfg in ResourceClaim" {
195195 log_objects
196+ iupgrade_wait " ${TEST_CHART_REPO} " " ${TEST_CHART_VERSION} " NOARGS
196197
197198 envsubst < tests/bats/specs/rc-opaque-cfg-unknown-field.yaml.tmpl > \
198199 " ${BATS_TEST_TMPDIR} " /rc-opaque-cfg-unknown-field.yaml
@@ -236,6 +237,75 @@ log_objects() {
236237 kubectl wait --for=delete " ${POD} " --timeout=10s
237238}
238239
240+ # bats test_tags=bats:focus
241+ @test " Self-initiated unprepare of stale RCs in PrepareStarted" {
242+ log_objects
243+ iupgrade_wait " ${TEST_CHART_REPO} " " ${TEST_CHART_VERSION} " NOARGS
244+
245+ # Stage 1: provoke partially prepared claim.
246+ #
247+ # Based on the "catch unknown field in opaque cfg in ResourceClaim" test
248+ # above: Provoke a permanent Prepare() error, leaving behind a partially
249+ # prepared claim in the checkpoint.
250+ envsubst < tests/bats/specs/rc-opaque-cfg-unknown-field.yaml.tmpl > \
251+ " ${BATS_TEST_TMPDIR} " /rc-opaque-cfg-unknown-field.yaml
252+ local SPEC=" ${BATS_TEST_TMPDIR} /rc-opaque-cfg-unknown-field.yaml"
253+ local POD
254+ POD=$( kubectl create -f " ${SPEC} " | grep pod | awk ' {print $1;}' )
255+ kubectl wait \
256+ --for=jsonpath=' {.status.containerStatuses[0].state.waiting.reason}' =ContainerCreating \
257+ --timeout=10s \
258+ " ${POD} "
259+ wait_for_pod_event " ${POD} " FailedPrepareDynamicResources 10
260+ run kubectl logs \
261+ -l nvidia-dra-driver-gpu-component=kubelet-plugin \
262+ -n nvidia-dra-driver-gpu \
263+ --prefix --tail=-1
264+ assert_output --partial ' strict decoding error: unknown field "unexpectedField"'
265+
266+ # Stage 2: test that cleanup routine leaves this claim alone ('not stale')
267+ #
268+ # Re-install, flip log verbosity just to enforce container restart. This
269+ # ensures that the cleanup runs immediately (it runs upon startup, and then
270+ # only N minutes later again).
271+ local _iargs=(" --set" " logVerbosity=5" )
272+ iupgrade_wait " ${TEST_CHART_REPO} " " ${TEST_CHART_VERSION} " _iargs
273+ sleep 1 # give the on-startup cleanup a chance to run.
274+ run kubectl logs \
275+ -l nvidia-dra-driver-gpu-component=kubelet-plugin \
276+ -n nvidia-dra-driver-gpu \
277+ --prefix --tail=-1
278+ assert_output --partial " partially prepared claim not stale: default/batssuite-rc-bad-opaque-config"
279+
280+ # Stage 3: simulate stale claim, test cleanup.
281+ #
282+ # To that end, uninstall the driver and then remove both pod and RC from the API server.
283+ # Then, re-install DRA driver and confirm detection and removal of stale claim.
284+ helm uninstall -n nvidia-dra-driver-gpu nvidia-dra-driver-gpu-batssuite --wait
285+ kubectl delete " ${POD} " --force
286+ kubectl delete resourceclaim batssuite-rc-bad-opaque-config
287+ local _iargs=(" --set" " logVerbosity=6" )
288+ iupgrade_wait " ${TEST_CHART_REPO} " " ${TEST_CHART_VERSION} " _iargs
289+ sleep 1 # give the on-startup cleanup a chance to run.
290+
291+ run kubectl logs \
292+ -l nvidia-dra-driver-gpu-component=kubelet-plugin \
293+ -n nvidia-dra-driver-gpu \
294+ --prefix --tail=-1
295+ assert_output --partial " Deleted claim from checkpoint: default/batssuite-rc-bad-opaque-config"
296+ assert_output --partial " Checkpointed RC cleanup: unprepared stale claim: default/batssuite-rc-bad-opaque-config"
297+
298+ # Stage 4: appendix -- happens shortly thereafter: we do get a
299+ # UnprepareResourceClaims() call for this claim. Why? It's a noop because the
300+ # cleanup above was faster.
301+ sleep 4
302+ run kubectl logs \
303+ -l nvidia-dra-driver-gpu-component=kubelet-plugin \
304+ -n nvidia-dra-driver-gpu \
305+ --prefix --tail=-1
306+ assert_output --partial " Unprepare noop: claim not found in checkpoint data"
307+ }
308+
239309@test " nickelpie (NCCL send/recv/broadcast, 2 pods, 2 nodes, small payload)" {
240310 log_objects
241311
0 commit comments