Skip to content

Commit 81fa0b2

Browse files
committed
tests: cover cleanup for stale partially prepared claims
Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]>
1 parent 4c09fce commit 81fa0b2

File tree

1 file changed

+70
-0
lines changed

1 file changed

+70
-0
lines changed

tests/bats/tests.bats

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ log_objects() {
193193

194194
@test "NodePrepareResources: catch unknown field in opaque cfg in ResourceClaim" {
195195
log_objects
196+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" NOARGS
196197

197198
envsubst < tests/bats/specs/rc-opaque-cfg-unknown-field.yaml.tmpl > \
198199
"${BATS_TEST_TMPDIR}"/rc-opaque-cfg-unknown-field.yaml
@@ -236,6 +237,75 @@ log_objects() {
236237
kubectl wait --for=delete "${POD}" --timeout=10s
237238
}
238239

240+
# bats test_tags=bats:focus
241+
@test "Self-initiated unprepare of stale RCs in PrepareStarted" {
242+
log_objects
243+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" NOARGS
244+
245+
# Stage 1: provoke partially prepared claim.
246+
#
247+
# Based on the "catch unknown field in opaque cfg in ResourceClaim" test
248+
# above: Provoke a permanent Prepare() error, leaving behind a partially
249+
# prepared claim in the checkpoint.
250+
envsubst < tests/bats/specs/rc-opaque-cfg-unknown-field.yaml.tmpl > \
251+
"${BATS_TEST_TMPDIR}"/rc-opaque-cfg-unknown-field.yaml
252+
local SPEC="${BATS_TEST_TMPDIR}/rc-opaque-cfg-unknown-field.yaml"
253+
local POD
254+
POD=$(kubectl create -f "${SPEC}" | grep pod | awk '{print $1;}')
255+
kubectl wait \
256+
--for=jsonpath='{.status.containerStatuses[0].state.waiting.reason}'=ContainerCreating \
257+
--timeout=10s \
258+
"${POD}"
259+
wait_for_pod_event "${POD}" FailedPrepareDynamicResources 10
260+
run kubectl logs \
261+
-l nvidia-dra-driver-gpu-component=kubelet-plugin \
262+
-n nvidia-dra-driver-gpu \
263+
--prefix --tail=-1
264+
assert_output --partial 'strict decoding error: unknown field "unexpectedField"'
265+
266+
# Stage 2: test that cleanup routine leaves this claim alone ('not stale')
267+
#
268+
# Re-install, flip log verbosity just to enforce container restart. This
269+
# ensures that the cleanup runs immediately (it runs upon startup, and then
270+
# only N minutes later again).
271+
local _iargs=("--set" "logVerbosity=5")
272+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
273+
sleep 1 # give the on-startup cleanup a chance to run.
274+
run kubectl logs \
275+
-l nvidia-dra-driver-gpu-component=kubelet-plugin \
276+
-n nvidia-dra-driver-gpu \
277+
--prefix --tail=-1
278+
assert_output --partial "partially prepared claim not stale: default/batssuite-rc-bad-opaque-config"
279+
280+
# Stage 3: simulate stale claim, test cleanup.
281+
#
282+
# To that end, uninstall the driver and then remove both pod and RC from the API server.
283+
# Then, re-install DRA driver and confirm detection and removal of stale claim.
284+
helm uninstall -n nvidia-dra-driver-gpu nvidia-dra-driver-gpu-batssuite --wait
285+
kubectl delete "${POD}" --force
286+
kubectl delete resourceclaim batssuite-rc-bad-opaque-config
287+
local _iargs=("--set" "logVerbosity=6")
288+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
289+
sleep 1 # give the on-startup cleanup a chance to run.
290+
291+
run kubectl logs \
292+
-l nvidia-dra-driver-gpu-component=kubelet-plugin \
293+
-n nvidia-dra-driver-gpu \
294+
--prefix --tail=-1
295+
assert_output --partial "Deleted claim from checkpoint: default/batssuite-rc-bad-opaque-config"
296+
assert_output --partial "Checkpointed RC cleanup: unprepared stale claim: default/batssuite-rc-bad-opaque-config"
297+
298+
# Stage 4: appendix -- happens shortly thereafter: we do get a
299+
# UnprepareResourceClaims() call for this claim. Why? It's a noop because the
300+
# cleanup above was faster.
301+
sleep 4
302+
run kubectl logs \
303+
-l nvidia-dra-driver-gpu-component=kubelet-plugin \
304+
-n nvidia-dra-driver-gpu \
305+
--prefix --tail=-1
306+
assert_output --partial "Unprepare noop: claim not found in checkpoint data"
307+
}
308+
239309
@test "nickelpie (NCCL send/recv/broadcast, 2 pods, 2 nodes, small payload)" {
240310
log_objects
241311

0 commit comments

Comments
 (0)