Skip to content

Commit afbb033

Browse files
authored
Merge pull request #637 from jgehrcke/jp/test-suite-patches
tests: fixes, improved cleanup & stability, better debuggability
2 parents 40834bb + b57a854 commit afbb033

File tree

6 files changed

+133
-54
lines changed

6 files changed

+133
-54
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,7 @@
77
[._]*.sw[a-p]
88
coverage.out
99
tests-out
10+
*.log
11+
*.tar
12+
*.tgz
13+

tests/bats/Makefile

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,13 @@ endif
5656
BATS_IMAGE = batstests:$(GIT_COMMIT_SHORT)
5757
KUBECONFIG ?= $(HOME)/.kube/config
5858

59+
# Add `docker run` arguments when not running
60+
# in Github Actions / GitLab CI.
61+
DOCKER_RUN_FLAGS :=
62+
ifeq ($(CI),)
63+
DOCKER_RUN_FLAGS += -it
64+
endif
65+
5966
default: tests
6067

6168
.PHONY: image
@@ -71,9 +78,10 @@ image:
7178
.PHONY: tests
7279
tests: image
7380
mkdir -p tests-out && \
74-
export _RUNDIR=$(shell mktemp -p tests-out -d -t bats-tests-$$(date +%s)-XXXXX) && \
75-
time docker run \
76-
-it \
81+
export _RUNDIR=$$(mktemp -p tests-out -d -t bats-tests-$$(date +%s)-XXXXX) && \
82+
docker run \
83+
--rm \
84+
$(DOCKER_RUN_FLAGS) \
7785
-v /tmp:/tmp \
7886
-v $(CURDIR):/cwd \
7987
-v $(HOME)/.kube/:$(HOME)/.kube \
@@ -88,7 +96,7 @@ tests: image
8896
-u $(shell id -u ${USER}):$(shell id -g ${USER}) \
8997
--entrypoint "/bin/bash"\
9098
$(BATS_IMAGE) \
91-
-c "cd /cwd; \
99+
-c "set -ex; cd /cwd; \
92100
echo 'Running k8s cluster cleanup (invasive)... '; \
93101
bash tests/bats/cleanup-from-previous-run.sh 2>&1 | tee -a $${_RUNDIR}/cleanup.outerr; \
94102
TMPDIR=/cwd/$${_RUNDIR} bats \

tests/bats/cleanup-from-previous-run.sh

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ set -o pipefail
2121

2222
CRD_URL="https://raw.githubusercontent.com/NVIDIA/k8s-dra-driver-gpu/main/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml"
2323

24+
25+
THIS_DIR_PATH=$(dirname "$(realpath $0)")
26+
source "${THIS_DIR_PATH}/helpers.sh"
27+
2428
# For debugging: state of the world
2529
kubectl get computedomains.resource.nvidia.com
2630
kubectl get pods -n nvidia-dra-driver-gpu
@@ -32,6 +36,12 @@ set -x
3236
# likely helps deletion.
3337
kubectl apply -f "${CRD_URL}"
3438

39+
# Workload deletion below requires a DRA driver to be present, to actually clean
40+
# up. Install _a_ version temporarily, towards best-effort. Install
41+
# to-be-tested-version for now, latest-on-GHCR might be smarter though. Again,
42+
# this command may fail and in best-effort fashion this cleanup script proceeds.
43+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" NOARGS
44+
3545
# Some effort to delete workloads potentially left-over from a previous
3646
# interrupted run. TODO: try to affect all-at-once, maybe with a special label.
3747
# Note: the following commands are OK to fail -- the `errexit` shell option is
@@ -44,6 +54,10 @@ timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-1.yaml 2> /d
4454
timeout -v 5 kubectl delete pods -l env=batssuite 2> /dev/null
4555
timeout -v 2 kubectl delete resourceclaim batssuite-rc-bad-opaque-config --force 2> /dev/null
4656

57+
# TODO: maybe more brute-forcing/best-effort: it might make sense to submit all
58+
# workload in this test suite into a special namespace (not `default`), and to
59+
# then use `kubectl delete pods -n <testnamespace]> --all`.
60+
4761
# Delete any previous remainder of `clean-state-dirs-all-nodes.sh` invocation.
4862
kubectl delete pods privpod-rm-plugindirs 2> /dev/null
4963

@@ -55,9 +69,10 @@ kubectl wait \
5569
--timeout=10s \
5670
|| echo "wait-for-delete failed"
5771

58-
# The next `helm install` must freshly install CRDs, and this is one way to try
59-
# to achieve that. This might time out in case workload wasn't cleaned up
60-
# properly.
72+
# The next `helm install` should freshly install CRDs, and this is one way to
73+
# try to achieve that. This might time out in case workload wasn't cleaned up
74+
# properly. If that happens, the next test suite invocation will have failures
75+
# like "create not allowed while custom resource definition is terminating".
6176
timeout -v 10 kubectl delete crds computedomains.resource.nvidia.com || echo "CRD deletion failed"
6277

6378
# Remove kubelet plugin state directories from all nodes (critical part of

tests/bats/helpers.sh

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,64 @@
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18+
# Use a name that upon cluster inspection reveals that this
19+
# Helm chart release was installed/managed by this test suite.
20+
export TEST_HELM_RELEASE_NAME="nvidia-dra-driver-gpu-batssuite"
21+
22+
23+
_common_setup() {
24+
load '/bats-libraries/bats-support/load.bash'
25+
load '/bats-libraries/bats-assert/load.bash'
26+
load '/bats-libraries/bats-file/load.bash'
27+
}
28+
29+
30+
# A helper arg for `iupgrade_wait` w/o additional install args.
31+
export NOARGS=()
32+
33+
# Install or upgrade, and wait for pods to be READY.
34+
# 1st arg: helm chart repo
35+
# 2nd arg: helm chart version
36+
# 3rd arg: array with additional args (provide `NOARGS` if none)
37+
iupgrade_wait() {
38+
# E.g. `nvidia/nvidia-dra-driver-gpu` or
39+
# `oci://ghcr.io/nvidia/k8s-dra-driver-gpu`
40+
local REPO="$1"
41+
42+
# E.g. `25.3.1` or `25.8.0-dev-f2eaddd6-chart`
43+
local VERSION="$2"
44+
45+
# Expect array as third argument.
46+
local -n ADDITIONAL_INSTALL_ARGS=$3
47+
48+
timeout -v 120 helm upgrade --install "${TEST_HELM_RELEASE_NAME}" \
49+
"${REPO}" \
50+
--version="${VERSION}" \
51+
--wait \
52+
--timeout=1m5s \
53+
--create-namespace \
54+
--namespace nvidia-dra-driver-gpu \
55+
--set resources.gpus.enabled=false \
56+
--set nvidiaDriverRoot="${TEST_NVIDIA_DRIVER_ROOT}" "${ADDITIONAL_INSTALL_ARGS[@]}"
57+
58+
# Valueable output to have in the logs in case things went pearshaped.
59+
kubectl get pods -n nvidia-dra-driver-gpu
60+
61+
# Some part of this waiting work is done by helm as of `--wait` with
62+
# `--timeout`. Note that the below in itself would not be sufficient: in case
63+
# of an upgrade we need to isolate the _new_ pods and not accidentally observe
64+
# the currently disappearing pods. Also note that despite the `--wait` above,
65+
# the kubelet plugins may still be in `PodInitializing` after the Helm command
66+
# returned. My conclusion is that helm waits for the controller to be READY,
67+
# but not for the plugin pods to be READY.
68+
kubectl wait --for=condition=READY pods -A -l nvidia-dra-driver-gpu-component=kubelet-plugin --timeout=15s
69+
70+
# Again, log current state.
71+
kubectl get pods -n nvidia-dra-driver-gpu
72+
73+
# maybe: check version on labels (to confirm that we set labels correctly)
74+
}
75+
1876
# Events accumulate over time, so for certainty it's best to use a unique pod
1977
# name. Right now, this inspects an entire line which includes REASON, MESSAGE,
2078
# and OBJECT, so choose the needle (grepped for) precisely enough.
@@ -37,4 +95,4 @@ wait_for_pod_event() {
3795
fi
3896
sleep 2
3997
done
40-
}
98+
}

tests/bats/setup_suite.bash

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ setup_suite () {
2525
# Probe: kubectl configured against a k8s cluster.
2626
kubectl cluster-info | grep "control plane is running at"
2727

28+
# Fail fast in case there seems to be a DRA driver Helm chart installed at
29+
# this point (maybe one _not_ managed by this test suite).
30+
helm list -A
31+
helm list -A | grep "nvidia-dra-driver-gpu" && { echo "error: helm list not clean"; return 1; }
32+
2833
# Show, for debugging.
2934
kubectl api-resources --api-group=resource.k8s.io
3035

tests/bats/tests.bats

Lines changed: 35 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
# shellcheck disable=SC2148
22
# shellcheck disable=SC2329
33
setup() {
4-
load '/bats-libraries/bats-support/load.bash'
5-
load '/bats-libraries/bats-assert/load.bash'
6-
load '/bats-libraries/bats-file/load.bash'
7-
load 'helpers.sh'
4+
# Executed before entering each test in this file.
5+
load 'helpers.sh'
6+
_common_setup
87
}
98

9+
1010
# Currently, the tests defined in this file deliberately depend on each other
1111
# and are expected to execute in the order defined. In the future, we want to
1212
# build test dependency injection (with fixtures), and work towards clean
@@ -15,10 +15,6 @@ setup() {
1515
# happening. Tools like `etcdctl` will be helpful.
1616

1717

18-
# Use a name that upon cluster inspection reveals that this
19-
# Helm chart release was installed/managed by this test suite.
20-
export TEST_HELM_RELEASE_NAME="nvidia-dra-driver-gpu-batssuite"
21-
2218
# Note(JP): bats swallows output of setup upon success (regardless of cmdline
2319
# args such as `--show-output-of-passing-tests`). Ref:
2420
# https://github.com/bats-core/bats-core/issues/540#issuecomment-1013521656 --
@@ -41,45 +37,28 @@ setup_file() {
4137
# Prepare for installing releases from NGC (that merely mutates local
4238
# filesystem state).
4339
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update
44-
45-
# A helper arg for `iupgrade_wait` w/o additional install args.
46-
export NOARGS=()
47-
}
48-
49-
# Install or upgrade, and wait for pods to be READY.
50-
# 1st arg: helm chart repo
51-
# 2nd arg: helm chart version
52-
# 3rd arg: array with additional args (provide `NOARGS` if none)
53-
iupgrade_wait() {
54-
# E.g. `nvidia/nvidia-dra-driver-gpu` or
55-
# `oci://ghcr.io/nvidia/k8s-dra-driver-gpu`
56-
local REPO="$1"
57-
58-
# E.g. `25.3.1` or `25.8.0-dev-f2eaddd6-chart`
59-
local VERSION="$2"
60-
61-
# Expect array as third argument.
62-
local -n ADDITIONAL_INSTALL_ARGS=$3
63-
64-
timeout -v 10 helm upgrade --install "${TEST_HELM_RELEASE_NAME}" \
65-
"${REPO}" \
66-
--version="${VERSION}" \
67-
--create-namespace \
68-
--namespace nvidia-dra-driver-gpu \
69-
--set resources.gpus.enabled=false \
70-
--set nvidiaDriverRoot="${TEST_NVIDIA_DRIVER_ROOT}" "${ADDITIONAL_INSTALL_ARGS[@]}"
71-
72-
kubectl wait --for=condition=READY pods -A -l nvidia-dra-driver-gpu-component=kubelet-plugin --timeout=10s
73-
kubectl wait --for=condition=READY pods -A -l nvidia-dra-driver-gpu-component=controller --timeout=10s
74-
# maybe: check version on labels (to confirm that we set labels correctly)
7540
}
7641

7742
apply_check_delete_workload_imex_chan_inject() {
7843
kubectl apply -f demo/specs/imex/channel-injection.yaml
79-
kubectl wait --for=condition=READY pods imex-channel-injection --timeout=70s
44+
kubectl wait --for=condition=READY pods imex-channel-injection --timeout=100s
8045
run kubectl logs imex-channel-injection
81-
assert_output --partial "channel0"
8246
kubectl delete -f demo/specs/imex/channel-injection.yaml
47+
# Check output after attempted deletion.
48+
assert_output --partial "channel0"
49+
50+
# Wait for deletion to complete; this is critical before moving on to the next
51+
# test (as long as we don't wipe state entirely between tests).
52+
kubectl wait --for=delete pods imex-channel-injection --timeout=10s
53+
}
54+
55+
log_objects() {
56+
# Never fail, but show output in case a test fails, to facilitate debugging.
57+
# Could this be part of setup()? If setup succeeds and when a test fails:
58+
# does this show the output of setup? Then we could do this.
59+
kubectl get resourceclaims || true
60+
kubectl get computedomain || true
61+
kubectl get pods -o wide || true
8362
}
8463

8564
# A test that covers local dev tooling, we don't want to
@@ -100,8 +79,7 @@ apply_check_delete_workload_imex_chan_inject() {
10079
}
10180

10281
@test "helm-install ${TEST_CHART_REPO}/${TEST_CHART_VERSION}" {
103-
local _iargs=("--set" "featureGates.IMEXDaemonsWithDNSNames=true")
104-
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
82+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" NOARGS
10583
}
10684

10785
@test "helm list: validate output" {
@@ -146,10 +124,12 @@ apply_check_delete_workload_imex_chan_inject() {
146124
}
147125

148126
@test "IMEX channel injection (single)" {
127+
log_objects
149128
apply_check_delete_workload_imex_chan_inject
150129
}
151130

152131
@test "IMEX channel injection (all)" {
132+
log_objects
153133
# Example: with TEST_CHART_VERSION="v25.3.2-12390-chart"
154134
# the command below returns 0 (true: the tested version is smaller)
155135
if dpkg --compare-versions "${TEST_CHART_VERSION#v}" lt "25.8.0"; then
@@ -164,6 +144,8 @@ apply_check_delete_workload_imex_chan_inject() {
164144
}
165145

166146
@test "NodePrepareResources: catch unknown field in opaque cfg in ResourceClaim" {
147+
log_objects
148+
167149
envsubst < tests/bats/specs/rc-opaque-cfg-unknown-field.yaml.tmpl > \
168150
"${BATS_TEST_TMPDIR}"/rc-opaque-cfg-unknown-field.yaml
169151
cd "${BATS_TEST_TMPDIR}"
@@ -206,6 +188,8 @@ apply_check_delete_workload_imex_chan_inject() {
206188
}
207189

208190
@test "nickelpie (NCCL send/recv/broadcast, 2 pods, 2 nodes, small payload)" {
191+
log_objects
192+
209193
# Do not run in checkout dir (to not pollute that).
210194
cd "${BATS_TEST_TMPDIR}"
211195
git clone https://github.com/jgehrcke/jpsnips-nv
@@ -220,6 +204,8 @@ apply_check_delete_workload_imex_chan_inject() {
220204
}
221205

222206
@test "nvbandwidth (2 nodes, 2 GPUs each)" {
207+
log_objects
208+
223209
kubectl create -f https://github.com/kubeflow/mpi-operator/releases/download/v0.6.0/mpi-operator.yaml || echo "ignore"
224210
kubectl apply -f demo/specs/imex/nvbandwidth-test-job-1.yaml
225211
# The canonical k8s job interface works even for MPIJob (the MPIJob has an
@@ -232,6 +218,8 @@ apply_check_delete_workload_imex_chan_inject() {
232218
}
233219

234220
@test "downgrade: current-dev -> last-stable" {
221+
log_objects
222+
235223
# Stage 1: apply workload, but do not delete.
236224
kubectl apply -f demo/specs/imex/channel-injection.yaml
237225
kubectl wait --for=condition=READY pods imex-channel-injection --timeout=60s
@@ -250,8 +238,10 @@ apply_check_delete_workload_imex_chan_inject() {
250238
}
251239

252240
@test "upgrade: wipe-state, install-last-stable, upgrade-to-current-dev" {
241+
log_objects
242+
253243
# Stage 1: clean slate
254-
helm uninstall "${TEST_HELM_RELEASE_NAME}" -n nvidia-dra-driver-gpu
244+
helm uninstall "${TEST_HELM_RELEASE_NAME}" -n nvidia-dra-driver-gpu --wait --timeout=30s
255245
kubectl wait --for=delete pods -A -l app.kubernetes.io/name=nvidia-dra-driver-gpu --timeout=10s
256246
bash tests/bats/clean-state-dirs-all-nodes.sh
257247
kubectl get crd computedomains.resource.nvidia.com
@@ -271,8 +261,7 @@ apply_check_delete_workload_imex_chan_inject() {
271261
kubectl apply -f "${CRD_UPGRADE_URL}"
272262

273263
# Stage 5: install target version (as users would do).
274-
local _iargs=("--set" "featureGates.IMEXDaemonsWithDNSNames=true")
275-
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
264+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" NOARGS
276265

277266
# Stage 6: confirm deleting old workload works (critical, see above).
278267
timeout -v 60 kubectl delete -f demo/specs/imex/channel-injection.yaml

0 commit comments

Comments
 (0)