Skip to content

Commit 6334996

Browse files
authored
Merge pull request #572 from jgehrcke/jp/bats-test-suite
Add bats-based test suite (focus on ComputeDomains)
2 parents 13a7359 + ea0517b commit 6334996

File tree

8 files changed

+509
-0
lines changed

8 files changed

+509
-0
lines changed

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,3 +220,8 @@ PHONY: .shell
220220
-w /work \
221221
--user $$(id -u):$$(id -g) \
222222
$(BUILDIMAGE)
223+
224+
.PHONY: bats
225+
bats:
226+
make -f tests/bats/Makefile tests
227+

tests/bats/Dockerfile

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
FROM debian:trixie
2+
3+
# GNU parallel: bats may want to use that
4+
# gettext-base: provides envsubst, used by nickelpie
5+
RUN apt-get update && apt-get install -y -q --no-install-recommends \
6+
parallel git ca-certificates curl make gettext-base && \
7+
rm -rf /var/lib/apt/lists/*
8+
9+
# Set by BuiltKit, of the form amd64/arm64.
10+
ARG BUILDARCH
11+
12+
# Install bats for running cmdline tests.
13+
# This is the image used when invoking `make bats-test`.
14+
RUN git clone https://github.com/bats-core/bats-core.git && cd bats-core && \
15+
git checkout 855844b8344e67d60dc0f43fa39817ed7787f141 && ./install.sh /usr/local
16+
17+
RUN mkdir -p /bats-libraries
18+
RUN git clone https://github.com/bats-core/bats-support /bats-libraries/bats-support
19+
RUN git clone https://github.com/bats-core/bats-assert /bats-libraries/bats-assert
20+
RUN git clone https://github.com/bats-core/bats-file /bats-libraries/bats-file
21+
22+
RUN curl -sSfLO --retry 8 --retry-all-errors --connect-timeout 10 --retry-delay 5 \
23+
https://get.helm.sh/helm-v3.18.6-linux-${BUILDARCH}.tar.gz && \
24+
tar -zxvf helm-v3*${BUILDARCH}.tar.gz && mv linux-${BUILDARCH}/helm /usr/bin
25+
26+
RUN curl -sSfLO --retry 8 --retry-all-errors --connect-timeout 10 --retry-delay 5 \
27+
https://dl.k8s.io/release/v1.34.0/bin/linux/${BUILDARCH}/kubectl && \
28+
chmod ugo+x kubectl && mv kubectl /usr/bin

tests/bats/Makefile

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
include $(CURDIR)/versions.mk
17+
include $(CURDIR)/common.mk
18+
19+
BATS_IMAGE = batstests:$(GIT_COMMIT_SHORT)
20+
21+
TEST_CHART_REPO ?= "oci://ghcr.io/nvidia/k8s-dra-driver-gpu"
22+
TEST_CHART_VERSION ?= $(VERSION_GHCR_CHART)
23+
TEST_CHART_LASTSTABLE_REPO ?= "oci://ghcr.io/nvidia/k8s-dra-driver-gpu"
24+
TEST_CHART_LASTSTABLE_VERSION ?= "25.3.2-2c250af3-chart"
25+
TEST_NVIDIA_DRIVER_ROOT ?= "/run/nvidia/driver"
26+
27+
# Currently consumed in upgrade test via
28+
# kubectl apply -f <URL> (can be a branch, tag, or commit)
29+
TEST_CRD_UPGRADE_TARGET_GIT_REF ?= $(GIT_COMMIT_SHORT)
30+
31+
default: tests
32+
33+
.PHONY: image
34+
image:
35+
docker buildx build . -t $(BATS_IMAGE) -f tests/bats/Dockerfile
36+
37+
# Warning: destructive against currently configured k8s cluster.
38+
#
39+
# Explicit invocation of 'cleanup-from-previous-run.sh' (could also be done as
40+
# suite/file 'setup' in bats, but we'd lose output on success). During dev, you
41+
# may want to add --show-output-of-passing-tests (and read bats docs for other
42+
# cmdline args).
43+
.PHONY: tests
44+
tests: image
45+
mkdir -p tests-out
46+
export _RUNDIR=$(shell mktemp -p tests-out -d -t bats-tests-$$(date +%s)-XXXXX) && \
47+
echo "output directory: $${_RUNDIR}" && \
48+
time docker run \
49+
-it \
50+
-v /tmp:/tmp \
51+
-v $(CURDIR):/cwd \
52+
-v ~/.kube/config:/kubeconfig \
53+
--env KUBECONFIG=/kubeconfig \
54+
--env TEST_CHART_REPO=$(TEST_CHART_REPO) \
55+
--env TEST_CHART_VERSION=$(TEST_CHART_VERSION) \
56+
--env TEST_CHART_LASTSTABLE_REPO=$(TEST_CHART_LASTSTABLE_REPO) \
57+
--env TEST_CHART_LASTSTABLE_VERSION=$(TEST_CHART_LASTSTABLE_VERSION) \
58+
--env TEST_CRD_UPGRADE_TARGET_GIT_REF=$(TEST_CRD_UPGRADE_TARGET_GIT_REF) \
59+
--env TEST_NVIDIA_DRIVER_ROOT=$(TEST_NVIDIA_DRIVER_ROOT) \
60+
-u $(shell id -u ${USER}):$(shell id -g ${USER}) \
61+
--entrypoint "/bin/bash"\
62+
$(BATS_IMAGE) \
63+
-c "cd /cwd; \
64+
echo 'Running k8s cluster cleanup (invasive)... '; \
65+
bash tests/bats/cleanup-from-previous-run.sh &> $${_RUNDIR}/cleanup.outerr || \
66+
(echo 'Cleanup failed:'; cat $${_RUNDIR}/cleanup.outerr); \
67+
TMPDIR=/cwd/$${_RUNDIR} bats \
68+
--print-output-on-failure \
69+
--no-tempdir-cleanup \
70+
--timing \
71+
tests/bats/tests.bats \
72+
"

tests/bats/README.md

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
**Warning:** the test suite runs _invasively_ against the Kubernetes cluster that your local `kubectl` is currently configured against.
2+
3+
## Usage
4+
5+
Invoke `make bats` in the root of this repository.
6+
7+
8+
### Test local dev state (artifacts not pushed)
9+
10+
Not yet supported.
11+
Let's change this ASAP.
12+
13+
This test suite for now assumes public availability of a Helm chart on GHCR or NGC, pointing to a container image publicly available on GHCR or NGC.
14+
15+
### Test Helm chart from registery
16+
17+
#### Default versions
18+
19+
Say, this is the current local git revision:
20+
21+
```console
22+
$ git rev-parse --short=8 HEAD
23+
e6e1dde4
24+
```
25+
26+
Then the test suite runs with the default configuration, for example:
27+
```console
28+
$ make bats
29+
...
30+
--env TEST_CHART_REPO="oci://ghcr.io/nvidia/k8s-dra-driver-gpu" \
31+
--env TEST_CHART_VERSION=25.8.0-dev-e6e1dde4-chart \
32+
--env TEST_CHART_LASTSTABLE_REPO="oci://ghcr.io/nvidia/k8s-dra-driver-gpu" \
33+
--env TEST_CHART_LASTSTABLE_VERSION="25.3.2-7020737a-chart" \
34+
--env TEST_CRD_UPGRADE_TARGET_GIT_REF=e6e1dde4 \
35+
...
36+
12 tests, 0 failures in 166 seconds
37+
```
38+
39+
As you can see, this currently requires a Helm chart corresponding to the local revision to be available on GHCR.
40+
41+
#### Test specific versions
42+
43+
Set the correponding `TEST_*` environment variables before invoking the Makefile target.
44+
45+
For example:
46+
47+
```console
48+
$ export TEST_CHART_VERSION="25.8.0-dev-b823882b-chart"
49+
$ export TEST_CRD_UPGRADE_TARGET_GIT_REF="main"
50+
$ make bats
51+
...
52+
12 tests, 0 failures in 166 seconds
53+
```
54+
55+
56+
## Development
57+
58+
Bats is a workable solution.
59+
Developing new tests might however probe your patience.
60+
Make wise usage of
61+
62+
* [skipping tests](https://bats-core.readthedocs.io/en/stable/writing-tests.html#skip-easily-skip-tests)
63+
* [tagging tests with `bats:focus`](https://bats-core.readthedocs.io/en/stable/writing-tests.html#special-tags)
64+
* [CLI args](https://bats-core.readthedocs.io/en/stable/usage.html) such as `--verbose-run`, `--show-output-of-passing-tests`.
65+
66+
67+
Also, familiarize yourself with bat's [`run`](https://bats-core.readthedocs.io/en/stable/writing-tests.html#run-test-other-commands) command.
68+
69+
Don't skip the section about when [not to use `run`](https://bats-core.readthedocs.io/en/stable/writing-tests.html#when-not-to-use-run).
70+
71+
Take inspiration from [cri-o tests](https://github.com/cri-o/cri-o/tree/81e69a58c7e6ec8699b3bdd8696b1d0e25e32bfb/test).
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/bin/bash
2+
#
3+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4+
# SPDX-License-Identifier: Apache-2.0
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
set -o errexit
19+
set -o nounset
20+
set -o pipefail
21+
22+
rm_kubelet_plugin_dirs_from_node () {
23+
local NODE_NAME="$1"
24+
echo "Run privileged pod to remove kubelet plugin directories on node ${NODE_NAME}"
25+
kubectl run privpod-rm-plugindirs \
26+
--rm \
27+
--image=busybox \
28+
--attach \
29+
--wait \
30+
--restart=Never \
31+
--overrides='{
32+
"spec": {
33+
"nodeName": "'"${NODE_NAME}"'",
34+
"containers": [{
35+
"name": "privpod-rm-plugindirs",
36+
"image": "busybox",
37+
"securityContext": { "privileged": true },
38+
"volumeMounts": [{
39+
"mountPath": "/host",
40+
"name": "host-root"
41+
}],
42+
"command": ["/bin/sh", "-c", "rm -rfv /host/var/lib/kubelet/plugins/*"]
43+
}],
44+
"volumes": [{
45+
"name": "host-root",
46+
"hostPath": { "path": "/" }
47+
}]
48+
}
49+
}'
50+
}
51+
52+
# Would be faster when using a daemonset. However, the output is more readable
53+
# when running sequentially.
54+
for node in $(kubectl get nodes -o jsonpath='{.items[*].metadata.name}'); do
55+
rm_kubelet_plugin_dirs_from_node $node
56+
done
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/bin/bash
2+
#
3+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4+
# SPDX-License-Identifier: Apache-2.0
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
19+
set -o nounset
20+
set -o pipefail
21+
22+
CRD_URL="https://raw.githubusercontent.com/NVIDIA/k8s-dra-driver-gpu/main/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml"
23+
24+
# For debugging: state of the world
25+
kubectl get computedomains.resource.nvidia.com
26+
kubectl get pods -n nvidia-dra-driver-gpu
27+
helm list -A
28+
29+
set -x
30+
# When the CRD has been left behind deleted by a partially performed
31+
# test then the deletions below cannot succeed. Apply a CRD version that
32+
# likely helps deletion.
33+
kubectl apply -f "${CRD_URL}"
34+
35+
# Some effort to delete workloads potentially left-over from a previous
36+
# interrupted run. TODO: try to affect all-at-once, maybe with a special label.
37+
# Note: the following commands are OK to fail -- the `errexit` shell option is
38+
# deliberately not set here.
39+
timeout -v 5 kubectl delete -f demo/specs/imex/channel-injection.yaml
40+
timeout -v 5 kubectl delete -f demo/specs/imex/channel-injection-all.yaml
41+
timeout -v 5 kubectl delete jobs nickelpie-test
42+
timeout -v 5 kubectl delete computedomain nickelpie-test-compute-domain
43+
timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-1.yaml
44+
45+
# Delete any previous remainder of `clean-state-dirs-all-nodes.sh` invocation.
46+
kubectl delete pods privpod-rm-plugindirs
47+
48+
helm uninstall nvidia-dra-driver-gpu-batssuite -n nvidia-dra-driver-gpu
49+
50+
kubectl wait \
51+
--for=delete pods -A \
52+
-l app.kubernetes.io/name=nvidia-dra-driver-gpu \
53+
--timeout=10s \
54+
|| echo "wait-for-delete failed"
55+
56+
# The next `helm install` must freshly install CRDs, and this is one way to try
57+
# to achieve that. This might time out in case workload wasn't cleaned up
58+
# properly.
59+
timeout -v 10 kubectl delete crds computedomains.resource.nvidia.com || echo "CRD deletion failed"
60+
61+
set -e
62+
# Remove kubelet plugin state directories from all nodes.
63+
bash tests/bats/clean-state-dirs-all-nodes.sh
64+
set +x

0 commit comments

Comments
 (0)