11# shellcheck disable=SC2148
22# shellcheck disable=SC2329
33setup () {
4- load ' /bats-libraries/bats-support/load.bash'
5- load ' /bats-libraries/bats-assert/load.bash'
6- load ' /bats-libraries/bats-file/load.bash'
7- load ' helpers.sh'
4+ # Executed before entering each test in this file.
5+ load ' helpers.sh'
6+ _common_setup
87}
98
9+
1010# Currently, the tests defined in this file deliberately depend on each other
1111# and are expected to execute in the order defined. In the future, we want to
1212# build test dependency injection (with fixtures), and work towards clean
@@ -15,10 +15,6 @@ setup() {
1515# happening. Tools like `etcdctl` will be helpful.
1616
1717
18- # Use a name that upon cluster inspection reveals that this
19- # Helm chart release was installed/managed by this test suite.
20- export TEST_HELM_RELEASE_NAME=" nvidia-dra-driver-gpu-batssuite"
21-
2218# Note(JP): bats swallows output of setup upon success (regardless of cmdline
2319# args such as `--show-output-of-passing-tests`). Ref:
2420# https://github.com/bats-core/bats-core/issues/540#issuecomment-1013521656 --
@@ -41,45 +37,28 @@ setup_file() {
4137 # Prepare for installing releases from NGC (that merely mutates local
4238 # filesystem state).
4339 helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update
44-
45- # A helper arg for `iupgrade_wait` w/o additional install args.
46- export NOARGS=()
47- }
48-
49- # Install or upgrade, and wait for pods to be READY.
50- # 1st arg: helm chart repo
51- # 2nd arg: helm chart version
52- # 3rd arg: array with additional args (provide `NOARGS` if none)
53- iupgrade_wait () {
54- # E.g. `nvidia/nvidia-dra-driver-gpu` or
55- # `oci://ghcr.io/nvidia/k8s-dra-driver-gpu`
56- local REPO=" $1 "
57-
58- # E.g. `25.3.1` or `25.8.0-dev-f2eaddd6-chart`
59- local VERSION=" $2 "
60-
61- # Expect array as third argument.
62- local -n ADDITIONAL_INSTALL_ARGS=$3
63-
64- timeout -v 10 helm upgrade --install " ${TEST_HELM_RELEASE_NAME} " \
65- " ${REPO} " \
66- --version=" ${VERSION} " \
67- --create-namespace \
68- --namespace nvidia-dra-driver-gpu \
69- --set resources.gpus.enabled=false \
70- --set nvidiaDriverRoot=" ${TEST_NVIDIA_DRIVER_ROOT} " " ${ADDITIONAL_INSTALL_ARGS[@]} "
71-
72- kubectl wait --for=condition=READY pods -A -l nvidia-dra-driver-gpu-component=kubelet-plugin --timeout=10s
73- kubectl wait --for=condition=READY pods -A -l nvidia-dra-driver-gpu-component=controller --timeout=10s
74- # maybe: check version on labels (to confirm that we set labels correctly)
7540}
7641
7742apply_check_delete_workload_imex_chan_inject () {
7843 kubectl apply -f demo/specs/imex/channel-injection.yaml
79- kubectl wait --for=condition=READY pods imex-channel-injection --timeout=70s
44+ kubectl wait --for=condition=READY pods imex-channel-injection --timeout=100s
8045 run kubectl logs imex-channel-injection
81- assert_output --partial " channel0"
8246 kubectl delete -f demo/specs/imex/channel-injection.yaml
47+ # Check output after attempted deletion.
48+ assert_output --partial " channel0"
49+
50+ # Wait for deletion to complete; this is critical before moving on to the next
51+ # test (as long as we don't wipe state entirely between tests).
52+ kubectl wait --for=delete pods imex-channel-injection --timeout=10s
53+ }
54+
55+ log_objects () {
56+ # Never fail, but show output in case a test fails, to facilitate debugging.
57+ # Could this be part of setup()? If setup succeeds and when a test fails:
58+ # does this show the output of setup? Then we could do this.
59+ kubectl get resourceclaims || true
60+ kubectl get computedomain || true
61+ kubectl get pods -o wide || true
8362}
8463
8564# A test that covers local dev tooling, we don't want to
@@ -100,8 +79,7 @@ apply_check_delete_workload_imex_chan_inject() {
10079}
10180
10281@test " helm-install ${TEST_CHART_REPO} /${TEST_CHART_VERSION} " {
103- local _iargs=(" --set" " featureGates.IMEXDaemonsWithDNSNames=true" )
104- iupgrade_wait " ${TEST_CHART_REPO} " " ${TEST_CHART_VERSION} " _iargs
82+ iupgrade_wait " ${TEST_CHART_REPO} " " ${TEST_CHART_VERSION} " NOARGS
10583}
10684
10785@test " helm list: validate output" {
@@ -146,10 +124,12 @@ apply_check_delete_workload_imex_chan_inject() {
146124}
147125
148126@test " IMEX channel injection (single)" {
127+ log_objects
149128 apply_check_delete_workload_imex_chan_inject
150129}
151130
152131@test " IMEX channel injection (all)" {
132+ log_objects
153133 # Example: with TEST_CHART_VERSION="v25.3.2-12390-chart"
154134 # the command below returns 0 (true: the tested version is smaller)
155135 if dpkg --compare-versions " ${TEST_CHART_VERSION# v} " lt " 25.8.0" ; then
@@ -164,6 +144,8 @@ apply_check_delete_workload_imex_chan_inject() {
164144}
165145
166146@test " NodePrepareResources: catch unknown field in opaque cfg in ResourceClaim" {
147+ log_objects
148+
167149 envsubst < tests/bats/specs/rc-opaque-cfg-unknown-field.yaml.tmpl > \
168150 " ${BATS_TEST_TMPDIR} " /rc-opaque-cfg-unknown-field.yaml
169151 cd " ${BATS_TEST_TMPDIR} "
@@ -206,6 +188,8 @@ apply_check_delete_workload_imex_chan_inject() {
206188}
207189
208190@test " nickelpie (NCCL send/recv/broadcast, 2 pods, 2 nodes, small payload)" {
191+ log_objects
192+
209193 # Do not run in checkout dir (to not pollute that).
210194 cd " ${BATS_TEST_TMPDIR} "
211195 git clone https://github.com/jgehrcke/jpsnips-nv
@@ -220,6 +204,8 @@ apply_check_delete_workload_imex_chan_inject() {
220204}
221205
222206@test " nvbandwidth (2 nodes, 2 GPUs each)" {
207+ log_objects
208+
223209 kubectl create -f https://github.com/kubeflow/mpi-operator/releases/download/v0.6.0/mpi-operator.yaml || echo " ignore"
224210 kubectl apply -f demo/specs/imex/nvbandwidth-test-job-1.yaml
225211 # The canonical k8s job interface works even for MPIJob (the MPIJob has an
@@ -232,6 +218,8 @@ apply_check_delete_workload_imex_chan_inject() {
232218}
233219
234220@test " downgrade: current-dev -> last-stable" {
221+ log_objects
222+
235223 # Stage 1: apply workload, but do not delete.
236224 kubectl apply -f demo/specs/imex/channel-injection.yaml
237225 kubectl wait --for=condition=READY pods imex-channel-injection --timeout=60s
@@ -250,8 +238,10 @@ apply_check_delete_workload_imex_chan_inject() {
250238}
251239
252240@test " upgrade: wipe-state, install-last-stable, upgrade-to-current-dev" {
241+ log_objects
242+
253243 # Stage 1: clean slate
254- helm uninstall " ${TEST_HELM_RELEASE_NAME} " -n nvidia-dra-driver-gpu
244+ helm uninstall " ${TEST_HELM_RELEASE_NAME} " -n nvidia-dra-driver-gpu --wait --timeout=30s
255245 kubectl wait --for=delete pods -A -l app.kubernetes.io/name=nvidia-dra-driver-gpu --timeout=10s
256246 bash tests/bats/clean-state-dirs-all-nodes.sh
257247 kubectl get crd computedomains.resource.nvidia.com
@@ -271,8 +261,7 @@ apply_check_delete_workload_imex_chan_inject() {
271261 kubectl apply -f " ${CRD_UPGRADE_URL} "
272262
273263 # Stage 5: install target version (as users would do).
274- local _iargs=(" --set" " featureGates.IMEXDaemonsWithDNSNames=true" )
275- iupgrade_wait " ${TEST_CHART_REPO} " " ${TEST_CHART_VERSION} " _iargs
264+ iupgrade_wait " ${TEST_CHART_REPO} " " ${TEST_CHART_VERSION} " NOARGS
276265
277266 # Stage 6: confirm deleting old workload works (critical, see above).
278267 timeout -v 60 kubectl delete -f demo/specs/imex/channel-injection.yaml
0 commit comments