Merge pull request #637 from jgehrcke/jp/test-suite-patches

jgehrcke · web-flow · commit afbb033809e6 · 2025-10-02T15:58:22.000+02:00
tests: fixes, improved cleanup &amp; stability, better debuggability
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,7 @@
 [._]*.sw[a-p]
 coverage.out
 tests-out
+*.log
+*.tar
+*.tgz
+
diff --git a/tests/bats/Makefile b/tests/bats/Makefile
@@ -56,6 +56,13 @@ endif
 BATS_IMAGE = batstests:$(GIT_COMMIT_SHORT)
 KUBECONFIG ?= $(HOME)/.kube/config
 
+# Add `docker run` arguments when not running
+# in Github Actions / GitLab CI.
+DOCKER_RUN_FLAGS :=
+ifeq ($(CI),)
+  DOCKER_RUN_FLAGS += -it
+endif
+
 default: tests
 
 .PHONY: image
@@ -71,9 +78,10 @@ image:
 .PHONY: tests
 tests: image
 	mkdir -p tests-out && \
-	export _RUNDIR=$(shell mktemp -p tests-out -d -t bats-tests-$$(date +%s)-XXXXX) && \
-	time docker run \
-		-it \
+	export _RUNDIR=$$(mktemp -p tests-out -d -t bats-tests-$$(date +%s)-XXXXX) && \
+	docker run \
+		--rm \
+		$(DOCKER_RUN_FLAGS) \
 		-v /tmp:/tmp \
 		-v $(CURDIR):/cwd \
 		-v $(HOME)/.kube/:$(HOME)/.kube \
@@ -88,7 +96,7 @@ tests: image
 		-u $(shell id -u ${USER}):$(shell id -g ${USER}) \
 		--entrypoint "/bin/bash"\
 		$(BATS_IMAGE) \
-		-c "cd /cwd; \
+		-c "set -ex; cd /cwd; \
 			echo 'Running k8s cluster cleanup (invasive)... '; \
 			bash tests/bats/cleanup-from-previous-run.sh 2>&1 | tee -a $${_RUNDIR}/cleanup.outerr; \
 			TMPDIR=/cwd/$${_RUNDIR} bats \
diff --git a/tests/bats/cleanup-from-previous-run.sh b/tests/bats/cleanup-from-previous-run.sh
@@ -21,6 +21,10 @@ set -o pipefail
 
 CRD_URL="https://raw.githubusercontent.com/NVIDIA/k8s-dra-driver-gpu/main/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml"
 
+
+THIS_DIR_PATH=$(dirname "$(realpath $0)")
+source "${THIS_DIR_PATH}/helpers.sh"
+
 # For debugging: state of the world
 kubectl get computedomains.resource.nvidia.com
 kubectl get pods -n nvidia-dra-driver-gpu
@@ -32,6 +36,12 @@ set -x
 # likely helps deletion.
 kubectl apply -f "${CRD_URL}"
 
+# Workload deletion below requires a DRA driver to be present, to actually clean
+# up. Install _a_ version temporarily, towards best-effort. Install
+# to-be-tested-version for now, latest-on-GHCR might be smarter though. Again,
+# this command may fail and in best-effort fashion this cleanup script proceeds.
+iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" NOARGS
+
 # Some effort to delete workloads potentially left-over from a previous
 # interrupted run. TODO: try to affect all-at-once, maybe with a special label.
 # Note: the following commands are OK to fail -- the `errexit` shell option is
@@ -44,6 +54,10 @@ timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-1.yaml 2> /d
 timeout -v 5 kubectl delete pods -l env=batssuite 2> /dev/null
 timeout -v 2 kubectl delete resourceclaim batssuite-rc-bad-opaque-config --force 2> /dev/null
 
+# TODO: maybe more brute-forcing/best-effort: it might make sense to submit all
+# workload in this test suite into a special namespace (not `default`), and to
+# then use `kubectl delete pods -n <testnamespace]> --all`.
+
 # Delete any previous remainder of `clean-state-dirs-all-nodes.sh` invocation.
 kubectl delete pods privpod-rm-plugindirs 2> /dev/null
 
@@ -55,9 +69,10 @@ kubectl wait \
     --timeout=10s \
         || echo "wait-for-delete failed"
 
-# The next `helm install` must freshly install CRDs, and this is one way to try
-# to achieve that. This might time out in case workload wasn't cleaned up
-# properly.
+# The next `helm install` should freshly install CRDs, and this is one way to
+# try to achieve that. This might time out in case workload wasn't cleaned up
+# properly. If that happens, the next test suite invocation will have failures
+# like "create not allowed while custom resource definition is terminating".
 timeout -v 10 kubectl delete crds computedomains.resource.nvidia.com || echo "CRD deletion failed"
 
 # Remove kubelet plugin state directories from all nodes (critical part of
diff --git a/tests/bats/helpers.sh b/tests/bats/helpers.sh
@@ -15,6 +15,64 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+# Use a name that upon cluster inspection reveals that this
+# Helm chart release was installed/managed by this test suite.
+export TEST_HELM_RELEASE_NAME="nvidia-dra-driver-gpu-batssuite"
+
+
+_common_setup() {
+  load '/bats-libraries/bats-support/load.bash'
+  load '/bats-libraries/bats-assert/load.bash'
+  load '/bats-libraries/bats-file/load.bash'
+}
+
+
+# A helper arg for `iupgrade_wait` w/o additional install args.
+export NOARGS=()
+
+# Install or upgrade, and wait for pods to be READY.
+# 1st arg: helm chart repo
+# 2nd arg: helm chart version
+# 3rd arg: array with additional args (provide `NOARGS` if none)
+iupgrade_wait() {
+  # E.g. `nvidia/nvidia-dra-driver-gpu` or
+  # `oci://ghcr.io/nvidia/k8s-dra-driver-gpu`
+  local REPO="$1"
+
+  # E.g. `25.3.1` or `25.8.0-dev-f2eaddd6-chart`
+  local VERSION="$2"
+
+  # Expect array as third argument.
+  local -n ADDITIONAL_INSTALL_ARGS=$3
+
+  timeout -v 120 helm upgrade --install "${TEST_HELM_RELEASE_NAME}" \
+    "${REPO}" \
+    --version="${VERSION}" \
+    --wait \
+    --timeout=1m5s \
+    --create-namespace \
+    --namespace nvidia-dra-driver-gpu \
+    --set resources.gpus.enabled=false \
+    --set nvidiaDriverRoot="${TEST_NVIDIA_DRIVER_ROOT}" "${ADDITIONAL_INSTALL_ARGS[@]}"
+
+  # Valueable output to have in the logs in case things went pearshaped.
+  kubectl get pods -n nvidia-dra-driver-gpu
+
+  # Some part of this waiting work is done by helm as of `--wait` with
+  # `--timeout`. Note that the below in itself would not be sufficient: in case
+  # of an upgrade we need to isolate the _new_ pods and not accidentally observe
+  # the currently disappearing pods. Also note that despite the `--wait` above,
+  # the kubelet plugins may still be in `PodInitializing` after the Helm command
+  # returned. My conclusion is that helm waits for the controller to be READY,
+  # but not for the plugin pods to be READY.
+  kubectl wait --for=condition=READY pods -A -l nvidia-dra-driver-gpu-component=kubelet-plugin --timeout=15s
+
+  # Again, log current state.
+  kubectl get pods -n nvidia-dra-driver-gpu
+
+  # maybe: check version on labels (to confirm that we set labels correctly)
+}
+
 # Events accumulate over time, so for certainty it's best to use a unique pod
 # name. Right now, this inspects an entire line which includes REASON, MESSAGE,
 # and OBJECT, so choose the needle (grepped for) precisely enough.
@@ -37,4 +95,4 @@ wait_for_pod_event() {
     fi
     sleep 2
   done
-}
+}
diff --git a/tests/bats/setup_suite.bash b/tests/bats/setup_suite.bash
@@ -25,6 +25,11 @@ setup_suite () {
     # Probe: kubectl configured against a k8s cluster.
     kubectl cluster-info | grep "control plane is running at"
 
+    # Fail fast in case there seems to be a DRA driver Helm chart installed at
+    # this point (maybe one _not_ managed by this test suite).
+    helm list -A
+    helm list -A | grep "nvidia-dra-driver-gpu" && { echo "error: helm list not clean"; return 1; }
+
     # Show, for debugging.
     kubectl api-resources --api-group=resource.k8s.io
 
diff --git a/tests/bats/tests.bats b/tests/bats/tests.bats
@@ -1,12 +1,12 @@
 # shellcheck disable=SC2148
 # shellcheck disable=SC2329
 setup() {
-  load '/bats-libraries/bats-support/load.bash'
-  load '/bats-libraries/bats-assert/load.bash'
-  load '/bats-libraries/bats-file/load.bash'
-  load 'helpers.sh'
+  # Executed before entering each test in this file.
+   load 'helpers.sh'
+  _common_setup
 }
 
+
 # Currently, the tests defined in this file deliberately depend on each other
 # and are expected to execute in the order defined. In the future, we want to
 # build test dependency injection (with fixtures), and work towards clean
@@ -15,10 +15,6 @@ setup() {
 # happening. Tools like `etcdctl` will be helpful.
 
 
-# Use a name that upon cluster inspection reveals that this
-# Helm chart release was installed/managed by this test suite.
-export TEST_HELM_RELEASE_NAME="nvidia-dra-driver-gpu-batssuite"
-
 # Note(JP): bats swallows output of setup upon success (regardless of cmdline
 # args such as `--show-output-of-passing-tests`). Ref:
 # https://github.com/bats-core/bats-core/issues/540#issuecomment-1013521656 --
@@ -41,45 +37,28 @@ setup_file() {
   # Prepare for installing releases from NGC (that merely mutates local
   # filesystem state).
   helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update
-
-  # A helper arg for `iupgrade_wait` w/o additional install args.
-  export NOARGS=()
-}
-
-# Install or upgrade, and wait for pods to be READY.
-# 1st arg: helm chart repo
-# 2nd arg: helm chart version
-# 3rd arg: array with additional args (provide `NOARGS` if none)
-iupgrade_wait() {
-  # E.g. `nvidia/nvidia-dra-driver-gpu` or
-  # `oci://ghcr.io/nvidia/k8s-dra-driver-gpu`
-  local REPO="$1"
-
-  # E.g. `25.3.1` or `25.8.0-dev-f2eaddd6-chart`
-  local VERSION="$2"
-
-  # Expect array as third argument.
-  local -n ADDITIONAL_INSTALL_ARGS=$3
-
-  timeout -v 10 helm upgrade --install "${TEST_HELM_RELEASE_NAME}" \
-    "${REPO}" \
-    --version="${VERSION}" \
-    --create-namespace \
-    --namespace nvidia-dra-driver-gpu \
-    --set resources.gpus.enabled=false \
-    --set nvidiaDriverRoot="${TEST_NVIDIA_DRIVER_ROOT}" "${ADDITIONAL_INSTALL_ARGS[@]}"
-
-  kubectl wait --for=condition=READY pods -A -l nvidia-dra-driver-gpu-component=kubelet-plugin --timeout=10s
-  kubectl wait --for=condition=READY pods -A -l nvidia-dra-driver-gpu-component=controller --timeout=10s
-  # maybe: check version on labels (to confirm that we set labels correctly)
 }
 
 apply_check_delete_workload_imex_chan_inject() {
   kubectl apply -f demo/specs/imex/channel-injection.yaml
-  kubectl wait --for=condition=READY pods imex-channel-injection --timeout=70s
+  kubectl wait --for=condition=READY pods imex-channel-injection --timeout=100s
   run kubectl logs imex-channel-injection
-  assert_output --partial "channel0"
   kubectl delete -f demo/specs/imex/channel-injection.yaml
+  # Check output after attempted deletion.
+  assert_output --partial "channel0"
+
+  # Wait for deletion to complete; this is critical before moving on to the next
+  # test (as long as we don't wipe state entirely between tests).
+  kubectl wait --for=delete pods imex-channel-injection --timeout=10s
+}
+
+log_objects() {
+  # Never fail, but show output in case a test fails, to facilitate debugging.
+  # Could this be part of setup()? If setup succeeds and when a test fails:
+  # does this show the output of setup? Then we could do this.
+  kubectl get resourceclaims || true
+  kubectl get computedomain || true
+  kubectl get pods -o wide || true
 }
 
 # A test that covers local dev tooling, we don't want to
@@ -100,8 +79,7 @@ apply_check_delete_workload_imex_chan_inject() {
 }
 
 @test "helm-install ${TEST_CHART_REPO}/${TEST_CHART_VERSION}" {
-  local _iargs=("--set" "featureGates.IMEXDaemonsWithDNSNames=true")
-  iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
+  iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" NOARGS
 }
 
 @test "helm list: validate output" {
@@ -146,10 +124,12 @@ apply_check_delete_workload_imex_chan_inject() {
 }
 
 @test "IMEX channel injection (single)" {
+  log_objects
   apply_check_delete_workload_imex_chan_inject
 }
 
 @test "IMEX channel injection (all)" {
+  log_objects
   # Example: with TEST_CHART_VERSION="v25.3.2-12390-chart"
   # the command below returns 0 (true: the tested version is smaller)
   if dpkg --compare-versions "${TEST_CHART_VERSION#v}" lt "25.8.0"; then
@@ -164,6 +144,8 @@ apply_check_delete_workload_imex_chan_inject() {
 }
 
 @test "NodePrepareResources: catch unknown field in opaque cfg in ResourceClaim" {
+  log_objects
+
   envsubst < tests/bats/specs/rc-opaque-cfg-unknown-field.yaml.tmpl > \
     "${BATS_TEST_TMPDIR}"/rc-opaque-cfg-unknown-field.yaml
   cd "${BATS_TEST_TMPDIR}"
@@ -206,6 +188,8 @@ apply_check_delete_workload_imex_chan_inject() {
 }
 
 @test "nickelpie (NCCL send/recv/broadcast, 2 pods, 2 nodes, small payload)" {
+  log_objects
+
   # Do not run in checkout dir (to not pollute that).
   cd "${BATS_TEST_TMPDIR}"
   git clone https://github.com/jgehrcke/jpsnips-nv
@@ -220,6 +204,8 @@ apply_check_delete_workload_imex_chan_inject() {
 }
 
 @test "nvbandwidth (2 nodes, 2 GPUs each)" {
+  log_objects
+
   kubectl create -f https://github.com/kubeflow/mpi-operator/releases/download/v0.6.0/mpi-operator.yaml || echo "ignore"
   kubectl apply -f demo/specs/imex/nvbandwidth-test-job-1.yaml
   # The canonical k8s job interface works even for MPIJob (the MPIJob has an
@@ -232,6 +218,8 @@ apply_check_delete_workload_imex_chan_inject() {
 }
 
 @test "downgrade: current-dev -> last-stable" {
+  log_objects
+
   # Stage 1: apply workload, but do not delete.
   kubectl apply -f demo/specs/imex/channel-injection.yaml
   kubectl wait --for=condition=READY pods imex-channel-injection --timeout=60s
@@ -250,8 +238,10 @@ apply_check_delete_workload_imex_chan_inject() {
 }
 
 @test "upgrade: wipe-state, install-last-stable, upgrade-to-current-dev" {
+  log_objects
+
   # Stage 1: clean slate
-  helm uninstall "${TEST_HELM_RELEASE_NAME}" -n nvidia-dra-driver-gpu
+  helm uninstall "${TEST_HELM_RELEASE_NAME}" -n nvidia-dra-driver-gpu --wait --timeout=30s
   kubectl wait --for=delete pods -A -l app.kubernetes.io/name=nvidia-dra-driver-gpu --timeout=10s
   bash tests/bats/clean-state-dirs-all-nodes.sh
   kubectl get crd computedomains.resource.nvidia.com
@@ -271,8 +261,7 @@ apply_check_delete_workload_imex_chan_inject() {
   kubectl apply -f "${CRD_UPGRADE_URL}"
 
   # Stage 5: install target version (as users would do).
-  local _iargs=("--set" "featureGates.IMEXDaemonsWithDNSNames=true")
-  iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
+  iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" NOARGS
 
   # Stage 6: confirm deleting old workload works (critical, see above).
   timeout -v 60 kubectl delete -f demo/specs/imex/channel-injection.yaml