Merge pull request #711 from shivamerla/add_gpu_stress_tests

shivamerla · web-flow · commit 5443e0f5ce7f · 2025-11-04T12:48:00.000-08:00
tests: Add separate targets for GPU plugin tests + add stress tests
diff --git a/Makefile b/Makefile
@@ -221,7 +221,14 @@ PHONY: .shell
 		--user $$(id -u):$$(id -g) \
 		$(BUILDIMAGE)
 
-.PHONY: bats
+.PHONY: bats bats-cd bats-gpu
 bats:
 	make -f tests/bats/Makefile tests
 
+# Run compute domain specific tests
+bats-cd:
+	make -f tests/bats/Makefile tests-cd
+
+# Run GPU plugin specific tests
+bats-gpu:
+	make -f tests/bats/Makefile tests-gpu
diff --git a/tests/bats/Makefile b/tests/bats/Makefile
@@ -16,7 +16,6 @@
 include $(CURDIR)/versions.mk
 include $(CURDIR)/common.mk
 
-
 # The to-be-tested Helm chart. Ignored when setting TEST_CHART_LOCAL.
 TEST_CHART_REPO ?= "oci://ghcr.io/nvidia/k8s-dra-driver-gpu"
 TEST_CHART_VERSION ?= "$(VERSION_GHCR_CHART)"
@@ -53,8 +52,10 @@ TEST_CHART_REPO = "deployments/helm/nvidia-dra-driver-gpu"
 TEST_CHART_VERSION = $(VERSION:v%=%)
 endif
 
-BATS_IMAGE = batstests:$(GIT_COMMIT_SHORT)
+BATS_IMAGE ?= batstests:$(GIT_COMMIT_SHORT)
+BATS_ARGS ?= --print-output-on-failure --no-tempdir-cleanup --timing --abort
 KUBECONFIG ?= $(HOME)/.kube/config
+RUNDIR_PARENT ?= /tmp/k8s-dra-driver-gpu-tests-out-$(USER)
 
 # Add `docker run` arguments when not running
 # in Github Actions / GitLab CI.
@@ -63,6 +64,42 @@ ifeq ($(CI),)
   DOCKER_RUN_FLAGS += -it
 endif
 
+DOCKER_ENVS = \
+	--env KUBECONFIG=$(KUBECONFIG) \
+	--env TEST_CHART_REPO=$(TEST_CHART_REPO) \
+	--env TEST_CHART_VERSION=$(TEST_CHART_VERSION) \
+	--env TEST_CHART_LASTSTABLE_REPO=$(TEST_CHART_LASTSTABLE_REPO) \
+	--env TEST_CHART_LASTSTABLE_VERSION=$(TEST_CHART_LASTSTABLE_VERSION) \
+	--env TEST_CRD_UPGRADE_TARGET_GIT_REF=$(TEST_CRD_UPGRADE_TARGET_GIT_REF) \
+	--env TEST_NVIDIA_DRIVER_ROOT=$(TEST_NVIDIA_DRIVER_ROOT) \
+	--env TEST_EXPECTED_IMAGE_SPEC_SUBSTRING=$(TEST_EXPECTED_IMAGE_SPEC_SUBSTRING)
+
+DOCKER_UID := $(shell id -u)
+DOCKER_GID := $(shell id -g)
+DOCKER_USER := $(DOCKER_UID):$(DOCKER_GID)
+
+DOCKER_MOUNTS = \
+	-v /tmp:/tmp \
+	-v $(CURDIR):/cwd \
+	-v $(dir $(abspath $(KUBECONFIG))):$(dir $(abspath $(KUBECONFIG)))
+
+# Helper to run bats inside container
+# $(1) = whitespace-separated list of test files
+define RUN_BATS
+	@export _RUNDIR_PARENT="$(RUNDIR_PARENT)"; \
+	mkdir -p "$${_RUNDIR_PARENT}"; \
+	export _RUNDIR="$$(mktemp -p "$${_RUNDIR_PARENT}" -d -t bats-tests-$$(date +%s)-XXXXX)"; \
+	echo "Run dir: $${_RUNDIR}"; \
+	docker run --rm $(DOCKER_RUN_FLAGS) $(DOCKER_MOUNTS) $(DOCKER_ENVS) \
+	  -u $(DOCKER_USER) --entrypoint /bin/bash $(BATS_IMAGE) \
+	  -c "set -ex; cd /cwd; \
+	      echo 'Running k8s cluster cleanup (invasive)...'; \
+	      bash tests/bats/cleanup-from-previous-run.sh 2>&1 | tee -a $${_RUNDIR}/cleanup.outerr; \
+	      set +x; echo '--- STARTING TEST SUITE ---'; set -x; \
+	      TMPDIR="$${_RUNDIR}" bats $(BATS_ARGS) $(1) \
+      "
+endef
+
 default: tests
 
 .PHONY: image
@@ -75,43 +112,35 @@ image:
 # suite/file 'setup' in bats, but we'd lose output on success). During dev, you
 # may want to add --show-output-of-passing-tests (and read bats docs for other
 # cmdline args).
-.PHONY: tests
+.PHONY: tests-gpu tests-cd
+
+# Run GPU plugin specific tests
+tests-gpu: image
+	$(call RUN_BATS, \
+		tests/bats/test_basics.bats \
+		tests/bats/test_gpu_basic.bats \
+		tests/bats/test_gpu_stress.bats)
+
+# Run Compute Domain specific tests
+tests-cd: image
+	$(call RUN_BATS, \
+		tests/bats/test_basics.bats \
+		tests/bats/test_cd_imex_chan_inject.bats \
+		tests/bats/test_cd_mnnvl_workload.bats \
+		tests/bats/test_cd_misc.bats \
+		tests/bats/test_cd_logging.bats \
+		tests/bats/test_cd_failover.bats \
+		tests/bats/test_cd_updowngrade.bats)
+
+# Run complete tests
 tests: image
-	export _RUNDIR_PARENT=/tmp/k8s-dra-driver-gpu-tests-out-$${USER} && \
-	mkdir -p "$${_RUNDIR_PARENT}" && \
-	export _RUNDIR=$$(mktemp -p $${_RUNDIR_PARENT} -d -t bats-tests-$$(date +%s)-XXXXX) && \
-	docker run \
-		--rm \
-		$(DOCKER_RUN_FLAGS) \
-		-v /tmp:/tmp \
-		-v $(CURDIR):/cwd \
-		-v $(HOME)/.kube/:$(HOME)/.kube \
-		--env KUBECONFIG=$(KUBECONFIG) \
-		--env TEST_CHART_REPO=$(TEST_CHART_REPO) \
-		--env TEST_CHART_VERSION=$(TEST_CHART_VERSION) \
-		--env TEST_CHART_LASTSTABLE_REPO=$(TEST_CHART_LASTSTABLE_REPO) \
-		--env TEST_CHART_LASTSTABLE_VERSION=$(TEST_CHART_LASTSTABLE_VERSION) \
-		--env TEST_CRD_UPGRADE_TARGET_GIT_REF=$(TEST_CRD_UPGRADE_TARGET_GIT_REF) \
-		--env TEST_NVIDIA_DRIVER_ROOT=$(TEST_NVIDIA_DRIVER_ROOT) \
-		--env TEST_EXPECTED_IMAGE_SPEC_SUBSTRING=$(TEST_EXPECTED_IMAGE_SPEC_SUBSTRING) \
-		-u $(shell id -u ${USER}):$(shell id -g ${USER}) \
-		--entrypoint "/bin/bash"\
-		$(BATS_IMAGE) \
-		-c "set -ex; cd /cwd; \
-			echo 'Running k8s cluster cleanup (invasive)... '; \
-			bash tests/bats/cleanup-from-previous-run.sh 2>&1 | tee -a $${_RUNDIR}/cleanup.outerr; \
-			set +x; echo '--- STARTING TEST SUITE ---'; set -x; \
-			TMPDIR=$${_RUNDIR} bats \
-			--print-output-on-failure \
-			--no-tempdir-cleanup \
-			--timing \
-			--abort \
-			tests/bats/test_basics.bats \
-			tests/bats/test_gpu_basic.bats \
-			tests/bats/test_cd_imex_chan_inject.bats \
-			tests/bats/test_cd_mnnvl_workload.bats \
-			tests/bats/test_cd_misc.bats \
-			tests/bats/test_cd_logging.bats \
-			tests/bats/test_cd_failover.bats \
-			tests/bats/test_cd_updowngrade.bats \
-		"
+	$(call RUN_BATS, \
+		tests/bats/test_basics.bats \
+		tests/bats/test_gpu_basic.bats \
+		tests/bats/test_cd_imex_chan_inject.bats \
+		tests/bats/test_cd_mnnvl_workload.bats \
+		tests/bats/test_cd_misc.bats \
+		tests/bats/test_cd_logging.bats \
+		tests/bats/test_cd_failover.bats \
+		tests/bats/test_cd_updowngrade.bats \
+		tests/bats/test_gpu_stress.bats)
diff --git a/tests/bats/cleanup-from-previous-run.sh b/tests/bats/cleanup-from-previous-run.sh
@@ -34,7 +34,7 @@ set -x
 
 # If a previous run leaves e.g. the controller behind in CrashLoopBackOff then
 # the next installation with --wait won't succeed.
-timeout -v 5 helm uninstall nvidia-dra-driver-gpu-batssuite -n nvidia-dra-driver-gpu
+timeout -v 15 helm uninstall nvidia-dra-driver-gpu-batssuite -n nvidia-dra-driver-gpu
 
 # When the CRD has been left behind deleted by a partially performed
 # test then the deletions below cannot succeed. Apply a CRD version that
@@ -62,15 +62,24 @@ timeout -v 5 kubectl delete pods -l env=batssuite 2> /dev/null
 timeout -v 2 kubectl delete resourceclaim batssuite-rc-bad-opaque-config --force 2> /dev/null
 timeout -v 2 kubectl delete -f demo/specs/imex/simple-mig-test 2> /dev/null
 
+# Cleanup any GPU stress test pods left behind
+timeout -v 30 kubectl delete pods -l 'env=batssuite,test=stress-shared' 2> /dev/null
+timeout -v 5 kubectl delete -f tests/bats/specs/rc-shared-gpu.yaml 2> /dev/null
+kubectl wait --for=delete pods -l 'env=batssuite,test=stress-shared' \
+    --timeout=60s \
+    || echo "wait-for-delete failed"
+
 # TODO: maybe more brute-forcing/best-effort: it might make sense to submit all
 # workload in this test suite into a special namespace (not `default`), and to
 # then use `kubectl delete pods -n <testnamespace]> --all`.
 
 # Delete any previous remainder of `clean-state-dirs-all-nodes.sh` invocation.
 kubectl delete pods privpod-rm-plugindirs 2> /dev/null
 
-timeout -v 5 helm uninstall nvidia-dra-driver-gpu-batssuite -n nvidia-dra-driver-gpu
+# Make sure to wait till the chart is completely removed
+helm uninstall nvidia-dra-driver-gpu-batssuite --wait -n nvidia-dra-driver-gpu
 
+# Double check that the pods are deleted
 kubectl wait \
     --for=delete pods -A \
     -l app.kubernetes.io/name=nvidia-dra-driver-gpu \
diff --git a/tests/bats/specs/pods-shared-gpu.yaml b/tests/bats/specs/pods-shared-gpu.yaml
@@ -0,0 +1,23 @@
+# Pod referencing the shared resource claim from rc-shared-gpu.yaml
+# Test will create multiple pods using the spec below and updated INDEX.
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: stress-pod-__INDEX__
+  labels:
+    env: batssuite
+    test: stress-shared
+spec:
+  restartPolicy: Never
+  containers:
+  - name: ctr
+    image: ubuntu:24.04
+    command: ["bash","-lc"]
+    args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: gpu
+  resourceClaims:
+  - name: gpu
+    resourceClaimName: rc-shared-gpu
diff --git a/tests/bats/specs/rc-shared-gpu.yaml b/tests/bats/specs/rc-shared-gpu.yaml
@@ -0,0 +1,14 @@
+# Shared GPU resource claim
+apiVersion: resource.k8s.io/v1
+kind: ResourceClaim
+metadata:
+  name: rc-shared-gpu
+  labels:
+    env: batssuite
+    test: stress-shared
+spec:
+  devices:
+    requests:
+    - name: gpu
+      exactly:
+        deviceClassName: gpu.nvidia.com
diff --git a/tests/bats/test_gpu_stress.bats b/tests/bats/test_gpu_stress.bats
@@ -0,0 +1,76 @@
+# shellcheck disable=SC2148
+# shellcheck disable=SC2329
+
+: "${STRESS_PODS_N:=15}"
+: "${STRESS_LOOPS:=5}"
+: "${STRESS_DELAY:=30}"
+
+setup_file () {
+  load 'helpers.sh'
+  _common_setup
+  local _iargs=("--set" "logVerbosity=6")
+  iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
+}
+
+setup() {
+  load 'helpers.sh'
+  _common_setup
+  log_objects
+}
+
+bats::on_failure() {
+  echo -e "\n\nFAILURE HOOK START"
+  log_objects
+  show_kubelet_plugin_error_logs
+  echo -e "FAILURE HOOK END\n\n"
+}
+
+# Expand pod YAML with indexes
+_generate_pods_manifest() {
+  local out="$1"
+  local template="tests/bats/specs/pods-shared-gpu.yaml"
+  : > "$out"
+  for i in $(seq 1 "${STRESS_PODS_N}"); do
+    sed "s/__INDEX__/${i}/g" "${template}" >> "$out"
+    echo "---" >> "$out"
+  done
+}
+
+@test "Stress: shared ResourceClaim across ${STRESS_PODS_N} pods x ${STRESS_LOOPS} loops" {
+  for loop in $(seq 1 "${STRESS_LOOPS}"); do
+    echo "=== Loop $loop/${STRESS_LOOPS} ==="
+
+    # Apply ResourceClaim
+    kubectl apply -f tests/bats/specs/rc-shared-gpu.yaml
+
+    # Generate and apply pods spec
+    manifest="${BATS_TEST_TMPDIR:-/tmp}/pods-shared-${loop}.yaml"
+    _generate_pods_manifest "$manifest"
+    kubectl apply  -f "$manifest"
+
+    # Wait for ResourceClaim allocation
+    kubectl wait --for=jsonpath='{.status.allocation}' resourceclaim rc-shared-gpu --timeout=120s
+
+    # Wait for all pods to be Ready
+    kubectl wait  --for=condition=Ready pods -l 'env=batssuite,test=stress-shared' --timeout=180s
+
+    # Verify pod phases
+    phases=$(kubectl get pods  -l 'env=batssuite,test=stress-shared' -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.phase}{"\n"}{end}')
+    echo "$phases"
+    echo "$phases" | awk '$2!="Running"{exit 1}'
+
+    # Spot-check GPU allocation logs
+    run kubectl logs  stress-pod-1
+    assert_output --partial "UUID: GPU-"
+
+    # Cleanup
+    kubectl delete pods  -l 'env=batssuite,test=stress-shared' --timeout=90s
+    kubectl delete -f tests/bats/specs/rc-shared-gpu.yaml --timeout=90s
+    kubectl wait  --for=delete pods -l 'env=batssuite,test=stress-shared' --timeout=60s
+
+    if [[ "$loop" -lt "$STRESS_LOOPS" ]]; then
+      echo "Sleeping ${STRESS_DELAY}s before next loop..."
+      sleep "${STRESS_DELAY}"
+    fi
+  done
+}