Skip to content

Commit 5443e0f

Browse files
authored
Merge pull request #711 from shivamerla/add_gpu_stress_tests
tests: Add separate targets for GPU plugin tests + add stress tests
2 parents 59d775b + 3babfe5 commit 5443e0f

File tree

6 files changed

+202
-44
lines changed

6 files changed

+202
-44
lines changed

Makefile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,14 @@ PHONY: .shell
221221
--user $$(id -u):$$(id -g) \
222222
$(BUILDIMAGE)
223223

224-
.PHONY: bats
224+
.PHONY: bats bats-cd bats-gpu
225225
bats:
226226
make -f tests/bats/Makefile tests
227227

228+
# Run compute domain specific tests
229+
bats-cd:
230+
make -f tests/bats/Makefile tests-cd
231+
232+
# Run GPU plugin specific tests
233+
bats-gpu:
234+
make -f tests/bats/Makefile tests-gpu

tests/bats/Makefile

Lines changed: 70 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
include $(CURDIR)/versions.mk
1717
include $(CURDIR)/common.mk
1818

19-
2019
# The to-be-tested Helm chart. Ignored when setting TEST_CHART_LOCAL.
2120
TEST_CHART_REPO ?= "oci://ghcr.io/nvidia/k8s-dra-driver-gpu"
2221
TEST_CHART_VERSION ?= "$(VERSION_GHCR_CHART)"
@@ -53,8 +52,10 @@ TEST_CHART_REPO = "deployments/helm/nvidia-dra-driver-gpu"
5352
TEST_CHART_VERSION = $(VERSION:v%=%)
5453
endif
5554

56-
BATS_IMAGE = batstests:$(GIT_COMMIT_SHORT)
55+
BATS_IMAGE ?= batstests:$(GIT_COMMIT_SHORT)
56+
BATS_ARGS ?= --print-output-on-failure --no-tempdir-cleanup --timing --abort
5757
KUBECONFIG ?= $(HOME)/.kube/config
58+
RUNDIR_PARENT ?= /tmp/k8s-dra-driver-gpu-tests-out-$(USER)
5859

5960
# Add `docker run` arguments when not running
6061
# in Github Actions / GitLab CI.
@@ -63,6 +64,42 @@ ifeq ($(CI),)
6364
DOCKER_RUN_FLAGS += -it
6465
endif
6566

67+
DOCKER_ENVS = \
68+
--env KUBECONFIG=$(KUBECONFIG) \
69+
--env TEST_CHART_REPO=$(TEST_CHART_REPO) \
70+
--env TEST_CHART_VERSION=$(TEST_CHART_VERSION) \
71+
--env TEST_CHART_LASTSTABLE_REPO=$(TEST_CHART_LASTSTABLE_REPO) \
72+
--env TEST_CHART_LASTSTABLE_VERSION=$(TEST_CHART_LASTSTABLE_VERSION) \
73+
--env TEST_CRD_UPGRADE_TARGET_GIT_REF=$(TEST_CRD_UPGRADE_TARGET_GIT_REF) \
74+
--env TEST_NVIDIA_DRIVER_ROOT=$(TEST_NVIDIA_DRIVER_ROOT) \
75+
--env TEST_EXPECTED_IMAGE_SPEC_SUBSTRING=$(TEST_EXPECTED_IMAGE_SPEC_SUBSTRING)
76+
77+
DOCKER_UID := $(shell id -u)
78+
DOCKER_GID := $(shell id -g)
79+
DOCKER_USER := $(DOCKER_UID):$(DOCKER_GID)
80+
81+
DOCKER_MOUNTS = \
82+
-v /tmp:/tmp \
83+
-v $(CURDIR):/cwd \
84+
-v $(dir $(abspath $(KUBECONFIG))):$(dir $(abspath $(KUBECONFIG)))
85+
86+
# Helper to run bats inside container
87+
# $(1) = whitespace-separated list of test files
88+
define RUN_BATS
89+
@export _RUNDIR_PARENT="$(RUNDIR_PARENT)"; \
90+
mkdir -p "$${_RUNDIR_PARENT}"; \
91+
export _RUNDIR="$$(mktemp -p "$${_RUNDIR_PARENT}" -d -t bats-tests-$$(date +%s)-XXXXX)"; \
92+
echo "Run dir: $${_RUNDIR}"; \
93+
docker run --rm $(DOCKER_RUN_FLAGS) $(DOCKER_MOUNTS) $(DOCKER_ENVS) \
94+
-u $(DOCKER_USER) --entrypoint /bin/bash $(BATS_IMAGE) \
95+
-c "set -ex; cd /cwd; \
96+
echo 'Running k8s cluster cleanup (invasive)...'; \
97+
bash tests/bats/cleanup-from-previous-run.sh 2>&1 | tee -a $${_RUNDIR}/cleanup.outerr; \
98+
set +x; echo '--- STARTING TEST SUITE ---'; set -x; \
99+
TMPDIR="$${_RUNDIR}" bats $(BATS_ARGS) $(1) \
100+
"
101+
endef
102+
66103
default: tests
67104

68105
.PHONY: image
@@ -75,43 +112,35 @@ image:
75112
# suite/file 'setup' in bats, but we'd lose output on success). During dev, you
76113
# may want to add --show-output-of-passing-tests (and read bats docs for other
77114
# cmdline args).
78-
.PHONY: tests
115+
.PHONY: tests-gpu tests-cd
116+
117+
# Run GPU plugin specific tests
118+
tests-gpu: image
119+
$(call RUN_BATS, \
120+
tests/bats/test_basics.bats \
121+
tests/bats/test_gpu_basic.bats \
122+
tests/bats/test_gpu_stress.bats)
123+
124+
# Run Compute Domain specific tests
125+
tests-cd: image
126+
$(call RUN_BATS, \
127+
tests/bats/test_basics.bats \
128+
tests/bats/test_cd_imex_chan_inject.bats \
129+
tests/bats/test_cd_mnnvl_workload.bats \
130+
tests/bats/test_cd_misc.bats \
131+
tests/bats/test_cd_logging.bats \
132+
tests/bats/test_cd_failover.bats \
133+
tests/bats/test_cd_updowngrade.bats)
134+
135+
# Run complete tests
79136
tests: image
80-
export _RUNDIR_PARENT=/tmp/k8s-dra-driver-gpu-tests-out-$${USER} && \
81-
mkdir -p "$${_RUNDIR_PARENT}" && \
82-
export _RUNDIR=$$(mktemp -p $${_RUNDIR_PARENT} -d -t bats-tests-$$(date +%s)-XXXXX) && \
83-
docker run \
84-
--rm \
85-
$(DOCKER_RUN_FLAGS) \
86-
-v /tmp:/tmp \
87-
-v $(CURDIR):/cwd \
88-
-v $(HOME)/.kube/:$(HOME)/.kube \
89-
--env KUBECONFIG=$(KUBECONFIG) \
90-
--env TEST_CHART_REPO=$(TEST_CHART_REPO) \
91-
--env TEST_CHART_VERSION=$(TEST_CHART_VERSION) \
92-
--env TEST_CHART_LASTSTABLE_REPO=$(TEST_CHART_LASTSTABLE_REPO) \
93-
--env TEST_CHART_LASTSTABLE_VERSION=$(TEST_CHART_LASTSTABLE_VERSION) \
94-
--env TEST_CRD_UPGRADE_TARGET_GIT_REF=$(TEST_CRD_UPGRADE_TARGET_GIT_REF) \
95-
--env TEST_NVIDIA_DRIVER_ROOT=$(TEST_NVIDIA_DRIVER_ROOT) \
96-
--env TEST_EXPECTED_IMAGE_SPEC_SUBSTRING=$(TEST_EXPECTED_IMAGE_SPEC_SUBSTRING) \
97-
-u $(shell id -u ${USER}):$(shell id -g ${USER}) \
98-
--entrypoint "/bin/bash"\
99-
$(BATS_IMAGE) \
100-
-c "set -ex; cd /cwd; \
101-
echo 'Running k8s cluster cleanup (invasive)... '; \
102-
bash tests/bats/cleanup-from-previous-run.sh 2>&1 | tee -a $${_RUNDIR}/cleanup.outerr; \
103-
set +x; echo '--- STARTING TEST SUITE ---'; set -x; \
104-
TMPDIR=$${_RUNDIR} bats \
105-
--print-output-on-failure \
106-
--no-tempdir-cleanup \
107-
--timing \
108-
--abort \
109-
tests/bats/test_basics.bats \
110-
tests/bats/test_gpu_basic.bats \
111-
tests/bats/test_cd_imex_chan_inject.bats \
112-
tests/bats/test_cd_mnnvl_workload.bats \
113-
tests/bats/test_cd_misc.bats \
114-
tests/bats/test_cd_logging.bats \
115-
tests/bats/test_cd_failover.bats \
116-
tests/bats/test_cd_updowngrade.bats \
117-
"
137+
$(call RUN_BATS, \
138+
tests/bats/test_basics.bats \
139+
tests/bats/test_gpu_basic.bats \
140+
tests/bats/test_cd_imex_chan_inject.bats \
141+
tests/bats/test_cd_mnnvl_workload.bats \
142+
tests/bats/test_cd_misc.bats \
143+
tests/bats/test_cd_logging.bats \
144+
tests/bats/test_cd_failover.bats \
145+
tests/bats/test_cd_updowngrade.bats \
146+
tests/bats/test_gpu_stress.bats)

tests/bats/cleanup-from-previous-run.sh

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ set -x
3434

3535
# If a previous run leaves e.g. the controller behind in CrashLoopBackOff then
3636
# the next installation with --wait won't succeed.
37-
timeout -v 5 helm uninstall nvidia-dra-driver-gpu-batssuite -n nvidia-dra-driver-gpu
37+
timeout -v 15 helm uninstall nvidia-dra-driver-gpu-batssuite -n nvidia-dra-driver-gpu
3838

3939
# When the CRD has been left behind deleted by a partially performed
4040
# test then the deletions below cannot succeed. Apply a CRD version that
@@ -62,15 +62,24 @@ timeout -v 5 kubectl delete pods -l env=batssuite 2> /dev/null
6262
timeout -v 2 kubectl delete resourceclaim batssuite-rc-bad-opaque-config --force 2> /dev/null
6363
timeout -v 2 kubectl delete -f demo/specs/imex/simple-mig-test 2> /dev/null
6464

65+
# Cleanup any GPU stress test pods left behind
66+
timeout -v 30 kubectl delete pods -l 'env=batssuite,test=stress-shared' 2> /dev/null
67+
timeout -v 5 kubectl delete -f tests/bats/specs/rc-shared-gpu.yaml 2> /dev/null
68+
kubectl wait --for=delete pods -l 'env=batssuite,test=stress-shared' \
69+
--timeout=60s \
70+
|| echo "wait-for-delete failed"
71+
6572
# TODO: maybe more brute-forcing/best-effort: it might make sense to submit all
6673
# workload in this test suite into a special namespace (not `default`), and to
6774
# then use `kubectl delete pods -n <testnamespace]> --all`.
6875

6976
# Delete any previous remainder of `clean-state-dirs-all-nodes.sh` invocation.
7077
kubectl delete pods privpod-rm-plugindirs 2> /dev/null
7178

72-
timeout -v 5 helm uninstall nvidia-dra-driver-gpu-batssuite -n nvidia-dra-driver-gpu
79+
# Make sure to wait till the chart is completely removed
80+
helm uninstall nvidia-dra-driver-gpu-batssuite --wait -n nvidia-dra-driver-gpu
7381

82+
# Double check that the pods are deleted
7483
kubectl wait \
7584
--for=delete pods -A \
7685
-l app.kubernetes.io/name=nvidia-dra-driver-gpu \
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Pod referencing the shared resource claim from rc-shared-gpu.yaml
2+
# Test will create multiple pods using the spec below and updated INDEX.
3+
---
4+
apiVersion: v1
5+
kind: Pod
6+
metadata:
7+
name: stress-pod-__INDEX__
8+
labels:
9+
env: batssuite
10+
test: stress-shared
11+
spec:
12+
restartPolicy: Never
13+
containers:
14+
- name: ctr
15+
image: ubuntu:24.04
16+
command: ["bash","-lc"]
17+
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
18+
resources:
19+
claims:
20+
- name: gpu
21+
resourceClaims:
22+
- name: gpu
23+
resourceClaimName: rc-shared-gpu
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Shared GPU resource claim
2+
apiVersion: resource.k8s.io/v1
3+
kind: ResourceClaim
4+
metadata:
5+
name: rc-shared-gpu
6+
labels:
7+
env: batssuite
8+
test: stress-shared
9+
spec:
10+
devices:
11+
requests:
12+
- name: gpu
13+
exactly:
14+
deviceClassName: gpu.nvidia.com

tests/bats/test_gpu_stress.bats

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# shellcheck disable=SC2148
2+
# shellcheck disable=SC2329
3+
4+
: "${STRESS_PODS_N:=15}"
5+
: "${STRESS_LOOPS:=5}"
6+
: "${STRESS_DELAY:=30}"
7+
8+
setup_file () {
9+
load 'helpers.sh'
10+
_common_setup
11+
local _iargs=("--set" "logVerbosity=6")
12+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
13+
}
14+
15+
setup() {
16+
load 'helpers.sh'
17+
_common_setup
18+
log_objects
19+
}
20+
21+
bats::on_failure() {
22+
echo -e "\n\nFAILURE HOOK START"
23+
log_objects
24+
show_kubelet_plugin_error_logs
25+
echo -e "FAILURE HOOK END\n\n"
26+
}
27+
28+
# Expand pod YAML with indexes
29+
_generate_pods_manifest() {
30+
local out="$1"
31+
local template="tests/bats/specs/pods-shared-gpu.yaml"
32+
: > "$out"
33+
for i in $(seq 1 "${STRESS_PODS_N}"); do
34+
sed "s/__INDEX__/${i}/g" "${template}" >> "$out"
35+
echo "---" >> "$out"
36+
done
37+
}
38+
39+
@test "Stress: shared ResourceClaim across ${STRESS_PODS_N} pods x ${STRESS_LOOPS} loops" {
40+
for loop in $(seq 1 "${STRESS_LOOPS}"); do
41+
echo "=== Loop $loop/${STRESS_LOOPS} ==="
42+
43+
# Apply ResourceClaim
44+
kubectl apply -f tests/bats/specs/rc-shared-gpu.yaml
45+
46+
# Generate and apply pods spec
47+
manifest="${BATS_TEST_TMPDIR:-/tmp}/pods-shared-${loop}.yaml"
48+
_generate_pods_manifest "$manifest"
49+
kubectl apply -f "$manifest"
50+
51+
# Wait for ResourceClaim allocation
52+
kubectl wait --for=jsonpath='{.status.allocation}' resourceclaim rc-shared-gpu --timeout=120s
53+
54+
# Wait for all pods to be Ready
55+
kubectl wait --for=condition=Ready pods -l 'env=batssuite,test=stress-shared' --timeout=180s
56+
57+
# Verify pod phases
58+
phases=$(kubectl get pods -l 'env=batssuite,test=stress-shared' -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.phase}{"\n"}{end}')
59+
echo "$phases"
60+
echo "$phases" | awk '$2!="Running"{exit 1}'
61+
62+
# Spot-check GPU allocation logs
63+
run kubectl logs stress-pod-1
64+
assert_output --partial "UUID: GPU-"
65+
66+
# Cleanup
67+
kubectl delete pods -l 'env=batssuite,test=stress-shared' --timeout=90s
68+
kubectl delete -f tests/bats/specs/rc-shared-gpu.yaml --timeout=90s
69+
kubectl wait --for=delete pods -l 'env=batssuite,test=stress-shared' --timeout=60s
70+
71+
if [[ "$loop" -lt "$STRESS_LOOPS" ]]; then
72+
echo "Sleeping ${STRESS_DELAY}s before next loop..."
73+
sleep "${STRESS_DELAY}"
74+
fi
75+
done
76+
}

0 commit comments

Comments
 (0)