Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,14 @@ PHONY: .shell
--user $$(id -u):$$(id -g) \
$(BUILDIMAGE)

.PHONY: bats
.PHONY: bats bats-cd bats-gpu
bats:
make -f tests/bats/Makefile tests

# Run compute domain specific tests
bats-cd:
make -f tests/bats/Makefile tests-cd

# Run GPU plugin specific tests
bats-gpu:
make -f tests/bats/Makefile tests-gpu
111 changes: 70 additions & 41 deletions tests/bats/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
include $(CURDIR)/versions.mk
include $(CURDIR)/common.mk


# The to-be-tested Helm chart. Ignored when setting TEST_CHART_LOCAL.
TEST_CHART_REPO ?= "oci://ghcr.io/nvidia/k8s-dra-driver-gpu"
TEST_CHART_VERSION ?= "$(VERSION_GHCR_CHART)"
Expand Down Expand Up @@ -53,8 +52,10 @@ TEST_CHART_REPO = "deployments/helm/nvidia-dra-driver-gpu"
TEST_CHART_VERSION = $(VERSION:v%=%)
endif

BATS_IMAGE = batstests:$(GIT_COMMIT_SHORT)
BATS_IMAGE ?= batstests:$(GIT_COMMIT_SHORT)
BATS_ARGS ?= --print-output-on-failure --no-tempdir-cleanup --timing --abort
KUBECONFIG ?= $(HOME)/.kube/config
RUNDIR_PARENT ?= /tmp/k8s-dra-driver-gpu-tests-out-$(USER)

# Add `docker run` arguments when not running
# in Github Actions / GitLab CI.
Expand All @@ -63,6 +64,42 @@ ifeq ($(CI),)
DOCKER_RUN_FLAGS += -it
endif

DOCKER_ENVS = \
--env KUBECONFIG=$(KUBECONFIG) \
--env TEST_CHART_REPO=$(TEST_CHART_REPO) \
--env TEST_CHART_VERSION=$(TEST_CHART_VERSION) \
--env TEST_CHART_LASTSTABLE_REPO=$(TEST_CHART_LASTSTABLE_REPO) \
--env TEST_CHART_LASTSTABLE_VERSION=$(TEST_CHART_LASTSTABLE_VERSION) \
--env TEST_CRD_UPGRADE_TARGET_GIT_REF=$(TEST_CRD_UPGRADE_TARGET_GIT_REF) \
--env TEST_NVIDIA_DRIVER_ROOT=$(TEST_NVIDIA_DRIVER_ROOT) \
--env TEST_EXPECTED_IMAGE_SPEC_SUBSTRING=$(TEST_EXPECTED_IMAGE_SPEC_SUBSTRING)

DOCKER_UID := $(shell id -u)
DOCKER_GID := $(shell id -g)
DOCKER_USER := $(DOCKER_UID):$(DOCKER_GID)

DOCKER_MOUNTS = \
-v /tmp:/tmp \
-v $(CURDIR):/cwd \
-v $(dir $(abspath $(KUBECONFIG))):$(dir $(abspath $(KUBECONFIG)))

# Helper to run bats inside container
# $(1) = whitespace-separated list of test files
define RUN_BATS
@export _RUNDIR_PARENT="$(RUNDIR_PARENT)"; \
mkdir -p "$${_RUNDIR_PARENT}"; \
export _RUNDIR="$$(mktemp -p "$${_RUNDIR_PARENT}" -d -t bats-tests-$$(date +%s)-XXXXX)"; \
echo "Run dir: $${_RUNDIR}"; \
docker run --rm $(DOCKER_RUN_FLAGS) $(DOCKER_MOUNTS) $(DOCKER_ENVS) \
-u $(DOCKER_USER) --entrypoint /bin/bash $(BATS_IMAGE) \
-c "set -ex; cd /cwd; \
echo 'Running k8s cluster cleanup (invasive)...'; \
bash tests/bats/cleanup-from-previous-run.sh 2>&1 | tee -a $${_RUNDIR}/cleanup.outerr; \
set +x; echo '--- STARTING TEST SUITE ---'; set -x; \
TMPDIR="$${_RUNDIR}" bats $(BATS_ARGS) $(1) \
"
endef

default: tests

.PHONY: image
Expand All @@ -75,43 +112,35 @@ image:
# suite/file 'setup' in bats, but we'd lose output on success). During dev, you
# may want to add --show-output-of-passing-tests (and read bats docs for other
# cmdline args).
.PHONY: tests
.PHONY: tests-gpu tests-cd

# Run GPU plugin specific tests
tests-gpu: image
$(call RUN_BATS, \
tests/bats/test_basics.bats \
tests/bats/test_gpu_basic.bats \
tests/bats/test_gpu_stress.bats)

# Run Compute Domain specific tests
tests-cd: image
$(call RUN_BATS, \
tests/bats/test_basics.bats \
tests/bats/test_cd_imex_chan_inject.bats \
tests/bats/test_cd_mnnvl_workload.bats \
tests/bats/test_cd_misc.bats \
tests/bats/test_cd_logging.bats \
tests/bats/test_cd_failover.bats \
tests/bats/test_cd_updowngrade.bats)

# Run complete tests
tests: image
export _RUNDIR_PARENT=/tmp/k8s-dra-driver-gpu-tests-out-$${USER} && \
mkdir -p "$${_RUNDIR_PARENT}" && \
export _RUNDIR=$$(mktemp -p $${_RUNDIR_PARENT} -d -t bats-tests-$$(date +%s)-XXXXX) && \
docker run \
--rm \
$(DOCKER_RUN_FLAGS) \
-v /tmp:/tmp \
-v $(CURDIR):/cwd \
-v $(HOME)/.kube/:$(HOME)/.kube \
--env KUBECONFIG=$(KUBECONFIG) \
--env TEST_CHART_REPO=$(TEST_CHART_REPO) \
--env TEST_CHART_VERSION=$(TEST_CHART_VERSION) \
--env TEST_CHART_LASTSTABLE_REPO=$(TEST_CHART_LASTSTABLE_REPO) \
--env TEST_CHART_LASTSTABLE_VERSION=$(TEST_CHART_LASTSTABLE_VERSION) \
--env TEST_CRD_UPGRADE_TARGET_GIT_REF=$(TEST_CRD_UPGRADE_TARGET_GIT_REF) \
--env TEST_NVIDIA_DRIVER_ROOT=$(TEST_NVIDIA_DRIVER_ROOT) \
--env TEST_EXPECTED_IMAGE_SPEC_SUBSTRING=$(TEST_EXPECTED_IMAGE_SPEC_SUBSTRING) \
-u $(shell id -u ${USER}):$(shell id -g ${USER}) \
--entrypoint "/bin/bash"\
$(BATS_IMAGE) \
-c "set -ex; cd /cwd; \
echo 'Running k8s cluster cleanup (invasive)... '; \
bash tests/bats/cleanup-from-previous-run.sh 2>&1 | tee -a $${_RUNDIR}/cleanup.outerr; \
set +x; echo '--- STARTING TEST SUITE ---'; set -x; \
TMPDIR=$${_RUNDIR} bats \
--print-output-on-failure \
--no-tempdir-cleanup \
--timing \
--abort \
tests/bats/test_basics.bats \
tests/bats/test_gpu_basic.bats \
tests/bats/test_cd_imex_chan_inject.bats \
tests/bats/test_cd_mnnvl_workload.bats \
tests/bats/test_cd_misc.bats \
tests/bats/test_cd_logging.bats \
tests/bats/test_cd_failover.bats \
tests/bats/test_cd_updowngrade.bats \
"
$(call RUN_BATS, \
tests/bats/test_basics.bats \
tests/bats/test_gpu_basic.bats \
tests/bats/test_cd_imex_chan_inject.bats \
tests/bats/test_cd_mnnvl_workload.bats \
tests/bats/test_cd_misc.bats \
tests/bats/test_cd_logging.bats \
tests/bats/test_cd_failover.bats \
tests/bats/test_cd_updowngrade.bats \
tests/bats/test_gpu_stress.bats)
13 changes: 11 additions & 2 deletions tests/bats/cleanup-from-previous-run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ set -x

# If a previous run leaves e.g. the controller behind in CrashLoopBackOff then
# the next installation with --wait won't succeed.
timeout -v 5 helm uninstall nvidia-dra-driver-gpu-batssuite -n nvidia-dra-driver-gpu
timeout -v 15 helm uninstall nvidia-dra-driver-gpu-batssuite -n nvidia-dra-driver-gpu

# When the CRD has been left behind deleted by a partially performed
# test then the deletions below cannot succeed. Apply a CRD version that
Expand Down Expand Up @@ -62,15 +62,24 @@ timeout -v 5 kubectl delete pods -l env=batssuite 2> /dev/null
timeout -v 2 kubectl delete resourceclaim batssuite-rc-bad-opaque-config --force 2> /dev/null
timeout -v 2 kubectl delete -f demo/specs/imex/simple-mig-test 2> /dev/null

# Cleanup any GPU stress test pods left behind
timeout -v 30 kubectl delete pods -l 'env=batssuite,test=stress-shared' 2> /dev/null
timeout -v 5 kubectl delete -f tests/bats/specs/rc-shared-gpu.yaml 2> /dev/null
kubectl wait --for=delete pods -l 'env=batssuite,test=stress-shared' \
--timeout=60s \
|| echo "wait-for-delete failed"

# TODO: maybe more brute-forcing/best-effort: it might make sense to submit all
# workload in this test suite into a special namespace (not `default`), and to
# then use `kubectl delete pods -n <testnamespace]> --all`.

# Delete any previous remainder of `clean-state-dirs-all-nodes.sh` invocation.
kubectl delete pods privpod-rm-plugindirs 2> /dev/null

timeout -v 5 helm uninstall nvidia-dra-driver-gpu-batssuite -n nvidia-dra-driver-gpu
# Make sure to wait till the chart is completely removed
helm uninstall nvidia-dra-driver-gpu-batssuite --wait -n nvidia-dra-driver-gpu
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

That one is interesting. I'll see how it does in practice for me!

I've found Helm's --wait surprisingly surprising at times.


# Double check that the pods are deleted
kubectl wait \
--for=delete pods -A \
-l app.kubernetes.io/name=nvidia-dra-driver-gpu \
Expand Down
23 changes: 23 additions & 0 deletions tests/bats/specs/pods-shared-gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Pod referencing the shared resource claim from rc-shared-gpu.yaml
# Test will create multiple pods using the spec below and updated INDEX.
---
apiVersion: v1
kind: Pod
metadata:
name: stress-pod-__INDEX__
labels:
env: batssuite
test: stress-shared
spec:
restartPolicy: Never
containers:
- name: ctr
image: ubuntu:24.04
command: ["bash","-lc"]
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
resources:
claims:
- name: gpu
resourceClaims:
- name: gpu
resourceClaimName: rc-shared-gpu
14 changes: 14 additions & 0 deletions tests/bats/specs/rc-shared-gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Shared GPU resource claim
apiVersion: resource.k8s.io/v1
kind: ResourceClaim
metadata:
name: rc-shared-gpu
labels:
env: batssuite
test: stress-shared
spec:
devices:
requests:
- name: gpu
exactly:
deviceClassName: gpu.nvidia.com
76 changes: 76 additions & 0 deletions tests/bats/test_gpu_stress.bats
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# shellcheck disable=SC2148
# shellcheck disable=SC2329

: "${STRESS_PODS_N:=15}"
: "${STRESS_LOOPS:=5}"
: "${STRESS_DELAY:=30}"

setup_file () {
load 'helpers.sh'
_common_setup
local _iargs=("--set" "logVerbosity=6")
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
}

setup() {
load 'helpers.sh'
_common_setup
log_objects
}

bats::on_failure() {
echo -e "\n\nFAILURE HOOK START"
log_objects
show_kubelet_plugin_error_logs
echo -e "FAILURE HOOK END\n\n"
}

# Expand pod YAML with indexes
_generate_pods_manifest() {
local out="$1"
local template="tests/bats/specs/pods-shared-gpu.yaml"
: > "$out"
for i in $(seq 1 "${STRESS_PODS_N}"); do
sed "s/__INDEX__/${i}/g" "${template}" >> "$out"
echo "---" >> "$out"
done
}

@test "Stress: shared ResourceClaim across ${STRESS_PODS_N} pods x ${STRESS_LOOPS} loops" {
for loop in $(seq 1 "${STRESS_LOOPS}"); do
echo "=== Loop $loop/${STRESS_LOOPS} ==="

# Apply ResourceClaim
kubectl apply -f tests/bats/specs/rc-shared-gpu.yaml

# Generate and apply pods spec
manifest="${BATS_TEST_TMPDIR:-/tmp}/pods-shared-${loop}.yaml"
_generate_pods_manifest "$manifest"
kubectl apply -f "$manifest"

# Wait for ResourceClaim allocation
kubectl wait --for=jsonpath='{.status.allocation}' resourceclaim rc-shared-gpu --timeout=120s

# Wait for all pods to be Ready
kubectl wait --for=condition=Ready pods -l 'env=batssuite,test=stress-shared' --timeout=180s

# Verify pod phases
phases=$(kubectl get pods -l 'env=batssuite,test=stress-shared' -o jsonpath='{range .items[*]}{.metadata.name}{" "}{.status.phase}{"\n"}{end}')
echo "$phases"
echo "$phases" | awk '$2!="Running"{exit 1}'

# Spot-check GPU allocation logs
run kubectl logs stress-pod-1
assert_output --partial "UUID: GPU-"

# Cleanup
kubectl delete pods -l 'env=batssuite,test=stress-shared' --timeout=90s
kubectl delete -f tests/bats/specs/rc-shared-gpu.yaml --timeout=90s
kubectl wait --for=delete pods -l 'env=batssuite,test=stress-shared' --timeout=60s

if [[ "$loop" -lt "$STRESS_LOOPS" ]]; then
echo "Sleeping ${STRESS_DELAY}s before next loop..."
sleep "${STRESS_DELAY}"
fi
done
}