Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@
[._]*.sw[a-p]
coverage.out
tests-out
*.log
*.tar
*.tgz

16 changes: 12 additions & 4 deletions tests/bats/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ endif
BATS_IMAGE = batstests:$(GIT_COMMIT_SHORT)
KUBECONFIG ?= $(HOME)/.kube/config

# Add `docker run` arguments when not running
# in Github Actions / GitLab CI.
DOCKER_RUN_FLAGS :=
ifeq ($(CI),)
DOCKER_RUN_FLAGS += -it
endif

default: tests

.PHONY: image
Expand All @@ -71,9 +78,10 @@ image:
.PHONY: tests
tests: image
mkdir -p tests-out && \
export _RUNDIR=$(shell mktemp -p tests-out -d -t bats-tests-$$(date +%s)-XXXXX) && \
time docker run \
-it \
export _RUNDIR=$$(mktemp -p tests-out -d -t bats-tests-$$(date +%s)-XXXXX) && \
docker run \
--rm \
$(DOCKER_RUN_FLAGS) \
-v /tmp:/tmp \
-v $(CURDIR):/cwd \
-v $(HOME)/.kube/:$(HOME)/.kube \
Expand All @@ -88,7 +96,7 @@ tests: image
-u $(shell id -u ${USER}):$(shell id -g ${USER}) \
--entrypoint "/bin/bash"\
$(BATS_IMAGE) \
-c "cd /cwd; \
-c "set -ex; cd /cwd; \
echo 'Running k8s cluster cleanup (invasive)... '; \
bash tests/bats/cleanup-from-previous-run.sh 2>&1 | tee -a $${_RUNDIR}/cleanup.outerr; \
TMPDIR=/cwd/$${_RUNDIR} bats \
Expand Down
21 changes: 18 additions & 3 deletions tests/bats/cleanup-from-previous-run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ set -o pipefail

CRD_URL="https://raw.githubusercontent.com/NVIDIA/k8s-dra-driver-gpu/main/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml"


THIS_DIR_PATH=$(dirname "$(realpath $0)")
source "${THIS_DIR_PATH}/helpers.sh"

# For debugging: state of the world
kubectl get computedomains.resource.nvidia.com
kubectl get pods -n nvidia-dra-driver-gpu
Expand All @@ -32,6 +36,12 @@ set -x
# likely helps deletion.
kubectl apply -f "${CRD_URL}"

# Workload deletion below requires a DRA driver to be present, to actually clean
# up. Install _a_ version temporarily, towards best-effort. Install
# to-be-tested-version for now, latest-on-GHCR might be smarter though. Again,
# this command may fail and in best-effort fashion this cleanup script proceeds.
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" NOARGS

# Some effort to delete workloads potentially left-over from a previous
# interrupted run. TODO: try to affect all-at-once, maybe with a special label.
# Note: the following commands are OK to fail -- the `errexit` shell option is
Expand All @@ -44,6 +54,10 @@ timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-1.yaml 2> /d
timeout -v 5 kubectl delete pods -l env=batssuite 2> /dev/null
timeout -v 2 kubectl delete resourceclaim batssuite-rc-bad-opaque-config --force 2> /dev/null

# TODO: maybe more brute-forcing/best-effort: it might make sense to submit all
# workload in this test suite into a special namespace (not `default`), and to
# then use `kubectl delete pods -n <testnamespace]> --all`.

# Delete any previous remainder of `clean-state-dirs-all-nodes.sh` invocation.
kubectl delete pods privpod-rm-plugindirs 2> /dev/null

Expand All @@ -55,9 +69,10 @@ kubectl wait \
--timeout=10s \
|| echo "wait-for-delete failed"

# The next `helm install` must freshly install CRDs, and this is one way to try
# to achieve that. This might time out in case workload wasn't cleaned up
# properly.
# The next `helm install` should freshly install CRDs, and this is one way to
# try to achieve that. This might time out in case workload wasn't cleaned up
# properly. If that happens, the next test suite invocation will have failures
# like "create not allowed while custom resource definition is terminating".
timeout -v 10 kubectl delete crds computedomains.resource.nvidia.com || echo "CRD deletion failed"

# Remove kubelet plugin state directories from all nodes (critical part of
Expand Down
60 changes: 59 additions & 1 deletion tests/bats/helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,64 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# Use a name that upon cluster inspection reveals that this
# Helm chart release was installed/managed by this test suite.
export TEST_HELM_RELEASE_NAME="nvidia-dra-driver-gpu-batssuite"


_common_setup() {
load '/bats-libraries/bats-support/load.bash'
load '/bats-libraries/bats-assert/load.bash'
load '/bats-libraries/bats-file/load.bash'
}


# A helper arg for `iupgrade_wait` w/o additional install args.
export NOARGS=()

# Install or upgrade, and wait for pods to be READY.
# 1st arg: helm chart repo
# 2nd arg: helm chart version
# 3rd arg: array with additional args (provide `NOARGS` if none)
iupgrade_wait() {
# E.g. `nvidia/nvidia-dra-driver-gpu` or
# `oci://ghcr.io/nvidia/k8s-dra-driver-gpu`
local REPO="$1"

# E.g. `25.3.1` or `25.8.0-dev-f2eaddd6-chart`
local VERSION="$2"

# Expect array as third argument.
local -n ADDITIONAL_INSTALL_ARGS=$3

timeout -v 120 helm upgrade --install "${TEST_HELM_RELEASE_NAME}" \
"${REPO}" \
--version="${VERSION}" \
--wait \
--timeout=1m5s \
--create-namespace \
--namespace nvidia-dra-driver-gpu \
--set resources.gpus.enabled=false \
--set nvidiaDriverRoot="${TEST_NVIDIA_DRIVER_ROOT}" "${ADDITIONAL_INSTALL_ARGS[@]}"

# Valueable output to have in the logs in case things went pearshaped.
kubectl get pods -n nvidia-dra-driver-gpu

# Some part of this waiting work is done by helm as of `--wait` with
# `--timeout`. Note that the below in itself would not be sufficient: in case
# of an upgrade we need to isolate the _new_ pods and not accidentally observe
# the currently disappearing pods. Also note that despite the `--wait` above,
# the kubelet plugins may still be in `PodInitializing` after the Helm command
# returned. My conclusion is that helm waits for the controller to be READY,
# but not for the plugin pods to be READY.
kubectl wait --for=condition=READY pods -A -l nvidia-dra-driver-gpu-component=kubelet-plugin --timeout=15s

# Again, log current state.
kubectl get pods -n nvidia-dra-driver-gpu

# maybe: check version on labels (to confirm that we set labels correctly)
}

# Events accumulate over time, so for certainty it's best to use a unique pod
# name. Right now, this inspects an entire line which includes REASON, MESSAGE,
# and OBJECT, so choose the needle (grepped for) precisely enough.
Expand All @@ -37,4 +95,4 @@ wait_for_pod_event() {
fi
sleep 2
done
}
}
5 changes: 5 additions & 0 deletions tests/bats/setup_suite.bash
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ setup_suite () {
# Probe: kubectl configured against a k8s cluster.
kubectl cluster-info | grep "control plane is running at"

# Fail fast in case there seems to be a DRA driver Helm chart installed at
# this point (maybe one _not_ managed by this test suite).
helm list -A
helm list -A | grep "nvidia-dra-driver-gpu" && { echo "error: helm list not clean"; return 1; }

# Show, for debugging.
kubectl api-resources --api-group=resource.k8s.io

Expand Down
81 changes: 35 additions & 46 deletions tests/bats/tests.bats
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# shellcheck disable=SC2148
# shellcheck disable=SC2329
setup() {
load '/bats-libraries/bats-support/load.bash'
load '/bats-libraries/bats-assert/load.bash'
load '/bats-libraries/bats-file/load.bash'
load 'helpers.sh'
# Executed before entering each test in this file.
load 'helpers.sh'
_common_setup
}


# Currently, the tests defined in this file deliberately depend on each other
# and are expected to execute in the order defined. In the future, we want to
# build test dependency injection (with fixtures), and work towards clean
Expand All @@ -15,10 +15,6 @@ setup() {
# happening. Tools like `etcdctl` will be helpful.


# Use a name that upon cluster inspection reveals that this
# Helm chart release was installed/managed by this test suite.
export TEST_HELM_RELEASE_NAME="nvidia-dra-driver-gpu-batssuite"

# Note(JP): bats swallows output of setup upon success (regardless of cmdline
# args such as `--show-output-of-passing-tests`). Ref:
# https://github.com/bats-core/bats-core/issues/540#issuecomment-1013521656 --
Expand All @@ -41,45 +37,28 @@ setup_file() {
# Prepare for installing releases from NGC (that merely mutates local
# filesystem state).
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update

# A helper arg for `iupgrade_wait` w/o additional install args.
export NOARGS=()
}

# Install or upgrade, and wait for pods to be READY.
# 1st arg: helm chart repo
# 2nd arg: helm chart version
# 3rd arg: array with additional args (provide `NOARGS` if none)
iupgrade_wait() {
# E.g. `nvidia/nvidia-dra-driver-gpu` or
# `oci://ghcr.io/nvidia/k8s-dra-driver-gpu`
local REPO="$1"

# E.g. `25.3.1` or `25.8.0-dev-f2eaddd6-chart`
local VERSION="$2"

# Expect array as third argument.
local -n ADDITIONAL_INSTALL_ARGS=$3

timeout -v 10 helm upgrade --install "${TEST_HELM_RELEASE_NAME}" \
"${REPO}" \
--version="${VERSION}" \
--create-namespace \
--namespace nvidia-dra-driver-gpu \
--set resources.gpus.enabled=false \
--set nvidiaDriverRoot="${TEST_NVIDIA_DRIVER_ROOT}" "${ADDITIONAL_INSTALL_ARGS[@]}"

kubectl wait --for=condition=READY pods -A -l nvidia-dra-driver-gpu-component=kubelet-plugin --timeout=10s
kubectl wait --for=condition=READY pods -A -l nvidia-dra-driver-gpu-component=controller --timeout=10s
# maybe: check version on labels (to confirm that we set labels correctly)
}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved to helpers.sh


apply_check_delete_workload_imex_chan_inject() {
kubectl apply -f demo/specs/imex/channel-injection.yaml
kubectl wait --for=condition=READY pods imex-channel-injection --timeout=70s
kubectl wait --for=condition=READY pods imex-channel-injection --timeout=100s
run kubectl logs imex-channel-injection
assert_output --partial "channel0"
kubectl delete -f demo/specs/imex/channel-injection.yaml
# Check output after attempted deletion.
assert_output --partial "channel0"

# Wait for deletion to complete; this is critical before moving on to the next
# test (as long as we don't wipe state entirely between tests).
kubectl wait --for=delete pods imex-channel-injection --timeout=10s
}

log_objects() {
# Never fail, but show output in case a test fails, to facilitate debugging.
# Could this be part of setup()? If setup succeeds and when a test fails:
# does this show the output of setup? Then we could do this.
kubectl get resourceclaims || true
kubectl get computedomain || true
kubectl get pods -o wide || true
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am sure we'll find a better way -- the main question here is if a bats setup primitive can be useful. I will explore more deeply another time.

}

# A test that covers local dev tooling, we don't want to
Expand All @@ -100,8 +79,7 @@ apply_check_delete_workload_imex_chan_inject() {
}

@test "helm-install ${TEST_CHART_REPO}/${TEST_CHART_VERSION}" {
local _iargs=("--set" "featureGates.IMEXDaemonsWithDNSNames=true")
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" NOARGS
}

@test "helm list: validate output" {
Expand Down Expand Up @@ -146,10 +124,12 @@ apply_check_delete_workload_imex_chan_inject() {
}

@test "IMEX channel injection (single)" {
log_objects
apply_check_delete_workload_imex_chan_inject
}

@test "IMEX channel injection (all)" {
log_objects
# Example: with TEST_CHART_VERSION="v25.3.2-12390-chart"
# the command below returns 0 (true: the tested version is smaller)
if dpkg --compare-versions "${TEST_CHART_VERSION#v}" lt "25.8.0"; then
Expand All @@ -164,6 +144,8 @@ apply_check_delete_workload_imex_chan_inject() {
}

@test "NodePrepareResources: catch unknown field in opaque cfg in ResourceClaim" {
log_objects

envsubst < tests/bats/specs/rc-opaque-cfg-unknown-field.yaml.tmpl > \
"${BATS_TEST_TMPDIR}"/rc-opaque-cfg-unknown-field.yaml
cd "${BATS_TEST_TMPDIR}"
Expand Down Expand Up @@ -206,6 +188,8 @@ apply_check_delete_workload_imex_chan_inject() {
}

@test "nickelpie (NCCL send/recv/broadcast, 2 pods, 2 nodes, small payload)" {
log_objects

# Do not run in checkout dir (to not pollute that).
cd "${BATS_TEST_TMPDIR}"
git clone https://github.com/jgehrcke/jpsnips-nv
Expand All @@ -220,6 +204,8 @@ apply_check_delete_workload_imex_chan_inject() {
}

@test "nvbandwidth (2 nodes, 2 GPUs each)" {
log_objects

kubectl create -f https://github.com/kubeflow/mpi-operator/releases/download/v0.6.0/mpi-operator.yaml || echo "ignore"
kubectl apply -f demo/specs/imex/nvbandwidth-test-job-1.yaml
# The canonical k8s job interface works even for MPIJob (the MPIJob has an
Expand All @@ -232,6 +218,8 @@ apply_check_delete_workload_imex_chan_inject() {
}

@test "downgrade: current-dev -> last-stable" {
log_objects

# Stage 1: apply workload, but do not delete.
kubectl apply -f demo/specs/imex/channel-injection.yaml
kubectl wait --for=condition=READY pods imex-channel-injection --timeout=60s
Expand All @@ -250,8 +238,10 @@ apply_check_delete_workload_imex_chan_inject() {
}

@test "upgrade: wipe-state, install-last-stable, upgrade-to-current-dev" {
log_objects

# Stage 1: clean slate
helm uninstall "${TEST_HELM_RELEASE_NAME}" -n nvidia-dra-driver-gpu
helm uninstall "${TEST_HELM_RELEASE_NAME}" -n nvidia-dra-driver-gpu --wait --timeout=30s
kubectl wait --for=delete pods -A -l app.kubernetes.io/name=nvidia-dra-driver-gpu --timeout=10s
bash tests/bats/clean-state-dirs-all-nodes.sh
kubectl get crd computedomains.resource.nvidia.com
Expand All @@ -271,8 +261,7 @@ apply_check_delete_workload_imex_chan_inject() {
kubectl apply -f "${CRD_UPGRADE_URL}"

# Stage 5: install target version (as users would do).
local _iargs=("--set" "featureGates.IMEXDaemonsWithDNSNames=true")
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" NOARGS

# Stage 6: confirm deleting old workload works (critical, see above).
timeout -v 60 kubectl delete -f demo/specs/imex/channel-injection.yaml
Expand Down