Skip to content

Commit 7c0b313

Browse files
committed
tests: split tests.bats into modules
Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]>
1 parent 65cd2c5 commit 7c0b313

File tree

9 files changed

+440
-354
lines changed

9 files changed

+440
-354
lines changed

tests/bats/Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,5 +105,10 @@ tests: image
105105
--no-tempdir-cleanup \
106106
--timing \
107107
--abort \
108+
tests/bats/test_basics.bats \
108109
tests/bats/tests.bats \
110+
tests/bats/test_cd_imex_chan_inject.bats \
111+
tests/bats/test_cd_mnnvl_workload.bats \
112+
tests/bats/test_cd_logging.bats \
113+
tests/bats/test_cd_updowngrade.bats \
109114
"

tests/bats/helpers.sh

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,18 @@ iupgrade_wait() {
8484
# maybe: check version on labels (to confirm that we set labels correctly)
8585
}
8686

87+
88+
log_objects() {
89+
# Never fail, but show output in case a test fails, to facilitate debugging.
90+
# Could this be part of setup()? If setup succeeds and when a test fails:
91+
# does this show the output of setup? Then we could do this.
92+
kubectl get resourceclaims || true
93+
kubectl get computedomain || true
94+
kubectl get pods -o wide || true
95+
kubectl get pods -o wide -n nvidia-dra-driver-gpu || true
96+
}
97+
98+
8799
# Events accumulate over time, so for certainty it's best to use a unique pod
88100
# name. Right now, this inspects an entire line which includes REASON, MESSAGE,
89101
# and OBJECT, so choose the needle (grepped for) precisely enough.
@@ -147,3 +159,16 @@ get_current_controller_pod_name() {
147159
| grep -iv 'terminating' \
148160
| awk '{print $1}'
149161
}
162+
163+
164+
apply_check_delete_workload_imex_chan_inject() {
165+
kubectl apply -f demo/specs/imex/channel-injection.yaml
166+
kubectl wait --for=condition=READY pods imex-channel-injection --timeout=100s
167+
run kubectl logs imex-channel-injection
168+
assert_output --partial "channel0"
169+
170+
# Wait for deletion to complete; this is critical before moving on to the next
171+
# test (as long as we don't wipe state entirely between tests).
172+
kubectl delete -f demo/specs/imex/channel-injection.yaml
173+
kubectl wait --for=delete pods imex-channel-injection --timeout=10s
174+
}

tests/bats/setup_suite.bash

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
# https://bats-core.readthedocs.io/en/latest/writing-tests.html#setup-and-teardown-pre-and-post-test-hooks
2020

2121
# Validate that some prerequisites are met, and inspect environment for
22-
# characteristics, such as DRA API group version. A failing setup_suit()
22+
# characteristics, such as DRA API group version. A failing setup_suite()
2323
# function aborts the suite (fail fast).
24-
setup_suite () {
24+
validate_prerequisites() {
2525
# Probe: kubectl configured against a k8s cluster.
2626
kubectl cluster-info | grep "control plane is running at"
2727

@@ -47,4 +47,27 @@ setup_suite () {
4747

4848
# Examples: v1, or v1beta1
4949
export TEST_K8S_RESOURCE_API_VERSION
50-
}
50+
}
51+
52+
53+
setup_suite () {
54+
validate_prerequisites
55+
# Create Helm repo cache dir and point `helm` to it, otherwise `Error:
56+
# INSTALLATION FAILED: mkdir /.cache: permission denied`
57+
HELM_REPOSITORY_CACHE=$(mktemp -d -t helm-XXXXX)
58+
export HELM_REPOSITORY_CACHE
59+
60+
# Consumed by the helm CLI.
61+
export HELM_REPOSITORY_CONFIG=${HELM_REPOSITORY_CACHE}/repo.cfg
62+
63+
# Prepare CRD upgrade URL.
64+
export CRD_URL_PFX="https://raw.githubusercontent.com/NVIDIA/k8s-dra-driver-gpu/"
65+
export CRD_URL_SFX="/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml"
66+
export CRD_UPGRADE_URL="${CRD_URL_PFX}${TEST_CRD_UPGRADE_TARGET_GIT_REF}${CRD_URL_SFX}"
67+
68+
# Prepare for installing releases from NGC (that merely mutates local
69+
# filesystem state).
70+
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update
71+
}
72+
73+

tests/bats/test_basics.bats

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# shellcheck disable=SC2148
2+
# shellcheck disable=SC2329
3+
4+
setup() {
5+
load 'helpers.sh'
6+
_common_setup
7+
}
8+
9+
10+
# A test that covers local dev tooling; we don't want to
11+
# unintentionally change/break these targets.
12+
@test "test VERSION_W_COMMIT, VERSION_GHCR_CHART, VERSION" {
13+
run make print-VERSION
14+
assert_output --regexp '^v[0-9]+\.[0-9]+\.[0-9]+-dev$'
15+
run make print-VERSION_W_COMMIT
16+
assert_output --regexp '^v[0-9]+\.[0-9]+\.[0-9]+-dev-[0-9a-f]{8}$'
17+
run make print-VERSION_GHCR_CHART
18+
assert_output --regexp '^[0-9]+\.[0-9]+\.[0-9]+-dev-[0-9a-f]{8}-chart$'
19+
}
20+
21+
22+
@test "confirm no kubelet plugin pods running" {
23+
run kubectl get pods -A -l nvidia-dra-driver-gpu-component=kubelet-plugin
24+
[ "$status" -eq 0 ]
25+
refute_output --partial 'Running'
26+
}
27+
28+
29+
@test "helm-install ${TEST_CHART_REPO}/${TEST_CHART_VERSION}" {
30+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" NOARGS
31+
}
32+
33+
34+
@test "helm list: validate output" {
35+
# Sanity check: one chart installed.
36+
helm list -n nvidia-dra-driver-gpu -o json | jq 'length == 1'
37+
38+
# Confirm consistency between the various version-related parameters. Note
39+
# that the --version arg provided to `helm install/upgrade` does not directly
40+
# set app_version; it is just a version constraint. `app_version` tested here
41+
# is AFAIU defined solely by the chart's appVersion YAML spec.
42+
helm list -n nvidia-dra-driver-gpu -o json | jq '.[].app_version' | grep "${TEST_CHART_VERSION}"
43+
}
44+
45+
46+
@test "get crd computedomains.resource.nvidia.com" {
47+
kubectl get crd computedomains.resource.nvidia.com
48+
}
49+
50+
51+
@test "wait for plugin & controller pods READY" {
52+
kubectl wait --for=condition=READY pods -A \
53+
-l nvidia-dra-driver-gpu-component=kubelet-plugin --timeout=10s
54+
kubectl wait --for=condition=READY pods -A \
55+
-l nvidia-dra-driver-gpu-component=controller --timeout=10s
56+
}
57+
58+
59+
@test "validate CD controller container image spec" {
60+
local ACTUAL_IMAGE_SPEC
61+
ACTUAL_IMAGE_SPEC=$(kubectl get pod \
62+
-n nvidia-dra-driver-gpu \
63+
-l nvidia-dra-driver-gpu-component=controller \
64+
-o json | \
65+
jq -r '.items[].spec.containers[] | select(.name=="compute-domain") | .image')
66+
67+
# Emit once, unfiltered, for debuggability
68+
echo "$ACTUAL_IMAGE_SPEC"
69+
70+
# Confirm substring; TODO: make tighter with precise
71+
# TEST_EXPECTED_IMAGE_SPEC_SUBSTRING
72+
echo "$ACTUAL_IMAGE_SPEC" | grep "${TEST_EXPECTED_IMAGE_SPEC_SUBSTRING}"
73+
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# shellcheck disable=SC2148
2+
# shellcheck disable=SC2329
3+
4+
setup_file() {
5+
load 'helpers.sh'
6+
local _iargs=("--set" "logVerbosity=6")
7+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
8+
}
9+
10+
11+
setup () {
12+
load 'helpers.sh'
13+
_common_setup
14+
log_objects
15+
}
16+
17+
18+
@test "IMEX channel injection (single)" {
19+
log_objects
20+
apply_check_delete_workload_imex_chan_inject
21+
}
22+
23+
24+
@test "IMEX channel injection (all)" {
25+
log_objects
26+
# Example: with TEST_CHART_VERSION="v25.3.2-12390-chart"
27+
# the command below returns 0 (true: the tested version is smaller)
28+
if dpkg --compare-versions "${TEST_CHART_VERSION#v}" lt "25.8.0"; then
29+
skip "tested chart version smaller than 25.8.0"
30+
fi
31+
kubectl apply -f demo/specs/imex/channel-injection-all.yaml
32+
kubectl wait --for=condition=READY pods imex-channel-injection-all --timeout=80s
33+
run kubectl logs imex-channel-injection-all
34+
assert_output --partial "channel2047"
35+
assert_output --partial "channel222"
36+
kubectl delete -f demo/specs/imex/channel-injection-all.yaml
37+
kubectl wait --for=delete pods imex-channel-injection-all --timeout=10s
38+
}

tests/bats/test_cd_logging.bats

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
# shellcheck disable=SC2148
2+
# shellcheck disable=SC2329
3+
4+
#setup_file() {
5+
# load 'helpers.sh'
6+
# local _iargs=("--set" "logVerbosity=6")
7+
# iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
8+
#}
9+
10+
11+
setup () {
12+
load 'helpers.sh'
13+
_common_setup
14+
}
15+
16+
17+
@test "CD controller/plugin: startup config / detail in logs on level 0" {
18+
local _iargs=("--set" "logVerbosity=0")
19+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
20+
21+
run kubectl logs -l nvidia-dra-driver-gpu-component=controller -n nvidia-dra-driver-gpu --tail=-1
22+
assert_output --partial "Verbosity:"
23+
assert_output --partial '"MPSSupport":false'
24+
assert_output --partial 'additionalNamespaces:'
25+
26+
run kubectl logs -l nvidia-dra-driver-gpu-component=kubelet-plugin -n nvidia-dra-driver-gpu --tail=-1
27+
assert_output --partial "Verbosity"
28+
assert_output --partial "nodeName"
29+
assert_output --partial "identified fabric clique"
30+
assert_output --partial "driver version validation"
31+
}
32+
33+
@test "CD controller: test log verbosity levels" {
34+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" NOARGS
35+
36+
# Deploy workload: give the controller something to log about.
37+
log_objects
38+
kubectl apply -f demo/specs/imex/channel-injection.yaml
39+
kubectl wait --for=condition=READY pods imex-channel-injection --timeout=100s
40+
run kubectl logs imex-channel-injection
41+
assert_output --partial "channel0"
42+
43+
echo "test level 0"
44+
local _iargs=("--set" "logVerbosity=0")
45+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
46+
kubectl logs -l nvidia-dra-driver-gpu-component=controller -n nvidia-dra-driver-gpu --tail=-1
47+
run kubectl logs -l nvidia-dra-driver-gpu-component=controller -n nvidia-dra-driver-gpu --tail=-1
48+
assert_output --partial 'controller manager config'
49+
assert_output --partial 'maxNodesPerIMEXDomain'
50+
51+
echo "test level 1"
52+
local _iargs=("--set" "logVerbosity=1")
53+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
54+
run kubectl logs -l nvidia-dra-driver-gpu-component=controller -n nvidia-dra-driver-gpu --tail=-1
55+
refute_output --partial 'Processing added or updated ComputeDomain'
56+
refute_output --partial 'reflector.go'
57+
refute_output --partial 'Caches populated'
58+
refute_output --partial 'round_trippers.go'
59+
refute_output --partial 'Listing and watching'
60+
61+
echo "test level 2"
62+
local _iargs=("--set" "logVerbosity=2")
63+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
64+
run kubectl logs -l nvidia-dra-driver-gpu-component=controller -n nvidia-dra-driver-gpu --tail=-1
65+
assert_output --partial 'Processing added or updated ComputeDomain'
66+
assert_output --partial 'reflector.go'
67+
assert_output --partial 'Caches populated'
68+
refute_output --partial 'Listing and watching'
69+
refute_output --partial 'round_trippers.go'
70+
71+
echo "test level 3"
72+
local _iargs=("--set" "logVerbosity=3")
73+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
74+
run kubectl logs -l nvidia-dra-driver-gpu-component=controller -n nvidia-dra-driver-gpu --tail=-1
75+
assert_output --partial 'reflector.go'
76+
assert_output --partial 'Caches populated'
77+
assert_output --partial 'Listing and watching'
78+
refute_output --partial 'round_trippers.go'
79+
80+
echo "test level 4"
81+
local _iargs=("--set" "logVerbosity=4")
82+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
83+
run kubectl logs -l nvidia-dra-driver-gpu-component=controller -n nvidia-dra-driver-gpu --tail=-1
84+
refute_output --partial 'round_trippers.go'
85+
86+
echo "test level 5"
87+
local _iargs=("--set" "logVerbosity=5")
88+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
89+
run kubectl logs -l nvidia-dra-driver-gpu-component=controller -n nvidia-dra-driver-gpu --tail=-1
90+
refute_output --partial 'round_trippers.go'
91+
92+
echo "test level 6"
93+
local _iargs=("--set" "logVerbosity=6")
94+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
95+
run kubectl logs -l nvidia-dra-driver-gpu-component=controller -n nvidia-dra-driver-gpu --tail=-1
96+
assert_output --partial 'Cleanup: perform for'
97+
assert_output --partial 'round_trippers.go'
98+
assert_output --partial '"Response" verb="GET"'
99+
100+
# Delete workload and hence CD daemon
101+
kubectl delete -f demo/specs/imex/channel-injection.yaml
102+
kubectl wait --for=delete pods imex-channel-injection --timeout=10s
103+
}
104+
105+
@test "CD daemon: test log verbosity levels" {
106+
log_objects
107+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" NOARGS
108+
109+
echo "test level 6"
110+
local _iargs=("--set" "logVerbosity=6")
111+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
112+
113+
kubectl apply -f demo/specs/imex/channel-injection.yaml
114+
kubectl wait --for=condition=READY pods imex-channel-injection --timeout=100s
115+
116+
# Confirm that the CD daemon logs on level six
117+
run get_all_cd_daemon_logs_for_cd_name "imex-channel-injection"
118+
assert_output --partial 'wait for nodes update' # level 1 msg
119+
assert_output --partial 'round_trippers.go' # level 6 msg
120+
121+
# Delete workload and hence CD daemon
122+
kubectl delete -f demo/specs/imex/channel-injection.yaml
123+
kubectl wait --for=delete pods imex-channel-injection --timeout=10s
124+
125+
echo "test level 0"
126+
log_objects
127+
# Reconfigure (only the) log verbosity for CD daemons spawned in the future.
128+
# The deployment mutation via `set env` below is expected to restart the
129+
# controller. Wait for the old controller pod to go away (to be sure that the
130+
# new LOG_VERBOSITY_CD_DAEMON setting applies), and make sure controller
131+
# deployment is still READY before moving on (make sure 1/1 READY).
132+
CPOD_OLD="$(get_current_controller_pod_name)"
133+
kubectl set env deployment nvidia-dra-driver-gpu-controller -n nvidia-dra-driver-gpu LOG_VERBOSITY_CD_DAEMON=0
134+
echo "wait --for=delete: $CPOD_OLD"
135+
kubectl wait --for=delete pods "$CPOD_OLD" -n nvidia-dra-driver-gpu --timeout=10s
136+
echo "returned: wait --for=delete"
137+
CPOD_NEW="$(get_current_controller_pod_name)"
138+
kubectl wait --for=condition=READY pods "$CPOD_NEW" -n nvidia-dra-driver-gpu --timeout=10s
139+
echo "new controller pod is in effect"
140+
141+
# Spawn new workload
142+
kubectl apply -f demo/specs/imex/channel-injection.yaml
143+
kubectl wait --for=condition=READY pods imex-channel-injection --timeout=100s
144+
145+
# Confirm that the CD daemon now does not contain a level 1 msg
146+
run get_all_cd_daemon_logs_for_cd_name "imex-channel-injection"
147+
refute_output --partial 'wait for nodes update' # expected level 1 msg
148+
149+
# Delete workload
150+
kubectl delete -f demo/specs/imex/channel-injection.yaml
151+
kubectl wait --for=delete pods imex-channel-injection --timeout=10s
152+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# shellcheck disable=SC2148
2+
# shellcheck disable=SC2329
3+
4+
setup_file() {
5+
load 'helpers.sh'
6+
local _iargs=("--set" "logVerbosity=6")
7+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
8+
}
9+
10+
11+
setup () {
12+
load 'helpers.sh'
13+
_common_setup
14+
log_objects
15+
}
16+
17+
18+
@test "nickelpie (NCCL send/recv/broadcast, 2 pods, 2 nodes, small payload)" {
19+
# Do not run in checkout dir (to not pollute that).
20+
cd "${BATS_TEST_TMPDIR}"
21+
git clone https://github.com/jgehrcke/jpsnips-nv
22+
cd jpsnips-nv && git checkout fb46298fc7aa5fc1322b4672e8847da5321baeb7
23+
cd nickelpie/one-pod-per-node/
24+
bash teardown-start-evaluate-npie-job.sh --gb-per-benchmark 5 --matrix-scale 2 --n-ranks 2
25+
run kubectl logs --prefix -l job-name=nickelpie-test --tail=-1
26+
kubectl wait --for=condition=complete --timeout=60s job/nickelpie-test
27+
kubectl delete -f npie-job.yaml.rendered
28+
kubectl wait --for=delete --timeout=60s job/nickelpie-test
29+
echo "${output}" | grep -E '^.*broadcast-.*RESULT bandwidth: [0-9]+\.[0-9]+ GB/s.*$'
30+
}
31+
32+
33+
@test "nvbandwidth (2 nodes, 2 GPUs each)" {
34+
kubectl create -f https://github.com/kubeflow/mpi-operator/releases/download/v0.6.0/mpi-operator.yaml || echo "ignore"
35+
kubectl apply -f demo/specs/imex/nvbandwidth-test-job-1.yaml
36+
# The canonical k8s job interface works even for MPIJob (the MPIJob has an
37+
# underlying k8s job).
38+
kubectl wait --for=create job/nvbandwidth-test-1-launcher --timeout=20s
39+
kubectl wait --for=condition=complete job/nvbandwidth-test-1-launcher --timeout=60s
40+
run kubectl logs --tail=-1 --prefix -l job-name=nvbandwidth-test-1-launcher
41+
kubectl delete -f demo/specs/imex/nvbandwidth-test-job-1.yaml
42+
echo "${output}" | grep -E '^.*SUM multinode_device_to_device_memcpy_read_ce [0-9]+\.[0-9]+.*$'
43+
}

0 commit comments

Comments
 (0)