Skip to content

Commit 59d775b

Browse files
authored
Merge pull request #709 from jgehrcke/jp/basic-gpu-tests
tests: cover basic GPU allocation, misc improvements
2 parents 852b56f + 1e79179 commit 59d775b

File tree

10 files changed

+375
-8
lines changed

10 files changed

+375
-8
lines changed

tests/bats/Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ image:
7777
# cmdline args).
7878
.PHONY: tests
7979
tests: image
80-
export _RUNDIR_PARENT=/tmp/k8s-dra-driver-gpu-tests-out && \
80+
export _RUNDIR_PARENT=/tmp/k8s-dra-driver-gpu-tests-out-$${USER} && \
8181
mkdir -p "$${_RUNDIR_PARENT}" && \
8282
export _RUNDIR=$$(mktemp -p $${_RUNDIR_PARENT} -d -t bats-tests-$$(date +%s)-XXXXX) && \
8383
docker run \
@@ -100,12 +100,14 @@ tests: image
100100
-c "set -ex; cd /cwd; \
101101
echo 'Running k8s cluster cleanup (invasive)... '; \
102102
bash tests/bats/cleanup-from-previous-run.sh 2>&1 | tee -a $${_RUNDIR}/cleanup.outerr; \
103+
set +x; echo '--- STARTING TEST SUITE ---'; set -x; \
103104
TMPDIR=$${_RUNDIR} bats \
104105
--print-output-on-failure \
105106
--no-tempdir-cleanup \
106107
--timing \
107108
--abort \
108109
tests/bats/test_basics.bats \
110+
tests/bats/test_gpu_basic.bats \
109111
tests/bats/test_cd_imex_chan_inject.bats \
110112
tests/bats/test_cd_mnnvl_workload.bats \
111113
tests/bats/test_cd_misc.bats \

tests/bats/README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ That's CI-oriented.
5454
We may want to change that.
5555

5656

57-
## Development
57+
## Resources for development
5858

5959
Bats is a workable solution.
6060
Developing new tests might however probe your patience.
@@ -65,6 +65,11 @@ Make wise usage of
6565
* [tagging tests with `bats:focus`](https://bats-core.readthedocs.io/en/stable/writing-tests.html#special-tags)
6666
* [CLI args](https://bats-core.readthedocs.io/en/stable/usage.html) such as `--verbose-run`, `--show-output-of-passing-tests`.
6767

68+
Other references:
69+
* https://github.com/bats-core/bats-assert
70+
* https://github.com/bats-core/bats-file
71+
72+
6873
Misc notes:
6974

7075
* The test suite stops on first failure (using the [new](https://github.com/bats-core/bats-core/issues/209) `--abort` flag for bats).

tests/bats/clean-state-dirs-all-nodes.sh

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ set -o pipefail
2121

2222
rm_kubelet_plugin_dirs_from_node () {
2323
local NODE_NAME="$1"
24-
echo "Run privileged pod to remove kubelet plugin directories on node ${NODE_NAME}"
25-
kubectl run privpod-rm-plugindirs \
24+
echo "Run privileged pod to remove /run/cdi/* and kubelet plugin directories on node ${NODE_NAME}"
25+
kubectl run "privpod-rm-plugindirs-${NODE_NAME}" \
2626
--rm \
2727
--image=busybox \
2828
--attach \
@@ -32,7 +32,8 @@ rm_kubelet_plugin_dirs_from_node () {
3232
"spec": {
3333
"nodeName": "'"${NODE_NAME}"'",
3434
"containers": [{
35-
"name": "privpod-rm-plugindirs",
35+
"name": "privpod-rm-plugindirs-'"${NODE_NAME}"'",
36+
"metadata": {"labels": {"env": "batssuite"}},
3637
"image": "busybox",
3738
"securityContext": { "privileged": true },
3839
"volumeMounts": [{
@@ -52,5 +53,7 @@ rm_kubelet_plugin_dirs_from_node () {
5253
# Would be faster when using a daemonset. However, the output is more readable
5354
# when running sequentially.
5455
for node in $(kubectl get nodes -o jsonpath='{.items[*].metadata.name}'); do
55-
rm_kubelet_plugin_dirs_from_node $node
56+
rm_kubelet_plugin_dirs_from_node "$node" &
5657
done
58+
wait
59+
echo "state dir cleanup: done"

tests/bats/cleanup-from-previous-run.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-2.yaml 2> /d
6060
timeout -v 5 kubectl delete -f tests/bats/specs/nvb2.yaml 2> /dev/null
6161
timeout -v 5 kubectl delete pods -l env=batssuite 2> /dev/null
6262
timeout -v 2 kubectl delete resourceclaim batssuite-rc-bad-opaque-config --force 2> /dev/null
63+
timeout -v 2 kubectl delete -f demo/specs/imex/simple-mig-test 2> /dev/null
6364

6465
# TODO: maybe more brute-forcing/best-effort: it might make sense to submit all
6566
# workload in this test suite into a special namespace (not `default`), and to
@@ -86,4 +87,6 @@ timeout -v 10 kubectl delete crds computedomains.resource.nvidia.com || echo "CR
8687
# cleanup, fail hard if this does not succeed).
8788
set -e
8889
bash tests/bats/clean-state-dirs-all-nodes.sh
90+
8991
set +x
92+
echo "cleanup: done"

tests/bats/helpers.sh

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ iupgrade_wait() {
5252
--timeout=1m5s \
5353
--create-namespace \
5454
--namespace nvidia-dra-driver-gpu \
55-
--set resources.gpus.enabled=false \
55+
--set gpuResourcesEnabledOverride=true \
5656
--set nvidiaDriverRoot="${TEST_NVIDIA_DRIVER_ROOT}" "${ADDITIONAL_INSTALL_ARGS[@]}"
5757

5858
# Valueable output to have in the logs in case things went pearshaped.
@@ -139,7 +139,8 @@ show_kubelet_plugin_error_logs() {
139139
kubectl logs \
140140
-l nvidia-dra-driver-gpu-component=kubelet-plugin \
141141
-n nvidia-dra-driver-gpu \
142-
--prefix --tail=-1 | grep -E "^(E|W)[0-9]{4}"
142+
--all-containers \
143+
--prefix --tail=-1 | grep -E "^(E|W)[0-9]{4}" -iE "error"
143144
) || true
144145
echo -e "KUBELET PLUGIN ERROR LOGS END\n\n"
145146
}
@@ -172,3 +173,70 @@ apply_check_delete_workload_imex_chan_inject() {
172173
kubectl delete -f demo/specs/imex/channel-injection.yaml
173174
kubectl wait --for=delete pods imex-channel-injection --timeout=10s
174175
}
176+
177+
178+
# Run cmd in nvidia-mig-manager pod because that one has highest privileges. I
179+
# use this for example to run `nvcnt gb-nvl-027-compute06 nvidia-smi`
180+
nvmm() {
181+
if [ -z "$1" ]; then
182+
echo "Usage: nvcnt <node-name> [command...]"
183+
return 1
184+
fi
185+
local node="$1"
186+
shift # Remove first argument, leaving remaining args in $@
187+
188+
local pod
189+
pod=$(kubectl get pod -n gpu-operator -l app=nvidia-mig-manager \
190+
--field-selector spec.nodeName="$node" \
191+
--no-headers -o custom-columns=":metadata.name")
192+
193+
if [ -z "$pod" ]; then
194+
echo "get pod -n gpu-operator -l app=nvidia-mig-manager: no pod found on node $node"
195+
return 1
196+
fi
197+
198+
echo "Executing on pod $pod (node: $node)..."
199+
kubectl -n gpu-operator exec -it "$pod" -- "$@"
200+
}
201+
202+
restart_kubelet_on_node() {
203+
local NODEIP="$1"
204+
echo "sytemctl restart kubelet.service on ${NODEIP}"
205+
# Assume that current user has password-less sudo privileges
206+
ssh "${USER}@${NODEIP}" 'sudo systemctl restart kubelet.service'
207+
}
208+
209+
restart_kubelet_all_nodes() {
210+
for nodeip in $(kubectl get nodes -o jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}{end}'); do
211+
restart_kubelet_on_node "$nodeip"
212+
done
213+
#wait
214+
echo "restart kubelets: done"
215+
}
216+
217+
kplog () {
218+
if [[ -z "$1" || -z "$2" ]]; then
219+
echo "Usage: kplog [gpus|compute-domains] <node-hint-for-grep> [args]"
220+
return 1
221+
fi
222+
local nodehint="$2"
223+
local cont="$1"
224+
shift
225+
shift # Remove first argument, leaving remaining args in $@
226+
227+
local node=$(kubectl get nodes | grep "$nodehint" | awk '{print $1}')
228+
echo "identified node: $node"
229+
230+
local pod
231+
pod=$(kubectl get pod -n nvidia-dra-driver-gpu -l nvidia-dra-driver-gpu-component=kubelet-plugin \
232+
--field-selector spec.nodeName="$node" \
233+
--no-headers -o custom-columns=":metadata.name")
234+
235+
if [ -z "$pod" ]; then
236+
echo " get pod -n nvidia-dra-driver-gpu -l nvidia-dra-driver-gpu-component=kubelet-plugin: no pod found on node $node"
237+
return 1
238+
fi
239+
240+
echo "Executing on pod $pod (node: $node)..."
241+
kubectl logs -n nvidia-dra-driver-gpu "$pod" -c "$cont" "$@"
242+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# One pod, two containers.
2+
# Each asking for shared access to a single, full GPU.
3+
apiVersion: resource.k8s.io/v1
4+
kind: ResourceClaimTemplate
5+
metadata:
6+
name: rct-single-gpu-full
7+
spec:
8+
spec:
9+
devices:
10+
requests:
11+
- name: gpu
12+
exactly:
13+
deviceClassName: gpu.nvidia.com
14+
---
15+
apiVersion: v1
16+
kind: Pod
17+
metadata:
18+
name: pod1
19+
labels:
20+
env: batssuite
21+
spec:
22+
containers:
23+
- name: ctr0
24+
image: ubuntu:24.04
25+
command: ["bash", "-c"]
26+
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
27+
resources:
28+
claims:
29+
- name: shared-gpu
30+
- name: ctr1
31+
image: ubuntu:24.04
32+
command: ["bash", "-c"]
33+
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
34+
resources:
35+
claims:
36+
- name: shared-gpu
37+
resourceClaims:
38+
- name: shared-gpu
39+
resourceClaimTemplateName: rct-single-gpu-full
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Simple GPU sharing scenario:
2+
# Two pods, one container each, each getting access to the same, shared GPU
3+
apiVersion: resource.k8s.io/v1
4+
kind: ResourceClaim
5+
metadata:
6+
name: rc-single-gpu-full
7+
spec:
8+
devices:
9+
requests:
10+
- name: gpu
11+
exactly:
12+
deviceClassName: gpu.nvidia.com
13+
---
14+
apiVersion: v1
15+
kind: Pod
16+
metadata:
17+
name: pod1
18+
labels:
19+
env: batssuite
20+
spec:
21+
containers:
22+
- name: ctr
23+
image: ubuntu:24.04
24+
command: ["bash", "-c"]
25+
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
26+
resources:
27+
claims:
28+
- name: gpu
29+
resourceClaims:
30+
- name: gpu
31+
resourceClaimName: rc-single-gpu-full
32+
---
33+
apiVersion: v1
34+
kind: Pod
35+
metadata:
36+
name: pod2
37+
labels:
38+
env: batssuite
39+
spec:
40+
containers:
41+
- name: ctr
42+
image: ubuntu:24.04
43+
command: ["bash", "-c"]
44+
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
45+
resources:
46+
claims:
47+
- name: gpu
48+
resourceClaims:
49+
- name: gpu
50+
resourceClaimName: rc-single-gpu-full
51+
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Two pods, one container each, each container gets 1 distinct full GPU
2+
apiVersion: resource.k8s.io/v1
3+
kind: ResourceClaimTemplate
4+
metadata:
5+
name: rct-single-gpu-full
6+
spec:
7+
spec:
8+
devices:
9+
requests:
10+
- name: gpu
11+
exactly:
12+
deviceClassName: gpu.nvidia.com
13+
---
14+
apiVersion: v1
15+
kind: Pod
16+
metadata:
17+
name: pod1
18+
labels:
19+
env: batssuite
20+
spec:
21+
containers:
22+
- name: ctr
23+
image: ubuntu:24.04
24+
command: ["bash", "-c"]
25+
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
26+
resources:
27+
claims:
28+
- name: gpu
29+
resourceClaims:
30+
- name: gpu
31+
resourceClaimTemplateName: rct-single-gpu-full
32+
---
33+
apiVersion: v1
34+
kind: Pod
35+
metadata:
36+
name: pod2
37+
labels:
38+
env: batssuite
39+
spec:
40+
containers:
41+
- name: ctr
42+
image: ubuntu:24.04
43+
command: ["bash", "-c"]
44+
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
45+
resources:
46+
claims:
47+
- name: gpu
48+
resourceClaims:
49+
- name: gpu
50+
resourceClaimTemplateName: rct-single-gpu-full
51+
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Scenario: single pod, requesting any full GPU
2+
apiVersion: resource.k8s.io/v1
3+
kind: ResourceClaimTemplate
4+
metadata:
5+
name: rct-single-gpu-full
6+
spec:
7+
spec:
8+
devices:
9+
requests:
10+
- name: gpu
11+
exactly:
12+
deviceClassName: gpu.nvidia.com
13+
---
14+
apiVersion: v1
15+
kind: Pod
16+
metadata:
17+
name: pod-full-gpu
18+
labels:
19+
env: batssuite
20+
spec:
21+
containers:
22+
- name: ctr
23+
image: ubuntu:24.04
24+
command: ["bash", "-c"]
25+
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
26+
resources:
27+
claims:
28+
- name: gpu
29+
resourceClaims:
30+
- name: gpu
31+
resourceClaimTemplateName: rct-single-gpu-full

0 commit comments

Comments
 (0)