Skip to content

Commit 11014c1

Browse files
committed
tests: cover basic GPU allocation
Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]> misc fixes Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]>
1 parent fcd74d1 commit 11014c1

File tree

10 files changed

+345
-8
lines changed

10 files changed

+345
-8
lines changed

tests/bats/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ tests: image
107107
--timing \
108108
--abort \
109109
tests/bats/test_basics.bats \
110+
tests/bats/test_gpu_basic.bats \
110111
tests/bats/test_cd_imex_chan_inject.bats \
111112
tests/bats/test_cd_mnnvl_workload.bats \
112113
tests/bats/test_cd_misc.bats \

tests/bats/README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ That's CI-oriented.
5454
We may want to change that.
5555

5656

57-
## Development
57+
## Resources for development
5858

5959
Bats is a workable solution.
6060
Developing new tests might however probe your patience.
@@ -65,6 +65,11 @@ Make wise usage of
6565
* [tagging tests with `bats:focus`](https://bats-core.readthedocs.io/en/stable/writing-tests.html#special-tags)
6666
* [CLI args](https://bats-core.readthedocs.io/en/stable/usage.html) such as `--verbose-run`, `--show-output-of-passing-tests`.
6767

68+
Other references:
69+
* https://github.com/bats-core/bats-assert
70+
* https://github.com/bats-core/bats-file
71+
72+
6873
Misc notes:
6974

7075
* The test suite stops on first failure (using the [new](https://github.com/bats-core/bats-core/issues/209) `--abort` flag for bats).

tests/bats/clean-state-dirs-all-nodes.sh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,10 @@ set -o errexit
1919
set -o nounset
2020
set -o pipefail
2121

22-
# TODO: think about wiping /var/run/cdi, too
23-
22+
# Note: assume that /var/run on host is a link to /run
2423
rm_kubelet_plugin_dirs_from_node () {
2524
local NODE_NAME="$1"
26-
echo "Run privileged pod to remove kubelet plugin directories on node ${NODE_NAME}"
25+
echo "Run privileged pod to remove /run/cdi/* and kubelet plugin directories on node ${NODE_NAME}"
2726
kubectl run "privpod-rm-plugindirs-${NODE_NAME}" \
2827
--rm \
2928
--image=busybox \
@@ -42,7 +41,7 @@ rm_kubelet_plugin_dirs_from_node () {
4241
"mountPath": "/host",
4342
"name": "host-root"
4443
}],
45-
"command": ["/bin/sh", "-c", "rm -rfv /host/var/lib/kubelet/plugins/*"]
44+
"command": ["/bin/sh", "-c", "rm -rfv /host/run/cdi/* ; rm -rfv /host/var/lib/kubelet/plugins/*"]
4645
}],
4746
"volumes": [{
4847
"name": "host-root",

tests/bats/cleanup-from-previous-run.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-2.yaml 2> /d
6060
timeout -v 5 kubectl delete -f tests/bats/specs/nvb2.yaml 2> /dev/null
6161
timeout -v 5 kubectl delete pods -l env=batssuite 2> /dev/null
6262
timeout -v 2 kubectl delete resourceclaim batssuite-rc-bad-opaque-config --force 2> /dev/null
63+
timeout -v 2 kubectl delete -f demo/specs/imex/simple-mig-test 2> /dev/null
6364

6465
# TODO: maybe more brute-forcing/best-effort: it might make sense to submit all
6566
# workload in this test suite into a special namespace (not `default`), and to
@@ -86,4 +87,6 @@ timeout -v 10 kubectl delete crds computedomains.resource.nvidia.com || echo "CR
8687
# cleanup, fail hard if this does not succeed).
8788
set -e
8889
bash tests/bats/clean-state-dirs-all-nodes.sh
90+
8991
set +x
92+
echo "cleanup: done"

tests/bats/helpers.sh

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ iupgrade_wait() {
5252
--timeout=1m5s \
5353
--create-namespace \
5454
--namespace nvidia-dra-driver-gpu \
55-
--set resources.gpus.enabled=false \
55+
--set gpuResourcesEnabledOverride=true \
5656
--set nvidiaDriverRoot="${TEST_NVIDIA_DRIVER_ROOT}" "${ADDITIONAL_INSTALL_ARGS[@]}"
5757

5858
# Valueable output to have in the logs in case things went pearshaped.
@@ -139,7 +139,8 @@ show_kubelet_plugin_error_logs() {
139139
kubectl logs \
140140
-l nvidia-dra-driver-gpu-component=kubelet-plugin \
141141
-n nvidia-dra-driver-gpu \
142-
--prefix --tail=-1 | grep -E "^(E|W)[0-9]{4}"
142+
--all-containers \
143+
--prefix --tail=-1 | grep -E "^(E|W)[0-9]{4}" -iE "error"
143144
) || true
144145
echo -e "KUBELET PLUGIN ERROR LOGS END\n\n"
145146
}
@@ -190,10 +191,52 @@ nvmm() {
190191
--no-headers -o custom-columns=":metadata.name")
191192

192193
if [ -z "$pod" ]; then
193-
echo "No pod found on node $node"
194+
echo "get pod -n gpu-operator -l app=nvidia-mig-manager: no pod found on node $node"
194195
return 1
195196
fi
196197

197198
echo "Executing on pod $pod (node: $node)..."
198199
kubectl -n gpu-operator exec -it "$pod" -- "$@"
200+
}
201+
202+
restart_kubelet_on_node() {
203+
local NODEIP="$1"
204+
echo "sytemctl restart kubelet.service on ${NODEIP}"
205+
# Assume that current user has password-less sudo privileges
206+
ssh "${USER}@${NODEIP}" 'sudo systemctl restart kubelet.service'
207+
}
208+
209+
restart_kubelet_all_nodes() {
210+
for nodeip in $(kubectl get nodes -o jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}{end}'); do
211+
restart_kubelet_on_node "$nodeip"
212+
done
213+
#wait
214+
echo "restart kubelets: done"
215+
}
216+
217+
kplog () {
218+
if [[ -z "$1" || -z "$2" ]]; then
219+
echo "Usage: kplog [gpus|compute-domains] <node-hint-for-grep> [args]"
220+
return 1
221+
fi
222+
local nodehint="$2"
223+
local cont="$1"
224+
shift
225+
shift # Remove first argument, leaving remaining args in $@
226+
227+
local node=$(kubectl get nodes | grep "$nodehint" | awk '{print $1}')
228+
echo "identified node: $node"
229+
230+
local pod
231+
pod=$(kubectl get pod -n nvidia-dra-driver-gpu -l nvidia-dra-driver-gpu-component=kubelet-plugin \
232+
--field-selector spec.nodeName="$node" \
233+
--no-headers -o custom-columns=":metadata.name")
234+
235+
if [ -z "$pod" ]; then
236+
echo " get pod -n nvidia-dra-driver-gpu -l nvidia-dra-driver-gpu-component=kubelet-plugin: no pod found on node $node"
237+
return 1
238+
fi
239+
240+
echo "Executing on pod $pod (node: $node)..."
241+
kubectl logs -n nvidia-dra-driver-gpu "$pod" -c "$cont" "$@"
199242
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# One pod, two containers.
2+
# Each asking for shared access to a single, full GPU.
3+
apiVersion: resource.k8s.io/v1
4+
kind: ResourceClaimTemplate
5+
metadata:
6+
name: rct-single-gpu-full
7+
spec:
8+
spec:
9+
devices:
10+
requests:
11+
- name: gpu
12+
exactly:
13+
deviceClassName: gpu.nvidia.com
14+
---
15+
apiVersion: v1
16+
kind: Pod
17+
metadata:
18+
name: pod1
19+
labels:
20+
env: batssuite
21+
spec:
22+
containers:
23+
- name: ctr0
24+
image: ubuntu:24.04
25+
command: ["bash", "-c"]
26+
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
27+
resources:
28+
claims:
29+
- name: shared-gpu
30+
- name: ctr1
31+
image: ubuntu:24.04
32+
command: ["bash", "-c"]
33+
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
34+
resources:
35+
claims:
36+
- name: shared-gpu
37+
resourceClaims:
38+
- name: shared-gpu
39+
resourceClaimTemplateName: rct-single-gpu-full
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Simple GPU sharing scenario:
2+
# Two pods, one container each, each getting access to the same, shared GPU
3+
apiVersion: resource.k8s.io/v1
4+
kind: ResourceClaim
5+
metadata:
6+
name: rc-single-gpu-full
7+
spec:
8+
devices:
9+
requests:
10+
- name: gpu
11+
exactly:
12+
deviceClassName: gpu.nvidia.com
13+
---
14+
apiVersion: v1
15+
kind: Pod
16+
metadata:
17+
name: pod1
18+
labels:
19+
env: batssuite
20+
spec:
21+
containers:
22+
- name: ctr
23+
image: ubuntu:24.04
24+
command: ["bash", "-c"]
25+
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
26+
resources:
27+
claims:
28+
- name: gpu
29+
resourceClaims:
30+
- name: gpu
31+
resourceClaimName: rc-single-gpu-full
32+
---
33+
apiVersion: v1
34+
kind: Pod
35+
metadata:
36+
name: pod2
37+
labels:
38+
env: batssuite
39+
spec:
40+
containers:
41+
- name: ctr
42+
image: ubuntu:24.04
43+
command: ["bash", "-c"]
44+
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
45+
resources:
46+
claims:
47+
- name: gpu
48+
resourceClaims:
49+
- name: gpu
50+
resourceClaimName: rc-single-gpu-full
51+
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Two pods, one container each, each container gets 1 distinct full GPU
2+
apiVersion: resource.k8s.io/v1
3+
kind: ResourceClaimTemplate
4+
metadata:
5+
name: rct-single-gpu-full
6+
spec:
7+
spec:
8+
devices:
9+
requests:
10+
- name: gpu
11+
exactly:
12+
deviceClassName: gpu.nvidia.com
13+
---
14+
apiVersion: v1
15+
kind: Pod
16+
metadata:
17+
name: pod1
18+
labels:
19+
env: batssuite
20+
spec:
21+
containers:
22+
- name: ctr
23+
image: ubuntu:24.04
24+
command: ["bash", "-c"]
25+
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
26+
resources:
27+
claims:
28+
- name: gpu
29+
resourceClaims:
30+
- name: gpu
31+
resourceClaimTemplateName: rct-single-gpu-full
32+
---
33+
apiVersion: v1
34+
kind: Pod
35+
metadata:
36+
name: pod2
37+
labels:
38+
env: batssuite
39+
spec:
40+
containers:
41+
- name: ctr
42+
image: ubuntu:24.04
43+
command: ["bash", "-c"]
44+
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
45+
resources:
46+
claims:
47+
- name: gpu
48+
resourceClaims:
49+
- name: gpu
50+
resourceClaimTemplateName: rct-single-gpu-full
51+
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Scenario: single pod, requesting any full GPU
2+
apiVersion: resource.k8s.io/v1
3+
kind: ResourceClaimTemplate
4+
metadata:
5+
name: rct-single-gpu-full
6+
spec:
7+
spec:
8+
devices:
9+
requests:
10+
- name: gpu
11+
exactly:
12+
deviceClassName: gpu.nvidia.com
13+
---
14+
apiVersion: v1
15+
kind: Pod
16+
metadata:
17+
name: pod-full-gpu
18+
labels:
19+
env: batssuite
20+
spec:
21+
containers:
22+
- name: ctr
23+
image: ubuntu:24.04
24+
command: ["bash", "-c"]
25+
args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
26+
resources:
27+
claims:
28+
- name: gpu
29+
resourceClaims:
30+
- name: gpu
31+
resourceClaimTemplateName: rct-single-gpu-full

0 commit comments

Comments
 (0)