tests: cover basic GPU allocation

jgehrcke · jgehrcke · commit 11014c1e2d98 · 2025-11-03T14:26:20.000+01:00
Signed-off-by: Dr. Jan-Philip Gehrcke &lt;jgehrcke@nvidia.com&gt;

misc fixes

Signed-off-by: Dr. Jan-Philip Gehrcke &lt;jgehrcke@nvidia.com&gt;
diff --git a/tests/bats/Makefile b/tests/bats/Makefile
@@ -107,6 +107,7 @@ tests: image
 			--timing \
 			--abort \
 			tests/bats/test_basics.bats \
+			tests/bats/test_gpu_basic.bats \
 			tests/bats/test_cd_imex_chan_inject.bats \
 			tests/bats/test_cd_mnnvl_workload.bats \
 			tests/bats/test_cd_misc.bats \
diff --git a/tests/bats/README.md b/tests/bats/README.md
@@ -54,7 +54,7 @@ That's CI-oriented.
 We may want to change that.
 
 
-## Development
+## Resources for development
 
 Bats is a workable solution.
 Developing new tests might however probe your patience.
@@ -65,6 +65,11 @@ Make wise usage of
 * [tagging tests with `bats:focus`](https://bats-core.readthedocs.io/en/stable/writing-tests.html#special-tags)
 * [CLI args](https://bats-core.readthedocs.io/en/stable/usage.html) such as `--verbose-run`, `--show-output-of-passing-tests`.
 
+Other references:
+* https://github.com/bats-core/bats-assert
+* https://github.com/bats-core/bats-file
+
+
 Misc notes:
 
 * The test suite stops on first failure (using the [new](https://github.com/bats-core/bats-core/issues/209) `--abort` flag for bats).
diff --git a/tests/bats/clean-state-dirs-all-nodes.sh b/tests/bats/clean-state-dirs-all-nodes.sh
@@ -19,11 +19,10 @@ set -o errexit
 set -o nounset
 set -o pipefail
 
-# TODO: think about wiping /var/run/cdi, too
-
+# Note: assume that /var/run on host is a link to /run
 rm_kubelet_plugin_dirs_from_node () {
     local NODE_NAME="$1"
-    echo "Run privileged pod to remove kubelet plugin directories on node ${NODE_NAME}"
+    echo "Run privileged pod to remove /run/cdi/* and kubelet plugin directories on node ${NODE_NAME}"
     kubectl run "privpod-rm-plugindirs-${NODE_NAME}" \
         --rm \
         --image=busybox \
@@ -42,7 +41,7 @@ rm_kubelet_plugin_dirs_from_node () {
                 "mountPath": "/host",
                 "name": "host-root"
             }],
-            "command": ["/bin/sh", "-c", "rm -rfv /host/var/lib/kubelet/plugins/*"]
+            "command": ["/bin/sh", "-c", "rm -rfv /host/run/cdi/* ; rm -rfv /host/var/lib/kubelet/plugins/*"]
             }],
             "volumes": [{
             "name": "host-root",
diff --git a/tests/bats/cleanup-from-previous-run.sh b/tests/bats/cleanup-from-previous-run.sh
@@ -60,6 +60,7 @@ timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-2.yaml 2> /d
 timeout -v 5 kubectl delete -f tests/bats/specs/nvb2.yaml 2> /dev/null
 timeout -v 5 kubectl delete pods -l env=batssuite 2> /dev/null
 timeout -v 2 kubectl delete resourceclaim batssuite-rc-bad-opaque-config --force 2> /dev/null
+timeout -v 2 kubectl delete -f demo/specs/imex/simple-mig-test 2> /dev/null
 
 # TODO: maybe more brute-forcing/best-effort: it might make sense to submit all
 # workload in this test suite into a special namespace (not `default`), and to
@@ -86,4 +87,6 @@ timeout -v 10 kubectl delete crds computedomains.resource.nvidia.com || echo "CR
 # cleanup, fail hard if this does not succeed).
 set -e
 bash tests/bats/clean-state-dirs-all-nodes.sh
+
 set +x
+echo "cleanup: done"
diff --git a/tests/bats/helpers.sh b/tests/bats/helpers.sh
@@ -52,7 +52,7 @@ iupgrade_wait() {
     --timeout=1m5s \
     --create-namespace \
     --namespace nvidia-dra-driver-gpu \
-    --set resources.gpus.enabled=false \
+    --set gpuResourcesEnabledOverride=true \
     --set nvidiaDriverRoot="${TEST_NVIDIA_DRIVER_ROOT}" "${ADDITIONAL_INSTALL_ARGS[@]}"
 
   # Valueable output to have in the logs in case things went pearshaped.
@@ -139,7 +139,8 @@ show_kubelet_plugin_error_logs() {
     kubectl logs \
     -l nvidia-dra-driver-gpu-component=kubelet-plugin \
     -n nvidia-dra-driver-gpu \
-    --prefix --tail=-1 | grep -E "^(E|W)[0-9]{4}"
+    --all-containers \
+    --prefix --tail=-1 | grep -E "^(E|W)[0-9]{4}" -iE "error"
   ) || true
   echo -e "KUBELET PLUGIN ERROR LOGS END\n\n"
 }
@@ -190,10 +191,52 @@ nvmm() {
     --no-headers -o custom-columns=":metadata.name")
 
   if [ -z "$pod" ]; then
-    echo "No pod found on node $node"
+    echo "get pod -n gpu-operator -l app=nvidia-mig-manager: no pod found on node $node"
     return 1
   fi
 
   echo "Executing on pod $pod (node: $node)..."
   kubectl -n gpu-operator exec -it "$pod" -- "$@"
+}
+
+restart_kubelet_on_node() {
+  local NODEIP="$1"
+  echo "sytemctl restart kubelet.service on ${NODEIP}"
+  # Assume that current user has password-less sudo privileges
+  ssh "${USER}@${NODEIP}" 'sudo systemctl restart kubelet.service'
+}
+
+restart_kubelet_all_nodes() {
+  for nodeip in $(kubectl get nodes -o jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}{end}'); do
+    restart_kubelet_on_node "$nodeip"
+  done
+  #wait
+  echo "restart kubelets: done"
+}
+
+kplog () {
+  if [[ -z "$1" || -z "$2" ]]; then
+    echo "Usage: kplog [gpus|compute-domains] <node-hint-for-grep> [args]"
+    return 1
+  fi
+  local nodehint="$2"
+  local cont="$1"
+  shift
+  shift # Remove first argument, leaving remaining args in $@
+
+  local node=$(kubectl get nodes | grep "$nodehint" | awk '{print $1}')
+  echo "identified node: $node"
+
+  local pod
+  pod=$(kubectl get pod -n nvidia-dra-driver-gpu -l nvidia-dra-driver-gpu-component=kubelet-plugin \
+    --field-selector spec.nodeName="$node" \
+    --no-headers -o custom-columns=":metadata.name")
+
+  if [ -z "$pod" ]; then
+    echo " get pod -n nvidia-dra-driver-gpu -l nvidia-dra-driver-gpu-component=kubelet-plugin: no pod found on node $node"
+    return 1
+  fi
+
+  echo "Executing on pod $pod (node: $node)..."
+  kubectl logs -n nvidia-dra-driver-gpu "$pod" -c "$cont" "$@"
 }
diff --git a/tests/bats/specs/gpu-1pod-2cnt-1gpu.yaml b/tests/bats/specs/gpu-1pod-2cnt-1gpu.yaml
@@ -0,0 +1,39 @@
+# One pod, two containers.
+# Each asking for shared access to a single, full GPU.
+apiVersion: resource.k8s.io/v1
+kind: ResourceClaimTemplate
+metadata:
+  name: rct-single-gpu-full
+spec:
+  spec:
+    devices:
+      requests:
+      - name: gpu
+        exactly:
+          deviceClassName: gpu.nvidia.com
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pod1
+  labels:
+    env: batssuite
+spec:
+  containers:
+  - name: ctr0
+    image: ubuntu:24.04
+    command: ["bash", "-c"]
+    args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: shared-gpu
+  - name: ctr1
+    image: ubuntu:24.04
+    command: ["bash", "-c"]
+    args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: shared-gpu
+  resourceClaims:
+  - name: shared-gpu
+    resourceClaimTemplateName: rct-single-gpu-full
diff --git a/tests/bats/specs/gpu-2pods-1gpu.yaml b/tests/bats/specs/gpu-2pods-1gpu.yaml
@@ -0,0 +1,51 @@
+# Simple GPU sharing scenario:
+# Two pods, one container each, each getting access to the same, shared GPU
+apiVersion: resource.k8s.io/v1
+kind: ResourceClaim
+metadata:
+  name: rc-single-gpu-full
+spec:
+  devices:
+    requests:
+    - name: gpu
+      exactly:
+        deviceClassName: gpu.nvidia.com
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pod1
+  labels:
+    env: batssuite
+spec:
+  containers:
+  - name: ctr
+    image: ubuntu:24.04
+    command: ["bash", "-c"]
+    args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: gpu
+  resourceClaims:
+  - name: gpu
+    resourceClaimName: rc-single-gpu-full
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pod2
+  labels:
+    env: batssuite
+spec:
+  containers:
+  - name: ctr
+    image: ubuntu:24.04
+    command: ["bash", "-c"]
+    args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: gpu
+  resourceClaims:
+  - name: gpu
+    resourceClaimName: rc-single-gpu-full
+
diff --git a/tests/bats/specs/gpu-2pods-2gpus.yaml b/tests/bats/specs/gpu-2pods-2gpus.yaml
@@ -0,0 +1,51 @@
+# Two pods, one container each, each container gets 1 distinct full GPU
+apiVersion: resource.k8s.io/v1
+kind: ResourceClaimTemplate
+metadata:
+  name: rct-single-gpu-full
+spec:
+  spec:
+    devices:
+      requests:
+      - name: gpu
+        exactly:
+          deviceClassName: gpu.nvidia.com
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pod1
+  labels:
+    env: batssuite
+spec:
+  containers:
+  - name: ctr
+    image: ubuntu:24.04
+    command: ["bash", "-c"]
+    args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: gpu
+  resourceClaims:
+  - name: gpu
+    resourceClaimTemplateName: rct-single-gpu-full
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pod2
+  labels:
+    env: batssuite
+spec:
+  containers:
+  - name: ctr
+    image: ubuntu:24.04
+    command: ["bash", "-c"]
+    args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: gpu
+  resourceClaims:
+  - name: gpu
+    resourceClaimTemplateName: rct-single-gpu-full
+
diff --git a/tests/bats/specs/gpu-simple-full.yaml b/tests/bats/specs/gpu-simple-full.yaml
@@ -0,0 +1,31 @@
+# Scenario: single pod, requesting any full GPU
+apiVersion: resource.k8s.io/v1
+kind: ResourceClaimTemplate
+metadata:
+  name: rct-single-gpu-full
+spec:
+  spec:
+    devices:
+      requests:
+      - name: gpu
+        exactly:
+          deviceClassName: gpu.nvidia.com
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pod-full-gpu
+  labels:
+    env: batssuite
+spec:
+  containers:
+  - name: ctr
+    image: ubuntu:24.04
+    command: ["bash", "-c"]
+    args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: gpu
+  resourceClaims:
+  - name: gpu
+    resourceClaimTemplateName: rct-single-gpu-full
diff --git a/tests/bats/test_gpu_basic.bats b/tests/bats/test_gpu_basic.bats