Merge pull request #709 from jgehrcke/jp/basic-gpu-tests

jgehrcke · web-flow · commit 59d775b9b528 · 2025-11-03T19:38:02.000+01:00
tests: cover basic GPU allocation, misc improvements
diff --git a/tests/bats/Makefile b/tests/bats/Makefile
@@ -77,7 +77,7 @@ image:
 # cmdline args).
 .PHONY: tests
 tests: image
-	export _RUNDIR_PARENT=/tmp/k8s-dra-driver-gpu-tests-out && \
+	export _RUNDIR_PARENT=/tmp/k8s-dra-driver-gpu-tests-out-$${USER} && \
 	mkdir -p "$${_RUNDIR_PARENT}" && \
 	export _RUNDIR=$$(mktemp -p $${_RUNDIR_PARENT} -d -t bats-tests-$$(date +%s)-XXXXX) && \
 	docker run \
@@ -100,12 +100,14 @@ tests: image
 		-c "set -ex; cd /cwd; \
 			echo 'Running k8s cluster cleanup (invasive)... '; \
 			bash tests/bats/cleanup-from-previous-run.sh 2>&1 | tee -a $${_RUNDIR}/cleanup.outerr; \
+			set +x; echo '--- STARTING TEST SUITE ---'; set -x; \
 			TMPDIR=$${_RUNDIR} bats \
 			--print-output-on-failure \
 			--no-tempdir-cleanup \
 			--timing \
 			--abort \
 			tests/bats/test_basics.bats \
+			tests/bats/test_gpu_basic.bats \
 			tests/bats/test_cd_imex_chan_inject.bats \
 			tests/bats/test_cd_mnnvl_workload.bats \
 			tests/bats/test_cd_misc.bats \
diff --git a/tests/bats/README.md b/tests/bats/README.md
@@ -54,7 +54,7 @@ That's CI-oriented.
 We may want to change that.
 
 
-## Development
+## Resources for development
 
 Bats is a workable solution.
 Developing new tests might however probe your patience.
@@ -65,6 +65,11 @@ Make wise usage of
 * [tagging tests with `bats:focus`](https://bats-core.readthedocs.io/en/stable/writing-tests.html#special-tags)
 * [CLI args](https://bats-core.readthedocs.io/en/stable/usage.html) such as `--verbose-run`, `--show-output-of-passing-tests`.
 
+Other references:
+* https://github.com/bats-core/bats-assert
+* https://github.com/bats-core/bats-file
+
+
 Misc notes:
 
 * The test suite stops on first failure (using the [new](https://github.com/bats-core/bats-core/issues/209) `--abort` flag for bats).
diff --git a/tests/bats/clean-state-dirs-all-nodes.sh b/tests/bats/clean-state-dirs-all-nodes.sh
@@ -21,8 +21,8 @@ set -o pipefail
 
 rm_kubelet_plugin_dirs_from_node () {
     local NODE_NAME="$1"
-    echo "Run privileged pod to remove kubelet plugin directories on node ${NODE_NAME}"
-    kubectl run privpod-rm-plugindirs \
+    echo "Run privileged pod to remove /run/cdi/* and kubelet plugin directories on node ${NODE_NAME}"
+    kubectl run "privpod-rm-plugindirs-${NODE_NAME}" \
         --rm \
         --image=busybox \
         --attach \
@@ -32,7 +32,8 @@ rm_kubelet_plugin_dirs_from_node () {
         "spec": {
             "nodeName": "'"${NODE_NAME}"'",
             "containers": [{
-            "name": "privpod-rm-plugindirs",
+            "name": "privpod-rm-plugindirs-'"${NODE_NAME}"'",
+            "metadata": {"labels": {"env": "batssuite"}},
             "image": "busybox",
             "securityContext": { "privileged": true },
             "volumeMounts": [{
@@ -52,5 +53,7 @@ rm_kubelet_plugin_dirs_from_node () {
 # Would be faster when using a daemonset. However, the output is more readable
 # when running sequentially.
 for node in $(kubectl get nodes -o jsonpath='{.items[*].metadata.name}'); do
-    rm_kubelet_plugin_dirs_from_node $node
+    rm_kubelet_plugin_dirs_from_node "$node" &
 done
+wait
+echo "state dir cleanup: done"
diff --git a/tests/bats/cleanup-from-previous-run.sh b/tests/bats/cleanup-from-previous-run.sh
@@ -60,6 +60,7 @@ timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-2.yaml 2> /d
 timeout -v 5 kubectl delete -f tests/bats/specs/nvb2.yaml 2> /dev/null
 timeout -v 5 kubectl delete pods -l env=batssuite 2> /dev/null
 timeout -v 2 kubectl delete resourceclaim batssuite-rc-bad-opaque-config --force 2> /dev/null
+timeout -v 2 kubectl delete -f demo/specs/imex/simple-mig-test 2> /dev/null
 
 # TODO: maybe more brute-forcing/best-effort: it might make sense to submit all
 # workload in this test suite into a special namespace (not `default`), and to
@@ -86,4 +87,6 @@ timeout -v 10 kubectl delete crds computedomains.resource.nvidia.com || echo "CR
 # cleanup, fail hard if this does not succeed).
 set -e
 bash tests/bats/clean-state-dirs-all-nodes.sh
+
 set +x
+echo "cleanup: done"
diff --git a/tests/bats/helpers.sh b/tests/bats/helpers.sh
@@ -52,7 +52,7 @@ iupgrade_wait() {
     --timeout=1m5s \
     --create-namespace \
     --namespace nvidia-dra-driver-gpu \
-    --set resources.gpus.enabled=false \
+    --set gpuResourcesEnabledOverride=true \
     --set nvidiaDriverRoot="${TEST_NVIDIA_DRIVER_ROOT}" "${ADDITIONAL_INSTALL_ARGS[@]}"
 
   # Valueable output to have in the logs in case things went pearshaped.
@@ -139,7 +139,8 @@ show_kubelet_plugin_error_logs() {
     kubectl logs \
     -l nvidia-dra-driver-gpu-component=kubelet-plugin \
     -n nvidia-dra-driver-gpu \
-    --prefix --tail=-1 | grep -E "^(E|W)[0-9]{4}"
+    --all-containers \
+    --prefix --tail=-1 | grep -E "^(E|W)[0-9]{4}" -iE "error"
   ) || true
   echo -e "KUBELET PLUGIN ERROR LOGS END\n\n"
 }
@@ -172,3 +173,70 @@ apply_check_delete_workload_imex_chan_inject() {
   kubectl delete -f demo/specs/imex/channel-injection.yaml
   kubectl wait --for=delete pods imex-channel-injection --timeout=10s
 }
+
+
+# Run cmd in nvidia-mig-manager pod because that one has highest privileges. I
+# use this for example to run `nvcnt gb-nvl-027-compute06 nvidia-smi`
+nvmm() {
+  if [ -z "$1" ]; then
+    echo "Usage: nvcnt <node-name> [command...]"
+    return 1
+  fi
+  local node="$1"
+  shift  # Remove first argument, leaving remaining args in $@
+
+  local pod
+  pod=$(kubectl get pod -n gpu-operator -l app=nvidia-mig-manager \
+    --field-selector spec.nodeName="$node" \
+    --no-headers -o custom-columns=":metadata.name")
+
+  if [ -z "$pod" ]; then
+    echo "get pod -n gpu-operator -l app=nvidia-mig-manager: no pod found on node $node"
+    return 1
+  fi
+
+  echo "Executing on pod $pod (node: $node)..."
+  kubectl -n gpu-operator exec -it "$pod" -- "$@"
+}
+
+restart_kubelet_on_node() {
+  local NODEIP="$1"
+  echo "sytemctl restart kubelet.service on ${NODEIP}"
+  # Assume that current user has password-less sudo privileges
+  ssh "${USER}@${NODEIP}" 'sudo systemctl restart kubelet.service'
+}
+
+restart_kubelet_all_nodes() {
+  for nodeip in $(kubectl get nodes -o jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}{end}'); do
+    restart_kubelet_on_node "$nodeip"
+  done
+  #wait
+  echo "restart kubelets: done"
+}
+
+kplog () {
+  if [[ -z "$1" || -z "$2" ]]; then
+    echo "Usage: kplog [gpus|compute-domains] <node-hint-for-grep> [args]"
+    return 1
+  fi
+  local nodehint="$2"
+  local cont="$1"
+  shift
+  shift # Remove first argument, leaving remaining args in $@
+
+  local node=$(kubectl get nodes | grep "$nodehint" | awk '{print $1}')
+  echo "identified node: $node"
+
+  local pod
+  pod=$(kubectl get pod -n nvidia-dra-driver-gpu -l nvidia-dra-driver-gpu-component=kubelet-plugin \
+    --field-selector spec.nodeName="$node" \
+    --no-headers -o custom-columns=":metadata.name")
+
+  if [ -z "$pod" ]; then
+    echo " get pod -n nvidia-dra-driver-gpu -l nvidia-dra-driver-gpu-component=kubelet-plugin: no pod found on node $node"
+    return 1
+  fi
+
+  echo "Executing on pod $pod (node: $node)..."
+  kubectl logs -n nvidia-dra-driver-gpu "$pod" -c "$cont" "$@"
+}
diff --git a/tests/bats/specs/gpu-1pod-2cnt-1gpu.yaml b/tests/bats/specs/gpu-1pod-2cnt-1gpu.yaml
@@ -0,0 +1,39 @@
+# One pod, two containers.
+# Each asking for shared access to a single, full GPU.
+apiVersion: resource.k8s.io/v1
+kind: ResourceClaimTemplate
+metadata:
+  name: rct-single-gpu-full
+spec:
+  spec:
+    devices:
+      requests:
+      - name: gpu
+        exactly:
+          deviceClassName: gpu.nvidia.com
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pod1
+  labels:
+    env: batssuite
+spec:
+  containers:
+  - name: ctr0
+    image: ubuntu:24.04
+    command: ["bash", "-c"]
+    args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: shared-gpu
+  - name: ctr1
+    image: ubuntu:24.04
+    command: ["bash", "-c"]
+    args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: shared-gpu
+  resourceClaims:
+  - name: shared-gpu
+    resourceClaimTemplateName: rct-single-gpu-full
diff --git a/tests/bats/specs/gpu-2pods-1gpu.yaml b/tests/bats/specs/gpu-2pods-1gpu.yaml
@@ -0,0 +1,51 @@
+# Simple GPU sharing scenario:
+# Two pods, one container each, each getting access to the same, shared GPU
+apiVersion: resource.k8s.io/v1
+kind: ResourceClaim
+metadata:
+  name: rc-single-gpu-full
+spec:
+  devices:
+    requests:
+    - name: gpu
+      exactly:
+        deviceClassName: gpu.nvidia.com
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pod1
+  labels:
+    env: batssuite
+spec:
+  containers:
+  - name: ctr
+    image: ubuntu:24.04
+    command: ["bash", "-c"]
+    args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: gpu
+  resourceClaims:
+  - name: gpu
+    resourceClaimName: rc-single-gpu-full
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pod2
+  labels:
+    env: batssuite
+spec:
+  containers:
+  - name: ctr
+    image: ubuntu:24.04
+    command: ["bash", "-c"]
+    args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: gpu
+  resourceClaims:
+  - name: gpu
+    resourceClaimName: rc-single-gpu-full
+
diff --git a/tests/bats/specs/gpu-2pods-2gpus.yaml b/tests/bats/specs/gpu-2pods-2gpus.yaml
@@ -0,0 +1,51 @@
+# Two pods, one container each, each container gets 1 distinct full GPU
+apiVersion: resource.k8s.io/v1
+kind: ResourceClaimTemplate
+metadata:
+  name: rct-single-gpu-full
+spec:
+  spec:
+    devices:
+      requests:
+      - name: gpu
+        exactly:
+          deviceClassName: gpu.nvidia.com
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pod1
+  labels:
+    env: batssuite
+spec:
+  containers:
+  - name: ctr
+    image: ubuntu:24.04
+    command: ["bash", "-c"]
+    args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: gpu
+  resourceClaims:
+  - name: gpu
+    resourceClaimTemplateName: rct-single-gpu-full
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pod2
+  labels:
+    env: batssuite
+spec:
+  containers:
+  - name: ctr
+    image: ubuntu:24.04
+    command: ["bash", "-c"]
+    args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: gpu
+  resourceClaims:
+  - name: gpu
+    resourceClaimTemplateName: rct-single-gpu-full
+
diff --git a/tests/bats/specs/gpu-simple-full.yaml b/tests/bats/specs/gpu-simple-full.yaml
@@ -0,0 +1,31 @@
+# Scenario: single pod, requesting any full GPU
+apiVersion: resource.k8s.io/v1
+kind: ResourceClaimTemplate
+metadata:
+  name: rct-single-gpu-full
+spec:
+  spec:
+    devices:
+      requests:
+      - name: gpu
+        exactly:
+          deviceClassName: gpu.nvidia.com
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pod-full-gpu
+  labels:
+    env: batssuite
+spec:
+  containers:
+  - name: ctr
+    image: ubuntu:24.04
+    command: ["bash", "-c"]
+    args: ["nvidia-smi -L; trap 'exit 0' TERM; sleep 9999 & wait"]
+    resources:
+      claims:
+      - name: gpu
+  resourceClaims:
+  - name: gpu
+    resourceClaimTemplateName: rct-single-gpu-full
diff --git a/tests/bats/test_gpu_basic.bats b/tests/bats/test_gpu_basic.bats