tests: add test_cd_failover.bats and support

jgehrcke · jgehrcke · commit 70948915fe4b · 2025-10-15T17:20:45.000Z
Signed-off-by: Dr. Jan-Philip Gehrcke &lt;jgehrcke@nvidia.com&gt;
diff --git a/tests/bats/Dockerfile b/tests/bats/Dockerfile
@@ -2,8 +2,9 @@ FROM debian:trixie
 
 # GNU parallel: bats may want to use that
 # gettext-base: provides envsubst, used by nickelpie
+# bc: by bash wrappers, for calculation
 RUN apt-get update && apt-get install -y -q --no-install-recommends \
-    parallel git ca-certificates curl make gettext-base jq && \
+    parallel git ca-certificates curl make gettext-base jq bc yq && \
     rm -rf /var/lib/apt/lists/*
 
 # Set by BuiltKit, of the form amd64/arm64.
@@ -15,6 +16,7 @@ RUN git clone https://github.com/bats-core/bats-core.git && cd bats-core && \
     git checkout 658f442f0fcdd6f9e2ea01625999217e8f7bfe7d && ./install.sh /usr/local
 
 RUN mkdir -p /bats-libraries
+# These are conservatively patched -- so far; maybe pin them in the future
 RUN git clone https://github.com/bats-core/bats-support /bats-libraries/bats-support
 RUN git clone https://github.com/bats-core/bats-assert /bats-libraries/bats-assert
 RUN git clone https://github.com/bats-core/bats-file /bats-libraries/bats-file
diff --git a/tests/bats/Makefile b/tests/bats/Makefile
@@ -110,5 +110,6 @@ tests: image
 			tests/bats/test_cd_imex_chan_inject.bats \
 			tests/bats/test_cd_mnnvl_workload.bats \
 			tests/bats/test_cd_logging.bats \
+			tests/bats/test_cd_failover.bats \
 			tests/bats/test_cd_updowngrade.bats \
 		"
diff --git a/tests/bats/cleanup-from-previous-run.sh b/tests/bats/cleanup-from-previous-run.sh
@@ -56,6 +56,8 @@ timeout -v 5 kubectl delete -f demo/specs/imex/channel-injection-all.yaml 2> /de
 timeout -v 5 kubectl delete jobs nickelpie-test 2> /dev/null
 timeout -v 5 kubectl delete computedomain nickelpie-test-compute-domain 2> /dev/null
 timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-1.yaml 2> /dev/null
+timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-2.yaml 2> /dev/null
+timeout -v 5 kubectl delete -f tests/bats/specs/nvb2.yaml 2> /dev/null
 timeout -v 5 kubectl delete pods -l env=batssuite 2> /dev/null
 timeout -v 2 kubectl delete resourceclaim batssuite-rc-bad-opaque-config --force 2> /dev/null
 
diff --git a/tests/bats/lib/test_cd_nvb_failover.sh b/tests/bats/lib/test_cd_nvb_failover.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+
+set -o nounset
+set -o errexit
+
+SPECPATH="$1"
+FAULT_TYPE="$2"
+
+BASE_NAME="test-failover-job"
+CD_NAME="test-failover-cd"
+
+# External supervisor can inject run ID (for many-repetition-tests), used mainly
+# in output file names.
+RUNID="${RUNID:-no_runid}"
+
+JOB_NAME="${BASE_NAME}-launcher"
+
+# For measuring duration with sub-second precision.
+_T0=$(awk '{print $1}' /proc/uptime)
+
+# For measuring duration with O(1 s) precision.
+SECONDS=0
+
+# Common arguments for `kubectl logs`, with common ts for proper chronological
+# sort upon dedup/post-processing.
+KLOGS_ARGS="--tail=-1 --prefix --all-containers --timestamps"
+
+# Wait for the workload to heal after fault injection (for the MPI launcher pod
+# to succeed); otherwise fail the test TIMEOUT seconds after startup.
+TIMEOUT=300
+
+log_ts_no_newline() {
+    echo -n "$(date -u +'%Y-%m-%dT%H:%M:%S.%3NZ ')"
+}
+
+log() {
+  _TNOW=$(awk '{print $1}' /proc/uptime)
+  _DUR=$(echo "$_TNOW - $_T0" | bc)
+  log_ts_no_newline
+  printf "[%6.1fs] $1\n" "$_DUR"
+}
+
+log "RUNID $RUNID | fault type $FAULT_TYPE | $SPECPATH | $BASE_NAME | $JOB_NAME | $CD_NAME"
+log "do: delete -f ${SPECPATH} (and wait)"
+kubectl delete -f "${SPECPATH}" --ignore-not-found > /dev/null
+kubectl wait --for=delete job/"${JOB_NAME}" --timeout=20s > /dev/null
+log "done"
+
+log "do: apply -f ${SPECPATH}"
+kubectl apply -f "${SPECPATH}" > /dev/null
+log "done"
+log "do: wait --for=create"
+kubectl wait --for=create job/"${JOB_NAME}" --timeout=40s > /dev/null
+log "done"
+CDUID=$(kubectl describe computedomains.resource.nvidia.com "${CD_NAME}" | grep UID | awk '{print $2}')
+
+log "CD uid: ${CDUID}"
+log "resource claims:"
+kubectl get resourceclaim
+log "workload pods:"
+kubectl get pods -o wide
+
+
+LAUNCHER_LOG_PATH="${RUNID}_launcher_logs.log"
+LAUNCHER_ERRORS_LOG_PATH="${RUNID}_launcher_errors.log"
+echo "" > "${LAUNCHER_LOG_PATH}"
+echo "" > "${LAUNCHER_LOG_PATH}".dup
+
+FAULT_INJECTED=0
+NVB_COMMS_STARTED=0
+LAST_LAUNCHER_RESTART_OUTPUT=""
+STATUS="nil"
+
+while true; do
+
+    _llro=$(kubectl get pod -l job-name="${JOB_NAME}" -o json | \
+        /usr/bin/jq -r '.items[].status.containerStatuses[].restartCount'
+    )
+
+    if [[ "$LAST_LAUNCHER_RESTART_OUTPUT" != "$_llro" ]]; then
+        log "launcher container restarts seen: $_llro"
+        LAST_LAUNCHER_RESTART_OUTPUT="$_llro"
+    fi
+
+    # Start log-follower child processes for all newly popping up CD daemon pods
+    # (when they are Running). I have added this very late in the game because I
+    # think we're missing CD daemon log around container shutdown; I want to be
+    # extra sure.
+    kubectl get pods -n nvidia-dra-driver-gpu | grep "${CD_NAME}" | grep Running | awk '{print $1}' | while read pname; do
+        _logfname="${RUNID}_cddaemon_follow_${pname}.log"
+        if [ -f "$_logfname" ]; then
+            continue
+        fi
+        log "new CD daemon pod: $pname -- follow log, save to ${_logfname}"
+        kubectl logs -n nvidia-dra-driver-gpu "$pname" \
+            --tail=-1 --timestamps --prefix --all-containers --follow \
+            > "${_logfname}" &
+        # Note: if we lose track of the log followers spawned, we can and should
+        # terminate them all with `kill $(jobs -p)`.
+    done
+
+    # Note that the launcher _pod_ is not expected to restart. The container in
+    # the pod may restart various times in the context of this failover.
+    # `kubectl logs --follow` does not automatically follow container restarts.
+    # To catch all container instances in view of quick restarts, we need to
+    # often call a pair of `kubectl logs` commands (once with, and once without
+    # --previous). Even that does not reliably obtain _all_ container logs. The
+    # correct solution for this type of problem is to have a proper log
+    # streaming pipeline. Collect heavily duplicated logs (dedup later)
+    kubectl logs -l job-name="${JOB_NAME}" $KLOGS_ARGS >> "${LAUNCHER_LOG_PATH}".dup 2>&1 || true
+    kubectl logs -l job-name="${JOB_NAME}" $KLOGS_ARGS --previous >> "${LAUNCHER_LOG_PATH}".dup 2>&1 || true
+
+
+    date -u +'%Y-%m-%dT%H:%M:%S.%3NZ ' >> "${RUNID}_pods_over_time"
+    kubectl get pods -n nvidia-dra-driver-gpu -o wide >> "${RUNID}_pods_over_time"
+    kubectl get pods -o wide >> "${RUNID}_pods_over_time"
+
+    STATUS=$(kubectl get pod -l job-name="${JOB_NAME}" -o jsonpath="{.items[0].status.phase}" 2>/dev/null)
+    if [ "$STATUS" == "Succeeded" ]; then
+        log "nvb completed"
+        break
+    fi
+
+    # The launcher pod handles many failures internally by restarting the
+    # launcher container (the MPI launcher process). Treat it as permanent
+    # failure when this pod failed overall.
+    if [ "$STATUS" == "Failed" ]; then
+        log "nvb launcher pod failed"
+        break
+    fi
+
+    # Keep rather precise track of when the actual communication part of the
+    # benchmark has started. Assume that the benchmark takes at least 20 seconds
+    # overall. Inject fault shortly after benchmark has started. Pick that delay
+    # to be random (but below 20 seconds).
+    if (( NVB_COMMS_STARTED == 1 )); then
+        if (( FAULT_INJECTED == 0 )); then
+            log "NVB_COMMS_STARTED"
+
+            _jitter_seconds=$(awk -v min=1 -v max=5 'BEGIN {srand(); print min+rand()*(max-min)}')
+            log "sleep, pre-injection jitter: $_jitter_seconds s"
+            sleep "$_jitter_seconds"
+
+            # A failing CUDA mem import/export API call in a worker process is
+            # propagated to the launcher, in which case the launcher container
+            # terminates (without logging). The launcher pod then restarts the
+            # container. After that, the MPI workload (the benchmark) is
+            # reinitialized by new new launcher. It starts again from scratch
+            # (while the MPI SSH-type worker processes stay alive, they actually
+            # start new workload child processes). Another type of failure that
+            # that launcher handles by restarting itself is clean TCP connection
+            # shutdown initiated by (clean) worker pod deletion. Any type of
+            # failure that is propagated to the launcher is met with the
+            # launcher container crashing, and restarting worker processes. This
+            # type of error handling after all facilitates "healing" the
+            # workload.
+            #
+            # Here, however, the workload _never_ proceeds from where it left
+            # off before fault injection. When this test passes, it implies that
+            # the workload restarted from scratch internally after fault
+            # injection, and then completed.
+
+            if (( FAULT_TYPE == 1 )); then
+                log "inject fault type 1: force-delete worker pod 0"
+                set -x
+                kubectl delete pod "${BASE_NAME}-worker-0" --grace-period=0 --force
+                set +x
+            elif (( FAULT_TYPE == 2 )); then
+                log "inject fault type 2: force-delete all IMEX daemons"
+                set -x
+                kubectl delete pod -n nvidia-dra-driver-gpu -l resource.nvidia.com/computeDomain --grace-period=0 --force
+                set +x
+            elif (( FAULT_TYPE == 3 )); then
+                log "inject fault type 3: regular-delete worker pod 1"
+                set -x
+                kubectl delete pod "${BASE_NAME}-worker-1"
+                set +x
+            else
+                log "unknown fault type $FAULT_TYPE"
+                exit 1
+            fi
+            FAULT_INJECTED=1
+        fi
+        # Fault already injected, do not inject again.
+    else
+        # Did the benchmark start? Consult current launcher container log.
+        if kubectl logs -l job-name="${JOB_NAME}" --tail=-1 2>&1 | grep "Running multinode_"; then
+            NVB_COMMS_STARTED=1
+        fi
+    fi
+
+    if [ "$SECONDS" -ge $TIMEOUT ]; then
+        log "global deadline reached ($TIMEOUT seconds), collect debug data -- and leave control loop"
+        kubectl get pods -A -o wide
+        kubectl get computedomain
+        kubectl get computedomains.resource.nvidia.com "${CD_NAME}" -o yaml
+
+        # Run this in the background, then delete workflow -- this helps getting all logs
+        # (but also disrupts post-run debuggability)
+        kubectl logs -l "training.kubeflow.org/job-name=${BASE_NAME}" \
+            --tail=-1 --prefix --all-containers --timestamps --follow &> "${RUNID}_on_timeout_workload.log" &
+        log "on-timeout do: delete -f ${SPECPATH} (and wait)"
+        kubectl delete -f "${SPECPATH}" --ignore-not-found > /dev/null
+        kubectl wait --for=delete job/"${JOB_NAME}" --timeout=20s > /dev/null
+
+        # log something if this looks like a segmentation fault on
+        # shutdown (not our bug)
+        set +e
+        cat "${RUNID}_on_timeout_workload.log" | grep PMIx_Finalize
+        set -e
+
+        log "done"
+        break
+    fi
+
+    sleep 1
+done
+
+
+log "terminate children, wait"
+jobs -p
+kill $(jobs -p) || true
+wait
+
+log "dedup launcher logs"
+cat "${LAUNCHER_LOG_PATH}".dup | sort | uniq > "${LAUNCHER_LOG_PATH}"
+rm "${LAUNCHER_LOG_PATH}".dup
+
+set +e
+log "errors in / reported by launcher:"
+cat "${LAUNCHER_LOG_PATH}" | \
+    grep -e CUDA_ -e "closed by remote host" -e "Could not resolve" > "${LAUNCHER_ERRORS_LOG_PATH}"
+cat "${LAUNCHER_ERRORS_LOG_PATH}"
+
+if [ "$STATUS" != "Succeeded" ]; then
+    log "last launcher pod status is not 'Succeeded': $STATUS"
+    log "finished: failure (fault type $FAULT_TYPE)"
+    log "exit with code 1"
+    exit 1
+fi
+
+log "finished: success (fault type $FAULT_TYPE)"
diff --git a/tests/bats/specs/nvb2.yaml b/tests/bats/specs/nvb2.yaml
@@ -0,0 +1,88 @@
+---
+apiVersion: resource.nvidia.com/v1beta1
+kind: ComputeDomain
+metadata:
+  name: test-failover-cd
+spec:
+  numNodes: 2
+  channel:
+    resourceClaimTemplate:
+      name: test-failover-rct-channel
+---
+apiVersion: kubeflow.org/v2beta1
+kind: MPIJob
+metadata:
+  name: test-failover-job
+spec:
+  slotsPerWorker: 2
+  launcherCreationPolicy: WaitForWorkersReady
+  runPolicy:
+    cleanPodPolicy: Running
+  sshAuthMountPath: /home/mpiuser/.ssh
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+        metadata:
+          labels:
+            mpi-memcpy-dra-test-replica: mpi-launcher
+        spec:
+          containers:
+          - image: ghcr.io/nvidia/k8s-samples:nvbandwidth-6dc12f17
+            name: mpi-launcher
+            securityContext:
+              runAsUser: 1000
+            command:
+            - mpirun
+            args:
+            - --show-progress
+            - --bind-to
+            - core
+            - --map-by
+            - ppr:2:node
+            - -np
+            - "4"
+            - --report-bindings
+            - -q
+            - nvbandwidth
+            - --verbose
+            - --testSamples
+            - "20"
+            - --bufferSize
+            - "2048"
+            - -t
+            - multinode_device_to_device_memcpy_read_ce
+          affinity:
+            nodeAffinity:
+              requiredDuringSchedulingIgnoredDuringExecution:
+                nodeSelectorTerms:
+                - matchExpressions:
+                  - key: node-role.kubernetes.io/control-plane
+                    operator: Exists
+    Worker:
+      replicas: 2
+      template:
+        metadata:
+          labels:
+            mpi-memcpy-dra-test-replica: mpi-worker
+        spec:
+          containers:
+          - image: ghcr.io/nvidia/k8s-samples:nvbandwidth-6dc12f17
+            name: mpi-worker
+            securityContext:
+              runAsUser: 1000
+            env:
+            command:
+            - /usr/sbin/sshd
+            args:
+            - -De
+            - -f
+            - /home/mpiuser/.sshd_config
+            resources:
+              limits:
+                nvidia.com/gpu: 2
+              claims:
+              - name: test-failover-rc-channel
+          resourceClaims:
+          - name: test-failover-rc-channel
+            resourceClaimTemplateName: test-failover-rct-channel
diff --git a/tests/bats/test_cd_failover.bats b/tests/bats/test_cd_failover.bats
@@ -0,0 +1,30 @@
+# shellcheck disable=SC2148
+# shellcheck disable=SC2329
+
+setup_file() {
+  load 'helpers.sh'
+  _common_setup
+
+  local _iargs=("--set" "logVerbosity=6")
+  iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
+}
+
+teardown_file() {
+  kubectl delete -f tests/bats/specs/nvb2.yaml --ignore-not-found
+  kubectl wait --for=delete job/test-failover-job-launcher --timeout=20s || true
+}
+
+
+@test "CD failover nvb2: force-delete worker pod 0" {
+  bash tests/bats/lib/test_cd_nvb_failover.sh tests/bats/specs/nvb2.yaml 1
+}
+
+
+@test "CD failover nvb2: force-delete all IMEX daemons" {
+  bash tests/bats/lib/test_cd_nvb_failover.sh tests/bats/specs/nvb2.yaml 2
+}
+
+
+@test "CD failover nvb2: regular-delete worker pod 1" {
+  bash tests/bats/lib/test_cd_nvb_failover.sh tests/bats/specs/nvb2.yaml 3
+}

Original file line number	Diff line number	Diff line change
`@@ -110,5 +110,6 @@ tests: image`
`110`	`110`	`tests/bats/test_cd_imex_chan_inject.bats \`
`111`	`111`	`tests/bats/test_cd_mnnvl_workload.bats \`
`112`	`112`	`tests/bats/test_cd_logging.bats \`
	`113`	`+ tests/bats/test_cd_failover.bats \`
`113`	`114`	`tests/bats/test_cd_updowngrade.bats \`
`114`	`115`	`"`