Skip to content

Commit 7094891

Browse files
committed
tests: add test_cd_failover.bats and support
Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]>
1 parent eec2564 commit 7094891

File tree

6 files changed

+366
-1
lines changed

6 files changed

+366
-1
lines changed

tests/bats/Dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@ FROM debian:trixie
22

33
# GNU parallel: bats may want to use that
44
# gettext-base: provides envsubst, used by nickelpie
5+
# bc: by bash wrappers, for calculation
56
RUN apt-get update && apt-get install -y -q --no-install-recommends \
6-
parallel git ca-certificates curl make gettext-base jq && \
7+
parallel git ca-certificates curl make gettext-base jq bc yq && \
78
rm -rf /var/lib/apt/lists/*
89

910
# Set by BuiltKit, of the form amd64/arm64.
@@ -15,6 +16,7 @@ RUN git clone https://github.com/bats-core/bats-core.git && cd bats-core && \
1516
git checkout 658f442f0fcdd6f9e2ea01625999217e8f7bfe7d && ./install.sh /usr/local
1617

1718
RUN mkdir -p /bats-libraries
19+
# These are conservatively patched -- so far; maybe pin them in the future
1820
RUN git clone https://github.com/bats-core/bats-support /bats-libraries/bats-support
1921
RUN git clone https://github.com/bats-core/bats-assert /bats-libraries/bats-assert
2022
RUN git clone https://github.com/bats-core/bats-file /bats-libraries/bats-file

tests/bats/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,5 +110,6 @@ tests: image
110110
tests/bats/test_cd_imex_chan_inject.bats \
111111
tests/bats/test_cd_mnnvl_workload.bats \
112112
tests/bats/test_cd_logging.bats \
113+
tests/bats/test_cd_failover.bats \
113114
tests/bats/test_cd_updowngrade.bats \
114115
"

tests/bats/cleanup-from-previous-run.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ timeout -v 5 kubectl delete -f demo/specs/imex/channel-injection-all.yaml 2> /de
5656
timeout -v 5 kubectl delete jobs nickelpie-test 2> /dev/null
5757
timeout -v 5 kubectl delete computedomain nickelpie-test-compute-domain 2> /dev/null
5858
timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-1.yaml 2> /dev/null
59+
timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-2.yaml 2> /dev/null
60+
timeout -v 5 kubectl delete -f tests/bats/specs/nvb2.yaml 2> /dev/null
5961
timeout -v 5 kubectl delete pods -l env=batssuite 2> /dev/null
6062
timeout -v 2 kubectl delete resourceclaim batssuite-rc-bad-opaque-config --force 2> /dev/null
6163

Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
#!/bin/bash
2+
3+
set -o nounset
4+
set -o errexit
5+
6+
SPECPATH="$1"
7+
FAULT_TYPE="$2"
8+
9+
BASE_NAME="test-failover-job"
10+
CD_NAME="test-failover-cd"
11+
12+
# External supervisor can inject run ID (for many-repetition-tests), used mainly
13+
# in output file names.
14+
RUNID="${RUNID:-no_runid}"
15+
16+
JOB_NAME="${BASE_NAME}-launcher"
17+
18+
# For measuring duration with sub-second precision.
19+
_T0=$(awk '{print $1}' /proc/uptime)
20+
21+
# For measuring duration with O(1 s) precision.
22+
SECONDS=0
23+
24+
# Common arguments for `kubectl logs`, with common ts for proper chronological
25+
# sort upon dedup/post-processing.
26+
KLOGS_ARGS="--tail=-1 --prefix --all-containers --timestamps"
27+
28+
# Wait for the workload to heal after fault injection (for the MPI launcher pod
29+
# to succeed); otherwise fail the test TIMEOUT seconds after startup.
30+
TIMEOUT=300
31+
32+
log_ts_no_newline() {
33+
echo -n "$(date -u +'%Y-%m-%dT%H:%M:%S.%3NZ ')"
34+
}
35+
36+
log() {
37+
_TNOW=$(awk '{print $1}' /proc/uptime)
38+
_DUR=$(echo "$_TNOW - $_T0" | bc)
39+
log_ts_no_newline
40+
printf "[%6.1fs] $1\n" "$_DUR"
41+
}
42+
43+
log "RUNID $RUNID | fault type $FAULT_TYPE | $SPECPATH | $BASE_NAME | $JOB_NAME | $CD_NAME"
44+
log "do: delete -f ${SPECPATH} (and wait)"
45+
kubectl delete -f "${SPECPATH}" --ignore-not-found > /dev/null
46+
kubectl wait --for=delete job/"${JOB_NAME}" --timeout=20s > /dev/null
47+
log "done"
48+
49+
log "do: apply -f ${SPECPATH}"
50+
kubectl apply -f "${SPECPATH}" > /dev/null
51+
log "done"
52+
log "do: wait --for=create"
53+
kubectl wait --for=create job/"${JOB_NAME}" --timeout=40s > /dev/null
54+
log "done"
55+
CDUID=$(kubectl describe computedomains.resource.nvidia.com "${CD_NAME}" | grep UID | awk '{print $2}')
56+
57+
log "CD uid: ${CDUID}"
58+
log "resource claims:"
59+
kubectl get resourceclaim
60+
log "workload pods:"
61+
kubectl get pods -o wide
62+
63+
64+
LAUNCHER_LOG_PATH="${RUNID}_launcher_logs.log"
65+
LAUNCHER_ERRORS_LOG_PATH="${RUNID}_launcher_errors.log"
66+
echo "" > "${LAUNCHER_LOG_PATH}"
67+
echo "" > "${LAUNCHER_LOG_PATH}".dup
68+
69+
FAULT_INJECTED=0
70+
NVB_COMMS_STARTED=0
71+
LAST_LAUNCHER_RESTART_OUTPUT=""
72+
STATUS="nil"
73+
74+
while true; do
75+
76+
_llro=$(kubectl get pod -l job-name="${JOB_NAME}" -o json | \
77+
/usr/bin/jq -r '.items[].status.containerStatuses[].restartCount'
78+
)
79+
80+
if [[ "$LAST_LAUNCHER_RESTART_OUTPUT" != "$_llro" ]]; then
81+
log "launcher container restarts seen: $_llro"
82+
LAST_LAUNCHER_RESTART_OUTPUT="$_llro"
83+
fi
84+
85+
# Start log-follower child processes for all newly popping up CD daemon pods
86+
# (when they are Running). I have added this very late in the game because I
87+
# think we're missing CD daemon log around container shutdown; I want to be
88+
# extra sure.
89+
kubectl get pods -n nvidia-dra-driver-gpu | grep "${CD_NAME}" | grep Running | awk '{print $1}' | while read pname; do
90+
_logfname="${RUNID}_cddaemon_follow_${pname}.log"
91+
if [ -f "$_logfname" ]; then
92+
continue
93+
fi
94+
log "new CD daemon pod: $pname -- follow log, save to ${_logfname}"
95+
kubectl logs -n nvidia-dra-driver-gpu "$pname" \
96+
--tail=-1 --timestamps --prefix --all-containers --follow \
97+
> "${_logfname}" &
98+
# Note: if we lose track of the log followers spawned, we can and should
99+
# terminate them all with `kill $(jobs -p)`.
100+
done
101+
102+
# Note that the launcher _pod_ is not expected to restart. The container in
103+
# the pod may restart various times in the context of this failover.
104+
# `kubectl logs --follow` does not automatically follow container restarts.
105+
# To catch all container instances in view of quick restarts, we need to
106+
# often call a pair of `kubectl logs` commands (once with, and once without
107+
# --previous). Even that does not reliably obtain _all_ container logs. The
108+
# correct solution for this type of problem is to have a proper log
109+
# streaming pipeline. Collect heavily duplicated logs (dedup later)
110+
kubectl logs -l job-name="${JOB_NAME}" $KLOGS_ARGS >> "${LAUNCHER_LOG_PATH}".dup 2>&1 || true
111+
kubectl logs -l job-name="${JOB_NAME}" $KLOGS_ARGS --previous >> "${LAUNCHER_LOG_PATH}".dup 2>&1 || true
112+
113+
114+
date -u +'%Y-%m-%dT%H:%M:%S.%3NZ ' >> "${RUNID}_pods_over_time"
115+
kubectl get pods -n nvidia-dra-driver-gpu -o wide >> "${RUNID}_pods_over_time"
116+
kubectl get pods -o wide >> "${RUNID}_pods_over_time"
117+
118+
STATUS=$(kubectl get pod -l job-name="${JOB_NAME}" -o jsonpath="{.items[0].status.phase}" 2>/dev/null)
119+
if [ "$STATUS" == "Succeeded" ]; then
120+
log "nvb completed"
121+
break
122+
fi
123+
124+
# The launcher pod handles many failures internally by restarting the
125+
# launcher container (the MPI launcher process). Treat it as permanent
126+
# failure when this pod failed overall.
127+
if [ "$STATUS" == "Failed" ]; then
128+
log "nvb launcher pod failed"
129+
break
130+
fi
131+
132+
# Keep rather precise track of when the actual communication part of the
133+
# benchmark has started. Assume that the benchmark takes at least 20 seconds
134+
# overall. Inject fault shortly after benchmark has started. Pick that delay
135+
# to be random (but below 20 seconds).
136+
if (( NVB_COMMS_STARTED == 1 )); then
137+
if (( FAULT_INJECTED == 0 )); then
138+
log "NVB_COMMS_STARTED"
139+
140+
_jitter_seconds=$(awk -v min=1 -v max=5 'BEGIN {srand(); print min+rand()*(max-min)}')
141+
log "sleep, pre-injection jitter: $_jitter_seconds s"
142+
sleep "$_jitter_seconds"
143+
144+
# A failing CUDA mem import/export API call in a worker process is
145+
# propagated to the launcher, in which case the launcher container
146+
# terminates (without logging). The launcher pod then restarts the
147+
# container. After that, the MPI workload (the benchmark) is
148+
# reinitialized by new new launcher. It starts again from scratch
149+
# (while the MPI SSH-type worker processes stay alive, they actually
150+
# start new workload child processes). Another type of failure that
151+
# that launcher handles by restarting itself is clean TCP connection
152+
# shutdown initiated by (clean) worker pod deletion. Any type of
153+
# failure that is propagated to the launcher is met with the
154+
# launcher container crashing, and restarting worker processes. This
155+
# type of error handling after all facilitates "healing" the
156+
# workload.
157+
#
158+
# Here, however, the workload _never_ proceeds from where it left
159+
# off before fault injection. When this test passes, it implies that
160+
# the workload restarted from scratch internally after fault
161+
# injection, and then completed.
162+
163+
if (( FAULT_TYPE == 1 )); then
164+
log "inject fault type 1: force-delete worker pod 0"
165+
set -x
166+
kubectl delete pod "${BASE_NAME}-worker-0" --grace-period=0 --force
167+
set +x
168+
elif (( FAULT_TYPE == 2 )); then
169+
log "inject fault type 2: force-delete all IMEX daemons"
170+
set -x
171+
kubectl delete pod -n nvidia-dra-driver-gpu -l resource.nvidia.com/computeDomain --grace-period=0 --force
172+
set +x
173+
elif (( FAULT_TYPE == 3 )); then
174+
log "inject fault type 3: regular-delete worker pod 1"
175+
set -x
176+
kubectl delete pod "${BASE_NAME}-worker-1"
177+
set +x
178+
else
179+
log "unknown fault type $FAULT_TYPE"
180+
exit 1
181+
fi
182+
FAULT_INJECTED=1
183+
fi
184+
# Fault already injected, do not inject again.
185+
else
186+
# Did the benchmark start? Consult current launcher container log.
187+
if kubectl logs -l job-name="${JOB_NAME}" --tail=-1 2>&1 | grep "Running multinode_"; then
188+
NVB_COMMS_STARTED=1
189+
fi
190+
fi
191+
192+
if [ "$SECONDS" -ge $TIMEOUT ]; then
193+
log "global deadline reached ($TIMEOUT seconds), collect debug data -- and leave control loop"
194+
kubectl get pods -A -o wide
195+
kubectl get computedomain
196+
kubectl get computedomains.resource.nvidia.com "${CD_NAME}" -o yaml
197+
198+
# Run this in the background, then delete workflow -- this helps getting all logs
199+
# (but also disrupts post-run debuggability)
200+
kubectl logs -l "training.kubeflow.org/job-name=${BASE_NAME}" \
201+
--tail=-1 --prefix --all-containers --timestamps --follow &> "${RUNID}_on_timeout_workload.log" &
202+
log "on-timeout do: delete -f ${SPECPATH} (and wait)"
203+
kubectl delete -f "${SPECPATH}" --ignore-not-found > /dev/null
204+
kubectl wait --for=delete job/"${JOB_NAME}" --timeout=20s > /dev/null
205+
206+
# log something if this looks like a segmentation fault on
207+
# shutdown (not our bug)
208+
set +e
209+
cat "${RUNID}_on_timeout_workload.log" | grep PMIx_Finalize
210+
set -e
211+
212+
log "done"
213+
break
214+
fi
215+
216+
sleep 1
217+
done
218+
219+
220+
log "terminate children, wait"
221+
jobs -p
222+
kill $(jobs -p) || true
223+
wait
224+
225+
log "dedup launcher logs"
226+
cat "${LAUNCHER_LOG_PATH}".dup | sort | uniq > "${LAUNCHER_LOG_PATH}"
227+
rm "${LAUNCHER_LOG_PATH}".dup
228+
229+
set +e
230+
log "errors in / reported by launcher:"
231+
cat "${LAUNCHER_LOG_PATH}" | \
232+
grep -e CUDA_ -e "closed by remote host" -e "Could not resolve" > "${LAUNCHER_ERRORS_LOG_PATH}"
233+
cat "${LAUNCHER_ERRORS_LOG_PATH}"
234+
235+
if [ "$STATUS" != "Succeeded" ]; then
236+
log "last launcher pod status is not 'Succeeded': $STATUS"
237+
log "finished: failure (fault type $FAULT_TYPE)"
238+
log "exit with code 1"
239+
exit 1
240+
fi
241+
242+
log "finished: success (fault type $FAULT_TYPE)"

tests/bats/specs/nvb2.yaml

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
---
2+
apiVersion: resource.nvidia.com/v1beta1
3+
kind: ComputeDomain
4+
metadata:
5+
name: test-failover-cd
6+
spec:
7+
numNodes: 2
8+
channel:
9+
resourceClaimTemplate:
10+
name: test-failover-rct-channel
11+
---
12+
apiVersion: kubeflow.org/v2beta1
13+
kind: MPIJob
14+
metadata:
15+
name: test-failover-job
16+
spec:
17+
slotsPerWorker: 2
18+
launcherCreationPolicy: WaitForWorkersReady
19+
runPolicy:
20+
cleanPodPolicy: Running
21+
sshAuthMountPath: /home/mpiuser/.ssh
22+
mpiReplicaSpecs:
23+
Launcher:
24+
replicas: 1
25+
template:
26+
metadata:
27+
labels:
28+
mpi-memcpy-dra-test-replica: mpi-launcher
29+
spec:
30+
containers:
31+
- image: ghcr.io/nvidia/k8s-samples:nvbandwidth-6dc12f17
32+
name: mpi-launcher
33+
securityContext:
34+
runAsUser: 1000
35+
command:
36+
- mpirun
37+
args:
38+
- --show-progress
39+
- --bind-to
40+
- core
41+
- --map-by
42+
- ppr:2:node
43+
- -np
44+
- "4"
45+
- --report-bindings
46+
- -q
47+
- nvbandwidth
48+
- --verbose
49+
- --testSamples
50+
- "20"
51+
- --bufferSize
52+
- "2048"
53+
- -t
54+
- multinode_device_to_device_memcpy_read_ce
55+
affinity:
56+
nodeAffinity:
57+
requiredDuringSchedulingIgnoredDuringExecution:
58+
nodeSelectorTerms:
59+
- matchExpressions:
60+
- key: node-role.kubernetes.io/control-plane
61+
operator: Exists
62+
Worker:
63+
replicas: 2
64+
template:
65+
metadata:
66+
labels:
67+
mpi-memcpy-dra-test-replica: mpi-worker
68+
spec:
69+
containers:
70+
- image: ghcr.io/nvidia/k8s-samples:nvbandwidth-6dc12f17
71+
name: mpi-worker
72+
securityContext:
73+
runAsUser: 1000
74+
env:
75+
command:
76+
- /usr/sbin/sshd
77+
args:
78+
- -De
79+
- -f
80+
- /home/mpiuser/.sshd_config
81+
resources:
82+
limits:
83+
nvidia.com/gpu: 2
84+
claims:
85+
- name: test-failover-rc-channel
86+
resourceClaims:
87+
- name: test-failover-rc-channel
88+
resourceClaimTemplateName: test-failover-rct-channel

tests/bats/test_cd_failover.bats

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# shellcheck disable=SC2148
2+
# shellcheck disable=SC2329
3+
4+
setup_file() {
5+
load 'helpers.sh'
6+
_common_setup
7+
8+
local _iargs=("--set" "logVerbosity=6")
9+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
10+
}
11+
12+
teardown_file() {
13+
kubectl delete -f tests/bats/specs/nvb2.yaml --ignore-not-found
14+
kubectl wait --for=delete job/test-failover-job-launcher --timeout=20s || true
15+
}
16+
17+
18+
@test "CD failover nvb2: force-delete worker pod 0" {
19+
bash tests/bats/lib/test_cd_nvb_failover.sh tests/bats/specs/nvb2.yaml 1
20+
}
21+
22+
23+
@test "CD failover nvb2: force-delete all IMEX daemons" {
24+
bash tests/bats/lib/test_cd_nvb_failover.sh tests/bats/specs/nvb2.yaml 2
25+
}
26+
27+
28+
@test "CD failover nvb2: regular-delete worker pod 1" {
29+
bash tests/bats/lib/test_cd_nvb_failover.sh tests/bats/specs/nvb2.yaml 3
30+
}

0 commit comments

Comments
 (0)