1+ #! /bin/bash
2+
3+ set -o nounset
4+ set -o errexit
5+
6+ SPECPATH=" $1 "
7+ FAULT_TYPE=" $2 "
8+
9+ BASE_NAME=" test-failover-job"
10+ CD_NAME=" test-failover-cd"
11+
12+ # External supervisor can inject run ID (for many-repetition-tests), used mainly
13+ # in output file names.
14+ RUNID=" ${RUNID:- no_runid} "
15+
16+ JOB_NAME=" ${BASE_NAME} -launcher"
17+
18+ # For measuring duration with sub-second precision.
19+ _T0=$( awk ' {print $1}' /proc/uptime)
20+
21+ # For measuring duration with O(1 s) precision.
22+ SECONDS=0
23+
24+ # Common arguments for `kubectl logs`, with common ts for proper chronological
25+ # sort upon dedup/post-processing.
26+ KLOGS_ARGS=" --tail=-1 --prefix --all-containers --timestamps"
27+
28+ # Wait for the workload to heal after fault injection (for the MPI launcher pod
29+ # to succeed); otherwise fail the test TIMEOUT seconds after startup.
30+ TIMEOUT=300
31+
32+ log_ts_no_newline () {
33+ echo -n " $( date -u +' %Y-%m-%dT%H:%M:%S.%3NZ ' ) "
34+ }
35+
36+ log () {
37+ _TNOW=$( awk ' {print $1}' /proc/uptime)
38+ _DUR=$( echo " $_TNOW - $_T0 " | bc)
39+ log_ts_no_newline
40+ printf " [%6.1fs] $1 \n" " $_DUR "
41+ }
42+
43+ log " RUNID $RUNID | fault type $FAULT_TYPE | $SPECPATH | $BASE_NAME | $JOB_NAME | $CD_NAME "
44+ log " do: delete -f ${SPECPATH} (and wait)"
45+ kubectl delete -f " ${SPECPATH} " --ignore-not-found > /dev/null
46+ kubectl wait --for=delete job/" ${JOB_NAME} " --timeout=20s > /dev/null
47+ log " done"
48+
49+ log " do: apply -f ${SPECPATH} "
50+ kubectl apply -f " ${SPECPATH} " > /dev/null
51+ log " done"
52+ log " do: wait --for=create"
53+ kubectl wait --for=create job/" ${JOB_NAME} " --timeout=40s > /dev/null
54+ log " done"
55+ CDUID=$( kubectl describe computedomains.resource.nvidia.com " ${CD_NAME} " | grep UID | awk ' {print $2}' )
56+
57+ log " CD uid: ${CDUID} "
58+ log " resource claims:"
59+ kubectl get resourceclaim
60+ log " workload pods:"
61+ kubectl get pods -o wide
62+
63+
64+ LAUNCHER_LOG_PATH=" ${RUNID} _launcher_logs.log"
65+ LAUNCHER_ERRORS_LOG_PATH=" ${RUNID} _launcher_errors.log"
66+ echo " " > " ${LAUNCHER_LOG_PATH} "
67+ echo " " > " ${LAUNCHER_LOG_PATH} " .dup
68+
69+ FAULT_INJECTED=0
70+ NVB_COMMS_STARTED=0
71+ LAST_LAUNCHER_RESTART_OUTPUT=" "
72+ STATUS=" nil"
73+
74+ while true ; do
75+
76+ _llro=$( kubectl get pod -l job-name=" ${JOB_NAME} " -o json | \
77+ /usr/bin/jq -r ' .items[].status.containerStatuses[].restartCount'
78+ )
79+
80+ if [[ " $LAST_LAUNCHER_RESTART_OUTPUT " != " $_llro " ]]; then
81+ log " launcher container restarts seen: $_llro "
82+ LAST_LAUNCHER_RESTART_OUTPUT=" $_llro "
83+ fi
84+
85+ # Start log-follower child processes for all newly popping up CD daemon pods
86+ # (when they are Running). I have added this very late in the game because I
87+ # think we're missing CD daemon log around container shutdown; I want to be
88+ # extra sure.
89+ kubectl get pods -n nvidia-dra-driver-gpu | grep " ${CD_NAME} " | grep Running | awk ' {print $1}' | while read pname; do
90+ _logfname=" ${RUNID} _cddaemon_follow_${pname} .log"
91+ if [ -f " $_logfname " ]; then
92+ continue
93+ fi
94+ log " new CD daemon pod: $pname -- follow log, save to ${_logfname} "
95+ kubectl logs -n nvidia-dra-driver-gpu " $pname " \
96+ --tail=-1 --timestamps --prefix --all-containers --follow \
97+ > " ${_logfname} " &
98+ # Note: if we lose track of the log followers spawned, we can and should
99+ # terminate them all with `kill $(jobs -p)`.
100+ done
101+
102+ # Note that the launcher _pod_ is not expected to restart. The container in
103+ # the pod may restart various times in the context of this failover.
104+ # `kubectl logs --follow` does not automatically follow container restarts.
105+ # To catch all container instances in view of quick restarts, we need to
106+ # often call a pair of `kubectl logs` commands (once with, and once without
107+ # --previous). Even that does not reliably obtain _all_ container logs. The
108+ # correct solution for this type of problem is to have a proper log
109+ # streaming pipeline. Collect heavily duplicated logs (dedup later)
110+ kubectl logs -l job-name=" ${JOB_NAME} " $KLOGS_ARGS >> " ${LAUNCHER_LOG_PATH} " .dup 2>&1 || true
111+ kubectl logs -l job-name=" ${JOB_NAME} " $KLOGS_ARGS --previous >> " ${LAUNCHER_LOG_PATH} " .dup 2>&1 || true
112+
113+
114+ date -u +' %Y-%m-%dT%H:%M:%S.%3NZ ' >> " ${RUNID} _pods_over_time"
115+ kubectl get pods -n nvidia-dra-driver-gpu -o wide >> " ${RUNID} _pods_over_time"
116+ kubectl get pods -o wide >> " ${RUNID} _pods_over_time"
117+
118+ STATUS=$( kubectl get pod -l job-name=" ${JOB_NAME} " -o jsonpath=" {.items[0].status.phase}" 2> /dev/null)
119+ if [ " $STATUS " == " Succeeded" ]; then
120+ log " nvb completed"
121+ break
122+ fi
123+
124+ # The launcher pod handles many failures internally by restarting the
125+ # launcher container (the MPI launcher process). Treat it as permanent
126+ # failure when this pod failed overall.
127+ if [ " $STATUS " == " Failed" ]; then
128+ log " nvb launcher pod failed"
129+ break
130+ fi
131+
132+ # Keep rather precise track of when the actual communication part of the
133+ # benchmark has started. Assume that the benchmark takes at least 20 seconds
134+ # overall. Inject fault shortly after benchmark has started. Pick that delay
135+ # to be random (but below 20 seconds).
136+ if (( NVB_COMMS_STARTED == 1 )) ; then
137+ if (( FAULT_INJECTED == 0 )) ; then
138+ log " NVB_COMMS_STARTED"
139+
140+ _jitter_seconds=$( awk -v min=1 -v max=5 ' BEGIN {srand(); print min+rand()*(max-min)}' )
141+ log " sleep, pre-injection jitter: $_jitter_seconds s"
142+ sleep " $_jitter_seconds "
143+
144+ # A failing CUDA mem import/export API call in a worker process is
145+ # propagated to the launcher, in which case the launcher container
146+ # terminates (without logging). The launcher pod then restarts the
147+ # container. After that, the MPI workload (the benchmark) is
148+ # reinitialized by new new launcher. It starts again from scratch
149+ # (while the MPI SSH-type worker processes stay alive, they actually
150+ # start new workload child processes). Another type of failure that
151+ # that launcher handles by restarting itself is clean TCP connection
152+ # shutdown initiated by (clean) worker pod deletion. Any type of
153+ # failure that is propagated to the launcher is met with the
154+ # launcher container crashing, and restarting worker processes. This
155+ # type of error handling after all facilitates "healing" the
156+ # workload.
157+ #
158+ # Here, however, the workload _never_ proceeds from where it left
159+ # off before fault injection. When this test passes, it implies that
160+ # the workload restarted from scratch internally after fault
161+ # injection, and then completed.
162+
163+ if (( FAULT_TYPE == 1 )) ; then
164+ log " inject fault type 1: force-delete worker pod 0"
165+ set -x
166+ kubectl delete pod " ${BASE_NAME} -worker-0" --grace-period=0 --force
167+ set +x
168+ elif (( FAULT_TYPE == 2 )) ; then
169+ log " inject fault type 2: force-delete all IMEX daemons"
170+ set -x
171+ kubectl delete pod -n nvidia-dra-driver-gpu -l resource.nvidia.com/computeDomain --grace-period=0 --force
172+ set +x
173+ elif (( FAULT_TYPE == 3 )) ; then
174+ log " inject fault type 3: regular-delete worker pod 1"
175+ set -x
176+ kubectl delete pod " ${BASE_NAME} -worker-1"
177+ set +x
178+ else
179+ log " unknown fault type $FAULT_TYPE "
180+ exit 1
181+ fi
182+ FAULT_INJECTED=1
183+ fi
184+ # Fault already injected, do not inject again.
185+ else
186+ # Did the benchmark start? Consult current launcher container log.
187+ if kubectl logs -l job-name=" ${JOB_NAME} " --tail=-1 2>&1 | grep " Running multinode_" ; then
188+ NVB_COMMS_STARTED=1
189+ fi
190+ fi
191+
192+ if [ " $SECONDS " -ge $TIMEOUT ]; then
193+ log " global deadline reached ($TIMEOUT seconds), collect debug data -- and leave control loop"
194+ kubectl get pods -A -o wide
195+ kubectl get computedomain
196+ kubectl get computedomains.resource.nvidia.com " ${CD_NAME} " -o yaml
197+
198+ # Run this in the background, then delete workflow -- this helps getting all logs
199+ # (but also disrupts post-run debuggability)
200+ kubectl logs -l " training.kubeflow.org/job-name=${BASE_NAME} " \
201+ --tail=-1 --prefix --all-containers --timestamps --follow & > " ${RUNID} _on_timeout_workload.log" &
202+ log " on-timeout do: delete -f ${SPECPATH} (and wait)"
203+ kubectl delete -f " ${SPECPATH} " --ignore-not-found > /dev/null
204+ kubectl wait --for=delete job/" ${JOB_NAME} " --timeout=20s > /dev/null
205+
206+ # log something if this looks like a segmentation fault on
207+ # shutdown (not our bug)
208+ set +e
209+ cat " ${RUNID} _on_timeout_workload.log" | grep PMIx_Finalize
210+ set -e
211+
212+ log " done"
213+ break
214+ fi
215+
216+ sleep 1
217+ done
218+
219+
220+ log " terminate children, wait"
221+ jobs -p
222+ kill $( jobs -p) || true
223+ wait
224+
225+ log " dedup launcher logs"
226+ cat " ${LAUNCHER_LOG_PATH} " .dup | sort | uniq > " ${LAUNCHER_LOG_PATH} "
227+ rm " ${LAUNCHER_LOG_PATH} " .dup
228+
229+ set +e
230+ log " errors in / reported by launcher:"
231+ cat " ${LAUNCHER_LOG_PATH} " | \
232+ grep -e CUDA_ -e " closed by remote host" -e " Could not resolve" > " ${LAUNCHER_ERRORS_LOG_PATH} "
233+ cat " ${LAUNCHER_ERRORS_LOG_PATH} "
234+
235+ if [ " $STATUS " != " Succeeded" ]; then
236+ log " last launcher pod status is not 'Succeeded': $STATUS "
237+ log " finished: failure (fault type $FAULT_TYPE )"
238+ log " exit with code 1"
239+ exit 1
240+ fi
241+
242+ log " finished: success (fault type $FAULT_TYPE )"
0 commit comments