Skip to content

Commit e14c8fc

Browse files
Stebossolupton
andauthored
Improve error handling, s3 mounting, distributed tests for axlearn (#1332)
Co-authored-by: Olli Lupton <[email protected]>
1 parent bc4dc15 commit e14c8fc

File tree

12 files changed

+586
-116
lines changed

12 files changed

+586
-116
lines changed

.github/actions/submit-delete-k8s-job/action.yml

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ runs:
1313
using: "composite"
1414
steps:
1515
- name: Submit and Delete Kubernetes job
16-
uses: ./.github/actions/with-post-step
17-
with:
16+
uses: ./.github/actions/with-post-step
17+
with:
1818
main: |
1919
set -x
2020
TIMEOUT_JOB_CREATION=60s
@@ -35,9 +35,51 @@ runs:
3535
kubectl wait --for=condition=Ready \
3636
--selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} \
3737
--timeout=$TIMEOUT_JOB_START pod
38-
38+
3939
# Stream logs
4040
kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
41-
42-
post: |
41+
42+
# Detect job parallelism
43+
parallelism=$(kubectl get job/"${{ inputs.job-name }}" -o jsonpath='{.spec.parallelism}')
44+
# if parallelism is not set, use default value of 1
45+
echo "Parallelism ${parallelism}"
46+
if [ -z "${parallelism}" ]; then
47+
echo "No parallelism specified, defaulting to 1"
48+
parallelism=1
49+
fi
50+
51+
while IFS=: read -r failures successes; do
52+
failures="${failures:-0}"
53+
successes="${successes:-0}"
54+
total=$((failures + successes))
55+
56+
if [ $total -lt $parallelism ]; then
57+
# neither "failed" nor "succeeded", so wait
58+
sleep 1
59+
elif [ $total -eq $parallelism ]; then
60+
# we have total=parallelism => either X successes or X failures
61+
# In any case, the job is done
62+
break
63+
else
64+
# Log here
65+
echo "Unexpected number of completed pods ${total} with parallelism ${parallelism}"
66+
exit 255
67+
fi
68+
done <<EOF
69+
$(kubectl get job/"${{ inputs.job-name }}" -o 'jsonpath={.status.failed}:{.status.succeeded}')
70+
EOF
71+
72+
# If job indicates a failure try to print out the info
73+
if [ "${failures:-0}" -gt 0 ]; then
74+
echo "Job ${{ inputs.job-name }} has $failures failures"
75+
# this is for batch jobs only
76+
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o name)
77+
if [ -n "${pods}" ]; then
78+
kubectl describe ${pods}
79+
fi
80+
exit 1
81+
fi
82+
83+
post: |
84+
echo "Deleting K8s job: ${{ inputs.job-name }}"
4385
kubectl delete -f "${{ inputs.job-config-file }}"

.github/container/Dockerfile.axlearn

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# syntax=docker/dockerfile:1-labs
22
ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax
3-
ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git#main
3+
ARG URLREF_AXLEARN=https://github.com/Steboss/axlearn.git#main
44
ARG SRC_PATH_AXLEARN=/opt/axlearn
55

66
###############################################################################
@@ -22,8 +22,9 @@ einops==0.8.0
2222
nltk==3.7
2323
portpicker==1.6.0
2424
seqio==0.0.18
25-
protobuf==3.20.3
25+
protobuf==3.20.3
2626
pytest>=7.4.3
27+
tensorflow==2.18.1
2728
REQUIREMENTS
2829
EOF
2930

@@ -32,7 +33,7 @@ EOF
3233
## Add test script to the path
3334
###############################################################################
3435

35-
ADD test-axlearn.sh /usr/local/bin/
36+
ADD test-axlearn.sh fuji-train-perf.py /usr/local/bin/
3637

3738
###############################################################################
3839
## Install accumulated packages from the base image and the previous stage

0 commit comments

Comments
 (0)