@@ -20,142 +20,147 @@ jobs:
2020 JAX_IMAGE : ${{ inputs.CONTAINER }}
2121 secrets : inherit
2222
23- build-mpi-operator-compatible-base :
24- runs-on : [self-hosted, "amd64", "large"]
25- steps :
26- - name : Login to nvcr.io Container Registry
27- uses : docker/login-action@v3
28- with :
29- registry : nvcr.io
30- username : $oauthtoken
31- password : ${{ secrets.NVCR_TOKEN }}
32- - name : Checkout repository
33- uses : actions/checkout@v4
34- - name : Build MPI operator compatible base container
35- id : build
36- uses : ./.github/actions/build-container
37- with :
38- ARCHITECTURE : amd64
39- ARTIFACT_NAME : artifact-mpi-operator-compatible-base-build
40- BADGE_FILENAME : badge-mpi-operator-compatible-base-build
41- BUILD_DATE : 0000-00-00 # not important; this image is never published
42- BASE_IMAGE : ${{ inputs.CONTAINER }}
43- CONTAINER_NAME : mpi-operator-compatible-base
44- DOCKERFILE : .github/container/Dockerfile.mpi-operator-compatible-base
45- RUNNER_SIZE : small
46- ssh-private-key : ${{ secrets.SSH_PRIVATE_KEY }}
47- ssh-known-hosts : ${{ vars.SSH_KNOWN_HOSTS }}
48- github-token : ${{ secrets.GITHUB_TOKEN }}
49- bazel-remote-cache-url : ${{ vars.BAZEL_REMOTE_CACHE_URL }}
50- outputs :
51- DOCKER_TAG_MEALKIT : ${{ steps.build.outputs.DOCKER_TAG_MEALKIT }}
52- DOCKER_TAG_FINAL : ${{ steps.build.outputs.DOCKER_TAG_FINAL }}
53-
54- nccl-test :
55- needs : build-mpi-operator-compatible-base
56- strategy :
57- matrix :
58- test : [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
59- runs-on : eks
60- env :
61- BASE_IMAGE : ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
62- TEST_NAME : ${{ matrix.test }}
63- steps :
64- - name : Checkout repository
65- uses : actions/checkout@v4
66- - name : Login to GitHub Container Registry
67- uses : docker/login-action@v3
68- with :
69- registry : ghcr.io
70- username : ${{ github.repository_owner }}
71- password : ${{ secrets.GITHUB_TOKEN }}
72- - name : Create env vars
73- id : var
74- shell : bash
75- run : |
76- JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
77- LAUNCHER_NAME="${JOB_NAME}-launcher"
78- TOKEN_NAME="${JOB_NAME}-token"
79- # Make these available to later steps
80- echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
81- echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
82- - name : K8s GHCR store and delete token
83- id : store-token
84- uses : ./.github/actions/store-delete-k8s-ghcr
85- - name : Configure Kubernetes job
86- run : |
87- export WORKER_NAME="${JOB_NAME}-worker"
88- yq -i '.metadata.name = strenv(JOB_NAME)
89- | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
90- | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
91- | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
92- | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
93- | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
94- | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
95- | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
96- .github/eks-workflow-files/mpi-nccl-test.yml
97- git diff .github/eks-workflow-files/mpi-nccl-test.yml
98- - name : Submit Kubernetes job
99- id : submit_job
100- run : |
101- echo "Check whether the cluster is under maintenance"
102- if kubectl get nodes -o custom-columns=TAINTS:.spec.taints | grep "maintenance"; then
103- echo "Cluster is under maintenance, skipping job submission"
104- echo "continue-run=false" >> "$GITHUB_OUTPUT"
105- else
106- kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
107- echo "continue-run=true" >> "$GITHUB_OUTPUT"
108- fi
109- - name : Wait for Kubernetes job to start
110- if : steps.submit_job.outputs.continue-run == 'true'
111- # Note that this is *not* using JOB_NAME
112- run : |
113- # Launcher job is created eagerly, but suspended. Kueue un-suspends it when
114- # resources are available, but that is where there can be a long wait if the
115- # cluster is busy executing other jobs.
116- kubectl wait --for=create job/${LAUNCHER_NAME}
117- kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=14400s
118- - name : Stream Kubernetes job output
119- if : steps.submit_job.outputs.continue-run == 'true'
120- # Note that this is *not* JOB_NAME
121- run : |
122- # Streaming logs will fail if the container/pod is still pending
123- while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
124- sleep 1
125- done
126- # TODO: --all-containers=true --all-pods=true could make sense here, but it
127- # prefixes lines with a rather verbose tag
128- kubectl logs --follow job/${LAUNCHER_NAME}
129- - name : Retrieve Kubernetes job status
130- if : steps.submit_job.outputs.continue-run == 'true'
131- shell : bash -exo pipefail {0}
132- run : |
133- while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
134- failure=${status[0]:-0}
135- success=${status[1]:-0}
136- total=$((failure+success))
137- if [[ ${total} < 1 ]]; then
138- sleep 1
139- elif [[ ${total} == 1 ]]; then
140- break
141- else
142- # Shouldn't happen, maybe a sign the job being monitored does not have a
143- # single launcher pod?
144- exit 255
145- fi
146- done
147- exit ${failure}
148- # Provide more debug output in case of failure; note that some kinds of launch
149- # failure do not produce any log output.
150- - name : Debug failed Kubernetes job
151- if : failure() && steps.submit_job.outputs.continue-run == 'true'
152- run : |
153- # Provide better debug in case of launch failures that will not produce log output
154- pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
155- if [[ -n "${pods}" ]]; then
156- kubectl describe ${pods}
157- fi
158- # Clean up in case of errors as well as success
159- - name : Delete Kubernetes job
160- if : always()
161- run : kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
23+ # EKS cluster offline due to maintenance - to run manually
24+ #
25+ # build-mpi-operator-compatible-base:
26+ # runs-on: [self-hosted, "amd64", "large"]
27+ # steps:
28+ # - name: Login to nvcr.io Container Registry
29+ # uses: docker/login-action@v3
30+ # with:
31+ # registry: nvcr.io
32+ # username: $oauthtoken
33+ # password: ${{ secrets.NVCR_TOKEN }}
34+ # - name: Checkout repository
35+ # uses: actions/checkout@v4
36+ # - name: Build MPI operator compatible base container
37+ # id: build
38+ # uses: ./.github/actions/build-container
39+ # with:
40+ # ARCHITECTURE: amd64
41+ # ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
42+ # BADGE_FILENAME: badge-mpi-operator-compatible-base-build
43+ # BUILD_DATE: 0000-00-00 # not important; this image is never published
44+ # BASE_IMAGE: ${{ inputs.CONTAINER }}
45+ # CONTAINER_NAME: mpi-operator-compatible-base
46+ # DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
47+ # RUNNER_SIZE: small
48+ # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
49+ # ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
50+ # github-token: ${{ secrets.GITHUB_TOKEN }}
51+ # bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
52+ # outputs:
53+ # DOCKER_TAG_MEALKIT: ${{ steps.build.outputs.DOCKER_TAG_MEALKIT }}
54+ # DOCKER_TAG_FINAL: ${{ steps.build.outputs.DOCKER_TAG_FINAL }}
55+ #
56+ #
57+ # EKS cluster offline due to maintenance - to run manually
58+ #
59+ # nccl-test:
60+ # needs: build-mpi-operator-compatible-base
61+ # strategy:
62+ # matrix:
63+ # test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
64+ # runs-on: eks
65+ # env:
66+ # BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
67+ # TEST_NAME: ${{ matrix.test }}
68+ # steps:
69+ # - name: Checkout repository
70+ # uses: actions/checkout@v4
71+ # - name: Login to GitHub Container Registry
72+ # uses: docker/login-action@v3
73+ # with:
74+ # registry: ghcr.io
75+ # username: ${{ github.repository_owner }}
76+ # password: ${{ secrets.GITHUB_TOKEN }}
77+ # - name: Create env vars
78+ # id: var
79+ # shell: bash
80+ # run: |
81+ # JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
82+ # LAUNCHER_NAME="${JOB_NAME}-launcher"
83+ # TOKEN_NAME="${JOB_NAME}-token"
84+ # # Make these available to later steps
85+ # echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
86+ # echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
87+ # - name: K8s GHCR store and delete token
88+ # id: store-token
89+ # uses: ./.github/actions/store-delete-k8s-ghcr
90+ # - name: Configure Kubernetes job
91+ # run: |
92+ # export WORKER_NAME="${JOB_NAME}-worker"
93+ # yq -i '.metadata.name = strenv(JOB_NAME)
94+ # | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
95+ # | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
96+ # | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
97+ # | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
98+ # | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
99+ # | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
100+ # | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
101+ # .github/eks-workflow-files/mpi-nccl-test.yml
102+ # git diff .github/eks-workflow-files/mpi-nccl-test.yml
103+ # - name: Submit Kubernetes job
104+ # id: submit_job
105+ # run: |
106+ # echo "Check whether the cluster is under maintenance"
107+ # if kubectl get nodes -o custom-columns=TAINTS:.spec.taints | grep "maintenance"; then
108+ # echo "Cluster is under maintenance, skipping job submission"
109+ # echo "continue-run=false" >> "$GITHUB_OUTPUT"
110+ # else
111+ # kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
112+ # echo "continue-run=true" >> "$GITHUB_OUTPUT"
113+ # fi
114+ # - name: Wait for Kubernetes job to start
115+ # if: steps.submit_job.outputs.continue-run == 'true'
116+ # # Note that this is *not* using JOB_NAME
117+ # run: |
118+ # # Launcher job is created eagerly, but suspended. Kueue un-suspends it when
119+ # # resources are available, but that is where there can be a long wait if the
120+ # # cluster is busy executing other jobs.
121+ # kubectl wait --for=create job/${LAUNCHER_NAME}
122+ # kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=14400s
123+ # - name: Stream Kubernetes job output
124+ # if: steps.submit_job.outputs.continue-run == 'true'
125+ # # Note that this is *not* JOB_NAME
126+ # run: |
127+ # # Streaming logs will fail if the container/pod is still pending
128+ # while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
129+ # sleep 1
130+ # done
131+ # # TODO: --all-containers=true --all-pods=true could make sense here, but it
132+ # # prefixes lines with a rather verbose tag
133+ # kubectl logs --follow job/${LAUNCHER_NAME}
134+ # - name: Retrieve Kubernetes job status
135+ # if: steps.submit_job.outputs.continue-run == 'true'
136+ # shell: bash -exo pipefail {0}
137+ # run: |
138+ # while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
139+ # failure=${status[0]:-0}
140+ # success=${status[1]:-0}
141+ # total=$((failure+success))
142+ # if [[ ${total} < 1 ]]; then
143+ # sleep 1
144+ # elif [[ ${total} == 1 ]]; then
145+ # break
146+ # else
147+ # # Shouldn't happen, maybe a sign the job being monitored does not have a
148+ # # single launcher pod?
149+ # exit 255
150+ # fi
151+ # done
152+ # exit ${failure}
153+ # # Provide more debug output in case of failure; note that some kinds of launch
154+ # # failure do not produce any log output.
155+ # - name: Debug failed Kubernetes job
156+ # if: failure() && steps.submit_job.outputs.continue-run == 'true'
157+ # run: |
158+ # # Provide better debug in case of launch failures that will not produce log output
159+ # pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
160+ # if [[ -n "${pods}" ]]; then
161+ # kubectl describe ${pods}
162+ # fi
163+ # # Clean up in case of errors as well as success
164+ # - name: Delete Kubernetes job
165+ # if: always()
166+ # run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
0 commit comments