Skip to content

Commit c0e4af6

Browse files
committed
Disable EKS workflows
1 parent 1caecc9 commit c0e4af6

File tree

2 files changed

+153
-146
lines changed

2 files changed

+153
-146
lines changed

.github/workflows/_test_nccl.yaml

Lines changed: 144 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -20,142 +20,147 @@ jobs:
2020
JAX_IMAGE: ${{ inputs.CONTAINER }}
2121
secrets: inherit
2222

23-
build-mpi-operator-compatible-base:
24-
runs-on: [self-hosted, "amd64", "large"]
25-
steps:
26-
- name: Login to nvcr.io Container Registry
27-
uses: docker/login-action@v3
28-
with:
29-
registry: nvcr.io
30-
username: $oauthtoken
31-
password: ${{ secrets.NVCR_TOKEN }}
32-
- name: Checkout repository
33-
uses: actions/checkout@v4
34-
- name: Build MPI operator compatible base container
35-
id: build
36-
uses: ./.github/actions/build-container
37-
with:
38-
ARCHITECTURE: amd64
39-
ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
40-
BADGE_FILENAME: badge-mpi-operator-compatible-base-build
41-
BUILD_DATE: 0000-00-00 # not important; this image is never published
42-
BASE_IMAGE: ${{ inputs.CONTAINER }}
43-
CONTAINER_NAME: mpi-operator-compatible-base
44-
DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
45-
RUNNER_SIZE: small
46-
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
47-
ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
48-
github-token: ${{ secrets.GITHUB_TOKEN }}
49-
bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
50-
outputs:
51-
DOCKER_TAG_MEALKIT: ${{ steps.build.outputs.DOCKER_TAG_MEALKIT }}
52-
DOCKER_TAG_FINAL: ${{ steps.build.outputs.DOCKER_TAG_FINAL }}
53-
54-
nccl-test:
55-
needs: build-mpi-operator-compatible-base
56-
strategy:
57-
matrix:
58-
test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
59-
runs-on: eks
60-
env:
61-
BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
62-
TEST_NAME: ${{ matrix.test }}
63-
steps:
64-
- name: Checkout repository
65-
uses: actions/checkout@v4
66-
- name: Login to GitHub Container Registry
67-
uses: docker/login-action@v3
68-
with:
69-
registry: ghcr.io
70-
username: ${{ github.repository_owner }}
71-
password: ${{ secrets.GITHUB_TOKEN }}
72-
- name: Create env vars
73-
id: var
74-
shell: bash
75-
run: |
76-
JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
77-
LAUNCHER_NAME="${JOB_NAME}-launcher"
78-
TOKEN_NAME="${JOB_NAME}-token"
79-
# Make these available to later steps
80-
echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
81-
echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
82-
- name: K8s GHCR store and delete token
83-
id: store-token
84-
uses: ./.github/actions/store-delete-k8s-ghcr
85-
- name: Configure Kubernetes job
86-
run: |
87-
export WORKER_NAME="${JOB_NAME}-worker"
88-
yq -i '.metadata.name = strenv(JOB_NAME)
89-
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
90-
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
91-
| .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
92-
| .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
93-
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
94-
| .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
95-
| .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
96-
.github/eks-workflow-files/mpi-nccl-test.yml
97-
git diff .github/eks-workflow-files/mpi-nccl-test.yml
98-
- name: Submit Kubernetes job
99-
id: submit_job
100-
run: |
101-
echo "Check whether the cluster is under maintenance"
102-
if kubectl get nodes -o custom-columns=TAINTS:.spec.taints | grep "maintenance"; then
103-
echo "Cluster is under maintenance, skipping job submission"
104-
echo "continue-run=false" >> "$GITHUB_OUTPUT"
105-
else
106-
kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
107-
echo "continue-run=true" >> "$GITHUB_OUTPUT"
108-
fi
109-
- name: Wait for Kubernetes job to start
110-
if: steps.submit_job.outputs.continue-run == 'true'
111-
# Note that this is *not* using JOB_NAME
112-
run: |
113-
# Launcher job is created eagerly, but suspended. Kueue un-suspends it when
114-
# resources are available, but that is where there can be a long wait if the
115-
# cluster is busy executing other jobs.
116-
kubectl wait --for=create job/${LAUNCHER_NAME}
117-
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=14400s
118-
- name: Stream Kubernetes job output
119-
if: steps.submit_job.outputs.continue-run == 'true'
120-
# Note that this is *not* JOB_NAME
121-
run: |
122-
# Streaming logs will fail if the container/pod is still pending
123-
while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
124-
sleep 1
125-
done
126-
# TODO: --all-containers=true --all-pods=true could make sense here, but it
127-
# prefixes lines with a rather verbose tag
128-
kubectl logs --follow job/${LAUNCHER_NAME}
129-
- name: Retrieve Kubernetes job status
130-
if: steps.submit_job.outputs.continue-run == 'true'
131-
shell: bash -exo pipefail {0}
132-
run: |
133-
while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
134-
failure=${status[0]:-0}
135-
success=${status[1]:-0}
136-
total=$((failure+success))
137-
if [[ ${total} < 1 ]]; then
138-
sleep 1
139-
elif [[ ${total} == 1 ]]; then
140-
break
141-
else
142-
# Shouldn't happen, maybe a sign the job being monitored does not have a
143-
# single launcher pod?
144-
exit 255
145-
fi
146-
done
147-
exit ${failure}
148-
# Provide more debug output in case of failure; note that some kinds of launch
149-
# failure do not produce any log output.
150-
- name: Debug failed Kubernetes job
151-
if: failure() && steps.submit_job.outputs.continue-run == 'true'
152-
run: |
153-
# Provide better debug in case of launch failures that will not produce log output
154-
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
155-
if [[ -n "${pods}" ]]; then
156-
kubectl describe ${pods}
157-
fi
158-
# Clean up in case of errors as well as success
159-
- name: Delete Kubernetes job
160-
if: always()
161-
run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
23+
# EKS cluster offline due to maintenance - to run manually
24+
#
25+
# build-mpi-operator-compatible-base:
26+
# runs-on: [self-hosted, "amd64", "large"]
27+
# steps:
28+
# - name: Login to nvcr.io Container Registry
29+
# uses: docker/login-action@v3
30+
# with:
31+
# registry: nvcr.io
32+
# username: $oauthtoken
33+
# password: ${{ secrets.NVCR_TOKEN }}
34+
# - name: Checkout repository
35+
# uses: actions/checkout@v4
36+
# - name: Build MPI operator compatible base container
37+
# id: build
38+
# uses: ./.github/actions/build-container
39+
# with:
40+
# ARCHITECTURE: amd64
41+
# ARTIFACT_NAME: artifact-mpi-operator-compatible-base-build
42+
# BADGE_FILENAME: badge-mpi-operator-compatible-base-build
43+
# BUILD_DATE: 0000-00-00 # not important; this image is never published
44+
# BASE_IMAGE: ${{ inputs.CONTAINER }}
45+
# CONTAINER_NAME: mpi-operator-compatible-base
46+
# DOCKERFILE: .github/container/Dockerfile.mpi-operator-compatible-base
47+
# RUNNER_SIZE: small
48+
# ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
49+
# ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
50+
# github-token: ${{ secrets.GITHUB_TOKEN }}
51+
# bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
52+
# outputs:
53+
# DOCKER_TAG_MEALKIT: ${{ steps.build.outputs.DOCKER_TAG_MEALKIT }}
54+
# DOCKER_TAG_FINAL: ${{ steps.build.outputs.DOCKER_TAG_FINAL }}
55+
#
56+
#
57+
# EKS cluster offline due to maintenance - to run manually
58+
#
59+
# nccl-test:
60+
# needs: build-mpi-operator-compatible-base
61+
# strategy:
62+
# matrix:
63+
# test: [all_gather_perf_mpi, all_reduce_perf_mpi, broadcast_perf_mpi, reduce_scatter_perf_mpi]
64+
# runs-on: eks
65+
# env:
66+
# BASE_IMAGE: ${{ needs.build-mpi-operator-compatible-base.outputs.DOCKER_TAG_FINAL }}
67+
# TEST_NAME: ${{ matrix.test }}
68+
# steps:
69+
# - name: Checkout repository
70+
# uses: actions/checkout@v4
71+
# - name: Login to GitHub Container Registry
72+
# uses: docker/login-action@v3
73+
# with:
74+
# registry: ghcr.io
75+
# username: ${{ github.repository_owner }}
76+
# password: ${{ secrets.GITHUB_TOKEN }}
77+
# - name: Create env vars
78+
# id: var
79+
# shell: bash
80+
# run: |
81+
# JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
82+
# LAUNCHER_NAME="${JOB_NAME}-launcher"
83+
# TOKEN_NAME="${JOB_NAME}-token"
84+
# # Make these available to later steps
85+
# echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
86+
# echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
87+
# - name: K8s GHCR store and delete token
88+
# id: store-token
89+
# uses: ./.github/actions/store-delete-k8s-ghcr
90+
# - name: Configure Kubernetes job
91+
# run: |
92+
# export WORKER_NAME="${JOB_NAME}-worker"
93+
# yq -i '.metadata.name = strenv(JOB_NAME)
94+
# | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
95+
# | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
96+
# | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
97+
# | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
98+
# | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
99+
# | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
100+
# | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
101+
# .github/eks-workflow-files/mpi-nccl-test.yml
102+
# git diff .github/eks-workflow-files/mpi-nccl-test.yml
103+
# - name: Submit Kubernetes job
104+
# id: submit_job
105+
# run: |
106+
# echo "Check whether the cluster is under maintenance"
107+
# if kubectl get nodes -o custom-columns=TAINTS:.spec.taints | grep "maintenance"; then
108+
# echo "Cluster is under maintenance, skipping job submission"
109+
# echo "continue-run=false" >> "$GITHUB_OUTPUT"
110+
# else
111+
# kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
112+
# echo "continue-run=true" >> "$GITHUB_OUTPUT"
113+
# fi
114+
# - name: Wait for Kubernetes job to start
115+
# if: steps.submit_job.outputs.continue-run == 'true'
116+
# # Note that this is *not* using JOB_NAME
117+
# run: |
118+
# # Launcher job is created eagerly, but suspended. Kueue un-suspends it when
119+
# # resources are available, but that is where there can be a long wait if the
120+
# # cluster is busy executing other jobs.
121+
# kubectl wait --for=create job/${LAUNCHER_NAME}
122+
# kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=14400s
123+
# - name: Stream Kubernetes job output
124+
# if: steps.submit_job.outputs.continue-run == 'true'
125+
# # Note that this is *not* JOB_NAME
126+
# run: |
127+
# # Streaming logs will fail if the container/pod is still pending
128+
# while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
129+
# sleep 1
130+
# done
131+
# # TODO: --all-containers=true --all-pods=true could make sense here, but it
132+
# # prefixes lines with a rather verbose tag
133+
# kubectl logs --follow job/${LAUNCHER_NAME}
134+
# - name: Retrieve Kubernetes job status
135+
# if: steps.submit_job.outputs.continue-run == 'true'
136+
# shell: bash -exo pipefail {0}
137+
# run: |
138+
# while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
139+
# failure=${status[0]:-0}
140+
# success=${status[1]:-0}
141+
# total=$((failure+success))
142+
# if [[ ${total} < 1 ]]; then
143+
# sleep 1
144+
# elif [[ ${total} == 1 ]]; then
145+
# break
146+
# else
147+
# # Shouldn't happen, maybe a sign the job being monitored does not have a
148+
# # single launcher pod?
149+
# exit 255
150+
# fi
151+
# done
152+
# exit ${failure}
153+
# # Provide more debug output in case of failure; note that some kinds of launch
154+
# # failure do not produce any log output.
155+
# - name: Debug failed Kubernetes job
156+
# if: failure() && steps.submit_job.outputs.continue-run == 'true'
157+
# run: |
158+
# # Provide better debug in case of launch failures that will not produce log output
159+
# pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
160+
# if [[ -n "${pods}" ]]; then
161+
# kubectl describe ${pods}
162+
# fi
163+
# # Clean up in case of errors as well as success
164+
# - name: Delete Kubernetes job
165+
# if: always()
166+
# run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml

.github/workflows/ngc-release-testing.yaml

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,14 @@ jobs:
3131
CONTAINER: ${{ inputs.JAX_IMAGE }}
3232
secrets: inherit
3333

34-
test-maxtext-eks:
35-
if: inputs.MAXTEXT_IMAGE != ''
36-
uses: ./.github/workflows/_test_maxtext_k8s.yaml
37-
with:
38-
MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
39-
secrets: inherit
34+
# EKS cluster offline due to maintenance - to run manually
35+
#
36+
# test-maxtext-eks:
37+
# if: inputs.MAXTEXT_IMAGE != ''
38+
# uses: ./.github/workflows/_test_maxtext_k8s.yaml
39+
# with:
40+
# MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
41+
# secrets: inherit
4042

4143
test-maxtext-gke:
4244
if: inputs.MAXTEXT_IMAGE != ''
@@ -46,7 +48,7 @@ jobs:
4648
secrets: inherit
4749

4850
finalize:
49-
needs: [ test-nccl, test-maxtext-eks, test-maxtext-gke ]
51+
needs: [ test-nccl, test-maxtext-gke] # ,test-maxtext-eks ]
5052
if: "!cancelled()"
5153
uses: ./.github/workflows/_finalize.yaml
5254
secrets: inherit

0 commit comments

Comments
 (0)