Skip to content

Commit 18b8fd1

Browse files
authored
Check whether the EKS cluster is under maintenance and skip steps (#1658)
Little modification to the CI to avoid running on EKS for the moment, as the cluster is under maintenance
1 parent 25de46b commit 18b8fd1

File tree

2 files changed

+20
-2
lines changed

2 files changed

+20
-2
lines changed

.github/actions/submit-delete-k8s-job/action.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ runs:
1616
uses: ./.github/actions/with-post-step
1717
with:
1818
main: |
19+
echo "Checking for cluster maintenance taint..."
20+
if kubectl get nodes -o custom-columns=TAINTS:.spec.taints | grep "maintenance"; then
21+
echo "Cluster is under maintenance, skipping job submission."
22+
exit 0 # Exit successfully without running the rest of the script
23+
fi
24+
1925
set -x
2026
TIMEOUT_JOB_CREATION=60s
2127
TIMEOUT_JOB_WAIT=14400s

.github/workflows/_test_nccl.yaml

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,18 @@ jobs:
9090
.github/eks-workflow-files/mpi-nccl-test.yml
9191
git diff .github/eks-workflow-files/mpi-nccl-test.yml
9292
- name: Submit Kubernetes job
93-
run: kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
93+
id: submit_job
94+
run: |
95+
echo "Check whether the cluster is under maintenance"
96+
if kubectl get nodes -o custom-columns=TAINTS:.spec.taints | grep "maintenance"; then
97+
echo "Cluster is under maintenance, skipping job submission"
98+
echo "continue-run=false" >> "$GITHUB_OUTPUT"
99+
else
100+
kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
101+
echo "continue-run=true" >> "$GITHUB_OUTPUT"
102+
fi
94103
- name: Wait for Kubernetes job to start
104+
if: steps.submit_job.outputs.continue-run == 'true'
95105
# Note that this is *not* using JOB_NAME
96106
run: |
97107
# Launcher job is created eagerly, but suspended. Kueue un-suspends it when
@@ -100,6 +110,7 @@ jobs:
100110
kubectl wait --for=create job/${LAUNCHER_NAME}
101111
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=14400s
102112
- name: Stream Kubernetes job output
113+
if: steps.submit_job.outputs.continue-run == 'true'
103114
# Note that this is *not* JOB_NAME
104115
run: |
105116
# Streaming logs will fail if the container/pod is still pending
@@ -110,6 +121,7 @@ jobs:
110121
# prefixes lines with a rather verbose tag
111122
kubectl logs --follow job/${LAUNCHER_NAME}
112123
- name: Retrieve Kubernetes job status
124+
if: steps.submit_job.outputs.continue-run == 'true'
113125
shell: bash -exo pipefail {0}
114126
run: |
115127
while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
@@ -130,7 +142,7 @@ jobs:
130142
# Provide more debug output in case of failure; note that some kinds of launch
131143
# failure do not produce any log output.
132144
- name: Debug failed Kubernetes job
133-
if: failure()
145+
if: failure() && steps.submit_job.outputs.continue-run == 'true'
134146
run: |
135147
# Provide better debug in case of launch failures that will not produce log output
136148
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)

0 commit comments

Comments
 (0)