9090 .github/eks-workflow-files/mpi-nccl-test.yml
9191 git diff .github/eks-workflow-files/mpi-nccl-test.yml
9292 - name : Submit Kubernetes job
93- run : kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
93+ id : submit_job
94+ run : |
95+ echo "Check whether the cluster is under maintenance"
96+ if kubectl get nodes -o custom-columns=TAINTS:.spec.taints | grep "maintenance"; then
97+ echo "Cluster is under maintenance, skipping job submission"
98+ echo "continue-run=false" >> "$GITHUB_OUTPUT"
99+ else
100+ kubectl apply -f .github/eks-workflow-files/mpi-nccl-test.yml
101+ echo "continue-run=true" >> "$GITHUB_OUTPUT"
102+ fi
94103 - name : Wait for Kubernetes job to start
104+ if : steps.submit_job.outputs.continue-run == 'true'
95105 # Note that this is *not* using JOB_NAME
96106 run : |
97107 # Launcher job is created eagerly, but suspended. Kueue un-suspends it when
@@ -100,6 +110,7 @@ jobs:
100110 kubectl wait --for=create job/${LAUNCHER_NAME}
101111 kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${LAUNCHER_NAME} --timeout=14400s
102112 - name : Stream Kubernetes job output
113+ if : steps.submit_job.outputs.continue-run == 'true'
103114 # Note that this is *not* JOB_NAME
104115 run : |
105116 # Streaming logs will fail if the container/pod is still pending
@@ -110,6 +121,7 @@ jobs:
110121 # prefixes lines with a rather verbose tag
111122 kubectl logs --follow job/${LAUNCHER_NAME}
112123 - name : Retrieve Kubernetes job status
124+ if : steps.submit_job.outputs.continue-run == 'true'
113125 shell : bash -exo pipefail {0}
114126 run : |
115127 while readarray -d : -t status < <(kubectl get job/${LAUNCHER_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
@@ -130,7 +142,7 @@ jobs:
130142 # Provide more debug output in case of failure; note that some kinds of launch
131143 # failure do not produce any log output.
132144 - name : Debug failed Kubernetes job
133- if : failure()
145+ if : failure() && steps.submit_job.outputs.continue-run == 'true'
134146 run : |
135147 # Provide better debug in case of launch failures that will not produce log output
136148 pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${LAUNCHER_NAME} -o name)
0 commit comments