Skip to content

Commit b601395

Browse files
authored
K8s job monitoring bugfix (#1507)
1 parent 77d5161 commit b601395

File tree

1 file changed

+26
-27
lines changed

1 file changed

+26
-27
lines changed

.github/actions/submit-delete-k8s-job/action.yml

Lines changed: 26 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -20,59 +20,58 @@ runs:
2020
TIMEOUT_JOB_CREATION=60s
2121
TIMEOUT_JOB_WAIT=14400s
2222
TIMEOUT_JOB_START=600s
23+
INPUT_JOB_NAME=${{ inputs.job-name }}
24+
INPUT_JOB_CONFIG_FILE=${{ inputs.job-config-file }}
2325
2426
echo "Submit K8s job"
25-
kubectl apply -f "${{ inputs.job-config-file }}"
26-
kubectl get event | grep ${{ inputs.job-name }}
27+
kubectl apply -f "${INPUT_JOB_CONFIG_FILE}"
28+
kubectl get event | grep ${INPUT_JOB_NAME}
2729
# Wait for job to be created
28-
kubectl wait --for=create job/${{ inputs.job-name }} --timeout=$TIMEOUT_JOB_CREATION
29-
30+
kubectl wait --for=create job/${INPUT_JOB_NAME} --timeout=$TIMEOUT_JOB_CREATION
3031
# Wait for job to be unsuspended
31-
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=$TIMEOUT_JOB_WAIT
32-
32+
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${INPUT_JOB_NAME} --timeout=$TIMEOUT_JOB_WAIT
3333
# Wait for pods to be running
3434
kubectl wait --for=condition=Ready \
35-
--selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} \
35+
--selector=batch.kubernetes.io/job-name=${INPUT_JOB_NAME} \
3636
--timeout=$TIMEOUT_JOB_START pod
3737
3838
# Stream logs
39-
kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
39+
kubectl logs --all-containers=true --all-pods=true --follow job/${INPUT_JOB_NAME}
4040
4141
# Detect job parallelism
42-
parallelism=$(kubectl get job/"${{ inputs.job-name }}" -o jsonpath='{.spec.parallelism}')
42+
parallelism=$(kubectl get job/${INPUT_JOB_NAME} -o jsonpath='{.spec.parallelism}')
4343
# if parallelism is not set, use default value of 1
44-
echo "Parallelism ${parallelism}"
4544
if [ -z "${parallelism}" ]; then
4645
echo "No parallelism specified, defaulting to 1"
4746
parallelism=1
4847
fi
4948
50-
while IFS=: read -r failures successes; do
51-
failures="${failures:-0}"
52-
successes="${successes:-0}"
49+
while true; do
50+
job_status_counts=$(kubectl get job/${INPUT_JOB_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}')
51+
52+
IFS=:
53+
set -- $job_status_counts
54+
failures=${1:-0}
55+
successes=${2:-0}
56+
5357
total=$((failures + successes))
5458
59+
echo "status: failures=${failures}, successes=${successes}, total=${total}, parallelism=${parallelism}"
60+
5561
if [ $total -lt $parallelism ]; then
5662
# neither "failed" nor "succeeded", so wait
57-
sleep 1
58-
elif [ $total -eq $parallelism ]; then
59-
# we have total=parallelism => either X successes or X failures
60-
# In any case, the job is done
61-
break
62-
else
63-
# Log here
64-
echo "Unexpected number of completed pods ${total} with parallelism ${parallelism}"
65-
exit 255
63+
sleep 2
64+
continue
6665
fi
67-
done <<EOF
68-
$(kubectl get job/"${{ inputs.job-name }}" -o 'jsonpath={.status.failed}:{.status.succeeded}')
69-
EOF
66+
break
67+
done
68+
7069
7170
# If job indicates a failure try to print out the info
7271
if [ "${failures:-0}" -gt 0 ]; then
73-
echo "Job ${{ inputs.job-name }} has $failures failures"
72+
echo "Job ${INPUT_JOB_NAME} has $failures failures"
7473
# this is for batch jobs only
75-
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o name)
74+
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${INPUT_JOB_NAME} -o name)
7675
if [ -n "${pods}" ]; then
7776
kubectl describe ${pods}
7877
fi

0 commit comments

Comments
 (0)