Skip to content

Commit 7f4299e

Browse files
committed
Add transformerengine EKS job submission to CI workflow
1 parent 7433d00 commit 7f4299e

File tree

1 file changed

+131
-32
lines changed

1 file changed

+131
-32
lines changed

.github/workflows/_ci.yaml

Lines changed: 131 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -488,43 +488,142 @@ jobs:
488488
# ARTIFACTS: |
489489
# test-equinox.log
490490
# secrets: inherit
491-
492-
te-unittests:
493-
secrets: inherit
491+
test-transformerengine-eks:
494492
needs: build-jax
495-
if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
496-
uses: ./.github/workflows/_test_unit.yaml
497-
with:
498-
TEST_NAME: te
499-
EXECUTE: |
500-
docker run -i --gpus all --shm-size=1g -v $PWD:/log \
501-
${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
502-
bash <<"EOF" |& tee test-te.log
503-
pip install pytest-reportlog pytest-xdist
504-
# Start MPS daemon
505-
nvidia-cuda-mps-control -d
506-
# TE's default is slightly different, without the hyphen
507-
export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE}
508-
# 1 GPU per worker, 6 workers per GPU
509-
pytest-xdist.sh 1 6 pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh
510-
EOF
493+
if: inputs.ARCHITECTURE == 'amd64'
494+
runs-on: eks
495+
env:
496+
JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
497+
JOB_NAME: transformerengine-${{ github.run_id }}
498+
steps:
499+
- name: Check out the repository
500+
uses: actions/checkout@v4
501+
- name: Login to GitHub Container Registry
502+
uses: docker/login-action@v3
503+
with:
504+
registry: ghcr.io
505+
username: ${{ github.repository_owner }}
506+
password: ${{ secrets.GITHUB_TOKEN }}
507+
- name: K8s GHCR store and delete token
508+
id: store-token
509+
uses: ./.github/actions/store-delete-k8s-ghcr
510+
- name: Configure job manifest
511+
run: |
512+
yq -i ea '
513+
select(di == 0).metadata.name = strenv(JOB_NAME)
514+
| select(di == 0).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
515+
| select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}"
516+
| select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
517+
.github/eks-workflow-files/transformer-engine/unit-tests.yml
518+
git diff .github/eks-workflow-files/transformer-engine/unit-tests.yml
519+
520+
- name: Submit & delete transformer engine unit test job
521+
uses: ./.github/actions/submit-delete-k8s-job
522+
with:
523+
job-config-file: .github/eks-workflow-files/transformer-engine/unit-tests.yml
524+
job-name: ${{ env.JOB_NAME }}
511525

512-
STATISTICS_SCRIPT: |
513-
summary_line=$(tail -n1 test-te.log)
514-
errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
515-
passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
516-
failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
526+
- name: Download logs from S3
527+
id: log-s3
528+
run: |
529+
NAME=transformer-engine
530+
LOCAL_DIR=$NAME-output
531+
532+
mkdir -p $LOCAL_DIR
533+
aws s3 cp s3://jax-toolbox-eks-output/$NAME/${{ github.run_id }}/summary.txt $LOCAL_DIR/
534+
aws s3 cp s3://jax-toolbox-eks-output/$NAME/${{ github.run_id }}/ $LOCAL_DIR/ --recursive --exclude "*" --include "*.log"
535+
536+
passed_tests=$(grep -c ": PASSED" $LOCAL_DIR/summary.txt || true)
537+
failed_tests=$(grep -c ": FAILED" $LOCAL_DIR/summary.txt || true)
517538
total_tests=$((failed_tests + passed_tests))
518-
echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
519-
echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
520-
echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
521-
echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
522539
523-
TIMEOUT_MINUTES: 120
524-
ARTIFACTS: |
525-
test-te.log
526-
pytest-report.jsonl
540+
echo "Passed tests: $passed_tests"
541+
echo "Failed tests: $failed_tests"
542+
echo "Total tests: $total_tests"
543+
echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
544+
echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
545+
echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
546+
547+
- name: Generate sitrep
548+
id: sitrep
549+
if: ${{ !cancelled() }}
550+
shell: bash -x -e {0}
551+
run: |
552+
# bring in utility functions
553+
source .github/workflows/scripts/to_json.sh
554+
555+
badge_label='TransformerEngine EKS Unit'
527556
557+
total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
558+
failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
559+
passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
560+
errors="0" \
561+
summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
562+
badge_message="Passed $passed_tests out of $total_tests." \
563+
badge_color="brightgreen"
564+
if [ "$failed_tests" -gt 0 ]; then
565+
badge_color="red"
566+
fi \
567+
568+
to_json \
569+
summary \
570+
errors total_tests passed_tests failed_tests \
571+
badge_label badge_color badge_message \
572+
> sitrep.json
573+
574+
schemaVersion=1 \
575+
label="${badge_label}" \
576+
message="Passed $passed_tests out of $total_tests." \
577+
color=$badge_color \
578+
to_json schemaVersion label message color \
579+
> badge-transformer-engine-test.json
580+
581+
- name: Upload artifacts
582+
if: ${{ !cancelled() }}
583+
uses: actions/upload-artifact@v4
584+
with:
585+
name: "artifact-transformer-engine-test"
586+
path: |
587+
sitrep.json
588+
badge-transformer-engine-test.json
589+
trasformer-engine-output/*
590+
591+
# te-unittests:
592+
# secrets: inherit
593+
# needs: build-jax
594+
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
595+
# uses: ./.github/workflows/_test_unit.yaml
596+
# with:
597+
# TEST_NAME: te
598+
# EXECUTE: |
599+
# docker run -i --gpus all --shm-size=1g -v $PWD:/log \
600+
# ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
601+
# bash <<"EOF" |& tee test-te.log
602+
# pip install pytest-reportlog pytest-xdist
603+
# # Start MPS daemon
604+
# nvidia-cuda-mps-control -d
605+
# # TE's default is slightly different, without the hyphen
606+
# export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE}
607+
# # 1 GPU per worker, 6 workers per GPU
608+
# pytest-xdist.sh 1 6 pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh
609+
# EOF
610+
#
611+
# STATISTICS_SCRIPT: |
612+
# summary_line=$(tail -n1 test-te.log)
613+
# errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
614+
# passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
615+
# failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
616+
# total_tests=$((failed_tests + passed_tests))
617+
# echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
618+
# echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
619+
# echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
620+
# echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
621+
#
622+
# TIMEOUT_MINUTES: 120
623+
# ARTIFACTS: |
624+
# test-te.log
625+
# pytest-report.jsonl
626+
#
528627
# test-upstream-t5x:
529628
# needs: build-upstream-t5x
530629
# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a

0 commit comments

Comments
 (0)