@@ -488,43 +488,142 @@ jobs:
488488 # ARTIFACTS: |
489489 # test-equinox.log
490490 # secrets: inherit
491-
492- te-unittests :
493- secrets : inherit
491+ test-transformerengine-eks :
494492 needs : build-jax
495- if : inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
496- uses : ./.github/workflows/_test_unit.yaml
497- with :
498- TEST_NAME : te
499- EXECUTE : |
500- docker run -i --gpus all --shm-size=1g -v $PWD:/log \
501- ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
502- bash <<"EOF" |& tee test-te.log
503- pip install pytest-reportlog pytest-xdist
504- # Start MPS daemon
505- nvidia-cuda-mps-control -d
506- # TE's default is slightly different, without the hyphen
507- export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE}
508- # 1 GPU per worker, 6 workers per GPU
509- pytest-xdist.sh 1 6 pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh
510- EOF
493+ if : inputs.ARCHITECTURE == 'amd64'
494+ runs-on : eks
495+ env :
496+ JAX_DOCKER_IMAGE : ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
497+ JOB_NAME : transformerengine-${{ github.run_id }}
498+ steps :
499+ - name : Check out the repository
500+ uses : actions/checkout@v4
501+ - name : Login to GitHub Container Registry
502+ uses : docker/login-action@v3
503+ with :
504+ registry : ghcr.io
505+ username : ${{ github.repository_owner }}
506+ password : ${{ secrets.GITHUB_TOKEN }}
507+ - name : K8s GHCR store and delete token
508+ id : store-token
509+ uses : ./.github/actions/store-delete-k8s-ghcr
510+ - name : Configure job manifest
511+ run : |
512+ yq -i ea '
513+ select(di == 0).metadata.name = strenv(JOB_NAME)
514+ | select(di == 0).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
515+ | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}"
516+ | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
517+ .github/eks-workflow-files/transformer-engine/unit-tests.yml
518+ git diff .github/eks-workflow-files/transformer-engine/unit-tests.yml
519+
520+ - name : Submit & delete transformer engine unit test job
521+ uses : ./.github/actions/submit-delete-k8s-job
522+ with :
523+ job-config-file : .github/eks-workflow-files/transformer-engine/unit-tests.yml
524+ job-name : ${{ env.JOB_NAME }}
511525
512- STATISTICS_SCRIPT : |
513- summary_line=$(tail -n1 test-te.log)
514- errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
515- passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
516- failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
526+ - name : Download logs from S3
527+ id : log-s3
528+ run : |
529+ NAME=transformer-engine
530+ LOCAL_DIR=$NAME-output
531+
532+ mkdir -p $LOCAL_DIR
533+ aws s3 cp s3://jax-toolbox-eks-output/$NAME/${{ github.run_id }}/summary.txt $LOCAL_DIR/
534+ aws s3 cp s3://jax-toolbox-eks-output/$NAME/${{ github.run_id }}/ $LOCAL_DIR/ --recursive --exclude "*" --include "*.log"
535+
536+ passed_tests=$(grep -c ": PASSED" $LOCAL_DIR/summary.txt || true)
537+ failed_tests=$(grep -c ": FAILED" $LOCAL_DIR/summary.txt || true)
517538 total_tests=$((failed_tests + passed_tests))
518- echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
519- echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
520- echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
521- echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
522539
523- TIMEOUT_MINUTES : 120
524- ARTIFACTS : |
525- test-te.log
526- pytest-report.jsonl
540+ echo "Passed tests: $passed_tests"
541+ echo "Failed tests: $failed_tests"
542+ echo "Total tests: $total_tests"
543+ echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
544+ echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
545+ echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
546+
547+ - name : Generate sitrep
548+ id : sitrep
549+ if : ${{ !cancelled() }}
550+ shell : bash -x -e {0}
551+ run : |
552+ # bring in utility functions
553+ source .github/workflows/scripts/to_json.sh
554+
555+ badge_label='TransformerEngine EKS Unit'
527556
557+ total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
558+ failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
559+ passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
560+ errors="0" \
561+ summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
562+ badge_message="Passed $passed_tests out of $total_tests." \
563+ badge_color="brightgreen"
564+ if [ "$failed_tests" -gt 0 ]; then
565+ badge_color="red"
566+ fi \
567+
568+ to_json \
569+ summary \
570+ errors total_tests passed_tests failed_tests \
571+ badge_label badge_color badge_message \
572+ > sitrep.json
573+
574+ schemaVersion=1 \
575+ label="${badge_label}" \
576+ message="Passed $passed_tests out of $total_tests." \
577+ color=$badge_color \
578+ to_json schemaVersion label message color \
579+ > badge-transformer-engine-test.json
580+
581+ - name : Upload artifacts
582+ if : ${{ !cancelled() }}
583+ uses : actions/upload-artifact@v4
584+ with :
585+ name : " artifact-transformer-engine-test"
586+ path : |
587+ sitrep.json
588+ badge-transformer-engine-test.json
589+ trasformer-engine-output/*
590+
591+ # te-unittests:
592+ # secrets: inherit
593+ # needs: build-jax
594+ # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
595+ # uses: ./.github/workflows/_test_unit.yaml
596+ # with:
597+ # TEST_NAME: te
598+ # EXECUTE: |
599+ # docker run -i --gpus all --shm-size=1g -v $PWD:/log \
600+ # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
601+ # bash <<"EOF" |& tee test-te.log
602+ # pip install pytest-reportlog pytest-xdist
603+ # # Start MPS daemon
604+ # nvidia-cuda-mps-control -d
605+ # # TE's default is slightly different, without the hyphen
606+ # export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE}
607+ # # 1 GPU per worker, 6 workers per GPU
608+ # pytest-xdist.sh 1 6 pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh
609+ # EOF
610+ #
611+ # STATISTICS_SCRIPT: |
612+ # summary_line=$(tail -n1 test-te.log)
613+ # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
614+ # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
615+ # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
616+ # total_tests=$((failed_tests + passed_tests))
617+ # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
618+ # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
619+ # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
620+ # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
621+ #
622+ # TIMEOUT_MINUTES: 120
623+ # ARTIFACTS: |
624+ # test-te.log
625+ # pytest-report.jsonl
626+ #
528627# test-upstream-t5x:
529628# needs: build-upstream-t5x
530629# if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
0 commit comments