Linux Transformers Test #318
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Linux Transformers Test | |
| on: | |
| schedule: | |
| # GMT+8 0:00 Sunday | |
| - cron: '0 16 * * 6' | |
| pull_request: | |
| branches: | |
| - main | |
| paths: | |
| - '.github/scripts/check-transformers.py' | |
| - '.github/scripts/spec.py' | |
| - '.github/workflows/_linux_transformers.yml' | |
| workflow_dispatch: | |
| inputs: | |
| python: | |
| required: false | |
| type: string | |
| default: '3.10' | |
| description: Python version | |
| runner: | |
| required: true | |
| type: string | |
| default: 'pvc_rolling' | |
| description: Runner label | |
| driver: | |
| required: false | |
| type: string | |
| default: 'lts' | |
| description: Driver lts/rolling | |
| nightly_whl: | |
| required: false | |
| type: string | |
| default: '' | |
| description: Pytorch nightly wheel version | |
| accelerate: | |
| required: false | |
| type: string | |
| default: 'v1.7.0' | |
| description: Accelerate version | |
| datasets: | |
| required: false | |
| type: string | |
| default: 'v3.6.0' | |
| description: Accelerate version | |
| transformers: | |
| required: false | |
| type: string | |
| default: 'v4.51.3' | |
| description: Transformers version | |
| permissions: read-all | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | |
| cancel-in-progress: true | |
| env: | |
| HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | |
| DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }} | |
| HF_HUB_ETAG_TIMEOUT: 120 | |
| HF_HUB_DOWNLOAD_TIMEOUT: 120 | |
| python: ${{ inputs.python != '' && inputs.python || '3.10' }} | |
| accelerate: ${{ inputs.accelerate != '' && inputs.accelerate || 'v1.7.0'}} | |
| datasets: ${{ inputs.datasets != '' && inputs.datasets || 'v3.6.0'}} | |
| transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.51.3' }} | |
| PACKAGES: | | |
| espeak-ng | |
| git-lfs | |
| pkg-config | |
| libavcodec-dev | |
| libavdevice-dev | |
| libavfilter-dev | |
| libavformat-dev | |
| libavutil-dev | |
| libswresample-dev | |
| libswscale-dev | |
| pciutils | |
| TORCH_INDEX: '--pre --index-url https://download.pytorch.org/whl/nightly/xpu' | |
| AGENT_TOOLSDIRECTORY: /tmp/xpu-tool | |
| defaults: | |
| run: | |
| shell: bash {0} | |
| jobs: | |
| conditions-filter: | |
| name: conditions-filter | |
| if: ${{ github.event.pull_request.draft == false }} | |
| runs-on: ubuntu-22.04 | |
| timeout-minutes: 10 | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| outputs: | |
| disabled_tests: ${{ steps.check-pr-desc.outputs.disabled_tests }} | |
| steps: | |
| - name: Check PR infos | |
| id: check-pr-desc | |
| run: | | |
| set -x -e -o pipefail | |
| sudo apt update && sudo apt install -y dos2unix | |
| (gh --repo ${GITHUB_REPOSITORY} pr view ${{ github.event.pull_request.number }} || echo $?) 2>&1 |tee pr-info.txt | |
| dos2unix pr-info.txt | |
| disabled_tests="$(awk '/disable_/{printf("%s ", $0)}' pr-info.txt)" | |
| echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}" | |
| prepare: | |
| runs-on: ${{ inputs.runner != '' && inputs.runner || 'pvc_rolling' }} | |
| needs: conditions-filter | |
| if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_transformers')) }} | |
| outputs: | |
| torch: ${{ steps.getver.outputs.torch }} | |
| torchvision: ${{ steps.getver.outputs.torchvision }} | |
| torchaudio: ${{ steps.getver.outputs.torchaudio }} | |
| triton: ${{ steps.getver.outputs.triton }} | |
| runner_id: ${{ steps.runner-info.outputs.runner_id }} | |
| user_id: ${{ steps.runner-info.outputs.user_id }} | |
| render_id: ${{ steps.runner-info.outputs.render_id }} | |
| hostname: ${{ steps.runner-info.outputs.hostname }} | |
| pytest_extra_args: ${{ steps.runner-info.outputs.pytest_extra_args }} | |
| steps: | |
| - id: getver | |
| run: | | |
| # We can't just `pip index version...` and get the last available | |
| # version as pytorch packages may have tricky dependencies. Instead | |
| # we dry run install packages and get versions which would be installed. | |
| # See: https://github.com/pytorch/pytorch/issues/154687 | |
| pip install --dry-run --ignore-installed $TORCH_INDEX \ | |
| torch torchvision torchaudio pytorch-triton-xpu >_log.txt | |
| torch=$(cat _log.txt | grep "Would install" | sed -E "s/.*torch-([^ ]*).*/\1/") | |
| torchvision=$(cat _log.txt | grep "Would install" | sed -E "s/.*torchvision-([^ ]*).*/\1/") | |
| torchaudio=$(cat _log.txt | grep "Would install" | sed -E "s/.*torchaudio-([^ ]*).*/\1/") | |
| triton=$(cat _log.txt | grep "Would install" | sed -E "s/.*pytorch-triton-xpu-([^ ]*).*/\1/") | |
| echo "torch=$torch" | tee -a "$GITHUB_OUTPUT" | |
| echo "torchvision=$torchvision" | tee -a "$GITHUB_OUTPUT" | |
| echo "torchaudio=$torchaudio" | tee -a "$GITHUB_OUTPUT" | |
| echo "triton=$triton" | tee -a "$GITHUB_OUTPUT" | |
| - name: Checkout torch-xpu-ops | |
| uses: actions/checkout@v4 | |
| - name: Get runner | |
| id: runner-info | |
| uses: ./.github/actions/get-runner | |
| tests: | |
| needs: prepare | |
| runs-on: ${{ needs.prepare.outputs.runner_id }} | |
| container: | |
| image: intelgpu/ubuntu-22.04-lts2:2523.31 | |
| volumes: | |
| - ${{ github.workspace }}:${{ github.workspace }} | |
| options: --device=/dev/mem --device=/dev/dri --group-add video --group-add ${{ needs.prepare.outputs.render_id }} | |
| --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g | |
| -u ${{ needs.prepare.outputs.user_id }} | |
| -e ZE_AFFINITY_MASK | |
| env: | |
| PYTORCH_DEBUG_XPU_FALLBACK: '1' | |
| TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py' | |
| # enable pytest parallel run, and continue others if meets crash case such as segmentation fault | |
| PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.prepare.outputs.pytest_extra_args }} | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 1 | |
| matrix: | |
| test: | |
| # Excluding tests due to: | |
| # * https://github.com/huggingface/transformers/issues/36267 (marian tests) | |
| - test_case: 'tests_backbone' | |
| cmd: '--ignore=tests/models/marian/test_modeling_marian.py -k backbone tests' | |
| - test_case: "tests_py" | |
| cmd: "tests/*.py" | |
| # Excluding tests due to: | |
| # * torch.distributed.* not yet supported by XPU | |
| - test_case: 'tests_generation' | |
| cmd: 'tests/generation' | |
| filter: 'not TestFSDPGeneration' | |
| # breaking for each shard to take ~15-30 minutes to complete | |
| # Excluding tests due to: | |
| # * https://github.com/pytorch/pytorch/issues/140965 (aten::_linalg_eigvals) | |
| # * https://github.com/huggingface/transformers/issues/36267 (marian tests) | |
| - test_case: 'tests_models_0' | |
| cmd: 'tests/models --num-shards 4 --shard-id 0 --ignore=tests/models/marian/test_modeling_marian.py' | |
| filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' | |
| - test_case: 'tests_models_1' | |
| cmd: 'tests/models --num-shards 4 --shard-id 1 --ignore=tests/models/marian/test_modeling_marian.py' | |
| filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' | |
| - test_case: 'tests_models_2' | |
| cmd: 'tests/models --num-shards 4 --shard-id 2 --ignore=tests/models/marian/test_modeling_marian.py' | |
| filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' | |
| - test_case: 'tests_models_3' | |
| cmd: 'tests/models --num-shards 4 --shard-id 3 --ignore=tests/models/marian/test_modeling_marian.py' | |
| filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings' | |
| # Excluding tests due to: | |
| # * Some ray tests hang, reason unknown | |
| # * torch.distributed.* not yet supported by XPU | |
| - test_case: 'tests_trainer' | |
| cmd: 'tests/trainer' | |
| filter: 'not ray and not TestTrainerDistributed and not TestTrainerDistributedXPU and not TestFSDPTrainer' | |
| # Excluding tests due to: | |
| # * Network proxy connection issue, reason unknown | |
| # *'tests/utils/test_import_utils.py' invalidates state of the test engine causing | |
| # next tests to fail. See: https://github.com/huggingface/transformers/issues/36267 | |
| - test_case: 'tests_utils' | |
| cmd: '--ignore=tests/utils/test_import_utils.py tests/utils' | |
| filter: 'not test_load_img_url_timeout' | |
| steps: | |
| - name: Checkout torch-xpu-ops | |
| uses: actions/checkout@v4 | |
| with: | |
| path: torch-xpu-ops | |
| - name: Checkout Transformers | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: huggingface/transformers | |
| ref: ${{ env.transformers }} | |
| path: transformers | |
| - name: Prepare test vars | |
| run: | | |
| echo "HF_HOME=$HOME/.hf_home_of_transformers_test" >> $GITHUB_ENV | |
| echo "TEST_CASE=${{matrix.test.test_case}}" >> $GITHUB_ENV | |
| - name: Report HF cache directory | |
| run: | | |
| if [ -d "$HF_HOME" ]; then | |
| ls -al ${{ env.HF_HOME }} | |
| du -sh ${{ env.HF_HOME }} | |
| fi | |
| - name: Prepare OS environment | |
| run: | | |
| # as jobs might run in parallel on the same system, apt-get might | |
| # step into the lock hold by other job | |
| start_time=$SECONDS | |
| while ! sudo apt-get update; do | |
| sleep 1; | |
| if (( $SECONDS - start_time > 60 )); then false; fi | |
| done | |
| while ! sudo apt-get install -y $PACKAGES; do | |
| sleep 1; | |
| if (( $SECONDS - start_time > 60 )); then false; fi | |
| done | |
| while ! git lfs install; do | |
| sleep 1; | |
| if (( $SECONDS - start_time > 60 )); then false; fi | |
| done | |
| - name: Setup python-${{ env.python }} | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ env.python }} | |
| - name: Check python | |
| run: | | |
| which python && python -V | |
| which pip && pip list | |
| pip install -U pip wheel setuptools | |
| - name: Prepare pytorch and deps | |
| run: | | |
| pip install junitparser | |
| pip install $TORCH_INDEX \ | |
| torch==${{ needs.prepare.outputs.torch }} \ | |
| torchvision==${{ needs.prepare.outputs.torchvision }} \ | |
| torchaudio==${{ needs.prepare.outputs.torchaudio }} \ | |
| pytorch-triton-xpu==${{needs.prepare.outputs.triton }} | |
| - name: Prepare Transformers | |
| run: | | |
| pwd | |
| cd transformers | |
| pip install \ | |
| accelerate==${{ env.accelerate }} \ | |
| datasets==${{ env.datasets }} | |
| pip install -e . | |
| pip install -e ".[dev-torch,testing,video]" | |
| rm -rf logs && mkdir -p logs | |
| rm -rf reports | |
| cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./ | |
| - name: Report installed versions | |
| run: | | |
| LOGS_DIR="${{ github.workspace }}/transformers/logs" | |
| echo "pip installed packages:" | |
| pip list | tee "$LOGS_DIR/pip_list-$TEST_CASE.txt" | |
| echo "lspci gpu devices:" | |
| lspci -d ::0380 | tee "$LOGS_DIR/lspci_0380-$TEST_CASE.txt" | |
| echo "GPU render nodes:" | |
| cat /sys/class/drm/render*/device/device | tee "$LOGS_DIR/device_IDs-$TEST_CASE.txt" | |
| echo "xpu-smi output:" | |
| xpu-smi discovery -y --json --dump -1 | |
| - name: Sanity check installed packages | |
| run: | | |
| # Use latest pytest | |
| pip install -U pytest pytest-timeout pytest-xdist pytest-shard | |
| # These checks are to exit earlier if for any reason Transformers | |
| # reinstalled torch packages back to CUDA versions (not expected). | |
| pip show torch | grep Version | grep xpu | |
| pip show torchaudio | grep Version | grep xpu | |
| pip show torchvision | grep Version | grep xpu | |
| python -c 'import torch; exit(not torch.xpu.is_available())' | |
| - name: Run tests on ${{ needs.prepare.outputs.hostname }} | |
| run: | | |
| cd transformers | |
| python -m pytest --make-reports=${TEST_CASE} --junit-xml=reports/${TEST_CASE}.xml \ | |
| -k "${{ matrix.test.filter}}" ${{ matrix.test.cmd }} || true | |
| - name: Check for errors in tests | |
| run: | | |
| python torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml | |
| - name: Print environment | |
| if: ${{ ! cancelled() }} | |
| uses: ./torch-xpu-ops/.github/actions/print-environment | |
| with: | |
| pip_packages: 'accelerate datasets transformers' | |
| to: 'transformers/logs/environment-$TEST_CASE.md' | |
| - name: Clean up | |
| if: ${{ always() }} | |
| run: | | |
| if [ -d "$HF_HOME" ]; then | |
| ls -al ${{ env.HF_HOME }} | |
| du -sh ${{ env.HF_HOME }} | |
| rm -rf ${{ env.HF_HOME }} | |
| fi | |
| - name: Upload reports | |
| if: ${{ ! cancelled() }} | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: reports-${{ matrix.test.test_case }}-${{ github.event.pull_request.number || github.sha }} | |
| path: ${{ github.workspace }}/transformers/reports | |
| - name: Upload logs | |
| if: ${{ ! cancelled() }} | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: logs-${{ matrix.test.test_case }}-${{ github.event.pull_request.number || github.sha }} | |
| path: ${{ github.workspace }}/transformers/logs | |
| report: | |
| needs: tests | |
| if: ${{ success() || failure() }} | |
| runs-on: ubuntu-24.04 | |
| steps: | |
| - name: Download reports | |
| uses: actions/download-artifact@v4 | |
| with: | |
| pattern: 'reports-*' | |
| path: 'transformers/reports/' | |
| merge-multiple: true | |
| - name: Download logs | |
| if: ${{ ! cancelled() }} | |
| uses: actions/download-artifact@v4 | |
| with: | |
| pattern: 'logs-*' | |
| path: 'transformers/logs/' | |
| merge-multiple: true | |
| - name: Checkout torch-xpu-ops | |
| if: ${{ ! cancelled() }} | |
| uses: actions/checkout@v4 | |
| with: | |
| path: torch-xpu-ops | |
| - name: Setup python-${{ env.python }} | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ env.python }} | |
| - name: Install pip deps | |
| run: | | |
| pip install junitparser | |
| - name: Print results table | |
| if: ${{ ! cancelled() }} | |
| run: | | |
| # Helper function to return number preceeding given pattern, i.e: | |
| # === 25 failed, 11 warnings, 0 errors === | |
| # Call as follows: | |
| # parse_stat $line "failed" | |
| function parse_stat() { | |
| stat=$(cat $1 | grep $2 | sed "s/.* \([0-9]*\) $2.*/\1/") | |
| if [ -n "$stat" ]; then echo $stat; else echo "0"; fi | |
| } | |
| cd transformers | |
| { | |
| echo "### Results" | |
| echo "| Test group | Errors | Failed | Deselected | Passed | Skipped |" | |
| echo "| --- | --- | --- | --- | --- | --- |" | |
| for stat in $(find reports -name stats.txt); do | |
| # Each stat.txt is located in: reports/$test_group/stats.txt | |
| test_group=$(echo $stat | cut -f 2 -d/) | |
| # Get failed, passed, skipped, etc. counters | |
| failed=$(parse_stat $stat failed) | |
| passed=$(parse_stat $stat passed) | |
| deselected=$(parse_stat $stat deselected) | |
| skipped=$(parse_stat $stat skipped) | |
| warnings=$(parse_stat $stat warnings) | |
| errors=$(parse_stat $stat errors) | |
| echo "| $test_group | $errors | $failed | $deselected | $passed | $skipped |" | |
| done | |
| } >> $GITHUB_STEP_SUMMARY | |
| - name: Print baseline difference | |
| if: ${{ ! cancelled() }} | |
| run: | | |
| python torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml >> $GITHUB_STEP_SUMMARY || true | |
| - name: Print failure lines | |
| if: ${{ ! cancelled() }} | |
| run: | | |
| cd transformers | |
| { | |
| echo "### Failure lines" | |
| echo "| Test group |File | Error | Comment |" | |
| echo "| --- | --- | --- | --- |" | |
| rm -rf _failures.txt | |
| for failure in $(find reports -name failures_line.txt); do | |
| # Each failure_line.txt is located in: reports/$test_group/failure_line.txt | |
| test_group=$(echo $failure | cut -f2 -d/) | |
| tail -n +2 $failure | sed "s/^/$test_group /" >> _failures.txt | |
| done | |
| # failures_line.txt file does not have test case information, | |
| # so we can just sort the output and report uniq values | |
| sort _failures.txt | uniq > _failures_uniq.txt | |
| while read line; do | |
| test_group=$(echo $line | cut -f1 -d" ") | |
| file=$(echo $line | cut -f2 -d" " | sed "s/\(.*\):$/\1/") | |
| error=$(echo $line | cut -f3 -d" " | sed "s/\(.*\):$/\1/") | |
| # Failure comments often contain special characters which complicate | |
| # parsing failure lines. But fortunately we know for sure where comments | |
| # start. So we just output all contents starting from this position and | |
| # wrap everything in <pre></pre> to avoid collisions with Markdown formatting. | |
| comment="<pre>$(echo $line | cut -f4- -d' ' | sed 's/\(.*\):$/\1/')</pre>" | |
| echo "| $test_group | $file | $error | $comment |" | |
| done <_failures_uniq.txt | |
| } >> $GITHUB_STEP_SUMMARY | |
| - name: Print not implemented XPU backend ops | |
| if: ${{ ! cancelled() }} | |
| run: | | |
| cd transformers | |
| { | |
| echo "### Not implemented ops" | |
| echo "| Test group | Operator | Status |" | |
| echo "| --- | --- | --- |" | |
| rm -rf _ops.txt && touch _ops.txt | |
| for log in $(find reports -name failures_line.txt); do | |
| # Each failure_line.txt is located in: reports/$test_group/failure_line.txt | |
| test_group=$(echo $log | cut -f2 -d/) | |
| ops=$(grep NotImplementedError $log | grep "for the XPU device" | sed "s/.*The operator '\(.*\)' is not.*/\1/") | |
| for op in $ops; do | |
| echo "| $test_group | <pre>$op</pre> | not implemented |" >> _ops.txt | |
| done | |
| done | |
| for log in $(find reports -name warnings.txt); do | |
| # Each warnings.txt is located in: reports/$test_group/warnings.txt | |
| test_group=$(echo $log | cut -f2 -d/) | |
| ops=$(grep UserWarning $log | grep "on the XPU backend" | sed "s/.*The operator '\(.*\) on the XPU.*/\1/") | |
| for op in $ops; do | |
| echo "| $test_group | <pre>$op</pre> | fallback to CPU happens |" >> _ops.txt | |
| done | |
| done | |
| sort _ops.txt | uniq | |
| } >> $GITHUB_STEP_SUMMARY | |
| - name: Print environment | |
| if: ${{ ! cancelled() }} | |
| run: | | |
| first_md=$(find transformers/logs -name "environment-*.md" | head -1) | |
| cat $first_md >> $GITHUB_STEP_SUMMARY | |
| # we expect environments to be identical except for the ZE_AFFINITY_MASK line | |
| find transformers/logs -name "environment-*.md" | xargs sed -i '/ZE_AFFINITY_MASK/d' | |
| for f in $(find transformers/logs -name "environment-*.md"); do | |
| diff $f $first_md | |
| done |