Skip to content

Linux Transformers Test #318

Linux Transformers Test

Linux Transformers Test #318

name: Linux Transformers Test
on:
schedule:
# GMT+8 0:00 Sunday
- cron: '0 16 * * 6'
pull_request:
branches:
- main
paths:
- '.github/scripts/check-transformers.py'
- '.github/scripts/spec.py'
- '.github/workflows/_linux_transformers.yml'
workflow_dispatch:
inputs:
python:
required: false
type: string
default: '3.10'
description: Python version
runner:
required: true
type: string
default: 'pvc_rolling'
description: Runner label
driver:
required: false
type: string
default: 'lts'
description: Driver lts/rolling
nightly_whl:
required: false
type: string
default: ''
description: Pytorch nightly wheel version
accelerate:
required: false
type: string
default: 'v1.7.0'
description: Accelerate version
datasets:
required: false
type: string
default: 'v3.6.0'
description: Accelerate version
transformers:
required: false
type: string
default: 'v4.51.3'
description: Transformers version
permissions: read-all
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
HF_HUB_ETAG_TIMEOUT: 120
HF_HUB_DOWNLOAD_TIMEOUT: 120
python: ${{ inputs.python != '' && inputs.python || '3.10' }}
accelerate: ${{ inputs.accelerate != '' && inputs.accelerate || 'v1.7.0'}}
datasets: ${{ inputs.datasets != '' && inputs.datasets || 'v3.6.0'}}
transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.51.3' }}
PACKAGES: |
espeak-ng
git-lfs
pkg-config
libavcodec-dev
libavdevice-dev
libavfilter-dev
libavformat-dev
libavutil-dev
libswresample-dev
libswscale-dev
pciutils
TORCH_INDEX: '--pre --index-url https://download.pytorch.org/whl/nightly/xpu'
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
defaults:
run:
shell: bash {0}
jobs:
conditions-filter:
name: conditions-filter
if: ${{ github.event.pull_request.draft == false }}
runs-on: ubuntu-22.04
timeout-minutes: 10
env:
GH_TOKEN: ${{ github.token }}
outputs:
disabled_tests: ${{ steps.check-pr-desc.outputs.disabled_tests }}
steps:
- name: Check PR infos
id: check-pr-desc
run: |
set -x -e -o pipefail
sudo apt update && sudo apt install -y dos2unix
(gh --repo ${GITHUB_REPOSITORY} pr view ${{ github.event.pull_request.number }} || echo $?) 2>&1 |tee pr-info.txt
dos2unix pr-info.txt
disabled_tests="$(awk '/disable_/{printf("%s ", $0)}' pr-info.txt)"
echo "disabled_tests=${disabled_tests}" |tee "${GITHUB_OUTPUT}"
prepare:
runs-on: ${{ inputs.runner != '' && inputs.runner || 'pvc_rolling' }}
needs: conditions-filter
if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') || contains(needs.conditions-filter.outputs.disabled_tests, 'disable_transformers')) }}
outputs:
torch: ${{ steps.getver.outputs.torch }}
torchvision: ${{ steps.getver.outputs.torchvision }}
torchaudio: ${{ steps.getver.outputs.torchaudio }}
triton: ${{ steps.getver.outputs.triton }}
runner_id: ${{ steps.runner-info.outputs.runner_id }}
user_id: ${{ steps.runner-info.outputs.user_id }}
render_id: ${{ steps.runner-info.outputs.render_id }}
hostname: ${{ steps.runner-info.outputs.hostname }}
pytest_extra_args: ${{ steps.runner-info.outputs.pytest_extra_args }}
steps:
- id: getver
run: |
# We can't just `pip index version...` and get the last available
# version as pytorch packages may have tricky dependencies. Instead
# we dry run install packages and get versions which would be installed.
# See: https://github.com/pytorch/pytorch/issues/154687
pip install --dry-run --ignore-installed $TORCH_INDEX \
torch torchvision torchaudio pytorch-triton-xpu >_log.txt
torch=$(cat _log.txt | grep "Would install" | sed -E "s/.*torch-([^ ]*).*/\1/")
torchvision=$(cat _log.txt | grep "Would install" | sed -E "s/.*torchvision-([^ ]*).*/\1/")
torchaudio=$(cat _log.txt | grep "Would install" | sed -E "s/.*torchaudio-([^ ]*).*/\1/")
triton=$(cat _log.txt | grep "Would install" | sed -E "s/.*pytorch-triton-xpu-([^ ]*).*/\1/")
echo "torch=$torch" | tee -a "$GITHUB_OUTPUT"
echo "torchvision=$torchvision" | tee -a "$GITHUB_OUTPUT"
echo "torchaudio=$torchaudio" | tee -a "$GITHUB_OUTPUT"
echo "triton=$triton" | tee -a "$GITHUB_OUTPUT"
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
- name: Get runner
id: runner-info
uses: ./.github/actions/get-runner
tests:
needs: prepare
runs-on: ${{ needs.prepare.outputs.runner_id }}
container:
image: intelgpu/ubuntu-22.04-lts2:2523.31
volumes:
- ${{ github.workspace }}:${{ github.workspace }}
options: --device=/dev/mem --device=/dev/dri --group-add video --group-add ${{ needs.prepare.outputs.render_id }}
--security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
-u ${{ needs.prepare.outputs.user_id }}
-e ZE_AFFINITY_MASK
env:
PYTORCH_DEBUG_XPU_FALLBACK: '1'
TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py'
# enable pytest parallel run, and continue others if meets crash case such as segmentation fault
PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.prepare.outputs.pytest_extra_args }}
strategy:
fail-fast: false
max-parallel: 1
matrix:
test:
# Excluding tests due to:
# * https://github.com/huggingface/transformers/issues/36267 (marian tests)
- test_case: 'tests_backbone'
cmd: '--ignore=tests/models/marian/test_modeling_marian.py -k backbone tests'
- test_case: "tests_py"
cmd: "tests/*.py"
# Excluding tests due to:
# * torch.distributed.* not yet supported by XPU
- test_case: 'tests_generation'
cmd: 'tests/generation'
filter: 'not TestFSDPGeneration'
# breaking for each shard to take ~15-30 minutes to complete
# Excluding tests due to:
# * https://github.com/pytorch/pytorch/issues/140965 (aten::_linalg_eigvals)
# * https://github.com/huggingface/transformers/issues/36267 (marian tests)
- test_case: 'tests_models_0'
cmd: 'tests/models --num-shards 4 --shard-id 0 --ignore=tests/models/marian/test_modeling_marian.py'
filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
- test_case: 'tests_models_1'
cmd: 'tests/models --num-shards 4 --shard-id 1 --ignore=tests/models/marian/test_modeling_marian.py'
filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
- test_case: 'tests_models_2'
cmd: 'tests/models --num-shards 4 --shard-id 2 --ignore=tests/models/marian/test_modeling_marian.py'
filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
- test_case: 'tests_models_3'
cmd: 'tests/models --num-shards 4 --shard-id 3 --ignore=tests/models/marian/test_modeling_marian.py'
filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
# Excluding tests due to:
# * Some ray tests hang, reason unknown
# * torch.distributed.* not yet supported by XPU
- test_case: 'tests_trainer'
cmd: 'tests/trainer'
filter: 'not ray and not TestTrainerDistributed and not TestTrainerDistributedXPU and not TestFSDPTrainer'
# Excluding tests due to:
# * Network proxy connection issue, reason unknown
# *'tests/utils/test_import_utils.py' invalidates state of the test engine causing
# next tests to fail. See: https://github.com/huggingface/transformers/issues/36267
- test_case: 'tests_utils'
cmd: '--ignore=tests/utils/test_import_utils.py tests/utils'
filter: 'not test_load_img_url_timeout'
steps:
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
with:
path: torch-xpu-ops
- name: Checkout Transformers
uses: actions/checkout@v4
with:
repository: huggingface/transformers
ref: ${{ env.transformers }}
path: transformers
- name: Prepare test vars
run: |
echo "HF_HOME=$HOME/.hf_home_of_transformers_test" >> $GITHUB_ENV
echo "TEST_CASE=${{matrix.test.test_case}}" >> $GITHUB_ENV
- name: Report HF cache directory
run: |
if [ -d "$HF_HOME" ]; then
ls -al ${{ env.HF_HOME }}
du -sh ${{ env.HF_HOME }}
fi
- name: Prepare OS environment
run: |
# as jobs might run in parallel on the same system, apt-get might
# step into the lock hold by other job
start_time=$SECONDS
while ! sudo apt-get update; do
sleep 1;
if (( $SECONDS - start_time > 60 )); then false; fi
done
while ! sudo apt-get install -y $PACKAGES; do
sleep 1;
if (( $SECONDS - start_time > 60 )); then false; fi
done
while ! git lfs install; do
sleep 1;
if (( $SECONDS - start_time > 60 )); then false; fi
done
- name: Setup python-${{ env.python }}
uses: actions/setup-python@v5
with:
python-version: ${{ env.python }}
- name: Check python
run: |
which python && python -V
which pip && pip list
pip install -U pip wheel setuptools
- name: Prepare pytorch and deps
run: |
pip install junitparser
pip install $TORCH_INDEX \
torch==${{ needs.prepare.outputs.torch }} \
torchvision==${{ needs.prepare.outputs.torchvision }} \
torchaudio==${{ needs.prepare.outputs.torchaudio }} \
pytorch-triton-xpu==${{needs.prepare.outputs.triton }}
- name: Prepare Transformers
run: |
pwd
cd transformers
pip install \
accelerate==${{ env.accelerate }} \
datasets==${{ env.datasets }}
pip install -e .
pip install -e ".[dev-torch,testing,video]"
rm -rf logs && mkdir -p logs
rm -rf reports
cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./
- name: Report installed versions
run: |
LOGS_DIR="${{ github.workspace }}/transformers/logs"
echo "pip installed packages:"
pip list | tee "$LOGS_DIR/pip_list-$TEST_CASE.txt"
echo "lspci gpu devices:"
lspci -d ::0380 | tee "$LOGS_DIR/lspci_0380-$TEST_CASE.txt"
echo "GPU render nodes:"
cat /sys/class/drm/render*/device/device | tee "$LOGS_DIR/device_IDs-$TEST_CASE.txt"
echo "xpu-smi output:"
xpu-smi discovery -y --json --dump -1
- name: Sanity check installed packages
run: |
# Use latest pytest
pip install -U pytest pytest-timeout pytest-xdist pytest-shard
# These checks are to exit earlier if for any reason Transformers
# reinstalled torch packages back to CUDA versions (not expected).
pip show torch | grep Version | grep xpu
pip show torchaudio | grep Version | grep xpu
pip show torchvision | grep Version | grep xpu
python -c 'import torch; exit(not torch.xpu.is_available())'
- name: Run tests on ${{ needs.prepare.outputs.hostname }}
run: |
cd transformers
python -m pytest --make-reports=${TEST_CASE} --junit-xml=reports/${TEST_CASE}.xml \
-k "${{ matrix.test.filter}}" ${{ matrix.test.cmd }} || true
- name: Check for errors in tests
run: |
python torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml
- name: Print environment
if: ${{ ! cancelled() }}
uses: ./torch-xpu-ops/.github/actions/print-environment
with:
pip_packages: 'accelerate datasets transformers'
to: 'transformers/logs/environment-$TEST_CASE.md'
- name: Clean up
if: ${{ always() }}
run: |
if [ -d "$HF_HOME" ]; then
ls -al ${{ env.HF_HOME }}
du -sh ${{ env.HF_HOME }}
rm -rf ${{ env.HF_HOME }}
fi
- name: Upload reports
if: ${{ ! cancelled() }}
uses: actions/upload-artifact@v4
with:
name: reports-${{ matrix.test.test_case }}-${{ github.event.pull_request.number || github.sha }}
path: ${{ github.workspace }}/transformers/reports
- name: Upload logs
if: ${{ ! cancelled() }}
uses: actions/upload-artifact@v4
with:
name: logs-${{ matrix.test.test_case }}-${{ github.event.pull_request.number || github.sha }}
path: ${{ github.workspace }}/transformers/logs
report:
needs: tests
if: ${{ success() || failure() }}
runs-on: ubuntu-24.04
steps:
- name: Download reports
uses: actions/download-artifact@v4
with:
pattern: 'reports-*'
path: 'transformers/reports/'
merge-multiple: true
- name: Download logs
if: ${{ ! cancelled() }}
uses: actions/download-artifact@v4
with:
pattern: 'logs-*'
path: 'transformers/logs/'
merge-multiple: true
- name: Checkout torch-xpu-ops
if: ${{ ! cancelled() }}
uses: actions/checkout@v4
with:
path: torch-xpu-ops
- name: Setup python-${{ env.python }}
uses: actions/setup-python@v5
with:
python-version: ${{ env.python }}
- name: Install pip deps
run: |
pip install junitparser
- name: Print results table
if: ${{ ! cancelled() }}
run: |
# Helper function to return number preceeding given pattern, i.e:
# === 25 failed, 11 warnings, 0 errors ===
# Call as follows:
# parse_stat $line "failed"
function parse_stat() {
stat=$(cat $1 | grep $2 | sed "s/.* \([0-9]*\) $2.*/\1/")
if [ -n "$stat" ]; then echo $stat; else echo "0"; fi
}
cd transformers
{
echo "### Results"
echo "| Test group | Errors | Failed | Deselected | Passed | Skipped |"
echo "| --- | --- | --- | --- | --- | --- |"
for stat in $(find reports -name stats.txt); do
# Each stat.txt is located in: reports/$test_group/stats.txt
test_group=$(echo $stat | cut -f 2 -d/)
# Get failed, passed, skipped, etc. counters
failed=$(parse_stat $stat failed)
passed=$(parse_stat $stat passed)
deselected=$(parse_stat $stat deselected)
skipped=$(parse_stat $stat skipped)
warnings=$(parse_stat $stat warnings)
errors=$(parse_stat $stat errors)
echo "| $test_group | $errors | $failed | $deselected | $passed | $skipped |"
done
} >> $GITHUB_STEP_SUMMARY
- name: Print baseline difference
if: ${{ ! cancelled() }}
run: |
python torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml >> $GITHUB_STEP_SUMMARY || true
- name: Print failure lines
if: ${{ ! cancelled() }}
run: |
cd transformers
{
echo "### Failure lines"
echo "| Test group |File | Error | Comment |"
echo "| --- | --- | --- | --- |"
rm -rf _failures.txt
for failure in $(find reports -name failures_line.txt); do
# Each failure_line.txt is located in: reports/$test_group/failure_line.txt
test_group=$(echo $failure | cut -f2 -d/)
tail -n +2 $failure | sed "s/^/$test_group /" >> _failures.txt
done
# failures_line.txt file does not have test case information,
# so we can just sort the output and report uniq values
sort _failures.txt | uniq > _failures_uniq.txt
while read line; do
test_group=$(echo $line | cut -f1 -d" ")
file=$(echo $line | cut -f2 -d" " | sed "s/\(.*\):$/\1/")
error=$(echo $line | cut -f3 -d" " | sed "s/\(.*\):$/\1/")
# Failure comments often contain special characters which complicate
# parsing failure lines. But fortunately we know for sure where comments
# start. So we just output all contents starting from this position and
# wrap everything in <pre></pre> to avoid collisions with Markdown formatting.
comment="<pre>$(echo $line | cut -f4- -d' ' | sed 's/\(.*\):$/\1/')</pre>"
echo "| $test_group | $file | $error | $comment |"
done <_failures_uniq.txt
} >> $GITHUB_STEP_SUMMARY
- name: Print not implemented XPU backend ops
if: ${{ ! cancelled() }}
run: |
cd transformers
{
echo "### Not implemented ops"
echo "| Test group | Operator | Status |"
echo "| --- | --- | --- |"
rm -rf _ops.txt && touch _ops.txt
for log in $(find reports -name failures_line.txt); do
# Each failure_line.txt is located in: reports/$test_group/failure_line.txt
test_group=$(echo $log | cut -f2 -d/)
ops=$(grep NotImplementedError $log | grep "for the XPU device" | sed "s/.*The operator '\(.*\)' is not.*/\1/")
for op in $ops; do
echo "| $test_group | <pre>$op</pre> | not implemented |" >> _ops.txt
done
done
for log in $(find reports -name warnings.txt); do
# Each warnings.txt is located in: reports/$test_group/warnings.txt
test_group=$(echo $log | cut -f2 -d/)
ops=$(grep UserWarning $log | grep "on the XPU backend" | sed "s/.*The operator '\(.*\) on the XPU.*/\1/")
for op in $ops; do
echo "| $test_group | <pre>$op</pre> | fallback to CPU happens |" >> _ops.txt
done
done
sort _ops.txt | uniq
} >> $GITHUB_STEP_SUMMARY
- name: Print environment
if: ${{ ! cancelled() }}
run: |
first_md=$(find transformers/logs -name "environment-*.md" | head -1)
cat $first_md >> $GITHUB_STEP_SUMMARY
# we expect environments to be identical except for the ZE_AFFINITY_MASK line
find transformers/logs -name "environment-*.md" | xargs sed -i '/ZE_AFFINITY_MASK/d'
for f in $(find transformers/logs -name "environment-*.md"); do
diff $f $first_md
done