Linux Transformers Test #318

Workflow file for this run

.github/workflows/_linux_transformers.yml at 56851c9

	name: Linux Transformers Test

	on:
	schedule:
	# GMT+8 0:00 Sunday
	- cron: '0 16 * * 6'
	pull_request:
	branches:
	- main
	paths:
	- '.github/scripts/check-transformers.py'
	- '.github/scripts/spec.py'
	- '.github/workflows/_linux_transformers.yml'
	workflow_dispatch:
	inputs:
	python:
	required: false
	type: string
	default: '3.10'
	description: Python version
	runner:
	required: true
	type: string
	default: 'pvc_rolling'
	description: Runner label
	driver:
	required: false
	type: string
	default: 'lts'
	description: Driver lts/rolling
	nightly_whl:
	required: false
	type: string
	default: ''
	description: Pytorch nightly wheel version
	accelerate:
	required: false
	type: string
	default: 'v1.7.0'
	description: Accelerate version
	datasets:
	required: false
	type: string
	default: 'v3.6.0'
	description: Accelerate version
	transformers:
	required: false
	type: string
	default: 'v4.51.3'
	description: Transformers version

	permissions: read-all

	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.ref }}
	cancel-in-progress: true
	env:
	HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
	DOCKER_REGISTRY_AUTH_TOKEN: ${{ secrets.DOCKER_HUB_TOKEN }}
	HF_HUB_ETAG_TIMEOUT: 120
	HF_HUB_DOWNLOAD_TIMEOUT: 120
	python: ${{ inputs.python != '' && inputs.python \|\| '3.10' }}
	accelerate: ${{ inputs.accelerate != '' && inputs.accelerate \|\| 'v1.7.0'}}
	datasets: ${{ inputs.datasets != '' && inputs.datasets \|\| 'v3.6.0'}}
	transformers: ${{ inputs.transformers != '' && inputs.transformers \|\| 'v4.51.3' }}
	PACKAGES: \|
	espeak-ng
	git-lfs
	pkg-config
	libavcodec-dev
	libavdevice-dev
	libavfilter-dev
	libavformat-dev
	libavutil-dev
	libswresample-dev
	libswscale-dev
	pciutils
	TORCH_INDEX: '--pre --index-url https://download.pytorch.org/whl/nightly/xpu'
	AGENT_TOOLSDIRECTORY: /tmp/xpu-tool

	defaults:
	run:
	shell: bash {0}

	jobs:
	conditions-filter:
	name: conditions-filter
	if: ${{ github.event.pull_request.draft == false }}
	runs-on: ubuntu-22.04
	timeout-minutes: 10
	env:
	GH_TOKEN: ${{ github.token }}
	outputs:
	disabled_tests: ${{ steps.check-pr-desc.outputs.disabled_tests }}
	steps:
	- name: Check PR infos
	id: check-pr-desc
	run: \|
	set -x -e -o pipefail
	sudo apt update && sudo apt install -y dos2unix
	(gh --repo ${GITHUB_REPOSITORY} pr view ${{ github.event.pull_request.number }} \|\| echo $?) 2>&1 \|tee pr-info.txt
	dos2unix pr-info.txt
	disabled_tests="$(awk '/disable_/{printf("%s ", $0)}' pr-info.txt)"
	echo "disabled_tests=${disabled_tests}" \|tee "${GITHUB_OUTPUT}"

	prepare:
	runs-on: ${{ inputs.runner != '' && inputs.runner \|\| 'pvc_rolling' }}
	needs: conditions-filter
	if: ${{ !(contains(needs.conditions-filter.outputs.disabled_tests, 'disable_all') \|\| contains(needs.conditions-filter.outputs.disabled_tests, 'disable_transformers')) }}
	outputs:
	torch: ${{ steps.getver.outputs.torch }}
	torchvision: ${{ steps.getver.outputs.torchvision }}
	torchaudio: ${{ steps.getver.outputs.torchaudio }}
	triton: ${{ steps.getver.outputs.triton }}
	runner_id: ${{ steps.runner-info.outputs.runner_id }}
	user_id: ${{ steps.runner-info.outputs.user_id }}
	render_id: ${{ steps.runner-info.outputs.render_id }}
	hostname: ${{ steps.runner-info.outputs.hostname }}
	pytest_extra_args: ${{ steps.runner-info.outputs.pytest_extra_args }}
	steps:
	- id: getver
	run: \|
	# We can't just `pip index version...` and get the last available
	# version as pytorch packages may have tricky dependencies. Instead
	# we dry run install packages and get versions which would be installed.
	# See: https://github.com/pytorch/pytorch/issues/154687
	pip install --dry-run --ignore-installed $TORCH_INDEX \
	torch torchvision torchaudio pytorch-triton-xpu >_log.txt

	torch=$(cat _log.txt \| grep "Would install" \| sed -E "s/.torch-([^ ]).*/\1/")
	torchvision=$(cat _log.txt \| grep "Would install" \| sed -E "s/.torchvision-([^ ]).*/\1/")
	torchaudio=$(cat _log.txt \| grep "Would install" \| sed -E "s/.torchaudio-([^ ]).*/\1/")
	triton=$(cat _log.txt \| grep "Would install" \| sed -E "s/.pytorch-triton-xpu-([^ ]).*/\1/")
	echo "torch=$torch" \| tee -a "$GITHUB_OUTPUT"
	echo "torchvision=$torchvision" \| tee -a "$GITHUB_OUTPUT"
	echo "torchaudio=$torchaudio" \| tee -a "$GITHUB_OUTPUT"
	echo "triton=$triton" \| tee -a "$GITHUB_OUTPUT"
	- name: Checkout torch-xpu-ops
	uses: actions/checkout@v4
	- name: Get runner
	id: runner-info
	uses: ./.github/actions/get-runner

	tests:
	needs: prepare
	runs-on: ${{ needs.prepare.outputs.runner_id }}
	container:
	image: intelgpu/ubuntu-22.04-lts2:2523.31
	volumes:
	- ${{ github.workspace }}:${{ github.workspace }}
	options: --device=/dev/mem --device=/dev/dri --group-add video --group-add ${{ needs.prepare.outputs.render_id }}
	--security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=8g
	-u ${{ needs.prepare.outputs.user_id }}
	-e ZE_AFFINITY_MASK
	env:
	PYTORCH_DEBUG_XPU_FALLBACK: '1'
	TRANSFORMERS_TEST_DEVICE_SPEC: 'spec.py'
	# enable pytest parallel run, and continue others if meets crash case such as segmentation fault
	PYTEST_ADDOPTS: -rsf --timeout 600 --timeout_method=thread --dist worksteal ${{ needs.prepare.outputs.pytest_extra_args }}
	strategy:
	fail-fast: false
	max-parallel: 1
	matrix:
	test:
	# Excluding tests due to:
	# * https://github.com/huggingface/transformers/issues/36267 (marian tests)
	- test_case: 'tests_backbone'
	cmd: '--ignore=tests/models/marian/test_modeling_marian.py -k backbone tests'
	- test_case: "tests_py"
	cmd: "tests/*.py"
	# Excluding tests due to:
	# * torch.distributed.* not yet supported by XPU
	- test_case: 'tests_generation'
	cmd: 'tests/generation'
	filter: 'not TestFSDPGeneration'
	# breaking for each shard to take ~15-30 minutes to complete
	# Excluding tests due to:
	# * https://github.com/pytorch/pytorch/issues/140965 (aten::_linalg_eigvals)
	# * https://github.com/huggingface/transformers/issues/36267 (marian tests)
	- test_case: 'tests_models_0'
	cmd: 'tests/models --num-shards 4 --shard-id 0 --ignore=tests/models/marian/test_modeling_marian.py'
	filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
	- test_case: 'tests_models_1'
	cmd: 'tests/models --num-shards 4 --shard-id 1 --ignore=tests/models/marian/test_modeling_marian.py'
	filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
	- test_case: 'tests_models_2'
	cmd: 'tests/models --num-shards 4 --shard-id 2 --ignore=tests/models/marian/test_modeling_marian.py'
	filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
	- test_case: 'tests_models_3'
	cmd: 'tests/models --num-shards 4 --shard-id 3 --ignore=tests/models/marian/test_modeling_marian.py'
	filter: 'not test_resize_embeddings_untied and not test_resize_tokens_embeddings'
	# Excluding tests due to:
	# * Some ray tests hang, reason unknown
	# * torch.distributed.* not yet supported by XPU
	- test_case: 'tests_trainer'
	cmd: 'tests/trainer'
	filter: 'not ray and not TestTrainerDistributed and not TestTrainerDistributedXPU and not TestFSDPTrainer'
	# Excluding tests due to:
	# * Network proxy connection issue, reason unknown
	# *'tests/utils/test_import_utils.py' invalidates state of the test engine causing
	# next tests to fail. See: https://github.com/huggingface/transformers/issues/36267
	- test_case: 'tests_utils'
	cmd: '--ignore=tests/utils/test_import_utils.py tests/utils'
	filter: 'not test_load_img_url_timeout'
	steps:
	- name: Checkout torch-xpu-ops
	uses: actions/checkout@v4
	with:
	path: torch-xpu-ops
	- name: Checkout Transformers
	uses: actions/checkout@v4
	with:
	repository: huggingface/transformers
	ref: ${{ env.transformers }}
	path: transformers
	- name: Prepare test vars
	run: \|
	echo "HF_HOME=$HOME/.hf_home_of_transformers_test" >> $GITHUB_ENV
	echo "TEST_CASE=${{matrix.test.test_case}}" >> $GITHUB_ENV
	- name: Report HF cache directory
	run: \|
	if [ -d "$HF_HOME" ]; then
	ls -al ${{ env.HF_HOME }}
	du -sh ${{ env.HF_HOME }}
	fi
	- name: Prepare OS environment
	run: \|
	# as jobs might run in parallel on the same system, apt-get might
	# step into the lock hold by other job
	start_time=$SECONDS
	while ! sudo apt-get update; do
	sleep 1;
	if (( $SECONDS - start_time > 60 )); then false; fi
	done
	while ! sudo apt-get install -y $PACKAGES; do
	sleep 1;
	if (( $SECONDS - start_time > 60 )); then false; fi
	done
	while ! git lfs install; do
	sleep 1;
	if (( $SECONDS - start_time > 60 )); then false; fi
	done
	- name: Setup python-${{ env.python }}
	uses: actions/setup-python@v5
	with:
	python-version: ${{ env.python }}
	- name: Check python
	run: \|
	which python && python -V
	which pip && pip list
	pip install -U pip wheel setuptools
	- name: Prepare pytorch and deps
	run: \|
	pip install junitparser
	pip install $TORCH_INDEX \
	torch==${{ needs.prepare.outputs.torch }} \
	torchvision==${{ needs.prepare.outputs.torchvision }} \
	torchaudio==${{ needs.prepare.outputs.torchaudio }} \
	pytorch-triton-xpu==${{needs.prepare.outputs.triton }}
	- name: Prepare Transformers
	run: \|
	pwd
	cd transformers
	pip install \
	accelerate==${{ env.accelerate }} \
	datasets==${{ env.datasets }}
	pip install -e .
	pip install -e ".[dev-torch,testing,video]"
	rm -rf logs && mkdir -p logs
	rm -rf reports
	cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./
	- name: Report installed versions
	run: \|
	LOGS_DIR="${{ github.workspace }}/transformers/logs"
	echo "pip installed packages:"
	pip list \| tee "$LOGS_DIR/pip_list-$TEST_CASE.txt"
	echo "lspci gpu devices:"
	lspci -d ::0380 \| tee "$LOGS_DIR/lspci_0380-$TEST_CASE.txt"
	echo "GPU render nodes:"
	cat /sys/class/drm/render*/device/device \| tee "$LOGS_DIR/device_IDs-$TEST_CASE.txt"
	echo "xpu-smi output:"
	xpu-smi discovery -y --json --dump -1
	- name: Sanity check installed packages
	run: \|
	# Use latest pytest
	pip install -U pytest pytest-timeout pytest-xdist pytest-shard
	# These checks are to exit earlier if for any reason Transformers
	# reinstalled torch packages back to CUDA versions (not expected).
	pip show torch \| grep Version \| grep xpu
	pip show torchaudio \| grep Version \| grep xpu
	pip show torchvision \| grep Version \| grep xpu
	python -c 'import torch; exit(not torch.xpu.is_available())'
	- name: Run tests on ${{ needs.prepare.outputs.hostname }}
	run: \|
	cd transformers
	python -m pytest --make-reports=${TEST_CASE} --junit-xml=reports/${TEST_CASE}.xml \
	-k "${{ matrix.test.filter}}" ${{ matrix.test.cmd }} \|\| true
	- name: Check for errors in tests
	run: \|
	python torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml
	- name: Print environment
	if: ${{ ! cancelled() }}
	uses: ./torch-xpu-ops/.github/actions/print-environment
	with:
	pip_packages: 'accelerate datasets transformers'
	to: 'transformers/logs/environment-$TEST_CASE.md'
	- name: Clean up
	if: ${{ always() }}
	run: \|
	if [ -d "$HF_HOME" ]; then
	ls -al ${{ env.HF_HOME }}
	du -sh ${{ env.HF_HOME }}
	rm -rf ${{ env.HF_HOME }}
	fi
	- name: Upload reports
	if: ${{ ! cancelled() }}
	uses: actions/upload-artifact@v4
	with:
	name: reports-${{ matrix.test.test_case }}-${{ github.event.pull_request.number \|\| github.sha }}
	path: ${{ github.workspace }}/transformers/reports
	- name: Upload logs
	if: ${{ ! cancelled() }}
	uses: actions/upload-artifact@v4
	with:
	name: logs-${{ matrix.test.test_case }}-${{ github.event.pull_request.number \|\| github.sha }}
	path: ${{ github.workspace }}/transformers/logs

	report:
	needs: tests
	if: ${{ success() \|\| failure() }}
	runs-on: ubuntu-24.04
	steps:
	- name: Download reports
	uses: actions/download-artifact@v4
	with:
	pattern: 'reports-*'
	path: 'transformers/reports/'
	merge-multiple: true
	- name: Download logs
	if: ${{ ! cancelled() }}
	uses: actions/download-artifact@v4
	with:
	pattern: 'logs-*'
	path: 'transformers/logs/'
	merge-multiple: true
	- name: Checkout torch-xpu-ops
	if: ${{ ! cancelled() }}
	uses: actions/checkout@v4
	with:
	path: torch-xpu-ops
	- name: Setup python-${{ env.python }}
	uses: actions/setup-python@v5
	with:
	python-version: ${{ env.python }}
	- name: Install pip deps
	run: \|
	pip install junitparser
	- name: Print results table
	if: ${{ ! cancelled() }}
	run: \|
	# Helper function to return number preceeding given pattern, i.e:
	# === 25 failed, 11 warnings, 0 errors ===
	# Call as follows:
	# parse_stat $line "failed"
	function parse_stat() {
	stat=$(cat $1 \| grep $2 \| sed "s/.* $[0-9]$ $2./\1/")
	if [ -n "$stat" ]; then echo $stat; else echo "0"; fi
	}
	cd transformers
	{
	echo "### Results"
	echo "\| Test group \| Errors \| Failed \| Deselected \| Passed \| Skipped \|"
	echo "\| --- \| --- \| --- \| --- \| --- \| --- \|"
	for stat in $(find reports -name stats.txt); do
	# Each stat.txt is located in: reports/$test_group/stats.txt
	test_group=$(echo $stat \| cut -f 2 -d/)
	# Get failed, passed, skipped, etc. counters
	failed=$(parse_stat $stat failed)
	passed=$(parse_stat $stat passed)
	deselected=$(parse_stat $stat deselected)
	skipped=$(parse_stat $stat skipped)
	warnings=$(parse_stat $stat warnings)
	errors=$(parse_stat $stat errors)
	echo "\| $test_group \| $errors \| $failed \| $deselected \| $passed \| $skipped \|"
	done
	} >> $GITHUB_STEP_SUMMARY
	- name: Print baseline difference
	if: ${{ ! cancelled() }}
	run: \|
	python torch-xpu-ops/.github/scripts/check-transformers.py transformers/reports/*.xml >> $GITHUB_STEP_SUMMARY \|\| true
	- name: Print failure lines
	if: ${{ ! cancelled() }}
	run: \|
	cd transformers
	{
	echo "### Failure lines"
	echo "\| Test group \|File \| Error \| Comment \|"
	echo "\| --- \| --- \| --- \| --- \|"
	rm -rf _failures.txt
	for failure in $(find reports -name failures_line.txt); do
	# Each failure_line.txt is located in: reports/$test_group/failure_line.txt
	test_group=$(echo $failure \| cut -f2 -d/)
	tail -n +2 $failure \| sed "s/^/$test_group /" >> _failures.txt
	done
	# failures_line.txt file does not have test case information,
	# so we can just sort the output and report uniq values
	sort _failures.txt \| uniq > _failures_uniq.txt
	while read line; do
	test_group=$(echo $line \| cut -f1 -d" ")
	file=$(echo $line \| cut -f2 -d" " \| sed "s/$.*$:$/\1/")
	error=$(echo $line \| cut -f3 -d" " \| sed "s/$.*$:$/\1/")
	# Failure comments often contain special characters which complicate
	# parsing failure lines. But fortunately we know for sure where comments
	# start. So we just output all contents starting from this position and
	# wrap everything in <pre></pre> to avoid collisions with Markdown formatting.
	comment="<pre>$(echo $line \| cut -f4- -d' ' \| sed 's/$.*$:$/\1/')</pre>"
	echo "\| $test_group \| $file \| $error \| $comment \|"
	done <_failures_uniq.txt
	} >> $GITHUB_STEP_SUMMARY
	- name: Print not implemented XPU backend ops
	if: ${{ ! cancelled() }}
	run: \|
	cd transformers
	{
	echo "### Not implemented ops"
	echo "\| Test group \| Operator \| Status \|"
	echo "\| --- \| --- \| --- \|"
	rm -rf _ops.txt && touch _ops.txt
	for log in $(find reports -name failures_line.txt); do
	# Each failure_line.txt is located in: reports/$test_group/failure_line.txt
	test_group=$(echo $log \| cut -f2 -d/)
	ops=$(grep NotImplementedError $log \| grep "for the XPU device" \| sed "s/.The operator '$.$' is not.*/\1/")
	for op in $ops; do
	echo "\| $test_group \| <pre>$op</pre> \| not implemented \|" >> _ops.txt
	done
	done
	for log in $(find reports -name warnings.txt); do
	# Each warnings.txt is located in: reports/$test_group/warnings.txt
	test_group=$(echo $log \| cut -f2 -d/)
	ops=$(grep UserWarning $log \| grep "on the XPU backend" \| sed "s/.The operator '$.$ on the XPU.*/\1/")
	for op in $ops; do
	echo "\| $test_group \| <pre>$op</pre> \| fallback to CPU happens \|" >> _ops.txt
	done
	done
	sort _ops.txt \| uniq
	} >> $GITHUB_STEP_SUMMARY
	- name: Print environment
	if: ${{ ! cancelled() }}
	run: \|
	first_md=$(find transformers/logs -name "environment-*.md" \| head -1)
	cat $first_md >> $GITHUB_STEP_SUMMARY
	# we expect environments to be identical except for the ZE_AFFINITY_MASK line
	find transformers/logs -name "environment-*.md" \| xargs sed -i '/ZE_AFFINITY_MASK/d'
	for f in $(find transformers/logs -name "environment-*.md"); do
	diff $f $first_md
	done

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Linux Transformers Test #318

Workflow file

Linux Transformers Test #318

Uh oh!

Jobs

Run details

Workflow file for this run