Support JetVLM #46241

Workflow file for this run

.github/workflows/pr-test.yml at 08072bb

	name: PR Test

	on:
	push:
	branches: [main]
	pull_request:
	branches: [main]
	types: [synchronize, labeled]
	workflow_dispatch:
	inputs:
	version:
	description: "FlashInfer version"
	required: true
	type: choice
	default: "release"
	options:
	- "release"
	- "nightly"

	concurrency:
	group: pr-test-${{ github.ref }}
	cancel-in-progress: true

	jobs:
	# =============================================== check changes ====================================================
	check-changes:
	runs-on: ubuntu-latest
	outputs:
	main_package: ${{ steps.filter.outputs.main_package }}
	sgl_kernel: ${{ steps.filter.outputs.sgl_kernel }}
	multimodal_gen: ${{ steps.filter.outputs.multimodal_gen }}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Fail if the PR does not have the 'run-ci' label
	if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'run-ci')
	run: \|
	echo "This pull request does not have the 'run-ci' label. Failing the workflow."
	exit 1

	- name: Fail if the PR is a draft
	if: github.event_name == 'pull_request' && github.event.pull_request.draft == true
	run: \|
	echo "This pull request is a draft. Failing the workflow."
	exit 1

	- name: Enforce rate limit for low-permission actors
	if: github.event_name == 'pull_request' \|\| github.event_name == 'workflow_dispatch'
	uses: actions/github-script@v7
	with:
	github-token: ${{ secrets.GITHUB_TOKEN }}
	script: \|
	const HOURS = 2;
	const owner = context.repo.owner;
	const repo = context.repo.repo;
	const eventName = context.eventName;
	const curRun = await github.rest.actions.getWorkflowRun({
	owner, repo, run_id: context.runId
	});
	const triggeringActor = curRun.data.triggering_actor?.login \|\| context.actor;

	async function hasHighPermission(username) {
	try {
	const { data } = await github.rest.repos.getCollaboratorPermissionLevel({ owner, repo, username });
	const perm = data.permission \|\| 'none';
	return perm === 'write' \|\| perm === 'maintain' \|\| perm === 'admin';
	} catch (e) {
	if (e.status === 404 \|\| e.status === 403) return false;
	throw e;
	}
	}

	if (await hasHighPermission(triggeringActor)) {
	core.info(`Triggering user '${triggeringActor}' has high permission. No rate limit applied.`);
	return;
	}

	const cutoff = new Date(Date.now() - HOURS * 60 * 60 * 1000);
	core.info(`Checking for workflow runs since ${cutoff.toISOString()} (last ${HOURS} hours) for event '${eventName}'.`);

	const { data } = await github.rest.actions.listWorkflowRuns({
	owner,
	repo,
	workflow_id: 'pr-test.yml',
	event: eventName,
	per_page: 100,
	});

	const runs = data.workflow_runs \|\| [];
	const recentFound = runs.find((run) => {
	if (String(run.id) === String(context.runId)) return false;
	if (new Date(run.created_at) < cutoff) return false;
	return (run.actor?.login === triggeringActor) \|\| (run.triggering_actor?.login === triggeringActor);
	});

	if (recentFound) {
	core.setFailed(
	`User '${triggeringActor}' already triggered '${context.workflow}' via '${eventName}' at ${recentFound.created_at}. ` +
	`Please wait ${HOURS} hours before triggering again.`
	);
	} else {
	core.info(`No recent runs detected within the last ${HOURS} hours; proceeding.`);
	}

	- name: Detect file changes
	id: filter
	uses: dorny/paths-filter@v3
	with:
	filters: \|
	main_package:
	- "python/sglang/!(multimodal_gen)/**"
	- "python/*.toml"
	- "scripts/ci/**"
	- "test/**"
	- ".github/workflows/pr-test.yml"
	sgl_kernel:
	- "sgl-kernel/**"
	multimodal_gen:
	- "python/sglang/multimodal_gen/**"

	# =============================================== sgl-kernel ====================================================

	sgl-kernel-build-wheels:
	needs: [check-changes]
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	runs-on: x64-kernel-build-node
	strategy:
	matrix:
	include:
	- python-version: "3.10"
	cuda-version: "12.9"
	# Add back when CUDA 13.0 is supported on CI
	# - python-version: "3.10"
	# cuda-version: "13.0"
	name: Build Wheel
	steps:
	- name: Cleanup
	run: \|
	sudo rm -rf $GITHUB_WORKSPACE/* \|\| true

	- uses: actions/checkout@v4
	with:
	submodules: "recursive"

	- name: Set up Python ${{ matrix.python-version }}
	uses: actions/setup-python@v5
	with:
	python-version: ${{ matrix.python-version }}

	- name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
	run: \|
	cd sgl-kernel
	./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"

	- name: Upload artifacts
	uses: actions/upload-artifact@v4
	with:
	name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
	path: sgl-kernel/dist/*

	sgl-kernel-build-wheels-arm:
	needs: [check-changes]
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	runs-on: arm-kernel-build-node
	strategy:
	matrix:
	include:
	- python-version: "3.10"
	cuda-version: "12.9"
	name: Build Wheel Arm
	steps:
	- name: Cleanup
	run: \|
	if [ -d "$GITHUB_WORKSPACE" ]; then
	sudo rm -rf "$GITHUB_WORKSPACE"/* \|\| true
	else
	echo "$GITHUB_WORKSPACE does not exist, nothing to clean"
	fi

	- uses: actions/checkout@v4
	with:
	submodules: "recursive"

	- name: Set up Python ${{ matrix.python-version }}
	uses: actions/setup-python@v5
	with:
	python-version: ${{ matrix.python-version }}

	- name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
	run: \|
	cd sgl-kernel
	./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"

	- name: Upload artifacts
	uses: actions/upload-artifact@v4
	with:
	name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}-aarch64
	path: sgl-kernel/dist/*

	sgl-kernel-unit-test:
	needs: [check-changes, sgl-kernel-build-wheels]
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	runs-on: 1-gpu-runner
	env:
	RUNNER_LABELS: 1-gpu-runner
	steps:
	- uses: actions/checkout@v4

	- name: Cleanup
	run: \|
	ls -alh sgl-kernel/dist \|\| true
	rm -rf sgl-kernel/dist/* \|\| true

	- name: Download artifacts
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 30
	run: \|
	cd sgl-kernel
	pytest tests/

	sgl-kernel-mla-test:
	needs: [check-changes, sgl-kernel-build-wheels]
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	runs-on: 1-gpu-runner
	env:
	RUNNER_LABELS: 1-gpu-runner
	steps:
	- uses: actions/checkout@v4

	- name: Cleanup
	run: \|
	ls -alh sgl-kernel/dist \|\| true
	rm -rf sgl-kernel/dist/* \|\| true

	- name: Download artifacts
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 30
	run: \|
	cd test/srt
	python3 test_mla_deepseek_v3.py

	sgl-kernel-benchmark-test:
	needs: [check-changes, sgl-kernel-build-wheels]
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	runs-on: 1-gpu-runner
	env:
	CI: true
	RUNNER_LABELS: 1-gpu-runner
	steps:
	- uses: actions/checkout@v4

	- name: Cleanup
	run: \|
	ls -alh sgl-kernel/dist \|\| true
	rm -rf sgl-kernel/dist/* \|\| true

	- name: Download artifacts
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh

	- name: Run benchmark tests
	timeout-minutes: 45
	run: \|
	cd sgl-kernel/benchmark
	echo "Running sgl-kernel benchmark tests in CI mode..."

	echo "CI environment variable: $CI"
	echo "GITHUB_ACTIONS environment variable: $GITHUB_ACTIONS"

	for bench_file in bench_*.py; do
	echo "Testing $bench_file..."
	timeout 60 python3 "$bench_file" \|\| echo "Warning: $bench_file timed out or failed, continuing..."
	echo "Completed $bench_file"
	echo "---"
	done

	echo "All benchmark tests completed!"

	# =============================================== multimodal_gen ====================================================
	multimodal-gen-test:
	needs: [check-changes]
	if: needs.check-changes.outputs.multimodal_gen == 'true'
	runs-on: ubuntu-latest
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	- name: Run placeholder test
	run: \|
	echo "Running multimodal_gen tests..."
	echo "This is a placeholder for future tests."

	# Adding a single CUDA13 smoke test to verify that the kernel builds and runs
	# TODO: Add back this test when it can pass on CI
	# cuda13-kernel-smoke-test:
	# needs: [check-changes, sgl-kernel-build-wheels]
	# if: needs.check-changes.outputs.sgl_kernel == 'true'
	# runs-on: x64-cu13-kernel-tests
	# steps:
	# - uses: actions/checkout@v4

	# - name: Cleanup
	# run: \|
	# ls -alh sgl-kernel/dist \|\| true
	# rm -rf sgl-kernel/dist/* \|\| true

	# - name: Download CUDA 13.0 artifacts
	# uses: actions/download-artifact@v4
	# with:
	# path: sgl-kernel/dist/
	# merge-multiple: true
	# pattern: wheel-python3.10-cuda13.0

	# - name: Install dependencies
	# run: \|
	# CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh

	# - name: Run kernel unit tests
	# timeout-minutes: 30
	# run: \|
	# cd sgl-kernel
	# pytest tests/

	# =============================================== primary ====================================================

	unit-test-frontend:
	needs: [check-changes, sgl-kernel-build-wheels]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 1-gpu-runner
	env:
	RUNNER_LABELS: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 10
	run: \|
	cd test/lang
	python3 run_suite.py --suite per-commit

	unit-test-backend-1-gpu:
	needs: [check-changes, unit-test-frontend, sgl-kernel-build-wheels]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 1-gpu-runner
	env:
	RUNNER_LABELS: 1-gpu-runner
	strategy:
	fail-fast: false
	max-parallel: 5
	matrix:
	part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 30
	run: \|
	cd test/srt
	python3 run_suite.py --suite per-commit-1-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 15

	unit-test-backend-2-gpu:
	needs: [check-changes, unit-test-backend-1-gpu, sgl-kernel-build-wheels]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 2-gpu-runner
	env:
	RUNNER_LABELS: 2-gpu-runner
	strategy:
	fail-fast: false
	matrix:
	part: [0, 1]
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 30
	run: \|
	cd test/srt
	python3 run_suite.py --suite per-commit-2-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2

	unit-test-backend-4-gpu:
	needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 4-gpu-h100
	env:
	RUNNER_LABELS: 4-gpu-h100
	strategy:
	fail-fast: false
	matrix:
	part: [0, 1]
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 20
	run: \|
	cd test/srt
	python3 run_suite.py --suite per-commit-4-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2

	unit-test-backend-8-gpu-h200:
	needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 8-gpu-h200
	env:
	RUNNER_LABELS: 8-gpu-h200
	strategy:
	fail-fast: false
	matrix:
	part: [0, 1, 2]
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 20
	run: \|
	cd test/srt
	python3 run_suite.py --suite per-commit-8-gpu-h200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 3

	unit-test-backend-8-gpu-h20:
	needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 8-gpu-h20
	env:
	SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4"
	RUNNER_LABELS: 8-gpu-h20
	strategy:
	fail-fast: false
	matrix:
	part: [0, 1]
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 20
	run: \|
	cd test/srt
	python3 run_suite.py --suite per-commit-8-gpu-h20 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2

	performance-test-1-gpu-part-1:
	needs: [check-changes, sgl-kernel-build-wheels]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 1-gpu-runner
	env:
	RUNNER_LABELS: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh

	- name: Benchmark single latency
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
	python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default

	- name: Benchmark online latency
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default

	- name: Benchmark offline throughput
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default

	- name: Benchmark offline throughput (Non-streaming, small batch size)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size

	- name: Benchmark online latency (EAGLE)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle

	- name: Benchmark online latency (LoRA)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency
	python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency_with_concurrent_adapter_updates

	performance-test-1-gpu-part-2:
	needs: [check-changes, sgl-kernel-build-wheels]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 1-gpu-runner
	env:
	RUNNER_LABELS: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh

	- name: Benchmark offline throughput (w/o RadixAttention)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache

	- name: Benchmark offline throughput (w/ Triton)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend

	- name: Benchmark offline throughput (w/ FP8)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8

	- name: Benchmark VLM offline throughput
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_offline_throughput

	- name: Benchmark VLM online latency
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency

	performance-test-1-gpu-part-3:
	needs: [check-changes, sgl-kernel-build-wheels]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 1-gpu-runner
	env:
	RUNNER_LABELS: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh

	- name: Benchmark Scores online latency and throughput
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_latency_throughput

	- name: Benchmark Scores online latency and throughput (batch size scaling)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_batch_scaling

	performance-test-2-gpu:
	needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 2-gpu-runner
	env:
	RUNNER_LABELS: 2-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh

	- name: Benchmark single latency (TP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1

	- name: Benchmark single latency + torch.compile (TP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1

	- name: Benchmark offline throughput (TP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default

	- name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache

	- name: Benchmark offline PP decode throughput (PP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode

	- name: Benchmark offline PP prefill throughput (PP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_pp_long_context_prefill

	accuracy-test-1-gpu:
	needs: [check-changes, sgl-kernel-build-wheels]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 1-gpu-runner
	env:
	RUNNER_LABELS: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
	git clone https://github.com/merrymercy/human-eval.git
	cd human-eval
	pip install -e .

	- name: Evaluate accuracy
	timeout-minutes: 20
	run: \|
	cd test/srt
	python3 test_eval_accuracy_large.py

	accuracy-test-2-gpu:
	needs: [check-changes, accuracy-test-1-gpu, sgl-kernel-build-wheels]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 2-gpu-runner
	env:
	RUNNER_LABELS: 2-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
	git clone https://github.com/merrymercy/human-eval.git
	cd human-eval
	pip install -e .

	- name: Evaluate accuracy (TP=2)
	timeout-minutes: 20
	run: \|
	cd test/srt
	python3 test_moe_eval_accuracy_large.py

	unit-test-deepep-4-gpu:
	needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 4-gpu-h100
	env:
	RUNNER_LABELS: 4-gpu-h100
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_deepep.sh

	- name: Run test
	timeout-minutes: 20
	run: \|
	cd test/srt
	python3 run_suite.py --suite per-commit-4-gpu-deepep

	unit-test-deepep-8-gpu:
	needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 8-gpu-h200
	env:
	RUNNER_LABELS: 8-gpu-h200
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_deepep.sh

	- name: Run test
	timeout-minutes: 20
	run: \|
	cd test/srt
	python3 run_suite.py --suite per-commit-8-gpu-h200-deepep

	unit-test-backend-4-gpu-b200:
	needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 4-gpu-b200
	env:
	RUNNER_LABELS: 4-gpu-b200
	strategy:
	fail-fast: false
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v6
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 45
	run: \|
	cd test/srt
	python3 run_suite.py --suite per-commit-4-gpu-b200 --auto-partition-id 0 --auto-partition-size 1 --timeout-per-file 3600

	unit-test-backend-4-gpu-gb200:
	needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels-arm]
	if: always() && !failure() && !cancelled() &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	runs-on: 4-gpu-gb200
	env:
	RUNNER_LABELS: 4-gpu-gb200
	strategy:
	fail-fast: false
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9-aarch64

	- name: Install dependencies
	run: \|
	CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 GRACE_BLACKWELL=1 bash scripts/ci/ci_install_deepep.sh

	- name: Run test
	timeout-minutes: 45
	run: \|
	cd test/srt
	python3 run_suite.py --suite per-commit-4-gpu-gb200 --auto-partition-id 0 --auto-partition-size 1 --timeout-per-file 3600

	pr-test-finish:
	needs:
	[
	check-changes,

	sgl-kernel-build-wheels,
	sgl-kernel-unit-test,
	sgl-kernel-mla-test,
	sgl-kernel-benchmark-test,

	multimodal-gen-test,

	unit-test-frontend,
	unit-test-backend-1-gpu,
	unit-test-backend-2-gpu,
	unit-test-backend-4-gpu,
	unit-test-backend-8-gpu-h200,
	performance-test-1-gpu-part-1,
	performance-test-1-gpu-part-2,
	performance-test-1-gpu-part-3,
	performance-test-2-gpu,
	accuracy-test-1-gpu,
	accuracy-test-2-gpu,
	unit-test-deepep-4-gpu,
	unit-test-deepep-8-gpu,
	unit-test-backend-4-gpu-b200,
	unit-test-backend-4-gpu-gb200,
	]
	if: always()
	runs-on: ubuntu-latest
	steps:
	- name: Check all dependent job statuses
	run: \|
	# Convert the 'needs' context to a JSON string
	json_needs='${{ toJson(needs) }}'

	# Get a list of all job names from the JSON keys
	job_names=$(echo "$json_needs" \| jq -r 'keys_unsorted[]')

	for job in $job_names; do
	# For each job, extract its result
	result=$(echo "$json_needs" \| jq -r --arg j "$job" '.[$j].result')

	# Print the job name and its result
	echo "$job: $result"

	# Check for failure or cancellation and exit if found
	if [[ "$result" == "failure" \|\| "$result" == "cancelled" ]]; then
	echo "The above jobs failed."
	exit 1
	fi
	done

	# If the loop completes, all jobs were successful
	echo "All jobs completed successfully"
	exit 0

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Support JetVLM #46241

Workflow file

Support JetVLM #46241

Uh oh!

Jobs

Run details

Workflow file for this run