Skip to content

Support JetVLM

Support JetVLM #46241

Workflow file for this run

name: PR Test
on:
push:
branches: [main]
pull_request:
branches: [main]
types: [synchronize, labeled]
workflow_dispatch:
inputs:
version:
description: "FlashInfer version"
required: true
type: choice
default: "release"
options:
- "release"
- "nightly"
concurrency:
group: pr-test-${{ github.ref }}
cancel-in-progress: true
jobs:
# =============================================== check changes ====================================================
check-changes:
runs-on: ubuntu-latest
outputs:
main_package: ${{ steps.filter.outputs.main_package }}
sgl_kernel: ${{ steps.filter.outputs.sgl_kernel }}
multimodal_gen: ${{ steps.filter.outputs.multimodal_gen }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Fail if the PR does not have the 'run-ci' label
if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'run-ci')
run: |
echo "This pull request does not have the 'run-ci' label. Failing the workflow."
exit 1
- name: Fail if the PR is a draft
if: github.event_name == 'pull_request' && github.event.pull_request.draft == true
run: |
echo "This pull request is a draft. Failing the workflow."
exit 1
- name: Enforce rate limit for low-permission actors
if: github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch'
uses: actions/github-script@v7
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const HOURS = 2;
const owner = context.repo.owner;
const repo = context.repo.repo;
const eventName = context.eventName;
const curRun = await github.rest.actions.getWorkflowRun({
owner, repo, run_id: context.runId
});
const triggeringActor = curRun.data.triggering_actor?.login || context.actor;
async function hasHighPermission(username) {
try {
const { data } = await github.rest.repos.getCollaboratorPermissionLevel({ owner, repo, username });
const perm = data.permission || 'none';
return perm === 'write' || perm === 'maintain' || perm === 'admin';
} catch (e) {
if (e.status === 404 || e.status === 403) return false;
throw e;
}
}
if (await hasHighPermission(triggeringActor)) {
core.info(`Triggering user '${triggeringActor}' has high permission. No rate limit applied.`);
return;
}
const cutoff = new Date(Date.now() - HOURS * 60 * 60 * 1000);
core.info(`Checking for workflow runs since ${cutoff.toISOString()} (last ${HOURS} hours) for event '${eventName}'.`);
const { data } = await github.rest.actions.listWorkflowRuns({
owner,
repo,
workflow_id: 'pr-test.yml',
event: eventName,
per_page: 100,
});
const runs = data.workflow_runs || [];
const recentFound = runs.find((run) => {
if (String(run.id) === String(context.runId)) return false;
if (new Date(run.created_at) < cutoff) return false;
return (run.actor?.login === triggeringActor) || (run.triggering_actor?.login === triggeringActor);
});
if (recentFound) {
core.setFailed(
`User '${triggeringActor}' already triggered '${context.workflow}' via '${eventName}' at ${recentFound.created_at}. ` +
`Please wait ${HOURS} hours before triggering again.`
);
} else {
core.info(`No recent runs detected within the last ${HOURS} hours; proceeding.`);
}
- name: Detect file changes
id: filter
uses: dorny/paths-filter@v3
with:
filters: |
main_package:
- "python/sglang/!(multimodal_gen)/**"
- "python/*.toml"
- "scripts/ci/**"
- "test/**"
- ".github/workflows/pr-test.yml"
sgl_kernel:
- "sgl-kernel/**"
multimodal_gen:
- "python/sglang/multimodal_gen/**"
# =============================================== sgl-kernel ====================================================
sgl-kernel-build-wheels:
needs: [check-changes]
if: needs.check-changes.outputs.sgl_kernel == 'true'
runs-on: x64-kernel-build-node
strategy:
matrix:
include:
- python-version: "3.10"
cuda-version: "12.9"
# Add back when CUDA 13.0 is supported on CI
# - python-version: "3.10"
# cuda-version: "13.0"
name: Build Wheel
steps:
- name: Cleanup
run: |
sudo rm -rf $GITHUB_WORKSPACE/* || true
- uses: actions/checkout@v4
with:
submodules: "recursive"
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
run: |
cd sgl-kernel
./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
path: sgl-kernel/dist/*
sgl-kernel-build-wheels-arm:
needs: [check-changes]
if: needs.check-changes.outputs.sgl_kernel == 'true'
runs-on: arm-kernel-build-node
strategy:
matrix:
include:
- python-version: "3.10"
cuda-version: "12.9"
name: Build Wheel Arm
steps:
- name: Cleanup
run: |
if [ -d "$GITHUB_WORKSPACE" ]; then
sudo rm -rf "$GITHUB_WORKSPACE"/* || true
else
echo "$GITHUB_WORKSPACE does not exist, nothing to clean"
fi
- uses: actions/checkout@v4
with:
submodules: "recursive"
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
run: |
cd sgl-kernel
./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}-aarch64
path: sgl-kernel/dist/*
sgl-kernel-unit-test:
needs: [check-changes, sgl-kernel-build-wheels]
if: needs.check-changes.outputs.sgl_kernel == 'true'
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- uses: actions/checkout@v4
- name: Cleanup
run: |
ls -alh sgl-kernel/dist || true
rm -rf sgl-kernel/dist/* || true
- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd sgl-kernel
pytest tests/
sgl-kernel-mla-test:
needs: [check-changes, sgl-kernel-build-wheels]
if: needs.check-changes.outputs.sgl_kernel == 'true'
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- uses: actions/checkout@v4
- name: Cleanup
run: |
ls -alh sgl-kernel/dist || true
rm -rf sgl-kernel/dist/* || true
- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test/srt
python3 test_mla_deepseek_v3.py
sgl-kernel-benchmark-test:
needs: [check-changes, sgl-kernel-build-wheels]
if: needs.check-changes.outputs.sgl_kernel == 'true'
runs-on: 1-gpu-runner
env:
CI: true
RUNNER_LABELS: 1-gpu-runner
steps:
- uses: actions/checkout@v4
- name: Cleanup
run: |
ls -alh sgl-kernel/dist || true
rm -rf sgl-kernel/dist/* || true
- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run benchmark tests
timeout-minutes: 45
run: |
cd sgl-kernel/benchmark
echo "Running sgl-kernel benchmark tests in CI mode..."
echo "CI environment variable: $CI"
echo "GITHUB_ACTIONS environment variable: $GITHUB_ACTIONS"
for bench_file in bench_*.py; do
echo "Testing $bench_file..."
timeout 60 python3 "$bench_file" || echo "Warning: $bench_file timed out or failed, continuing..."
echo "Completed $bench_file"
echo "---"
done
echo "All benchmark tests completed!"
# =============================================== multimodal_gen ====================================================
multimodal-gen-test:
needs: [check-changes]
if: needs.check-changes.outputs.multimodal_gen == 'true'
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Run placeholder test
run: |
echo "Running multimodal_gen tests..."
echo "This is a placeholder for future tests."
# Adding a single CUDA13 smoke test to verify that the kernel builds and runs
# TODO: Add back this test when it can pass on CI
# cuda13-kernel-smoke-test:
# needs: [check-changes, sgl-kernel-build-wheels]
# if: needs.check-changes.outputs.sgl_kernel == 'true'
# runs-on: x64-cu13-kernel-tests
# steps:
# - uses: actions/checkout@v4
# - name: Cleanup
# run: |
# ls -alh sgl-kernel/dist || true
# rm -rf sgl-kernel/dist/* || true
# - name: Download CUDA 13.0 artifacts
# uses: actions/download-artifact@v4
# with:
# path: sgl-kernel/dist/
# merge-multiple: true
# pattern: wheel-python3.10-cuda13.0
# - name: Install dependencies
# run: |
# CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
# - name: Run kernel unit tests
# timeout-minutes: 30
# run: |
# cd sgl-kernel
# pytest tests/
# =============================================== primary ====================================================
unit-test-frontend:
needs: [check-changes, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 10
run: |
cd test/lang
python3 run_suite.py --suite per-commit
unit-test-backend-1-gpu:
needs: [check-changes, unit-test-frontend, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
strategy:
fail-fast: false
max-parallel: 5
matrix:
part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test/srt
python3 run_suite.py --suite per-commit-1-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 15
unit-test-backend-2-gpu:
needs: [check-changes, unit-test-backend-1-gpu, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 2-gpu-runner
env:
RUNNER_LABELS: 2-gpu-runner
strategy:
fail-fast: false
matrix:
part: [0, 1]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
cd test/srt
python3 run_suite.py --suite per-commit-2-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
unit-test-backend-4-gpu:
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 4-gpu-h100
env:
RUNNER_LABELS: 4-gpu-h100
strategy:
fail-fast: false
matrix:
part: [0, 1]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 20
run: |
cd test/srt
python3 run_suite.py --suite per-commit-4-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
unit-test-backend-8-gpu-h200:
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 8-gpu-h200
env:
RUNNER_LABELS: 8-gpu-h200
strategy:
fail-fast: false
matrix:
part: [0, 1, 2]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 20
run: |
cd test/srt
python3 run_suite.py --suite per-commit-8-gpu-h200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 3
unit-test-backend-8-gpu-h20:
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 8-gpu-h20
env:
SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4"
RUNNER_LABELS: 8-gpu-h20
strategy:
fail-fast: false
matrix:
part: [0, 1]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 20
run: |
cd test/srt
python3 run_suite.py --suite per-commit-8-gpu-h20 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
performance-test-1-gpu-part-1:
needs: [check-changes, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Benchmark single latency
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
- name: Benchmark online latency
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
- name: Benchmark offline throughput
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
- name: Benchmark offline throughput (Non-streaming, small batch size)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
- name: Benchmark online latency (EAGLE)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
- name: Benchmark online latency (LoRA)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency
python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency_with_concurrent_adapter_updates
performance-test-1-gpu-part-2:
needs: [check-changes, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Benchmark offline throughput (w/o RadixAttention)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
- name: Benchmark offline throughput (w/ Triton)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
- name: Benchmark offline throughput (w/ FP8)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
- name: Benchmark VLM offline throughput
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_offline_throughput
- name: Benchmark VLM online latency
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency
performance-test-1-gpu-part-3:
needs: [check-changes, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Benchmark Scores online latency and throughput
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_latency_throughput
- name: Benchmark Scores online latency and throughput (batch size scaling)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_batch_scaling
performance-test-2-gpu:
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 2-gpu-runner
env:
RUNNER_LABELS: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Benchmark single latency (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
- name: Benchmark single latency + torch.compile (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
- name: Benchmark offline throughput (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
- name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
- name: Benchmark offline PP decode throughput (PP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode
- name: Benchmark offline PP prefill throughput (PP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_pp_long_context_prefill
accuracy-test-1-gpu:
needs: [check-changes, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 1-gpu-runner
env:
RUNNER_LABELS: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
git clone https://github.com/merrymercy/human-eval.git
cd human-eval
pip install -e .
- name: Evaluate accuracy
timeout-minutes: 20
run: |
cd test/srt
python3 test_eval_accuracy_large.py
accuracy-test-2-gpu:
needs: [check-changes, accuracy-test-1-gpu, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 2-gpu-runner
env:
RUNNER_LABELS: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
git clone https://github.com/merrymercy/human-eval.git
cd human-eval
pip install -e .
- name: Evaluate accuracy (TP=2)
timeout-minutes: 20
run: |
cd test/srt
python3 test_moe_eval_accuracy_large.py
unit-test-deepep-4-gpu:
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 4-gpu-h100
env:
RUNNER_LABELS: 4-gpu-h100
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_deepep.sh
- name: Run test
timeout-minutes: 20
run: |
cd test/srt
python3 run_suite.py --suite per-commit-4-gpu-deepep
unit-test-deepep-8-gpu:
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 8-gpu-h200
env:
RUNNER_LABELS: 8-gpu-h200
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_deepep.sh
- name: Run test
timeout-minutes: 20
run: |
cd test/srt
python3 run_suite.py --suite per-commit-8-gpu-h200-deepep
unit-test-backend-4-gpu-b200:
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 4-gpu-b200
env:
RUNNER_LABELS: 4-gpu-b200
strategy:
fail-fast: false
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v6
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
- name: Run test
timeout-minutes: 45
run: |
cd test/srt
python3 run_suite.py --suite per-commit-4-gpu-b200 --auto-partition-id 0 --auto-partition-size 1 --timeout-per-file 3600
unit-test-backend-4-gpu-gb200:
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels-arm]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 4-gpu-gb200
env:
RUNNER_LABELS: 4-gpu-gb200
strategy:
fail-fast: false
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9-aarch64
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 GRACE_BLACKWELL=1 bash scripts/ci/ci_install_deepep.sh
- name: Run test
timeout-minutes: 45
run: |
cd test/srt
python3 run_suite.py --suite per-commit-4-gpu-gb200 --auto-partition-id 0 --auto-partition-size 1 --timeout-per-file 3600
pr-test-finish:
needs:
[
check-changes,
sgl-kernel-build-wheels,
sgl-kernel-unit-test,
sgl-kernel-mla-test,
sgl-kernel-benchmark-test,
multimodal-gen-test,
unit-test-frontend,
unit-test-backend-1-gpu,
unit-test-backend-2-gpu,
unit-test-backend-4-gpu,
unit-test-backend-8-gpu-h200,
performance-test-1-gpu-part-1,
performance-test-1-gpu-part-2,
performance-test-1-gpu-part-3,
performance-test-2-gpu,
accuracy-test-1-gpu,
accuracy-test-2-gpu,
unit-test-deepep-4-gpu,
unit-test-deepep-8-gpu,
unit-test-backend-4-gpu-b200,
unit-test-backend-4-gpu-gb200,
]
if: always()
runs-on: ubuntu-latest
steps:
- name: Check all dependent job statuses
run: |
# Convert the 'needs' context to a JSON string
json_needs='${{ toJson(needs) }}'
# Get a list of all job names from the JSON keys
job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]')
for job in $job_names; do
# For each job, extract its result
result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result')
# Print the job name and its result
echo "$job: $result"
# Check for failure or cancellation and exit if found
if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
echo "The above jobs failed."
exit 1
fi
done
# If the loop completes, all jobs were successful
echo "All jobs completed successfully"
exit 0