Skip to content

[VLM] Support piecewise cuda graph for vlm #38256

[VLM] Support piecewise cuda graph for vlm

[VLM] Support piecewise cuda graph for vlm #38256

Workflow file for this run

name: PR Test (AMD)
on:
push:
branches: [ main ]
paths:
- "python/**"
- "!python/sglang/multimodal_gen/**"
- "scripts/ci/**"
- "test/**"
- "sgl-kernel/**"
- ".github/workflows/pr-test-amd.yml"
pull_request:
branches: [ main ]
paths:
- "python/**"
- "!python/sglang/multimodal_gen/**"
- "scripts/ci/**"
- "test/**"
- "sgl-kernel/**"
- ".github/workflows/pr-test-amd.yml"
types: [synchronize, labeled]
workflow_dispatch:
concurrency:
group: pr-test-amd-${{ github.ref }}
cancel-in-progress: true
jobs:
check-changes:
runs-on: ubuntu-latest
outputs:
main_package: ${{ steps.filter.outputs.main_package }}
sgl_kernel: ${{ steps.filter.outputs.sgl_kernel }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Fail if the PR does not have the 'run-ci' label
if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'run-ci')
run: |
echo "This pull request does not have the 'run-ci' label. Failing the workflow."
exit 1
- name: Fail if the PR is a draft
if: github.event_name == 'pull_request' && github.event.pull_request.draft == true
run: |
echo "This pull request is a draft. Failing the workflow."
exit 1
- name: Detect file changes
id: filter
uses: dorny/paths-filter@v3
with:
filters: |
main_package:
- "python/**"
- "scripts/ci/**"
- "test/**"
- ".github/workflows/pr-test-amd.yml"
sgl_kernel:
- "sgl-kernel/**"
# =============================================== sgl-kernel ====================================================
sgl-kernel-unit-test-amd:
needs: [check-changes]
if: needs.check-changes.outputs.sgl_kernel == 'true'
strategy:
fail-fast: false
matrix:
runner: [linux-mi300-gpu-1]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Ensure VRAM is clear
run: bash scripts/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: |
bash scripts/ci/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 14
run: |
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py
docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py
docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_kvcacheio.py
# =============================================== primary ====================================================
unit-test-frontend-amd:
needs: [check-changes]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
strategy:
fail-fast: false
matrix:
runner: [linux-mi300-gpu-1]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Ensure VRAM is clear
run: bash scripts/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: |
bash scripts/ci/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 10
run: |
docker exec -w /sglang-checkout/test/lang ci_sglang python3 run_suite.py --suite per-commit
unit-test-backend-1-gpu-amd:
needs: [check-changes, unit-test-frontend-amd]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
strategy:
fail-fast: false
matrix:
runner: [linux-mi300-gpu-1]
part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Ensure VRAM is clear
run: bash scripts/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 12
unit-test-backend-2-gpu-amd:
needs: [check-changes]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
strategy:
fail-fast: false
matrix:
runner: [linux-mi300-gpu-2]
part: [0, 1]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Ensure VRAM is clear
run: bash scripts/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
unit-test-backend-8-gpu-amd:
needs: [check-changes, unit-test-backend-2-gpu-amd]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
env:
RUNNER_LABELS: linux-mi300-gpu-8
strategy:
fail-fast: false
matrix:
runner: [linux-mi300-gpu-8]
part: [0, 1, 2]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Ensure VRAM is clear
run: bash scripts/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 60
run: |
bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600
performance-test-1-gpu-part-1-amd:
needs: [check-changes]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
strategy:
fail-fast: false
matrix:
runner: [linux-mi300-gpu-1]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Ensure VRAM is clear
run: bash scripts/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Benchmark single latency
timeout-minutes: 20
run: |
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
- name: Benchmark online latency
timeout-minutes: 15
run: |
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
- name: Benchmark offline throughput
timeout-minutes: 15
run: |
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
- name: Benchmark offline throughput (Non-streaming, small batch size)
timeout-minutes: 15
run: |
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
performance-test-1-gpu-part-2-amd:
needs: [check-changes]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
strategy:
fail-fast: false
matrix:
runner: [linux-mi300-gpu-1]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Ensure VRAM is clear
run: bash scripts/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Benchmark offline throughput (w/o RadixAttention)
timeout-minutes: 15
run: |
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
- name: Benchmark offline throughput (w/ Triton)
timeout-minutes: 15
run: |
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
- name: Benchmark offline throughput (w/ FP8)
timeout-minutes: 15
run: |
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
performance-test-2-gpu-amd:
needs: [check-changes, unit-test-backend-2-gpu-amd]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
strategy:
fail-fast: false
matrix:
runner: [linux-mi300-gpu-2]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Ensure VRAM is clear
run: bash scripts/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Benchmark dummy grok (TP=2)
timeout-minutes: 30
run: |
bash scripts/ci/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
- name: Benchmark single latency (TP=2)
timeout-minutes: 25
run: |
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
- name: Benchmark single latency + torch.compile (TP=2)
timeout-minutes: 25
run: |
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
- name: Benchmark offline throughput (TP=2)
timeout-minutes: 25
run: |
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
- name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
timeout-minutes: 25
run: |
bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
accuracy-test-1-gpu-amd:
needs: [check-changes]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
strategy:
fail-fast: false
matrix:
runner: [linux-mi300-gpu-1]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Ensure VRAM is clear
run: bash scripts/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Evaluate Accuracy
timeout-minutes: 30
run: |
bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_eval_accuracy_large.py
bash scripts/ci/amd_ci_exec.sh python3 test_eval_fp8_accuracy.py
bash scripts/ci/amd_ci_exec.sh python3 models/test_qwen_models.py
accuracy-test-2-gpu-amd:
needs: [check-changes, accuracy-test-1-gpu-amd]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
strategy:
fail-fast: false
matrix:
runner: [linux-mi300-gpu-2]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Ensure VRAM is clear
run: bash scripts/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd_ci_install_dependency.sh
- name: Evaluate accuracy (TP=2)
timeout-minutes: 30
run: |
bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_moe_eval_accuracy_large.py
pr-test-amd-finish:
needs:
[
check-changes,
sgl-kernel-unit-test-amd,
unit-test-frontend-amd,
unit-test-backend-1-gpu-amd,
unit-test-backend-2-gpu-amd,
unit-test-backend-8-gpu-amd,
performance-test-1-gpu-part-1-amd,
performance-test-1-gpu-part-2-amd,
performance-test-2-gpu-amd,
accuracy-test-1-gpu-amd,
accuracy-test-2-gpu-amd,
]
if: always()
runs-on: ubuntu-latest
steps:
- name: Check all dependent job statuses
run: |
# Convert the 'needs' context to a JSON string
json_needs='${{ toJson(needs) }}'
# Get a list of all job names from the JSON keys
job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]')
for job in $job_names; do
# For each job, extract its result
result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result')
# Print the job name and its result
echo "$job: $result"
# Check for failure or cancellation and exit if found
if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
echo "The above jobs failed."
exit 1
fi
done
# If the loop completes, all jobs were successful
echo "All jobs completed successfully"
exit 0