Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2602,9 +2602,9 @@ minimaxm3-fp4-mi355x-vllm-disagg:
- isl: 8192
osl: 1024
search-space:
# 1P TP4 + 1D TP4 (2 nodes total), conc sweep 1..512 (single job, looped)
# 1P TP4 + 1D TP4 (2 nodes total), conc sweep 1..256 (single job, looped)
- spec-decoding: "none"
conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 ]
conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
prefill:
num-worker: 1
tp: 4
Expand Down
1 change: 1 addition & 0 deletions benchmarks/multi_node/amd_utils/models_vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ MiniMax-M3-MXFP4:
prefill_flags: "--tensor-parallel-size 8 --max-num-batched-tokens 32768 --max-num-seqs 512 --block-size 128 --language-model-only --attention-backend TRITON_ATTN --moe-backend aiter --no-enable-prefix-caching --gpu-memory-utilization 0.90 --tool-call-parser minimax_m3 --reasoning-parser minimax_m3 --enable-auto-tool-choice"
decode_flags: "--tensor-parallel-size 8 --max-num-batched-tokens 32768 --max-num-seqs 512 --block-size 128 --language-model-only --attention-backend TRITON_ATTN --moe-backend aiter --no-enable-prefix-caching --gpu-memory-utilization 0.90 --tool-call-parser minimax_m3 --reasoning-parser minimax_m3 --enable-auto-tool-choice"
env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_MOE=1 VLLM_USE_BREAKABLE_CUDAGRAPH=0 VLLM_ENGINE_READY_TIMEOUT_S=3600"
prefill_env: "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB=2048"
hf_dir: "models--amd--MiniMax-M3-MXFP4"

gpt-oss-120b:
Expand Down
12 changes: 12 additions & 0 deletions benchmarks/multi_node/amd_utils/server_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,12 @@ pf = bash_escape(m.get('prefill_flags', '--tensor-parallel-size 8'))
df = bash_escape(m.get('decode_flags', '--tensor-parallel-size 8'))
ev = bash_escape(m.get('env', ''))
dev = bash_escape(m.get('decode_env', ''))
pev = bash_escape(m.get('prefill_env', ''))
print(f'PREFILL_SERVER_CONFIG=\"{pf}\"')
print(f'DECODE_SERVER_CONFIG=\"{df}\"')
print(f'MODEL_ENVS=\"{ev}\"')
print(f'DECODE_MODEL_ENVS=\"{dev}\"')
print(f'PREFILL_MODEL_ENVS=\"{pev}\"')
")"

echo "Loaded model configuration for: $MODEL_NAME"
Expand Down Expand Up @@ -251,6 +253,11 @@ if [ "$NODE_RANK" -eq 0 ]; then

setup_vllm_env

for env_pair in ${PREFILL_MODEL_ENVS}; do
export "$env_pair"
echo "[PREFILL_ENV] $env_pair"
done

# Router is started as an external container by job.slurm (VLLM_ROUTER_IMAGE)
echo "Using external vllm-router container (started by job.slurm on this node)"

Expand Down Expand Up @@ -420,6 +427,11 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then

setup_vllm_env

for env_pair in ${PREFILL_MODEL_ENVS}; do
export "$env_pair"
echo "[PREFILL_ENV] $env_pair"
done

SERVED_MODEL="${MODEL_NAME}"
PREFILL_CMD="vllm serve ${MODEL_PATH} \
--served-model-name ${SERVED_MODEL} \
Expand Down
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4255,3 +4255,10 @@
- "Reuse the pinned vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e image, text-only target path, TRITON_ATTN, automatic tool choice, MiniMax-M3 parsers, VLLM_USE_BREAKABLE_CUDAGRAPH=0, default KV-cache dtype, and automatic MoE backend selection."
- "Pass --use-chat-template for MTP acceptance and mirror the existing MiniMax-M3 MXFP8 MI355X MTP TP/EP/DP-attention search space at 1k1k and 8k1k."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1939

- config-keys:
- minimaxm3-fp4-mi355x-vllm-disagg
description:
- "Enable prefill-only INT4 quick-reduce for MiniMax-M3 MXFP4 disagg: set VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 and VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB=2048 on the prefill workers via a new prefill_env channel (mirrors the existing decode_env path in server_vllm.sh)."
- "Cap the 1P1D TP4 concurrency sweep at 256 (was 512); 2P1D TP4 unchanged at 128/256/512."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1943