Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12369,6 +12369,44 @@ minimaxm3-fp8-b200-vllm-mtp:
- { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }

# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of MiniMax-M3 NVFP4
# (nvidia/MiniMax-M3-NVFP4) B200 single-node vLLM, pairing the target with the
# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). Runs on the
# b200-dgxc cluster. MiniMax-M3 modelopt NVFP4 support (vllm-project/vllm
# PR #46380) is baked into the perf container image, so no runtime patch is
# needed; prompts are routed through the chat template. Target weights are
# pre-staged at /scratch/fsw/models/MiniMax-M3-NVFP4 (launch_b200-dgxc.sh
# resolves MODEL_PATH for minimaxm3-fp4); the EAGLE3 draft is fetched next to
# the target weights.
minimaxm3-fp4-b200-vllm-mtp:
image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41
model: nvidia/MiniMax-M3-NVFP4
model-prefix: minimaxm3
runner: b200-dgxc
precision: fp4
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 512, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }

# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
# minimaxm3-fp8-b300-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens).
Expand Down
107 changes: 107 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b200_mtp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/env bash

# MiniMax-M3 NVFP4 B200 single-node vLLM recipe with EAGLE3 speculative
# decoding — same shape as minimaxm3_fp8_b200_mtp.sh but uses the
# nvidia/MiniMax-M3-NVFP4 checkpoint. MiniMax-M3 modelopt NVFP4 support
# (vllm-project/vllm PR #46380) is baked into the perf container image, so no
# runtime patch is needed.

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

DRAFT_MODEL="Inferact/MiniMax-M3-EAGLE3"

# launch_b200-dgxc.sh rewrites MODEL to the pre-downloaded path; only download
# the target when handed a bare HF id (b200-cw / b200-nb runners). The EAGLE3
# draft is never pre-staged, so fetch it either way: next to the target weights
# when MODEL is a local path, into the HF cache otherwise.
if [[ "$MODEL" != /* ]]; then
hf download "$MODEL"
hf download "$DRAFT_MODEL"
DRAFT_MODEL_PATH="$DRAFT_MODEL"
else
DRAFT_MODEL_PATH="$(dirname "$MODEL")/${DRAFT_MODEL##*/}"
if [[ ! -d "$DRAFT_MODEL_PATH" || -z "$(ls -A "$DRAFT_MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$DRAFT_MODEL" --local-dir "$DRAFT_MODEL_PATH"
fi
fi

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

nvidia-smi

SERVER_LOG=/workspace/server.log

export VLLM_ENGINE_READY_TIMEOUT_S=3600
export VLLM_FLOAT32_MATMUL_PRECISION=high

if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
else
PARALLEL_ARGS="--tensor-parallel-size=$TP"
fi

# use 3 speculative tokens for all configs for now
NUM_SPEC_TOKENS=3

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi
start_gpu_monitor

set -x
vllm serve $MODEL --port $PORT \
$PARALLEL_ARGS \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--language-model-only \
--max-cudagraph-capture-size 2048 \
--max-num-batched-tokens "$((ISL * 2 ))" \
--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \
--stream-interval 20 --no-enable-prefix-caching \
--trust-remote-code > $SERVER_LOG 2>&1 &

SERVER_PID=$!

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code \
--use-chat-template

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4196,3 +4196,12 @@
description:
- "Initial submission: MiniMax-M3 MXFP4 disagg (prefill/decode) on MI355X with vLLM over the MoRI-IO KV connector (8k/1k)."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1914

- config-keys:
- minimaxm3-fp4-b200-vllm-mtp
description:
- "Add MiniMax-M3 NVFP4 (nvidia/MiniMax-M3-NVFP4) B200 single-node aggregated vLLM benchmark with EAGLE3 speculative decoding (spec-decoding: mtp, 3 draft tokens via Inferact/MiniMax-M3-EAGLE3), runner b200-dgxc"
- "Image vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41 (bakes in MiniMax-M3 modelopt NVFP4 support, vllm-project/vllm PR #46380; no runtime patch needed); prompts routed through the chat template"
- "Target weights pre-staged at /scratch/fsw/models/MiniMax-M3-NVFP4 (added minimaxm3-fp4 MODEL_PATH branch to launch_b200-dgxc.sh); EAGLE3 draft fetched next to the target weights; --block-size 128 (MSA), --language-model-only"
- "Sweeps tp 4/8 with and without EP and dp-attn at 1k1k and 8k1k, conc 1-512"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1933
4 changes: 4 additions & 0 deletions runners/launch_b200-dgxc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then
# tree (root-owned); it lives in the sa-shared-writable gharunners tree.
export MODEL_PATH="/lustre/fsw/gharunners/models/MiniMax-M3-MXFP8"
export SRT_SLURM_MODEL_PREFIX="minimax-m3-mxfp8"
elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp4" ]]; then
# NVFP4 checkpoint, pre-staged on the b200-dgxc scratch tree.
export MODEL_PATH="/scratch/fsw/models/MiniMax-M3-NVFP4"
export SRT_SLURM_MODEL_PREFIX="minimax-m3-nvfp4"
else
echo "Unsupported model prefix/precision: $MODEL_PREFIX/$PRECISION"
echo "Available models under /lustre/fsw/models:"
Expand Down
Loading