Skip to content

Commit cc4c980

Browse files
authored
[None][feat] Add Qwen3-Next to layer-wise benchmarks (NVIDIA#9065)
Signed-off-by: Tailing Yuan <[email protected]>
1 parent fdb0787 commit cc4c980

File tree

18 files changed

+772
-450
lines changed

18 files changed

+772
-450
lines changed

.pre-commit-config.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,6 @@ common-files: &common_files |
8383
examples/infinitebench/compute_scores.py |
8484
examples/infinitebench/construct_synthetic_dataset.py |
8585
examples/infinitebench/eval_utils.py |
86-
examples/layer_wise_benchmarks/run_single.py |
8786
examples/llm-api/_tensorrt_engine/llm_eagle_decoding.py |
8887
examples/llm-api/_tensorrt_engine/llm_eagle2_decoding.py |
8988
examples/llm-api/_tensorrt_engine/llm_inference_customize.py |
@@ -811,7 +810,6 @@ common-files: &common_files |
811810
tensorrt_llm/serve/tool_parser/utils.py |
812811
tensorrt_llm/tools/__init__.py |
813812
tensorrt_llm/tools/importlib_utils.py |
814-
tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py |
815813
tensorrt_llm/tools/multimodal_builder.py |
816814
tensorrt_llm/tools/onnx_utils.py |
817815
tensorrt_llm/tools/plugin_gen/__init__.py |
@@ -1188,7 +1186,6 @@ common-files: &common_files |
11881186
tests/unittest/tools/plugin_gen/test_core.py |
11891187
tests/unittest/tools/plugin_gen/test_plugin_gen.py |
11901188
tests/unittest/tools/plugin_gen/test_shape_infer.py |
1191-
tests/unittest/tools/test_layer_wise_benchmarks.py |
11921189
tests/unittest/tools/test_prepare_dataset.py |
11931190
tests/unittest/tools/test_test_to_stage_mapping.py |
11941191
tests/unittest/trt/__init__.py |

examples/layer_wise_benchmarks/README.md

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ pip install -e ../..
1515
**Step 3:** In the container, run benchmarks and generate profiles:
1616

1717
```bash
18-
# Run DeepSeek-R1
18+
# Run DeepSeek-R1 NVFP4
1919
NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml
2020
NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml
2121

@@ -24,7 +24,7 @@ NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSee
2424
NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
2525

2626
# Run DeepSeek-V3.2-Exp with 32k context length
27-
NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --max-num-tokens $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769
27+
NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769
2828
NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --seq-len-kv-cache 32769
2929

3030
# Run with attention TP
@@ -48,6 +48,10 @@ NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --moe-back
4848
# Scale TEP=16 to 4 GPUs: reduce the number of attention heads and experts
4949
NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp
5050

51+
# Run Qwen3-Next (balanced routing is not implemented)
52+
NP=2 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified
53+
NP=2 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model Qwen/Qwen3-Next-80B-A3B-Instruct --layer-indices 6,7 --no-enable-attention-dp --moe-backend TRTLLM --balance-method NotModified
54+
5155
# Run with DeepEP A2A
5256
NP=4 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./mpi_launch.sh ./run_single.sh config_ctx.yaml --moe-backend WIDEEP
5357
NP=4 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./mpi_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP
@@ -76,7 +80,7 @@ It uses the image recorded in `../../jenkins/current_image_tags.properties`. The
7680
**Step 3:** Run benchmarks to generate profiles. Run the following command on the controller node, where `NODES` &le; the number of allocated nodes:
7781

7882
```bash
79-
# Run DeepSeek-R1 with wide ep: uses MNNVL A2A if applicable
83+
# Run DeepSeek-R1 NVFP4 with wide ep: uses MNNVL A2A if applicable
8084
SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 ./slurm_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP
8185

8286
# Run with attention TP and TRTLLMGen
@@ -93,3 +97,9 @@ SLURM_JOB_ID=$SLURM_JOB_ID NODES=2 NP=8 ./slurm_launch.sh ./run_single.sh config
9397
## Parse profiles
9498

9599
Coming soon.
100+
101+
## Trouble shooting
102+
103+
1. Error `fp8 blockscale gemm only support Hopper` on Blackwell.
104+
105+
The default MoE backend "CUTLASS" does not support FP8 weights. Please choose the same MoE backend as your end-to-end config. A typical choice is adding `--moe-backend DEEPGEMM`, `--moe-backend TRTLLM`, or `--moe-backend WIDEEP` option.

examples/layer_wise_benchmarks/config_ctx.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ max_seq_len: 9220 # 8192 + 1024 + 4
99
enable_attention_dp: true
1010

1111
# Model init args
12-
max_num_tokens: 20480
1312
moe_backend: CUTLASS
1413
use_cuda_graph: false
1514

examples/layer_wise_benchmarks/config_gen.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ max_seq_len: 9220 # 8192 + 1024 + 4
99
enable_attention_dp: true
1010

1111
# Model init args
12-
max_num_tokens: 4096 # MTP3 as max
1312
moe_backend: CUTLASS
1413
use_cuda_graph: true
1514

examples/layer_wise_benchmarks/run_single.py

Lines changed: 57 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@
88
from tensorrt_llm._torch.autotuner import AutoTuner, autotune
99
from tensorrt_llm._torch.modules.multi_stream_utils import with_multi_stream
1010
from tensorrt_llm._utils import local_mpi_rank, mpi_rank, mpi_world_size
11-
from tensorrt_llm.tools.layer_wise_benchmarks.deepseekv3_runner import (
12-
BalanceMethod, DeepSeekV3Runner)
11+
from tensorrt_llm.tools.layer_wise_benchmarks import BalanceMethod, get_runner_cls
1312

1413

1514
def comma_separated_ints(s):
@@ -23,30 +22,25 @@ def comma_separated_ints(s):
2322
parser.add_argument(
2423
"--layer-indices",
2524
type=comma_separated_ints,
26-
help="Comma separated indices of layers, should be a contiguous range")
25+
help="Comma separated indices of layers, should be a contiguous range",
26+
)
2727
parser.add_argument("--run-type", type=str, choices=["CTX", "GEN"])
2828
parser.add_argument("--scaled-from", type=int)
2929
# KV cache related args
30+
parser.add_argument("--max-batch-size", type=int)
3031
parser.add_argument("--tokens-per-block", type=int)
3132
parser.add_argument("--max-seq-len", type=int)
3233
group = parser.add_mutually_exclusive_group(required=False)
33-
group.add_argument("--enable-attention-dp",
34-
action="store_true",
35-
dest="enable_attention_dp")
36-
group.add_argument("--no-enable-attention-dp",
37-
action="store_false",
38-
dest="enable_attention_dp")
34+
group.add_argument("--enable-attention-dp", action="store_true", dest="enable_attention_dp")
35+
group.add_argument("--no-enable-attention-dp", action="store_false", dest="enable_attention_dp")
3936
parser.set_defaults(enable_attention_dp=None)
4037
# Model init args
4138
parser.add_argument("--max-num-tokens", type=int)
4239
parser.add_argument("--moe-backend", type=str)
40+
parser.add_argument("--moe-max-num-tokens", type=int)
4341
group = parser.add_mutually_exclusive_group(required=False)
44-
group.add_argument("--use-cuda-graph",
45-
action="store_true",
46-
dest="use_cuda_graph")
47-
group.add_argument("--no-use-cuda-graph",
48-
action="store_false",
49-
dest="use_cuda_graph")
42+
group.add_argument("--use-cuda-graph", action="store_true", dest="use_cuda_graph")
43+
group.add_argument("--no-use-cuda-graph", action="store_false", dest="use_cuda_graph")
5044
parser.set_defaults(use_cuda_graph=None)
5145
# Per iteration args
5246
parser.add_argument("--batch-size", type=int)
@@ -59,8 +53,12 @@ def comma_separated_ints(s):
5953
config = yaml.safe_load(f)
6054
del args.config_path
6155
for k, v in vars(args).items():
62-
if v is None:
56+
if v is None and k in config:
6357
setattr(args, k, config[k])
58+
if args.max_batch_size is None:
59+
args.max_batch_size = args.batch_size
60+
if args.max_num_tokens is None:
61+
args.max_num_tokens = args.max_batch_size * args.seq_len_q
6462
print(args)
6563

6664
# MPI args
@@ -70,43 +68,49 @@ def comma_separated_ints(s):
7068
torch.cuda.set_device(local_rank)
7169

7270
# Create KV cache manager
73-
mapping = DeepSeekV3Runner.create_mapping(
74-
enable_attention_dp=args.enable_attention_dp)
75-
max_batch_size = 2048
76-
kv_cache_manager = DeepSeekV3Runner.create_kv_cache_manager(
71+
Runner = get_runner_cls(args.model)
72+
mapping = Runner.create_mapping(enable_attention_dp=args.enable_attention_dp)
73+
kv_cache_manager = Runner.create_kv_cache_manager(
7774
args.model,
7875
mapping,
7976
tokens_per_block=args.tokens_per_block,
80-
max_batch_size=max_batch_size,
77+
max_batch_size=args.max_batch_size,
8178
max_seq_len=args.max_seq_len,
82-
layer_indices=args.layer_indices)
83-
attn_workspace = torch.empty((0, ), device="cuda", dtype=torch.int8)
79+
layer_indices=args.layer_indices,
80+
)
81+
attn_workspace = torch.empty((0,), device="cuda", dtype=torch.int8)
8482

8583
# Create other global objects
8684
AutoTuner.get().clear_cache()
8785
capture_stream = torch.cuda.Stream()
8886

8987
# Create Runner
90-
runner = DeepSeekV3Runner(args.model,
91-
mapping,
92-
moe_backend=args.moe_backend,
93-
layer_indices=args.layer_indices,
94-
scaled_from=args.scaled_from,
95-
max_seq_len=args.max_seq_len,
96-
max_num_tokens=args.max_num_tokens,
97-
use_cuda_graph=args.use_cuda_graph)
88+
runner = Runner(
89+
args.model,
90+
mapping,
91+
moe_backend=args.moe_backend,
92+
layer_indices=args.layer_indices,
93+
scaled_from=args.scaled_from,
94+
max_seq_len=args.max_seq_len,
95+
max_num_tokens=args.max_num_tokens,
96+
moe_max_num_tokens=args.moe_max_num_tokens,
97+
use_cuda_graph=args.use_cuda_graph,
98+
)
9899

99100
# Warm up
100-
assert args.batch_size <= max_batch_size
101+
assert args.batch_size <= args.max_batch_size
101102
assert args.seq_len_q + args.seq_len_kv_cache <= args.max_seq_len
102-
run_pack = runner.create_run_pack(args.run_type,
103-
batch_size=args.batch_size,
104-
seq_len_q=args.seq_len_q,
105-
seq_len_kv_cache=args.seq_len_kv_cache,
106-
kv_cache_manager=kv_cache_manager,
107-
attn_workspace=attn_workspace)
108-
runner.replace_routing_method(balance_method=BalanceMethod[args.balance_method],
109-
balance_ratio=args.balance_ratio)
103+
run_pack = runner.create_run_pack(
104+
args.run_type,
105+
batch_size=args.batch_size,
106+
seq_len_q=args.seq_len_q,
107+
seq_len_kv_cache=args.seq_len_kv_cache,
108+
kv_cache_manager=kv_cache_manager,
109+
attn_workspace=attn_workspace,
110+
)
111+
runner.replace_routing_method(
112+
balance_method=BalanceMethod[args.balance_method], balance_ratio=args.balance_ratio
113+
)
110114
capture_stream.wait_stream(torch.cuda.current_stream())
111115
with torch.cuda.stream(capture_stream):
112116
run_pack()
@@ -120,21 +124,15 @@ def comma_separated_ints(s):
120124
if args.use_cuda_graph:
121125
with with_multi_stream(True):
122126
g = torch.cuda.CUDAGraph()
123-
with torch.cuda.graph(g,
124-
stream=capture_stream,
125-
capture_error_mode="global"):
127+
with torch.cuda.graph(g, stream=capture_stream, capture_error_mode="global"):
126128
run_pack()
127129

128130
warmup_times = 20
129131
run_times = 100
130-
events = [
131-
torch.cuda.Event(enable_timing=True)
132-
for _ in range(warmup_times + run_times + 1)
133-
]
132+
events = [torch.cuda.Event(enable_timing=True) for _ in range(warmup_times + run_times + 1)]
134133
for i in range(warmup_times + run_times):
135134
events[i].record()
136-
with nvtx.annotate(
137-
f"b={args.batch_size} s={args.seq_len_q} EP{world_size}"):
135+
with nvtx.annotate(f"b={args.batch_size} s={args.seq_len_q} EP{world_size}"):
138136
if args.use_cuda_graph:
139137
g.replay()
140138
else:
@@ -144,16 +142,16 @@ def comma_separated_ints(s):
144142

145143
# Print statistics
146144
# Print before `cudaProfilerStop` to ensure messages are included in the profile
147-
time_list = [
148-
start.elapsed_time(stop) for start, stop in zip(events, events[1:])
149-
]
145+
time_list = [start.elapsed_time(stop) for start, stop in zip(events, events[1:])]
150146
time_list = time_list[warmup_times:]
151-
print(f"[RANK {rank}]"
152-
f" min {np.min(time_list) * 1000:.1f}"
153-
f" max {np.max(time_list) * 1000:.1f}"
154-
f" mean {np.mean(time_list) * 1000:.1f}"
155-
f" median {np.median(time_list) * 1000:.1f}"
156-
f" P90 {np.percentile(time_list, 90) * 1000:.1f}"
157-
f" (us)")
147+
print(
148+
f"[RANK {rank}]"
149+
f" min {np.min(time_list) * 1000:.1f}"
150+
f" max {np.max(time_list) * 1000:.1f}"
151+
f" mean {np.mean(time_list) * 1000:.1f}"
152+
f" median {np.median(time_list) * 1000:.1f}"
153+
f" P90 {np.percentile(time_list, 90) * 1000:.1f}"
154+
f" (us)"
155+
)
158156

159157
torch.cuda.cudart().cudaProfilerStop()

pyproject.toml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,6 @@ exclude = [
123123
"examples/infinitebench/compute_scores.py",
124124
"examples/infinitebench/construct_synthetic_dataset.py",
125125
"examples/infinitebench/eval_utils.py",
126-
"examples/layer_wise_benchmarks/run_single.py",
127126
"examples/llm-api/_tensorrt_engine/llm_eagle_decoding.py",
128127
"examples/llm-api/_tensorrt_engine/llm_eagle2_decoding.py",
129128
"examples/llm-api/_tensorrt_engine/llm_inference_customize.py",
@@ -851,7 +850,6 @@ exclude = [
851850
"tensorrt_llm/serve/tool_parser/utils.py",
852851
"tensorrt_llm/tools/__init__.py",
853852
"tensorrt_llm/tools/importlib_utils.py",
854-
"tensorrt_llm/tools/layer_wise_benchmarks/deepseekv3_runner.py",
855853
"tensorrt_llm/tools/multimodal_builder.py",
856854
"tensorrt_llm/tools/onnx_utils.py",
857855
"tensorrt_llm/tools/plugin_gen/__init__.py",
@@ -1228,7 +1226,6 @@ exclude = [
12281226
"tests/unittest/tools/plugin_gen/test_core.py",
12291227
"tests/unittest/tools/plugin_gen/test_plugin_gen.py",
12301228
"tests/unittest/tools/plugin_gen/test_shape_infer.py",
1231-
"tests/unittest/tools/test_layer_wise_benchmarks.py",
12321229
"tests/unittest/tools/test_prepare_dataset.py",
12331230
"tests/unittest/tools/test_test_to_stage_mapping.py",
12341231
"tests/unittest/trt/__init__.py",
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from .runner_factory import get_runner_cls
2+
from .runner_interface import BalanceMethod
3+
4+
__all__ = [
5+
"BalanceMethod",
6+
"get_runner_cls",
7+
]

0 commit comments

Comments
 (0)