Skip to content

Commit 564556c

Browse files
authored
Merge branch 'main' into clean_cuda_graph
2 parents 990f4db + f3224cc commit 564556c

File tree

25 files changed

+1117
-67
lines changed

25 files changed

+1117
-67
lines changed

.github/workflows/blossom-ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ jobs:
4040
startsWith(github.event.comment.body, '/bot skip --comment') ||
4141
startsWith(github.event.comment.body, '/bot reuse-pipeline') ||
4242
startsWith(github.event.comment.body, '/bot kill')) && contains(
43-
fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub","raymochen","shuyixiong","johncalesp","leslie-fang25","reasonsolo","zhou-yuxin","vadiklyutiy","yali-arch","NVShreyas","h-guo18","pengbowang-nv","lancelly","heyuhhh","mayani-nv","flin3500","sunnyqgg","kris1025", "karljang", "ajrasane", "jthomson04", "fredricz-20070104", "aalanwyr", "samuellees", "nvamyt", "jinzh-nvidia", "zheyuf", "yumin066", "sychen52", "xxi-nv", "barneuman", "xuanzic", "yufeiwu-nv", "richardhuo-nv", "dcaox", "tshmilnvidia", "anish-shanbhag", "zhangcl", "timothygao8710", "jthomson04", "faradawn", "govind-ramnarayan","Boreas618","baize97","jieli-matrix","qiangxu1996"]'),
43+
fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub","raymochen","shuyixiong","johncalesp","leslie-fang25","reasonsolo","zhou-yuxin","vadiklyutiy","yali-arch","NVShreyas","h-guo18","pengbowang-nv","lancelly","heyuhhh","mayani-nv","flin3500","sunnyqgg","kris1025", "karljang", "ajrasane", "jthomson04", "fredricz-20070104", "aalanwyr", "samuellees", "nvamyt", "jinzh-nvidia", "zheyuf", "yumin066", "sychen52", "xxi-nv", "barneuman", "xuanzic", "yufeiwu-nv", "richardhuo-nv", "dcaox", "tshmilnvidia", "anish-shanbhag", "zhangcl", "timothygao8710", "jthomson04", "faradawn", "govind-ramnarayan","Boreas618","baize97","jieli-matrix","qiangxu1996","atrifex","mlefeb01","Wong4j","JadoTu"]'),
4444
github.actor)
4545
steps:
4646
- name: Check if comment is issued by authorized person

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,4 +82,5 @@ compile_commands.json
8282
.devcontainer/docker-compose.override.yml
8383

8484
# Enroot sqsh files
85+
enroot/sw-tensorrt-docker+*.sqsh
8586
enroot/tensorrt_llm.devel.sqsh

cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1311,9 +1311,6 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir
13111311
setupIfSelected(TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaledConfig{}, quant_params.fp8_mxfp4);
13121312
setupIfSelected(TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaledConfig{}, quant_params.mxfp8_mxfp4);
13131313

1314-
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1315-
asm volatile("griddepcontrol.launch_dependents;");
1316-
#endif
13171314
assert(gemm_m <= INT32_MAX);
13181315
assert(gemm1_n > 0 && gemm1_n <= INT32_MAX);
13191316
assert(gemm1_k > 0 && gemm1_k <= INT32_MAX);
@@ -1332,6 +1329,10 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir
13321329
reinterpret_cast<TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA const*>(
13331330
quant_params.groupwise.fc2.weight_scales),
13341331
bias2, gemm2_output, router_scales, permuted_row_to_unpermuted_row, expert);
1332+
1333+
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1334+
asm volatile("griddepcontrol.launch_dependents;");
1335+
#endif
13351336
}
13361337

13371338
// ========================== Permutation things =======================================
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# Layer-wise Benchmarks
2+
3+
## Generate profiles
4+
5+
### Run with MPI
6+
7+
**Step 1:** Start a container using Docker, Enroot or others. Please refer to `../../jenkins/current_image_tags.properties` for the Docker image URI.
8+
9+
**Step 2:** In the container, install `tensorrt_llm`:
10+
11+
```bash
12+
pip install -e ../..
13+
```
14+
15+
**Step 3:** In the container, run benchmarks and generate profiles:
16+
17+
```bash
18+
# Run DeepSeek-R1
19+
NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml
20+
NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml
21+
22+
# Run DeepSeek-V3.2-Exp
23+
NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
24+
NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
25+
26+
# Run DeepSeek-V3.2-Exp with 32k context length
27+
NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --max-num-tokens $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769
28+
NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --seq-len-kv-cache 32769
29+
30+
# Run with attention TP
31+
NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp
32+
NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --no-enable-attention-dp
33+
34+
# Run with attention TP and TRTLLMGen
35+
NP=4 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM
36+
NP=4 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
37+
38+
# Run with MTP3
39+
NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --batch-size 32 --seq-len-q 4
40+
41+
# Run 4 layers
42+
NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --layer-indices 5,6,7,8
43+
NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --layer-indices 5,6,7,8
44+
45+
# Scale DEP=16 to 4 GPUs: reduce the number of experts, uses MNNVL A2A if applicable
46+
NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP
47+
48+
# Scale TEP=16 to 4 GPUs: reduce the number of attention heads and experts
49+
NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp
50+
51+
# Run with DeepEP A2A
52+
NP=4 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./mpi_launch.sh ./run_single.sh config_ctx.yaml --moe-backend WIDEEP
53+
NP=4 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./mpi_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP
54+
```
55+
56+
### Run with Slurm
57+
58+
> Tips: If you have a running job with environment installed, please skip step 1 and 2 and go straight to step 3. In this case, your job must be run with `--container-name aaa`, and if the container name is not "layer_wise_benchmarks" please `export CONTAINER_NAME=aaa`.
59+
60+
**Step 1:** On the controller node, allocate one or multiple nodes, and record the `SLURM_JOB_ID`:
61+
62+
```bash
63+
SLURM_JOB_ID=$(NODES=4 TIME=02:00:00 ./slurm_alloc.sh)
64+
```
65+
66+
Please fill the variables in `./slurm_alloc.sh`.
67+
68+
**Step 2:** Start a container and install `tensorrt_llm`. Run the following command on the controller node:
69+
70+
```bash
71+
SLURM_JOB_ID=$SLURM_JOB_ID ./slurm_init_containers.sh
72+
```
73+
74+
It uses the image recorded in `../../jenkins/current_image_tags.properties`. The image will be downloaded to `../../enroot/` for once.
75+
76+
**Step 3:** Run benchmarks to generate profiles. Run the following command on the controller node, where `NODES` &le; the number of allocated nodes:
77+
78+
```bash
79+
# Run DeepSeek-R1 with wide ep: uses MNNVL A2A if applicable
80+
SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 ./slurm_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP
81+
82+
# Run with attention TP and TRTLLMGen
83+
SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
84+
85+
# Run with DeepEPLowLatency
86+
SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP
87+
88+
# You can run 4-GPU and 8-GPU tasks without reallocate the slurm job
89+
SLURM_JOB_ID=$SLURM_JOB_ID NODES=1 NP=4 ./slurm_launch.sh ./run_single.sh config_ctx.yaml
90+
SLURM_JOB_ID=$SLURM_JOB_ID NODES=2 NP=8 ./slurm_launch.sh ./run_single.sh config_ctx.yaml
91+
```
92+
93+
## Parse profiles
94+
95+
Coming soon.
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model: nvidia/DeepSeek-R1-0528-FP4-v2
2+
layer_indices: [5]
3+
run_type: CTX
4+
scaled_from: null
5+
6+
# KV cache related args
7+
tokens_per_block: 32
8+
max_seq_len: 9220 # 8192 + 1024 + 4
9+
enable_attention_dp: true
10+
11+
# Model init args
12+
max_num_tokens: 20480
13+
moe_backend: CUTLASS
14+
use_cuda_graph: false
15+
16+
# Per iteration args
17+
batch_size: 1
18+
seq_len_q: 8193
19+
seq_len_kv_cache: 0
20+
balance_method: Balanced
21+
balance_ratio: null
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
model: nvidia/DeepSeek-R1-0528-FP4-v2
2+
layer_indices: [5]
3+
run_type: GEN
4+
scaled_from: null
5+
6+
# KV cache related args
7+
tokens_per_block: 32
8+
max_seq_len: 9220 # 8192 + 1024 + 4
9+
enable_attention_dp: true
10+
11+
# Model init args
12+
max_num_tokens: 4096 # MTP3 as max
13+
moe_backend: CUTLASS
14+
use_cuda_graph: true
15+
16+
# Per iteration args
17+
batch_size: 128
18+
seq_len_q: 1 # Set to (1 + MTP)
19+
seq_len_kv_cache: 8193
20+
balance_method: Balanced
21+
balance_ratio: null
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/bin/bash
2+
3+
set -euo pipefail
4+
5+
# Clear slurm envs
6+
unset $(env | grep -i slurm | awk -F'=' '{print $1}')
7+
unset $(env | grep MPI | awk -F'=' '{print $1}')
8+
9+
set -x
10+
mpirun --allow-run-as-root --np ${NP} "$@"

0 commit comments

Comments
 (0)