QiJune
diff --git a/‎.github/workflows/blossom-ci.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/blossom-ci.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu‎
Lines changed: 4 additions & 3 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎examples/layer_wise_benchmarks/README.md‎
Lines changed: 95 additions & 0 deletions b/‎examples/layer_wise_benchmarks/README.md‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎examples/layer_wise_benchmarks/config_ctx.yaml‎
Lines changed: 21 additions & 0 deletions b/‎examples/layer_wise_benchmarks/config_ctx.yaml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎examples/layer_wise_benchmarks/config_gen.yaml‎
Lines changed: 21 additions & 0 deletions b/‎examples/layer_wise_benchmarks/config_gen.yaml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎examples/layer_wise_benchmarks/mpi_launch.sh‎
Lines changed: 10 additions & 0 deletions b/‎examples/layer_wise_benchmarks/mpi_launch.sh‎
Lines changed: 10 additions & 0 deletions
@@ -40,7 +40,7 @@ jobs:
         startsWith(github.event.comment.body, '/bot skip --comment') ||
         startsWith(github.event.comment.body, '/bot reuse-pipeline') ||
         startsWith(github.event.comment.body, '/bot kill')) && contains(
-        fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub","raymochen","shuyixiong","johncalesp","leslie-fang25","reasonsolo","zhou-yuxin","vadiklyutiy","yali-arch","NVShreyas","h-guo18","pengbowang-nv","lancelly","heyuhhh","mayani-nv","flin3500","sunnyqgg","kris1025", "karljang", "ajrasane", "jthomson04", "fredricz-20070104", "aalanwyr", "samuellees", "nvamyt", "jinzh-nvidia", "zheyuf", "yumin066", "sychen52", "xxi-nv", "barneuman", "xuanzic", "yufeiwu-nv", "richardhuo-nv", "dcaox", "tshmilnvidia", "anish-shanbhag", "zhangcl", "timothygao8710", "jthomson04", "faradawn", "govind-ramnarayan","Boreas618","baize97","jieli-matrix","qiangxu1996"]'),
+        fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub","raymochen","shuyixiong","johncalesp","leslie-fang25","reasonsolo","zhou-yuxin","vadiklyutiy","yali-arch","NVShreyas","h-guo18","pengbowang-nv","lancelly","heyuhhh","mayani-nv","flin3500","sunnyqgg","kris1025", "karljang", "ajrasane", "jthomson04", "fredricz-20070104", "aalanwyr", "samuellees", "nvamyt", "jinzh-nvidia", "zheyuf", "yumin066", "sychen52", "xxi-nv", "barneuman", "xuanzic", "yufeiwu-nv", "richardhuo-nv", "dcaox", "tshmilnvidia", "anish-shanbhag", "zhangcl", "timothygao8710", "jthomson04", "faradawn", "govind-ramnarayan","Boreas618","baize97","jieli-matrix","qiangxu1996","atrifex","mlefeb01","Wong4j","JadoTu"]'),
         github.actor)
     steps:
       - name: Check if comment is issued by authorized person
 
@@ -82,4 +82,5 @@ compile_commands.json
 .devcontainer/docker-compose.override.yml
 
 # Enroot sqsh files
+enroot/sw-tensorrt-docker+*.sqsh
 enroot/tensorrt_llm.devel.sqsh
@@ -1311,9 +1311,6 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir
     setupIfSelected(TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaledConfig{}, quant_params.fp8_mxfp4);
     setupIfSelected(TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaledConfig{}, quant_params.mxfp8_mxfp4);
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
-#endif
     assert(gemm_m <= INT32_MAX);
     assert(gemm1_n > 0 && gemm1_n <= INT32_MAX);
     assert(gemm1_k > 0 && gemm1_k <= INT32_MAX);
@@ -1332,6 +1329,10 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir
         reinterpret_cast<TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA const*>(
             quant_params.groupwise.fc2.weight_scales),
         bias2, gemm2_output, router_scales, permuted_row_to_unpermuted_row, expert);
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.launch_dependents;");
+#endif
 }
 
 // ========================== Permutation things =======================================
 
@@ -0,0 +1,95 @@
+# Layer-wise Benchmarks
+
+## Generate profiles
+
+### Run with MPI
+
+**Step 1:** Start a container using Docker, Enroot or others. Please refer to `../../jenkins/current_image_tags.properties` for the Docker image URI.
+
+**Step 2:** In the container, install `tensorrt_llm`:
+
+```bash
+pip install -e ../..
+```
+
+**Step 3:** In the container, run benchmarks and generate profiles:
+
+```bash
+# Run DeepSeek-R1
+NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml
+NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml
+
+# Run DeepSeek-V3.2-Exp
+NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
+NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --moe-backend DEEPGEMM
+
+# Run DeepSeek-V3.2-Exp with 32k context length
+NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --max-num-tokens $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --batch-size 1 --seq-len-q 32769
+NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --model deepseek-ai/DeepSeek-V3.2-Exp --tokens-per-block 64 --max-seq-len $((32768 + 1024 + 4)) --moe-backend DEEPGEMM --seq-len-kv-cache 32769
+
+# Run with attention TP
+NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp
+NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --no-enable-attention-dp
+
+# Run with attention TP and TRTLLMGen
+NP=4 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --no-enable-attention-dp --moe-backend TRTLLM
+NP=4 TRTLLM_ENABLE_PDL=1 ./mpi_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
+
+# Run with MTP3
+NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --batch-size 32 --seq-len-q 4
+
+# Run 4 layers
+NP=4 ./mpi_launch.sh ./run_single.sh config_ctx.yaml --layer-indices 5,6,7,8
+NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --layer-indices 5,6,7,8
+
+# Scale DEP=16 to 4 GPUs: reduce the number of experts, uses MNNVL A2A if applicable
+NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --moe-backend WIDEEP
+
+# Scale TEP=16 to 4 GPUs: reduce the number of attention heads and experts
+NP=4 ./mpi_launch.sh ./run_single.sh config_gen.yaml --scaled-from 16 --no-enable-attention-dp
+
+# Run with DeepEP A2A
+NP=4 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./mpi_launch.sh ./run_single.sh config_ctx.yaml --moe-backend WIDEEP
+NP=4 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEP ./mpi_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP
+```
+
+### Run with Slurm
+
+> Tips: If you have a running job with environment installed, please skip step 1 and 2 and go straight to step 3. In this case, your job must be run with `--container-name aaa`, and if the container name is not "layer_wise_benchmarks" please `export CONTAINER_NAME=aaa`.
+
+**Step 1:** On the controller node, allocate one or multiple nodes, and record the `SLURM_JOB_ID`:
+
+```bash
+SLURM_JOB_ID=$(NODES=4 TIME=02:00:00 ./slurm_alloc.sh)
+```
+
+Please fill the variables in `./slurm_alloc.sh`.
+
+**Step 2:** Start a container and install `tensorrt_llm`. Run the following command on the controller node:
+
+```bash
+SLURM_JOB_ID=$SLURM_JOB_ID ./slurm_init_containers.sh
+```
+
+It uses the image recorded in `../../jenkins/current_image_tags.properties`. The image will be downloaded to `../../enroot/` for once.
+
+**Step 3:** Run benchmarks to generate profiles. Run the following command on the controller node, where `NODES` &le; the number of allocated nodes:
+
+```bash
+# Run DeepSeek-R1 with wide ep: uses MNNVL A2A if applicable
+SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 ./slurm_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP
+
+# Run with attention TP and TRTLLMGen
+SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_ENABLE_PDL=1 ./slurm_launch.sh ./run_single.sh config_gen.yaml --no-enable-attention-dp --moe-backend TRTLLM
+
+# Run with DeepEPLowLatency
+SLURM_JOB_ID=$SLURM_JOB_ID NODES=4 NP=16 TRTLLM_FORCE_ALLTOALL_METHOD=DeepEPLowLatency ./slurm_launch.sh ./run_single.sh config_gen.yaml --moe-backend WIDEEP
+
+# You can run 4-GPU and 8-GPU tasks without reallocate the slurm job
+SLURM_JOB_ID=$SLURM_JOB_ID NODES=1 NP=4 ./slurm_launch.sh ./run_single.sh config_ctx.yaml
+SLURM_JOB_ID=$SLURM_JOB_ID NODES=2 NP=8 ./slurm_launch.sh ./run_single.sh config_ctx.yaml
+```
+
+## Parse profiles
+
+Coming soon.
@@ -0,0 +1,21 @@
+model: nvidia/DeepSeek-R1-0528-FP4-v2
+layer_indices: [5]
+run_type: CTX
+scaled_from: null
+
+# KV cache related args
+tokens_per_block: 32
+max_seq_len: 9220  # 8192 + 1024 + 4
+enable_attention_dp: true
+
+# Model init args
+max_num_tokens: 20480
+moe_backend: CUTLASS
+use_cuda_graph: false
+
+# Per iteration args
+batch_size: 1
+seq_len_q: 8193
+seq_len_kv_cache: 0
+balance_method: Balanced
+balance_ratio: null
@@ -0,0 +1,21 @@
+model: nvidia/DeepSeek-R1-0528-FP4-v2
+layer_indices: [5]
+run_type: GEN
+scaled_from: null
+
+# KV cache related args
+tokens_per_block: 32
+max_seq_len: 9220  # 8192 + 1024 + 4
+enable_attention_dp: true
+
+# Model init args
+max_num_tokens: 4096  # MTP3 as max
+moe_backend: CUTLASS
+use_cuda_graph: true
+
+# Per iteration args
+batch_size: 128
+seq_len_q: 1  # Set to (1 + MTP)
+seq_len_kv_cache: 8193
+balance_method: Balanced
+balance_ratio: null
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# Clear slurm envs
+unset $(env | grep -i slurm | awk -F'=' '{print $1}')
+unset $(env | grep MPI | awk -F'=' '{print $1}')
+
+set -x
+mpirun --allow-run-as-root --np ${NP} "$@"