QiJune
diff --git a/‎examples/disaggregated/slurm/benchmark/config.yaml‎
Lines changed: 3 additions & 2 deletions b/‎examples/disaggregated/slurm/benchmark/config.yaml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/disaggregated/slurm/benchmark/disaggr_torch.slurm‎
Lines changed: 7 additions & 7 deletions b/‎examples/disaggregated/slurm/benchmark/disaggr_torch.slurm‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎examples/disaggregated/slurm/benchmark/submit.py‎
Lines changed: 16 additions & 13 deletions b/‎examples/disaggregated/slurm/benchmark/submit.py‎
Lines changed: 16 additions & 13 deletions
diff --git a/‎examples/wide_ep/slurm_scripts/README.md‎
Lines changed: 3 additions & 2 deletions b/‎examples/wide_ep/slurm_scripts/README.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/wide_ep/slurm_scripts/config.yaml‎
Lines changed: 113 additions & 0 deletions b/‎examples/wide_ep/slurm_scripts/config.yaml‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎jenkins/L0_Test.groovy‎
Lines changed: 25 additions & 9 deletions b/‎jenkins/L0_Test.groovy‎
Lines changed: 25 additions & 9 deletions
diff --git a/‎tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py‎
Lines changed: 9 additions & 0 deletions b/‎tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/models/checkpoints/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎tensorrt_llm/_torch/models/checkpoints/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/models/checkpoints/hf/llava_next_weight_mapper.py‎
Lines changed: 15 additions & 0 deletions b/‎tensorrt_llm/_torch/models/checkpoints/hf/llava_next_weight_mapper.py‎
Lines changed: 15 additions & 0 deletions
@@ -5,7 +5,7 @@ slurm:
   account: "<account>"
   job_time: "02:00:00"
   job_name: "<job_name>"
-  numa_bind: true
+  numa_bind: true # Only enable for GB200 NVL72
 
 # Benchmark Mode
 benchmark:
@@ -42,7 +42,6 @@ profiling:
   nsys_on: false  # Set to true to enable profiling
 
 worker_config:
-  eplb_num_slots: 0  # Number of slots for EPLB
   gen:
     tensor_parallel_size: 8
     moe_expert_parallel_size: 8
@@ -77,6 +76,8 @@ worker_config:
     moe_config:
       backend: CUTLASS
       use_low_precision_moe_combine: true
+      load_balancer:
+        num_slots: 0
     cache_transceiver_config:
       max_tokens_in_buffer: 4608
       backend: DEFAULT
 
@@ -7,8 +7,8 @@ gpus_per_node=${1}
 numa_bind=${2}
 ctx_nodes=${3}  # Number of nodes needed for ctx workers
 gen_nodes=${4}  # Number of nodes needed for gen workers
-ctx_tp_size=${5}  # Tensor parallel size for ctx workers
-gen_tp_size=${6}  # Tensor parallel size for gen workers
+ctx_world_size=${5}  # World size for ctx workers
+gen_world_size=${6}  # World size for gen workers
 
 # Worker configuration
 num_ctx_servers=${7}
@@ -47,8 +47,8 @@ echo "  gpus_per_node: ${gpus_per_node}"
 echo "  numa_bind: ${numa_bind}"
 echo "  ctx_nodes: ${ctx_nodes}"
 echo "  gen_nodes: ${gen_nodes}"
-echo "  ctx_tp_size: ${ctx_tp_size}"
-echo "  gen_tp_size: ${gen_tp_size}"
+echo "  ctx_world_size: ${ctx_world_size}"
+echo "  gen_world_size: ${gen_world_size}"
 echo
 echo "Worker Configuration:"
 echo "  num_ctx_servers: ${num_ctx_servers}"
@@ -123,7 +123,7 @@ if [ -d "${trtllm_repo}" ]; then
 
     echo "Installing TensorRT-LLM..."
     if ! srun --container-name=${container_name} \
-        --container-mounts=${container_mount} \
+        --container-mounts=${container_mount} --no-container-mount-home \
         --mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
         bash -c "cd ${trtllm_repo} && pip install -e ." \
         &> ${full_logdir}/install.log; then
@@ -167,7 +167,7 @@ echo "ctx_nodes_num_in_single_server: ${ctx_nodes_num_in_single_server}"
 echo "Starting gen workers..."
 for i in $(seq 0 $((num_gen_servers - 1))); do
     srun -l -N ${gen_nodes_num_in_single_server} \
-        --ntasks=${gen_tp_size} \
+        --ntasks=$((gen_world_size)) \
         --ntasks-per-node=${gpus_per_node} \
         --container-image=${container_image} \
         --container-name=${container_name} \
@@ -182,7 +182,7 @@ done
 echo "Starting ctx workers..."
 for i in $(seq 0 $((num_ctx_servers - 1))); do
     srun -l -N ${ctx_nodes_num_in_single_server} \
-        --ntasks=${ctx_tp_size} \
+        --ntasks=$((ctx_world_size)) \
         --ntasks-per-node=${gpus_per_node} \
         --container-image=${container_image} \
         --container-name=${container_name} \
 
@@ -39,9 +39,9 @@ def save_worker_config(config, output_path, worker_type):
         yaml.dump(worker_config, f, default_flow_style=False)
 
 
-def calculate_nodes(tp_size, num_servers, gpus_per_node):
-    """Calculate required nodes based on tensor parallel size and server count."""
-    return (tp_size + gpus_per_node - 1) // gpus_per_node * num_servers
+def calculate_nodes(world_size, num_servers, gpus_per_node):
+    """Calculate required nodes based on world size and server count."""
+    return (world_size + gpus_per_node - 1) // gpus_per_node * num_servers
 
 
 def submit_job(config):
@@ -50,10 +50,6 @@ def submit_job(config):
     hw_config = config['hardware']
     env_config = config['environment']
 
-    # Calculate nodes based on tensor parallel sizes
-    ctx_tp_size = config['worker_config']['ctx']['tensor_parallel_size']
-    gen_tp_size = config['worker_config']['gen']['tensor_parallel_size']
-
     # Get number of servers from config
     ctx_num = hw_config['num_ctx_servers']
     gen_num = hw_config['num_gen_servers']
@@ -63,9 +59,16 @@ def submit_job(config):
     mtp_size = gen_config.get('speculative_config',
                               {}).get('num_nextn_predict_layers', 0)
 
-    ctx_nodes = calculate_nodes(ctx_tp_size, ctx_num,
+    # Calculate nodes based on world sizes
+    ctx_tp_size = config['worker_config']['ctx']['tensor_parallel_size']
+    ctx_pp_size = config['worker_config']['ctx']['pipeline_parallel_size']
+    ctx_world_size = ctx_tp_size * ctx_pp_size
+    ctx_nodes = calculate_nodes(ctx_world_size, ctx_num,
                                 hw_config['gpus_per_node'])
-    gen_nodes = calculate_nodes(gen_tp_size, gen_num,
+    gen_tp_size = config['worker_config']['gen']['tensor_parallel_size']
+    gen_pp_size = config['worker_config']['gen']['pipeline_parallel_size']
+    gen_world_size = gen_tp_size * gen_pp_size
+    gen_nodes = calculate_nodes(gen_world_size, gen_num,
                                 hw_config['gpus_per_node'])
     total_nodes = ctx_nodes + gen_nodes
     total_tasks = total_nodes * hw_config['gpus_per_node']
@@ -82,9 +85,9 @@ def submit_job(config):
 
     # Determine directory suffix based on attention_dp
     if gen_enable_attention_dp:
-        dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['eplb_num_slots']}_mtp{mtp_size}"
+        dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['gen']['moe_config']['load_balancer']['num_slots']}_mtp{mtp_size}"
     else:
-        dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['eplb_num_slots']}_mtp{mtp_size}"
+        dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['gen']['moe_config']['load_balancer']['num_slots']}_mtp{mtp_size}"
 
     # Create full log directory path
     log_dir = os.path.join(log_base, dir_suffix)
@@ -114,8 +117,8 @@ def submit_job(config):
         str(slurm_config['numa_bind']).lower(),
         str(ctx_nodes),  # Number of nodes needed for ctx workers
         str(gen_nodes),  # Number of nodes needed for gen workers
-        str(ctx_tp_size),  # Tensor parallel size for ctx workers
-        str(gen_tp_size),  # Tensor parallel size for gen workers
+        str(ctx_world_size),  # World size for ctx workers
+        str(gen_world_size),  # World size for gen workers
 
         # Worker configuration
         str(ctx_num),
 
@@ -34,6 +34,7 @@ Before running the scripts, ensure you have:
 ### Run Benchmarks
 
 ```bash
-# Please find the `submit.py` script and an example `config.yaml` in the `examples/disaggregated/slurm/benchmark/` directory.
-python3 submit.py -c your_config.yaml
+# Please find the `submit.py` script in the `examples/disaggregated/slurm/benchmark/` directory.
+# An example `config.yaml` for wide EP: `examples/wide_ep/slurm_scripts/config.yaml`.
+python3 submit.py -c config.yaml
 ```
@@ -0,0 +1,113 @@
+# SLURM Configuration
+slurm:
+  script_file: "disaggr_torch.slurm"
+  partition: "<partition>"
+  account: "<account>"
+  job_time: "02:00:00"
+  job_name: "<job_name>"
+  numa_bind: true # Only enable for GB200 NVL72
+
+# Hardware Configuration
+hardware:
+  gpus_per_node: 4  # Modify this with your hardware configuration
+  num_ctx_servers: 2  # Number of context servers
+  num_gen_servers: 1  # Number of generation servers
+
+# Benchmark Mode
+benchmark:
+  mode: "e2e"  # Options: e2e, gen_only
+  use_nv_sa_benchmark: false  # Whether to use NVIDIA SA benchmark script
+  multi_round: 1  # Number of benchmark rounds
+  benchmark_ratio: 0.8  # Benchmark ratio
+  streaming: true  # Enable streaming mode
+  concurrency_list: "1024"
+
+# Sequence Configuration
+sequence:
+  input_length: 8196  # Input sequence length
+  output_length: 1024  # Output sequence length
+
+# Environment Configuration
+environment:
+  container_mount: "<container_mount>"  # Format: path1:path1,path2:path2
+  container_image: "<container_image>"
+  model_path: "<model_path>"
+  trtllm_repo: "<trtllm_repo>"
+  build_wheel: false  # Don't build the wheel when launching multiple jobs
+  dataset_file: "<dataset_file>"
+  work_dir: "<full_path_to_work_dir>"
+
+# Profiling Configuration
+profiling:
+  nsys_on: false  # Set to true to enable profiling
+
+# Worker Configuration
+worker_config:
+  gen:
+    enable_layerwise_nvtx_marker: true
+    tensor_parallel_size: 32
+    moe_expert_parallel_size: 32
+    enable_attention_dp: true
+    enable_lm_head_tp_in_adp: true
+    pipeline_parallel_size: 1
+    max_batch_size: 128
+    max_num_tokens: 512
+    max_seq_len: 9236
+    cuda_graph_config:
+      enable_padding: true
+      batch_sizes:
+      - 1
+      - 2
+      - 4
+      - 8
+      - 16
+      - 32
+      - 64
+      - 128
+      - 256
+      - 512
+      - 768
+      - 1024
+      - 2048
+      - 128
+    print_iter_log: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.6
+      dtype: fp8
+    moe_config:
+      backend: WIDEEP
+      use_low_precision_moe_combine: true
+      load_balancer:
+        num_slots: 288
+        layer_updates_per_iter: 1
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8448
+      backend: DEFAULT
+    stream_interval: 20
+    num_postprocess_workers: 4
+    speculative_config:
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
+  ctx:
+    enable_layerwise_nvtx_marker: true
+    max_batch_size: 1
+    max_num_tokens: 8448
+    max_seq_len: 8212
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    enable_attention_dp: true
+    pipeline_parallel_size: 1
+    print_iter_log: true
+    cuda_graph_config: null
+    disable_overlap_scheduler: true
+    kv_cache_config:
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.75
+      dtype: fp8
+    cache_transceiver_config:
+      max_tokens_in_buffer: 8448
+      backend: DEFAULT
+    speculative_config:
+      decoding_type: MTP
+      num_nextn_predict_layers: 3
@@ -113,7 +113,7 @@ CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"
 MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
 
 // GPU types that require open driver
-REQUIRED_OPEN_DRIVER_TYPES = ["b100-ts2", "rtx-5080", "rtx-5090", "rtx-pro-6000"]
+REQUIRED_OPEN_DRIVER_TYPES = ["b100-ts2", "rtx-5080", "rtx-5090", "rtx-pro-6000", "rtx-pro-6000d"]
 
 // GPU types that don't support dynamic driver flashing
 REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200"]
@@ -1386,6 +1386,21 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
                     path: /vol/scratch1/scratch.svc_tensorrt_blossom
         """
     }
+    def llmModelVolume = """
+                - name: scratch-trt-llm-data
+                  nfs:
+                    server: 10.117.145.14
+                    path: /vol/scratch1/scratch.michaeln_blossom
+    """
+    if (type.contains("6000d")) {
+        // rtx-pro-6000d nodes are located in Austin DC, we use the FlexCache to speed up the data access.
+        llmModelVolume = """
+                - name: scratch-trt-llm-data
+                  nfs:
+                    server: 10.20.162.212
+                    path: /vol/scratch26/scratch.trt_llm_data
+        """
+    }
 
     def podConfig = [
         cloud: targetCould,
@@ -1432,10 +1447,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
                 - name: dshm
                   emptyDir:
                     medium: Memory
-                - name: scratch-trt-llm-data
-                  nfs:
-                    server: 10.117.145.14
-                    path: /vol/scratch1/scratch.michaeln_blossom
+                ${llmModelVolume}
                 ${pvcVolume}
         """.stripIndent(),
     ]
@@ -2578,9 +2590,6 @@ def launchTestJobs(pipeline, testFilter)
     x86TestConfigs = [
         "DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
         "DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2", "l0_dgx_h100", 1, 1, 2],
-        "DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
-        "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
-        "DGX_H100-2_GPUs-PyTorch-Ray-1": ["dgx-h100-x2", "l0_dgx_h100", 1, 1, 2],
         "DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
         "A10-PyTorch-1": ["a10", "l0_a10", 1, 2],
         "A10-PyTorch-2": ["a10", "l0_a10", 2, 2],
@@ -2664,10 +2673,14 @@ def launchTestJobs(pipeline, testFilter)
         // "DGX_H200-4_GPUs-TensorRT-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 3, 4],
         // "DGX_H200-4_GPUs-TensorRT-Post-Merge-2": ["dgx-h200-x4", "l0_dgx_h200", 2, 3, 4],
         // "DGX_H200-4_GPUs-TensorRT-Post-Merge-3": ["dgx-h200-x4", "l0_dgx_h200", 3, 3, 4],
-        // Disable RTXPro6000 stages due to nodes will be offline temporarily
+        // Disable RTXPro6000 stages due to nodes will be offline temporarily.
+        // [TODO] Split tests between RTXPro6000 and RTXPro6000D and move reasonable mount of tests to pre-merge.
         // "RTXPro6000-PyTorch-Post-Merge-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1],
         // "RTXPro6000-4_GPUs-PyTorch-Post-Merge-1": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 1, 2, 4],
         // "RTXPro6000-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 2, 2, 4],
+        "RTXPro6000D-PyTorch-Post-Merge-1": ["rtx-pro-6000d", "l0_rtx_pro_6000", 1, 1],
+        "RTXPro6000D-4_GPUs-PyTorch-Post-Merge-1": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 1, 2, 4],
+        "RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 2, 2, 4],
     ]
 
     parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(key.contains("-CU12-") ? LLM_DOCKER_IMAGE_12_9 : LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
@@ -2689,6 +2702,9 @@ def launchTestJobs(pipeline, testFilter)
     fullSet = parallelJobs.keySet()
 
     x86SlurmTestConfigs = [
+        "DGX_H100-2_GPUs-PyTorch-Ray-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
+        "DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
+        "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
         "B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
         "DGX_B200-4_GPUs-PyTorch-1": ["b200-x4", "l0_dgx_b200", 1, 2, 4],
         "DGX_B200-4_GPUs-PyTorch-2": ["b200-x4", "l0_dgx_b200", 2, 2, 4],
 
@@ -49,6 +49,15 @@ def __init__(self, alpha: float, output_dtype: torch.dtype):
                     f"SM version {get_sm_version()} is not supported for CuteDSLNVFP4BlackwellLinear, it only supports SM 100"
                 )
 
+        # rewrite the hash function because the value of self.alpha doesn't affect the tactic.
+        def __hash__(self):
+            return hash((self.output_dtype, ))
+
+        def __eq__(self, other):
+            if not isinstance(other, CuteDSLNVFP4BlackwellLinear):
+                return False
+            return self.output_dtype == other.output_dtype
+
         def get_valid_tactics(
             self,
             inputs: List[torch.Tensor],
 
@@ -3,6 +3,7 @@
 from .hf.config_loader import HfConfigLoader
 from .hf.gemma3_weight_mapper import Gemma3HfWeightMapper
 from .hf.llama4_weight_mapper import Llama4HfWeightMapper
+from .hf.llava_next_weight_mapper import LlavaNextHfWeightMapper
 from .hf.mixtral_weight_mapper import MixtralHfWeightMapper
 from .hf.nemotron_h_weight_mapper import NemotronHHfWeightMapper
 from .hf.qwen2_moe_weight_mapper import Qwen2MoeHfWeightMapper
@@ -17,5 +18,5 @@
     "BaseCheckpointLoader", "HfCheckpointLoader", "NemotronHHfWeightMapper",
     "Gemma3HfWeightMapper", "MixtralHfWeightMapper", "Llama4HfWeightMapper",
     "Qwen2MoeHfWeightMapper", "Qwen3MoeHfWeightMapper", "Qwen2VLHfWeightMapper",
-    "Qwen3NextHfWeightMapper"
+    "Qwen3NextHfWeightMapper", "LlavaNextHfWeightMapper"
 ]
@@ -0,0 +1,15 @@
+from tensorrt_llm._torch.models.checkpoints.hf.weight_mapper import HfWeightMapper
+from tensorrt_llm._torch.models.modeling_utils import register_mapper
+
+
+@register_mapper("HF", "LlavaNextForConditionalGeneration")
+class LlavaNextHfWeightMapper(HfWeightMapper):
+    def preprocess_weights(self, weights: dict) -> dict:
+        transformed_weights = {}
+        for key, value in weights.items():
+            if key.startswith("model."):
+                new_key = key[len("model.") :]
+                transformed_weights[new_key] = value
+            else:
+                transformed_weights[key] = value
+        return transformed_weights