Skip to content

Commit e4fea88

Browse files
committed
rebase
Signed-off-by: junq <[email protected]>
2 parents 81e729a + 5e6f1bc commit e4fea88

30 files changed

+1426
-845
lines changed

examples/disaggregated/slurm/benchmark/config.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ slurm:
55
account: "<account>"
66
job_time: "02:00:00"
77
job_name: "<job_name>"
8-
numa_bind: true
8+
numa_bind: true # Only enable for GB200 NVL72
99

1010
# Benchmark Mode
1111
benchmark:
@@ -42,7 +42,6 @@ profiling:
4242
nsys_on: false # Set to true to enable profiling
4343

4444
worker_config:
45-
eplb_num_slots: 0 # Number of slots for EPLB
4645
gen:
4746
tensor_parallel_size: 8
4847
moe_expert_parallel_size: 8
@@ -77,6 +76,8 @@ worker_config:
7776
moe_config:
7877
backend: CUTLASS
7978
use_low_precision_moe_combine: true
79+
load_balancer:
80+
num_slots: 0
8081
cache_transceiver_config:
8182
max_tokens_in_buffer: 4608
8283
backend: DEFAULT

examples/disaggregated/slurm/benchmark/disaggr_torch.slurm

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ gpus_per_node=${1}
77
numa_bind=${2}
88
ctx_nodes=${3} # Number of nodes needed for ctx workers
99
gen_nodes=${4} # Number of nodes needed for gen workers
10-
ctx_tp_size=${5} # Tensor parallel size for ctx workers
11-
gen_tp_size=${6} # Tensor parallel size for gen workers
10+
ctx_world_size=${5} # World size for ctx workers
11+
gen_world_size=${6} # World size for gen workers
1212

1313
# Worker configuration
1414
num_ctx_servers=${7}
@@ -47,8 +47,8 @@ echo " gpus_per_node: ${gpus_per_node}"
4747
echo " numa_bind: ${numa_bind}"
4848
echo " ctx_nodes: ${ctx_nodes}"
4949
echo " gen_nodes: ${gen_nodes}"
50-
echo " ctx_tp_size: ${ctx_tp_size}"
51-
echo " gen_tp_size: ${gen_tp_size}"
50+
echo " ctx_world_size: ${ctx_world_size}"
51+
echo " gen_world_size: ${gen_world_size}"
5252
echo
5353
echo "Worker Configuration:"
5454
echo " num_ctx_servers: ${num_ctx_servers}"
@@ -123,7 +123,7 @@ if [ -d "${trtllm_repo}" ]; then
123123

124124
echo "Installing TensorRT-LLM..."
125125
if ! srun --container-name=${container_name} \
126-
--container-mounts=${container_mount} \
126+
--container-mounts=${container_mount} --no-container-mount-home \
127127
--mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
128128
bash -c "cd ${trtllm_repo} && pip install -e ." \
129129
&> ${full_logdir}/install.log; then
@@ -167,7 +167,7 @@ echo "ctx_nodes_num_in_single_server: ${ctx_nodes_num_in_single_server}"
167167
echo "Starting gen workers..."
168168
for i in $(seq 0 $((num_gen_servers - 1))); do
169169
srun -l -N ${gen_nodes_num_in_single_server} \
170-
--ntasks=${gen_tp_size} \
170+
--ntasks=$((gen_world_size)) \
171171
--ntasks-per-node=${gpus_per_node} \
172172
--container-image=${container_image} \
173173
--container-name=${container_name} \
@@ -182,7 +182,7 @@ done
182182
echo "Starting ctx workers..."
183183
for i in $(seq 0 $((num_ctx_servers - 1))); do
184184
srun -l -N ${ctx_nodes_num_in_single_server} \
185-
--ntasks=${ctx_tp_size} \
185+
--ntasks=$((ctx_world_size)) \
186186
--ntasks-per-node=${gpus_per_node} \
187187
--container-image=${container_image} \
188188
--container-name=${container_name} \

examples/disaggregated/slurm/benchmark/submit.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ def save_worker_config(config, output_path, worker_type):
3939
yaml.dump(worker_config, f, default_flow_style=False)
4040

4141

42-
def calculate_nodes(tp_size, num_servers, gpus_per_node):
43-
"""Calculate required nodes based on tensor parallel size and server count."""
44-
return (tp_size + gpus_per_node - 1) // gpus_per_node * num_servers
42+
def calculate_nodes(world_size, num_servers, gpus_per_node):
43+
"""Calculate required nodes based on world size and server count."""
44+
return (world_size + gpus_per_node - 1) // gpus_per_node * num_servers
4545

4646

4747
def submit_job(config):
@@ -50,10 +50,6 @@ def submit_job(config):
5050
hw_config = config['hardware']
5151
env_config = config['environment']
5252

53-
# Calculate nodes based on tensor parallel sizes
54-
ctx_tp_size = config['worker_config']['ctx']['tensor_parallel_size']
55-
gen_tp_size = config['worker_config']['gen']['tensor_parallel_size']
56-
5753
# Get number of servers from config
5854
ctx_num = hw_config['num_ctx_servers']
5955
gen_num = hw_config['num_gen_servers']
@@ -63,9 +59,16 @@ def submit_job(config):
6359
mtp_size = gen_config.get('speculative_config',
6460
{}).get('num_nextn_predict_layers', 0)
6561

66-
ctx_nodes = calculate_nodes(ctx_tp_size, ctx_num,
62+
# Calculate nodes based on world sizes
63+
ctx_tp_size = config['worker_config']['ctx']['tensor_parallel_size']
64+
ctx_pp_size = config['worker_config']['ctx']['pipeline_parallel_size']
65+
ctx_world_size = ctx_tp_size * ctx_pp_size
66+
ctx_nodes = calculate_nodes(ctx_world_size, ctx_num,
6767
hw_config['gpus_per_node'])
68-
gen_nodes = calculate_nodes(gen_tp_size, gen_num,
68+
gen_tp_size = config['worker_config']['gen']['tensor_parallel_size']
69+
gen_pp_size = config['worker_config']['gen']['pipeline_parallel_size']
70+
gen_world_size = gen_tp_size * gen_pp_size
71+
gen_nodes = calculate_nodes(gen_world_size, gen_num,
6972
hw_config['gpus_per_node'])
7073
total_nodes = ctx_nodes + gen_nodes
7174
total_tasks = total_nodes * hw_config['gpus_per_node']
@@ -82,9 +85,9 @@ def submit_job(config):
8285

8386
# Determine directory suffix based on attention_dp
8487
if gen_enable_attention_dp:
85-
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['eplb_num_slots']}_mtp{mtp_size}"
88+
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['gen']['moe_config']['load_balancer']['num_slots']}_mtp{mtp_size}"
8689
else:
87-
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['eplb_num_slots']}_mtp{mtp_size}"
90+
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{config['worker_config']['gen']['moe_config']['load_balancer']['num_slots']}_mtp{mtp_size}"
8891

8992
# Create full log directory path
9093
log_dir = os.path.join(log_base, dir_suffix)
@@ -114,8 +117,8 @@ def submit_job(config):
114117
str(slurm_config['numa_bind']).lower(),
115118
str(ctx_nodes), # Number of nodes needed for ctx workers
116119
str(gen_nodes), # Number of nodes needed for gen workers
117-
str(ctx_tp_size), # Tensor parallel size for ctx workers
118-
str(gen_tp_size), # Tensor parallel size for gen workers
120+
str(ctx_world_size), # World size for ctx workers
121+
str(gen_world_size), # World size for gen workers
119122

120123
# Worker configuration
121124
str(ctx_num),

examples/wide_ep/slurm_scripts/README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ Before running the scripts, ensure you have:
3434
### Run Benchmarks
3535

3636
```bash
37-
# Please find the `submit.py` script and an example `config.yaml` in the `examples/disaggregated/slurm/benchmark/` directory.
38-
python3 submit.py -c your_config.yaml
37+
# Please find the `submit.py` script in the `examples/disaggregated/slurm/benchmark/` directory.
38+
# An example `config.yaml` for wide EP: `examples/wide_ep/slurm_scripts/config.yaml`.
39+
python3 submit.py -c config.yaml
3940
```
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# SLURM Configuration
2+
slurm:
3+
script_file: "disaggr_torch.slurm"
4+
partition: "<partition>"
5+
account: "<account>"
6+
job_time: "02:00:00"
7+
job_name: "<job_name>"
8+
numa_bind: true # Only enable for GB200 NVL72
9+
10+
# Hardware Configuration
11+
hardware:
12+
gpus_per_node: 4 # Modify this with your hardware configuration
13+
num_ctx_servers: 2 # Number of context servers
14+
num_gen_servers: 1 # Number of generation servers
15+
16+
# Benchmark Mode
17+
benchmark:
18+
mode: "e2e" # Options: e2e, gen_only
19+
use_nv_sa_benchmark: false # Whether to use NVIDIA SA benchmark script
20+
multi_round: 1 # Number of benchmark rounds
21+
benchmark_ratio: 0.8 # Benchmark ratio
22+
streaming: true # Enable streaming mode
23+
concurrency_list: "1024"
24+
25+
# Sequence Configuration
26+
sequence:
27+
input_length: 8196 # Input sequence length
28+
output_length: 1024 # Output sequence length
29+
30+
# Environment Configuration
31+
environment:
32+
container_mount: "<container_mount>" # Format: path1:path1,path2:path2
33+
container_image: "<container_image>"
34+
model_path: "<model_path>"
35+
trtllm_repo: "<trtllm_repo>"
36+
build_wheel: false # Don't build the wheel when launching multiple jobs
37+
dataset_file: "<dataset_file>"
38+
work_dir: "<full_path_to_work_dir>"
39+
40+
# Profiling Configuration
41+
profiling:
42+
nsys_on: false # Set to true to enable profiling
43+
44+
# Worker Configuration
45+
worker_config:
46+
gen:
47+
enable_layerwise_nvtx_marker: true
48+
tensor_parallel_size: 32
49+
moe_expert_parallel_size: 32
50+
enable_attention_dp: true
51+
enable_lm_head_tp_in_adp: true
52+
pipeline_parallel_size: 1
53+
max_batch_size: 128
54+
max_num_tokens: 512
55+
max_seq_len: 9236
56+
cuda_graph_config:
57+
enable_padding: true
58+
batch_sizes:
59+
- 1
60+
- 2
61+
- 4
62+
- 8
63+
- 16
64+
- 32
65+
- 64
66+
- 128
67+
- 256
68+
- 512
69+
- 768
70+
- 1024
71+
- 2048
72+
- 128
73+
print_iter_log: true
74+
kv_cache_config:
75+
enable_block_reuse: false
76+
free_gpu_memory_fraction: 0.6
77+
dtype: fp8
78+
moe_config:
79+
backend: WIDEEP
80+
use_low_precision_moe_combine: true
81+
load_balancer:
82+
num_slots: 288
83+
layer_updates_per_iter: 1
84+
cache_transceiver_config:
85+
max_tokens_in_buffer: 8448
86+
backend: DEFAULT
87+
stream_interval: 20
88+
num_postprocess_workers: 4
89+
speculative_config:
90+
decoding_type: MTP
91+
num_nextn_predict_layers: 3
92+
ctx:
93+
enable_layerwise_nvtx_marker: true
94+
max_batch_size: 1
95+
max_num_tokens: 8448
96+
max_seq_len: 8212
97+
tensor_parallel_size: 4
98+
moe_expert_parallel_size: 4
99+
enable_attention_dp: true
100+
pipeline_parallel_size: 1
101+
print_iter_log: true
102+
cuda_graph_config: null
103+
disable_overlap_scheduler: true
104+
kv_cache_config:
105+
enable_block_reuse: false
106+
free_gpu_memory_fraction: 0.75
107+
dtype: fp8
108+
cache_transceiver_config:
109+
max_tokens_in_buffer: 8448
110+
backend: DEFAULT
111+
speculative_config:
112+
decoding_type: MTP
113+
num_nextn_predict_layers: 3

jenkins/L0_Test.groovy

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"
113113
MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
114114

115115
// GPU types that require open driver
116-
REQUIRED_OPEN_DRIVER_TYPES = ["b100-ts2", "rtx-5080", "rtx-5090", "rtx-pro-6000"]
116+
REQUIRED_OPEN_DRIVER_TYPES = ["b100-ts2", "rtx-5080", "rtx-5090", "rtx-pro-6000", "rtx-pro-6000d"]
117117

118118
// GPU types that don't support dynamic driver flashing
119119
REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200"]
@@ -1386,6 +1386,21 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
13861386
path: /vol/scratch1/scratch.svc_tensorrt_blossom
13871387
"""
13881388
}
1389+
def llmModelVolume = """
1390+
- name: scratch-trt-llm-data
1391+
nfs:
1392+
server: 10.117.145.14
1393+
path: /vol/scratch1/scratch.michaeln_blossom
1394+
"""
1395+
if (type.contains("6000d")) {
1396+
// rtx-pro-6000d nodes are located in Austin DC, we use the FlexCache to speed up the data access.
1397+
llmModelVolume = """
1398+
- name: scratch-trt-llm-data
1399+
nfs:
1400+
server: 10.20.162.212
1401+
path: /vol/scratch26/scratch.trt_llm_data
1402+
"""
1403+
}
13891404

13901405
def podConfig = [
13911406
cloud: targetCould,
@@ -1432,10 +1447,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
14321447
- name: dshm
14331448
emptyDir:
14341449
medium: Memory
1435-
- name: scratch-trt-llm-data
1436-
nfs:
1437-
server: 10.117.145.14
1438-
path: /vol/scratch1/scratch.michaeln_blossom
1450+
${llmModelVolume}
14391451
${pvcVolume}
14401452
""".stripIndent(),
14411453
]
@@ -2578,9 +2590,6 @@ def launchTestJobs(pipeline, testFilter)
25782590
x86TestConfigs = [
25792591
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
25802592
"DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2", "l0_dgx_h100", 1, 1, 2],
2581-
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
2582-
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
2583-
"DGX_H100-2_GPUs-PyTorch-Ray-1": ["dgx-h100-x2", "l0_dgx_h100", 1, 1, 2],
25842593
"DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
25852594
"A10-PyTorch-1": ["a10", "l0_a10", 1, 2],
25862595
"A10-PyTorch-2": ["a10", "l0_a10", 2, 2],
@@ -2664,10 +2673,14 @@ def launchTestJobs(pipeline, testFilter)
26642673
// "DGX_H200-4_GPUs-TensorRT-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 3, 4],
26652674
// "DGX_H200-4_GPUs-TensorRT-Post-Merge-2": ["dgx-h200-x4", "l0_dgx_h200", 2, 3, 4],
26662675
// "DGX_H200-4_GPUs-TensorRT-Post-Merge-3": ["dgx-h200-x4", "l0_dgx_h200", 3, 3, 4],
2667-
// Disable RTXPro6000 stages due to nodes will be offline temporarily
2676+
// Disable RTXPro6000 stages due to nodes will be offline temporarily.
2677+
// [TODO] Split tests between RTXPro6000 and RTXPro6000D and move reasonable mount of tests to pre-merge.
26682678
// "RTXPro6000-PyTorch-Post-Merge-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1],
26692679
// "RTXPro6000-4_GPUs-PyTorch-Post-Merge-1": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 1, 2, 4],
26702680
// "RTXPro6000-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 2, 2, 4],
2681+
"RTXPro6000D-PyTorch-Post-Merge-1": ["rtx-pro-6000d", "l0_rtx_pro_6000", 1, 1],
2682+
"RTXPro6000D-4_GPUs-PyTorch-Post-Merge-1": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 1, 2, 4],
2683+
"RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 2, 2, 4],
26712684
]
26722685

26732686
parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(key.contains("-CU12-") ? LLM_DOCKER_IMAGE_12_9 : LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
@@ -2689,6 +2702,9 @@ def launchTestJobs(pipeline, testFilter)
26892702
fullSet = parallelJobs.keySet()
26902703

26912704
x86SlurmTestConfigs = [
2705+
"DGX_H100-2_GPUs-PyTorch-Ray-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
2706+
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
2707+
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
26922708
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
26932709
"DGX_B200-4_GPUs-PyTorch-1": ["b200-x4", "l0_dgx_b200", 1, 2, 4],
26942710
"DGX_B200-4_GPUs-PyTorch-2": ["b200-x4", "l0_dgx_b200", 2, 2, 4],

tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,15 @@ def __init__(self, alpha: float, output_dtype: torch.dtype):
4949
f"SM version {get_sm_version()} is not supported for CuteDSLNVFP4BlackwellLinear, it only supports SM 100"
5050
)
5151

52+
# rewrite the hash function because the value of self.alpha doesn't affect the tactic.
53+
def __hash__(self):
54+
return hash((self.output_dtype, ))
55+
56+
def __eq__(self, other):
57+
if not isinstance(other, CuteDSLNVFP4BlackwellLinear):
58+
return False
59+
return self.output_dtype == other.output_dtype
60+
5261
def get_valid_tactics(
5362
self,
5463
inputs: List[torch.Tensor],

tensorrt_llm/_torch/models/checkpoints/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from .hf.config_loader import HfConfigLoader
44
from .hf.gemma3_weight_mapper import Gemma3HfWeightMapper
55
from .hf.llama4_weight_mapper import Llama4HfWeightMapper
6+
from .hf.llava_next_weight_mapper import LlavaNextHfWeightMapper
67
from .hf.mixtral_weight_mapper import MixtralHfWeightMapper
78
from .hf.nemotron_h_weight_mapper import NemotronHHfWeightMapper
89
from .hf.qwen2_moe_weight_mapper import Qwen2MoeHfWeightMapper
@@ -17,5 +18,5 @@
1718
"BaseCheckpointLoader", "HfCheckpointLoader", "NemotronHHfWeightMapper",
1819
"Gemma3HfWeightMapper", "MixtralHfWeightMapper", "Llama4HfWeightMapper",
1920
"Qwen2MoeHfWeightMapper", "Qwen3MoeHfWeightMapper", "Qwen2VLHfWeightMapper",
20-
"Qwen3NextHfWeightMapper"
21+
"Qwen3NextHfWeightMapper", "LlavaNextHfWeightMapper"
2122
]
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from tensorrt_llm._torch.models.checkpoints.hf.weight_mapper import HfWeightMapper
2+
from tensorrt_llm._torch.models.modeling_utils import register_mapper
3+
4+
5+
@register_mapper("HF", "LlavaNextForConditionalGeneration")
6+
class LlavaNextHfWeightMapper(HfWeightMapper):
7+
def preprocess_weights(self, weights: dict) -> dict:
8+
transformed_weights = {}
9+
for key, value in weights.items():
10+
if key.startswith("model."):
11+
new_key = key[len("model.") :]
12+
transformed_weights[new_key] = value
13+
else:
14+
transformed_weights[key] = value
15+
return transformed_weights

0 commit comments

Comments
 (0)