NVIDIA
diff --git a/‎cpp/tests/resources/data/test_model_lora_config.json‎
Lines changed: 0 additions & 30 deletions b/‎cpp/tests/resources/data/test_model_lora_config.json‎
Lines changed: 0 additions & 30 deletions
diff --git a/‎examples/models/core/llama/README.md‎
Lines changed: 0 additions & 10 deletions b/‎examples/models/core/llama/README.md‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎tensorrt_llm/__init__.py‎
Lines changed: 0 additions & 3 deletions b/‎tensorrt_llm/__init__.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/llm_args.py‎
Lines changed: 5 additions & 2 deletions b/‎tensorrt_llm/_torch/auto_deploy/llm_args.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/device_mesh.py‎
Lines changed: 0 additions & 3 deletions b/‎tensorrt_llm/_torch/device_mesh.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 28 additions & 21 deletions b/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 28 additions & 21 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/py_executor_creator.py‎
Lines changed: 4 additions & 23 deletions b/‎tensorrt_llm/_torch/pyexecutor/py_executor_creator.py‎
Lines changed: 4 additions & 23 deletions
diff --git a/‎tensorrt_llm/_utils.py‎
Lines changed: 0 additions & 33 deletions b/‎tensorrt_llm/_utils.py‎
Lines changed: 0 additions & 33 deletions
diff --git a/‎tensorrt_llm/auto_parallel/__init__.py‎
Lines changed: 0 additions & 9 deletions b/‎tensorrt_llm/auto_parallel/__init__.py‎
Lines changed: 0 additions & 9 deletions
@@ -93,36 +93,6 @@
             ],
             "trtllm_modules_to_hf_modules": {}
         },
-        "auto_parallel_config": {
-            "world_size": 1,
-            "gpus_per_node": 8,
-            "cluster_key": "A100-PCIe-80GB",
-            "cluster_info": null,
-            "sharding_cost_model": "alpha_beta",
-            "comm_cost_model": "alpha_beta",
-            "enable_pipeline_parallelism": false,
-            "enable_shard_unbalanced_shape": false,
-            "enable_shard_dynamic_shape": false,
-            "enable_reduce_scatter": true,
-            "builder_flags": null,
-            "debug_mode": false,
-            "infer_shape": true,
-            "validation_mode": false,
-            "same_buffer_io": {
-                "past_key_value_(\\d+)": "present_key_value_\\1"
-            },
-            "same_spec_io": {},
-            "sharded_io_allowlist": [
-                "past_key_value_\\d+",
-                "present_key_value_\\d*"
-            ],
-            "fast_reduce": true,
-            "fill_weights": false,
-            "parallel_config_cache": null,
-            "profile_cache": null,
-            "dump_path": null,
-            "debug_outputs": []
-        },
         "weight_sparsity": false,
         "weight_streaming": false,
         "plugin_config": {
 
@@ -132,16 +132,6 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_fp16_wq \
             --output_dir ./tmp/llama/7B/trt_engines/weight_only/1-gpu/ \
             --gemm_plugin auto
 
-# Build LLaMA 7B using 2-way auto parallelism (deprecated).
-python convert_checkpoint.py --model_dir ./tmp/llama/7B/ \
-                            --output_dir ./tllm_checkpoint_1gpu_fp16 \
-                            --dtype float16
-
-trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_fp16 \
-            --output_dir ./tmp/llama/7B/trt_engines/fp16/2-gpu/ \
-            --gemm_plugin auto \
-            --auto_parallel 2
-
 # Build LLaMA 7B using 2-way tensor parallelism.
 python convert_checkpoint.py --model_dir ./tmp/llama/7B/ \
                             --output_dir ./tllm_checkpoint_2gpu_tp2 \
 
@@ -77,7 +77,6 @@ def _preload_python_lib():
                      mpi_barrier, mpi_comm, mpi_rank, mpi_world_size,
                      set_mpi_comm, str_dtype_to_torch, str_dtype_to_trt,
                      torch_dtype_to_trt)
-from .auto_parallel import AutoParallelConfig, auto_parallel
 from .builder import BuildConfig, Builder, BuilderConfig, build
 from .disaggregated_params import DisaggregatedParams
 from .functional import Tensor, constant
@@ -130,8 +129,6 @@ def _preload_python_lib():
     'Module',
     'functional',
     'models',
-    'auto_parallel',
-    'AutoParallelConfig',
     'quantization',
     'tools',
     'LLM',
 
@@ -391,10 +391,13 @@ def validate_parallel_config(self):
         rank to automatically shard the model. This is just to ensure that other objects in the
         runtime that may read parallel_config can do so.
         """
+
+        # Set tp_size = self.world_size so that _ParallelConfig.world_size will return the
+        # correct value (computed as tp_size * pp_size * cp_size). This does not necessarily
+        # mean that TP will actually be used.
         self._parallel_config = _ParallelConfig(
-            auto_parallel=True, gpus_per_node=self.gpus_per_node
+            tp_size=self.world_size, gpus_per_node=self.gpus_per_node
         )
-        self._parallel_config.world_size = self.world_size
         return self
 
     @model_validator(mode="after")
 
@@ -74,18 +74,15 @@ def moe_ep_group_pg(self):
     # Access rank
     @property
     def tp_rank(self) -> int:
-        assert not self.auto_parallel, "Auto parallel is not currently supported in Ray mode."
         return self.tp_group_pg.rank()
 
     @property
     def pp_rank(self) -> int:
-        assert not self.auto_parallel, "Auto parallel is not currently supported in Ray mode."
         return self.pp_group_pg.rank()
 
     @property
     def cp_rank(self) -> int:
         # TODO: WIP
-        assert not self.auto_parallel, "Auto parallel is not currently supported in Ray mode."
         return self.cp_group_pg.rank()
 
     # Access group ranks
 
@@ -20,6 +20,7 @@
                                             MultimodalRuntimeData)
 from tensorrt_llm.inputs.registry import (create_input_processor,
                                           create_input_processor_with_hash)
+from tensorrt_llm.llmapi.llm_args import TorchLlmArgs
 from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_helper import LoraConfig
 from tensorrt_llm.lora_manager import LoraModelConfig
@@ -38,7 +39,6 @@
 from ..expert_statistic import ExpertStatistic
 from ..memory_buffer_utils import with_shared_pool
 from ..metadata import KVCacheParams
-from ..models.checkpoints.base_checkpoint_loader import BaseCheckpointLoader
 from ..models.modeling_multimodal_utils import filter_mm_token_from_input_ids
 from ..models.modeling_utils import DecoderModelForCausalLM
 from ..modules.fused_moe.moe_load_balancer import (MoeLoadBalancer,
@@ -52,7 +52,7 @@
 from ..utils import (get_model_extra_attrs,
                      set_per_request_piecewise_cuda_graph_flag,
                      set_torch_compiling, with_model_extra_attrs)
-from .config import PyTorchConfig
+from .config import PyTorchConfig, _construct_checkpoint_loader
 from .config_utils import is_mla
 from .cuda_graph_runner import CUDAGraphRunner
 from .guided_decoder import CapturableGuidedDecoder
@@ -131,29 +131,36 @@ def __init__(
         *,
         model_path: str,
         pytorch_backend_config: PyTorchConfig,
-        checkpoint_loader: BaseCheckpointLoader,
-        batch_size: int = 8,
-        max_beam_width: int = 1,
-        max_num_tokens: int = 8192,
-        max_seq_len: Optional[int] = None,
         mapping: Optional[Mapping] = None,
         attn_runtime_features: Optional[AttentionRuntimeFeatures] = None,
         dist: Optional[MPIDist] = None,
         spec_config: Optional["DecodingBaseConfig"] = None,
-        sparse_attention_config: Optional["SparseAttentionConfig"] = None,
-        lora_config: Optional[LoraConfig] = None,
         is_draft_model: bool = False,
         drafting_loop_wrapper: Optional[Callable[[torch.nn.Module],
                                                  torch.nn.Module]] = None,
         model: Optional[torch.nn.Module] = None,
+        llm_args: Optional[TorchLlmArgs] = None,
     ):
+        assert llm_args is not None, "llm_args must be provided for PyTorchModelEngine"
+
         self.forward_pass_callable = None
         self.ub_buffers = None
-        self.batch_size = batch_size
+        (
+            max_beam_width,
+            max_num_tokens,
+            max_seq_len,
+            max_batch_size,
+        ) = llm_args.get_runtime_sizes()
+
+        self.batch_size = max_batch_size
         self.max_num_tokens = max_num_tokens
         self.max_seq_len = max_seq_len
         self.max_beam_width = max_beam_width
 
+        checkpoint_loader = _construct_checkpoint_loader(
+            llm_args.backend, llm_args.checkpoint_loader,
+            llm_args.checkpoint_format)
+
         self.mapping = mapping
         if mapping.has_pp():
             init_pp_comm(mapping)
@@ -171,7 +178,7 @@ def __init__(
             spec_config.max_total_draft_tokens = 0
         self.spec_config = spec_config
         self.is_spec_decode = spec_config is not None
-        self.sparse_attention_config = sparse_attention_config
+        self.sparse_attention_config = None if is_draft_model else llm_args.sparse_attention_config
         self.enable_spec_decode = self.is_spec_decode
         self.is_draft_model = is_draft_model
 
@@ -181,13 +188,15 @@ def __init__(
         self.input_processor_with_hash = create_input_processor_with_hash(
             self.input_processor)
         if model is None:
+            lora_config: Optional[
+                LoraConfig] = None if is_draft_model else llm_args.lora_config
             loader = ModelLoader(
                 pytorch_backend_config=pytorch_backend_config,
                 mapping=self.mapping,
                 spec_config=self.spec_config,
                 sparse_attention_config=self.sparse_attention_config,
-                max_num_tokens=max_num_tokens,
-                max_seq_len=max_seq_len,
+                max_num_tokens=self.max_num_tokens,
+                max_seq_len=self.max_seq_len,
                 lora_config=lora_config,
             )
             self.model, moe_load_balancer = loader.load(
@@ -273,29 +282,27 @@ def __init__(
 
         self.attn_backend = get_attention_backend(
             pytorch_backend_config.attn_backend,
-            sparse_attn_config=sparse_attention_config)
+            sparse_attn_config=self.sparse_attention_config)
 
         if self.is_spec_decode:
             self.spec_metadata = None
             update_spec_config_from_model_config(self.spec_config,
                                                  self.model.config)
-            max_num_draft_tokens = self.original_max_total_draft_tokens * batch_size
+            max_num_draft_tokens = self.original_max_total_draft_tokens * self.batch_size
             self.draft_tokens_cuda = torch.empty((max_num_draft_tokens, ),
                                                  dtype=torch.int,
                                                  device='cuda')
             self.gather_ids_cuda = torch.empty((self.max_num_tokens, ),
                                                dtype=torch.int,
                                                device='cuda')
-            self.num_accepted_draft_tokens_cuda = torch.empty((batch_size, ),
-                                                              dtype=torch.int,
-                                                              device='cuda')
+            self.num_accepted_draft_tokens_cuda = torch.empty(
+                (self.batch_size, ), dtype=torch.int, device='cuda')
             self.previous_pos_indices_cuda = torch.empty(
                 (self.max_num_tokens, ), dtype=torch.int, device='cuda')
             self.previous_pos_id_offsets_cuda = torch.zeros(
                 (self.max_num_tokens, ), dtype=torch.int, device='cuda')
-            self.previous_kv_lens_offsets_cuda = torch.zeros((batch_size, ),
-                                                             dtype=torch.int,
-                                                             device='cuda')
+            self.previous_kv_lens_offsets_cuda = torch.zeros(
+                (self.batch_size, ), dtype=torch.int, device='cuda')
             self.without_logits = self.spec_config.spec_dec_mode.without_logits(
             ) or self.model_is_wrapped
             self.max_draft_len = spec_config.max_draft_len
 
@@ -32,7 +32,7 @@
 from ._util import (KvCacheCreator, _adjust_torch_mem_fraction,
                     create_py_executor_instance, instantiate_sampler, is_mla,
                     validate_feature_combination)
-from .config import PyTorchConfig, _construct_checkpoint_loader
+from .config import PyTorchConfig
 from .config_utils import is_mla
 from .guided_decoder import CapturableGuidedDecoder, GuidedDecoder
 from .kv_cache_connector import KvCacheConnectorManager
@@ -234,11 +234,6 @@ def create_py_executor(
     mm_encoder_only = llm_args.mm_encoder_only
     enable_chunked_context = llm_args.enable_chunked_prefill
 
-    assert llm_args.backend == "pytorch", "_construct_checkpoint_loader expects different parameters for autodeploy"
-    checkpoint_loader = _construct_checkpoint_loader(llm_args.backend,
-                                                     llm_args.checkpoint_loader,
-                                                     llm_args.checkpoint_format)
-
     (
         max_beam_width,
         max_num_tokens,
@@ -305,8 +300,6 @@ def create_py_executor(
         has_draft_model_engine = spec_config.spec_dec_mode.has_draft_model()
         has_spec_drafter = spec_config.spec_dec_mode.has_spec_drafter()
 
-    sparse_attention_config = llm_args.sparse_attention_config
-
     # chunk_unit_size may be changed to 64 when using flash mla
     attn_runtime_features = AttentionRuntimeFeatures(
         chunked_prefill=enable_chunked_context,
@@ -322,17 +315,11 @@ def create_py_executor(
         model_engine = PyTorchModelEngine(
             model_path=checkpoint_dir,
             pytorch_backend_config=pytorch_backend_config,
-            batch_size=max_batch_size,
-            max_beam_width=max_beam_width,
-            max_num_tokens=max_num_tokens,
-            max_seq_len=max_seq_len,
             mapping=mapping,
             attn_runtime_features=attn_runtime_features,
             dist=dist,
             spec_config=spec_config,
-            sparse_attention_config=sparse_attention_config,
-            lora_config=lora_config,
-            checkpoint_loader=checkpoint_loader,
+            llm_args=llm_args,
         )
 
     validate_feature_combination(llm_args, model_engine,
@@ -369,19 +356,13 @@ def drafting_loop_wrapper(model):
             draft_model_engine = PyTorchModelEngine(
                 model_path=spec_config.speculative_model_dir,
                 pytorch_backend_config=draft_pytorch_backend_config,
-                batch_size=max_batch_size,
-                max_beam_width=max_beam_width,
-                max_num_tokens=max_num_tokens,
-                # Note: The draft model engine will infer its own max_seq_len.
-                # We'll stop drafting when we hit the max.
-                max_seq_len=max_seq_len,
                 mapping=mapping,
                 attn_runtime_features=attn_runtime_features,
                 dist=dist,
                 spec_config=draft_spec_config,
-                checkpoint_loader=checkpoint_loader,
                 is_draft_model=True,
                 drafting_loop_wrapper=drafting_loop_wrapper,
+                llm_args=llm_args,
             )
             # For DeepseekV3 MTP, we need to set the num_hidden_layers to 1 for the draft model
             if spec_config.spec_dec_mode.is_mtp_eagle():
@@ -574,7 +555,7 @@ def drafting_loop_wrapper(model):
             pytorch_backend_config=pytorch_backend_config,
             speculative_config=spec_config,
             profiling_stage_data=profiling_stage_data,
-            sparse_attention_config=sparse_attention_config,
+            sparse_attention_config=llm_args.sparse_attention_config,
         )
         estimating_kv_cache = kv_cache_creator.try_prepare_estimation()
         with mem_monitor.observe_creation_stage(
 
@@ -25,7 +25,6 @@
 import trace
 import weakref
 from contextlib import contextmanager
-from dataclasses import asdict
 from enum import EnumMeta
 from functools import lru_cache, partial, wraps
 from pathlib import Path
@@ -799,38 +798,6 @@ def localtrace(frame, why, arg):
     return wrapper
 
 
-class DictConversion:
-
-    @classmethod
-    def from_dict(cls, config: Dict[str, Any]):
-        obj = cls()
-        fields = obj.__dataclass_fields__
-        for key, value in config.items():
-            assert hasattr(obj, key), f"cannot find {key} in {obj}"
-            field_cls = fields[key].type
-            if (isinstance(field_cls, type)
-                    and issubclass(field_cls, DictConversion)
-                    and isinstance(value, dict)):
-                value = field_cls.from_dict(value)
-            setattr(obj, key, value)
-        return obj
-
-    def to_dict(self):
-        return asdict(self)
-
-    @classmethod
-    def from_json_file(cls, file):
-        with open(file) as f:
-            return cls.from_dict(json.load(f))
-
-    def set_defaults(self, **kwargs):
-        for key, default in kwargs.items():
-            value = getattr(self, key)
-            if (value is None
-                    or (isinstance(value, (list, dict)) and len(value) == 0)):
-                setattr(self, key, default)
-
-
 class BaseEnumMeta(EnumMeta):
 
     def __contains__(cls, item):