NVIDIA
diff --git a/‎docs/source/_static/custom.css‎
Lines changed: 25 additions & 0 deletions b/‎docs/source/_static/custom.css‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎docs/source/conf.py‎
Lines changed: 23 additions & 0 deletions b/‎docs/source/conf.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎docs/source/helper.py‎
Lines changed: 22 additions & 6 deletions b/‎docs/source/helper.py‎
Lines changed: 22 additions & 6 deletions
diff --git a/‎tensorrt_llm/bench/benchmark/throughput.py‎
Lines changed: 4 additions & 0 deletions b/‎tensorrt_llm/bench/benchmark/throughput.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tensorrt_llm/llmapi/llm.py‎
Lines changed: 15 additions & 7 deletions b/‎tensorrt_llm/llmapi/llm.py‎
Lines changed: 15 additions & 7 deletions
@@ -0,0 +1,25 @@
+.tag {
+  padding: 2px 5px;
+  border-radius: 4px;
+  font-size: 0.8em;
+  margin-right: 5px;
+  color: #000;
+}
+
+code.beta {
+  display: inline-block;
+  background-color: #6c757d;
+  color: #999;
+}
+
+code.prototype {
+  display: inline-block;
+  background-color: #fd7e14;
+  color: #fff;
+}
+
+code.deprecated {
+  display: inline-block;
+  background-color: red;
+  color: #fff;
+}
@@ -12,6 +12,7 @@
 import sys
 
 import pygit2
+from docutils import nodes
 
 sys.path.insert(0, os.path.abspath('.'))
 
@@ -60,10 +61,16 @@
     'sphinx_togglebutton',
 ]
 
+autodoc_member_order = 'bysource'
 autodoc_pydantic_model_show_json = True
 autodoc_pydantic_model_show_config_summary = True
 autodoc_pydantic_field_doc_policy = "description"
 autodoc_pydantic_model_show_field_list = True  # Display field list with descriptions
+autodoc_pydantic_model_member_order = "groupwise"
+autodoc_pydantic_model_hide_pydantic_methods = True
+autodoc_pydantic_field_list_validators = False
+autodoc_pydantic_settings_signature_prefix = ""  # remove any prefix
+autodoc_pydantic_settings_hide_reused_validator = True  # hide all the validator should be better
 
 myst_url_schemes = {
     "http":
@@ -143,10 +150,26 @@
 print('CPP_INCLUDE_DIR', CPP_INCLUDE_DIR)
 print('CPP_GEN_DIR', CPP_GEN_DIR)
 
+html_css_files = [
+    'custom.css',
+]
+
+
+def tag_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
+    """A custom role for displaying tags."""
+    tag_name = text.lower()
+    node = nodes.literal(text, text, classes=['tag', tag_name])
+    return [node], []
+
 
 def setup(app):
     from helper import generate_examples, generate_llmapi
 
+    from tensorrt_llm.llmapi.utils import tag_llm_params
+    tag_llm_params()
+
+    app.add_role('tag', tag_role)
+
     generate_examples()
     generate_llmapi()
 
 
@@ -286,6 +286,18 @@ def extract_all_and_eval(file_path):
     return local_vars
 
 
+def get_pydantic_methods() -> list[str]:
+    from pydantic import BaseModel
+
+    class Dummy(BaseModel):
+        pass
+
+    methods = set(
+        [method for method in dir(Dummy) if not method.startswith('_')])
+    methods.discard("__init__")
+    return list(methods)
+
+
 def generate_llmapi():
     root_dir = Path(__file__).parent.parent.parent.resolve()
 
@@ -301,14 +313,18 @@ def generate_llmapi():
     for cls_name in public_classes_names:
         cls_name = cls_name.strip()
         options = [
-            "    :members:", "    :undoc-members:", "    :show-inheritance:"
+            "    :members:",
+            "    :undoc-members:",
+            "    :show-inheritance:",
+            "    :special-members: __init__",
+            "    :member-order: groupwise",
         ]
 
-        if cls_name != 'LLM':  # Conditionally add :special-members: __init__
-            options.append("    :special-members: __init__")
-
-        if cls_name in ['TrtLLM', 'TorchLLM', 'LLM']:
-            options.append("    :inherited-members:")
+        options.append("    :inherited-members:")
+        if cls_name in ["TorchLlmArgs", "TrtLlmArgs"]:
+            # exclude tons of methods from Pydantic
+            options.append(
+                f"    :exclude-members: {','.join(get_pydantic_methods())}")
 
         content += f".. autoclass:: tensorrt_llm.llmapi.{cls_name}\n"
         content += "\n".join(options) + "\n\n"
 
@@ -382,12 +382,16 @@ def throughput_command(
                 logger.warning(
                     "Ignore extended_runtime_perf_knob_config for pytorch backend."
                 )
+            if kwargs.pop("batching_type", None):
+                logger.warning("Ignore batching_type for pytorch backend.")
             llm = PyTorchLLM(**kwargs)
         elif runtime_config.backend == "_autodeploy":
             if kwargs.pop("extended_runtime_perf_knob_config", None):
                 logger.warning(
                     "Ignore extended_runtime_perf_knob_config for _autodeploy backend."
                 )
+            if kwargs.pop("batching_type", None):
+                logger.warning("Ignore batching_type for pytorch backend.")
             llm = AutoDeployLLM(**kwargs)
         else:
             llm = LLM(**kwargs)
 
@@ -40,7 +40,7 @@
                         _xgrammar_tokenizer_info)
 # TODO[chunweiy]: move the following symbols back to utils scope, and remove the following import
 from .utils import (append_docstring, exception_handler, get_device_count,
-                    print_colored_debug)
+                    print_colored_debug, set_api_status)
 
 
 class RequestOutput(DetokenizedGenerationResultBase, GenerationResult):
@@ -212,6 +212,7 @@ def __init__(self,
         atexit.register(LLM._shutdown_wrapper, weakref.ref(self))
 
     @property
+    @set_api_status("beta")
     def llm_id(self) -> str:
         if self._llm_id is None:
             hostname = socket.gethostname()
@@ -422,6 +423,7 @@ def generate_async(
         return RequestOutput._from_generation_result(result, prompt,
                                                      self.tokenizer)
 
+    @set_api_status("beta")
     def get_stats(self, timeout: Optional[float] = 2) -> List[dict]:
         '''Get iteration statistics from the runtime.
         To collect statistics, call this function after prompts have been submitted with LLM().generate().
@@ -435,6 +437,7 @@ def get_stats(self, timeout: Optional[float] = 2) -> List[dict]:
         '''
         return self._executor.get_stats(timeout=timeout)
 
+    @set_api_status("beta")
     def get_stats_async(self, timeout: Optional[float] = 2) -> IterationResult:
         '''Get iteration statistics from the runtime.
         To collect statistics, you can call this function in an async coroutine or the /metrics endpoint (if you're using trtllm-serve)
@@ -448,6 +451,7 @@ def get_stats_async(self, timeout: Optional[float] = 2) -> IterationResult:
         '''
         return self._executor.aget_stats(timeout=timeout)
 
+    @set_api_status("beta")
     def get_kv_cache_events(self, timeout: Optional[float] = 2) -> List[dict]:
         '''Get iteration KV events from the runtime.
 
@@ -469,6 +473,7 @@ def get_kv_cache_events(self, timeout: Optional[float] = 2) -> List[dict]:
         '''
         return self._executor.get_kv_events(timeout=timeout)
 
+    @set_api_status("beta")
     def get_kv_cache_events_async(self,
                                   timeout: Optional[float] = 2
                                   ) -> IterationResult:
@@ -667,6 +672,7 @@ def tokenizer(self) -> Optional[TokenizerBase]:
     def tokenizer(self, tokenizer: TokenizerBase):
         self._tokenizer = tokenizer
 
+    @set_api_status("beta")
     def shutdown(self) -> None:
         if hasattr(self, "_executor") and self._executor is not None:
             self._executor.shutdown()
@@ -924,12 +930,17 @@ def _build_model(self):
             max_beam_width=self.args.max_beam_width,
             scheduler_config=PybindMirror.maybe_to_pybind(
                 self.args.scheduler_config),
-            batching_type=PybindMirror.maybe_to_pybind(self.args.batching_type)
-            or tllm.BatchingType.INFLIGHT,
             max_batch_size=max_batch_size,
             max_num_tokens=max_num_tokens,
             gather_generation_logits=self.args.gather_generation_logits)
 
+        if hasattr(self.args,
+                   "batching_type") and self.args.batching_type is not None:
+            self._executor_config.batching_type = PybindMirror.maybe_to_pybind(
+                self.args.batching_type)
+        else:
+            self._executor_config.batching_type = tllm.BatchingType.INFLIGHT
+
         if self.args.kv_cache_config is not None:
             self._executor_config.kv_cache_config = PybindMirror.maybe_to_pybind(
                 self.args.kv_cache_config)
@@ -957,7 +968,6 @@ def _build_model(self):
                 f"Unsupported guided decoding backend {self.args.guided_decoding_backend}"
             )
 
-        self._executor_config.normalize_log_probs = self.args.normalize_log_probs
         self._executor_config.enable_chunked_context = self.args.enable_chunked_prefill
         self._executor_config.max_beam_width = self.args.max_beam_width
         if self.args.cache_transceiver_config is not None:
@@ -1040,13 +1050,11 @@ def __init__(self,
                          revision, tokenizer_revision, **kwargs)
 
 
-_LLM_REPR = "TorchLLM"
-
 # sphinx will ignore the LLM's docstring if it is not explicitly set
 LLM.__doc__ = \
     f"""LLM class is the main class for running a LLM model.
 
-    This class is an alias of {_LLM_REPR}.
+    For more details about the arguments, please refer to :class:`TorchLlmArgs`.
 
     Parameters:
 """ + TORCH_LLM_DOCSTRING