NVIDIA
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorrt_llm/commands/serve.py‎
Lines changed: 13 additions & 3 deletions b/‎tensorrt_llm/commands/serve.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎tensorrt_llm/serve/chat_utils.py‎
Lines changed: 9 additions & 0 deletions b/‎tensorrt_llm/serve/chat_utils.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎tensorrt_llm/serve/openai_server.py‎
Lines changed: 3 additions & 0 deletions b/‎tensorrt_llm/serve/openai_server.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tensorrt_llm/serve/postprocess_handlers.py‎
Lines changed: 97 additions & 15 deletions b/‎tensorrt_llm/serve/postprocess_handlers.py‎
Lines changed: 97 additions & 15 deletions
diff --git a/‎tensorrt_llm/serve/tool_parser/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎tensorrt_llm/serve/tool_parser/__init__.py‎
Lines changed: 3 additions & 0 deletions
@@ -78,3 +78,4 @@ nvidia-cutlass-dsl==4.2.1; python_version >= "3.10"
 numba-cuda>=0.19.0 # WAR for nvbugs/5501820
 plotly
 numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing
+partial_json_parser
@@ -33,6 +33,7 @@
 from tensorrt_llm.llmapi.reasoning_parser import ReasoningParserFactory
 from tensorrt_llm.logger import logger, severity_map
 from tensorrt_llm.serve import OpenAIDisaggServer, OpenAIServer
+from tensorrt_llm.serve.tool_parser import ToolParserFactory
 
 # Global variable to store the Popen object of the child process
 _child_p_global: Optional[subprocess.Popen] = None
@@ -150,6 +151,7 @@ def launch_server(
         host: str,
         port: int,
         llm_args: dict,
+        tool_parser: Optional[str] = None,
         metadata_server_cfg: Optional[MetadataServerConfig] = None,
         server_role: Optional[ServerRole] = None,
         disagg_cluster_config: Optional[DisaggClusterConfig] = None,
@@ -173,6 +175,7 @@ def launch_server(
 
     server = OpenAIServer(llm=llm,
                           model=model,
+                          tool_parser=tool_parser,
                           server_role=server_role,
                           metadata_server_cfg=metadata_server_cfg,
                           disagg_cluster_config=disagg_cluster_config,
@@ -311,6 +314,12 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
     default=None,
     help="[Experimental] Specify the parser for reasoning models.",
 )
+@click.option(
+    "--tool_parser",
+    type=click.Choice(ToolParserFactory.parsers.keys()),
+    default=None,
+    help="[Experimental] Specify the parser for tool models.",
+)
 @click.option("--metadata_server_config_file",
               type=str,
               default=None,
@@ -352,7 +361,8 @@ def serve(
         gpus_per_node: Optional[int], kv_cache_free_gpu_memory_fraction: float,
         num_postprocess_workers: int, trust_remote_code: bool,
         extra_llm_api_options: Optional[str], reasoning_parser: Optional[str],
-        metadata_server_config_file: Optional[str], server_role: Optional[str],
+        tool_parser: Optional[str], metadata_server_config_file: Optional[str],
+        server_role: Optional[str],
         fail_fast_on_attention_window_too_large: bool,
         otlp_traces_endpoint: Optional[str], enable_chunked_prefill: bool,
         disagg_cluster_uri: Optional[str], media_io_kwargs: Optional[str]):
@@ -423,8 +433,8 @@ def serve(
 
     multimodal_server_config = MultimodalServerConfig(
         media_io_kwargs=parsed_media_io_kwargs)
-    launch_server(host, port, llm_args, metadata_server_cfg, server_role,
-                  disagg_cluster_config, multimodal_server_config)
+    launch_server(host, port, llm_args, tool_parser, metadata_server_cfg,
+                  server_role, disagg_cluster_config, multimodal_server_config)
 
 
 @click.command("mm_embedding_serve")
 
@@ -1,3 +1,4 @@
+import uuid
 from functools import partial
 from typing import (Any, Callable, Coroutine, Dict, Iterable, List, Literal,
                     Optional, Tuple, TypeAlias, TypedDict, Union, cast)
@@ -220,3 +221,11 @@ def check_multiple_response(n: int, backend: Optional[str]):
     if n > 1 and backend == "pytorch":
         raise ValueError(
             "Multiple response is not supported in PyTorch workflow")
+
+
+def make_tool_call_id(id_type: str = "random", func_name=None, idx=None):
+    if id_type == "kimi_k2":
+        return f"functions.{func_name}:{idx}"
+    else:
+        # by default return random
+        return f"chatcmpl-tool-{uuid.uuid4().hex}"
@@ -78,12 +78,14 @@ class OpenAIServer:
     def __init__(self,
                  llm: Union[LLM, MultimodalEncoder],
                  model: str,
+                 tool_parser: Optional[str],
                  server_role: Optional[ServerRole],
                  metadata_server_cfg: MetadataServerConfig,
                  disagg_cluster_config: Optional[DisaggClusterConfig] = None,
                  multimodal_server_config: Optional[MultimodalServerConfig] = None):
         self.llm = llm
         self.tokenizer = llm.tokenizer
+        self.tool_parser = tool_parser
         self.metadata_server = create_metadata_server(metadata_server_cfg)
         self.disagg_cluster_config = disagg_cluster_config
         self.multimodal_server_config = multimodal_server_config
@@ -532,6 +534,7 @@ async def create_chat_response(
                 prompt["multi_modal_data"] = mm_data
 
             postproc_args.reasoning_parser = self.llm.args.reasoning_parser
+            postproc_args.tool_parser = self.tool_parser
             if conversation and conversation[-1].get(
                     "content") and conversation[-1].get("role") == get_role():
                 postproc_args.last_message_content = conversation[-1]["content"]
 
@@ -10,6 +10,7 @@
                                        ReasoningParserFactory)
 from ..llmapi.tokenizer import TransformersTokenizer
 # yapf: disable
+from .chat_utils import make_tool_call_id
 from .harmony_adapter import (handle_non_streaming_response,
                               handle_streaming_response)
 from .openai_protocol import (ChatCompletionLogProbs,
@@ -23,18 +24,22 @@
                               CompletionRequest, CompletionResponse,
                               CompletionResponseChoice,
                               CompletionResponseStreamChoice,
-                              CompletionStreamResponse, DeltaMessage,
-                              FunctionCall, PromptTokensDetails, StreamOptions,
-                              ToolCall, UsageInfo, to_disaggregated_params)
+                              CompletionStreamResponse, DeltaFunctionCall,
+                              DeltaMessage, DeltaToolCall, FunctionCall,
+                              PromptTokensDetails, StreamOptions, ToolCall,
+                              UsageInfo, to_disaggregated_params)
+from .tool_parser.base_tool_parser import BaseToolParser
+from .tool_parser.core_types import ToolCallItem
+from .tool_parser.tool_parser_factory import ToolParserFactory
 
 # yapf: enable
 
 
 @dataclass(kw_only=True)
 class ChatPostprocArgs(PostprocArgs):
     echo: bool = False
-    role: str = None
-    model: str = None
+    role: str
+    model: str
     num_choices: int = 1
     tools: Optional[List[ChatCompletionToolsParam]] = None
     tool_choice: Optional[Union[Literal["none"],
@@ -44,8 +49,11 @@ class ChatPostprocArgs(PostprocArgs):
     stream_options: Optional[StreamOptions] = None
     last_message_content: Optional[str] = None
     reasoning_parser: Optional[str] = None
+    tool_parser: Optional[str] = None
     reasoning_parser_dict: dict[int, BaseReasoningParser] = field(
         default_factory=dict)
+    tool_parser_dict: dict[int, BaseToolParser] = field(default_factory=dict)
+    has_tool_call: dict[int, bool] = field(default_factory=dict)
 
     @classmethod
     def from_request(cls, request: ChatCompletionRequest):
@@ -116,6 +124,31 @@ def apply_reasoning_parser(args: ChatPostprocArgs, output_index: int, text: str,
     return content, reasoning_content
 
 
+def apply_tool_parser(args: ChatPostprocArgs, output_index: int, text: str,
+                      streaming: bool) -> Tuple[str, List[ToolCallItem]]:
+    tool_parser = None
+    tools = args.tools
+    if args.tool_parser is not None and tools is not None:
+        if output_index not in args.tool_parser_dict:
+            args.tool_parser_dict[
+                output_index] = ToolParserFactory.create_tool_parser(
+                    args.tool_parser)
+        tool_parser = args.tool_parser_dict[output_index]
+
+    if tool_parser is not None and tools is not None:
+        if not streaming:
+            result = tool_parser.detect_and_parse(text, tools)
+        else:
+            result = tool_parser.parse_streaming_increment(text, tools)
+        normal_text, calls = result.normal_text, result.calls
+        if result.calls:
+            args.has_tool_call[output_index] = True
+    else:
+        normal_text, calls = text, []
+
+    return normal_text, calls
+
+
 @nvtx_range_debug("chat_stream_post_processor")
 def chat_stream_post_processor(rsp: GenerationResultBase,
                                args: ChatPostprocArgs) -> List[str]:
@@ -176,27 +209,63 @@ def yield_first_chat(num_tokens: int,
         if args.tool_choice and type(
                 args.tool_choice) is ChatCompletionNamedToolChoiceParam:
             delta_message = DeltaMessage(tool_calls=[
-                ToolCall(function=FunctionCall(
-                    name=args.tool_choice.function.name, arguments=delta_text))
-            ])
+                DeltaToolCall(
+                    function=DeltaFunctionCall(
+                        name=args.tool_choice.function.name,
+                        arguments=delta_text),
+                    index=i,
+                ),
+            ], )
         else:
-            delta_message = DeltaMessage(content=delta_text,
-                                         reasoning_content=reasoning_delta_text)
+            delta_text, calls = apply_tool_parser(args, i, delta_text, True)
+            tool_calls = []
+            for call_item in calls:
+                # Tool call ID should be generated only once per tool call
+                if call_item.name:
+                    # First chunk: include ID and function name
+                    tool_call_id = make_tool_call_id()
+                    function_name = call_item.name
+                else:
+                    # Subsequent chunks: null ID and name for argument deltas
+                    tool_call_id = None
+                    function_name = None
+
+                tool_calls.append(
+                    DeltaToolCall(
+                        id=tool_call_id,
+                        index=call_item.tool_index,
+                        function=DeltaFunctionCall(
+                            name=function_name,
+                            arguments=call_item.parameters,
+                        ),
+                    ))
+            if tool_calls or delta_text or reasoning_delta_text or output.finish_reason:
+                delta_message = DeltaMessage(
+                    content=delta_text,
+                    reasoning_content=reasoning_delta_text,
+                    tool_calls=tool_calls if tool_calls else None)
+            else:
+                continue
 
         choice = ChatCompletionResponseStreamChoice(
             index=i,
             delta=delta_message,
-            finish_reason=None,
             avg_decoded_tokens_per_iter=getattr(rsp,
                                                 'avg_decoded_tokens_per_iter',
-                                                None))
+                                                None),
+            stop_reason=output.stop_reason,
+        )
         if args.return_logprobs:
             logprobs = output.logprobs_diff
             token_ids = output.token_ids_diff
             choice.logprobs = create_logprobs(token_ids, args.tokenizer,
                                               logprobs, args.top_logprobs)
         if output.finish_reason is not None:
-            choice.finish_reason = output.finish_reason
+            if output.finish_reason == "stop" and args.has_tool_call.get(
+                    i, False):
+                choice.finish_reason = "tool_calls"
+            else:
+                choice.finish_reason = output.finish_reason
             choice.stop_reason = output.stop_reason
             finish_reason_sent[i] = True
         chunk = ChatCompletionStreamResponse(choices=[choice], model=args.model)
@@ -247,21 +316,34 @@ def chat_response_post_processor(
                         name=args.tool_choice.function.name, arguments=text))
                 ])
         else:
+            if text is None:
+                text = ""
+            text, calls = apply_tool_parser(args, output.index, text, False)
+            tool_calls = [
+                ToolCall(function=FunctionCall(name=call.name or "",
+                                               arguments=call.parameters))
+                for call in calls
+            ]
             message = ChatMessage(role=role,
                                   content=text,
-                                  reasoning_content=reasoning_text)
+                                  reasoning_content=reasoning_text,
+                                  tool_calls=tool_calls)
         disaggregated_params = to_disaggregated_params(
             output.disaggregated_params)
         choice = ChatCompletionResponseChoice(
             index=output.index,
             message=message,
-            finish_reason=output.finish_reason,
             stop_reason=output.stop_reason,
             disaggregated_params=disaggregated_params,
             avg_decoded_tokens_per_iter=getattr(rsp,
                                                 'avg_decoded_tokens_per_iter',
                                                 None),
         )
+        if output.finish_reason == "stop" and args.has_tool_call.get(
+                output.index, False):
+            choice.finish_reason = "tool_calls"
+        else:
+            choice.finish_reason = output.finish_reason
 
         if args.return_logprobs:
             choice.logprobs = create_logprobs(output.token_ids, args.tokenizer,
 
@@ -0,0 +1,3 @@
+from .tool_parser_factory import ToolParserFactory
+
+__all__ = ["ToolParserFactory"]
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .tool_parser_factory import ToolParserFactory`
	`2`	`+`
	`3`	`+__all__ = ["ToolParserFactory"]`