Better conform to TRT-LLM style guidelines and add an LLM API argument to explicitly enable the async worker for testing purposes

mojombo · mojombo · commit 447e521d11ee · 2025-10-24T11:10:42.000-07:00
Signed-off-by: Dan Hansen &lt;1+dhansen-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -815,7 +815,7 @@ def create_py_executor_instance(
 def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,
                               max_batch_size: int,
                               speculative_config: SpeculativeConfig,
-                              max_beam_width: int, use_async_worker: bool):
+                              max_beam_width: int, enable_async_worker: bool):
     max_num_sequences = max_batch_size * mapping.pp_size
     max_draft_len = (0 if speculative_config is None else
                      speculative_config.max_draft_len)
@@ -828,7 +828,7 @@ def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,
         max_total_draft_tokens=max_total_draft_tokens,
         max_num_sequences=max_num_sequences,
         max_beam_width=max_beam_width,
-        use_async_worker=use_async_worker,
+        enable_async_worker=enable_async_worker,
     )
 
 
@@ -839,15 +839,16 @@ def instantiate_sampler(engine: PyTorchModelEngine,
                         speculative_config: SpeculativeConfig,
                         decoding_config: trtllm.DecodingConfig,
                         kv_cache_config: KvCacheConfig):
-    use_async_worker = confidential_compute_enabled()
+    enable_async_worker = (confidential_compute_enabled() or
+                           pytorch_backend_config.sampler_enable_async_worker)
 
     sampler_args = create_torch_sampler_args(
         mapping,
         max_seq_len=engine.max_seq_len,
         max_batch_size=max_batch_size,
         speculative_config=speculative_config,
         max_beam_width=max_beam_width,
-        use_async_worker=use_async_worker)
+        enable_async_worker=enable_async_worker)
     decoding_mode = get_decoding_mode(decoding_config=decoding_config,
                                       max_beam_width=max_beam_width)
     if mapping.cp_config.get('cp_type') == CpType.STAR:
@@ -874,7 +875,7 @@ def instantiate_sampler(engine: PyTorchModelEngine,
                              max_beam_width=max_beam_width,
                              decoding_config=decoding_config,
                              kv_cache_config=kv_cache_config,
-                             use_async_worker=use_async_worker)
+                             enable_async_worker=enable_async_worker)
     if not engine.model.model_config.is_generation:
         # NOTE: choose sampler based on model type
         return EarlyStopSampler()
diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py
@@ -69,6 +69,7 @@ class PyTorchConfig:
     The type of sampler to use. Options are TRTLLMSampler, TorchSampler or auto.
     Defaults to auto, which will use TorchSampler unless BeamSearch is requested.
     """
+    sampler_enable_async_worker: bool = False
 
     kv_cache_dtype: str = "auto"
     mamba_ssm_cache_dtype: str = "auto"
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -52,7 +52,7 @@
                           LlmResponse, get_draft_token_length)
 from .model_engine import ModelEngine
 from .resource_manager import ResourceManager
-from .sampler import Sampler, SampleState, SampleStateTensors
+from .sampler import AsyncWorkerMixin, Sampler, SampleState, SampleStateTensors
 from .scheduler import RequestScheduler, ScheduledRequests
 
 # Environment variable to specify iteration ranges for profiling start/stop.
@@ -479,7 +479,7 @@ def shutdown(self):
         if self.draft_model_engine is not None:
             del self.draft_model_engine
         # Stop the sampler's async worker, if it was used
-        if hasattr(self.sampler, 'async_worker_stop'):
+        if isinstance(self.sampler, AsyncWorkerMixin):
             self.sampler.async_worker_stop()
 
     def can_enqueue_requests(self) -> bool:
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -38,6 +38,7 @@
 from .kv_cache_connector import KvCacheConnectorManager
 from .model_engine import PyTorchModelEngine
 from .py_executor import PyExecutor
+from .sampler import AsyncWorkerMixin
 
 
 class _ExecutorCreationStage(enum.Enum):
@@ -181,7 +182,7 @@ def update_sampler_max_seq_len(max_seq_len, sampler):
 
 
 def maybe_start_sampler_async_worker(sampler):
-    if hasattr(sampler, 'async_worker_start') and sampler.use_async_worker:
+    if isinstance(sampler, AsyncWorkerMixin) and sampler.enable_async_worker:
         sampler.async_worker_start()
 
 
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -17,7 +17,7 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from collections.abc import Iterable
-from concurrent.futures import Future, ThreadPoolExecutor
+from concurrent import futures
 from dataclasses import dataclass
 from itertools import repeat
 from typing import Any, Callable, List, Optional, TypeVar, cast
@@ -82,7 +82,7 @@
 
 @dataclass(kw_only=True)
 class SampleStateTensors:
-    new_tokens: torch.Tensor | Future[torch.Tensor]
+    new_tokens: torch.Tensor | futures.Future[torch.Tensor]
     log_probs: torch.Tensor | None = None
 
     def values(self):
@@ -574,12 +574,12 @@ class AsyncWorkerMixin:
     def _async_worker_active(self) -> bool:
         return self._async_worker is not None
 
-    def _async_worker_init(self, use_async_worker: bool):
-        self.use_async_worker = use_async_worker
+    def _async_worker_init(self, enable_async_worker: bool):
+        self.enable_async_worker = enable_async_worker
         self._async_worker = None
 
     def async_worker_start(self):
-        assert self.use_async_worker
+        assert self.enable_async_worker
         assert not self._async_worker_active()
 
         def _async_worker_initializer(device_id):
@@ -590,7 +590,7 @@ def _async_worker_initializer(device_id):
             # blocking copies from gating subsequent async work
             torch.cuda.set_stream(torch.cuda.Stream())
 
-        self._async_worker = ThreadPoolExecutor(
+        self._async_worker = futures.ThreadPoolExecutor(
             max_workers=1,
             initializer=_async_worker_initializer,
             initargs=(torch.cuda.current_device(),),
@@ -653,7 +653,7 @@ class Args:
         max_num_sequences: int
         max_beam_width: int
         max_total_draft_tokens: int
-        use_async_worker: Optional[bool] = False
+        enable_async_worker: Optional[bool] = False
 
     def __init__(self, args: Args):
         self.max_seq_len = args.max_seq_len
@@ -674,7 +674,7 @@ def __init__(self, args: Args):
         self._global_seed = 42
         self._generator = None
 
-        self._async_worker_init(args.use_async_worker)
+        self._async_worker_init(args.enable_async_worker)
 
     def get_generator(self, device: torch.device) -> torch.Generator:
         """Get a deterministic generator for the specified device.
@@ -755,9 +755,8 @@ def handle_logprobs(
         if request.py_return_log_probs:
             if self._async_worker_active():
                 # These should be futures if we used the async worker
-                assert isinstance(request.py_topk_logprobs_values, Future) and isinstance(
-                    request.py_topk_logprobs_vals, Future
-                )
+                assert isinstance(request.py_topk_logprobs_values, futures.Future)
+                assert isinstance(request.py_topk_logprobs_vals, futures.Future)
                 topk_log_probs_vals = request.py_topk_logprobs_vals.result()
                 topk_log_probs_indices = request.py_topk_logprobs_indices.result()
             else:
@@ -1079,7 +1078,7 @@ def update_requests(
         assert state.host is not None
 
         if self._async_worker_active():
-            assert isinstance(state.host.new_tokens, Future)
+            assert isinstance(state.host.new_tokens, futures.Future)
             new_tokens = state.host.new_tokens.result()
         else:
             new_tokens = state.host.new_tokens
@@ -1686,7 +1685,9 @@ class SampleStateTensorsHostTRTLLM(SampleStateTensors):
 class SampleStateTRTLLM(SampleState):
     finalize_events: dict[str, CudaEvent] | None = None
     """`Optional` to accommodate `_forward_step_inter_pp` which creates a `SampleState` without `finalize_events`"""
-    host: Optional[SampleStateTensorsHostTRTLLM | Future[SampleStateTensorsHostTRTLLM]] = None
+    host: Optional[SampleStateTensorsHostTRTLLM | futures.Future[SampleStateTensorsHostTRTLLM]] = (
+        None
+    )
 
 
 class TRTLLMSampler(Sampler, AsyncWorkerMixin):
@@ -1709,7 +1710,7 @@ def __init__(
         max_beam_width: int,
         decoding_config: Optional[DecodingConfig] = None,
         kv_cache_config: Optional[KvCacheConfig] = None,
-        use_async_worker: Optional[bool] = False,
+        enable_async_worker: Optional[bool] = False,
     ):
         vocab_size = model.config.vocab_size
         num_hidden_layers = model.config.num_hidden_layers
@@ -1760,7 +1761,7 @@ def __init__(
         self._initialize_store()
         self._instantiate_algorithms()
 
-        self._async_worker_init(use_async_worker)
+        self._async_worker_init(enable_async_worker)
 
     def _initialize_store(self):
         torch_stream = torch.cuda.current_stream().cuda_stream
@@ -1983,7 +1984,7 @@ def update_requests(
 
         if self._async_worker_active():
             # Wait for and "unpack" the host tensors
-            assert isinstance(state.host, Future)
+            assert isinstance(state.host, futures.Future)
             state.host = state.host.result()
 
         beam_width = self.beam_width(state.scheduled_requests.all_requests())
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -2449,6 +2449,11 @@ class TorchLlmArgs(BaseLlmArgs):
         "The type of sampler to use. Options are TRTLLMSampler, TorchSampler or auto. Defaults to auto, which will use TorchSampler unless BeamSearch is requested.",
         status="beta")
 
+    sampler_enable_async_worker: bool = Field(
+        default=False,
+        description="Enable the async worker in the sampler for D->H copies",
+        status="beta")
+
     enable_iter_perf_stats: bool = Field(
         default=False,
         description="Enable iteration performance statistics.",
@@ -2822,6 +2827,7 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
             use_low_precision_moe_combine=self.moe_config.
             use_low_precision_moe_combine,
             sampler_type=self.sampler_type,
+            sampler_enable_async_worker=self.sampler_enable_async_worker,
             kv_cache_dtype=self.kv_cache_config.dtype,
             mamba_ssm_cache_dtype=self.kv_cache_config.mamba_ssm_cache_dtype,
             enable_iter_perf_stats=self.enable_iter_perf_stats,