Refactor to pull all common code under the AsyncWorkerMixin

mojombo · mojombo · commit 6e31c1cf0ed0 · 2025-10-24T11:10:41.000-07:00
Signed-off-by: Dan Hansen &lt;1+dhansen-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -9,8 +9,8 @@
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.models.modeling_utils import \
     MODEL_CLASS_VISION_ENCODER_MAPPING
-from tensorrt_llm._utils import str_dtype_to_binding, torch_dtype_to_str
-from tensorrt_llm._utils import str_dtype_to_binding, torch_dtype_to_str, confidential_compute_enabled
+from tensorrt_llm._utils import (confidential_compute_enabled,
+                                 str_dtype_to_binding, torch_dtype_to_str)
 from tensorrt_llm.bindings.executor import DecodingMode
 from tensorrt_llm.llmapi.llm_args import (CacheTransceiverConfig,
                                           EagleDecodingConfig, KvCacheConfig,
@@ -815,8 +815,7 @@ def create_py_executor_instance(
 def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,
                               max_batch_size: int,
                               speculative_config: SpeculativeConfig,
-                              max_beam_width: int,
-                              use_host_copy_thread: bool):
+                              max_beam_width: int, use_async_worker: bool):
     max_num_sequences = max_batch_size * mapping.pp_size
     max_draft_len = (0 if speculative_config is None else
                      speculative_config.max_draft_len)
@@ -829,7 +828,7 @@ def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,
         max_total_draft_tokens=max_total_draft_tokens,
         max_num_sequences=max_num_sequences,
         max_beam_width=max_beam_width,
-        use_host_copy_thread=use_host_copy_thread,
+        use_async_worker=use_async_worker,
     )
 
 
@@ -840,15 +839,15 @@ def instantiate_sampler(engine: PyTorchModelEngine,
                         speculative_config: SpeculativeConfig,
                         decoding_config: trtllm.DecodingConfig,
                         kv_cache_config: KvCacheConfig):
-    use_host_copy_thread = confidential_compute_enabled()
+    use_async_worker = confidential_compute_enabled()
 
     sampler_args = create_torch_sampler_args(
         mapping,
         max_seq_len=engine.max_seq_len,
         max_batch_size=max_batch_size,
         speculative_config=speculative_config,
         max_beam_width=max_beam_width,
-        use_host_copy_thread=use_host_copy_thread)
+        use_async_worker=use_async_worker)
     decoding_mode = get_decoding_mode(decoding_config=decoding_config,
                                       max_beam_width=max_beam_width)
     if mapping.cp_config.get('cp_type') == CpType.STAR:
@@ -875,7 +874,7 @@ def instantiate_sampler(engine: PyTorchModelEngine,
                              max_beam_width=max_beam_width,
                              decoding_config=decoding_config,
                              kv_cache_config=kv_cache_config,
-                             use_host_copy_thread=use_host_copy_thread)
+                             use_async_worker=use_async_worker)
     if not engine.model.model_config.is_generation:
         # NOTE: choose sampler based on model type
         return EarlyStopSampler()
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -478,9 +478,9 @@ def shutdown(self):
         del self.model_engine
         if self.draft_model_engine is not None:
             del self.draft_model_engine
-        # Stop the sampler's host copy thread, if it was used
-        if hasattr(self.sampler, 'stop_host_copy_thread'):
-            self.sampler.stop_host_copy_thread()
+        # Stop the sampler's async worker, if it was used
+        if hasattr(self.sampler, 'async_worker_stop'):
+            self.sampler.async_worker_stop()
 
     def can_enqueue_requests(self) -> bool:
         """
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -179,9 +179,11 @@ def update_sampler_max_seq_len(max_seq_len, sampler):
         assert hasattr(sampler, "max_seq_len")
         sampler.max_seq_len = max_seq_len
 
-def maybe_start_sampler_host_copy_thread(sampler):
-    if hasattr(sampler, 'start_host_copy_thread') and sampler.use_host_copy_thread:
-        sampler.start_host_copy_thread()
+
+def maybe_start_sampler_async_worker(sampler):
+    if hasattr(sampler, 'async_worker_start') and sampler.use_async_worker:
+        sampler.async_worker_start()
+
 
 def get_guided_decoding_config(guided_decoding_backend: str,
                                tokenizer: Optional[TokenizerBase] = None):
@@ -674,8 +676,8 @@ def drafting_loop_wrapper(model):
     _adjust_torch_mem_fraction(pytorch_backend_config)
 
     # Now that we've got the instance of py_executor that we're going to keep,
-    # start the sampler's host copy thread, if needed
-    maybe_start_sampler_host_copy_thread(sampler)
+    # start the sampler's async worker, if needed
+    maybe_start_sampler_async_worker(sampler)
 
     py_executor.start_worker()
     return py_executor
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py