NVIDIA
diff --git a/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 10 additions & 3 deletions b/‎tensorrt_llm/_torch/pyexecutor/_util.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/py_executor.py‎
Lines changed: 3 additions & 0 deletions b/‎tensorrt_llm/_torch/pyexecutor/py_executor.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/py_executor_creator.py‎
Lines changed: 7 additions & 0 deletions b/‎tensorrt_llm/_torch/pyexecutor/py_executor_creator.py‎
Lines changed: 7 additions & 0 deletions
@@ -10,6 +10,7 @@
 from tensorrt_llm._torch.models.modeling_utils import \
     MODEL_CLASS_VISION_ENCODER_MAPPING
 from tensorrt_llm._utils import str_dtype_to_binding, torch_dtype_to_str
+from tensorrt_llm._utils import str_dtype_to_binding, torch_dtype_to_str, confidential_compute_enabled
 from tensorrt_llm.bindings.executor import DecodingMode
 from tensorrt_llm.llmapi.llm_args import (CacheTransceiverConfig,
                                           EagleDecodingConfig, KvCacheConfig,
@@ -816,7 +817,8 @@ def create_py_executor_instance(
 def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,
                               max_batch_size: int,
                               speculative_config: SpeculativeConfig,
-                              max_beam_width: int):
+                              max_beam_width: int,
+                              use_host_copy_thread: bool):
     max_num_sequences = max_batch_size * mapping.pp_size
     max_draft_len = (0 if speculative_config is None else
                      speculative_config.max_draft_len)
@@ -829,6 +831,7 @@ def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,
         max_total_draft_tokens=max_total_draft_tokens,
         max_num_sequences=max_num_sequences,
         max_beam_width=max_beam_width,
+        use_host_copy_thread=use_host_copy_thread,
     )
 
 
@@ -839,12 +842,15 @@ def instantiate_sampler(engine: PyTorchModelEngine,
                         speculative_config: SpeculativeConfig,
                         decoding_config: trtllm.DecodingConfig,
                         kv_cache_config: KvCacheConfig):
+    use_host_copy_thread = confidential_compute_enabled()
+
     sampler_args = create_torch_sampler_args(
         mapping,
         max_seq_len=engine.max_seq_len,
         max_batch_size=max_batch_size,
         speculative_config=speculative_config,
-        max_beam_width=max_beam_width)
+        max_beam_width=max_beam_width,
+        use_host_copy_thread=use_host_copy_thread)
     decoding_mode = get_decoding_mode(decoding_config=decoding_config,
                                       max_beam_width=max_beam_width)
     if mapping.cp_config.get('cp_type') == CpType.STAR:
@@ -870,7 +876,8 @@ def instantiate_sampler(engine: PyTorchModelEngine,
                              max_batch_size=max_batch_size,
                              max_beam_width=max_beam_width,
                              decoding_config=decoding_config,
-                             kv_cache_config=kv_cache_config)
+                             kv_cache_config=kv_cache_config,
+                             use_host_copy_thread=use_host_copy_thread)
     if not engine.model.model_config.is_generation:
         # NOTE: choose sampler based on model type
         return EarlyStopSampler()
 
@@ -475,6 +475,9 @@ def shutdown(self):
         del self.model_engine
         if self.draft_model_engine is not None:
             del self.draft_model_engine
+        # Stop the sampler's host copy thread, if it was used
+        if hasattr(self.sampler, 'stop_host_copy_thread'):
+            self.sampler.stop_host_copy_thread()
 
     def can_enqueue_requests(self) -> bool:
         """
 
@@ -179,6 +179,9 @@ def update_sampler_max_seq_len(max_seq_len, sampler):
         assert hasattr(sampler, "max_seq_len")
         sampler.max_seq_len = max_seq_len
 
+def maybe_start_sampler_host_copy_thread(sampler):
+    if hasattr(sampler, 'start_host_copy_thread') and sampler.use_host_copy_thread:
+        sampler.start_host_copy_thread()
 
 def get_guided_decoding_config(guided_decoding_backend: str,
                                tokenizer: Optional[TokenizerBase] = None):
@@ -667,5 +670,9 @@ def drafting_loop_wrapper(model):
 
     _adjust_torch_mem_fraction(pytorch_backend_config)
 
+    # Now that we've got the instance of py_executor that we're going to keep,
+    # start the sampler's host copy thread, if needed
+    maybe_start_sampler_host_copy_thread(sampler)
+
     py_executor.start_worker()
     return py_executor