Fix some issues discovered in testing

mojombo · mojombo · commit abef7de8f72e · 2025-11-10T14:02:53.000-08:00
Signed-off-by: Dan Hansen &lt;1+dhansen-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -450,7 +450,8 @@ def shutdown(self):
             for key in keys:
                 del self.virtual_memory_pools[key]
         # Stop the sampler's async worker, if it was used
-        if isinstance(self.sampler, AsyncWorkerMixin):
+        if (isinstance(self.sampler, AsyncWorkerMixin)
+                and self.async_worker_enabled()):
             self.sampler.async_worker_stop()
 
     def can_enqueue_requests(self) -> bool:
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -196,7 +196,8 @@ def update_sampler_max_seq_len(max_seq_len, sampler):
 
 
 def maybe_start_sampler_async_worker(sampler):
-    if isinstance(sampler, AsyncWorkerMixin) and sampler.enable_async_worker:
+    if (isinstance(sampler, AsyncWorkerMixin)
+            and sampler.async_worker_enabled()):
         sampler.async_worker_start()
 
 
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -605,12 +605,15 @@ def _async_worker_active(self) -> bool:
         return self._async_worker is not None
 
     def _async_worker_init(self, enable_async_worker: bool):
-        self.enable_async_worker = enable_async_worker
+        self._enable_async_worker = enable_async_worker
         self._async_worker = None
         self._async_worker_futures: list[futures.Future[any]] = []
 
+    def async_worker_enabled(self):
+        return hasattr(self, "_enable_async_worker") and self._enable_async_worker
+
     def async_worker_start(self):
-        assert self.enable_async_worker
+        assert self.async_worker_enabled()
         assert not self._async_worker_active()
 
         def _async_worker_initializer(device_id):
@@ -628,10 +631,12 @@ def _async_worker_initializer(device_id):
         )
 
     def async_worker_stop(self):
-        if self._async_worker_active():
-            self._async_worker.shutdown(wait=True)
-            self._async_worker = None
+        assert self.async_worker_enabled()
+        assert self._async_worker_active()
+        self._async_worker.shutdown(wait=True)
+        self._async_worker = None
 
+    @torch.inference_mode()
     def _async_worker_run(self, ready: torch.cuda.Event, func, /, *args, **kwargs):
         # Make sure the async work takes place after all prior operations on
         # the primary stream. synchronize() is intentionally chosen instead of