some cleanup of the impl and the test

govind-ramnarayan · govind-ramnarayan · commit 3bfe777f5115 · 2025-12-02T22:22:10.000-08:00
Signed-off-by: Govind Ramnarayan &lt;105831528+govind-ramnarayan@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/models/hf.py b/tensorrt_llm/_torch/auto_deploy/models/hf.py
@@ -345,7 +345,7 @@ def build_and_load_model(self, device: DeviceLikeType) -> nn.Module:
                 "trust_remote_code": True,
                 "tp_plan": "auto",
                 **unused_kwargs,
-                # "dtype": "auto",  # takes precedence over unused_kwargs! -- REMOVED
+                "dtype": "auto",  # takes precedence over unused_kwargs!
             },
         )
         model.eval()
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -144,7 +144,14 @@ def build_from_config(cls, ad_config: LlmArgs):
         build_and_optimize = InferenceOptimizer(factory=factory, config=ad_config.transforms)
 
         # construct engine
-        return cls(build_and_optimize, seq_info, device, max_beam_width, reporting_info)
+        return cls(
+            build_and_optimize,
+            seq_info,
+            device,
+            max_beam_width,
+            ad_config.sampler_type,
+            reporting_info,
+        )
 
     @torch.inference_mode()
     def __init__(
@@ -153,6 +160,7 @@ def __init__(
         seq_info: SequenceInfo,
         device: DeviceLikeType,
         max_beam_width: int = 1,
+        sampler_type: SamplerType = SamplerType.TorchSampler,
         reporting_info: ReportingInfo = ReportingInfo(),
     ) -> None:
         """Initialize the engine with model and sequence information."""
@@ -168,6 +176,7 @@ def __init__(
         self.llm_args.batch_wait_timeout_iters = 0
         self.llm_args.batch_wait_max_tokens_ratio = 0.0
         self.llm_args.max_num_tokens = seq_info.max_num_tokens
+        self.sampler_type = sampler_type
         self.iter_counter = 0
         self.iter_states = {}
 
@@ -301,10 +310,12 @@ def _compute_logits(self) -> List[torch.Tensor]:
         logits: torch.Tensor = self.model(**self.cache_seq_interface.named_args)[0]
 
         # Ensure logits are float32 as TRTLLMSampler expects float32
-        if logits.dtype != torch.float32:
-            print("Changing logits dtype to float32")
-            print(f"Old logits.dtype: {logits.dtype}")
-            logits = logits.float()
+        # TODO(govind): Should this be put into the AD graph so it can be fused with other operations?
+        if self.sampler_type == SamplerType.TRTLLMSampler and logits.dtype != torch.float32:
+            ad_logger.info(
+                f"Logits type {logits.dtype} is not supported by TRTLLMSampler. Casting to float32."
+            )
+            logits = logits.to(torch.float32)
 
         # return a list of tensors
         return self.cache_seq_interface.info.unnest_sequences(logits)
@@ -351,6 +362,57 @@ def __init__(self, ad_config: LlmArgs):
         self.config.num_attention_heads = factory.num_attention_heads
 
 
+def get_torch_dtype(ad_config: LlmArgs):
+    # if the model dtype is "auto", we infer it from the model config
+    model_dtype = ad_config.dtype
+    if model_dtype == "auto":
+        model_dtype = ad_config.create_factory().dtype
+    if isinstance(model_dtype, str):
+        model_dtype = str_dtype_to_torch(model_dtype)
+    return model_dtype
+
+
+def instantiate_sampler(
+    ad_config: LlmArgs,
+    max_num_sequences: int,
+    max_draft_len: int,
+    max_total_draft_tokens: int,
+    dist_mapping: Mapping,
+):
+    if ad_config.sampler_type == SamplerType.TorchSampler:
+        # search sampler with speculative decoding
+        sampler_args = TorchSampler.Args(
+            max_seq_len=ad_config.max_seq_len,
+            max_draft_len=max_draft_len,
+            max_total_draft_tokens=max_total_draft_tokens,
+            max_num_sequences=max_num_sequences,
+            max_beam_width=ad_config.max_beam_width,
+            disable_overlap_scheduler=ad_config.disable_overlap_scheduler,
+        )
+        sampler = TorchSampler(sampler_args)
+
+    elif ad_config.sampler_type == SamplerType.TRTLLMSampler:
+        tllm_model_config = TRTLLMSamplerModelConfig(ad_config=ad_config)
+        decoding_mode = get_decoding_mode(ad_config.decoding_config, ad_config.max_beam_width)
+        model_dtype = get_torch_dtype(ad_config)
+        sampler = TRTLLMSampler(
+            model=tllm_model_config,
+            model_dtype=model_dtype,
+            mapping=dist_mapping,
+            decoding_mode=decoding_mode,
+            disable_overlap_scheduler=ad_config.disable_overlap_scheduler,
+            max_seq_len=ad_config.max_seq_len,
+            max_batch_size=ad_config.max_batch_size,
+            max_beam_width=ad_config.max_beam_width,
+            decoding_config=ad_config.decoding_config,
+            kv_cache_config=ad_config.kv_cache_config,
+        )
+    else:
+        raise ValueError(f"Sampler type {ad_config.sampler_type} is not supported.")
+
+    return sampler
+
+
 def create_autodeploy_executor(ad_config: LlmArgs, tokenizer: Optional[TokenizerBase] = None):
     """Create an AutoDeploy executor from the given configuration and tokenizer.
     The tokenizer is required for guided decoding.
@@ -447,42 +509,14 @@ def create_autodeploy_executor(ad_config: LlmArgs, tokenizer: Optional[Tokenizer
     )
     scheduler = SimpleScheduler(capacitor_scheduler, mb_scheduler)
 
-    if ad_config.sampler_type == SamplerType.TorchSampler:
-        # search sampler with speculative decoding
-        sampler_args = TorchSampler.Args(
-            max_seq_len=ad_config.max_seq_len,
-            max_draft_len=max_draft_len,
-            max_total_draft_tokens=max_total_draft_tokens,
-            max_num_sequences=max_num_sequences,
-            max_beam_width=ad_config.max_beam_width,
-            disable_overlap_scheduler=ad_config.disable_overlap_scheduler,
-        )
-        sampler = TorchSampler(sampler_args)
+    sampler = instantiate_sampler(
+        ad_config=ad_config,
+        max_num_sequences=max_num_sequences,
+        max_draft_len=max_draft_len,
+        max_total_draft_tokens=max_total_draft_tokens,
+        dist_mapping=dist_mapping,
+    )
 
-    elif ad_config.sampler_type == SamplerType.TRTLLMSampler:
-        tllm_model_config = TRTLLMSamplerModelConfig(ad_config=ad_config)
-        decoding_mode = get_decoding_mode(ad_config.decoding_config, ad_config.max_beam_width)
-        # if the model dtype is "auto", we infer it from the model config
-        model_dtype = ad_config.dtype
-        print(f"model_dtype: {model_dtype}")
-        if model_dtype == "auto":
-            model_dtype = ad_config.create_factory().dtype
-            print(f"model_dtype was auto. Setting to: {model_dtype}")
-        if isinstance(model_dtype, str):
-            model_dtype = str_dtype_to_torch(model_dtype)
-            print(f"model_dtype was string. Setting to: {model_dtype}")
-        sampler = TRTLLMSampler(
-            model=tllm_model_config,
-            model_dtype=model_dtype,
-            mapping=dist_mapping,
-            decoding_mode=decoding_mode,
-            disable_overlap_scheduler=ad_config.disable_overlap_scheduler,
-            max_seq_len=ad_config.max_seq_len,
-            max_batch_size=ad_config.max_batch_size,
-            max_beam_width=ad_config.max_beam_width,
-            decoding_config=ad_config.decoding_config,
-            kv_cache_config=ad_config.kv_cache_config,
-        )
     # Guided (istructured) decoding.
     guided_decoder = None
     if (
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -2588,32 +2588,14 @@ def __init__(
         max_beam_width: int,
         decoding_config: Optional[DecodingConfig] = None,
         kv_cache_config: Optional[KvCacheConfig] = None,
-        logits_dtype: DataType = DataType.FLOAT,
     ):
         vocab_size = model.config.vocab_size
         num_hidden_layers = model.config.num_hidden_layers
         hidden_size = model.config.hidden_size
         num_heads = model.config.num_attention_heads
 
-        print(
-            f"vocab_size: {vocab_size}, num_hidden_layers: {num_hidden_layers}, hidden_size: {hidden_size}, \
-            num_heads: {num_heads}"
-        )
-        print(f"model_dtype: {model_dtype}")
-        print(f"mapping: {mapping}")
-        print(f"decoding_mode: {decoding_mode}")
-        print(f"disable_overlap_scheduler: {disable_overlap_scheduler}")
-        print(f"max_seq_len: {max_seq_len}")
-        print(f"max_batch_size: {max_batch_size}")
-        print(f"max_beam_width: {max_beam_width}")
-        print(f"decoding_config: {decoding_config}")
-        print(f"kv_cache_config: {kv_cache_config}")
-
         self.model_datatype = torch_dtype_to_binding(model_dtype)
-        self.logits_datatype = logits_dtype
-
-        print(f"self.model_datatype: {self.model_datatype}")
-        print(f"self.logits_datatype: {self.logits_datatype}")
+        self.logits_datatype = DataType.FLOAT
         self.decoding_mode = decoding_mode
         self.decoding_config = decoding_config if decoding_config else DecodingConfig(decoding_mode)
         max_attn_window = kv_cache_config.max_attention_window
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -1137,7 +1137,6 @@ def __init__(self,
                  revision: Optional[str] = None,
                  tokenizer_revision: Optional[str] = None,
                  **kwargs: Any) -> None:
-        print(f"dtype: {dtype}")
         super().__init__(model, tokenizer, tokenizer_mode, skip_tokenizer_init,
                          trust_remote_code, tensor_parallel_size, dtype,
                          revision, tokenizer_revision, **kwargs)
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_sampler.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_sampler.py
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from _model_test_utils import get_small_model_config
+from build_and_run_ad import ExperimentConfig, main
+
+from tensorrt_llm.llmapi.llm_args import SamplerType
+
+
+def test_ad_trtllm_sampler_smoke():
+    """Test TRTLLMSampler in AutoDeploy smoke test."""
+    # Get small model config
+    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+    experiment_config = get_small_model_config(model_id)
+
+    # Configure for TRTLLMSampler
+    experiment_config["args"]["runtime"] = "trtllm"
+    experiment_config["args"]["world_size"] = 1
+    experiment_config["args"]["sampler_type"] = SamplerType.TRTLLMSampler
+    # experiment_config["args"]["sampler_type"] = SamplerType.TorchSampler
+    # experiment_config["args"]["dtype"] = "float32"
+
+    # Setup simple prompt
+    experiment_config["prompt"]["batch_size"] = 1
+    experiment_config["prompt"]["queries"] = {"prompt": "What is the capital of France?"}
+    experiment_config["prompt"]["sp_kwargs"] = {
+        "max_tokens": 10,
+        "temperature": 1.0,
+        "top_k": 1,
+    }
+
+    print(f"Experiment config: {experiment_config}")
+    cfg = ExperimentConfig(**experiment_config)
+
+    print("Running smoke test with TRTLLMSampler...")
+    results = main(cfg)
+
+    # Basic assertion that we got some output
+    prompts_and_outputs = results["prompts_and_outputs"]
+    assert len(prompts_and_outputs) == 1
+    assert len(prompts_and_outputs[0][1]) > 0

Original file line number	Diff line number	Diff line change
`@@ -345,7 +345,7 @@ def build_and_load_model(self, device: DeviceLikeType) -> nn.Module:`
`345`	`345`	`"trust_remote_code": True,`
`346`	`346`	`"tp_plan": "auto",`
`347`	`347`	`**unused_kwargs,`
`348`		`- # "dtype": "auto", # takes precedence over unused_kwargs! -- REMOVED`
	`348`	`+ "dtype": "auto", # takes precedence over unused_kwargs!`
`349`	`349`	`},`
`350`	`350`	`)`
`351`	`351`	`model.eval()`