short circuit autotune logics when not in autotune mode; address feedback; last minute fix expert weight dtype

rosenrodt · rosenrodt · commit 64ec271b608d · 2025-12-04T01:06:38.000+08:00
Signed-off-by: Anthony Chang &lt;27950904+rosenrodt@users.noreply.github.com&gt;
diff --git a/.clangd b/.clangd
@@ -23,6 +23,7 @@ CompileFlags:
     - cuda
     # Allow variadic CUDA functions
     - "-Xclang=-fcuda-allow-variadic-functions"
+    - "-I/mnt/trtllm-gen/amodel/cuda/gpgpu_internal/include"
 
 ---
 
diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
@@ -56,6 +56,11 @@ def prepare_dummy_topk_and_hook(
         Tuple of (routing_logits_for_tuner, topk_weights_for_tuner, topk_ids_for_tuner, tuning_config_with_hook)
     """
 
+    # NOTE: This prevents auto-tuning related code from being executed in actual runs
+    tuner = AutoTuner.get()
+    if not tuner.is_tuning_mode:
+        return routing_logits, topk_weights, topk_ids, base_tuning_config
+
     if routing_logits is None:
         routing_logits_for_tuner = torch.randn(hidden_states.shape[0],
                                                num_experts,
@@ -91,6 +96,7 @@ def prepare_dummy_topk_and_hook(
         # Attention DP: topk is pre-computed, no routing needed
         topk_ids_for_tuner, topk_weights_for_tuner = routing_method.apply(
             routing_logits_for_tuner)
+        topk_weights_for_tuner = topk_weights_for_tuner.to(torch.bfloat16)
         # Don't pass routing_logits to avoid C++ warning about all three being provided
         routing_logits_for_tuner = None
     else:
@@ -122,7 +128,7 @@ def recreate_dummy_topk_if_needed(
                 topk_ids_for_tuner, topk_weights_for_tuner = routing_method.apply(
                     routing_logits_for_tuner)
                 inputs[-1] = topk_ids_for_tuner
-                inputs[-2] = topk_weights_for_tuner
+                inputs[-2] = topk_weights_for_tuner.to(torch.bfloat16)
             # Note: routing_logits is None in attention DP, no need to adjust
             assert inputs[0] is None
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/routing.py b/tensorrt_llm/_torch/modules/fused_moe/routing.py
@@ -219,7 +219,7 @@ def routing_method_type(self):
         return RoutingMethodType.Default
 
 
-class Deepseekv3RoutingImpl():
+class Deepseekv3RoutingImpl:
 
     def __init__(
             self,
@@ -556,7 +556,6 @@ def routing_method_type(self) -> RoutingMethodType:
         return RoutingMethodType.RenormalizeNaive
 
 
-# Mapping from RoutingMethodType to the corresponding class
 ROUTING_METHOD_TYPE_TO_CLASS: Dict[RoutingMethodType,
                                    Type[BaseMoeRoutingMethod]] = {
                                        RoutingMethodType.Default: