address feedback and fix test

rosenrodt · rosenrodt · commit 116dc65f5484 · 2025-12-01T15:20:30.000+08:00
Signed-off-by: Anthony Chang &lt;27950904+rosenrodt@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
@@ -1,12 +1,11 @@
 from dataclasses import dataclass, replace
-from enum import Enum
 from functools import lru_cache
 from typing import List, Optional, Tuple, Union
 
 import torch
-from torch.distributions import Normal
 
-from tensorrt_llm._torch.utils import (Fp4QuantizedTensor, fp4_utils,
+from tensorrt_llm._torch.utils import (Fp4QuantizedTensor, TopkIdsGenMethod,
+                                       create_dummy_topk_ids, fp4_utils,
                                        get_last_power_of_2_num_tokens_buckets,
                                        last_positive_power_of_2,
                                        next_positive_power_of_2)
@@ -15,75 +14,6 @@
                          OptimizationProfile, TunableRunner, TuningConfig)
 
 
-class TopkIdsGenMethod(Enum):
-    """
-    Methods for generating dummy topk_ids for autotuning.
-
-    - UNIFORM: Uniform distribution; this performs the worst as it does not reflect real runs
-    - RANDINT: Uniform with duplicates; this performs better than UNIFORM and GAUSSIAN
-    - GAUSSIAN: Gaussian distribution
-    """
-
-    UNIFORM = "uniform"
-    RANDINT = "randint"
-    GAUSSIAN = "gaussian"
-
-
-def create_dummy_topk_ids(
-    num_tokens: int,
-    num_experts: int,
-    top_k: int,
-    device: torch.device,
-    method: TopkIdsGenMethod,
-) -> torch.Tensor:
-    """
-    Factory function to create dummy topk_ids for autotuning.
-
-    Args:
-        num_tokens: Number of tokens (batch dimension)
-        num_experts: Number of experts to choose from
-        top_k: Number of experts to select per token
-        device: Device to create tensor on
-        method: Generation method (see TopkIdsGenMethod)
-
-    Returns:
-        topk_ids tensor of shape (num_tokens, top_k) with dtype int32
-    """
-    # Note: RANDINT is uniform distribution with replacement which can cause duplicates. However we
-    # settle with RANDINT for the moment because, in practice, MoE tuned with RANDINT performs better
-    # than both GAUSSIAN and UNIFORM. In the future, we should adopt GAUSSIAN(mu, sigma) because the
-    # topk_id for each token is guaranteed to be unique.
-
-    if method == TopkIdsGenMethod.UNIFORM:
-        rand_scores = torch.rand(num_tokens, num_experts, device=device)
-        topk_ids = rand_scores.argsort(dim=1)[:, :top_k]
-
-    elif method == TopkIdsGenMethod.RANDINT:
-        topk_ids = torch.randint(0,
-                                 num_experts, (num_tokens, top_k),
-                                 device=device)
-
-    elif method == TopkIdsGenMethod.GAUSSIAN:
-        # Make variance proportional to num_experts
-        sigma = num_experts / 3.0
-        # Off-center mean to get slightly long-tail distribution
-        mean = 2 * num_experts / 3
-        normal = Normal(loc=mean, scale=sigma)
-
-        expert_indices = torch.arange(num_experts,
-                                      device=device,
-                                      dtype=torch.float32)
-
-        weights = torch.exp(normal.log_prob(expert_indices))
-
-        weights_expanded = weights.unsqueeze(0).expand(num_tokens, -1)
-        topk_ids = torch.multinomial(weights_expanded,
-                                     num_samples=top_k,
-                                     replacement=False)
-
-    return topk_ids.to(torch.int32).to(device)
-
-
 def prepare_dummy_topk_and_hook(
     topk_weights: Optional[torch.Tensor],
     topk_ids: Optional[torch.Tensor],
diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
@@ -6,6 +6,7 @@
 from typing import Dict, List
 
 import torch
+from torch.distributions import Normal
 
 from tensorrt_llm._utils import TensorWrapper, convert_to_torch_tensor
 from tensorrt_llm.mapping import Mapping
@@ -370,3 +371,72 @@ def wrapper(*args, **kwargs):
         return wrapper
 
     return decorator(func) if func else decorator
+
+
+class TopkIdsGenMethod(Enum):
+    """
+    Methods for generating dummy topk_ids for autotuning.
+
+    - UNIFORM: Uniform distribution; this performs the worst as it does not reflect real runs
+    - RANDINT: Uniform with duplicates; this performs better than UNIFORM and GAUSSIAN
+    - GAUSSIAN: Gaussian distribution
+    """
+
+    UNIFORM = "uniform"
+    RANDINT = "randint"
+    GAUSSIAN = "gaussian"
+
+
+def create_dummy_topk_ids(
+    num_tokens: int,
+    num_experts: int,
+    top_k: int,
+    device: torch.device,
+    method: TopkIdsGenMethod,
+) -> torch.Tensor:
+    """
+    Factory function to create dummy topk_ids for autotuning.
+
+    Args:
+        num_tokens: Number of tokens (batch dimension)
+        num_experts: Number of experts to choose from
+        top_k: Number of experts to select per token
+        device: Device to create tensor on
+        method: Generation method (see TopkIdsGenMethod)
+
+    Returns:
+        topk_ids tensor of shape (num_tokens, top_k) with dtype int32
+    """
+    # Note: RANDINT is uniform distribution with replacement which can cause duplicates. However we
+    # settle with RANDINT for the moment because, in practice, MoE tuned with RANDINT performs better
+    # than both GAUSSIAN and UNIFORM. In the future, we should adopt GAUSSIAN(mu, sigma) because the
+    # topk_id for each token is guaranteed to be unique.
+
+    if method == TopkIdsGenMethod.UNIFORM:
+        rand_scores = torch.rand(num_tokens, num_experts, device=device)
+        topk_ids = rand_scores.argsort(dim=1)[:, :top_k]
+
+    elif method == TopkIdsGenMethod.RANDINT:
+        topk_ids = torch.randint(0,
+                                 num_experts, (num_tokens, top_k),
+                                 device=device)
+
+    elif method == TopkIdsGenMethod.GAUSSIAN:
+        # Make variance proportional to num_experts
+        sigma = num_experts / 3.0
+        # Off-center mean to get slightly long-tail distribution
+        mean = 2 * num_experts / 3
+        normal = Normal(loc=mean, scale=sigma)
+
+        expert_indices = torch.arange(num_experts,
+                                      device=device,
+                                      dtype=torch.float32)
+
+        weights = torch.exp(normal.log_prob(expert_indices))
+
+        weights_expanded = weights.unsqueeze(0).expand(num_tokens, -1)
+        topk_ids = torch.multinomial(weights_expanded,
+                                     num_samples=top_k,
+                                     replacement=False)
+
+    return topk_ids.to(torch.int32).to(device)
diff --git a/tests/unittest/_torch/misc/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py
@@ -318,7 +318,7 @@ def test_multiple_dynamic_shapes_cache():
     # We also test the cache serialization and deserialization here.
     AutoTuner.get().profiling_cache.clear()
     AutoTuner.get().profiling_cache.load_cache(
-        os.path.join(temp_dir.name, "test_multiple_dynamic_shapes.rank0.json"))
+        os.path.join(temp_dir.name, "test_multiple_dynamic_shapes.json"))
     cache_entries = tuner.profiling_cache.get_specific_custom_op(
         "test_multiple_dynamic_shapes")