[None][perf] Adjust select_alltoall_method_type. (#8950)

bobboli · web-flow · commit d8b05894ee3b · 2025-11-19T07:43:55.000-08:00
Signed-off-by: Bo Li &lt;22713281+bobboli@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu
@@ -51,6 +51,12 @@ namespace tensorrt_llm::kernels::mnnvl_throughput
         __VA_ARGS__;                                                                                                   \
         break;                                                                                                         \
     }                                                                                                                  \
+    case 6:                                                                                                            \
+    {                                                                                                                  \
+        constexpr int TOP_K = 6;                                                                                       \
+        __VA_ARGS__;                                                                                                   \
+        break;                                                                                                         \
+    }                                                                                                                  \
     case 4:                                                                                                            \
     {                                                                                                                  \
         constexpr int TOP_K = 4;                                                                                       \
diff --git a/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp b/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp
@@ -31,7 +31,7 @@ namespace torch_ext
 namespace mnnvl_throughput
 {
 
-// TODO: Is Alignment necessary?obu guo
+// TODO: Is Alignment necessary?
 // Helper function to align offset to specified byte boundary
 inline size_t alignOffset(size_t offset, size_t alignment)
 {
diff --git a/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp
@@ -554,7 +554,7 @@ class Bf16MxE2m1BlockScaleMoeRunner : public torch::CustomClassHolder
             topk_group, intermediate_size, valid_hidden_size, valid_intermediate_size, local_expert_offset,
             local_num_experts, routed_scaling_factor, tileN, routing_method_type, mDtypeAct, *mRunners[tileN], config,
             topk_weights, topk_ids,
-            /*output=*/torch::nullopt); // TODO: Support user-provided output
+            /*out_tensor=*/torch::nullopt); // TODO: Support user-provided output
     }
 
 private:
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -151,7 +151,7 @@ def __init__(
                         model_config.mapping)
                 elif self.moe_alltoall_backend == "mnnvlthroughput":
                     workspace_mb = int(
-                        os.environ.get("TRTLLM_MOE_A2A_WORKSPACE_MB", "512"))
+                        os.environ.get("TRTLLM_MOE_A2A_WORKSPACE_MB", "2048"))
                     self.moe_a2a = MoeAlltoAll(
                         mapping=self.mapping,
                         max_num_tokens=model_config.max_num_tokens,
@@ -213,6 +213,17 @@ def has_int8_woq_per_channel(self):
         ) and not self.quant_config.layer_quant_mode.has_per_group_scaling()
 
     def select_alltoall_method_type(self) -> AlltoallMethodType:
+        # If no attention DP, no need to use AlltoAll.
+        if self.mapping.dp_size == 1:
+            return AlltoallMethodType.NotEnabled
+
+        # AlltoAll cannot support MoE TP.
+        if self.mapping.moe_tp_size != 1:
+            return AlltoallMethodType.NotEnabled
+
+        if not MnnvlMemory.supports_mnnvl():
+            return AlltoallMethodType.NotEnabled
+
         all2all_method_type = os.environ.get("TRTLLM_FORCE_ALLTOALL_METHOD")
         if all2all_method_type is not None:
             if AlltoallMethodType[all2all_method_type] in [
@@ -224,18 +235,13 @@ def select_alltoall_method_type(self) -> AlltoallMethodType:
                 )
             return AlltoallMethodType[all2all_method_type]
 
-        if not self.mapping.enable_attention_dp:
-            return AlltoallMethodType.NotEnabled
-
-        if self.mapping.tp_size == 1:
-            return AlltoallMethodType.NotEnabled
-
         if os.environ.get("TRTLLM_MOE_DISABLE_ALLTOALLV", "0") == "1":
             return AlltoallMethodType.NotEnabled
 
-        if not (self.mapping.moe_ep_size > self.routing_method.experts_per_token
-                and MnnvlMemory.supports_mnnvl()):
-            return AlltoallMethodType.NotEnabled
+        # TODO: We found that MNNVL performs better than NCCL AllGather/ReduceScatter,
+        # regardless of the relationship between EP size and topK. We favor AlltoAll for now.
+        # if not self.mapping.moe_ep_size > self.routing_method.experts_per_token:
+        #     return AlltoallMethodType.NotEnabled
 
         return AlltoallMethodType.MNNVL
 
@@ -247,9 +253,9 @@ def enable_alltoall(self):
 
     @cached_property
     def moe_alltoall_backend(self):
-        # "mnnvllatency" (default) or "mnnvlthroughput"
+        # "mnnvlthroughput" (default) or "mnnvllatency"
         return os.environ.get("TRTLLM_MOE_ALLTOALL_BACKEND",
-                              "mnnvllatency").strip().lower()
+                              "mnnvlthroughput").strip().lower()
 
     def _supports_load_balancer(self) -> bool:
         """CutlassFusedMoE supports load balancer."""
@@ -751,25 +757,15 @@ def forward_fake(
         use_dp_padding: Optional[bool] = None,
         **kwargs,
     ) -> Union[torch.Tensor, List[torch.Tensor]]:
-        if not self.enable_alltoall:
-            return super().forward_fake(
-                x,
-                router_logits,
-                do_finalize=do_finalize,
-                output_dtype=output_dtype,
-                all_rank_num_tokens=all_rank_num_tokens,
-                use_dp_padding=use_dp_padding,
-                **kwargs,
-            )
-        else:
-            is_nvfp4_input = isinstance(x, Fp4QuantizedTensor)
-            data_type = output_dtype if is_nvfp4_input else x.dtype
-            num_tokens = all_rank_num_tokens[
-                self.parallel_rank] if all_rank_num_tokens else x.shape[0]
-            hidden_size = x.shape[1] * (2 if is_nvfp4_input else 1)
-            top_k = self.routing_method.experts_per_token
-            return x.new_empty((num_tokens, top_k, hidden_size),
-                               dtype=data_type)
+        return super().forward_fake(
+            x,
+            router_logits,
+            do_finalize=do_finalize,
+            output_dtype=output_dtype,
+            all_rank_num_tokens=all_rank_num_tokens,
+            use_dp_padding=use_dp_padding,
+            **kwargs,
+        )
 
     def load_weights(self, weights: List[Dict]):
         assert self._weights_created
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
@@ -13,6 +13,7 @@
 from ...model_config import ModelConfig
 from ...utils import AuxStreamType, EventType, Fp4QuantizedTensor
 from .fused_moe_cutlass import CutlassFusedMoE
+from .interface import AlltoallMethodType
 from .quantization import (DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm,
                            MoEWeightLoadingMode, UnquantizedFusedMoEMethod)
 from .routing import BaseMoeRoutingMethod
@@ -462,6 +463,10 @@ def _get_quant_method(self):
         else:
             return UnquantizedFusedMoEMethod()
 
+    def select_alltoall_method_type(self) -> AlltoallMethodType:
+        """DeepGEMM backend currently doesn't support alltoall; honor overrides but default to disabled."""
+        return AlltoallMethodType.NotEnabled
+
     @nvtx_range("[DG] forward")
     def forward_chunk(
         self,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
@@ -128,7 +128,7 @@ def __init__(
                         model_config.mapping)
                 elif self.moe_alltoall_backend == "mnnvlthroughput":
                     workspace_mb = int(
-                        os.environ.get("TRTLLM_MOE_A2A_WORKSPACE_MB", "512"))
+                        os.environ.get("TRTLLM_MOE_A2A_WORKSPACE_MB", "2048"))
                     self.moe_a2a = MoeAlltoAll(
                         mapping=self.mapping,
                         max_num_tokens=model_config.max_num_tokens,
@@ -154,6 +154,17 @@ def __init__(
             self.create_weights()
 
     def select_alltoall_method_type(self) -> AlltoallMethodType:
+        # If no attention DP, no need to use AlltoAll.
+        if self.mapping.dp_size == 1:
+            return AlltoallMethodType.NotEnabled
+
+        # AlltoAll cannot support MoE TP.
+        if self.mapping.moe_tp_size != 1:
+            return AlltoallMethodType.NotEnabled
+
+        if not MnnvlMemory.supports_mnnvl():
+            return AlltoallMethodType.NotEnabled
+
         all2all_method_type = os.environ.get("TRTLLM_FORCE_ALLTOALL_METHOD")
         if all2all_method_type is not None:
             if AlltoallMethodType[all2all_method_type] in [
@@ -165,18 +176,13 @@ def select_alltoall_method_type(self) -> AlltoallMethodType:
                 )
             return AlltoallMethodType[all2all_method_type]
 
-        if not self.mapping.enable_attention_dp:
-            return AlltoallMethodType.NotEnabled
-
-        if self.mapping.tp_size == 1:
-            return AlltoallMethodType.NotEnabled
-
         if os.environ.get("TRTLLM_MOE_DISABLE_ALLTOALLV", "0") == "1":
             return AlltoallMethodType.NotEnabled
 
-        if not (self.mapping.moe_ep_size > self.routing_method.experts_per_token
-                and MnnvlMemory.supports_mnnvl()):
-            return AlltoallMethodType.NotEnabled
+        # TODO: We found that MNNVL performs better than NCCL AllGather/ReduceScatter,
+        # regardless of the relationship between EP size and topK. We favor AlltoAll for now.
+        # if not self.mapping.moe_ep_size > self.routing_method.experts_per_token:
+        #     return AlltoallMethodType.NotEnabled
 
         return AlltoallMethodType.MNNVL
 
@@ -192,9 +198,9 @@ def enable_alltoall(self):
 
     @cached_property
     def moe_alltoall_backend(self):
-        # "mnnvllatency" (default) or "mnnvlthroughput"
+        # "mnnvlthroughput" (default) or "mnnvllatency"
         return os.environ.get("TRTLLM_MOE_ALLTOALL_BACKEND",
-                              "mnnvllatency").strip().lower()
+                              "mnnvlthroughput").strip().lower()
 
     def _check_configs(self):
         assert self.has_deepseek_fp8_block_scales \
@@ -503,7 +509,8 @@ def forward_impl(
 
         moe_output: Optional[torch.Tensor] = None
         use_workspace_output = False
-        if self.enable_alltoall and self.moe_alltoall_backend == "mnnvlthroughput":
+        # TODO: use_workspace_output only supports w4a8_mxfp4_mxfp8 (gpt-oss) for now
+        if self.enable_alltoall and self.moe_alltoall_backend == "mnnvlthroughput" and self.has_w4a8_mxfp4_mxfp8:
             moe_output = self.moe_a2a.get_combine_payload_tensor_in_workspace(
                 runtime_max_tokens_per_rank, self.hidden_size, torch.bfloat16)
             use_workspace_output = True
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_serve.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_serve.py
@@ -19,7 +19,7 @@ def _run_serve_with_click(args):
         raise SystemExit(result.exit_code)
 
 
-@pytest.mark.timeout(360)
+@pytest.mark.timeout(500)
 def test_trtllm_serve_openai_chat_completion(tmp_path):
     # Prepare small model config and extra options yaml
     config = get_small_model_config("meta-llama/Meta-Llama-3.1-8B-Instruct")
@@ -58,7 +58,7 @@ def test_trtllm_serve_openai_chat_completion(tmp_path):
 
         start_time = time.time()
         last_err = None
-        while time.time() - start_time < 90:
+        while time.time() - start_time < 300:
             if not server.is_alive():
                 raise RuntimeError("Server process exited prematurely")
             try:

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ namespace torch_ext`
`31`	`31`	`namespace mnnvl_throughput`
`32`	`32`	`{`
`33`	`33`
`34`		`-// TODO: Is Alignment necessary?obu guo`
	`34`	`+// TODO: Is Alignment necessary?`
`35`	`35`	`// Helper function to align offset to specified byte boundary`
`36`	`36`	`inline size_t alignOffset(size_t offset, size_t alignment)`
`37`	`37`	`{`
Original file line number	Diff line number	Diff line change
`@@ -554,7 +554,7 @@ class Bf16MxE2m1BlockScaleMoeRunner : public torch::CustomClassHolder`
`554`	`554`	`topk_group, intermediate_size, valid_hidden_size, valid_intermediate_size, local_expert_offset,`
`555`	`555`	`local_num_experts, routed_scaling_factor, tileN, routing_method_type, mDtypeAct, *mRunners[tileN], config,`
`556`	`556`	`topk_weights, topk_ids,`
`557`		`- /output=/torch::nullopt); // TODO: Support user-provided output`
	`557`	`+ /out_tensor=/torch::nullopt); // TODO: Support user-provided output`
`558`	`558`	`}`
`559`	`559`
`560`	`560`	`private:`