fixed rebase, fixed rms norm fusion to use the correct strategy, enhanced rms test to check strategy

MrGeva · MrGeva · commit 1dff84bc6aa5 · 2025-11-26T00:57:11.000-08:00
Signed-off-by: Eran Geva &lt;19514940+MrGeva@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/trtllm_dist.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/trtllm_dist.py
@@ -30,13 +30,22 @@ def trtllm_allreduce(tensor, op, strategy: str, all_reduce_params=None):
     rank, world_size = get_rank_world_size()
     assert op == ReduceOp.SUM, "TRT-LLM all reduce only supports SUM op."
 
-    # Cache key includes rank, world_size, and dtype to handle different configurations
-    cache_key = (rank, world_size, tensor.dtype)
+    # Convert string strategy to enum
+    try:
+        strategy_enum = getattr(AllReduceStrategy, strategy)
+    except AttributeError:
+        raise ValueError(
+            f"Invalid allreduce strategy: {strategy}. "
+            f"Valid options: AUTO, NCCL, ONESHOT, TWOSHOT, MIN_LATENCY, "
+            f"LOWPRECISION, UB, MNNVL, NCCL_SYMMETRIC"
+        )
+
+    # Cache key includes rank, world_size, dtype, and strategy to handle different configurations
+    cache_key = (rank, world_size, tensor.dtype, strategy_enum)
     if cache_key not in _allreduce_cache:
         p_config = Mapping(world_size=world_size, tp_size=world_size, rank=rank)
-        # Use Strategy.AUTO for optimal performance
         _allreduce_cache[cache_key] = AllReduce(
-            mapping=p_config, strategy=strategy, dtype=tensor.dtype
+            mapping=p_config, strategy=strategy_enum, dtype=tensor.dtype
         )
 
     torch_op = _allreduce_cache[cache_key]
@@ -87,7 +96,11 @@ def trtllm_dist_all_reduce_fake(tensor, strategy):
     "dist::trtllm_fused_allreduce_residual_rmsnorm", mutates_args=(), device_types="cuda"
 )
 def trtllm_fused_allreduce_residual_rmsnorm(
-    tensor: torch.Tensor, residual: torch.Tensor, norm_weight: torch.Tensor, eps: float
+    tensor: torch.Tensor,
+    residual: torch.Tensor,
+    norm_weight: torch.Tensor,
+    eps: float,
+    strategy: str,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Fused allreduce + residual + rmsnorm using TRT-LLM optimized kernel.
 
@@ -100,12 +113,18 @@ def trtllm_fused_allreduce_residual_rmsnorm(
         norm_weight=norm_weight,
         eps=eps,
     )
-    return trtllm_allreduce(tensor, ReduceOp.SUM, all_reduce_params=all_reduce_params)
+    return trtllm_allreduce(
+        tensor, ReduceOp.SUM, strategy=strategy, all_reduce_params=all_reduce_params
+    )
 
 
 @trtllm_fused_allreduce_residual_rmsnorm.register_fake
 def trtllm_fused_allreduce_residual_rmsnorm_fake(
-    tensor: torch.Tensor, residual: torch.Tensor, norm_weight: torch.Tensor, eps: float
+    tensor: torch.Tensor,
+    residual: torch.Tensor,
+    norm_weight: torch.Tensor,
+    eps: float,
+    strategy: str,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     return torch.empty_like(tensor), torch.empty_like(tensor)
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py b/tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py
@@ -5,6 +5,7 @@
 The torch backend (demollm mode) does not benefit from fusion.
 """
 
+from functools import partial
 from typing import Tuple
 
 import torch
@@ -28,11 +29,14 @@
 # ============================================================================
 
 
-def _make_allreduce_residual_rmsnorm_pattern(add_order: str = "residual_first"):
+def _make_allreduce_residual_rmsnorm_pattern(
+    add_order: str = "residual_first", strategy: str = "AUTO"
+):
     """Factory function to create pattern functions for allreduce+residual+rmsnorm fusion.
 
     Args:
         add_order: Either "residual_first" (residual + x) or "x_first" (x + residual)
+        strategy: AllReduce strategy to use in the pattern
 
     Returns:
         A pattern function that can be used with register_ad_pattern
@@ -50,7 +54,7 @@ def pattern_fn(
         Returns (normed, z)
         """
         input_dtype = x.dtype
-        hidden_states = torch.ops.auto_deploy.trtllm_dist_all_reduce(x)
+        hidden_states = torch.ops.auto_deploy.trtllm_dist_all_reduce(x, strategy)
 
         # Handle addition order
         if add_order == "residual_first":
@@ -70,10 +74,12 @@ def pattern_fn(
 
 
 def _allreduce_residual_rmsnorm_replacement(
-    x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float
+    x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float, strategy: str
 ):
     """Replacement using TRT-LLM fused kernel."""
-    return torch.ops.dist.trtllm_fused_allreduce_residual_rmsnorm(x, residual, weight, eps)
+    return torch.ops.dist.trtllm_fused_allreduce_residual_rmsnorm(
+        x, residual, weight, eps, strategy
+    )
 
 
 # ============================================================================
@@ -115,19 +121,22 @@ def _apply(
         # Instantiate Pattern Functions
         # ============================================================================
 
+        # Get the allreduce strategy from shared_config
+        strategy = shared_config.sharding_config.allreduce_strategy.name
+
         # TRT-LLM backend (MPI mode) - two patterns for different addition orders
         _allreduce_residual_rmsnorm_pattern_trtllm = _make_allreduce_residual_rmsnorm_pattern(
-            add_order="residual_first"
+            add_order="residual_first", strategy=strategy
         )
         _allreduce_residual_rmsnorm_pattern2_trtllm = _make_allreduce_residual_rmsnorm_pattern(
-            add_order="x_first"
+            add_order="x_first", strategy=strategy
         )
 
         # Register TRT-LLM backend patterns only (no torch backend fusion)
         # Pattern 1: residual + allreduce(x)
         register_ad_pattern(
             search_fn=_allreduce_residual_rmsnorm_pattern_trtllm,
-            replace_fn=_allreduce_residual_rmsnorm_replacement,
+            replace_fn=partial(_allreduce_residual_rmsnorm_replacement, strategy=strategy),
             patterns=patterns,
             dummy_args=dummy_args,
             op_ignore_types=op_ignore_types,
@@ -137,7 +146,7 @@ def _apply(
         # Pattern 2: allreduce(x) + residual
         register_ad_pattern(
             search_fn=_allreduce_residual_rmsnorm_pattern2_trtllm,
-            replace_fn=_allreduce_residual_rmsnorm_replacement,
+            replace_fn=partial(_allreduce_residual_rmsnorm_replacement, strategy=strategy),
             patterns=patterns,
             dummy_args=dummy_args,
             op_ignore_types=op_ignore_types,
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py
@@ -58,6 +58,8 @@ def validate_allreduce_strategy(v):
     if isinstance(v, int):
         return AllReduceStrategy(v)
     return v  # Let Pydantic handle other types
+
+
 def _get_dist_ops(backend: str):
     """Get the appropriate distributed ops based on backend availability.
 
@@ -585,7 +587,7 @@ def _shard_parameter_node(
 
     # add reduction node
     with gm.graph.inserting_after(node):
-        dist_node = gm.graph.call_function(fn_dist, args=dist_args)
+        dist_node = gm.graph.call_function(fn_dist, args=(node,) + tuple(dist_args))
         node.replace_all_uses_with(dist_node)
         dist_node.replace_input_with(dist_node, node)
 
@@ -1232,7 +1234,9 @@ def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
 
     def apply(self, gm: GraphModule, node: Node) -> None:
         """Apply EP sharding transformation to the graph module."""
-        _insert_sharded_moe(gm, node, self.rank, self.world_size, self.allreduce_strategy, self.dist_backend, [])
+        _insert_sharded_moe(
+            gm, node, self.rank, self.world_size, self.allreduce_strategy, self.dist_backend, []
+        )
 
 
 class MXFP4EPShardingInfo(EPShardingInfo):
@@ -1246,7 +1250,9 @@ def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
         return True
 
     def apply(self, gm: GraphModule, node: Node) -> None:
-        _insert_sharded_mxfp4_mlp_ep(gm, node, self.rank, self.world_size, self.allreduce_strategy, self.dist_backend)
+        _insert_sharded_mxfp4_mlp_ep(
+            gm, node, self.rank, self.world_size, self.allreduce_strategy, self.dist_backend
+        )
 
 
 class FP8EPShardingInfo(EPShardingInfo, QuantizationShardingMixin):
@@ -1263,7 +1269,13 @@ def scale_names(self) -> List[str]:
 
     def apply(self, gm: GraphModule, node: Node) -> None:
         _insert_sharded_moe(
-            gm, node, self.rank, self.world_size, self.allreduce_strategy, self.dist_backend, self.scale_names()
+            gm,
+            node,
+            self.rank,
+            self.world_size,
+            self.allreduce_strategy,
+            self.dist_backend,
+            self.scale_names(),
         )
 
 
@@ -1281,7 +1293,13 @@ def scale_names(self) -> List[str]:
 
     def apply(self, gm: GraphModule, node: Node) -> None:
         _insert_sharded_moe(
-            gm, node, self.rank, self.world_size, self.allreduce_strategy, self.dist_backend, self.scale_names()
+            gm,
+            node,
+            self.rank,
+            self.world_size,
+            self.allreduce_strategy,
+            self.dist_backend,
+            self.scale_names(),
         )
 
 
diff --git a/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py b/tests/unittest/_torch/auto_deploy/unit/multigpu/transformations/library/test_allreduce_residual_rmsnorm_fusion.py
@@ -32,12 +32,13 @@ def forward(self, hidden_states: torch.Tensor):
 class AllreduceResidualNorm(torch.nn.Module):
     """AllreduceResidualNorm pattern model that do residual plus x"""
 
-    def __init__(self, hidden_size, dtype):
+    def __init__(self, hidden_size, dtype, strategy):
         super().__init__()
         self.norm = RMSNorm(hidden_size, 1e-5, dtype)
+        self.strategy = strategy
 
     def forward(self, x, residual):
-        x = torch.ops.auto_deploy.torch_dist_all_reduce.default(x, "AUTO")
+        x = torch.ops.auto_deploy.trtllm_dist_all_reduce.default(x, self.strategy)
         y = residual + x
         normed = self.norm(y)
         return normed, y
@@ -46,18 +47,19 @@ def forward(self, x, residual):
 class AllreduceResidualNorm2(torch.nn.Module):
     """AllreduceResidualNorm pattern model that do x plus residual"""
 
-    def __init__(self, hidden_size, dtype):
+    def __init__(self, hidden_size, dtype, strategy):
         super().__init__()
         self.norm = RMSNorm(hidden_size, 1e-5, dtype)
+        self.strategy = strategy
 
     def forward(self, x, residual):
-        x = torch.ops.auto_deploy.torch_dist_all_reduce.default(x, "AUTO")
+        x = torch.ops.auto_deploy.trtllm_dist_all_reduce.default(x, self.strategy)
         y = x + residual
         normed = self.norm(y)
         return normed, y
 
 
-def _test_allreduce_fusion(port: int, ModuleCls):
+def _test_allreduce_fusion(port: int, ModuleCls, strategy: str):
     if not is_trtllm_op_available():
         pytest.skip("Require trtllm ops to run test_allreduce_fusion.")
 
@@ -69,7 +71,7 @@ def _test_allreduce_fusion(port: int, ModuleCls):
     residual = torch.randn(16, 16).to(dtype).cuda()
 
     # Trace the original model
-    model = ModuleCls(16, dtype)
+    model = ModuleCls(16, dtype, strategy=strategy)
     args = (
         x,
         residual,
@@ -78,10 +80,14 @@ def _test_allreduce_fusion(port: int, ModuleCls):
     # Run the original
     original_outputs, residual_original = gm(x, residual)
 
-    # Fuse ops
+    # Fuse ops with the specified strategy
     gm_transformed = InferenceOptimizer(
         None,
         {
+            "detect_sharding": {
+                "stage": "post_export",
+                "allreduce_strategy": strategy,
+            },
             "fuse_allreduce_residual_rmsnorm": {
                 "stage": "post_load_fusion",
             },
@@ -91,12 +97,21 @@ def _test_allreduce_fusion(port: int, ModuleCls):
     # Run the fused graph
     fused_outputs, residual_fused = gm_transformed(x, residual)
 
-    # Check if fused node in the graph
+    # Check if fused node in the graph and verify strategy
     has_fused_node = False
+    fused_node_strategy = None
     for node in gm_transformed.graph.nodes:
         if is_op(node, torch.ops.dist.trtllm_fused_allreduce_residual_rmsnorm):
             has_fused_node = True
+            # The fused node should have the strategy as the last argument
+            # args: (x, residual, weight, eps, strategy)
+            if len(node.args) >= 5:
+                fused_node_strategy = node.args[4]
+
     assert has_fused_node, "Fused node not found."
+    assert fused_node_strategy == strategy, (
+        f"Fused node strategy mismatch: expected '{strategy}', got '{fused_node_strategy}'"
+    )
 
     # Verify outputs are consistent
     assert torch.allclose(residual_original, residual_fused, atol=1e-5), (
@@ -117,11 +132,16 @@ def _test_allreduce_fusion(port: int, ModuleCls):
     [AllreduceResidualNorm, AllreduceResidualNorm2],
     ids=["residual_plus_x", "x_plus_residual"],
 )
-def test_allreduce_fusion(device_count, ModuleCls):
+@pytest.mark.parametrize(
+    "strategy",
+    ["AUTO", "NCCL", "ONESHOT"],
+    ids=["strategy_auto", "strategy_nccl", "strategy_oneshot"],
+)
+def test_allreduce_fusion(device_count, ModuleCls, strategy):
     if device_count <= 1:
         pytest.skip("Require multi GPUs to run test_allreduce_fusion.")
     port = dist.get_free_port()
 
     n_workers = device_count
     mpi_pool = MpiPoolSession(n_workers=n_workers)
-    mpi_pool.submit_sync(_test_allreduce_fusion, port=port, ModuleCls=ModuleCls)
+    mpi_pool.submit_sync(_test_allreduce_fusion, port=port, ModuleCls=ModuleCls, strategy=strategy)