[Dev] fix(megatron-fsdp): Resolve hang caused by non-deterministic reduce-scatter (#2252)

xuwchen · shjwudp · yanring · web-flow · commit c6e2b29c9195 · 2025-11-20T04:18:20.000Z
Co-authored-by: Jianbin Chang &lt;shjwudp@gmail.com&gt;
Co-authored-by: Zijie Yan &lt;zijiey@nvidia.com&gt;
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py
@@ -2782,6 +2782,9 @@ def reduce_gradients(
             outer_fsdp_group_grad_reduce (bool, optional): Whether to reduce gradients
                 across outer-DP groups. Defaults to False.
         """
+        # Sort parameters by their bucket IDs to ensure a deterministic processing order.
+        # Performing reduce-scatter operations out of order can lead to hangs.
+        params = sorted(list(params), key=lambda x: self.buffer.param_to_param_group[x])
         for param in params:
             bucket_id = self.buffer.param_to_param_group[param]
             param_group = self.buffer.parameter_groups[bucket_id]