Reduce-scatter implementation with FP32 accumulation

deepakn94 · deepakn94 · commit eaba87cb19a0 · 2025-10-27T18:14:04.000+05:30
- Include plumbing to use this implementation for actual model training
- Add async_op=True option for reduce_scatter_with_fp32_accumulation
- Add unit test

Signed-off-by: Deepak Narayanan &lt;dnarayanan@nvidia.com&gt;
diff --git a/megatron/core/distributed/distributed_data_parallel_config.py b/megatron/core/distributed/distributed_data_parallel_config.py
@@ -49,6 +49,11 @@ class DistributedDataParallelConfig:
        message size (which for ring algorithms is bucket_size / dp_size) apparently needs
        to be divisible by a power of 2 for high busbw."""
 
+    reduce_scatter_with_fp32_accumulation: bool = False
+    """If true, use a reduce-scatter implementation which sends lower-precision values
+       over the wire (using an all-to-all to keep total communication overhead in line
+       with the standard ring implementation) but performs accumulation locally in FP32."""
+
     average_in_collective: bool = False
     """If true, compute average in collective directly, as opposed to dividing by the
        dp_size first and then computing sum in the collective."""
diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py
@@ -20,6 +20,7 @@
 from ..fp8_utils import is_float8tensor, is_mxfp8tensor, modify_underlying_storage
 from ..utils import is_torch_min_version, log_on_each_pipeline_stage
 from .distributed_data_parallel_config import DistributedDataParallelConfig
+from .reduce_scatter_with_fp32_accumulation import reduce_scatter_with_fp32_accumulation
 
 logger = logging.getLogger(__name__)
 
@@ -151,6 +152,13 @@ def __init__(
         if self.ddp_config.num_distributed_optimizer_instances > 1:
             self.inter_distributed_optimizer_instance_group = None
             self.communication_stream = None
+            assert (
+                not self.ddp_config.reduce_scatter_with_fp32_accumulation
+            ), "RS w/ FP32 accumulation not supported with num_distributed_optimizer_instances > 1"
+
+        global dist_reduce_scatter_func
+        if self.ddp_config.reduce_scatter_with_fp32_accumulation:
+            dist_reduce_scatter_func = reduce_scatter_with_fp32_accumulation
 
         self.reset()
         self.param_gather_handle = None
@@ -382,6 +390,7 @@ def start_grad_sync(self):
             communication_group = self.data_parallel_group
 
         # Coalesce communication kernels across buckets in the bucket group.
+        grad_reduce_handle = None
         with stream_context, _coalescing_manager(communication_group, async_ops=async_op) as cm:
             for idx, bucket in enumerate(self.buckets):
                 if self.ddp_config.use_distributed_optimizer:
@@ -392,7 +401,7 @@ def start_grad_sync(self):
                     local_data_view = self.cached_grad_buffer_shard_list[idx][
                         self.intra_distributed_optimizer_instance_rank
                     ]
-                    dist_reduce_scatter_func(
+                    grad_reduce_handle = dist_reduce_scatter_func(
                         local_data_view,
                         bucket.grad_data,
                         op=reduce_op,
@@ -434,7 +443,16 @@ def start_grad_sync(self):
                     )
 
         if async_op:
-            self.grad_reduce_handle = cm
+            if self.ddp_config.reduce_scatter_with_fp32_accumulation:
+                assert (
+                    len(self.buckets) == 1
+                ), "Only 1 bucket supported with reduce_scatter_with_fp32_accumulation=True"
+                # torch.distributed._coalescing_manager does not correctly handle calling our custom
+                # collective handle's .wait() method, so we take matters into our own hands here.
+                assert grad_reduce_handle is not None
+                self.grad_reduce_handle = grad_reduce_handle
+            else:
+                self.grad_reduce_handle = cm
         else:
             # When using `_coalescing_manager`, even if a synchronous op (async_op=False) is used,
             # `cm` is not None, which is different from when `_coalescing_manager` is not used in
diff --git a/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py b/megatron/core/distributed/reduce_scatter_with_fp32_accumulation.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+
+from typing import Any
+
+import torch
+
+
+class _ReduceScatterWithFP32AccumulationWorkHandle:
+    """Work handle to return to user when using reduce_scatter_with_fp32_accumulation with
+    async_op=True."""
+
+    def __init__(
+        self,
+        all_to_all_handle: Any,
+        all_to_all_output_tensor: torch.Tensor,
+        output_tensor: torch.Tensor,
+        world_size: int,
+    ):
+        """Initialize WorkHandle object."""
+        self.all_to_all_handle = all_to_all_handle
+        self.all_to_all_output_tensor = all_to_all_output_tensor
+        self.output_tensor = output_tensor
+        self.world_size = world_size
+
+    def wait(self):
+        """Wait until communication (and associated computation) is completed."""
+        # Wait for communication to complete if needed.
+        if self.all_to_all_handle is not None:
+            self.all_to_all_handle.wait()
+
+        # Accumulate into a fp32 sum.
+        output_tensor_in_fp32 = torch.sum(
+            self.all_to_all_output_tensor.view((self.world_size, -1)), dim=0, dtype=torch.float32
+        )
+        assert output_tensor_in_fp32.dtype == torch.float32
+
+        # Copy downcasted sum into output_tensor.
+        self.output_tensor.copy_(output_tensor_in_fp32)
+
+
+def reduce_scatter_with_fp32_accumulation(
+    output_tensor: torch.Tensor,
+    input_tensor: torch.Tensor,
+    op: torch.distributed.ReduceOp,
+    group: torch.distributed.ProcessGroup,
+    async_op: bool,
+):
+    """Reduce-scatter with FP32 accumulation.
+
+    Collects input_tensor in lower precision using an all-to-all, then locally accumulates in FP32
+    precision, then downcasts final sum back into right location in input_tensor.
+
+
+    Args:
+        output_tensor (torch.Tensor): Output tensor with reduce-scattered output (only the shard).
+        input_tensor (torch.Tensor): Input tensor that needs to be reduce-scattered.
+        op (torch.distributed.ReduceOp): Only torch.distributed.ReduceOp.SUM is supported.
+        group (torch.distributed.ProcessGroup): Process group to use for reduce-scatter.
+        async_op (bool): Only False is supported right now.
+    """
+    # Make sure arguments conform to the implementation.
+    assert op == torch.distributed.ReduceOp.SUM
+
+    # Get world_size.
+    if group is None:
+        world_size = torch.distributed.get_world_size()
+    else:
+        world_size = group.size()
+
+    # Make sure input_tensor size is divisible by world size.
+    assert input_tensor.numel() % world_size == 0
+
+    # Call all_to_all (every rank should have their respective gradient shards collected from
+    # all ranks). We also create a tensor for the all-to-all output (the all-to-all collective
+    # cannot be performed in-place).
+    all_to_all_output_tensor = torch.empty_like(input_tensor)
+    all_to_all_handle = torch.distributed.all_to_all_single(
+        output=all_to_all_output_tensor, input=input_tensor, group=group, async_op=async_op
+    )
+
+    # Create a work handle to finish communication and reduction.
+    reduce_scatter_handle = _ReduceScatterWithFP32AccumulationWorkHandle(
+        all_to_all_handle, all_to_all_output_tensor, output_tensor, world_size
+    )
+    if async_op:
+        # Return work handle; consumers can call .wait() to ensure communication and associated
+        # reduction complete.
+        return reduce_scatter_handle
+    else:
+        # Wait on work handle.
+        reduce_scatter_handle.wait()
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
@@ -2636,6 +2636,10 @@ def _add_distributed_args(parser):
                        'of 2 (2^16) to ensure NCCL collectives have high bus bandwidth at large DP counts, '
                        'since NCCL message size (which for ring algorithms is bucket_size / dp_size) '
                        'apparently needs to be divisible by a power of 2 for high busbw.')
+    group.add_argument('--ddp-reduce-scatter-with-fp32-accumulation', action='store_true',
+                       default=False, help='If set, use a reduce-scatter implementation which sends lower-precision '
+                       'values over the wire (using an all-to-all to keep total communication overhead in line '
+                       'with the standard ring implementation) but performs accumulation locally in FP32.')
     group.add_argument('--ddp-average-in-collective', action='store_true',
                        default=False, help='If set, average directly in data-parallel communication collective.')
     group.add_argument('--overlap-param-gather', action='store_true',
diff --git a/megatron/training/training.py b/megatron/training/training.py
@@ -971,6 +971,7 @@ def build_model():
             else:
                 kwargs['bucket_size'] = args.ddp_bucket_size
             kwargs['pad_buckets_for_high_nccl_busbw'] = args.ddp_pad_buckets_for_high_nccl_busbw
+            kwargs['reduce_scatter_with_fp32_accumulation'] = args.ddp_reduce_scatter_with_fp32_accumulation
             kwargs['average_in_collective'] = args.ddp_average_in_collective
             if args.use_megatron_fsdp and args.use_precision_aware_optimizer:
                 kwargs["preserve_fp32_weights"] = False
diff --git a/tests/unit_tests/distributed/test_reduce_scatter_with_fp32_accumulation.py b/tests/unit_tests/distributed/test_reduce_scatter_with_fp32_accumulation.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+
+import pytest
+import torch
+
+# Import our reduce_scatter implementation and shard_buffer (used for
+# checks in the test).
+from megatron.core.distributed.param_and_grad_buffer import (
+    reduce_scatter_with_fp32_accumulation,
+    shard_buffer,
+)
+from tests.unit_tests.test_utilities import Utils
+
+
+def get_non_matching_values(tensor1_shard, tensor2_shard):
+    mask = torch.isclose(tensor1_shard, tensor2_shard)
+    indices = (~mask).nonzero()
+    return indices, tensor1_shard[indices], tensor2_shard[indices]
+
+
+class TestReduceScatterWithFP32Accumulation:
+    @classmethod
+    def setup_class(cls):
+        Utils.initialize_model_parallel()
+
+    @classmethod
+    def teardown_class(cls):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize("async_op", [True, False])
+    @pytest.mark.parametrize("baseline_reduce_scatter_in_fp32", [True, False])
+    def test_reduce_scatter_with_fp32_accumulation(
+        self, async_op: bool, baseline_reduce_scatter_in_fp32: bool
+    ):
+        num_tests = 20
+        rank = Utils.rank
+        world_size = Utils.world_size
+        for _ in range(num_tests):
+            # Initialize input tensors.
+            tensor1 = torch.rand(100000, device='cuda', dtype=torch.bfloat16)
+            tensor2 = tensor1.clone()
+
+            # Make sure the two APIs are *identical*.
+            kwargs = {"op": torch.distributed.ReduceOp.SUM, "group": None, "async_op": async_op}
+
+            # Reduce-scatter with all-to-alls.
+            args = [
+                shard_buffer(tensor1, world_size)[rank],
+                tensor1,
+            ]  # Output tensor is view into original input.
+            handle = reduce_scatter_with_fp32_accumulation(*args, **kwargs)
+            if async_op:
+                assert handle is not None
+                handle.wait()
+            tensor1_shard = shard_buffer(tensor1, world_size)[rank]
+
+            if baseline_reduce_scatter_in_fp32:
+                tensor2 = tensor2.float()
+
+            # Reduce-scatter with reduce-scatter API.
+            args = [
+                shard_buffer(tensor2, world_size)[rank],
+                tensor2,
+            ]  # Output tensor is view into original input.
+            handle = torch.distributed.reduce_scatter_tensor(*args, **kwargs)
+            if async_op:
+                assert handle is not None
+                handle.wait()
+            tensor2_shard = shard_buffer(tensor2, world_size)[rank]
+            if baseline_reduce_scatter_in_fp32:  # Cast result back to bfloat16.
+                tensor2_shard = tensor2_shard.bfloat16()
+
+            # Compare results: results should match when doing FP32 reduction and not match when
+            # doing direct BF16 reduction. We only look at relevant shard of tensor1 and tensor2.
+            assert (
+                torch.allclose(tensor1_shard, tensor2_shard) == baseline_reduce_scatter_in_fp32
+            ), f"{get_non_matching_values(tensor1_shard, tensor2_shard)}"