wait for futures while syncing fragments

tushar00jain · tushar00jain · commit c93ad11e2b23 · 2025-07-25T14:05:17.000-07:00
Summary: - we current wait for pg work's future when preparing for a fragment - if we use gloo, this blocks the cpu - move the wait call to when we perform the actual sync of the fragment - the manager allreduce also returns the work object so we can wait for that as well when performing the sync - use http transport instead of pg transport -- pg transport fails to resolve address when running locally - deep copy the state dict for sending checkpoint because if the replica moves to the next step, the state dict can change before the checkpoint is sent Test Plan: gloo overlaps now <img width="1284" height="662" alt="image" src="https://github.com/user-attachments/assets/e9b88e52-8053-432b-83a3-e689bcc4f9d4" /> nccl still overlaps <img width="1283" height="664" alt="image" src="https://github.com/user-attachments/assets/cbd0a352-1529-42f7-b8d9-d45bd0e84a97" />
diff --git a/torchft/collectives.py b/torchft/collectives.py
@@ -18,6 +18,7 @@
     AllreduceOptions,
     AllToAllOptions,
     ReduceScatterOptions,
+    Work,
 )
 from torch.futures import Future
 
@@ -288,7 +289,7 @@ def allreduce_quantized(
     opts: AllreduceOptions | ReduceOp,
     process_group: "ProcessGroup",
     sync_stream: cuda.Stream | None = None,
-) -> Future[list[torch.Tensor]]:
+) -> tuple[Work, Future[list[torch.Tensor]]]:
     """
     Performs a quantized all-reduce operation on a list of tensors.
 
@@ -379,17 +380,17 @@ def allreduce_quantized(
             [torch.split(quantized_tensors_out.view(world_size, -1), 1)[rank]],
             _to_allgather_options(allreduce_opts),
         )
-        work.wait()
         fut = work.get_future()
 
         def callback(fut: Future[list[torch.Tensor]]) -> list[torch.Tensor]:
             # Dequantize and copy to output buffer.
             nonlocal tensors, quantized_tensors, world_size, sync_stream
 
             with torch.cuda.stream(sync_stream):
+                fut.wait()
                 # Dequantize the result back to the original precision
                 fused_dequantize_from_fp8(tensors, quantized_tensors, world_size)
                 return tensors
 
         fut = fut.then(callback)
-        return fut
+        return (work, fut)
diff --git a/torchft/collectives_test.py b/torchft/collectives_test.py
@@ -94,7 +94,7 @@ def _run_all_reduce_collective(
                     )
                 ]
 
-                fut = allreduce_quantized(tensors, reduce_op, pg)
+                _, fut = allreduce_quantized(tensors, reduce_op, pg)
                 fut.wait()
 
                 work = pg.allreduce([expected], reduce_op)
diff --git a/torchft/ddp.py b/torchft/ddp.py
@@ -68,7 +68,8 @@ def __init__(self, manager: "Manager", module: nn.Module, **kwargs: object) -> N
     def _comm_hook(
         state: "Manager", bucket: dist.GradBucket
     ) -> torch.futures.Future[torch.Tensor]:
-        return state.allreduce(bucket.buffer())
+        _, fut = state.allreduce(bucket.buffer())
+        return fut
 
 
 class PureDistributedDataParallel(nn.Module):
diff --git a/torchft/ddp_test.py b/torchft/ddp_test.py
@@ -39,14 +39,16 @@ def test_ddp(self) -> None:
 
         call_count = 0
 
-        def allreduce(tensor: torch.Tensor) -> Future[torch.Tensor]:
+        def allreduce(
+            tensor: torch.Tensor,
+        ) -> tuple[torch.Tensor, Future[torch.Tensor]]:
             nonlocal call_count
 
             call_count += 1
 
             fut = Future()  # pyre-fixme[29]: not a function
             fut.set_result(tensor)
-            return fut
+            return tensor, fut
 
         manager.allreduce = allreduce
 
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -18,6 +18,7 @@
 import torch
 import torch.distributed as dist
 from torch import nn, optim
+from torch.distributed.distributed_c10d import Work
 from torch.distributed.tensor import DTensor
 from torch.nn.parameter import Parameter
 from torch.optim.optimizer import Optimizer
@@ -154,7 +155,8 @@ def _average(self) -> list[torch.Tensor]:
         for p in self._model.parameters():
             # Create a new tensor to store the averaged parameter
             avg_param = extract_local_tensor(p)
-            works.append(self._manager.allreduce(avg_param))
+            (work, fut) = self._manager.allreduce(avg_param)
+            works.append(fut)
             averaged_parameters.append(avg_param)
         for work in works:
             work.wait()
@@ -201,6 +203,7 @@ def __init__(
 
         # Stores pending all reduce
         self._allreduce_futures: list[torch.futures.Future[torch.Tensor]] = []
+        self._allreduce_work: list[Work] = []
         self._stream: Optional[torch.cuda.Stream] = (
             torch.cuda.Stream() if torch.cuda.is_available() else None
         )
@@ -377,6 +380,7 @@ def wait(self) -> None:
             self._stop_event = None
 
         self._allreduce_futures = []
+        self._allreduce_work = []
 
     @torch.profiler.record_function("torchft::local_sgd::prepare_sync")
     def prepare_sync(self) -> None:
@@ -399,13 +403,6 @@ def prepare_sync(self) -> None:
         ):
             self._average_grads()
 
-            for work in self._allreduce_futures:
-                work.wait()
-
-            if self._stream is not None:
-                self._stop_event = torch.cuda.Event()
-                self._stop_event.record()
-
     @torch.profiler.record_function("torchft::local_sgd::perform_sync")
     def perform_sync(self) -> bool:
         """
@@ -415,6 +412,21 @@ def perform_sync(self) -> bool:
         # Waiting for an allreduce before it has been sent is currently not supported.
         assert len(self._allreduce_futures) > 0
 
+        with (
+            torch.cuda.stream(self._stream)
+            if self._stream is not None
+            else nullcontext()
+        ):
+            for work in self._allreduce_work:
+                work.wait()
+
+            for fut in self._allreduce_futures:
+                fut.wait()
+
+            if self._stream is not None:
+                self._stop_event = torch.cuda.Event()
+                self._stop_event.record()
+
         self.wait()
 
         # save the parameters so they can be used for merging
@@ -464,10 +476,13 @@ def _allreduce_per_param(self) -> None:
         """Performs allreduce on each gradient tensor separately (original method)."""
         for name, p in self._model_fragment.named_parameters():
             # Perform allreduce on the pseudogradients
-            work = self._manager.allreduce(
+            (work, fut) = self._manager.allreduce(
                 self._grads[name], should_quantize=self.should_quantize
             )
-            self._allreduce_futures.append(work)
+            self._allreduce_futures.append(fut)
+
+            if work is not None:
+                self._allreduce_work.append(work)
 
     def _bucketize_and_allreduce(
         self,
@@ -508,7 +523,7 @@ def _bucketize_and_allreduce(
                 pack_offset += numel
                 flat_index += 1
 
-            work = self._manager.allreduce(
+            (work, fut) = self._manager.allreduce(
                 flat_buffer, should_quantize=self.should_quantize
             )
 
@@ -517,8 +532,11 @@ def callback(fut: torch.futures.Future[torch.Tensor]) -> None:
                 for t, pack_offset, numel in bucket_tensors:
                     t.copy_(flat_buffer[pack_offset : pack_offset + numel].view_as(t))
 
-            work = work.then(callback)
-            self._allreduce_futures.append(work)
+            fut = fut.then(callback)
+
+            self._allreduce_futures.append(fut)
+            if work is not None:
+                self._allreduce_work.append(work)
 
             offset += chunk_size
 
diff --git a/torchft/local_sgd_test.py b/torchft/local_sgd_test.py
@@ -11,6 +11,7 @@
 import torch
 from parameterized import parameterized
 from torch import Tensor, nn, optim
+from torch.distributed.distributed_c10d import Work
 from torch.distributed.tensor import DTensor
 
 from torchft.local_sgd import DiLoCo, LocalSGD, extract_local_tensor
@@ -26,6 +27,15 @@ def create_manager() -> MagicMock:
 
     manager.errored.return_value = None
 
+    def mock_allreduce(
+        tensor: torch.Tensor, should_quantize: bool = False
+    ) -> tuple[Work | None, torch.futures.Future[Tensor]]:
+        fut = torch.futures.Future()  # pyre-fixme[29]: not a function
+        fut.set_result(tensor)
+        return (None, fut)
+
+    manager.allreduce.side_effect = mock_allreduce
+
     return manager
 
 
@@ -66,7 +76,7 @@ class LocalSGDTest(TestCase):
     def test_local_sgd_healthy(self) -> None:
         model = SimpleModel()
         optimizer = optim.SGD(model.parameters())
-        manager = create_autospec(Manager)
+        manager = create_manager()
         with LocalSGD(manager, model, optimizer, sync_every=2) as local_sgd:
             self.assertEqual(local_sgd._local_step, 0)
             inp = torch.rand(2, 3)
@@ -242,11 +252,11 @@ def test_bucketization_correctness(self) -> None:
         # Define fake allreduce: multiplies buffer by 2
         def fake_allreduce(
             tensor: Tensor, should_quantize: bool
-        ) -> torch.futures.Future[Tensor]:
+        ) -> tuple[Work | None, torch.futures.Future[Tensor]]:
             tensor.mul_(2)
             fut = torch.futures.Future()  # pyre-fixme[29]: not a function
             fut.set_result(tensor)
-            return fut
+            return (None, fut)
 
         manager.allreduce.side_effect = fake_allreduce
 
@@ -286,11 +296,11 @@ def test_gradient_correctness(self) -> None:
         # Define fake allreduce: multiplies buffer by 2
         def fake_allreduce(
             tensor: Tensor, should_quantize: bool
-        ) -> torch.futures.Future[Tensor]:
+        ) -> tuple[Work | None, torch.futures.Future[Tensor]]:
             tensor.mul_(2)
             fut = torch.futures.Future()  # pyre-fixme[29]: not a function
             fut.set_result(tensor)
-            return fut
+            return (None, fut)
 
         manager.allreduce.side_effect = fake_allreduce
 
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -26,6 +26,7 @@
 """
 
 import concurrent.futures
+import copy
 import logging
 import os
 import socket
@@ -39,7 +40,7 @@
 
 import torch
 from torch.distributed import ReduceOp, TCPStore
-from torch.distributed.distributed_c10d import AllreduceOptions, ReduceOp
+from torch.distributed.distributed_c10d import AllreduceOptions, ReduceOp, Work
 
 from torchft._torchft import ManagerClient, ManagerServer
 from torchft.checkpointing import CheckpointTransport, HTTPTransport
@@ -345,7 +346,7 @@ def shutdown(self, wait: bool = True) -> None:
     @torch.profiler.record_function("torchft::manager::allreduce")
     def allreduce(
         self, tensor: torch.Tensor, should_quantize: bool = False
-    ) -> torch.futures.Future[torch.Tensor]:
+    ) -> tuple[Work | None, torch.futures.Future[torch.Tensor]]:
         """
         Fault tolerant allreduce the tensor and return a Future that will be completed when
         the tensor is ready.
@@ -367,7 +368,7 @@ def allreduce(
         if self.errored():
             fut = torch.futures.Future()  # pyre-fixme[29]: not a function
             fut.set_result(tensor)
-            return fut
+            return (None, fut)
 
         self.wait_quorum()
         num_participants: int = self.num_participants()
@@ -380,12 +381,11 @@ def allreduce(
             # Run the allreduce async and save the work object so we can wait on
             # it later.
             if should_quantize and IS_TRITON_AVAILABLE:
-                fut = allreduce_quantized(
+                (work, fut) = allreduce_quantized(
                     [tensor], ReduceOp.SUM, self._pg, torch.cuda.current_stream()
                 )
             else:
                 work = self._pg.allreduce([tensor], ReduceOp.SUM)
-                work.wait()
                 fut = work.get_future()
 
             stream: Optional[torch.cuda.Stream] = (
@@ -403,6 +403,7 @@ def callback(
                 # change the stream to avoid making the callback stream
                 # dependent on process group stream running the allreduce
                 with torch.cuda.stream(stream) if stream is not None else nullcontext():
+                    fut.wait()
                     fut.value()
                     tensor /= num_participants
 
@@ -411,7 +412,7 @@ def callback(
             fut = fut.then(callback)
 
             fut = self.wrap_future(fut, tensor)
-            return fut
+            return (work, fut)
 
         except Exception as e:
             self._logger.exception(
@@ -421,7 +422,7 @@ def callback(
 
             fut = torch.futures.Future()  # pyre-fixme[29]: not a function
             fut.set_result(tensor)
-            return fut
+            return (None, fut)
 
     def report_error(self, e: Exception) -> None:
         """
@@ -646,7 +647,7 @@ def _async_quorum(
                             self._checkpoint_transport.send_checkpoint(
                                 dst_ranks=quorum.recover_dst_replica_ranks,
                                 step=max_step,
-                                state_dict=self._manager_state_dict(),
+                                state_dict=copy.deepcopy(self._manager_state_dict()),
                                 timeout=self._timeout,
                             )
 
diff --git a/torchft/manager_integ_test.py b/torchft/manager_integ_test.py
@@ -634,7 +634,7 @@ def all_reduce_callback(
 
         manager.start_quorum()
         t1 = torch.ones((1, 3), device=device)
-        fut = manager.allreduce(t1)
+        (_, fut) = manager.allreduce(t1)
         fut.wait()
         return t1
     return None
diff --git a/torchft/manager_test.py b/torchft/manager_test.py
diff --git a/train_diloco.py b/train_diloco.py

Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,7 @@ def _run_all_reduce_collective(`
`94`	`94`	`)`
`95`	`95`	`]`
`96`	`96`
`97`		`- fut = allreduce_quantized(tensors, reduce_op, pg)`
	`97`	`+ _, fut = allreduce_quantized(tensors, reduce_op, pg)`
`98`	`98`	`fut.wait()`
`99`	`99`
`100`	`100`	`work = pg.allreduce([expected], reduce_op)`