option 2 - call work.wait inside future callback

tushar00jain · tushar00jain · commit 92ad240bb29f · 2025-07-25T21:29:41.000-07:00
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -397,12 +397,13 @@ def allreduce(self, tensor: torch.Tensor, should_quantize: bool = False) -> Work
             def callback(
                 fut: torch.futures.Future[List[torch.Tensor]],
             ) -> torch.Tensor:
-                nonlocal tensor, stream, num_participants
+                nonlocal tensor, stream, num_participants, work
 
                 # change the stream to avoid making the callback stream
                 # dependent on process group stream running the allreduce
                 with torch.cuda.stream(stream) if stream is not None else nullcontext():
                     # Setup stream dependency
+                    work.wait()
                     fut.wait()
                     fut.value()
                     tensor /= num_participants