pytorch · tushar00jain · Aug 1, 2025 · Jul 26, 2025 · Aug 1, 2025
diff --git a/torchft/collectives.py b/torchft/collectives.py
@@ -387,6 +387,8 @@ def callback(fut: Future[list[torch.Tensor]]) -> list[torch.Tensor]:
             nonlocal tensors, quantized_tensors, world_size, sync_stream
 
             with torch.cuda.stream(sync_stream):
+                # Setup stream dependency
+                fut.wait()
                 # Dequantize the result back to the original precision
                 fused_dequantize_from_fp8(tensors, quantized_tensors, world_size)
                 return tensors

diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -513,9 +513,14 @@ def _bucketize_and_allreduce(
             )
 
             def callback(fut: torch.futures.Future[torch.Tensor]) -> None:
-                nonlocal bucket_tensors, flat_buffer
-                for t, pack_offset, numel in bucket_tensors:
-                    t.copy_(flat_buffer[pack_offset : pack_offset + numel].view_as(t))
+                with torch.cuda.stream(self._stream) if self._stream else nullcontext():
+                    nonlocal bucket_tensors, flat_buffer
+                    # Setup stream dependency
+                    fut.wait()
+                    for t, pack_offset, numel in bucket_tensors:
+                        t.copy_(
+                            flat_buffer[pack_offset : pack_offset + numel].view_as(t)
+                        )
 
             work = work.then(callback)
             self._allreduce_futures.append(work)

diff --git a/torchft/manager.py b/torchft/manager.py
@@ -403,6 +403,8 @@ def callback(
                 # change the stream to avoid making the callback stream
                 # dependent on process group stream running the allreduce
                 with torch.cuda.stream(stream) if stream is not None else nullcontext():
+                    # Setup stream dependency
+                    fut.wait()
                     fut.value()
                     tensor /= num_participants
 

diff --git a/train_diloco.py b/train_diloco.py
@@ -34,7 +34,7 @@
     ProcessGroupGloo,
     ProcessGroupNCCL,
 )
-from torchft.checkpointing.pg_transport import PGTransport
+from torchft.checkpointing.http_transport import HTTPTransport
 from torchft.local_sgd import DiLoCo
 
 logging.basicConfig(level=logging.INFO)
@@ -67,13 +67,12 @@ def state_dict():
             timeout=timedelta(seconds=10),
         )
         if torch.cuda.is_available() and USE_NCCL
-        else ProcessGroupGloo(timeout=timedelta(seconds=5))
+        else ProcessGroupGloo(timeout=timedelta(seconds=10))
     )
 
-    transport = PGTransport(
-        pg,
+    transport = HTTPTransport(
         timeout=timedelta(seconds=10),
-        device=device,
+        num_chunks=0,
     )
 
     manager = Manager(