pytorch · tushar00jain · Jul 24, 2025 · Jul 24, 2025
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -27,6 +27,7 @@ jobs:
           lintrunner init
 
           pip install .[dev] -v
+          pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
       - name: Run lintrunner
         run: |
           set -eux

diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml
@@ -15,11 +15,7 @@ jobs:
           - runs-on: "linux.2xlarge"
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
-            torch-version: "stable"
-          - runs-on: "linux.g5.12xlarge.nvidia.gpu"
-            gpu-arch-type: "cuda"
-            gpu-arch-version: "12.4"
-            torch-version: "stable"
+            torch-version: "nigthly"
           - runs-on: "linux.g5.12xlarge.nvidia.gpu"
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.4"

diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -399,13 +399,6 @@ def prepare_sync(self) -> None:
         ):
             self._average_grads()
 
-            for work in self._allreduce_futures:
-                work.wait()
-
-            if self._stream is not None:
-                self._stop_event = torch.cuda.Event()
-                self._stop_event.record()
-
     @torch.profiler.record_function("torchft::local_sgd::perform_sync")
     def perform_sync(self) -> bool:
         """
@@ -415,6 +408,18 @@ def perform_sync(self) -> bool:
         # Waiting for an allreduce before it has been sent is currently not supported.
         assert len(self._allreduce_futures) > 0
 
+        with (
+            torch.cuda.stream(self._stream)
+            if self._stream is not None
+            else nullcontext()
+        ):
+            for work in self._allreduce_futures:
+                work.wait()
+
+            if self._stream is not None:
+                self._stop_event = torch.cuda.Event()
+                self._stop_event.record()
+
         self.wait()
 
         # save the parameters so they can be used for merging

diff --git a/torchft/manager.py b/torchft/manager.py
@@ -385,7 +385,7 @@ def allreduce(
                 )
             else:
                 work = self._pg.allreduce([tensor], ReduceOp.SUM)
-                work.wait()
+                work.block_current_stream()
                 fut = work.get_future()
 
             stream: Optional[torch.cuda.Stream] = (

diff --git a/train_diloco.py b/train_diloco.py
@@ -34,7 +34,7 @@
     ProcessGroupGloo,
     ProcessGroupNCCL,
 )
-from torchft.checkpointing.pg_transport import PGTransport
+from torchft.checkpointing.http_transport import HTTPTransport
 from torchft.local_sgd import DiLoCo
 
 logging.basicConfig(level=logging.INFO)
@@ -67,13 +67,12 @@ def state_dict():
             timeout=timedelta(seconds=10),
         )
         if torch.cuda.is_available() and USE_NCCL
-        else ProcessGroupGloo(timeout=timedelta(seconds=5))
+        else ProcessGroupGloo(timeout=timedelta(seconds=10))
     )
 
-    transport = PGTransport(
-        pg,
+    transport = HTTPTransport(
         timeout=timedelta(seconds=10),
-        device=device,
+        num_chunks=0,
     )
 
     manager = Manager(