use block_current_stream work api

tushar00jain · tushar00jain · commit 3c9fd6970836 · 2025-07-22T14:21:54.000-07:00
Summary: use block_current_stream to avoid blocking the cpu when using gloo Test Plan: ## Before <img width="1285" height="667" alt="image" src="https://github.com/user-attachments/assets/082b55ce-efac-46f8-adba-df92ffec864f" /> ## After <img width="1283" height="662" alt="image" src="https://github.com/user-attachments/assets/923bcc7b-740a-43d8-baed-19e3d77a7889" />
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -27,6 +27,7 @@ jobs:
           lintrunner init
 
           pip install .[dev] -v
+          pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
       - name: Run lintrunner
         run: |
           set -eux
diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml
@@ -15,11 +15,7 @@ jobs:
           - runs-on: "linux.2xlarge"
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
-            torch-version: "stable"
-          - runs-on: "linux.g5.12xlarge.nvidia.gpu"
-            gpu-arch-type: "cuda"
-            gpu-arch-version: "12.4"
-            torch-version: "stable"
+            torch-version: "nigthly"
           - runs-on: "linux.g5.12xlarge.nvidia.gpu"
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.4"
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -373,7 +373,7 @@ def allreduce(
                 )
             else:
                 work = self._pg.allreduce([tensor], ReduceOp.SUM)
-                work.wait()
+                work.block_current_stream()
                 fut = work.get_future()
 
             stream: Optional[torch.cuda.Stream] = (

Original file line number	Diff line number	Diff line change
`@@ -373,7 +373,7 @@ def allreduce(`
`373`	`373`	`)`
`374`	`374`	`else:`
`375`	`375`	`work = self._pg.allreduce([tensor], ReduceOp.SUM)`
`376`		`- work.wait()`
	`376`	`+ work.block_current_stream()`
`377`	`377`	`fut = work.get_future()`
`378`	`378`
`379`	`379`	`stream: Optional[torch.cuda.Stream] = (`