process_group/gloo: support CUDA tensors (#185)

d4l3k · web-flow · commit 93c230bd2601 · 2025-05-02T10:42:02.000-07:00
diff --git a/torchft/process_group.py b/torchft/process_group.py
@@ -560,6 +560,10 @@ def _create_pg(self, store: Store, rank: int, world_size: int) -> BaseProcessGro
         pg._register_backend(
             torch.device("cpu"), ProcessGroup.BackendType.GLOO, backend_class
         )
+        if torch.cuda.is_available():
+            pg._register_backend(
+                torch.device("cuda"), ProcessGroup.BackendType.GLOO, backend_class
+            )
         return pg
 
     def getBackendName(self) -> str:
diff --git a/torchft/process_group_test.py b/torchft/process_group_test.py
@@ -65,6 +65,7 @@ def dummy_init_pg() -> None:
 def _test_pg(
     pg: ProcessGroup,
     example_tensor: torch.Tensor = torch.randn((2, 3), dtype=torch.float32),
+    skip: list[str] = [],
 ) -> Dict[str, dist._Work]:
     """
     Helper function to test a set of collective operations on a given process group.
@@ -124,6 +125,8 @@ def check_tensors(arg: Any) -> None:  # pyre-ignore[2]
     works: Dict[str, dist._Work] = {}
 
     for coll_str, args in collectives:
+        if coll_str in skip:
+            continue
         try:
             coll = getattr(pg, coll_str)
             work = coll(*args)
@@ -496,7 +499,12 @@ def run_reduce_scatter_tensor_coalesced_test(
 
 
 class ProcessGroupTest(TestCase):
-    def test_gloo_apis(self) -> None:
+    @parameterized.expand(["cpu", "cuda"])
+    def test_gloo_apis(self, device: str) -> None:
+        if device == "cuda" and not torch.cuda.is_available():
+            self.skipTest("CUDA is not available")
+            return
+
         store = TCPStore(
             host_name="localhost", port=0, is_master=True, wait_for_workers=False
         )
@@ -507,11 +515,23 @@ def test_gloo_apis(self) -> None:
 
         self.assertEqual(pg.size(), 1)
 
-        _test_pg(pg)
+        _test_pg(
+            pg,
+            torch.tensor([2], device=device),
+            skip=(
+                # https://github.com/pytorch/pytorch/issues/152645
+                [
+                    "allreduce_coalesced",
+                    "allgather_into_tensor_coalesced",
+                ]
+                if device == "cuda"
+                else []
+            ),
+        )
 
-        m = nn.Linear(3, 4)
+        m = nn.Linear(3, 4).to(device)
         m = torch.nn.parallel.DistributedDataParallel(m, process_group=pg)
-        m(torch.rand(2, 3))
+        m(torch.rand(2, 3, device=device))
 
     def test_gloo_timeout(self) -> None:
         store = TCPStore(