Group allreduce futures (#209)

tushar00jain · web-flow · commit 095e418db22b · 2025-06-10T10:23:51.000-07:00
diff --git a/README.md b/README.md
@@ -222,7 +222,13 @@ A test DDP script can be launched with torchX with:
 torchx run
 ```
 
-See [.torchxconfig](.torchxconfig), [torchx.py](./torchft/torchx.py) and the [torchX documentation](https://pytorch.org/torchx/latest/) to understand how DDP is being ran. 
+Or Diloco with:
+
+```sh
+USE_STREAMING=True torchx run ./torchft/torchx.py:hsdp --script='train_diloco.py'
+```
+
+See [.torchxconfig](.torchxconfig), [torchx.py](./torchft/torchx.py) and the [torchX documentation](https://pytorch.org/torchx/latest/) to understand how DDP is being ran.
 
 `torchx.py` could also launch HSDP jobs when `workers_per_replica` is set > 1, if the training script supports it. For an example HSDP training implementation with torchFT enabled, see [torchtitan](https://github.com/pytorch/torchtitan).
 
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -11,6 +11,7 @@
 import logging
 import math
 import threading
+from contextlib import nullcontext
 from types import TracebackType
 from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type
 
@@ -197,9 +198,10 @@ def __init__(
         self._outer_optimizer = outer_optimizer
 
         # Stores pending all reduce
-        self._allreduce_futures: list[
-            torch.futures.Future[None] | torch.futures.Future[torch.Tensor]
-        ] = []
+        self._allreduce_futures: list[torch.futures.Future[torch.Tensor]] = []
+        self._stream: Optional[torch.cuda.Stream] = (
+            torch.cuda.Stream() if torch.cuda.is_available() else None
+        )
 
         if bucket_cap_mb is not None:
             self.bucket_cap_mb = int(bucket_cap_mb * 1024 * 1024)
@@ -222,13 +224,15 @@ def __init__(
                 t = t.pin_memory()
             self.original_parameters[name] = t
 
+    @torch.profiler.record_function("torchft::local_sgd::save_parameters")
     def save_parameters(self) -> None:
         with torch.no_grad():
             # TODO: consider running copy on a separate stream
             for name, p in self._model_fragment.named_parameters():
                 param_to_local = extract_local_tensor(p.data)
                 self.original_parameters[name].copy_(param_to_local, non_blocking=True)
 
+    @torch.profiler.record_function("torchft::local_sgd::restore_parameters")
     def restore_parameters(self) -> None:
         with torch.no_grad():
             # TODO: consider running copy on a separate stream
@@ -248,6 +252,7 @@ def restore_parameters(self) -> None:
                 else:
                     p.data.copy_(self.original_parameters[name], non_blocking=False)
 
+    @torch.profiler.record_function("torchft::local_sgd::wait")
     def wait(self) -> None:
         """
         Waits for the previously scheduled allreduce to finish
@@ -256,6 +261,9 @@ def wait(self) -> None:
         for work in self._allreduce_futures:
             work.wait()
 
+        if self._stream is not None:
+            self._stream.synchronize()
+
         self._allreduce_futures = []
 
     def should_prepare_fragment(self, step: int) -> bool:
@@ -272,22 +280,31 @@ def should_sync_fragment(self, step: int) -> bool:
         step_to_sync = step - self._fragment_sync_offset - self._fragment_sync_delay
         return step_to_sync % self._sync_every == 0
 
+    @torch.profiler.record_function("torchft::local_sgd::prepare_sync")
     def prepare_sync(self) -> None:
         """
         Calculate the pseugradient, average them across the manager group and starts
         allreduce on the pseudo-gradients but doesn't wait for it to finish.
         """
-        # Set the .grad field of each parameter to its pseudogradient
-        for name, p in self._model_fragment.named_parameters():
-            local_param = extract_local_tensor(p.data)
-            pseudogradient = local_param - self.original_parameters[name].to(p.device)
-            if isinstance(p, DTensor):
-                p.grad._local_tensor = pseudogradient
-            else:
-                p.grad = pseudogradient
+        with (
+            torch.cuda.stream(self._stream)
+            if self._stream is not None
+            else nullcontext()
+        ):
+            # Set the .grad field of each parameter to its pseudogradient
+            for name, p in self._model_fragment.named_parameters():
+                local_param = extract_local_tensor(p.data)
+                pseudogradient = local_param - self.original_parameters[name].to(
+                    p.device
+                )
+                if isinstance(p, DTensor):
+                    p.grad._local_tensor = pseudogradient
+                else:
+                    p.grad = pseudogradient
 
-        self._average_grads()
+            self._average_grads()
 
+    @torch.profiler.record_function("torchft::local_sgd::perform_sync")
     def perform_sync(self) -> bool:
         """
         Overrides the sync method to wait for the scheduled allreduce to finish and
@@ -467,16 +484,6 @@ def __init__(
         if fragment_update_alpha < 0 or fragment_update_alpha > 1:
             raise ValueError("fragment_update_alpha must be between 0 and 1")
 
-        # TODO: Support multiple fragments
-        # This requires changing the manager to support `should_commit` for each
-        # fragment separately.
-        if len(model_fragments) != 1:
-            raise ValueError("Multiple fragments are not supported yet")
-
-        # TODO: Support `fragment_sync_delay`
-        if fragment_sync_delay != 0:
-            raise ValueError("Fragment synchronization delay is not supported yet")
-
         # TODO: Support `fragment_update_alpha`
         if fragment_update_alpha != 0.0:
             raise ValueError(
@@ -522,6 +529,8 @@ def __init__(
                 use_bucketization,
                 bucket_cap_mb,
                 should_quantize,
+                fragment_sync_delay,
+                fragment_update_alpha,
             )
             for i, model_fragment in enumerate(model_fragments)
         ]
@@ -606,16 +615,20 @@ def _step_post_hook(
             step = self._local_step
 
         # Start sending fragments
-        for fragment in self._fragments:
+        for i, fragment in enumerate(self._fragments):
             if not fragment.should_prepare_fragment(step):
                 continue
 
+            logger.debug(f"preparing fragment {i} at step {step}")
+
             fragment.prepare_sync()
 
-        for fragment in self._fragments:
+        for i, fragment in enumerate(self._fragments):
             if not fragment.should_sync_fragment(step):
                 continue
 
+            logger.debug(f"syncing fragment {i} at step {step}")
+
             if not fragment.perform_sync():
                 # Cancel all the previously scheduled allreduce by simply
                 # waiting for them. They should have failed but lets be
@@ -655,3 +668,17 @@ def _step_post_hook(
             # training data by looping here. Otherwise that training data goes to
             # waste after recovery
             self._quorum_loop()
+
+            # We need to set make sure `_local_step` is still
+            # the same across all replicas if `quorum_id` changed.
+            #
+            # We can't garuntee a majority of replicas in this new quorum
+            # has the latest `max_step`.
+            #
+            # TODO: This is garuntee is currently lacking
+            # in torchft unless `shrink_only` is set.
+            #
+            # After the quorum though, everyone will have the same
+            # `local_step` because replicas with the chosen
+            # `max_step` will have the same `local_step`. That is
+            # because we don't take additional steps after commit.
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -259,7 +259,6 @@ def __init__(
         self._quorum_id = -1
         self._errored: Optional[ExceptionWithTraceback] = None
         self._healing = False
-        self._pending_work: List[torch.futures.Future[object]] = []
         self._batches_committed = 0
 
         # first step is 1
@@ -296,6 +295,7 @@ def shutdown(self, wait: bool = True) -> None:
             self._manager.shutdown()
         self._executor.shutdown(wait=wait)
 
+    @torch.profiler.record_function("torchft::manager::allreduce")
     def allreduce(
         self, tensor: torch.Tensor, should_quantize: bool = False
     ) -> torch.futures.Future[torch.Tensor]:
@@ -331,34 +331,36 @@ def allreduce(
         try:
             # Run the allreduce async and save the work object so we can wait on
             # it later.
-            fut: Optional[
-                torch.futures.Future[None]
-                | torch.futures.Future[torch.Tensor]
-                | torch.futures.Future[List[torch.Tensor]]
-            ] = None
             if should_quantize and IS_TRITON_AVAILABLE:
-                fut = allreduce_quantized([tensor], ReduceOp.AVG, self._pg)
+                assert False, "allreduce_quantized is not supported yet"
+                # TODO: Support `allreduce_quantized`
+                # fut = allreduce_quantized([tensor], ReduceOp.SUM, self._pg)
             else:
                 work = self._pg.allreduce([tensor], ReduceOp.SUM)
                 fut = work.get_future()
 
+            stream: Optional[torch.cuda.Stream] = (
+                torch.cuda.current_stream() if torch.cuda.is_available() else None
+            )
+
             # schedule grad normalization as a continuation
             # on the Future
             def callback(
                 fut: torch.futures.Future[List[torch.Tensor]],
             ) -> torch.Tensor:
-                nonlocal tensor
+                nonlocal tensor, stream
 
                 # check for exceptions
                 fut.value()
 
                 tensor /= self.num_participants()
 
+                if stream is not None:
+                    stream.wait_stream(torch.cuda.current_stream())
+
                 return tensor
 
-            assert fut is not None
-            if not should_quantize:
-                fut = fut.then(callback)
+            fut = fut.then(callback)
             fut = self.wrap_future(fut, tensor)
             return fut
 
@@ -429,7 +431,6 @@ def callback(
                 return default
 
         fut = fut.then(callback)
-        self._pending_work.append(cast(torch.futures.Future[object], fut))
         return fut
 
     def start_quorum(
@@ -562,7 +563,7 @@ def _async_quorum(
             self._logger.info(f"reconfiguring for {quorum_id=} {store_prefixed_addr=}")
             # We use the replica rank and world as we want all replicas in the PG.
             try:
-                with torch.profiler.record_function("torchft::manager::_pg.configure"):
+                with torch.profiler.record_function("torchft::manager::_pg::configure"):
                     self._pg.configure(
                         store_prefixed_addr, replica_rank, replica_world_size
                     )
@@ -694,20 +695,12 @@ def should_commit(self, timeout: Optional[timedelta] = None) -> bool:
         Raises:
             RuntimeError: if should_commit fails max_retries times in a row and max_retries is set
         """
-        for work in self._pending_work:
-            # check at the beginning of since .wait() may trigger errors
-            if self._errored is not None:
-                break
-
-            # We swallow the error at in a future then callback so this will
-            # never return an error.
-            work.wait()
-
         # make sure recovery is complete before committing
         if self._recovery_stream is not None:
             self._recovery_stream.synchronize()
 
-        self._pending_work = []
+        if torch.cuda.is_available():
+            torch.cuda.current_stream().synchronize()
 
         if err := self._pg.errored():
             self.report_error(err)
diff --git a/torchft/manager_test.py b/torchft/manager_test.py
@@ -164,9 +164,7 @@ def test_quorum_happy(self, client_mock: MagicMock) -> None:
 
         manager.start_quorum()
         manager.allreduce(torch.tensor([1.0])).wait()
-        self.assertEqual(len(manager._pending_work), 1)
         self.assertTrue(manager.should_commit())
-        self.assertEqual(len(manager._pending_work), 0)
 
         self.assertEqual(manager._quorum_id, 123)
         self.assertEqual(manager.current_step(), 1)
@@ -554,8 +552,6 @@ def test_manager_wrap_future(self, client_mock: MagicMock) -> None:
         self.assertIs(error.original_exception, e)
         self.assertEqual(wrapped_fut.value(), 2)
 
-        self.assertEqual(manager._pending_work, [wrapped_fut])
-
     @patch("torchft.manager.ManagerClient", autospec=True)
     def test_manager_wrap_future_timeout(self, client_mock: MagicMock) -> None:
         manager = self._create_manager(timeout=timedelta(seconds=0.01))
diff --git a/torchft/process_group.py b/torchft/process_group.py
@@ -775,7 +775,7 @@ def abort(self) -> None:
 
     def errored(self) -> Optional[Exception]:
         # force a synchronization to ensure all work is complete
-        torch.cuda.synchronize()
+        torch.cuda.current_stream().synchronize()
 
         return self._errored
 
diff --git a/train_ddp.py b/train_ddp.py
@@ -51,7 +51,7 @@ def main() -> None:
     # majority of groups will be available so few batches will be dropped.
     sampler = DistributedSampler(
         trainset,
-        replica_group=REPLICA_GROUP_ID,
+        replica_rank=REPLICA_GROUP_ID,
         num_replica_groups=NUM_REPLICA_GROUPS,
         group_rank=0,
         # for DDP we can use replica groups of size 1, FSDP/PP/CP would need more.
diff --git a/train_diloco.py b/train_diloco.py