fix infinite recovery

tushar00jain · tushar00jain · commit a19a0e31f69f · 2025-06-16T00:54:12.000-07:00
Summary:
- we don't increase the max_step when a node is catching up because we don't call should_commit
- this can lead the node always being behind and get into an infinite recovery loop
- so simply call `should_commit`
- note, this can result in the global parameters falling out of sync, the diff includes an RFC on how to fix that
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -351,13 +351,39 @@ def perform_sync(self) -> bool:
         steps using the outer optimizer.
         """
         if len(self._allreduce_futures) == 0:
-            return True
+            assert self._fragment_sync_delay > 0
+            # This can happen when using `fragment_sync_delay`. The node
+            # might not have participated in syncing of this fragment.
+            #
+            # The allreduce for other nodes who did might actually
+            # succeed and in that case, we shouldn't allow recovery
+            # from this node.
+            #
+            # We do need to increase the `max_step` here so we
+            # don't end up in an infinite loop of needing to recover.
+            #
+            # TODO: We can add a `is_catching_up` flag to the state_dict
+            # to disallow recoveries from this node. Such nodes can
+            # be excluded from `max_step` calculation unless all
+            # nodes are catching up.
+            return self._manager.should_commit()
 
         self.wait()
 
         # Restore the parameters back to the previous state
         self.restore_parameters()
 
+        # This can return success even if the allreduce failed. Because
+        # the process group could have been reconfigured while the
+        # allreduce was inflight. The inflight allreduce may or may
+        # not have been aborted.
+        #
+        # We consider it successful anyway.
+        #
+        # TODO: We can track errors per allreduce to
+        # let the commit fail here. But this has the downside of
+        # reconfiguring the pg too many times resulting in
+        # more aborts and more commit failures.
         should_commit = self._manager.should_commit()
 
         if should_commit:
@@ -702,6 +728,16 @@ def _step_post_hook(
             # waste after recovery
             self._quorum_loop()
 
+            # TODO: Since we do quorum after commit, there might be a big gap until
+            # the next allreduce. This increases the chances of nodes failing
+            # and so the allreduce to fail.
+            # - We could maybe do a quorum again right before preparing for a fragment
+            #   using `shring_only`. This might make it tricky for new nodes to join
+            #   though.
+            # - Maintain a sequence number in the state dict that gets bumped at every
+            #   quorum call. Then we can do a quorum right before allreduce and avoid
+            #   doing quorums after commit.
+
             # We need to set make sure `_local_step` is still
             # the same across all replicas if `quorum_id` changed.
             #