Fix compatibility between data.py and train_ddp.py with replica rank terminology (meta-pytorch#187)

WarrenZhu050413 · WarrenZhu050413 · commit 10686e2c2dd2 · 2025-05-13T10:21:27.000+08:00
diff --git a/torchft/data.py b/torchft/data.py
@@ -46,7 +46,7 @@ class DistributedSampler(data.distributed.DistributedSampler):
     def __init__(
         self,
         dataset: data.Dataset,
-        replica_rank: int,
+        replica_group_id: int,
         num_replica_groups: int,
         group_rank: Optional[int] = None,
         num_replicas: Optional[int] = None,
@@ -65,7 +65,7 @@ def __init__(
         if num_replicas is None:
             num_replicas = dist.get_world_size()
 
-        self.global_rank: int = group_rank + num_replicas * replica_rank
+        self.global_rank: int = group_rank + num_replicas * replica_group_id
         self.global_world_size: int = num_replicas * num_replica_groups
 
         super().__init__(
diff --git a/train_ddp.py b/train_ddp.py
@@ -51,7 +51,7 @@ def main() -> None:
     # majority of groups will be available so few batches will be dropped.
     sampler = DistributedSampler(
         trainset,
-        replica_group=REPLICA_GROUP_ID,
+        replica_group_id=REPLICA_GROUP_ID,
         num_replica_groups=NUM_REPLICA_GROUPS,
         group_rank=0,
         # for DDP we can use replica groups of size 1, FSDP/PP/CP would need more.