Throw KeyError when getting partially pushed dtensor (#49)

kaiyuan-li · dcci · web-flow · commit 871fa116cdb2 · 2025-10-07T13:56:32.000-04:00
* raise KeyError at key miss

* just raise key error

* just raise key error

* [torchstore] Rework the readme.

* Update to account for Lucas' comments.

* test

* sync

* test update

* Add partial DTensor commit detection with file-based sync

* Update README.md to match upstream/main and remove PR comment

* Simplify test_partial_put by removing sync primitives and using ranks_to_skip_put

* verify exists in test

* fmt

---------

Co-authored-by: Davide Italiano &lt;davidino@meta.com&gt;
diff --git a/tests/test_tensor_slice.py b/tests/test_tensor_slice.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import os
+import tempfile
 
 import pytest
 import torch
@@ -120,8 +121,6 @@ async def test_tensor_slice_inplace():
 @pytest.mark.asyncio
 async def test_put_dtensor_get_full_tensor():
     """Test basic DTensor put/get functionality with separate put and get meshes using shared DTensorActor"""
-    import tempfile
-
     await ts.initialize(num_storage_volumes=2, strategy=ts.LocalRankStrategy())
 
     original_tensor = torch.arange(16).reshape(4, 4).float()
@@ -151,5 +150,53 @@ async def test_put_dtensor_get_full_tensor():
             await ts.shutdown()
 
 
+@pytest.mark.asyncio
+async def test_partial_put():
+    """
+    Verify the behavior when a dtensor is partially put.
+    1. Create two put actors. Each of them should put half of a DTensor.
+    2. Rank 1 will skip the put operation (using ranks_to_skip_put=[1]).
+    3. After rank 0 completes its put, we call get() which should raise a KeyError
+       because the DTensor is not fully committed (only rank 0's shard is stored).
+    """
+
+    await ts.initialize(num_storage_volumes=2, strategy=ts.LocalRankStrategy())
+
+    original_tensor = torch.arange(16).reshape(4, 4).float()
+
+    with tempfile.TemporaryDirectory() as filesystem_store_dir:
+        try:
+            put_mesh = await spawn_actors(
+                2,
+                DTensorActor,
+                "dtensor_put_mesh",
+                mesh_shape=(2,),
+                original_tensor=original_tensor,
+                placements=[Shard(0)],
+                file_store_name=os.path.join(filesystem_store_dir, "put_test"),
+                visible_devices="0,1",
+                ranks_to_skip_put=[1],  # Rank 1 will skip the put
+            )
+
+            # Execute the put - rank 0 will put, rank 1 will skip
+            await put_mesh.do_put.call()
+
+            assert not await ts.exists("test_key")
+            # Try to get the tensor - should raise KeyError because only rank 0 has committed
+            with pytest.raises(KeyError) as exc_info:
+                await ts.get("test_key")
+
+            # Verify the error message mentions partial commit
+            assert "partially committed" in str(
+                exc_info.value
+            ), f"Error message should mention partial commit: {exc_info.value}"
+
+        finally:
+            # Clean up process groups
+            await put_mesh.destroy_process_group.call()
+            await put_mesh._proc_mesh.stop()
+            await ts.shutdown()
+
+
 if __name__ == "__main__":
     main(__file__)
diff --git a/tests/utils.py b/tests/utils.py
@@ -9,6 +9,8 @@
 from itertools import product
 from logging import getLogger
 
+from typing import List
+
 import pytest
 import torch
 import torchstore as ts
@@ -53,13 +55,17 @@ def __init__(
         placements,
         file_store_name,
         visible_devices="0,1,2,3,4,5,6,7",
+        ranks_to_skip_put: (
+            List[int] | None
+        ) = None,  # ranks that should skip put operation
     ):
         self.rank = current_rank().rank
         self.mesh_shape = mesh_shape
         self.world_size = math.prod(mesh_shape)
         self.original_tensor = original_tensor
         self.placements = placements
         self.file_store_name = file_store_name
+        self.ranks_to_skip_put = ranks_to_skip_put or []
 
         # torchstore will fail without this (see LocalRankStrategy)
         os.environ["LOCAL_RANK"] = str(self.rank)
@@ -95,6 +101,11 @@ async def do_put(self):
         tensor = self.original_tensor.to("cpu")
         dtensor = distribute_tensor(tensor, device_mesh, placements=self.placements)
 
+        # Skip put if this rank is in the skip list
+        if self.rank in self.ranks_to_skip_put:
+            self.rlog(f"Skipping put for rank {self.rank}")
+            return
+
         self.rlog(f"calling put with {dtensor=}")
         await ts.put(self.shared_key, dtensor)
 
diff --git a/torchstore/controller.py b/torchstore/controller.py
@@ -6,6 +6,7 @@
 
 from dataclasses import dataclass, field
 from enum import auto, Enum
+from itertools import product
 from typing import Dict, List, Mapping, Optional, Set
 
 from monarch.actor import Actor, endpoint
@@ -61,6 +62,46 @@ def assert_initialized(self) -> None:
             self.is_initialized
         ), "Please call torchstore.initialize before attempting to use store."
 
+    def _is_dtensor_fully_committed(
+        self, key: str, volume_map: Dict[str, StorageInfo]
+    ) -> bool:
+        """
+        Check if all shards of a DTensor have been committed.
+
+        For a DTensor to be fully committed, we need all coordinates in the mesh
+        to have been stored. The mesh_shape tells us the total number of shards,
+        and coordinates tell us which shards we have.
+
+        Args:
+            key (str): The key to check.
+            volume_map (Dict[str, StorageInfo]): Mapping from storage volume IDs to StorageInfo.
+
+        Returns:
+            bool: True if fully committed, False if partial.
+        """
+        # Collect all tensor slices across all storage volumes
+        all_slices = set()
+        mesh_shape = None
+
+        for storage_info in volume_map.values():
+            if storage_info.object_type != ObjectType.TENSOR_SLICE:
+                return True  # Not a DTensor, so it's "fully committed"
+
+            for tensor_slice in storage_info.tensor_slices:
+                all_slices.add(tensor_slice.coordinates)
+                if mesh_shape is None:
+                    mesh_shape = tensor_slice.mesh_shape
+                else:
+                    assert (
+                        mesh_shape == tensor_slice.mesh_shape
+                    ), "Inconsistent mesh shapes in stored slices"
+
+        # Generate all expected coordinates for the mesh
+        expected_coords = set(product(*(range(s) for s in mesh_shape)))
+
+        # Check if we have all coordinates
+        return all_slices == expected_coords
+
     @endpoint
     async def init(
         self,
@@ -116,13 +157,25 @@ def locate_volumes(
                 objects containing metadata about the stored data shards.
 
         Raises:
-            KeyError: If the key is not found in any storage volumes.
+            KeyError: If the key is not found in any storage volumes, or if the key
+                is a DTensor that is only partially committed.
         """
         self.assert_initialized()
 
         if key not in self.keys_to_storage_volumes:
             raise KeyError(f"Unable to locate {key} in any storage volumes.")
-        return self.keys_to_storage_volumes[key]
+
+        volume_map = self.keys_to_storage_volumes[key]
+
+        # Check if this is a DTensor and if it's fully committed
+        if not self._is_dtensor_fully_committed(key, volume_map):
+            raise KeyError(
+                f"DTensor '{key}' is only partially committed. "
+                f"Not all shards have been stored yet. "
+                f"Please ensure all ranks complete their put() operations."
+            )
+
+        return volume_map
 
     @endpoint
     def notify_put(self, key: str, request: Request, storage_volume_id: str) -> None: