Monarch V1 Support. Necessary for direct actor to actor communications (#56)

LucasLLC · web-flow · commit 662299faf4fd · 2025-10-13T18:03:00.000-04:00
* latest rdma updates from monarch

* remove test code

* remove test code

* working v1

* removing test code

* v1

* add v1 gate

* nits

* linter
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -4,6 +4,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from torchstore.constants import MONARCH_HOSTMESH_V1
+
+if MONARCH_HOSTMESH_V1:
+    from monarch._rust_bindings.monarch_hyperactor.channel import ChannelTransport
+    from monarch._rust_bindings.monarch_hyperactor.config import configure
+
+    configure(
+        default_transport=ChannelTransport.MetaTlsWithHostname,
+    )
+
 import math
 import os
 import tempfile
diff --git a/torchstore/api.py b/torchstore/api.py
@@ -8,10 +8,10 @@
 
 import torch
 
-from monarch.actor import get_or_spawn_controller
-
 import torchstore.state_dict_utils
 from torchstore.client import LocalClient
+
+from torchstore.constants import MONARCH_HOSTMESH_V1
 from torchstore.controller import Controller
 from torchstore.storage_volume import StorageVolume
 from torchstore.strategy import (
@@ -21,6 +21,11 @@
 )
 from torchstore.transport.pipe import TensorSlice
 
+if MONARCH_HOSTMESH_V1:
+    from monarch._src.actor.v1.proc_mesh import get_or_spawn_controller
+else:
+    from monarch.actor import get_or_spawn_controller
+
 
 # I need to keep this somewhere, so here we go
 DEFAULT_TORCHSTORE_NAME: str = "TorchStore"
diff --git a/torchstore/constants.py b/torchstore/constants.py
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+
+MONARCH_HOSTMESH_V1 = os.environ.get("MONARCH_HOSTMESH_V1", "0").lower() in (
+    "1",
+    "true",
+)
diff --git a/torchstore/storage_volume.py b/torchstore/storage_volume.py
@@ -188,6 +188,7 @@ def _handle_dtensor(
     async def put(
         self, key: str, transport_buffer: TransportBuffer, request: Request
     ) -> None:
+
         if request.is_object:
             self.kv[key] = {"obj": request.objects}
             return
diff --git a/torchstore/transport/buffers.py b/torchstore/transport/buffers.py
@@ -17,7 +17,7 @@
 
     def RDMABuffer(*args: Any, **kwargs: Any) -> Any:
         raise NotImplementedError(
-            "RDMABuffer is not available. This environemnt was likely not built with tensor_engine supoprt."
+            "RDMABuffer is not available. This environemnt was likely not built with rdma support."
         )
 
 
@@ -27,12 +27,10 @@ def RDMABuffer(*args: Any, **kwargs: Any) -> Any:
     os.environ.get("TORCHSTORE_RDMA_CHUNK_SIZE_MB", str(1024 * 32))
 )
 
-# assert RDMA_CHUNK_SIZE_MB <= 1024, "Monarch does not support 1gb chunks via rdma"
-
 
 def rdma_available() -> bool:
     rdma_enabled = (
-        os.environ.get("TORCHSTORE_RDMA_ENABLED", "0") == "1"
+        os.environ.get("TORCHSTORE_RDMA_ENABLED", "1") == "1"
     )  # TODO: enable on this build
     return rdma_enabled and monarch_rdma_available()
 
@@ -111,11 +109,13 @@ def allocate(self, tensor_like: Union[torch.Tensor, Tuple]) -> None:
             return
         elif isinstance(tensor_like, Tuple):
             # we know the size of the tensor from fetching metadata
-            tensor = torch.empty(tensor_like[0], dtype=tensor_like[1])
+            tensor = torch.empty(
+                tensor_like[0], dtype=tensor_like[1], device=torch.device("cpu")
+            )
         else:
             # we have an inplace tensor, allocate a copy
             assert isinstance(tensor_like, torch.Tensor)
-            tensor = torch.empty_like(tensor_like)
+            tensor = torch.empty_like(tensor_like, device=torch.device("cpu"))
 
         # store tensor meta
         self.shape = tensor.shape
@@ -125,7 +125,10 @@ def allocate(self, tensor_like: Union[torch.Tensor, Tuple]) -> None:
         self._assert_valid_tensor(tensor)
 
         byte_view_chunks = self._create_byte_views_from_tensor(tensor)
-        self.tensor_refs = [torch.empty_like(chunk) for chunk in byte_view_chunks]
+        self.tensor_refs = [
+            torch.empty_like(chunk, device=torch.device("cpu"))
+            for chunk in byte_view_chunks
+        ]
         self.rdma_buffers = [RDMABuffer(chunk) for chunk in self.tensor_refs]
 
         chunk_sizes = set()
@@ -140,7 +143,9 @@ def update(self, other_buffer: "TransportBuffer") -> None:
     async def read_into(self, tensor: Optional[torch.Tensor] = None) -> torch.Tensor:
         if tensor is None:
             # allocate a tensor to return
-            tensor = torch.empty(self.shape, dtype=self.dtype)
+            tensor = torch.empty(
+                self.shape, dtype=self.dtype, device=torch.device("cpu")
+            )
 
         self._assert_valid_tensor(tensor)
         assert self.rdma_buffers is not None
diff --git a/torchstore/utils.py b/torchstore/utils.py
@@ -10,7 +10,12 @@
 
 import torch
 
-from monarch.actor import ProcMesh, this_host
+from torchstore.constants import MONARCH_HOSTMESH_V1
+
+if MONARCH_HOSTMESH_V1:
+    from monarch._src.actor.v1.host_mesh import this_host
+else:
+    from monarch.actor import this_host
 
 
 if TYPE_CHECKING:
@@ -29,7 +34,7 @@ async def spawn_actors(num_processes, actor_cls, name, mesh=None, **init_args):
         actors = mesh.spawn(f"{name}_{str(uuid.uuid4())[:8]}", actor_cls, **init_args)
         return actors
 
-    assert isinstance(mesh, ProcMesh)
+    assert hasattr(mesh, "spawn")
     actors = mesh.spawn(f"{name}_{str(uuid.uuid4())[:8]}", actor_cls, **init_args)
 
     return actors