[Mosaic GPU] Add support for async copies to peer devices

apaszke · Google-ML-Automation · commit eb54e7f61bfa · 2025-05-22T08:36:40.000-07:00
PiperOrigin-RevId: 761977946
diff --git a/jax/experimental/mosaic/gpu/core.py b/jax/experimental/mosaic/gpu/core.py
@@ -677,14 +677,25 @@ def as_gpu_kernel(
   if launch_ctx.is_device_collective and not supports_cross_device_collectives():
     raise RuntimeError("Kernel is a cross-device collective but no support is available.")
 
-  expected_arg_treedef = jax.tree.structure(in_shape)
+  expected_arg_tys, expected_arg_treedef = jax.tree.flatten(in_shape)
   def _check_args(*args):
     arg_treedef = jax.tree.structure(args)
     if arg_treedef != expected_arg_treedef:
       raise ValueError(
           f"Invalid argument structure: expected {expected_arg_treedef}, got"
           f" {arg_treedef}, ({args=})"
       )
+    for arg, expected_ty in zip(args, expected_arg_tys):
+      if arg.shape != expected_ty.shape:
+        raise ValueError(
+            f"Argument shape mismatch: expected {expected_ty.shape}, got"
+            f" {arg.shape}"
+        )
+      if arg.dtype != expected_ty.dtype:
+        raise ValueError(
+            f"Argument dtype mismatch: expected {expected_ty.dtype}, got"
+            f" {arg.dtype}"
+        )
 
   def bind(*args) -> Any:
     return mosaic_gpu_p.bind(*args, module=module, out_types=out_shape)
diff --git a/jax/experimental/mosaic/gpu/launch_context.py b/jax/experimental/mosaic/gpu/launch_context.py
@@ -400,6 +400,7 @@ def _get_tma_desc(
       self,
       gmem_ref,
       gmem_transform: tuple[MemRefTransform, ...],
+      gmem_peer_id: int | ir.Value | None,
       transformed_slice_shape: tuple[int, ...],
       swizzle: int | None,
       reduction_op: Literal[
@@ -408,6 +409,7 @@ def _get_tma_desc(
   ):
     tma_desc_key = (gmem_ref, transformed_slice_shape, swizzle, gmem_transform)
     if (tma_desc := self.tma_descriptors.get(tma_desc_key, None)) is None:
+      i32 = ir.IntegerType.get_signless(32)
       i64 = ir.IntegerType.get_signless(64)
       ptr_ty = ir.Type.parse("!llvm.ptr")
       def init_tma_desc(host_ptr):
@@ -432,6 +434,24 @@ def init_tma_desc(host_ptr):
         base_ptr = llvm.getelementptr(
             ptr_ty, alloc_ptr, [as_i64(offset)], [llvm_dyn], ref_ty.element_type, llvm.GEPNoWrapFlags.none,
         )
+        if gmem_peer_id is not None:
+          if not isinstance(gmem_peer_id, ir.Value):
+            peer_id = c(gmem_peer_id, i32)
+          else:
+            try:
+              peer_id = _replicate_peer_id_computation(gmem_peer_id)
+            except ReplicationError as e:
+              raise ValueError(
+                  "Failed to reproduce the gmem_peer_id computation on the host"
+              ) from e
+          self._ensure_nvshmem_decls()
+          base_ptr = llvm.call(
+              base_ptr.type,
+              [base_ptr, peer_id],
+              [],
+              [],
+              callee="nvshmem_ptr",
+          )
         rank = ref_ty.rank
         assert rank * 2 == len(sizes_and_strides)
         swizzle_arg = (
@@ -507,6 +527,7 @@ def async_copy(
       dst_ref,
       gmem_slice: Any = (),
       gmem_transform: MemRefTransform | tuple[MemRefTransform, ...] = (),
+      gmem_peer_id: int | ir.Value | None = None,
       barrier: utils.BarrierRef | None = None,
       swizzle: int | None = None,
       arrive: bool | None = None,
@@ -750,7 +771,8 @@ def partition_dim(dim: int, idx: ir.Value, num_chunks: int):
       multicast_mask = None
 
     tma_desc = self._get_tma_desc(
-        gmem_ref, gmem_transform, tuple(slice_shape), swizzle, reduction_op,
+        gmem_ref, gmem_transform, gmem_peer_id,
+        tuple(slice_shape), swizzle, reduction_op,
     )
 
     # We constuct TMA descriptors in column-major order.
@@ -893,3 +915,33 @@ def device_id(self) -> ir.Value:
     self._ensure_nvshmem_decls()
     i32 = ir.IntegerType.get_signless(32)
     return llvm.call(i32, [], [], [], callee="nvshmem_my_pe")
+
+
+class ReplicationError(Exception):
+  pass
+
+def _replicate_peer_id_computation(peer_id: ir.Value, fuel=8) -> ir.Value:
+  if fuel == 0:
+    raise ReplicationError(
+        "gmem_peer_id computation is too complicated to recompute on the host"
+    )
+  if isinstance(peer_id, ir.BlockArgument):
+    raise ReplicationError("Can't recompute a value that's a block argument")
+  op = peer_id.owner.opview
+  # We accept all arith ops
+  if op.OPERATION_NAME.startswith("arith."):
+    new_operands = [
+        _replicate_peer_id_computation(x, fuel - 1) for x in op.operands
+    ]
+    result_types = [r.type for r in op.results]
+    new_attributes = {na.name: na.attr for na in op.attributes}
+    new_op = ir.Operation.create(
+        op.OPERATION_NAME, result_types, new_operands, new_attributes
+    )
+    return new_op.results if len(new_op.results) > 1 else new_op.result
+  if isinstance(op, llvm.CallOp) and op.callee.value == "nvshmem_my_pe":
+    i32 = ir.IntegerType.get_signless(32)
+    return llvm.call(i32, [], [], [], callee="nvshmem_my_pe")
+  raise ReplicationError(
+      f"Unrecognized op can't be recomputed on the host: {op}"
+  )
diff --git a/jaxlib/mosaic/gpu/BUILD b/jaxlib/mosaic/gpu/BUILD
@@ -122,6 +122,8 @@ cc_library(
     # Linker may prune these symbols if they are not explicitly exported.
     linkopts = [
         "-Wl,--export-dynamic-symbol='mosaic_gpu_*'",
+        "-Wl,--export-dynamic-symbol='nvshmem_my_pe'",
+        "-Wl,--export-dynamic-symbol='nvshmem_ptr'",
         "-Wl,--export-dynamic-symbol='nvshmemx_barrier_all_on_stream'",
         "-Wl,--export-dynamic-symbol='nvshmemx_cumodule_init'",
         "-Wl,--export-dynamic-symbol='nvshmemx_init_status'",
diff --git a/tests/mosaic/BUILD b/tests/mosaic/BUILD
@@ -63,6 +63,27 @@ jax_multiplatform_test(
     ]),
 )
 
+jax_multiplatform_test(
+    name = "gpu_test_distributed",
+    srcs = ["gpu_test_distributed.py"],
+    args = [
+        "--num_processes=2",
+        "--gpus_per_process=1",
+    ],
+    enable_backends = [],
+    enable_configs = ["gpu_h100x2"],
+    env = {"XLA_FLAGS": "--xla_gpu_autotune_level=0 --xla_gpu_experimental_enable_nvshmem=true"},
+    tags = ["multiaccelerator"],
+    deps = [
+        "//jax:experimental",
+        "//jax:mosaic_gpu",
+        "//jax:test_multiprocess",
+    ] + py_deps([
+        "absl/testing",
+        "numpy",
+    ]),
+)
+
 jax_py_test(
     name = "gpu_dialect_test",
     srcs = ["gpu_dialect_test.py"],
diff --git a/tests/mosaic/gpu_test_distributed.py b/tests/mosaic/gpu_test_distributed.py
@@ -0,0 +1,100 @@
+# Copyright 2025 The JAX Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from absl.testing import parameterized
+import jax
+from jax._src import config
+from jax._src import test_util as jtu
+from jax._src import test_multiprocess as jt_multiprocess
+from jax._src.interpreters import mlir
+from jax._src.lib.mlir import ir
+from jax._src.lib.mlir.dialects import arith
+from jax.experimental.mosaic.gpu import dialect as mgpu_dialect  # pylint: disable=g-importing-member
+from jax.experimental import shard
+from jax.experimental import multihost_utils
+import jax.numpy as jnp
+import numpy as np
+try:
+  import jax._src.lib.mosaic_gpu  # noqa: F401
+  HAS_MOSAIC_GPU = True
+except ImportError:
+  HAS_MOSAIC_GPU = False
+else:
+  import jax.experimental.mosaic.gpu as mgpu
+
+
+# ruff: noqa: F405
+# pylint: disable=g-complex-comprehension
+P = jax.sharding.PartitionSpec
+
+
+class TestCase(parameterized.TestCase):
+
+  def setUp(self):
+    if not HAS_MOSAIC_GPU:
+      self.skipTest("jaxlib built without Mosaic GPU")
+    if (not jtu.test_device_matches(["cuda"]) or
+        not jtu.is_cuda_compute_capability_at_least("9.0")):
+      self.skipTest("Only works on GPU with capability >= sm90")
+    if not mgpu.supports_cross_device_collectives():
+      self.skipTest("NVSHMEM library unavailable.")
+    if jax.process_count() == 1:
+      self.skipTest("Test requires multiple processes.")
+    if jax.device_count() != jax.process_count():
+      self.skipTest("Need 1 device per process")
+    super().setUp()
+    self.prng = np.random.default_rng(1234)
+    self.context = mlir.make_ir_context()
+    if mgpu_dialect is not None:
+      mgpu_dialect.register_dialect(self.context)
+    self.enter_context(config.traceback_filtering("off"))
+    self.enter_context(self.context)
+    self.enter_context(ir.Location.unknown())
+
+
+class ProfilerTest(TestCase):
+
+  def test_remote_async_copy(self):
+    i32 = ir.IntegerType.get_signless(32)
+    def kernel(ctx, src, dst, scratch):
+      tmp, barrier = scratch
+      other_device = arith.subi(arith.constant(i32, 1), ctx.device_id())
+      ctx.async_copy(src_ref=src, dst_ref=tmp, barrier=barrier)
+      barrier.wait()
+      ctx.async_copy(src_ref=tmp, dst_ref=dst, gmem_peer_id=other_device)
+      ctx.await_async_copy(0)
+    mesh = jax.make_mesh(
+        (2,), ("x",), axis_types=(jax.sharding.AxisType.Explicit,)
+    )
+    with jax.sharding.use_mesh(mesh):
+      x_np = np.arange(64 * 64, dtype=jnp.float32).reshape(64, 64)
+      x = shard.reshard(x_np, P("x"))
+      y = jax.jit(
+          jax.shard_map(
+              lambda x: mgpu.as_gpu_kernel(
+                  kernel, (1, 1, 1), (128, 1, 1), x, x, (x, mgpu.TMABarrier())
+              )(x),
+              out_specs=P("x"),
+              check_vma=False,
+          )
+      )(x)
+      y_np = multihost_utils.process_allgather(y, tiled=True)
+      np.testing.assert_array_equal(
+          y_np, np.concatenate(np.split(x_np, 2)[::-1], axis=0)
+      )
+
+
+if __name__ == "__main__":
+  jt_multiprocess.main()
diff --git a/tests/pallas/BUILD b/tests/pallas/BUILD
@@ -864,6 +864,7 @@ jax_multiplatform_test(
         "notap",
     ],
     deps = [
+        "//jax:experimental",
         "//jax:pallas",
         "//jax:pallas_experimental_gpu_ops",
         "//jax:pallas_mosaic_gpu",
diff --git a/tests/pallas/mgpu_collective_matmul_test.py b/tests/pallas/mgpu_collective_matmul_test.py
@@ -27,6 +27,7 @@
 from jax._src.pallas import pallas_call
 from jax.experimental.mosaic import gpu as mgpu
 from jax.experimental.pallas.ops.gpu import collective_matmul_mgpu
+from jax.experimental import shard
 import jax.numpy as jnp
 import numpy as np
 
@@ -51,8 +52,13 @@ def setUp(self):
     if os.environ.get("XLA_PYTHON_CLIENT_ALLOCATOR", "") == "platform":
       self.skipTest("NVSHMEM doesn't work with the platform allocator.")
     context_stack = contextlib.ExitStack()
-    context_stack.enter_context(pallas_call._PALLAS_USE_MOSAIC_GPU(True))
     self.addCleanup(context_stack.close)
+    context_stack.enter_context(pallas_call._PALLAS_USE_MOSAIC_GPU(True))
+    num_devices = jax.device_count()
+    mesh = jax.make_mesh(
+        (num_devices,), ("x",), axis_types=(jax.sharding.AxisType.Explicit,)
+    )
+    context_stack.enter_context(jax.sharding.use_mesh(mesh))
 
   @parameterized.product(
       m_shard=(1024, 8192),
@@ -90,28 +96,17 @@ def test_all_gather_lhs_matmul(
     k1, k2 = random.split(random.key(1234), num=2)
     lhs = random.normal(k1, (num_devices * m_shard, k), dtype)
     rhs = random.normal(k2, (k, num_devices * n_shard), dtype)
-
-    mesh = jax.sharding.Mesh(jax.devices(), ["x"])
-    lhs = jax.device_put(lhs, jax.sharding.NamedSharding(mesh, P("x", None)))
-    rhs = jax.device_put(rhs, jax.sharding.NamedSharding(mesh, P(None, "x")))
+    lhs = shard.reshard(lhs, P("x", None))
+    rhs = shard.reshard(rhs, P(None, "x"))
 
     def run(body):
       out = jax.jit(
-          jax.shard_map(
-              body,
-              mesh=mesh,
-              in_specs=(P("x", None), P(None, "x")),
-              out_specs=P(None, "x"),
-              check_vma=False,
-          )
+          jax.shard_map(body, out_specs=P(None, "x"), check_vma=False)
       )(lhs, rhs)
       # Gather output, for NumPy comparison on the host.
       out = jax.shard_map(
           lambda x: lax.all_gather(x, "x", axis=1, tiled=True),
-          mesh=mesh,
-          in_specs=P(None, "x"),
-          out_specs=P(None),
-          check_vma=False,
+          out_specs=P(None), check_vma=False,
       )(out)
       return out