[Mosaic GPU] Add support for async copies to peer devices

apaszke · Google-ML-Automation · commit 5a448b867cf9 · 2025-05-23T04:47:34.000-07:00
PiperOrigin-RevId: 762370447
diff --git a/jax/experimental/mosaic/gpu/launch_context.py b/jax/experimental/mosaic/gpu/launch_context.py
@@ -400,6 +400,7 @@ def _get_tma_desc(
       self,
       gmem_ref,
       gmem_transform: tuple[MemRefTransform, ...],
+      gmem_peer_id: int | ir.Value | None,
       transformed_slice_shape: tuple[int, ...],
       swizzle: int | None,
       reduction_op: Literal[
@@ -408,6 +409,7 @@ def _get_tma_desc(
   ):
     tma_desc_key = (gmem_ref, transformed_slice_shape, swizzle, gmem_transform)
     if (tma_desc := self.tma_descriptors.get(tma_desc_key, None)) is None:
+      i32 = ir.IntegerType.get_signless(32)
       i64 = ir.IntegerType.get_signless(64)
       ptr_ty = ir.Type.parse("!llvm.ptr")
       def init_tma_desc(host_ptr):
@@ -432,6 +434,25 @@ def init_tma_desc(host_ptr):
         base_ptr = llvm.getelementptr(
             ptr_ty, alloc_ptr, [as_i64(offset)], [llvm_dyn], ref_ty.element_type, llvm.GEPNoWrapFlags.none,
         )
+        if gmem_peer_id is not None:
+          if not isinstance(gmem_peer_id, ir.Value):
+            peer_id = c(gmem_peer_id, i32)
+          else:
+            try:
+              # We try to reproduce the gmem_peer_id computation on the host.
+              peer_id = _recompute_peer_id(gmem_peer_id)
+            except ReplicationError as e:
+              raise ValueError(
+                  "Failed to recompute the async_copy peer id on the host"
+              ) from e
+          self._ensure_nvshmem_decls()
+          base_ptr = llvm.call(
+              base_ptr.type,
+              [base_ptr, peer_id],
+              [],
+              [],
+              callee="nvshmem_ptr",
+          )
         rank = ref_ty.rank
         assert rank * 2 == len(sizes_and_strides)
         swizzle_arg = (
@@ -507,6 +528,7 @@ def async_copy(
       dst_ref,
       gmem_slice: Any = (),
       gmem_transform: MemRefTransform | tuple[MemRefTransform, ...] = (),
+      gmem_peer_id: int | ir.Value | None = None,
       barrier: utils.BarrierRef | None = None,
       swizzle: int | None = None,
       arrive: bool | None = None,
@@ -750,7 +772,8 @@ def partition_dim(dim: int, idx: ir.Value, num_chunks: int):
       multicast_mask = None
 
     tma_desc = self._get_tma_desc(
-        gmem_ref, gmem_transform, tuple(slice_shape), swizzle, reduction_op,
+        gmem_ref, gmem_transform, gmem_peer_id,
+        tuple(slice_shape), swizzle, reduction_op,
     )
 
     # We constuct TMA descriptors in column-major order.
@@ -893,3 +916,33 @@ def device_id(self) -> ir.Value:
     self._ensure_nvshmem_decls()
     i32 = ir.IntegerType.get_signless(32)
     return llvm.call(i32, [], [], [], callee="nvshmem_my_pe")
+
+
+class ReplicationError(Exception):
+  pass
+
+def _recompute_peer_id(peer_id: ir.Value, fuel=8) -> ir.Value:
+  if fuel == 0:
+    raise ReplicationError(
+        "gmem_peer_id computation is too complicated to recompute on the host"
+    )
+  if isinstance(peer_id, ir.BlockArgument):
+    raise ReplicationError("Can't recompute a value that's a block argument")
+  op = peer_id.owner.opview
+  # We accept all arith ops
+  if op.OPERATION_NAME.startswith("arith."):
+    new_operands = [_recompute_peer_id(x, fuel - 1) for x in op.operands]
+    result_types = [r.type for r in op.results]
+    new_attributes = {na.name: na.attr for na in op.attributes}
+    new_op = ir.Operation.create(
+        op.OPERATION_NAME, result_types, new_operands, new_attributes
+    )
+    return new_op.results if len(new_op.results) > 1 else new_op.result
+  # nvshmem_my_pe queries the device id of the current process and works on both
+  # the host and the device.
+  if isinstance(op, llvm.CallOp) and op.callee.value == "nvshmem_my_pe":
+    i32 = ir.IntegerType.get_signless(32)
+    return llvm.call(i32, [], [], [], callee="nvshmem_my_pe")
+  raise ReplicationError(
+      f"Unrecognized op can't be recomputed on the host: {op}"
+  )
diff --git a/jaxlib/mosaic/gpu/BUILD b/jaxlib/mosaic/gpu/BUILD
@@ -122,6 +122,8 @@ cc_library(
     # Linker may prune these symbols if they are not explicitly exported.
     linkopts = [
         "-Wl,--export-dynamic-symbol='mosaic_gpu_*'",
+        "-Wl,--export-dynamic-symbol='nvshmem_my_pe'",
+        "-Wl,--export-dynamic-symbol='nvshmem_ptr'",
         "-Wl,--export-dynamic-symbol='nvshmemx_barrier_all_on_stream'",
         "-Wl,--export-dynamic-symbol='nvshmemx_cumodule_init'",
         "-Wl,--export-dynamic-symbol='nvshmemx_init_status'",
diff --git a/tests/mosaic/BUILD b/tests/mosaic/BUILD
@@ -63,6 +63,27 @@ jax_multiplatform_test(
     ]),
 )
 
+jax_multiplatform_test(
+    name = "gpu_test_distributed",
+    srcs = ["gpu_test_distributed.py"],
+    args = [
+        "--num_processes=2",
+        "--gpus_per_process=1",
+    ],
+    enable_backends = [],
+    enable_configs = ["gpu_h100x2"],
+    env = {"XLA_FLAGS": "--xla_gpu_autotune_level=0 --xla_gpu_experimental_enable_nvshmem=true"},
+    tags = ["multiaccelerator"],
+    deps = [
+        "//jax:experimental",
+        "//jax:mosaic_gpu",
+        "//jax:test_multiprocess",
+    ] + py_deps([
+        "absl/testing",
+        "numpy",
+    ]),
+)
+
 jax_py_test(
     name = "gpu_dialect_test",
     srcs = ["gpu_dialect_test.py"],
diff --git a/tests/mosaic/gpu_test_distributed.py b/tests/mosaic/gpu_test_distributed.py
@@ -0,0 +1,100 @@
+# Copyright 2025 The JAX Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from absl.testing import parameterized
+import jax
+from jax._src import config
+from jax._src import test_util as jtu
+from jax._src import test_multiprocess as jt_multiprocess
+from jax._src.interpreters import mlir
+from jax._src.lib.mlir import ir
+from jax._src.lib.mlir.dialects import arith
+from jax.experimental.mosaic.gpu import dialect as mgpu_dialect  # pylint: disable=g-importing-member
+from jax.experimental import shard
+from jax.experimental import multihost_utils
+import jax.numpy as jnp
+import numpy as np
+try:
+  import jax._src.lib.mosaic_gpu  # noqa: F401
+  HAS_MOSAIC_GPU = True
+except ImportError:
+  HAS_MOSAIC_GPU = False
+else:
+  import jax.experimental.mosaic.gpu as mgpu
+
+
+# ruff: noqa: F405
+# pylint: disable=g-complex-comprehension
+P = jax.sharding.PartitionSpec
+
+
+class TestCase(parameterized.TestCase):
+
+  def setUp(self):
+    if not HAS_MOSAIC_GPU:
+      self.skipTest("jaxlib built without Mosaic GPU")
+    if (not jtu.test_device_matches(["cuda"]) or
+        not jtu.is_cuda_compute_capability_at_least("9.0")):
+      self.skipTest("Only works on GPU with capability >= sm90")
+    if not mgpu.supports_cross_device_collectives():
+      self.skipTest("NVSHMEM library unavailable.")
+    if jax.process_count() == 1:
+      self.skipTest("Test requires multiple processes.")
+    if jax.device_count() != jax.process_count():
+      self.skipTest("Need 1 device per process")
+    super().setUp()
+    self.prng = np.random.default_rng(1234)
+    self.context = mlir.make_ir_context()
+    if mgpu_dialect is not None:
+      mgpu_dialect.register_dialect(self.context)
+    self.enter_context(config.traceback_filtering("off"))
+    self.enter_context(self.context)
+    self.enter_context(ir.Location.unknown())
+
+
+class ProfilerTest(TestCase):
+
+  def test_remote_async_copy(self):
+    i32 = ir.IntegerType.get_signless(32)
+    def kernel(ctx, src, dst, scratch):
+      tmp, barrier = scratch
+      other_device = arith.subi(arith.constant(i32, 1), ctx.device_id())
+      ctx.async_copy(src_ref=src, dst_ref=tmp, barrier=barrier)
+      barrier.wait()
+      ctx.async_copy(src_ref=tmp, dst_ref=dst, gmem_peer_id=other_device)
+      ctx.await_async_copy(0)
+    mesh = jax.make_mesh(
+        (2,), ("x",), axis_types=(jax.sharding.AxisType.Explicit,)
+    )
+    with jax.sharding.use_mesh(mesh):
+      x_np = np.arange(64 * 64, dtype=jnp.float32).reshape(64, 64)
+      x = shard.reshard(x_np, P("x"))
+      y = jax.jit(
+          jax.shard_map(
+              lambda x: mgpu.as_gpu_kernel(
+                  kernel, (1, 1, 1), (128, 1, 1), x, x, (x, mgpu.TMABarrier())
+              )(x),
+              out_specs=P("x"),
+              check_vma=False,
+          )
+      )(x)
+      y_np = multihost_utils.process_allgather(y, tiled=True)
+      np.testing.assert_array_equal(
+          y_np, np.concatenate(np.split(x_np, 2)[::-1], axis=0)
+      )
+
+
+if __name__ == "__main__":
+  jt_multiprocess.main()