[Pallas:MGPU] Add a first prototype of an all_gather collective matmul kernel

apaszke · Google-ML-Automation · commit 7014bde5a579 · 2025-05-22T06:36:46.000-07:00
It's not very optimized at the moment and is unlikely to outperform the baseline
of raw all_gather + matmul, but it computes the right numbers. We are already aware
of a few places that could be optimized and we'll start rolling them out soon.

PiperOrigin-RevId: 761939624
diff --git a/jax/_src/pallas/core.py b/jax/_src/pallas/core.py
@@ -971,7 +971,7 @@ def _convert_block_spec_to_block_mapping(
 class ScratchShape(Protocol):
   def get_array_aval(self) -> jax_core.AbstractValue:
     ...
-  def get_ref_aval(self) -> state.AbstractRef:
+  def get_ref_aval(self) -> state.AbstractRef | TransformedRef:
     ...
 
 
diff --git a/jax/experimental/pallas/ops/gpu/collective_matmul_mgpu.py b/jax/experimental/pallas/ops/gpu/collective_matmul_mgpu.py
@@ -0,0 +1,178 @@
+# Copyright 2025 The JAX Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A collective matmul kernel implemented using Mosaic GPU."""
+
+import functools
+import jax
+from jax import lax
+from jax.experimental import pallas as pl
+from jax.experimental.pallas import mosaic_gpu as plgpu
+import jax.numpy as jnp
+
+
+def _find_swizzle(dim_size_bits: int, what: str):
+  for swizzle_bytes in (128, 64, 32, 16):
+    if dim_size_bits % (swizzle_bytes * 8) == 0:
+      return swizzle_bytes
+  raise ValueError(
+      f"No valid out swizzle for {what}: its minor dimension has"
+      f" {dim_size_bits} bits, which is not a multiple of 128"
+  )
+
+
+# TODO(apaszke): Add grid tiling
+def all_gather_lhs_matmul(
+    lhs: jax.Array,
+    rhs: jax.Array,
+    axis_name,
+    *,
+    block_m: int,
+    block_n: int,
+    block_k: int,
+    max_concurrent_steps: int,
+) -> jax.Array:
+  if (num_devices := jax.device_count()) != jax.process_count():
+    raise ValueError("The kernel only supports one device per process")
+  if (axis_size := lax.axis_size(axis_name)) != num_devices:
+    raise ValueError("The kernel can only work over all devices in a Mesh.")
+  if max_concurrent_steps < 2:
+    raise ValueError("max_concurrent_steps must be >= 2")
+
+  num_sms = 132  # There are 132 SMs on a H100 SXM GPU.
+
+  m_shard, k = lhs.shape
+  k2, n_shard = rhs.shape
+  if k != k2:
+    raise ValueError(
+        f"lhs and rhs must have the same contraction size, got {k} and {k2}."
+    )
+  if (element_type := lhs.dtype) != rhs.dtype:
+    raise ValueError(
+        f"lhs and rhs must have the same element type, got {element_type} and"
+        f" {rhs.dtype}."
+    )
+  if k % block_k != 0:
+    raise NotImplementedError(f"k={k} must be a multiple of block_k={block_k}")
+  if m_shard % block_m != 0:
+    raise NotImplementedError(f"m_shard={m_shard} must be a multiple of block_m={block_m}")
+  if n_shard % block_n != 0:
+    raise NotImplementedError(f"n_shard={n_shard} must be a multiple of block_n={block_n}")
+  if n_shard != block_n:
+    raise NotImplementedError(
+        f"n_shard={n_shard} must be equal to block_n={block_n}"
+    )
+
+  swizzle = min(
+      _find_swizzle(block_k * jnp.finfo(element_type).bits, "lhs"),
+      _find_swizzle(block_n * jnp.finfo(element_type).bits, "rhs"),
+  )
+  transforms = (
+      plgpu.TilingTransform((8, swizzle // jnp.dtype(element_type).itemsize)),
+      plgpu.SwizzleTransform(swizzle),
+  )
+
+  def kernel_body(lhs_ref, rhs_ref, out_ref, scratch_ref, capacity_sem, received_sem):
+    sm_id = lax.axis_index('sm')
+    scratch_ref = scratch_ref.at[sm_id]
+
+    dev_id = lax.axis_index(axis_name)
+    send_dev_id = lax.rem(dev_id + axis_size - 1, axis_size)
+    recv_dev_id = lax.rem(dev_id + 1, axis_size)
+    # NOTE: Technically we should signal the recv_dev_id (and our signal would
+    # be received from send_dev_id), but if everyone signals in a ring after a
+    # barrier then it's equivalent to a local signal.
+    pl.semaphore_signal(capacity_sem)
+    send_scratch_ref = plgpu.remote_ref(
+        scratch_ref, send_dev_id, device_id_type=pl.DeviceIdType.LOGICAL
+    )
+
+    def m_loop(mi, _):
+      mi = mi * lax.axis_size('sm') + sm_id
+      m_tile_slice = pl.ds(mi * block_m, block_m)
+
+      # For some reason ptxas spills if we unroll the loop over k
+      copy_block = 32
+      def k_copy_loop(ki, _):
+        k_slice = pl.ds(ki * copy_block, copy_block)
+        scratch_ref[0, :, k_slice] = lhs_ref[m_tile_slice, k_slice]
+      jax.lax.fori_loop(0, k // copy_block, k_copy_loop, None)
+
+      def device_loop(device_offset, _):
+        # Loop invariant: scratch_ref.at[scratch_slot] is ready to be used
+        # We're double buffering the scratch space. At each step, we read from
+        # scratch_ref.at[scratch_slot] and write to scratch_ref.at[next_scratch_slot]
+        # located on the send_dev_id. We swap the slots after completing a step,
+        # which lets us overlap the copy with compute.
+        scratch_slot = lax.rem(device_offset, 2)
+        next_scratch_slot = 1 - scratch_slot
+
+        @functools.partial(
+            pl.run_scoped,
+            acc_ref=plgpu.ACC((block_m, block_n)),
+            out_smem=plgpu.SMEM((block_m, block_n), jnp.float16, transforms=transforms),
+        )
+        def _(acc_ref, out_smem):
+          pl.semaphore_wait(capacity_sem)
+          @functools.partial(
+              plgpu.emit_pipeline,
+              grid=(k // block_k,),
+              in_specs=[
+                  plgpu.BlockSpec((block_m, block_k), lambda k: (0, k), transforms=transforms),
+                  plgpu.BlockSpec((block_k, block_n), lambda k: (k, 0), transforms=transforms),
+              ],
+              max_concurrent_steps=max_concurrent_steps,
+              delay_release=1,
+          )
+          def k_loop(idxs, lhs_smem, rhs_smem):
+            (ki,) = idxs
+            plgpu.wgmma(acc_ref, lhs_smem, rhs_smem)
+            k_slice = pl.ds(ki * block_k, block_k)
+            # TODO(apaszke): No need to send on the last step
+            # TODO(apaszke): Use an async copy. This is uncoalesced.
+            send_scratch_ref[next_scratch_slot, :, k_slice] = lhs_smem[...]
+          k_loop(scratch_ref.at[scratch_slot], rhs_ref)
+          # TODO(apaszke): Both of those semaphores perform a .sys release.
+          # This is very expensive and we should only do a single .sys fence.
+          pl.semaphore_signal(capacity_sem, device_id=recv_dev_id, device_id_type=pl.DeviceIdType.LOGICAL)
+          pl.semaphore_signal(received_sem, device_id=send_dev_id, device_id_type=pl.DeviceIdType.LOGICAL)
+          # Make sure all TMAs have read SMEM before we overwrite it.
+          plgpu.wait_smem_to_gmem(0, wait_read_only=True)
+          out_smem[...] = acc_ref[...].astype(out_smem.dtype)
+          plgpu.commit_smem()
+          device_m_slice = pl.ds(
+              lax.rem(device_offset + dev_id, num_devices) * m_shard, block_m
+          )
+          plgpu.copy_smem_to_gmem(
+              out_smem, out_ref.at[device_m_slice].at[m_tile_slice]
+          )
+          # Wait for the next scratch to arrive --- see the loop invariant.
+          pl.semaphore_wait(received_sem)
+      jax.lax.fori_loop(0, num_devices, device_loop, None)
+    grid_size = m_shard // block_m
+    m_steps = grid_size // num_sms + jnp.int32(sm_id < grid_size % num_sms)
+    # TODO(apaszke): Use the ND-loop helper.
+    jax.lax.fori_loop(0, m_steps, m_loop, None)
+
+  result, _ = plgpu.kernel(
+      kernel_body,
+      out_shape=[jax.ShapeDtypeStruct((axis_size * m_shard, n_shard), jnp.float16),
+                  jax.ShapeDtypeStruct((num_sms, 2, block_m, k), jnp.float16)],
+      scratch_shapes=[
+          plgpu.SemaphoreType.REGULAR, plgpu.SemaphoreType.REGULAR,
+      ],
+      grid=(num_sms,),
+      grid_names=('sm',),
+  )(lhs, rhs)
+  return result
diff --git a/tests/pallas/BUILD b/tests/pallas/BUILD
@@ -842,6 +842,35 @@ jax_multiplatform_test(
     ]),
 )
 
+jax_multiplatform_test(
+    name = "mgpu_collective_matmul_test",
+    srcs = ["mgpu_collective_matmul_test.py"],
+    args = [
+        "--num_processes=2",
+        "--gpus_per_process=1",
+    ],
+    enable_backends = [],
+    enable_configs = [
+        "gpu_h100x2",
+    ],
+    env = {
+        "XLA_FLAGS": "--xla_gpu_experimental_enable_nvshmem=true",
+        "JAX_PALLAS_USE_MOSAIC_GPU": "1",
+    },
+    shard_count = 4,
+    tags = [
+        "manual",
+        "multiaccelerator",
+        "notap",
+    ],
+    deps = [
+        "//jax:pallas",
+        "//jax:pallas_experimental_gpu_ops",
+        "//jax:pallas_mosaic_gpu",
+        "//jax:test_multiprocess",
+    ] + py_deps("absl/testing") + py_deps("numpy"),
+)
+
 jax_multiplatform_test(
     name = "fuser_block_spec_test",
     srcs = [
diff --git a/tests/pallas/mgpu_collective_matmul_test.py b/tests/pallas/mgpu_collective_matmul_test.py
@@ -0,0 +1,134 @@
+# Copyright 2025 The JAX Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test different parameterizations of our Mosaic GPU collective matmul."""
+
+import contextlib
+import functools
+import os
+
+from absl.testing import parameterized  # pylint: disable=g-multiple-import
+import jax
+from jax import lax
+from jax import random
+from jax._src import test_multiprocess as jt_multiprocess
+from jax._src import test_util as jtu
+from jax._src.pallas import pallas_call
+from jax.experimental.mosaic import gpu as mgpu
+from jax.experimental.pallas.ops.gpu import collective_matmul_mgpu
+import jax.numpy as jnp
+import numpy as np
+
+
+P = jax.sharding.PartitionSpec
+
+
+@jtu.with_config(jax_traceback_filtering="off")
+class CollectiveMatmulTestCase(jtu.JaxTestCase):
+
+  def setUp(self):
+    super().setUp()
+    if collective_matmul_mgpu is None:
+      self.skipTest("Mosaic GPU not available.")
+    if (not jtu.test_device_matches(["cuda"]) or
+        not jtu.is_cuda_compute_capability_equal("9.0")):
+      self.skipTest("Only works on GPU with capability sm90a")
+    if not mgpu.supports_cross_device_collectives():
+      self.skipTest("NVSHMEM library unavailable.")
+    if jax.process_count() == 1:
+      self.skipTest("Test requires multiple processes.")
+    context_stack = contextlib.ExitStack()
+    context_stack.enter_context(pallas_call._PALLAS_USE_MOSAIC_GPU(True))
+    self.addCleanup(context_stack.close)
+
+  @parameterized.product(
+      m_shard=(1024, 8192),
+      n_shard=(64, 128, 192),
+      k=(256, 8192),
+      block_m=(64, 128, 192),
+      block_n=(64, 128, 192),
+      block_k=(64, 128),
+      max_concurrent_steps=(2, 4),
+  )
+  def test_all_gather_lhs_matmul(
+      self,
+      m_shard,
+      n_shard,
+      k,
+      block_m,
+      block_n,
+      block_k,
+      max_concurrent_steps,
+  ):
+    num_devices = jax.device_count()
+    dtype = jnp.float16
+    lhs_smem_size = block_m * block_k * max_concurrent_steps * 2
+    rhs_smem_size = block_k * block_n * max_concurrent_steps * 2
+    # H100 SMEM limit is 228kB.
+    if lhs_smem_size + rhs_smem_size > 228_000:
+      self.skipTest("This configuration requires too much SMEM.")
+    if n_shard != block_n:
+      self.skipTest("n_shard must be equal to block_n for now.")
+    if n_shard % block_n:
+      self.skipTest("n_shard must be divisble by block_n for now.")
+    if m_shard % block_m:
+      self.skipTest("m_shard must be divisible by block_m for now.")
+
+    k1, k2 = random.split(random.key(1234), num=2)
+    lhs = random.normal(k1, (num_devices * m_shard, k), dtype)
+    rhs = random.normal(k2, (k, num_devices * n_shard), dtype)
+
+    mesh = jax.sharding.Mesh(jax.devices(), ["x"])
+    lhs = jax.device_put(lhs, jax.sharding.NamedSharding(mesh, P("x", None)))
+    rhs = jax.device_put(rhs, jax.sharding.NamedSharding(mesh, P(None, "x")))
+
+    def run(body):
+      out = jax.jit(
+          jax.shard_map(
+              body,
+              mesh=mesh,
+              in_specs=(P("x", None), P(None, "x")),
+              out_specs=P(None, "x"),
+              check_vma=False,
+          )
+      )(lhs, rhs)
+      # Gather output, for NumPy comparison on the host.
+      out = jax.shard_map(
+          lambda x: lax.all_gather(x, "x", axis=1, tiled=True),
+          mesh=mesh,
+          in_specs=P(None, "x"),
+          out_specs=P(None),
+          check_vma=False,
+      )(out)
+      return out
+
+    out = run(
+        functools.partial(
+            collective_matmul_mgpu.all_gather_lhs_matmul,
+            axis_name="x",
+            block_m=block_m,
+            block_n=block_n,
+            block_k=block_k,
+            max_concurrent_steps=max_concurrent_steps,
+        )
+    )
+    ref_out = run(lambda x, y: lax.all_gather(x, "x", axis=0, tiled=True) @ y)
+    np.testing.assert_allclose(out, ref_out)
+
+
+if __name__ == "__main__":
+  os.environ["XLA_FLAGS"] = (
+      os.environ.get("XLA_FLAGS", "") + " --xla_gpu_autotune_level=0"
+  )
+  jt_multiprocess.main()