[Mosaic GPU] Add support for copy_gmem_to_smem in Warp semantics.

justinjfu · Google-ML-Automation · commit d4ab82637a92 · 2025-05-23T10:17:09.000-07:00
PiperOrigin-RevId: 762475094
diff --git a/jax/_src/pallas/mosaic_gpu/primitives.py b/jax/_src/pallas/mosaic_gpu/primitives.py
@@ -49,6 +49,7 @@
 import jax.numpy as jnp
 
 
+WARP_SIZE = 32
 WARPGROUP_SIZE = 128
 
 
@@ -464,7 +465,7 @@ def _copy_gmem_to_smem_lowering(
     dst_transforms_treedef,
     barrier_transforms_treedef,
     collective_axes,
-    warpgroup_sync: bool = True,
+    for_warpgroup: bool = True,
 ):
   flat_src_transforms, flat_dst_transforms, flat_barrier_transforms = (
       util.split_list(
@@ -505,15 +506,23 @@ def _copy_gmem_to_smem_lowering(
   if ctx.module_ctx.lowering_semantics == mgpu.LoweringSemantics.Lane:
     if bytes % WARPGROUP_SIZE:
       raise NotImplementedError("Only aligned copies are supported")
-    # We arrive uniformly from each thread in the WG, so we need to divide the
-    # number of bytes by the number of threads in the WG.
-    # TODO: apaszke - Relax this. We can just select the WG leader and have it
-    # arrive with the whole transfer size, while everyone else arrives with 0.
-    # But we should continue using this scheme as it's likely to be faster.
-    bytes //= WARPGROUP_SIZE
-    if warpgroup_sync:
+    if for_warpgroup:
+      # We arrive uniformly from each thread in the WG, so we need to divide the
+      # number of bytes by the number of threads in the WG.
+      # TODO: apaszke - Relax this. We can just select the WG leader and have it
+      # arrive with the whole transfer size, while everyone else arrives with 0.
+      # But we should continue using this scheme as it's likely to be faster.
+      bytes //= WARPGROUP_SIZE
       mgpu.warpgroup_barrier()  # Make sure all reads have completed.
-    barrier.arrive_expect_tx(bytes)
+      barrier.arrive_expect_tx(bytes)
+    else:
+      # In Warp-level lowering, we arrive on each CUDA thread in a warp, but
+      # the barrier still expects a full 128 arrivals so we arrive 4 times
+      # on each CUDA thread instead.
+      bytes //= WARP_SIZE
+      barrier.arrive(arrival_count=3, can_complete=False)
+      barrier.arrive_expect_tx(bytes)
+
     ctx.launch_ctx.async_copy(
         src_ref=src,
         dst_ref=dst,
@@ -549,7 +558,7 @@ def _copy_gmem_to_smem_lowering(
     copy_gmem_to_smem_p,
     mgpu.LoweringSemantics.Lane,
     primitive_semantics=gpu_core.PrimitiveSemantics.Warp,
-)(functools.partial(_copy_gmem_to_smem_lowering, warpgroup_sync=False))
+)(functools.partial(_copy_gmem_to_smem_lowering, for_warpgroup=False))
 
 
 def copy_gmem_to_smem(
@@ -713,6 +722,8 @@ def _barrier_wait_pp_eqn(
 
 
 @lowering.register_lowering_rule(barrier_wait_p, mgpu.LoweringSemantics.Lane)
+@lowering.register_lowering_rule(barrier_wait_p, mgpu.LoweringSemantics.Lane,
+                                 gpu_core.PrimitiveSemantics.Warp)
 @lowering.register_lowering_rule(barrier_wait_p, mgpu.LoweringSemantics.Warpgroup)
 def _barrier_wait_lowering(
     ctx: lowering.LoweringRuleContext,
diff --git a/jax/experimental/mosaic/gpu/utils.py b/jax/experimental/mosaic/gpu/utils.py
@@ -816,9 +816,16 @@ def update_parities(self, parities: ir.Value) -> tuple[ir.Value, ir.Value]:
     )
     return parity, arith.xori(parities, bitmask)
 
-  def arrive(self):
+  def arrive(self, arrival_count: int = 1, can_complete: bool = True):
     i64 = ir.IntegerType.get_signless(64)
-    nvvm.mbarrier_arrive_shared(i64, self.get_ptr())
+    if can_complete:
+      if arrival_count > 1:
+        count = c(arrival_count - 1, ir.IntegerType.get_signless(32))
+        nvvm.mbarrier_arrive_nocomplete_shared(i64, self.get_ptr(), count)
+      nvvm.mbarrier_arrive_shared(i64, self.get_ptr())
+    else:
+      count = c(arrival_count, ir.IntegerType.get_signless(32))
+      nvvm.mbarrier_arrive_nocomplete_shared(i64, self.get_ptr(), count)
 
   def arrive_expect_tx(
       self, bytes: int | ir.Value, predicate: ir.Value | None = None
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -1878,6 +1878,32 @@ def _():
         },
     )
 
+  def test_copy_gmem_to_smem_from_different_warps(self):
+    # In this test, we issue a copy from from warp 0 and await it in warp 1.
+    warp_mesh = plgpu.WarpMesh(axis_name="warp")
+    @functools.partial(plgpu.kernel,
+                       out_shape=jax.ShapeDtypeStruct((32, 32), jnp.float32))
+    def kernel(x_ref, y_ref):
+      def scope(smem_ref, tma_barrier):
+        @pl.core_map(warp_mesh)
+        def _():
+          warp_id = lax.axis_index("warp")
+          @pl.when(warp_id == 0)
+          def _():
+            plgpu.copy_gmem_to_smem(x_ref.at[32:64], smem_ref, tma_barrier)
+
+          @pl.when(warp_id == 1)
+          def _():
+            plgpu.barrier_wait(tma_barrier)
+            plgpu.copy_smem_to_gmem(smem_ref, y_ref)
+        plgpu.wait_smem_to_gmem(0)
+      pl.run_scoped(scope,
+                    smem_ref=plgpu.SMEM((32, 32), jnp.float32),
+                    tma_barrier=plgpu.Barrier(num_arrivals=1))
+    x = jax.random.uniform(jax.random.key(42), (64, 32), jnp.float32)
+    result = kernel(x)
+    np.testing.assert_array_equal(result, x[32:64])
+
 
 class PallasCallWGTest(
     PallasCallTest, lowering_semantics=plgpu.LoweringSemantics.Warpgroup