[Mosaic GPU] Add support for loops, debug_print, and unary ops to Warp semantics.

justinjfu · Google-ML-Automation · commit a827a274baf1 · 2025-05-22T10:57:53.000-07:00
PiperOrigin-RevId: 762036132
diff --git a/jax/_src/pallas/mosaic_gpu/lowering.py b/jax/_src/pallas/mosaic_gpu/lowering.py
@@ -1698,6 +1698,19 @@ def convert(ty, x):
     lax.not_p: lambda ctx, x: ~x,
 })
 
+def _unary_warp_lowering_rule(impl):
+  def _lowering_rule(ctx: LoweringRuleContext, x):
+    if not all(aval_in.shape == () for aval_in in ctx.avals_in):
+      raise NotImplementedError(
+          "Non-scalar arithmetic is not supported in warp-level lowering.")
+    return impl(x)
+  return _lowering_rule
+
+mosaic_lowering_rules[gpu_core.LANExWARP_SEMANTICS].update({
+    lax.neg_p: _unary_warp_lowering_rule(lambda x: -x),
+    lax.not_p: _unary_warp_lowering_rule(lambda x: ~x)
+})
+
 mosaic_lowering_rules[gpu_core.WGxWG_SEMANTICS].update({
     lax.neg_p: _lower_fun(lambda x: jnp.subtract(0, x), multiple_results=False),
     lax.not_p: _lower_fun(
@@ -2163,6 +2176,8 @@ def _axis_index_warp_rule(ctx: LoweringRuleContext, *, axis_name: Hashable):
 
 
 @register_lowering_rule(primitives.debug_print_p, mgpu.LoweringSemantics.Lane)
+@register_lowering_rule(primitives.debug_print_p, mgpu.LoweringSemantics.Lane,
+                        gpu_core.PrimitiveSemantics.Warp)
 def _debug_print_lowering_rule(
     ctx: LoweringRuleContext,
     *args,
@@ -2171,13 +2186,17 @@ def _debug_print_lowering_rule(
 ):
   del has_placeholders  # Unused.
   primitives.check_debug_print_format(fmt, *args)
+  scope = mgpu.ThreadSubset.WARPGROUP
+  if ctx.module_ctx.primitive_semantics == gpu_core.PrimitiveSemantics.Warp:
+    scope = mgpu.ThreadSubset.WARP
   if not any(aval.shape for aval in ctx.avals_in):
     mgpu.debug_print(
         fmt,
         *(
             _ensure_ir_value(arg, aval.dtype)
             for arg, aval in zip(args, ctx.avals_in)
         ),
+        scope=scope
     )
   elif len(ctx.avals_in) == 1:
     [arg] = args
@@ -2461,6 +2480,8 @@ def loop(loop_index, body_args):
 
 @register_lowering_rule(lax.scan_p, mgpu.LoweringSemantics.Lane)
 @register_lowering_rule(lax.scan_p, mgpu.LoweringSemantics.Warpgroup)
+@register_lowering_rule(lax.scan_p, mgpu.LoweringSemantics.Lane,
+                        gpu_core.PrimitiveSemantics.Warp)
 def _scan_lowering_rule(
     ctx: LoweringRuleContext,
     *args,
diff --git a/jax/experimental/mosaic/gpu/utils.py b/jax/experimental/mosaic/gpu/utils.py
@@ -144,7 +144,11 @@ def _debug_scalar_ty_format(arg):
     return "%f", arg
   raise NotImplementedError(f"Can't print the type {arg.type}")
 
-def debug_print(fmt, *args, uniform=True):
+def debug_print(fmt, *args, uniform=True, scope=None):
+  if not uniform and scope is not None:
+    raise ValueError("Cannot specify scope to a non-uniform debug_print.")
+  if scope is None:
+    scope = ThreadSubset.WARPGROUP
   type_formats = []
   new_args = []
   for arg in args:
@@ -168,7 +172,7 @@ def debug_print(fmt, *args, uniform=True):
       raise NotImplementedError(arg.type)
     type_formats.append(ty_format)
   ctx = (
-      functools.partial(single_thread, scope=ThreadSubset.WARPGROUP)
+      functools.partial(single_thread, scope=scope)
       if uniform
       else contextlib.nullcontext
   )
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -1569,64 +1569,6 @@ def kernel(x_ref, y_ref, o_ref):
     y = jax.lax.iota(jnp.float32, 128) * 3
     np.testing.assert_array_equal(kernel(x, y), x + y)
 
-  def test_warp_specialization_axis_index(self):
-    if self.LOWERING_SEMANTICS != plgpu.LoweringSemantics.Lane:
-      self.skipTest("Test only works on Lane semantics")
-    warp_mesh = plgpu.WarpMesh(axis_name="warp")
-    @functools.partial(plgpu.kernel,
-                       out_shape=jax.ShapeDtypeStruct((2, 128), jnp.int32))
-    def kernel(y_ref):
-      def scope(ones_smem_ref, threes_smem_ref):
-        # Prepare data to copy.
-        ones_smem_ref[:] = jnp.ones((1, 128), jnp.int32)
-        threes_smem_ref[:] = jnp.ones((1, 128), jnp.int32) * 3
-        plgpu.commit_smem()
-        @pl.core_map(warp_mesh)
-        def _():
-          warp_id = lax.axis_index("warp")
-          # We cannot load/store inside of core_map, so we issue async
-          # copies instead to produce a testable result.
-          @pl.when(warp_id == 1)
-          def _():
-            plgpu.copy_smem_to_gmem(ones_smem_ref, y_ref.at[0:1])
-          @pl.when(warp_id == 3)
-          def _():
-            plgpu.copy_smem_to_gmem(threes_smem_ref, y_ref.at[1:2])
-        plgpu.wait_smem_to_gmem(0)
-      pl.run_scoped(scope,
-                    plgpu.SMEM((1, 128), jnp.int32),
-                    plgpu.SMEM((1, 128), jnp.int32)
-                    )
-    result = kernel()
-    expected = jnp.stack((jnp.ones((128,), jnp.int32),
-                          jnp.ones((128,), jnp.int32) * 3), axis=0)
-    np.testing.assert_array_equal(result, expected)
-
-  def test_warp_mesh_errors_when_closing_over_array(self):
-    if self.LOWERING_SEMANTICS != plgpu.LoweringSemantics.Lane:
-      self.skipTest("Test only works on Lane semantics")
-    # We currently do not allow closing over arrays when mapping over
-    # a mesh, since we would need to present a view of the array local
-    # to each warp.
-    warp_mesh = plgpu.WarpMesh(axis_name="warp")
-    @functools.partial(plgpu.kernel,
-                       out_shape=jax.ShapeDtypeStruct((32, 32), jnp.float32),
-                       scratch_shapes=[plgpu.SMEM((32, 32), jnp.float32)])
-    def kernel(out_ref, smem_ref):
-      arr = jnp.ones((32, 32), dtype=jnp.float32)
-      @pl.core_map(warp_mesh)
-      def _():
-        smem_ref[...] = arr + 1
-      plgpu.commit_smem()
-      plgpu.copy_smem_to_gmem(smem_ref, out_ref)
-      plgpu.wait_smem_to_gmem(0)
-    with self.assertRaisesRegex(
-        mgpu_lowering.LoweringError,
-        "Can only close over scalars and Refs when using core_map with "
-        "WarpMesh",
-    ):
-      kernel()
-
   def test_smem_aliasing_works(self):
     self.skip_if_wg_semantics()
 
@@ -1825,6 +1767,118 @@ def body(idx, _):
       )
 
 
+class PallasCallWarpPrimitiveSemanticsTest(PallasTest):
+  def setUp(self):
+    super().setUp()
+    if self.LOWERING_SEMANTICS != plgpu.LoweringSemantics.Lane:
+      self.skipTest("Test only works on Lane semantics")
+
+  def test_axis_index(self):
+    warp_mesh = plgpu.WarpMesh(axis_name="warp")
+    @functools.partial(plgpu.kernel,
+                       out_shape=jax.ShapeDtypeStruct((2, 128), jnp.int32))
+    def kernel(y_ref):
+      def scope(ones_smem_ref, threes_smem_ref):
+        # Prepare data to copy.
+        ones_smem_ref[:] = jnp.ones((1, 128), jnp.int32)
+        threes_smem_ref[:] = jnp.ones((1, 128), jnp.int32) * 3
+        plgpu.commit_smem()
+        @pl.core_map(warp_mesh)
+        def _():
+          warp_id = lax.axis_index("warp")
+          # We cannot load/store inside of core_map, so we issue async
+          # copies instead to produce a testable result.
+          @pl.when(warp_id == 1)
+          def _():
+            plgpu.copy_smem_to_gmem(ones_smem_ref, y_ref.at[0:1])
+          @pl.when(warp_id == 3)
+          def _():
+            plgpu.copy_smem_to_gmem(threes_smem_ref, y_ref.at[1:2])
+        plgpu.wait_smem_to_gmem(0)
+      pl.run_scoped(scope,
+                    plgpu.SMEM((1, 128), jnp.int32),
+                    plgpu.SMEM((1, 128), jnp.int32)
+                    )
+    result = kernel()
+    expected = jnp.stack((jnp.ones((128,), jnp.int32),
+                          jnp.ones((128,), jnp.int32) * 3), axis=0)
+    np.testing.assert_array_equal(result, expected)
+
+  def test_errors_when_closing_over_array(self):
+    # We currently do not allow closing over arrays when mapping over
+    # a mesh, since we would need to present a view of the array local
+    # to each warp.
+    warp_mesh = plgpu.WarpMesh(axis_name="warp")
+    @functools.partial(plgpu.kernel,
+                       out_shape=jax.ShapeDtypeStruct((32, 32), jnp.float32),
+                       scratch_shapes=[plgpu.SMEM((32, 32), jnp.float32)])
+    def kernel(out_ref, smem_ref):
+      arr = jnp.ones((32, 32), dtype=jnp.float32)
+      @pl.core_map(warp_mesh)
+      def _():
+        smem_ref[...] = arr + 1
+      plgpu.commit_smem()
+      plgpu.copy_smem_to_gmem(smem_ref, out_ref)
+      plgpu.wait_smem_to_gmem(0)
+    with self.assertRaisesRegex(
+        mgpu_lowering.LoweringError,
+        "Can only close over scalars and Refs when using core_map with "
+        "WarpMesh",
+    ):
+      kernel()
+
+  def test_single_warp_scan(self):
+    warp_mesh = plgpu.WarpMesh(axis_name="warp")
+    @functools.partial(plgpu.kernel,
+                       out_shape=jax.ShapeDtypeStruct((10, 128), jnp.int32))
+    def kernel(y_ref):
+      def scope(smem_ref):
+        # Prepare data to copy.
+        for i in range(10):
+          smem_ref[i, :] = jnp.ones_like(smem_ref.at[i]) * i
+        plgpu.commit_smem()
+        @pl.core_map(warp_mesh)
+        def _():
+          warp_id = lax.axis_index("warp")
+          @pl.when(warp_id == 0)
+          def _():
+            def loop_body(i, _):
+              _slice = pl.ds(i, 1)
+              plgpu.copy_smem_to_gmem(smem_ref.at[_slice], y_ref.at[_slice])
+            lax.fori_loop(0, 10, loop_body, None)
+        plgpu.wait_smem_to_gmem(0)
+      pl.run_scoped(scope, plgpu.SMEM((10, 128), jnp.int32))
+    result = kernel()
+    expected = jnp.stack(
+        [jnp.ones((128,), jnp.int32) * i for i in range(10)], axis=0)
+    np.testing.assert_array_equal(result, expected)
+
+  def test_debug_print(self):
+    warp_mesh = plgpu.WarpMesh(axis_name="warp")
+    @functools.partial(
+        plgpu.kernel,
+        out_shape=jnp.zeros(128, np.int32),
+    )
+    def kernel(ref):
+      ref[...] = ref[...]  # Prevent kernel from being DCE'd
+      @pl.core_map(warp_mesh)
+      def _():
+        warp_id = lax.axis_index("warp")
+        pl.debug_print("warp: {}", warp_id)
+
+    with self.capture_stdout() as output:
+      jax.block_until_ready(kernel())
+    self.assertEqual(
+        set(output().splitlines()),
+        {
+            "warp: 0",
+            "warp: 1",
+            "warp: 2",
+            "warp: 3",
+        },
+    )
+
+
 class PallasCallWGTest(
     PallasCallTest, lowering_semantics=plgpu.LoweringSemantics.Warpgroup
 ):