jax-ml · Amir-19 · May 12, 2025 · May 12, 2025 · apaszke · May 19, 2025
diff --git a/jax/_src/pallas/mosaic_gpu/primitives.py b/jax/_src/pallas/mosaic_gpu/primitives.py
@@ -237,6 +237,7 @@ def _copy_smem_to_gmem_lowering(
     has_user_predicate,
     commit_group,
     reduction_op,
+    team_id,
 ):
   if has_user_predicate:
     flat_args, user_predicate = flat_args[:-1], flat_args[-1]
@@ -268,6 +269,7 @@ def _copy_smem_to_gmem_lowering(
         predicate=predicate,
         arrive=commit_group,
         reduction_op=reduction_op,
+        team_id=team_id,
         **copy_params,
     )
     return ()
@@ -347,6 +349,7 @@ def copy_smem_to_gmem(
     *,
     commit_group: bool = True,
     reduction_op: mgpu.ReductionOp | None = None,
+    team_id: int | None = None,
 ) -> None:
   """Asynchronously copies a SMEM reference to a GMEM reference.
 
@@ -361,6 +364,7 @@ def copy_smem_to_gmem(
     reduction_op: If set, perform the specified reduction operation when storing
       to GMEM. For example, using ``"add"`` is conceptually equivalent to
       doing ``src += dst``.
+    team_id: if set, dst ref would be translated to a multicast memory addr
 
   See also:
     :func:`jax.experimental.mosaic.gpu.wait_smem_to_gmem`
@@ -389,6 +393,7 @@ def copy_smem_to_gmem(
       has_user_predicate=predicate is not None,
       commit_group=commit_group,
       reduction_op=reduction_op,
+      team_id = team_id,
   )
   return None
 

diff --git a/jax/experimental/mosaic/gpu/core.py b/jax/experimental/mosaic/gpu/core.py
@@ -116,8 +116,14 @@ def supports_cross_device_collectives():
 
 
 @mosaic_gpu_p.def_abstract_eval
-def _mosaic_gpu_abstract_eval(*_, module, out_types):
+def _mosaic_gpu_abstract_eval(
+    *_,
+    module,
+    out_types,
+    input_output_aliases,
+):
   del module  # Unused.
+  del input_output_aliases # Unused.
   return [jax._src.core.ShapedArray(t.shape, t.dtype) for t in out_types]
 
 # TODO(apaszke): Implement a proper system for managing kernel lifetimes
@@ -618,8 +624,9 @@ def _run_serde_pass(
 def _declare_runtime_functions():
   """Declares the runtime functions that can be used by the generated code."""
   ptr_ty = ir.Type.parse("!llvm.ptr")
+  i32 = ir.IntegerType.get_signless(32)
   i64 = ir.IntegerType.get_signless(64)
-  arg_tys = [ptr_ty, ptr_ty, i64, i64, ptr_ty, ptr_ty, i64, ptr_ty]
+  arg_tys = [ptr_ty, ptr_ty, i64, i64, ptr_ty, ptr_ty, i64, ptr_ty, i32]
   init_tma_desc_type = ir.FunctionType.get(arg_tys, [])
   func.FuncOp(
       "mosaic_gpu_init_tma_desc", init_tma_desc_type, visibility="private"
@@ -639,6 +646,7 @@ def as_gpu_kernel(
     kernel_name: str | None = None,
     ir_version: int | None = None,
     thread_semantics: LoweringSemantics = LoweringSemantics.Lane,
+    input_output_aliases: tuple[tuple[int, int], ...] = (),
 ):
   if isinstance(in_shape, list):
     in_shape = tuple(in_shape)
@@ -680,7 +688,12 @@ def _check_args(*args):
       )
 
   def bind(*args) -> Any:
-    return mosaic_gpu_p.bind(*args, module=module, out_types=out_shape)
+    return mosaic_gpu_p.bind(
+        *args,
+        module=module,
+        out_types=out_shape,
+        input_output_aliases=input_output_aliases,
+    )
 
   if prof_spec is not None:
     @jax.jit