From 0823dc3cefd1e8f3b74329623d5964edb8ea39a4 Mon Sep 17 00:00:00 2001
From: matrix72 <matrix72c@gmail.com>
Date: Wed, 5 Mar 2025 16:47:06 +0800
Subject: [PATCH] Add GPU transformation operations for barrier elimination and
 async group creation

---
 examples/BuddyGPU/transform.mlir           |  48 +-
 midend/include/Dialect/GPU/TransformOps.td | 110 ++++
 midend/include/Utils/GPUUtils.h            |   6 +
 midend/lib/Dialect/GPU/TransformOps.cpp    | 627 +++++++++++++++++++++
 midend/lib/Utils/GPUUtils.cpp              | 289 ++++++++++
 5 files changed, 1056 insertions(+), 24 deletions(-)

diff --git a/examples/BuddyGPU/transform.mlir b/examples/BuddyGPU/transform.mlir
index e2a02a9a97..7b6e711e5b 100644
--- a/examples/BuddyGPU/transform.mlir
+++ b/examples/BuddyGPU/transform.mlir
@@ -135,10 +135,10 @@ module attributes { transform.with_named_sequence } {
     // Rewrite bufferized scf.forall ops to distributed gpu.thread_id attribute.
     %mapped = transform.gpu.map_nested_forall_to_threads %gpu_launch block_dims = [64, 2, 1] warp_size = 32 : (!transform.any_op) -> !transform.any_op
 
-    %15 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op
+    %14 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op
 
     // Removes unnecessary GPU barriers from the function.
-    // %15 = transform.buddy.eliminate_gpu_barriers %14 : (!transform.any_op) -> !transform.any_op
+    %15 = transform.buddy.eliminate_gpu_barriers %14 : (!transform.any_op) -> !transform.any_op
 
     // Perform canonicalization.
     transform.apply_patterns to %15 {
@@ -191,7 +191,7 @@ module attributes { transform.with_named_sequence } {
 
     // Insert a gpu.barrier after a given scf.for loop
     %16 = transform.structured.match ops{["scf.for"]} in %15 : (!transform.any_op) -> !transform.op<"scf.for">
-    // transform.buddy.synchronize_loop %16 : (!transform.op<"scf.for">) -> ()
+    transform.buddy.synchronize_loop %16 : (!transform.op<"scf.for">) -> ()
 
 
     transform.apply_patterns to %15 {
@@ -223,77 +223,77 @@ module attributes { transform.with_named_sequence } {
     // mma operations, targetting warp level tensorcore operations.
     transform.buddy.vector.vector_to_mma_conversion %17 {use_mma_sync} : (!transform.any_op) -> ()
 
-    // %18 = transform.buddy.eliminate_gpu_barriers %17 : (!transform.any_op) -> !transform.any_op
+    %18 = transform.buddy.eliminate_gpu_barriers %17 : (!transform.any_op) -> !transform.any_op
 
     // Perform canonicalization.
-    transform.apply_patterns to %17 {
+    transform.apply_patterns to %18 {
       transform.apply_patterns.linalg.tiling_canonicalization
       transform.apply_patterns.scf.for_loop_canonicalization
       transform.apply_patterns.canonicalization
     } : !transform.any_op
-    transform.apply_cse to %17 : !transform.any_op
+    transform.apply_cse to %18 : !transform.any_op
     %all_loops_7 = transform.structured.match interface{LoopLikeInterface}
-        in %17
+        in %18
         : (!transform.any_op) -> !transform.any_op
     transform.apply_licm to %all_loops_7 : !transform.any_op
-    transform.apply_patterns to %17 {
+    transform.apply_patterns to %18 {
       transform.apply_patterns.linalg.tiling_canonicalization
       transform.apply_patterns.vector.lower_masked_transfers
     } : !transform.any_op
 
-    %19 = transform.structured.match ops{["gpu.launch"]} in %17 : (!transform.any_op) -> !transform.any_op
+    %19 = transform.structured.match ops{["gpu.launch"]} in %18 : (!transform.any_op) -> !transform.any_op
     %fwfa = transform.structured.match ops{["memref.alloc"]} in %19 : (!transform.any_op) -> !transform.op<"memref.alloc">
 
     // Do multi-buffering/array expansion to remove dependencies on the temporary allocation between consecutive loop iterations.
     transform.memref.multibuffer %fwfa {factor = 3 : i64, skip_analysis} : (!transform.op<"memref.alloc">) -> !transform.any_op
 
-    transform.apply_patterns to %17 {
+    transform.apply_patterns to %18 {
       transform.apply_patterns.vector.transfer_to_scf full_unroll = true
     } : !transform.any_op
-    transform.apply_patterns to %17 {
+    transform.apply_patterns to %18 {
       transform.apply_patterns.linalg.tiling_canonicalization
       transform.apply_patterns.scf.for_loop_canonicalization
       transform.apply_patterns.canonicalization
     } : !transform.any_op
-    transform.apply_cse to %17 : !transform.any_op
+    transform.apply_cse to %18 : !transform.any_op
     %all_loops_8 = transform.structured.match interface{LoopLikeInterface}
-        in %17
+        in %18
         : (!transform.any_op) -> !transform.any_op
     transform.apply_licm to %all_loops_8 : !transform.any_op
-    transform.apply_patterns to %17 {
+    transform.apply_patterns to %18 {
       transform.apply_patterns.linalg.tiling_canonicalization
       transform.apply_patterns.vector.lower_masked_transfers
     } : !transform.any_op
 
     // Convert sync copies to shared memory to async.
-    // transform.buddy.create_async_groups %17 {use_mma_sync} : (!transform.any_op) -> ()
-    transform.apply_patterns to %17 {
+    transform.buddy.create_async_groups %18 {use_mma_sync} : (!transform.any_op) -> ()
+    transform.apply_patterns to %18 {
       transform.apply_patterns.linalg.tiling_canonicalization
       transform.apply_patterns.scf.for_loop_canonicalization
       transform.apply_patterns.canonicalization
       transform.apply_patterns.memref.fold_memref_alias_ops
     } : !transform.any_op
     %all_loops_9 = transform.structured.match interface{LoopLikeInterface}
-        in %17
+        in %18
         : (!transform.any_op) -> !transform.any_op
     transform.apply_licm to %all_loops_9 : !transform.any_op
-    transform.apply_cse to %17 : !transform.any_op
+    transform.apply_cse to %18 : !transform.any_op
 
 
-    %20 = transform.structured.match ops{["nvgpu.mma.sync"]} in %17 : (!transform.any_op) -> !transform.any_op
+    %20 = transform.structured.match ops{["nvgpu.mma.sync"]} in %18 : (!transform.any_op) -> !transform.any_op
     %21 = transform.get_parent_op %20 {deduplicate, op_name = "scf.for"} : (!transform.any_op) -> !transform.any_op
     // This applies software pipelining to a given scf.for loop.
     // The pipelining strategy will look for a copy to shared memory and pipeline it to overlap it with the rest of the loop.
     // %22 = transform.buddy.pipeline_shared_memory_copies %21 {depth = 3 : i64, use_mma_sync, peel_epilogue} : (!transform.any_op) -> !transform.any_op
 
     // Perform canonicalization.
-    transform.apply_patterns to %17 {
+    transform.apply_patterns to %18 {
       transform.apply_patterns.vector.lower_masks
     } : !transform.any_op
-    transform.apply_patterns to %17 {
+    transform.apply_patterns to %18 {
       transform.apply_patterns.vector.materialize_masks
     } : !transform.any_op
-    transform.apply_patterns to %17 {
+    transform.apply_patterns to %18 {
       transform.apply_patterns.linalg.tiling_canonicalization
       transform.apply_patterns.scf.for_loop_canonicalization
       transform.apply_patterns.canonicalization
@@ -301,10 +301,10 @@ module attributes { transform.with_named_sequence } {
     } : !transform.any_op
 
     %all_loops_10 = transform.structured.match interface{LoopLikeInterface}
-        in %17
+        in %18
         : (!transform.any_op) -> !transform.any_op
     transform.apply_licm to %all_loops_10 : !transform.any_op
-    transform.apply_cse to %17 : !transform.any_op
+    transform.apply_cse to %18 : !transform.any_op
 
     transform.yield
   }
diff --git a/midend/include/Dialect/GPU/TransformOps.td b/midend/include/Dialect/GPU/TransformOps.td
index 8eb7fac01d..fbc5072f86 100644
--- a/midend/include/Dialect/GPU/TransformOps.td
+++ b/midend/include/Dialect/GPU/TransformOps.td
@@ -124,4 +124,114 @@ def VectorToMMAConversionOp : Op<Transform_Dialect, "buddy.vector.vector_to_mma_
   }];
 }
 
+def EliminateGpuBarriersOp :
+  Op<Transform_Dialect, "buddy.eliminate_gpu_barriers",
+    [FunctionalStyleTransformOpTrait,
+     MemoryEffectsOpInterface,
+     TransformEachOpTrait,
+     TransformOpInterface,
+     ReportTrackingListenerFailuresOpTrait]> {
+  let description = [{
+    Removes unnecessary GPU barriers from the function. If a barrier does not
+    enforce any conflicting pair of memory effects, including a pair that is
+    enforced by another barrier, it is unnecessary and can be removed.
+
+    #### Return modes
+
+    Consumes the operand handle and produces a new handle to the function after
+    rewriting.
+  }];
+
+  let arguments = (ins TransformHandleTypeInterface:$target);
+  let results = (outs TransformHandleTypeInterface:$result);
+
+  let assemblyFormat = [{ $target attr-dict `:` functional-type(operands, results)}];
+  let cppNamespace = "mlir::buddy::gpu";
+
+  let builders = [
+    OpBuilder<(ins "Value":$target)>
+  ];
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::func::FuncOp target,
+        ::mlir::transform::ApplyToEachResultList &results,
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
+def SynchronizeLoopOp : Op<
+    Transform_Dialect, "buddy.synchronize_loop", [
+      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+      TransformEachOpTrait,
+      TransformOpInterface,
+      ReportTrackingListenerFailuresOpTrait]> {
+  let description = [{
+    This inserts a gpu.barrier after a given scf.for loop.
+
+    #### Return modes
+    This transform consumes the scf.for handle and produces a result handle
+    which points to the new scf.for loop generated. It will fail if the loop
+    cannot be pipelined or if there are no shared memory copies.
+  }];
+
+  let arguments = (
+      ins TransformHandleTypeInterface:$for_op);
+  let results = (outs);
+
+  let cppNamespace = "mlir::buddy::gpu";
+
+  let assemblyFormat = [{ 
+    $for_op 
+    attr-dict 
+    `:` functional-type(operands, results)}];
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::scf::ForOp forOp,
+        ::mlir::transform::ApplyToEachResultList &results,
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
+def CreateAsyncGroupsOp :
+  Op<Transform_Dialect, "buddy.create_async_groups",
+    [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+     TransformEachOpTrait,
+     TransformOpInterface,
+     ReportTrackingListenerFailuresOpTrait]> {
+  let description = [{
+    Convert copies to shared memory to async copies. This creates groups
+    of consecutive copies and emit wait operation right after.
+    The input operation is a `func.func`.
+
+    `use_mma_sync` specifies whether `bypassL1` attributes should be added to the
+    async copies.
+
+    #### Return modes
+    This op returns a handle to the transformed function, even if nothing
+    changed.
+  }];
+
+  let arguments = (ins TransformHandleTypeInterface:$target,
+                   UnitAttr:$use_mma_sync);
+  let results = (outs);
+
+  let assemblyFormat = [{ 
+    $target 
+    attr-dict 
+    `:` functional-type(operands, results)}];
+  let cppNamespace = "mlir::buddy::gpu";
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::func::FuncOp target,
+        ::mlir::transform::ApplyToEachResultList &results,
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
 #endif // TRANSFORM_OPS_TD
diff --git a/midend/include/Utils/GPUUtils.h b/midend/include/Utils/GPUUtils.h
index 88605fe1d3..2b343fab1f 100644
--- a/midend/include/Utils/GPUUtils.h
+++ b/midend/include/Utils/GPUUtils.h
@@ -98,6 +98,12 @@ template <typename AllocLikeOpType>
 void hoistStaticallyBoundAllocationsInFunc(RewriterBase &rewriter,
                                            func::FuncOp funcOp);
 
+//===----------------------------------------------------------------------===//
+// Utility from compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h
+//===----------------------------------------------------------------------===//
+void createAsyncGroups(RewriterBase &rewriter, func::FuncOp funcOp,
+    bool useMMASync);
+
 } // namespace buddy::gpu
 } // namespace mlir
 
diff --git a/midend/lib/Dialect/GPU/TransformOps.cpp b/midend/lib/Dialect/GPU/TransformOps.cpp
index 3e689fc931..f4b164a405 100644
--- a/midend/lib/Dialect/GPU/TransformOps.cpp
+++ b/midend/lib/Dialect/GPU/TransformOps.cpp
@@ -207,5 +207,632 @@ buddy::gpu::VectorToMMAConversionOp::applyToOne(
   return listener.checkAndResetError();
 }
 
+//===---------------------------------------------------------------------===//
+// EliminateGpuBarriersOp
+//===---------------------------------------------------------------------===//
+
+/// Returns `true` if the op is known not to have any side effects, but does not
+/// implement the MemoryEffectsOpInterface in the suitable way.
+static bool isKnownNoEffectsOpWithoutInterface(Operation *op) {
+  // memref::AssumeAlignment is conceptually pure, but marking it as such would
+  // make DCE immediately remove it.
+  return isa<memref::AssumeAlignmentOp>(op);
+}
+
+/// Returns `true` if the op is defines the parallel region that is subject to
+/// barrier synchronization.
+static bool isParallelRegionBoundary(Operation *op) {
+  if (op->hasAttr("__parallel_region_boundary_for_test"))
+    return true;
+
+  // We consider functions inside executable variants that have the same symbol
+  // name as an export symbol.
+  auto func = dyn_cast<FunctionOpInterface>(op);
+  if (!func)
+    return false;
+  auto parent = op->getParentOfType<ModuleOp>();
+  if (!parent)
+    return false;
+  auto variant = parent->getParentOfType<mlir::gpu::LaunchOp>();
+  if (!variant)
+    return false;
+  return true;
+}
+
+/// Returns `true` if the op behaves like a sequential loop, e.g., the control
+/// flow "wraps around" from the end of the body region back to its start.
+static bool isSequentialLoopLike(Operation *op) { return isa<scf::ForOp>(op); }
+
+/// Returns `true` if the regions of the op are guaranteed to be executed at
+/// most once. Thus, if an operation in one of the nested regions of `op` is
+/// executed than so are all the other operations in this region.
+static bool hasSingleExecutionBody(Operation *op) {
+  return isa<scf::IfOp, memref::AllocaScopeOp>(op);
+}
+
+/// Returns `true` if the operation is known to produce a pointer-like object
+/// distinct from any other object produced by a similar operation. For example,
+/// an allocation produces such an object.
+static bool producesDistinctBase(Operation *op) {
+  return isa_and_nonnull<memref::AllocOp, memref::AllocaOp>(op);
+}
+
+/// Populates `effects` with all memory effects without associating them to a
+/// specific value.
+static void addAllValuelessEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  effects.emplace_back(MemoryEffects::Effect::get<MemoryEffects::Read>());
+  effects.emplace_back(MemoryEffects::Effect::get<MemoryEffects::Write>());
+  effects.emplace_back(MemoryEffects::Effect::get<MemoryEffects::Allocate>());
+  effects.emplace_back(MemoryEffects::Effect::get<MemoryEffects::Free>());
+}
+
+/// Collect the memory effects of the given op in 'effects'. Returns 'true' if
+/// it could extract the effect information from the op, otherwise returns
+/// 'false' and conservatively populates the list with all possible effects
+/// associated with no particular value or symbol.
+static bool
+collectEffects(Operation *op,
+               SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
+               bool ignoreBarriers = true) {
+  // Skip over barriers to avoid infinite recursion (those barriers would ask
+  // this barrier again).
+  if (ignoreBarriers && isa<mlir::gpu::BarrierOp>(op))
+    return true;
+
+  // Skip over ops that we know have no effects.
+  if (isKnownNoEffectsOpWithoutInterface(op))
+    return true;
+
+  // Collect effect instances the operation. Note that the implementation of
+  // getEffects erases all effect instances that have the type other than the
+  // template parameter so we collect them first in a local buffer and then
+  // copy.
+  if (auto iface = dyn_cast<MemoryEffectOpInterface>(op)) {
+    SmallVector<MemoryEffects::EffectInstance> localEffects;
+    iface.getEffects(localEffects);
+    llvm::append_range(effects, localEffects);
+    return true;
+  }
+  if (op->hasTrait<OpTrait::HasRecursiveMemoryEffects>()) {
+    for (auto &region : op->getRegions()) {
+      for (auto &block : region) {
+        for (auto &innerOp : block)
+          if (!collectEffects(&innerOp, effects, ignoreBarriers))
+            return false;
+      }
+    }
+    return true;
+  }
+
+  // We need to be conservative here in case the op doesn't have the interface
+  // and assume it can have any possible effect.
+  addAllValuelessEffects(effects);
+  return false;
+}
+
+/// Collects memory effects from operations that may be executed before `op` in
+/// a trivial structured control flow, e.g., without branches. Stops at the
+/// parallel region boundary or at the barrier operation if `stopAtBarrier` is
+/// set. Returns `true` if the memory effects added to `effects` are exact,
+/// `false` if they are a conservative over-approximation. The latter means that
+/// `effects` contain instances not associated with a specific value.
+static bool
+getEffectsBefore(Operation *op,
+                 SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
+                 bool stopAtBarrier) {
+  if (!op->getBlock())
+    return true;
+
+  // If there is a non-structured control flow, bail.
+  Region *region = op->getBlock()->getParent();
+  if (region && !llvm::hasSingleElement(region->getBlocks())) {
+    addAllValuelessEffects(effects);
+    return false;
+  }
+
+  // Collect all effects before the op.
+  if (op != &op->getBlock()->front()) {
+    for (Operation *it = op->getPrevNode(); it != nullptr;
+         it = it->getPrevNode()) {
+      if (isa<mlir::gpu::BarrierOp>(it)) {
+        if (stopAtBarrier)
+          return true;
+        else
+          continue;
+      }
+      if (!collectEffects(it, effects))
+        return false;
+    }
+  }
+
+  // Stop if reached the parallel region boundary.
+  if (isParallelRegionBoundary(op->getParentOp()))
+    return true;
+
+  // Otherwise, keep collecting above the parent operation.
+  if (!getEffectsBefore(op->getParentOp(), effects, stopAtBarrier))
+    return false;
+
+  // If the op is loop-like, collect effects from the trailing operations until
+  // we hit a barrier because they can executed before the current operation by
+  // the previous iteration of this loop. For example, in the following loop
+  //
+  //   for i = ... {
+  //     op1
+  //     ...
+  //     barrier
+  //     op2
+  //   }
+  //
+  // the operation `op2` at iteration `i` is known to be executed before the
+  // operation `op1` at iteration `i+1` and the side effects must be ordered
+  // appropriately.
+  if (isSequentialLoopLike(op->getParentOp())) {
+    // Assuming loop terminators have no side effects.
+    return getEffectsBefore(op->getBlock()->getTerminator(), effects,
+                            /*stopAtBarrier=*/true);
+  }
+
+  // If the parent operation is not guaranteed to execute its (single-block)
+  // region once, walk the block.
+  bool conservative = false;
+  if (!hasSingleExecutionBody(op->getParentOp()))
+    op->getParentOp()->walk([&](Operation *in) {
+      if (conservative)
+        return WalkResult::interrupt();
+      if (!collectEffects(in, effects)) {
+        conservative = true;
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
+
+  return !conservative;
+}
+
+/// Collects memory effects from operations that may be executed after `op` in
+/// a trivial structured control flow, e.g., without branches. Stops at the
+/// parallel region boundary or at the barrier operation if `stopAtBarrier` is
+/// set. Returns `true` if the memory effects added to `effects` are exact,
+/// `false` if they are a conservative over-approximation. The latter means that
+/// `effects` contain instances not associated with a specific value.
+static bool
+getEffectsAfter(Operation *op,
+                SmallVectorImpl<MemoryEffects::EffectInstance> &effects,
+                bool stopAtBarrier) {
+  if (!op->getBlock())
+    return true;
+
+  // If there is a non-structured control flow, bail.
+  Region *region = op->getBlock()->getParent();
+  if (region && !llvm::hasSingleElement(region->getBlocks())) {
+    addAllValuelessEffects(effects);
+    return false;
+  }
+
+  // Collect all effects after the op.
+  if (op != &op->getBlock()->back())
+    for (Operation *it = op->getNextNode(); it != nullptr;
+         it = it->getNextNode()) {
+      if (isa<mlir::gpu::BarrierOp>(it)) {
+        if (stopAtBarrier)
+          return true;
+        continue;
+      }
+      if (!collectEffects(it, effects))
+        return false;
+    }
+
+  // Stop if reached the parallel region boundary.
+  if (isParallelRegionBoundary(op->getParentOp()))
+    return true;
+
+  // Otherwise, keep collecting below the parent operation.
+  if (!getEffectsAfter(op->getParentOp(), effects, stopAtBarrier))
+    return false;
+
+  // If the op is loop-like, collect effects from the leading operations until
+  // we hit a barrier because they can executed after the current operation by
+  // the next iteration of this loop. For example, in the following loop
+  //
+  //   for i = ... {
+  //     op1
+  //     ...
+  //     barrier
+  //     op2
+  //   }
+  //
+  // the operation `op1` at iteration `i` is known to be executed after the
+  // operation `op2` at iteration `i-1` and the side effects must be ordered
+  // appropriately.
+  if (isSequentialLoopLike(op->getParentOp())) {
+    if (isa<mlir::gpu::BarrierOp>(op->getBlock()->front()))
+      return true;
+
+    bool exact = collectEffects(&op->getBlock()->front(), effects);
+    return getEffectsAfter(&op->getBlock()->front(), effects,
+                           /*stopAtBarrier=*/true) &&
+           exact;
+  }
+
+  // If the parent operation is not guaranteed to execute its (single-block)
+  // region once, walk the block.
+  bool conservative = false;
+  if (!hasSingleExecutionBody(op->getParentOp()))
+    op->getParentOp()->walk([&](Operation *in) {
+      if (conservative)
+        return WalkResult::interrupt();
+      if (!collectEffects(in, effects)) {
+        conservative = true;
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
+
+  return !conservative;
+}
+
+/// Looks through known "view-like" ops to find the base memref.
+static Value getBase(Value v) {
+  while (true) {
+    Operation *definingOp = v.getDefiningOp();
+    if (!definingOp)
+      break;
+
+    bool shouldContinue =
+        TypeSwitch<Operation *, bool>(v.getDefiningOp())
+            .Case<memref::CastOp, memref::SubViewOp, memref::ViewOp>(
+                [&](auto op) {
+                  v = op.getSource();
+                  return true;
+                })
+            .Case<memref::TransposeOp>([&](auto op) {
+              v = op.getIn();
+              return true;
+            })
+            .Case<memref::CollapseShapeOp, memref::ExpandShapeOp>([&](auto op) {
+              v = op.getSrc();
+              return true;
+            })
+            .Default([](Operation *) { return false; });
+    if (!shouldContinue)
+      break;
+  }
+  return v;
+}
+
+/// Returns `true` if the value is defined as a function argument.
+static bool isFunctionArgument(Value v) {
+  auto arg = dyn_cast<BlockArgument>(v);
+  return arg && isa<FunctionOpInterface>(arg.getOwner()->getParentOp());
+}
+
+/// Returns the operand that the operation "propagates" through it for capture
+/// purposes. That is, if the value produced by this operation is captured, then
+/// so is the returned value.
+
+
+static Value propagatesCapture(Operation *op) {
+  return llvm::TypeSwitch<Operation *, Value>(op)
+    .Case(
+      [](ViewLikeOpInterface viewLike) { return viewLike.getViewSource(); })
+    .Case([](CastOpInterface castLike) { return castLike->getOperand(0); })
+    .Case([](memref::TransposeOp transpose) { return transpose.getIn(); })
+    .Case<memref::ExpandShapeOp, memref::CollapseShapeOp>(
+      [](auto op) { return op.getSrc(); })
+    .Default([](Operation *) { return Value(); });
+}
+
+/// Returns `true` if the given operation is known to capture the given value,
+/// `false` if it is known not to capture the given value, `nullopt` if neither
+/// is known.
+static std::optional<bool> getKnownCapturingStatus(Operation *op, Value v) {
+  return llvm::TypeSwitch<Operation *, std::optional<bool>>(op)
+      // Store-like operations don't capture the destination, but do capture
+      // the value.
+      .Case<memref::StoreOp, vector::TransferWriteOp>(
+          [&](auto op) { return op.getValue() == v; })
+      .Case<vector::StoreOp, vector::MaskedStoreOp>(
+          [&](auto op) { return op.getValueToStore() == v; })
+      // These operations are known not to capture.
+      .Case([](memref::DeallocOp) { return false; })
+      // By default, we don't know anything.
+      .Default([](Operation *) { return std::nullopt; });
+}
+
+/// Returns `true` if the value may be captured by any of its users, i.e., if
+/// the user may be storing this value into memory. This makes aliasing analysis
+/// more conservative as it cannot assume the pointer-like value is only passed
+/// around through SSA use-def.
+static bool maybeCaptured(Value v) {
+  SmallVector<Value> todo = {v};
+  while (!todo.empty()) {
+    Value v = todo.pop_back_val();
+    for (Operation *user : v.getUsers()) {
+      // A user that is known to only read cannot capture.
+      auto iface = dyn_cast<MemoryEffectOpInterface>(user);
+      if (iface) {
+        SmallVector<MemoryEffects::EffectInstance> effects;
+        iface.getEffects(effects);
+        if (llvm::all_of(effects,
+                         [](const MemoryEffects::EffectInstance &effect) {
+                           return isa<MemoryEffects::Read>(effect.getEffect());
+                         })) {
+          continue;
+        }
+      }
+
+      // When an operation is known to create an alias, consider if the
+      // source is captured as well.
+      if (Value v = propagatesCapture(user)) {
+        todo.push_back(v);
+        continue;
+      }
+
+      std::optional<bool> knownCaptureStatus = getKnownCapturingStatus(user, v);
+      if (!knownCaptureStatus || *knownCaptureStatus)
+        return true;
+    }
+  }
+
+  return false;
+}
+
+/// Returns true if two values may be referencing aliasing memory. This is a
+/// rather naive and conservative analysis. Values defined by different
+/// allocation-like operations as well as values derived from those by casts and
+/// views cannot alias each other. Similarly, values defined by allocations
+/// inside a function cannot alias function arguments. Global values cannot
+/// alias each other or local allocations. Values that are captured, i.e.
+/// themselves potentially stored in memory, are considered as aliasing with
+/// everything. This seems sufficient to achieve barrier removal in structured
+/// control flow, more complex cases would require a proper dataflow analysis.
+static bool mayAlias(Value first, Value second) {
+
+  first = getBase(first);
+  second = getBase(second);
+  // Values derived from the same base memref do alias (unless we do a more
+  // advanced analysis to prove non-overlapping accesses).
+  if (first == second) {
+    return true;
+  }
+
+  // Different globals cannot alias.
+  if (auto globFirst = first.getDefiningOp<memref::GetGlobalOp>()) {
+    if (auto globSecond = second.getDefiningOp<memref::GetGlobalOp>()) {
+      return globFirst.getNameAttr() == globSecond.getNameAttr();
+    }
+  }
+  // if (auto subSpanFirst =
+  //         first.getDefiningOp<HAL::InterfaceBindingSubspanOp>()) {
+  //   if (auto subSpanSecond =
+  //           second.getDefiningOp<HAL::InterfaceBindingSubspanOp>()) {
+  //     return subSpanFirst.getBindingAttr() == subSpanSecond.getBindingAttr();
+  //   }
+  // }
+
+  bool isDistinct[] = {producesDistinctBase(first.getDefiningOp()),
+                       producesDistinctBase(second.getDefiningOp())};
+  bool isGlobal[] = {first.getDefiningOp<memref::GetGlobalOp>() != nullptr,
+                     second.getDefiningOp<memref::GetGlobalOp>() != nullptr};
+
+  // Non-equivalent distinct bases and globals cannot alias. At this point, we
+  // have already filtered out based on values being equal and global name being
+  // equal.
+  if ((isDistinct[0] || isGlobal[0]) && (isDistinct[1] || isGlobal[1]))
+    return false;
+
+  bool isArg[] = {isFunctionArgument(first), isFunctionArgument(second)};
+
+  // Distinct bases (allocations) cannot have been passed as an argument.
+  if ((isDistinct[0] && isArg[1]) || (isDistinct[1] && isArg[0]))
+    return false;
+
+  // Non-captured base distinct values cannot conflict with another base value.
+  if (isDistinct[0] && !maybeCaptured(first))
+    return false;
+  if (isDistinct[1] && !maybeCaptured(second))
+    return false;
+
+  // Otherwise, conservatively assume aliasing.
+  return true;
+}
+
+/// Returns `true` if the effect may be affecting memory aliasing the value. If
+/// the effect is not associated with any value, it is assumed to affect all
+/// memory and therefore aliases with everything.
+static bool mayAlias(MemoryEffects::EffectInstance a, Value v2) {
+  if (Value v = a.getValue()) {
+    return mayAlias(v, v2);
+  }
+  return true;
+}
+
+/// Returns `true` if the two effects may be affecting aliasing memory. If
+/// an effect is not associated with any value, it is assumed to affect all
+/// memory and therefore aliases with everything. Effects on different resources
+/// cannot alias.
+static bool mayAlias(MemoryEffects::EffectInstance a,
+                     MemoryEffects::EffectInstance b) {
+  if (a.getResource()->getResourceID() != b.getResource()->getResourceID())
+    return false;
+  if (Value v2 = b.getValue()) {
+    return mayAlias(a, v2);
+  } else if (Value v = a.getValue()) {
+    return mayAlias(b, v);
+  }
+  return true;
+}
+
+/// Returns `true` if any of the "before" effect instances has a conflict with
+/// any "after" instance for the purpose of barrier elimination. The effects are
+/// supposed to be limited to a barrier synchronization scope. A conflict exists
+/// if effects instances affect aliasing memory locations and at least on of
+/// then as a write. As an exception, if the non-write effect is an allocation
+/// effect, there is no conflict since we are only expected to see the
+/// allocation happening in the same thread and it cannot be accessed from
+/// another thread without capture (which we do handle in alias analysis).
+static bool
+haveConflictingEffects(ArrayRef<MemoryEffects::EffectInstance> beforeEffects,
+                       ArrayRef<MemoryEffects::EffectInstance> afterEffects) {
+  for (const MemoryEffects::EffectInstance &before : beforeEffects) {
+    for (const MemoryEffects::EffectInstance &after : afterEffects) {
+      // If cannot alias, definitely no conflict.
+      if (!mayAlias(before, after))
+        continue;
+
+      // Read/read is not a conflict.
+      if (isa<MemoryEffects::Read>(before.getEffect()) &&
+          isa<MemoryEffects::Read>(after.getEffect())) {
+        continue;
+      }
+
+      // Allocate/* is not a conflict since the allocation happens within the
+      // thread context.
+      // TODO: This is not the case for */Free unless the allocation happened in
+      // the thread context, which we could also check for.
+      if (isa<MemoryEffects::Allocate>(before.getEffect()) ||
+          isa<MemoryEffects::Allocate>(after.getEffect())) {
+        continue;
+      }
+
+      // In the particular case that the before effect is a free, we only have 2
+      // possibilities:
+      //   1. either the program is well-formed and there must be an interleaved
+      //      alloc that must limit the scope of effect lookback and we can
+      //      safely ignore the free -> read / free -> write and free -> free
+      //      conflicts.
+      //   2. either the program is ill-formed and we are in undefined behavior
+      //      territory.
+      if (isa<MemoryEffects::Free>(before.getEffect()))
+        continue;
+
+      // Other kinds of effects create a conflict, e.g. read-after-write.
+      LLVM_DEBUG(
+          DBGS() << "found a conflict between (before): " << before.getValue()
+                 << " read:" << isa<MemoryEffects::Read>(before.getEffect())
+                 << " write:" << isa<MemoryEffects::Write>(before.getEffect())
+                 << " alloc:"
+                 << isa<MemoryEffects::Allocate>(before.getEffect()) << " free:"
+                 << isa<MemoryEffects::Free>(before.getEffect()) << "\n");
+      LLVM_DEBUG(
+          DBGS() << "and (after):                " << after.getValue()
+                 << " read:" << isa<MemoryEffects::Read>(after.getEffect())
+                 << " write:" << isa<MemoryEffects::Write>(after.getEffect())
+                 << " alloc:" << isa<MemoryEffects::Allocate>(after.getEffect())
+                 << " free:" << isa<MemoryEffects::Free>(after.getEffect())
+                 << "\n");
+      return true;
+    }
+  }
+
+  return false;
+}
+
+namespace {
+/// Barrier elimination pattern. If a barrier does not enforce any conflicting
+/// pair of memory effects, including a pair that is enforced by another
+/// barrier, it is unnecessary and can be removed. Adapted from
+/// "High-Performance GPU-to-CPU Transpilation and Optimization via High-Level
+/// Parallel Constructs" by Moses et.al. in PPoPP 2023 and implementation in
+/// Polygeist.
+class BarrierElimination final : public OpRewritePattern<mlir::gpu::BarrierOp> {
+public:
+  using OpRewritePattern<mlir::gpu::BarrierOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mlir::gpu::BarrierOp barrier,
+                                PatternRewriter &rewriter) const override {
+    LLVM_DEBUG(DBGS() << "checking the necessity of: " << barrier << " "
+                      << barrier.getLoc() << "\n");
+
+    SmallVector<MemoryEffects::EffectInstance> beforeEffects;
+    getEffectsBefore(barrier, beforeEffects, /*stopAtBarrier=*/true);
+
+    SmallVector<MemoryEffects::EffectInstance> afterEffects;
+    getEffectsAfter(barrier, afterEffects, /*stopAtBarrier=*/true);
+
+    if (!haveConflictingEffects(beforeEffects, afterEffects)) {
+      LLVM_DEBUG(DBGS() << "the surrounding barriers are sufficient, removing "
+                        << barrier << "\n");
+      rewriter.eraseOp(barrier);
+      return success();
+    }
+
+    LLVM_DEBUG(DBGS() << "barrier is necessary: " << barrier << " "
+                      << barrier.getLoc() << "\n");
+    return failure();
+  }
+};
+} // namespace
+
+void buddy::gpu::EliminateGpuBarriersOp::build(OpBuilder &builder,
+                                                      OperationState &state,
+                                                      Value target) {
+  build(builder, state, target.getType(), target);
+}
+
+DiagnosedSilenceableFailure
+buddy::gpu::EliminateGpuBarriersOp::applyToOne(
+    transform::TransformRewriter &rewriter, func::FuncOp target,
+    transform::ApplyToEachResultList &results,
+    transform::TransformState &state) {
+  RewritePatternSet patterns(target.getContext());
+  patterns.insert<BarrierElimination>(getContext());
+  transform::ErrorCheckingTrackingListener listener(state, *this);
+  auto checkErrors = llvm::make_scope_exit([&]() {
+    // The TrackingListener API makes checking for errors mandatory. It is safe
+    // to drop payload ops during this transform, so we can ignore all errors.
+    (void)listener.checkAndResetError();
+  });
+  GreedyRewriteConfig config;
+  config.listener = &listener;
+  if (failed(
+          applyPatternsAndFoldGreedily(target, std::move(patterns), config))) {
+    return emitDefaultSilenceableFailure(target);
+  }
+
+  results.push_back(target);
+  return DiagnosedSilenceableFailure::success();
+}
+
+//===----------------------------------------------------------------------===//
+// SynchronizeLoopOp
+//===----------------------------------------------------------------------===//
+
+void buddy::gpu::SynchronizeLoopOp::getEffects(
+  SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+transform::onlyReadsHandle(getForOp(), effects);
+transform::modifiesPayload(effects);
+}
+
+DiagnosedSilenceableFailure buddy::gpu::SynchronizeLoopOp::applyToOne(
+  transform::TransformRewriter &rewriter, scf::ForOp forOp,
+  transform::ApplyToEachResultList &results,
+  transform::TransformState &state) {
+rewriter.setInsertionPointAfter(forOp);
+rewriter.create<mlir::gpu::BarrierOp>(forOp.getLoc());
+return DiagnosedSilenceableFailure::success();
+}
+
+//===---------------------------------------------------------------------===//
+// CreateAsyncGroupsOp.
+//===---------------------------------------------------------------------===//
+
+void buddy::gpu::CreateAsyncGroupsOp::getEffects(
+  SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+transform::onlyReadsHandle(getTarget(), effects);
+transform::modifiesPayload(effects);
+}
+
+DiagnosedSilenceableFailure buddy::gpu::CreateAsyncGroupsOp::applyToOne(
+  transform::TransformRewriter &rewriter, func::FuncOp target,
+  transform::ApplyToEachResultList &results,
+  transform::TransformState &state) {
+createAsyncGroups(rewriter, cast<func::FuncOp>(target),
+                                 getUseMmaSync());
+return DiagnosedSilenceableFailure::success();
+}
+
 #define GET_OP_CLASSES
 #include "GPU/TransformOps.cpp.inc"
diff --git a/midend/lib/Utils/GPUUtils.cpp b/midend/lib/Utils/GPUUtils.cpp
index 82058c8813..243e49d521 100644
--- a/midend/lib/Utils/GPUUtils.cpp
+++ b/midend/lib/Utils/GPUUtils.cpp
@@ -531,6 +531,295 @@ template void
 hoistStaticallyBoundAllocationsInFunc<memref::AllocaOp>(RewriterBase &rewriter,
                                                         func::FuncOp funcOp);
 
+static bool isContiguousStore(Operation *write) {
+  if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(write)) {
+    if (!transferWrite.getPermutationMap().isMinorIdentity() ||
+        !transferWrite.isDimInBounds(0) || transferWrite.getMask()) {
+      LDBG("--not a contiguous store op: " << *write);
+      return false;
+    }
+    return true;
+  }
+  if (isa<vector::StoreOp>(write)) {
+    return true;
+  }
+  LDBG("--not a store op: " << write->getName().getStringRef());
+  return false;
+}
+
+static Value getMemrefOperand(Operation *op) {
+  if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(op)) {
+    return transferWrite.getSource();
+  }
+  if (auto transferRead = dyn_cast<vector::TransferReadOp>(op)) {
+    return transferRead.getSource();
+  }
+  if (auto storeOp = dyn_cast<vector::StoreOp>(op)) {
+    return storeOp.getBase();
+  }
+  if (auto loadOp = dyn_cast<vector::LoadOp>(op)) {
+    return loadOp.getBase();
+  }
+  return Value();
+}
+
+static bool isContiguousRead(Operation *read) {
+  if (auto transferRead = dyn_cast<vector::TransferReadOp>(read)) {
+    if (!transferRead.isDimInBounds(0) ||
+        !transferRead.getPermutationMap().isMinorIdentity()) {
+      LDBG("--not a contiguous load op: " << *read);
+      return false;
+    }
+    return true;
+  }
+  if (isa<vector::LoadOp>(read)) {
+    return true;
+  }
+  LDBG("--not a load op: " << read->getName().getStringRef());
+  return false;
+}
+
+struct MaskResult {
+  vector::CreateMaskOp maskOp;
+  vector::ExtractOp maybeExtractOp;
+};
+static MaskResult getMask(Operation *op) {
+  auto transferRead = dyn_cast<vector::TransferReadOp>(op);
+  if (!transferRead || !transferRead.getMask())
+    return MaskResult{};
+  vector::ExtractOp maybeExtractOp =
+      transferRead.getMask().getDefiningOp<vector::ExtractOp>();
+  auto maskOp =
+      maybeExtractOp
+          ? maybeExtractOp.getVector().getDefiningOp<vector::CreateMaskOp>()
+          : transferRead.getMask().getDefiningOp<vector::CreateMaskOp>();
+  if (maybeExtractOp) {
+    if (maybeExtractOp.getStaticPosition().size() + 1 !=
+        llvm::cast<VectorType>(maskOp->getResultTypes().front()).getRank()) {
+      LDBG("----mask through extract unexpected position size -> Skip: "
+           << maybeExtractOp);
+      return MaskResult{};
+    }
+    if (maybeExtractOp.getStaticPosition().size() != 1) {
+      LDBG("----only mask through 2-D -> 1-D extract supported atm -> Skip: "
+           << maybeExtractOp);
+      return MaskResult{};
+    }
+    LDBG("----mask through extract: " << maybeExtractOp);
+  }
+  return MaskResult{maskOp, maybeExtractOp};
+}
+
+static Value getMaskValue(RewriterBase &rewriter, Operation *op) {
+  MaskResult maskResult = getMask(op);
+  if (!maskResult.maskOp)
+    return Value();
+  Value count = maskResult.maskOp->getOperands().back();
+  vector::ExtractOp maybeExtractOp = maskResult.maybeExtractOp;
+  if (maybeExtractOp) {
+    assert(maybeExtractOp.getStaticPosition().size() == 1 &&
+           "expected single pos");
+    int64_t sliceNum = maybeExtractOp.getStaticPosition()[0];
+    // TODO: to support >2-D mask + extract, and all the cmp.
+    Location loc = op->getLoc();
+    Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    Value cmp = rewriter.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::slt,
+        rewriter.create<arith::ConstantIndexOp>(loc, sliceNum),
+        maskResult.maskOp->getOperands().front());
+    count = rewriter.create<arith::SelectOp>(loc, cmp, count, zero);
+  }
+  return count;
+}
+
+static Value getValueStored(Operation *writeOp) {
+  if (auto transferWrite = dyn_cast<vector::TransferWriteOp>(writeOp)) {
+    return transferWrite.getValue();
+  }
+  if (auto storeOp = dyn_cast<vector::StoreOp>(writeOp)) {
+    return storeOp.getValueToStore();
+  }
+  return Value();
+}
+
+static Operation::operand_range getIndices(Operation *op) {
+  if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
+    return vectorReadOp.getIndices();
+  if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
+    return vectorStoreOp.getIndices();
+  if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
+    return transferReadOp.getIndices();
+  if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
+    return transferWriteOp.getIndices();
+  llvm_unreachable("unsupported op type");
+}
+
+/// Return `true` if the conversion to async copy is legal.
+static bool resultsInSupportedAsyncCopy(MemRefType memrefType,
+                                        Operation::operand_range indices,
+                                        VectorType vecType) {
+  constexpr int64_t kSupportedCpAsyncAlignmentsInBytes[3] = {4, 8, 16};
+  // Condition 1: the vectory rank must be supported.
+  if (vecType.hasRank() != 1) {
+    LDBG("----> cp.async failed, not a 1-D vector: " << vecType);
+    return false;
+  }
+
+  // Condition 2: the copy size must be supported.
+  bool supportedCopySize = false;
+  int64_t numElements = vecType.getNumElements();
+  Type elementType = vecType.getElementType();
+  for (int64_t alignmentInBytes : kSupportedCpAsyncAlignmentsInBytes) {
+    if (alignmentInBytes * 8 ==
+        numElements * elementType.getIntOrFloatBitWidth()) {
+      supportedCopySize = true;
+      break;
+    }
+  }
+  if (!supportedCopySize) {
+    LDBG("----> cp.async alignment failed, "
+         << numElements << " elts * " << elementType.getIntOrFloatBitWidth()
+         << "b/elem = " << numElements * elementType.getIntOrFloatBitWidth()
+         << "b is not supported by cp.async");
+    return false;
+  }
+
+  // TODO: Condition 3: the alignments must be supported. For cp.async the
+  // NVIDIA doc (section 6.4.1) says: "The address must be naturally aligned to
+  // a multiple of the access size. If an address is not properly aligned, the
+  // resulting behavior is undefined.".
+  return true;
+}
+
+void createAsyncGroups(RewriterBase &rewriter, func::FuncOp funcOp,
+                       bool useMMASync) {
+  LDBG("Start asyncGroups: useMMASync=" << useMMASync);
+  llvm::SmallSetVector<Operation *, 16> copyToSharedMem;
+  // Look for all the copy that can be converted to async copy ops.
+  funcOp.walk([&](Operation *writeOp) {
+    if (!isContiguousStore(writeOp))
+      return WalkResult::advance();
+    LDBG("--candidate writeOp: " << *writeOp);
+    Value vectorVal = getValueStored(writeOp);
+    if (llvm::cast<VectorType>(vectorVal.getType()).getRank() != 1) {
+      LDBG("----writeOp is not an inbounds 1-D minor identity -> Skip");
+      return WalkResult::advance();
+    }
+    Value memrefOperand = getMemrefOperand(writeOp);
+    if (!hasSharedMemoryAddressSpace(
+            llvm::cast<MemRefType>(memrefOperand.getType()))) {
+      LDBG("----address space is not workgroup -> Skip");
+      return WalkResult::advance();
+    }
+    Operation *readOp = vectorVal.getDefiningOp();
+    if (readOp == nullptr || !isContiguousRead(readOp)) {
+      LDBG("----no contiguous readOp defining the writeOp -> Skip");
+      return WalkResult::advance();
+    }
+
+    LDBG("--candidate readOp: " << *readOp);
+    if (auto transferRead = dyn_cast<vector::TransferReadOp>(readOp)) {
+      if (transferRead.getMask()) {
+        auto paddingCst =
+            transferRead.getPadding().getDefiningOp<arith::ConstantFloatOp>();
+        if (!paddingCst || !paddingCst.value().isZero()) {
+          LDBG("----read padding value is not 0.f -> Skip");
+          return WalkResult::advance();
+        }
+        auto maskResult = getMask(transferRead);
+        if (!maskResult.maskOp) {
+          LDBG("----read mask is not a vector.create_mask op -> Skip: "
+               << transferRead.getMask());
+          return WalkResult::advance();
+        }
+      }
+    }
+
+    // Check whether both accesses are supported before we emit: this is
+    // necessary to ensure the correctness of DeviceAsyncCopyOp.
+    VectorType vecType = llvm::cast<VectorType>(vectorVal.getType());
+    Value storeBase = getMemrefOperand(writeOp);
+    Value loadBase = getMemrefOperand(readOp);
+    if (!resultsInSupportedAsyncCopy(cast<MemRefType>(loadBase.getType()),
+                                     getIndices(readOp), vecType) ||
+        !resultsInSupportedAsyncCopy(cast<MemRefType>(storeBase.getType()),
+                                     getIndices(writeOp), vecType))
+      return WalkResult::advance();
+
+    LDBG("--writeOp can be made async -> SUCCESS");
+    copyToSharedMem.insert(writeOp);
+    return WalkResult::advance();
+  });
+
+  while (!copyToSharedMem.empty()) {
+    SmallVector<Operation *> group;
+    Operation *writeOp = *copyToSharedMem.begin();
+    LDBG("--START a group from: " << *writeOp);
+    // Start a group with the first write.
+    copyToSharedMem.remove(writeOp);
+    group.push_back(writeOp);
+    Operation *nextNode = writeOp;
+    // Look in the next nodes for more copies to add to the same group.
+    while ((nextNode = nextNode->getNextNode())) {
+      // Ignore ops without side effects
+      auto memInterface = dyn_cast<MemoryEffectOpInterface>(nextNode);
+      if (memInterface && memInterface.hasNoEffect() &&
+          !nextNode->hasTrait<OpTrait::HasRecursiveMemoryEffects>())
+        continue;
+      // ignore read from a different address space.
+      if (isa<vector::TransferReadOp, vector::LoadOp>(nextNode)) {
+        Operation *readOp = nextNode;
+        Value memrefOperand = getMemrefOperand(readOp);
+        if (!hasSharedMemoryAddressSpace(
+                llvm::cast<MemRefType>(memrefOperand.getType()))) {
+          continue;
+        }
+      }
+      if (copyToSharedMem.count(nextNode)) {
+        // found another copy, add it to the group.
+        copyToSharedMem.remove(nextNode);
+        group.push_back(nextNode);
+        continue;
+      }
+      // If the op is something else stop the accumulating op in the group.
+      LDBG("----> STOP accumulating into group due to: " << *nextNode);
+      break;
+    }
+    // emit the group.
+    SmallVector<Value> tokens;
+    for (Operation *writeOp : group) {
+      rewriter.setInsertionPoint(writeOp);
+      Value vectorVal = getValueStored(writeOp);
+      auto vectorType = llvm::cast<VectorType>(vectorVal.getType());
+      int64_t numElements = vectorType.getNumElements();
+      Operation *readOp = vectorVal.getDefiningOp();
+      Value storeBase = getMemrefOperand(writeOp);
+      Value loadBase = getMemrefOperand(readOp);
+      Value mask = getMaskValue(rewriter, readOp);
+      auto dstMemref = llvm::cast<MemRefType>(storeBase.getType());
+      int64_t sizeInBytes =
+          (dstMemref.getElementTypeBitWidth() * numElements) / 8;
+      UnitAttr bypassL1 =
+          useMMASync && sizeInBytes == 16 ? rewriter.getUnitAttr() : UnitAttr();
+      Value token = rewriter.create<nvgpu::DeviceAsyncCopyOp>(
+          writeOp->getLoc(),
+          nvgpu::DeviceAsyncTokenType::get(funcOp.getContext()), storeBase,
+          getIndices(writeOp), loadBase, getIndices(readOp),
+          rewriter.getIndexAttr(numElements), mask,
+          /*bypassL1=*/bypassL1);
+      tokens.push_back(token);
+    }
+    // Create the group and wait for it right after.
+    Value groupToken = rewriter.create<nvgpu::DeviceAsyncCreateGroupOp>(
+        funcOp.getLoc(), nvgpu::DeviceAsyncTokenType::get(funcOp.getContext()),
+        tokens);
+    rewriter.create<nvgpu::DeviceAsyncWaitOp>(funcOp.getLoc(), groupToken,
+                                              nullptr);
+    // Clean up old stores.
+    for (Operation *writeOp : group)
+      rewriter.eraseOp(writeOp);
+  }
+}
 } // namespace gpu
 } // namespace mlir::buddy
 #endif // UTILS_GPUUTILS_DEF