From 0823dc3cefd1e8f3b74329623d5964edb8ea39a4 Mon Sep 17 00:00:00 2001 From: matrix72 Date: Wed, 5 Mar 2025 16:47:06 +0800 Subject: [PATCH] Add GPU transformation operations for barrier elimination and async group creation --- examples/BuddyGPU/transform.mlir | 48 +- midend/include/Dialect/GPU/TransformOps.td | 110 ++++ midend/include/Utils/GPUUtils.h | 6 + midend/lib/Dialect/GPU/TransformOps.cpp | 627 +++++++++++++++++++++ midend/lib/Utils/GPUUtils.cpp | 289 ++++++++++ 5 files changed, 1056 insertions(+), 24 deletions(-) diff --git a/examples/BuddyGPU/transform.mlir b/examples/BuddyGPU/transform.mlir index e2a02a9a97..7b6e711e5b 100644 --- a/examples/BuddyGPU/transform.mlir +++ b/examples/BuddyGPU/transform.mlir @@ -135,10 +135,10 @@ module attributes { transform.with_named_sequence } { // Rewrite bufferized scf.forall ops to distributed gpu.thread_id attribute. %mapped = transform.gpu.map_nested_forall_to_threads %gpu_launch block_dims = [64, 2, 1] warp_size = 32 : (!transform.any_op) -> !transform.any_op - %15 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op + %14 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op // Removes unnecessary GPU barriers from the function. - // %15 = transform.buddy.eliminate_gpu_barriers %14 : (!transform.any_op) -> !transform.any_op + %15 = transform.buddy.eliminate_gpu_barriers %14 : (!transform.any_op) -> !transform.any_op // Perform canonicalization. transform.apply_patterns to %15 { @@ -191,7 +191,7 @@ module attributes { transform.with_named_sequence } { // Insert a gpu.barrier after a given scf.for loop %16 = transform.structured.match ops{["scf.for"]} in %15 : (!transform.any_op) -> !transform.op<"scf.for"> - // transform.buddy.synchronize_loop %16 : (!transform.op<"scf.for">) -> () + transform.buddy.synchronize_loop %16 : (!transform.op<"scf.for">) -> () transform.apply_patterns to %15 { @@ -223,77 +223,77 @@ module attributes { transform.with_named_sequence } { // mma operations, targetting warp level tensorcore operations. transform.buddy.vector.vector_to_mma_conversion %17 {use_mma_sync} : (!transform.any_op) -> () - // %18 = transform.buddy.eliminate_gpu_barriers %17 : (!transform.any_op) -> !transform.any_op + %18 = transform.buddy.eliminate_gpu_barriers %17 : (!transform.any_op) -> !transform.any_op // Perform canonicalization. - transform.apply_patterns to %17 { + transform.apply_patterns to %18 { transform.apply_patterns.linalg.tiling_canonicalization transform.apply_patterns.scf.for_loop_canonicalization transform.apply_patterns.canonicalization } : !transform.any_op - transform.apply_cse to %17 : !transform.any_op + transform.apply_cse to %18 : !transform.any_op %all_loops_7 = transform.structured.match interface{LoopLikeInterface} - in %17 + in %18 : (!transform.any_op) -> !transform.any_op transform.apply_licm to %all_loops_7 : !transform.any_op - transform.apply_patterns to %17 { + transform.apply_patterns to %18 { transform.apply_patterns.linalg.tiling_canonicalization transform.apply_patterns.vector.lower_masked_transfers } : !transform.any_op - %19 = transform.structured.match ops{["gpu.launch"]} in %17 : (!transform.any_op) -> !transform.any_op + %19 = transform.structured.match ops{["gpu.launch"]} in %18 : (!transform.any_op) -> !transform.any_op %fwfa = transform.structured.match ops{["memref.alloc"]} in %19 : (!transform.any_op) -> !transform.op<"memref.alloc"> // Do multi-buffering/array expansion to remove dependencies on the temporary allocation between consecutive loop iterations. transform.memref.multibuffer %fwfa {factor = 3 : i64, skip_analysis} : (!transform.op<"memref.alloc">) -> !transform.any_op - transform.apply_patterns to %17 { + transform.apply_patterns to %18 { transform.apply_patterns.vector.transfer_to_scf full_unroll = true } : !transform.any_op - transform.apply_patterns to %17 { + transform.apply_patterns to %18 { transform.apply_patterns.linalg.tiling_canonicalization transform.apply_patterns.scf.for_loop_canonicalization transform.apply_patterns.canonicalization } : !transform.any_op - transform.apply_cse to %17 : !transform.any_op + transform.apply_cse to %18 : !transform.any_op %all_loops_8 = transform.structured.match interface{LoopLikeInterface} - in %17 + in %18 : (!transform.any_op) -> !transform.any_op transform.apply_licm to %all_loops_8 : !transform.any_op - transform.apply_patterns to %17 { + transform.apply_patterns to %18 { transform.apply_patterns.linalg.tiling_canonicalization transform.apply_patterns.vector.lower_masked_transfers } : !transform.any_op // Convert sync copies to shared memory to async. - // transform.buddy.create_async_groups %17 {use_mma_sync} : (!transform.any_op) -> () - transform.apply_patterns to %17 { + transform.buddy.create_async_groups %18 {use_mma_sync} : (!transform.any_op) -> () + transform.apply_patterns to %18 { transform.apply_patterns.linalg.tiling_canonicalization transform.apply_patterns.scf.for_loop_canonicalization transform.apply_patterns.canonicalization transform.apply_patterns.memref.fold_memref_alias_ops } : !transform.any_op %all_loops_9 = transform.structured.match interface{LoopLikeInterface} - in %17 + in %18 : (!transform.any_op) -> !transform.any_op transform.apply_licm to %all_loops_9 : !transform.any_op - transform.apply_cse to %17 : !transform.any_op + transform.apply_cse to %18 : !transform.any_op - %20 = transform.structured.match ops{["nvgpu.mma.sync"]} in %17 : (!transform.any_op) -> !transform.any_op + %20 = transform.structured.match ops{["nvgpu.mma.sync"]} in %18 : (!transform.any_op) -> !transform.any_op %21 = transform.get_parent_op %20 {deduplicate, op_name = "scf.for"} : (!transform.any_op) -> !transform.any_op // This applies software pipelining to a given scf.for loop. // The pipelining strategy will look for a copy to shared memory and pipeline it to overlap it with the rest of the loop. // %22 = transform.buddy.pipeline_shared_memory_copies %21 {depth = 3 : i64, use_mma_sync, peel_epilogue} : (!transform.any_op) -> !transform.any_op // Perform canonicalization. - transform.apply_patterns to %17 { + transform.apply_patterns to %18 { transform.apply_patterns.vector.lower_masks } : !transform.any_op - transform.apply_patterns to %17 { + transform.apply_patterns to %18 { transform.apply_patterns.vector.materialize_masks } : !transform.any_op - transform.apply_patterns to %17 { + transform.apply_patterns to %18 { transform.apply_patterns.linalg.tiling_canonicalization transform.apply_patterns.scf.for_loop_canonicalization transform.apply_patterns.canonicalization @@ -301,10 +301,10 @@ module attributes { transform.with_named_sequence } { } : !transform.any_op %all_loops_10 = transform.structured.match interface{LoopLikeInterface} - in %17 + in %18 : (!transform.any_op) -> !transform.any_op transform.apply_licm to %all_loops_10 : !transform.any_op - transform.apply_cse to %17 : !transform.any_op + transform.apply_cse to %18 : !transform.any_op transform.yield } diff --git a/midend/include/Dialect/GPU/TransformOps.td b/midend/include/Dialect/GPU/TransformOps.td index 8eb7fac01d..fbc5072f86 100644 --- a/midend/include/Dialect/GPU/TransformOps.td +++ b/midend/include/Dialect/GPU/TransformOps.td @@ -124,4 +124,114 @@ def VectorToMMAConversionOp : Op { + let description = [{ + Removes unnecessary GPU barriers from the function. If a barrier does not + enforce any conflicting pair of memory effects, including a pair that is + enforced by another barrier, it is unnecessary and can be removed. + + #### Return modes + + Consumes the operand handle and produces a new handle to the function after + rewriting. + }]; + + let arguments = (ins TransformHandleTypeInterface:$target); + let results = (outs TransformHandleTypeInterface:$result); + + let assemblyFormat = [{ $target attr-dict `:` functional-type(operands, results)}]; + let cppNamespace = "mlir::buddy::gpu"; + + let builders = [ + OpBuilder<(ins "Value":$target)> + ]; + + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::transform::TransformRewriter &rewriter, + ::mlir::func::FuncOp target, + ::mlir::transform::ApplyToEachResultList &results, + ::mlir::transform::TransformState &state); + }]; +} + +def SynchronizeLoopOp : Op< + Transform_Dialect, "buddy.synchronize_loop", [ + DeclareOpInterfaceMethods, + TransformEachOpTrait, + TransformOpInterface, + ReportTrackingListenerFailuresOpTrait]> { + let description = [{ + This inserts a gpu.barrier after a given scf.for loop. + + #### Return modes + This transform consumes the scf.for handle and produces a result handle + which points to the new scf.for loop generated. It will fail if the loop + cannot be pipelined or if there are no shared memory copies. + }]; + + let arguments = ( + ins TransformHandleTypeInterface:$for_op); + let results = (outs); + + let cppNamespace = "mlir::buddy::gpu"; + + let assemblyFormat = [{ + $for_op + attr-dict + `:` functional-type(operands, results)}]; + + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::transform::TransformRewriter &rewriter, + ::mlir::scf::ForOp forOp, + ::mlir::transform::ApplyToEachResultList &results, + ::mlir::transform::TransformState &state); + }]; +} + +def CreateAsyncGroupsOp : + Op, + TransformEachOpTrait, + TransformOpInterface, + ReportTrackingListenerFailuresOpTrait]> { + let description = [{ + Convert copies to shared memory to async copies. This creates groups + of consecutive copies and emit wait operation right after. + The input operation is a `func.func`. + + `use_mma_sync` specifies whether `bypassL1` attributes should be added to the + async copies. + + #### Return modes + This op returns a handle to the transformed function, even if nothing + changed. + }]; + + let arguments = (ins TransformHandleTypeInterface:$target, + UnitAttr:$use_mma_sync); + let results = (outs); + + let assemblyFormat = [{ + $target + attr-dict + `:` functional-type(operands, results)}]; + let cppNamespace = "mlir::buddy::gpu"; + + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::transform::TransformRewriter &rewriter, + ::mlir::func::FuncOp target, + ::mlir::transform::ApplyToEachResultList &results, + ::mlir::transform::TransformState &state); + }]; +} + #endif // TRANSFORM_OPS_TD diff --git a/midend/include/Utils/GPUUtils.h b/midend/include/Utils/GPUUtils.h index 88605fe1d3..2b343fab1f 100644 --- a/midend/include/Utils/GPUUtils.h +++ b/midend/include/Utils/GPUUtils.h @@ -98,6 +98,12 @@ template void hoistStaticallyBoundAllocationsInFunc(RewriterBase &rewriter, func::FuncOp funcOp); +//===----------------------------------------------------------------------===// +// Utility from compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h +//===----------------------------------------------------------------------===// +void createAsyncGroups(RewriterBase &rewriter, func::FuncOp funcOp, + bool useMMASync); + } // namespace buddy::gpu } // namespace mlir diff --git a/midend/lib/Dialect/GPU/TransformOps.cpp b/midend/lib/Dialect/GPU/TransformOps.cpp index 3e689fc931..f4b164a405 100644 --- a/midend/lib/Dialect/GPU/TransformOps.cpp +++ b/midend/lib/Dialect/GPU/TransformOps.cpp @@ -207,5 +207,632 @@ buddy::gpu::VectorToMMAConversionOp::applyToOne( return listener.checkAndResetError(); } +//===---------------------------------------------------------------------===// +// EliminateGpuBarriersOp +//===---------------------------------------------------------------------===// + +/// Returns `true` if the op is known not to have any side effects, but does not +/// implement the MemoryEffectsOpInterface in the suitable way. +static bool isKnownNoEffectsOpWithoutInterface(Operation *op) { + // memref::AssumeAlignment is conceptually pure, but marking it as such would + // make DCE immediately remove it. + return isa(op); +} + +/// Returns `true` if the op is defines the parallel region that is subject to +/// barrier synchronization. +static bool isParallelRegionBoundary(Operation *op) { + if (op->hasAttr("__parallel_region_boundary_for_test")) + return true; + + // We consider functions inside executable variants that have the same symbol + // name as an export symbol. + auto func = dyn_cast(op); + if (!func) + return false; + auto parent = op->getParentOfType(); + if (!parent) + return false; + auto variant = parent->getParentOfType(); + if (!variant) + return false; + return true; +} + +/// Returns `true` if the op behaves like a sequential loop, e.g., the control +/// flow "wraps around" from the end of the body region back to its start. +static bool isSequentialLoopLike(Operation *op) { return isa(op); } + +/// Returns `true` if the regions of the op are guaranteed to be executed at +/// most once. Thus, if an operation in one of the nested regions of `op` is +/// executed than so are all the other operations in this region. +static bool hasSingleExecutionBody(Operation *op) { + return isa(op); +} + +/// Returns `true` if the operation is known to produce a pointer-like object +/// distinct from any other object produced by a similar operation. For example, +/// an allocation produces such an object. +static bool producesDistinctBase(Operation *op) { + return isa_and_nonnull(op); +} + +/// Populates `effects` with all memory effects without associating them to a +/// specific value. +static void addAllValuelessEffects( + SmallVectorImpl &effects) { + effects.emplace_back(MemoryEffects::Effect::get()); + effects.emplace_back(MemoryEffects::Effect::get()); + effects.emplace_back(MemoryEffects::Effect::get()); + effects.emplace_back(MemoryEffects::Effect::get()); +} + +/// Collect the memory effects of the given op in 'effects'. Returns 'true' if +/// it could extract the effect information from the op, otherwise returns +/// 'false' and conservatively populates the list with all possible effects +/// associated with no particular value or symbol. +static bool +collectEffects(Operation *op, + SmallVectorImpl &effects, + bool ignoreBarriers = true) { + // Skip over barriers to avoid infinite recursion (those barriers would ask + // this barrier again). + if (ignoreBarriers && isa(op)) + return true; + + // Skip over ops that we know have no effects. + if (isKnownNoEffectsOpWithoutInterface(op)) + return true; + + // Collect effect instances the operation. Note that the implementation of + // getEffects erases all effect instances that have the type other than the + // template parameter so we collect them first in a local buffer and then + // copy. + if (auto iface = dyn_cast(op)) { + SmallVector localEffects; + iface.getEffects(localEffects); + llvm::append_range(effects, localEffects); + return true; + } + if (op->hasTrait()) { + for (auto ®ion : op->getRegions()) { + for (auto &block : region) { + for (auto &innerOp : block) + if (!collectEffects(&innerOp, effects, ignoreBarriers)) + return false; + } + } + return true; + } + + // We need to be conservative here in case the op doesn't have the interface + // and assume it can have any possible effect. + addAllValuelessEffects(effects); + return false; +} + +/// Collects memory effects from operations that may be executed before `op` in +/// a trivial structured control flow, e.g., without branches. Stops at the +/// parallel region boundary or at the barrier operation if `stopAtBarrier` is +/// set. Returns `true` if the memory effects added to `effects` are exact, +/// `false` if they are a conservative over-approximation. The latter means that +/// `effects` contain instances not associated with a specific value. +static bool +getEffectsBefore(Operation *op, + SmallVectorImpl &effects, + bool stopAtBarrier) { + if (!op->getBlock()) + return true; + + // If there is a non-structured control flow, bail. + Region *region = op->getBlock()->getParent(); + if (region && !llvm::hasSingleElement(region->getBlocks())) { + addAllValuelessEffects(effects); + return false; + } + + // Collect all effects before the op. + if (op != &op->getBlock()->front()) { + for (Operation *it = op->getPrevNode(); it != nullptr; + it = it->getPrevNode()) { + if (isa(it)) { + if (stopAtBarrier) + return true; + else + continue; + } + if (!collectEffects(it, effects)) + return false; + } + } + + // Stop if reached the parallel region boundary. + if (isParallelRegionBoundary(op->getParentOp())) + return true; + + // Otherwise, keep collecting above the parent operation. + if (!getEffectsBefore(op->getParentOp(), effects, stopAtBarrier)) + return false; + + // If the op is loop-like, collect effects from the trailing operations until + // we hit a barrier because they can executed before the current operation by + // the previous iteration of this loop. For example, in the following loop + // + // for i = ... { + // op1 + // ... + // barrier + // op2 + // } + // + // the operation `op2` at iteration `i` is known to be executed before the + // operation `op1` at iteration `i+1` and the side effects must be ordered + // appropriately. + if (isSequentialLoopLike(op->getParentOp())) { + // Assuming loop terminators have no side effects. + return getEffectsBefore(op->getBlock()->getTerminator(), effects, + /*stopAtBarrier=*/true); + } + + // If the parent operation is not guaranteed to execute its (single-block) + // region once, walk the block. + bool conservative = false; + if (!hasSingleExecutionBody(op->getParentOp())) + op->getParentOp()->walk([&](Operation *in) { + if (conservative) + return WalkResult::interrupt(); + if (!collectEffects(in, effects)) { + conservative = true; + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + + return !conservative; +} + +/// Collects memory effects from operations that may be executed after `op` in +/// a trivial structured control flow, e.g., without branches. Stops at the +/// parallel region boundary or at the barrier operation if `stopAtBarrier` is +/// set. Returns `true` if the memory effects added to `effects` are exact, +/// `false` if they are a conservative over-approximation. The latter means that +/// `effects` contain instances not associated with a specific value. +static bool +getEffectsAfter(Operation *op, + SmallVectorImpl &effects, + bool stopAtBarrier) { + if (!op->getBlock()) + return true; + + // If there is a non-structured control flow, bail. + Region *region = op->getBlock()->getParent(); + if (region && !llvm::hasSingleElement(region->getBlocks())) { + addAllValuelessEffects(effects); + return false; + } + + // Collect all effects after the op. + if (op != &op->getBlock()->back()) + for (Operation *it = op->getNextNode(); it != nullptr; + it = it->getNextNode()) { + if (isa(it)) { + if (stopAtBarrier) + return true; + continue; + } + if (!collectEffects(it, effects)) + return false; + } + + // Stop if reached the parallel region boundary. + if (isParallelRegionBoundary(op->getParentOp())) + return true; + + // Otherwise, keep collecting below the parent operation. + if (!getEffectsAfter(op->getParentOp(), effects, stopAtBarrier)) + return false; + + // If the op is loop-like, collect effects from the leading operations until + // we hit a barrier because they can executed after the current operation by + // the next iteration of this loop. For example, in the following loop + // + // for i = ... { + // op1 + // ... + // barrier + // op2 + // } + // + // the operation `op1` at iteration `i` is known to be executed after the + // operation `op2` at iteration `i-1` and the side effects must be ordered + // appropriately. + if (isSequentialLoopLike(op->getParentOp())) { + if (isa(op->getBlock()->front())) + return true; + + bool exact = collectEffects(&op->getBlock()->front(), effects); + return getEffectsAfter(&op->getBlock()->front(), effects, + /*stopAtBarrier=*/true) && + exact; + } + + // If the parent operation is not guaranteed to execute its (single-block) + // region once, walk the block. + bool conservative = false; + if (!hasSingleExecutionBody(op->getParentOp())) + op->getParentOp()->walk([&](Operation *in) { + if (conservative) + return WalkResult::interrupt(); + if (!collectEffects(in, effects)) { + conservative = true; + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + + return !conservative; +} + +/// Looks through known "view-like" ops to find the base memref. +static Value getBase(Value v) { + while (true) { + Operation *definingOp = v.getDefiningOp(); + if (!definingOp) + break; + + bool shouldContinue = + TypeSwitch(v.getDefiningOp()) + .Case( + [&](auto op) { + v = op.getSource(); + return true; + }) + .Case([&](auto op) { + v = op.getIn(); + return true; + }) + .Case([&](auto op) { + v = op.getSrc(); + return true; + }) + .Default([](Operation *) { return false; }); + if (!shouldContinue) + break; + } + return v; +} + +/// Returns `true` if the value is defined as a function argument. +static bool isFunctionArgument(Value v) { + auto arg = dyn_cast(v); + return arg && isa(arg.getOwner()->getParentOp()); +} + +/// Returns the operand that the operation "propagates" through it for capture +/// purposes. That is, if the value produced by this operation is captured, then +/// so is the returned value. + + +static Value propagatesCapture(Operation *op) { + return llvm::TypeSwitch(op) + .Case( + [](ViewLikeOpInterface viewLike) { return viewLike.getViewSource(); }) + .Case([](CastOpInterface castLike) { return castLike->getOperand(0); }) + .Case([](memref::TransposeOp transpose) { return transpose.getIn(); }) + .Case( + [](auto op) { return op.getSrc(); }) + .Default([](Operation *) { return Value(); }); +} + +/// Returns `true` if the given operation is known to capture the given value, +/// `false` if it is known not to capture the given value, `nullopt` if neither +/// is known. +static std::optional getKnownCapturingStatus(Operation *op, Value v) { + return llvm::TypeSwitch>(op) + // Store-like operations don't capture the destination, but do capture + // the value. + .Case( + [&](auto op) { return op.getValue() == v; }) + .Case( + [&](auto op) { return op.getValueToStore() == v; }) + // These operations are known not to capture. + .Case([](memref::DeallocOp) { return false; }) + // By default, we don't know anything. + .Default([](Operation *) { return std::nullopt; }); +} + +/// Returns `true` if the value may be captured by any of its users, i.e., if +/// the user may be storing this value into memory. This makes aliasing analysis +/// more conservative as it cannot assume the pointer-like value is only passed +/// around through SSA use-def. +static bool maybeCaptured(Value v) { + SmallVector todo = {v}; + while (!todo.empty()) { + Value v = todo.pop_back_val(); + for (Operation *user : v.getUsers()) { + // A user that is known to only read cannot capture. + auto iface = dyn_cast(user); + if (iface) { + SmallVector effects; + iface.getEffects(effects); + if (llvm::all_of(effects, + [](const MemoryEffects::EffectInstance &effect) { + return isa(effect.getEffect()); + })) { + continue; + } + } + + // When an operation is known to create an alias, consider if the + // source is captured as well. + if (Value v = propagatesCapture(user)) { + todo.push_back(v); + continue; + } + + std::optional knownCaptureStatus = getKnownCapturingStatus(user, v); + if (!knownCaptureStatus || *knownCaptureStatus) + return true; + } + } + + return false; +} + +/// Returns true if two values may be referencing aliasing memory. This is a +/// rather naive and conservative analysis. Values defined by different +/// allocation-like operations as well as values derived from those by casts and +/// views cannot alias each other. Similarly, values defined by allocations +/// inside a function cannot alias function arguments. Global values cannot +/// alias each other or local allocations. Values that are captured, i.e. +/// themselves potentially stored in memory, are considered as aliasing with +/// everything. This seems sufficient to achieve barrier removal in structured +/// control flow, more complex cases would require a proper dataflow analysis. +static bool mayAlias(Value first, Value second) { + + first = getBase(first); + second = getBase(second); + // Values derived from the same base memref do alias (unless we do a more + // advanced analysis to prove non-overlapping accesses). + if (first == second) { + return true; + } + + // Different globals cannot alias. + if (auto globFirst = first.getDefiningOp()) { + if (auto globSecond = second.getDefiningOp()) { + return globFirst.getNameAttr() == globSecond.getNameAttr(); + } + } + // if (auto subSpanFirst = + // first.getDefiningOp()) { + // if (auto subSpanSecond = + // second.getDefiningOp()) { + // return subSpanFirst.getBindingAttr() == subSpanSecond.getBindingAttr(); + // } + // } + + bool isDistinct[] = {producesDistinctBase(first.getDefiningOp()), + producesDistinctBase(second.getDefiningOp())}; + bool isGlobal[] = {first.getDefiningOp() != nullptr, + second.getDefiningOp() != nullptr}; + + // Non-equivalent distinct bases and globals cannot alias. At this point, we + // have already filtered out based on values being equal and global name being + // equal. + if ((isDistinct[0] || isGlobal[0]) && (isDistinct[1] || isGlobal[1])) + return false; + + bool isArg[] = {isFunctionArgument(first), isFunctionArgument(second)}; + + // Distinct bases (allocations) cannot have been passed as an argument. + if ((isDistinct[0] && isArg[1]) || (isDistinct[1] && isArg[0])) + return false; + + // Non-captured base distinct values cannot conflict with another base value. + if (isDistinct[0] && !maybeCaptured(first)) + return false; + if (isDistinct[1] && !maybeCaptured(second)) + return false; + + // Otherwise, conservatively assume aliasing. + return true; +} + +/// Returns `true` if the effect may be affecting memory aliasing the value. If +/// the effect is not associated with any value, it is assumed to affect all +/// memory and therefore aliases with everything. +static bool mayAlias(MemoryEffects::EffectInstance a, Value v2) { + if (Value v = a.getValue()) { + return mayAlias(v, v2); + } + return true; +} + +/// Returns `true` if the two effects may be affecting aliasing memory. If +/// an effect is not associated with any value, it is assumed to affect all +/// memory and therefore aliases with everything. Effects on different resources +/// cannot alias. +static bool mayAlias(MemoryEffects::EffectInstance a, + MemoryEffects::EffectInstance b) { + if (a.getResource()->getResourceID() != b.getResource()->getResourceID()) + return false; + if (Value v2 = b.getValue()) { + return mayAlias(a, v2); + } else if (Value v = a.getValue()) { + return mayAlias(b, v); + } + return true; +} + +/// Returns `true` if any of the "before" effect instances has a conflict with +/// any "after" instance for the purpose of barrier elimination. The effects are +/// supposed to be limited to a barrier synchronization scope. A conflict exists +/// if effects instances affect aliasing memory locations and at least on of +/// then as a write. As an exception, if the non-write effect is an allocation +/// effect, there is no conflict since we are only expected to see the +/// allocation happening in the same thread and it cannot be accessed from +/// another thread without capture (which we do handle in alias analysis). +static bool +haveConflictingEffects(ArrayRef beforeEffects, + ArrayRef afterEffects) { + for (const MemoryEffects::EffectInstance &before : beforeEffects) { + for (const MemoryEffects::EffectInstance &after : afterEffects) { + // If cannot alias, definitely no conflict. + if (!mayAlias(before, after)) + continue; + + // Read/read is not a conflict. + if (isa(before.getEffect()) && + isa(after.getEffect())) { + continue; + } + + // Allocate/* is not a conflict since the allocation happens within the + // thread context. + // TODO: This is not the case for */Free unless the allocation happened in + // the thread context, which we could also check for. + if (isa(before.getEffect()) || + isa(after.getEffect())) { + continue; + } + + // In the particular case that the before effect is a free, we only have 2 + // possibilities: + // 1. either the program is well-formed and there must be an interleaved + // alloc that must limit the scope of effect lookback and we can + // safely ignore the free -> read / free -> write and free -> free + // conflicts. + // 2. either the program is ill-formed and we are in undefined behavior + // territory. + if (isa(before.getEffect())) + continue; + + // Other kinds of effects create a conflict, e.g. read-after-write. + LLVM_DEBUG( + DBGS() << "found a conflict between (before): " << before.getValue() + << " read:" << isa(before.getEffect()) + << " write:" << isa(before.getEffect()) + << " alloc:" + << isa(before.getEffect()) << " free:" + << isa(before.getEffect()) << "\n"); + LLVM_DEBUG( + DBGS() << "and (after): " << after.getValue() + << " read:" << isa(after.getEffect()) + << " write:" << isa(after.getEffect()) + << " alloc:" << isa(after.getEffect()) + << " free:" << isa(after.getEffect()) + << "\n"); + return true; + } + } + + return false; +} + +namespace { +/// Barrier elimination pattern. If a barrier does not enforce any conflicting +/// pair of memory effects, including a pair that is enforced by another +/// barrier, it is unnecessary and can be removed. Adapted from +/// "High-Performance GPU-to-CPU Transpilation and Optimization via High-Level +/// Parallel Constructs" by Moses et.al. in PPoPP 2023 and implementation in +/// Polygeist. +class BarrierElimination final : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(mlir::gpu::BarrierOp barrier, + PatternRewriter &rewriter) const override { + LLVM_DEBUG(DBGS() << "checking the necessity of: " << barrier << " " + << barrier.getLoc() << "\n"); + + SmallVector beforeEffects; + getEffectsBefore(barrier, beforeEffects, /*stopAtBarrier=*/true); + + SmallVector afterEffects; + getEffectsAfter(barrier, afterEffects, /*stopAtBarrier=*/true); + + if (!haveConflictingEffects(beforeEffects, afterEffects)) { + LLVM_DEBUG(DBGS() << "the surrounding barriers are sufficient, removing " + << barrier << "\n"); + rewriter.eraseOp(barrier); + return success(); + } + + LLVM_DEBUG(DBGS() << "barrier is necessary: " << barrier << " " + << barrier.getLoc() << "\n"); + return failure(); + } +}; +} // namespace + +void buddy::gpu::EliminateGpuBarriersOp::build(OpBuilder &builder, + OperationState &state, + Value target) { + build(builder, state, target.getType(), target); +} + +DiagnosedSilenceableFailure +buddy::gpu::EliminateGpuBarriersOp::applyToOne( + transform::TransformRewriter &rewriter, func::FuncOp target, + transform::ApplyToEachResultList &results, + transform::TransformState &state) { + RewritePatternSet patterns(target.getContext()); + patterns.insert(getContext()); + transform::ErrorCheckingTrackingListener listener(state, *this); + auto checkErrors = llvm::make_scope_exit([&]() { + // The TrackingListener API makes checking for errors mandatory. It is safe + // to drop payload ops during this transform, so we can ignore all errors. + (void)listener.checkAndResetError(); + }); + GreedyRewriteConfig config; + config.listener = &listener; + if (failed( + applyPatternsAndFoldGreedily(target, std::move(patterns), config))) { + return emitDefaultSilenceableFailure(target); + } + + results.push_back(target); + return DiagnosedSilenceableFailure::success(); +} + +//===----------------------------------------------------------------------===// +// SynchronizeLoopOp +//===----------------------------------------------------------------------===// + +void buddy::gpu::SynchronizeLoopOp::getEffects( + SmallVectorImpl &effects) { +transform::onlyReadsHandle(getForOp(), effects); +transform::modifiesPayload(effects); +} + +DiagnosedSilenceableFailure buddy::gpu::SynchronizeLoopOp::applyToOne( + transform::TransformRewriter &rewriter, scf::ForOp forOp, + transform::ApplyToEachResultList &results, + transform::TransformState &state) { +rewriter.setInsertionPointAfter(forOp); +rewriter.create(forOp.getLoc()); +return DiagnosedSilenceableFailure::success(); +} + +//===---------------------------------------------------------------------===// +// CreateAsyncGroupsOp. +//===---------------------------------------------------------------------===// + +void buddy::gpu::CreateAsyncGroupsOp::getEffects( + SmallVectorImpl &effects) { +transform::onlyReadsHandle(getTarget(), effects); +transform::modifiesPayload(effects); +} + +DiagnosedSilenceableFailure buddy::gpu::CreateAsyncGroupsOp::applyToOne( + transform::TransformRewriter &rewriter, func::FuncOp target, + transform::ApplyToEachResultList &results, + transform::TransformState &state) { +createAsyncGroups(rewriter, cast(target), + getUseMmaSync()); +return DiagnosedSilenceableFailure::success(); +} + #define GET_OP_CLASSES #include "GPU/TransformOps.cpp.inc" diff --git a/midend/lib/Utils/GPUUtils.cpp b/midend/lib/Utils/GPUUtils.cpp index 82058c8813..243e49d521 100644 --- a/midend/lib/Utils/GPUUtils.cpp +++ b/midend/lib/Utils/GPUUtils.cpp @@ -531,6 +531,295 @@ template void hoistStaticallyBoundAllocationsInFunc(RewriterBase &rewriter, func::FuncOp funcOp); +static bool isContiguousStore(Operation *write) { + if (auto transferWrite = dyn_cast(write)) { + if (!transferWrite.getPermutationMap().isMinorIdentity() || + !transferWrite.isDimInBounds(0) || transferWrite.getMask()) { + LDBG("--not a contiguous store op: " << *write); + return false; + } + return true; + } + if (isa(write)) { + return true; + } + LDBG("--not a store op: " << write->getName().getStringRef()); + return false; +} + +static Value getMemrefOperand(Operation *op) { + if (auto transferWrite = dyn_cast(op)) { + return transferWrite.getSource(); + } + if (auto transferRead = dyn_cast(op)) { + return transferRead.getSource(); + } + if (auto storeOp = dyn_cast(op)) { + return storeOp.getBase(); + } + if (auto loadOp = dyn_cast(op)) { + return loadOp.getBase(); + } + return Value(); +} + +static bool isContiguousRead(Operation *read) { + if (auto transferRead = dyn_cast(read)) { + if (!transferRead.isDimInBounds(0) || + !transferRead.getPermutationMap().isMinorIdentity()) { + LDBG("--not a contiguous load op: " << *read); + return false; + } + return true; + } + if (isa(read)) { + return true; + } + LDBG("--not a load op: " << read->getName().getStringRef()); + return false; +} + +struct MaskResult { + vector::CreateMaskOp maskOp; + vector::ExtractOp maybeExtractOp; +}; +static MaskResult getMask(Operation *op) { + auto transferRead = dyn_cast(op); + if (!transferRead || !transferRead.getMask()) + return MaskResult{}; + vector::ExtractOp maybeExtractOp = + transferRead.getMask().getDefiningOp(); + auto maskOp = + maybeExtractOp + ? maybeExtractOp.getVector().getDefiningOp() + : transferRead.getMask().getDefiningOp(); + if (maybeExtractOp) { + if (maybeExtractOp.getStaticPosition().size() + 1 != + llvm::cast(maskOp->getResultTypes().front()).getRank()) { + LDBG("----mask through extract unexpected position size -> Skip: " + << maybeExtractOp); + return MaskResult{}; + } + if (maybeExtractOp.getStaticPosition().size() != 1) { + LDBG("----only mask through 2-D -> 1-D extract supported atm -> Skip: " + << maybeExtractOp); + return MaskResult{}; + } + LDBG("----mask through extract: " << maybeExtractOp); + } + return MaskResult{maskOp, maybeExtractOp}; +} + +static Value getMaskValue(RewriterBase &rewriter, Operation *op) { + MaskResult maskResult = getMask(op); + if (!maskResult.maskOp) + return Value(); + Value count = maskResult.maskOp->getOperands().back(); + vector::ExtractOp maybeExtractOp = maskResult.maybeExtractOp; + if (maybeExtractOp) { + assert(maybeExtractOp.getStaticPosition().size() == 1 && + "expected single pos"); + int64_t sliceNum = maybeExtractOp.getStaticPosition()[0]; + // TODO: to support >2-D mask + extract, and all the cmp. + Location loc = op->getLoc(); + Value zero = rewriter.create(loc, 0); + Value cmp = rewriter.create( + loc, arith::CmpIPredicate::slt, + rewriter.create(loc, sliceNum), + maskResult.maskOp->getOperands().front()); + count = rewriter.create(loc, cmp, count, zero); + } + return count; +} + +static Value getValueStored(Operation *writeOp) { + if (auto transferWrite = dyn_cast(writeOp)) { + return transferWrite.getValue(); + } + if (auto storeOp = dyn_cast(writeOp)) { + return storeOp.getValueToStore(); + } + return Value(); +} + +static Operation::operand_range getIndices(Operation *op) { + if (auto vectorReadOp = dyn_cast(op)) + return vectorReadOp.getIndices(); + if (auto vectorStoreOp = dyn_cast(op)) + return vectorStoreOp.getIndices(); + if (auto transferReadOp = dyn_cast(op)) + return transferReadOp.getIndices(); + if (auto transferWriteOp = dyn_cast(op)) + return transferWriteOp.getIndices(); + llvm_unreachable("unsupported op type"); +} + +/// Return `true` if the conversion to async copy is legal. +static bool resultsInSupportedAsyncCopy(MemRefType memrefType, + Operation::operand_range indices, + VectorType vecType) { + constexpr int64_t kSupportedCpAsyncAlignmentsInBytes[3] = {4, 8, 16}; + // Condition 1: the vectory rank must be supported. + if (vecType.hasRank() != 1) { + LDBG("----> cp.async failed, not a 1-D vector: " << vecType); + return false; + } + + // Condition 2: the copy size must be supported. + bool supportedCopySize = false; + int64_t numElements = vecType.getNumElements(); + Type elementType = vecType.getElementType(); + for (int64_t alignmentInBytes : kSupportedCpAsyncAlignmentsInBytes) { + if (alignmentInBytes * 8 == + numElements * elementType.getIntOrFloatBitWidth()) { + supportedCopySize = true; + break; + } + } + if (!supportedCopySize) { + LDBG("----> cp.async alignment failed, " + << numElements << " elts * " << elementType.getIntOrFloatBitWidth() + << "b/elem = " << numElements * elementType.getIntOrFloatBitWidth() + << "b is not supported by cp.async"); + return false; + } + + // TODO: Condition 3: the alignments must be supported. For cp.async the + // NVIDIA doc (section 6.4.1) says: "The address must be naturally aligned to + // a multiple of the access size. If an address is not properly aligned, the + // resulting behavior is undefined.". + return true; +} + +void createAsyncGroups(RewriterBase &rewriter, func::FuncOp funcOp, + bool useMMASync) { + LDBG("Start asyncGroups: useMMASync=" << useMMASync); + llvm::SmallSetVector copyToSharedMem; + // Look for all the copy that can be converted to async copy ops. + funcOp.walk([&](Operation *writeOp) { + if (!isContiguousStore(writeOp)) + return WalkResult::advance(); + LDBG("--candidate writeOp: " << *writeOp); + Value vectorVal = getValueStored(writeOp); + if (llvm::cast(vectorVal.getType()).getRank() != 1) { + LDBG("----writeOp is not an inbounds 1-D minor identity -> Skip"); + return WalkResult::advance(); + } + Value memrefOperand = getMemrefOperand(writeOp); + if (!hasSharedMemoryAddressSpace( + llvm::cast(memrefOperand.getType()))) { + LDBG("----address space is not workgroup -> Skip"); + return WalkResult::advance(); + } + Operation *readOp = vectorVal.getDefiningOp(); + if (readOp == nullptr || !isContiguousRead(readOp)) { + LDBG("----no contiguous readOp defining the writeOp -> Skip"); + return WalkResult::advance(); + } + + LDBG("--candidate readOp: " << *readOp); + if (auto transferRead = dyn_cast(readOp)) { + if (transferRead.getMask()) { + auto paddingCst = + transferRead.getPadding().getDefiningOp(); + if (!paddingCst || !paddingCst.value().isZero()) { + LDBG("----read padding value is not 0.f -> Skip"); + return WalkResult::advance(); + } + auto maskResult = getMask(transferRead); + if (!maskResult.maskOp) { + LDBG("----read mask is not a vector.create_mask op -> Skip: " + << transferRead.getMask()); + return WalkResult::advance(); + } + } + } + + // Check whether both accesses are supported before we emit: this is + // necessary to ensure the correctness of DeviceAsyncCopyOp. + VectorType vecType = llvm::cast(vectorVal.getType()); + Value storeBase = getMemrefOperand(writeOp); + Value loadBase = getMemrefOperand(readOp); + if (!resultsInSupportedAsyncCopy(cast(loadBase.getType()), + getIndices(readOp), vecType) || + !resultsInSupportedAsyncCopy(cast(storeBase.getType()), + getIndices(writeOp), vecType)) + return WalkResult::advance(); + + LDBG("--writeOp can be made async -> SUCCESS"); + copyToSharedMem.insert(writeOp); + return WalkResult::advance(); + }); + + while (!copyToSharedMem.empty()) { + SmallVector group; + Operation *writeOp = *copyToSharedMem.begin(); + LDBG("--START a group from: " << *writeOp); + // Start a group with the first write. + copyToSharedMem.remove(writeOp); + group.push_back(writeOp); + Operation *nextNode = writeOp; + // Look in the next nodes for more copies to add to the same group. + while ((nextNode = nextNode->getNextNode())) { + // Ignore ops without side effects + auto memInterface = dyn_cast(nextNode); + if (memInterface && memInterface.hasNoEffect() && + !nextNode->hasTrait()) + continue; + // ignore read from a different address space. + if (isa(nextNode)) { + Operation *readOp = nextNode; + Value memrefOperand = getMemrefOperand(readOp); + if (!hasSharedMemoryAddressSpace( + llvm::cast(memrefOperand.getType()))) { + continue; + } + } + if (copyToSharedMem.count(nextNode)) { + // found another copy, add it to the group. + copyToSharedMem.remove(nextNode); + group.push_back(nextNode); + continue; + } + // If the op is something else stop the accumulating op in the group. + LDBG("----> STOP accumulating into group due to: " << *nextNode); + break; + } + // emit the group. + SmallVector tokens; + for (Operation *writeOp : group) { + rewriter.setInsertionPoint(writeOp); + Value vectorVal = getValueStored(writeOp); + auto vectorType = llvm::cast(vectorVal.getType()); + int64_t numElements = vectorType.getNumElements(); + Operation *readOp = vectorVal.getDefiningOp(); + Value storeBase = getMemrefOperand(writeOp); + Value loadBase = getMemrefOperand(readOp); + Value mask = getMaskValue(rewriter, readOp); + auto dstMemref = llvm::cast(storeBase.getType()); + int64_t sizeInBytes = + (dstMemref.getElementTypeBitWidth() * numElements) / 8; + UnitAttr bypassL1 = + useMMASync && sizeInBytes == 16 ? rewriter.getUnitAttr() : UnitAttr(); + Value token = rewriter.create( + writeOp->getLoc(), + nvgpu::DeviceAsyncTokenType::get(funcOp.getContext()), storeBase, + getIndices(writeOp), loadBase, getIndices(readOp), + rewriter.getIndexAttr(numElements), mask, + /*bypassL1=*/bypassL1); + tokens.push_back(token); + } + // Create the group and wait for it right after. + Value groupToken = rewriter.create( + funcOp.getLoc(), nvgpu::DeviceAsyncTokenType::get(funcOp.getContext()), + tokens); + rewriter.create(funcOp.getLoc(), groupToken, + nullptr); + // Clean up old stores. + for (Operation *writeOp : group) + rewriter.eraseOp(writeOp); + } +} } // namespace gpu } // namespace mlir::buddy #endif // UTILS_GPUUTILS_DEF