Skip to content

[GPU] Add GPU barrier elimination and async group ellimination #474

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 24 additions & 24 deletions examples/BuddyGPU/transform.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ module attributes { transform.with_named_sequence } {
// Rewrite bufferized scf.forall ops to distributed gpu.thread_id attribute.
%mapped = transform.gpu.map_nested_forall_to_threads %gpu_launch block_dims = [64, 2, 1] warp_size = 32 : (!transform.any_op) -> !transform.any_op

%15 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op
%14 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op

// Removes unnecessary GPU barriers from the function.
// %15 = transform.buddy.eliminate_gpu_barriers %14 : (!transform.any_op) -> !transform.any_op
%15 = transform.buddy.eliminate_gpu_barriers %14 : (!transform.any_op) -> !transform.any_op

// Perform canonicalization.
transform.apply_patterns to %15 {
Expand Down Expand Up @@ -191,7 +191,7 @@ module attributes { transform.with_named_sequence } {

// Insert a gpu.barrier after a given scf.for loop
%16 = transform.structured.match ops{["scf.for"]} in %15 : (!transform.any_op) -> !transform.op<"scf.for">
// transform.buddy.synchronize_loop %16 : (!transform.op<"scf.for">) -> ()
transform.buddy.synchronize_loop %16 : (!transform.op<"scf.for">) -> ()


transform.apply_patterns to %15 {
Expand Down Expand Up @@ -223,88 +223,88 @@ module attributes { transform.with_named_sequence } {
// mma operations, targetting warp level tensorcore operations.
transform.buddy.vector.vector_to_mma_conversion %17 {use_mma_sync} : (!transform.any_op) -> ()

// %18 = transform.buddy.eliminate_gpu_barriers %17 : (!transform.any_op) -> !transform.any_op
%18 = transform.buddy.eliminate_gpu_barriers %17 : (!transform.any_op) -> !transform.any_op

// Perform canonicalization.
transform.apply_patterns to %17 {
transform.apply_patterns to %18 {
transform.apply_patterns.linalg.tiling_canonicalization
transform.apply_patterns.scf.for_loop_canonicalization
transform.apply_patterns.canonicalization
} : !transform.any_op
transform.apply_cse to %17 : !transform.any_op
transform.apply_cse to %18 : !transform.any_op
%all_loops_7 = transform.structured.match interface{LoopLikeInterface}
in %17
in %18
: (!transform.any_op) -> !transform.any_op
transform.apply_licm to %all_loops_7 : !transform.any_op
transform.apply_patterns to %17 {
transform.apply_patterns to %18 {
transform.apply_patterns.linalg.tiling_canonicalization
transform.apply_patterns.vector.lower_masked_transfers
} : !transform.any_op

%19 = transform.structured.match ops{["gpu.launch"]} in %17 : (!transform.any_op) -> !transform.any_op
%19 = transform.structured.match ops{["gpu.launch"]} in %18 : (!transform.any_op) -> !transform.any_op
%fwfa = transform.structured.match ops{["memref.alloc"]} in %19 : (!transform.any_op) -> !transform.op<"memref.alloc">

// Do multi-buffering/array expansion to remove dependencies on the temporary allocation between consecutive loop iterations.
transform.memref.multibuffer %fwfa {factor = 3 : i64, skip_analysis} : (!transform.op<"memref.alloc">) -> !transform.any_op

transform.apply_patterns to %17 {
transform.apply_patterns to %18 {
transform.apply_patterns.vector.transfer_to_scf full_unroll = true
} : !transform.any_op
transform.apply_patterns to %17 {
transform.apply_patterns to %18 {
transform.apply_patterns.linalg.tiling_canonicalization
transform.apply_patterns.scf.for_loop_canonicalization
transform.apply_patterns.canonicalization
} : !transform.any_op
transform.apply_cse to %17 : !transform.any_op
transform.apply_cse to %18 : !transform.any_op
%all_loops_8 = transform.structured.match interface{LoopLikeInterface}
in %17
in %18
: (!transform.any_op) -> !transform.any_op
transform.apply_licm to %all_loops_8 : !transform.any_op
transform.apply_patterns to %17 {
transform.apply_patterns to %18 {
transform.apply_patterns.linalg.tiling_canonicalization
transform.apply_patterns.vector.lower_masked_transfers
} : !transform.any_op

// Convert sync copies to shared memory to async.
// transform.buddy.create_async_groups %17 {use_mma_sync} : (!transform.any_op) -> ()
transform.apply_patterns to %17 {
transform.buddy.create_async_groups %18 {use_mma_sync} : (!transform.any_op) -> ()
transform.apply_patterns to %18 {
transform.apply_patterns.linalg.tiling_canonicalization
transform.apply_patterns.scf.for_loop_canonicalization
transform.apply_patterns.canonicalization
transform.apply_patterns.memref.fold_memref_alias_ops
} : !transform.any_op
%all_loops_9 = transform.structured.match interface{LoopLikeInterface}
in %17
in %18
: (!transform.any_op) -> !transform.any_op
transform.apply_licm to %all_loops_9 : !transform.any_op
transform.apply_cse to %17 : !transform.any_op
transform.apply_cse to %18 : !transform.any_op


%20 = transform.structured.match ops{["nvgpu.mma.sync"]} in %17 : (!transform.any_op) -> !transform.any_op
%20 = transform.structured.match ops{["nvgpu.mma.sync"]} in %18 : (!transform.any_op) -> !transform.any_op
%21 = transform.get_parent_op %20 {deduplicate, op_name = "scf.for"} : (!transform.any_op) -> !transform.any_op
// This applies software pipelining to a given scf.for loop.
// The pipelining strategy will look for a copy to shared memory and pipeline it to overlap it with the rest of the loop.
// %22 = transform.buddy.pipeline_shared_memory_copies %21 {depth = 3 : i64, use_mma_sync, peel_epilogue} : (!transform.any_op) -> !transform.any_op

// Perform canonicalization.
transform.apply_patterns to %17 {
transform.apply_patterns to %18 {
transform.apply_patterns.vector.lower_masks
} : !transform.any_op
transform.apply_patterns to %17 {
transform.apply_patterns to %18 {
transform.apply_patterns.vector.materialize_masks
} : !transform.any_op
transform.apply_patterns to %17 {
transform.apply_patterns to %18 {
transform.apply_patterns.linalg.tiling_canonicalization
transform.apply_patterns.scf.for_loop_canonicalization
transform.apply_patterns.canonicalization
transform.apply_patterns.memref.fold_memref_alias_ops
} : !transform.any_op

%all_loops_10 = transform.structured.match interface{LoopLikeInterface}
in %17
in %18
: (!transform.any_op) -> !transform.any_op
transform.apply_licm to %all_loops_10 : !transform.any_op
transform.apply_cse to %17 : !transform.any_op
transform.apply_cse to %18 : !transform.any_op

transform.yield
}
Expand Down
110 changes: 110 additions & 0 deletions midend/include/Dialect/GPU/TransformOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,114 @@ def VectorToMMAConversionOp : Op<Transform_Dialect, "buddy.vector.vector_to_mma_
}];
}

def EliminateGpuBarriersOp :
Op<Transform_Dialect, "buddy.eliminate_gpu_barriers",
[FunctionalStyleTransformOpTrait,
MemoryEffectsOpInterface,
TransformEachOpTrait,
TransformOpInterface,
ReportTrackingListenerFailuresOpTrait]> {
let description = [{
Removes unnecessary GPU barriers from the function. If a barrier does not
enforce any conflicting pair of memory effects, including a pair that is
enforced by another barrier, it is unnecessary and can be removed.

#### Return modes

Consumes the operand handle and produces a new handle to the function after
rewriting.
}];

let arguments = (ins TransformHandleTypeInterface:$target);
let results = (outs TransformHandleTypeInterface:$result);

let assemblyFormat = [{ $target attr-dict `:` functional-type(operands, results)}];
let cppNamespace = "mlir::buddy::gpu";

let builders = [
OpBuilder<(ins "Value":$target)>
];

let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::transform::TransformRewriter &rewriter,
::mlir::func::FuncOp target,
::mlir::transform::ApplyToEachResultList &results,
::mlir::transform::TransformState &state);
}];
}

def SynchronizeLoopOp : Op<
Transform_Dialect, "buddy.synchronize_loop", [
DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
TransformEachOpTrait,
TransformOpInterface,
ReportTrackingListenerFailuresOpTrait]> {
let description = [{
This inserts a gpu.barrier after a given scf.for loop.

#### Return modes
This transform consumes the scf.for handle and produces a result handle
which points to the new scf.for loop generated. It will fail if the loop
cannot be pipelined or if there are no shared memory copies.
}];

let arguments = (
ins TransformHandleTypeInterface:$for_op);
let results = (outs);

let cppNamespace = "mlir::buddy::gpu";

let assemblyFormat = [{
$for_op
attr-dict
`:` functional-type(operands, results)}];

let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::transform::TransformRewriter &rewriter,
::mlir::scf::ForOp forOp,
::mlir::transform::ApplyToEachResultList &results,
::mlir::transform::TransformState &state);
}];
}

def CreateAsyncGroupsOp :
Op<Transform_Dialect, "buddy.create_async_groups",
[DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
TransformEachOpTrait,
TransformOpInterface,
ReportTrackingListenerFailuresOpTrait]> {
let description = [{
Convert copies to shared memory to async copies. This creates groups
of consecutive copies and emit wait operation right after.
The input operation is a `func.func`.

`use_mma_sync` specifies whether `bypassL1` attributes should be added to the
async copies.

#### Return modes
This op returns a handle to the transformed function, even if nothing
changed.
}];

let arguments = (ins TransformHandleTypeInterface:$target,
UnitAttr:$use_mma_sync);
let results = (outs);

let assemblyFormat = [{
$target
attr-dict
`:` functional-type(operands, results)}];
let cppNamespace = "mlir::buddy::gpu";

let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::transform::TransformRewriter &rewriter,
::mlir::func::FuncOp target,
::mlir::transform::ApplyToEachResultList &results,
::mlir::transform::TransformState &state);
}];
}

#endif // TRANSFORM_OPS_TD
6 changes: 6 additions & 0 deletions midend/include/Utils/GPUUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,12 @@ template <typename AllocLikeOpType>
void hoistStaticallyBoundAllocationsInFunc(RewriterBase &rewriter,
func::FuncOp funcOp);

//===----------------------------------------------------------------------===//
// Utility from compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h
//===----------------------------------------------------------------------===//
void createAsyncGroups(RewriterBase &rewriter, func::FuncOp funcOp,
bool useMMASync);

} // namespace buddy::gpu
} // namespace mlir

Expand Down
Loading