[TTIG_PrefetchOp] Add mask argument (#4011)

whitneywhtsang · web-flow · commit 53b2a515d679 · 2025-04-25T15:24:57.000Z
This PR adds a new argument `mask` to the prefetch operation. It is to prepare for handling prefetching of tensor of pointers, as loads from tensor of pointers can be masked. Note: this change comes partially from #3634. --------- Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir
@@ -130,6 +130,15 @@ tt.func @triton_intel_gpu.extract(%ptr : !tt.ptr<tensor<32x32xf16>>) {
 
 // -----
 
+tt.func @triton_intel_gpu.prefetch(%arg0: !tt.ptr<tensor<2x32xf32>>, %arg1: tensor<4x32xi1>) {
+  // expected-note@-1 {{prior use here}}
+  // expected-error@+1 {{use of value '%arg1' expects different type than prior uses: 'tensor<2x32xi1>' vs 'tensor<4x32xi1>'}}
+  triton_intel_gpu.prefetch %arg0, %arg1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<tensor<2x32xf32>>
+  tt.return
+}
+
+// -----
+
 #warp = #triton_intel_gpu.warp<{sizePerThread = [16, 64], threadsPerWarp = [1, 1], order = [1, 0]}>
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32, triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} {
diff --git a/test/TritonIntelGPU/tritonintelgpu.mlir b/test/TritonIntelGPU/tritonintelgpu.mlir
@@ -50,6 +50,17 @@ tt.func @simplify_scf_for(%arg0: tensor<16x8xf16>, %arg1: tensor<16x8xf16>, %arg
 
 // -----
 
+tt.func @triton_intel_gpu.prefetch(%arg0: !tt.ptr<tensor<2x32xf32>>, %arg1: tensor<2x32xi1>) {
+  // CHECK-LABEL: @triton_intel_gpu.prefetch
+  // CHECK:         triton_intel_gpu.prefetch %arg0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<tensor<2x32xf32>>
+  triton_intel_gpu.prefetch %arg0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<tensor<2x32xf32>>
+  // CHECK:         triton_intel_gpu.prefetch %arg0, %arg1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<tensor<2x32xf32>>
+  triton_intel_gpu.prefetch %arg0, %arg1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<tensor<2x32xf32>>
+  tt.return
+}
+
+// -----
+
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32, triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} {
   tt.func @triton_intel_gpu.sub_group_transpose(%local_buffer : !tt.ptr<f16, 3>, %src : tensor<16x16xf16>) -> tensor<16x16xf16> {
     // CHECK-LABEL: @triton_intel_gpu.sub_group_transpose
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUOps.td b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUOps.td
@@ -107,7 +107,10 @@ def TTIG_ExtractOp : TTIG_Op<"extract", [Pure]> {
   let hasFolder = 1;
 }
 
-def TTIG_PrefetchOp : TTIG_Op<"prefetch"> {
+def TTIG_PrefetchOp : TTIG_Op<"prefetch", [
+  TypesMatchWith<"mask type matches ptr type", "ptr", "mask", "getI1SameShape(getPointeeType($_self))",
+                 "($_op.getOperands().size() <= 1) || std::equal_to<>()">,
+]> {
   let summary = "Tensor prefetch operation";
   let description = [{
     The `prefetch` operation prefetches an input tensor.
@@ -117,11 +120,20 @@ def TTIG_PrefetchOp : TTIG_Op<"prefetch"> {
           : !tt.ptr<tensor<256x32xf16>
       ```
   }];
-  let arguments = (ins AnyTypeOf<[TT_PtrLike, TT_TensorPtr]>:$ptr, TT_CacheModifierAttr:$cache,
-                       TT_EvictionPolicyAttr:$evict, BoolAttr:$isVolatile);
+  let arguments = (
+    ins AnyTypeOf<[TT_PtrLike, TT_TensorPtr]>:$ptr,
+    Optional<TT_BoolLike>:$mask,
+    TT_CacheModifierAttr:$cache,
+    TT_EvictionPolicyAttr:$evict,
+    BoolAttr:$isVolatile
+  );
   let results = (outs);
+  let builders = [
+    OpBuilder<(ins "Value":$ptr, "triton::CacheModifier":$cache,
+                   "triton::EvictionPolicy":$evict, "bool":$isVolatile)>
+  ];
   let assemblyFormat = [{
-    operands attr-dict `:` type($ptr)
+    $ptr (`,` $mask^)? attr-dict `:` type($ptr)
   }];
 }
 
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Ops.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Ops.cpp
@@ -197,6 +197,12 @@ OpFoldResult ExtractOp::fold(FoldAdaptor adaptor) {
   return {};
 }
 
+void PrefetchOp::build(OpBuilder &builder, OperationState &state, Value ptr,
+                       CacheModifier cache, EvictionPolicy evict,
+                       bool isVolatile) {
+  PrefetchOp::build(builder, state, ptr, /*mask=*/{}, cache, evict, isVolatile);
+}
+
 LogicalResult SubGroupTransposeOp::verify() {
   RankedTensorType srcType = getSrc().getType();
   auto mod = getOperation()->getParentOfType<mlir::ModuleOp>();