From 9096ffe844676bfe814aaf63d33ef8691cc11506 Mon Sep 17 00:00:00 2001
From: "Lu,Chengjun" <chengjun.lu@intel.com>
Date: Mon, 10 Mar 2025 15:23:11 +0000
Subject: [PATCH 01/11] Support tensor of pointer as the pointer parameter of
 the prefetching operation. Add a mask operand for boundary check.

---
 test/TritonIntelGPU/prefetch-to-llvm.mlir     | 213 ++++++++++++-----
 .../LoadStoreOpToLLVM.cpp                     | 223 +++++++++++++++++-
 2 files changed, 368 insertions(+), 68 deletions(-)
diff --git a/test/TritonIntelGPU/prefetch-to-llvm.mlir b/test/TritonIntelGPU/prefetch-to-llvm.mlir
index 082f75d5fd..b4d493b825 100644
--- a/test/TritonIntelGPU/prefetch-to-llvm.mlir
+++ b/test/TritonIntelGPU/prefetch-to-llvm.mlir
@@ -1,75 +1,158 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm
+// RUN: triton-opt %s -split-input-file --convert-triton-intel-gpu-to-llvm --cse -canonicalize | FileCheck %s
 
-// CHECK-DAG: llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind}
-// CHECK-DAG: llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x2cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind}
+// CHECK-DAG: llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind}
 module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32} {
-  tt.func public @matmul_with_prefetch(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64) {
-    // CHECK-LABEL: @matmul_with_prefetch
+// CHECK-LABEL:   llvm.func spir_kernelcc @prefetch_block_ptr(
+// CHECK-SAME:                                                %[[BASE:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !llvm.ptr<1>,
+// CHECK-SAME:                                                %[[BASE_HEIGHT:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: i64,
+// CHECK-SAME:                                                %[[BASE_WIDTH:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: i64,
+// CHECK-SAME:                                                %[[ROW_STRIDE:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: i64) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 128, 1, 1>} {
+  tt.func public @prefetch_block_ptr(%arg0: !tt.ptr<f16>, %arg2: i64, %arg4: i64, %arg5: i64) {
     %c0_i32 = arith.constant 0 : i32
     %c1_i64 = arith.constant 1 : i64
 
-    // CHECK: %[[ROW_MAJOR_BLOCK_PTR:.*]] = llvm.insertvalue %arg0, {{.*}}[6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK: %[[VAL_17:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id()
-    // CHECK: %[[VAL_18:.*]] = llvm.zext %[[VAL_17]] : i32 to i64
-    // CHECK: %[[VAL_19:.*]] = llvm.trunc %[[VAL_18]] : i64 to i32
-    // CHECK: %[[VAL_20:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK: %[[VAL_21:.*]] = llvm.urem %[[VAL_19]], %[[VAL_20]]  : i32
-    // CHECK: %[[VAL_22:.*]] = llvm.udiv %[[VAL_19]], %[[VAL_20]]  : i32
-    // CHECK: %[[VAL_23:.*]] = llvm.mlir.constant(8 : i32) : i32
-    // CHECK: %[[VAL_24:.*]] = llvm.urem %[[VAL_22]], %[[VAL_23]]  : i32
-    // CHECK: %[[VAL_25:.*]] = llvm.udiv %[[VAL_22]], %[[VAL_23]]  : i32
-    // CHECK: %[[ROW_MAJOR_OFFSET_Y:.*]] = llvm.extractvalue %[[ROW_MAJOR_BLOCK_PTR]][0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK: %[[ROW_MAJOR_OFFSET_X:.*]] = llvm.extractvalue %[[ROW_MAJOR_BLOCK_PTR]][1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK: %[[ROW_MAJOR_HEIGHT_:.*]] = llvm.extractvalue %[[ROW_MAJOR_BLOCK_PTR]][2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK: %[[ROW_MAJOR_WIDTH_:.*]] = llvm.extractvalue %[[ROW_MAJOR_BLOCK_PTR]][3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK: %[[ROW_MAJOR_ROW_STRIDE_:.*]] = llvm.extractvalue %[[ROW_MAJOR_BLOCK_PTR]][4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK: %[[ROW_MAJOR_BASE:.*]] = llvm.extractvalue %[[ROW_MAJOR_BLOCK_PTR]][6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK: %[[VAL_34:.*]] = llvm.mul %[[ROW_MAJOR_WIDTH_]], {{.*}} : i64
-    // CHECK: %[[ROW_MAJOR_WIDTH:.*]] = llvm.trunc %[[VAL_34]] : i64 to i32
-    // CHECK: %[[ROW_MAJOR_HEIGHT:.*]] = llvm.trunc %[[ROW_MAJOR_HEIGHT_]] : i64 to i32
-    // CHECK: %[[ROW_MAJOR_ROW_STRIDE:.*]] = llvm.mul %[[ROW_MAJOR_ROW_STRIDE_]], {{.*}} : i64
-    // CHECK: %[[ROW_MAJOR_STRIDE:.*]] = llvm.trunc %[[ROW_MAJOR_ROW_STRIDE]] : i64 to i32
-    // CHECK: %[[COLUMN_MAJOR_WARP_OFF_X_:.*]] = llvm.add {{.*}}, %[[ROW_MAJOR_OFFSET_X]] : i32
-    // CHECK: %[[COLUMN_MAJOR_WARP_OFF_Y_:.*]] = llvm.add {{.*}}, %[[ROW_MAJOR_OFFSET_Y]] : i32
-    // CHECK: %[[COLUMN_MAJOR_WARP_OFF_Y:.*]] = llvm.trunc %[[COLUMN_MAJOR_WARP_OFF_Y_]] : i32 to i32
-    // CHECK: %[[COLUMN_MAJOR_WARP_OFF_X:.*]] = llvm.trunc %[[COLUMN_MAJOR_WARP_OFF_X_]] : i32 to i32
-    // CHECK: %[[VAL_56:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK: %[[VAL_57:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK: %[[VAL_59:.*]] = llvm.insertelement %[[COLUMN_MAJOR_WARP_OFF_X]],  {{.*}}{{\[}}%[[VAL_57]] : i32] : vector<2xi32>
-    // CHECK: %[[ROW_MAJOR_COORD:.*]] = llvm.insertelement %[[COLUMN_MAJOR_WARP_OFF_Y]],  {{.*}}{{\[}}%[[VAL_56]] : i32] : vector<2xi32>
-    // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(%[[ROW_MAJOR_BASE]], %[[ROW_MAJOR_WIDTH]], %[[ROW_MAJOR_HEIGHT]], %[[ROW_MAJOR_STRIDE]], %[[ROW_MAJOR_COORD]]) {{.*}} : (!llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> ()
-    %rowMajorPtr = tt.make_tensor_ptr %arg0, [%arg2, %arg4], [%arg5, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x16xf16>>
-    triton_intel_gpu.prefetch %rowMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<32x16xf16>>
-
-    // COM: The memory layout is same for the column major memory and row major memory. The prefetch function should be the same.
-
-    // CHECK: %[[COLUMN_MAJOR_BLOCK_PTR:.*]] = llvm.insertvalue %arg1, {{.*}}[6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK: %[[COLUMN_MAJOR_OFFSET_Y:.*]] = llvm.extractvalue %[[COLUMN_MAJOR_BLOCK_PTR]][0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK: %[[COLUMN_MAJOR_OFFSET_X:.*]] = llvm.extractvalue %[[COLUMN_MAJOR_BLOCK_PTR]][1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK: %[[COLUMN_MAJOR_HEIGHT_:.*]] = llvm.extractvalue %[[COLUMN_MAJOR_BLOCK_PTR]][2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK: %[[COLUMN_MAJOR_WIDTH:.*]] = llvm.extractvalue %[[COLUMN_MAJOR_BLOCK_PTR]][3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK: %[[COLUMN_MAJOR_COL_STRIDE:.*]] = llvm.extractvalue %[[COLUMN_MAJOR_BLOCK_PTR]][5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK: %[[COLUMN_MAJOR_BASE:.*]] = llvm.extractvalue %[[COLUMN_MAJOR_BLOCK_PTR]][6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK: %[[VAL_86:.*]] = llvm.mul %[[COLUMN_MAJOR_HEIGHT_]], {{.*}} : i64
-    // CHECK: %[[COLUMN_MAJOR_HEIGHT:.*]] = llvm.trunc %[[VAL_86]] : i64 to i32
-    // CHECK: %[[COLUMN_MAJOR_WIDTH_:.*]] = llvm.trunc %[[COLUMN_MAJOR_WIDTH]] : i64 to i32
-    // CHECK: %[[VAL_90:.*]] = llvm.mul %[[COLUMN_MAJOR_COL_STRIDE]], {{.*}} : i64
-    // CHECK: %[[COLUMN_MAJOR_STRIDE:.*]] = llvm.trunc %[[VAL_90]] : i64 to i32
-    // CHECK: %[[COLUMN_MAJOR_WARP_OFF_X_:.*]] = llvm.add {{.*}}, %[[COLUMN_MAJOR_OFFSET_X]] : i32
-    // CHECK: %[[COLUMN_MAJOR_WARP_OFF_Y_:.*]] = llvm.add {{.*}}, %[[COLUMN_MAJOR_OFFSET_Y]] : i32
-    // CHECK: %[[COLUMN_MAJOR_WARP_OFF_Y:.*]] = llvm.trunc %[[COLUMN_MAJOR_WARP_OFF_Y_]] : i32 to i32
-    // CHECK: %[[COLUMN_MAJOR_WARP_OFF_X:.*]] = llvm.trunc %[[COLUMN_MAJOR_WARP_OFF_X_]] : i32 to i32
-    // CHECK: %[[VAL_108:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK: %[[VAL_109:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK: llvm.insertelement %[[COLUMN_MAJOR_WARP_OFF_X]], {{.*}}{{\[}}%[[VAL_109]] : i32] : vector<2xi32>
-    // CHECK: %[[COLUMN_MAJOR_COORD:.*]] = llvm.insertelement %[[COLUMN_MAJOR_WARP_OFF_Y]], {{.*}}{{\[}}%[[VAL_108]] : i32] : vector<2xi32>
-    // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x2cPU3AS1viiiDv2_i(%[[COLUMN_MAJOR_BASE]], %[[COLUMN_MAJOR_HEIGHT]], %[[COLUMN_MAJOR_WIDTH_]], %[[COLUMN_MAJOR_STRIDE]], %[[COLUMN_MAJOR_COORD]]) {{.*}} : (!llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> ()
-    %columnMajorPtr = tt.make_tensor_ptr %arg1, [%arg4, %arg3], [%c1_i64, %arg6], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<16x32xf16>>
-    triton_intel_gpu.prefetch %columnMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "column_major"} : !tt.ptr<tensor<16x32xf16>>
+    // CHECK-DAG:           %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
+    // CHECK-DAG:           %[[CST_2_I32:.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK-DAG:           %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
+    // CHECK-DAG:           %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64
+    // CHECK-DAG:           %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
+    // CHECK-DAG:           %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK-DAG:           %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK:           %[[VAL_15:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id()
+    // CHECK:           %[[VAL_16:.*]] = llvm.zext %[[VAL_15]] : i32 to i64
+    // CHECK:           %[[VAL_17:.*]] = llvm.trunc %[[VAL_16]] : i64 to i32
+    // CHECK:           %[[VAL_18:.*]] = llvm.urem %[[VAL_17]], %[[CST_1]] : i32
+    // CHECK:           %[[VAL_19:.*]] = llvm.udiv %[[VAL_17]], %[[CST_1]] : i32
+    // CHECK:           %[[VAL_20:.*]] = llvm.urem %[[VAL_19]], %[[CST_8]] : i32
+    // CHECK:           %[[VAL_21:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64
+    // CHECK:           %[[ROW_MAJOR_BASE_WIDTH_I32:.*]] = llvm.trunc %[[VAL_21]] : i64 to i32
+    // CHECK:           %[[ROW_MAJOR_BASE_HEIGHT_I32:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32
+    // CHECK:           %[[VAL_24:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64
+    // CHECK:           %[[PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32
+    // CHECK:           %[[VAL_26:.*]] = llvm.mul %[[VAL_18]], %[[CST_32]] : i32
+    // CHECK:           %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32
+    // CHECK:           %[[VAL_29:.*]] = llvm.add %[[VAL_28]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_30:.*]] = llvm.mul %[[VAL_20]], %[[CST_2_I32]] : i32
+    // CHECK:           %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32
+    // CHECK:           %[[VAL_33:.*]] = llvm.add %[[VAL_32]], %[[CST_0]] : i32
+    // CHECK:           %[[OFFSET_Y:.*]] = llvm.trunc %[[VAL_33]] : i32 to i32
+    // CHECK:           %[[OFFSET_X:.*]] = llvm.trunc %[[VAL_29]] : i32 to i32
+    // CHECK:           %[[VAL_36:.*]] = llvm.insertelement %[[OFFSET_X]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:           %[[OFFSETS:.*]] = llvm.insertelement %[[OFFSET_Y]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:           llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH_I32]], %[[ROW_MAJOR_BASE_HEIGHT_I32]], %[[PITCH]], %[[OFFSETS]]) {{.*}}
+    %rowMajorPtr = tt.make_tensor_ptr %arg0, [%arg2, %arg4], [%arg5, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<16x32xf16>>
+    triton_intel_gpu.prefetch %rowMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<16x32xf16>>
 
+    // COM: The memory layout is same for the column major memory and row major memory. The prefetch should be the same.
+
+    // CHECK:           %[[VAL_38:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32
+    // CHECK:           %[[VAL_39:.*]] = llvm.zext %[[VAL_38]] : i32 to i64
+    // CHECK:           %[[VAL_40:.*]] = llvm.trunc %[[VAL_39]] : i64 to i32
+    // CHECK:           %[[VAL_41:.*]] = llvm.urem %[[VAL_40]], %[[CST_1]] : i32
+    // CHECK:           %[[VAL_42:.*]] = llvm.udiv %[[VAL_40]], %[[CST_1]] : i32
+    // CHECK:           %[[VAL_43:.*]] = llvm.urem %[[VAL_42]], %[[CST_8]] : i32
+    // CHECK:           %[[VAL_44:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64
+    // CHECK:           %[[COLUM_MAJOR_BASE_WIDTH_I32:.*]] = llvm.trunc %[[VAL_44]] : i64 to i32
+    // CHECK:           %[[COLUM_MAJOR_BASE_HEIGHT_I32:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32
+    // CHECK:           %[[VAL_47:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64
+    // CHECK:           %[[COLUM_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_47]] : i64 to i32
+    // CHECK:           %[[VAL_49:.*]] = llvm.mul %[[VAL_41]], %[[CST_32]] : i32
+    // CHECK:           %[[VAL_50:.*]] = llvm.add %[[VAL_49]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_51:.*]] = llvm.urem %[[VAL_50]], %[[CST_32]] : i32
+    // CHECK:           %[[VAL_52:.*]] = llvm.add %[[VAL_51]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_53:.*]] = llvm.mul %[[VAL_43]], %[[CST_2_I32]] : i32
+    // CHECK:           %[[VAL_54:.*]] = llvm.add %[[VAL_53]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_55:.*]] = llvm.urem %[[VAL_54]], %[[CST_16]] : i32
+    // CHECK:           %[[VAL_56:.*]] = llvm.add %[[VAL_55]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_57:.*]] = llvm.trunc %[[VAL_56]] : i32 to i32
+    // CHECK:           %[[VAL_58:.*]] = llvm.trunc %[[VAL_52]] : i32 to i32
+    // CHECK:           %[[VAL_59:.*]] = llvm.insertelement %[[VAL_58]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:           %[[COLUM_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[VAL_57]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:           llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[COLUM_MAJOR_BASE_WIDTH_I32]], %[[COLUM_MAJOR_BASE_HEIGHT_I32]], %[[COLUM_MAJOR_PITCH]], %[[COLUM_MAJOR_OFFSETS]]) {{.*}}
+    %columnMajorPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%c1_i64, %arg5], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x16xf16>>
+    triton_intel_gpu.prefetch %columnMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "column_major"} : !tt.ptr<tensor<32x16xf16>>
+
+    // COM: The memory is not structured densely. Not to prefetch it to the cache.
     // CHECK-NOT: block_prefetch
-    %nonContiguousPtr = tt.make_tensor_ptr %arg1, [%arg4, %arg3], [%arg6, %arg6], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<16x32xf16>>
-    triton_intel_gpu.prefetch %nonContiguousPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<tensor<16x32xf16>>
+    %nonContiguousPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%arg5, %arg5], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x16xf16>>
+    triton_intel_gpu.prefetch %nonContiguousPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<tensor<32x16xf16>>
+    tt.return
+  }
+}
+
+// -----
+
+// CHECK:   llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_8r16x2cPU3AS1viiiDv2_i
+#dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
+module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32} {
+  // CHECK-LABEL:   llvm.func spir_kernelcc @prefetch_tensor_of_pointers
+  tt.func public @prefetch_tensor_of_pointers(%tensor_of_ptr: tensor<64x32x!tt.ptr<f16>, #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth = 1}>>) {
+    // CHECK: %[[MASK:.*]] = llvm.mlir.constant(1 : i8) : i8
+    // CHECK: %[[VAL_2:.*]] = llvm.mlir.undef : vector<2xi32>
+    // CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK: %[[BASE_HEIGHT:.*]] = llvm.mlir.constant(8 : i32) : i32
+    // CHECK: %[[BASE_WIDTH:.*]] = llvm.mlir.constant(64 : i32) : i32
+    // CHECK: %[[TRUE:.*]] = llvm.mlir.constant(true) : i1
+
+    // CHECK: %[[ADDR_0:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
+    // CHECK: %[[ADDR_1:.*]] = llvm.extractvalue {{.*}}[1] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
+    // CHECK: %[[ADDR_16:.*]] = llvm.extractvalue {{.*}}[16] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
+    // CHECK: %[[ADDR_32:.*]] = llvm.extractvalue {{.*}}[32] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
+    // CHECK: %[[ADDR_48:.*]] = llvm.extractvalue {{.*}}[48] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)>
+    // CHECK: %[[VAL_13:.*]] = llvm.ptrtoint %[[ADDR_0]] : !llvm.ptr<1> to i64
+    // CHECK: %[[VAL_14:.*]] = llvm.ptrtoint %[[ADDR_1]] : !llvm.ptr<1> to i64
+    // CHECK: %[[PITCH:.*]] = llvm.sub %[[VAL_14]], %[[VAL_13]] : i64
+    // CHECK: %[[UNIFIED_PITCH:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[PITCH]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64
+    // CHECK: %[[UNIFIED_PITCH_I32:.*]] = llvm.trunc %[[UNIFIED_PITCH]] : i64 to i32
+    // CHECK: %[[VAL_18:.*]] = llvm.intr.umax(%[[UNIFIED_PITCH_I32]], %[[BASE_WIDTH]]) : (i32, i32) -> i32
+    // CHECK: %[[PITCH_IN_BYTES_I32:.*]] = llvm.trunc %[[VAL_18]] : i32 to i32
+
+    // CHECK: %[[UNIFIED_MASK:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[MASK]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i8, i32) -> i8
+    // CHECK: %[[UNIFIED_MASK_I1:.*]] = llvm.trunc %[[UNIFIED_MASK]] : i8 to i1
+    // CHECK: %[[OFFSET_Y:.*]] = llvm.select %[[UNIFIED_MASK_I1]], %[[CST_0]], %[[BASE_HEIGHT]] : i1, i32
+    // CHECK: %[[UNIFIED_BASE:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_13]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64
+    // CHECK: %[[VAL_26:.*]] = llvm.inttoptr %[[UNIFIED_BASE]] : i64 to !llvm.ptr<1>
+    // CHECK: %[[VAL_27:.*]] = llvm.insertelement %[[CST_0]], {{.*}} : vector<2xi32>
+    // CHECK: %[[OFFSETS:.*]] = llvm.insertelement %[[OFFSET_Y]], {{.*}} : vector<2xi32>
+    // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_8r16x2cPU3AS1viiiDv2_i(%[[VAL_26]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32]], %[[OFFSETS]])
+
+    // CHECK: %[[VAL_29:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[MASK]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i8, i32) -> i8
+    // CHECK: %[[VAL_30:.*]] = llvm.trunc %[[VAL_29]] : i8 to i1
+    // CHECK: %[[VAL_31:.*]] = llvm.select %[[VAL_30]], %[[CST_0]], %[[BASE_HEIGHT]] : i1, i32
+    // CHECK: %[[VAL_32:.*]] = llvm.ptrtoint %[[ADDR_16]] : !llvm.ptr<1> to i64
+    // CHECK: %[[VAL_33:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_32]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64
+    // CHECK: %[[VAL_34:.*]] = llvm.inttoptr %[[VAL_33]] : i64 to !llvm.ptr<1>
+    // CHECK: %[[VAL_35:.*]] = llvm.insertelement %[[VAL_31]], {{.*}} : vector<2xi32>
+    // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_8r16x2cPU3AS1viiiDv2_i(%[[VAL_34]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32]], %[[VAL_35]])
+
+    // CHECK: %[[VAL_36:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[MASK]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i8, i32) -> i8
+    // CHECK: %[[VAL_37:.*]] = llvm.trunc %[[VAL_36]] : i8 to i1
+    // CHECK: %[[VAL_38:.*]] = llvm.select %[[VAL_37]], %[[CST_0]], %[[BASE_HEIGHT]] : i1, i32
+    // CHECK: %[[VAL_39:.*]] = llvm.ptrtoint %[[ADDR_32]] : !llvm.ptr<1> to i64
+    // CHECK: %[[VAL_40:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_39]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64
+    // CHECK: %[[VAL_41:.*]] = llvm.inttoptr %[[VAL_40]] : i64 to !llvm.ptr<1>
+    // CHECK: %[[VAL_42:.*]] = llvm.insertelement %[[VAL_38]], {{.*}} : i32] : vector<2xi32>
+    // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_8r16x2cPU3AS1viiiDv2_i(%[[VAL_41]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32]], %[[VAL_42]])
+
+    // CHECK: %[[VAL_43:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[MASK]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i8, i32) -> i8
+    // CHECK: %[[VAL_44:.*]] = llvm.trunc %[[VAL_43]] : i8 to i1
+    // CHECK: %[[VAL_45:.*]] = llvm.select %[[VAL_44]], %[[CST_0]], %[[BASE_HEIGHT]] : i1, i32
+    // CHECK: %[[VAL_46:.*]] = llvm.ptrtoint %[[ADDR_48]] : !llvm.ptr<1> to i64
+    // CHECK: %[[VAL_47:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_46]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64
+    // CHECK: %[[VAL_48:.*]] = llvm.inttoptr %[[VAL_47]] : i64 to !llvm.ptr<1>
+    // CHECK: %[[VAL_49:.*]] = llvm.insertelement %[[VAL_45]], {{.*}} : i32] : vector<2xi32>
+    // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_8r16x2cPU3AS1viiiDv2_i(%[[VAL_48]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32]], %[[VAL_49]])
+
+    %mask_tensor = arith.constant dense<1> : tensor<64x32xi1, #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth = 1}>>
+    triton_intel_gpu.prefetch %tensor_of_ptr, %mask_tensor {boundaryCheck = array<i32>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, operandSegmentSizes = array<i32: 1, 1, 1>, triton_intel_gpu.block_io = "row_major"} : tensor<64x32x!tt.ptr<f16>, #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth = 1}>>
+
+    // CHECK-COUNT-4: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_8r16x2cPU3AS1viiiDv2_i
+
+    triton_intel_gpu.prefetch %tensor_of_ptr {boundaryCheck = array<i32>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, operandSegmentSizes = array<i32: 1, 1, 1>, triton_intel_gpu.block_io = "row_major"} : tensor<64x32x!tt.ptr<f16>, #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth = 1}>>
+
     tt.return
   }
 }
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
index e0adf72365..25f8b0a759 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -328,9 +328,7 @@ struct PrefetchOpConversion
     Value ptr = op.getPtr();
     if (isTensorPointerType(ptr.getType()))
       return rewriteTensorPointerPrefetch(op, adaptor, rewriter);
-
-    llvm_unreachable("Unexpected prefetch operation on 'regular' ptr");
-    return failure();
+    return rewriteRegularPointerPrefetch(op, adaptor, rewriter);
   }
 
   LogicalResult
@@ -369,6 +367,9 @@ struct PrefetchOpConversion
       // Swap the shape to make it row major and then get the tiling
       // size base on row major shape.
       std::swap(tensorShape[0], tensorShape[1]);
+
+      tensorType = RankedTensorType::get(
+          tensorShape, tensorType.getElementType(), tensorType.getEncoding());
     }
 
     unsigned numWarps = triton::gpu::lookupNumWarps(op);
@@ -481,6 +482,222 @@ struct PrefetchOpConversion
     rewriter.eraseOp(op);
     return success();
   }
+
+  LogicalResult
+  rewriteRegularPointerPrefetch(triton::gpu::intel::PrefetchOp op,
+                                OpAdaptor adaptor,
+                                ConversionPatternRewriter &rewriter) const {
+
+    Attribute blockIOAttr =
+        op->getAttr(TritonIntelGPUDialect::getBlockIOAttrName());
+    if (!blockIOAttr) {
+      llvm_unreachable("Unexpected prefetch operation on unstructured memory "
+                       "which may pollute the cache");
+      return failure();
+    }
+
+    // Only support rank 2 block pointer, either row major or column major.
+    StringRef memoryLayoutInfo = cast<StringAttr>(blockIOAttr).getValue();
+    assert((memoryLayoutInfo == "row_major" ||
+            memoryLayoutInfo == "column_major") &&
+           "Only row_major or column_major is supported");
+
+    const bool memoryRowMajor = (memoryLayoutInfo == "row_major");
+
+    // TODO: To support more layouts on memory.
+    if (!memoryRowMajor) {
+      return failure();
+    }
+
+    Value ptr = op.getPtr();
+    auto tensorOfPointers = cast<RankedTensorType>(ptr.getType());
+
+    // TODO: To support more layouts in register.
+    if (!hasDotDpasEncoding(tensorOfPointers))
+      return failure();
+
+    auto encoding = getDotEncoding(tensorOfPointers).value();
+    auto dpasLayout = cast<DpasEncodingAttr>(encoding.getParent());
+    auto warpsPerCTA = dpasLayout.getWarpsPerCTA();
+    auto cluster = dpasLayout.getRepCluster();
+    SmallVector<unsigned> repCluster{cluster.begin(), cluster.end()};
+    auto tensorShape = tensorOfPointers.getShape();
+
+    DpasEncodingAttr::OpIdx opIdx;
+    auto getOpIdx = [&]() -> DpasEncodingAttr::OpIdx {
+      auto dotLayout = getDotEncoding(tensorOfPointers).value();
+      return static_cast<DpasEncodingAttr::OpIdx>(dotLayout.getOpIdx());
+    };
+    opIdx = getOpIdx();
+
+    auto repetitions = dpasLayout.getDPASRepetitions(tensorShape, opIdx);
+    // getDPASRepetitions always return rank 3 size.
+    SmallVector<unsigned> numReps{repetitions.begin() + 1, repetitions.end()};
+    SmallVector<int64_t, 2> shardTensorShape;
+    if (opIdx == DpasEncodingAttr::OpIdx::OperandA) {
+      auto opAShape = dpasLayout.getShapeA();
+      shardTensorShape = {std::min<unsigned>(tensorShape[0], opAShape[0]),
+                          tensorShape[1]};
+      warpsPerCTA[1] = 1;
+      repCluster[1] = 1;
+      numReps[1] = 1;
+    } else {
+      auto opBShape = dpasLayout.getShapeB();
+      shardTensorShape = {tensorShape[0],
+                          std::min<unsigned>(tensorShape[1], opBShape[1])};
+      warpsPerCTA[0] = 1;
+      repCluster[0] = 1;
+      numReps[0] = 1;
+    }
+
+    auto ptrType = cast<PointerType>(tensorOfPointers.getElementType());
+    Type elementType = ptrType.getPointeeType();
+    RankedTensorType tensorType = RankedTensorType::get(
+        shardTensorShape, elementType, tensorOfPointers.getEncoding());
+
+    SmallVector<unsigned, 2> prefetchShape =
+        get2DPrefetchShapePerWarp(tensorType);
+
+    Value mask = op.getMask();
+    unsigned maskConstancyHor = std::numeric_limits<unsigned>::max(),
+             maskConstancyVer = std::numeric_limits<unsigned>::max();
+    if (mask) {
+      if (auto maskTy = dyn_cast_or_null<RankedTensorType>(mask.getType())) {
+        auto axisInfo = const_cast<triton::intel::ModuleAxisInfoAnalysis &>(
+                            axisAnalysisPass)
+                            .getAxisInfo(mask);
+        if (axisInfo) {
+          maskConstancyHor = axisInfo->getConstancy(1);
+          maskConstancyVer = axisInfo->getConstancy(0);
+        } else {
+          maskConstancyHor = 1;
+          maskConstancyVer = 1;
+        }
+      }
+      /*else {
+        // scalar mask. No need to check the constancy.
+      }*/
+    }
+    prefetchShape = {std::min<unsigned>(prefetchShape[0], maskConstancyVer),
+                     std::min<unsigned>(prefetchShape[1], maskConstancyHor)};
+
+    SmallVector<int64_t> numPrefetchsPerRep = {
+        mlir::ceil<int64_t>(shardTensorShape[0], prefetchShape[0]),
+        mlir::ceil<int64_t>(shardTensorShape[1], prefetchShape[1])};
+
+    Type eltTy = tensorType.getElementType();
+    unsigned elemSizeInBits = eltTy.getIntOrFloatBitWidth();
+    unsigned tileWidthInElem = prefetchShape[1];
+    unsigned tileHeightInElem = prefetchShape[0];
+    unsigned vBlocks = 1;
+    switch (elemSizeInBits) {
+    case 8:
+      if (tileWidthInElem == 64) {
+        // OCL interface supports 8b_?r32x2c for 64 bytes per row of 8 bits
+        // element.
+        vBlocks = 2;
+        tileWidthInElem = 32;
+      }
+      break;
+    case 16:
+      if (tileWidthInElem == 32) {
+        // OCL interface supports 16b_?r16x2c for 64 bytes per row of 16 bits
+        // element.
+        vBlocks = 2;
+        tileWidthInElem = 16;
+      }
+      break;
+    }
+
+    auto mod = rewriter.getBlock()->getParent()->getParentOfType<ModuleOp>();
+    Location loc = op.getLoc();
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
+
+    std::map<SmallVector<unsigned>, Value> baseAddrs, masks;
+    Value llPtr = adaptor.getPtr();
+    Value llMask = adaptor.getMask();
+
+    SmallVector<Value> ptrElems, maskElems;
+    // Get the LLVM values for pointers
+    ptrElems = unpackLLElements(loc, llPtr, rewriter);
+    if (llMask) {
+      maskElems = unpackLLElements(loc, llMask, rewriter);
+    }
+
+    // re-arrange the baseAddrs and masks to for large 2D block IO.
+    // Layout is unrelated to the scalar type.
+    SmallVector<SmallVector<unsigned>> offsets =
+        mlir::emitOffsetForLayout(encoding, tensorOfPointers);
+    for (size_t i = 0; i < ptrElems.size(); ++i) {
+      SmallVector<unsigned> offset = offsets[i];
+      baseAddrs[offset] = ptrElems[i];
+      if (llMask && maskElems.size() > 1)
+        masks[offset] = maskElems[i];
+    }
+
+    Value base, baseWidth, baseHeight, rowStrideInBytes, colStride, offsetBaseX,
+        offsetBaseY;
+
+    baseWidth = b.i32_val(vBlocks * tileWidthInElem * (elemSizeInBits / 8));
+    baseHeight = b.i32_val(tileHeightInElem);
+    offsetBaseX = b.i32_val(0);
+    offsetBaseY = b.i32_val(0);
+    rowStrideInBytes = b.sub(b.ptrtoint(i64_ty, baseAddrs[{1, 0}]),
+                             b.ptrtoint(i64_ty, baseAddrs[{0, 0}]));
+    rowStrideInBytes =
+        targetInfo.shuffleIdx(rewriter, loc, rowStrideInBytes, 0);
+    rowStrideInBytes = b.umax(b.trunc(i32_ty, rowStrideInBytes), baseWidth);
+    rowStrideInBytes = b.trunc(i32_ty, rowStrideInBytes);
+
+    for (int row = 0; row < numReps[0]; ++row) {
+      for (int col = 0; col < numReps[1]; ++col) {
+        // Prefetch the data for each repetitions.
+        for (int i = 0; i < numPrefetchsPerRep[0]; ++i)
+          for (int j = 0; j < numPrefetchsPerRep[1]; ++j) {
+            unsigned offsetN = col * warpsPerCTA[1] * shardTensorShape[1] +
+                               j * prefetchShape[1];
+            unsigned offsetM = row * warpsPerCTA[0] * shardTensorShape[0] +
+                               i * prefetchShape[0];
+            Value pred;
+            if (llMask) {
+              if (maskElems.size() > 1) {
+                pred = targetInfo.shuffleIdx(rewriter, loc,
+                                             masks[{offsetM, offsetN}], 0);
+              } else {
+                pred = maskElems[0];
+              }
+            } else {
+              pred = b.int_val(1, 1);
+            }
+            Value offsetY = b.select(pred, b.i32_val(0), baseHeight);
+            auto addr = targetInfo.shuffleIdx(rewriter, loc,
+                                              baseAddrs[{offsetM, offsetN}], 0);
+
+            auto newOp = rewriter.create<TritonGEN::Matrix2DBlockPrefetchOp>(
+                loc,
+                /*ptr*/ addr,
+                /*base_width*/ baseWidth,
+                /*base_height*/ baseHeight,
+                /*base_pitch*/ rowStrideInBytes,
+                /*x*/ offsetBaseX,
+                /*y*/ offsetY,
+                /*elem_size_in_bits*/ elemSizeInBits,
+                /*tile_width*/ tileWidthInElem,
+                /*tile_height*/ tileHeightInElem,
+                /*v_blocks*/ vBlocks,
+                /*cache_opt*/ TritonGEN::LoadCacheControl::L1C_L3C);
+            if (failed(newOp.verify())) {
+              // Explicitly invoke verifier because `triton_gen` ops are
+              // immediately lowered further to a builtin call.
+              return failure();
+            }
+          }
+      }
+    }
+
+    rewriter.eraseOp(op);
+    return success();
+  }
 };
 
 struct LoadOpToBlockIOConversion

From 6d4aad2a5c5d9cf07f2276385f80bc5b2abddf22 Mon Sep 17 00:00:00 2001
From: "Lu,Chengjun" <chengjun.lu@intel.com>
Date: Mon, 10 Mar 2025 15:27:47 +0000
Subject: [PATCH 02/11] Support the tensor of pointer in the matmul loop
 pipelining.

---
 third_party/intel/backend/compiler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
index 5b146b0559..ba29779103 100644
--- a/third_party/intel/backend/compiler.py
+++ b/third_party/intel/backend/compiler.py
@@ -310,7 +310,8 @@ def make_ttgir(mod, metadata, opt, properties):
         intel.passes.ttgpuir.add_accelerate_matmul(pm)
         intel.passes.ttgpuir.add_remove_layout_conversions(pm)
         intel.passes.ttgpuir.add_materialize_block_pointer(pm)
-        intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, False, XPUBackend.get_split_barrier_scope(opt))
+        intel.passes.ttgpuir.add_remove_layout_conversions(pm)
+        intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, True, XPUBackend.get_split_barrier_scope(opt))
 
         passes.ttgpuir.add_fuse_nested_loops(pm)
         passes.ttgpuir.add_optimize_thread_locality(pm)

From 6fb6ced3c07845fdb265b3185e0b633c756182d3 Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitney.tsang@intel.com>
Date: Thu, 24 Apr 2025 17:28:03 +0000
Subject: [PATCH 03/11] Fix failures

Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
---
 .../intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
index 25f8b0a759..c48fcdd06c 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -518,7 +518,7 @@ struct PrefetchOpConversion
 
     auto encoding = getDotEncoding(tensorOfPointers).value();
     auto dpasLayout = cast<DpasEncodingAttr>(encoding.getParent());
-    auto warpsPerCTA = dpasLayout.getWarpsPerCTA();
+    SmallVector<unsigned> warpsPerCTA(dpasLayout.getWarpsPerCTA());
     auto cluster = dpasLayout.getRepCluster();
     SmallVector<unsigned> repCluster{cluster.begin(), cluster.end()};
     auto tensorShape = tensorOfPointers.getShape();

From d36a089a1ebc05cd1f07daf0605e7554a5aa0763 Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitney.tsang@intel.com>
Date: Thu, 24 Apr 2025 17:58:21 +0000
Subject: [PATCH 04/11] [TritonIntelGPUPipeline] Remove supportRegularPtr
 option

Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
---
 test/TritonIntelGPU/loop-pipeline.mlir                |  2 +-
 test/TritonIntelGPU/split-barrier.mlir                |  4 ++--
 third_party/intel/backend/compiler.py                 |  2 +-
 .../Dialect/TritonIntelGPU/Transforms/Passes.td       |  3 ---
 .../Pipeliner/MatmulLoopPipeline.cpp                  | 11 ++---------
 .../lib/TritonIntelGPUTransforms/Pipeliner/Schedule.h |  1 -
 .../Pipeliner/SoftwarePipeliner.cpp                   |  8 ++++----
 third_party/intel/triton_xpu.cc                       |  4 ++--
 8 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/test/TritonIntelGPU/loop-pipeline.mlir b/test/TritonIntelGPU/loop-pipeline.mlir
index 0cdf686a98..bb8a86f63e 100644
--- a/test/TritonIntelGPU/loop-pipeline.mlir
+++ b/test/TritonIntelGPU/loop-pipeline.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3 support-regular-ptr=true" | FileCheck %s
+// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3" | FileCheck %s
 
 // CHECK: #[[$BLOCK_0:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}>
 // CHECK: #[[$BLOCK_1:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}>
diff --git a/test/TritonIntelGPU/split-barrier.mlir b/test/TritonIntelGPU/split-barrier.mlir
index a2db6e5c93..b40bd446d9 100644
--- a/test/TritonIntelGPU/split-barrier.mlir
+++ b/test/TritonIntelGPU/split-barrier.mlir
@@ -1,5 +1,5 @@
-// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3 support-regular-ptr=true split-barriers-scope=workgroup" | FileCheck %s --check-prefixes=CHECK,WORKGROUP_SCOPE
-// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3 support-regular-ptr=true split-barriers-scope=subgroup" | FileCheck %s --check-prefixes=CHECK,SUBGROUP_SCOPE
+// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3 split-barriers-scope=workgroup" | FileCheck %s --check-prefixes=CHECK,WORKGROUP_SCOPE
+// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3 split-barriers-scope=subgroup" | FileCheck %s --check-prefixes=CHECK,SUBGROUP_SCOPE
 
 // CHECK: #[[$BLOCK:.+]] = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 16], warpsPerCTA = [8, 4], order = [1, 0]}>
 // CHECK: #[[$DPAS:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 8], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
index ba29779103..8ca0dba87a 100644
--- a/third_party/intel/backend/compiler.py
+++ b/third_party/intel/backend/compiler.py
@@ -311,7 +311,7 @@ def make_ttgir(mod, metadata, opt, properties):
         intel.passes.ttgpuir.add_remove_layout_conversions(pm)
         intel.passes.ttgpuir.add_materialize_block_pointer(pm)
         intel.passes.ttgpuir.add_remove_layout_conversions(pm)
-        intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, True, XPUBackend.get_split_barrier_scope(opt))
+        intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, XPUBackend.get_split_barrier_scope(opt))
 
         passes.ttgpuir.add_fuse_nested_loops(pm)
         passes.ttgpuir.add_optimize_thread_locality(pm)
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td
index b0734715c2..2acedddb91 100644
--- a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td
+++ b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td
@@ -122,9 +122,6 @@ def TritonIntelGPUPipeline : Pass<"tritonintelgpu-pipeline", "mlir::ModuleOp"> {
     Option<"numStages", "num-stages",
            "int32_t", /*default*/"3",
            "number of pipeline stages">,
-    Option<"supportRegularPtr", "support-regular-ptr",
-           "bool", /*default*/"false",
-           "Enable support for prefetching non-block pointers">,
     Option<"splitBarrierScope", "split-barriers-scope",
            "enum SplitBarrierScope", "SplitBarrierScope::None",
            "insert split barriers in a loop",
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp
index e6bd165d3d..12d8dacd28 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp
@@ -117,8 +117,7 @@ static std::optional<LoadDotOperand> loadDotOperand(tt::LoadOp loadOp) {
 
 /// Collect loads to pipeline. Return success if we can pipeline this loop.
 static void collectOpsToPipeline(scf::ForOp forOp,
-                                 SmallVectorImpl<LoadDotOperand> &loadOps,
-                                 bool supportRegularPtr) {
+                                 SmallVectorImpl<LoadDotOperand> &loadOps) {
   assert(loadOps.empty() && "Expecting an empty list of load operations");
 
   ModuleOp moduleOp = forOp->getParentOfType<ModuleOp>();
@@ -128,11 +127,6 @@ static void collectOpsToPipeline(scf::ForOp forOp,
   // operations in the loop body block.
   for (Operation &op : forOp) {
     if (auto loadOp = dyn_cast<tt::LoadOp>(&op)) {
-      Value ptr = loadOp.getPtr();
-      bool isBlockPtr = mlir::triton::isTensorPointerType(ptr.getType());
-      if (!isBlockPtr && !supportRegularPtr)
-        continue;
-
       // Check if the memory is structed densely. If not, we do not prefetch it
       // to avoid polluting the cache.
       Attribute blockIOAttr =
@@ -303,12 +297,11 @@ createSchedule(scf::ForOp forOp, int numStages) {
 }
 
 bool ttgi::preProcessLoopAndGetSchedule(scf::ForOp &forOp, int numStages,
-                                        bool supportRegularPtr,
                                         mlir::scf::PipeliningOption &options) {
   // 1. First collect "interesting" operations with a stage where to schedule
   // them. This gives a coarse scheduling for the loop.
   SmallVector<LoadDotOperand> loads;
-  collectOpsToPipeline(forOp, loads, supportRegularPtr);
+  collectOpsToPipeline(forOp, loads);
   if (loads.empty()) {
     LDBG("No loads to pipeline");
     return false;
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/Schedule.h b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/Schedule.h
index 9ef16b1d12..a5f29580d0 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/Schedule.h
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/Schedule.h
@@ -6,7 +6,6 @@
 namespace mlir::triton::gpu::intel {
 
 bool preProcessLoopAndGetSchedule(scf::ForOp &forOp, int numStages,
-                                  bool supportRegularPtr,
                                   mlir::scf::PipeliningOption &options);
 
 } // namespace mlir::triton::gpu::intel
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp
index db6c7fe37b..53ac3bf07a 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp
@@ -39,14 +39,14 @@ static bool preCondition(scf::ForOp forOp) {
 }
 
 static void
-pipelineLoop(scf::ForOp forOp, int numStages, bool supportRegularPtr,
+pipelineLoop(scf::ForOp forOp, int numStages,
              std::optional<spirv::Scope> barrierScope = std::nullopt) {
   mlir::scf::PipeliningOption options;
   if (!preCondition(forOp))
     return;
 
-  bool foundSchedule = ttgi::preProcessLoopAndGetSchedule(
-      forOp, numStages, supportRegularPtr, options);
+  bool foundSchedule =
+      ttgi::preProcessLoopAndGetSchedule(forOp, numStages, options);
   if (!foundSchedule)
     return;
 
@@ -108,7 +108,7 @@ struct IntelGPUPipelinePass
     getOperation()->walk([&](scf::ForOp forOp) { loops.push_back(forOp); });
 
     for (scf::ForOp forOp : loops) {
-      pipelineLoop(forOp, numStages, supportRegularPtr, barrierScope);
+      pipelineLoop(forOp, numStages, barrierScope);
     }
   }
 };
diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
index e80c59e27c..1e3cd76991 100644
--- a/third_party/intel/triton_xpu.cc
+++ b/third_party/intel/triton_xpu.cc
@@ -88,8 +88,8 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) {
                      gpu::intel::createTritonIntelGPUAccelerateMatmul);
   ADD_PASS_WRAPPER_0("add_rewrite_stack_ptr",
                      gpu::intel::createTritonIntelGPURewriteStackPtr);
-  ADD_PASS_WRAPPER_OPT_3("add_pipeline",
-                         gpu::intel::createTritonIntelGPUPipeline, int, bool,
+  ADD_PASS_WRAPPER_OPT_2("add_pipeline",
+                         gpu::intel::createTritonIntelGPUPipeline, int,
                          enum gpu::intel::SplitBarrierScope);
   ADD_PASS_WRAPPER_0("add_remove_layout_conversions",
                      gpu::intel::createTritonIntelGPURemoveLayoutConversions);

From 78b26b8ef3d96e6a79627d8c1fa202161b1fb2a0 Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitney.tsang@intel.com>
Date: Mon, 28 Apr 2025 00:28:59 +0000
Subject: [PATCH 05/11] address review comment

---
 .../LoadStoreOpToLLVM.cpp                     | 24 ++++++++++++-------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
index c48fcdd06c..98d87127ad 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -530,30 +530,38 @@ struct PrefetchOpConversion
     };
     opIdx = getOpIdx();
 
-    auto repetitions = dpasLayout.getDPASRepetitions(tensorShape, opIdx);
-    // getDPASRepetitions always return rank 3 size.
+    SmallVector<int64_t> repetitions =
+        dpasLayout.getDPASRepetitions(tensorShape, opIdx);
+    assert(repetitions.size() == 3 &&
+           "getDPASRepetitions always return rank 3 size");
     SmallVector<unsigned> numReps{repetitions.begin() + 1, repetitions.end()};
     SmallVector<int64_t, 2> shardTensorShape;
-    if (opIdx == DpasEncodingAttr::OpIdx::OperandA) {
+    switch (opIdx) {
+    case DpasEncodingAttr::OpIdx::OperandA: {
       auto opAShape = dpasLayout.getShapeA();
       shardTensorShape = {std::min<unsigned>(tensorShape[0], opAShape[0]),
                           tensorShape[1]};
       warpsPerCTA[1] = 1;
       repCluster[1] = 1;
       numReps[1] = 1;
-    } else {
+    } break;
+    case DpasEncodingAttr::OpIdx::OperandB: {
       auto opBShape = dpasLayout.getShapeB();
       shardTensorShape = {tensorShape[0],
                           std::min<unsigned>(tensorShape[1], opBShape[1])};
       warpsPerCTA[0] = 1;
       repCluster[0] = 1;
       numReps[0] = 1;
+    } break;
+    case DpasEncodingAttr::OpIdx::OperandC: {
+      llvm_unreachable("unexpected OpIdx::OperandC");
+    } break;
     }
 
     auto ptrType = cast<PointerType>(tensorOfPointers.getElementType());
     Type elementType = ptrType.getPointeeType();
-    RankedTensorType tensorType = RankedTensorType::get(
-        shardTensorShape, elementType, tensorOfPointers.getEncoding());
+    auto tensorType = RankedTensorType::get(shardTensorShape, elementType,
+                                            tensorOfPointers.getEncoding());
 
     SmallVector<unsigned, 2> prefetchShape =
         get2DPrefetchShapePerWarp(tensorType);
@@ -562,6 +570,7 @@ struct PrefetchOpConversion
     unsigned maskConstancyHor = std::numeric_limits<unsigned>::max(),
              maskConstancyVer = std::numeric_limits<unsigned>::max();
     if (mask) {
+      // No need to check the constancy of scalar mask.
       if (auto maskTy = dyn_cast_or_null<RankedTensorType>(mask.getType())) {
         auto axisInfo = const_cast<triton::intel::ModuleAxisInfoAnalysis &>(
                             axisAnalysisPass)
@@ -574,9 +583,6 @@ struct PrefetchOpConversion
           maskConstancyVer = 1;
         }
       }
-      /*else {
-        // scalar mask. No need to check the constancy.
-      }*/
     }
     prefetchShape = {std::min<unsigned>(prefetchShape[0], maskConstancyVer),
                      std::min<unsigned>(prefetchShape[1], maskConstancyHor)};

From e3d441a9df00bdd18a9ef22f03700559458a3034 Mon Sep 17 00:00:00 2001
From: "Tiotto, Ettore" <ettore.tiotto@intel.com>
Date: Mon, 28 Apr 2025 21:52:35 +0000
Subject: [PATCH 06/11] Fix failing CI test

Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
---
 test/TritonIntelGPU/prefetch-to-llvm.mlir     | 13 ++++++++++++
 .../LoadStoreOpToLLVM.cpp                     | 21 +++++++++++++++----
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/test/TritonIntelGPU/prefetch-to-llvm.mlir b/test/TritonIntelGPU/prefetch-to-llvm.mlir
index b4d493b825..27f81be915 100644
--- a/test/TritonIntelGPU/prefetch-to-llvm.mlir
+++ b/test/TritonIntelGPU/prefetch-to-llvm.mlir
@@ -156,3 +156,16 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
     tt.return
   }
 }
+
+// -----
+
+// COM: Currently the prefetch operation in this test cannot be lowered correctly, so we check that the test compiles cleanly and not 2D block prefetch operation gets generated.
+#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [4, 1], A = [32, 8], B = [8, 16], C = [32, 16]}>
+module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_sg_2d_block, triton_intel_gpu.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32} {
+  // CHECK-LABEL: llvm.func spir_kernelcc @kernel
+  tt.func public @kernel(%arg0 : tensor<128x32x!tt.ptr<f32>, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>) {
+    // CHECK-NOT: intel_sub_group_2d_block_prefetch
+    triton_intel_gpu.prefetch %arg0 {boundaryCheck = array<i32>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, operandSegmentSizes = array<i32: 1, 0, 0>, triton_intel_gpu.block_io = "row_major"} : tensor<128x32x!tt.ptr<f32>, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
+    tt.return
+  }
+}
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
index 98d87127ad..c647d7cd76 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -325,10 +325,18 @@ struct PrefetchOpConversion
   LogicalResult
   matchAndRewrite(triton::gpu::intel::PrefetchOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const final {
-    Value ptr = op.getPtr();
-    if (isTensorPointerType(ptr.getType()))
-      return rewriteTensorPointerPrefetch(op, adaptor, rewriter);
-    return rewriteRegularPointerPrefetch(op, adaptor, rewriter);
+    LogicalResult res =
+        isTensorPointerType(op.getPtr().getType())
+            ? rewriteTensorPointerPrefetch(op, adaptor, rewriter)
+            : rewriteRegularPointerPrefetch(op, adaptor, rewriter);
+
+    // FIXME: the prefetch lowering code should never fail. Currently it does in
+    // some cases. We should address those cases instead of removing the
+    // prefetch operation.
+    if (failed(res))
+      rewriter.eraseOp(op);
+
+    return success();
   }
 
   LogicalResult
@@ -641,6 +649,11 @@ struct PrefetchOpConversion
         masks[offset] = maskElems[i];
     }
 
+    // baseAddrs[{0, 0}] and baseAddrs[{1, 0}] are currently used to calculate
+    // the pitch.
+    if (baseAddrs.count({0, 0}) == 0 || baseAddrs.count({1, 0}) == 0)
+      return failure();
+
     Value base, baseWidth, baseHeight, rowStrideInBytes, colStride, offsetBaseX,
         offsetBaseY;
 

From 41971ca8d795fa013e37abf2490d4cf8a8028a9e Mon Sep 17 00:00:00 2001
From: "Tiotto, Ettore" <ettore.tiotto@intel.com>
Date: Tue, 29 Apr 2025 18:12:07 +0000
Subject: [PATCH 07/11] Only prefetch 2D loads

Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
---
 .../Pipeliner/MatmulLoopPipeline.cpp                     | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp
index 12d8dacd28..e9b335067f 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp
@@ -127,8 +127,8 @@ static void collectOpsToPipeline(scf::ForOp forOp,
   // operations in the loop body block.
   for (Operation &op : forOp) {
     if (auto loadOp = dyn_cast<tt::LoadOp>(&op)) {
-      // Check if the memory is structed densely. If not, we do not prefetch it
-      // to avoid polluting the cache.
+      // Check if the memory is structured densely. If not, we do not prefetch
+      // it to avoid polluting the cache.
       Attribute blockIOAttr =
           loadOp->getAttr(mlir::triton::gpu::intel::TritonIntelGPUDialect::
                               getBlockIOAttrName());
@@ -137,6 +137,11 @@ static void collectOpsToPipeline(scf::ForOp forOp,
         continue;
       }
 
+      if (cast<RankedTensorType>(loadOp.getType()).getRank() != 2) {
+        LDBG("Skipping LoadOp with non 2D tensor type" << *loadOp);
+        continue;
+      }
+
       std::optional<LoadDotOperand> loadWithDotOperand = loadDotOperand(loadOp);
       if (loadWithDotOperand.has_value())
         loadOps.push_back(loadWithDotOperand.value());

From 2bf17c7b1a41ecb4b094c0d8da0a01acfd758b04 Mon Sep 17 00:00:00 2001
From: "Tiotto, Ettore" <ettore.tiotto@intel.com>
Date: Wed, 30 Apr 2025 17:36:31 +0000
Subject: [PATCH 08/11] Refactor LoadStoreOpToLLVM.cpp

Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
---
 .../LoadStoreOpToLLVM.cpp                     | 118 ++++++++----------
 1 file changed, 50 insertions(+), 68 deletions(-)

diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
index f5e51d7980..4d3f245030 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -316,7 +316,7 @@ struct BlockIOConversionBase : public LoadStoreConversionBase {
 
   // Determine whether the given LoadOp can be lowered to using block IO
   // instructions.
-  bool isLoadCandidate(triton::LoadOp op) const {
+  static bool isLoadCandidate(triton::LoadOp op) {
     Attribute blockIOAttr =
         op->getAttr(TritonIntelGPUDialect::getBlockIOAttrName());
     if (!blockIOAttr)
@@ -332,7 +332,7 @@ struct BlockIOConversionBase : public LoadStoreConversionBase {
       std::enable_if_t<llvm::is_one_of<OpTy, triton::gpu::intel::PrefetchOp,
                                        triton::LoadOp>::value,
                        bool> = true>
-  bool isMemoryRowMajor(OpTy op) const {
+  static bool isMemoryRowMajor(OpTy op) {
     Attribute blockIOAttr =
         op->getAttr(TritonIntelGPUDialect::getBlockIOAttrName());
     assert(blockIOAttr && "Expecting block IO attribute");
@@ -347,7 +347,7 @@ struct BlockIOConversionBase : public LoadStoreConversionBase {
     return memoryLayoutInfo == "row_major";
   }
 
-  DpasEncodingAttr::OpIdx getOpIdx(RankedTensorType tensorTy) const {
+  static DpasEncodingAttr::OpIdx getOpIdx(RankedTensorType tensorTy) {
     if (hasDpasEncoding(tensorTy))
       return DpasEncodingAttr::OpIdx::OperandC;
 
@@ -356,7 +356,7 @@ struct BlockIOConversionBase : public LoadStoreConversionBase {
     return static_cast<DpasEncodingAttr::OpIdx>(dotLayout.getOpIdx());
   }
 
-  DpasEncodingAttr getDpasLayout(RankedTensorType tensorTy) const {
+  static DpasEncodingAttr getDpasLayout(RankedTensorType tensorTy) {
     Attribute encoding = tensorTy.getEncoding();
     return cast<DpasEncodingAttr>(
         hasDpasEncoding(tensorTy)
@@ -544,14 +544,10 @@ struct PrefetchOpConversion
   rewriteRegularPointerPrefetch(triton::gpu::intel::PrefetchOp op,
                                 OpAdaptor adaptor,
                                 ConversionPatternRewriter &rewriter) const {
-
     Attribute blockIOAttr =
         op->getAttr(TritonIntelGPUDialect::getBlockIOAttrName());
-    if (!blockIOAttr) {
-      llvm_unreachable("Unexpected prefetch operation on unstructured memory "
-                       "which may pollute the cache");
+    if (!blockIOAttr)
       return failure();
-    }
 
     // Only support rank 2 block pointer, either row major or column major.
     StringRef memoryLayoutInfo = cast<StringAttr>(blockIOAttr).getValue();
@@ -562,50 +558,41 @@ struct PrefetchOpConversion
     const bool memoryRowMajor = (memoryLayoutInfo == "row_major");
 
     // TODO: To support more layouts on memory.
-    if (!memoryRowMajor) {
+    if (!memoryRowMajor)
       return failure();
-    }
-
-    Value ptr = op.getPtr();
-    auto tensorOfPointers = cast<RankedTensorType>(ptr.getType());
 
-    // TODO: To support more layouts in register.
-    if (!hasDotDpasEncoding(tensorOfPointers))
+    auto tensorOfPointers = cast<RankedTensorType>(op.getPtr().getType());
+    std::optional<DotOperandEncodingAttr> encoding =
+        getDotEncoding(tensorOfPointers);
+    if (!encoding)
       return failure();
 
-    auto encoding = getDotEncoding(tensorOfPointers).value();
-    auto dpasLayout = cast<DpasEncodingAttr>(encoding.getParent());
+    auto dpasLayout = cast<DpasEncodingAttr>(encoding->getParent());
     SmallVector<unsigned> warpsPerCTA(dpasLayout.getWarpsPerCTA());
-    auto cluster = dpasLayout.getRepCluster();
+    ArrayRef<unsigned> cluster = dpasLayout.getRepCluster();
     SmallVector<unsigned> repCluster{cluster.begin(), cluster.end()};
-    auto tensorShape = tensorOfPointers.getShape();
-
-    DpasEncodingAttr::OpIdx opIdx;
-    auto getOpIdx = [&]() -> DpasEncodingAttr::OpIdx {
-      auto dotLayout = getDotEncoding(tensorOfPointers).value();
-      return static_cast<DpasEncodingAttr::OpIdx>(dotLayout.getOpIdx());
-    };
-    opIdx = getOpIdx();
-
+    ArrayRef<int64_t> tensorShape = tensorOfPointers.getShape();
+    DpasEncodingAttr::OpIdx opIdx = getOpIdx(tensorOfPointers);
     SmallVector<int64_t> repetitions =
         dpasLayout.getDPASRepetitions(tensorShape, opIdx);
     assert(repetitions.size() == 3 &&
            "getDPASRepetitions always return rank 3 size");
     SmallVector<unsigned> numReps{repetitions.begin() + 1, repetitions.end()};
+
     SmallVector<int64_t, 2> shardTensorShape;
     switch (opIdx) {
     case DpasEncodingAttr::OpIdx::OperandA: {
-      auto opAShape = dpasLayout.getShapeA();
-      shardTensorShape = {std::min<unsigned>(tensorShape[0], opAShape[0]),
-                          tensorShape[1]};
+      shardTensorShape = {
+          std::min<unsigned>(tensorShape[0], dpasLayout.getShapeA()[0]),
+          tensorShape[1]};
       warpsPerCTA[1] = 1;
       repCluster[1] = 1;
       numReps[1] = 1;
     } break;
     case DpasEncodingAttr::OpIdx::OperandB: {
-      auto opBShape = dpasLayout.getShapeB();
-      shardTensorShape = {tensorShape[0],
-                          std::min<unsigned>(tensorShape[1], opBShape[1])};
+      shardTensorShape = {
+          tensorShape[0],
+          std::min<unsigned>(tensorShape[1], dpasLayout.getShapeB()[1])};
       warpsPerCTA[0] = 1;
       repCluster[0] = 1;
       numReps[0] = 1;
@@ -620,27 +607,26 @@ struct PrefetchOpConversion
     auto tensorType = RankedTensorType::get(shardTensorShape, elementType,
                                             tensorOfPointers.getEncoding());
 
-    SmallVector<unsigned, 2> prefetchShape =
-        get2DPrefetchShapePerWarp(tensorType);
-
     Value mask = op.getMask();
     unsigned maskConstancyHor = std::numeric_limits<unsigned>::max(),
              maskConstancyVer = std::numeric_limits<unsigned>::max();
     if (mask) {
       // No need to check the constancy of scalar mask.
       if (auto maskTy = dyn_cast_or_null<RankedTensorType>(mask.getType())) {
-        auto axisInfo = const_cast<triton::intel::ModuleAxisInfoAnalysis &>(
-                            axisAnalysisPass)
-                            .getAxisInfo(mask);
+        maskConstancyHor = maskConstancyVer = 1;
+        AxisInfo *axisInfo =
+            const_cast<triton::intel::ModuleAxisInfoAnalysis &>(
+                axisAnalysisPass)
+                .getAxisInfo(mask);
         if (axisInfo) {
           maskConstancyHor = axisInfo->getConstancy(1);
           maskConstancyVer = axisInfo->getConstancy(0);
-        } else {
-          maskConstancyHor = 1;
-          maskConstancyVer = 1;
         }
       }
     }
+
+    SmallVector<unsigned, 2> prefetchShape =
+        get2DPrefetchShapePerWarp(tensorType);
     prefetchShape = {std::min<unsigned>(prefetchShape[0], maskConstancyVer),
                      std::min<unsigned>(prefetchShape[1], maskConstancyHor)};
 
@@ -680,17 +666,16 @@ struct PrefetchOpConversion
     Value llPtr = adaptor.getPtr();
     Value llMask = adaptor.getMask();
 
-    SmallVector<Value> ptrElems, maskElems;
     // Get the LLVM values for pointers
-    ptrElems = unpackLLElements(loc, llPtr, rewriter);
-    if (llMask) {
+    SmallVector<Value> ptrElems = unpackLLElements(loc, llPtr, rewriter);
+    SmallVector<Value> maskElems;
+    if (llMask)
       maskElems = unpackLLElements(loc, llMask, rewriter);
-    }
 
     // re-arrange the baseAddrs and masks to for large 2D block IO.
     // Layout is unrelated to the scalar type.
     SmallVector<SmallVector<unsigned>> offsets =
-        mlir::emitOffsetForLayout(encoding, tensorOfPointers);
+        emitOffsetForLayout(*encoding, tensorOfPointers);
     for (size_t i = 0; i < ptrElems.size(); ++i) {
       SmallVector<unsigned> offset = offsets[i];
       baseAddrs[offset] = ptrElems[i];
@@ -703,15 +688,13 @@ struct PrefetchOpConversion
     if (baseAddrs.count({0, 0}) == 0 || baseAddrs.count({1, 0}) == 0)
       return failure();
 
-    Value base, baseWidth, baseHeight, rowStrideInBytes, colStride, offsetBaseX,
-        offsetBaseY;
-
-    baseWidth = b.i32_val(vBlocks * tileWidthInElem * (elemSizeInBits / 8));
-    baseHeight = b.i32_val(tileHeightInElem);
-    offsetBaseX = b.i32_val(0);
-    offsetBaseY = b.i32_val(0);
-    rowStrideInBytes = b.sub(b.ptrtoint(i64_ty, baseAddrs[{1, 0}]),
-                             b.ptrtoint(i64_ty, baseAddrs[{0, 0}]));
+    Value baseWidth =
+        b.i32_val(vBlocks * tileWidthInElem * (elemSizeInBits / 8));
+    Value baseHeight = b.i32_val(tileHeightInElem);
+    Value offsetBaseX = b.i32_val(0);
+    Value offsetBaseY = b.i32_val(0);
+    Value rowStrideInBytes = b.sub(b.ptrtoint(i64_ty, baseAddrs[{1, 0}]),
+                                   b.ptrtoint(i64_ty, baseAddrs[{0, 0}]));
     rowStrideInBytes =
         targetInfo.shuffleIdx(rewriter, loc, rowStrideInBytes, 0);
     rowStrideInBytes = b.umax(b.trunc(i32_ty, rowStrideInBytes), baseWidth);
@@ -727,19 +710,18 @@ struct PrefetchOpConversion
             unsigned offsetM = row * warpsPerCTA[0] * shardTensorShape[0] +
                                i * prefetchShape[0];
             Value pred;
-            if (llMask) {
-              if (maskElems.size() > 1) {
-                pred = targetInfo.shuffleIdx(rewriter, loc,
-                                             masks[{offsetM, offsetN}], 0);
-              } else {
-                pred = maskElems[0];
-              }
-            } else {
+            if (llMask)
+              pred = (maskElems.size() > 1)
+                         ? targetInfo.shuffleIdx(rewriter, loc,
+                                                 masks[{offsetM, offsetN}], 0)
+                         : maskElems[0];
+
+            else
               pred = b.int_val(1, 1);
-            }
+
             Value offsetY = b.select(pred, b.i32_val(0), baseHeight);
-            auto addr = targetInfo.shuffleIdx(rewriter, loc,
-                                              baseAddrs[{offsetM, offsetN}], 0);
+            Value addr = targetInfo.shuffleIdx(
+                rewriter, loc, baseAddrs[{offsetM, offsetN}], 0);
 
             auto newOp = rewriter.create<TritonGEN::Matrix2DBlockPrefetchOp>(
                 loc,

From d28fefe5e6d24b5a61d1c12a6027b14bf1033e36 Mon Sep 17 00:00:00 2001
From: "Tiotto, Ettore" <ettore.tiotto@intel.com>
Date: Thu, 1 May 2025 19:34:44 +0000
Subject: [PATCH 09/11] Fix failing gemm bmk

Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
---
 test/TritonIntelGPU/prefetch-to-llvm.mlir     | 80 +++++++++----------
 .../LoadStoreOpToLLVM.cpp                     |  5 +-
 2 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/test/TritonIntelGPU/prefetch-to-llvm.mlir b/test/TritonIntelGPU/prefetch-to-llvm.mlir
index 27f81be915..ec1a684054 100644
--- a/test/TritonIntelGPU/prefetch-to-llvm.mlir
+++ b/test/TritonIntelGPU/prefetch-to-llvm.mlir
@@ -1,5 +1,6 @@
 // RUN: triton-opt %s -split-input-file --convert-triton-intel-gpu-to-llvm --cse -canonicalize | FileCheck %s
 
+// CHECK-DAG: llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind}
 // CHECK-DAG: llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind}
 module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32} {
 // CHECK-LABEL:   llvm.func spir_kernelcc @prefetch_block_ptr(
@@ -11,13 +12,14 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
     %c0_i32 = arith.constant 0 : i32
     %c1_i64 = arith.constant 1 : i64
 
-    // CHECK-DAG:           %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK-DAG:           %[[CST_2_I32:.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-DAG:           %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
-    // CHECK-DAG:           %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64
-    // CHECK-DAG:           %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
-    // CHECK-DAG:           %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK-DAG:           %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK-DAG:       %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32
+    // CHECK-DAG:       %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
+    // CHECK-DAG:       %[[CST_2_I32:.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK-DAG:       %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
+    // CHECK-DAG:       %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64
+    // CHECK-DAG:       %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
+    // CHECK-DAG:       %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK-DAG:       %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK:           %[[VAL_15:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id()
     // CHECK:           %[[VAL_16:.*]] = llvm.zext %[[VAL_15]] : i32 to i64
     // CHECK:           %[[VAL_17:.*]] = llvm.trunc %[[VAL_16]] : i64 to i32
@@ -25,10 +27,10 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
     // CHECK:           %[[VAL_19:.*]] = llvm.udiv %[[VAL_17]], %[[CST_1]] : i32
     // CHECK:           %[[VAL_20:.*]] = llvm.urem %[[VAL_19]], %[[CST_8]] : i32
     // CHECK:           %[[VAL_21:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64
-    // CHECK:           %[[ROW_MAJOR_BASE_WIDTH_I32:.*]] = llvm.trunc %[[VAL_21]] : i64 to i32
-    // CHECK:           %[[ROW_MAJOR_BASE_HEIGHT_I32:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32
+    // CHECK:           %[[ROW_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_21]] : i64 to i32
+    // CHECK:           %[[ROW_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32
     // CHECK:           %[[VAL_24:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64
-    // CHECK:           %[[PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32
+    // CHECK:           %[[ROW_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32
     // CHECK:           %[[VAL_26:.*]] = llvm.mul %[[VAL_18]], %[[CST_32]] : i32
     // CHECK:           %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32
     // CHECK:           %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32
@@ -37,40 +39,38 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
     // CHECK:           %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32
     // CHECK:           %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32
     // CHECK:           %[[VAL_33:.*]] = llvm.add %[[VAL_32]], %[[CST_0]] : i32
-    // CHECK:           %[[OFFSET_Y:.*]] = llvm.trunc %[[VAL_33]] : i32 to i32
-    // CHECK:           %[[OFFSET_X:.*]] = llvm.trunc %[[VAL_29]] : i32 to i32
-    // CHECK:           %[[VAL_36:.*]] = llvm.insertelement %[[OFFSET_X]], {{.*}} : i32] : vector<2xi32>
-    // CHECK:           %[[OFFSETS:.*]] = llvm.insertelement %[[OFFSET_Y]], {{.*}} : i32] : vector<2xi32>
-    // CHECK:           llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH_I32]], %[[ROW_MAJOR_BASE_HEIGHT_I32]], %[[PITCH]], %[[OFFSETS]]) {{.*}}
+    // CHECK:           %[[ROW_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_33]] : i32 to i32
+    // CHECK:           %[[ROW_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_29]] : i32 to i32
+    // CHECK:           %[[VAL_36:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:           %[[ROW_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:           llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH]], %[[ROW_MAJOR_BASE_HEIGHT]], %[[ROW_MAJOR_PITCH]], %[[ROW_MAJOR_OFFSETS]])
     %rowMajorPtr = tt.make_tensor_ptr %arg0, [%arg2, %arg4], [%arg5, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<16x32xf16>>
     triton_intel_gpu.prefetch %rowMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<16x32xf16>>
 
-    // COM: The memory layout is same for the column major memory and row major memory. The prefetch should be the same.
-
-    // CHECK:           %[[VAL_38:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32
-    // CHECK:           %[[VAL_39:.*]] = llvm.zext %[[VAL_38]] : i32 to i64
-    // CHECK:           %[[VAL_40:.*]] = llvm.trunc %[[VAL_39]] : i64 to i32
-    // CHECK:           %[[VAL_41:.*]] = llvm.urem %[[VAL_40]], %[[CST_1]] : i32
-    // CHECK:           %[[VAL_42:.*]] = llvm.udiv %[[VAL_40]], %[[CST_1]] : i32
-    // CHECK:           %[[VAL_43:.*]] = llvm.urem %[[VAL_42]], %[[CST_8]] : i32
-    // CHECK:           %[[VAL_44:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64
-    // CHECK:           %[[COLUM_MAJOR_BASE_WIDTH_I32:.*]] = llvm.trunc %[[VAL_44]] : i64 to i32
-    // CHECK:           %[[COLUM_MAJOR_BASE_HEIGHT_I32:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32
-    // CHECK:           %[[VAL_47:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64
-    // CHECK:           %[[COLUM_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_47]] : i64 to i32
-    // CHECK:           %[[VAL_49:.*]] = llvm.mul %[[VAL_41]], %[[CST_32]] : i32
+    // CHECK:           %[[VAL_32:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id()
+    // CHECK:           %[[VAL_33:.*]] = llvm.zext %[[VAL_32]] : i32 to i64
+    // CHECK:           %[[VAL_34:.*]] = llvm.trunc %[[VAL_33]] : i64 to i32
+    // CHECK:           %[[VAL_35:.*]] = llvm.urem %[[VAL_34]], %[[CST_2_I32]] : i32
+    // CHECK:           %[[VAL_36:.*]] = llvm.udiv %[[VAL_34]], %[[CST_2_I32]] : i32
+    // CHECK:           %[[VAL_37:.*]] = llvm.urem %[[VAL_36]], %[[CST_4]] : i32
+    // CHECK:           %[[VAL_38:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64
+    // CHECK:           %[[COL_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_38]] : i64 to i32
+    // CHECK:           %[[COL_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32
+    // CHECK:           %[[VAL_41:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64
+    // CHECK:           %[[COL_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_41]] : i64 to i32
+    // CHECK:           %[[VAL_43:.*]] = llvm.mul %[[VAL_35]], %[[CST_16]] : i32
+    // CHECK:           %[[VAL_44:.*]] = llvm.add %[[VAL_43]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_45:.*]] = llvm.urem %[[VAL_44]], %[[CST_32]] : i32
+    // CHECK:           %[[VAL_46:.*]] = llvm.add %[[VAL_45]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_47:.*]] = llvm.mul %[[VAL_37]], %[[CST_4]] : i32
+    // CHECK:           %[[VAL_48:.*]] = llvm.add %[[VAL_47]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_49:.*]] = llvm.urem %[[VAL_48]], %[[CST_16]] : i32
     // CHECK:           %[[VAL_50:.*]] = llvm.add %[[VAL_49]], %[[CST_0]] : i32
-    // CHECK:           %[[VAL_51:.*]] = llvm.urem %[[VAL_50]], %[[CST_32]] : i32
-    // CHECK:           %[[VAL_52:.*]] = llvm.add %[[VAL_51]], %[[CST_0]] : i32
-    // CHECK:           %[[VAL_53:.*]] = llvm.mul %[[VAL_43]], %[[CST_2_I32]] : i32
-    // CHECK:           %[[VAL_54:.*]] = llvm.add %[[VAL_53]], %[[CST_0]] : i32
-    // CHECK:           %[[VAL_55:.*]] = llvm.urem %[[VAL_54]], %[[CST_16]] : i32
-    // CHECK:           %[[VAL_56:.*]] = llvm.add %[[VAL_55]], %[[CST_0]] : i32
-    // CHECK:           %[[VAL_57:.*]] = llvm.trunc %[[VAL_56]] : i32 to i32
-    // CHECK:           %[[VAL_58:.*]] = llvm.trunc %[[VAL_52]] : i32 to i32
-    // CHECK:           %[[VAL_59:.*]] = llvm.insertelement %[[VAL_58]], {{.*}} : i32] : vector<2xi32>
-    // CHECK:           %[[COLUM_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[VAL_57]], {{.*}} : i32] : vector<2xi32>
-    // CHECK:           llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[COLUM_MAJOR_BASE_WIDTH_I32]], %[[COLUM_MAJOR_BASE_HEIGHT_I32]], %[[COLUM_MAJOR_PITCH]], %[[COLUM_MAJOR_OFFSETS]]) {{.*}}
+    // CHECK:           %[[COL_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_50]] : i32 to i32
+    // CHECK:           %[[COL_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_46]] : i32 to i32
+    // CHECK:           %[[VAL_54:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:           %[[COL_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:           llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(%[[BASE]], %[[COL_MAJOR_BASE_WIDTH]], %[[COL_MAJOR_BASE_HEIGHT]], %[[COL_MAJOR_PITCH]], %[[COL_MAJOR_OFFSETS]]) {{.*}}
     %columnMajorPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%c1_i64, %arg5], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x16xf16>>
     triton_intel_gpu.prefetch %columnMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "column_major"} : !tt.ptr<tensor<32x16xf16>>
 
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
index 4d3f245030..e0e4564cb4 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -425,8 +425,9 @@ struct PrefetchOpConversion
       // size base on row major shape.
       std::swap(tensorShape[0], tensorShape[1]);
 
-      tensorType = RankedTensorType::get(
-          tensorShape, tensorType.getElementType(), tensorType.getEncoding());
+      //      tensorType = RankedTensorType::get(
+      //          tensorShape, tensorType.getElementType(),
+      //          tensorType.getEncoding());
     }
 
     unsigned numWarps = triton::gpu::lookupNumWarps(op);

From 95f71264d9c75e5a0bd95b8731e2ecb8019d63bd Mon Sep 17 00:00:00 2001
From: "Tiotto, Ettore" <ettore.tiotto@intel.com>
Date: Fri, 2 May 2025 16:03:36 +0000
Subject: [PATCH 10/11] Fix merge

Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
---
 test/TritonIntelGPU/prefetch-to-llvm.mlir     | 126 ++++++++----------
 third_party/intel/backend/compiler.py         |   2 +-
 .../LoadStoreOpToLLVM.cpp                     |   5 +-
 3 files changed, 60 insertions(+), 73 deletions(-)

diff --git a/test/TritonIntelGPU/prefetch-to-llvm.mlir b/test/TritonIntelGPU/prefetch-to-llvm.mlir
index bfa9d1bd98..f02818a894 100644
--- a/test/TritonIntelGPU/prefetch-to-llvm.mlir
+++ b/test/TritonIntelGPU/prefetch-to-llvm.mlir
@@ -12,79 +12,69 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
     %c0_i32 = arith.constant 0 : i32
     %c1_i64 = arith.constant 1 : i64
 
-    // CHECK-DAG:       %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32
-    // CHECK-DAG:       %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK-DAG:       %[[CST_2_I32:.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-DAG:       %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
-    // CHECK-DAG:       %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64
-    // CHECK-DAG:       %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
-    // CHECK-DAG:       %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK-DAG:       %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:           %[[VAL_15:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id()
-    // CHECK:           %[[VAL_16:.*]] = llvm.zext %[[VAL_15]] : i32 to i64
-    // CHECK:           %[[VAL_17:.*]] = llvm.trunc %[[VAL_16]] : i64 to i32
-    // CHECK:           %[[VAL_18:.*]] = llvm.urem %[[VAL_17]], %[[CST_1]] : i32
-    // CHECK:           %[[VAL_19:.*]] = llvm.udiv %[[VAL_17]], %[[CST_1]] : i32
-    // CHECK:           %[[VAL_20:.*]] = llvm.urem %[[VAL_19]], %[[CST_8]] : i32
-    // CHECK:           %[[VAL_21:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64
-    // CHECK:           %[[ROW_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_21]] : i64 to i32
-    // CHECK:           %[[ROW_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32
-    // CHECK:           %[[VAL_24:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64
-    // CHECK:           %[[ROW_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32
-    // CHECK:           %[[VAL_26:.*]] = llvm.mul %[[VAL_18]], %[[CST_32]] : i32
-    // CHECK:           %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32
-    // CHECK:           %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32
-    // CHECK:           %[[VAL_29:.*]] = llvm.add %[[VAL_28]], %[[CST_0]] : i32
-    // CHECK:           %[[VAL_30:.*]] = llvm.mul %[[VAL_20]], %[[CST_2_I32]] : i32
-    // CHECK:           %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32
-    // CHECK:           %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32
-    // CHECK:           %[[VAL_33:.*]] = llvm.add %[[VAL_32]], %[[CST_0]] : i32
-    // CHECK:           %[[ROW_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_33]] : i32 to i32
-    // CHECK:           %[[ROW_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_29]] : i32 to i32
-<<<<<<< HEAD
-    // CHECK:           %[[VAL_36:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32>
-    // CHECK:           %[[ROW_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32>
-    // CHECK:           llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH]], %[[ROW_MAJOR_BASE_HEIGHT]], %[[ROW_MAJOR_PITCH]], %[[ROW_MAJOR_OFFSETS]])
+    // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32
+    // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
+    // CHECK-DAG: %[[CST_2_I32:.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
+    // CHECK-DAG: %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64
+    // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
+    // CHECK-DAG: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK:     %[[VAL_15:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id()
+    // CHECK:     %[[VAL_16:.*]] = llvm.zext %[[VAL_15]] : i32 to i64
+    // CHECK:     %[[VAL_17:.*]] = llvm.trunc %[[VAL_16]] : i64 to i32
+    // CHECK:     %[[VAL_18:.*]] = llvm.urem %[[VAL_17]], %[[CST_1]] : i32
+    // CHECK:     %[[VAL_19:.*]] = llvm.udiv %[[VAL_17]], %[[CST_1]] : i32
+    // CHECK:     %[[VAL_20:.*]] = llvm.urem %[[VAL_19]], %[[CST_8]] : i32
+    // CHECK:     %[[VAL_21:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64
+    // CHECK:     %[[ROW_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_21]] : i64 to i32
+    // CHECK:     %[[ROW_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32
+    // CHECK:     %[[VAL_24:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64
+    // CHECK:     %[[ROW_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32
+    // CHECK:     %[[VAL_26:.*]] = llvm.mul %[[VAL_18]], %[[CST_32]] : i32
+    // CHECK:     %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32
+    // CHECK:     %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32
+    // CHECK:     %[[VAL_29:.*]] = llvm.add %[[VAL_28]], %[[CST_0]] : i32
+    // CHECK:     %[[VAL_30:.*]] = llvm.mul %[[VAL_20]], %[[CST_2_I32]] : i32
+    // CHECK:     %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32
+    // CHECK:     %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32
+    // CHECK:     %[[VAL_33:.*]] = llvm.add %[[VAL_32]], %[[CST_0]] : i32
+    // CHECK:     %[[ROW_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_33]] : i32 to i32
+    // CHECK:     %[[ROW_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_29]] : i32 to i32
+    // CHECK:     %[[VAL_36:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:     %[[ROW_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:     llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH]], %[[ROW_MAJOR_BASE_HEIGHT]], %[[ROW_MAJOR_PITCH]], %[[ROW_MAJOR_OFFSETS]])
     %rowMajorPtr = tt.make_tensor_ptr %arg0, [%arg2, %arg4], [%arg5, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<16x32xf16>>
     triton_intel_gpu.prefetch %rowMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<16x32xf16>>
 
-=======
-
-    // CHECK:           %[[VAL_36:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32>
-    // CHECK:           %[[ROW_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32>
-    // CHECK:           llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH]], %[[ROW_MAJOR_BASE_HEIGHT]], %[[ROW_MAJOR_PITCH]], %[[ROW_MAJOR_OFFSETS]])
-    %rowMajorPtr = tt.make_tensor_ptr %arg0, [%arg2, %arg4], [%arg5, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<16x32xf16>>
-    triton_intel_gpu.prefetch %rowMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<16x32xf16>>
-
->>>>>>> origin/main
-    // CHECK:           %[[VAL_32:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id()
-    // CHECK:           %[[VAL_33:.*]] = llvm.zext %[[VAL_32]] : i32 to i64
-    // CHECK:           %[[VAL_34:.*]] = llvm.trunc %[[VAL_33]] : i64 to i32
-    // CHECK:           %[[VAL_35:.*]] = llvm.urem %[[VAL_34]], %[[CST_2_I32]] : i32
-    // CHECK:           %[[VAL_36:.*]] = llvm.udiv %[[VAL_34]], %[[CST_2_I32]] : i32
-    // CHECK:           %[[VAL_37:.*]] = llvm.urem %[[VAL_36]], %[[CST_4]] : i32
-    // CHECK:           %[[VAL_38:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64
-    // CHECK:           %[[COL_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_38]] : i64 to i32
-    // CHECK:           %[[COL_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32
-    // CHECK:           %[[VAL_41:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64
-    // CHECK:           %[[COL_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_41]] : i64 to i32
-    // CHECK:           %[[VAL_43:.*]] = llvm.mul %[[VAL_35]], %[[CST_16]] : i32
-    // CHECK:           %[[VAL_44:.*]] = llvm.add %[[VAL_43]], %[[CST_0]] : i32
-    // CHECK:           %[[VAL_45:.*]] = llvm.urem %[[VAL_44]], %[[CST_32]] : i32
-    // CHECK:           %[[VAL_46:.*]] = llvm.add %[[VAL_45]], %[[CST_0]] : i32
-    // CHECK:           %[[VAL_47:.*]] = llvm.mul %[[VAL_37]], %[[CST_4]] : i32
-    // CHECK:           %[[VAL_48:.*]] = llvm.add %[[VAL_47]], %[[CST_0]] : i32
-    // CHECK:           %[[VAL_49:.*]] = llvm.urem %[[VAL_48]], %[[CST_16]] : i32
-    // CHECK:           %[[VAL_50:.*]] = llvm.add %[[VAL_49]], %[[CST_0]] : i32
-    // CHECK:           %[[COL_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_50]] : i32 to i32
-    // CHECK:           %[[COL_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_46]] : i32 to i32
-    // CHECK:           %[[VAL_54:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32>
-    // CHECK:           %[[COL_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32>
-    // CHECK:           llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(%[[BASE]], %[[COL_MAJOR_BASE_WIDTH]], %[[COL_MAJOR_BASE_HEIGHT]], %[[COL_MAJOR_PITCH]], %[[COL_MAJOR_OFFSETS]]) {{.*}}
+    // CHECK:     %[[VAL_32:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id()
+    // CHECK:     %[[VAL_33:.*]] = llvm.zext %[[VAL_32]] : i32 to i64
+    // CHECK:     %[[VAL_34:.*]] = llvm.trunc %[[VAL_33]] : i64 to i32
+    // CHECK:     %[[VAL_35:.*]] = llvm.urem %[[VAL_34]], %[[CST_2_I32]] : i32
+    // CHECK:     %[[VAL_36:.*]] = llvm.udiv %[[VAL_34]], %[[CST_2_I32]] : i32
+    // CHECK:     %[[VAL_37:.*]] = llvm.urem %[[VAL_36]], %[[CST_4]] : i32
+    // CHECK:     %[[VAL_38:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64
+    // CHECK:     %[[COL_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_38]] : i64 to i32
+    // CHECK:     %[[COL_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32
+    // CHECK:     %[[VAL_41:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64
+    // CHECK:     %[[COL_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_41]] : i64 to i32
+    // CHECK:     %[[VAL_43:.*]] = llvm.mul %[[VAL_35]], %[[CST_16]] : i32
+    // CHECK:     %[[VAL_44:.*]] = llvm.add %[[VAL_43]], %[[CST_0]] : i32
+    // CHECK:     %[[VAL_45:.*]] = llvm.urem %[[VAL_44]], %[[CST_32]] : i32
+    // CHECK:     %[[VAL_46:.*]] = llvm.add %[[VAL_45]], %[[CST_0]] : i32
+    // CHECK:     %[[VAL_47:.*]] = llvm.mul %[[VAL_37]], %[[CST_4]] : i32
+    // CHECK:     %[[VAL_48:.*]] = llvm.add %[[VAL_47]], %[[CST_0]] : i32
+    // CHECK:     %[[VAL_49:.*]] = llvm.urem %[[VAL_48]], %[[CST_16]] : i32
+    // CHECK:     %[[VAL_50:.*]] = llvm.add %[[VAL_49]], %[[CST_0]] : i32
+    // CHECK:     %[[COL_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_50]] : i32 to i32
+    // CHECK:     %[[COL_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_46]] : i32 to i32
+    // CHECK:     %[[VAL_54:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:     %[[COL_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:     llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(%[[BASE]], %[[COL_MAJOR_BASE_WIDTH]], %[[COL_MAJOR_BASE_HEIGHT]], %[[COL_MAJOR_PITCH]], %[[COL_MAJOR_OFFSETS]]) {{.*}}
     %columnMajorPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%c1_i64, %arg5], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x16xf16>>
     triton_intel_gpu.prefetch %columnMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "column_major"} : !tt.ptr<tensor<32x16xf16>>
 
-    // COM: The memory is not structured densely. Not to prefetch it to the cache.
+    // COM: The memory is not structured densely. Ensure it is not prefetched to the cache.
     // CHECK-NOT: block_prefetch
     %nonContiguousPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%arg5, %arg5], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x16xf16>>
     triton_intel_gpu.prefetch %nonContiguousPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<tensor<32x16xf16>>
@@ -169,7 +159,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
 
 // -----
 
-// COM: Currently the prefetch operation in this test cannot be lowered correctly, so we check that the test compiles cleanly and not 2D block prefetch operation gets generated.
+// COM: Currently the prefetch operation in this test cannot be lowered correctly, so we check that the test compiles cleanly and 2D block prefetch operations aren't generated.
 #mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [4, 1], A = [32, 8], B = [8, 16], C = [32, 16]}>
 module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_sg_2d_block, triton_intel_gpu.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL: llvm.func spir_kernelcc @kernel
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
index 8ca0dba87a..0fad28fd9e 100644
--- a/third_party/intel/backend/compiler.py
+++ b/third_party/intel/backend/compiler.py
@@ -310,7 +310,7 @@ def make_ttgir(mod, metadata, opt, properties):
         intel.passes.ttgpuir.add_accelerate_matmul(pm)
         intel.passes.ttgpuir.add_remove_layout_conversions(pm)
         intel.passes.ttgpuir.add_materialize_block_pointer(pm)
-        intel.passes.ttgpuir.add_remove_layout_conversions(pm)
+        ##        intel.passes.ttgpuir.add_remove_layout_conversions(pm)
         intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, XPUBackend.get_split_barrier_scope(opt))
 
         passes.ttgpuir.add_fuse_nested_loops(pm)
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
index 616520c3ec..d29841b5f2 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -427,10 +427,6 @@ struct PrefetchOpConversion
       // Swap the shape to make it row major and then get the tiling
       // size base on row major shape.
       std::swap(tensorShape[0], tensorShape[1]);
-
-      //      tensorType = RankedTensorType::get(
-      //          tensorShape, tensorType.getElementType(),
-      //          tensorType.getEncoding());
     }
 
     unsigned numWarps = triton::gpu::lookupNumWarps(op);
@@ -713,6 +709,7 @@ struct PrefetchOpConversion
                                j * prefetchShape[1];
             unsigned offsetM = row * warpsPerCTA[0] * shardTensorShape[0] +
                                i * prefetchShape[0];
+
             Value pred;
             if (llMask)
               pred = (maskElems.size() > 1)

From cad0433e1b4b6f7b1308a57e056476529a835e4a Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitney.tsang@intel.com>
Date: Fri, 2 May 2025 19:33:41 +0000
Subject: [PATCH 11/11] recover original remaining changes

---
 test/TritonIntelGPU/prefetch-to-llvm.mlir     | 119 +++++++++---------
 third_party/intel/backend/compiler.py         |   2 +-
 .../LoadStoreOpToLLVM.cpp                     |   3 +
 3 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/test/TritonIntelGPU/prefetch-to-llvm.mlir b/test/TritonIntelGPU/prefetch-to-llvm.mlir
index f02818a894..ce802dbcec 100644
--- a/test/TritonIntelGPU/prefetch-to-llvm.mlir
+++ b/test/TritonIntelGPU/prefetch-to-llvm.mlir
@@ -1,6 +1,5 @@
 // RUN: triton-opt %s -split-input-file --convert-triton-intel-gpu-to-llvm --cse -canonicalize | FileCheck %s
 
-// CHECK-DAG: llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind}
 // CHECK-DAG: llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = read, inaccessibleMem = none>, no_unwind}
 module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32} {
 // CHECK-LABEL:   llvm.func spir_kernelcc @prefetch_block_ptr(
@@ -12,69 +11,71 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
     %c0_i32 = arith.constant 0 : i32
     %c1_i64 = arith.constant 1 : i64
 
-    // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32
-    // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK-DAG: %[[CST_2_I32:.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
-    // CHECK-DAG: %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64
-    // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
-    // CHECK-DAG: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:     %[[VAL_15:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id()
-    // CHECK:     %[[VAL_16:.*]] = llvm.zext %[[VAL_15]] : i32 to i64
-    // CHECK:     %[[VAL_17:.*]] = llvm.trunc %[[VAL_16]] : i64 to i32
-    // CHECK:     %[[VAL_18:.*]] = llvm.urem %[[VAL_17]], %[[CST_1]] : i32
-    // CHECK:     %[[VAL_19:.*]] = llvm.udiv %[[VAL_17]], %[[CST_1]] : i32
-    // CHECK:     %[[VAL_20:.*]] = llvm.urem %[[VAL_19]], %[[CST_8]] : i32
-    // CHECK:     %[[VAL_21:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64
-    // CHECK:     %[[ROW_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_21]] : i64 to i32
-    // CHECK:     %[[ROW_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32
-    // CHECK:     %[[VAL_24:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64
-    // CHECK:     %[[ROW_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32
-    // CHECK:     %[[VAL_26:.*]] = llvm.mul %[[VAL_18]], %[[CST_32]] : i32
-    // CHECK:     %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32
-    // CHECK:     %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32
-    // CHECK:     %[[VAL_29:.*]] = llvm.add %[[VAL_28]], %[[CST_0]] : i32
-    // CHECK:     %[[VAL_30:.*]] = llvm.mul %[[VAL_20]], %[[CST_2_I32]] : i32
-    // CHECK:     %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32
-    // CHECK:     %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32
-    // CHECK:     %[[VAL_33:.*]] = llvm.add %[[VAL_32]], %[[CST_0]] : i32
-    // CHECK:     %[[ROW_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_33]] : i32 to i32
-    // CHECK:     %[[ROW_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_29]] : i32 to i32
-    // CHECK:     %[[VAL_36:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32>
-    // CHECK:     %[[ROW_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32>
-    // CHECK:     llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH]], %[[ROW_MAJOR_BASE_HEIGHT]], %[[ROW_MAJOR_PITCH]], %[[ROW_MAJOR_OFFSETS]])
+    // CHECK-DAG:       %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
+    // CHECK-DAG:       %[[CST_2_I32:.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK-DAG:       %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
+    // CHECK-DAG:       %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64
+    // CHECK-DAG:       %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
+    // CHECK-DAG:       %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK-DAG:       %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK:           %[[VAL_15:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id()
+    // CHECK:           %[[VAL_16:.*]] = llvm.zext %[[VAL_15]] : i32 to i64
+    // CHECK:           %[[VAL_17:.*]] = llvm.trunc %[[VAL_16]] : i64 to i32
+    // CHECK:           %[[VAL_18:.*]] = llvm.urem %[[VAL_17]], %[[CST_1]] : i32
+    // CHECK:           %[[VAL_19:.*]] = llvm.udiv %[[VAL_17]], %[[CST_1]] : i32
+    // CHECK:           %[[VAL_20:.*]] = llvm.urem %[[VAL_19]], %[[CST_8]] : i32
+    // CHECK:           %[[VAL_21:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64
+    // CHECK:           %[[ROW_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_21]] : i64 to i32
+    // CHECK:           %[[ROW_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32
+    // CHECK:           %[[VAL_24:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64
+    // CHECK:           %[[ROW_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32
+    // CHECK:           %[[VAL_26:.*]] = llvm.mul %[[VAL_18]], %[[CST_32]] : i32
+    // CHECK:           %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32
+    // CHECK:           %[[VAL_29:.*]] = llvm.add %[[VAL_28]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_30:.*]] = llvm.mul %[[VAL_20]], %[[CST_2_I32]] : i32
+    // CHECK:           %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32
+    // CHECK:           %[[VAL_33:.*]] = llvm.add %[[VAL_32]], %[[CST_0]] : i32
+    // CHECK:           %[[ROW_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_33]] : i32 to i32
+    // CHECK:           %[[ROW_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_29]] : i32 to i32
+
+    // CHECK:           %[[VAL_36:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:           %[[ROW_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:           llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH]], %[[ROW_MAJOR_BASE_HEIGHT]], %[[ROW_MAJOR_PITCH]], %[[ROW_MAJOR_OFFSETS]])
     %rowMajorPtr = tt.make_tensor_ptr %arg0, [%arg2, %arg4], [%arg5, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<16x32xf16>>
     triton_intel_gpu.prefetch %rowMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<16x32xf16>>
 
-    // CHECK:     %[[VAL_32:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id()
-    // CHECK:     %[[VAL_33:.*]] = llvm.zext %[[VAL_32]] : i32 to i64
-    // CHECK:     %[[VAL_34:.*]] = llvm.trunc %[[VAL_33]] : i64 to i32
-    // CHECK:     %[[VAL_35:.*]] = llvm.urem %[[VAL_34]], %[[CST_2_I32]] : i32
-    // CHECK:     %[[VAL_36:.*]] = llvm.udiv %[[VAL_34]], %[[CST_2_I32]] : i32
-    // CHECK:     %[[VAL_37:.*]] = llvm.urem %[[VAL_36]], %[[CST_4]] : i32
-    // CHECK:     %[[VAL_38:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64
-    // CHECK:     %[[COL_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_38]] : i64 to i32
-    // CHECK:     %[[COL_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32
-    // CHECK:     %[[VAL_41:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64
-    // CHECK:     %[[COL_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_41]] : i64 to i32
-    // CHECK:     %[[VAL_43:.*]] = llvm.mul %[[VAL_35]], %[[CST_16]] : i32
-    // CHECK:     %[[VAL_44:.*]] = llvm.add %[[VAL_43]], %[[CST_0]] : i32
-    // CHECK:     %[[VAL_45:.*]] = llvm.urem %[[VAL_44]], %[[CST_32]] : i32
-    // CHECK:     %[[VAL_46:.*]] = llvm.add %[[VAL_45]], %[[CST_0]] : i32
-    // CHECK:     %[[VAL_47:.*]] = llvm.mul %[[VAL_37]], %[[CST_4]] : i32
-    // CHECK:     %[[VAL_48:.*]] = llvm.add %[[VAL_47]], %[[CST_0]] : i32
-    // CHECK:     %[[VAL_49:.*]] = llvm.urem %[[VAL_48]], %[[CST_16]] : i32
-    // CHECK:     %[[VAL_50:.*]] = llvm.add %[[VAL_49]], %[[CST_0]] : i32
-    // CHECK:     %[[COL_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_50]] : i32 to i32
-    // CHECK:     %[[COL_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_46]] : i32 to i32
-    // CHECK:     %[[VAL_54:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32>
-    // CHECK:     %[[COL_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32>
-    // CHECK:     llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(%[[BASE]], %[[COL_MAJOR_BASE_WIDTH]], %[[COL_MAJOR_BASE_HEIGHT]], %[[COL_MAJOR_PITCH]], %[[COL_MAJOR_OFFSETS]]) {{.*}}
+    // COM: The memory layout is same for the column major memory and row major memory. The prefetch should be the same.
+
+    // CHECK:           %[[VAL_38:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32
+    // CHECK:           %[[VAL_39:.*]] = llvm.zext %[[VAL_38]] : i32 to i64
+    // CHECK:           %[[VAL_40:.*]] = llvm.trunc %[[VAL_39]] : i64 to i32
+    // CHECK:           %[[VAL_41:.*]] = llvm.urem %[[VAL_40]], %[[CST_1]] : i32
+    // CHECK:           %[[VAL_42:.*]] = llvm.udiv %[[VAL_40]], %[[CST_1]] : i32
+    // CHECK:           %[[VAL_43:.*]] = llvm.urem %[[VAL_42]], %[[CST_8]] : i32
+    // CHECK:           %[[VAL_44:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64
+    // CHECK:           %[[COLUM_MAJOR_BASE_WIDTH_I32:.*]] = llvm.trunc %[[VAL_44]] : i64 to i32
+    // CHECK:           %[[COLUM_MAJOR_BASE_HEIGHT_I32:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32
+    // CHECK:           %[[VAL_47:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64
+    // CHECK:           %[[COLUM_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_47]] : i64 to i32
+    // CHECK:           %[[VAL_49:.*]] = llvm.mul %[[VAL_41]], %[[CST_32]] : i32
+    // CHECK:           %[[VAL_50:.*]] = llvm.add %[[VAL_49]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_51:.*]] = llvm.urem %[[VAL_50]], %[[CST_32]] : i32
+    // CHECK:           %[[VAL_52:.*]] = llvm.add %[[VAL_51]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_53:.*]] = llvm.mul %[[VAL_43]], %[[CST_2_I32]] : i32
+    // CHECK:           %[[VAL_54:.*]] = llvm.add %[[VAL_53]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_55:.*]] = llvm.urem %[[VAL_54]], %[[CST_16]] : i32
+    // CHECK:           %[[VAL_56:.*]] = llvm.add %[[VAL_55]], %[[CST_0]] : i32
+    // CHECK:           %[[VAL_57:.*]] = llvm.trunc %[[VAL_56]] : i32 to i32
+    // CHECK:           %[[VAL_58:.*]] = llvm.trunc %[[VAL_52]] : i32 to i32
+    // CHECK:           %[[VAL_59:.*]] = llvm.insertelement %[[VAL_58]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:           %[[COLUM_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[VAL_57]], {{.*}} : i32] : vector<2xi32>
+    // CHECK:           llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[COLUM_MAJOR_BASE_WIDTH_I32]], %[[COLUM_MAJOR_BASE_HEIGHT_I32]], %[[COLUM_MAJOR_PITCH]], %[[COLUM_MAJOR_OFFSETS]]) {{.*}}
     %columnMajorPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%c1_i64, %arg5], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x16xf16>>
     triton_intel_gpu.prefetch %columnMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "column_major"} : !tt.ptr<tensor<32x16xf16>>
 
-    // COM: The memory is not structured densely. Ensure it is not prefetched to the cache.
+    // COM: The memory is not structured densely. Not to prefetch it to the cache.
     // CHECK-NOT: block_prefetch
     %nonContiguousPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%arg5, %arg5], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<32x16xf16>>
     triton_intel_gpu.prefetch %nonContiguousPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<tensor<32x16xf16>>
@@ -159,7 +160,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
 
 // -----
 
-// COM: Currently the prefetch operation in this test cannot be lowered correctly, so we check that the test compiles cleanly and 2D block prefetch operations aren't generated.
+// COM: Currently the prefetch operation in this test cannot be lowered correctly, so we check that the test compiles cleanly and not 2D block prefetch operation gets generated.
 #mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [4, 1], A = [32, 8], B = [8, 16], C = [32, 16]}>
 module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_sg_2d_block, triton_intel_gpu.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL: llvm.func spir_kernelcc @kernel
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
index 0fad28fd9e..8ca0dba87a 100644
--- a/third_party/intel/backend/compiler.py
+++ b/third_party/intel/backend/compiler.py
@@ -310,7 +310,7 @@ def make_ttgir(mod, metadata, opt, properties):
         intel.passes.ttgpuir.add_accelerate_matmul(pm)
         intel.passes.ttgpuir.add_remove_layout_conversions(pm)
         intel.passes.ttgpuir.add_materialize_block_pointer(pm)
-        ##        intel.passes.ttgpuir.add_remove_layout_conversions(pm)
+        intel.passes.ttgpuir.add_remove_layout_conversions(pm)
         intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, XPUBackend.get_split_barrier_scope(opt))
 
         passes.ttgpuir.add_fuse_nested_loops(pm)
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
index 9e31a33b4a..a6903c4a07 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -427,6 +427,9 @@ struct PrefetchOpConversion
       // Swap the shape to make it row major and then get the tiling
       // size base on row major shape.
       std::swap(tensorShape[0], tensorShape[1]);
+      tensorType = RankedTensorType::get(
+          tensorShape, tensorType.getElementType(),
+          tensorType.getEncoding());
     }
 
     unsigned numWarps = triton::gpu::lookupNumWarps(op);