diff --git a/test/TritonIntelGPU/loop-pipeline.mlir b/test/TritonIntelGPU/loop-pipeline.mlir index 3624fcd166..4635fedea1 100644 --- a/test/TritonIntelGPU/loop-pipeline.mlir +++ b/test/TritonIntelGPU/loop-pipeline.mlir @@ -100,11 +100,11 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, %74 = tt.splat %73 : i32 -> tensor<1x32xi32, #blocked> %75 = arith.cmpi slt, %33, %74 : tensor<1x32xi32, #blocked> %76 = tt.broadcast %75 : tensor<1x32xi1, #blocked> -> tensor<64x32xi1, #blocked> - %77 = tt.load %arg11, %76, %cst_0 : tensor<64x32x!tt.ptr, #blocked> + %77 = tt.load %arg11, %76, %cst_0 {triton_intel_gpu.block_io = "row_major"} : tensor<64x32x!tt.ptr, #blocked> %78 = tt.splat %73 : i32 -> tensor<32x1xi32, #blocked1> %79 = arith.cmpi slt, %40, %78 : tensor<32x1xi32, #blocked1> %80 = tt.broadcast %79 : tensor<32x1xi1, #blocked1> -> tensor<32x256xi1, #blocked1> - %81 = tt.load %arg12, %80, %cst_1 : tensor<32x256x!tt.ptr, #blocked1> + %81 = tt.load %arg12, %80, %cst_1 {triton_intel_gpu.block_io = "row_major"} : tensor<32x256x!tt.ptr, #blocked1> %82 = ttg.convert_layout %77 : tensor<64x32xf16, #blocked> -> tensor<64x32xf16, #dot0> %83 = ttg.convert_layout %81 : tensor<32x256xf16, #blocked1> -> tensor<32x256xf16, #dot1> %84 = tt.dot %82, %83, %arg10, inputPrecision = tf32 : tensor<64x32xf16, #dot0> * tensor<32x256xf16, #dot1> -> tensor<64x256xf32, #dpas> @@ -175,8 +175,8 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32 // CHECK: tt.dot {{.*}} : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<128x256xf32, #[[$DPAS]]> // CHECK-NEXT: scf.yield %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c64_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<128x256xf32, #dpas>, !tt.ptr>, !tt.ptr>) : i32 { - %56 = tt.load %arg11 {boundaryCheck = array} : !tt.ptr> - %57 = tt.load %arg12 {boundaryCheck = array} : !tt.ptr> + %56 = tt.load %arg11 {boundaryCheck = array, triton_intel_gpu.block_io = "row_major"} : !tt.ptr> + %57 = tt.load %arg12 {boundaryCheck = array, triton_intel_gpu.block_io = "row_major"} : !tt.ptr> %58 = tt.dot %56, %57, %arg10, inputPrecision = tf32 : tensor<128x64xf16, #dot0> * tensor<64x256xf16, #dot1> -> tensor<128x256xf32, #dpas> %59 = tt.advance %arg11, [%c0_i32, %c64_i32] : >> %60 = tt.advance %arg12, [%c64_i32, %c0_i32] : >> @@ -248,8 +248,8 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32 // CHECK: tt.dot {{.*}} : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<128x256xf32, #[[$DPAS]]> // CHECK-NEXT: scf.yield %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c64_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<128x256xf32, #dpas>, !tt.ptr>, !tt.ptr>) : i32 { - %56 = tt.load %arg11 {boundaryCheck = array} : !tt.ptr> - %57 = tt.load %arg12 {boundaryCheck = array} : !tt.ptr> + %56 = tt.load %arg11 {boundaryCheck = array, triton_intel_gpu.block_io = "row_major"} : !tt.ptr> + %57 = tt.load %arg12 {boundaryCheck = array, triton_intel_gpu.block_io = "row_major"} : !tt.ptr> %58 = tt.dot %56, %57, %arg10, inputPrecision = tf32 : tensor<128x64xf16, #dot0> * tensor<64x256xf16, #dot1> -> tensor<128x256xf32, #dpas> %102 = tt.addptr %arg8, %c4_i32 : !tt.ptr, i32 %100 = arith.addi %c0_i32, %c4_i32 : i32 diff --git a/test/TritonIntelGPU/split-barrier.mlir b/test/TritonIntelGPU/split-barrier.mlir index 9edb69774c..db559c3e8b 100644 --- a/test/TritonIntelGPU/split-barrier.mlir +++ b/test/TritonIntelGPU/split-barrier.mlir @@ -33,8 +33,8 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32 // CHECK-NEXT: scf.yield %23:3 = scf.for %arg2 = %c0_i32 to %c64_i32 step %c64_i32 iter_args(%arg3 = %cst, %arg4 = %18, %arg5 = %22) -> (tensor<128x256xf32, #dpas>, !tt.ptr>, !tt.ptr>) : i32 { %55:3 = scf.for %arg9 = %c0_i32 to %c64_i32 step %c64_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<128x256xf32, #dpas>, !tt.ptr>, !tt.ptr>) : i32 { - %56 = tt.load %arg11 {boundaryCheck = array} : !tt.ptr> - %57 = tt.load %arg12 {boundaryCheck = array} : !tt.ptr> + %56 = tt.load %arg11 {boundaryCheck = array, triton_intel_gpu.block_io = "row_major"} : !tt.ptr> + %57 = tt.load %arg12 {boundaryCheck = array, triton_intel_gpu.block_io = "row_major"} : !tt.ptr> %58 = tt.dot %56, %57, %arg10, inputPrecision = tf32 : tensor<128x64xf16, #dot0> * tensor<64x256xf16, #dot1> -> tensor<128x256xf32, #dpas> %59 = tt.advance %arg11, [%c0_i32, %c64_i32] : >> %60 = tt.advance %arg12, [%c64_i32, %c0_i32] : >> @@ -79,8 +79,8 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32 // SUBGROUP_SCOPE: spirv.INTEL.ControlBarrierWait // CHECK-NEXT: scf.yield %23:3 = scf.for %arg9 = %c0_i32 to %c64_i32 step %c64_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<128x256xf32, #dpas>, !tt.ptr>, !tt.ptr>) : i32 { - %56 = tt.load %arg11 {boundaryCheck = array} : !tt.ptr> - %57 = tt.load %arg12 {boundaryCheck = array} : !tt.ptr> + %56 = tt.load %arg11 {boundaryCheck = array, triton_intel_gpu.block_io = "row_major"} : !tt.ptr> + %57 = tt.load %arg12 {boundaryCheck = array, triton_intel_gpu.block_io = "row_major"} : !tt.ptr> %58 = tt.dot %56, %57, %arg10, inputPrecision = tf32 : tensor<128x64xf16, #dot0> * tensor<64x256xf16, #dot1> -> tensor<128x256xf32, #dpas> %59 = tt.advance %arg11, [%c0_i32, %c64_i32] : >> %60 = tt.advance %arg12, [%c64_i32, %c0_i32] : >> diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp index 2b6742fb42..1788ba753b 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp @@ -132,6 +132,16 @@ static void collectOpsToPipeline(scf::ForOp forOp, if (!isBlockPtr && !supportRegularPtr) continue; + // Check if the memory is structed densely. If not, we do not prefetch it + // to avoid polluting the cache. + Attribute blockIOAttr = + loadOp->getAttr(mlir::triton::gpu::intel::TritonIntelGPUDialect:: + getBlockIOAttrName()); + if (!blockIOAttr) { + LDBG("Skipping LoadOp without block_io attribute" << *loadOp); + continue; + } + std::optional loadWithDotOperand = loadDotOperand(loadOp); if (loadWithDotOperand.has_value()) loadOps.push_back(loadWithDotOperand.value());