From 9096ffe844676bfe814aaf63d33ef8691cc11506 Mon Sep 17 00:00:00 2001 From: "Lu,Chengjun" Date: Mon, 10 Mar 2025 15:23:11 +0000 Subject: [PATCH 01/11] Support tensor of pointer as the pointer parameter of the prefetching operation. Add a mask operand for boundary check. --- test/TritonIntelGPU/prefetch-to-llvm.mlir | 213 ++++++++++++----- .../LoadStoreOpToLLVM.cpp | 223 +++++++++++++++++- 2 files changed, 368 insertions(+), 68 deletions(-) diff --git a/test/TritonIntelGPU/prefetch-to-llvm.mlir b/test/TritonIntelGPU/prefetch-to-llvm.mlir index 082f75d5fd..b4d493b825 100644 --- a/test/TritonIntelGPU/prefetch-to-llvm.mlir +++ b/test/TritonIntelGPU/prefetch-to-llvm.mlir @@ -1,75 +1,158 @@ -// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm +// RUN: triton-opt %s -split-input-file --convert-triton-intel-gpu-to-llvm --cse -canonicalize | FileCheck %s -// CHECK-DAG: llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects, no_unwind} -// CHECK-DAG: llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x2cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects, no_unwind} +// CHECK-DAG: llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects, no_unwind} module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32} { - tt.func public @matmul_with_prefetch(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64) { - // CHECK-LABEL: @matmul_with_prefetch +// CHECK-LABEL: llvm.func spir_kernelcc @prefetch_block_ptr( +// CHECK-SAME: %[[BASE:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !llvm.ptr<1>, +// CHECK-SAME: %[[BASE_HEIGHT:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: i64, +// CHECK-SAME: %[[BASE_WIDTH:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: i64, +// CHECK-SAME: %[[ROW_STRIDE:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: i64) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} { + tt.func public @prefetch_block_ptr(%arg0: !tt.ptr, %arg2: i64, %arg4: i64, %arg5: i64) { %c0_i32 = arith.constant 0 : i32 %c1_i64 = arith.constant 1 : i64 - // CHECK: %[[ROW_MAJOR_BLOCK_PTR:.*]] = llvm.insertvalue %arg0, {{.*}}[6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[VAL_17:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() - // CHECK: %[[VAL_18:.*]] = llvm.zext %[[VAL_17]] : i32 to i64 - // CHECK: %[[VAL_19:.*]] = llvm.trunc %[[VAL_18]] : i64 to i32 - // CHECK: %[[VAL_20:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[VAL_21:.*]] = llvm.urem %[[VAL_19]], %[[VAL_20]] : i32 - // CHECK: %[[VAL_22:.*]] = llvm.udiv %[[VAL_19]], %[[VAL_20]] : i32 - // CHECK: %[[VAL_23:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK: %[[VAL_24:.*]] = llvm.urem %[[VAL_22]], %[[VAL_23]] : i32 - // CHECK: %[[VAL_25:.*]] = llvm.udiv %[[VAL_22]], %[[VAL_23]] : i32 - // CHECK: %[[ROW_MAJOR_OFFSET_Y:.*]] = llvm.extractvalue %[[ROW_MAJOR_BLOCK_PTR]][0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[ROW_MAJOR_OFFSET_X:.*]] = llvm.extractvalue %[[ROW_MAJOR_BLOCK_PTR]][1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[ROW_MAJOR_HEIGHT_:.*]] = llvm.extractvalue %[[ROW_MAJOR_BLOCK_PTR]][2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[ROW_MAJOR_WIDTH_:.*]] = llvm.extractvalue %[[ROW_MAJOR_BLOCK_PTR]][3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[ROW_MAJOR_ROW_STRIDE_:.*]] = llvm.extractvalue %[[ROW_MAJOR_BLOCK_PTR]][4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[ROW_MAJOR_BASE:.*]] = llvm.extractvalue %[[ROW_MAJOR_BLOCK_PTR]][6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[VAL_34:.*]] = llvm.mul %[[ROW_MAJOR_WIDTH_]], {{.*}} : i64 - // CHECK: %[[ROW_MAJOR_WIDTH:.*]] = llvm.trunc %[[VAL_34]] : i64 to i32 - // CHECK: %[[ROW_MAJOR_HEIGHT:.*]] = llvm.trunc %[[ROW_MAJOR_HEIGHT_]] : i64 to i32 - // CHECK: %[[ROW_MAJOR_ROW_STRIDE:.*]] = llvm.mul %[[ROW_MAJOR_ROW_STRIDE_]], {{.*}} : i64 - // CHECK: %[[ROW_MAJOR_STRIDE:.*]] = llvm.trunc %[[ROW_MAJOR_ROW_STRIDE]] : i64 to i32 - // CHECK: %[[COLUMN_MAJOR_WARP_OFF_X_:.*]] = llvm.add {{.*}}, %[[ROW_MAJOR_OFFSET_X]] : i32 - // CHECK: %[[COLUMN_MAJOR_WARP_OFF_Y_:.*]] = llvm.add {{.*}}, %[[ROW_MAJOR_OFFSET_Y]] : i32 - // CHECK: %[[COLUMN_MAJOR_WARP_OFF_Y:.*]] = llvm.trunc %[[COLUMN_MAJOR_WARP_OFF_Y_]] : i32 to i32 - // CHECK: %[[COLUMN_MAJOR_WARP_OFF_X:.*]] = llvm.trunc %[[COLUMN_MAJOR_WARP_OFF_X_]] : i32 to i32 - // CHECK: %[[VAL_56:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[VAL_57:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_59:.*]] = llvm.insertelement %[[COLUMN_MAJOR_WARP_OFF_X]], {{.*}}{{\[}}%[[VAL_57]] : i32] : vector<2xi32> - // CHECK: %[[ROW_MAJOR_COORD:.*]] = llvm.insertelement %[[COLUMN_MAJOR_WARP_OFF_Y]], {{.*}}{{\[}}%[[VAL_56]] : i32] : vector<2xi32> - // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(%[[ROW_MAJOR_BASE]], %[[ROW_MAJOR_WIDTH]], %[[ROW_MAJOR_HEIGHT]], %[[ROW_MAJOR_STRIDE]], %[[ROW_MAJOR_COORD]]) {{.*}} : (!llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - %rowMajorPtr = tt.make_tensor_ptr %arg0, [%arg2, %arg4], [%arg5, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > - triton_intel_gpu.prefetch %rowMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "row_major"} : !tt.ptr> - - // COM: The memory layout is same for the column major memory and row major memory. The prefetch function should be the same. - - // CHECK: %[[COLUMN_MAJOR_BLOCK_PTR:.*]] = llvm.insertvalue %arg1, {{.*}}[6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[COLUMN_MAJOR_OFFSET_Y:.*]] = llvm.extractvalue %[[COLUMN_MAJOR_BLOCK_PTR]][0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[COLUMN_MAJOR_OFFSET_X:.*]] = llvm.extractvalue %[[COLUMN_MAJOR_BLOCK_PTR]][1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[COLUMN_MAJOR_HEIGHT_:.*]] = llvm.extractvalue %[[COLUMN_MAJOR_BLOCK_PTR]][2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[COLUMN_MAJOR_WIDTH:.*]] = llvm.extractvalue %[[COLUMN_MAJOR_BLOCK_PTR]][3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[COLUMN_MAJOR_COL_STRIDE:.*]] = llvm.extractvalue %[[COLUMN_MAJOR_BLOCK_PTR]][5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[COLUMN_MAJOR_BASE:.*]] = llvm.extractvalue %[[COLUMN_MAJOR_BLOCK_PTR]][6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> - // CHECK: %[[VAL_86:.*]] = llvm.mul %[[COLUMN_MAJOR_HEIGHT_]], {{.*}} : i64 - // CHECK: %[[COLUMN_MAJOR_HEIGHT:.*]] = llvm.trunc %[[VAL_86]] : i64 to i32 - // CHECK: %[[COLUMN_MAJOR_WIDTH_:.*]] = llvm.trunc %[[COLUMN_MAJOR_WIDTH]] : i64 to i32 - // CHECK: %[[VAL_90:.*]] = llvm.mul %[[COLUMN_MAJOR_COL_STRIDE]], {{.*}} : i64 - // CHECK: %[[COLUMN_MAJOR_STRIDE:.*]] = llvm.trunc %[[VAL_90]] : i64 to i32 - // CHECK: %[[COLUMN_MAJOR_WARP_OFF_X_:.*]] = llvm.add {{.*}}, %[[COLUMN_MAJOR_OFFSET_X]] : i32 - // CHECK: %[[COLUMN_MAJOR_WARP_OFF_Y_:.*]] = llvm.add {{.*}}, %[[COLUMN_MAJOR_OFFSET_Y]] : i32 - // CHECK: %[[COLUMN_MAJOR_WARP_OFF_Y:.*]] = llvm.trunc %[[COLUMN_MAJOR_WARP_OFF_Y_]] : i32 to i32 - // CHECK: %[[COLUMN_MAJOR_WARP_OFF_X:.*]] = llvm.trunc %[[COLUMN_MAJOR_WARP_OFF_X_]] : i32 to i32 - // CHECK: %[[VAL_108:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: %[[VAL_109:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: llvm.insertelement %[[COLUMN_MAJOR_WARP_OFF_X]], {{.*}}{{\[}}%[[VAL_109]] : i32] : vector<2xi32> - // CHECK: %[[COLUMN_MAJOR_COORD:.*]] = llvm.insertelement %[[COLUMN_MAJOR_WARP_OFF_Y]], {{.*}}{{\[}}%[[VAL_108]] : i32] : vector<2xi32> - // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x2cPU3AS1viiiDv2_i(%[[COLUMN_MAJOR_BASE]], %[[COLUMN_MAJOR_HEIGHT]], %[[COLUMN_MAJOR_WIDTH_]], %[[COLUMN_MAJOR_STRIDE]], %[[COLUMN_MAJOR_COORD]]) {{.*}} : (!llvm.ptr<1>{{.*}}, i32, i32, i32, vector<2xi32>) -> () - %columnMajorPtr = tt.make_tensor_ptr %arg1, [%arg4, %arg3], [%c1_i64, %arg6], [%c0_i32, %c0_i32] {order = array} : > - triton_intel_gpu.prefetch %columnMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "column_major"} : !tt.ptr> + // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-DAG: %[[CST_2_I32:.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32 + // CHECK-DAG: %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64 + // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK-DAG: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK: %[[VAL_15:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() + // CHECK: %[[VAL_16:.*]] = llvm.zext %[[VAL_15]] : i32 to i64 + // CHECK: %[[VAL_17:.*]] = llvm.trunc %[[VAL_16]] : i64 to i32 + // CHECK: %[[VAL_18:.*]] = llvm.urem %[[VAL_17]], %[[CST_1]] : i32 + // CHECK: %[[VAL_19:.*]] = llvm.udiv %[[VAL_17]], %[[CST_1]] : i32 + // CHECK: %[[VAL_20:.*]] = llvm.urem %[[VAL_19]], %[[CST_8]] : i32 + // CHECK: %[[VAL_21:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64 + // CHECK: %[[ROW_MAJOR_BASE_WIDTH_I32:.*]] = llvm.trunc %[[VAL_21]] : i64 to i32 + // CHECK: %[[ROW_MAJOR_BASE_HEIGHT_I32:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32 + // CHECK: %[[VAL_24:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64 + // CHECK: %[[PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32 + // CHECK: %[[VAL_26:.*]] = llvm.mul %[[VAL_18]], %[[CST_32]] : i32 + // CHECK: %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32 + // CHECK: %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32 + // CHECK: %[[VAL_29:.*]] = llvm.add %[[VAL_28]], %[[CST_0]] : i32 + // CHECK: %[[VAL_30:.*]] = llvm.mul %[[VAL_20]], %[[CST_2_I32]] : i32 + // CHECK: %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32 + // CHECK: %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32 + // CHECK: %[[VAL_33:.*]] = llvm.add %[[VAL_32]], %[[CST_0]] : i32 + // CHECK: %[[OFFSET_Y:.*]] = llvm.trunc %[[VAL_33]] : i32 to i32 + // CHECK: %[[OFFSET_X:.*]] = llvm.trunc %[[VAL_29]] : i32 to i32 + // CHECK: %[[VAL_36:.*]] = llvm.insertelement %[[OFFSET_X]], {{.*}} : i32] : vector<2xi32> + // CHECK: %[[OFFSETS:.*]] = llvm.insertelement %[[OFFSET_Y]], {{.*}} : i32] : vector<2xi32> + // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH_I32]], %[[ROW_MAJOR_BASE_HEIGHT_I32]], %[[PITCH]], %[[OFFSETS]]) {{.*}} + %rowMajorPtr = tt.make_tensor_ptr %arg0, [%arg2, %arg4], [%arg5, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > + triton_intel_gpu.prefetch %rowMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "row_major"} : !tt.ptr> + // COM: The memory layout is same for the column major memory and row major memory. The prefetch should be the same. + + // CHECK: %[[VAL_38:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32 + // CHECK: %[[VAL_39:.*]] = llvm.zext %[[VAL_38]] : i32 to i64 + // CHECK: %[[VAL_40:.*]] = llvm.trunc %[[VAL_39]] : i64 to i32 + // CHECK: %[[VAL_41:.*]] = llvm.urem %[[VAL_40]], %[[CST_1]] : i32 + // CHECK: %[[VAL_42:.*]] = llvm.udiv %[[VAL_40]], %[[CST_1]] : i32 + // CHECK: %[[VAL_43:.*]] = llvm.urem %[[VAL_42]], %[[CST_8]] : i32 + // CHECK: %[[VAL_44:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64 + // CHECK: %[[COLUM_MAJOR_BASE_WIDTH_I32:.*]] = llvm.trunc %[[VAL_44]] : i64 to i32 + // CHECK: %[[COLUM_MAJOR_BASE_HEIGHT_I32:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32 + // CHECK: %[[VAL_47:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64 + // CHECK: %[[COLUM_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_47]] : i64 to i32 + // CHECK: %[[VAL_49:.*]] = llvm.mul %[[VAL_41]], %[[CST_32]] : i32 + // CHECK: %[[VAL_50:.*]] = llvm.add %[[VAL_49]], %[[CST_0]] : i32 + // CHECK: %[[VAL_51:.*]] = llvm.urem %[[VAL_50]], %[[CST_32]] : i32 + // CHECK: %[[VAL_52:.*]] = llvm.add %[[VAL_51]], %[[CST_0]] : i32 + // CHECK: %[[VAL_53:.*]] = llvm.mul %[[VAL_43]], %[[CST_2_I32]] : i32 + // CHECK: %[[VAL_54:.*]] = llvm.add %[[VAL_53]], %[[CST_0]] : i32 + // CHECK: %[[VAL_55:.*]] = llvm.urem %[[VAL_54]], %[[CST_16]] : i32 + // CHECK: %[[VAL_56:.*]] = llvm.add %[[VAL_55]], %[[CST_0]] : i32 + // CHECK: %[[VAL_57:.*]] = llvm.trunc %[[VAL_56]] : i32 to i32 + // CHECK: %[[VAL_58:.*]] = llvm.trunc %[[VAL_52]] : i32 to i32 + // CHECK: %[[VAL_59:.*]] = llvm.insertelement %[[VAL_58]], {{.*}} : i32] : vector<2xi32> + // CHECK: %[[COLUM_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[VAL_57]], {{.*}} : i32] : vector<2xi32> + // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[COLUM_MAJOR_BASE_WIDTH_I32]], %[[COLUM_MAJOR_BASE_HEIGHT_I32]], %[[COLUM_MAJOR_PITCH]], %[[COLUM_MAJOR_OFFSETS]]) {{.*}} + %columnMajorPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%c1_i64, %arg5], [%c0_i32, %c0_i32] {order = array} : > + triton_intel_gpu.prefetch %columnMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "column_major"} : !tt.ptr> + + // COM: The memory is not structured densely. Not to prefetch it to the cache. // CHECK-NOT: block_prefetch - %nonContiguousPtr = tt.make_tensor_ptr %arg1, [%arg4, %arg3], [%arg6, %arg6], [%c0_i32, %c0_i32] {order = array} : > - triton_intel_gpu.prefetch %nonContiguousPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr> + %nonContiguousPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%arg5, %arg5], [%c0_i32, %c0_i32] {order = array} : > + triton_intel_gpu.prefetch %nonContiguousPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr> + tt.return + } +} + +// ----- + +// CHECK: llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_8r16x2cPU3AS1viiiDv2_i +#dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> +module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32} { + // CHECK-LABEL: llvm.func spir_kernelcc @prefetch_tensor_of_pointers + tt.func public @prefetch_tensor_of_pointers(%tensor_of_ptr: tensor<64x32x!tt.ptr, #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth = 1}>>) { + // CHECK: %[[MASK:.*]] = llvm.mlir.constant(1 : i8) : i8 + // CHECK: %[[VAL_2:.*]] = llvm.mlir.undef : vector<2xi32> + // CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK: %[[BASE_HEIGHT:.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK: %[[BASE_WIDTH:.*]] = llvm.mlir.constant(64 : i32) : i32 + // CHECK: %[[TRUE:.*]] = llvm.mlir.constant(true) : i1 + + // CHECK: %[[ADDR_0:.*]] = llvm.extractvalue {{.*}}[0] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)> + // CHECK: %[[ADDR_1:.*]] = llvm.extractvalue {{.*}}[1] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)> + // CHECK: %[[ADDR_16:.*]] = llvm.extractvalue {{.*}}[16] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)> + // CHECK: %[[ADDR_32:.*]] = llvm.extractvalue {{.*}}[32] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)> + // CHECK: %[[ADDR_48:.*]] = llvm.extractvalue {{.*}}[48] : !llvm.struct<(ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>, ptr<1>)> + // CHECK: %[[VAL_13:.*]] = llvm.ptrtoint %[[ADDR_0]] : !llvm.ptr<1> to i64 + // CHECK: %[[VAL_14:.*]] = llvm.ptrtoint %[[ADDR_1]] : !llvm.ptr<1> to i64 + // CHECK: %[[PITCH:.*]] = llvm.sub %[[VAL_14]], %[[VAL_13]] : i64 + // CHECK: %[[UNIFIED_PITCH:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[PITCH]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64 + // CHECK: %[[UNIFIED_PITCH_I32:.*]] = llvm.trunc %[[UNIFIED_PITCH]] : i64 to i32 + // CHECK: %[[VAL_18:.*]] = llvm.intr.umax(%[[UNIFIED_PITCH_I32]], %[[BASE_WIDTH]]) : (i32, i32) -> i32 + // CHECK: %[[PITCH_IN_BYTES_I32:.*]] = llvm.trunc %[[VAL_18]] : i32 to i32 + + // CHECK: %[[UNIFIED_MASK:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[MASK]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i8, i32) -> i8 + // CHECK: %[[UNIFIED_MASK_I1:.*]] = llvm.trunc %[[UNIFIED_MASK]] : i8 to i1 + // CHECK: %[[OFFSET_Y:.*]] = llvm.select %[[UNIFIED_MASK_I1]], %[[CST_0]], %[[BASE_HEIGHT]] : i1, i32 + // CHECK: %[[UNIFIED_BASE:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_13]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64 + // CHECK: %[[VAL_26:.*]] = llvm.inttoptr %[[UNIFIED_BASE]] : i64 to !llvm.ptr<1> + // CHECK: %[[VAL_27:.*]] = llvm.insertelement %[[CST_0]], {{.*}} : vector<2xi32> + // CHECK: %[[OFFSETS:.*]] = llvm.insertelement %[[OFFSET_Y]], {{.*}} : vector<2xi32> + // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_8r16x2cPU3AS1viiiDv2_i(%[[VAL_26]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32]], %[[OFFSETS]]) + + // CHECK: %[[VAL_29:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[MASK]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i8, i32) -> i8 + // CHECK: %[[VAL_30:.*]] = llvm.trunc %[[VAL_29]] : i8 to i1 + // CHECK: %[[VAL_31:.*]] = llvm.select %[[VAL_30]], %[[CST_0]], %[[BASE_HEIGHT]] : i1, i32 + // CHECK: %[[VAL_32:.*]] = llvm.ptrtoint %[[ADDR_16]] : !llvm.ptr<1> to i64 + // CHECK: %[[VAL_33:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_32]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64 + // CHECK: %[[VAL_34:.*]] = llvm.inttoptr %[[VAL_33]] : i64 to !llvm.ptr<1> + // CHECK: %[[VAL_35:.*]] = llvm.insertelement %[[VAL_31]], {{.*}} : vector<2xi32> + // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_8r16x2cPU3AS1viiiDv2_i(%[[VAL_34]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32]], %[[VAL_35]]) + + // CHECK: %[[VAL_36:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[MASK]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i8, i32) -> i8 + // CHECK: %[[VAL_37:.*]] = llvm.trunc %[[VAL_36]] : i8 to i1 + // CHECK: %[[VAL_38:.*]] = llvm.select %[[VAL_37]], %[[CST_0]], %[[BASE_HEIGHT]] : i1, i32 + // CHECK: %[[VAL_39:.*]] = llvm.ptrtoint %[[ADDR_32]] : !llvm.ptr<1> to i64 + // CHECK: %[[VAL_40:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_39]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64 + // CHECK: %[[VAL_41:.*]] = llvm.inttoptr %[[VAL_40]] : i64 to !llvm.ptr<1> + // CHECK: %[[VAL_42:.*]] = llvm.insertelement %[[VAL_38]], {{.*}} : i32] : vector<2xi32> + // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_8r16x2cPU3AS1viiiDv2_i(%[[VAL_41]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32]], %[[VAL_42]]) + + // CHECK: %[[VAL_43:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflecj(%[[MASK]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i8, i32) -> i8 + // CHECK: %[[VAL_44:.*]] = llvm.trunc %[[VAL_43]] : i8 to i1 + // CHECK: %[[VAL_45:.*]] = llvm.select %[[VAL_44]], %[[CST_0]], %[[BASE_HEIGHT]] : i1, i32 + // CHECK: %[[VAL_46:.*]] = llvm.ptrtoint %[[ADDR_48]] : !llvm.ptr<1> to i64 + // CHECK: %[[VAL_47:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflelj(%[[VAL_46]], %[[CST_0]]) {convergent, no_unwind, will_return} : (i64, i32) -> i64 + // CHECK: %[[VAL_48:.*]] = llvm.inttoptr %[[VAL_47]] : i64 to !llvm.ptr<1> + // CHECK: %[[VAL_49:.*]] = llvm.insertelement %[[VAL_45]], {{.*}} : i32] : vector<2xi32> + // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_8r16x2cPU3AS1viiiDv2_i(%[[VAL_48]], %[[BASE_WIDTH]], %[[BASE_HEIGHT]], %[[PITCH_IN_BYTES_I32]], %[[VAL_49]]) + + %mask_tensor = arith.constant dense<1> : tensor<64x32xi1, #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth = 1}>> + triton_intel_gpu.prefetch %tensor_of_ptr, %mask_tensor {boundaryCheck = array, cache = 1 : i32, evict = 1 : i32, isVolatile = false, operandSegmentSizes = array, triton_intel_gpu.block_io = "row_major"} : tensor<64x32x!tt.ptr, #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth = 1}>> + + // CHECK-COUNT-4: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_8r16x2cPU3AS1viiiDv2_i + + triton_intel_gpu.prefetch %tensor_of_ptr {boundaryCheck = array, cache = 1 : i32, evict = 1 : i32, isVolatile = false, operandSegmentSizes = array, triton_intel_gpu.block_io = "row_major"} : tensor<64x32x!tt.ptr, #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth = 1}>> + tt.return } } diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp index e0adf72365..25f8b0a759 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -328,9 +328,7 @@ struct PrefetchOpConversion Value ptr = op.getPtr(); if (isTensorPointerType(ptr.getType())) return rewriteTensorPointerPrefetch(op, adaptor, rewriter); - - llvm_unreachable("Unexpected prefetch operation on 'regular' ptr"); - return failure(); + return rewriteRegularPointerPrefetch(op, adaptor, rewriter); } LogicalResult @@ -369,6 +367,9 @@ struct PrefetchOpConversion // Swap the shape to make it row major and then get the tiling // size base on row major shape. std::swap(tensorShape[0], tensorShape[1]); + + tensorType = RankedTensorType::get( + tensorShape, tensorType.getElementType(), tensorType.getEncoding()); } unsigned numWarps = triton::gpu::lookupNumWarps(op); @@ -481,6 +482,222 @@ struct PrefetchOpConversion rewriter.eraseOp(op); return success(); } + + LogicalResult + rewriteRegularPointerPrefetch(triton::gpu::intel::PrefetchOp op, + OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + + Attribute blockIOAttr = + op->getAttr(TritonIntelGPUDialect::getBlockIOAttrName()); + if (!blockIOAttr) { + llvm_unreachable("Unexpected prefetch operation on unstructured memory " + "which may pollute the cache"); + return failure(); + } + + // Only support rank 2 block pointer, either row major or column major. + StringRef memoryLayoutInfo = cast(blockIOAttr).getValue(); + assert((memoryLayoutInfo == "row_major" || + memoryLayoutInfo == "column_major") && + "Only row_major or column_major is supported"); + + const bool memoryRowMajor = (memoryLayoutInfo == "row_major"); + + // TODO: To support more layouts on memory. + if (!memoryRowMajor) { + return failure(); + } + + Value ptr = op.getPtr(); + auto tensorOfPointers = cast(ptr.getType()); + + // TODO: To support more layouts in register. + if (!hasDotDpasEncoding(tensorOfPointers)) + return failure(); + + auto encoding = getDotEncoding(tensorOfPointers).value(); + auto dpasLayout = cast(encoding.getParent()); + auto warpsPerCTA = dpasLayout.getWarpsPerCTA(); + auto cluster = dpasLayout.getRepCluster(); + SmallVector repCluster{cluster.begin(), cluster.end()}; + auto tensorShape = tensorOfPointers.getShape(); + + DpasEncodingAttr::OpIdx opIdx; + auto getOpIdx = [&]() -> DpasEncodingAttr::OpIdx { + auto dotLayout = getDotEncoding(tensorOfPointers).value(); + return static_cast(dotLayout.getOpIdx()); + }; + opIdx = getOpIdx(); + + auto repetitions = dpasLayout.getDPASRepetitions(tensorShape, opIdx); + // getDPASRepetitions always return rank 3 size. + SmallVector numReps{repetitions.begin() + 1, repetitions.end()}; + SmallVector shardTensorShape; + if (opIdx == DpasEncodingAttr::OpIdx::OperandA) { + auto opAShape = dpasLayout.getShapeA(); + shardTensorShape = {std::min(tensorShape[0], opAShape[0]), + tensorShape[1]}; + warpsPerCTA[1] = 1; + repCluster[1] = 1; + numReps[1] = 1; + } else { + auto opBShape = dpasLayout.getShapeB(); + shardTensorShape = {tensorShape[0], + std::min(tensorShape[1], opBShape[1])}; + warpsPerCTA[0] = 1; + repCluster[0] = 1; + numReps[0] = 1; + } + + auto ptrType = cast(tensorOfPointers.getElementType()); + Type elementType = ptrType.getPointeeType(); + RankedTensorType tensorType = RankedTensorType::get( + shardTensorShape, elementType, tensorOfPointers.getEncoding()); + + SmallVector prefetchShape = + get2DPrefetchShapePerWarp(tensorType); + + Value mask = op.getMask(); + unsigned maskConstancyHor = std::numeric_limits::max(), + maskConstancyVer = std::numeric_limits::max(); + if (mask) { + if (auto maskTy = dyn_cast_or_null(mask.getType())) { + auto axisInfo = const_cast( + axisAnalysisPass) + .getAxisInfo(mask); + if (axisInfo) { + maskConstancyHor = axisInfo->getConstancy(1); + maskConstancyVer = axisInfo->getConstancy(0); + } else { + maskConstancyHor = 1; + maskConstancyVer = 1; + } + } + /*else { + // scalar mask. No need to check the constancy. + }*/ + } + prefetchShape = {std::min(prefetchShape[0], maskConstancyVer), + std::min(prefetchShape[1], maskConstancyHor)}; + + SmallVector numPrefetchsPerRep = { + mlir::ceil(shardTensorShape[0], prefetchShape[0]), + mlir::ceil(shardTensorShape[1], prefetchShape[1])}; + + Type eltTy = tensorType.getElementType(); + unsigned elemSizeInBits = eltTy.getIntOrFloatBitWidth(); + unsigned tileWidthInElem = prefetchShape[1]; + unsigned tileHeightInElem = prefetchShape[0]; + unsigned vBlocks = 1; + switch (elemSizeInBits) { + case 8: + if (tileWidthInElem == 64) { + // OCL interface supports 8b_?r32x2c for 64 bytes per row of 8 bits + // element. + vBlocks = 2; + tileWidthInElem = 32; + } + break; + case 16: + if (tileWidthInElem == 32) { + // OCL interface supports 16b_?r16x2c for 64 bytes per row of 16 bits + // element. + vBlocks = 2; + tileWidthInElem = 16; + } + break; + } + + auto mod = rewriter.getBlock()->getParent()->getParentOfType(); + Location loc = op.getLoc(); + auto b = TritonLLVMOpBuilder(loc, rewriter); + + std::map, Value> baseAddrs, masks; + Value llPtr = adaptor.getPtr(); + Value llMask = adaptor.getMask(); + + SmallVector ptrElems, maskElems; + // Get the LLVM values for pointers + ptrElems = unpackLLElements(loc, llPtr, rewriter); + if (llMask) { + maskElems = unpackLLElements(loc, llMask, rewriter); + } + + // re-arrange the baseAddrs and masks to for large 2D block IO. + // Layout is unrelated to the scalar type. + SmallVector> offsets = + mlir::emitOffsetForLayout(encoding, tensorOfPointers); + for (size_t i = 0; i < ptrElems.size(); ++i) { + SmallVector offset = offsets[i]; + baseAddrs[offset] = ptrElems[i]; + if (llMask && maskElems.size() > 1) + masks[offset] = maskElems[i]; + } + + Value base, baseWidth, baseHeight, rowStrideInBytes, colStride, offsetBaseX, + offsetBaseY; + + baseWidth = b.i32_val(vBlocks * tileWidthInElem * (elemSizeInBits / 8)); + baseHeight = b.i32_val(tileHeightInElem); + offsetBaseX = b.i32_val(0); + offsetBaseY = b.i32_val(0); + rowStrideInBytes = b.sub(b.ptrtoint(i64_ty, baseAddrs[{1, 0}]), + b.ptrtoint(i64_ty, baseAddrs[{0, 0}])); + rowStrideInBytes = + targetInfo.shuffleIdx(rewriter, loc, rowStrideInBytes, 0); + rowStrideInBytes = b.umax(b.trunc(i32_ty, rowStrideInBytes), baseWidth); + rowStrideInBytes = b.trunc(i32_ty, rowStrideInBytes); + + for (int row = 0; row < numReps[0]; ++row) { + for (int col = 0; col < numReps[1]; ++col) { + // Prefetch the data for each repetitions. + for (int i = 0; i < numPrefetchsPerRep[0]; ++i) + for (int j = 0; j < numPrefetchsPerRep[1]; ++j) { + unsigned offsetN = col * warpsPerCTA[1] * shardTensorShape[1] + + j * prefetchShape[1]; + unsigned offsetM = row * warpsPerCTA[0] * shardTensorShape[0] + + i * prefetchShape[0]; + Value pred; + if (llMask) { + if (maskElems.size() > 1) { + pred = targetInfo.shuffleIdx(rewriter, loc, + masks[{offsetM, offsetN}], 0); + } else { + pred = maskElems[0]; + } + } else { + pred = b.int_val(1, 1); + } + Value offsetY = b.select(pred, b.i32_val(0), baseHeight); + auto addr = targetInfo.shuffleIdx(rewriter, loc, + baseAddrs[{offsetM, offsetN}], 0); + + auto newOp = rewriter.create( + loc, + /*ptr*/ addr, + /*base_width*/ baseWidth, + /*base_height*/ baseHeight, + /*base_pitch*/ rowStrideInBytes, + /*x*/ offsetBaseX, + /*y*/ offsetY, + /*elem_size_in_bits*/ elemSizeInBits, + /*tile_width*/ tileWidthInElem, + /*tile_height*/ tileHeightInElem, + /*v_blocks*/ vBlocks, + /*cache_opt*/ TritonGEN::LoadCacheControl::L1C_L3C); + if (failed(newOp.verify())) { + // Explicitly invoke verifier because `triton_gen` ops are + // immediately lowered further to a builtin call. + return failure(); + } + } + } + } + + rewriter.eraseOp(op); + return success(); + } }; struct LoadOpToBlockIOConversion From 6d4aad2a5c5d9cf07f2276385f80bc5b2abddf22 Mon Sep 17 00:00:00 2001 From: "Lu,Chengjun" Date: Mon, 10 Mar 2025 15:27:47 +0000 Subject: [PATCH 02/11] Support the tensor of pointer in the matmul loop pipelining. --- third_party/intel/backend/compiler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index 5b146b0559..ba29779103 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -310,7 +310,8 @@ def make_ttgir(mod, metadata, opt, properties): intel.passes.ttgpuir.add_accelerate_matmul(pm) intel.passes.ttgpuir.add_remove_layout_conversions(pm) intel.passes.ttgpuir.add_materialize_block_pointer(pm) - intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, False, XPUBackend.get_split_barrier_scope(opt)) + intel.passes.ttgpuir.add_remove_layout_conversions(pm) + intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, True, XPUBackend.get_split_barrier_scope(opt)) passes.ttgpuir.add_fuse_nested_loops(pm) passes.ttgpuir.add_optimize_thread_locality(pm) From 6fb6ced3c07845fdb265b3185e0b633c756182d3 Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Thu, 24 Apr 2025 17:28:03 +0000 Subject: [PATCH 03/11] Fix failures Signed-off-by: Whitney Tsang --- .../intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp index 25f8b0a759..c48fcdd06c 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -518,7 +518,7 @@ struct PrefetchOpConversion auto encoding = getDotEncoding(tensorOfPointers).value(); auto dpasLayout = cast(encoding.getParent()); - auto warpsPerCTA = dpasLayout.getWarpsPerCTA(); + SmallVector warpsPerCTA(dpasLayout.getWarpsPerCTA()); auto cluster = dpasLayout.getRepCluster(); SmallVector repCluster{cluster.begin(), cluster.end()}; auto tensorShape = tensorOfPointers.getShape(); From d36a089a1ebc05cd1f07daf0605e7554a5aa0763 Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Thu, 24 Apr 2025 17:58:21 +0000 Subject: [PATCH 04/11] [TritonIntelGPUPipeline] Remove supportRegularPtr option Signed-off-by: Whitney Tsang --- test/TritonIntelGPU/loop-pipeline.mlir | 2 +- test/TritonIntelGPU/split-barrier.mlir | 4 ++-- third_party/intel/backend/compiler.py | 2 +- .../Dialect/TritonIntelGPU/Transforms/Passes.td | 3 --- .../Pipeliner/MatmulLoopPipeline.cpp | 11 ++--------- .../lib/TritonIntelGPUTransforms/Pipeliner/Schedule.h | 1 - .../Pipeliner/SoftwarePipeliner.cpp | 8 ++++---- third_party/intel/triton_xpu.cc | 4 ++-- 8 files changed, 12 insertions(+), 23 deletions(-) diff --git a/test/TritonIntelGPU/loop-pipeline.mlir b/test/TritonIntelGPU/loop-pipeline.mlir index 0cdf686a98..bb8a86f63e 100644 --- a/test/TritonIntelGPU/loop-pipeline.mlir +++ b/test/TritonIntelGPU/loop-pipeline.mlir @@ -1,4 +1,4 @@ -// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3 support-regular-ptr=true" | FileCheck %s +// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3" | FileCheck %s // CHECK: #[[$BLOCK_0:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}> // CHECK: #[[$BLOCK_1:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}> diff --git a/test/TritonIntelGPU/split-barrier.mlir b/test/TritonIntelGPU/split-barrier.mlir index a2db6e5c93..b40bd446d9 100644 --- a/test/TritonIntelGPU/split-barrier.mlir +++ b/test/TritonIntelGPU/split-barrier.mlir @@ -1,5 +1,5 @@ -// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3 support-regular-ptr=true split-barriers-scope=workgroup" | FileCheck %s --check-prefixes=CHECK,WORKGROUP_SCOPE -// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3 support-regular-ptr=true split-barriers-scope=subgroup" | FileCheck %s --check-prefixes=CHECK,SUBGROUP_SCOPE +// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3 split-barriers-scope=workgroup" | FileCheck %s --check-prefixes=CHECK,WORKGROUP_SCOPE +// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3 split-barriers-scope=subgroup" | FileCheck %s --check-prefixes=CHECK,SUBGROUP_SCOPE // CHECK: #[[$BLOCK:.+]] = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 16], warpsPerCTA = [8, 4], order = [1, 0]}> // CHECK: #[[$DPAS:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 8], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index ba29779103..8ca0dba87a 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -311,7 +311,7 @@ def make_ttgir(mod, metadata, opt, properties): intel.passes.ttgpuir.add_remove_layout_conversions(pm) intel.passes.ttgpuir.add_materialize_block_pointer(pm) intel.passes.ttgpuir.add_remove_layout_conversions(pm) - intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, True, XPUBackend.get_split_barrier_scope(opt)) + intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, XPUBackend.get_split_barrier_scope(opt)) passes.ttgpuir.add_fuse_nested_loops(pm) passes.ttgpuir.add_optimize_thread_locality(pm) diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td index b0734715c2..2acedddb91 100644 --- a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td +++ b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td @@ -122,9 +122,6 @@ def TritonIntelGPUPipeline : Pass<"tritonintelgpu-pipeline", "mlir::ModuleOp"> { Option<"numStages", "num-stages", "int32_t", /*default*/"3", "number of pipeline stages">, - Option<"supportRegularPtr", "support-regular-ptr", - "bool", /*default*/"false", - "Enable support for prefetching non-block pointers">, Option<"splitBarrierScope", "split-barriers-scope", "enum SplitBarrierScope", "SplitBarrierScope::None", "insert split barriers in a loop", diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp index e6bd165d3d..12d8dacd28 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp @@ -117,8 +117,7 @@ static std::optional loadDotOperand(tt::LoadOp loadOp) { /// Collect loads to pipeline. Return success if we can pipeline this loop. static void collectOpsToPipeline(scf::ForOp forOp, - SmallVectorImpl &loadOps, - bool supportRegularPtr) { + SmallVectorImpl &loadOps) { assert(loadOps.empty() && "Expecting an empty list of load operations"); ModuleOp moduleOp = forOp->getParentOfType(); @@ -128,11 +127,6 @@ static void collectOpsToPipeline(scf::ForOp forOp, // operations in the loop body block. for (Operation &op : forOp) { if (auto loadOp = dyn_cast(&op)) { - Value ptr = loadOp.getPtr(); - bool isBlockPtr = mlir::triton::isTensorPointerType(ptr.getType()); - if (!isBlockPtr && !supportRegularPtr) - continue; - // Check if the memory is structed densely. If not, we do not prefetch it // to avoid polluting the cache. Attribute blockIOAttr = @@ -303,12 +297,11 @@ createSchedule(scf::ForOp forOp, int numStages) { } bool ttgi::preProcessLoopAndGetSchedule(scf::ForOp &forOp, int numStages, - bool supportRegularPtr, mlir::scf::PipeliningOption &options) { // 1. First collect "interesting" operations with a stage where to schedule // them. This gives a coarse scheduling for the loop. SmallVector loads; - collectOpsToPipeline(forOp, loads, supportRegularPtr); + collectOpsToPipeline(forOp, loads); if (loads.empty()) { LDBG("No loads to pipeline"); return false; diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/Schedule.h b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/Schedule.h index 9ef16b1d12..a5f29580d0 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/Schedule.h +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/Schedule.h @@ -6,7 +6,6 @@ namespace mlir::triton::gpu::intel { bool preProcessLoopAndGetSchedule(scf::ForOp &forOp, int numStages, - bool supportRegularPtr, mlir::scf::PipeliningOption &options); } // namespace mlir::triton::gpu::intel diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp index db6c7fe37b..53ac3bf07a 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp @@ -39,14 +39,14 @@ static bool preCondition(scf::ForOp forOp) { } static void -pipelineLoop(scf::ForOp forOp, int numStages, bool supportRegularPtr, +pipelineLoop(scf::ForOp forOp, int numStages, std::optional barrierScope = std::nullopt) { mlir::scf::PipeliningOption options; if (!preCondition(forOp)) return; - bool foundSchedule = ttgi::preProcessLoopAndGetSchedule( - forOp, numStages, supportRegularPtr, options); + bool foundSchedule = + ttgi::preProcessLoopAndGetSchedule(forOp, numStages, options); if (!foundSchedule) return; @@ -108,7 +108,7 @@ struct IntelGPUPipelinePass getOperation()->walk([&](scf::ForOp forOp) { loops.push_back(forOp); }); for (scf::ForOp forOp : loops) { - pipelineLoop(forOp, numStages, supportRegularPtr, barrierScope); + pipelineLoop(forOp, numStages, barrierScope); } } }; diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc index e80c59e27c..1e3cd76991 100644 --- a/third_party/intel/triton_xpu.cc +++ b/third_party/intel/triton_xpu.cc @@ -88,8 +88,8 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) { gpu::intel::createTritonIntelGPUAccelerateMatmul); ADD_PASS_WRAPPER_0("add_rewrite_stack_ptr", gpu::intel::createTritonIntelGPURewriteStackPtr); - ADD_PASS_WRAPPER_OPT_3("add_pipeline", - gpu::intel::createTritonIntelGPUPipeline, int, bool, + ADD_PASS_WRAPPER_OPT_2("add_pipeline", + gpu::intel::createTritonIntelGPUPipeline, int, enum gpu::intel::SplitBarrierScope); ADD_PASS_WRAPPER_0("add_remove_layout_conversions", gpu::intel::createTritonIntelGPURemoveLayoutConversions); From 78b26b8ef3d96e6a79627d8c1fa202161b1fb2a0 Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Mon, 28 Apr 2025 00:28:59 +0000 Subject: [PATCH 05/11] address review comment --- .../LoadStoreOpToLLVM.cpp | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp index c48fcdd06c..98d87127ad 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -530,30 +530,38 @@ struct PrefetchOpConversion }; opIdx = getOpIdx(); - auto repetitions = dpasLayout.getDPASRepetitions(tensorShape, opIdx); - // getDPASRepetitions always return rank 3 size. + SmallVector repetitions = + dpasLayout.getDPASRepetitions(tensorShape, opIdx); + assert(repetitions.size() == 3 && + "getDPASRepetitions always return rank 3 size"); SmallVector numReps{repetitions.begin() + 1, repetitions.end()}; SmallVector shardTensorShape; - if (opIdx == DpasEncodingAttr::OpIdx::OperandA) { + switch (opIdx) { + case DpasEncodingAttr::OpIdx::OperandA: { auto opAShape = dpasLayout.getShapeA(); shardTensorShape = {std::min(tensorShape[0], opAShape[0]), tensorShape[1]}; warpsPerCTA[1] = 1; repCluster[1] = 1; numReps[1] = 1; - } else { + } break; + case DpasEncodingAttr::OpIdx::OperandB: { auto opBShape = dpasLayout.getShapeB(); shardTensorShape = {tensorShape[0], std::min(tensorShape[1], opBShape[1])}; warpsPerCTA[0] = 1; repCluster[0] = 1; numReps[0] = 1; + } break; + case DpasEncodingAttr::OpIdx::OperandC: { + llvm_unreachable("unexpected OpIdx::OperandC"); + } break; } auto ptrType = cast(tensorOfPointers.getElementType()); Type elementType = ptrType.getPointeeType(); - RankedTensorType tensorType = RankedTensorType::get( - shardTensorShape, elementType, tensorOfPointers.getEncoding()); + auto tensorType = RankedTensorType::get(shardTensorShape, elementType, + tensorOfPointers.getEncoding()); SmallVector prefetchShape = get2DPrefetchShapePerWarp(tensorType); @@ -562,6 +570,7 @@ struct PrefetchOpConversion unsigned maskConstancyHor = std::numeric_limits::max(), maskConstancyVer = std::numeric_limits::max(); if (mask) { + // No need to check the constancy of scalar mask. if (auto maskTy = dyn_cast_or_null(mask.getType())) { auto axisInfo = const_cast( axisAnalysisPass) @@ -574,9 +583,6 @@ struct PrefetchOpConversion maskConstancyVer = 1; } } - /*else { - // scalar mask. No need to check the constancy. - }*/ } prefetchShape = {std::min(prefetchShape[0], maskConstancyVer), std::min(prefetchShape[1], maskConstancyHor)}; From e3d441a9df00bdd18a9ef22f03700559458a3034 Mon Sep 17 00:00:00 2001 From: "Tiotto, Ettore" Date: Mon, 28 Apr 2025 21:52:35 +0000 Subject: [PATCH 06/11] Fix failing CI test Signed-off-by: Tiotto, Ettore --- test/TritonIntelGPU/prefetch-to-llvm.mlir | 13 ++++++++++++ .../LoadStoreOpToLLVM.cpp | 21 +++++++++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/test/TritonIntelGPU/prefetch-to-llvm.mlir b/test/TritonIntelGPU/prefetch-to-llvm.mlir index b4d493b825..27f81be915 100644 --- a/test/TritonIntelGPU/prefetch-to-llvm.mlir +++ b/test/TritonIntelGPU/prefetch-to-llvm.mlir @@ -156,3 +156,16 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32} tt.return } } + +// ----- + +// COM: Currently the prefetch operation in this test cannot be lowered correctly, so we check that the test compiles cleanly and not 2D block prefetch operation gets generated. +#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [4, 1], A = [32, 8], B = [8, 16], C = [32, 16]}> +module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_sg_2d_block, triton_intel_gpu.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32} { + // CHECK-LABEL: llvm.func spir_kernelcc @kernel + tt.func public @kernel(%arg0 : tensor<128x32x!tt.ptr, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>) { + // CHECK-NOT: intel_sub_group_2d_block_prefetch + triton_intel_gpu.prefetch %arg0 {boundaryCheck = array, cache = 1 : i32, evict = 1 : i32, isVolatile = false, operandSegmentSizes = array, triton_intel_gpu.block_io = "row_major"} : tensor<128x32x!tt.ptr, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + tt.return + } +} diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp index 98d87127ad..c647d7cd76 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -325,10 +325,18 @@ struct PrefetchOpConversion LogicalResult matchAndRewrite(triton::gpu::intel::PrefetchOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const final { - Value ptr = op.getPtr(); - if (isTensorPointerType(ptr.getType())) - return rewriteTensorPointerPrefetch(op, adaptor, rewriter); - return rewriteRegularPointerPrefetch(op, adaptor, rewriter); + LogicalResult res = + isTensorPointerType(op.getPtr().getType()) + ? rewriteTensorPointerPrefetch(op, adaptor, rewriter) + : rewriteRegularPointerPrefetch(op, adaptor, rewriter); + + // FIXME: the prefetch lowering code should never fail. Currently it does in + // some cases. We should address those cases instead of removing the + // prefetch operation. + if (failed(res)) + rewriter.eraseOp(op); + + return success(); } LogicalResult @@ -641,6 +649,11 @@ struct PrefetchOpConversion masks[offset] = maskElems[i]; } + // baseAddrs[{0, 0}] and baseAddrs[{1, 0}] are currently used to calculate + // the pitch. + if (baseAddrs.count({0, 0}) == 0 || baseAddrs.count({1, 0}) == 0) + return failure(); + Value base, baseWidth, baseHeight, rowStrideInBytes, colStride, offsetBaseX, offsetBaseY; From 41971ca8d795fa013e37abf2490d4cf8a8028a9e Mon Sep 17 00:00:00 2001 From: "Tiotto, Ettore" Date: Tue, 29 Apr 2025 18:12:07 +0000 Subject: [PATCH 07/11] Only prefetch 2D loads Signed-off-by: Tiotto, Ettore --- .../Pipeliner/MatmulLoopPipeline.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp index 12d8dacd28..e9b335067f 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp @@ -127,8 +127,8 @@ static void collectOpsToPipeline(scf::ForOp forOp, // operations in the loop body block. for (Operation &op : forOp) { if (auto loadOp = dyn_cast(&op)) { - // Check if the memory is structed densely. If not, we do not prefetch it - // to avoid polluting the cache. + // Check if the memory is structured densely. If not, we do not prefetch + // it to avoid polluting the cache. Attribute blockIOAttr = loadOp->getAttr(mlir::triton::gpu::intel::TritonIntelGPUDialect:: getBlockIOAttrName()); @@ -137,6 +137,11 @@ static void collectOpsToPipeline(scf::ForOp forOp, continue; } + if (cast(loadOp.getType()).getRank() != 2) { + LDBG("Skipping LoadOp with non 2D tensor type" << *loadOp); + continue; + } + std::optional loadWithDotOperand = loadDotOperand(loadOp); if (loadWithDotOperand.has_value()) loadOps.push_back(loadWithDotOperand.value()); From 2bf17c7b1a41ecb4b094c0d8da0a01acfd758b04 Mon Sep 17 00:00:00 2001 From: "Tiotto, Ettore" Date: Wed, 30 Apr 2025 17:36:31 +0000 Subject: [PATCH 08/11] Refactor LoadStoreOpToLLVM.cpp Signed-off-by: Tiotto, Ettore --- .../LoadStoreOpToLLVM.cpp | 118 ++++++++---------- 1 file changed, 50 insertions(+), 68 deletions(-) diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp index f5e51d7980..4d3f245030 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -316,7 +316,7 @@ struct BlockIOConversionBase : public LoadStoreConversionBase { // Determine whether the given LoadOp can be lowered to using block IO // instructions. - bool isLoadCandidate(triton::LoadOp op) const { + static bool isLoadCandidate(triton::LoadOp op) { Attribute blockIOAttr = op->getAttr(TritonIntelGPUDialect::getBlockIOAttrName()); if (!blockIOAttr) @@ -332,7 +332,7 @@ struct BlockIOConversionBase : public LoadStoreConversionBase { std::enable_if_t::value, bool> = true> - bool isMemoryRowMajor(OpTy op) const { + static bool isMemoryRowMajor(OpTy op) { Attribute blockIOAttr = op->getAttr(TritonIntelGPUDialect::getBlockIOAttrName()); assert(blockIOAttr && "Expecting block IO attribute"); @@ -347,7 +347,7 @@ struct BlockIOConversionBase : public LoadStoreConversionBase { return memoryLayoutInfo == "row_major"; } - DpasEncodingAttr::OpIdx getOpIdx(RankedTensorType tensorTy) const { + static DpasEncodingAttr::OpIdx getOpIdx(RankedTensorType tensorTy) { if (hasDpasEncoding(tensorTy)) return DpasEncodingAttr::OpIdx::OperandC; @@ -356,7 +356,7 @@ struct BlockIOConversionBase : public LoadStoreConversionBase { return static_cast(dotLayout.getOpIdx()); } - DpasEncodingAttr getDpasLayout(RankedTensorType tensorTy) const { + static DpasEncodingAttr getDpasLayout(RankedTensorType tensorTy) { Attribute encoding = tensorTy.getEncoding(); return cast( hasDpasEncoding(tensorTy) @@ -544,14 +544,10 @@ struct PrefetchOpConversion rewriteRegularPointerPrefetch(triton::gpu::intel::PrefetchOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { - Attribute blockIOAttr = op->getAttr(TritonIntelGPUDialect::getBlockIOAttrName()); - if (!blockIOAttr) { - llvm_unreachable("Unexpected prefetch operation on unstructured memory " - "which may pollute the cache"); + if (!blockIOAttr) return failure(); - } // Only support rank 2 block pointer, either row major or column major. StringRef memoryLayoutInfo = cast(blockIOAttr).getValue(); @@ -562,50 +558,41 @@ struct PrefetchOpConversion const bool memoryRowMajor = (memoryLayoutInfo == "row_major"); // TODO: To support more layouts on memory. - if (!memoryRowMajor) { + if (!memoryRowMajor) return failure(); - } - - Value ptr = op.getPtr(); - auto tensorOfPointers = cast(ptr.getType()); - // TODO: To support more layouts in register. - if (!hasDotDpasEncoding(tensorOfPointers)) + auto tensorOfPointers = cast(op.getPtr().getType()); + std::optional encoding = + getDotEncoding(tensorOfPointers); + if (!encoding) return failure(); - auto encoding = getDotEncoding(tensorOfPointers).value(); - auto dpasLayout = cast(encoding.getParent()); + auto dpasLayout = cast(encoding->getParent()); SmallVector warpsPerCTA(dpasLayout.getWarpsPerCTA()); - auto cluster = dpasLayout.getRepCluster(); + ArrayRef cluster = dpasLayout.getRepCluster(); SmallVector repCluster{cluster.begin(), cluster.end()}; - auto tensorShape = tensorOfPointers.getShape(); - - DpasEncodingAttr::OpIdx opIdx; - auto getOpIdx = [&]() -> DpasEncodingAttr::OpIdx { - auto dotLayout = getDotEncoding(tensorOfPointers).value(); - return static_cast(dotLayout.getOpIdx()); - }; - opIdx = getOpIdx(); - + ArrayRef tensorShape = tensorOfPointers.getShape(); + DpasEncodingAttr::OpIdx opIdx = getOpIdx(tensorOfPointers); SmallVector repetitions = dpasLayout.getDPASRepetitions(tensorShape, opIdx); assert(repetitions.size() == 3 && "getDPASRepetitions always return rank 3 size"); SmallVector numReps{repetitions.begin() + 1, repetitions.end()}; + SmallVector shardTensorShape; switch (opIdx) { case DpasEncodingAttr::OpIdx::OperandA: { - auto opAShape = dpasLayout.getShapeA(); - shardTensorShape = {std::min(tensorShape[0], opAShape[0]), - tensorShape[1]}; + shardTensorShape = { + std::min(tensorShape[0], dpasLayout.getShapeA()[0]), + tensorShape[1]}; warpsPerCTA[1] = 1; repCluster[1] = 1; numReps[1] = 1; } break; case DpasEncodingAttr::OpIdx::OperandB: { - auto opBShape = dpasLayout.getShapeB(); - shardTensorShape = {tensorShape[0], - std::min(tensorShape[1], opBShape[1])}; + shardTensorShape = { + tensorShape[0], + std::min(tensorShape[1], dpasLayout.getShapeB()[1])}; warpsPerCTA[0] = 1; repCluster[0] = 1; numReps[0] = 1; @@ -620,27 +607,26 @@ struct PrefetchOpConversion auto tensorType = RankedTensorType::get(shardTensorShape, elementType, tensorOfPointers.getEncoding()); - SmallVector prefetchShape = - get2DPrefetchShapePerWarp(tensorType); - Value mask = op.getMask(); unsigned maskConstancyHor = std::numeric_limits::max(), maskConstancyVer = std::numeric_limits::max(); if (mask) { // No need to check the constancy of scalar mask. if (auto maskTy = dyn_cast_or_null(mask.getType())) { - auto axisInfo = const_cast( - axisAnalysisPass) - .getAxisInfo(mask); + maskConstancyHor = maskConstancyVer = 1; + AxisInfo *axisInfo = + const_cast( + axisAnalysisPass) + .getAxisInfo(mask); if (axisInfo) { maskConstancyHor = axisInfo->getConstancy(1); maskConstancyVer = axisInfo->getConstancy(0); - } else { - maskConstancyHor = 1; - maskConstancyVer = 1; } } } + + SmallVector prefetchShape = + get2DPrefetchShapePerWarp(tensorType); prefetchShape = {std::min(prefetchShape[0], maskConstancyVer), std::min(prefetchShape[1], maskConstancyHor)}; @@ -680,17 +666,16 @@ struct PrefetchOpConversion Value llPtr = adaptor.getPtr(); Value llMask = adaptor.getMask(); - SmallVector ptrElems, maskElems; // Get the LLVM values for pointers - ptrElems = unpackLLElements(loc, llPtr, rewriter); - if (llMask) { + SmallVector ptrElems = unpackLLElements(loc, llPtr, rewriter); + SmallVector maskElems; + if (llMask) maskElems = unpackLLElements(loc, llMask, rewriter); - } // re-arrange the baseAddrs and masks to for large 2D block IO. // Layout is unrelated to the scalar type. SmallVector> offsets = - mlir::emitOffsetForLayout(encoding, tensorOfPointers); + emitOffsetForLayout(*encoding, tensorOfPointers); for (size_t i = 0; i < ptrElems.size(); ++i) { SmallVector offset = offsets[i]; baseAddrs[offset] = ptrElems[i]; @@ -703,15 +688,13 @@ struct PrefetchOpConversion if (baseAddrs.count({0, 0}) == 0 || baseAddrs.count({1, 0}) == 0) return failure(); - Value base, baseWidth, baseHeight, rowStrideInBytes, colStride, offsetBaseX, - offsetBaseY; - - baseWidth = b.i32_val(vBlocks * tileWidthInElem * (elemSizeInBits / 8)); - baseHeight = b.i32_val(tileHeightInElem); - offsetBaseX = b.i32_val(0); - offsetBaseY = b.i32_val(0); - rowStrideInBytes = b.sub(b.ptrtoint(i64_ty, baseAddrs[{1, 0}]), - b.ptrtoint(i64_ty, baseAddrs[{0, 0}])); + Value baseWidth = + b.i32_val(vBlocks * tileWidthInElem * (elemSizeInBits / 8)); + Value baseHeight = b.i32_val(tileHeightInElem); + Value offsetBaseX = b.i32_val(0); + Value offsetBaseY = b.i32_val(0); + Value rowStrideInBytes = b.sub(b.ptrtoint(i64_ty, baseAddrs[{1, 0}]), + b.ptrtoint(i64_ty, baseAddrs[{0, 0}])); rowStrideInBytes = targetInfo.shuffleIdx(rewriter, loc, rowStrideInBytes, 0); rowStrideInBytes = b.umax(b.trunc(i32_ty, rowStrideInBytes), baseWidth); @@ -727,19 +710,18 @@ struct PrefetchOpConversion unsigned offsetM = row * warpsPerCTA[0] * shardTensorShape[0] + i * prefetchShape[0]; Value pred; - if (llMask) { - if (maskElems.size() > 1) { - pred = targetInfo.shuffleIdx(rewriter, loc, - masks[{offsetM, offsetN}], 0); - } else { - pred = maskElems[0]; - } - } else { + if (llMask) + pred = (maskElems.size() > 1) + ? targetInfo.shuffleIdx(rewriter, loc, + masks[{offsetM, offsetN}], 0) + : maskElems[0]; + + else pred = b.int_val(1, 1); - } + Value offsetY = b.select(pred, b.i32_val(0), baseHeight); - auto addr = targetInfo.shuffleIdx(rewriter, loc, - baseAddrs[{offsetM, offsetN}], 0); + Value addr = targetInfo.shuffleIdx( + rewriter, loc, baseAddrs[{offsetM, offsetN}], 0); auto newOp = rewriter.create( loc, From d28fefe5e6d24b5a61d1c12a6027b14bf1033e36 Mon Sep 17 00:00:00 2001 From: "Tiotto, Ettore" Date: Thu, 1 May 2025 19:34:44 +0000 Subject: [PATCH 09/11] Fix failing gemm bmk Signed-off-by: Tiotto, Ettore --- test/TritonIntelGPU/prefetch-to-llvm.mlir | 80 +++++++++---------- .../LoadStoreOpToLLVM.cpp | 5 +- 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/test/TritonIntelGPU/prefetch-to-llvm.mlir b/test/TritonIntelGPU/prefetch-to-llvm.mlir index 27f81be915..ec1a684054 100644 --- a/test/TritonIntelGPU/prefetch-to-llvm.mlir +++ b/test/TritonIntelGPU/prefetch-to-llvm.mlir @@ -1,5 +1,6 @@ // RUN: triton-opt %s -split-input-file --convert-triton-intel-gpu-to-llvm --cse -canonicalize | FileCheck %s +// CHECK-DAG: llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects, no_unwind} // CHECK-DAG: llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects, no_unwind} module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @prefetch_block_ptr( @@ -11,13 +12,14 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32} %c0_i32 = arith.constant 0 : i32 %c1_i64 = arith.constant 1 : i64 - // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-DAG: %[[CST_2_I32:.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-DAG: %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64 - // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-DAG: %[[CST_2_I32:.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32 + // CHECK-DAG: %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64 + // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK-DAG: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[VAL_15:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() // CHECK: %[[VAL_16:.*]] = llvm.zext %[[VAL_15]] : i32 to i64 // CHECK: %[[VAL_17:.*]] = llvm.trunc %[[VAL_16]] : i64 to i32 @@ -25,10 +27,10 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32} // CHECK: %[[VAL_19:.*]] = llvm.udiv %[[VAL_17]], %[[CST_1]] : i32 // CHECK: %[[VAL_20:.*]] = llvm.urem %[[VAL_19]], %[[CST_8]] : i32 // CHECK: %[[VAL_21:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64 - // CHECK: %[[ROW_MAJOR_BASE_WIDTH_I32:.*]] = llvm.trunc %[[VAL_21]] : i64 to i32 - // CHECK: %[[ROW_MAJOR_BASE_HEIGHT_I32:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32 + // CHECK: %[[ROW_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_21]] : i64 to i32 + // CHECK: %[[ROW_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32 // CHECK: %[[VAL_24:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64 - // CHECK: %[[PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32 + // CHECK: %[[ROW_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32 // CHECK: %[[VAL_26:.*]] = llvm.mul %[[VAL_18]], %[[CST_32]] : i32 // CHECK: %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32 // CHECK: %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32 @@ -37,40 +39,38 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32} // CHECK: %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32 // CHECK: %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32 // CHECK: %[[VAL_33:.*]] = llvm.add %[[VAL_32]], %[[CST_0]] : i32 - // CHECK: %[[OFFSET_Y:.*]] = llvm.trunc %[[VAL_33]] : i32 to i32 - // CHECK: %[[OFFSET_X:.*]] = llvm.trunc %[[VAL_29]] : i32 to i32 - // CHECK: %[[VAL_36:.*]] = llvm.insertelement %[[OFFSET_X]], {{.*}} : i32] : vector<2xi32> - // CHECK: %[[OFFSETS:.*]] = llvm.insertelement %[[OFFSET_Y]], {{.*}} : i32] : vector<2xi32> - // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH_I32]], %[[ROW_MAJOR_BASE_HEIGHT_I32]], %[[PITCH]], %[[OFFSETS]]) {{.*}} + // CHECK: %[[ROW_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_33]] : i32 to i32 + // CHECK: %[[ROW_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_29]] : i32 to i32 + // CHECK: %[[VAL_36:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32> + // CHECK: %[[ROW_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32> + // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH]], %[[ROW_MAJOR_BASE_HEIGHT]], %[[ROW_MAJOR_PITCH]], %[[ROW_MAJOR_OFFSETS]]) %rowMajorPtr = tt.make_tensor_ptr %arg0, [%arg2, %arg4], [%arg5, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > triton_intel_gpu.prefetch %rowMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "row_major"} : !tt.ptr> - // COM: The memory layout is same for the column major memory and row major memory. The prefetch should be the same. - - // CHECK: %[[VAL_38:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32 - // CHECK: %[[VAL_39:.*]] = llvm.zext %[[VAL_38]] : i32 to i64 - // CHECK: %[[VAL_40:.*]] = llvm.trunc %[[VAL_39]] : i64 to i32 - // CHECK: %[[VAL_41:.*]] = llvm.urem %[[VAL_40]], %[[CST_1]] : i32 - // CHECK: %[[VAL_42:.*]] = llvm.udiv %[[VAL_40]], %[[CST_1]] : i32 - // CHECK: %[[VAL_43:.*]] = llvm.urem %[[VAL_42]], %[[CST_8]] : i32 - // CHECK: %[[VAL_44:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64 - // CHECK: %[[COLUM_MAJOR_BASE_WIDTH_I32:.*]] = llvm.trunc %[[VAL_44]] : i64 to i32 - // CHECK: %[[COLUM_MAJOR_BASE_HEIGHT_I32:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32 - // CHECK: %[[VAL_47:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64 - // CHECK: %[[COLUM_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_47]] : i64 to i32 - // CHECK: %[[VAL_49:.*]] = llvm.mul %[[VAL_41]], %[[CST_32]] : i32 + // CHECK: %[[VAL_32:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() + // CHECK: %[[VAL_33:.*]] = llvm.zext %[[VAL_32]] : i32 to i64 + // CHECK: %[[VAL_34:.*]] = llvm.trunc %[[VAL_33]] : i64 to i32 + // CHECK: %[[VAL_35:.*]] = llvm.urem %[[VAL_34]], %[[CST_2_I32]] : i32 + // CHECK: %[[VAL_36:.*]] = llvm.udiv %[[VAL_34]], %[[CST_2_I32]] : i32 + // CHECK: %[[VAL_37:.*]] = llvm.urem %[[VAL_36]], %[[CST_4]] : i32 + // CHECK: %[[VAL_38:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64 + // CHECK: %[[COL_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_38]] : i64 to i32 + // CHECK: %[[COL_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32 + // CHECK: %[[VAL_41:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64 + // CHECK: %[[COL_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_41]] : i64 to i32 + // CHECK: %[[VAL_43:.*]] = llvm.mul %[[VAL_35]], %[[CST_16]] : i32 + // CHECK: %[[VAL_44:.*]] = llvm.add %[[VAL_43]], %[[CST_0]] : i32 + // CHECK: %[[VAL_45:.*]] = llvm.urem %[[VAL_44]], %[[CST_32]] : i32 + // CHECK: %[[VAL_46:.*]] = llvm.add %[[VAL_45]], %[[CST_0]] : i32 + // CHECK: %[[VAL_47:.*]] = llvm.mul %[[VAL_37]], %[[CST_4]] : i32 + // CHECK: %[[VAL_48:.*]] = llvm.add %[[VAL_47]], %[[CST_0]] : i32 + // CHECK: %[[VAL_49:.*]] = llvm.urem %[[VAL_48]], %[[CST_16]] : i32 // CHECK: %[[VAL_50:.*]] = llvm.add %[[VAL_49]], %[[CST_0]] : i32 - // CHECK: %[[VAL_51:.*]] = llvm.urem %[[VAL_50]], %[[CST_32]] : i32 - // CHECK: %[[VAL_52:.*]] = llvm.add %[[VAL_51]], %[[CST_0]] : i32 - // CHECK: %[[VAL_53:.*]] = llvm.mul %[[VAL_43]], %[[CST_2_I32]] : i32 - // CHECK: %[[VAL_54:.*]] = llvm.add %[[VAL_53]], %[[CST_0]] : i32 - // CHECK: %[[VAL_55:.*]] = llvm.urem %[[VAL_54]], %[[CST_16]] : i32 - // CHECK: %[[VAL_56:.*]] = llvm.add %[[VAL_55]], %[[CST_0]] : i32 - // CHECK: %[[VAL_57:.*]] = llvm.trunc %[[VAL_56]] : i32 to i32 - // CHECK: %[[VAL_58:.*]] = llvm.trunc %[[VAL_52]] : i32 to i32 - // CHECK: %[[VAL_59:.*]] = llvm.insertelement %[[VAL_58]], {{.*}} : i32] : vector<2xi32> - // CHECK: %[[COLUM_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[VAL_57]], {{.*}} : i32] : vector<2xi32> - // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[COLUM_MAJOR_BASE_WIDTH_I32]], %[[COLUM_MAJOR_BASE_HEIGHT_I32]], %[[COLUM_MAJOR_PITCH]], %[[COLUM_MAJOR_OFFSETS]]) {{.*}} + // CHECK: %[[COL_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_50]] : i32 to i32 + // CHECK: %[[COL_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_46]] : i32 to i32 + // CHECK: %[[VAL_54:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32> + // CHECK: %[[COL_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32> + // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(%[[BASE]], %[[COL_MAJOR_BASE_WIDTH]], %[[COL_MAJOR_BASE_HEIGHT]], %[[COL_MAJOR_PITCH]], %[[COL_MAJOR_OFFSETS]]) {{.*}} %columnMajorPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%c1_i64, %arg5], [%c0_i32, %c0_i32] {order = array} : > triton_intel_gpu.prefetch %columnMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "column_major"} : !tt.ptr> diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp index 4d3f245030..e0e4564cb4 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -425,8 +425,9 @@ struct PrefetchOpConversion // size base on row major shape. std::swap(tensorShape[0], tensorShape[1]); - tensorType = RankedTensorType::get( - tensorShape, tensorType.getElementType(), tensorType.getEncoding()); + // tensorType = RankedTensorType::get( + // tensorShape, tensorType.getElementType(), + // tensorType.getEncoding()); } unsigned numWarps = triton::gpu::lookupNumWarps(op); From 95f71264d9c75e5a0bd95b8731e2ecb8019d63bd Mon Sep 17 00:00:00 2001 From: "Tiotto, Ettore" Date: Fri, 2 May 2025 16:03:36 +0000 Subject: [PATCH 10/11] Fix merge Signed-off-by: Tiotto, Ettore --- test/TritonIntelGPU/prefetch-to-llvm.mlir | 126 ++++++++---------- third_party/intel/backend/compiler.py | 2 +- .../LoadStoreOpToLLVM.cpp | 5 +- 3 files changed, 60 insertions(+), 73 deletions(-) diff --git a/test/TritonIntelGPU/prefetch-to-llvm.mlir b/test/TritonIntelGPU/prefetch-to-llvm.mlir index bfa9d1bd98..f02818a894 100644 --- a/test/TritonIntelGPU/prefetch-to-llvm.mlir +++ b/test/TritonIntelGPU/prefetch-to-llvm.mlir @@ -12,79 +12,69 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32} %c0_i32 = arith.constant 0 : i32 %c1_i64 = arith.constant 1 : i64 - // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-DAG: %[[CST_2_I32:.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-DAG: %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64 - // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_15:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() - // CHECK: %[[VAL_16:.*]] = llvm.zext %[[VAL_15]] : i32 to i64 - // CHECK: %[[VAL_17:.*]] = llvm.trunc %[[VAL_16]] : i64 to i32 - // CHECK: %[[VAL_18:.*]] = llvm.urem %[[VAL_17]], %[[CST_1]] : i32 - // CHECK: %[[VAL_19:.*]] = llvm.udiv %[[VAL_17]], %[[CST_1]] : i32 - // CHECK: %[[VAL_20:.*]] = llvm.urem %[[VAL_19]], %[[CST_8]] : i32 - // CHECK: %[[VAL_21:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64 - // CHECK: %[[ROW_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_21]] : i64 to i32 - // CHECK: %[[ROW_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32 - // CHECK: %[[VAL_24:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64 - // CHECK: %[[ROW_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32 - // CHECK: %[[VAL_26:.*]] = llvm.mul %[[VAL_18]], %[[CST_32]] : i32 - // CHECK: %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32 - // CHECK: %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32 - // CHECK: %[[VAL_29:.*]] = llvm.add %[[VAL_28]], %[[CST_0]] : i32 - // CHECK: %[[VAL_30:.*]] = llvm.mul %[[VAL_20]], %[[CST_2_I32]] : i32 - // CHECK: %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32 - // CHECK: %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32 - // CHECK: %[[VAL_33:.*]] = llvm.add %[[VAL_32]], %[[CST_0]] : i32 - // CHECK: %[[ROW_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_33]] : i32 to i32 - // CHECK: %[[ROW_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_29]] : i32 to i32 -<<<<<<< HEAD - // CHECK: %[[VAL_36:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32> - // CHECK: %[[ROW_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32> - // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH]], %[[ROW_MAJOR_BASE_HEIGHT]], %[[ROW_MAJOR_PITCH]], %[[ROW_MAJOR_OFFSETS]]) + // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-DAG: %[[CST_2_I32:.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32 + // CHECK-DAG: %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64 + // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK-DAG: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK: %[[VAL_15:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() + // CHECK: %[[VAL_16:.*]] = llvm.zext %[[VAL_15]] : i32 to i64 + // CHECK: %[[VAL_17:.*]] = llvm.trunc %[[VAL_16]] : i64 to i32 + // CHECK: %[[VAL_18:.*]] = llvm.urem %[[VAL_17]], %[[CST_1]] : i32 + // CHECK: %[[VAL_19:.*]] = llvm.udiv %[[VAL_17]], %[[CST_1]] : i32 + // CHECK: %[[VAL_20:.*]] = llvm.urem %[[VAL_19]], %[[CST_8]] : i32 + // CHECK: %[[VAL_21:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64 + // CHECK: %[[ROW_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_21]] : i64 to i32 + // CHECK: %[[ROW_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32 + // CHECK: %[[VAL_24:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64 + // CHECK: %[[ROW_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32 + // CHECK: %[[VAL_26:.*]] = llvm.mul %[[VAL_18]], %[[CST_32]] : i32 + // CHECK: %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32 + // CHECK: %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32 + // CHECK: %[[VAL_29:.*]] = llvm.add %[[VAL_28]], %[[CST_0]] : i32 + // CHECK: %[[VAL_30:.*]] = llvm.mul %[[VAL_20]], %[[CST_2_I32]] : i32 + // CHECK: %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32 + // CHECK: %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32 + // CHECK: %[[VAL_33:.*]] = llvm.add %[[VAL_32]], %[[CST_0]] : i32 + // CHECK: %[[ROW_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_33]] : i32 to i32 + // CHECK: %[[ROW_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_29]] : i32 to i32 + // CHECK: %[[VAL_36:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32> + // CHECK: %[[ROW_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32> + // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH]], %[[ROW_MAJOR_BASE_HEIGHT]], %[[ROW_MAJOR_PITCH]], %[[ROW_MAJOR_OFFSETS]]) %rowMajorPtr = tt.make_tensor_ptr %arg0, [%arg2, %arg4], [%arg5, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > triton_intel_gpu.prefetch %rowMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "row_major"} : !tt.ptr> -======= - - // CHECK: %[[VAL_36:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32> - // CHECK: %[[ROW_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32> - // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH]], %[[ROW_MAJOR_BASE_HEIGHT]], %[[ROW_MAJOR_PITCH]], %[[ROW_MAJOR_OFFSETS]]) - %rowMajorPtr = tt.make_tensor_ptr %arg0, [%arg2, %arg4], [%arg5, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > - triton_intel_gpu.prefetch %rowMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "row_major"} : !tt.ptr> - ->>>>>>> origin/main - // CHECK: %[[VAL_32:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() - // CHECK: %[[VAL_33:.*]] = llvm.zext %[[VAL_32]] : i32 to i64 - // CHECK: %[[VAL_34:.*]] = llvm.trunc %[[VAL_33]] : i64 to i32 - // CHECK: %[[VAL_35:.*]] = llvm.urem %[[VAL_34]], %[[CST_2_I32]] : i32 - // CHECK: %[[VAL_36:.*]] = llvm.udiv %[[VAL_34]], %[[CST_2_I32]] : i32 - // CHECK: %[[VAL_37:.*]] = llvm.urem %[[VAL_36]], %[[CST_4]] : i32 - // CHECK: %[[VAL_38:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64 - // CHECK: %[[COL_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_38]] : i64 to i32 - // CHECK: %[[COL_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32 - // CHECK: %[[VAL_41:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64 - // CHECK: %[[COL_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_41]] : i64 to i32 - // CHECK: %[[VAL_43:.*]] = llvm.mul %[[VAL_35]], %[[CST_16]] : i32 - // CHECK: %[[VAL_44:.*]] = llvm.add %[[VAL_43]], %[[CST_0]] : i32 - // CHECK: %[[VAL_45:.*]] = llvm.urem %[[VAL_44]], %[[CST_32]] : i32 - // CHECK: %[[VAL_46:.*]] = llvm.add %[[VAL_45]], %[[CST_0]] : i32 - // CHECK: %[[VAL_47:.*]] = llvm.mul %[[VAL_37]], %[[CST_4]] : i32 - // CHECK: %[[VAL_48:.*]] = llvm.add %[[VAL_47]], %[[CST_0]] : i32 - // CHECK: %[[VAL_49:.*]] = llvm.urem %[[VAL_48]], %[[CST_16]] : i32 - // CHECK: %[[VAL_50:.*]] = llvm.add %[[VAL_49]], %[[CST_0]] : i32 - // CHECK: %[[COL_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_50]] : i32 to i32 - // CHECK: %[[COL_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_46]] : i32 to i32 - // CHECK: %[[VAL_54:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32> - // CHECK: %[[COL_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32> - // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(%[[BASE]], %[[COL_MAJOR_BASE_WIDTH]], %[[COL_MAJOR_BASE_HEIGHT]], %[[COL_MAJOR_PITCH]], %[[COL_MAJOR_OFFSETS]]) {{.*}} + // CHECK: %[[VAL_32:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() + // CHECK: %[[VAL_33:.*]] = llvm.zext %[[VAL_32]] : i32 to i64 + // CHECK: %[[VAL_34:.*]] = llvm.trunc %[[VAL_33]] : i64 to i32 + // CHECK: %[[VAL_35:.*]] = llvm.urem %[[VAL_34]], %[[CST_2_I32]] : i32 + // CHECK: %[[VAL_36:.*]] = llvm.udiv %[[VAL_34]], %[[CST_2_I32]] : i32 + // CHECK: %[[VAL_37:.*]] = llvm.urem %[[VAL_36]], %[[CST_4]] : i32 + // CHECK: %[[VAL_38:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64 + // CHECK: %[[COL_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_38]] : i64 to i32 + // CHECK: %[[COL_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32 + // CHECK: %[[VAL_41:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64 + // CHECK: %[[COL_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_41]] : i64 to i32 + // CHECK: %[[VAL_43:.*]] = llvm.mul %[[VAL_35]], %[[CST_16]] : i32 + // CHECK: %[[VAL_44:.*]] = llvm.add %[[VAL_43]], %[[CST_0]] : i32 + // CHECK: %[[VAL_45:.*]] = llvm.urem %[[VAL_44]], %[[CST_32]] : i32 + // CHECK: %[[VAL_46:.*]] = llvm.add %[[VAL_45]], %[[CST_0]] : i32 + // CHECK: %[[VAL_47:.*]] = llvm.mul %[[VAL_37]], %[[CST_4]] : i32 + // CHECK: %[[VAL_48:.*]] = llvm.add %[[VAL_47]], %[[CST_0]] : i32 + // CHECK: %[[VAL_49:.*]] = llvm.urem %[[VAL_48]], %[[CST_16]] : i32 + // CHECK: %[[VAL_50:.*]] = llvm.add %[[VAL_49]], %[[CST_0]] : i32 + // CHECK: %[[COL_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_50]] : i32 to i32 + // CHECK: %[[COL_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_46]] : i32 to i32 + // CHECK: %[[VAL_54:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32> + // CHECK: %[[COL_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32> + // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(%[[BASE]], %[[COL_MAJOR_BASE_WIDTH]], %[[COL_MAJOR_BASE_HEIGHT]], %[[COL_MAJOR_PITCH]], %[[COL_MAJOR_OFFSETS]]) {{.*}} %columnMajorPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%c1_i64, %arg5], [%c0_i32, %c0_i32] {order = array} : > triton_intel_gpu.prefetch %columnMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "column_major"} : !tt.ptr> - // COM: The memory is not structured densely. Not to prefetch it to the cache. + // COM: The memory is not structured densely. Ensure it is not prefetched to the cache. // CHECK-NOT: block_prefetch %nonContiguousPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%arg5, %arg5], [%c0_i32, %c0_i32] {order = array} : > triton_intel_gpu.prefetch %nonContiguousPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr> @@ -169,7 +159,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32} // ----- -// COM: Currently the prefetch operation in this test cannot be lowered correctly, so we check that the test compiles cleanly and not 2D block prefetch operation gets generated. +// COM: Currently the prefetch operation in this test cannot be lowered correctly, so we check that the test compiles cleanly and 2D block prefetch operations aren't generated. #mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [4, 1], A = [32, 8], B = [8, 16], C = [32, 16]}> module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_sg_2d_block, triton_intel_gpu.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @kernel diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index 8ca0dba87a..0fad28fd9e 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -310,7 +310,7 @@ def make_ttgir(mod, metadata, opt, properties): intel.passes.ttgpuir.add_accelerate_matmul(pm) intel.passes.ttgpuir.add_remove_layout_conversions(pm) intel.passes.ttgpuir.add_materialize_block_pointer(pm) - intel.passes.ttgpuir.add_remove_layout_conversions(pm) + ## intel.passes.ttgpuir.add_remove_layout_conversions(pm) intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, XPUBackend.get_split_barrier_scope(opt)) passes.ttgpuir.add_fuse_nested_loops(pm) diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp index 616520c3ec..d29841b5f2 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -427,10 +427,6 @@ struct PrefetchOpConversion // Swap the shape to make it row major and then get the tiling // size base on row major shape. std::swap(tensorShape[0], tensorShape[1]); - - // tensorType = RankedTensorType::get( - // tensorShape, tensorType.getElementType(), - // tensorType.getEncoding()); } unsigned numWarps = triton::gpu::lookupNumWarps(op); @@ -713,6 +709,7 @@ struct PrefetchOpConversion j * prefetchShape[1]; unsigned offsetM = row * warpsPerCTA[0] * shardTensorShape[0] + i * prefetchShape[0]; + Value pred; if (llMask) pred = (maskElems.size() > 1) From cad0433e1b4b6f7b1308a57e056476529a835e4a Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Fri, 2 May 2025 19:33:41 +0000 Subject: [PATCH 11/11] recover original remaining changes --- test/TritonIntelGPU/prefetch-to-llvm.mlir | 119 +++++++++--------- third_party/intel/backend/compiler.py | 2 +- .../LoadStoreOpToLLVM.cpp | 3 + 3 files changed, 64 insertions(+), 60 deletions(-) diff --git a/test/TritonIntelGPU/prefetch-to-llvm.mlir b/test/TritonIntelGPU/prefetch-to-llvm.mlir index f02818a894..ce802dbcec 100644 --- a/test/TritonIntelGPU/prefetch-to-llvm.mlir +++ b/test/TritonIntelGPU/prefetch-to-llvm.mlir @@ -1,6 +1,5 @@ // RUN: triton-opt %s -split-input-file --convert-triton-intel-gpu-to-llvm --cse -canonicalize | FileCheck %s -// CHECK-DAG: llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects, no_unwind} // CHECK-DAG: llvm.func spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects, no_unwind} module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @prefetch_block_ptr( @@ -12,69 +11,71 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32} %c0_i32 = arith.constant 0 : i32 %c1_i64 = arith.constant 1 : i64 - // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-DAG: %[[CST_2_I32:.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-DAG: %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64 - // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[VAL_15:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() - // CHECK: %[[VAL_16:.*]] = llvm.zext %[[VAL_15]] : i32 to i64 - // CHECK: %[[VAL_17:.*]] = llvm.trunc %[[VAL_16]] : i64 to i32 - // CHECK: %[[VAL_18:.*]] = llvm.urem %[[VAL_17]], %[[CST_1]] : i32 - // CHECK: %[[VAL_19:.*]] = llvm.udiv %[[VAL_17]], %[[CST_1]] : i32 - // CHECK: %[[VAL_20:.*]] = llvm.urem %[[VAL_19]], %[[CST_8]] : i32 - // CHECK: %[[VAL_21:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64 - // CHECK: %[[ROW_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_21]] : i64 to i32 - // CHECK: %[[ROW_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32 - // CHECK: %[[VAL_24:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64 - // CHECK: %[[ROW_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32 - // CHECK: %[[VAL_26:.*]] = llvm.mul %[[VAL_18]], %[[CST_32]] : i32 - // CHECK: %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32 - // CHECK: %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32 - // CHECK: %[[VAL_29:.*]] = llvm.add %[[VAL_28]], %[[CST_0]] : i32 - // CHECK: %[[VAL_30:.*]] = llvm.mul %[[VAL_20]], %[[CST_2_I32]] : i32 - // CHECK: %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32 - // CHECK: %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32 - // CHECK: %[[VAL_33:.*]] = llvm.add %[[VAL_32]], %[[CST_0]] : i32 - // CHECK: %[[ROW_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_33]] : i32 to i32 - // CHECK: %[[ROW_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_29]] : i32 to i32 - // CHECK: %[[VAL_36:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32> - // CHECK: %[[ROW_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32> - // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH]], %[[ROW_MAJOR_BASE_HEIGHT]], %[[ROW_MAJOR_PITCH]], %[[ROW_MAJOR_OFFSETS]]) + // CHECK-DAG: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-DAG: %[[CST_2_I32:.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32 + // CHECK-DAG: %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64 + // CHECK-DAG: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK-DAG: %[[CST_1:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK: %[[VAL_15:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() + // CHECK: %[[VAL_16:.*]] = llvm.zext %[[VAL_15]] : i32 to i64 + // CHECK: %[[VAL_17:.*]] = llvm.trunc %[[VAL_16]] : i64 to i32 + // CHECK: %[[VAL_18:.*]] = llvm.urem %[[VAL_17]], %[[CST_1]] : i32 + // CHECK: %[[VAL_19:.*]] = llvm.udiv %[[VAL_17]], %[[CST_1]] : i32 + // CHECK: %[[VAL_20:.*]] = llvm.urem %[[VAL_19]], %[[CST_8]] : i32 + // CHECK: %[[VAL_21:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64 + // CHECK: %[[ROW_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_21]] : i64 to i32 + // CHECK: %[[ROW_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32 + // CHECK: %[[VAL_24:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64 + // CHECK: %[[ROW_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32 + // CHECK: %[[VAL_26:.*]] = llvm.mul %[[VAL_18]], %[[CST_32]] : i32 + // CHECK: %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32 + // CHECK: %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32 + // CHECK: %[[VAL_29:.*]] = llvm.add %[[VAL_28]], %[[CST_0]] : i32 + // CHECK: %[[VAL_30:.*]] = llvm.mul %[[VAL_20]], %[[CST_2_I32]] : i32 + // CHECK: %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32 + // CHECK: %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32 + // CHECK: %[[VAL_33:.*]] = llvm.add %[[VAL_32]], %[[CST_0]] : i32 + // CHECK: %[[ROW_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_33]] : i32 to i32 + // CHECK: %[[ROW_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_29]] : i32 to i32 + + // CHECK: %[[VAL_36:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32> + // CHECK: %[[ROW_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[ROW_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32> + // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[ROW_MAJOR_BASE_WIDTH]], %[[ROW_MAJOR_BASE_HEIGHT]], %[[ROW_MAJOR_PITCH]], %[[ROW_MAJOR_OFFSETS]]) %rowMajorPtr = tt.make_tensor_ptr %arg0, [%arg2, %arg4], [%arg5, %c1_i64], [%c0_i32, %c0_i32] {order = array} : > triton_intel_gpu.prefetch %rowMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "row_major"} : !tt.ptr> - // CHECK: %[[VAL_32:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() - // CHECK: %[[VAL_33:.*]] = llvm.zext %[[VAL_32]] : i32 to i64 - // CHECK: %[[VAL_34:.*]] = llvm.trunc %[[VAL_33]] : i64 to i32 - // CHECK: %[[VAL_35:.*]] = llvm.urem %[[VAL_34]], %[[CST_2_I32]] : i32 - // CHECK: %[[VAL_36:.*]] = llvm.udiv %[[VAL_34]], %[[CST_2_I32]] : i32 - // CHECK: %[[VAL_37:.*]] = llvm.urem %[[VAL_36]], %[[CST_4]] : i32 - // CHECK: %[[VAL_38:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64 - // CHECK: %[[COL_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_38]] : i64 to i32 - // CHECK: %[[COL_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32 - // CHECK: %[[VAL_41:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64 - // CHECK: %[[COL_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_41]] : i64 to i32 - // CHECK: %[[VAL_43:.*]] = llvm.mul %[[VAL_35]], %[[CST_16]] : i32 - // CHECK: %[[VAL_44:.*]] = llvm.add %[[VAL_43]], %[[CST_0]] : i32 - // CHECK: %[[VAL_45:.*]] = llvm.urem %[[VAL_44]], %[[CST_32]] : i32 - // CHECK: %[[VAL_46:.*]] = llvm.add %[[VAL_45]], %[[CST_0]] : i32 - // CHECK: %[[VAL_47:.*]] = llvm.mul %[[VAL_37]], %[[CST_4]] : i32 - // CHECK: %[[VAL_48:.*]] = llvm.add %[[VAL_47]], %[[CST_0]] : i32 - // CHECK: %[[VAL_49:.*]] = llvm.urem %[[VAL_48]], %[[CST_16]] : i32 - // CHECK: %[[VAL_50:.*]] = llvm.add %[[VAL_49]], %[[CST_0]] : i32 - // CHECK: %[[COL_MAJOR_OFFSET_Y:.*]] = llvm.trunc %[[VAL_50]] : i32 to i32 - // CHECK: %[[COL_MAJOR_OFFSET_X:.*]] = llvm.trunc %[[VAL_46]] : i32 to i32 - // CHECK: %[[VAL_54:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_X]], {{.*}} : i32] : vector<2xi32> - // CHECK: %[[COL_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[COL_MAJOR_OFFSET_Y]], {{.*}} : i32] : vector<2xi32> - // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_4r16x1cPU3AS1viiiDv2_i(%[[BASE]], %[[COL_MAJOR_BASE_WIDTH]], %[[COL_MAJOR_BASE_HEIGHT]], %[[COL_MAJOR_PITCH]], %[[COL_MAJOR_OFFSETS]]) {{.*}} + // COM: The memory layout is same for the column major memory and row major memory. The prefetch should be the same. + + // CHECK: %[[VAL_38:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32 + // CHECK: %[[VAL_39:.*]] = llvm.zext %[[VAL_38]] : i32 to i64 + // CHECK: %[[VAL_40:.*]] = llvm.trunc %[[VAL_39]] : i64 to i32 + // CHECK: %[[VAL_41:.*]] = llvm.urem %[[VAL_40]], %[[CST_1]] : i32 + // CHECK: %[[VAL_42:.*]] = llvm.udiv %[[VAL_40]], %[[CST_1]] : i32 + // CHECK: %[[VAL_43:.*]] = llvm.urem %[[VAL_42]], %[[CST_8]] : i32 + // CHECK: %[[VAL_44:.*]] = llvm.mul %[[BASE_WIDTH]], %[[CST_2]] : i64 + // CHECK: %[[COLUM_MAJOR_BASE_WIDTH_I32:.*]] = llvm.trunc %[[VAL_44]] : i64 to i32 + // CHECK: %[[COLUM_MAJOR_BASE_HEIGHT_I32:.*]] = llvm.trunc %[[BASE_HEIGHT]] : i64 to i32 + // CHECK: %[[VAL_47:.*]] = llvm.mul %[[ROW_STRIDE]], %[[CST_2]] : i64 + // CHECK: %[[COLUM_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_47]] : i64 to i32 + // CHECK: %[[VAL_49:.*]] = llvm.mul %[[VAL_41]], %[[CST_32]] : i32 + // CHECK: %[[VAL_50:.*]] = llvm.add %[[VAL_49]], %[[CST_0]] : i32 + // CHECK: %[[VAL_51:.*]] = llvm.urem %[[VAL_50]], %[[CST_32]] : i32 + // CHECK: %[[VAL_52:.*]] = llvm.add %[[VAL_51]], %[[CST_0]] : i32 + // CHECK: %[[VAL_53:.*]] = llvm.mul %[[VAL_43]], %[[CST_2_I32]] : i32 + // CHECK: %[[VAL_54:.*]] = llvm.add %[[VAL_53]], %[[CST_0]] : i32 + // CHECK: %[[VAL_55:.*]] = llvm.urem %[[VAL_54]], %[[CST_16]] : i32 + // CHECK: %[[VAL_56:.*]] = llvm.add %[[VAL_55]], %[[CST_0]] : i32 + // CHECK: %[[VAL_57:.*]] = llvm.trunc %[[VAL_56]] : i32 to i32 + // CHECK: %[[VAL_58:.*]] = llvm.trunc %[[VAL_52]] : i32 to i32 + // CHECK: %[[VAL_59:.*]] = llvm.insertelement %[[VAL_58]], {{.*}} : i32] : vector<2xi32> + // CHECK: %[[COLUM_MAJOR_OFFSETS:.*]] = llvm.insertelement %[[VAL_57]], {{.*}} : i32] : vector<2xi32> + // CHECK: llvm.call spir_funccc @_Z45intel_sub_group_2d_block_prefetch_16b_2r16x2cPU3AS1viiiDv2_i(%[[BASE]], %[[COLUM_MAJOR_BASE_WIDTH_I32]], %[[COLUM_MAJOR_BASE_HEIGHT_I32]], %[[COLUM_MAJOR_PITCH]], %[[COLUM_MAJOR_OFFSETS]]) {{.*}} %columnMajorPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%c1_i64, %arg5], [%c0_i32, %c0_i32] {order = array} : > triton_intel_gpu.prefetch %columnMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, triton_intel_gpu.block_io = "column_major"} : !tt.ptr> - // COM: The memory is not structured densely. Ensure it is not prefetched to the cache. + // COM: The memory is not structured densely. Not to prefetch it to the cache. // CHECK-NOT: block_prefetch %nonContiguousPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%arg5, %arg5], [%c0_i32, %c0_i32] {order = array} : > triton_intel_gpu.prefetch %nonContiguousPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr> @@ -159,7 +160,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32} // ----- -// COM: Currently the prefetch operation in this test cannot be lowered correctly, so we check that the test compiles cleanly and 2D block prefetch operations aren't generated. +// COM: Currently the prefetch operation in this test cannot be lowered correctly, so we check that the test compiles cleanly and not 2D block prefetch operation gets generated. #mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [4, 1], A = [32, 8], B = [8, 16], C = [32, 16]}> module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_sg_2d_block, triton_intel_gpu.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @kernel diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index 0fad28fd9e..8ca0dba87a 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -310,7 +310,7 @@ def make_ttgir(mod, metadata, opt, properties): intel.passes.ttgpuir.add_accelerate_matmul(pm) intel.passes.ttgpuir.add_remove_layout_conversions(pm) intel.passes.ttgpuir.add_materialize_block_pointer(pm) - ## intel.passes.ttgpuir.add_remove_layout_conversions(pm) + intel.passes.ttgpuir.add_remove_layout_conversions(pm) intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, XPUBackend.get_split_barrier_scope(opt)) passes.ttgpuir.add_fuse_nested_loops(pm) diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp index 9e31a33b4a..a6903c4a07 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -427,6 +427,9 @@ struct PrefetchOpConversion // Swap the shape to make it row major and then get the tiling // size base on row major shape. std::swap(tensorShape[0], tensorShape[1]); + tensorType = RankedTensorType::get( + tensorShape, tensorType.getElementType(), + tensorType.getEncoding()); } unsigned numWarps = triton::gpu::lookupNumWarps(op);