@@ -79,32 +79,39 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32,
79
79
%51 = arith.muli %arg7 , %c32_i32 : i32
80
80
%52 = tt.splat %51 : i32 -> tensor <32 x256 xi32 , #blocked1 >
81
81
// COM: There are 3 stages in loop pipelining, the first 2 prefetching stages are before the loop and the last one is inside the loop.
82
- // CHECK: triton_intel_gpu.prefetch {{.*}} : tensor<64x32x!tt.ptr<f16>, #[[$BLOCK_0]]>
83
- // CHECK-NEXT: triton_intel_gpu.prefetch {{.*}} : tensor<32x256x!tt.ptr<f16>, #[[$BLOCK_1]]>
84
- // CHECK: triton_intel_gpu.prefetch {{.*}} : tensor<64x32x!tt.ptr<f16>, #[[$BLOCK_0]]>
85
- // CHECK-NEXT: triton_intel_gpu.prefetch {{.*}} : tensor<32x256x!tt.ptr<f16>, #[[$BLOCK_1]]>
86
- // CHECK: scf.for %[[VAL_92:.*]] = {{.*}} to {{.*}} step {{.*}} iter_args(%[[VAL_93:.*]] = {{.*}}, %[[VAL_94:.*]] = {{.*}}, %[[VAL_95:.*]] = {{.*}}, %[[VAL_96:.*]] = {{.*}}, %[[VAL_97:.*]] = {{.*}}) -> (tensor<64x256xf32, #[[$DPAS]]>, tensor<64x32x!tt.ptr<f16>, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr<f16>, #[[$BLOCK_1]]>, tensor<64x32x!tt.ptr<f16>, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr<f16>, #[[$BLOCK_1]]>) : i32 {
82
+ // CHECK: %[[LOAD_MASK:.*]] = arith.cmpi slt, {{.*}} : tensor<1x32xi32, #[[$BLOCK_0]]>
83
+ // CHECK: %[[LOAD_MASK_2D:.*]] = tt.broadcast %[[LOAD_MASK]] : tensor<1x32xi1, #[[$BLOCK_0]]> -> tensor<64x32xi1, #[[$BLOCK_0]]>
84
+ // CHECK: %[[LOOP_MASK:.*]] = tt.splat {{.*}} : i1 -> tensor<64x32xi1, #[[$BLOCK_0]]>
85
+ // CHECK: %[[PREFETCH_MASK:.*]] = arith.andi %[[LOOP_MASK]], %[[LOAD_MASK_2D]] : tensor<64x32xi1, #[[$BLOCK_0]]>
86
+ // CHECK: triton_intel_gpu.prefetch {{.*}}, %[[PREFETCH_MASK]] {{.*}} : tensor<64x32x!tt.ptr<f16>, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>
87
+ // CHECK: %[[LOAD_MASK_2:.*]] = arith.cmpi slt, {{.*}} : tensor<32x1xi32, #[[$BLOCK_1]]>
88
+ // CHECK: %[[LOAD_MASK_2D_2:.*]] = tt.broadcast %[[LOAD_MASK_2]] : tensor<32x1xi1, #[[$BLOCK_1]]> -> tensor<32x256xi1, #[[$BLOCK_1]]>
89
+ // CHECK: %[[LOOP_MASK:.*]] = tt.splat {{.*}} : i1 -> tensor<32x256xi1, #[[$BLOCK_1]]>
90
+ // CHECK: %[[PREFETCH_MASK:.*]] = arith.andi %[[LOOP_MASK]], %[[LOAD_MASK_2D_2]] : tensor<32x256xi1, #[[$BLOCK_1]]>
91
+ // CHECK: triton_intel_gpu.prefetch {{.*}}, %[[PREFETCH_MASK]] {{.*}} : tensor<32x256x!tt.ptr<f16>, #[[$BLOCK_1]]>, tensor<32x256xi1, #[[$BLOCK_1]]>
92
+ // CHECK: triton_intel_gpu.prefetch {{.*}} : tensor<64x32x!tt.ptr<f16>, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>
93
+ // CHECK: triton_intel_gpu.prefetch {{.*}} : tensor<32x256x!tt.ptr<f16>, #[[$BLOCK_1]]>, tensor<32x256xi1, #[[$BLOCK_1]]>
94
+ // CHECK: scf.for %[[VAL_92:.*]] = {{.*}} to {{.*}} step {{.*}} iter_args(%[[VAL_93:.*]] = {{.*}}, %[[VAL_94:.*]] = {{.*}}, %[[VAL_95:.*]] = {{.*}}, %[[VAL_96:.*]] = {{.*}}, %[[VAL_97:.*]] = {{.*}}, %[[VAL_98:.*]] = {{.*}}, %[[VAL_99:.*]] = {{.*}}, %[[VAL_100:.*]] = {{.*}}, %[[VAL_101:.*]] = {{.*}}) -> (tensor<64x256xf32, #[[$DPAS]]>, tensor<64x32x!tt.ptr<f16>, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr<f16>, #[[$BLOCK_1]]>, tensor<64x32x!tt.ptr<f16>, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr<f16>, #[[$BLOCK_1]]>, tensor<32x256xi1, #[[$BLOCK_1]]>, tensor<32x256xi1, #[[$BLOCK_1]]>) : i32 {
87
95
// CHECK: %[[VAL_106:.*]] = tt.addptr %[[VAL_94]], {{.*}} : tensor<64x32x!tt.ptr<f16>, #[[$BLOCK_0]]>, tensor<64x32xi32, #[[$BLOCK_0]]>
88
96
// CHECK: %[[VAL_107:.*]] = tt.addptr %[[VAL_95]], {{.*}} : tensor<32x256x!tt.ptr<f16>, #[[$BLOCK_1]]>, tensor<32x256xi32, #[[$BLOCK_1]]>
89
- // CHECK: triton_intel_gpu.prefetch %[[VAL_106]] {{.*}} : tensor<64x32x!tt.ptr<f16>, #[[$BLOCK_0]]>
90
- // CHECK: triton_intel_gpu.prefetch %[[VAL_107]] {{.*}} : tensor<32x256x!tt.ptr<f16>, #[[$BLOCK_1]]>
91
- // CHECK: %[[VAL_116:.*]] = tt.load %[[VAL_96]], {{.*}}, {{.*}} : tensor<64x32x!tt.ptr<f16>, #[[$BLOCK_0]]>
92
- // CHECK: %[[VAL_120:.*]] = tt.load %[[VAL_97 ]], {{.*}}, {{.*}} : tensor<32x256x!tt.ptr<f16>, #[[$BLOCK_1]]>
97
+ // CHECK: triton_intel_gpu.prefetch %[[VAL_106]], {{.*}} : tensor<64x32x!tt.ptr<f16>, #[[$BLOCK_0]]>, tensor<64x32xi1 , #[[$BLOCK_0]]>
98
+ // CHECK: triton_intel_gpu.prefetch %[[VAL_107]], {{.*}} : tensor<32x256x!tt.ptr<f16>, #[[$BLOCK_1]]>, tensor<32x256xi1 , #[[$BLOCK_1]]>
99
+ // CHECK: %[[VAL_116:.*]] = tt.load %[[VAL_96]], {{.*}}, {{.*}} {triton_intel_gpu.block_io = "row_major"} : tensor<64x32x!tt.ptr<f16>, #[[$BLOCK_0]]>
100
+ // CHECK: %[[VAL_120:.*]] = tt.load %[[VAL_99 ]], {{.*}}, {{.*}} {triton_intel_gpu.block_io = "row_major" } : tensor<32x256x!tt.ptr<f16>, #[[$BLOCK_1]]>
93
101
// CHECK: %[[VAL_121:.*]] = ttg.convert_layout %[[VAL_116]] : tensor<64x32xf16, #[[$BLOCK_0]]> -> tensor<64x32xf16, #{{.*}}<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>
94
102
// CHECK: %[[VAL_122:.*]] = ttg.convert_layout %[[VAL_120]] : tensor<32x256xf16, #[[$BLOCK_1]]> -> tensor<32x256xf16, #{{.*}}<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>
95
103
// CHECK: %[[VAL_123:.*]] = tt.dot %[[VAL_121]], %[[VAL_122]], %[[VAL_93]], inputPrecision = tf32 : tensor<64x32xf16, #{{.*}}<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<32x256xf16, #{{.*}}<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<64x256xf32, #[[$DPAS]]>
96
- // CHECK: scf.yield %[[VAL_123]], %[[VAL_106]], %[[VAL_107]], %[[VAL_94]], %[[VAL_95]] : tensor<64x256xf32, #[[$DPAS]]>, tensor<64x32x!tt.ptr<f16>, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr<f16>, #[[$BLOCK_1]]>, tensor<64x32x!tt.ptr<f16>, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr<f16>, #[[$BLOCK_1]]>
97
104
%53:3 = scf.for %arg9 = %c0_i32 to %50 step %c1_i32 iter_args (%arg10 = %cst_2 , %arg11 = %38 , %arg12 = %48 ) -> (tensor <64 x256 xf32 , #dpas >, tensor <64 x32 x!tt.ptr <f16 >, #blocked >, tensor <32 x256 x!tt.ptr <f16 >, #blocked1 >) : i32 {
98
105
%72 = arith.muli %arg9 , %c32_i32 : i32
99
106
%73 = arith.subi %arg5 , %72 : i32
100
107
%74 = tt.splat %73 : i32 -> tensor <1 x32 xi32 , #blocked >
101
108
%75 = arith.cmpi slt , %33 , %74 : tensor <1 x32 xi32 , #blocked >
102
109
%76 = tt.broadcast %75 : tensor <1 x32 xi1 , #blocked > -> tensor <64 x32 xi1 , #blocked >
103
- %77 = tt.load %arg11 , %76 , %cst_0 : tensor <64 x32 x!tt.ptr <f16 >, #blocked >
110
+ %77 = tt.load %arg11 , %76 , %cst_0 { triton_intel_gpu.block_io = " row_major " } : tensor <64 x32 x!tt.ptr <f16 >, #blocked >
104
111
%78 = tt.splat %73 : i32 -> tensor <32 x1 xi32 , #blocked1 >
105
112
%79 = arith.cmpi slt , %40 , %78 : tensor <32 x1 xi32 , #blocked1 >
106
113
%80 = tt.broadcast %79 : tensor <32 x1 xi1 , #blocked1 > -> tensor <32 x256 xi1 , #blocked1 >
107
- %81 = tt.load %arg12 , %80 , %cst_1 : tensor <32 x256 x!tt.ptr <f16 >, #blocked1 >
114
+ %81 = tt.load %arg12 , %80 , %cst_1 { triton_intel_gpu.block_io = " row_major " } : tensor <32 x256 x!tt.ptr <f16 >, #blocked1 >
108
115
%82 = ttg.convert_layout %77 : tensor <64 x32 xf16 , #blocked > -> tensor <64 x32 xf16 , #dot0 >
109
116
%83 = ttg.convert_layout %81 : tensor <32 x256 xf16 , #blocked1 > -> tensor <32 x256 xf16 , #dot1 >
110
117
%84 = tt.dot %82 , %83 , %arg10 , inputPrecision = tf32 : tensor <64 x32 xf16 , #dot0 > * tensor <32 x256 xf16 , #dot1 > -> tensor <64 x256 xf32 , #dpas >
@@ -175,8 +182,8 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
175
182
// CHECK: tt.dot {{.*}} : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<128x256xf32, #[[$DPAS]]>
176
183
// CHECK-NEXT: scf.yield
177
184
%23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c64_i32 iter_args (%arg10 = %cst , %arg11 = %18 , %arg12 = %22 ) -> (tensor <128 x256 xf32 , #dpas >, !tt.ptr <tensor <128 x64 xf16 , #dot0 >>, !tt.ptr <tensor <64 x256 xf16 , #dot1 >>) : i32 {
178
- %56 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <128 x64 xf16 , #dot0 >>
179
- %57 = tt.load %arg12 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <64 x256 xf16 , #dot1 >>
185
+ %56 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >, triton_intel_gpu.block_io = " row_major " } : !tt.ptr <tensor <128 x64 xf16 , #dot0 >>
186
+ %57 = tt.load %arg12 {boundaryCheck = array<i32 : 0 , 1 >, triton_intel_gpu.block_io = " row_major " } : !tt.ptr <tensor <64 x256 xf16 , #dot1 >>
180
187
%58 = tt.dot %56 , %57 , %arg10 , inputPrecision = tf32 : tensor <128 x64 xf16 , #dot0 > * tensor <64 x256 xf16 , #dot1 > -> tensor <128 x256 xf32 , #dpas >
181
188
%59 = tt.advance %arg11 , [%c0_i32 , %c64_i32 ] : <tensor <128 x64 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>>
182
189
%60 = tt.advance %arg12 , [%c64_i32 , %c0_i32 ] : <tensor <64 x256 xf16 , #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth = 2 }>>>
0 commit comments