@@ -100,11 +100,11 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32,
100
100
%74 = tt.splat %73 : i32 -> tensor <1 x32 xi32 , #blocked >
101
101
%75 = arith.cmpi slt , %33 , %74 : tensor <1 x32 xi32 , #blocked >
102
102
%76 = tt.broadcast %75 : tensor <1 x32 xi1 , #blocked > -> tensor <64 x32 xi1 , #blocked >
103
- %77 = tt.load %arg11 , %76 , %cst_0 : tensor <64 x32 x!tt.ptr <f16 >, #blocked >
103
+ %77 = tt.load %arg11 , %76 , %cst_0 { triton_intel_gpu.block_io = " row_major " } : tensor <64 x32 x!tt.ptr <f16 >, #blocked >
104
104
%78 = tt.splat %73 : i32 -> tensor <32 x1 xi32 , #blocked1 >
105
105
%79 = arith.cmpi slt , %40 , %78 : tensor <32 x1 xi32 , #blocked1 >
106
106
%80 = tt.broadcast %79 : tensor <32 x1 xi1 , #blocked1 > -> tensor <32 x256 xi1 , #blocked1 >
107
- %81 = tt.load %arg12 , %80 , %cst_1 : tensor <32 x256 x!tt.ptr <f16 >, #blocked1 >
107
+ %81 = tt.load %arg12 , %80 , %cst_1 { triton_intel_gpu.block_io = " row_major " } : tensor <32 x256 x!tt.ptr <f16 >, #blocked1 >
108
108
%82 = ttg.convert_layout %77 : tensor <64 x32 xf16 , #blocked > -> tensor <64 x32 xf16 , #dot0 >
109
109
%83 = ttg.convert_layout %81 : tensor <32 x256 xf16 , #blocked1 > -> tensor <32 x256 xf16 , #dot1 >
110
110
%84 = tt.dot %82 , %83 , %arg10 , inputPrecision = tf32 : tensor <64 x32 xf16 , #dot0 > * tensor <32 x256 xf16 , #dot1 > -> tensor <64 x256 xf32 , #dpas >
@@ -175,8 +175,8 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
175
175
// CHECK: tt.dot {{.*}} : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<128x256xf32, #[[$DPAS]]>
176
176
// CHECK-NEXT: scf.yield
177
177
%23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c64_i32 iter_args (%arg10 = %cst , %arg11 = %18 , %arg12 = %22 ) -> (tensor <128 x256 xf32 , #dpas >, !tt.ptr <tensor <128 x64 xf16 , #dot0 >>, !tt.ptr <tensor <64 x256 xf16 , #dot1 >>) : i32 {
178
- %56 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <128 x64 xf16 , #dot0 >>
179
- %57 = tt.load %arg12 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <64 x256 xf16 , #dot1 >>
178
+ %56 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >, triton_intel_gpu.block_io = " row_major " } : !tt.ptr <tensor <128 x64 xf16 , #dot0 >>
179
+ %57 = tt.load %arg12 {boundaryCheck = array<i32 : 0 , 1 >, triton_intel_gpu.block_io = " row_major " } : !tt.ptr <tensor <64 x256 xf16 , #dot1 >>
180
180
%58 = tt.dot %56 , %57 , %arg10 , inputPrecision = tf32 : tensor <128 x64 xf16 , #dot0 > * tensor <64 x256 xf16 , #dot1 > -> tensor <128 x256 xf32 , #dpas >
181
181
%59 = tt.advance %arg11 , [%c0_i32 , %c64_i32 ] : <tensor <128 x64 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth = 1 }>>>
182
182
%60 = tt.advance %arg12 , [%c64_i32 , %c0_i32 ] : <tensor <64 x256 xf16 , #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth = 2 }>>>
@@ -248,8 +248,8 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
248
248
// CHECK: tt.dot {{.*}} : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<128x256xf32, #[[$DPAS]]>
249
249
// CHECK-NEXT: scf.yield
250
250
%23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c64_i32 iter_args (%arg10 = %cst , %arg11 = %18 , %arg12 = %22 ) -> (tensor <128 x256 xf32 , #dpas >, !tt.ptr <tensor <128 x64 xf16 , #dot0 >>, !tt.ptr <tensor <64 x256 xf16 , #dot1 >>) : i32 {
251
- %56 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <128 x64 xf16 , #dot0 >>
252
- %57 = tt.load %arg12 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <64 x256 xf16 , #dot1 >>
251
+ %56 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >, triton_intel_gpu.block_io = " row_major " } : !tt.ptr <tensor <128 x64 xf16 , #dot0 >>
252
+ %57 = tt.load %arg12 {boundaryCheck = array<i32 : 0 , 1 >, triton_intel_gpu.block_io = " row_major " } : !tt.ptr <tensor <64 x256 xf16 , #dot1 >>
253
253
%58 = tt.dot %56 , %57 , %arg10 , inputPrecision = tf32 : tensor <128 x64 xf16 , #dot0 > * tensor <64 x256 xf16 , #dot1 > -> tensor <128 x256 xf32 , #dpas >
254
254
%102 = tt.addptr %arg8 , %c4_i32 : !tt.ptr <i32 >, i32
255
255
%100 = arith.addi %c0_i32 , %c4_i32 : i32
0 commit comments