From 440dd9aa6e56436c509d40c1338fde64d276f246 Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Sun, 27 Apr 2025 23:39:50 +0000 Subject: [PATCH 1/2] [MatmulLoopPipeline] Populate LoadOp mask to PrefetchOp Signed-off-by: Whitney Tsang --- test/TritonIntelGPU/loop-pipeline.mlir | 28 +++++++++++++------ .../Pipeliner/MatmulLoopPipeline.cpp | 4 +-- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/test/TritonIntelGPU/loop-pipeline.mlir b/test/TritonIntelGPU/loop-pipeline.mlir index e16e1405a1..83111893b8 100644 --- a/test/TritonIntelGPU/loop-pipeline.mlir +++ b/test/TritonIntelGPU/loop-pipeline.mlir @@ -79,25 +79,35 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, %51 = arith.muli %arg7, %c32_i32 : i32 %52 = tt.splat %51 : i32 -> tensor<32x256xi32, #blocked1> // COM: There are 3 stages in loop pipelining, the first 2 prefetching stages are before the loop and the last one is inside the loop. - // CHECK: %[[LOAD_MASK:.*]] = arith.cmpi slt, {{.*}} - // CHECK: %[[LOOP_MASK:.*]] = tt.splat %[[LOAD_MASK]] : i1 -> tensor<64x32xi1, #[[$BLOCK_0]]> - // CHECK: triton_intel_gpu.prefetch {{.*}}, %[[LOOP_MASK]] {{.*}} : tensor<64x32x!tt.ptr, #[[$BLOCK_0]]> + // CHECK: %[[LOAD_MASK:.*]] = arith.cmpi slt, {{.*}} : tensor<1x32xi32, #[[$BLOCK_0]]> + // CHECK-NEXT: %[[LOAD_MASK_2D:.*]] = tt.broadcast %[[LOAD_MASK]] : tensor<1x32xi1, #[[$BLOCK_0]]> -> tensor<64x32xi1, #[[$BLOCK_0]]> + // CHECK-NEXT: %[[LOOP_MASK:.*]] = tt.splat {{.*}} : i1 -> tensor<64x32xi1, #[[$BLOCK_0]]> + // CHECK-NEXT: %[[PREFETCH_MASK:.*]] = arith.andi %[[LOOP_MASK]], %[[LOAD_MASK_2D]] : tensor<64x32xi1, #[[$BLOCK_0]]> + // CHECK-NEXT: triton_intel_gpu.prefetch {{.*}}, %[[PREFETCH_MASK]] {{.*}} : tensor<64x32x!tt.ptr, #[[$BLOCK_0]]> // CHECK: triton_intel_gpu.prefetch {{.*}} : tensor<32x256x!tt.ptr, #[[$BLOCK_1]]> // CHECK: triton_intel_gpu.prefetch {{.*}} : tensor<64x32x!tt.ptr, #[[$BLOCK_0]]> // CHECK: triton_intel_gpu.prefetch {{.*}} : tensor<32x256x!tt.ptr, #[[$BLOCK_1]]> - // CHECK: scf.for %[[VAL_92:.*]] = {{.*}} to {{.*}} step {{.*}} iter_args(%[[VAL_93:.*]] = {{.*}}, %[[VAL_94:.*]] = {{.*}}, %[[VAL_95:.*]] = {{.*}}, %[[VAL_96:.*]] = {{.*}}, %[[VAL_97:.*]] = {{.*}}) -> (tensor<64x256xf32, #[[$DPAS]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>) : i32 { - // CHECK: %[[LOAD_MASK:.*]] = arith.cmpi slt, {{.*}} + // CHECK: scf.for %[[VAL_92:.*]] = {{.*}} to {{.*}} step {{.*}} iter_args(%[[VAL_93:.*]] = {{.*}}, %[[VAL_94:.*]] = {{.*}}, %[[VAL_95:.*]] = {{.*}}, %[[VAL_96:.*]] = {{.*}}, %[[ARG_13:.*]] = {{.*}}, %[[ARG_14:.*]] = {{.*}}, %[[VAL_97:.*]] = {{.*}}, %[[ARG_16:.*]] = {{.*}}, %[[ARG_17:.*]] = {{.*}}) -> + // CHECK-SAME: (tensor<64x256xf32, #[[$DPAS]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, tensor<32x256xi1, #[[$BLOCK_1]]>, tensor<32x256xi1, #[[$BLOCK_1]]>) : i32 { // CHECK: %[[VAL_106:.*]] = tt.addptr %[[VAL_94]], {{.*}} : tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<64x32xi32, #[[$BLOCK_0]]> // CHECK: %[[VAL_107:.*]] = tt.addptr %[[VAL_95]], {{.*}} : tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, tensor<32x256xi32, #[[$BLOCK_1]]> - // CHECK: %[[LOOP_MASK:.*]] = tt.splat %[[LOAD_MASK]] : i1 -> tensor<64x32xi1, #[[$BLOCK_0]]> - // CHECK: triton_intel_gpu.prefetch %[[VAL_106]], %[[LOOP_MASK]] {{.*}} : tensor<64x32x!tt.ptr, #[[$BLOCK_0]]> - // CHECK: triton_intel_gpu.prefetch %[[VAL_107]], {{.*}} : tensor<32x256x!tt.ptr, #[[$BLOCK_1]]> + // CHECK: %[[LOAD_MASK:.*]] = arith.cmpi slt, {{.*}} : tensor<1x32xi32, #blocked> + // CHECK-NEXT: %[[LOAD_MASK_2D_1:.*]] = tt.broadcast %[[LOAD_MASK]] : tensor<1x32xi1, #blocked> -> tensor<64x32xi1, #blocked> + // CHECK-NEXT: %[[LOOP_MASK:.*]] = tt.splat {{.*}} : i1 -> tensor<64x32xi1, #[[$BLOCK_0]]> + // CHECK-NEXT: %[[PREFETCH_MASK:.*]] = arith.andi %[[LOOP_MASK]], %[[LOAD_MASK_2D_1]] : tensor<64x32xi1, #blocked> + // CHECK-NEXT: triton_intel_gpu.prefetch %[[VAL_106]], %[[PREFETCH_MASK]] {{.*}} : tensor<64x32x!tt.ptr, #[[$BLOCK_0]]> + // CHECK: %[[LOAD_MASK:.*]] = arith.cmpi slt, {{.*}} : tensor<32x1xi32, #blocked1> + // CHECK-NEXT: %[[LOAD_MASK_2D_2:.*]] = tt.broadcast %[[LOAD_MASK]] : tensor<32x1xi1, #blocked1> -> tensor<32x256xi1, #blocked1> + // CHECK-NEXT: %[[LOOP_MASK:.*]] = tt.splat {{.*}} : i1 -> tensor<32x256xi1, #[[$BLOCK_1]]> + // CHECK-NEXT: %[[PREFETCH_MASK:.*]] = arith.andi %[[LOOP_MASK]], %[[LOAD_MASK_2D_2]] : tensor<32x256xi1, #blocked1> + // CHECK-NEXT: triton_intel_gpu.prefetch %[[VAL_107]], %[[PREFETCH_MASK]] {{.*}} : tensor<32x256x!tt.ptr, #[[$BLOCK_1]]> // CHECK: %[[VAL_116:.*]] = tt.load %[[VAL_96]], {{.*}}, {{.*}} : tensor<64x32x!tt.ptr, #[[$BLOCK_0]]> // CHECK: %[[VAL_120:.*]] = tt.load %[[VAL_97]], {{.*}}, {{.*}} : tensor<32x256x!tt.ptr, #[[$BLOCK_1]]> // CHECK: %[[VAL_121:.*]] = ttg.convert_layout %[[VAL_116]] : tensor<64x32xf16, #[[$BLOCK_0]]> -> tensor<64x32xf16, #{{.*}}<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> // CHECK: %[[VAL_122:.*]] = ttg.convert_layout %[[VAL_120]] : tensor<32x256xf16, #[[$BLOCK_1]]> -> tensor<32x256xf16, #{{.*}}<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> // CHECK: %[[VAL_123:.*]] = tt.dot %[[VAL_121]], %[[VAL_122]], %[[VAL_93]], inputPrecision = tf32 : tensor<64x32xf16, #{{.*}}<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<32x256xf16, #{{.*}}<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<64x256xf32, #[[$DPAS]]> - // CHECK: scf.yield %[[VAL_123]], %[[VAL_106]], %[[VAL_107]], %[[VAL_94]], %[[VAL_95]] : tensor<64x256xf32, #[[$DPAS]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]> + // CHECK: scf.yield %[[VAL_123]], %[[VAL_106]], %[[VAL_107]], %[[VAL_94]], %[[ARG_14]], %[[LOAD_MASK_2D_1]], %[[VAL_95]], %[[ARG_17]], %[[LOAD_MASK_2D_2]] : + // CHECK-SAME: tensor<64x256xf32, #[[$DPAS]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, tensor<32x256xi1, #[[$BLOCK_1]]>, tensor<32x256xi1, #[[$BLOCK_1]]> %53:3 = scf.for %arg9 = %c0_i32 to %50 step %c1_i32 iter_args(%arg10 = %cst_2, %arg11 = %38, %arg12 = %48) -> (tensor<64x256xf32, #dpas>, tensor<64x32x!tt.ptr, #blocked>, tensor<32x256x!tt.ptr, #blocked1>) : i32 { %72 = arith.muli %arg9, %c32_i32 : i32 %73 = arith.subi %arg5, %72 : i32 diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp index d65be0221c..e6bd165d3d 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp @@ -89,8 +89,8 @@ static void createPrefetchOp(scf::ForOp &forOp, tt::LoadOp loadOp) { OpBuilder builder(forOp); builder.setInsertionPoint(loadOp); auto prefetchOp = builder.create( - loadOp->getLoc(), loadOp.getPtr(), loadOp.getCache(), loadOp.getEvict(), - loadOp.getIsVolatile()); + loadOp->getLoc(), loadOp.getPtr(), loadOp.getMask(), loadOp.getCache(), + loadOp.getEvict(), loadOp.getIsVolatile()); // inherit attributes from the load operation auto attrs = loadOp->getAttrDictionary(); From 2550d8fa82c666465807f1b2c8da575c06cad45a Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Mon, 28 Apr 2025 16:45:24 +0000 Subject: [PATCH 2/2] address review comments Signed-off-by: Whitney Tsang --- test/TritonIntelGPU/loop-pipeline.mlir | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/test/TritonIntelGPU/loop-pipeline.mlir b/test/TritonIntelGPU/loop-pipeline.mlir index 83111893b8..0cdf686a98 100644 --- a/test/TritonIntelGPU/loop-pipeline.mlir +++ b/test/TritonIntelGPU/loop-pipeline.mlir @@ -20,7 +20,6 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, %c0_i32 = arith.constant 0 : i32 %c1_i32 = arith.constant 1 : i32 %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x32xf16, #blocked> - %cst_1 = arith.constant dense<0.000000e+00> : tensor<32x256xf16, #blocked1> %c63_i32 = arith.constant 63 : i32 %c255_i32 = arith.constant 255 : i32 %c31_i32 = arith.constant 31 : i32 @@ -83,31 +82,30 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, // CHECK-NEXT: %[[LOAD_MASK_2D:.*]] = tt.broadcast %[[LOAD_MASK]] : tensor<1x32xi1, #[[$BLOCK_0]]> -> tensor<64x32xi1, #[[$BLOCK_0]]> // CHECK-NEXT: %[[LOOP_MASK:.*]] = tt.splat {{.*}} : i1 -> tensor<64x32xi1, #[[$BLOCK_0]]> // CHECK-NEXT: %[[PREFETCH_MASK:.*]] = arith.andi %[[LOOP_MASK]], %[[LOAD_MASK_2D]] : tensor<64x32xi1, #[[$BLOCK_0]]> - // CHECK-NEXT: triton_intel_gpu.prefetch {{.*}}, %[[PREFETCH_MASK]] {{.*}} : tensor<64x32x!tt.ptr, #[[$BLOCK_0]]> + // CHECK-NEXT: triton_intel_gpu.prefetch {{.*}}, %[[PREFETCH_MASK]] {{.*}}triton_intel_gpu.block_io = "row_major"{{.*}} : tensor<64x32x!tt.ptr, #[[$BLOCK_0]]> // CHECK: triton_intel_gpu.prefetch {{.*}} : tensor<32x256x!tt.ptr, #[[$BLOCK_1]]> // CHECK: triton_intel_gpu.prefetch {{.*}} : tensor<64x32x!tt.ptr, #[[$BLOCK_0]]> // CHECK: triton_intel_gpu.prefetch {{.*}} : tensor<32x256x!tt.ptr, #[[$BLOCK_1]]> // CHECK: scf.for %[[VAL_92:.*]] = {{.*}} to {{.*}} step {{.*}} iter_args(%[[VAL_93:.*]] = {{.*}}, %[[VAL_94:.*]] = {{.*}}, %[[VAL_95:.*]] = {{.*}}, %[[VAL_96:.*]] = {{.*}}, %[[ARG_13:.*]] = {{.*}}, %[[ARG_14:.*]] = {{.*}}, %[[VAL_97:.*]] = {{.*}}, %[[ARG_16:.*]] = {{.*}}, %[[ARG_17:.*]] = {{.*}}) -> - // CHECK-SAME: (tensor<64x256xf32, #[[$DPAS]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, tensor<32x256xi1, #[[$BLOCK_1]]>, tensor<32x256xi1, #[[$BLOCK_1]]>) : i32 { + // CHECK-SAME: (tensor<64x256xf32, #[[$DPAS]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, i32, i32) : i32 { // CHECK: %[[VAL_106:.*]] = tt.addptr %[[VAL_94]], {{.*}} : tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<64x32xi32, #[[$BLOCK_0]]> // CHECK: %[[VAL_107:.*]] = tt.addptr %[[VAL_95]], {{.*}} : tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, tensor<32x256xi32, #[[$BLOCK_1]]> - // CHECK: %[[LOAD_MASK:.*]] = arith.cmpi slt, {{.*}} : tensor<1x32xi32, #blocked> + // CHECK: %[[VAL_108:.*]] = arith.subi {{.*}} : i32 + // CHECK: %[[VAL_109:.*]] = tt.splat %[[VAL_108]] : i32 -> tensor<1x32xi32, #blocked> + // CHECK-NEXT: %[[LOAD_MASK:.*]] = arith.cmpi slt, {{.*}}, %[[VAL_109]] : tensor<1x32xi32, #blocked> // CHECK-NEXT: %[[LOAD_MASK_2D_1:.*]] = tt.broadcast %[[LOAD_MASK]] : tensor<1x32xi1, #blocked> -> tensor<64x32xi1, #blocked> // CHECK-NEXT: %[[LOOP_MASK:.*]] = tt.splat {{.*}} : i1 -> tensor<64x32xi1, #[[$BLOCK_0]]> // CHECK-NEXT: %[[PREFETCH_MASK:.*]] = arith.andi %[[LOOP_MASK]], %[[LOAD_MASK_2D_1]] : tensor<64x32xi1, #blocked> // CHECK-NEXT: triton_intel_gpu.prefetch %[[VAL_106]], %[[PREFETCH_MASK]] {{.*}} : tensor<64x32x!tt.ptr, #[[$BLOCK_0]]> - // CHECK: %[[LOAD_MASK:.*]] = arith.cmpi slt, {{.*}} : tensor<32x1xi32, #blocked1> - // CHECK-NEXT: %[[LOAD_MASK_2D_2:.*]] = tt.broadcast %[[LOAD_MASK]] : tensor<32x1xi1, #blocked1> -> tensor<32x256xi1, #blocked1> - // CHECK-NEXT: %[[LOOP_MASK:.*]] = tt.splat {{.*}} : i1 -> tensor<32x256xi1, #[[$BLOCK_1]]> - // CHECK-NEXT: %[[PREFETCH_MASK:.*]] = arith.andi %[[LOOP_MASK]], %[[LOAD_MASK_2D_2]] : tensor<32x256xi1, #blocked1> + // CHECK: %[[PREFETCH_MASK:.*]] = tt.splat {{.*}} : i1 -> tensor<32x256xi1, #[[$BLOCK_1]]> // CHECK-NEXT: triton_intel_gpu.prefetch %[[VAL_107]], %[[PREFETCH_MASK]] {{.*}} : tensor<32x256x!tt.ptr, #[[$BLOCK_1]]> // CHECK: %[[VAL_116:.*]] = tt.load %[[VAL_96]], {{.*}}, {{.*}} : tensor<64x32x!tt.ptr, #[[$BLOCK_0]]> - // CHECK: %[[VAL_120:.*]] = tt.load %[[VAL_97]], {{.*}}, {{.*}} : tensor<32x256x!tt.ptr, #[[$BLOCK_1]]> + // CHECK: %[[VAL_120:.*]] = tt.load %[[VAL_97]] {{.*}} : tensor<32x256x!tt.ptr, #[[$BLOCK_1]]> // CHECK: %[[VAL_121:.*]] = ttg.convert_layout %[[VAL_116]] : tensor<64x32xf16, #[[$BLOCK_0]]> -> tensor<64x32xf16, #{{.*}}<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> // CHECK: %[[VAL_122:.*]] = ttg.convert_layout %[[VAL_120]] : tensor<32x256xf16, #[[$BLOCK_1]]> -> tensor<32x256xf16, #{{.*}}<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> // CHECK: %[[VAL_123:.*]] = tt.dot %[[VAL_121]], %[[VAL_122]], %[[VAL_93]], inputPrecision = tf32 : tensor<64x32xf16, #{{.*}}<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<32x256xf16, #{{.*}}<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<64x256xf32, #[[$DPAS]]> - // CHECK: scf.yield %[[VAL_123]], %[[VAL_106]], %[[VAL_107]], %[[VAL_94]], %[[ARG_14]], %[[LOAD_MASK_2D_1]], %[[VAL_95]], %[[ARG_17]], %[[LOAD_MASK_2D_2]] : - // CHECK-SAME: tensor<64x256xf32, #[[$DPAS]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, tensor<32x256xi1, #[[$BLOCK_1]]>, tensor<32x256xi1, #[[$BLOCK_1]]> + // CHECK: scf.yield %[[VAL_123]], %[[VAL_106]], %[[VAL_107]], %[[VAL_94]], %[[ARG_14]], %[[LOAD_MASK_2D_1]], %[[VAL_95]], %[[ARG_17]], %[[VAL_108]] : + // CHECK-SAME: tensor<64x256xf32, #[[$DPAS]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, tensor<64x32x!tt.ptr, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>, tensor<64x32xi1, #[[$BLOCK_0]]>, tensor<32x256x!tt.ptr, #[[$BLOCK_1]]>, i32, i32 %53:3 = scf.for %arg9 = %c0_i32 to %50 step %c1_i32 iter_args(%arg10 = %cst_2, %arg11 = %38, %arg12 = %48) -> (tensor<64x256xf32, #dpas>, tensor<64x32x!tt.ptr, #blocked>, tensor<32x256x!tt.ptr, #blocked1>) : i32 { %72 = arith.muli %arg9, %c32_i32 : i32 %73 = arith.subi %arg5, %72 : i32 @@ -118,7 +116,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, %78 = tt.splat %73 : i32 -> tensor<32x1xi32, #blocked1> %79 = arith.cmpi slt, %40, %78 : tensor<32x1xi32, #blocked1> %80 = tt.broadcast %79 : tensor<32x1xi1, #blocked1> -> tensor<32x256xi1, #blocked1> - %81 = tt.load %arg12, %80, %cst_1 {triton_intel_gpu.block_io = "row_major"} : tensor<32x256x!tt.ptr, #blocked1> + %81 = tt.load %arg12 {triton_intel_gpu.block_io = "row_major"} : tensor<32x256x!tt.ptr, #blocked1> %82 = ttg.convert_layout %77 : tensor<64x32xf16, #blocked> -> tensor<64x32xf16, #dot0> %83 = ttg.convert_layout %81 : tensor<32x256xf16, #blocked1> -> tensor<32x256xf16, #dot1> %84 = tt.dot %82, %83, %arg10, inputPrecision = tf32 : tensor<64x32xf16, #dot0> * tensor<32x256xf16, #dot1> -> tensor<64x256xf32, #dpas>