Skip to content

[mlir][draft] Incorrect sizes/offsets after tile + fuse #150203

@banach-space

Description

@banach-space

REPRO

$. mlir-opt --transform-interpreter tile_and_fuse.mlir -cse -test-transform-dialect-erase-schedule --split-input-file -cse
#map = affine_map<(d0, d1) -> (d0, d1)>
func.func @pack_scalable_prod(%2:tensor<64x32xf32>) ->tensor<?x32x?x1xf32>
{
    %c0 = arith.constant 0 : index
    %c8 = arith.constant 8 : index
    %vscale = vector.vscale
    %c8_vscale = arith.muli %vscale, %c8 : index
    %0 = affine.apply affine_map<()[s0] -> (64 ceildiv s0)>()[%c8_vscale]
    %3 = tensor.empty(%0, %c8_vscale) : tensor<?x32x?x1xf32>
    %4 = tensor.empty() : tensor<64x32xf32>
    %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<64x32xf32>) outs(%4 : tensor<64x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %7 = arith.addf %in, %in : f32
      linalg.yield %7 : f32
    } -> tensor<64x32xf32>
    %pack = linalg.pack %5 inner_dims_pos = [0, 1] inner_tiles = [%c8_vscale, 1] into %3 : tensor<64x32xf32> -> tensor<?x32x?x1xf32>
    return %pack: tensor<?x32x?x1xf32>
  }

module attributes {transform.with_named_sequence} {
  transform.named_sequence @__transform_main(%module : !transform.any_op {transform.readonly}) {
    %generic = transform.structured.match ops{["linalg.generic"]} in %module
      : (!transform.any_op) -> !transform.any_op
    %pack = transform.structured.match ops{["linalg.pack"]} in %module
      : (!transform.any_op) -> !transform.any_op

    %tiled_unpack, %loops = transform.structured.tile_using_forall %pack tile_sizes [[8], 1]
      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
    %fused_op, %new_containing_op =
      transform.structured.fuse_into_containing_op %generic into %loops
      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)

    transform.yield
  }
}

// -----

// Fixed-width version for comparison

#map = affine_map<(d0, d1) -> (d0, d1)>
func.func @pack_fixed_prod(%2:tensor<64x32xf32>) ->tensor<8x32x8x1xf32>
{
    %c0 = arith.constant 0 : index
    %c8 = arith.constant 8 : index
    %3 = tensor.empty() : tensor<8x32x8x1xf32>
    %4 = tensor.empty() : tensor<64x32xf32>
    %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<64x32xf32>) outs(%4 : tensor<64x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %7 = arith.addf %in, %in : f32
      linalg.yield %7 : f32
    } -> tensor<64x32xf32>
    %pack = linalg.pack %5 inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %3 : tensor<64x32xf32> -> tensor<8x32x8x1xf32>
    return %pack: tensor<8x32x8x1xf32>
  }

module attributes {transform.with_named_sequence} {
  transform.named_sequence @__transform_main(%module : !transform.any_op {transform.readonly}) {
    %generic = transform.structured.match ops{["linalg.generic"]} in %module
      : (!transform.any_op) -> !transform.any_op
    %pack = transform.structured.match ops{["linalg.pack"]} in %module
      : (!transform.any_op) -> !transform.any_op

    %tiled_unpack, %loops = transform.structured.tile_using_forall %pack tile_sizes [8, 1]
      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
    %fused_op, %new_containing_op =
      transform.structured.fuse_into_containing_op %generic into %loops
      : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)

    transform.yield
  }
}

ISSUE

After the transformation, you will get this linalg.pack Op:

#map = affine_map<()[s0] -> (64 ceildiv s0)>
#map2 = affine_map<(d0) -> (d0 * 8)>
#map3 = affine_map<(d0)[s0] -> (-d0 + s0, 8)>

    %0 = affine.apply #map()[%c8_vscale]
    %1 = tensor.empty(%0, %c8_vscale) : tensor<?x32x?x1xf32>

    %4 = scf.forall (%arg1, %arg2) in (%3, 32) shared_outs(%arg3 = %1) -> (tensor<?x32x?x1xf32>) {
      %5 = affine.apply #map2(%arg1)
      %6 = affine.min #map3(%5)[%dim]

      %9 = linalg.generic {
      } -> tensor<?x1xf32>

      %dim_1 = tensor.dim %arg3, %c2 : tensor<?x32x?x1xf32>
      %extracted_slice_2 = tensor.extract_slice %arg3[%5, %arg2, 0, 0] [%6, 1, %dim_1, 1] [1, 1, 1, 1] : tensor<?x32x?x1xf32> to tensor<?x1x?x1xf32>
      %pack = linalg.pack %9 inner_dims_pos = [0, 1] inner_tiles = [%c8_vscale, 1] into %extracted_slice_2 : tensor<?x1xf32> -> tensor<?x1x?x1xf32>
  }

Note the lack of vscale in size + offset computation for %extracted_slice_2 = tensor.extract_slice.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions