[TorchToLinalg] Improve broadcast lowerings in strict symbolic modes (#2505)

qedawkins · web-flow · commit 6f81ad72938d · 2023-10-05T15:15:26.000-04:00
With strict symbolic shapes, we can assume numpy-style dynamic
broadcasts never occur. This improves the lowering in the presence of
this assumption.
diff --git a/lib/Conversion/TorchToLinalg/DataMovement.cpp b/lib/Conversion/TorchToLinalg/DataMovement.cpp
@@ -1095,31 +1095,35 @@ class ConvertAtenBroadcastToOp : public OpConversionPattern<AtenBroadcastToOp> {
     // which in this case is `inShapeConverted` because this shape will yield
     // us the dimension size of the output.
     SmallVector<bool> useBroadcastToShape;
-    for (auto x : inShape) {
+    int64_t inputRank = self.getType().cast<RankedTensorType>().getRank();
+    for (size_t i = inShape.size() - inputRank, e = inShape.size(); i < e;
+         ++i) {
       int64_t dim;
-      if (!matchPattern(x, m_TorchConstantInt(&dim))) {
-        Operation *defOp = x.getDefiningOp();
-        if (isa<AtenSizeOp, AtenSizeIntOp>(defOp))
-          useBroadcastToShape.push_back(true);
-        else
+      if (matchPattern(inShape[i], m_TorchConstantInt(&dim))) {
+        if (dim < 0) {
           useBroadcastToShape.push_back(false);
+        } else {
+          useBroadcastToShape.push_back(true);
+        }
       } else {
-        useBroadcastToShape.push_back(false);
+        // Note: Dynamic -1 (inferred) broadcast shapes are unimplemented.
+        useBroadcastToShape.push_back(true);
       }
     }
 
     SmallVector<Value> inShapeConverted = getTypeConvertedValues(
         rewriter, op.getLoc(), getTypeConverter(), inShape);
+    auto newResultType =
+        getTypeConverter()->convertType(op.getType()).cast<RankedTensorType>();
     Value result;
-    if (failed(torch_to_linalg::broadcastToGivenShape(op, rewriter, self,
-                                                      inShapeConverted, result,
-                                                      useBroadcastToShape))) {
+    if (failed(torch_to_linalg::broadcastToGivenShape(
+            op, rewriter, self, inShapeConverted, newResultType, result,
+            useBroadcastToShape))) {
       return rewriter.notifyMatchFailure(
           op, "unable to perform broadcast operation");
     }
 
-    Type newResultType = getTypeConverter()->convertType(op.getType());
-    rewriter.replaceOpWithNewOp<tensor::CastOp>(op, newResultType, result);
+    rewriter.replaceOp(op, result);
     return success();
   }
 };
@@ -1177,7 +1181,7 @@ class ConvertAtenCopyOp : public OpConversionPattern<AtenCopyOp> {
       selfSizes[i] = castIndexToInt64(rewriter, loc, selfSizes[i]);
     Value broadcastedSrc;
     if (failed(torch_to_linalg::broadcastToGivenShape(
-            op, rewriter, src, selfSizes, broadcastedSrc))) {
+            op, rewriter, src, selfSizes, selfType, broadcastedSrc))) {
       return rewriter.notifyMatchFailure(
           op, "unable to perform broadcast operation");
     }
diff --git a/lib/Conversion/TorchToLinalg/Linear.cpp b/lib/Conversion/TorchToLinalg/Linear.cpp
@@ -295,13 +295,24 @@ class ConvertAtenMatmulOp : public OpConversionPattern<AtenMatmulOp> {
 
       // Broadcast the batch dimensions of both the matrices.
       Value broadcastedLhs, broadcastedRhs;
+      // TODO: Improve usage of static shape information.
+      SmallVector<int64_t> lhsTargetShape(lhsBroadcastToShape.size(),
+                                          ShapedType::kDynamic);
+      auto lhsBroadcastType =
+          RankedTensorType::get(lhsTargetShape, lhsType.getElementType());
       if (failed(torch_to_linalg::broadcastToGivenShape(
-              op, rewriter, lhs, lhsBroadcastToShape, broadcastedLhs))) {
+              op, rewriter, lhs, lhsBroadcastToShape, lhsBroadcastType,
+              broadcastedLhs))) {
         return rewriter.notifyMatchFailure(
             op, "unable to perform broadcast operation");
       }
+      SmallVector<int64_t> rhsTargetShape(rhsBroadcastToShape.size(),
+                                          ShapedType::kDynamic);
+      auto rhsBroadcastType =
+          RankedTensorType::get(rhsTargetShape, rhsType.getElementType());
       if (failed(torch_to_linalg::broadcastToGivenShape(
-              op, rewriter, rhs, rhsBroadcastToShape, broadcastedRhs))) {
+              op, rewriter, rhs, rhsBroadcastToShape, rhsBroadcastType,
+              broadcastedRhs))) {
         return rewriter.notifyMatchFailure(
             op, "unable to perform broadcast operation");
       }
diff --git a/lib/Conversion/TorchToLinalg/Utils.cpp b/lib/Conversion/TorchToLinalg/Utils.cpp
@@ -327,22 +327,28 @@ Value torch_to_linalg::createElementwiseLinalgGeneric(
 // Broadcasts input tensor based on the broadcastToShape.
 LogicalResult torch_to_linalg::broadcastToGivenShape(
     Operation *op, PatternRewriter &rewriter, Value input,
-    SmallVector<Value> broadcastToShape, Value &result,
-    SmallVector<bool> useBroadcastToShape) {
+    SmallVector<Value> broadcastToShape, RankedTensorType broadcastType,
+    Value &result, SmallVector<bool> useBroadcastToShape) {
   RankedTensorType inputType = input.getType().cast<RankedTensorType>();
+  int64_t inputRank = inputType.getRank();
+  int64_t outputRank = broadcastToShape.size();
+  ArrayRef<int64_t> outputShape = broadcastType.getShape();
   SmallVector<int64_t> inputShape =
       makeShapeTorchCompatible(inputType.getShape());
-  if (broadcastToShape.size() < inputShape.size()) {
+  if (outputRank < inputRank) {
     return rewriter.notifyMatchFailure(
         op, "invalid shape: broadcastToShape size must not be smaller than the "
             "size of the input shape");
   }
 
   Type elementType = inputType.getElementType();
   Location loc = op->getLoc();
-  SmallVector<Value> outShape;
+  SmallVector<OpFoldResult> outShape;
   bool elideDynamicBroadcastCheck = isAssumingStrictSymbolicShapes(rewriter);
 
+  // Vector indicating broadcasted status when assuming strict symbolic shapes.
+  SmallVector<bool> broadcastedStatus;
+
   // Create affine map and shapes for tensor initialization.
   SmallVector<AffineExpr> outExpr;
   Value zero =
@@ -351,10 +357,39 @@ LogicalResult torch_to_linalg::broadcastToGivenShape(
       rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0));
   Value oneIndex =
       rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(1));
-  size_t diff = broadcastToShape.size() - inputShape.size();
-  for (size_t i = 0; i < broadcastToShape.size(); i++) {
+  size_t diff = outputRank - inputRank;
+  bool hasDynamicNumpyBroadcast = false;
+  for (size_t i = 0, e = outputRank; i < e; i++) {
     Value shapeValue = broadcastToShape[i];
     size_t j = i - diff;
+    bool isDynamic = i >= diff && inputShape[j] == kUnknownSize;
+
+    // Inherit static output shapes if present.
+    if (outputShape[i] != ShapedType::kDynamic) {
+      outShape.push_back(rewriter.getIndexAttr(outputShape[i]));
+      if (i < diff) {
+        if (outputShape[i] < 0) {
+          return rewriter.notifyMatchFailure(
+              op, "invalid shape: negative values not allowed in new broadcast "
+                  "dimensions");
+        }
+        continue;
+      }
+      if (isDynamic) {
+        hasDynamicNumpyBroadcast = true;
+      } else if (inputShape[j] != outputShape[i] && inputShape[j] != 1) {
+        return rewriter.notifyMatchFailure(
+            op, "invalid shape: static mismatch in input and output broadcast "
+                "shapes");
+      }
+
+      // If strict symbolic shapes are assumed and the input shape is dynamic,
+      // we can assume that dim is not broadcasted.
+      broadcastedStatus.push_back(inputShape[j] != outputShape[i] &&
+                                  !isDynamic);
+      continue;
+    }
+
     if (i < diff) {
       if (!elideDynamicBroadcastCheck) {
         Value isValid = rewriter.create<arith::CmpIOp>(
@@ -374,24 +409,80 @@ LogicalResult torch_to_linalg::broadcastToGivenShape(
       Value select = rewriter.create<arith::SelectOp>(
           loc, isNegative, oneIndex, castIntToIndex(rewriter, loc, shapeValue));
       outShape.push_back(select);
-    } else {
-      // Case of dynamic input dimension wherein the shape to broadcast will
-      // yield us the dimension size of the output.
-      Value dim = getDimOp(rewriter, loc, input, j);
-      if (!useBroadcastToShape.empty()) {
-        if (useBroadcastToShape[i])
-          dim = castIntToIndex(rewriter, loc, broadcastToShape[j]);
+      broadcastedStatus.push_back(true);
+      continue;
+    }
+
+    // Case of dynamic input dimension wherein the shape to broadcast will
+    // yield us the dimension size of the output.
+    Value dim;
+    if (!useBroadcastToShape.empty() && useBroadcastToShape[j]) {
+      dim = castIntToIndex(rewriter, loc, broadcastToShape[i]);
+      if (isDynamic) {
+        hasDynamicNumpyBroadcast = true;
       }
-      outShape.push_back(dim);
+      if (!elideDynamicBroadcastCheck) {
+        Value isValid = rewriter.create<arith::CmpIOp>(
+            loc, arith::CmpIPredicate::sge, shapeValue, zero);
+        rewriter.create<cf::AssertOp>(
+            loc, isValid,
+            rewriter.getStringAttr(
+                "unimplemented: dynamic negative broadcast sizes"));
+      }
+    } else {
+      dim = getDimOp(rewriter, loc, input, j);
     }
+    // We can safely assume this dimension is not broadcasted with strict
+    // symbols.
+    broadcastedStatus.push_back(false);
+    outShape.push_back(dim);
   }
 
-  Value outTensor = rewriter.create<tensor::EmptyOp>(
-      loc, getAsOpFoldResult(outShape), elementType);
+  Value outTensor =
+      rewriter.create<tensor::EmptyOp>(loc, outShape, elementType);
+
+  // If we know there are no ? -> ? broadcasted dims, or we are assuming
+  // strict symbols, we can safely use standard linalg style broadcasting
+  // semantics.
+  if (!hasDynamicNumpyBroadcast || elideDynamicBroadcastCheck) {
+    // If no dims are broadcasted and the rank doesn't change, we can just fold
+    // the op away entirely.
+    if (!llvm::any_of(broadcastedStatus, [](bool b) { return b; }) &&
+        inputRank == outputRank) {
+      result = rewriter.create<tensor::CastOp>(loc, outTensor.getType(), input);
+      return success();
+    }
+
+    SmallVector<AffineExpr> inputExprs;
+    for (int64_t i = 0, e = inputRank; i < e; ++i) {
+      if (broadcastedStatus[i]) {
+        inputExprs.push_back(rewriter.getAffineConstantExpr(0));
+        continue;
+      }
+      inputExprs.push_back(rewriter.getAffineDimExpr(i + diff));
+    }
+
+    SmallVector<AffineMap> indexingMaps = {
+        AffineMap::get(outputRank, 0, inputExprs, rewriter.getContext()),
+        rewriter.getMultiDimIdentityMap(outputRank)};
+    SmallVector<utils::IteratorType> iteratorTypes(
+        outputRank, utils::IteratorType::parallel);
+    result = rewriter
+                 .create<linalg::GenericOp>(
+                     loc, outTensor.getType(), input, outTensor, indexingMaps,
+                     iteratorTypes,
+                     [&](OpBuilder &b, Location loc, ValueRange args) {
+                       b.create<linalg::YieldOp>(loc, args[0]);
+                     })
+                 .getResult(0);
+    return success();
+  }
 
+  // Fall back to numpy-style dynamic broadcasting in the form of a single
+  // linalg op.
   SmallVector<AffineMap> indexingMaps = {
-      rewriter.getMultiDimIdentityMap(broadcastToShape.size())};
-  SmallVector<utils::IteratorType> iteratorTypes(broadcastToShape.size(),
+      rewriter.getMultiDimIdentityMap(outputRank)};
+  SmallVector<utils::IteratorType> iteratorTypes(outputRank,
                                                  utils::IteratorType::parallel);
   result = rewriter
                .create<linalg::GenericOp>(
@@ -402,7 +493,7 @@ LogicalResult torch_to_linalg::broadcastToGivenShape(
                      // would be used to extract values from the input tensor
                      // later on.
                      SmallVector<Value> loopIndices;
-                     for (size_t i = 0; i < broadcastToShape.size(); ++i) {
+                     for (size_t i = 0, e = outputRank; i < e; ++i) {
                        if (i < diff)
                          continue;
                        loopIndices.push_back(b.create<linalg::IndexOp>(loc, i));
@@ -411,7 +502,7 @@ LogicalResult torch_to_linalg::broadcastToGivenShape(
                      // the i-th input dimension is not 1, else it contains a
                      // zero index.
                      SmallVector<Value> inputIndicesToExtract;
-                     for (size_t i = 0, n = inputShape.size(); i < n; i++) {
+                     for (size_t i = 0, n = inputRank; i < n; i++) {
                        if (inputShape[i] == 1) {
                          inputIndicesToExtract.push_back(zeroIndex);
                        } else {
diff --git a/lib/Conversion/TorchToLinalg/Utils.h b/lib/Conversion/TorchToLinalg/Utils.h
@@ -73,10 +73,12 @@ Value createElementwiseLinalgGeneric(
     function_ref<void(OpBuilder &, Location, ValueRange)> bodyBuild);
 
 // Broadcasts input tensor based on the broadcastToShape.
-LogicalResult
-broadcastToGivenShape(Operation *op, PatternRewriter &rewriter, Value input,
-                      SmallVector<Value> broadcastToShape, Value &result,
-                      SmallVector<bool> useBroadcastToShape = {});
+LogicalResult broadcastToGivenShape(Operation *op, PatternRewriter &rewriter,
+                                    Value input,
+                                    SmallVector<Value> broadcastToShape,
+                                    RankedTensorType broadcastType,
+                                    Value &result,
+                                    SmallVector<bool> useBroadcastToShape = {});
 
 // Cast a tensor to a rank-equivalent tensor of unknown size, i.e. <1x2xf32> ->
 // <?x?xf32>
diff --git a/test/Conversion/TorchToLinalg/broadcast.mlir b/test/Conversion/TorchToLinalg/broadcast.mlir
@@ -0,0 +1,90 @@
+// RUN: torch-mlir-opt <%s -convert-torch-to-linalg -canonicalize -split-input-file -mlir-print-local-scope -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL:   func.func @torch.aten.broadcast_to$simple_static(
+// CHECK:           %[[INIT_TENSOR:.*]] = tensor.empty() : tensor<3x4x2xf32>
+// CHECK:           %[[GENERIC:.*]] = linalg.generic
+// CHECK-SAME:        indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>]
+// CHECK-SAME:        iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:        ins({{.*}} : tensor<4x2xf32>) outs({{.*}} : tensor<3x4x2xf32>) {
+// CHECK:           ^bb0(%[[IN:.*]]: f32, %{{.*}}: f32):
+// CHECK:             linalg.yield %[[IN]] : f32
+// CHECK:           } -> tensor<3x4x2xf32>
+func.func @torch.aten.broadcast_to$simple_static(%arg0: !torch.vtensor<[4,2],f32>) -> !torch.vtensor<[3,4,2],f32> {
+  %int3 = torch.constant.int 3
+  %int4 = torch.constant.int 4
+  %int2 = torch.constant.int 2
+  %list = torch.prim.ListConstruct %int3, %int4, %int2 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
+  %0 = torch.aten.broadcast_to %arg0, %list : !torch.vtensor<[4,2],f32>, !torch.list<int> -> !torch.vtensor<[3,4,2],f32>
+  return %0 : !torch.vtensor<[3,4,2],f32>
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @torch.aten.broadcast_to$static_numpy_broadcast(
+// CHECK:           %[[INIT_TENSOR:.*]] = tensor.empty() : tensor<1x4x2xf32>
+// CHECK:           %[[GENERIC:.*]] = linalg.generic
+// CHECK-SAME:        indexing_maps = [affine_map<(d0, d1, d2) -> (d0, 0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>]
+// CHECK-SAME:        iterator_types = ["parallel", "parallel", "parallel"]}
+// CHECK-SAME:        ins({{.*}} : tensor<1x1x2xf32>) outs({{.*}} : tensor<1x4x2xf32>) {
+// CHECK:           ^bb0(%[[IN:.*]]: f32, %{{.*}}: f32):
+// CHECK:             linalg.yield %[[IN]] : f32
+// CHECK:           } -> tensor<1x4x2xf32>
+func.func @torch.aten.broadcast_to$static_numpy_broadcast(%arg0: !torch.vtensor<[1,1,2],f32>) -> !torch.vtensor<[1,4,2],f32> {
+  %int1 = torch.constant.int 1
+  %int4 = torch.constant.int 4
+  %int2 = torch.constant.int 2
+  %list = torch.prim.ListConstruct %int1, %int4, %int2 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
+  %0 = torch.aten.broadcast_to %arg0, %list : !torch.vtensor<[1,1,2],f32>, !torch.list<int> -> !torch.vtensor<[1,4,2],f32>
+  return %0 : !torch.vtensor<[1,4,2],f32>
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @torch.aten.broadcast_to$empty_input(
+// CHECK:           %[[INIT_TENSOR:.*]] = tensor.empty({{.*}}) : tensor<?xf32>
+// CHECK:           %[[GENERIC:.*]] = linalg.generic
+// CHECK-SAME:        indexing_maps = [affine_map<(d0) -> ()>, affine_map<(d0) -> (d0)>]
+// CHECK-SAME:        iterator_types = ["parallel"]}
+// CHECK-SAME:        ins({{.*}} : tensor<f32>) outs({{.*}} : tensor<?xf32>) {
+// CHECK:           ^bb0(%[[IN:.*]]: f32, %{{.*}}: f32):
+// CHECK:             linalg.yield %[[IN]] : f32
+// CHECK:           } -> tensor<?xf32>
+func.func @torch.aten.broadcast_to$empty_input(%arg0: !torch.vtensor<[],f32>, %arg1: !torch.int) -> !torch.vtensor<[?],f32> {
+  %list = torch.prim.ListConstruct %arg1 : (!torch.int) -> !torch.list<int>
+  %0 = torch.aten.broadcast_to %arg0, %list : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[?],f32>
+  return %0 : !torch.vtensor<[?],f32>
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @torch.aten.broadcast_to$strict_dynamic_broadcast(
+// CHECK:           %[[INIT_TENSOR:.*]] = tensor.empty({{.*}}) : tensor<?x?xf32>
+// CHECK:           %[[GENERIC:.*]] = linalg.generic
+// CHECK-SAME:        indexing_maps = [affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0, d1)>]
+// CHECK-SAME:        iterator_types = ["parallel", "parallel"]}
+// CHECK-SAME:        ins({{.*}} : tensor<?xf32>) outs({{.*}} : tensor<?x?xf32>) {
+// CHECK:           ^bb0(%[[IN:.*]]: f32, %{{.*}}: f32):
+// CHECK:             linalg.yield %[[IN]] : f32
+// CHECK:           } -> tensor<?x?xf32>
+func.func @torch.aten.broadcast_to$strict_dynamic_broadcast(%arg0: !torch.vtensor<[?],f32>, %arg1: !torch.int, %arg2: !torch.int) -> !torch.vtensor<[?,?],f32> attributes {torch.assume_strict_symbolic_shapes} {
+  %list = torch.prim.ListConstruct %arg1, %arg2 : (!torch.int, !torch.int) -> !torch.list<int>
+  %0 = torch.aten.broadcast_to %arg0, %list : !torch.vtensor<[?],f32>, !torch.list<int> -> !torch.vtensor<[?,?],f32>
+  return %0 : !torch.vtensor<[?,?],f32>
+}
+
+// -----
+
+/// Nothing we can do; verify we hit the fall back path.
+// CHECK-LABEL:   func.func @torch.aten.broadcast_to$pure_dynamic_broadcast(
+// CHECK:           %[[INIT_TENSOR:.*]] = tensor.empty({{.*}}) : tensor<?x?xf32>
+// CHECK:           %[[GENERIC:.*]] = linalg.generic
+// CHECK-SAME:        indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>]
+// CHECK-SAME:        iterator_types = ["parallel", "parallel"]}
+// CHECK-SAME:        outs({{.*}} : tensor<?x?xf32>) {
+// CHECK:           ^bb0(%[[OUT:.+]]: f32):
+// CHECK:             tensor.extract
+func.func @torch.aten.broadcast_to$pure_dynamic_broadcast(%arg0: !torch.vtensor<[?],f32>, %arg1: !torch.int, %arg2: !torch.int) -> !torch.vtensor<[?,?],f32> {
+  %list = torch.prim.ListConstruct %arg1, %arg2 : (!torch.int, !torch.int) -> !torch.list<int>
+  %0 = torch.aten.broadcast_to %arg0, %list : !torch.vtensor<[?],f32>, !torch.list<int> -> !torch.vtensor<[?,?],f32>
+  return %0 : !torch.vtensor<[?,?],f32>
+}