Add support for transposed grouped convolution in torch to linalg lowering (#4056)

ivangarcia44 · Ivan Garcia · web-flow · commit 596b58ea2432 · 2025-03-06T11:23:40.000-05:00
The conversion of the convolutiong torch operation to linalg currently works for grouped convolution (number of groups > 1) and transposed convolution, but the conversion failed when both are used at the same time. This change set correct this. The core of the changes are in the Linear.cpp. In transposed grouped convolution, the output filters is the one divided by the groups in the weights, not the input channel (see the "Variables" section in both links below for details). This was one of the fixes. https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html The other issue was that the weights expansion had to happen before the Channel/Filter dimension permutation/flip. This is because the expansion deals with adjacent dimensions, but in the final weights tensor the group and the input channel are not going to be adjacent. Once the dimensions are flipped, the expansion operation can't generate the expected dimension format. See the comment in the code for details. @rsuderman @vivekkhandelwal1 @zjgarvey @penguin-wwy @ubfx @sahas3 @dixinzhou @rafaelubalmw --------- Co-authored-by: Ivan Garcia <igarcia@vdi-ah2ddp-178.dhcp.mathworks.com>
diff --git a/lib/Conversion/TorchToLinalg/Linear.cpp b/lib/Conversion/TorchToLinalg/Linear.cpp
@@ -955,7 +955,55 @@ class ConvertAtenConvolutionOp : public OpConversionPattern<AtenConvolutionOp> {
       if (isa<mlir::IntegerType>(inputDTy))
         pad = rewriter.create<arith::TruncIOp>(op.getLoc(), inputDTy, pad);
     }
+
+    // The expandWeight lambda function below is used to expand the group
+    // dimension. For the normal case the group dimension is expanded out
+    // of the output filter dimension:
+    // expand F,C,H,W -> G,F/G,C,H,W
+    //
+    // Note that the group dimension has to be the first dimension. For the
+    // transposed convolution case, the group convolution is extracted out
+    // of the input channel dimension. But note that the input channel
+    // dimension is interchanged with the output filter dimension (due to
+    // the transposed operation). Because of this the group and input
+    // channel dimensions will not be adjacent and the expand_shape
+    // operation will not work.
+    //
+    // For this reason, in the transposed convolution case the expandWeight
+    // lambda needs to be executed before this dimension flipping by doing
+    // these two steps:
+    //
+    // Expansion:    C,F,H,W -> G,C/G,F,H,W
+    //
+    // Dimension interchange: G,C/G,F,H,W -> G,F,C/G,H,W
+    //
+    auto expandWeight = [&](Value tensor) {
+      auto inType = cast<RankedTensorType>(tensor.getType());
+      auto inShape = makeShapeTorchCompatible(inType.getShape());
+
+      SmallVector<int64_t> outShape{numGroups,
+                                    (inShape[0] == kUnknownSize
+                                         ? kUnknownSize
+                                         : (inShape[0] / numGroups)),
+                                    inShape[1]};
+      outShape.append(inShape.begin() + 2, inShape.end());
+
+      SmallVector<ReassociationIndices> indices{};
+      int currIndex = 0;
+      indices.push_back({0, 1});
+      currIndex += 2;
+      for (int i = currIndex; i <= (long)inShape.size(); i++)
+        indices.push_back({i});
+
+      auto retType = inType.clone(makeShapeLLVMCompatible(outShape));
+      return rewriter.create<tensor::ExpandShapeOp>(loc, retType, tensor,
+                                                    indices);
+    };
+
     if (transposed) {
+      bool isGroupedConv = numGroups > 1;
+      weight = isGroupedConv ? expandWeight(weight) : weight;
+
       Value c0 =
           rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0));
       Value c1 =
@@ -965,25 +1013,40 @@ class ConvertAtenConvolutionOp : public OpConversionPattern<AtenConvolutionOp> {
 
       // Transpose and flip weight
       SmallVector<Value> weightInitDims = getTensorSizes(rewriter, loc, weight);
-      std::iter_swap(weightInitDims.begin(), weightInitDims.begin() + 1);
-      outDims[1] = weightInitDims[0];
+      if (isGroupedConv) {
+        // We need to skip the first dimension (group) in this case, also the
+        // output dimension needs to consider the number of groups.
+        std::iter_swap(weightInitDims.begin() + 1, weightInitDims.begin() + 2);
+        auto numGroupsVal =
+            rewriter.create<mlir::arith::ConstantIndexOp>(loc, numGroups);
+        outDims[1] = rewriter.createOrFold<mlir::arith::MulIOp>(
+            loc, weightInitDims[1], numGroupsVal);
+      } else {
+        std::iter_swap(weightInitDims.begin(), weightInitDims.begin() + 1);
+        outDims[1] = weightInitDims[0];
+      }
+      auto weightRank = weightInitDims.size();
       Value weightInitTensor =
           createZeroInitTensor(rewriter, loc, weightInitDims, weightDTy);
       SmallVector<utils::IteratorType> iteratorTypes(
-          inRank, utils::IteratorType::parallel);
+          weightRank, utils::IteratorType::parallel);
       SmallVector<AffineMap> indexingMaps{
-          AffineMap::getMultiDimIdentityMap(inRank, context)};
+          AffineMap::getMultiDimIdentityMap(weightRank, context)};
       weight = rewriter
                    .create<linalg::GenericOp>(
                        loc, weightInitTensor.getType(), ValueRange{},
                        weightInitTensor, indexingMaps, iteratorTypes,
                        [&](OpBuilder &b, Location loc, ValueRange args) {
                          SmallVector<Value> indices;
-                         for (size_t i = 0; i < inRank; i++)
+                         for (size_t i = 0; i < weightRank; i++)
                            indices.push_back(b.create<linalg::IndexOp>(loc, i));
-                         std::iter_swap(indices.begin(), indices.begin() + 1);
-                         // Flip only the spatial dimensions (from 2 to inRank)
-                         for (size_t flipDim = 2; flipDim < inRank; flipDim++) {
+                         auto fcIdxSwapOffset = isGroupedConv ? 1 : 0;
+                         std::iter_swap(indices.begin() + fcIdxSwapOffset,
+                                        indices.begin() + fcIdxSwapOffset + 1);
+                         // Flip only the spatial dimensions (from 2 to
+                         // weightRank)
+                         for (size_t flipDim = fcIdxSwapOffset + 2;
+                              flipDim < weightRank; flipDim++) {
                            indices[flipDim] = b.create<arith::SubIOp>(
                                loc,
                                b.create<arith::SubIOp>(
@@ -1373,43 +1436,26 @@ class ConvertAtenConvolutionOp : public OpConversionPattern<AtenConvolutionOp> {
                                                     indices);
     };
 
-    // expand F,C,H,W -> G,F/G,C,H,W
-    auto expandWeight = [&](Value tensor) {
-      auto inType = cast<RankedTensorType>(tensor.getType());
-      auto inShape = makeShapeTorchCompatible(inType.getShape());
-
-      SmallVector<int64_t> outShape{
-          numGroups,
-          (inShape[0] == kUnknownSize ? kUnknownSize : inShape[0] / numGroups)};
-      outShape.append(inShape.begin() + 1, inShape.end());
-
-      SmallVector<ReassociationIndices> indices{{0, 1}};
-      for (auto i = 2; i <= (long)inShape.size(); i++)
-        indices.push_back({i});
-
-      auto retType = inType.clone(makeShapeLLVMCompatible(outShape));
-      return rewriter.create<tensor::ExpandShapeOp>(loc, retType, tensor,
-                                                    indices);
-    };
-
     Value paddedInputExpanded = expandGroups(paddedInput, 1);
-    Value weightExpanded = expandWeight(weight);
+    // If we have a transposed convolution, this needs to be handled before
+    // dimension permutation. See comments in the expandWeight lambda definition
+    // for details.
+    weight = transposed ? weight : expandWeight(weight);
     auto expandOutputTensor = expandGroups(outputTensor, 1);
 
     // TODO: add 1D and 3D case
     if (!inputZp) {
       conv = rewriter
                  .create<linalg::Conv2DNgchwGfchwOp>(
                      loc, expandOutputTensor.getResultType(),
-                     ValueRange{paddedInputExpanded, weightExpanded},
+                     ValueRange{paddedInputExpanded, weight},
                      expandOutputTensor.getResult(), stridesAttr, dilationAttr)
                  .getResult(0);
     } else {
       conv = rewriter
                  .create<linalg::Conv2DNgchwGfchwQOp>(
                      loc, expandOutputTensor.getResultType(),
-                     ValueRange{paddedInputExpanded, weightExpanded, inputZp,
-                                weightZp},
+                     ValueRange{paddedInputExpanded, weight, inputZp, weightZp},
                      expandOutputTensor.getResult(), stridesAttr, dilationAttr)
                  .getResult(0);
     }
diff --git a/projects/pt1/e2e_testing/xfail_sets.py b/projects/pt1/e2e_testing/xfail_sets.py
@@ -3537,6 +3537,7 @@
     "ConvolutionModule2DTransposeStridedStatic_basic",
     "ConvolutionModule2DTransposeStrided_basic",
     "ConvolutionModule2DTranspose_basic",
+    "ConvolutionModule2DGroupedTranspose_basic",
     "CumsumInputDtypeInt32Module_basic",
     "CumsumModule_basic",
     "CumsumStaticModule_basic",
@@ -4113,6 +4114,7 @@
     "ConvolutionModule2DTransposeStridedStatic_basic",
     "ConvolutionModule2DTransposeStrided_basic",
     "ConvolutionModule2DTranspose_basic",
+    "ConvolutionModule2DGroupedTranspose_basic",
     "CopyModule_basic",
     "CopyWithDifferentDTypesAndSizesModule_basic",
     "CopyWithDifferentDTypesModule_basic",
diff --git a/projects/pt1/python/torch_mlir_e2e_test/test_suite/conv.py b/projects/pt1/python/torch_mlir_e2e_test/test_suite/conv.py
@@ -1725,3 +1725,35 @@ def DeformConv2D_basic(module, tu: TestUtils):
     offset = tu.rand(N, offset_dim1, Hout, Wout)
     weight = tu.rand(Cout, Cin, Hker, Wker)
     module.forward(input, offset, weight)
+
+
+class ConvolutionModule2DGroupedTranspose(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    @export
+    @annotate_args(
+        [
+            None,
+            ([1, 2, 5, 7], torch.float32, True),
+            ([2, 2, 3, 3], torch.float32, True),
+            ([4], torch.float32, True),
+        ]
+    )
+    def forward(self, inputVec, weight, bias):
+        return torch.ops.aten.convolution(
+            inputVec,
+            weight,
+            bias=bias,
+            stride=[2, 2],
+            padding=[1, 1],
+            dilation=[1, 1],
+            transposed=True,
+            output_padding=[0, 0],
+            groups=2,
+        )
+
+
+@register_test_case(module_factory=lambda: ConvolutionModule2DGroupedTranspose())
+def ConvolutionModule2DGroupedTranspose_basic(module, tu: TestUtils):
+    module.forward(tu.rand(1, 2, 5, 7), tu.rand(2, 2, 3, 3), tu.rand(4))
diff --git a/test/Conversion/TorchToLinalg/convolution.mlir b/test/Conversion/TorchToLinalg/convolution.mlir
@@ -76,3 +76,77 @@ func.func @conv_broadcast(%arg0: !torch.vtensor<[1,80,3000],f32>, %arg1: !torch.
   %2 = torch.aten.convolution %arg0, %arg1, %arg2, %0, %0, %0, %false, %1, %int1 : !torch.vtensor<[1,80,3000],f32>, !torch.vtensor<[1024,80,3],f32>, !torch.vtensor<[1024],f32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool, !torch.list<int>, !torch.int -> !torch.vtensor<[1,1024,3000],f32>
   return %2 : !torch.vtensor<[1,1024,3000],f32>
 }
+
+// CHECK-LABEL:   func.func @transposedConv2D(
+// CHECK-SAME:       %[[INPUT_TENSOR:.*]]: !torch.vtensor<[1,2,5,7],f32>) -> !torch.vtensor<[1,4,10,14],f32>
+// CHECK:         = linalg.generic
+// CHECK-SAME:       outs(%[[BROADCASTED_WEIGHTS_INIT:.*]] : tensor<4x2x3x3xf32>) {
+// CHECK:         %[[WEIGHTS:.*]] = tensor.extract
+// CHECK-SAME:       : tensor<2x4x3x3xf32>
+// CHECK-NEXT:    linalg.yield %[[BROADCASTED_WEIGHTS:.*]] : f32
+// CHECK-NEXT:    } -> tensor<4x2x3x3xf32>
+// CHECK:         %[[BROADCASTED_BIAS:.*]] = linalg.broadcast ins(%[[BIAS:.*]] : tensor<4xf32>) outs(%[[BROADCASTED_BIAS_INIT:.*]] : tensor<1x4x11x15xf32>) dimensions = [0, 2, 3]
+// CHECK:         %[[CONV_RESULT:.*]] = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>}
+// CHECK-SAME:        ins(%[[INPUT_TENSOR_ADAPTED:.*]], %[[BROADCASTED_WEIGHTS:.*]] : tensor<1x2x13x17xf32>, tensor<4x2x3x3xf32>) outs(%[[BROADCASTED_BIAS:.*]] : tensor<1x4x11x15xf32>) -> tensor<1x4x11x15xf32>
+// CHECK-NEXT:    %[[OUTPUT_TENSOR_DYN:.*]] = tensor.cast %[[CONV_RESULT:.*]] : tensor<1x4x11x15xf32> to tensor<1x4x?x?xf32>
+// CHECK-NEXT:    %[[OUTPUT_TENSOR:.*]] = tensor.cast %[[OUTPUT_TENSOR_DYN:.*]] : tensor<1x4x?x?xf32> to tensor<1x4x10x14xf32>
+func.func @transposedConv2D(%arg0: !torch.vtensor<[1,2,5,7],f32>) -> !torch.vtensor<[1,4,10,14],f32> attributes {torch.assume_strict_symbolic_shapes} {
+  %int0 = torch.constant.int 0
+  %true = torch.constant.bool true
+  %int1 = torch.constant.int 1
+  %int2 = torch.constant.int 2
+  %0 = torch.vtensor.literal(dense_resource<torch_tensor_2_4_3_3_torch.float32> : tensor<2x4x3x3xf32>) : !torch.vtensor<[2,4,3,3],f32>
+  %1 = torch.vtensor.literal(dense_resource<torch_tensor_4_torch.float32> : tensor<4xf32>) : !torch.vtensor<[4],f32>
+  %2 = torch.prim.ListConstruct %int2, %int2 : (!torch.int, !torch.int) -> !torch.list<int>
+  %3 = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
+  %4 = torch.prim.ListConstruct %int1, %int1 : (!torch.int, !torch.int) -> !torch.list<int>
+  %5 = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
+  %6 = torch.aten.convolution %arg0, %0, %1, %2, %3, %4, %true, %5, %int1 : !torch.vtensor<[1,2,5,7],f32>, !torch.vtensor<[2,4,3,3],f32>, !torch.vtensor<[4],f32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool, !torch.list<int>, !torch.int -> !torch.vtensor<[1,4,10,14],f32>
+  return %6 : !torch.vtensor<[1,4,10,14],f32>
+}
+
+// CHECK-LABEL:   func.func @groupedConvolution2D(
+// CHECK-SAME:       %[[INPUT_TENSOR:.*]]: !torch.vtensor<[1,4,5,7],f32>) -> !torch.vtensor<[1,4,5,7],f32>
+// CHECK:         %[[BROADCASTED_BIAS:.*]] = linalg.broadcast ins(%[[BIAS:.*]] : tensor<4xf32>) outs(%[[BROADCASTED_BIAS_INIT:.*]] : tensor<1x4x5x7xf32>) dimensions = [0, 2, 3]
+// CHECK:         %[[CONV_RESULT:.*]] = linalg.conv_2d_ngchw_gfchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>}
+// CHECK-SAME:        ins(%[[INPUT_TENSOR_ADAPTED:.*]], %[[BROADCASTED_WEIGHTS:.*]] : tensor<1x2x2x7x9xf32>, tensor<2x2x2x3x3xf32>) outs(%[[BROADCASTED_BIAS:.*]] : tensor<1x2x2x5x7xf32>) -> tensor<1x2x2x5x7xf32>
+// CHECK-NEXT:    %[[OUTPUT_TENSOR:.*]] = tensor.collapse_shape
+// CHECK-SAME:        tensor<1x2x2x5x7xf32> into tensor<1x4x5x7xf32>
+func.func @groupedConvolution2D(%arg0: !torch.vtensor<[1,4,5,7],f32>) -> !torch.vtensor<[1,4,5,7],f32> attributes {torch.assume_strict_symbolic_shapes} {
+  %int0 = torch.constant.int 0
+  %false = torch.constant.bool false
+  %int1 = torch.constant.int 1
+  %int2 = torch.constant.int 2
+  %0 = torch.vtensor.literal(dense_resource<torch_tensor_4_2_3_3_torch.float32> : tensor<4x2x3x3xf32>) : !torch.vtensor<[4,2,3,3],f32>
+  %1 = torch.vtensor.literal(dense_resource<torch_tensor_4_torch.float32> : tensor<4xf32>) : !torch.vtensor<[4],f32>
+  %2 = torch.prim.ListConstruct %int1, %int1 : (!torch.int, !torch.int) -> !torch.list<int>
+  %3 = torch.prim.ListConstruct %int1, %int1 : (!torch.int, !torch.int) -> !torch.list<int>
+  %4 = torch.prim.ListConstruct %int1, %int1 : (!torch.int, !torch.int) -> !torch.list<int>
+  %5 = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
+  %6 = torch.aten.convolution %arg0, %0, %1, %2, %3, %4, %false, %5, %int2 : !torch.vtensor<[1,4,5,7],f32>, !torch.vtensor<[4,2,3,3],f32>, !torch.vtensor<[4],f32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool, !torch.list<int>, !torch.int -> !torch.vtensor<[1,4,5,7],f32>
+  return %6 : !torch.vtensor<[1,4,5,7],f32>
+}
+
+// CHECK-LABEL:   func.func @transposedGroupedConvolution2D(
+// CHECK-SAME:       %[[INPUT_TENSOR:.*]]: !torch.vtensor<[1,2,5,7],f32>) -> !torch.vtensor<[1,4,10,14],f32>
+// CHECK:         %[[BROADCASTED_BIAS:.*]] = linalg.broadcast ins(%[[BIAS:.*]] : tensor<4xf32>) outs(%[[BROADCASTED_BIAS_INIT:.*]] : tensor<1x4x11x15xf32>) dimensions = [0, 2, 3]
+// CHECK:         %[[CONV_RESULT:.*]] = linalg.conv_2d_ngchw_gfchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>}
+// CHECK-SAME:        ins(%[[INPUT_TENSOR_ADAPTED:.*]], %[[BROADCASTED_WEIGHTS:.*]] : tensor<1x2x1x13x17xf32>, tensor<2x2x1x3x3xf32>) outs(%[[BROADCASTED_BIAS:.*]] : tensor<1x2x2x11x15xf32>) -> tensor<1x2x2x11x15xf32>
+// CHECK-NEXT:    %[[COLLAPSED_TENSOR:.*]] = tensor.collapse_shape
+// CHECK-SAME:        tensor<1x2x2x11x15xf32> into tensor<1x4x11x15xf32>
+// CHECK-NEXT:    %[[OUTPUT_TENSOR_DYN:.*]] = tensor.cast %[[COLLAPSED_TENSOR:.*]] : tensor<1x4x11x15xf32> to tensor<1x4x?x?xf32>
+// CHECK-NEXT:    %[[OUTPUT_TENSOR:.*]] = tensor.cast %[[OUTPUT_TENSOR_DYN:.*]] : tensor<1x4x?x?xf32> to tensor<1x4x10x14xf32>
+func.func @transposedGroupedConvolution2D(%arg0: !torch.vtensor<[1,2,5,7],f32>) -> !torch.vtensor<[1,4,10,14],f32> attributes {torch.assume_strict_symbolic_shapes} {
+  %int0 = torch.constant.int 0
+  %true = torch.constant.bool true
+  %int1 = torch.constant.int 1
+  %int2 = torch.constant.int 2
+  %0 = torch.vtensor.literal(dense_resource<torch_tensor_2_2_3_3_torch.float32> : tensor<2x2x3x3xf32>) : !torch.vtensor<[2,2,3,3],f32>
+  %1 = torch.vtensor.literal(dense_resource<torch_tensor_4_torch.float32> : tensor<4xf32>) : !torch.vtensor<[4],f32>
+  %2 = torch.prim.ListConstruct %int2, %int2 : (!torch.int, !torch.int) -> !torch.list<int>
+  %3 = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
+  %4 = torch.prim.ListConstruct %int1, %int1 : (!torch.int, !torch.int) -> !torch.list<int>
+  %5 = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
+  %6 = torch.aten.convolution %arg0, %0, %1, %2, %3, %4, %true, %5, %int2 : !torch.vtensor<[1,2,5,7],f32>, !torch.vtensor<[2,2,3,3],f32>, !torch.vtensor<[4],f32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool, !torch.list<int>, !torch.int -> !torch.vtensor<[1,4,10,14],f32>
+  return %6 : !torch.vtensor<[1,4,10,14],f32>
+}