From 751e1fc0f4967b26971eead6dc9a0f40503a6cf7 Mon Sep 17 00:00:00 2001
From: HarryZ <uqbarz@gmail.com>
Date: Mon, 3 Mar 2025 16:34:02 +0800
Subject: [PATCH 1/7] [feat] reconstruct resize op, nearest and bilinear

---
 midend/lib/Utils/DIPUtils.cpp | 357 ++++++++++++++++++++++++++++++++++
 1 file changed, 357 insertions(+)

diff --git a/midend/lib/Utils/DIPUtils.cpp b/midend/lib/Utils/DIPUtils.cpp
index d68451bb51..4519ceb1d1 100644
--- a/midend/lib/Utils/DIPUtils.cpp
+++ b/midend/lib/Utils/DIPUtils.cpp
@@ -987,6 +987,363 @@ void fillPixelsBilinearInterpolate4D(
       });
 }
 
+void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc,
+                                              MLIRContext *ctx, Value input,
+                                              Value output,
+                                              Value horizontalScalingFactor,
+                                              Value verticalScalingFactor) {
+  Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
+  Value c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
+
+  Value inputRow = builder.create<memref::DimOp>(loc, input, c0);
+  Value inputRowMinus1 = builder.create<arith::IndexCastUIOp>(
+      loc, builder.getI16Type(),
+      builder.create<arith::SubIOp>(loc, inputRow, c1));
+  Value inputCol = builder.create<memref::DimOp>(loc, input, c1);
+  Value inputColMinus1 = builder.create<arith::IndexCastUIOp>(
+      loc, builder.getI16Type(),
+      builder.create<arith::SubIOp>(loc, inputCol, c1));
+
+  Value outputRow = builder.create<memref::DimOp>(loc, output, c0);
+  Value outputCol = builder.create<memref::DimOp>(loc, output, c1);
+
+  MemRefType dynamicTypeI16 =
+      MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 16));
+  Value srcXPosVec =
+      builder.create<memref::AllocOp>(loc, dynamicTypeI16, outputCol);
+  builder.create<scf::ForOp>(
+      loc, c0, outputCol, c1, std::nullopt,
+      [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
+        Value srcXPos = xBuilder.create<arith::FPToUIOp>(
+            xLoc, xBuilder.getI16Type(),
+            xBuilder.create<arith::MulFOp>(xLoc,
+                                           indexToF32(xBuilder, xLoc, xiv),
+                                           horizontalScalingFactor));
+        srcXPos =
+            xBuilder.create<arith::MinSIOp>(xLoc, srcXPos, inputRowMinus1);
+        xBuilder.create<memref::StoreOp>(xLoc, srcXPos, srcXPosVec,
+                                         ValueRange{xiv});
+        xBuilder.create<scf::YieldOp>(xLoc);
+      });
+
+  builder.create<scf::ForOp>(
+      loc, c0, outputRow, c1, std::nullopt,
+      [&](OpBuilder &yBuilder, Location yLoc, Value yiv, ValueRange) {
+        Value srcYPos = yBuilder.create<arith::FPToUIOp>(
+            yLoc, yBuilder.getI16Type(),
+            yBuilder.create<arith::MulFOp>(
+                yLoc, indexToF32(yBuilder, yLoc, yiv), verticalScalingFactor));
+        srcYPos =
+            yBuilder.create<arith::MinSIOp>(yLoc, srcYPos, inputColMinus1);
+        srcYPos = yBuilder.create<arith::IndexCastOp>(
+            yLoc, yBuilder.getIndexType(), srcYPos);
+        yBuilder.create<scf::ForOp>(
+            loc, c0, outputCol, c1, std::nullopt,
+            [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
+              Value srcXPos = xBuilder.create<memref::LoadOp>(xLoc, srcXPosVec,
+                                                              ValueRange{xiv});
+              srcXPos = xBuilder.create<arith::IndexCastOp>(
+                  xLoc, xBuilder.getIndexType(), srcXPos);
+              Value srcPixel = xBuilder.create<memref::LoadOp>(
+                  xLoc, input, ValueRange{srcYPos, srcXPos});
+              xBuilder.create<memref::StoreOp>(xLoc, srcPixel, output,
+                                               ValueRange{yiv, xiv});
+              xBuilder.create<scf::YieldOp>(xLoc);
+            });
+        yBuilder.create<scf::YieldOp>(yLoc);
+      });
+}
+
+void processScaling(OpBuilder &builder, Location loc, Value output,
+                    Value scalingFactor, Value input, Value xOffset,
+                    Value iAlpha) {
+  static const int SHIFT = 11;
+  static const int INTER_RESIZE_COEF_SCALE = 1 << SHIFT;
+  static const int HALF = 1 << (SHIFT - 1);
+
+  Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
+  Value c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
+  Value c2 = builder.create<arith::ConstantIndexOp>(loc, 2);
+  Value c0I32 =
+      builder.create<arith::IndexCastUIOp>(loc, builder.getI32Type(), c0);
+  Value c0F = builder.create<arith::ConstantFloatOp>(loc, (llvm::APFloat)0.0f,
+                                                     builder.getF32Type());
+  Value cDot5F = builder.create<arith::ConstantFloatOp>(
+      loc, (llvm::APFloat)0.5f, builder.getF32Type());
+  Value c1F = builder.create<arith::ConstantFloatOp>(loc, (llvm::APFloat)1.0f,
+                                                     builder.getF32Type());
+  Value inputMinus1 = builder.create<arith::IndexCastUIOp>(
+      loc, builder.getI32Type(), builder.create<arith::SubIOp>(loc, input, c1));
+  Value inputMinus2 = builder.create<arith::IndexCastUIOp>(
+      loc, builder.getI32Type(), builder.create<arith::SubIOp>(loc, input, c2));
+  builder.create<scf::ForOp>(
+      loc, c0, output, c1, std::nullopt,
+      [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
+        //  float fx = (float)((dx + 0.5) * scale_x - 0.5);
+        Value xivF = indexToF32(xBuilder, xLoc, xiv);
+        Value temp1 = xBuilder.create<arith::AddFOp>(xLoc, xivF, cDot5F);
+        Value temp2 =
+            xBuilder.create<arith::MulFOp>(xLoc, temp1, scalingFactor);
+        Value fx = xBuilder.create<arith::SubFOp>(xLoc, temp2, cDot5F);
+        Value sx =
+            xBuilder.create<arith::FPToSIOp>(xLoc, xBuilder.getI32Type(), fx);
+        fx = xBuilder.create<arith::SubFOp>(
+            xLoc, fx,
+            xBuilder.create<arith::SIToFPOp>(xLoc, xBuilder.getF32Type(), sx));
+        Value lowerThanZero = xBuilder.create<arith::CmpIOp>(
+            xLoc, arith::CmpIPredicate::slt, sx, c0I32);
+        Value greaterThan = xBuilder.create<arith::CmpIOp>(
+            xLoc, arith::CmpIPredicate::sge, sx, inputMinus1);
+        sx = xBuilder.create<arith::SelectOp>(xLoc, lowerThanZero, c0I32, sx);
+        fx = xBuilder.create<arith::SelectOp>(xLoc, lowerThanZero, c0F, fx);
+        sx = xBuilder.create<arith::SelectOp>(xLoc, greaterThan, inputMinus2,
+                                              sx);
+        fx = xBuilder.create<arith::SelectOp>(xLoc, greaterThan, c0F, fx);
+        xBuilder.create<memref::StoreOp>(xLoc, sx, xOffset, ValueRange{xiv});
+
+        //  ialpha[dx * 2]     = (short)((1.f - fx) * INTER_RESIZE_COEF_SCALE);
+        //  ialpha[dx * 2 + 1] = (short)(fx * INTER_RESIZE_COEF_SCALE);
+        Value fxScale = xBuilder.create<arith::MulFOp>(
+            xLoc, fx,
+            xBuilder.create<arith::ConstantFloatOp>(
+                xLoc, (llvm::APFloat)(float)INTER_RESIZE_COEF_SCALE,
+                xBuilder.getF32Type()));
+        Value oneMinusFx = xBuilder.create<arith::SubFOp>(xLoc, c1F, fx);
+        Value oneMinusFxScale = xBuilder.create<arith::MulFOp>(
+            xLoc, oneMinusFx,
+            xBuilder.create<arith::ConstantFloatOp>(
+                xLoc, (llvm::APFloat)(float)INTER_RESIZE_COEF_SCALE,
+                xBuilder.getF32Type()));
+
+        Value index0 = xBuilder.create<arith::MulIOp>(xLoc, xiv, c2);
+        Value index1 = xBuilder.create<arith::AddIOp>(xLoc, index0, c1);
+
+        Value val0 = xBuilder.create<arith::FPToSIOp>(
+            xLoc, xBuilder.getI16Type(), oneMinusFxScale);
+        Value val1 = xBuilder.create<arith::FPToSIOp>(
+            xLoc, xBuilder.getI16Type(), fxScale);
+
+        xBuilder.create<memref::StoreOp>(xLoc, val0, iAlpha,
+                                         ValueRange{index0});
+        xBuilder.create<memref::StoreOp>(xLoc, val1, iAlpha,
+                                         ValueRange{index1});
+        xBuilder.create<scf::YieldOp>(xLoc);
+      });
+}
+
+void calcInterpolation(OpBuilder &builder, Location loc, Value &sy,
+                       Value &syPrev, Value offset, Value input,
+                       Value outputWidth, Value iAlpha, Value buffer) {
+  static const int SHIFT = 11;
+  static const int HALF = 1 << (SHIFT - 1);
+  auto inElemTy = input.getType().cast<MemRefType>().getElementType();
+  Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
+  Value c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
+  Value c0I =
+      builder.create<arith::ConstantIntOp>(loc, 0, builder.getI32Type());
+  Value c1I =
+      builder.create<arith::ConstantIntOp>(loc, 1, builder.getI32Type());
+  Value c2 = builder.create<arith::ConstantIndexOp>(loc, 2);
+  Value c2I =
+      builder.create<arith::ConstantIntOp>(loc, 2, builder.getI32Type());
+  Value notEqual =
+      builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, sy, syPrev);
+  builder.create<scf::IfOp>(
+      loc, notEqual, [&](OpBuilder &tBuilder, Location tLoc) {
+        tBuilder.create<scf::ForOp>(
+            tLoc, c0, outputWidth, c1, std::nullopt,
+            [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
+              Value sx = xBuilder.create<memref::LoadOp>(xLoc, offset,
+                                                         ValueRange{xiv});
+              Value sxPlus1 = xBuilder.create<arith::AddIOp>(xLoc, sx, c1I);
+              Value xMul2 = xBuilder.create<arith::MulIOp>(xLoc, xiv, c2);
+              xMul2 = xBuilder.create<arith::IndexCastOp>(
+                  xLoc, xBuilder.getI32Type(), xMul2);
+              Value xMul2Plus1 =
+                  xBuilder.create<arith::AddIOp>(xLoc, xMul2, c1I);
+              Value index0 = xBuilder.create<arith::IndexCastOp>(
+                  xLoc, xBuilder.getIndexType(), xMul2);
+              Value index1 = xBuilder.create<arith::IndexCastOp>(
+                  xLoc, xBuilder.getIndexType(), xMul2Plus1);
+              Value a0 = xBuilder.create<memref::LoadOp>(xLoc, iAlpha,
+                                                         ValueRange{index0});
+              Value a1 = xBuilder.create<memref::LoadOp>(xLoc, iAlpha,
+                                                         ValueRange{index1});
+              Value sxIndex = xBuilder.create<arith::IndexCastOp>(
+                  xLoc, xBuilder.getIndexType(), sx);
+              Value sxPlus1Index = xBuilder.create<arith::IndexCastOp>(
+                  xLoc, xBuilder.getIndexType(), sxPlus1);
+              Value syIndex = xBuilder.create<arith::IndexCastOp>(
+                  xLoc, xBuilder.getIndexType(), sy);
+              Value v0 = xBuilder.create<memref::LoadOp>(
+                  xLoc, input, ValueRange{syIndex, sxIndex});
+              v0 = xBuilder.create<arith::FPToSIOp>(xLoc, xBuilder.getI32Type(),
+                                                    v0);
+              Value v1 = xBuilder.create<memref::LoadOp>(
+                  xLoc, input, ValueRange{syIndex, sxPlus1Index});
+              v1 = xBuilder.create<arith::FPToSIOp>(xLoc, xBuilder.getI32Type(),
+                                                    v1);
+              Value a0I32 = xBuilder.create<arith::ExtSIOp>(
+                  xLoc, xBuilder.getI32Type(), a0);
+              Value a1I32 = xBuilder.create<arith::ExtSIOp>(
+                  xLoc, xBuilder.getI32Type(), a1);
+              Value v0MulA0 = xBuilder.create<arith::MulIOp>(xLoc, v0, a0I32);
+              Value v1MulA1 = xBuilder.create<arith::MulIOp>(xLoc, v1, a1I32);
+              Value add1 =
+                  xBuilder.create<arith::AddIOp>(xLoc, v0MulA0, v1MulA1);
+              Value half = xBuilder.create<arith::ConstantIntOp>(
+                  xLoc, HALF, xBuilder.getI32Type());
+              Value add2 = xBuilder.create<arith::AddIOp>(xLoc, add1, half);
+              Value shift = xBuilder.create<arith::ConstantIntOp>(
+                  xLoc, SHIFT, xBuilder.getI32Type());
+              Value resShifted =
+                  xBuilder.create<arith::ShRSIOp>(xLoc, add2, shift);
+              xBuilder.create<memref::StoreOp>(xLoc, resShifted, buffer,
+                                               ValueRange{xiv});
+              xBuilder.create<scf::YieldOp>(xLoc);
+            });
+        // syPrev = sy;
+        tBuilder.create<scf::YieldOp>(tLoc);
+      });
+  syPrev = builder.create<arith::SelectOp>(loc, notEqual, sy, syPrev);
+}
+
+void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc,
+                                      MLIRContext *ctx, Value input,
+                                      Value output,
+                                      Value horizontalScalingFactor,
+                                      Value verticalScalingFactor) {
+  static const int SHIFT = 11;
+  static const int INTER_RESIZE_COEF_SCALE = 1 << SHIFT;
+  static const int HALF = 1 << (SHIFT - 1);
+  auto inElemTy = input.getType().cast<MemRefType>().getElementType();
+  Value cMinus1 = builder.create<arith::ConstantIndexOp>(loc, -1);
+  Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
+  Value c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
+  Value c1I =
+      builder.create<arith::ConstantIntOp>(loc, 1, builder.getI32Type());
+  Value c2 = builder.create<arith::ConstantIndexOp>(loc, 2);
+  Value c2I =
+      builder.create<arith::ConstantIntOp>(loc, 2, builder.getI32Type());
+  Value c1F = builder.create<arith::ConstantFloatOp>(loc, (llvm::APFloat)1.0f,
+                                                     builder.getF32Type());
+
+  Value inputRow = builder.create<memref::DimOp>(loc, input, c0);
+  Value inputCol = builder.create<memref::DimOp>(loc, input, c1);
+
+  Value outputRow = builder.create<memref::DimOp>(loc, output, c0);
+  Value outputCol = builder.create<memref::DimOp>(loc, output, c1);
+
+  MemRefType dynamicTypeI32 =
+      MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 32));
+  Value xOffset =
+      builder.create<memref::AllocOp>(loc, dynamicTypeI32, outputCol);
+  MemRefType dynamicTypeI16 =
+      MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 16));
+  Value outputColMul2 = builder.create<arith::MulIOp>(loc, outputCol, c2);
+  Value iAlpha =
+      builder.create<memref::AllocOp>(loc, dynamicTypeI16, outputColMul2);
+
+  processScaling(builder, loc, outputCol, horizontalScalingFactor, inputCol,
+                 xOffset, iAlpha);
+
+  Value yOffset =
+      builder.create<memref::AllocOp>(loc, dynamicTypeI32, outputRow);
+  Value outputRowMul2 = builder.create<arith::MulIOp>(loc, outputRow, c2);
+  Value iBeta =
+      builder.create<memref::AllocOp>(loc, dynamicTypeI16, outputRowMul2);
+
+  processScaling(builder, loc, outputRow, verticalScalingFactor, inputRow,
+                 yOffset, iBeta);
+
+  Value bufferWidth = outputCol;
+  Value buffer0 =
+      builder.create<memref::AllocOp>(loc, dynamicTypeI32, bufferWidth);
+  Value buffer1 =
+      builder.create<memref::AllocOp>(loc, dynamicTypeI32, bufferWidth);
+  Value prevSy0 =
+      builder.create<arith::IndexCastOp>(loc, builder.getI32Type(), cMinus1);
+  Value prevSy1 =
+      builder.create<arith::IndexCastOp>(loc, builder.getI32Type(), cMinus1);
+
+  // builder.create<scf::ParallelOp>(
+  //   loc, ValueRange{c0}, ValueRange{outputRow}, ValueRange{c1},
+  //   [&](OpBuilder &yBuilder, Location yLoc, ValueRange ivs) {
+  //     Value yiv = ivs[0];
+  //   }
+  // );
+  builder.create<scf::ForOp>(
+      loc, c0, outputRow, c1, std::nullopt,
+      [&](OpBuilder &yBuilder, Location yLoc, Value yiv, ValueRange) {
+        Value sy =
+            yBuilder.create<memref::LoadOp>(yLoc, yOffset, ValueRange{yiv});
+        Value syNext = yBuilder.create<arith::AddIOp>(yLoc, sy, c1I);
+        calcInterpolation(yBuilder, yLoc, sy, prevSy0, xOffset, input,
+                          outputCol, iAlpha, buffer0);
+        calcInterpolation(yBuilder, yLoc, syNext, prevSy1, xOffset, input,
+                          outputCol, iAlpha, buffer1);
+
+        Value yMul2 = yBuilder.create<arith::MulIOp>(yLoc, yiv, c2);
+        yMul2 = yBuilder.create<arith::IndexCastOp>(yLoc, yBuilder.getI32Type(),
+                                                    yMul2);
+        Value yMul2Plus1 = yBuilder.create<arith::AddIOp>(yLoc, yMul2, c1I);
+        Value index0 = yBuilder.create<arith::IndexCastOp>(
+            yLoc, yBuilder.getIndexType(), yMul2);
+        Value index1 = yBuilder.create<arith::IndexCastOp>(
+            yLoc, yBuilder.getIndexType(), yMul2Plus1);
+        Value b0 =
+            yBuilder.create<memref::LoadOp>(yLoc, iBeta, ValueRange{index0});
+        b0 = yBuilder.create<arith::ExtSIOp>(yLoc, yBuilder.getI32Type(), b0);
+        Value b1 =
+            yBuilder.create<memref::LoadOp>(yLoc, iBeta, ValueRange{index1});
+        b1 = yBuilder.create<arith::ExtSIOp>(yLoc, yBuilder.getI32Type(), b1);
+        // Value b0 = yBuilder.create<memref::LoadOp>(yLoc, iBeta,
+        // ValueRange{yiv}); b0 = yBuilder.create<arith::ExtSIOp>(yLoc,
+        // yBuilder.getI32Type(), b0); Value yPlus1 =
+        // builder.create<arith::AddIOp>(yLoc, yiv, c1); Value b1 =
+        // yBuilder.create<memref::LoadOp>(yLoc, iBeta, ValueRange{yPlus1}); b1
+        // = yBuilder.create<arith::ExtSIOp>(yLoc, yBuilder.getI32Type(), b1);
+        yBuilder.create<scf::ForOp>(
+            yLoc, c0, bufferWidth, c1, std::nullopt,
+            [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
+              Value buffer0X = xBuilder.create<memref::LoadOp>(xLoc, buffer0,
+                                                               ValueRange{xiv});
+              Value buffer1X = xBuilder.create<memref::LoadOp>(xLoc, buffer1,
+                                                               ValueRange{xiv});
+              Value b0MulBuffer0 =
+                  xBuilder.create<arith::MulIOp>(xLoc, b0, buffer0X);
+              Value b1MulBuffer1 =
+                  xBuilder.create<arith::MulIOp>(xLoc, b1, buffer1X);
+              Value add = xBuilder.create<arith::AddIOp>(xLoc, b0MulBuffer0,
+                                                         b1MulBuffer1);
+              Value half = xBuilder.create<arith::ConstantIntOp>(
+                  xLoc, HALF, xBuilder.getI32Type());
+              Value addHalf = xBuilder.create<arith::AddIOp>(xLoc, add, half);
+              Value shift = xBuilder.create<arith::ConstantIntOp>(
+                  xLoc, SHIFT, xBuilder.getI32Type());
+              Value resShifted =
+                  xBuilder.create<arith::ShRSIOp>(xLoc, addHalf, shift);
+              Value zero = xBuilder.create<arith::ConstantIntOp>(
+                  xLoc, 0, xBuilder.getI32Type());
+              Value twoFiftyFive = xBuilder.create<arith::ConstantIntOp>(
+                  xLoc, 255, xBuilder.getI32Type());
+              Value maxVal =
+                  xBuilder.create<arith::MaxSIOp>(xLoc, resShifted, zero);
+              Value clampedVal =
+                  xBuilder.create<arith::MinSIOp>(xLoc, maxVal, twoFiftyFive);
+              FloatType type = inElemTy.isF32()
+                                   ? FloatType::getF32(xBuilder.getContext())
+                                   : FloatType::getF64(xBuilder.getContext());
+              Value clampedValF =
+                  xBuilder.create<arith::SIToFPOp>(xLoc, type, clampedVal);
+              xBuilder.create<memref::StoreOp>(xLoc, clampedValF, output,
+                                               ValueRange{yiv, xiv});
+              xBuilder.create<scf::YieldOp>(xLoc);
+            });
+        yBuilder.create<scf::YieldOp>(yLoc);
+      });
+}
+
 // Helper function for resizing an image using nearest neighbour interpolation
 // mechanism.
 void NearestNeighbourInterpolationResizing(

From 91b75294e13885d1a3f5a8b9f00a6c8ebb92be88 Mon Sep 17 00:00:00 2001
From: HarryZ <uqbarz@gmail.com>
Date: Fri, 7 Mar 2025 14:07:23 +0800
Subject: [PATCH 2/7] [feat] vectorization of resize op

---
 midend/lib/Utils/DIPUtils.cpp | 397 +++++++++++++++++++++-------------
 1 file changed, 245 insertions(+), 152 deletions(-)

diff --git a/midend/lib/Utils/DIPUtils.cpp b/midend/lib/Utils/DIPUtils.cpp
index 4519ceb1d1..ae0c6da7c7 100644
--- a/midend/lib/Utils/DIPUtils.cpp
+++ b/midend/lib/Utils/DIPUtils.cpp
@@ -1056,89 +1056,123 @@ void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc,
 
 void processScaling(OpBuilder &builder, Location loc, Value output,
                     Value scalingFactor, Value input, Value xOffset,
-                    Value iAlpha) {
+                    Value iAlpha, int64_t stride) {
   static const int SHIFT = 11;
   static const int INTER_RESIZE_COEF_SCALE = 1 << SHIFT;
   static const int HALF = 1 << (SHIFT - 1);
 
+  VectorType vectorTyIndex = VectorType::get({stride}, builder.getIndexType());
+  VectorType vectorTyI1 = VectorType::get({stride}, builder.getI1Type());
+  VectorType vectorTyI16 =
+      VectorType::get({stride}, IntegerType::get(builder.getContext(), 16));
+  VectorType vectorTyI32 =
+      VectorType::get({stride}, IntegerType::get(builder.getContext(), 32));
+  VectorType vectorTyF32 =
+      VectorType::get({stride}, FloatType::getF32(builder.getContext()));
+
   Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
   Value c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
   Value c2 = builder.create<arith::ConstantIndexOp>(loc, 2);
-  Value c0I32 =
-      builder.create<arith::IndexCastUIOp>(loc, builder.getI32Type(), c0);
-  Value c0F = builder.create<arith::ConstantFloatOp>(loc, (llvm::APFloat)0.0f,
-                                                     builder.getF32Type());
-  Value cDot5F = builder.create<arith::ConstantFloatOp>(
-      loc, (llvm::APFloat)0.5f, builder.getF32Type());
+  Value c0I32Vec = builder.create<vector::SplatOp>(
+      loc, vectorTyI32,
+      builder.create<arith::IndexCastUIOp>(loc, builder.getI32Type(), c0));
+  Value c0FVec = builder.create<vector::SplatOp>(
+      loc, vectorTyF32,
+      builder.create<arith::ConstantFloatOp>(loc, (llvm::APFloat)0.0f,
+                                             builder.getF32Type()));
+  Value cDot5FVec = builder.create<vector::SplatOp>(
+      loc, vectorTyF32,
+      builder.create<arith::ConstantFloatOp>(loc, (llvm::APFloat)0.5f,
+                                             builder.getF32Type()));
+  Value scalingFactorVec =
+      builder.create<vector::SplatOp>(loc, vectorTyF32, scalingFactor);
   Value c1F = builder.create<arith::ConstantFloatOp>(loc, (llvm::APFloat)1.0f,
                                                      builder.getF32Type());
+  Value c1FVec = builder.create<vector::SplatOp>(loc, vectorTyF32, c1F);
   Value inputMinus1 = builder.create<arith::IndexCastUIOp>(
       loc, builder.getI32Type(), builder.create<arith::SubIOp>(loc, input, c1));
+  Value inputMinus1Vec =
+      builder.create<vector::SplatOp>(loc, vectorTyI32, inputMinus1);
   Value inputMinus2 = builder.create<arith::IndexCastUIOp>(
       loc, builder.getI32Type(), builder.create<arith::SubIOp>(loc, input, c2));
+  Value inputMinus2Vec =
+      builder.create<vector::SplatOp>(loc, vectorTyI32, inputMinus2);
+  Value stepVec = iotaVec0F32(builder, loc, stride);
+  Value strideVal = builder.create<arith::ConstantIndexOp>(loc, stride);
+  Value scaleCoefVec = builder.create<vector::SplatOp>(
+      loc, vectorTyF32,
+      builder.create<arith::ConstantFloatOp>(
+          loc, (llvm::APFloat)(float)INTER_RESIZE_COEF_SCALE,
+          builder.getF32Type()));
   builder.create<scf::ForOp>(
-      loc, c0, output, c1, std::nullopt,
+      loc, c0, output, strideVal, std::nullopt,
       [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
         //  float fx = (float)((dx + 0.5) * scale_x - 0.5);
-        Value xivF = indexToF32(xBuilder, xLoc, xiv);
-        Value temp1 = xBuilder.create<arith::AddFOp>(xLoc, xivF, cDot5F);
+        Value xivFVec = xBuilder.create<vector::SplatOp>(
+            xLoc, vectorTyF32, indexToF32(xBuilder, xLoc, xiv));
+        xivFVec = xBuilder.create<arith::AddFOp>(xLoc, xivFVec, stepVec);
+        Value temp1 = xBuilder.create<arith::AddFOp>(xLoc, xivFVec, cDot5FVec);
         Value temp2 =
-            xBuilder.create<arith::MulFOp>(xLoc, temp1, scalingFactor);
-        Value fx = xBuilder.create<arith::SubFOp>(xLoc, temp2, cDot5F);
-        Value sx =
-            xBuilder.create<arith::FPToSIOp>(xLoc, xBuilder.getI32Type(), fx);
+            xBuilder.create<arith::MulFOp>(xLoc, temp1, scalingFactorVec);
+        Value fx = xBuilder.create<arith::SubFOp>(xLoc, temp2, cDot5FVec);
+        Value sx = xBuilder.create<arith::FPToSIOp>(xLoc, vectorTyI32, fx);
         fx = xBuilder.create<arith::SubFOp>(
-            xLoc, fx,
-            xBuilder.create<arith::SIToFPOp>(xLoc, xBuilder.getF32Type(), sx));
+            xLoc, fx, xBuilder.create<arith::SIToFPOp>(xLoc, vectorTyF32, sx));
         Value lowerThanZero = xBuilder.create<arith::CmpIOp>(
-            xLoc, arith::CmpIPredicate::slt, sx, c0I32);
+            xLoc, arith::CmpIPredicate::slt, sx, c0I32Vec);
         Value greaterThan = xBuilder.create<arith::CmpIOp>(
-            xLoc, arith::CmpIPredicate::sge, sx, inputMinus1);
-        sx = xBuilder.create<arith::SelectOp>(xLoc, lowerThanZero, c0I32, sx);
-        fx = xBuilder.create<arith::SelectOp>(xLoc, lowerThanZero, c0F, fx);
-        sx = xBuilder.create<arith::SelectOp>(xLoc, greaterThan, inputMinus2,
+            xLoc, arith::CmpIPredicate::sge, sx, inputMinus1Vec);
+        sx =
+            xBuilder.create<arith::SelectOp>(xLoc, lowerThanZero, c0I32Vec, sx);
+        fx = xBuilder.create<arith::SelectOp>(xLoc, lowerThanZero, c0FVec, fx);
+        sx = xBuilder.create<arith::SelectOp>(xLoc, greaterThan, inputMinus2Vec,
                                               sx);
-        fx = xBuilder.create<arith::SelectOp>(xLoc, greaterThan, c0F, fx);
-        xBuilder.create<memref::StoreOp>(xLoc, sx, xOffset, ValueRange{xiv});
+        fx = xBuilder.create<arith::SelectOp>(xLoc, greaterThan, c0FVec, fx);
+        Value maskVal = xBuilder.create<arith::SubIOp>(xLoc, output, xiv);
+        Value maskVec =
+            xBuilder.create<vector::CreateMaskOp>(xLoc, vectorTyI1, maskVal);
+        xBuilder.create<vector::MaskedStoreOp>(xLoc, xOffset, ValueRange{xiv},
+                                               maskVec, sx);
 
         //  ialpha[dx * 2]     = (short)((1.f - fx) * INTER_RESIZE_COEF_SCALE);
         //  ialpha[dx * 2 + 1] = (short)(fx * INTER_RESIZE_COEF_SCALE);
-        Value fxScale = xBuilder.create<arith::MulFOp>(
-            xLoc, fx,
-            xBuilder.create<arith::ConstantFloatOp>(
-                xLoc, (llvm::APFloat)(float)INTER_RESIZE_COEF_SCALE,
-                xBuilder.getF32Type()));
-        Value oneMinusFx = xBuilder.create<arith::SubFOp>(xLoc, c1F, fx);
-        Value oneMinusFxScale = xBuilder.create<arith::MulFOp>(
-            xLoc, oneMinusFx,
-            xBuilder.create<arith::ConstantFloatOp>(
-                xLoc, (llvm::APFloat)(float)INTER_RESIZE_COEF_SCALE,
-                xBuilder.getF32Type()));
+        Value fxScale = xBuilder.create<arith::MulFOp>(xLoc, fx, scaleCoefVec);
+        Value oneMinusFx = xBuilder.create<arith::SubFOp>(xLoc, c1FVec, fx);
+        Value oneMinusFxScale =
+            xBuilder.create<arith::MulFOp>(xLoc, oneMinusFx, scaleCoefVec);
 
         Value index0 = xBuilder.create<arith::MulIOp>(xLoc, xiv, c2);
         Value index1 = xBuilder.create<arith::AddIOp>(xLoc, index0, c1);
 
-        Value val0 = xBuilder.create<arith::FPToSIOp>(
-            xLoc, xBuilder.getI16Type(), oneMinusFxScale);
-        Value val1 = xBuilder.create<arith::FPToSIOp>(
-            xLoc, xBuilder.getI16Type(), fxScale);
+        Value val0 = xBuilder.create<arith::FPToSIOp>(xLoc, vectorTyI16,
+                                                      oneMinusFxScale);
+        Value val1 =
+            xBuilder.create<arith::FPToSIOp>(xLoc, vectorTyI16, fxScale);
 
-        xBuilder.create<memref::StoreOp>(xLoc, val0, iAlpha,
+        SmallVector<int64_t> maskVec1;
+        for (int i = 0; i < stride; i++) {
+          maskVec1.push_back(i);
+          maskVec1.push_back(i + stride);
+        }
+        Value storeBack =
+            xBuilder.create<vector::ShuffleOp>(xLoc, val0, val1, maskVec1);
+        xBuilder.create<vector::StoreOp>(xLoc, storeBack, iAlpha,
                                          ValueRange{index0});
-        xBuilder.create<memref::StoreOp>(xLoc, val1, iAlpha,
-                                         ValueRange{index1});
         xBuilder.create<scf::YieldOp>(xLoc);
       });
 }
 
 void calcInterpolation(OpBuilder &builder, Location loc, Value &sy,
                        Value &syPrev, Value offset, Value input,
-                       Value outputWidth, Value iAlpha, Value buffer) {
+                       Value outputWidth, Value iAlpha, Value buffer,
+                       int64_t stride) {
   static const int SHIFT = 11;
   static const int HALF = 1 << (SHIFT - 1);
   auto inElemTy = input.getType().cast<MemRefType>().getElementType();
   Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
   Value c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
+  Value c1I1 =
+      builder.create<arith::ConstantIntOp>(loc, 1, builder.getI1Type());
   Value c0I =
       builder.create<arith::ConstantIntOp>(loc, 0, builder.getI32Type());
   Value c1I =
@@ -1146,115 +1180,169 @@ void calcInterpolation(OpBuilder &builder, Location loc, Value &sy,
   Value c2 = builder.create<arith::ConstantIndexOp>(loc, 2);
   Value c2I =
       builder.create<arith::ConstantIntOp>(loc, 2, builder.getI32Type());
+
+  Value strideVal = builder.create<arith::ConstantIndexOp>(loc, stride);
+  Value strideInt =
+      builder.create<arith::IndexCastOp>(loc, builder.getI32Type(), strideVal);
+  Value outputStrideRatio =
+      builder.create<arith::DivUIOp>(loc, outputWidth, strideVal);
+  Value outputMultiple = builder.create<arith::MulIOp>(
+      loc, builder.create<arith::AddIOp>(loc, outputStrideRatio, c1),
+      strideVal);
+  VectorType vectorTyIndex = VectorType::get({stride}, builder.getIndexType());
+  VectorType vectorTyI1 = VectorType::get({stride}, builder.getI1Type());
+  VectorType vectorTyI16 =
+      VectorType::get({stride}, IntegerType::get(builder.getContext(), 16));
+  VectorType vectorTyI32 =
+      VectorType::get({stride}, IntegerType::get(builder.getContext(), 32));
+  VectorType vectorTyF32 =
+      VectorType::get({stride}, FloatType::getF32(builder.getContext()));
+
+  Value c1I1Vec = builder.create<vector::SplatOp>(loc, vectorTyI1, c1I1);
+  Value c1IVec = builder.create<vector::SplatOp>(loc, vectorTyI32, c1I);
+  Value c2IVec = builder.create<vector::SplatOp>(loc, vectorTyI32, c2I);
+  Value half =
+      builder.create<arith::ConstantIntOp>(loc, HALF, builder.getI32Type());
+  Value halfVec = builder.create<vector::SplatOp>(loc, vectorTyI32, half);
+  Value shift =
+      builder.create<arith::ConstantIntOp>(loc, SHIFT, builder.getI32Type());
+  Value shiftVec = builder.create<vector::SplatOp>(loc, vectorTyI32, shift);
+
+  auto passThruConstantOp =
+      builder.create<arith::ConstantOp>(loc, builder.getZeroAttr(vectorTyF32));
+  auto passThruConstantOpI32 =
+      builder.create<arith::ConstantOp>(loc, builder.getZeroAttr(vectorTyI32));
+  auto passThruConstantOpI16 =
+      builder.create<arith::ConstantOp>(loc, builder.getZeroAttr(vectorTyI16));
   Value notEqual =
       builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, sy, syPrev);
-  builder.create<scf::IfOp>(
-      loc, notEqual, [&](OpBuilder &tBuilder, Location tLoc) {
-        tBuilder.create<scf::ForOp>(
-            tLoc, c0, outputWidth, c1, std::nullopt,
-            [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
-              Value sx = xBuilder.create<memref::LoadOp>(xLoc, offset,
-                                                         ValueRange{xiv});
-              Value sxPlus1 = xBuilder.create<arith::AddIOp>(xLoc, sx, c1I);
-              Value xMul2 = xBuilder.create<arith::MulIOp>(xLoc, xiv, c2);
-              xMul2 = xBuilder.create<arith::IndexCastOp>(
-                  xLoc, xBuilder.getI32Type(), xMul2);
-              Value xMul2Plus1 =
-                  xBuilder.create<arith::AddIOp>(xLoc, xMul2, c1I);
-              Value index0 = xBuilder.create<arith::IndexCastOp>(
-                  xLoc, xBuilder.getIndexType(), xMul2);
-              Value index1 = xBuilder.create<arith::IndexCastOp>(
-                  xLoc, xBuilder.getIndexType(), xMul2Plus1);
-              Value a0 = xBuilder.create<memref::LoadOp>(xLoc, iAlpha,
-                                                         ValueRange{index0});
-              Value a1 = xBuilder.create<memref::LoadOp>(xLoc, iAlpha,
-                                                         ValueRange{index1});
-              Value sxIndex = xBuilder.create<arith::IndexCastOp>(
-                  xLoc, xBuilder.getIndexType(), sx);
-              Value sxPlus1Index = xBuilder.create<arith::IndexCastOp>(
-                  xLoc, xBuilder.getIndexType(), sxPlus1);
-              Value syIndex = xBuilder.create<arith::IndexCastOp>(
-                  xLoc, xBuilder.getIndexType(), sy);
-              Value v0 = xBuilder.create<memref::LoadOp>(
-                  xLoc, input, ValueRange{syIndex, sxIndex});
-              v0 = xBuilder.create<arith::FPToSIOp>(xLoc, xBuilder.getI32Type(),
-                                                    v0);
-              Value v1 = xBuilder.create<memref::LoadOp>(
-                  xLoc, input, ValueRange{syIndex, sxPlus1Index});
-              v1 = xBuilder.create<arith::FPToSIOp>(xLoc, xBuilder.getI32Type(),
-                                                    v1);
-              Value a0I32 = xBuilder.create<arith::ExtSIOp>(
-                  xLoc, xBuilder.getI32Type(), a0);
-              Value a1I32 = xBuilder.create<arith::ExtSIOp>(
-                  xLoc, xBuilder.getI32Type(), a1);
-              Value v0MulA0 = xBuilder.create<arith::MulIOp>(xLoc, v0, a0I32);
-              Value v1MulA1 = xBuilder.create<arith::MulIOp>(xLoc, v1, a1I32);
-              Value add1 =
-                  xBuilder.create<arith::AddIOp>(xLoc, v0MulA0, v1MulA1);
-              Value half = xBuilder.create<arith::ConstantIntOp>(
-                  xLoc, HALF, xBuilder.getI32Type());
-              Value add2 = xBuilder.create<arith::AddIOp>(xLoc, add1, half);
-              Value shift = xBuilder.create<arith::ConstantIntOp>(
-                  xLoc, SHIFT, xBuilder.getI32Type());
-              Value resShifted =
-                  xBuilder.create<arith::ShRSIOp>(xLoc, add2, shift);
-              xBuilder.create<memref::StoreOp>(xLoc, resShifted, buffer,
-                                               ValueRange{xiv});
-              xBuilder.create<scf::YieldOp>(xLoc);
-            });
-        // syPrev = sy;
-        tBuilder.create<scf::YieldOp>(tLoc);
-      });
+  builder.create<
+      scf::IfOp>(loc, notEqual, [&](OpBuilder &tBuilder, Location tLoc) {
+    tBuilder.create<scf::ForOp>(
+        tLoc, c0, outputMultiple, strideVal, std::nullopt,
+        [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
+          Value maskVal =
+              xBuilder.create<arith::SubIOp>(xLoc, outputWidth, xiv);
+          Value maskVec =
+              xBuilder.create<vector::CreateMaskOp>(xLoc, vectorTyI1, maskVal);
+          Value sxVec = xBuilder.create<vector::MaskedLoadOp>(
+              xLoc, vectorTyI32, offset, ValueRange{xiv}, maskVec,
+              passThruConstantOpI32);
+          Value sxPlus1Vec =
+              xBuilder.create<arith::AddIOp>(xLoc, sxVec, c1IVec);
+          Value index0 = xBuilder.create<arith::MulIOp>(xLoc, xiv, c2);
+          Value index1 =
+              xBuilder.create<arith::AddIOp>(xLoc, index0, strideVal);
+          Value a0Vec = xBuilder.create<vector::LoadOp>(xLoc, vectorTyI16,
+                                                        iAlpha, index0);
+          Value a1Vec = xBuilder.create<vector::LoadOp>(xLoc, vectorTyI16,
+                                                        iAlpha, index1);
+          Value sxVecIndex =
+              xBuilder.create<arith::IndexCastOp>(xLoc, vectorTyIndex, sxVec);
+          Value sxPlus1VecIndex = xBuilder.create<arith::IndexCastOp>(
+              xLoc, vectorTyIndex, sxPlus1Vec);
+          Value syIndex = xBuilder.create<arith::IndexCastOp>(
+              xLoc, xBuilder.getIndexType(), sy);
+          Value v0Vec = xBuilder.create<vector::GatherOp>(
+              xLoc, vectorTyF32, input, ValueRange{syIndex, c0}, sxVecIndex,
+              c1I1Vec, passThruConstantOp);
+          v0Vec = xBuilder.create<arith::FPToSIOp>(xLoc, vectorTyI32, v0Vec);
+          Value v1Vec = xBuilder.create<vector::GatherOp>(
+              xLoc, vectorTyF32, input, ValueRange{syIndex, c0},
+              sxPlus1VecIndex, c1I1Vec, passThruConstantOp);
+          v1Vec = xBuilder.create<arith::FPToSIOp>(xLoc, vectorTyI32, v1Vec);
+
+          a0Vec = xBuilder.create<arith::ExtSIOp>(xLoc, vectorTyI32, a0Vec);
+          a1Vec = xBuilder.create<arith::ExtSIOp>(xLoc, vectorTyI32, a1Vec);
+          SmallVector<int64_t> maskVec1, maskVec2;
+          for (int i = 0; i < stride; i++) {
+            maskVec1.push_back(i * 2);
+            maskVec2.push_back(i * 2 + 1);
+          }
+          Value adder1 =
+              xBuilder.create<vector::ShuffleOp>(xLoc, a0Vec, a1Vec, maskVec1);
+          Value adder2 =
+              xBuilder.create<vector::ShuffleOp>(xLoc, a0Vec, a1Vec, maskVec2);
+          Value aMulv0 = xBuilder.create<arith::MulIOp>(xLoc, adder1, v0Vec);
+          Value aMulv1 = xBuilder.create<arith::MulIOp>(xLoc, adder2, v1Vec);
+          Value adder = xBuilder.create<arith::AddIOp>(xLoc, aMulv0, aMulv1);
+          adder = builder.create<arith::AddIOp>(loc, adder, halfVec);
+          Value resShifted =
+              xBuilder.create<arith::ShRSIOp>(xLoc, adder, shiftVec);
+          xBuilder.create<vector::MaskedStoreOp>(xLoc, buffer, ValueRange{xiv},
+                                                 maskVec, resShifted);
+          xBuilder.create<scf::YieldOp>(xLoc);
+        });
+    tBuilder.create<scf::YieldOp>(tLoc);
+  });
   syPrev = builder.create<arith::SelectOp>(loc, notEqual, sy, syPrev);
 }
 
 void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc,
                                       MLIRContext *ctx, Value input,
-                                      Value output,
+                                      Value output, int64_t stride,
                                       Value horizontalScalingFactor,
                                       Value verticalScalingFactor) {
   static const int SHIFT = 11;
   static const int INTER_RESIZE_COEF_SCALE = 1 << SHIFT;
   static const int HALF = 1 << (SHIFT - 1);
   auto inElemTy = input.getType().cast<MemRefType>().getElementType();
+  VectorType vectorTyIndex = VectorType::get({stride}, builder.getIndexType());
+  VectorType vectorTyI1 = VectorType::get({stride}, builder.getI1Type());
+  VectorType vectorTyI32 =
+      VectorType::get({stride}, IntegerType::get(builder.getContext(), 32));
+  VectorType vectorTyF = VectorType::get(
+      {stride}, inElemTy.isF32() ? FloatType::getF32(builder.getContext())
+                                 : FloatType::getF64(builder.getContext()));
+  auto passThruConstantOpI32 =
+      builder.create<arith::ConstantOp>(loc, builder.getZeroAttr(vectorTyI32));
   Value cMinus1 = builder.create<arith::ConstantIndexOp>(loc, -1);
   Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
   Value c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
-  Value c1I =
+  Value c1I32 =
       builder.create<arith::ConstantIntOp>(loc, 1, builder.getI32Type());
   Value c2 = builder.create<arith::ConstantIndexOp>(loc, 2);
-  Value c2I =
-      builder.create<arith::ConstantIntOp>(loc, 2, builder.getI32Type());
-  Value c1F = builder.create<arith::ConstantFloatOp>(loc, (llvm::APFloat)1.0f,
-                                                     builder.getF32Type());
-
   Value inputRow = builder.create<memref::DimOp>(loc, input, c0);
   Value inputCol = builder.create<memref::DimOp>(loc, input, c1);
 
   Value outputRow = builder.create<memref::DimOp>(loc, output, c0);
   Value outputCol = builder.create<memref::DimOp>(loc, output, c1);
 
+  Value strideVal = builder.create<arith::ConstantIndexOp>(loc, stride);
+  Value outputColStrideRatio =
+      builder.create<arith::DivUIOp>(loc, outputCol, strideVal);
+  Value outputColMultiple = builder.create<arith::MulIOp>(
+      loc, builder.create<arith::AddIOp>(loc, outputColStrideRatio, c1),
+      strideVal);
+  Value outputColMultiple2 =
+      builder.create<arith::MulIOp>(loc, outputColMultiple, c2);
+  Value outputRowStrideRatio =
+      builder.create<arith::DivUIOp>(loc, outputRow, strideVal);
+  Value outputRowMultiple = builder.create<arith::MulIOp>(
+      loc, builder.create<arith::AddIOp>(loc, outputRowStrideRatio, c1),
+      strideVal);
+  Value outputRowMultiple2 =
+      builder.create<arith::MulIOp>(loc, outputRowMultiple, c2);
+
   MemRefType dynamicTypeI32 =
       MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 32));
   Value xOffset =
       builder.create<memref::AllocOp>(loc, dynamicTypeI32, outputCol);
   MemRefType dynamicTypeI16 =
       MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 16));
-  Value outputColMul2 = builder.create<arith::MulIOp>(loc, outputCol, c2);
   Value iAlpha =
-      builder.create<memref::AllocOp>(loc, dynamicTypeI16, outputColMul2);
+      builder.create<memref::AllocOp>(loc, dynamicTypeI16, outputColMultiple2);
 
   processScaling(builder, loc, outputCol, horizontalScalingFactor, inputCol,
-                 xOffset, iAlpha);
+                 xOffset, iAlpha, stride);
 
   Value yOffset =
       builder.create<memref::AllocOp>(loc, dynamicTypeI32, outputRow);
-  Value outputRowMul2 = builder.create<arith::MulIOp>(loc, outputRow, c2);
   Value iBeta =
-      builder.create<memref::AllocOp>(loc, dynamicTypeI16, outputRowMul2);
+      builder.create<memref::AllocOp>(loc, dynamicTypeI16, outputRowMultiple2);
 
   processScaling(builder, loc, outputRow, verticalScalingFactor, inputRow,
-                 yOffset, iBeta);
+                 yOffset, iBeta, stride);
 
   Value bufferWidth = outputCol;
   Value buffer0 =
@@ -1266,6 +1354,19 @@ void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc,
   Value prevSy1 =
       builder.create<arith::IndexCastOp>(loc, builder.getI32Type(), cMinus1);
 
+  Value halfVec = builder.create<vector::SplatOp>(
+      loc, vectorTyI32,
+      builder.create<arith::ConstantIntOp>(loc, HALF, builder.getI32Type()));
+  Value shiftVec = builder.create<vector::SplatOp>(
+      loc, vectorTyI32,
+      builder.create<arith::ConstantIntOp>(loc, SHIFT, builder.getI32Type()));
+  Value zeroVec = builder.create<vector::SplatOp>(
+      loc, vectorTyI32,
+      builder.create<arith::ConstantIntOp>(loc, 0, builder.getI32Type()));
+  Value twoFiftyFiveVec = builder.create<vector::SplatOp>(
+      loc, vectorTyI32,
+      builder.create<arith::ConstantIntOp>(loc, 255, builder.getI32Type()));
+
   // builder.create<scf::ParallelOp>(
   //   loc, ValueRange{c0}, ValueRange{outputRow}, ValueRange{c1},
   //   [&](OpBuilder &yBuilder, Location yLoc, ValueRange ivs) {
@@ -1277,67 +1378,59 @@ void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc,
       [&](OpBuilder &yBuilder, Location yLoc, Value yiv, ValueRange) {
         Value sy =
             yBuilder.create<memref::LoadOp>(yLoc, yOffset, ValueRange{yiv});
-        Value syNext = yBuilder.create<arith::AddIOp>(yLoc, sy, c1I);
+        Value syNext = yBuilder.create<arith::AddIOp>(yLoc, sy, c1I32);
         calcInterpolation(yBuilder, yLoc, sy, prevSy0, xOffset, input,
-                          outputCol, iAlpha, buffer0);
+                          outputCol, iAlpha, buffer0, stride);
         calcInterpolation(yBuilder, yLoc, syNext, prevSy1, xOffset, input,
-                          outputCol, iAlpha, buffer1);
+                          outputCol, iAlpha, buffer1, stride);
 
+        //  calc index
         Value yMul2 = yBuilder.create<arith::MulIOp>(yLoc, yiv, c2);
         yMul2 = yBuilder.create<arith::IndexCastOp>(yLoc, yBuilder.getI32Type(),
                                                     yMul2);
-        Value yMul2Plus1 = yBuilder.create<arith::AddIOp>(yLoc, yMul2, c1I);
+        Value yMul2Plus1 = yBuilder.create<arith::AddIOp>(yLoc, yMul2, c1I32);
         Value index0 = yBuilder.create<arith::IndexCastOp>(
             yLoc, yBuilder.getIndexType(), yMul2);
         Value index1 = yBuilder.create<arith::IndexCastOp>(
             yLoc, yBuilder.getIndexType(), yMul2Plus1);
-        Value b0 =
-            yBuilder.create<memref::LoadOp>(yLoc, iBeta, ValueRange{index0});
+        Value b0 = yBuilder.create<memref::LoadOp>(yLoc, iBeta, index0);
         b0 = yBuilder.create<arith::ExtSIOp>(yLoc, yBuilder.getI32Type(), b0);
-        Value b1 =
-            yBuilder.create<memref::LoadOp>(yLoc, iBeta, ValueRange{index1});
+        Value b0Vec = yBuilder.create<vector::SplatOp>(yLoc, vectorTyI32, b0);
+        Value b1 = yBuilder.create<memref::LoadOp>(yLoc, iBeta, index1);
         b1 = yBuilder.create<arith::ExtSIOp>(yLoc, yBuilder.getI32Type(), b1);
-        // Value b0 = yBuilder.create<memref::LoadOp>(yLoc, iBeta,
-        // ValueRange{yiv}); b0 = yBuilder.create<arith::ExtSIOp>(yLoc,
-        // yBuilder.getI32Type(), b0); Value yPlus1 =
-        // builder.create<arith::AddIOp>(yLoc, yiv, c1); Value b1 =
-        // yBuilder.create<memref::LoadOp>(yLoc, iBeta, ValueRange{yPlus1}); b1
-        // = yBuilder.create<arith::ExtSIOp>(yLoc, yBuilder.getI32Type(), b1);
+        Value b1Vec = yBuilder.create<vector::SplatOp>(yLoc, vectorTyI32, b1);
+
         yBuilder.create<scf::ForOp>(
-            yLoc, c0, bufferWidth, c1, std::nullopt,
+            yLoc, c0, bufferWidth, strideVal, std::nullopt,
             [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
-              Value buffer0X = xBuilder.create<memref::LoadOp>(xLoc, buffer0,
-                                                               ValueRange{xiv});
-              Value buffer1X = xBuilder.create<memref::LoadOp>(xLoc, buffer1,
-                                                               ValueRange{xiv});
+              Value maskVal =
+                  xBuilder.create<arith::SubIOp>(xLoc, bufferWidth, xiv);
+              Value maskVec = xBuilder.create<vector::CreateMaskOp>(
+                  xLoc, vectorTyI1, maskVal);
+              Value buffer0X = xBuilder.create<vector::MaskedLoadOp>(
+                  xLoc, vectorTyI32, buffer0, ValueRange{xiv}, maskVec,
+                  passThruConstantOpI32);
+              Value buffer1X = xBuilder.create<vector::MaskedLoadOp>(
+                  xLoc, vectorTyI32, buffer1, ValueRange{xiv}, maskVec,
+                  passThruConstantOpI32);
               Value b0MulBuffer0 =
-                  xBuilder.create<arith::MulIOp>(xLoc, b0, buffer0X);
+                  xBuilder.create<arith::MulIOp>(xLoc, b0Vec, buffer0X);
               Value b1MulBuffer1 =
-                  xBuilder.create<arith::MulIOp>(xLoc, b1, buffer1X);
+                  xBuilder.create<arith::MulIOp>(xLoc, b1Vec, buffer1X);
               Value add = xBuilder.create<arith::AddIOp>(xLoc, b0MulBuffer0,
                                                          b1MulBuffer1);
-              Value half = xBuilder.create<arith::ConstantIntOp>(
-                  xLoc, HALF, xBuilder.getI32Type());
-              Value addHalf = xBuilder.create<arith::AddIOp>(xLoc, add, half);
-              Value shift = xBuilder.create<arith::ConstantIntOp>(
-                  xLoc, SHIFT, xBuilder.getI32Type());
+              Value addHalf =
+                  xBuilder.create<arith::AddIOp>(xLoc, add, halfVec);
               Value resShifted =
-                  xBuilder.create<arith::ShRSIOp>(xLoc, addHalf, shift);
-              Value zero = xBuilder.create<arith::ConstantIntOp>(
-                  xLoc, 0, xBuilder.getI32Type());
-              Value twoFiftyFive = xBuilder.create<arith::ConstantIntOp>(
-                  xLoc, 255, xBuilder.getI32Type());
+                  xBuilder.create<arith::ShRSIOp>(xLoc, addHalf, shiftVec);
               Value maxVal =
-                  xBuilder.create<arith::MaxSIOp>(xLoc, resShifted, zero);
-              Value clampedVal =
-                  xBuilder.create<arith::MinSIOp>(xLoc, maxVal, twoFiftyFive);
-              FloatType type = inElemTy.isF32()
-                                   ? FloatType::getF32(xBuilder.getContext())
-                                   : FloatType::getF64(xBuilder.getContext());
+                  xBuilder.create<arith::MaxSIOp>(xLoc, resShifted, zeroVec);
+              Value clampedVal = xBuilder.create<arith::MinSIOp>(
+                  xLoc, maxVal, twoFiftyFiveVec);
               Value clampedValF =
-                  xBuilder.create<arith::SIToFPOp>(xLoc, type, clampedVal);
-              xBuilder.create<memref::StoreOp>(xLoc, clampedValF, output,
-                                               ValueRange{yiv, xiv});
+                  xBuilder.create<arith::SIToFPOp>(xLoc, vectorTyF, clampedVal);
+              xBuilder.create<vector::MaskedStoreOp>(
+                  xLoc, output, ValueRange{yiv, xiv}, maskVec, clampedValF);
               xBuilder.create<scf::YieldOp>(xLoc);
             });
         yBuilder.create<scf::YieldOp>(yLoc);

From d220a8ed64f47bdd19896608534366f82eb2eb9e Mon Sep 17 00:00:00 2001
From: HarryZ <uqbarz@gmail.com>
Date: Fri, 7 Mar 2025 19:54:59 +0800
Subject: [PATCH 3/7] [feat] parallel resize2d op

---
 .../lib/Conversion/LowerDIP/LowerDIPPass.cpp  | 108 ++---
 midend/lib/Utils/DIPUtils.cpp                 | 440 ++++++++----------
 2 files changed, 233 insertions(+), 315 deletions(-)

diff --git a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp
index ef7fc9b9b0..3cd65126de 100644
--- a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp
+++ b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp
@@ -314,85 +314,45 @@ class DIPResize2DOpLowering : public OpRewritePattern<dip::Resize2DOp> {
                                << inElemTy << "is passed";
     }
 
-    Value c0 = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    Value c1 = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-    Value c0F32 = indexToF32(rewriter, loc, c0);
-
-    Value inputRow = rewriter.create<memref::DimOp>(loc, input, c0);
-    Value inputCol = rewriter.create<memref::DimOp>(loc, input, c1);
-
-    Value outputRow = rewriter.create<memref::DimOp>(loc, output, c0);
-    Value outputCol = rewriter.create<memref::DimOp>(loc, output, c1);
-
-    // Determine lower bound for second call of resize function (this is done
-    // for efficient tail processing).
-    Value outputColStrideRatio =
-        rewriter.create<arith::DivUIOp>(loc, outputCol, strideVal);
-    Value outputColMultiple =
-        rewriter.create<arith::MulIOp>(loc, strideVal, outputColStrideRatio);
-
-    SmallVector<Value, 8> lowerBounds1{c0, c0};
-    SmallVector<Value, 8> upperBounds1{outputRow, outputColMultiple};
-
-    SmallVector<int64_t, 8> steps{1, stride};
-    Value strideTailVal =
-        rewriter.create<arith::SubIOp>(loc, outputCol, outputColMultiple);
-
-    SmallVector<Value, 8> lowerBounds2{c0, outputColMultiple};
-    SmallVector<Value, 8> upperBounds2{outputRow, outputCol};
-
-    FloatType f32 = FloatType::getF32(ctx);
-    VectorType vectorTy32 = VectorType::get({stride}, f32);
-
-    Value horizontalScalingFactorVec = rewriter.create<vector::SplatOp>(
-        loc, vectorTy32, horizontalScalingFactor);
-    Value verticalScalingFactorVec = rewriter.create<vector::SplatOp>(
-        loc, vectorTy32, verticalScalingFactor);
-
-    // Obtain extreme allocatable value(s) in input and output for bounding
-    // purpose.
-    Value inputRowLastElem = rewriter.create<arith::SubIOp>(loc, inputRow, c1);
-    Value inputRowLastElemF32 = indexToF32(rewriter, loc, inputRowLastElem);
-
-    Value inputColLastElem = rewriter.create<arith::SubIOp>(loc, inputCol, c1);
-    Value inputColLastElemF32 = indexToF32(rewriter, loc, inputColLastElem);
-
-    Value outputRowLastElem =
-        rewriter.create<arith::SubIOp>(loc, outputRow, c1);
-    Value outputRowLastElemF32 = indexToF32(rewriter, loc, outputRowLastElem);
-
-    Value outputColLastElem =
-        rewriter.create<arith::SubIOp>(loc, outputCol, c1);
-    Value outputColLastElemF32 = indexToF32(rewriter, loc, outputColLastElem);
+    VectorType vectorTyI1 = VectorType::get({stride}, rewriter.getI1Type());
+    VectorType vectorTyI16 =
+        VectorType::get({stride}, IntegerType::get(rewriter.getContext(), 16));
+    VectorType vectorTyI32 =
+        VectorType::get({stride}, IntegerType::get(rewriter.getContext(), 32));
+    VectorType vectorTyF32 =
+        VectorType::get({stride}, FloatType::getF32(rewriter.getContext()));
+    VectorType vectorTyIndex =
+        VectorType::get({stride}, rewriter.getIndexType());
+    VectorType vectorResTy = VectorType::get(
+        {stride}, inElemTy.isF32() ? FloatType::getF32(rewriter.getContext())
+                                   : FloatType::getF64(rewriter.getContext()));
+
+    static const int SHIFT = 11;
+    static const int HALF = 1 << (SHIFT - 1);
+    static const int INTER_RESIZE_COEF_SCALE = 1 << SHIFT;
+    Value half =
+        rewriter.create<arith::ConstantIntOp>(loc, HALF, rewriter.getI32Type());
+    Value halfVec = rewriter.create<vector::SplatOp>(loc, vectorTyI32, half);
+    Value shift = rewriter.create<arith::ConstantIntOp>(loc, SHIFT,
+                                                        rewriter.getI32Type());
+    Value shiftVec = rewriter.create<vector::SplatOp>(loc, vectorTyI32, shift);
+    Value scaleVec = rewriter.create<vector::SplatOp>(
+        loc, vectorTyF32,
+        rewriter.create<arith::ConstantFloatOp>(
+            loc, (llvm::APFloat)(float)INTER_RESIZE_COEF_SCALE,
+            rewriter.getF32Type()));
 
     if (interpolationAttr ==
         dip::InterpolationType::NearestNeighbourInterpolation) {
-      dip::NearestNeighbourInterpolationResizing(
-          rewriter, loc, ctx, lowerBounds1, upperBounds1, steps, strideVal,
-          input, output, horizontalScalingFactorVec, verticalScalingFactorVec,
-          outputRowLastElemF32, outputColLastElemF32, inputRowLastElemF32,
-          inputColLastElemF32, vectorTy32, stride, c0, c0F32);
-
-      dip::NearestNeighbourInterpolationResizing(
-          rewriter, loc, ctx, lowerBounds2, upperBounds2, steps, strideTailVal,
-          input, output, horizontalScalingFactorVec, verticalScalingFactorVec,
-          outputRowLastElemF32, outputColLastElemF32, inputRowLastElemF32,
-          inputColLastElemF32, vectorTy32, stride, c0, c0F32);
+      dip::NearestNeighbourInterpolationResizingNew(
+          rewriter, loc, ctx, input, output, horizontalScalingFactor,
+          verticalScalingFactor);
     } else if (interpolationAttr ==
                dip::InterpolationType::BilinearInterpolation) {
-      Value c1F32 = indexToF32(rewriter, loc, c1);
-
-      dip::BilinearInterpolationResizing(
-          rewriter, loc, ctx, lowerBounds1, upperBounds1, steps, strideVal,
-          input, output, horizontalScalingFactorVec, verticalScalingFactorVec,
-          outputRowLastElemF32, outputColLastElemF32, inputRowLastElemF32,
-          inputColLastElemF32, vectorTy32, stride, c0, c0F32, c1F32);
-
-      dip::BilinearInterpolationResizing(
-          rewriter, loc, ctx, lowerBounds2, upperBounds2, steps, strideTailVal,
-          input, output, horizontalScalingFactorVec, verticalScalingFactorVec,
-          outputRowLastElemF32, outputColLastElemF32, inputRowLastElemF32,
-          inputColLastElemF32, vectorTy32, stride, c0, c0F32, c1F32);
+      dip::BilinearInterpolationResizingNew(
+          rewriter, loc, ctx, input, output, stride, horizontalScalingFactor,
+          verticalScalingFactor, halfVec, shiftVec, scaleVec, vectorResTy,
+          vectorTyI32, vectorTyI16, vectorTyIndex, vectorTyF32, vectorTyI1);
     }
 
     // Remove the original resize operation.
diff --git a/midend/lib/Utils/DIPUtils.cpp b/midend/lib/Utils/DIPUtils.cpp
index ae0c6da7c7..256624055f 100644
--- a/midend/lib/Utils/DIPUtils.cpp
+++ b/midend/lib/Utils/DIPUtils.cpp
@@ -1056,20 +1056,9 @@ void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc,
 
 void processScaling(OpBuilder &builder, Location loc, Value output,
                     Value scalingFactor, Value input, Value xOffset,
-                    Value iAlpha, int64_t stride) {
-  static const int SHIFT = 11;
-  static const int INTER_RESIZE_COEF_SCALE = 1 << SHIFT;
-  static const int HALF = 1 << (SHIFT - 1);
-
-  VectorType vectorTyIndex = VectorType::get({stride}, builder.getIndexType());
-  VectorType vectorTyI1 = VectorType::get({stride}, builder.getI1Type());
-  VectorType vectorTyI16 =
-      VectorType::get({stride}, IntegerType::get(builder.getContext(), 16));
-  VectorType vectorTyI32 =
-      VectorType::get({stride}, IntegerType::get(builder.getContext(), 32));
-  VectorType vectorTyF32 =
-      VectorType::get({stride}, FloatType::getF32(builder.getContext()));
-
+                    Value iAlpha, int64_t stride, Value scaleVec,
+                    VectorType vectorTyI32, VectorType vectorTyF32,
+                    VectorType vectorTyI1, VectorType vectorTyI16) {
   Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
   Value c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
   Value c2 = builder.create<arith::ConstantIndexOp>(loc, 2);
@@ -1080,15 +1069,17 @@ void processScaling(OpBuilder &builder, Location loc, Value output,
       loc, vectorTyF32,
       builder.create<arith::ConstantFloatOp>(loc, (llvm::APFloat)0.0f,
                                              builder.getF32Type()));
+  Value c1FVec = builder.create<vector::SplatOp>(
+      loc, vectorTyF32,
+      builder.create<arith::ConstantFloatOp>(loc, (llvm::APFloat)1.0f,
+                                             builder.getF32Type()));
   Value cDot5FVec = builder.create<vector::SplatOp>(
       loc, vectorTyF32,
       builder.create<arith::ConstantFloatOp>(loc, (llvm::APFloat)0.5f,
                                              builder.getF32Type()));
   Value scalingFactorVec =
       builder.create<vector::SplatOp>(loc, vectorTyF32, scalingFactor);
-  Value c1F = builder.create<arith::ConstantFloatOp>(loc, (llvm::APFloat)1.0f,
-                                                     builder.getF32Type());
-  Value c1FVec = builder.create<vector::SplatOp>(loc, vectorTyF32, c1F);
+
   Value inputMinus1 = builder.create<arith::IndexCastUIOp>(
       loc, builder.getI32Type(), builder.create<arith::SubIOp>(loc, input, c1));
   Value inputMinus1Vec =
@@ -1099,11 +1090,7 @@ void processScaling(OpBuilder &builder, Location loc, Value output,
       builder.create<vector::SplatOp>(loc, vectorTyI32, inputMinus2);
   Value stepVec = iotaVec0F32(builder, loc, stride);
   Value strideVal = builder.create<arith::ConstantIndexOp>(loc, stride);
-  Value scaleCoefVec = builder.create<vector::SplatOp>(
-      loc, vectorTyF32,
-      builder.create<arith::ConstantFloatOp>(
-          loc, (llvm::APFloat)(float)INTER_RESIZE_COEF_SCALE,
-          builder.getF32Type()));
+
   builder.create<scf::ForOp>(
       loc, c0, output, strideVal, std::nullopt,
       [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
@@ -1129,35 +1116,32 @@ void processScaling(OpBuilder &builder, Location loc, Value output,
                                               sx);
         fx = xBuilder.create<arith::SelectOp>(xLoc, greaterThan, c0FVec, fx);
         Value maskVal = xBuilder.create<arith::SubIOp>(xLoc, output, xiv);
-        Value maskVec =
-            xBuilder.create<vector::CreateMaskOp>(xLoc, vectorTyI1, maskVal);
-        xBuilder.create<vector::MaskedStoreOp>(xLoc, xOffset, ValueRange{xiv},
-                                               maskVec, sx);
+        xBuilder.create<vector::MaskedStoreOp>(
+            xLoc, xOffset, ValueRange{xiv},
+            xBuilder.create<vector::CreateMaskOp>(xLoc, vectorTyI1, maskVal),
+            sx);
 
         //  ialpha[dx * 2]     = (short)((1.f - fx) * INTER_RESIZE_COEF_SCALE);
         //  ialpha[dx * 2 + 1] = (short)(fx * INTER_RESIZE_COEF_SCALE);
-        Value fxScale = xBuilder.create<arith::MulFOp>(xLoc, fx, scaleCoefVec);
+        Value fxScale = xBuilder.create<arith::MulFOp>(xLoc, fx, scaleVec);
         Value oneMinusFx = xBuilder.create<arith::SubFOp>(xLoc, c1FVec, fx);
         Value oneMinusFxScale =
-            xBuilder.create<arith::MulFOp>(xLoc, oneMinusFx, scaleCoefVec);
-
-        Value index0 = xBuilder.create<arith::MulIOp>(xLoc, xiv, c2);
-        Value index1 = xBuilder.create<arith::AddIOp>(xLoc, index0, c1);
+            xBuilder.create<arith::MulFOp>(xLoc, oneMinusFx, scaleVec);
 
         Value val0 = xBuilder.create<arith::FPToSIOp>(xLoc, vectorTyI16,
                                                       oneMinusFxScale);
         Value val1 =
             xBuilder.create<arith::FPToSIOp>(xLoc, vectorTyI16, fxScale);
 
-        SmallVector<int64_t> maskVec1;
+        SmallVector<int64_t> maskVec;
         for (int i = 0; i < stride; i++) {
-          maskVec1.push_back(i);
-          maskVec1.push_back(i + stride);
+          maskVec.push_back(i);
+          maskVec.push_back(i + stride);
         }
-        Value storeBack =
-            xBuilder.create<vector::ShuffleOp>(xLoc, val0, val1, maskVec1);
-        xBuilder.create<vector::StoreOp>(xLoc, storeBack, iAlpha,
-                                         ValueRange{index0});
+        Value index0 = xBuilder.create<arith::MulIOp>(xLoc, xiv, c2);
+        xBuilder.create<vector::StoreOp>(
+            xLoc, xBuilder.create<vector::ShuffleOp>(xLoc, val0, val1, maskVec),
+            iAlpha, ValueRange{index0});
         xBuilder.create<scf::YieldOp>(xLoc);
       });
 }
@@ -1165,136 +1149,104 @@ void processScaling(OpBuilder &builder, Location loc, Value output,
 void calcInterpolation(OpBuilder &builder, Location loc, Value &sy,
                        Value &syPrev, Value offset, Value input,
                        Value outputWidth, Value iAlpha, Value buffer,
-                       int64_t stride) {
-  static const int SHIFT = 11;
-  static const int HALF = 1 << (SHIFT - 1);
-  auto inElemTy = input.getType().cast<MemRefType>().getElementType();
+                       int64_t stride, Value halfVec, Value shiftVec,
+                       VectorType vectorResTy, VectorType vectorTyIndex,
+                       VectorType vectorTyI32, VectorType vectorTyI16,
+                       VectorType vectorTyI1) {
   Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
   Value c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
-  Value c1I1 =
-      builder.create<arith::ConstantIntOp>(loc, 1, builder.getI1Type());
-  Value c0I =
-      builder.create<arith::ConstantIntOp>(loc, 0, builder.getI32Type());
-  Value c1I =
-      builder.create<arith::ConstantIntOp>(loc, 1, builder.getI32Type());
   Value c2 = builder.create<arith::ConstantIndexOp>(loc, 2);
-  Value c2I =
-      builder.create<arith::ConstantIntOp>(loc, 2, builder.getI32Type());
 
   Value strideVal = builder.create<arith::ConstantIndexOp>(loc, stride);
-  Value strideInt =
-      builder.create<arith::IndexCastOp>(loc, builder.getI32Type(), strideVal);
-  Value outputStrideRatio =
-      builder.create<arith::DivUIOp>(loc, outputWidth, strideVal);
-  Value outputMultiple = builder.create<arith::MulIOp>(
-      loc, builder.create<arith::AddIOp>(loc, outputStrideRatio, c1),
-      strideVal);
-  VectorType vectorTyIndex = VectorType::get({stride}, builder.getIndexType());
-  VectorType vectorTyI1 = VectorType::get({stride}, builder.getI1Type());
-  VectorType vectorTyI16 =
-      VectorType::get({stride}, IntegerType::get(builder.getContext(), 16));
-  VectorType vectorTyI32 =
-      VectorType::get({stride}, IntegerType::get(builder.getContext(), 32));
-  VectorType vectorTyF32 =
-      VectorType::get({stride}, FloatType::getF32(builder.getContext()));
-
-  Value c1I1Vec = builder.create<vector::SplatOp>(loc, vectorTyI1, c1I1);
-  Value c1IVec = builder.create<vector::SplatOp>(loc, vectorTyI32, c1I);
-  Value c2IVec = builder.create<vector::SplatOp>(loc, vectorTyI32, c2I);
-  Value half =
-      builder.create<arith::ConstantIntOp>(loc, HALF, builder.getI32Type());
-  Value halfVec = builder.create<vector::SplatOp>(loc, vectorTyI32, half);
-  Value shift =
-      builder.create<arith::ConstantIntOp>(loc, SHIFT, builder.getI32Type());
-  Value shiftVec = builder.create<vector::SplatOp>(loc, vectorTyI32, shift);
-
-  auto passThruConstantOp =
-      builder.create<arith::ConstantOp>(loc, builder.getZeroAttr(vectorTyF32));
-  auto passThruConstantOpI32 =
+
+  Value c1I1Vec = builder.create<vector::SplatOp>(
+      loc, vectorTyI1,
+      builder.create<arith::ConstantIntOp>(loc, 1, builder.getI1Type()));
+  Value c1Vec = builder.create<vector::SplatOp>(loc, vectorTyIndex, c1);
+
+  auto passThruRes =
+      builder.create<arith::ConstantOp>(loc, builder.getZeroAttr(vectorResTy));
+  auto passThruI32 =
       builder.create<arith::ConstantOp>(loc, builder.getZeroAttr(vectorTyI32));
-  auto passThruConstantOpI16 =
-      builder.create<arith::ConstantOp>(loc, builder.getZeroAttr(vectorTyI16));
   Value notEqual =
       builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ne, sy, syPrev);
-  builder.create<
-      scf::IfOp>(loc, notEqual, [&](OpBuilder &tBuilder, Location tLoc) {
-    tBuilder.create<scf::ForOp>(
-        tLoc, c0, outputMultiple, strideVal, std::nullopt,
-        [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
-          Value maskVal =
-              xBuilder.create<arith::SubIOp>(xLoc, outputWidth, xiv);
-          Value maskVec =
-              xBuilder.create<vector::CreateMaskOp>(xLoc, vectorTyI1, maskVal);
-          Value sxVec = xBuilder.create<vector::MaskedLoadOp>(
-              xLoc, vectorTyI32, offset, ValueRange{xiv}, maskVec,
-              passThruConstantOpI32);
-          Value sxPlus1Vec =
-              xBuilder.create<arith::AddIOp>(xLoc, sxVec, c1IVec);
-          Value index0 = xBuilder.create<arith::MulIOp>(xLoc, xiv, c2);
-          Value index1 =
-              xBuilder.create<arith::AddIOp>(xLoc, index0, strideVal);
-          Value a0Vec = xBuilder.create<vector::LoadOp>(xLoc, vectorTyI16,
-                                                        iAlpha, index0);
-          Value a1Vec = xBuilder.create<vector::LoadOp>(xLoc, vectorTyI16,
-                                                        iAlpha, index1);
-          Value sxVecIndex =
-              xBuilder.create<arith::IndexCastOp>(xLoc, vectorTyIndex, sxVec);
-          Value sxPlus1VecIndex = xBuilder.create<arith::IndexCastOp>(
-              xLoc, vectorTyIndex, sxPlus1Vec);
-          Value syIndex = xBuilder.create<arith::IndexCastOp>(
-              xLoc, xBuilder.getIndexType(), sy);
-          Value v0Vec = xBuilder.create<vector::GatherOp>(
-              xLoc, vectorTyF32, input, ValueRange{syIndex, c0}, sxVecIndex,
-              c1I1Vec, passThruConstantOp);
-          v0Vec = xBuilder.create<arith::FPToSIOp>(xLoc, vectorTyI32, v0Vec);
-          Value v1Vec = xBuilder.create<vector::GatherOp>(
-              xLoc, vectorTyF32, input, ValueRange{syIndex, c0},
-              sxPlus1VecIndex, c1I1Vec, passThruConstantOp);
-          v1Vec = xBuilder.create<arith::FPToSIOp>(xLoc, vectorTyI32, v1Vec);
-
-          a0Vec = xBuilder.create<arith::ExtSIOp>(xLoc, vectorTyI32, a0Vec);
-          a1Vec = xBuilder.create<arith::ExtSIOp>(xLoc, vectorTyI32, a1Vec);
-          SmallVector<int64_t> maskVec1, maskVec2;
-          for (int i = 0; i < stride; i++) {
-            maskVec1.push_back(i * 2);
-            maskVec2.push_back(i * 2 + 1);
-          }
-          Value adder1 =
-              xBuilder.create<vector::ShuffleOp>(xLoc, a0Vec, a1Vec, maskVec1);
-          Value adder2 =
-              xBuilder.create<vector::ShuffleOp>(xLoc, a0Vec, a1Vec, maskVec2);
-          Value aMulv0 = xBuilder.create<arith::MulIOp>(xLoc, adder1, v0Vec);
-          Value aMulv1 = xBuilder.create<arith::MulIOp>(xLoc, adder2, v1Vec);
-          Value adder = xBuilder.create<arith::AddIOp>(xLoc, aMulv0, aMulv1);
-          adder = builder.create<arith::AddIOp>(loc, adder, halfVec);
-          Value resShifted =
-              xBuilder.create<arith::ShRSIOp>(xLoc, adder, shiftVec);
-          xBuilder.create<vector::MaskedStoreOp>(xLoc, buffer, ValueRange{xiv},
-                                                 maskVec, resShifted);
-          xBuilder.create<scf::YieldOp>(xLoc);
-        });
-    tBuilder.create<scf::YieldOp>(tLoc);
-  });
+  builder.create<scf::IfOp>(
+      loc, notEqual, [&](OpBuilder &tBuilder, Location tLoc) {
+        tBuilder.create<scf::ForOp>(
+            tLoc, c0, outputWidth, strideVal, std::nullopt,
+            [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
+              Value maskVal =
+                  xBuilder.create<arith::SubIOp>(xLoc, outputWidth, xiv);
+              Value maskVec = xBuilder.create<vector::CreateMaskOp>(
+                  xLoc, vectorTyI1, maskVal);
+              Value sxVec = xBuilder.create<arith::IndexCastOp>(
+                  xLoc, vectorTyIndex,
+                  xBuilder.create<vector::MaskedLoadOp>(xLoc, vectorTyI32,
+                                                        offset, ValueRange{xiv},
+                                                        maskVec, passThruI32));
+              Value sxPlus1Vec =
+                  xBuilder.create<arith::AddIOp>(xLoc, sxVec, c1Vec);
+
+              Value index0 = xBuilder.create<arith::MulIOp>(xLoc, xiv, c2);
+              Value index1 =
+                  xBuilder.create<arith::AddIOp>(xLoc, index0, strideVal);
+              Value a0Vec = xBuilder.create<arith::ExtSIOp>(
+                  xLoc, vectorTyI32,
+                  xBuilder.create<vector::LoadOp>(xLoc, vectorTyI16, iAlpha,
+                                                  index0));
+              Value a1Vec = xBuilder.create<arith::ExtSIOp>(
+                  xLoc, vectorTyI32,
+                  xBuilder.create<vector::LoadOp>(xLoc, vectorTyI16, iAlpha,
+                                                  index1));
+
+              Value syIndex = xBuilder.create<arith::IndexCastOp>(
+                  xLoc, xBuilder.getIndexType(), sy);
+              Value v0Vec = xBuilder.create<arith::FPToSIOp>(
+                  xLoc, vectorTyI32,
+                  xBuilder.create<vector::GatherOp>(
+                      xLoc, vectorResTy, input, ValueRange{syIndex, c0}, sxVec,
+                      c1I1Vec, passThruRes));
+              Value v1Vec = xBuilder.create<arith::FPToSIOp>(
+                  xLoc, vectorTyI32,
+                  xBuilder.create<vector::GatherOp>(
+                      xLoc, vectorResTy, input, ValueRange{syIndex, c0},
+                      sxPlus1Vec, c1I1Vec, passThruRes));
+
+              SmallVector<int64_t> maskVec1, maskVec2;
+              for (int i = 0; i < stride; i++) {
+                maskVec1.push_back(i * 2);
+                maskVec2.push_back(i * 2 + 1);
+              }
+              Value a0ShuffleVec = xBuilder.create<vector::ShuffleOp>(
+                  xLoc, a0Vec, a1Vec, maskVec1);
+              Value a1ShuffleVec = xBuilder.create<vector::ShuffleOp>(
+                  xLoc, a0Vec, a1Vec, maskVec2);
+              Value aMulv0 =
+                  xBuilder.create<arith::MulIOp>(xLoc, a0ShuffleVec, v0Vec);
+              Value aMulv1 =
+                  xBuilder.create<arith::MulIOp>(xLoc, a1ShuffleVec, v1Vec);
+              Value addRes =
+                  xBuilder.create<arith::AddIOp>(xLoc, aMulv0, aMulv1);
+              addRes = builder.create<arith::AddIOp>(loc, addRes, halfVec);
+              Value resShifted =
+                  xBuilder.create<arith::ShRSIOp>(xLoc, addRes, shiftVec);
+              xBuilder.create<vector::MaskedStoreOp>(
+                  xLoc, buffer, ValueRange{xiv}, maskVec, resShifted);
+              xBuilder.create<scf::YieldOp>(xLoc);
+            });
+        tBuilder.create<scf::YieldOp>(tLoc);
+      });
   syPrev = builder.create<arith::SelectOp>(loc, notEqual, sy, syPrev);
 }
 
-void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc,
-                                      MLIRContext *ctx, Value input,
-                                      Value output, int64_t stride,
-                                      Value horizontalScalingFactor,
-                                      Value verticalScalingFactor) {
-  static const int SHIFT = 11;
-  static const int INTER_RESIZE_COEF_SCALE = 1 << SHIFT;
-  static const int HALF = 1 << (SHIFT - 1);
-  auto inElemTy = input.getType().cast<MemRefType>().getElementType();
-  VectorType vectorTyIndex = VectorType::get({stride}, builder.getIndexType());
-  VectorType vectorTyI1 = VectorType::get({stride}, builder.getI1Type());
-  VectorType vectorTyI32 =
-      VectorType::get({stride}, IntegerType::get(builder.getContext(), 32));
-  VectorType vectorTyF = VectorType::get(
-      {stride}, inElemTy.isF32() ? FloatType::getF32(builder.getContext())
-                                 : FloatType::getF64(builder.getContext()));
-  auto passThruConstantOpI32 =
+void BilinearInterpolationResizingNew(
+    OpBuilder &builder, Location loc, MLIRContext *ctx, Value input,
+    Value output, int64_t stride, Value horizontalScalingFactor,
+    Value verticalScalingFactor, Value halfVec, Value shiftVec, Value scaleVec,
+    VectorType vectorResTy, VectorType vectorTyI32, VectorType vectorTyI16,
+    VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1) {
+
+  auto passThruI32 =
       builder.create<arith::ConstantOp>(loc, builder.getZeroAttr(vectorTyI32));
   Value cMinus1 = builder.create<arith::ConstantIndexOp>(loc, -1);
   Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
@@ -1302,9 +1254,9 @@ void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc,
   Value c1I32 =
       builder.create<arith::ConstantIntOp>(loc, 1, builder.getI32Type());
   Value c2 = builder.create<arith::ConstantIndexOp>(loc, 2);
+
   Value inputRow = builder.create<memref::DimOp>(loc, input, c0);
   Value inputCol = builder.create<memref::DimOp>(loc, input, c1);
-
   Value outputRow = builder.create<memref::DimOp>(loc, output, c0);
   Value outputCol = builder.create<memref::DimOp>(loc, output, c1);
 
@@ -1334,7 +1286,8 @@ void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc,
       builder.create<memref::AllocOp>(loc, dynamicTypeI16, outputColMultiple2);
 
   processScaling(builder, loc, outputCol, horizontalScalingFactor, inputCol,
-                 xOffset, iAlpha, stride);
+                 xOffset, iAlpha, stride, scaleVec, vectorTyI32, vectorTyF32,
+                 vectorTyI1, vectorTyI16);
 
   Value yOffset =
       builder.create<memref::AllocOp>(loc, dynamicTypeI32, outputRow);
@@ -1342,24 +1295,11 @@ void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc,
       builder.create<memref::AllocOp>(loc, dynamicTypeI16, outputRowMultiple2);
 
   processScaling(builder, loc, outputRow, verticalScalingFactor, inputRow,
-                 yOffset, iBeta, stride);
+                 yOffset, iBeta, stride, scaleVec, vectorTyI32, vectorTyF32,
+                 vectorTyI1, vectorTyI16);
 
   Value bufferWidth = outputCol;
-  Value buffer0 =
-      builder.create<memref::AllocOp>(loc, dynamicTypeI32, bufferWidth);
-  Value buffer1 =
-      builder.create<memref::AllocOp>(loc, dynamicTypeI32, bufferWidth);
-  Value prevSy0 =
-      builder.create<arith::IndexCastOp>(loc, builder.getI32Type(), cMinus1);
-  Value prevSy1 =
-      builder.create<arith::IndexCastOp>(loc, builder.getI32Type(), cMinus1);
-
-  Value halfVec = builder.create<vector::SplatOp>(
-      loc, vectorTyI32,
-      builder.create<arith::ConstantIntOp>(loc, HALF, builder.getI32Type()));
-  Value shiftVec = builder.create<vector::SplatOp>(
-      loc, vectorTyI32,
-      builder.create<arith::ConstantIntOp>(loc, SHIFT, builder.getI32Type()));
+
   Value zeroVec = builder.create<vector::SplatOp>(
       loc, vectorTyI32,
       builder.create<arith::ConstantIntOp>(loc, 0, builder.getI32Type()));
@@ -1367,74 +1307,92 @@ void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc,
       loc, vectorTyI32,
       builder.create<arith::ConstantIntOp>(loc, 255, builder.getI32Type()));
 
-  // builder.create<scf::ParallelOp>(
-  //   loc, ValueRange{c0}, ValueRange{outputRow}, ValueRange{c1},
-  //   [&](OpBuilder &yBuilder, Location yLoc, ValueRange ivs) {
-  //     Value yiv = ivs[0];
-  //   }
-  // );
-  builder.create<scf::ForOp>(
-      loc, c0, outputRow, c1, std::nullopt,
-      [&](OpBuilder &yBuilder, Location yLoc, Value yiv, ValueRange) {
-        Value sy =
-            yBuilder.create<memref::LoadOp>(yLoc, yOffset, ValueRange{yiv});
-        Value syNext = yBuilder.create<arith::AddIOp>(yLoc, sy, c1I32);
-        calcInterpolation(yBuilder, yLoc, sy, prevSy0, xOffset, input,
-                          outputCol, iAlpha, buffer0, stride);
-        calcInterpolation(yBuilder, yLoc, syNext, prevSy1, xOffset, input,
-                          outputCol, iAlpha, buffer1, stride);
-
-        //  calc index
-        Value yMul2 = yBuilder.create<arith::MulIOp>(yLoc, yiv, c2);
-        yMul2 = yBuilder.create<arith::IndexCastOp>(yLoc, yBuilder.getI32Type(),
-                                                    yMul2);
-        Value yMul2Plus1 = yBuilder.create<arith::AddIOp>(yLoc, yMul2, c1I32);
-        Value index0 = yBuilder.create<arith::IndexCastOp>(
-            yLoc, yBuilder.getIndexType(), yMul2);
-        Value index1 = yBuilder.create<arith::IndexCastOp>(
-            yLoc, yBuilder.getIndexType(), yMul2Plus1);
-        Value b0 = yBuilder.create<memref::LoadOp>(yLoc, iBeta, index0);
-        b0 = yBuilder.create<arith::ExtSIOp>(yLoc, yBuilder.getI32Type(), b0);
-        Value b0Vec = yBuilder.create<vector::SplatOp>(yLoc, vectorTyI32, b0);
-        Value b1 = yBuilder.create<memref::LoadOp>(yLoc, iBeta, index1);
-        b1 = yBuilder.create<arith::ExtSIOp>(yLoc, yBuilder.getI32Type(), b1);
-        Value b1Vec = yBuilder.create<vector::SplatOp>(yLoc, vectorTyI32, b1);
+  auto resizeLoop = [&](OpBuilder &yBuilder, Location yLoc, Value yiv,
+                        ValueRange) {
+    Value buffer0 =
+        yBuilder.create<memref::AllocOp>(loc, dynamicTypeI32, bufferWidth);
+    Value buffer1 =
+        yBuilder.create<memref::AllocOp>(loc, dynamicTypeI32, bufferWidth);
+    Value prevSy0 = yBuilder.create<arith::IndexCastOp>(
+        loc, yBuilder.getI32Type(), cMinus1);
+    Value prevSy1 = yBuilder.create<arith::IndexCastOp>(
+        loc, yBuilder.getI32Type(), cMinus1);
+
+    Value sy = yBuilder.create<memref::LoadOp>(yLoc, yOffset, ValueRange{yiv});
+    Value syNext = yBuilder.create<arith::AddIOp>(yLoc, sy, c1I32);
+    calcInterpolation(yBuilder, yLoc, sy, prevSy0, xOffset, input, outputCol,
+                      iAlpha, buffer0, stride, halfVec, shiftVec, vectorResTy,
+                      vectorTyIndex, vectorTyI32, vectorTyI16, vectorTyI1);
+    calcInterpolation(yBuilder, yLoc, syNext, prevSy1, xOffset, input,
+                      outputCol, iAlpha, buffer1, stride, halfVec, shiftVec,
+                      vectorResTy, vectorTyIndex, vectorTyI32, vectorTyI16,
+                      vectorTyI1);
+
+    Value index0 = yBuilder.create<arith::MulIOp>(yLoc, yiv, c2);
+    Value index1 = yBuilder.create<arith::AddIOp>(yLoc, index0, c1);
+    Value b0 = yBuilder.create<arith::ExtSIOp>(
+        yLoc, yBuilder.getI32Type(),
+        yBuilder.create<memref::LoadOp>(yLoc, iBeta, index0));
+    Value b0Vec = yBuilder.create<vector::SplatOp>(yLoc, vectorTyI32, b0);
+    Value b1 = yBuilder.create<arith::ExtSIOp>(
+        yLoc, yBuilder.getI32Type(),
+        yBuilder.create<memref::LoadOp>(yLoc, iBeta, index1));
+    Value b1Vec = yBuilder.create<vector::SplatOp>(yLoc, vectorTyI32, b1);
+
+    yBuilder.create<scf::ForOp>(
+        yLoc, c0, bufferWidth, strideVal, std::nullopt,
+        [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
+          Value maskVal =
+              xBuilder.create<arith::SubIOp>(xLoc, bufferWidth, xiv);
+          Value maskVec =
+              xBuilder.create<vector::CreateMaskOp>(xLoc, vectorTyI1, maskVal);
+          Value buffer0X = xBuilder.create<vector::MaskedLoadOp>(
+              xLoc, vectorTyI32, buffer0, ValueRange{xiv}, maskVec,
+              passThruI32);
+          Value buffer1X = xBuilder.create<vector::MaskedLoadOp>(
+              xLoc, vectorTyI32, buffer1, ValueRange{xiv}, maskVec,
+              passThruI32);
+          Value b0MulBuffer0 =
+              xBuilder.create<arith::MulIOp>(xLoc, b0Vec, buffer0X);
+          Value b1MulBuffer1 =
+              xBuilder.create<arith::MulIOp>(xLoc, b1Vec, buffer1X);
+          Value bufferRes =
+              xBuilder.create<arith::AddIOp>(xLoc, b0MulBuffer0, b1MulBuffer1);
+          Value addHalf =
+              xBuilder.create<arith::AddIOp>(xLoc, bufferRes, halfVec);
+          Value resShifted =
+              xBuilder.create<arith::ShRSIOp>(xLoc, addHalf, shiftVec);
+          Value maxVal =
+              xBuilder.create<arith::MaxSIOp>(xLoc, resShifted, zeroVec);
+          Value clampedVal =
+              xBuilder.create<arith::MinSIOp>(xLoc, maxVal, twoFiftyFiveVec);
+          Value clampedValF =
+              xBuilder.create<arith::SIToFPOp>(xLoc, vectorResTy, clampedVal);
+          xBuilder.create<vector::MaskedStoreOp>(
+              xLoc, output, ValueRange{yiv, xiv}, maskVec, clampedValF);
+          xBuilder.create<scf::YieldOp>(xLoc);
+        });
+    yBuilder.create<memref::DeallocOp>(loc, buffer0);
+    yBuilder.create<memref::DeallocOp>(loc, buffer1);
+    yBuilder.create<scf::YieldOp>(yLoc);
+  };
 
-        yBuilder.create<scf::ForOp>(
-            yLoc, c0, bufferWidth, strideVal, std::nullopt,
-            [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
-              Value maskVal =
-                  xBuilder.create<arith::SubIOp>(xLoc, bufferWidth, xiv);
-              Value maskVec = xBuilder.create<vector::CreateMaskOp>(
-                  xLoc, vectorTyI1, maskVal);
-              Value buffer0X = xBuilder.create<vector::MaskedLoadOp>(
-                  xLoc, vectorTyI32, buffer0, ValueRange{xiv}, maskVec,
-                  passThruConstantOpI32);
-              Value buffer1X = xBuilder.create<vector::MaskedLoadOp>(
-                  xLoc, vectorTyI32, buffer1, ValueRange{xiv}, maskVec,
-                  passThruConstantOpI32);
-              Value b0MulBuffer0 =
-                  xBuilder.create<arith::MulIOp>(xLoc, b0Vec, buffer0X);
-              Value b1MulBuffer1 =
-                  xBuilder.create<arith::MulIOp>(xLoc, b1Vec, buffer1X);
-              Value add = xBuilder.create<arith::AddIOp>(xLoc, b0MulBuffer0,
-                                                         b1MulBuffer1);
-              Value addHalf =
-                  xBuilder.create<arith::AddIOp>(xLoc, add, halfVec);
-              Value resShifted =
-                  xBuilder.create<arith::ShRSIOp>(xLoc, addHalf, shiftVec);
-              Value maxVal =
-                  xBuilder.create<arith::MaxSIOp>(xLoc, resShifted, zeroVec);
-              Value clampedVal = xBuilder.create<arith::MinSIOp>(
-                  xLoc, maxVal, twoFiftyFiveVec);
-              Value clampedValF =
-                  xBuilder.create<arith::SIToFPOp>(xLoc, vectorTyF, clampedVal);
-              xBuilder.create<vector::MaskedStoreOp>(
-                  xLoc, output, ValueRange{yiv, xiv}, maskVec, clampedValF);
-              xBuilder.create<scf::YieldOp>(xLoc);
-            });
-        yBuilder.create<scf::YieldOp>(yLoc);
+  Value batch = builder.create<arith::ConstantIndexOp>(loc, stride);
+  builder.create<scf::ParallelOp>(
+      loc, ValueRange{c0}, ValueRange{outputRow}, ValueRange{batch},
+      [&](OpBuilder &tBuilder, Location tLoc, ValueRange ivs) {
+        Value tStart = ivs[0];
+        Value tEnd = tBuilder.create<arith::AddIOp>(tLoc, tStart, batch);
+        tEnd = tBuilder.create<arith::MinSIOp>(tLoc, tEnd, outputRow);
+        tBuilder.create<scf::ForOp>(tLoc, tStart, tEnd, c1, std::nullopt,
+                                    resizeLoop);
+        tBuilder.create<scf::YieldOp>(tLoc);
       });
+
+  builder.create<memref::DeallocOp>(loc, xOffset);
+  builder.create<memref::DeallocOp>(loc, iAlpha);
+  builder.create<memref::DeallocOp>(loc, yOffset);
+  builder.create<memref::DeallocOp>(loc, iBeta);
 }
 
 // Helper function for resizing an image using nearest neighbour interpolation

From 2e3625748dbad99bc897727cb4a0d9733d9ce257 Mon Sep 17 00:00:00 2001
From: HarryZ <uqbarz@gmail.com>
Date: Fri, 7 Mar 2025 20:06:00 +0800
Subject: [PATCH 4/7] [feat] finish resize2d acc op

---
 examples/DIPDialect/CMakeLists.txt     |  3 +-
 frontend/Interfaces/lib/CMakeLists.txt | 47 +++++++++++++++-----------
 midend/include/Utils/DIPUtils.h        | 13 +++++++
 3 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/examples/DIPDialect/CMakeLists.txt b/examples/DIPDialect/CMakeLists.txt
index 7b2f075c9c..259d4f4f8c 100644
--- a/examples/DIPDialect/CMakeLists.txt
+++ b/examples/DIPDialect/CMakeLists.txt
@@ -1,4 +1,5 @@
-set(DIP_LIBS ${JPEG_LIBRARY} ${PNG_LIBRARIES} BuddyLibDIP)
+set(DIP_LIBS ${JPEG_LIBRARY} ${PNG_LIBRARIES} BuddyLibDIP omp)
+link_directories(${LLVM_LIBS})
 
 if(BUDDY_ENABLE_OPENCV)
   find_package(OpenCV REQUIRED CONFIG)
diff --git a/frontend/Interfaces/lib/CMakeLists.txt b/frontend/Interfaces/lib/CMakeLists.txt
index c4172a8b23..1286289c22 100644
--- a/frontend/Interfaces/lib/CMakeLists.txt
+++ b/frontend/Interfaces/lib/CMakeLists.txt
@@ -17,16 +17,23 @@ endif ()
 add_custom_command(OUTPUT DIP.o
         COMMAND ${CMAKE_BINARY_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/DIP.mlir
         -lower-dip="DIP-strip-mining=${SPLITING_SIZE}"
-        -arith-expand
+        -affine-parallelize
         -lower-affine
-        -convert-scf-to-cf
-        -convert-math-to-llvm
+        -convert-scf-to-openmp
+        -convert-vector-to-scf
         -convert-vector-to-llvm
+        -memref-expand
+        -arith-expand
+        -convert-arith-to-llvm
         -finalize-memref-to-llvm
+        -convert-scf-to-cf
+        -convert-openmp-to-llvm
+        -convert-math-to-llvm
         -convert-func-to-llvm
         -reconcile-unrealized-casts |
         ${LLVM_TOOLS_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
         ${LLVM_TOOLS_BINARY_DIR}/llc
+        -relocation-model=pic
         -mtriple=${BUDDY_TARGET_TRIPLE}
         -mattr=${BUDDY_OPT_ATTR}
         --filetype=obj
@@ -47,8 +54,8 @@ SET_TARGET_PROPERTIES(BuddyLibDIP PROPERTIES
 
 add_custom_command(
   OUTPUT DAP.o
-  COMMAND 
-    ${CMAKE_BINARY_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/DAP.mlir 
+  COMMAND
+    ${CMAKE_BINARY_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/DAP.mlir
       -lower-dap="DAP-vector-splitting=${SPLITING_SIZE}"
       --convert-linalg-to-affine-loops
       -arith-expand
@@ -59,9 +66,9 @@ add_custom_command(
       -finalize-memref-to-llvm
       -llvm-request-c-wrappers
       -convert-func-to-llvm
-      -reconcile-unrealized-casts | 
+      -reconcile-unrealized-casts |
     ${LLVM_TOOLS_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
-    ${LLVM_TOOLS_BINARY_DIR}/llc 
+    ${LLVM_TOOLS_BINARY_DIR}/llc
       -mtriple=${BUDDY_TARGET_TRIPLE}
       -mattr=${BUDDY_OPT_ATTR}
       --filetype=obj
@@ -71,25 +78,25 @@ add_custom_command(
 
 add_custom_command(
   OUTPUT DAP-extend.o
-  COMMAND 
-    ${CMAKE_BINARY_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/DAP-extend.mlir 
+  COMMAND
+    ${CMAKE_BINARY_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/DAP-extend.mlir
       -extend-dap
       -one-shot-bufferize
       -convert-linalg-to-loops
       -convert-scf-to-cf
       -expand-strided-metadata
       -lower-affine
-      -convert-vector-to-llvm 
-      -memref-expand 
+      -convert-vector-to-llvm
+      -memref-expand
       -arith-expand
       -convert-arith-to-llvm
-      -finalize-memref-to-llvm 
+      -finalize-memref-to-llvm
       -convert-math-to-llvm
       -llvm-request-c-wrappers
       -convert-func-to-llvm
-      -reconcile-unrealized-casts | 
+      -reconcile-unrealized-casts |
     ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
-    ${LLVM_TOOLS_BINARY_DIR}/llc 
+    ${LLVM_TOOLS_BINARY_DIR}/llc
       -mtriple=${BUDDY_TARGET_TRIPLE}
       -mattr=${BUDDY_OPT_ATTR}
       -filetype=obj -relocation-model=pic
@@ -99,7 +106,7 @@ add_custom_command(
 
 add_custom_command(
   OUTPUT DAPVectorization.o
-  COMMAND 
+  COMMAND
     cat ${CMAKE_CURRENT_SOURCE_DIR}/DAP.mlir |
     sed -e 's/@buddy_fir/@buddy_fir_vectorization/'
         -e 's/@buddy_iir/@buddy_iir_vectorization/'
@@ -115,9 +122,9 @@ add_custom_command(
       -finalize-memref-to-llvm
       -llvm-request-c-wrappers
       -convert-func-to-llvm
-      -reconcile-unrealized-casts | 
+      -reconcile-unrealized-casts |
     ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
-    ${LLVM_TOOLS_BINARY_DIR}/llc 
+    ${LLVM_TOOLS_BINARY_DIR}/llc
       -mtriple=${BUDDY_TARGET_TRIPLE}
       -mattr=${BUDDY_OPT_ATTR}
       -filetype=obj
@@ -125,9 +132,9 @@ add_custom_command(
   DEPENDS mlir-translate llc buddy-opt
 )
 
-add_library(BuddyLibDAP STATIC 
-  DAP.o 
-  DAP-extend.o 
+add_library(BuddyLibDAP STATIC
+  DAP.o
+  DAP-extend.o
   DAPVectorization.o
 )
 
diff --git a/midend/include/Utils/DIPUtils.h b/midend/include/Utils/DIPUtils.h
index c5bd3104d1..6b0bcdecf1 100644
--- a/midend/include/Utils/DIPUtils.h
+++ b/midend/include/Utils/DIPUtils.h
@@ -176,6 +176,19 @@ void fillPixelsBilinearInterpolate4D(
     Value inputRowLastElemF32, Value inputColLastElemF32, Value c0F32,
     Value c1F32, Value dataCondition);
 
+void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc,
+                                              MLIRContext *ctx, Value input,
+                                              Value output,
+                                              Value horizontalScalingFactor,
+                                              Value verticalScalingFactor);
+
+void BilinearInterpolationResizingNew(
+    OpBuilder &builder, Location loc, MLIRContext *ctx, Value input,
+    Value output, int64_t stride, Value horizontalScalingFactor,
+    Value verticalScalingFactor, Value halfVec, Value shiftVec, Value scaleVec,
+    VectorType vectorResTy, VectorType vectorTyI32, VectorType vectorTyI16,
+    VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1);
+
 // Helper function for resizing an image using nearest neighbour interpolation
 // mechanism.
 void NearestNeighbourInterpolationResizing(

From d64526c009415f54662cc7f775963d63719f913f Mon Sep 17 00:00:00 2001
From: HarryZ <uqbarz@gmail.com>
Date: Tue, 11 Mar 2025 18:56:28 +0800
Subject: [PATCH 5/7] [feat] add nearest resize vector, but slower

---
 midend/include/Utils/DIPUtils.h               | 10 +--
 .../lib/Conversion/LowerDIP/LowerDIPPass.cpp  |  5 +-
 midend/lib/Utils/DIPUtils.cpp                 | 75 +++++++++++++------
 3 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/midend/include/Utils/DIPUtils.h b/midend/include/Utils/DIPUtils.h
index 6b0bcdecf1..04679440e5 100644
--- a/midend/include/Utils/DIPUtils.h
+++ b/midend/include/Utils/DIPUtils.h
@@ -176,11 +176,11 @@ void fillPixelsBilinearInterpolate4D(
     Value inputRowLastElemF32, Value inputColLastElemF32, Value c0F32,
     Value c1F32, Value dataCondition);
 
-void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc,
-                                              MLIRContext *ctx, Value input,
-                                              Value output,
-                                              Value horizontalScalingFactor,
-                                              Value verticalScalingFactor);
+void NearestNeighbourInterpolationResizingNew(
+    OpBuilder &builder, Location loc, MLIRContext *ctx, Value input,
+    Value output, int64_t stride, Value horizontalScalingFactor,
+    Value verticalScalingFactor, VectorType vectorResTy, VectorType vectorTyI16,
+    VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1);
 
 void BilinearInterpolationResizingNew(
     OpBuilder &builder, Location loc, MLIRContext *ctx, Value input,
diff --git a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp
index 3cd65126de..578563e08b 100644
--- a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp
+++ b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp
@@ -345,8 +345,9 @@ class DIPResize2DOpLowering : public OpRewritePattern<dip::Resize2DOp> {
     if (interpolationAttr ==
         dip::InterpolationType::NearestNeighbourInterpolation) {
       dip::NearestNeighbourInterpolationResizingNew(
-          rewriter, loc, ctx, input, output, horizontalScalingFactor,
-          verticalScalingFactor);
+          rewriter, loc, ctx, input, output, stride, horizontalScalingFactor,
+          verticalScalingFactor, vectorResTy, vectorTyI16, vectorTyIndex,
+          vectorTyF32, vectorTyI1);
     } else if (interpolationAttr ==
                dip::InterpolationType::BilinearInterpolation) {
       dip::BilinearInterpolationResizingNew(
diff --git a/midend/lib/Utils/DIPUtils.cpp b/midend/lib/Utils/DIPUtils.cpp
index 256624055f..3ddefccbd0 100644
--- a/midend/lib/Utils/DIPUtils.cpp
+++ b/midend/lib/Utils/DIPUtils.cpp
@@ -987,18 +987,30 @@ void fillPixelsBilinearInterpolate4D(
       });
 }
 
-void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc,
-                                              MLIRContext *ctx, Value input,
-                                              Value output,
-                                              Value horizontalScalingFactor,
-                                              Value verticalScalingFactor) {
+void NearestNeighbourInterpolationResizingNew(
+    OpBuilder &builder, Location loc, MLIRContext *ctx, Value input,
+    Value output, int64_t stride, Value horizontalScalingFactor,
+    Value verticalScalingFactor, VectorType vectorResTy, VectorType vectorTyI16,
+    VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1) {
   Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
   Value c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
+  Value strideVal = builder.create<arith::ConstantIndexOp>(loc, stride);
+  Value stepVec = iotaVec0F32(builder, loc, stride);
+  Value horizontalVec = builder.create<vector::SplatOp>(
+      loc, vectorTyF32, horizontalScalingFactor);
+  Value verticalVec =
+      builder.create<vector::SplatOp>(loc, vectorTyF32, verticalScalingFactor);
+  auto passThruRes =
+      builder.create<arith::ConstantOp>(loc, builder.getZeroAttr(vectorResTy));
+  auto passThruI16 =
+      builder.create<arith::ConstantOp>(loc, builder.getZeroAttr(vectorTyI16));
 
   Value inputRow = builder.create<memref::DimOp>(loc, input, c0);
   Value inputRowMinus1 = builder.create<arith::IndexCastUIOp>(
       loc, builder.getI16Type(),
       builder.create<arith::SubIOp>(loc, inputRow, c1));
+  Value inputRowMinus1Vec =
+      builder.create<vector::SplatOp>(loc, vectorTyI16, inputRowMinus1);
   Value inputCol = builder.create<memref::DimOp>(loc, input, c1);
   Value inputColMinus1 = builder.create<arith::IndexCastUIOp>(
       loc, builder.getI16Type(),
@@ -1006,22 +1018,33 @@ void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc,
 
   Value outputRow = builder.create<memref::DimOp>(loc, output, c0);
   Value outputCol = builder.create<memref::DimOp>(loc, output, c1);
+  Value outputColStrideRatio =
+      builder.create<arith::DivUIOp>(loc, outputCol, strideVal);
+  Value outputColMultiple = builder.create<arith::MulIOp>(
+      loc, builder.create<arith::AddIOp>(loc, outputColStrideRatio, c1),
+      strideVal);
 
   MemRefType dynamicTypeI16 =
       MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 16));
   Value srcXPosVec =
-      builder.create<memref::AllocOp>(loc, dynamicTypeI16, outputCol);
+      builder.create<memref::AllocOp>(loc, dynamicTypeI16, outputColMultiple);
   builder.create<scf::ForOp>(
-      loc, c0, outputCol, c1, std::nullopt,
+      loc, c0, outputColMultiple, strideVal, std::nullopt,
       [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
+        // Value maskVal = xBuilder.create<arith::SubIOp>(xLoc, outputCol, xiv);
+        Value xivFVec = xBuilder.create<vector::SplatOp>(
+            xLoc, vectorTyF32, indexToF32(xBuilder, xLoc, xiv));
+        xivFVec = xBuilder.create<arith::AddFOp>(xLoc, xivFVec, stepVec);
         Value srcXPos = xBuilder.create<arith::FPToUIOp>(
-            xLoc, xBuilder.getI16Type(),
-            xBuilder.create<arith::MulFOp>(xLoc,
-                                           indexToF32(xBuilder, xLoc, xiv),
-                                           horizontalScalingFactor));
+            xLoc, vectorTyI16,
+            xBuilder.create<arith::MulFOp>(xLoc, xivFVec, horizontalVec));
         srcXPos =
-            xBuilder.create<arith::MinSIOp>(xLoc, srcXPos, inputRowMinus1);
-        xBuilder.create<memref::StoreOp>(xLoc, srcXPos, srcXPosVec,
+            xBuilder.create<arith::MinSIOp>(xLoc, srcXPos, inputRowMinus1Vec);
+        // xBuilder.create<vector::MaskedStoreOp>(
+        //       xLoc, srcXPosVec, ValueRange{xiv},
+        //       xBuilder.create<vector::CreateMaskOp>(xLoc, vectorTyI1,
+        //       maskVal), srcXPos);
+        xBuilder.create<vector::StoreOp>(xLoc, srcXPos, srcXPosVec,
                                          ValueRange{xiv});
         xBuilder.create<scf::YieldOp>(xLoc);
       });
@@ -1038,16 +1061,24 @@ void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc,
         srcYPos = yBuilder.create<arith::IndexCastOp>(
             yLoc, yBuilder.getIndexType(), srcYPos);
         yBuilder.create<scf::ForOp>(
-            loc, c0, outputCol, c1, std::nullopt,
+            loc, c0, outputCol, strideVal, std::nullopt,
             [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
-              Value srcXPos = xBuilder.create<memref::LoadOp>(xLoc, srcXPosVec,
-                                                              ValueRange{xiv});
-              srcXPos = xBuilder.create<arith::IndexCastOp>(
-                  xLoc, xBuilder.getIndexType(), srcXPos);
-              Value srcPixel = xBuilder.create<memref::LoadOp>(
-                  xLoc, input, ValueRange{srcYPos, srcXPos});
-              xBuilder.create<memref::StoreOp>(xLoc, srcPixel, output,
-                                               ValueRange{yiv, xiv});
+              Value maskVal =
+                  xBuilder.create<arith::SubIOp>(xLoc, outputCol, xiv);
+              Value maskVec = xBuilder.create<vector::CreateMaskOp>(
+                  xLoc, vectorTyI1, maskVal);
+              Value srcXPos = xBuilder.create<vector::MaskedLoadOp>(
+                  xLoc, vectorTyI16, srcXPosVec, ValueRange{xiv}, maskVec,
+                  passThruI16);
+              // Value srcXPos = xBuilder.create<vector::LoadOp>(
+              //     xLoc, vectorTyI16, srcXPosVec, ValueRange{xiv});
+              srcXPos = xBuilder.create<arith::IndexCastOp>(xLoc, vectorTyIndex,
+                                                            srcXPos);
+              Value srcPixel = xBuilder.create<vector::GatherOp>(
+                  xLoc, vectorResTy, input, ValueRange{srcYPos, c0}, srcXPos,
+                  maskVec, passThruRes);
+              xBuilder.create<vector::MaskedStoreOp>(
+                  xLoc, output, ValueRange{yiv, xiv}, maskVec, srcPixel);
               xBuilder.create<scf::YieldOp>(xLoc);
             });
         yBuilder.create<scf::YieldOp>(yLoc);

From 7fed3aa3376c7469e97f151f24b7ba83cbf1b87c Mon Sep 17 00:00:00 2001
From: HarryZ <uqbarz@gmail.com>
Date: Tue, 11 Mar 2025 19:26:10 +0800
Subject: [PATCH 6/7] [feat] parallel resize2d nearest op

---
 midend/include/Utils/DIPUtils.h               | 10 +--
 .../lib/Conversion/LowerDIP/LowerDIPPass.cpp  |  4 +-
 midend/lib/Utils/DIPUtils.cpp                 | 82 ++++++-------------
 3 files changed, 32 insertions(+), 64 deletions(-)

diff --git a/midend/include/Utils/DIPUtils.h b/midend/include/Utils/DIPUtils.h
index 04679440e5..e5c3cfdf30 100644
--- a/midend/include/Utils/DIPUtils.h
+++ b/midend/include/Utils/DIPUtils.h
@@ -176,11 +176,11 @@ void fillPixelsBilinearInterpolate4D(
     Value inputRowLastElemF32, Value inputColLastElemF32, Value c0F32,
     Value c1F32, Value dataCondition);
 
-void NearestNeighbourInterpolationResizingNew(
-    OpBuilder &builder, Location loc, MLIRContext *ctx, Value input,
-    Value output, int64_t stride, Value horizontalScalingFactor,
-    Value verticalScalingFactor, VectorType vectorResTy, VectorType vectorTyI16,
-    VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1);
+void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc,
+                                              MLIRContext *ctx, Value input,
+                                              Value output, int64_t stride,
+                                              Value horizontalScalingFactor,
+                                              Value verticalScalingFactor);
 
 void BilinearInterpolationResizingNew(
     OpBuilder &builder, Location loc, MLIRContext *ctx, Value input,
diff --git a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp
index 578563e08b..a250bd8068 100644
--- a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp
+++ b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp
@@ -300,7 +300,6 @@ class DIPResize2DOpLowering : public OpRewritePattern<dip::Resize2DOp> {
     Value verticalScalingFactor = op->getOperand(2);
     Value output = op->getOperand(3);
     auto interpolationAttr = op.getInterpolationType();
-    Value strideVal = rewriter.create<arith::ConstantIndexOp>(loc, stride);
 
     auto inElemTy = input.getType().cast<MemRefType>().getElementType();
     dip::DIP_ERROR error =
@@ -346,8 +345,7 @@ class DIPResize2DOpLowering : public OpRewritePattern<dip::Resize2DOp> {
         dip::InterpolationType::NearestNeighbourInterpolation) {
       dip::NearestNeighbourInterpolationResizingNew(
           rewriter, loc, ctx, input, output, stride, horizontalScalingFactor,
-          verticalScalingFactor, vectorResTy, vectorTyI16, vectorTyIndex,
-          vectorTyF32, vectorTyI1);
+          verticalScalingFactor);
     } else if (interpolationAttr ==
                dip::InterpolationType::BilinearInterpolation) {
       dip::BilinearInterpolationResizingNew(
diff --git a/midend/lib/Utils/DIPUtils.cpp b/midend/lib/Utils/DIPUtils.cpp
index 3ddefccbd0..5d4282fe83 100644
--- a/midend/lib/Utils/DIPUtils.cpp
+++ b/midend/lib/Utils/DIPUtils.cpp
@@ -987,30 +987,18 @@ void fillPixelsBilinearInterpolate4D(
       });
 }
 
-void NearestNeighbourInterpolationResizingNew(
-    OpBuilder &builder, Location loc, MLIRContext *ctx, Value input,
-    Value output, int64_t stride, Value horizontalScalingFactor,
-    Value verticalScalingFactor, VectorType vectorResTy, VectorType vectorTyI16,
-    VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1) {
+void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc,
+                                              MLIRContext *ctx, Value input,
+                                              Value output, int64_t stride,
+                                              Value horizontalScalingFactor,
+                                              Value verticalScalingFactor) {
   Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
   Value c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
-  Value strideVal = builder.create<arith::ConstantIndexOp>(loc, stride);
-  Value stepVec = iotaVec0F32(builder, loc, stride);
-  Value horizontalVec = builder.create<vector::SplatOp>(
-      loc, vectorTyF32, horizontalScalingFactor);
-  Value verticalVec =
-      builder.create<vector::SplatOp>(loc, vectorTyF32, verticalScalingFactor);
-  auto passThruRes =
-      builder.create<arith::ConstantOp>(loc, builder.getZeroAttr(vectorResTy));
-  auto passThruI16 =
-      builder.create<arith::ConstantOp>(loc, builder.getZeroAttr(vectorTyI16));
 
   Value inputRow = builder.create<memref::DimOp>(loc, input, c0);
   Value inputRowMinus1 = builder.create<arith::IndexCastUIOp>(
       loc, builder.getI16Type(),
       builder.create<arith::SubIOp>(loc, inputRow, c1));
-  Value inputRowMinus1Vec =
-      builder.create<vector::SplatOp>(loc, vectorTyI16, inputRowMinus1);
   Value inputCol = builder.create<memref::DimOp>(loc, input, c1);
   Value inputColMinus1 = builder.create<arith::IndexCastUIOp>(
       loc, builder.getI16Type(),
@@ -1018,40 +1006,30 @@ void NearestNeighbourInterpolationResizingNew(
 
   Value outputRow = builder.create<memref::DimOp>(loc, output, c0);
   Value outputCol = builder.create<memref::DimOp>(loc, output, c1);
-  Value outputColStrideRatio =
-      builder.create<arith::DivUIOp>(loc, outputCol, strideVal);
-  Value outputColMultiple = builder.create<arith::MulIOp>(
-      loc, builder.create<arith::AddIOp>(loc, outputColStrideRatio, c1),
-      strideVal);
 
   MemRefType dynamicTypeI16 =
       MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 16));
   Value srcXPosVec =
-      builder.create<memref::AllocOp>(loc, dynamicTypeI16, outputColMultiple);
+      builder.create<memref::AllocOp>(loc, dynamicTypeI16, outputCol);
   builder.create<scf::ForOp>(
-      loc, c0, outputColMultiple, strideVal, std::nullopt,
+      loc, c0, outputCol, c1, std::nullopt,
       [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
-        // Value maskVal = xBuilder.create<arith::SubIOp>(xLoc, outputCol, xiv);
-        Value xivFVec = xBuilder.create<vector::SplatOp>(
-            xLoc, vectorTyF32, indexToF32(xBuilder, xLoc, xiv));
-        xivFVec = xBuilder.create<arith::AddFOp>(xLoc, xivFVec, stepVec);
         Value srcXPos = xBuilder.create<arith::FPToUIOp>(
-            xLoc, vectorTyI16,
-            xBuilder.create<arith::MulFOp>(xLoc, xivFVec, horizontalVec));
+            xLoc, xBuilder.getI16Type(),
+            xBuilder.create<arith::MulFOp>(xLoc,
+                                           indexToF32(xBuilder, xLoc, xiv),
+                                           horizontalScalingFactor));
         srcXPos =
-            xBuilder.create<arith::MinSIOp>(xLoc, srcXPos, inputRowMinus1Vec);
-        // xBuilder.create<vector::MaskedStoreOp>(
-        //       xLoc, srcXPosVec, ValueRange{xiv},
-        //       xBuilder.create<vector::CreateMaskOp>(xLoc, vectorTyI1,
-        //       maskVal), srcXPos);
-        xBuilder.create<vector::StoreOp>(xLoc, srcXPos, srcXPosVec,
+            xBuilder.create<arith::MinSIOp>(xLoc, srcXPos, inputRowMinus1);
+        xBuilder.create<memref::StoreOp>(xLoc, srcXPos, srcXPosVec,
                                          ValueRange{xiv});
         xBuilder.create<scf::YieldOp>(xLoc);
       });
 
-  builder.create<scf::ForOp>(
-      loc, c0, outputRow, c1, std::nullopt,
-      [&](OpBuilder &yBuilder, Location yLoc, Value yiv, ValueRange) {
+  builder.create<scf::ParallelOp>(
+      loc, ValueRange{c0}, ValueRange{outputRow}, ValueRange{c1},
+      [&](OpBuilder &yBuilder, Location yLoc, ValueRange ivs) {
+        Value yiv = ivs[0];
         Value srcYPos = yBuilder.create<arith::FPToUIOp>(
             yLoc, yBuilder.getI16Type(),
             yBuilder.create<arith::MulFOp>(
@@ -1061,24 +1039,16 @@ void NearestNeighbourInterpolationResizingNew(
         srcYPos = yBuilder.create<arith::IndexCastOp>(
             yLoc, yBuilder.getIndexType(), srcYPos);
         yBuilder.create<scf::ForOp>(
-            loc, c0, outputCol, strideVal, std::nullopt,
+            loc, c0, outputCol, c1, std::nullopt,
             [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) {
-              Value maskVal =
-                  xBuilder.create<arith::SubIOp>(xLoc, outputCol, xiv);
-              Value maskVec = xBuilder.create<vector::CreateMaskOp>(
-                  xLoc, vectorTyI1, maskVal);
-              Value srcXPos = xBuilder.create<vector::MaskedLoadOp>(
-                  xLoc, vectorTyI16, srcXPosVec, ValueRange{xiv}, maskVec,
-                  passThruI16);
-              // Value srcXPos = xBuilder.create<vector::LoadOp>(
-              //     xLoc, vectorTyI16, srcXPosVec, ValueRange{xiv});
-              srcXPos = xBuilder.create<arith::IndexCastOp>(xLoc, vectorTyIndex,
-                                                            srcXPos);
-              Value srcPixel = xBuilder.create<vector::GatherOp>(
-                  xLoc, vectorResTy, input, ValueRange{srcYPos, c0}, srcXPos,
-                  maskVec, passThruRes);
-              xBuilder.create<vector::MaskedStoreOp>(
-                  xLoc, output, ValueRange{yiv, xiv}, maskVec, srcPixel);
+              Value srcXPos = xBuilder.create<memref::LoadOp>(xLoc, srcXPosVec,
+                                                              ValueRange{xiv});
+              srcXPos = xBuilder.create<arith::IndexCastOp>(
+                  xLoc, xBuilder.getIndexType(), srcXPos);
+              Value srcPixel = xBuilder.create<memref::LoadOp>(
+                  xLoc, input, ValueRange{srcYPos, srcXPos});
+              xBuilder.create<memref::StoreOp>(xLoc, srcPixel, output,
+                                               ValueRange{yiv, xiv});
               xBuilder.create<scf::YieldOp>(xLoc);
             });
         yBuilder.create<scf::YieldOp>(yLoc);

From 791c62dde39d6878da970e415aef36e448fda5dd Mon Sep 17 00:00:00 2001
From: HarryZ <uqbarz@gmail.com>
Date: Tue, 11 Mar 2025 19:31:49 +0800
Subject: [PATCH 7/7] [fix] cleanup old resize2d code

---
 midend/include/Utils/DIPUtils.h               | 38 +++-----
 .../lib/Conversion/LowerDIP/LowerDIPPass.cpp  |  4 +-
 midend/lib/Utils/DIPUtils.cpp                 | 88 ++-----------------
 3 files changed, 21 insertions(+), 109 deletions(-)

diff --git a/midend/include/Utils/DIPUtils.h b/midend/include/Utils/DIPUtils.h
index e5c3cfdf30..edb96721b8 100644
--- a/midend/include/Utils/DIPUtils.h
+++ b/midend/include/Utils/DIPUtils.h
@@ -176,29 +176,13 @@ void fillPixelsBilinearInterpolate4D(
     Value inputRowLastElemF32, Value inputColLastElemF32, Value c0F32,
     Value c1F32, Value dataCondition);
 
-void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc,
-                                              MLIRContext *ctx, Value input,
-                                              Value output, int64_t stride,
-                                              Value horizontalScalingFactor,
-                                              Value verticalScalingFactor);
-
-void BilinearInterpolationResizingNew(
-    OpBuilder &builder, Location loc, MLIRContext *ctx, Value input,
-    Value output, int64_t stride, Value horizontalScalingFactor,
-    Value verticalScalingFactor, Value halfVec, Value shiftVec, Value scaleVec,
-    VectorType vectorResTy, VectorType vectorTyI32, VectorType vectorTyI16,
-    VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1);
-
 // Helper function for resizing an image using nearest neighbour interpolation
 // mechanism.
-void NearestNeighbourInterpolationResizing(
-    OpBuilder &builder, Location loc, MLIRContext *ctx,
-    SmallVector<Value, 8> lowerBounds, SmallVector<Value, 8> upperBounds,
-    SmallVector<int64_t, 8> steps, Value strideVal, Value input, Value output,
-    Value horizontalScalingFactorVec, Value verticalScalingFactorVec,
-    Value outputRowLastElemF32, Value outputColLastElemF32,
-    Value inputRowLastElemF32, Value inputColLastElemF32, VectorType vectorTy32,
-    int64_t stride, Value c0, Value c0F32);
+void NearestNeighbourInterpolationResizing(OpBuilder &builder, Location loc,
+                                           MLIRContext *ctx, Value input,
+                                           Value output, int64_t stride,
+                                           Value horizontalScalingFactor,
+                                           Value verticalScalingFactor);
 
 // Helper function for resizing 4D an image using nearest neighbour
 // interpolation mechanism.
@@ -213,13 +197,11 @@ void NearestNeighbourInterpolationResizing4D(
 
 // Helper function for resizing an image using bilinear interpolation mechanism.
 void BilinearInterpolationResizing(
-    OpBuilder &builder, Location loc, MLIRContext *ctx,
-    SmallVector<Value, 8> lowerBounds, SmallVector<Value, 8> upperBounds,
-    SmallVector<int64_t, 8> steps, Value strideVal, Value input, Value output,
-    Value horizontalScalingFactorVec, Value verticalScalingFactorVec,
-    Value outputRowLastElemF32, Value outputColLastElemF32,
-    Value inputRowLastElemF32, Value inputColLastElemF32, VectorType vectorTy32,
-    int64_t stride, Value c0, Value c0F32, Value c1F32);
+    OpBuilder &builder, Location loc, MLIRContext *ctx, Value input,
+    Value output, int64_t stride, Value horizontalScalingFactor,
+    Value verticalScalingFactor, Value halfVec, Value shiftVec, Value scaleVec,
+    VectorType vectorResTy, VectorType vectorTyI32, VectorType vectorTyI16,
+    VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1);
 
 // Helper function for resizing 4D an image using bilinear interpolation
 // mechanism.
diff --git a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp
index a250bd8068..f2fa6e2a71 100644
--- a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp
+++ b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp
@@ -343,12 +343,12 @@ class DIPResize2DOpLowering : public OpRewritePattern<dip::Resize2DOp> {
 
     if (interpolationAttr ==
         dip::InterpolationType::NearestNeighbourInterpolation) {
-      dip::NearestNeighbourInterpolationResizingNew(
+      dip::NearestNeighbourInterpolationResizing(
           rewriter, loc, ctx, input, output, stride, horizontalScalingFactor,
           verticalScalingFactor);
     } else if (interpolationAttr ==
                dip::InterpolationType::BilinearInterpolation) {
-      dip::BilinearInterpolationResizingNew(
+      dip::BilinearInterpolationResizing(
           rewriter, loc, ctx, input, output, stride, horizontalScalingFactor,
           verticalScalingFactor, halfVec, shiftVec, scaleVec, vectorResTy,
           vectorTyI32, vectorTyI16, vectorTyIndex, vectorTyF32, vectorTyI1);
diff --git a/midend/lib/Utils/DIPUtils.cpp b/midend/lib/Utils/DIPUtils.cpp
index 5d4282fe83..d89a2ddd92 100644
--- a/midend/lib/Utils/DIPUtils.cpp
+++ b/midend/lib/Utils/DIPUtils.cpp
@@ -987,11 +987,13 @@ void fillPixelsBilinearInterpolate4D(
       });
 }
 
-void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc,
-                                              MLIRContext *ctx, Value input,
-                                              Value output, int64_t stride,
-                                              Value horizontalScalingFactor,
-                                              Value verticalScalingFactor) {
+// Helper function for resizing an image using nearest neighbour interpolation
+// mechanism.
+void NearestNeighbourInterpolationResizing(OpBuilder &builder, Location loc,
+                                           MLIRContext *ctx, Value input,
+                                           Value output, int64_t stride,
+                                           Value horizontalScalingFactor,
+                                           Value verticalScalingFactor) {
   Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
   Value c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
 
@@ -1240,7 +1242,8 @@ void calcInterpolation(OpBuilder &builder, Location loc, Value &sy,
   syPrev = builder.create<arith::SelectOp>(loc, notEqual, sy, syPrev);
 }
 
-void BilinearInterpolationResizingNew(
+// Helper function for resizing an image using bilinear interpolation mechanism.
+void BilinearInterpolationResizing(
     OpBuilder &builder, Location loc, MLIRContext *ctx, Value input,
     Value output, int64_t stride, Value horizontalScalingFactor,
     Value verticalScalingFactor, Value halfVec, Value shiftVec, Value scaleVec,
@@ -1396,38 +1399,6 @@ void BilinearInterpolationResizingNew(
   builder.create<memref::DeallocOp>(loc, iBeta);
 }
 
-// Helper function for resizing an image using nearest neighbour interpolation
-// mechanism.
-void NearestNeighbourInterpolationResizing(
-    OpBuilder &builder, Location loc, MLIRContext *ctx,
-    SmallVector<Value, 8> lowerBounds, SmallVector<Value, 8> upperBounds,
-    SmallVector<int64_t, 8> steps, Value strideVal, Value input, Value output,
-    Value horizontalScalingFactorVec, Value verticalScalingFactorVec,
-    Value outputRowLastElemF32, Value outputColLastElemF32,
-    Value inputRowLastElemF32, Value inputColLastElemF32, VectorType vectorTy32,
-    int64_t stride, Value c0, Value c0F32) {
-  affine::buildAffineLoopNest(
-      builder, loc, lowerBounds, upperBounds, steps,
-      [&](OpBuilder &builder, Location loc, ValueRange ivs) {
-        Value ivs0F32 = indexToF32(builder, loc, ivs[0]);
-        Value yVec = builder.create<vector::SplatOp>(loc, vectorTy32, ivs0F32);
-        Value xVec = iotaVec(builder, loc, ctx, ivs[1], strideVal, vectorTy32,
-                             c0, stride);
-
-        Value resXVecInterm = builder.create<arith::MulFOp>(
-            loc, xVec, horizontalScalingFactorVec);
-        Value resYVecInterm =
-            builder.create<arith::MulFOp>(loc, yVec, verticalScalingFactorVec);
-
-        Value resXVec = roundOff(builder, loc, resXVecInterm);
-        Value resYVec = roundOff(builder, loc, resYVecInterm);
-
-        fillPixels(builder, loc, xVec, yVec, resXVec, resYVec, input, output,
-                   c0, strideVal, outputRowLastElemF32, outputColLastElemF32,
-                   inputRowLastElemF32, inputColLastElemF32, c0F32);
-      });
-}
-
 // Helper function for resizing 4D an image using nearest neighbour
 // interpolation mechanism.
 void NearestNeighbourInterpolationResizing4D(
@@ -1461,47 +1432,6 @@ void NearestNeighbourInterpolationResizing4D(
       });
 }
 
-// Helper function for resizing an image using bilinear interpolation mechanism.
-void BilinearInterpolationResizing(
-    OpBuilder &builder, Location loc, MLIRContext *ctx,
-    SmallVector<Value, 8> lowerBounds, SmallVector<Value, 8> upperBounds,
-    SmallVector<int64_t, 8> steps, Value strideVal, Value input, Value output,
-    Value horizontalScalingFactorVec, Value verticalScalingFactorVec,
-    Value outputRowLastElemF32, Value outputColLastElemF32,
-    Value inputRowLastElemF32, Value inputColLastElemF32, VectorType vectorTy32,
-    int64_t stride, Value c0, Value c0F32, Value c1F32) {
-  affine::buildAffineLoopNest(
-      builder, loc, lowerBounds, upperBounds, steps,
-      [&](OpBuilder &builder, Location loc, ValueRange ivs) {
-        Value ivs0F32 = indexToF32(builder, loc, ivs[0]);
-        Value yVec = builder.create<vector::SplatOp>(loc, vectorTy32, ivs0F32);
-        Value xVec = iotaVec(builder, loc, ctx, ivs[1], strideVal, vectorTy32,
-                             c0, stride);
-
-        Value xVecInterm = builder.create<arith::MulFOp>(
-            loc, xVec, horizontalScalingFactorVec);
-        Value yVecInterm =
-            builder.create<arith::MulFOp>(loc, yVec, verticalScalingFactorVec);
-
-        Value xVecInterm_L = builder.create<math::FloorOp>(loc, xVecInterm);
-        Value xVecInterm_H = builder.create<math::CeilOp>(loc, xVecInterm);
-
-        Value yVecInterm_L = builder.create<math::FloorOp>(loc, yVecInterm);
-        Value yVecInterm_H = builder.create<math::CeilOp>(loc, yVecInterm);
-
-        Value xVecWeight =
-            builder.create<arith::SubFOp>(loc, xVecInterm, xVecInterm_L);
-        Value yVecWeight =
-            builder.create<arith::SubFOp>(loc, yVecInterm, yVecInterm_L);
-
-        fillPixelsBilinearInterpolate(
-            builder, loc, xVec, yVec, xVecInterm_L, yVecInterm_L, xVecInterm_H,
-            yVecInterm_H, input, output, c0, strideVal, xVecWeight, yVecWeight,
-            outputRowLastElemF32, outputColLastElemF32, inputRowLastElemF32,
-            inputColLastElemF32, c0F32, c1F32);
-      });
-}
-
 // Helper function for resizing 4D an image using bilinear interpolation
 // mechanism.
 void BilinearInterpolationResizing4D(