From 751e1fc0f4967b26971eead6dc9a0f40503a6cf7 Mon Sep 17 00:00:00 2001 From: HarryZ Date: Mon, 3 Mar 2025 16:34:02 +0800 Subject: [PATCH 1/7] [feat] reconstruct resize op, nearest and bilinear --- midend/lib/Utils/DIPUtils.cpp | 357 ++++++++++++++++++++++++++++++++++ 1 file changed, 357 insertions(+) diff --git a/midend/lib/Utils/DIPUtils.cpp b/midend/lib/Utils/DIPUtils.cpp index d68451bb51..4519ceb1d1 100644 --- a/midend/lib/Utils/DIPUtils.cpp +++ b/midend/lib/Utils/DIPUtils.cpp @@ -987,6 +987,363 @@ void fillPixelsBilinearInterpolate4D( }); } +void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc, + MLIRContext *ctx, Value input, + Value output, + Value horizontalScalingFactor, + Value verticalScalingFactor) { + Value c0 = builder.create(loc, 0); + Value c1 = builder.create(loc, 1); + + Value inputRow = builder.create(loc, input, c0); + Value inputRowMinus1 = builder.create( + loc, builder.getI16Type(), + builder.create(loc, inputRow, c1)); + Value inputCol = builder.create(loc, input, c1); + Value inputColMinus1 = builder.create( + loc, builder.getI16Type(), + builder.create(loc, inputCol, c1)); + + Value outputRow = builder.create(loc, output, c0); + Value outputCol = builder.create(loc, output, c1); + + MemRefType dynamicTypeI16 = + MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 16)); + Value srcXPosVec = + builder.create(loc, dynamicTypeI16, outputCol); + builder.create( + loc, c0, outputCol, c1, std::nullopt, + [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { + Value srcXPos = xBuilder.create( + xLoc, xBuilder.getI16Type(), + xBuilder.create(xLoc, + indexToF32(xBuilder, xLoc, xiv), + horizontalScalingFactor)); + srcXPos = + xBuilder.create(xLoc, srcXPos, inputRowMinus1); + xBuilder.create(xLoc, srcXPos, srcXPosVec, + ValueRange{xiv}); + xBuilder.create(xLoc); + }); + + builder.create( + loc, c0, outputRow, c1, std::nullopt, + [&](OpBuilder &yBuilder, Location yLoc, Value yiv, ValueRange) { + Value srcYPos = yBuilder.create( + yLoc, yBuilder.getI16Type(), + yBuilder.create( + yLoc, indexToF32(yBuilder, yLoc, yiv), verticalScalingFactor)); + srcYPos = + yBuilder.create(yLoc, srcYPos, inputColMinus1); + srcYPos = yBuilder.create( + yLoc, yBuilder.getIndexType(), srcYPos); + yBuilder.create( + loc, c0, outputCol, c1, std::nullopt, + [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { + Value srcXPos = xBuilder.create(xLoc, srcXPosVec, + ValueRange{xiv}); + srcXPos = xBuilder.create( + xLoc, xBuilder.getIndexType(), srcXPos); + Value srcPixel = xBuilder.create( + xLoc, input, ValueRange{srcYPos, srcXPos}); + xBuilder.create(xLoc, srcPixel, output, + ValueRange{yiv, xiv}); + xBuilder.create(xLoc); + }); + yBuilder.create(yLoc); + }); +} + +void processScaling(OpBuilder &builder, Location loc, Value output, + Value scalingFactor, Value input, Value xOffset, + Value iAlpha) { + static const int SHIFT = 11; + static const int INTER_RESIZE_COEF_SCALE = 1 << SHIFT; + static const int HALF = 1 << (SHIFT - 1); + + Value c0 = builder.create(loc, 0); + Value c1 = builder.create(loc, 1); + Value c2 = builder.create(loc, 2); + Value c0I32 = + builder.create(loc, builder.getI32Type(), c0); + Value c0F = builder.create(loc, (llvm::APFloat)0.0f, + builder.getF32Type()); + Value cDot5F = builder.create( + loc, (llvm::APFloat)0.5f, builder.getF32Type()); + Value c1F = builder.create(loc, (llvm::APFloat)1.0f, + builder.getF32Type()); + Value inputMinus1 = builder.create( + loc, builder.getI32Type(), builder.create(loc, input, c1)); + Value inputMinus2 = builder.create( + loc, builder.getI32Type(), builder.create(loc, input, c2)); + builder.create( + loc, c0, output, c1, std::nullopt, + [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { + // float fx = (float)((dx + 0.5) * scale_x - 0.5); + Value xivF = indexToF32(xBuilder, xLoc, xiv); + Value temp1 = xBuilder.create(xLoc, xivF, cDot5F); + Value temp2 = + xBuilder.create(xLoc, temp1, scalingFactor); + Value fx = xBuilder.create(xLoc, temp2, cDot5F); + Value sx = + xBuilder.create(xLoc, xBuilder.getI32Type(), fx); + fx = xBuilder.create( + xLoc, fx, + xBuilder.create(xLoc, xBuilder.getF32Type(), sx)); + Value lowerThanZero = xBuilder.create( + xLoc, arith::CmpIPredicate::slt, sx, c0I32); + Value greaterThan = xBuilder.create( + xLoc, arith::CmpIPredicate::sge, sx, inputMinus1); + sx = xBuilder.create(xLoc, lowerThanZero, c0I32, sx); + fx = xBuilder.create(xLoc, lowerThanZero, c0F, fx); + sx = xBuilder.create(xLoc, greaterThan, inputMinus2, + sx); + fx = xBuilder.create(xLoc, greaterThan, c0F, fx); + xBuilder.create(xLoc, sx, xOffset, ValueRange{xiv}); + + // ialpha[dx * 2] = (short)((1.f - fx) * INTER_RESIZE_COEF_SCALE); + // ialpha[dx * 2 + 1] = (short)(fx * INTER_RESIZE_COEF_SCALE); + Value fxScale = xBuilder.create( + xLoc, fx, + xBuilder.create( + xLoc, (llvm::APFloat)(float)INTER_RESIZE_COEF_SCALE, + xBuilder.getF32Type())); + Value oneMinusFx = xBuilder.create(xLoc, c1F, fx); + Value oneMinusFxScale = xBuilder.create( + xLoc, oneMinusFx, + xBuilder.create( + xLoc, (llvm::APFloat)(float)INTER_RESIZE_COEF_SCALE, + xBuilder.getF32Type())); + + Value index0 = xBuilder.create(xLoc, xiv, c2); + Value index1 = xBuilder.create(xLoc, index0, c1); + + Value val0 = xBuilder.create( + xLoc, xBuilder.getI16Type(), oneMinusFxScale); + Value val1 = xBuilder.create( + xLoc, xBuilder.getI16Type(), fxScale); + + xBuilder.create(xLoc, val0, iAlpha, + ValueRange{index0}); + xBuilder.create(xLoc, val1, iAlpha, + ValueRange{index1}); + xBuilder.create(xLoc); + }); +} + +void calcInterpolation(OpBuilder &builder, Location loc, Value &sy, + Value &syPrev, Value offset, Value input, + Value outputWidth, Value iAlpha, Value buffer) { + static const int SHIFT = 11; + static const int HALF = 1 << (SHIFT - 1); + auto inElemTy = input.getType().cast().getElementType(); + Value c0 = builder.create(loc, 0); + Value c1 = builder.create(loc, 1); + Value c0I = + builder.create(loc, 0, builder.getI32Type()); + Value c1I = + builder.create(loc, 1, builder.getI32Type()); + Value c2 = builder.create(loc, 2); + Value c2I = + builder.create(loc, 2, builder.getI32Type()); + Value notEqual = + builder.create(loc, arith::CmpIPredicate::ne, sy, syPrev); + builder.create( + loc, notEqual, [&](OpBuilder &tBuilder, Location tLoc) { + tBuilder.create( + tLoc, c0, outputWidth, c1, std::nullopt, + [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { + Value sx = xBuilder.create(xLoc, offset, + ValueRange{xiv}); + Value sxPlus1 = xBuilder.create(xLoc, sx, c1I); + Value xMul2 = xBuilder.create(xLoc, xiv, c2); + xMul2 = xBuilder.create( + xLoc, xBuilder.getI32Type(), xMul2); + Value xMul2Plus1 = + xBuilder.create(xLoc, xMul2, c1I); + Value index0 = xBuilder.create( + xLoc, xBuilder.getIndexType(), xMul2); + Value index1 = xBuilder.create( + xLoc, xBuilder.getIndexType(), xMul2Plus1); + Value a0 = xBuilder.create(xLoc, iAlpha, + ValueRange{index0}); + Value a1 = xBuilder.create(xLoc, iAlpha, + ValueRange{index1}); + Value sxIndex = xBuilder.create( + xLoc, xBuilder.getIndexType(), sx); + Value sxPlus1Index = xBuilder.create( + xLoc, xBuilder.getIndexType(), sxPlus1); + Value syIndex = xBuilder.create( + xLoc, xBuilder.getIndexType(), sy); + Value v0 = xBuilder.create( + xLoc, input, ValueRange{syIndex, sxIndex}); + v0 = xBuilder.create(xLoc, xBuilder.getI32Type(), + v0); + Value v1 = xBuilder.create( + xLoc, input, ValueRange{syIndex, sxPlus1Index}); + v1 = xBuilder.create(xLoc, xBuilder.getI32Type(), + v1); + Value a0I32 = xBuilder.create( + xLoc, xBuilder.getI32Type(), a0); + Value a1I32 = xBuilder.create( + xLoc, xBuilder.getI32Type(), a1); + Value v0MulA0 = xBuilder.create(xLoc, v0, a0I32); + Value v1MulA1 = xBuilder.create(xLoc, v1, a1I32); + Value add1 = + xBuilder.create(xLoc, v0MulA0, v1MulA1); + Value half = xBuilder.create( + xLoc, HALF, xBuilder.getI32Type()); + Value add2 = xBuilder.create(xLoc, add1, half); + Value shift = xBuilder.create( + xLoc, SHIFT, xBuilder.getI32Type()); + Value resShifted = + xBuilder.create(xLoc, add2, shift); + xBuilder.create(xLoc, resShifted, buffer, + ValueRange{xiv}); + xBuilder.create(xLoc); + }); + // syPrev = sy; + tBuilder.create(tLoc); + }); + syPrev = builder.create(loc, notEqual, sy, syPrev); +} + +void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc, + MLIRContext *ctx, Value input, + Value output, + Value horizontalScalingFactor, + Value verticalScalingFactor) { + static const int SHIFT = 11; + static const int INTER_RESIZE_COEF_SCALE = 1 << SHIFT; + static const int HALF = 1 << (SHIFT - 1); + auto inElemTy = input.getType().cast().getElementType(); + Value cMinus1 = builder.create(loc, -1); + Value c0 = builder.create(loc, 0); + Value c1 = builder.create(loc, 1); + Value c1I = + builder.create(loc, 1, builder.getI32Type()); + Value c2 = builder.create(loc, 2); + Value c2I = + builder.create(loc, 2, builder.getI32Type()); + Value c1F = builder.create(loc, (llvm::APFloat)1.0f, + builder.getF32Type()); + + Value inputRow = builder.create(loc, input, c0); + Value inputCol = builder.create(loc, input, c1); + + Value outputRow = builder.create(loc, output, c0); + Value outputCol = builder.create(loc, output, c1); + + MemRefType dynamicTypeI32 = + MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 32)); + Value xOffset = + builder.create(loc, dynamicTypeI32, outputCol); + MemRefType dynamicTypeI16 = + MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 16)); + Value outputColMul2 = builder.create(loc, outputCol, c2); + Value iAlpha = + builder.create(loc, dynamicTypeI16, outputColMul2); + + processScaling(builder, loc, outputCol, horizontalScalingFactor, inputCol, + xOffset, iAlpha); + + Value yOffset = + builder.create(loc, dynamicTypeI32, outputRow); + Value outputRowMul2 = builder.create(loc, outputRow, c2); + Value iBeta = + builder.create(loc, dynamicTypeI16, outputRowMul2); + + processScaling(builder, loc, outputRow, verticalScalingFactor, inputRow, + yOffset, iBeta); + + Value bufferWidth = outputCol; + Value buffer0 = + builder.create(loc, dynamicTypeI32, bufferWidth); + Value buffer1 = + builder.create(loc, dynamicTypeI32, bufferWidth); + Value prevSy0 = + builder.create(loc, builder.getI32Type(), cMinus1); + Value prevSy1 = + builder.create(loc, builder.getI32Type(), cMinus1); + + // builder.create( + // loc, ValueRange{c0}, ValueRange{outputRow}, ValueRange{c1}, + // [&](OpBuilder &yBuilder, Location yLoc, ValueRange ivs) { + // Value yiv = ivs[0]; + // } + // ); + builder.create( + loc, c0, outputRow, c1, std::nullopt, + [&](OpBuilder &yBuilder, Location yLoc, Value yiv, ValueRange) { + Value sy = + yBuilder.create(yLoc, yOffset, ValueRange{yiv}); + Value syNext = yBuilder.create(yLoc, sy, c1I); + calcInterpolation(yBuilder, yLoc, sy, prevSy0, xOffset, input, + outputCol, iAlpha, buffer0); + calcInterpolation(yBuilder, yLoc, syNext, prevSy1, xOffset, input, + outputCol, iAlpha, buffer1); + + Value yMul2 = yBuilder.create(yLoc, yiv, c2); + yMul2 = yBuilder.create(yLoc, yBuilder.getI32Type(), + yMul2); + Value yMul2Plus1 = yBuilder.create(yLoc, yMul2, c1I); + Value index0 = yBuilder.create( + yLoc, yBuilder.getIndexType(), yMul2); + Value index1 = yBuilder.create( + yLoc, yBuilder.getIndexType(), yMul2Plus1); + Value b0 = + yBuilder.create(yLoc, iBeta, ValueRange{index0}); + b0 = yBuilder.create(yLoc, yBuilder.getI32Type(), b0); + Value b1 = + yBuilder.create(yLoc, iBeta, ValueRange{index1}); + b1 = yBuilder.create(yLoc, yBuilder.getI32Type(), b1); + // Value b0 = yBuilder.create(yLoc, iBeta, + // ValueRange{yiv}); b0 = yBuilder.create(yLoc, + // yBuilder.getI32Type(), b0); Value yPlus1 = + // builder.create(yLoc, yiv, c1); Value b1 = + // yBuilder.create(yLoc, iBeta, ValueRange{yPlus1}); b1 + // = yBuilder.create(yLoc, yBuilder.getI32Type(), b1); + yBuilder.create( + yLoc, c0, bufferWidth, c1, std::nullopt, + [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { + Value buffer0X = xBuilder.create(xLoc, buffer0, + ValueRange{xiv}); + Value buffer1X = xBuilder.create(xLoc, buffer1, + ValueRange{xiv}); + Value b0MulBuffer0 = + xBuilder.create(xLoc, b0, buffer0X); + Value b1MulBuffer1 = + xBuilder.create(xLoc, b1, buffer1X); + Value add = xBuilder.create(xLoc, b0MulBuffer0, + b1MulBuffer1); + Value half = xBuilder.create( + xLoc, HALF, xBuilder.getI32Type()); + Value addHalf = xBuilder.create(xLoc, add, half); + Value shift = xBuilder.create( + xLoc, SHIFT, xBuilder.getI32Type()); + Value resShifted = + xBuilder.create(xLoc, addHalf, shift); + Value zero = xBuilder.create( + xLoc, 0, xBuilder.getI32Type()); + Value twoFiftyFive = xBuilder.create( + xLoc, 255, xBuilder.getI32Type()); + Value maxVal = + xBuilder.create(xLoc, resShifted, zero); + Value clampedVal = + xBuilder.create(xLoc, maxVal, twoFiftyFive); + FloatType type = inElemTy.isF32() + ? FloatType::getF32(xBuilder.getContext()) + : FloatType::getF64(xBuilder.getContext()); + Value clampedValF = + xBuilder.create(xLoc, type, clampedVal); + xBuilder.create(xLoc, clampedValF, output, + ValueRange{yiv, xiv}); + xBuilder.create(xLoc); + }); + yBuilder.create(yLoc); + }); +} + // Helper function for resizing an image using nearest neighbour interpolation // mechanism. void NearestNeighbourInterpolationResizing( From 91b75294e13885d1a3f5a8b9f00a6c8ebb92be88 Mon Sep 17 00:00:00 2001 From: HarryZ Date: Fri, 7 Mar 2025 14:07:23 +0800 Subject: [PATCH 2/7] [feat] vectorization of resize op --- midend/lib/Utils/DIPUtils.cpp | 397 +++++++++++++++++++++------------- 1 file changed, 245 insertions(+), 152 deletions(-) diff --git a/midend/lib/Utils/DIPUtils.cpp b/midend/lib/Utils/DIPUtils.cpp index 4519ceb1d1..ae0c6da7c7 100644 --- a/midend/lib/Utils/DIPUtils.cpp +++ b/midend/lib/Utils/DIPUtils.cpp @@ -1056,89 +1056,123 @@ void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc, void processScaling(OpBuilder &builder, Location loc, Value output, Value scalingFactor, Value input, Value xOffset, - Value iAlpha) { + Value iAlpha, int64_t stride) { static const int SHIFT = 11; static const int INTER_RESIZE_COEF_SCALE = 1 << SHIFT; static const int HALF = 1 << (SHIFT - 1); + VectorType vectorTyIndex = VectorType::get({stride}, builder.getIndexType()); + VectorType vectorTyI1 = VectorType::get({stride}, builder.getI1Type()); + VectorType vectorTyI16 = + VectorType::get({stride}, IntegerType::get(builder.getContext(), 16)); + VectorType vectorTyI32 = + VectorType::get({stride}, IntegerType::get(builder.getContext(), 32)); + VectorType vectorTyF32 = + VectorType::get({stride}, FloatType::getF32(builder.getContext())); + Value c0 = builder.create(loc, 0); Value c1 = builder.create(loc, 1); Value c2 = builder.create(loc, 2); - Value c0I32 = - builder.create(loc, builder.getI32Type(), c0); - Value c0F = builder.create(loc, (llvm::APFloat)0.0f, - builder.getF32Type()); - Value cDot5F = builder.create( - loc, (llvm::APFloat)0.5f, builder.getF32Type()); + Value c0I32Vec = builder.create( + loc, vectorTyI32, + builder.create(loc, builder.getI32Type(), c0)); + Value c0FVec = builder.create( + loc, vectorTyF32, + builder.create(loc, (llvm::APFloat)0.0f, + builder.getF32Type())); + Value cDot5FVec = builder.create( + loc, vectorTyF32, + builder.create(loc, (llvm::APFloat)0.5f, + builder.getF32Type())); + Value scalingFactorVec = + builder.create(loc, vectorTyF32, scalingFactor); Value c1F = builder.create(loc, (llvm::APFloat)1.0f, builder.getF32Type()); + Value c1FVec = builder.create(loc, vectorTyF32, c1F); Value inputMinus1 = builder.create( loc, builder.getI32Type(), builder.create(loc, input, c1)); + Value inputMinus1Vec = + builder.create(loc, vectorTyI32, inputMinus1); Value inputMinus2 = builder.create( loc, builder.getI32Type(), builder.create(loc, input, c2)); + Value inputMinus2Vec = + builder.create(loc, vectorTyI32, inputMinus2); + Value stepVec = iotaVec0F32(builder, loc, stride); + Value strideVal = builder.create(loc, stride); + Value scaleCoefVec = builder.create( + loc, vectorTyF32, + builder.create( + loc, (llvm::APFloat)(float)INTER_RESIZE_COEF_SCALE, + builder.getF32Type())); builder.create( - loc, c0, output, c1, std::nullopt, + loc, c0, output, strideVal, std::nullopt, [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { // float fx = (float)((dx + 0.5) * scale_x - 0.5); - Value xivF = indexToF32(xBuilder, xLoc, xiv); - Value temp1 = xBuilder.create(xLoc, xivF, cDot5F); + Value xivFVec = xBuilder.create( + xLoc, vectorTyF32, indexToF32(xBuilder, xLoc, xiv)); + xivFVec = xBuilder.create(xLoc, xivFVec, stepVec); + Value temp1 = xBuilder.create(xLoc, xivFVec, cDot5FVec); Value temp2 = - xBuilder.create(xLoc, temp1, scalingFactor); - Value fx = xBuilder.create(xLoc, temp2, cDot5F); - Value sx = - xBuilder.create(xLoc, xBuilder.getI32Type(), fx); + xBuilder.create(xLoc, temp1, scalingFactorVec); + Value fx = xBuilder.create(xLoc, temp2, cDot5FVec); + Value sx = xBuilder.create(xLoc, vectorTyI32, fx); fx = xBuilder.create( - xLoc, fx, - xBuilder.create(xLoc, xBuilder.getF32Type(), sx)); + xLoc, fx, xBuilder.create(xLoc, vectorTyF32, sx)); Value lowerThanZero = xBuilder.create( - xLoc, arith::CmpIPredicate::slt, sx, c0I32); + xLoc, arith::CmpIPredicate::slt, sx, c0I32Vec); Value greaterThan = xBuilder.create( - xLoc, arith::CmpIPredicate::sge, sx, inputMinus1); - sx = xBuilder.create(xLoc, lowerThanZero, c0I32, sx); - fx = xBuilder.create(xLoc, lowerThanZero, c0F, fx); - sx = xBuilder.create(xLoc, greaterThan, inputMinus2, + xLoc, arith::CmpIPredicate::sge, sx, inputMinus1Vec); + sx = + xBuilder.create(xLoc, lowerThanZero, c0I32Vec, sx); + fx = xBuilder.create(xLoc, lowerThanZero, c0FVec, fx); + sx = xBuilder.create(xLoc, greaterThan, inputMinus2Vec, sx); - fx = xBuilder.create(xLoc, greaterThan, c0F, fx); - xBuilder.create(xLoc, sx, xOffset, ValueRange{xiv}); + fx = xBuilder.create(xLoc, greaterThan, c0FVec, fx); + Value maskVal = xBuilder.create(xLoc, output, xiv); + Value maskVec = + xBuilder.create(xLoc, vectorTyI1, maskVal); + xBuilder.create(xLoc, xOffset, ValueRange{xiv}, + maskVec, sx); // ialpha[dx * 2] = (short)((1.f - fx) * INTER_RESIZE_COEF_SCALE); // ialpha[dx * 2 + 1] = (short)(fx * INTER_RESIZE_COEF_SCALE); - Value fxScale = xBuilder.create( - xLoc, fx, - xBuilder.create( - xLoc, (llvm::APFloat)(float)INTER_RESIZE_COEF_SCALE, - xBuilder.getF32Type())); - Value oneMinusFx = xBuilder.create(xLoc, c1F, fx); - Value oneMinusFxScale = xBuilder.create( - xLoc, oneMinusFx, - xBuilder.create( - xLoc, (llvm::APFloat)(float)INTER_RESIZE_COEF_SCALE, - xBuilder.getF32Type())); + Value fxScale = xBuilder.create(xLoc, fx, scaleCoefVec); + Value oneMinusFx = xBuilder.create(xLoc, c1FVec, fx); + Value oneMinusFxScale = + xBuilder.create(xLoc, oneMinusFx, scaleCoefVec); Value index0 = xBuilder.create(xLoc, xiv, c2); Value index1 = xBuilder.create(xLoc, index0, c1); - Value val0 = xBuilder.create( - xLoc, xBuilder.getI16Type(), oneMinusFxScale); - Value val1 = xBuilder.create( - xLoc, xBuilder.getI16Type(), fxScale); + Value val0 = xBuilder.create(xLoc, vectorTyI16, + oneMinusFxScale); + Value val1 = + xBuilder.create(xLoc, vectorTyI16, fxScale); - xBuilder.create(xLoc, val0, iAlpha, + SmallVector maskVec1; + for (int i = 0; i < stride; i++) { + maskVec1.push_back(i); + maskVec1.push_back(i + stride); + } + Value storeBack = + xBuilder.create(xLoc, val0, val1, maskVec1); + xBuilder.create(xLoc, storeBack, iAlpha, ValueRange{index0}); - xBuilder.create(xLoc, val1, iAlpha, - ValueRange{index1}); xBuilder.create(xLoc); }); } void calcInterpolation(OpBuilder &builder, Location loc, Value &sy, Value &syPrev, Value offset, Value input, - Value outputWidth, Value iAlpha, Value buffer) { + Value outputWidth, Value iAlpha, Value buffer, + int64_t stride) { static const int SHIFT = 11; static const int HALF = 1 << (SHIFT - 1); auto inElemTy = input.getType().cast().getElementType(); Value c0 = builder.create(loc, 0); Value c1 = builder.create(loc, 1); + Value c1I1 = + builder.create(loc, 1, builder.getI1Type()); Value c0I = builder.create(loc, 0, builder.getI32Type()); Value c1I = @@ -1146,115 +1180,169 @@ void calcInterpolation(OpBuilder &builder, Location loc, Value &sy, Value c2 = builder.create(loc, 2); Value c2I = builder.create(loc, 2, builder.getI32Type()); + + Value strideVal = builder.create(loc, stride); + Value strideInt = + builder.create(loc, builder.getI32Type(), strideVal); + Value outputStrideRatio = + builder.create(loc, outputWidth, strideVal); + Value outputMultiple = builder.create( + loc, builder.create(loc, outputStrideRatio, c1), + strideVal); + VectorType vectorTyIndex = VectorType::get({stride}, builder.getIndexType()); + VectorType vectorTyI1 = VectorType::get({stride}, builder.getI1Type()); + VectorType vectorTyI16 = + VectorType::get({stride}, IntegerType::get(builder.getContext(), 16)); + VectorType vectorTyI32 = + VectorType::get({stride}, IntegerType::get(builder.getContext(), 32)); + VectorType vectorTyF32 = + VectorType::get({stride}, FloatType::getF32(builder.getContext())); + + Value c1I1Vec = builder.create(loc, vectorTyI1, c1I1); + Value c1IVec = builder.create(loc, vectorTyI32, c1I); + Value c2IVec = builder.create(loc, vectorTyI32, c2I); + Value half = + builder.create(loc, HALF, builder.getI32Type()); + Value halfVec = builder.create(loc, vectorTyI32, half); + Value shift = + builder.create(loc, SHIFT, builder.getI32Type()); + Value shiftVec = builder.create(loc, vectorTyI32, shift); + + auto passThruConstantOp = + builder.create(loc, builder.getZeroAttr(vectorTyF32)); + auto passThruConstantOpI32 = + builder.create(loc, builder.getZeroAttr(vectorTyI32)); + auto passThruConstantOpI16 = + builder.create(loc, builder.getZeroAttr(vectorTyI16)); Value notEqual = builder.create(loc, arith::CmpIPredicate::ne, sy, syPrev); - builder.create( - loc, notEqual, [&](OpBuilder &tBuilder, Location tLoc) { - tBuilder.create( - tLoc, c0, outputWidth, c1, std::nullopt, - [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { - Value sx = xBuilder.create(xLoc, offset, - ValueRange{xiv}); - Value sxPlus1 = xBuilder.create(xLoc, sx, c1I); - Value xMul2 = xBuilder.create(xLoc, xiv, c2); - xMul2 = xBuilder.create( - xLoc, xBuilder.getI32Type(), xMul2); - Value xMul2Plus1 = - xBuilder.create(xLoc, xMul2, c1I); - Value index0 = xBuilder.create( - xLoc, xBuilder.getIndexType(), xMul2); - Value index1 = xBuilder.create( - xLoc, xBuilder.getIndexType(), xMul2Plus1); - Value a0 = xBuilder.create(xLoc, iAlpha, - ValueRange{index0}); - Value a1 = xBuilder.create(xLoc, iAlpha, - ValueRange{index1}); - Value sxIndex = xBuilder.create( - xLoc, xBuilder.getIndexType(), sx); - Value sxPlus1Index = xBuilder.create( - xLoc, xBuilder.getIndexType(), sxPlus1); - Value syIndex = xBuilder.create( - xLoc, xBuilder.getIndexType(), sy); - Value v0 = xBuilder.create( - xLoc, input, ValueRange{syIndex, sxIndex}); - v0 = xBuilder.create(xLoc, xBuilder.getI32Type(), - v0); - Value v1 = xBuilder.create( - xLoc, input, ValueRange{syIndex, sxPlus1Index}); - v1 = xBuilder.create(xLoc, xBuilder.getI32Type(), - v1); - Value a0I32 = xBuilder.create( - xLoc, xBuilder.getI32Type(), a0); - Value a1I32 = xBuilder.create( - xLoc, xBuilder.getI32Type(), a1); - Value v0MulA0 = xBuilder.create(xLoc, v0, a0I32); - Value v1MulA1 = xBuilder.create(xLoc, v1, a1I32); - Value add1 = - xBuilder.create(xLoc, v0MulA0, v1MulA1); - Value half = xBuilder.create( - xLoc, HALF, xBuilder.getI32Type()); - Value add2 = xBuilder.create(xLoc, add1, half); - Value shift = xBuilder.create( - xLoc, SHIFT, xBuilder.getI32Type()); - Value resShifted = - xBuilder.create(xLoc, add2, shift); - xBuilder.create(xLoc, resShifted, buffer, - ValueRange{xiv}); - xBuilder.create(xLoc); - }); - // syPrev = sy; - tBuilder.create(tLoc); - }); + builder.create< + scf::IfOp>(loc, notEqual, [&](OpBuilder &tBuilder, Location tLoc) { + tBuilder.create( + tLoc, c0, outputMultiple, strideVal, std::nullopt, + [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { + Value maskVal = + xBuilder.create(xLoc, outputWidth, xiv); + Value maskVec = + xBuilder.create(xLoc, vectorTyI1, maskVal); + Value sxVec = xBuilder.create( + xLoc, vectorTyI32, offset, ValueRange{xiv}, maskVec, + passThruConstantOpI32); + Value sxPlus1Vec = + xBuilder.create(xLoc, sxVec, c1IVec); + Value index0 = xBuilder.create(xLoc, xiv, c2); + Value index1 = + xBuilder.create(xLoc, index0, strideVal); + Value a0Vec = xBuilder.create(xLoc, vectorTyI16, + iAlpha, index0); + Value a1Vec = xBuilder.create(xLoc, vectorTyI16, + iAlpha, index1); + Value sxVecIndex = + xBuilder.create(xLoc, vectorTyIndex, sxVec); + Value sxPlus1VecIndex = xBuilder.create( + xLoc, vectorTyIndex, sxPlus1Vec); + Value syIndex = xBuilder.create( + xLoc, xBuilder.getIndexType(), sy); + Value v0Vec = xBuilder.create( + xLoc, vectorTyF32, input, ValueRange{syIndex, c0}, sxVecIndex, + c1I1Vec, passThruConstantOp); + v0Vec = xBuilder.create(xLoc, vectorTyI32, v0Vec); + Value v1Vec = xBuilder.create( + xLoc, vectorTyF32, input, ValueRange{syIndex, c0}, + sxPlus1VecIndex, c1I1Vec, passThruConstantOp); + v1Vec = xBuilder.create(xLoc, vectorTyI32, v1Vec); + + a0Vec = xBuilder.create(xLoc, vectorTyI32, a0Vec); + a1Vec = xBuilder.create(xLoc, vectorTyI32, a1Vec); + SmallVector maskVec1, maskVec2; + for (int i = 0; i < stride; i++) { + maskVec1.push_back(i * 2); + maskVec2.push_back(i * 2 + 1); + } + Value adder1 = + xBuilder.create(xLoc, a0Vec, a1Vec, maskVec1); + Value adder2 = + xBuilder.create(xLoc, a0Vec, a1Vec, maskVec2); + Value aMulv0 = xBuilder.create(xLoc, adder1, v0Vec); + Value aMulv1 = xBuilder.create(xLoc, adder2, v1Vec); + Value adder = xBuilder.create(xLoc, aMulv0, aMulv1); + adder = builder.create(loc, adder, halfVec); + Value resShifted = + xBuilder.create(xLoc, adder, shiftVec); + xBuilder.create(xLoc, buffer, ValueRange{xiv}, + maskVec, resShifted); + xBuilder.create(xLoc); + }); + tBuilder.create(tLoc); + }); syPrev = builder.create(loc, notEqual, sy, syPrev); } void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc, MLIRContext *ctx, Value input, - Value output, + Value output, int64_t stride, Value horizontalScalingFactor, Value verticalScalingFactor) { static const int SHIFT = 11; static const int INTER_RESIZE_COEF_SCALE = 1 << SHIFT; static const int HALF = 1 << (SHIFT - 1); auto inElemTy = input.getType().cast().getElementType(); + VectorType vectorTyIndex = VectorType::get({stride}, builder.getIndexType()); + VectorType vectorTyI1 = VectorType::get({stride}, builder.getI1Type()); + VectorType vectorTyI32 = + VectorType::get({stride}, IntegerType::get(builder.getContext(), 32)); + VectorType vectorTyF = VectorType::get( + {stride}, inElemTy.isF32() ? FloatType::getF32(builder.getContext()) + : FloatType::getF64(builder.getContext())); + auto passThruConstantOpI32 = + builder.create(loc, builder.getZeroAttr(vectorTyI32)); Value cMinus1 = builder.create(loc, -1); Value c0 = builder.create(loc, 0); Value c1 = builder.create(loc, 1); - Value c1I = + Value c1I32 = builder.create(loc, 1, builder.getI32Type()); Value c2 = builder.create(loc, 2); - Value c2I = - builder.create(loc, 2, builder.getI32Type()); - Value c1F = builder.create(loc, (llvm::APFloat)1.0f, - builder.getF32Type()); - Value inputRow = builder.create(loc, input, c0); Value inputCol = builder.create(loc, input, c1); Value outputRow = builder.create(loc, output, c0); Value outputCol = builder.create(loc, output, c1); + Value strideVal = builder.create(loc, stride); + Value outputColStrideRatio = + builder.create(loc, outputCol, strideVal); + Value outputColMultiple = builder.create( + loc, builder.create(loc, outputColStrideRatio, c1), + strideVal); + Value outputColMultiple2 = + builder.create(loc, outputColMultiple, c2); + Value outputRowStrideRatio = + builder.create(loc, outputRow, strideVal); + Value outputRowMultiple = builder.create( + loc, builder.create(loc, outputRowStrideRatio, c1), + strideVal); + Value outputRowMultiple2 = + builder.create(loc, outputRowMultiple, c2); + MemRefType dynamicTypeI32 = MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 32)); Value xOffset = builder.create(loc, dynamicTypeI32, outputCol); MemRefType dynamicTypeI16 = MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 16)); - Value outputColMul2 = builder.create(loc, outputCol, c2); Value iAlpha = - builder.create(loc, dynamicTypeI16, outputColMul2); + builder.create(loc, dynamicTypeI16, outputColMultiple2); processScaling(builder, loc, outputCol, horizontalScalingFactor, inputCol, - xOffset, iAlpha); + xOffset, iAlpha, stride); Value yOffset = builder.create(loc, dynamicTypeI32, outputRow); - Value outputRowMul2 = builder.create(loc, outputRow, c2); Value iBeta = - builder.create(loc, dynamicTypeI16, outputRowMul2); + builder.create(loc, dynamicTypeI16, outputRowMultiple2); processScaling(builder, loc, outputRow, verticalScalingFactor, inputRow, - yOffset, iBeta); + yOffset, iBeta, stride); Value bufferWidth = outputCol; Value buffer0 = @@ -1266,6 +1354,19 @@ void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc, Value prevSy1 = builder.create(loc, builder.getI32Type(), cMinus1); + Value halfVec = builder.create( + loc, vectorTyI32, + builder.create(loc, HALF, builder.getI32Type())); + Value shiftVec = builder.create( + loc, vectorTyI32, + builder.create(loc, SHIFT, builder.getI32Type())); + Value zeroVec = builder.create( + loc, vectorTyI32, + builder.create(loc, 0, builder.getI32Type())); + Value twoFiftyFiveVec = builder.create( + loc, vectorTyI32, + builder.create(loc, 255, builder.getI32Type())); + // builder.create( // loc, ValueRange{c0}, ValueRange{outputRow}, ValueRange{c1}, // [&](OpBuilder &yBuilder, Location yLoc, ValueRange ivs) { @@ -1277,67 +1378,59 @@ void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc, [&](OpBuilder &yBuilder, Location yLoc, Value yiv, ValueRange) { Value sy = yBuilder.create(yLoc, yOffset, ValueRange{yiv}); - Value syNext = yBuilder.create(yLoc, sy, c1I); + Value syNext = yBuilder.create(yLoc, sy, c1I32); calcInterpolation(yBuilder, yLoc, sy, prevSy0, xOffset, input, - outputCol, iAlpha, buffer0); + outputCol, iAlpha, buffer0, stride); calcInterpolation(yBuilder, yLoc, syNext, prevSy1, xOffset, input, - outputCol, iAlpha, buffer1); + outputCol, iAlpha, buffer1, stride); + // calc index Value yMul2 = yBuilder.create(yLoc, yiv, c2); yMul2 = yBuilder.create(yLoc, yBuilder.getI32Type(), yMul2); - Value yMul2Plus1 = yBuilder.create(yLoc, yMul2, c1I); + Value yMul2Plus1 = yBuilder.create(yLoc, yMul2, c1I32); Value index0 = yBuilder.create( yLoc, yBuilder.getIndexType(), yMul2); Value index1 = yBuilder.create( yLoc, yBuilder.getIndexType(), yMul2Plus1); - Value b0 = - yBuilder.create(yLoc, iBeta, ValueRange{index0}); + Value b0 = yBuilder.create(yLoc, iBeta, index0); b0 = yBuilder.create(yLoc, yBuilder.getI32Type(), b0); - Value b1 = - yBuilder.create(yLoc, iBeta, ValueRange{index1}); + Value b0Vec = yBuilder.create(yLoc, vectorTyI32, b0); + Value b1 = yBuilder.create(yLoc, iBeta, index1); b1 = yBuilder.create(yLoc, yBuilder.getI32Type(), b1); - // Value b0 = yBuilder.create(yLoc, iBeta, - // ValueRange{yiv}); b0 = yBuilder.create(yLoc, - // yBuilder.getI32Type(), b0); Value yPlus1 = - // builder.create(yLoc, yiv, c1); Value b1 = - // yBuilder.create(yLoc, iBeta, ValueRange{yPlus1}); b1 - // = yBuilder.create(yLoc, yBuilder.getI32Type(), b1); + Value b1Vec = yBuilder.create(yLoc, vectorTyI32, b1); + yBuilder.create( - yLoc, c0, bufferWidth, c1, std::nullopt, + yLoc, c0, bufferWidth, strideVal, std::nullopt, [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { - Value buffer0X = xBuilder.create(xLoc, buffer0, - ValueRange{xiv}); - Value buffer1X = xBuilder.create(xLoc, buffer1, - ValueRange{xiv}); + Value maskVal = + xBuilder.create(xLoc, bufferWidth, xiv); + Value maskVec = xBuilder.create( + xLoc, vectorTyI1, maskVal); + Value buffer0X = xBuilder.create( + xLoc, vectorTyI32, buffer0, ValueRange{xiv}, maskVec, + passThruConstantOpI32); + Value buffer1X = xBuilder.create( + xLoc, vectorTyI32, buffer1, ValueRange{xiv}, maskVec, + passThruConstantOpI32); Value b0MulBuffer0 = - xBuilder.create(xLoc, b0, buffer0X); + xBuilder.create(xLoc, b0Vec, buffer0X); Value b1MulBuffer1 = - xBuilder.create(xLoc, b1, buffer1X); + xBuilder.create(xLoc, b1Vec, buffer1X); Value add = xBuilder.create(xLoc, b0MulBuffer0, b1MulBuffer1); - Value half = xBuilder.create( - xLoc, HALF, xBuilder.getI32Type()); - Value addHalf = xBuilder.create(xLoc, add, half); - Value shift = xBuilder.create( - xLoc, SHIFT, xBuilder.getI32Type()); + Value addHalf = + xBuilder.create(xLoc, add, halfVec); Value resShifted = - xBuilder.create(xLoc, addHalf, shift); - Value zero = xBuilder.create( - xLoc, 0, xBuilder.getI32Type()); - Value twoFiftyFive = xBuilder.create( - xLoc, 255, xBuilder.getI32Type()); + xBuilder.create(xLoc, addHalf, shiftVec); Value maxVal = - xBuilder.create(xLoc, resShifted, zero); - Value clampedVal = - xBuilder.create(xLoc, maxVal, twoFiftyFive); - FloatType type = inElemTy.isF32() - ? FloatType::getF32(xBuilder.getContext()) - : FloatType::getF64(xBuilder.getContext()); + xBuilder.create(xLoc, resShifted, zeroVec); + Value clampedVal = xBuilder.create( + xLoc, maxVal, twoFiftyFiveVec); Value clampedValF = - xBuilder.create(xLoc, type, clampedVal); - xBuilder.create(xLoc, clampedValF, output, - ValueRange{yiv, xiv}); + xBuilder.create(xLoc, vectorTyF, clampedVal); + xBuilder.create( + xLoc, output, ValueRange{yiv, xiv}, maskVec, clampedValF); xBuilder.create(xLoc); }); yBuilder.create(yLoc); From d220a8ed64f47bdd19896608534366f82eb2eb9e Mon Sep 17 00:00:00 2001 From: HarryZ Date: Fri, 7 Mar 2025 19:54:59 +0800 Subject: [PATCH 3/7] [feat] parallel resize2d op --- .../lib/Conversion/LowerDIP/LowerDIPPass.cpp | 108 ++--- midend/lib/Utils/DIPUtils.cpp | 440 ++++++++---------- 2 files changed, 233 insertions(+), 315 deletions(-) diff --git a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp index ef7fc9b9b0..3cd65126de 100644 --- a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp +++ b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp @@ -314,85 +314,45 @@ class DIPResize2DOpLowering : public OpRewritePattern { << inElemTy << "is passed"; } - Value c0 = rewriter.create(loc, 0); - Value c1 = rewriter.create(loc, 1); - Value c0F32 = indexToF32(rewriter, loc, c0); - - Value inputRow = rewriter.create(loc, input, c0); - Value inputCol = rewriter.create(loc, input, c1); - - Value outputRow = rewriter.create(loc, output, c0); - Value outputCol = rewriter.create(loc, output, c1); - - // Determine lower bound for second call of resize function (this is done - // for efficient tail processing). - Value outputColStrideRatio = - rewriter.create(loc, outputCol, strideVal); - Value outputColMultiple = - rewriter.create(loc, strideVal, outputColStrideRatio); - - SmallVector lowerBounds1{c0, c0}; - SmallVector upperBounds1{outputRow, outputColMultiple}; - - SmallVector steps{1, stride}; - Value strideTailVal = - rewriter.create(loc, outputCol, outputColMultiple); - - SmallVector lowerBounds2{c0, outputColMultiple}; - SmallVector upperBounds2{outputRow, outputCol}; - - FloatType f32 = FloatType::getF32(ctx); - VectorType vectorTy32 = VectorType::get({stride}, f32); - - Value horizontalScalingFactorVec = rewriter.create( - loc, vectorTy32, horizontalScalingFactor); - Value verticalScalingFactorVec = rewriter.create( - loc, vectorTy32, verticalScalingFactor); - - // Obtain extreme allocatable value(s) in input and output for bounding - // purpose. - Value inputRowLastElem = rewriter.create(loc, inputRow, c1); - Value inputRowLastElemF32 = indexToF32(rewriter, loc, inputRowLastElem); - - Value inputColLastElem = rewriter.create(loc, inputCol, c1); - Value inputColLastElemF32 = indexToF32(rewriter, loc, inputColLastElem); - - Value outputRowLastElem = - rewriter.create(loc, outputRow, c1); - Value outputRowLastElemF32 = indexToF32(rewriter, loc, outputRowLastElem); - - Value outputColLastElem = - rewriter.create(loc, outputCol, c1); - Value outputColLastElemF32 = indexToF32(rewriter, loc, outputColLastElem); + VectorType vectorTyI1 = VectorType::get({stride}, rewriter.getI1Type()); + VectorType vectorTyI16 = + VectorType::get({stride}, IntegerType::get(rewriter.getContext(), 16)); + VectorType vectorTyI32 = + VectorType::get({stride}, IntegerType::get(rewriter.getContext(), 32)); + VectorType vectorTyF32 = + VectorType::get({stride}, FloatType::getF32(rewriter.getContext())); + VectorType vectorTyIndex = + VectorType::get({stride}, rewriter.getIndexType()); + VectorType vectorResTy = VectorType::get( + {stride}, inElemTy.isF32() ? FloatType::getF32(rewriter.getContext()) + : FloatType::getF64(rewriter.getContext())); + + static const int SHIFT = 11; + static const int HALF = 1 << (SHIFT - 1); + static const int INTER_RESIZE_COEF_SCALE = 1 << SHIFT; + Value half = + rewriter.create(loc, HALF, rewriter.getI32Type()); + Value halfVec = rewriter.create(loc, vectorTyI32, half); + Value shift = rewriter.create(loc, SHIFT, + rewriter.getI32Type()); + Value shiftVec = rewriter.create(loc, vectorTyI32, shift); + Value scaleVec = rewriter.create( + loc, vectorTyF32, + rewriter.create( + loc, (llvm::APFloat)(float)INTER_RESIZE_COEF_SCALE, + rewriter.getF32Type())); if (interpolationAttr == dip::InterpolationType::NearestNeighbourInterpolation) { - dip::NearestNeighbourInterpolationResizing( - rewriter, loc, ctx, lowerBounds1, upperBounds1, steps, strideVal, - input, output, horizontalScalingFactorVec, verticalScalingFactorVec, - outputRowLastElemF32, outputColLastElemF32, inputRowLastElemF32, - inputColLastElemF32, vectorTy32, stride, c0, c0F32); - - dip::NearestNeighbourInterpolationResizing( - rewriter, loc, ctx, lowerBounds2, upperBounds2, steps, strideTailVal, - input, output, horizontalScalingFactorVec, verticalScalingFactorVec, - outputRowLastElemF32, outputColLastElemF32, inputRowLastElemF32, - inputColLastElemF32, vectorTy32, stride, c0, c0F32); + dip::NearestNeighbourInterpolationResizingNew( + rewriter, loc, ctx, input, output, horizontalScalingFactor, + verticalScalingFactor); } else if (interpolationAttr == dip::InterpolationType::BilinearInterpolation) { - Value c1F32 = indexToF32(rewriter, loc, c1); - - dip::BilinearInterpolationResizing( - rewriter, loc, ctx, lowerBounds1, upperBounds1, steps, strideVal, - input, output, horizontalScalingFactorVec, verticalScalingFactorVec, - outputRowLastElemF32, outputColLastElemF32, inputRowLastElemF32, - inputColLastElemF32, vectorTy32, stride, c0, c0F32, c1F32); - - dip::BilinearInterpolationResizing( - rewriter, loc, ctx, lowerBounds2, upperBounds2, steps, strideTailVal, - input, output, horizontalScalingFactorVec, verticalScalingFactorVec, - outputRowLastElemF32, outputColLastElemF32, inputRowLastElemF32, - inputColLastElemF32, vectorTy32, stride, c0, c0F32, c1F32); + dip::BilinearInterpolationResizingNew( + rewriter, loc, ctx, input, output, stride, horizontalScalingFactor, + verticalScalingFactor, halfVec, shiftVec, scaleVec, vectorResTy, + vectorTyI32, vectorTyI16, vectorTyIndex, vectorTyF32, vectorTyI1); } // Remove the original resize operation. diff --git a/midend/lib/Utils/DIPUtils.cpp b/midend/lib/Utils/DIPUtils.cpp index ae0c6da7c7..256624055f 100644 --- a/midend/lib/Utils/DIPUtils.cpp +++ b/midend/lib/Utils/DIPUtils.cpp @@ -1056,20 +1056,9 @@ void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc, void processScaling(OpBuilder &builder, Location loc, Value output, Value scalingFactor, Value input, Value xOffset, - Value iAlpha, int64_t stride) { - static const int SHIFT = 11; - static const int INTER_RESIZE_COEF_SCALE = 1 << SHIFT; - static const int HALF = 1 << (SHIFT - 1); - - VectorType vectorTyIndex = VectorType::get({stride}, builder.getIndexType()); - VectorType vectorTyI1 = VectorType::get({stride}, builder.getI1Type()); - VectorType vectorTyI16 = - VectorType::get({stride}, IntegerType::get(builder.getContext(), 16)); - VectorType vectorTyI32 = - VectorType::get({stride}, IntegerType::get(builder.getContext(), 32)); - VectorType vectorTyF32 = - VectorType::get({stride}, FloatType::getF32(builder.getContext())); - + Value iAlpha, int64_t stride, Value scaleVec, + VectorType vectorTyI32, VectorType vectorTyF32, + VectorType vectorTyI1, VectorType vectorTyI16) { Value c0 = builder.create(loc, 0); Value c1 = builder.create(loc, 1); Value c2 = builder.create(loc, 2); @@ -1080,15 +1069,17 @@ void processScaling(OpBuilder &builder, Location loc, Value output, loc, vectorTyF32, builder.create(loc, (llvm::APFloat)0.0f, builder.getF32Type())); + Value c1FVec = builder.create( + loc, vectorTyF32, + builder.create(loc, (llvm::APFloat)1.0f, + builder.getF32Type())); Value cDot5FVec = builder.create( loc, vectorTyF32, builder.create(loc, (llvm::APFloat)0.5f, builder.getF32Type())); Value scalingFactorVec = builder.create(loc, vectorTyF32, scalingFactor); - Value c1F = builder.create(loc, (llvm::APFloat)1.0f, - builder.getF32Type()); - Value c1FVec = builder.create(loc, vectorTyF32, c1F); + Value inputMinus1 = builder.create( loc, builder.getI32Type(), builder.create(loc, input, c1)); Value inputMinus1Vec = @@ -1099,11 +1090,7 @@ void processScaling(OpBuilder &builder, Location loc, Value output, builder.create(loc, vectorTyI32, inputMinus2); Value stepVec = iotaVec0F32(builder, loc, stride); Value strideVal = builder.create(loc, stride); - Value scaleCoefVec = builder.create( - loc, vectorTyF32, - builder.create( - loc, (llvm::APFloat)(float)INTER_RESIZE_COEF_SCALE, - builder.getF32Type())); + builder.create( loc, c0, output, strideVal, std::nullopt, [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { @@ -1129,35 +1116,32 @@ void processScaling(OpBuilder &builder, Location loc, Value output, sx); fx = xBuilder.create(xLoc, greaterThan, c0FVec, fx); Value maskVal = xBuilder.create(xLoc, output, xiv); - Value maskVec = - xBuilder.create(xLoc, vectorTyI1, maskVal); - xBuilder.create(xLoc, xOffset, ValueRange{xiv}, - maskVec, sx); + xBuilder.create( + xLoc, xOffset, ValueRange{xiv}, + xBuilder.create(xLoc, vectorTyI1, maskVal), + sx); // ialpha[dx * 2] = (short)((1.f - fx) * INTER_RESIZE_COEF_SCALE); // ialpha[dx * 2 + 1] = (short)(fx * INTER_RESIZE_COEF_SCALE); - Value fxScale = xBuilder.create(xLoc, fx, scaleCoefVec); + Value fxScale = xBuilder.create(xLoc, fx, scaleVec); Value oneMinusFx = xBuilder.create(xLoc, c1FVec, fx); Value oneMinusFxScale = - xBuilder.create(xLoc, oneMinusFx, scaleCoefVec); - - Value index0 = xBuilder.create(xLoc, xiv, c2); - Value index1 = xBuilder.create(xLoc, index0, c1); + xBuilder.create(xLoc, oneMinusFx, scaleVec); Value val0 = xBuilder.create(xLoc, vectorTyI16, oneMinusFxScale); Value val1 = xBuilder.create(xLoc, vectorTyI16, fxScale); - SmallVector maskVec1; + SmallVector maskVec; for (int i = 0; i < stride; i++) { - maskVec1.push_back(i); - maskVec1.push_back(i + stride); + maskVec.push_back(i); + maskVec.push_back(i + stride); } - Value storeBack = - xBuilder.create(xLoc, val0, val1, maskVec1); - xBuilder.create(xLoc, storeBack, iAlpha, - ValueRange{index0}); + Value index0 = xBuilder.create(xLoc, xiv, c2); + xBuilder.create( + xLoc, xBuilder.create(xLoc, val0, val1, maskVec), + iAlpha, ValueRange{index0}); xBuilder.create(xLoc); }); } @@ -1165,136 +1149,104 @@ void processScaling(OpBuilder &builder, Location loc, Value output, void calcInterpolation(OpBuilder &builder, Location loc, Value &sy, Value &syPrev, Value offset, Value input, Value outputWidth, Value iAlpha, Value buffer, - int64_t stride) { - static const int SHIFT = 11; - static const int HALF = 1 << (SHIFT - 1); - auto inElemTy = input.getType().cast().getElementType(); + int64_t stride, Value halfVec, Value shiftVec, + VectorType vectorResTy, VectorType vectorTyIndex, + VectorType vectorTyI32, VectorType vectorTyI16, + VectorType vectorTyI1) { Value c0 = builder.create(loc, 0); Value c1 = builder.create(loc, 1); - Value c1I1 = - builder.create(loc, 1, builder.getI1Type()); - Value c0I = - builder.create(loc, 0, builder.getI32Type()); - Value c1I = - builder.create(loc, 1, builder.getI32Type()); Value c2 = builder.create(loc, 2); - Value c2I = - builder.create(loc, 2, builder.getI32Type()); Value strideVal = builder.create(loc, stride); - Value strideInt = - builder.create(loc, builder.getI32Type(), strideVal); - Value outputStrideRatio = - builder.create(loc, outputWidth, strideVal); - Value outputMultiple = builder.create( - loc, builder.create(loc, outputStrideRatio, c1), - strideVal); - VectorType vectorTyIndex = VectorType::get({stride}, builder.getIndexType()); - VectorType vectorTyI1 = VectorType::get({stride}, builder.getI1Type()); - VectorType vectorTyI16 = - VectorType::get({stride}, IntegerType::get(builder.getContext(), 16)); - VectorType vectorTyI32 = - VectorType::get({stride}, IntegerType::get(builder.getContext(), 32)); - VectorType vectorTyF32 = - VectorType::get({stride}, FloatType::getF32(builder.getContext())); - - Value c1I1Vec = builder.create(loc, vectorTyI1, c1I1); - Value c1IVec = builder.create(loc, vectorTyI32, c1I); - Value c2IVec = builder.create(loc, vectorTyI32, c2I); - Value half = - builder.create(loc, HALF, builder.getI32Type()); - Value halfVec = builder.create(loc, vectorTyI32, half); - Value shift = - builder.create(loc, SHIFT, builder.getI32Type()); - Value shiftVec = builder.create(loc, vectorTyI32, shift); - - auto passThruConstantOp = - builder.create(loc, builder.getZeroAttr(vectorTyF32)); - auto passThruConstantOpI32 = + + Value c1I1Vec = builder.create( + loc, vectorTyI1, + builder.create(loc, 1, builder.getI1Type())); + Value c1Vec = builder.create(loc, vectorTyIndex, c1); + + auto passThruRes = + builder.create(loc, builder.getZeroAttr(vectorResTy)); + auto passThruI32 = builder.create(loc, builder.getZeroAttr(vectorTyI32)); - auto passThruConstantOpI16 = - builder.create(loc, builder.getZeroAttr(vectorTyI16)); Value notEqual = builder.create(loc, arith::CmpIPredicate::ne, sy, syPrev); - builder.create< - scf::IfOp>(loc, notEqual, [&](OpBuilder &tBuilder, Location tLoc) { - tBuilder.create( - tLoc, c0, outputMultiple, strideVal, std::nullopt, - [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { - Value maskVal = - xBuilder.create(xLoc, outputWidth, xiv); - Value maskVec = - xBuilder.create(xLoc, vectorTyI1, maskVal); - Value sxVec = xBuilder.create( - xLoc, vectorTyI32, offset, ValueRange{xiv}, maskVec, - passThruConstantOpI32); - Value sxPlus1Vec = - xBuilder.create(xLoc, sxVec, c1IVec); - Value index0 = xBuilder.create(xLoc, xiv, c2); - Value index1 = - xBuilder.create(xLoc, index0, strideVal); - Value a0Vec = xBuilder.create(xLoc, vectorTyI16, - iAlpha, index0); - Value a1Vec = xBuilder.create(xLoc, vectorTyI16, - iAlpha, index1); - Value sxVecIndex = - xBuilder.create(xLoc, vectorTyIndex, sxVec); - Value sxPlus1VecIndex = xBuilder.create( - xLoc, vectorTyIndex, sxPlus1Vec); - Value syIndex = xBuilder.create( - xLoc, xBuilder.getIndexType(), sy); - Value v0Vec = xBuilder.create( - xLoc, vectorTyF32, input, ValueRange{syIndex, c0}, sxVecIndex, - c1I1Vec, passThruConstantOp); - v0Vec = xBuilder.create(xLoc, vectorTyI32, v0Vec); - Value v1Vec = xBuilder.create( - xLoc, vectorTyF32, input, ValueRange{syIndex, c0}, - sxPlus1VecIndex, c1I1Vec, passThruConstantOp); - v1Vec = xBuilder.create(xLoc, vectorTyI32, v1Vec); - - a0Vec = xBuilder.create(xLoc, vectorTyI32, a0Vec); - a1Vec = xBuilder.create(xLoc, vectorTyI32, a1Vec); - SmallVector maskVec1, maskVec2; - for (int i = 0; i < stride; i++) { - maskVec1.push_back(i * 2); - maskVec2.push_back(i * 2 + 1); - } - Value adder1 = - xBuilder.create(xLoc, a0Vec, a1Vec, maskVec1); - Value adder2 = - xBuilder.create(xLoc, a0Vec, a1Vec, maskVec2); - Value aMulv0 = xBuilder.create(xLoc, adder1, v0Vec); - Value aMulv1 = xBuilder.create(xLoc, adder2, v1Vec); - Value adder = xBuilder.create(xLoc, aMulv0, aMulv1); - adder = builder.create(loc, adder, halfVec); - Value resShifted = - xBuilder.create(xLoc, adder, shiftVec); - xBuilder.create(xLoc, buffer, ValueRange{xiv}, - maskVec, resShifted); - xBuilder.create(xLoc); - }); - tBuilder.create(tLoc); - }); + builder.create( + loc, notEqual, [&](OpBuilder &tBuilder, Location tLoc) { + tBuilder.create( + tLoc, c0, outputWidth, strideVal, std::nullopt, + [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { + Value maskVal = + xBuilder.create(xLoc, outputWidth, xiv); + Value maskVec = xBuilder.create( + xLoc, vectorTyI1, maskVal); + Value sxVec = xBuilder.create( + xLoc, vectorTyIndex, + xBuilder.create(xLoc, vectorTyI32, + offset, ValueRange{xiv}, + maskVec, passThruI32)); + Value sxPlus1Vec = + xBuilder.create(xLoc, sxVec, c1Vec); + + Value index0 = xBuilder.create(xLoc, xiv, c2); + Value index1 = + xBuilder.create(xLoc, index0, strideVal); + Value a0Vec = xBuilder.create( + xLoc, vectorTyI32, + xBuilder.create(xLoc, vectorTyI16, iAlpha, + index0)); + Value a1Vec = xBuilder.create( + xLoc, vectorTyI32, + xBuilder.create(xLoc, vectorTyI16, iAlpha, + index1)); + + Value syIndex = xBuilder.create( + xLoc, xBuilder.getIndexType(), sy); + Value v0Vec = xBuilder.create( + xLoc, vectorTyI32, + xBuilder.create( + xLoc, vectorResTy, input, ValueRange{syIndex, c0}, sxVec, + c1I1Vec, passThruRes)); + Value v1Vec = xBuilder.create( + xLoc, vectorTyI32, + xBuilder.create( + xLoc, vectorResTy, input, ValueRange{syIndex, c0}, + sxPlus1Vec, c1I1Vec, passThruRes)); + + SmallVector maskVec1, maskVec2; + for (int i = 0; i < stride; i++) { + maskVec1.push_back(i * 2); + maskVec2.push_back(i * 2 + 1); + } + Value a0ShuffleVec = xBuilder.create( + xLoc, a0Vec, a1Vec, maskVec1); + Value a1ShuffleVec = xBuilder.create( + xLoc, a0Vec, a1Vec, maskVec2); + Value aMulv0 = + xBuilder.create(xLoc, a0ShuffleVec, v0Vec); + Value aMulv1 = + xBuilder.create(xLoc, a1ShuffleVec, v1Vec); + Value addRes = + xBuilder.create(xLoc, aMulv0, aMulv1); + addRes = builder.create(loc, addRes, halfVec); + Value resShifted = + xBuilder.create(xLoc, addRes, shiftVec); + xBuilder.create( + xLoc, buffer, ValueRange{xiv}, maskVec, resShifted); + xBuilder.create(xLoc); + }); + tBuilder.create(tLoc); + }); syPrev = builder.create(loc, notEqual, sy, syPrev); } -void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc, - MLIRContext *ctx, Value input, - Value output, int64_t stride, - Value horizontalScalingFactor, - Value verticalScalingFactor) { - static const int SHIFT = 11; - static const int INTER_RESIZE_COEF_SCALE = 1 << SHIFT; - static const int HALF = 1 << (SHIFT - 1); - auto inElemTy = input.getType().cast().getElementType(); - VectorType vectorTyIndex = VectorType::get({stride}, builder.getIndexType()); - VectorType vectorTyI1 = VectorType::get({stride}, builder.getI1Type()); - VectorType vectorTyI32 = - VectorType::get({stride}, IntegerType::get(builder.getContext(), 32)); - VectorType vectorTyF = VectorType::get( - {stride}, inElemTy.isF32() ? FloatType::getF32(builder.getContext()) - : FloatType::getF64(builder.getContext())); - auto passThruConstantOpI32 = +void BilinearInterpolationResizingNew( + OpBuilder &builder, Location loc, MLIRContext *ctx, Value input, + Value output, int64_t stride, Value horizontalScalingFactor, + Value verticalScalingFactor, Value halfVec, Value shiftVec, Value scaleVec, + VectorType vectorResTy, VectorType vectorTyI32, VectorType vectorTyI16, + VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1) { + + auto passThruI32 = builder.create(loc, builder.getZeroAttr(vectorTyI32)); Value cMinus1 = builder.create(loc, -1); Value c0 = builder.create(loc, 0); @@ -1302,9 +1254,9 @@ void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc, Value c1I32 = builder.create(loc, 1, builder.getI32Type()); Value c2 = builder.create(loc, 2); + Value inputRow = builder.create(loc, input, c0); Value inputCol = builder.create(loc, input, c1); - Value outputRow = builder.create(loc, output, c0); Value outputCol = builder.create(loc, output, c1); @@ -1334,7 +1286,8 @@ void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc, builder.create(loc, dynamicTypeI16, outputColMultiple2); processScaling(builder, loc, outputCol, horizontalScalingFactor, inputCol, - xOffset, iAlpha, stride); + xOffset, iAlpha, stride, scaleVec, vectorTyI32, vectorTyF32, + vectorTyI1, vectorTyI16); Value yOffset = builder.create(loc, dynamicTypeI32, outputRow); @@ -1342,24 +1295,11 @@ void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc, builder.create(loc, dynamicTypeI16, outputRowMultiple2); processScaling(builder, loc, outputRow, verticalScalingFactor, inputRow, - yOffset, iBeta, stride); + yOffset, iBeta, stride, scaleVec, vectorTyI32, vectorTyF32, + vectorTyI1, vectorTyI16); Value bufferWidth = outputCol; - Value buffer0 = - builder.create(loc, dynamicTypeI32, bufferWidth); - Value buffer1 = - builder.create(loc, dynamicTypeI32, bufferWidth); - Value prevSy0 = - builder.create(loc, builder.getI32Type(), cMinus1); - Value prevSy1 = - builder.create(loc, builder.getI32Type(), cMinus1); - - Value halfVec = builder.create( - loc, vectorTyI32, - builder.create(loc, HALF, builder.getI32Type())); - Value shiftVec = builder.create( - loc, vectorTyI32, - builder.create(loc, SHIFT, builder.getI32Type())); + Value zeroVec = builder.create( loc, vectorTyI32, builder.create(loc, 0, builder.getI32Type())); @@ -1367,74 +1307,92 @@ void BilinearInterpolationResizingNew(OpBuilder &builder, Location loc, loc, vectorTyI32, builder.create(loc, 255, builder.getI32Type())); - // builder.create( - // loc, ValueRange{c0}, ValueRange{outputRow}, ValueRange{c1}, - // [&](OpBuilder &yBuilder, Location yLoc, ValueRange ivs) { - // Value yiv = ivs[0]; - // } - // ); - builder.create( - loc, c0, outputRow, c1, std::nullopt, - [&](OpBuilder &yBuilder, Location yLoc, Value yiv, ValueRange) { - Value sy = - yBuilder.create(yLoc, yOffset, ValueRange{yiv}); - Value syNext = yBuilder.create(yLoc, sy, c1I32); - calcInterpolation(yBuilder, yLoc, sy, prevSy0, xOffset, input, - outputCol, iAlpha, buffer0, stride); - calcInterpolation(yBuilder, yLoc, syNext, prevSy1, xOffset, input, - outputCol, iAlpha, buffer1, stride); - - // calc index - Value yMul2 = yBuilder.create(yLoc, yiv, c2); - yMul2 = yBuilder.create(yLoc, yBuilder.getI32Type(), - yMul2); - Value yMul2Plus1 = yBuilder.create(yLoc, yMul2, c1I32); - Value index0 = yBuilder.create( - yLoc, yBuilder.getIndexType(), yMul2); - Value index1 = yBuilder.create( - yLoc, yBuilder.getIndexType(), yMul2Plus1); - Value b0 = yBuilder.create(yLoc, iBeta, index0); - b0 = yBuilder.create(yLoc, yBuilder.getI32Type(), b0); - Value b0Vec = yBuilder.create(yLoc, vectorTyI32, b0); - Value b1 = yBuilder.create(yLoc, iBeta, index1); - b1 = yBuilder.create(yLoc, yBuilder.getI32Type(), b1); - Value b1Vec = yBuilder.create(yLoc, vectorTyI32, b1); + auto resizeLoop = [&](OpBuilder &yBuilder, Location yLoc, Value yiv, + ValueRange) { + Value buffer0 = + yBuilder.create(loc, dynamicTypeI32, bufferWidth); + Value buffer1 = + yBuilder.create(loc, dynamicTypeI32, bufferWidth); + Value prevSy0 = yBuilder.create( + loc, yBuilder.getI32Type(), cMinus1); + Value prevSy1 = yBuilder.create( + loc, yBuilder.getI32Type(), cMinus1); + + Value sy = yBuilder.create(yLoc, yOffset, ValueRange{yiv}); + Value syNext = yBuilder.create(yLoc, sy, c1I32); + calcInterpolation(yBuilder, yLoc, sy, prevSy0, xOffset, input, outputCol, + iAlpha, buffer0, stride, halfVec, shiftVec, vectorResTy, + vectorTyIndex, vectorTyI32, vectorTyI16, vectorTyI1); + calcInterpolation(yBuilder, yLoc, syNext, prevSy1, xOffset, input, + outputCol, iAlpha, buffer1, stride, halfVec, shiftVec, + vectorResTy, vectorTyIndex, vectorTyI32, vectorTyI16, + vectorTyI1); + + Value index0 = yBuilder.create(yLoc, yiv, c2); + Value index1 = yBuilder.create(yLoc, index0, c1); + Value b0 = yBuilder.create( + yLoc, yBuilder.getI32Type(), + yBuilder.create(yLoc, iBeta, index0)); + Value b0Vec = yBuilder.create(yLoc, vectorTyI32, b0); + Value b1 = yBuilder.create( + yLoc, yBuilder.getI32Type(), + yBuilder.create(yLoc, iBeta, index1)); + Value b1Vec = yBuilder.create(yLoc, vectorTyI32, b1); + + yBuilder.create( + yLoc, c0, bufferWidth, strideVal, std::nullopt, + [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { + Value maskVal = + xBuilder.create(xLoc, bufferWidth, xiv); + Value maskVec = + xBuilder.create(xLoc, vectorTyI1, maskVal); + Value buffer0X = xBuilder.create( + xLoc, vectorTyI32, buffer0, ValueRange{xiv}, maskVec, + passThruI32); + Value buffer1X = xBuilder.create( + xLoc, vectorTyI32, buffer1, ValueRange{xiv}, maskVec, + passThruI32); + Value b0MulBuffer0 = + xBuilder.create(xLoc, b0Vec, buffer0X); + Value b1MulBuffer1 = + xBuilder.create(xLoc, b1Vec, buffer1X); + Value bufferRes = + xBuilder.create(xLoc, b0MulBuffer0, b1MulBuffer1); + Value addHalf = + xBuilder.create(xLoc, bufferRes, halfVec); + Value resShifted = + xBuilder.create(xLoc, addHalf, shiftVec); + Value maxVal = + xBuilder.create(xLoc, resShifted, zeroVec); + Value clampedVal = + xBuilder.create(xLoc, maxVal, twoFiftyFiveVec); + Value clampedValF = + xBuilder.create(xLoc, vectorResTy, clampedVal); + xBuilder.create( + xLoc, output, ValueRange{yiv, xiv}, maskVec, clampedValF); + xBuilder.create(xLoc); + }); + yBuilder.create(loc, buffer0); + yBuilder.create(loc, buffer1); + yBuilder.create(yLoc); + }; - yBuilder.create( - yLoc, c0, bufferWidth, strideVal, std::nullopt, - [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { - Value maskVal = - xBuilder.create(xLoc, bufferWidth, xiv); - Value maskVec = xBuilder.create( - xLoc, vectorTyI1, maskVal); - Value buffer0X = xBuilder.create( - xLoc, vectorTyI32, buffer0, ValueRange{xiv}, maskVec, - passThruConstantOpI32); - Value buffer1X = xBuilder.create( - xLoc, vectorTyI32, buffer1, ValueRange{xiv}, maskVec, - passThruConstantOpI32); - Value b0MulBuffer0 = - xBuilder.create(xLoc, b0Vec, buffer0X); - Value b1MulBuffer1 = - xBuilder.create(xLoc, b1Vec, buffer1X); - Value add = xBuilder.create(xLoc, b0MulBuffer0, - b1MulBuffer1); - Value addHalf = - xBuilder.create(xLoc, add, halfVec); - Value resShifted = - xBuilder.create(xLoc, addHalf, shiftVec); - Value maxVal = - xBuilder.create(xLoc, resShifted, zeroVec); - Value clampedVal = xBuilder.create( - xLoc, maxVal, twoFiftyFiveVec); - Value clampedValF = - xBuilder.create(xLoc, vectorTyF, clampedVal); - xBuilder.create( - xLoc, output, ValueRange{yiv, xiv}, maskVec, clampedValF); - xBuilder.create(xLoc); - }); - yBuilder.create(yLoc); + Value batch = builder.create(loc, stride); + builder.create( + loc, ValueRange{c0}, ValueRange{outputRow}, ValueRange{batch}, + [&](OpBuilder &tBuilder, Location tLoc, ValueRange ivs) { + Value tStart = ivs[0]; + Value tEnd = tBuilder.create(tLoc, tStart, batch); + tEnd = tBuilder.create(tLoc, tEnd, outputRow); + tBuilder.create(tLoc, tStart, tEnd, c1, std::nullopt, + resizeLoop); + tBuilder.create(tLoc); }); + + builder.create(loc, xOffset); + builder.create(loc, iAlpha); + builder.create(loc, yOffset); + builder.create(loc, iBeta); } // Helper function for resizing an image using nearest neighbour interpolation From 2e3625748dbad99bc897727cb4a0d9733d9ce257 Mon Sep 17 00:00:00 2001 From: HarryZ Date: Fri, 7 Mar 2025 20:06:00 +0800 Subject: [PATCH 4/7] [feat] finish resize2d acc op --- examples/DIPDialect/CMakeLists.txt | 3 +- frontend/Interfaces/lib/CMakeLists.txt | 47 +++++++++++++++----------- midend/include/Utils/DIPUtils.h | 13 +++++++ 3 files changed, 42 insertions(+), 21 deletions(-) diff --git a/examples/DIPDialect/CMakeLists.txt b/examples/DIPDialect/CMakeLists.txt index 7b2f075c9c..259d4f4f8c 100644 --- a/examples/DIPDialect/CMakeLists.txt +++ b/examples/DIPDialect/CMakeLists.txt @@ -1,4 +1,5 @@ -set(DIP_LIBS ${JPEG_LIBRARY} ${PNG_LIBRARIES} BuddyLibDIP) +set(DIP_LIBS ${JPEG_LIBRARY} ${PNG_LIBRARIES} BuddyLibDIP omp) +link_directories(${LLVM_LIBS}) if(BUDDY_ENABLE_OPENCV) find_package(OpenCV REQUIRED CONFIG) diff --git a/frontend/Interfaces/lib/CMakeLists.txt b/frontend/Interfaces/lib/CMakeLists.txt index c4172a8b23..1286289c22 100644 --- a/frontend/Interfaces/lib/CMakeLists.txt +++ b/frontend/Interfaces/lib/CMakeLists.txt @@ -17,16 +17,23 @@ endif () add_custom_command(OUTPUT DIP.o COMMAND ${CMAKE_BINARY_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/DIP.mlir -lower-dip="DIP-strip-mining=${SPLITING_SIZE}" - -arith-expand + -affine-parallelize -lower-affine - -convert-scf-to-cf - -convert-math-to-llvm + -convert-scf-to-openmp + -convert-vector-to-scf -convert-vector-to-llvm + -memref-expand + -arith-expand + -convert-arith-to-llvm -finalize-memref-to-llvm + -convert-scf-to-cf + -convert-openmp-to-llvm + -convert-math-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | ${LLVM_TOOLS_BINARY_DIR}/mlir-translate --mlir-to-llvmir | ${LLVM_TOOLS_BINARY_DIR}/llc + -relocation-model=pic -mtriple=${BUDDY_TARGET_TRIPLE} -mattr=${BUDDY_OPT_ATTR} --filetype=obj @@ -47,8 +54,8 @@ SET_TARGET_PROPERTIES(BuddyLibDIP PROPERTIES add_custom_command( OUTPUT DAP.o - COMMAND - ${CMAKE_BINARY_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/DAP.mlir + COMMAND + ${CMAKE_BINARY_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/DAP.mlir -lower-dap="DAP-vector-splitting=${SPLITING_SIZE}" --convert-linalg-to-affine-loops -arith-expand @@ -59,9 +66,9 @@ add_custom_command( -finalize-memref-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm - -reconcile-unrealized-casts | + -reconcile-unrealized-casts | ${LLVM_TOOLS_BINARY_DIR}/mlir-translate --mlir-to-llvmir | - ${LLVM_TOOLS_BINARY_DIR}/llc + ${LLVM_TOOLS_BINARY_DIR}/llc -mtriple=${BUDDY_TARGET_TRIPLE} -mattr=${BUDDY_OPT_ATTR} --filetype=obj @@ -71,25 +78,25 @@ add_custom_command( add_custom_command( OUTPUT DAP-extend.o - COMMAND - ${CMAKE_BINARY_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/DAP-extend.mlir + COMMAND + ${CMAKE_BINARY_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/DAP-extend.mlir -extend-dap -one-shot-bufferize -convert-linalg-to-loops -convert-scf-to-cf -expand-strided-metadata -lower-affine - -convert-vector-to-llvm - -memref-expand + -convert-vector-to-llvm + -memref-expand -arith-expand -convert-arith-to-llvm - -finalize-memref-to-llvm + -finalize-memref-to-llvm -convert-math-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm - -reconcile-unrealized-casts | + -reconcile-unrealized-casts | ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir | - ${LLVM_TOOLS_BINARY_DIR}/llc + ${LLVM_TOOLS_BINARY_DIR}/llc -mtriple=${BUDDY_TARGET_TRIPLE} -mattr=${BUDDY_OPT_ATTR} -filetype=obj -relocation-model=pic @@ -99,7 +106,7 @@ add_custom_command( add_custom_command( OUTPUT DAPVectorization.o - COMMAND + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/DAP.mlir | sed -e 's/@buddy_fir/@buddy_fir_vectorization/' -e 's/@buddy_iir/@buddy_iir_vectorization/' @@ -115,9 +122,9 @@ add_custom_command( -finalize-memref-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm - -reconcile-unrealized-casts | + -reconcile-unrealized-casts | ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir | - ${LLVM_TOOLS_BINARY_DIR}/llc + ${LLVM_TOOLS_BINARY_DIR}/llc -mtriple=${BUDDY_TARGET_TRIPLE} -mattr=${BUDDY_OPT_ATTR} -filetype=obj @@ -125,9 +132,9 @@ add_custom_command( DEPENDS mlir-translate llc buddy-opt ) -add_library(BuddyLibDAP STATIC - DAP.o - DAP-extend.o +add_library(BuddyLibDAP STATIC + DAP.o + DAP-extend.o DAPVectorization.o ) diff --git a/midend/include/Utils/DIPUtils.h b/midend/include/Utils/DIPUtils.h index c5bd3104d1..6b0bcdecf1 100644 --- a/midend/include/Utils/DIPUtils.h +++ b/midend/include/Utils/DIPUtils.h @@ -176,6 +176,19 @@ void fillPixelsBilinearInterpolate4D( Value inputRowLastElemF32, Value inputColLastElemF32, Value c0F32, Value c1F32, Value dataCondition); +void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc, + MLIRContext *ctx, Value input, + Value output, + Value horizontalScalingFactor, + Value verticalScalingFactor); + +void BilinearInterpolationResizingNew( + OpBuilder &builder, Location loc, MLIRContext *ctx, Value input, + Value output, int64_t stride, Value horizontalScalingFactor, + Value verticalScalingFactor, Value halfVec, Value shiftVec, Value scaleVec, + VectorType vectorResTy, VectorType vectorTyI32, VectorType vectorTyI16, + VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1); + // Helper function for resizing an image using nearest neighbour interpolation // mechanism. void NearestNeighbourInterpolationResizing( From d64526c009415f54662cc7f775963d63719f913f Mon Sep 17 00:00:00 2001 From: HarryZ Date: Tue, 11 Mar 2025 18:56:28 +0800 Subject: [PATCH 5/7] [feat] add nearest resize vector, but slower --- midend/include/Utils/DIPUtils.h | 10 +-- .../lib/Conversion/LowerDIP/LowerDIPPass.cpp | 5 +- midend/lib/Utils/DIPUtils.cpp | 75 +++++++++++++------ 3 files changed, 61 insertions(+), 29 deletions(-) diff --git a/midend/include/Utils/DIPUtils.h b/midend/include/Utils/DIPUtils.h index 6b0bcdecf1..04679440e5 100644 --- a/midend/include/Utils/DIPUtils.h +++ b/midend/include/Utils/DIPUtils.h @@ -176,11 +176,11 @@ void fillPixelsBilinearInterpolate4D( Value inputRowLastElemF32, Value inputColLastElemF32, Value c0F32, Value c1F32, Value dataCondition); -void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc, - MLIRContext *ctx, Value input, - Value output, - Value horizontalScalingFactor, - Value verticalScalingFactor); +void NearestNeighbourInterpolationResizingNew( + OpBuilder &builder, Location loc, MLIRContext *ctx, Value input, + Value output, int64_t stride, Value horizontalScalingFactor, + Value verticalScalingFactor, VectorType vectorResTy, VectorType vectorTyI16, + VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1); void BilinearInterpolationResizingNew( OpBuilder &builder, Location loc, MLIRContext *ctx, Value input, diff --git a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp index 3cd65126de..578563e08b 100644 --- a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp +++ b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp @@ -345,8 +345,9 @@ class DIPResize2DOpLowering : public OpRewritePattern { if (interpolationAttr == dip::InterpolationType::NearestNeighbourInterpolation) { dip::NearestNeighbourInterpolationResizingNew( - rewriter, loc, ctx, input, output, horizontalScalingFactor, - verticalScalingFactor); + rewriter, loc, ctx, input, output, stride, horizontalScalingFactor, + verticalScalingFactor, vectorResTy, vectorTyI16, vectorTyIndex, + vectorTyF32, vectorTyI1); } else if (interpolationAttr == dip::InterpolationType::BilinearInterpolation) { dip::BilinearInterpolationResizingNew( diff --git a/midend/lib/Utils/DIPUtils.cpp b/midend/lib/Utils/DIPUtils.cpp index 256624055f..3ddefccbd0 100644 --- a/midend/lib/Utils/DIPUtils.cpp +++ b/midend/lib/Utils/DIPUtils.cpp @@ -987,18 +987,30 @@ void fillPixelsBilinearInterpolate4D( }); } -void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc, - MLIRContext *ctx, Value input, - Value output, - Value horizontalScalingFactor, - Value verticalScalingFactor) { +void NearestNeighbourInterpolationResizingNew( + OpBuilder &builder, Location loc, MLIRContext *ctx, Value input, + Value output, int64_t stride, Value horizontalScalingFactor, + Value verticalScalingFactor, VectorType vectorResTy, VectorType vectorTyI16, + VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1) { Value c0 = builder.create(loc, 0); Value c1 = builder.create(loc, 1); + Value strideVal = builder.create(loc, stride); + Value stepVec = iotaVec0F32(builder, loc, stride); + Value horizontalVec = builder.create( + loc, vectorTyF32, horizontalScalingFactor); + Value verticalVec = + builder.create(loc, vectorTyF32, verticalScalingFactor); + auto passThruRes = + builder.create(loc, builder.getZeroAttr(vectorResTy)); + auto passThruI16 = + builder.create(loc, builder.getZeroAttr(vectorTyI16)); Value inputRow = builder.create(loc, input, c0); Value inputRowMinus1 = builder.create( loc, builder.getI16Type(), builder.create(loc, inputRow, c1)); + Value inputRowMinus1Vec = + builder.create(loc, vectorTyI16, inputRowMinus1); Value inputCol = builder.create(loc, input, c1); Value inputColMinus1 = builder.create( loc, builder.getI16Type(), @@ -1006,22 +1018,33 @@ void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc, Value outputRow = builder.create(loc, output, c0); Value outputCol = builder.create(loc, output, c1); + Value outputColStrideRatio = + builder.create(loc, outputCol, strideVal); + Value outputColMultiple = builder.create( + loc, builder.create(loc, outputColStrideRatio, c1), + strideVal); MemRefType dynamicTypeI16 = MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 16)); Value srcXPosVec = - builder.create(loc, dynamicTypeI16, outputCol); + builder.create(loc, dynamicTypeI16, outputColMultiple); builder.create( - loc, c0, outputCol, c1, std::nullopt, + loc, c0, outputColMultiple, strideVal, std::nullopt, [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { + // Value maskVal = xBuilder.create(xLoc, outputCol, xiv); + Value xivFVec = xBuilder.create( + xLoc, vectorTyF32, indexToF32(xBuilder, xLoc, xiv)); + xivFVec = xBuilder.create(xLoc, xivFVec, stepVec); Value srcXPos = xBuilder.create( - xLoc, xBuilder.getI16Type(), - xBuilder.create(xLoc, - indexToF32(xBuilder, xLoc, xiv), - horizontalScalingFactor)); + xLoc, vectorTyI16, + xBuilder.create(xLoc, xivFVec, horizontalVec)); srcXPos = - xBuilder.create(xLoc, srcXPos, inputRowMinus1); - xBuilder.create(xLoc, srcXPos, srcXPosVec, + xBuilder.create(xLoc, srcXPos, inputRowMinus1Vec); + // xBuilder.create( + // xLoc, srcXPosVec, ValueRange{xiv}, + // xBuilder.create(xLoc, vectorTyI1, + // maskVal), srcXPos); + xBuilder.create(xLoc, srcXPos, srcXPosVec, ValueRange{xiv}); xBuilder.create(xLoc); }); @@ -1038,16 +1061,24 @@ void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc, srcYPos = yBuilder.create( yLoc, yBuilder.getIndexType(), srcYPos); yBuilder.create( - loc, c0, outputCol, c1, std::nullopt, + loc, c0, outputCol, strideVal, std::nullopt, [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { - Value srcXPos = xBuilder.create(xLoc, srcXPosVec, - ValueRange{xiv}); - srcXPos = xBuilder.create( - xLoc, xBuilder.getIndexType(), srcXPos); - Value srcPixel = xBuilder.create( - xLoc, input, ValueRange{srcYPos, srcXPos}); - xBuilder.create(xLoc, srcPixel, output, - ValueRange{yiv, xiv}); + Value maskVal = + xBuilder.create(xLoc, outputCol, xiv); + Value maskVec = xBuilder.create( + xLoc, vectorTyI1, maskVal); + Value srcXPos = xBuilder.create( + xLoc, vectorTyI16, srcXPosVec, ValueRange{xiv}, maskVec, + passThruI16); + // Value srcXPos = xBuilder.create( + // xLoc, vectorTyI16, srcXPosVec, ValueRange{xiv}); + srcXPos = xBuilder.create(xLoc, vectorTyIndex, + srcXPos); + Value srcPixel = xBuilder.create( + xLoc, vectorResTy, input, ValueRange{srcYPos, c0}, srcXPos, + maskVec, passThruRes); + xBuilder.create( + xLoc, output, ValueRange{yiv, xiv}, maskVec, srcPixel); xBuilder.create(xLoc); }); yBuilder.create(yLoc); From 7fed3aa3376c7469e97f151f24b7ba83cbf1b87c Mon Sep 17 00:00:00 2001 From: HarryZ Date: Tue, 11 Mar 2025 19:26:10 +0800 Subject: [PATCH 6/7] [feat] parallel resize2d nearest op --- midend/include/Utils/DIPUtils.h | 10 +-- .../lib/Conversion/LowerDIP/LowerDIPPass.cpp | 4 +- midend/lib/Utils/DIPUtils.cpp | 82 ++++++------------- 3 files changed, 32 insertions(+), 64 deletions(-) diff --git a/midend/include/Utils/DIPUtils.h b/midend/include/Utils/DIPUtils.h index 04679440e5..e5c3cfdf30 100644 --- a/midend/include/Utils/DIPUtils.h +++ b/midend/include/Utils/DIPUtils.h @@ -176,11 +176,11 @@ void fillPixelsBilinearInterpolate4D( Value inputRowLastElemF32, Value inputColLastElemF32, Value c0F32, Value c1F32, Value dataCondition); -void NearestNeighbourInterpolationResizingNew( - OpBuilder &builder, Location loc, MLIRContext *ctx, Value input, - Value output, int64_t stride, Value horizontalScalingFactor, - Value verticalScalingFactor, VectorType vectorResTy, VectorType vectorTyI16, - VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1); +void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc, + MLIRContext *ctx, Value input, + Value output, int64_t stride, + Value horizontalScalingFactor, + Value verticalScalingFactor); void BilinearInterpolationResizingNew( OpBuilder &builder, Location loc, MLIRContext *ctx, Value input, diff --git a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp index 578563e08b..a250bd8068 100644 --- a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp +++ b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp @@ -300,7 +300,6 @@ class DIPResize2DOpLowering : public OpRewritePattern { Value verticalScalingFactor = op->getOperand(2); Value output = op->getOperand(3); auto interpolationAttr = op.getInterpolationType(); - Value strideVal = rewriter.create(loc, stride); auto inElemTy = input.getType().cast().getElementType(); dip::DIP_ERROR error = @@ -346,8 +345,7 @@ class DIPResize2DOpLowering : public OpRewritePattern { dip::InterpolationType::NearestNeighbourInterpolation) { dip::NearestNeighbourInterpolationResizingNew( rewriter, loc, ctx, input, output, stride, horizontalScalingFactor, - verticalScalingFactor, vectorResTy, vectorTyI16, vectorTyIndex, - vectorTyF32, vectorTyI1); + verticalScalingFactor); } else if (interpolationAttr == dip::InterpolationType::BilinearInterpolation) { dip::BilinearInterpolationResizingNew( diff --git a/midend/lib/Utils/DIPUtils.cpp b/midend/lib/Utils/DIPUtils.cpp index 3ddefccbd0..5d4282fe83 100644 --- a/midend/lib/Utils/DIPUtils.cpp +++ b/midend/lib/Utils/DIPUtils.cpp @@ -987,30 +987,18 @@ void fillPixelsBilinearInterpolate4D( }); } -void NearestNeighbourInterpolationResizingNew( - OpBuilder &builder, Location loc, MLIRContext *ctx, Value input, - Value output, int64_t stride, Value horizontalScalingFactor, - Value verticalScalingFactor, VectorType vectorResTy, VectorType vectorTyI16, - VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1) { +void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc, + MLIRContext *ctx, Value input, + Value output, int64_t stride, + Value horizontalScalingFactor, + Value verticalScalingFactor) { Value c0 = builder.create(loc, 0); Value c1 = builder.create(loc, 1); - Value strideVal = builder.create(loc, stride); - Value stepVec = iotaVec0F32(builder, loc, stride); - Value horizontalVec = builder.create( - loc, vectorTyF32, horizontalScalingFactor); - Value verticalVec = - builder.create(loc, vectorTyF32, verticalScalingFactor); - auto passThruRes = - builder.create(loc, builder.getZeroAttr(vectorResTy)); - auto passThruI16 = - builder.create(loc, builder.getZeroAttr(vectorTyI16)); Value inputRow = builder.create(loc, input, c0); Value inputRowMinus1 = builder.create( loc, builder.getI16Type(), builder.create(loc, inputRow, c1)); - Value inputRowMinus1Vec = - builder.create(loc, vectorTyI16, inputRowMinus1); Value inputCol = builder.create(loc, input, c1); Value inputColMinus1 = builder.create( loc, builder.getI16Type(), @@ -1018,40 +1006,30 @@ void NearestNeighbourInterpolationResizingNew( Value outputRow = builder.create(loc, output, c0); Value outputCol = builder.create(loc, output, c1); - Value outputColStrideRatio = - builder.create(loc, outputCol, strideVal); - Value outputColMultiple = builder.create( - loc, builder.create(loc, outputColStrideRatio, c1), - strideVal); MemRefType dynamicTypeI16 = MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 16)); Value srcXPosVec = - builder.create(loc, dynamicTypeI16, outputColMultiple); + builder.create(loc, dynamicTypeI16, outputCol); builder.create( - loc, c0, outputColMultiple, strideVal, std::nullopt, + loc, c0, outputCol, c1, std::nullopt, [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { - // Value maskVal = xBuilder.create(xLoc, outputCol, xiv); - Value xivFVec = xBuilder.create( - xLoc, vectorTyF32, indexToF32(xBuilder, xLoc, xiv)); - xivFVec = xBuilder.create(xLoc, xivFVec, stepVec); Value srcXPos = xBuilder.create( - xLoc, vectorTyI16, - xBuilder.create(xLoc, xivFVec, horizontalVec)); + xLoc, xBuilder.getI16Type(), + xBuilder.create(xLoc, + indexToF32(xBuilder, xLoc, xiv), + horizontalScalingFactor)); srcXPos = - xBuilder.create(xLoc, srcXPos, inputRowMinus1Vec); - // xBuilder.create( - // xLoc, srcXPosVec, ValueRange{xiv}, - // xBuilder.create(xLoc, vectorTyI1, - // maskVal), srcXPos); - xBuilder.create(xLoc, srcXPos, srcXPosVec, + xBuilder.create(xLoc, srcXPos, inputRowMinus1); + xBuilder.create(xLoc, srcXPos, srcXPosVec, ValueRange{xiv}); xBuilder.create(xLoc); }); - builder.create( - loc, c0, outputRow, c1, std::nullopt, - [&](OpBuilder &yBuilder, Location yLoc, Value yiv, ValueRange) { + builder.create( + loc, ValueRange{c0}, ValueRange{outputRow}, ValueRange{c1}, + [&](OpBuilder &yBuilder, Location yLoc, ValueRange ivs) { + Value yiv = ivs[0]; Value srcYPos = yBuilder.create( yLoc, yBuilder.getI16Type(), yBuilder.create( @@ -1061,24 +1039,16 @@ void NearestNeighbourInterpolationResizingNew( srcYPos = yBuilder.create( yLoc, yBuilder.getIndexType(), srcYPos); yBuilder.create( - loc, c0, outputCol, strideVal, std::nullopt, + loc, c0, outputCol, c1, std::nullopt, [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { - Value maskVal = - xBuilder.create(xLoc, outputCol, xiv); - Value maskVec = xBuilder.create( - xLoc, vectorTyI1, maskVal); - Value srcXPos = xBuilder.create( - xLoc, vectorTyI16, srcXPosVec, ValueRange{xiv}, maskVec, - passThruI16); - // Value srcXPos = xBuilder.create( - // xLoc, vectorTyI16, srcXPosVec, ValueRange{xiv}); - srcXPos = xBuilder.create(xLoc, vectorTyIndex, - srcXPos); - Value srcPixel = xBuilder.create( - xLoc, vectorResTy, input, ValueRange{srcYPos, c0}, srcXPos, - maskVec, passThruRes); - xBuilder.create( - xLoc, output, ValueRange{yiv, xiv}, maskVec, srcPixel); + Value srcXPos = xBuilder.create(xLoc, srcXPosVec, + ValueRange{xiv}); + srcXPos = xBuilder.create( + xLoc, xBuilder.getIndexType(), srcXPos); + Value srcPixel = xBuilder.create( + xLoc, input, ValueRange{srcYPos, srcXPos}); + xBuilder.create(xLoc, srcPixel, output, + ValueRange{yiv, xiv}); xBuilder.create(xLoc); }); yBuilder.create(yLoc); From 791c62dde39d6878da970e415aef36e448fda5dd Mon Sep 17 00:00:00 2001 From: HarryZ Date: Tue, 11 Mar 2025 19:31:49 +0800 Subject: [PATCH 7/7] [fix] cleanup old resize2d code --- midend/include/Utils/DIPUtils.h | 38 +++----- .../lib/Conversion/LowerDIP/LowerDIPPass.cpp | 4 +- midend/lib/Utils/DIPUtils.cpp | 88 ++----------------- 3 files changed, 21 insertions(+), 109 deletions(-) diff --git a/midend/include/Utils/DIPUtils.h b/midend/include/Utils/DIPUtils.h index e5c3cfdf30..edb96721b8 100644 --- a/midend/include/Utils/DIPUtils.h +++ b/midend/include/Utils/DIPUtils.h @@ -176,29 +176,13 @@ void fillPixelsBilinearInterpolate4D( Value inputRowLastElemF32, Value inputColLastElemF32, Value c0F32, Value c1F32, Value dataCondition); -void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc, - MLIRContext *ctx, Value input, - Value output, int64_t stride, - Value horizontalScalingFactor, - Value verticalScalingFactor); - -void BilinearInterpolationResizingNew( - OpBuilder &builder, Location loc, MLIRContext *ctx, Value input, - Value output, int64_t stride, Value horizontalScalingFactor, - Value verticalScalingFactor, Value halfVec, Value shiftVec, Value scaleVec, - VectorType vectorResTy, VectorType vectorTyI32, VectorType vectorTyI16, - VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1); - // Helper function for resizing an image using nearest neighbour interpolation // mechanism. -void NearestNeighbourInterpolationResizing( - OpBuilder &builder, Location loc, MLIRContext *ctx, - SmallVector lowerBounds, SmallVector upperBounds, - SmallVector steps, Value strideVal, Value input, Value output, - Value horizontalScalingFactorVec, Value verticalScalingFactorVec, - Value outputRowLastElemF32, Value outputColLastElemF32, - Value inputRowLastElemF32, Value inputColLastElemF32, VectorType vectorTy32, - int64_t stride, Value c0, Value c0F32); +void NearestNeighbourInterpolationResizing(OpBuilder &builder, Location loc, + MLIRContext *ctx, Value input, + Value output, int64_t stride, + Value horizontalScalingFactor, + Value verticalScalingFactor); // Helper function for resizing 4D an image using nearest neighbour // interpolation mechanism. @@ -213,13 +197,11 @@ void NearestNeighbourInterpolationResizing4D( // Helper function for resizing an image using bilinear interpolation mechanism. void BilinearInterpolationResizing( - OpBuilder &builder, Location loc, MLIRContext *ctx, - SmallVector lowerBounds, SmallVector upperBounds, - SmallVector steps, Value strideVal, Value input, Value output, - Value horizontalScalingFactorVec, Value verticalScalingFactorVec, - Value outputRowLastElemF32, Value outputColLastElemF32, - Value inputRowLastElemF32, Value inputColLastElemF32, VectorType vectorTy32, - int64_t stride, Value c0, Value c0F32, Value c1F32); + OpBuilder &builder, Location loc, MLIRContext *ctx, Value input, + Value output, int64_t stride, Value horizontalScalingFactor, + Value verticalScalingFactor, Value halfVec, Value shiftVec, Value scaleVec, + VectorType vectorResTy, VectorType vectorTyI32, VectorType vectorTyI16, + VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1); // Helper function for resizing 4D an image using bilinear interpolation // mechanism. diff --git a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp index a250bd8068..f2fa6e2a71 100644 --- a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp +++ b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp @@ -343,12 +343,12 @@ class DIPResize2DOpLowering : public OpRewritePattern { if (interpolationAttr == dip::InterpolationType::NearestNeighbourInterpolation) { - dip::NearestNeighbourInterpolationResizingNew( + dip::NearestNeighbourInterpolationResizing( rewriter, loc, ctx, input, output, stride, horizontalScalingFactor, verticalScalingFactor); } else if (interpolationAttr == dip::InterpolationType::BilinearInterpolation) { - dip::BilinearInterpolationResizingNew( + dip::BilinearInterpolationResizing( rewriter, loc, ctx, input, output, stride, horizontalScalingFactor, verticalScalingFactor, halfVec, shiftVec, scaleVec, vectorResTy, vectorTyI32, vectorTyI16, vectorTyIndex, vectorTyF32, vectorTyI1); diff --git a/midend/lib/Utils/DIPUtils.cpp b/midend/lib/Utils/DIPUtils.cpp index 5d4282fe83..d89a2ddd92 100644 --- a/midend/lib/Utils/DIPUtils.cpp +++ b/midend/lib/Utils/DIPUtils.cpp @@ -987,11 +987,13 @@ void fillPixelsBilinearInterpolate4D( }); } -void NearestNeighbourInterpolationResizingNew(OpBuilder &builder, Location loc, - MLIRContext *ctx, Value input, - Value output, int64_t stride, - Value horizontalScalingFactor, - Value verticalScalingFactor) { +// Helper function for resizing an image using nearest neighbour interpolation +// mechanism. +void NearestNeighbourInterpolationResizing(OpBuilder &builder, Location loc, + MLIRContext *ctx, Value input, + Value output, int64_t stride, + Value horizontalScalingFactor, + Value verticalScalingFactor) { Value c0 = builder.create(loc, 0); Value c1 = builder.create(loc, 1); @@ -1240,7 +1242,8 @@ void calcInterpolation(OpBuilder &builder, Location loc, Value &sy, syPrev = builder.create(loc, notEqual, sy, syPrev); } -void BilinearInterpolationResizingNew( +// Helper function for resizing an image using bilinear interpolation mechanism. +void BilinearInterpolationResizing( OpBuilder &builder, Location loc, MLIRContext *ctx, Value input, Value output, int64_t stride, Value horizontalScalingFactor, Value verticalScalingFactor, Value halfVec, Value shiftVec, Value scaleVec, @@ -1396,38 +1399,6 @@ void BilinearInterpolationResizingNew( builder.create(loc, iBeta); } -// Helper function for resizing an image using nearest neighbour interpolation -// mechanism. -void NearestNeighbourInterpolationResizing( - OpBuilder &builder, Location loc, MLIRContext *ctx, - SmallVector lowerBounds, SmallVector upperBounds, - SmallVector steps, Value strideVal, Value input, Value output, - Value horizontalScalingFactorVec, Value verticalScalingFactorVec, - Value outputRowLastElemF32, Value outputColLastElemF32, - Value inputRowLastElemF32, Value inputColLastElemF32, VectorType vectorTy32, - int64_t stride, Value c0, Value c0F32) { - affine::buildAffineLoopNest( - builder, loc, lowerBounds, upperBounds, steps, - [&](OpBuilder &builder, Location loc, ValueRange ivs) { - Value ivs0F32 = indexToF32(builder, loc, ivs[0]); - Value yVec = builder.create(loc, vectorTy32, ivs0F32); - Value xVec = iotaVec(builder, loc, ctx, ivs[1], strideVal, vectorTy32, - c0, stride); - - Value resXVecInterm = builder.create( - loc, xVec, horizontalScalingFactorVec); - Value resYVecInterm = - builder.create(loc, yVec, verticalScalingFactorVec); - - Value resXVec = roundOff(builder, loc, resXVecInterm); - Value resYVec = roundOff(builder, loc, resYVecInterm); - - fillPixels(builder, loc, xVec, yVec, resXVec, resYVec, input, output, - c0, strideVal, outputRowLastElemF32, outputColLastElemF32, - inputRowLastElemF32, inputColLastElemF32, c0F32); - }); -} - // Helper function for resizing 4D an image using nearest neighbour // interpolation mechanism. void NearestNeighbourInterpolationResizing4D( @@ -1461,47 +1432,6 @@ void NearestNeighbourInterpolationResizing4D( }); } -// Helper function for resizing an image using bilinear interpolation mechanism. -void BilinearInterpolationResizing( - OpBuilder &builder, Location loc, MLIRContext *ctx, - SmallVector lowerBounds, SmallVector upperBounds, - SmallVector steps, Value strideVal, Value input, Value output, - Value horizontalScalingFactorVec, Value verticalScalingFactorVec, - Value outputRowLastElemF32, Value outputColLastElemF32, - Value inputRowLastElemF32, Value inputColLastElemF32, VectorType vectorTy32, - int64_t stride, Value c0, Value c0F32, Value c1F32) { - affine::buildAffineLoopNest( - builder, loc, lowerBounds, upperBounds, steps, - [&](OpBuilder &builder, Location loc, ValueRange ivs) { - Value ivs0F32 = indexToF32(builder, loc, ivs[0]); - Value yVec = builder.create(loc, vectorTy32, ivs0F32); - Value xVec = iotaVec(builder, loc, ctx, ivs[1], strideVal, vectorTy32, - c0, stride); - - Value xVecInterm = builder.create( - loc, xVec, horizontalScalingFactorVec); - Value yVecInterm = - builder.create(loc, yVec, verticalScalingFactorVec); - - Value xVecInterm_L = builder.create(loc, xVecInterm); - Value xVecInterm_H = builder.create(loc, xVecInterm); - - Value yVecInterm_L = builder.create(loc, yVecInterm); - Value yVecInterm_H = builder.create(loc, yVecInterm); - - Value xVecWeight = - builder.create(loc, xVecInterm, xVecInterm_L); - Value yVecWeight = - builder.create(loc, yVecInterm, yVecInterm_L); - - fillPixelsBilinearInterpolate( - builder, loc, xVec, yVec, xVecInterm_L, yVecInterm_L, xVecInterm_H, - yVecInterm_H, input, output, c0, strideVal, xVecWeight, yVecWeight, - outputRowLastElemF32, outputColLastElemF32, inputRowLastElemF32, - inputColLastElemF32, c0F32, c1F32); - }); -} - // Helper function for resizing 4D an image using bilinear interpolation // mechanism. void BilinearInterpolationResizing4D(