diff --git a/examples/DIPDialect/CMakeLists.txt b/examples/DIPDialect/CMakeLists.txt index 7b2f075c9c..259d4f4f8c 100644 --- a/examples/DIPDialect/CMakeLists.txt +++ b/examples/DIPDialect/CMakeLists.txt @@ -1,4 +1,5 @@ -set(DIP_LIBS ${JPEG_LIBRARY} ${PNG_LIBRARIES} BuddyLibDIP) +set(DIP_LIBS ${JPEG_LIBRARY} ${PNG_LIBRARIES} BuddyLibDIP omp) +link_directories(${LLVM_LIBS}) if(BUDDY_ENABLE_OPENCV) find_package(OpenCV REQUIRED CONFIG) diff --git a/frontend/Interfaces/lib/CMakeLists.txt b/frontend/Interfaces/lib/CMakeLists.txt index c4172a8b23..1286289c22 100644 --- a/frontend/Interfaces/lib/CMakeLists.txt +++ b/frontend/Interfaces/lib/CMakeLists.txt @@ -17,16 +17,23 @@ endif () add_custom_command(OUTPUT DIP.o COMMAND ${CMAKE_BINARY_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/DIP.mlir -lower-dip="DIP-strip-mining=${SPLITING_SIZE}" - -arith-expand + -affine-parallelize -lower-affine - -convert-scf-to-cf - -convert-math-to-llvm + -convert-scf-to-openmp + -convert-vector-to-scf -convert-vector-to-llvm + -memref-expand + -arith-expand + -convert-arith-to-llvm -finalize-memref-to-llvm + -convert-scf-to-cf + -convert-openmp-to-llvm + -convert-math-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | ${LLVM_TOOLS_BINARY_DIR}/mlir-translate --mlir-to-llvmir | ${LLVM_TOOLS_BINARY_DIR}/llc + -relocation-model=pic -mtriple=${BUDDY_TARGET_TRIPLE} -mattr=${BUDDY_OPT_ATTR} --filetype=obj @@ -47,8 +54,8 @@ SET_TARGET_PROPERTIES(BuddyLibDIP PROPERTIES add_custom_command( OUTPUT DAP.o - COMMAND - ${CMAKE_BINARY_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/DAP.mlir + COMMAND + ${CMAKE_BINARY_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/DAP.mlir -lower-dap="DAP-vector-splitting=${SPLITING_SIZE}" --convert-linalg-to-affine-loops -arith-expand @@ -59,9 +66,9 @@ add_custom_command( -finalize-memref-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm - -reconcile-unrealized-casts | + -reconcile-unrealized-casts | ${LLVM_TOOLS_BINARY_DIR}/mlir-translate --mlir-to-llvmir | - ${LLVM_TOOLS_BINARY_DIR}/llc + ${LLVM_TOOLS_BINARY_DIR}/llc -mtriple=${BUDDY_TARGET_TRIPLE} -mattr=${BUDDY_OPT_ATTR} --filetype=obj @@ -71,25 +78,25 @@ add_custom_command( add_custom_command( OUTPUT DAP-extend.o - COMMAND - ${CMAKE_BINARY_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/DAP-extend.mlir + COMMAND + ${CMAKE_BINARY_DIR}/bin/buddy-opt ${CMAKE_CURRENT_SOURCE_DIR}/DAP-extend.mlir -extend-dap -one-shot-bufferize -convert-linalg-to-loops -convert-scf-to-cf -expand-strided-metadata -lower-affine - -convert-vector-to-llvm - -memref-expand + -convert-vector-to-llvm + -memref-expand -arith-expand -convert-arith-to-llvm - -finalize-memref-to-llvm + -finalize-memref-to-llvm -convert-math-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm - -reconcile-unrealized-casts | + -reconcile-unrealized-casts | ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir | - ${LLVM_TOOLS_BINARY_DIR}/llc + ${LLVM_TOOLS_BINARY_DIR}/llc -mtriple=${BUDDY_TARGET_TRIPLE} -mattr=${BUDDY_OPT_ATTR} -filetype=obj -relocation-model=pic @@ -99,7 +106,7 @@ add_custom_command( add_custom_command( OUTPUT DAPVectorization.o - COMMAND + COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/DAP.mlir | sed -e 's/@buddy_fir/@buddy_fir_vectorization/' -e 's/@buddy_iir/@buddy_iir_vectorization/' @@ -115,9 +122,9 @@ add_custom_command( -finalize-memref-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm - -reconcile-unrealized-casts | + -reconcile-unrealized-casts | ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir | - ${LLVM_TOOLS_BINARY_DIR}/llc + ${LLVM_TOOLS_BINARY_DIR}/llc -mtriple=${BUDDY_TARGET_TRIPLE} -mattr=${BUDDY_OPT_ATTR} -filetype=obj @@ -125,9 +132,9 @@ add_custom_command( DEPENDS mlir-translate llc buddy-opt ) -add_library(BuddyLibDAP STATIC - DAP.o - DAP-extend.o +add_library(BuddyLibDAP STATIC + DAP.o + DAP-extend.o DAPVectorization.o ) diff --git a/midend/include/Utils/DIPUtils.h b/midend/include/Utils/DIPUtils.h index c5bd3104d1..edb96721b8 100644 --- a/midend/include/Utils/DIPUtils.h +++ b/midend/include/Utils/DIPUtils.h @@ -178,14 +178,11 @@ void fillPixelsBilinearInterpolate4D( // Helper function for resizing an image using nearest neighbour interpolation // mechanism. -void NearestNeighbourInterpolationResizing( - OpBuilder &builder, Location loc, MLIRContext *ctx, - SmallVector lowerBounds, SmallVector upperBounds, - SmallVector steps, Value strideVal, Value input, Value output, - Value horizontalScalingFactorVec, Value verticalScalingFactorVec, - Value outputRowLastElemF32, Value outputColLastElemF32, - Value inputRowLastElemF32, Value inputColLastElemF32, VectorType vectorTy32, - int64_t stride, Value c0, Value c0F32); +void NearestNeighbourInterpolationResizing(OpBuilder &builder, Location loc, + MLIRContext *ctx, Value input, + Value output, int64_t stride, + Value horizontalScalingFactor, + Value verticalScalingFactor); // Helper function for resizing 4D an image using nearest neighbour // interpolation mechanism. @@ -200,13 +197,11 @@ void NearestNeighbourInterpolationResizing4D( // Helper function for resizing an image using bilinear interpolation mechanism. void BilinearInterpolationResizing( - OpBuilder &builder, Location loc, MLIRContext *ctx, - SmallVector lowerBounds, SmallVector upperBounds, - SmallVector steps, Value strideVal, Value input, Value output, - Value horizontalScalingFactorVec, Value verticalScalingFactorVec, - Value outputRowLastElemF32, Value outputColLastElemF32, - Value inputRowLastElemF32, Value inputColLastElemF32, VectorType vectorTy32, - int64_t stride, Value c0, Value c0F32, Value c1F32); + OpBuilder &builder, Location loc, MLIRContext *ctx, Value input, + Value output, int64_t stride, Value horizontalScalingFactor, + Value verticalScalingFactor, Value halfVec, Value shiftVec, Value scaleVec, + VectorType vectorResTy, VectorType vectorTyI32, VectorType vectorTyI16, + VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1); // Helper function for resizing 4D an image using bilinear interpolation // mechanism. diff --git a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp index ef7fc9b9b0..f2fa6e2a71 100644 --- a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp +++ b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp @@ -300,7 +300,6 @@ class DIPResize2DOpLowering : public OpRewritePattern { Value verticalScalingFactor = op->getOperand(2); Value output = op->getOperand(3); auto interpolationAttr = op.getInterpolationType(); - Value strideVal = rewriter.create(loc, stride); auto inElemTy = input.getType().cast().getElementType(); dip::DIP_ERROR error = @@ -314,85 +313,45 @@ class DIPResize2DOpLowering : public OpRewritePattern { << inElemTy << "is passed"; } - Value c0 = rewriter.create(loc, 0); - Value c1 = rewriter.create(loc, 1); - Value c0F32 = indexToF32(rewriter, loc, c0); - - Value inputRow = rewriter.create(loc, input, c0); - Value inputCol = rewriter.create(loc, input, c1); - - Value outputRow = rewriter.create(loc, output, c0); - Value outputCol = rewriter.create(loc, output, c1); - - // Determine lower bound for second call of resize function (this is done - // for efficient tail processing). - Value outputColStrideRatio = - rewriter.create(loc, outputCol, strideVal); - Value outputColMultiple = - rewriter.create(loc, strideVal, outputColStrideRatio); - - SmallVector lowerBounds1{c0, c0}; - SmallVector upperBounds1{outputRow, outputColMultiple}; - - SmallVector steps{1, stride}; - Value strideTailVal = - rewriter.create(loc, outputCol, outputColMultiple); - - SmallVector lowerBounds2{c0, outputColMultiple}; - SmallVector upperBounds2{outputRow, outputCol}; - - FloatType f32 = FloatType::getF32(ctx); - VectorType vectorTy32 = VectorType::get({stride}, f32); - - Value horizontalScalingFactorVec = rewriter.create( - loc, vectorTy32, horizontalScalingFactor); - Value verticalScalingFactorVec = rewriter.create( - loc, vectorTy32, verticalScalingFactor); - - // Obtain extreme allocatable value(s) in input and output for bounding - // purpose. - Value inputRowLastElem = rewriter.create(loc, inputRow, c1); - Value inputRowLastElemF32 = indexToF32(rewriter, loc, inputRowLastElem); - - Value inputColLastElem = rewriter.create(loc, inputCol, c1); - Value inputColLastElemF32 = indexToF32(rewriter, loc, inputColLastElem); - - Value outputRowLastElem = - rewriter.create(loc, outputRow, c1); - Value outputRowLastElemF32 = indexToF32(rewriter, loc, outputRowLastElem); - - Value outputColLastElem = - rewriter.create(loc, outputCol, c1); - Value outputColLastElemF32 = indexToF32(rewriter, loc, outputColLastElem); + VectorType vectorTyI1 = VectorType::get({stride}, rewriter.getI1Type()); + VectorType vectorTyI16 = + VectorType::get({stride}, IntegerType::get(rewriter.getContext(), 16)); + VectorType vectorTyI32 = + VectorType::get({stride}, IntegerType::get(rewriter.getContext(), 32)); + VectorType vectorTyF32 = + VectorType::get({stride}, FloatType::getF32(rewriter.getContext())); + VectorType vectorTyIndex = + VectorType::get({stride}, rewriter.getIndexType()); + VectorType vectorResTy = VectorType::get( + {stride}, inElemTy.isF32() ? FloatType::getF32(rewriter.getContext()) + : FloatType::getF64(rewriter.getContext())); + + static const int SHIFT = 11; + static const int HALF = 1 << (SHIFT - 1); + static const int INTER_RESIZE_COEF_SCALE = 1 << SHIFT; + Value half = + rewriter.create(loc, HALF, rewriter.getI32Type()); + Value halfVec = rewriter.create(loc, vectorTyI32, half); + Value shift = rewriter.create(loc, SHIFT, + rewriter.getI32Type()); + Value shiftVec = rewriter.create(loc, vectorTyI32, shift); + Value scaleVec = rewriter.create( + loc, vectorTyF32, + rewriter.create( + loc, (llvm::APFloat)(float)INTER_RESIZE_COEF_SCALE, + rewriter.getF32Type())); if (interpolationAttr == dip::InterpolationType::NearestNeighbourInterpolation) { dip::NearestNeighbourInterpolationResizing( - rewriter, loc, ctx, lowerBounds1, upperBounds1, steps, strideVal, - input, output, horizontalScalingFactorVec, verticalScalingFactorVec, - outputRowLastElemF32, outputColLastElemF32, inputRowLastElemF32, - inputColLastElemF32, vectorTy32, stride, c0, c0F32); - - dip::NearestNeighbourInterpolationResizing( - rewriter, loc, ctx, lowerBounds2, upperBounds2, steps, strideTailVal, - input, output, horizontalScalingFactorVec, verticalScalingFactorVec, - outputRowLastElemF32, outputColLastElemF32, inputRowLastElemF32, - inputColLastElemF32, vectorTy32, stride, c0, c0F32); + rewriter, loc, ctx, input, output, stride, horizontalScalingFactor, + verticalScalingFactor); } else if (interpolationAttr == dip::InterpolationType::BilinearInterpolation) { - Value c1F32 = indexToF32(rewriter, loc, c1); - dip::BilinearInterpolationResizing( - rewriter, loc, ctx, lowerBounds1, upperBounds1, steps, strideVal, - input, output, horizontalScalingFactorVec, verticalScalingFactorVec, - outputRowLastElemF32, outputColLastElemF32, inputRowLastElemF32, - inputColLastElemF32, vectorTy32, stride, c0, c0F32, c1F32); - - dip::BilinearInterpolationResizing( - rewriter, loc, ctx, lowerBounds2, upperBounds2, steps, strideTailVal, - input, output, horizontalScalingFactorVec, verticalScalingFactorVec, - outputRowLastElemF32, outputColLastElemF32, inputRowLastElemF32, - inputColLastElemF32, vectorTy32, stride, c0, c0F32, c1F32); + rewriter, loc, ctx, input, output, stride, horizontalScalingFactor, + verticalScalingFactor, halfVec, shiftVec, scaleVec, vectorResTy, + vectorTyI32, vectorTyI16, vectorTyIndex, vectorTyF32, vectorTyI1); } // Remove the original resize operation. diff --git a/midend/lib/Utils/DIPUtils.cpp b/midend/lib/Utils/DIPUtils.cpp index d68451bb51..d89a2ddd92 100644 --- a/midend/lib/Utils/DIPUtils.cpp +++ b/midend/lib/Utils/DIPUtils.cpp @@ -989,34 +989,414 @@ void fillPixelsBilinearInterpolate4D( // Helper function for resizing an image using nearest neighbour interpolation // mechanism. -void NearestNeighbourInterpolationResizing( - OpBuilder &builder, Location loc, MLIRContext *ctx, - SmallVector lowerBounds, SmallVector upperBounds, - SmallVector steps, Value strideVal, Value input, Value output, - Value horizontalScalingFactorVec, Value verticalScalingFactorVec, - Value outputRowLastElemF32, Value outputColLastElemF32, - Value inputRowLastElemF32, Value inputColLastElemF32, VectorType vectorTy32, - int64_t stride, Value c0, Value c0F32) { - affine::buildAffineLoopNest( - builder, loc, lowerBounds, upperBounds, steps, - [&](OpBuilder &builder, Location loc, ValueRange ivs) { - Value ivs0F32 = indexToF32(builder, loc, ivs[0]); - Value yVec = builder.create(loc, vectorTy32, ivs0F32); - Value xVec = iotaVec(builder, loc, ctx, ivs[1], strideVal, vectorTy32, - c0, stride); +void NearestNeighbourInterpolationResizing(OpBuilder &builder, Location loc, + MLIRContext *ctx, Value input, + Value output, int64_t stride, + Value horizontalScalingFactor, + Value verticalScalingFactor) { + Value c0 = builder.create(loc, 0); + Value c1 = builder.create(loc, 1); - Value resXVecInterm = builder.create( - loc, xVec, horizontalScalingFactorVec); - Value resYVecInterm = - builder.create(loc, yVec, verticalScalingFactorVec); + Value inputRow = builder.create(loc, input, c0); + Value inputRowMinus1 = builder.create( + loc, builder.getI16Type(), + builder.create(loc, inputRow, c1)); + Value inputCol = builder.create(loc, input, c1); + Value inputColMinus1 = builder.create( + loc, builder.getI16Type(), + builder.create(loc, inputCol, c1)); + + Value outputRow = builder.create(loc, output, c0); + Value outputCol = builder.create(loc, output, c1); + + MemRefType dynamicTypeI16 = + MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 16)); + Value srcXPosVec = + builder.create(loc, dynamicTypeI16, outputCol); + builder.create( + loc, c0, outputCol, c1, std::nullopt, + [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { + Value srcXPos = xBuilder.create( + xLoc, xBuilder.getI16Type(), + xBuilder.create(xLoc, + indexToF32(xBuilder, xLoc, xiv), + horizontalScalingFactor)); + srcXPos = + xBuilder.create(xLoc, srcXPos, inputRowMinus1); + xBuilder.create(xLoc, srcXPos, srcXPosVec, + ValueRange{xiv}); + xBuilder.create(xLoc); + }); - Value resXVec = roundOff(builder, loc, resXVecInterm); - Value resYVec = roundOff(builder, loc, resYVecInterm); + builder.create( + loc, ValueRange{c0}, ValueRange{outputRow}, ValueRange{c1}, + [&](OpBuilder &yBuilder, Location yLoc, ValueRange ivs) { + Value yiv = ivs[0]; + Value srcYPos = yBuilder.create( + yLoc, yBuilder.getI16Type(), + yBuilder.create( + yLoc, indexToF32(yBuilder, yLoc, yiv), verticalScalingFactor)); + srcYPos = + yBuilder.create(yLoc, srcYPos, inputColMinus1); + srcYPos = yBuilder.create( + yLoc, yBuilder.getIndexType(), srcYPos); + yBuilder.create( + loc, c0, outputCol, c1, std::nullopt, + [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { + Value srcXPos = xBuilder.create(xLoc, srcXPosVec, + ValueRange{xiv}); + srcXPos = xBuilder.create( + xLoc, xBuilder.getIndexType(), srcXPos); + Value srcPixel = xBuilder.create( + xLoc, input, ValueRange{srcYPos, srcXPos}); + xBuilder.create(xLoc, srcPixel, output, + ValueRange{yiv, xiv}); + xBuilder.create(xLoc); + }); + yBuilder.create(yLoc); + }); +} - fillPixels(builder, loc, xVec, yVec, resXVec, resYVec, input, output, - c0, strideVal, outputRowLastElemF32, outputColLastElemF32, - inputRowLastElemF32, inputColLastElemF32, c0F32); +void processScaling(OpBuilder &builder, Location loc, Value output, + Value scalingFactor, Value input, Value xOffset, + Value iAlpha, int64_t stride, Value scaleVec, + VectorType vectorTyI32, VectorType vectorTyF32, + VectorType vectorTyI1, VectorType vectorTyI16) { + Value c0 = builder.create(loc, 0); + Value c1 = builder.create(loc, 1); + Value c2 = builder.create(loc, 2); + Value c0I32Vec = builder.create( + loc, vectorTyI32, + builder.create(loc, builder.getI32Type(), c0)); + Value c0FVec = builder.create( + loc, vectorTyF32, + builder.create(loc, (llvm::APFloat)0.0f, + builder.getF32Type())); + Value c1FVec = builder.create( + loc, vectorTyF32, + builder.create(loc, (llvm::APFloat)1.0f, + builder.getF32Type())); + Value cDot5FVec = builder.create( + loc, vectorTyF32, + builder.create(loc, (llvm::APFloat)0.5f, + builder.getF32Type())); + Value scalingFactorVec = + builder.create(loc, vectorTyF32, scalingFactor); + + Value inputMinus1 = builder.create( + loc, builder.getI32Type(), builder.create(loc, input, c1)); + Value inputMinus1Vec = + builder.create(loc, vectorTyI32, inputMinus1); + Value inputMinus2 = builder.create( + loc, builder.getI32Type(), builder.create(loc, input, c2)); + Value inputMinus2Vec = + builder.create(loc, vectorTyI32, inputMinus2); + Value stepVec = iotaVec0F32(builder, loc, stride); + Value strideVal = builder.create(loc, stride); + + builder.create( + loc, c0, output, strideVal, std::nullopt, + [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { + // float fx = (float)((dx + 0.5) * scale_x - 0.5); + Value xivFVec = xBuilder.create( + xLoc, vectorTyF32, indexToF32(xBuilder, xLoc, xiv)); + xivFVec = xBuilder.create(xLoc, xivFVec, stepVec); + Value temp1 = xBuilder.create(xLoc, xivFVec, cDot5FVec); + Value temp2 = + xBuilder.create(xLoc, temp1, scalingFactorVec); + Value fx = xBuilder.create(xLoc, temp2, cDot5FVec); + Value sx = xBuilder.create(xLoc, vectorTyI32, fx); + fx = xBuilder.create( + xLoc, fx, xBuilder.create(xLoc, vectorTyF32, sx)); + Value lowerThanZero = xBuilder.create( + xLoc, arith::CmpIPredicate::slt, sx, c0I32Vec); + Value greaterThan = xBuilder.create( + xLoc, arith::CmpIPredicate::sge, sx, inputMinus1Vec); + sx = + xBuilder.create(xLoc, lowerThanZero, c0I32Vec, sx); + fx = xBuilder.create(xLoc, lowerThanZero, c0FVec, fx); + sx = xBuilder.create(xLoc, greaterThan, inputMinus2Vec, + sx); + fx = xBuilder.create(xLoc, greaterThan, c0FVec, fx); + Value maskVal = xBuilder.create(xLoc, output, xiv); + xBuilder.create( + xLoc, xOffset, ValueRange{xiv}, + xBuilder.create(xLoc, vectorTyI1, maskVal), + sx); + + // ialpha[dx * 2] = (short)((1.f - fx) * INTER_RESIZE_COEF_SCALE); + // ialpha[dx * 2 + 1] = (short)(fx * INTER_RESIZE_COEF_SCALE); + Value fxScale = xBuilder.create(xLoc, fx, scaleVec); + Value oneMinusFx = xBuilder.create(xLoc, c1FVec, fx); + Value oneMinusFxScale = + xBuilder.create(xLoc, oneMinusFx, scaleVec); + + Value val0 = xBuilder.create(xLoc, vectorTyI16, + oneMinusFxScale); + Value val1 = + xBuilder.create(xLoc, vectorTyI16, fxScale); + + SmallVector maskVec; + for (int i = 0; i < stride; i++) { + maskVec.push_back(i); + maskVec.push_back(i + stride); + } + Value index0 = xBuilder.create(xLoc, xiv, c2); + xBuilder.create( + xLoc, xBuilder.create(xLoc, val0, val1, maskVec), + iAlpha, ValueRange{index0}); + xBuilder.create(xLoc); + }); +} + +void calcInterpolation(OpBuilder &builder, Location loc, Value &sy, + Value &syPrev, Value offset, Value input, + Value outputWidth, Value iAlpha, Value buffer, + int64_t stride, Value halfVec, Value shiftVec, + VectorType vectorResTy, VectorType vectorTyIndex, + VectorType vectorTyI32, VectorType vectorTyI16, + VectorType vectorTyI1) { + Value c0 = builder.create(loc, 0); + Value c1 = builder.create(loc, 1); + Value c2 = builder.create(loc, 2); + + Value strideVal = builder.create(loc, stride); + + Value c1I1Vec = builder.create( + loc, vectorTyI1, + builder.create(loc, 1, builder.getI1Type())); + Value c1Vec = builder.create(loc, vectorTyIndex, c1); + + auto passThruRes = + builder.create(loc, builder.getZeroAttr(vectorResTy)); + auto passThruI32 = + builder.create(loc, builder.getZeroAttr(vectorTyI32)); + Value notEqual = + builder.create(loc, arith::CmpIPredicate::ne, sy, syPrev); + builder.create( + loc, notEqual, [&](OpBuilder &tBuilder, Location tLoc) { + tBuilder.create( + tLoc, c0, outputWidth, strideVal, std::nullopt, + [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { + Value maskVal = + xBuilder.create(xLoc, outputWidth, xiv); + Value maskVec = xBuilder.create( + xLoc, vectorTyI1, maskVal); + Value sxVec = xBuilder.create( + xLoc, vectorTyIndex, + xBuilder.create(xLoc, vectorTyI32, + offset, ValueRange{xiv}, + maskVec, passThruI32)); + Value sxPlus1Vec = + xBuilder.create(xLoc, sxVec, c1Vec); + + Value index0 = xBuilder.create(xLoc, xiv, c2); + Value index1 = + xBuilder.create(xLoc, index0, strideVal); + Value a0Vec = xBuilder.create( + xLoc, vectorTyI32, + xBuilder.create(xLoc, vectorTyI16, iAlpha, + index0)); + Value a1Vec = xBuilder.create( + xLoc, vectorTyI32, + xBuilder.create(xLoc, vectorTyI16, iAlpha, + index1)); + + Value syIndex = xBuilder.create( + xLoc, xBuilder.getIndexType(), sy); + Value v0Vec = xBuilder.create( + xLoc, vectorTyI32, + xBuilder.create( + xLoc, vectorResTy, input, ValueRange{syIndex, c0}, sxVec, + c1I1Vec, passThruRes)); + Value v1Vec = xBuilder.create( + xLoc, vectorTyI32, + xBuilder.create( + xLoc, vectorResTy, input, ValueRange{syIndex, c0}, + sxPlus1Vec, c1I1Vec, passThruRes)); + + SmallVector maskVec1, maskVec2; + for (int i = 0; i < stride; i++) { + maskVec1.push_back(i * 2); + maskVec2.push_back(i * 2 + 1); + } + Value a0ShuffleVec = xBuilder.create( + xLoc, a0Vec, a1Vec, maskVec1); + Value a1ShuffleVec = xBuilder.create( + xLoc, a0Vec, a1Vec, maskVec2); + Value aMulv0 = + xBuilder.create(xLoc, a0ShuffleVec, v0Vec); + Value aMulv1 = + xBuilder.create(xLoc, a1ShuffleVec, v1Vec); + Value addRes = + xBuilder.create(xLoc, aMulv0, aMulv1); + addRes = builder.create(loc, addRes, halfVec); + Value resShifted = + xBuilder.create(xLoc, addRes, shiftVec); + xBuilder.create( + xLoc, buffer, ValueRange{xiv}, maskVec, resShifted); + xBuilder.create(xLoc); + }); + tBuilder.create(tLoc); + }); + syPrev = builder.create(loc, notEqual, sy, syPrev); +} + +// Helper function for resizing an image using bilinear interpolation mechanism. +void BilinearInterpolationResizing( + OpBuilder &builder, Location loc, MLIRContext *ctx, Value input, + Value output, int64_t stride, Value horizontalScalingFactor, + Value verticalScalingFactor, Value halfVec, Value shiftVec, Value scaleVec, + VectorType vectorResTy, VectorType vectorTyI32, VectorType vectorTyI16, + VectorType vectorTyIndex, VectorType vectorTyF32, VectorType vectorTyI1) { + + auto passThruI32 = + builder.create(loc, builder.getZeroAttr(vectorTyI32)); + Value cMinus1 = builder.create(loc, -1); + Value c0 = builder.create(loc, 0); + Value c1 = builder.create(loc, 1); + Value c1I32 = + builder.create(loc, 1, builder.getI32Type()); + Value c2 = builder.create(loc, 2); + + Value inputRow = builder.create(loc, input, c0); + Value inputCol = builder.create(loc, input, c1); + Value outputRow = builder.create(loc, output, c0); + Value outputCol = builder.create(loc, output, c1); + + Value strideVal = builder.create(loc, stride); + Value outputColStrideRatio = + builder.create(loc, outputCol, strideVal); + Value outputColMultiple = builder.create( + loc, builder.create(loc, outputColStrideRatio, c1), + strideVal); + Value outputColMultiple2 = + builder.create(loc, outputColMultiple, c2); + Value outputRowStrideRatio = + builder.create(loc, outputRow, strideVal); + Value outputRowMultiple = builder.create( + loc, builder.create(loc, outputRowStrideRatio, c1), + strideVal); + Value outputRowMultiple2 = + builder.create(loc, outputRowMultiple, c2); + + MemRefType dynamicTypeI32 = + MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 32)); + Value xOffset = + builder.create(loc, dynamicTypeI32, outputCol); + MemRefType dynamicTypeI16 = + MemRefType::get(ShapedType::kDynamic, IntegerType::get(ctx, 16)); + Value iAlpha = + builder.create(loc, dynamicTypeI16, outputColMultiple2); + + processScaling(builder, loc, outputCol, horizontalScalingFactor, inputCol, + xOffset, iAlpha, stride, scaleVec, vectorTyI32, vectorTyF32, + vectorTyI1, vectorTyI16); + + Value yOffset = + builder.create(loc, dynamicTypeI32, outputRow); + Value iBeta = + builder.create(loc, dynamicTypeI16, outputRowMultiple2); + + processScaling(builder, loc, outputRow, verticalScalingFactor, inputRow, + yOffset, iBeta, stride, scaleVec, vectorTyI32, vectorTyF32, + vectorTyI1, vectorTyI16); + + Value bufferWidth = outputCol; + + Value zeroVec = builder.create( + loc, vectorTyI32, + builder.create(loc, 0, builder.getI32Type())); + Value twoFiftyFiveVec = builder.create( + loc, vectorTyI32, + builder.create(loc, 255, builder.getI32Type())); + + auto resizeLoop = [&](OpBuilder &yBuilder, Location yLoc, Value yiv, + ValueRange) { + Value buffer0 = + yBuilder.create(loc, dynamicTypeI32, bufferWidth); + Value buffer1 = + yBuilder.create(loc, dynamicTypeI32, bufferWidth); + Value prevSy0 = yBuilder.create( + loc, yBuilder.getI32Type(), cMinus1); + Value prevSy1 = yBuilder.create( + loc, yBuilder.getI32Type(), cMinus1); + + Value sy = yBuilder.create(yLoc, yOffset, ValueRange{yiv}); + Value syNext = yBuilder.create(yLoc, sy, c1I32); + calcInterpolation(yBuilder, yLoc, sy, prevSy0, xOffset, input, outputCol, + iAlpha, buffer0, stride, halfVec, shiftVec, vectorResTy, + vectorTyIndex, vectorTyI32, vectorTyI16, vectorTyI1); + calcInterpolation(yBuilder, yLoc, syNext, prevSy1, xOffset, input, + outputCol, iAlpha, buffer1, stride, halfVec, shiftVec, + vectorResTy, vectorTyIndex, vectorTyI32, vectorTyI16, + vectorTyI1); + + Value index0 = yBuilder.create(yLoc, yiv, c2); + Value index1 = yBuilder.create(yLoc, index0, c1); + Value b0 = yBuilder.create( + yLoc, yBuilder.getI32Type(), + yBuilder.create(yLoc, iBeta, index0)); + Value b0Vec = yBuilder.create(yLoc, vectorTyI32, b0); + Value b1 = yBuilder.create( + yLoc, yBuilder.getI32Type(), + yBuilder.create(yLoc, iBeta, index1)); + Value b1Vec = yBuilder.create(yLoc, vectorTyI32, b1); + + yBuilder.create( + yLoc, c0, bufferWidth, strideVal, std::nullopt, + [&](OpBuilder &xBuilder, Location xLoc, Value xiv, ValueRange) { + Value maskVal = + xBuilder.create(xLoc, bufferWidth, xiv); + Value maskVec = + xBuilder.create(xLoc, vectorTyI1, maskVal); + Value buffer0X = xBuilder.create( + xLoc, vectorTyI32, buffer0, ValueRange{xiv}, maskVec, + passThruI32); + Value buffer1X = xBuilder.create( + xLoc, vectorTyI32, buffer1, ValueRange{xiv}, maskVec, + passThruI32); + Value b0MulBuffer0 = + xBuilder.create(xLoc, b0Vec, buffer0X); + Value b1MulBuffer1 = + xBuilder.create(xLoc, b1Vec, buffer1X); + Value bufferRes = + xBuilder.create(xLoc, b0MulBuffer0, b1MulBuffer1); + Value addHalf = + xBuilder.create(xLoc, bufferRes, halfVec); + Value resShifted = + xBuilder.create(xLoc, addHalf, shiftVec); + Value maxVal = + xBuilder.create(xLoc, resShifted, zeroVec); + Value clampedVal = + xBuilder.create(xLoc, maxVal, twoFiftyFiveVec); + Value clampedValF = + xBuilder.create(xLoc, vectorResTy, clampedVal); + xBuilder.create( + xLoc, output, ValueRange{yiv, xiv}, maskVec, clampedValF); + xBuilder.create(xLoc); + }); + yBuilder.create(loc, buffer0); + yBuilder.create(loc, buffer1); + yBuilder.create(yLoc); + }; + + Value batch = builder.create(loc, stride); + builder.create( + loc, ValueRange{c0}, ValueRange{outputRow}, ValueRange{batch}, + [&](OpBuilder &tBuilder, Location tLoc, ValueRange ivs) { + Value tStart = ivs[0]; + Value tEnd = tBuilder.create(tLoc, tStart, batch); + tEnd = tBuilder.create(tLoc, tEnd, outputRow); + tBuilder.create(tLoc, tStart, tEnd, c1, std::nullopt, + resizeLoop); + tBuilder.create(tLoc); }); + + builder.create(loc, xOffset); + builder.create(loc, iAlpha); + builder.create(loc, yOffset); + builder.create(loc, iBeta); } // Helper function for resizing 4D an image using nearest neighbour @@ -1052,47 +1432,6 @@ void NearestNeighbourInterpolationResizing4D( }); } -// Helper function for resizing an image using bilinear interpolation mechanism. -void BilinearInterpolationResizing( - OpBuilder &builder, Location loc, MLIRContext *ctx, - SmallVector lowerBounds, SmallVector upperBounds, - SmallVector steps, Value strideVal, Value input, Value output, - Value horizontalScalingFactorVec, Value verticalScalingFactorVec, - Value outputRowLastElemF32, Value outputColLastElemF32, - Value inputRowLastElemF32, Value inputColLastElemF32, VectorType vectorTy32, - int64_t stride, Value c0, Value c0F32, Value c1F32) { - affine::buildAffineLoopNest( - builder, loc, lowerBounds, upperBounds, steps, - [&](OpBuilder &builder, Location loc, ValueRange ivs) { - Value ivs0F32 = indexToF32(builder, loc, ivs[0]); - Value yVec = builder.create(loc, vectorTy32, ivs0F32); - Value xVec = iotaVec(builder, loc, ctx, ivs[1], strideVal, vectorTy32, - c0, stride); - - Value xVecInterm = builder.create( - loc, xVec, horizontalScalingFactorVec); - Value yVecInterm = - builder.create(loc, yVec, verticalScalingFactorVec); - - Value xVecInterm_L = builder.create(loc, xVecInterm); - Value xVecInterm_H = builder.create(loc, xVecInterm); - - Value yVecInterm_L = builder.create(loc, yVecInterm); - Value yVecInterm_H = builder.create(loc, yVecInterm); - - Value xVecWeight = - builder.create(loc, xVecInterm, xVecInterm_L); - Value yVecWeight = - builder.create(loc, yVecInterm, yVecInterm_L); - - fillPixelsBilinearInterpolate( - builder, loc, xVec, yVec, xVecInterm_L, yVecInterm_L, xVecInterm_H, - yVecInterm_H, input, output, c0, strideVal, xVecWeight, yVecWeight, - outputRowLastElemF32, outputColLastElemF32, inputRowLastElemF32, - inputColLastElemF32, c0F32, c1F32); - }); -} - // Helper function for resizing 4D an image using bilinear interpolation // mechanism. void BilinearInterpolationResizing4D(