@@ -100,6 +100,42 @@ static mlir::Value emitX86MaskAddLogic(CIRGenBuilderTy &builder,
100100 return builder.createBitcast (resVec, ops[0 ].getType ());
101101}
102102
103+ static mlir::Value emitX86MaskUnpack (CIRGenBuilderTy &builder,
104+ mlir::Location loc,
105+ const std::string &intrinsicName,
106+ SmallVectorImpl<mlir::Value> &ops) {
107+ unsigned numElems = cast<cir::IntType>(ops[0 ].getType ()).getWidth ();
108+
109+ // Convert both operands to mask vectors.
110+ mlir::Value lhs = getMaskVecValue (builder, loc, ops[0 ], numElems);
111+ mlir::Value rhs = getMaskVecValue (builder, loc, ops[1 ], numElems);
112+
113+ mlir::Type i32Ty = builder.getSInt32Ty ();
114+
115+ // Create indices for extracting the first half of each vector.
116+ SmallVector<mlir::Attribute, 32 > halfIndices;
117+ for (auto i : llvm::seq<unsigned >(0 , numElems / 2 ))
118+ halfIndices.push_back (cir::IntAttr::get (i32Ty, i));
119+
120+ // Extract first half of each vector. This gives better codegen than
121+ // doing it in a single shuffle.
122+ mlir::Value lhsHalf = builder.createVecShuffle (loc, lhs, lhs, halfIndices);
123+ mlir::Value rhsHalf = builder.createVecShuffle (loc, rhs, rhs, halfIndices);
124+
125+ // Create indices for concatenating the vectors.
126+ // NOTE: Operands are swapped to match the intrinsic definition.
127+ // After the half extraction, both vectors have numElems/2 elements.
128+ // In createVecShuffle(rhsHalf, lhsHalf, indices), indices [0..numElems/2-1] select
129+ // from rhsHalf, and indices [numElems/2..numElems-1] select from lhsHalf.
130+ SmallVector<mlir::Attribute, 64 > concatIndices;
131+ for (auto i : llvm::seq<unsigned >(0 , numElems))
132+ concatIndices.push_back (cir::IntAttr::get (i32Ty, i));
133+
134+ // Concat the vectors (RHS first, then LHS).
135+ mlir::Value res = builder.createVecShuffle (loc, rhsHalf, lhsHalf, concatIndices);
136+ return builder.createBitcast (res, ops[0 ].getType ());
137+ }
138+
103139static mlir::Value emitX86MaskLogic (CIRGenBuilderTy &builder,
104140 mlir::Location loc,
105141 cir::BinOpKind binOpKind,
@@ -244,42 +280,15 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
244280 getContext ().BuiltinInfo .getName (builtinID));
245281 return {};
246282
247- case X86::BI__builtin_ia32_kunpckdi:
283+ case X86::BI__builtin_ia32_kunpckhi:
284+ return emitX86MaskUnpack (builder, getLoc (expr->getExprLoc ()),
285+ " x86.avx512.kunpackb" , ops);
248286 case X86::BI__builtin_ia32_kunpcksi:
249- case X86::BI__builtin_ia32_kunpckhi: {
250- // Get the number of elements from the bit width of the first operand.
251- unsigned numElems = cast<cir::IntType>(ops[0 ].getType ()).getWidth ();
252-
253- // Convert both operands to mask vectors.
254- mlir::Value lhs = getMaskVecValue (*this , expr, ops[0 ], numElems);
255- mlir::Value rhs = getMaskVecValue (*this , expr, ops[1 ], numElems);
256-
257- mlir::Location loc = getLoc (expr->getExprLoc ());
258-
259- // Create indices for extracting the first half of each vector.
260- SmallVector<mlir::Attribute, 32 > halfIndices;
261- mlir::Type i32Ty = builder.getSInt32Ty ();
262- for (auto i : llvm::seq<unsigned >(0 , numElems / 2 ))
263- halfIndices.push_back (cir::IntAttr::get (i32Ty, i));
264-
265- // Extract first half of each vector. This gives better codegen than
266- // doing it in a single shuffle.
267- lhs = builder.createVecShuffle (loc, lhs, lhs, halfIndices);
268- rhs = builder.createVecShuffle (loc, rhs, rhs, halfIndices);
269-
270- // Create indices for concatenating the vectors.
271- // NOTE: Operands are swapped to match the intrinsic definition.
272- // After the half extraction, both vectors have numElems/2 elements.
273- // In createVecShuffle(rhs, lhs, indices), indices [0..numElems/2-1] select
274- // from rhs, and indices [numElems/2..numElems-1] select from lhs.
275- SmallVector<mlir::Attribute, 64 > concatIndices;
276- for (auto i : llvm::seq<unsigned >(0 , numElems))
277- concatIndices.push_back (cir::IntAttr::get (i32Ty, i));
278-
279- // Concat the vectors (RHS first, then LHS).
280- mlir::Value res = builder.createVecShuffle (loc, rhs, lhs, concatIndices);
281- return builder.createBitcast (res, ops[0 ].getType ());
282- }
287+ return emitX86MaskUnpack (builder, getLoc (expr->getExprLoc ()),
288+ " x86.avx512.kunpackw" , ops);
289+ case X86::BI__builtin_ia32_kunpckdi:
290+ return emitX86MaskUnpack (builder, getLoc (expr->getExprLoc ()),
291+ " x86.avx512.kunpackd" , ops);
283292
284293 case X86::BI_mm_setcsr:
285294 case X86::BI__builtin_ia32_ldmxcsr: {
0 commit comments