Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
2f02de3
feat: Add support for kunpack builtins
ahmednoursphinx Nov 19, 2025
ad1525e
chore: format files
ahmednoursphinx Nov 19, 2025
2b75e07
refactor: rename property to include full name
ahmednoursphinx Nov 19, 2025
052c48e
refactor: move logic to helper function and modify function name
ahmednoursphinx Nov 20, 2025
dab0645
refactor: rearrange funcs
ahmednoursphinx Nov 20, 2025
faab7d2
refactor: optimize createVecShuffle
ahmednoursphinx Nov 20, 2025
bb801bc
refactor: move test functions
ahmednoursphinx Nov 20, 2025
212c5df
chore: Format files
ahmednoursphinx Nov 20, 2025
b7ceb04
Merge branch 'main' into issue_-167765_kunpck
ahmednoursphinx Nov 20, 2025
cf728a9
refactor: remove duplicate test
ahmednoursphinx Nov 20, 2025
107d4a1
Update CIRGenBuiltinX86.cpp
ahmednoursphinx Nov 20, 2025
bff4131
Update CIRGenBuiltinX86.cpp
ahmednoursphinx Nov 20, 2025
553aa74
refactor: remove redundant tests
ahmednoursphinx Nov 20, 2025
349fb13
Update avx512bw-builtins.c
ahmednoursphinx Nov 20, 2025
14d8eab
Merge branch 'main' into issue_-167765_kunpck
ahmednoursphinx Nov 25, 2025
19e365d
refactor: Use getMaskVecValue func
ahmednoursphinx Nov 25, 2025
0f28143
feat: add test
ahmednoursphinx Nov 25, 2025
2c68211
chore: fix test
ahmednoursphinx Nov 25, 2025
ce3956e
Merge branch 'main' into issue_-167765_kunpck
ahmednoursphinx Nov 30, 2025
f56a71f
chore: fix merge conflict by adding test to the correct location
ahmednoursphinx Nov 30, 2025
36cd79a
chore: update CIR label
ahmednoursphinx Nov 30, 2025
bef4953
feat: add Also a test for _mm512_kunpackw and _mm512_kunpackd.
ahmednoursphinx Nov 30, 2025
416e122
feat: refactor to use a function instead of inlining and give name to…
ahmednoursphinx Nov 30, 2025
e7a6386
chore: Format files
ahmednoursphinx Nov 30, 2025
8318cea
feat: use builder
ahmednoursphinx Nov 30, 2025
8521696
refactor: fix tests
ahmednoursphinx Nov 30, 2025
6cbd359
Merge branch 'main' into issue_-167765_kunpck
ahmednoursphinx Dec 2, 2025
02b22e6
Merge branch 'main' into issue_-167765_kunpck
ahmednoursphinx Dec 2, 2025
cf8238b
chore: fix merge conflict
ahmednoursphinx Dec 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 48 additions & 3 deletions clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,28 @@ static mlir::Value emitIntrinsicCallOp(CIRGenFunction &cgf, const CallExpr *e,
.getResult();
}

static mlir::Value getMaskVecValue(CIRGenBuilderTy &builder, mlir::Value mask,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is function is also being added in #168591

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

refactored to specifically refer to kunpack operation

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that's what we want. The function will be shared. I was just mentioning it because we'll want the same implementation in both PRs and whichever one is merged second will need to be rebased on the other.

unsigned numElems) {
auto maskIntType = mlir::cast<cir::IntType>(mask.getType());
unsigned maskWidth = maskIntType.getWidth();

// Create a vector of bool type with maskWidth elements
auto maskVecType =
cir::VectorType::get(builder.getContext(),
cir::BoolType::get(builder.getContext()), maskWidth);
mlir::Value maskVec = builder.createBitcast(mask, maskVecType);

// If we have less than 8 elements, then the starting mask was an i8 and
// we need to extract down to the right number of elements.
if (numElems < 8) {
llvm::SmallVector<int64_t, 4> indices;
for (unsigned i = 0; i != numElems; ++i)
indices.push_back(i);
maskVec = builder.createVecShuffle(mask.getLoc(), maskVec, indices);
}
return maskVec;
}

// OG has unordered comparison as a form of optimization in addition to
// ordered comparison, while CIR doesn't.
//
Expand Down Expand Up @@ -169,6 +191,32 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
case X86::BI__builtin_ia32_vec_set_v16hi:
case X86::BI__builtin_ia32_vec_set_v8si:
case X86::BI__builtin_ia32_vec_set_v4di:

case X86::BI__builtin_ia32_kunpckdi:
case X86::BI__builtin_ia32_kunpcksi:
case X86::BI__builtin_ia32_kunpckhi: {
auto maskIntType = mlir::cast<cir::IntType>(ops[0].getType());
unsigned numElems = maskIntType.getWidth();
mlir::Value lhs = getMaskVecValue(builder, ops[0], numElems);
mlir::Value rhs = getMaskVecValue(builder, ops[1], numElems);
llvm::SmallVector<int64_t, 64> indices;
for (unsigned i = 0; i != numElems; ++i)
indices.push_back(i);

// First extract half of each vector. This gives better codegen than
// doing it in a single shuffle.
mlir::Location loc = getLoc(expr->getExprLoc());
lhs = builder.createVecShuffle(
loc, lhs, llvm::ArrayRef(indices.data(), numElems / 2));
rhs = builder.createVecShuffle(
loc, rhs, llvm::ArrayRef(indices.data(), numElems / 2));
// Concat the vectors.
// NOTE: Operands are swapped to match the intrinsic definition.
mlir::Value res = builder.createVecShuffle(
loc, rhs, lhs, llvm::ArrayRef(indices.data(), numElems));
return builder.createBitcast(res, ops[0].getType());
}

case X86::BI_mm_setcsr:
case X86::BI__builtin_ia32_ldmxcsr:
case X86::BI_mm_getcsr:
Expand Down Expand Up @@ -675,9 +723,6 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
case X86::BI__builtin_ia32_kmovw:
case X86::BI__builtin_ia32_kmovd:
case X86::BI__builtin_ia32_kmovq:
case X86::BI__builtin_ia32_kunpckdi:
case X86::BI__builtin_ia32_kunpcksi:
case X86::BI__builtin_ia32_kunpckhi:
case X86::BI__builtin_ia32_sqrtsh_round_mask:
case X86::BI__builtin_ia32_sqrtsd_round_mask:
case X86::BI__builtin_ia32_sqrtss_round_mask:
Expand Down
92 changes: 92 additions & 0 deletions clang/test/CIR/CodeGen/X86/avx512-kunpck-builtins.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512bw -fclangir -emit-cir -o %t.cir -Wall -Werror
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512bw -fclangir -emit-llvm -o %t.ll -Wall -Werror
// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s

// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512f -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG

// This test exercises the kunpck (mask unpack) builtins for AVX-512.

#include <immintrin.h>

__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) {
// CIR-LABEL: test_mm512_kunpackb
// LLVM-LABEL: test_mm512_kunpackb
// OGCG-LABEL: test_mm512_kunpackb
return _mm512_kunpackb(__A, __B);
// CIR: [[MASK_A:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.int<u, 16>), !cir.vector<!cir.bool x 16>
// CIR: [[EXTRACT_A:%.*]] = cir.vec.shuffle([[MASK_A]], {{.*}}) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i]
// CIR: [[MASK_B:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.int<u, 16>), !cir.vector<!cir.bool x 16>
// CIR: [[EXTRACT_B:%.*]] = cir.vec.shuffle([[MASK_B]], {{.*}}) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i]
// CIR: [[CONCAT:%.*]] = cir.vec.shuffle([[EXTRACT_B]], [[EXTRACT_A]]) [#cir.int<0> : !s32i, #cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i]
// CIR: {{%.*}} = cir.cast(bitcast, [[CONCAT]] : !cir.vector<!cir.bool x 16>), !cir.int<u, 16>

// LLVM: [[A_BITCAST:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
// LLVM: [[A_SHUFFLE:%.*]] = shufflevector <16 x i1> [[A_BITCAST]], <16 x i1> [[A_BITCAST]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// LLVM: [[B_BITCAST:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
// LLVM: [[B_SHUFFLE:%.*]] = shufflevector <16 x i1> [[B_BITCAST]], <16 x i1> [[B_BITCAST]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// LLVM: [[CONCAT:%.*]] = shufflevector <8 x i1> [[B_SHUFFLE]], <8 x i1> [[A_SHUFFLE]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// LLVM: bitcast <16 x i1> [[CONCAT]] to i16

// OGCG: [[A_BITCAST:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
// OGCG: [[A_SHUFFLE:%.*]] = shufflevector <16 x i1> [[A_BITCAST]], <16 x i1> [[A_BITCAST]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// OGCG: [[B_BITCAST:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
// OGCG: [[B_SHUFFLE:%.*]] = shufflevector <16 x i1> [[B_BITCAST]], <16 x i1> [[B_BITCAST]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// OGCG: [[CONCAT:%.*]] = shufflevector <8 x i1> [[B_SHUFFLE]], <8 x i1> [[A_SHUFFLE]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// OGCG: bitcast <16 x i1> [[CONCAT]] to i16
}

__mmask32 test_mm512_kunpackw(__mmask32 __A, __mmask32 __B) {
// CIR-LABEL: test_mm512_kunpackw
// LLVM-LABEL: test_mm512_kunpackw
// OGCG-LABEL: test_mm512_kunpackw
return _mm512_kunpackw(__A, __B);
// CIR: [[MASK_A:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.int<u, 32>), !cir.vector<!cir.bool x 32>
// CIR: [[EXTRACT_A:%.*]] = cir.vec.shuffle([[MASK_A]], {{.*}})
// CIR: [[MASK_B:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.int<u, 32>), !cir.vector<!cir.bool x 32>
// CIR: [[EXTRACT_B:%.*]] = cir.vec.shuffle([[MASK_B]], {{.*}})
// CIR: [[CONCAT:%.*]] = cir.vec.shuffle([[EXTRACT_B]], [[EXTRACT_A]])
// CIR: {{%.*}} = cir.cast(bitcast, [[CONCAT]] : !cir.vector<!cir.bool x 32>), !cir.int<u, 32>

// LLVM: [[A_BITCAST:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
// LLVM: [[A_SHUFFLE:%.*]] = shufflevector <32 x i1> [[A_BITCAST]], <32 x i1> [[A_BITCAST]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// LLVM: [[B_BITCAST:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
// LLVM: [[B_SHUFFLE:%.*]] = shufflevector <32 x i1> [[B_BITCAST]], <32 x i1> [[B_BITCAST]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// LLVM: [[CONCAT:%.*]] = shufflevector <16 x i1> [[B_SHUFFLE]], <16 x i1> [[A_SHUFFLE]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
// LLVM: bitcast <32 x i1> [[CONCAT]] to i32

// OGCG: [[A_BITCAST:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
// OGCG: [[A_SHUFFLE:%.*]] = shufflevector <32 x i1> [[A_BITCAST]], <32 x i1> [[A_BITCAST]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// OGCG: [[B_BITCAST:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
// OGCG: [[B_SHUFFLE:%.*]] = shufflevector <32 x i1> [[B_BITCAST]], <32 x i1> [[B_BITCAST]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// OGCG: [[CONCAT:%.*]] = shufflevector <16 x i1> [[B_SHUFFLE]], <16 x i1> [[A_SHUFFLE]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
// OGCG: bitcast <32 x i1> [[CONCAT]] to i32
}

__mmask64 test_mm512_kunpackd(__mmask64 __A, __mmask64 __B) {
// CIR-LABEL: test_mm512_kunpackd
// LLVM-LABEL: test_mm512_kunpackd
// OGCG-LABEL: test_mm512_kunpackd
return _mm512_kunpackd(__A, __B);
// CIR: [[MASK_A:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.int<u, 64>), !cir.vector<!cir.bool x 64>
// CIR: [[EXTRACT_A:%.*]] = cir.vec.shuffle([[MASK_A]], {{.*}})
// CIR: [[MASK_B:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.int<u, 64>), !cir.vector<!cir.bool x 64>
// CIR: [[EXTRACT_B:%.*]] = cir.vec.shuffle([[MASK_B]], {{.*}})
// CIR: [[CONCAT:%.*]] = cir.vec.shuffle([[EXTRACT_B]], [[EXTRACT_A]])
// CIR: {{%.*}} = cir.cast(bitcast, [[CONCAT]] : !cir.vector<!cir.bool x 64>), !cir.int<u, 64>

// LLVM: [[A_BITCAST:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
// LLVM: [[A_SHUFFLE:%.*]] = shufflevector <64 x i1> [[A_BITCAST]], <64 x i1> [[A_BITCAST]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
// LLVM: [[B_BITCAST:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
// LLVM: [[B_SHUFFLE:%.*]] = shufflevector <64 x i1> [[B_BITCAST]], <64 x i1> [[B_BITCAST]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
// LLVM: [[CONCAT:%.*]] = shufflevector <32 x i1> [[B_SHUFFLE]], <32 x i1> [[A_SHUFFLE]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
// LLVM: bitcast <64 x i1> [[CONCAT]] to i64

// OGCG: [[A_BITCAST:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
// OGCG: [[A_SHUFFLE:%.*]] = shufflevector <64 x i1> [[A_BITCAST]], <64 x i1> [[A_BITCAST]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
// OGCG: [[B_BITCAST:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
// OGCG: [[B_SHUFFLE:%.*]] = shufflevector <64 x i1> [[B_BITCAST]], <64 x i1> [[B_BITCAST]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
// OGCG: [[CONCAT:%.*]] = shufflevector <32 x i1> [[B_SHUFFLE]], <32 x i1> [[A_SHUFFLE]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
// OGCG: bitcast <64 x i1> [[CONCAT]] to i64
}

Loading