Skip to content

Commit f6b3152

Browse files
pkwasnie-inteligcbot
authored andcommitted
Optimize SPIR-V / OpenCL C "bitselect" builtin function
Add a dedicated intrinsic that guarantees "bitselect" builtin is implemented with one "bfn" instruction.
1 parent 9023e09 commit f6b3152

File tree

12 files changed

+147
-13
lines changed

12 files changed

+147
-13
lines changed

IGC/BiFModule/Headers/bif_flag_controls.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ BIF_FLAG_CONTROL(bool, UseNativeFP16AtomicMinMax)
2323
BIF_FLAG_CONTROL(bool, HasInt64SLMAtomicCAS)
2424
BIF_FLAG_CONTROL(bool, UseNativeFP64GlobalAtomicAdd)
2525
BIF_FLAG_CONTROL(bool, UseNative64BitIntBuiltin)
26+
BIF_FLAG_CONTROL(bool, UseBfn)
2627
BIF_FLAG_CONTROL(bool, HasThreadPauseSupport)
2728
BIF_FLAG_CONTROL(bool, UseNative64BitFloatBuiltin)
2829
BIF_FLAG_CONTROL(bool, hasHWLocalThreadID)

IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,10 @@ double __builtin_IB_dmin(double, double) __attribute__((const));
224224
double __builtin_IB_dmax(double, double) __attribute__((const));
225225
#endif
226226

227+
// Boolean function on three sources
228+
short __builtin_IB_bfn_i16(short, short, short, uchar) __attribute__((const));
229+
int __builtin_IB_bfn_i32(int, int, int, uchar) __attribute__((const));
230+
227231
// Atomic operations
228232
int __builtin_IB_atomic_add_global_i32(__global int*, int);
229233
int __builtin_IB_atomic_add_local_i32(__local int*, int);

IGC/BiFModule/Implementation/Relational/bitselect.cl

Lines changed: 50 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,43 +9,81 @@ SPDX-License-Identifier: MIT
99
#include "../include/BiF_Definitions.cl"
1010
#include "../../Headers/spirv.h"
1111

12+
// Bitselect can be implemented with the following boolean function:
13+
// s0 & s1 | ~s0 & s2
14+
// where s0 = c, s1 = b, s2 = a
15+
// This maps to boolean function 0xD8.
1216

1317
INLINE
1418
char SPIRV_OVERLOADABLE SPIRV_OCL_BUILTIN(bitselect, _i8_i8_i8, )( char a, char b, char c )
1519
{
16-
char temp;
17-
temp = (c & b) | (~c & a);
18-
return temp;
20+
if (BIF_FLAG_CTRL_GET(UseBfn))
21+
{
22+
return (char) __builtin_IB_bfn_i16((short)as_uchar(c), (short)as_uchar(b), (short)as_uchar(a), 0xD8);
23+
}
24+
else
25+
{
26+
char temp;
27+
temp = (c & b) | (~c & a);
28+
return temp;
29+
}
1930
}
2031

2132
GENERATE_SPIRV_OCL_VECTOR_FUNCTIONS_3ARGS( bitselect, char, char, i8 )
2233

2334
INLINE
2435
short SPIRV_OVERLOADABLE SPIRV_OCL_BUILTIN(bitselect, _i16_i16_i16, )( short a, short b, short c )
2536
{
26-
short temp;
27-
temp = (c & b) | (~c & a);
28-
return temp;
37+
if (BIF_FLAG_CTRL_GET(UseBfn))
38+
{
39+
return __builtin_IB_bfn_i16(c, b, a, 0xD8);
40+
}
41+
else
42+
{
43+
short temp;
44+
temp = (c & b) | (~c & a);
45+
return temp;
46+
}
2947
}
3048

3149
GENERATE_SPIRV_OCL_VECTOR_FUNCTIONS_3ARGS( bitselect, short, short, i16 )
3250

3351
INLINE
3452
int SPIRV_OVERLOADABLE SPIRV_OCL_BUILTIN(bitselect, _i32_i32_i32, )( int a, int b, int c )
3553
{
36-
int temp;
37-
temp = (c & b) | (~c & a);
38-
return temp;
54+
if (BIF_FLAG_CTRL_GET(UseBfn))
55+
{
56+
return __builtin_IB_bfn_i32(c, b, a, 0xD8);
57+
}
58+
else
59+
{
60+
int temp;
61+
temp = (c & b) | (~c & a);
62+
return temp;
63+
}
3964
}
4065

4166
GENERATE_SPIRV_OCL_VECTOR_FUNCTIONS_3ARGS( bitselect, int, int, i32 )
4267

4368
INLINE
4469
long SPIRV_OVERLOADABLE SPIRV_OCL_BUILTIN(bitselect, _i64_i64_i64, )( long a, long b, long c )
4570
{
46-
long temp;
47-
temp = (c & b) | (~c & a);
48-
return temp;
71+
if (BIF_FLAG_CTRL_GET(UseBfn))
72+
{
73+
int2 tmpA = as_int2(a);
74+
int2 tmpB = as_int2(b);
75+
int2 tmpC = as_int2(c);
76+
int2 tmpResult;
77+
tmpResult.s0 = __builtin_IB_bfn_i32(tmpC.s0, tmpB.s0, tmpA.s0, 0xD8);
78+
tmpResult.s1 = __builtin_IB_bfn_i32(tmpC.s1, tmpB.s1, tmpA.s1, 0xD8);
79+
return as_long(tmpResult);
80+
}
81+
else
82+
{
83+
long temp;
84+
temp = (c & b) | (~c & a);
85+
return temp;
86+
}
4987
}
5088

5189
GENERATE_SPIRV_OCL_VECTOR_FUNCTIONS_3ARGS( bitselect, long, long, i64 )

IGC/Compiler/Builtins/BIFFlagCtrl/BIFFlagCtrlResolution.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ void BIFFlagCtrlResolution::FillFlagCtrl() {
6666
PtrCGC->platform.hasThreadPauseSupport());
6767
BIF_FLAG_CTRL_SET(UseNative64BitFloatBuiltin,
6868
!PtrCGC->platform.hasNoFP64Inst());
69+
BIF_FLAG_CTRL_SET(UseBfn, IGC_IS_FLAG_ENABLED(EnableBfn) &&
70+
PtrCGC->platform.supportBfnInstruction());
6971
BIF_FLAG_CTRL_SET(hasHWLocalThreadID, PtrCGC->platform.hasHWLocalThreadID());
7072
BIF_FLAG_CTRL_SET(CRMacros, PtrCGC->platform.hasCorrectlyRoundedMacros());
7173
BIF_FLAG_CTRL_SET(

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9604,6 +9604,9 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
96049604
case GenISAIntrinsic::GenISA_WaveClusteredInterleave:
96059605
emitWaveClusteredInterleave(inst);
96069606
break;
9607+
case GenISAIntrinsic::GenISA_bfn:
9608+
emitBfn(inst);
9609+
break;
96079610
case GenISAIntrinsic::GenISA_dp4a_ss:
96089611
case GenISAIntrinsic::GenISA_dp4a_uu:
96099612
case GenISAIntrinsic::GenISA_dp4a_su:
@@ -17789,6 +17792,19 @@ void EmitPass::emitFPOWithNonDefaultRoundingMode(llvm::GenIntrinsicInst* inst)
1778917792
ResetRoundingMode(inst);
1779017793
}
1779117794

17795+
void EmitPass::emitBfn(llvm::GenIntrinsicInst* inst)
17796+
{
17797+
IGC_ASSERT_MESSAGE(isa<ConstantInt>(inst->getArgOperand(3)), "booleanFuncCtrl must be const!");
17798+
const uint8_t booleanFuncCtrl = int_cast<uint8_t>(cast<ConstantInt>(inst->getArgOperand(3))->getZExtValue());
17799+
17800+
CVariable* src0 = GetSymbol(inst->getOperand(0));
17801+
CVariable* src1 = GetSymbol(inst->getOperand(1));
17802+
CVariable* src2 = GetSymbol(inst->getOperand(2));
17803+
17804+
m_encoder->Bfn(booleanFuncCtrl, m_destination, src0, src1, src2);
17805+
m_encoder->Push();
17806+
}
17807+
1779217808
void EmitPass::emitftoi(llvm::GenIntrinsicInst* inst)
1779317809
{
1779417810
IGC_ASSERT_MESSAGE(inst->getOperand(0)->getType()->isFloatingPointTy(), "Unsupported type");

IGC/Compiler/CISACodeGen/EmitVISAPass.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,8 @@ class EmitPass : public llvm::FunctionPass
455455
void emitftoi(llvm::GenIntrinsicInst* inst);
456456
void emitCtlz(const SSource& source);
457457

458+
void emitBfn(llvm::GenIntrinsicInst* inst);
459+
458460

459461
// VME
460462
void emitVMESendIME(llvm::GenIntrinsicInst* inst);

IGC/Compiler/CISACodeGen/WIAnalysis.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1508,7 +1508,8 @@ WIAnalysis::WIDependancy WIAnalysisRunner::calculate_dep(const CallInst* inst)
15081508
GII_id == GenISAIntrinsic::GenISA_LSC2DBlockCreateAddrPayload ||
15091509
GII_id == GenISAIntrinsic::GenISA_LSC2DBlockCopyAddrPayload ||
15101510
GII_id == GenISAIntrinsic::GenISA_PredicatedLoad ||
1511-
GII_id == GenISAIntrinsic::GenISA_PredicatedStore)
1511+
GII_id == GenISAIntrinsic::GenISA_PredicatedStore ||
1512+
GII_id == GenISAIntrinsic::GenISA_bfn)
15121513
{
15131514
switch (GII_id)
15141515
{

IGC/Compiler/CISACodeGen/opCode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ DECLARE_OPCODE(GenISA_dp4a_ss, GenISAIntrinsic, llvm_dp4a_ss, false, true, true,
156156
DECLARE_OPCODE(GenISA_dp4a_uu, GenISAIntrinsic, llvm_dp4a_uu, false, true, true, true, false, false, false)
157157
DECLARE_OPCODE(GenISA_dp4a_su, GenISAIntrinsic, llvm_dp4a_su, false, true, true, true, false, false, false)
158158
DECLARE_OPCODE(GenISA_dp4a_us, GenISAIntrinsic, llvm_dp4a_us, false, true, true, true, false, false, false)
159+
DECLARE_OPCODE(GenISA_bfn, GenISAIntrinsic, llvm_bfn, false, false, false, false, false, false, false)
159160

160161
// GS Intrinsics
161162
DECLARE_OPCODE(GenISA_OUTPUTGS, GenISAIntrinsic, llvm_output_gs, false, false, false, false, false, false, false)

IGC/Compiler/Optimizer/OCLBIUtils.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1758,6 +1758,9 @@ CBuiltinsResolver::CBuiltinsResolver(CImagesBI::ParamMap* paramMap, CImagesBI::I
17581758

17591759
m_CommandMap[StringRef("__builtin_IB_samplepos")] = CSamplePos::create();
17601760

1761+
m_CommandMap["__builtin_IB_bfn_i16"] = CSimpleIntrinMapping::create(GenISAIntrinsic::GenISA_bfn);
1762+
m_CommandMap["__builtin_IB_bfn_i32"] = CSimpleIntrinMapping::create(GenISAIntrinsic::GenISA_bfn);
1763+
17611764
// `dp4a` built-ins
17621765
m_CommandMap["__builtin_IB_dp4a_ss"] = CSimpleIntrinMapping::create(GenISAIntrinsic::GenISA_dp4a_ss, false);
17631766
m_CommandMap["__builtin_IB_dp4a_uu"] = CSimpleIntrinMapping::create(GenISAIntrinsic::GenISA_dp4a_uu, false);

IGC/GenISAIntrinsics/generator/input/Intrinsic_definitions.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11730,3 +11730,31 @@ intrinsics:
1173011730
memory_effects:
1173111731
- !<MemoryRestriction>
1173211732
memory_access: !MemoryAccessType NoModRef
11733+
- !<IntrinsicDefinition>
11734+
name: "GenISA_bfn"
11735+
comment: "Performs an arbitrary boolean logical operation on three sources."
11736+
return_definition: !<ReturnDefinition>
11737+
type_definition: *any_int
11738+
comment: "result"
11739+
arguments:
11740+
- !<ArgumentDefinition>
11741+
name: Arg0
11742+
type_definition: *ref_0_
11743+
comment: "source0 (a)"
11744+
- !<ArgumentDefinition>
11745+
name: Arg1
11746+
type_definition: *ref_0_
11747+
comment: "source1 (b)"
11748+
- !<ArgumentDefinition>
11749+
name: Arg2
11750+
type_definition: *ref_0_
11751+
comment: "source2 (c)"
11752+
- !<ArgumentDefinition>
11753+
name: Arg3
11754+
type_definition: *i8
11755+
comment: "Boolean function"
11756+
attributes:
11757+
- !AttributeID "NoUnwind"
11758+
memory_effects:
11759+
- !<MemoryRestriction>
11760+
memory_access: !MemoryAccessType NoModRef

IGC/VectorCompiler/lib/GenXOpts/CMTrans/GenXBIFFlagCtrlResolution.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ void GenXBIFFlagCtrlResolution::FillFlagCtrl() {
8989
BIF_FLAG_CTRL_SET(hasHWLocalThreadID, false);
9090
BIF_FLAG_CTRL_SET(APIRS, false);
9191
BIF_FLAG_CTRL_SET(UseLSC, false);
92+
BIF_FLAG_CTRL_SET(UseBfn, false);
9293
BIF_FLAG_CTRL_SET(ForceL1Prefetch, false);
9394
BIF_FLAG_CTRL_SET(UseNativeFP64GlobalAtomicAdd, false);
9495
BIF_FLAG_CTRL_SET(MaxHWThreadIDPerSubDevice, 1);

IGC/ocloc_tests/Builtins/bitselect.cl

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*========================== begin_copyright_notice ============================
2+
3+
Copyright (C) 2025 Intel Corporation
4+
5+
SPDX-License-Identifier: MIT
6+
7+
============================= end_copyright_notice ===========================*/
8+
9+
// REQUIRES: regkeys, xe2-supported
10+
// UNSUPPORTED: sys32
11+
12+
// RUN: ocloc compile -file %s -options " -igc_opts 'DumpVISAASMToConsole=1'" -device bmg | FileCheck %s
13+
14+
// CHECK: .kernel "test_i32"
15+
// CHECK: lsc_load.ugm (M1, 32) [[A:V[0-9]+]]:d32 flat[{{V[0-9]+}}]:a64
16+
// CHECK: lsc_load.ugm (M1, 32) [[B:V[0-9]+]]:d32 flat[{{V[0-9]+}}]:a64
17+
// CHECK: lsc_load.ugm (M1, 32) [[C:V[0-9]+]]:d32 flat[{{V[0-9]+}}]:a64
18+
// CHECK: bfn.xd8 (M1, 32) [[RESULT:V[0-9]+]](0,0)<1> [[C]](0,0)<1;1,0> [[B]](0,0)<1;1,0> [[A]](0,0)<1;1,0>
19+
// CHECK: lsc_store.ugm (M1, 32) flat[{{V[0-9]+}}]:a64 [[RESULT]]:d32
20+
__attribute__((intel_reqd_sub_group_size(32)))
21+
kernel void test_i32(global int* a, global int* b, global int* c) {
22+
int i = get_global_id(0);
23+
a[i] = bitselect(a[i], b[i], c[i]);
24+
}
25+
26+
// CHECK: .kernel "test_i64"
27+
// CHECK: lsc_load.ugm (M1, 32) [[A:V[0-9]+]]:d32x2 flat[{{V[0-9]+}}]:a64
28+
// CHECK: lsc_load.ugm (M1, 32) [[B:V[0-9]+]]:d32x2 flat[{{V[0-9]+}}]:a64
29+
// CHECK: lsc_load.ugm (M1, 32) [[C:V[0-9]+]]:d32x2 flat[{{V[0-9]+}}]:a64
30+
// CHECK: bfn.xd8 (M1, 32) [[RESULT:.+]](0,0)<1> [[C]](0,0)<1;1,0> [[B]](0,0)<1;1,0> [[A]](0,0)<1;1,0>
31+
// CHECK: bfn.xd8 (M1, 32) [[RESULT]](2,0)<1> [[C]](2,0)<1;1,0> [[B]](2,0)<1;1,0> [[A]](2,0)<1;1,0>
32+
// CHECK: lsc_store.ugm (M1, 32) flat[{{V[0-9]+}}]:a64 [[RESULT]]:d32x2
33+
__attribute__((intel_reqd_sub_group_size(32)))
34+
kernel void test_i64(global long* a, global long* b, global long* c) {
35+
int i = get_global_id(0);
36+
a[i] = bitselect(a[i], b[i], c[i]);
37+
}

0 commit comments

Comments
 (0)