Skip to content

Commit 9ffcf1f

Browse files
committed
aarch64: Improve scalar mode popcount expansion by using SVE [PR113860]
This is similar to the recent improvements to the Advanced SIMD popcount expansion by using SVE. We can utilize SVE to generate more efficient code for scalar mode popcount too. Changes since v1: * v2: Add a new VNx1BI mode and a new test case for V1DI. * v3: Abandon VNx1BI changes and add a new variant of aarch64_ptrue_reg. PR target/113860 gcc/ChangeLog: * config/aarch64/aarch64-protos.h (aarch64_ptrue_reg): New function. * config/aarch64/aarch64-simd.md (popcount<mode>2): Update pattern to also support V1DI mode. * config/aarch64/aarch64.cc (aarch64_ptrue_reg): New function. * config/aarch64/aarch64.md (popcount<mode>2): Add TARGET_SVE support. * config/aarch64/iterators.md (VDQHSD_V1DI): New mode iterator. (SVE_VDQ_I): Add V1DI. (bitsize): Likewise. (VPRED): Likewise. (VEC_POP_MODE): New mode attribute. (vec_pop_mode): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/popcnt-sve.c: Update test. * gcc.target/aarch64/popcnt11.c: New test. * gcc.target/aarch64/popcnt12.c: New test. Signed-off-by: Pengxuan Zheng <[email protected]>
1 parent 774ad67 commit 9ffcf1f

File tree

8 files changed

+139
-11
lines changed

8 files changed

+139
-11
lines changed

gcc/config/aarch64/aarch64-protos.h

+1
Original file line numberDiff line numberDiff line change
@@ -917,6 +917,7 @@ rtx aarch64_expand_sve_dupq (rtx, machine_mode, rtx);
917917
void aarch64_expand_mov_immediate (rtx, rtx);
918918
rtx aarch64_stack_protect_canary_mem (machine_mode, rtx, aarch64_salt_type);
919919
rtx aarch64_ptrue_reg (machine_mode);
920+
rtx aarch64_ptrue_reg (machine_mode, unsigned int);
920921
rtx aarch64_pfalse_reg (machine_mode);
921922
bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
922923
void aarch64_emit_sve_pred_move (rtx, rtx, rtx);

gcc/config/aarch64/aarch64-simd.md

+12-3
Original file line numberDiff line numberDiff line change
@@ -3516,19 +3516,28 @@
35163516
)
35173517

35183518
(define_expand "popcount<mode>2"
3519-
[(set (match_operand:VDQHSD 0 "register_operand")
3520-
(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
3519+
[(set (match_operand:VDQHSD_V1DI 0 "register_operand")
3520+
(popcount:VDQHSD_V1DI
3521+
(match_operand:VDQHSD_V1DI 1 "register_operand")))]
35213522
"TARGET_SIMD"
35223523
{
35233524
if (TARGET_SVE)
35243525
{
3525-
rtx p = aarch64_ptrue_reg (<VPRED>mode);
3526+
rtx p = aarch64_ptrue_reg (<VPRED>mode, <bitsize> == 64 ? 8 : 16);
35263527
emit_insn (gen_aarch64_pred_popcount<mode> (operands[0],
35273528
p,
35283529
operands[1]));
35293530
DONE;
35303531
}
35313532

3533+
if (<MODE>mode == V1DImode)
3534+
{
3535+
rtx out = gen_reg_rtx (DImode);
3536+
emit_insn (gen_popcountdi2 (out, gen_lowpart (DImode, operands[1])));
3537+
emit_move_insn (operands[0], gen_lowpart (<MODE>mode, out));
3538+
DONE;
3539+
}
3540+
35323541
/* Generate a byte popcount. */
35333542
machine_mode mode = <bitsize> == 64 ? V8QImode : V16QImode;
35343543
machine_mode mode2 = <bitsize> == 64 ? V2SImode : V4SImode;

gcc/config/aarch64/aarch64.cc

+21
Original file line numberDiff line numberDiff line change
@@ -3630,6 +3630,27 @@ aarch64_ptrue_reg (machine_mode mode)
36303630
return gen_lowpart (mode, reg);
36313631
}
36323632

3633+
/* Return an all-true (restricted to the leading VL bits) predicate register of
3634+
mode MODE. */
3635+
3636+
rtx
3637+
aarch64_ptrue_reg (machine_mode mode, unsigned int vl)
3638+
{
3639+
gcc_assert (aarch64_sve_pred_mode_p (mode));
3640+
3641+
rtx_vector_builder builder (VNx16BImode, vl, 2);
3642+
3643+
for (int i = 0; i < vl; i++)
3644+
builder.quick_push (CONST1_RTX (BImode));
3645+
3646+
for (int i = 0; i < vl; i++)
3647+
builder.quick_push (CONST0_RTX (BImode));
3648+
3649+
rtx const_vec = builder.build ();
3650+
rtx reg = force_reg (VNx16BImode, const_vec);
3651+
return gen_lowpart (mode, reg);
3652+
}
3653+
36333654
/* Return an all-false predicate register of mode MODE. */
36343655

36353656
rtx

gcc/config/aarch64/aarch64.md

+9
Original file line numberDiff line numberDiff line change
@@ -5345,6 +5345,15 @@
53455345
(popcount:ALLI (match_operand:ALLI 1 "register_operand")))]
53465346
"TARGET_CSSC ? GET_MODE_BITSIZE (<MODE>mode) >= 32 : TARGET_SIMD"
53475347
{
5348+
if (!TARGET_CSSC && TARGET_SVE && <MODE>mode != QImode)
5349+
{
5350+
rtx tmp = gen_reg_rtx (<VEC_POP_MODE>mode);
5351+
rtx op1 = gen_lowpart (<VEC_POP_MODE>mode, operands[1]);
5352+
emit_insn (gen_popcount<vec_pop_mode>2 (tmp, op1));
5353+
emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp));
5354+
DONE;
5355+
}
5356+
53485357
if (!TARGET_CSSC)
53495358
{
53505359
rtx v = gen_reg_rtx (V8QImode);

gcc/config/aarch64/iterators.md

+13-3
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,8 @@
290290
;; Advanced SIMD modes for H, S and D types.
291291
(define_mode_iterator VDQHSD [V4HI V8HI V2SI V4SI V2DI])
292292

293+
(define_mode_iterator VDQHSD_V1DI [VDQHSD V1DI])
294+
293295
;; Advanced SIMD and scalar integer modes for H and S.
294296
(define_mode_iterator VSDQ_HSI [V4HI V8HI V2SI V4SI HI SI])
295297

@@ -559,7 +561,7 @@
559561
(define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])
560562

561563
;; All SVE and Advanced SIMD integer vector modes.
562-
(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I])
564+
(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I V1DI])
563565

564566
;; SVE integer vector modes whose elements are 16 bits or wider.
565567
(define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
@@ -1235,7 +1237,7 @@
12351237
(define_mode_attr bitsize [(V8QI "64") (V16QI "128")
12361238
(V4HI "64") (V8HI "128")
12371239
(V2SI "64") (V4SI "128")
1238-
(V2DI "128")])
1240+
(V1DI "64") (V2DI "128")])
12391241

12401242
;; Map a floating point or integer mode to the appropriate register name prefix
12411243
(define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")])
@@ -2297,7 +2299,7 @@
22972299
(VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
22982300
(V8QI "VNx8BI") (V16QI "VNx16BI")
22992301
(V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
2300-
(V4SI "VNx4BI") (V2DI "VNx2BI")])
2302+
(V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")])
23012303

23022304
;; ...and again in lower case.
23032305
(define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
@@ -2331,6 +2333,14 @@
23312333
(VNx4SI "VNx8SI") (VNx4SF "VNx8SF")
23322334
(VNx2DI "VNx4DI") (VNx2DF "VNx4DF")])
23332335

2336+
;; The Advanced SIMD modes of popcount corresponding to scalar modes.
2337+
(define_mode_attr VEC_POP_MODE [(QI "V8QI") (HI "V4HI")
2338+
(SI "V2SI") (DI "V1DI")])
2339+
2340+
;; ...and again in lower case.
2341+
(define_mode_attr vec_pop_mode [(QI "v8qi") (HI "v4hi")
2342+
(SI "v2si") (DI "v1di")])
2343+
23342344
;; On AArch64 the By element instruction doesn't have a 2S variant.
23352345
;; However because the instruction always selects a pair of values
23362346
;; The normal 3SAME instruction can be used here instead.

gcc/testsuite/gcc.target/aarch64/popcnt-sve.c

+5-5
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
/*
66
** f_v4hi:
7-
** ptrue (p[0-7]).b, all
7+
** ptrue (p[0-7]).b, vl8
88
** ldr d([0-9]+), \[x0\]
99
** cnt z\2.h, \1/m, z\2.h
1010
** str d\2, \[x1\]
@@ -21,7 +21,7 @@ f_v4hi (unsigned short *__restrict b, unsigned short *__restrict d)
2121

2222
/*
2323
** f_v8hi:
24-
** ptrue (p[0-7]).b, all
24+
** ptrue (p[0-7]).b, vl16
2525
** ldr q([0-9]+), \[x0\]
2626
** cnt z\2.h, \1/m, z\2.h
2727
** str q\2, \[x1\]
@@ -42,7 +42,7 @@ f_v8hi (unsigned short *__restrict b, unsigned short *__restrict d)
4242

4343
/*
4444
** f_v2si:
45-
** ptrue (p[0-7]).b, all
45+
** ptrue (p[0-7]).b, vl8
4646
** ldr d([0-9]+), \[x0\]
4747
** cnt z\2.s, \1/m, z\2.s
4848
** str d\2, \[x1\]
@@ -57,7 +57,7 @@ f_v2si (unsigned int *__restrict b, unsigned int *__restrict d)
5757

5858
/*
5959
** f_v4si:
60-
** ptrue (p[0-7]).b, all
60+
** ptrue (p[0-7]).b, vl16
6161
** ldr q([0-9]+), \[x0\]
6262
** cnt z\2.s, \1/m, z\2.s
6363
** str q\2, \[x1\]
@@ -74,7 +74,7 @@ f_v4si (unsigned int *__restrict b, unsigned int *__restrict d)
7474

7575
/*
7676
** f_v2di:
77-
** ptrue (p[0-7]).b, all
77+
** ptrue (p[0-7]).b, vl16
7878
** ldr q([0-9]+), \[x0\]
7979
** cnt z\2.d, \1/m, z\2.d
8080
** str q\2, \[x1\]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/* { dg-do compile } */
2+
/* { dg-options "-O2 -march=armv8.2-a+sve" } */
3+
/* { dg-final { check-function-bodies "**" "" "" } } */
4+
5+
/*
6+
** f_qi:
7+
** ldr b([0-9]+), \[x0\]
8+
** cnt v\1.8b, v\1.8b
9+
** smov w0, v\1.b\[0\]
10+
** ret
11+
*/
12+
unsigned
13+
f_qi (unsigned char *a)
14+
{
15+
return __builtin_popcountg (a[0]);
16+
}
17+
18+
/*
19+
** f_hi:
20+
** ldr h([0-9]+), \[x0\]
21+
** ptrue (p[0-7]).b, vl8
22+
** cnt z\1.h, \2/m, z\1.h
23+
** smov w0, v\1.h\[0\]
24+
** ret
25+
*/
26+
unsigned
27+
f_hi (unsigned short *a)
28+
{
29+
return __builtin_popcountg (a[0]);
30+
}
31+
32+
/*
33+
** f_si:
34+
** ldr s([0-9]+), \[x0\]
35+
** ptrue (p[0-7]).b, vl8
36+
** cnt z\1.s, \2/m, z\1.s
37+
** umov x0, v\1.d\[0\]
38+
** ret
39+
*/
40+
unsigned
41+
f_si (unsigned int *a)
42+
{
43+
return __builtin_popcountg (a[0]);
44+
}
45+
46+
/*
47+
** f_di:
48+
** ldr d([0-9]+), \[x0\]
49+
** ptrue (p[0-7])\.b, vl8
50+
** cnt z\1\.d, \2/m, z\1\.d
51+
** fmov x0, d\1
52+
** ret
53+
*/
54+
unsigned
55+
f_di (unsigned long *a)
56+
{
57+
return __builtin_popcountg (a[0]);
58+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/* { dg-do compile } */
2+
/* { dg-options "-O2 -fgimple" } */
3+
/* { dg-final { check-function-bodies "**" "" "" } } */
4+
5+
#pragma GCC target "+nosve"
6+
7+
/*
8+
** foo:
9+
** cnt (v[0-9]+\.8b), v0\.8b
10+
** addv b0, \1
11+
** ret
12+
*/
13+
__Uint64x1_t __GIMPLE
14+
foo (__Uint64x1_t x)
15+
{
16+
__Uint64x1_t z;
17+
18+
z = .POPCOUNT (x);
19+
return z;
20+
}

0 commit comments

Comments
 (0)