Skip to content

Commit f98dd29

Browse files
committed
cmd/compile: rework unbounded shift lowering on PPC64
This reduces unbounded shift latency by one cycle, and may generate less instructions in some cases. When there is a choice whether to use doubleword or word shifts, use doubleword shifts. Doubleword shifts have fewer hardware scheduling restrictions across P8/P9/P10. Likewise, rework the shift sequence to allow the compare/shift/overshift values to compute in parallel, then choose the correct value. Some ANDCCconst rules also need reworked to ensure they simplify when used for their flag value. This commonly occurs when prove fails to identify a bounded shift (e.g foo32<<uint(x&31)). Change-Id: Ifc6ff4a865d68675e57745056db414b0eb6f2d34 Reviewed-on: https://go-review.googlesource.com/c/go/+/442597 TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Lynn Boger <[email protected]> Reviewed-by: Than McIntosh <[email protected]> Run-TryBot: Paul Murphy <[email protected]> Reviewed-by: Ian Lance Taylor <[email protected]>
1 parent f60a2a9 commit f98dd29

File tree

3 files changed

+953
-861
lines changed

3 files changed

+953
-861
lines changed

src/cmd/compile/internal/ssa/_gen/PPC64.rules

Lines changed: 51 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -172,61 +172,60 @@
172172
// Lower bounded shifts first. No need to check shift value.
173173
(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SLD x y)
174174
(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y)
175-
(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y)
176-
(Lsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SLW x y)
175+
(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SLD x y)
176+
(Lsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SLD x y)
177177
(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRD x y)
178178
(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW x y)
179-
(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW (MOVHZreg x) y)
180-
(Rsh8Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRW (MOVBZreg x) y)
179+
(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRD (MOVHZreg x) y)
180+
(Rsh8Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SRD (MOVBZreg x) y)
181181
(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAD x y)
182182
(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW x y)
183-
(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW (MOVHreg x) y)
184-
(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAW (MOVBreg x) y)
183+
(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAD (MOVHreg x) y)
184+
(Rsh8x(64|32|16|8) x y) && shiftIsBounded(v) => (SRAD (MOVBreg x) y)
185+
186+
// Unbounded shifts. Go shifts saturate to 0 or -1 when shifting beyond the number of
187+
// bits in a type, PPC64 shifts do not (see the ISA for details).
188+
//
189+
// Note, y is always non-negative.
190+
//
191+
// Note, ISELZ is intentionally not used in lower. Where possible, ISEL is converted to ISELZ in late lower
192+
// after all the ISEL folding rules have been exercised.
193+
194+
((Rsh64U|Lsh64)x64 <t> x y) => (ISEL [0] (S(R|L)D <t> x y) (MOVDconst [0]) (CMPUconst y [64]))
195+
((Rsh64U|Lsh64)x32 <t> x y) => (ISEL [0] (S(R|L)D <t> x y) (MOVDconst [0]) (CMPWUconst y [64]))
196+
((Rsh64U|Lsh64)x16 <t> x y) => (ISEL [2] (S(R|L)D <t> x y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFC0] y)))
197+
((Rsh64U|Lsh64)x8 <t> x y) => (ISEL [2] (S(R|L)D <t> x y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0x00C0] y)))
198+
(Rsh64x(64|32) <t> x y) => (ISEL [0] (SRAD <t> x y) (SRADconst <t> x [63]) (CMP(U|WU)const y [64]))
199+
(Rsh64x16 <t> x y) => (ISEL [2] (SRAD <t> x y) (SRADconst <t> x [63]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFC0] y)))
200+
(Rsh64x8 <t> x y) => (ISEL [2] (SRAD <t> x y) (SRADconst <t> x [63]) (Select1 <types.TypeFlags> (ANDCCconst [0x00C0] y)))
201+
202+
((Rsh32U|Lsh32)x64 <t> x y) => (ISEL [0] (S(R|L)W <t> x y) (MOVDconst [0]) (CMPUconst y [32]))
203+
((Rsh32U|Lsh32)x32 <t> x y) => (ISEL [0] (S(R|L)W <t> x y) (MOVDconst [0]) (CMPWUconst y [32]))
204+
((Rsh32U|Lsh32)x16 <t> x y) => (ISEL [2] (S(R|L)W <t> x y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFE0] y)))
205+
((Rsh32U|Lsh32)x8 <t> x y) => (ISEL [2] (S(R|L)W <t> x y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0x00E0] y)))
206+
(Rsh32x(64|32) <t> x y) => (ISEL [0] (SRAW <t> x y) (SRAWconst <t> x [31]) (CMP(U|WU)const y [32]))
207+
(Rsh32x16 <t> x y) => (ISEL [2] (SRAW <t> x y) (SRAWconst <t> x [31]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFE0] y)))
208+
(Rsh32x8 <t> x y) => (ISEL [2] (SRAW <t> x y) (SRAWconst <t> x [31]) (Select1 <types.TypeFlags> (ANDCCconst [0x00E0] y)))
209+
210+
((Rsh16U|Lsh16)x64 <t> x y) => (ISEL [0] (S(R|L)D <t> (MOVHZreg x) y) (MOVDconst [0]) (CMPUconst y [16]))
211+
((Rsh16U|Lsh16)x32 <t> x y) => (ISEL [0] (S(R|L)D <t> (MOVHZreg x) y) (MOVDconst [0]) (CMPWUconst y [16]))
212+
((Rsh16U|Lsh16)x16 <t> x y) => (ISEL [2] (S(R|L)D <t> (MOVHZreg x) y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFF0] y)))
213+
((Rsh16U|Lsh16)x8 <t> x y) => (ISEL [2] (S(R|L)D <t> (MOVHZreg x) y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0x00F0] y)))
214+
(Rsh16x(64|32) <t> x y) => (ISEL [0] (SRAD <t> (MOVHreg x) y) (SRADconst <t> (MOVHreg x) [15]) (CMP(U|WU)const y [16]))
215+
(Rsh16x16 <t> x y) => (ISEL [2] (SRAD <t> (MOVHreg x) y) (SRADconst <t> (MOVHreg x) [15]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFF0] y)))
216+
(Rsh16x8 <t> x y) => (ISEL [2] (SRAD <t> (MOVHreg x) y) (SRADconst <t> (MOVHreg x) [15]) (Select1 <types.TypeFlags> (ANDCCconst [0x00F0] y)))
217+
218+
((Rsh8U|Lsh8)x64 <t> x y) => (ISEL [0] (S(R|L)D <t> (MOVBZreg x) y) (MOVDconst [0]) (CMPUconst y [8]))
219+
((Rsh8U|Lsh8)x32 <t> x y) => (ISEL [0] (S(R|L)D <t> (MOVBZreg x) y) (MOVDconst [0]) (CMPWUconst y [8]))
220+
((Rsh8U|Lsh8)x16 <t> x y) => (ISEL [2] (S(R|L)D <t> (MOVBZreg x) y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFF8] y)))
221+
((Rsh8U|Lsh8)x8 <t> x y) => (ISEL [2] (S(R|L)D <t> (MOVBZreg x) y) (MOVDconst [0]) (Select1 <types.TypeFlags> (ANDCCconst [0x00F8] y)))
222+
(Rsh8x(64|32) <t> x y) => (ISEL [0] (SRAD <t> (MOVBreg x) y) (SRADconst <t> (MOVBreg x) [7]) (CMP(U|WU)const y [8]))
223+
(Rsh8x16 <t> x y) => (ISEL [2] (SRAD <t> (MOVBreg x) y) (SRADconst <t> (MOVBreg x) [7]) (Select1 <types.TypeFlags> (ANDCCconst [0xFFF8] y)))
224+
(Rsh8x8 <t> x y) => (ISEL [2] (SRAD <t> (MOVBreg x) y) (SRADconst <t> (MOVBreg x) [7]) (Select1 <types.TypeFlags> (ANDCCconst [0x00F8] y)))
225+
226+
// Catch bounded shifts in situations like foo<<uint(shift&63) which might not be caught by the prove pass.
227+
(CMP(U|WU)const [d] (Select0 (ANDCCconst z [c]))) && uint64(d) > uint64(c) => (FlagLT)
185228

186-
// non-constant rotates
187-
// If shift > 64 then use -1 as shift count to shift all bits.
188-
((Lsh64|Rsh64|Rsh64U)x64 x y) => (S(L|RA|R)D x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64]))))
189-
((Rsh32|Rsh32U|Lsh32)x64 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32]))))
190-
191-
(Rsh(16|16U)x64 x y) => (SR(AW|W) ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
192-
(Lsh16x64 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
193-
194-
(Rsh(8|8U)x64 x y) => (SR(AW|W) ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
195-
(Lsh8x64 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
196-
197-
((Rsh64|Rsh64U|Lsh64)x32 x y) => (S(RA|R|L)D x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [64]))))
198-
((Rsh32|Rsh32U|Lsh32)x32 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [32]))))
199-
200-
(Rsh(16|16U)x32 x y) => (SR(AW|W) ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
201-
(Lsh16x32 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [16]))))
202-
203-
(Rsh(8|8U)x32 x y) => (SR(AW|W) ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
204-
(Lsh8x32 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU y (MOVDconst [8]))))
205-
206-
((Rsh64|Rsh64U|Lsh64)x16 x y) => (S(RA|R|L)D x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [64]))))
207-
208-
((Rsh32|Rsh32U|Lsh32)x16 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [32]))))
209-
210-
(Rsh(16|16U)x16 x y) => (S(RA|R)W ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [16]))))
211-
(Lsh16x16 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [16]))))
212-
213-
(Rsh(8|8U)x16 x y) => (SR(AW|W) ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [8]))))
214-
(Lsh8x16 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt16to64 y) (MOVDconst [8]))))
215-
216-
217-
((Rsh64|Rsh64U|Lsh64)x8 x y) => (S(RA|R|L)D x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [64]))))
218-
219-
((Rsh32|Rsh32U|Lsh32)x8 x y) => (S(RA|R|L)W x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [32]))))
220-
221-
(Rsh(16|16U)x8 x y) => (S(RA|R)W ((Sign|Zero)Ext16to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [16]))))
222-
(Lsh16x8 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [16]))))
223-
224-
(Rsh(8|8U)x8 x y) => (S(RA|R)W ((Sign|Zero)Ext8to32 x) (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [8]))))
225-
(Lsh8x8 x y) => (SLW x (ISEL [0] y (MOVDconst [-1]) (CMPU (ZeroExt8to64 y) (MOVDconst [8]))))
226-
227-
// Cleaning up shift ops
228-
(ISEL [0] (Select0 (ANDCCconst [d] y)) (MOVDconst [-1]) (CMPU (Select0 (ANDCCconst [d] y)) (MOVDconst [c]))) && c >= d => (Select0 (ANDCCconst [d] y))
229-
(ISEL [0] (Select0 (ANDCCconst [d] y)) (MOVDconst [-1]) (CMPUconst [c] (Select0 (ANDCCconst [d] y)))) && c >= d => (Select0 (ANDCCconst [d] y))
230229
(ORN x (MOVDconst [-1])) => x
231230

232231
(S(RAD|RD|LD) x (MOVDconst [c])) => (S(RAD|RD|LD)const [c&63 | (c>>6&1*63)] x)
@@ -563,11 +562,12 @@
563562
(OR x (MOVDconst [c])) && isU32Bit(c) => (ORconst [c] x)
564563

565564
// Simplify consts
566-
(Select0 (ANDCCconst [c] (Select0 (ANDCCconst [d] x)))) => (Select0 (ANDCCconst [c&d] x))
565+
(ANDCCconst [c] (Select0 (ANDCCconst [d] x))) => (ANDCCconst [c&d] x)
567566
(ORconst [c] (ORconst [d] x)) => (ORconst [c|d] x)
568567
(XORconst [c] (XORconst [d] x)) => (XORconst [c^d] x)
569568
(Select0 (ANDCCconst [-1] x)) => x
570569
(Select0 (ANDCCconst [0] _)) => (MOVDconst [0])
570+
(Select1 (ANDCCconst [0] _)) => (FlagEQ)
571571
(XORconst [0] x) => x
572572
(ORconst [-1] _) => (MOVDconst [-1])
573573
(ORconst [0] x) => x

0 commit comments

Comments
 (0)