Skip to content

Commit

Permalink
[X86][SchedModel] SSE reciprocal square root instruction latencies.
Browse files Browse the repository at this point in the history
The SSE rsqrt instruction (a fast reciprocal square root estimate) was
grouped in the same scheduling IIC_SSE_SQRT* class as the accurate (but very
slow) SSE sqrt instruction. For code which uses rsqrt (possibly with
newton-raphson iterations) this poor scheduling was affecting performances.

This patch splits off the rsqrt instruction from the sqrt instruction scheduling
classes and creates new IIC_SSE_RSQER* classes with latency values based on
Agner's table.

Differential Revision: http://reviews.llvm.org/D5370

Patch by Simon Pilgrim.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@218517 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
adibiagio committed Sep 26, 2014
1 parent a0d5d7a commit a5ab9ba
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 15 deletions.
16 changes: 13 additions & 3 deletions lib/Target/X86/X86InstrSSE.td
Original file line number Diff line number Diff line change
Expand Up @@ -3344,6 +3344,16 @@ def SSE_SQRTSD : OpndItins<
>;
}

let Sched = WriteFRsqrt in {
def SSE_RSQRTPS : OpndItins<
IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
>;

def SSE_RSQRTSS : OpndItins<
IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
>;
}

let Sched = WriteFRcp in {
def SSE_RCPP : OpndItins<
IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
Expand Down Expand Up @@ -3622,10 +3632,10 @@ defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss,

// Reciprocal approximations. Note that these typically require refinement
// in order to obtain suitable precision.
defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>,
sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>,
defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>,
sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>;
int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>;
defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
Expand Down
1 change: 1 addition & 0 deletions lib/Target/X86/X86SchedHaswell.td
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ defm : HWWriteResPair<WriteFAdd, HWPort1, 3>;
defm : HWWriteResPair<WriteFMul, HWPort0, 5>;
defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles.
defm : HWWriteResPair<WriteFRcp, HWPort0, 5>;
defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>;
defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>;
defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
Expand Down
1 change: 1 addition & 0 deletions lib/Target/X86/X86SchedSandyBridge.td
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ defm : SBWriteResPair<WriteFAdd, SBPort1, 3>;
defm : SBWriteResPair<WriteFMul, SBPort0, 5>;
defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles.
defm : SBWriteResPair<WriteFRcp, SBPort0, 5>;
defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>;
defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
Expand Down
18 changes: 12 additions & 6 deletions lib/Target/X86/X86Schedule.td
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,13 @@ def WriteZero : SchedWrite;
defm WriteJump : X86SchedWritePair;

// Floating point. This covers both scalar and vector operations.
defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
defm WriteFDiv : X86SchedWritePair; // Floating point division.
defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal.
defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
defm WriteFDiv : X86SchedWritePair; // Floating point division.
defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal estimate.
defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate.
defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
defm WriteFShuffle : X86SchedWritePair; // Floating point vector shuffles.
defm WriteFBlend : X86SchedWritePair; // Floating point vector blends.
defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends.
Expand Down Expand Up @@ -314,6 +315,11 @@ def IIC_SSE_SQRTPD_RM : InstrItinClass;
def IIC_SSE_SQRTSD_RR : InstrItinClass;
def IIC_SSE_SQRTSD_RM : InstrItinClass;

def IIC_SSE_RSQRTPS_RR : InstrItinClass;
def IIC_SSE_RSQRTPS_RM : InstrItinClass;
def IIC_SSE_RSQRTSS_RR : InstrItinClass;
def IIC_SSE_RSQRTSS_RM : InstrItinClass;

def IIC_SSE_RCPP_RR : InstrItinClass;
def IIC_SSE_RCPP_RM : InstrItinClass;
def IIC_SSE_RCPS_RR : InstrItinClass;
Expand Down
5 changes: 5 additions & 0 deletions lib/Target/X86/X86ScheduleAtom.td
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,11 @@ def AtomItineraries : ProcessorItineraries<
InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,

InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >,
InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >,

InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
Expand Down
12 changes: 6 additions & 6 deletions lib/Target/X86/X86ScheduleBtVer2.td
Original file line number Diff line number Diff line change
Expand Up @@ -163,15 +163,15 @@ defm : JWriteResIntPair<WriteJump, JALU01, 1>;
// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions?
// FIXME: Double precision latencies
// FIXME: SS vs PS latencies
// FIXME: RSQRT latencies
// FIXME: ymm latencies
////////////////////////////////////////////////////////////////////////////////

defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>;
defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>;
defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>;
defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>;
defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>;
defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>;
defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>;
defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>;
defm : JWriteResFpuPair<WriteFRsqrt, JFPU1, 2>;
defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>;
defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>;
defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>;

def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> {
Expand Down
1 change: 1 addition & 0 deletions lib/Target/X86/X86ScheduleSLM.td
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
// Scalar and vector floating point.
defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>;
defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>;
defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>;
defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
Expand Down

0 comments on commit a5ab9ba

Please sign in to comment.