@@ -586,29 +586,55 @@ void X86FrameLowering::emitStackProbeInlineGeneric(
586586 const uint64_t StackProbeSize = TLI.getStackProbeSize (MF);
587587 uint64_t ProbeChunk = StackProbeSize * 8 ;
588588
589+ uint64_t MaxAlign =
590+ TRI->needsStackRealignment (MF) ? calculateMaxStackAlign (MF) : 0 ;
591+
589592 // Synthesize a loop or unroll it, depending on the number of iterations.
593+ // BuildStackAlignAND ensures that only MaxAlign % StackProbeSize bits left
594+ // between the unaligned rsp and current rsp.
590595 if (Offset > ProbeChunk) {
591- emitStackProbeInlineGenericLoop (MF, MBB, MBBI, DL, Offset);
596+ emitStackProbeInlineGenericLoop (MF, MBB, MBBI, DL, Offset,
597+ MaxAlign % StackProbeSize);
592598 } else {
593- emitStackProbeInlineGenericBlock (MF, MBB, MBBI, DL, Offset);
599+ emitStackProbeInlineGenericBlock (MF, MBB, MBBI, DL, Offset,
600+ MaxAlign % StackProbeSize);
594601 }
595602}
596603
597604void X86FrameLowering::emitStackProbeInlineGenericBlock (
598605 MachineFunction &MF, MachineBasicBlock &MBB,
599- MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
600- uint64_t Offset ) const {
606+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
607+ uint64_t AlignOffset ) const {
601608
602609 const X86Subtarget &STI = MF.getSubtarget <X86Subtarget>();
603610 const X86TargetLowering &TLI = *STI.getTargetLowering ();
604611 const unsigned Opc = getSUBriOpcode (Uses64BitFramePtr, Offset);
605612 const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
606613 const uint64_t StackProbeSize = TLI.getStackProbeSize (MF);
614+
607615 uint64_t CurrentOffset = 0 ;
608- // 0 Thanks to return address being saved on the stack
609- uint64_t CurrentProbeOffset = 0 ;
610616
611- // For the first N - 1 pages, just probe. I tried to take advantage of
617+ assert (AlignOffset < StackProbeSize);
618+
619+ // If the offset is so small it fits within a page, there's nothing to do.
620+ if (StackProbeSize < Offset + AlignOffset) {
621+
622+ MachineInstr *MI = BuildMI (MBB, MBBI, DL, TII.get (Opc), StackPtr)
623+ .addReg (StackPtr)
624+ .addImm (StackProbeSize - AlignOffset)
625+ .setMIFlag (MachineInstr::FrameSetup);
626+ MI->getOperand (3 ).setIsDead (); // The EFLAGS implicit def is dead.
627+
628+ addRegOffset (BuildMI (MBB, MBBI, DL, TII.get (MovMIOpc))
629+ .setMIFlag (MachineInstr::FrameSetup),
630+ StackPtr, false , 0 )
631+ .addImm (0 )
632+ .setMIFlag (MachineInstr::FrameSetup);
633+ NumFrameExtraProbe++;
634+ CurrentOffset = StackProbeSize - AlignOffset;
635+ }
636+
637+ // For the next N - 1 pages, just probe. I tried to take advantage of
612638 // natural probes but it implies much more logic and there was very few
613639 // interesting natural probes to interleave.
614640 while (CurrentOffset + StackProbeSize < Offset) {
@@ -626,9 +652,9 @@ void X86FrameLowering::emitStackProbeInlineGenericBlock(
626652 .setMIFlag (MachineInstr::FrameSetup);
627653 NumFrameExtraProbe++;
628654 CurrentOffset += StackProbeSize;
629- CurrentProbeOffset += StackProbeSize;
630655 }
631656
657+ // No need to probe the tail, it is smaller than a Page.
632658 uint64_t ChunkSize = Offset - CurrentOffset;
633659 MachineInstr *MI = BuildMI (MBB, MBBI, DL, TII.get (Opc), StackPtr)
634660 .addReg (StackPtr)
@@ -639,15 +665,35 @@ void X86FrameLowering::emitStackProbeInlineGenericBlock(
639665
640666void X86FrameLowering::emitStackProbeInlineGenericLoop (
641667 MachineFunction &MF, MachineBasicBlock &MBB,
642- MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
643- uint64_t Offset ) const {
668+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
669+ uint64_t AlignOffset ) const {
644670 assert (Offset && " null offset" );
645671
646672 const X86Subtarget &STI = MF.getSubtarget <X86Subtarget>();
647673 const X86TargetLowering &TLI = *STI.getTargetLowering ();
648674 const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
649675 const uint64_t StackProbeSize = TLI.getStackProbeSize (MF);
650676
677+ if (AlignOffset) {
678+ if (AlignOffset < StackProbeSize) {
679+ // Perform a first smaller allocation followed by a probe.
680+ const unsigned SUBOpc = getSUBriOpcode (Uses64BitFramePtr, AlignOffset);
681+ MachineInstr *MI = BuildMI (MBB, MBBI, DL, TII.get (SUBOpc), StackPtr)
682+ .addReg (StackPtr)
683+ .addImm (AlignOffset)
684+ .setMIFlag (MachineInstr::FrameSetup);
685+ MI->getOperand (3 ).setIsDead (); // The EFLAGS implicit def is dead.
686+
687+ addRegOffset (BuildMI (MBB, MBBI, DL, TII.get (MovMIOpc))
688+ .setMIFlag (MachineInstr::FrameSetup),
689+ StackPtr, false , 0 )
690+ .addImm (0 )
691+ .setMIFlag (MachineInstr::FrameSetup);
692+ NumFrameExtraProbe++;
693+ Offset -= AlignOffset;
694+ }
695+ }
696+
651697 // Synthesize a loop
652698 NumFrameLoopProbe++;
653699 const BasicBlock *LLVM_BB = MBB.getBasicBlock ();
@@ -666,17 +712,17 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
666712
667713 // save loop bound
668714 {
669- const unsigned Opc = getSUBriOpcode (Uses64BitFramePtr, Offset);
670- BuildMI (MBB, MBBI, DL, TII.get (Opc ), FinalStackProbed)
715+ const unsigned SUBOpc = getSUBriOpcode (Uses64BitFramePtr, Offset);
716+ BuildMI (MBB, MBBI, DL, TII.get (SUBOpc ), FinalStackProbed)
671717 .addReg (FinalStackProbed)
672718 .addImm (Offset / StackProbeSize * StackProbeSize)
673719 .setMIFlag (MachineInstr::FrameSetup);
674720 }
675721
676722 // allocate a page
677723 {
678- const unsigned Opc = getSUBriOpcode (Uses64BitFramePtr, StackProbeSize);
679- BuildMI (testMBB, DL, TII.get (Opc ), StackPtr)
724+ const unsigned SUBOpc = getSUBriOpcode (Uses64BitFramePtr, StackProbeSize);
725+ BuildMI (testMBB, DL, TII.get (SUBOpc ), StackPtr)
680726 .addReg (StackPtr)
681727 .addImm (StackProbeSize)
682728 .setMIFlag (MachineInstr::FrameSetup);
@@ -1052,13 +1098,149 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
10521098 uint64_t MaxAlign) const {
10531099 uint64_t Val = -MaxAlign;
10541100 unsigned AndOp = getANDriOpcode (Uses64BitFramePtr, Val);
1055- MachineInstr *MI = BuildMI (MBB, MBBI, DL, TII.get (AndOp), Reg)
1056- .addReg (Reg)
1057- .addImm (Val)
1058- .setMIFlag (MachineInstr::FrameSetup);
10591101
1060- // The EFLAGS implicit def is dead.
1061- MI->getOperand (3 ).setIsDead ();
1102+ MachineFunction &MF = *MBB.getParent ();
1103+ const X86Subtarget &STI = MF.getSubtarget <X86Subtarget>();
1104+ const X86TargetLowering &TLI = *STI.getTargetLowering ();
1105+ const uint64_t StackProbeSize = TLI.getStackProbeSize (MF);
1106+ const bool EmitInlineStackProbe = TLI.hasInlineStackProbe (MF);
1107+
1108+ // We want to make sure that (in worst case) less than StackProbeSize bytes
1109+ // are not probed after the AND. This assumption is used in
1110+ // emitStackProbeInlineGeneric.
1111+ if (Reg == StackPtr && EmitInlineStackProbe && MaxAlign >= StackProbeSize) {
1112+ {
1113+ NumFrameLoopProbe++;
1114+ MachineBasicBlock *entryMBB =
1115+ MF.CreateMachineBasicBlock (MBB.getBasicBlock ());
1116+ MachineBasicBlock *headMBB =
1117+ MF.CreateMachineBasicBlock (MBB.getBasicBlock ());
1118+ MachineBasicBlock *bodyMBB =
1119+ MF.CreateMachineBasicBlock (MBB.getBasicBlock ());
1120+ MachineBasicBlock *footMBB =
1121+ MF.CreateMachineBasicBlock (MBB.getBasicBlock ());
1122+
1123+ MachineFunction::iterator MBBIter = MBB.getIterator ();
1124+ MF.insert (MBBIter, entryMBB);
1125+ MF.insert (MBBIter, headMBB);
1126+ MF.insert (MBBIter, bodyMBB);
1127+ MF.insert (MBBIter, footMBB);
1128+ const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
1129+ Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D;
1130+
1131+ // Setup entry block
1132+ {
1133+
1134+ entryMBB->splice (entryMBB->end (), &MBB, MBB.begin (), MBBI);
1135+ BuildMI (entryMBB, DL, TII.get (TargetOpcode::COPY), FinalStackProbed)
1136+ .addReg (StackPtr)
1137+ .setMIFlag (MachineInstr::FrameSetup);
1138+ MachineInstr *MI =
1139+ BuildMI (entryMBB, DL, TII.get (AndOp), FinalStackProbed)
1140+ .addReg (FinalStackProbed)
1141+ .addImm (Val)
1142+ .setMIFlag (MachineInstr::FrameSetup);
1143+
1144+ // The EFLAGS implicit def is dead.
1145+ MI->getOperand (3 ).setIsDead ();
1146+
1147+ BuildMI (entryMBB, DL,
1148+ TII.get (Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
1149+ .addReg (FinalStackProbed)
1150+ .addReg (StackPtr)
1151+ .setMIFlag (MachineInstr::FrameSetup);
1152+ BuildMI (entryMBB, DL, TII.get (X86::JCC_1))
1153+ .addMBB (&MBB)
1154+ .addImm (X86::COND_E)
1155+ .setMIFlag (MachineInstr::FrameSetup);
1156+ entryMBB->addSuccessor (headMBB);
1157+ entryMBB->addSuccessor (&MBB);
1158+ }
1159+
1160+ // Loop entry block
1161+
1162+ {
1163+ const unsigned SUBOpc =
1164+ getSUBriOpcode (Uses64BitFramePtr, StackProbeSize);
1165+ BuildMI (headMBB, DL, TII.get (SUBOpc), StackPtr)
1166+ .addReg (StackPtr)
1167+ .addImm (StackProbeSize)
1168+ .setMIFlag (MachineInstr::FrameSetup);
1169+
1170+ BuildMI (headMBB, DL,
1171+ TII.get (Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
1172+ .addReg (FinalStackProbed)
1173+ .addReg (StackPtr)
1174+ .setMIFlag (MachineInstr::FrameSetup);
1175+
1176+ // jump
1177+ BuildMI (headMBB, DL, TII.get (X86::JCC_1))
1178+ .addMBB (footMBB)
1179+ .addImm (X86::COND_B)
1180+ .setMIFlag (MachineInstr::FrameSetup);
1181+
1182+ headMBB->addSuccessor (bodyMBB);
1183+ headMBB->addSuccessor (footMBB);
1184+ }
1185+
1186+ // setup loop body
1187+ {
1188+ addRegOffset (BuildMI (bodyMBB, DL, TII.get (MovMIOpc))
1189+ .setMIFlag (MachineInstr::FrameSetup),
1190+ StackPtr, false , 0 )
1191+ .addImm (0 )
1192+ .setMIFlag (MachineInstr::FrameSetup);
1193+
1194+ const unsigned SUBOpc =
1195+ getSUBriOpcode (Uses64BitFramePtr, StackProbeSize);
1196+ BuildMI (bodyMBB, DL, TII.get (SUBOpc), StackPtr)
1197+ .addReg (StackPtr)
1198+ .addImm (StackProbeSize)
1199+ .setMIFlag (MachineInstr::FrameSetup);
1200+
1201+ // cmp with stack pointer bound
1202+ BuildMI (bodyMBB, DL,
1203+ TII.get (Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
1204+ .addReg (FinalStackProbed)
1205+ .addReg (StackPtr)
1206+ .setMIFlag (MachineInstr::FrameSetup);
1207+
1208+ // jump
1209+ BuildMI (bodyMBB, DL, TII.get (X86::JCC_1))
1210+ .addMBB (bodyMBB)
1211+ .addImm (X86::COND_B)
1212+ .setMIFlag (MachineInstr::FrameSetup);
1213+ bodyMBB->addSuccessor (bodyMBB);
1214+ bodyMBB->addSuccessor (footMBB);
1215+ }
1216+
1217+ // setup loop footer
1218+ {
1219+ BuildMI (footMBB, DL, TII.get (TargetOpcode::COPY), StackPtr)
1220+ .addReg (FinalStackProbed)
1221+ .setMIFlag (MachineInstr::FrameSetup);
1222+ addRegOffset (BuildMI (footMBB, DL, TII.get (MovMIOpc))
1223+ .setMIFlag (MachineInstr::FrameSetup),
1224+ StackPtr, false , 0 )
1225+ .addImm (0 )
1226+ .setMIFlag (MachineInstr::FrameSetup);
1227+ footMBB->addSuccessor (&MBB);
1228+ }
1229+
1230+ recomputeLiveIns (*headMBB);
1231+ recomputeLiveIns (*bodyMBB);
1232+ recomputeLiveIns (*footMBB);
1233+ recomputeLiveIns (MBB);
1234+ }
1235+ } else {
1236+ MachineInstr *MI = BuildMI (MBB, MBBI, DL, TII.get (AndOp), Reg)
1237+ .addReg (Reg)
1238+ .addImm (Val)
1239+ .setMIFlag (MachineInstr::FrameSetup);
1240+
1241+ // The EFLAGS implicit def is dead.
1242+ MI->getOperand (3 ).setIsDead ();
1243+ }
10621244}
10631245
10641246bool X86FrameLowering::has128ByteRedZone (const MachineFunction& MF) const {
0 commit comments