-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[PHIElimination] Reuse existing COPY in predecessor basic block (Take Two) #146806
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-arm @llvm/pr-subscribers-backend-powerpc Author: Guy David (guy-david) ChangesDuring lowering, the PHI instruction is detached from the function and thus it is not considered a user of its operands. Full diff: https://github.com/llvm/llvm-project/pull/146806.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp
index 86523c22a419d..71e53a52a4061 100644
--- a/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/llvm/lib/CodeGen/PHIElimination.cpp
@@ -541,6 +541,7 @@ void PHIEliminationImpl::LowerPHINode(MachineBasicBlock &MBB,
// Now loop over all of the incoming arguments, changing them to copy into the
// IncomingReg register in the corresponding predecessor basic block.
SmallPtrSet<MachineBasicBlock *, 8> MBBsInsertedInto;
+ SmallVector<MachineInstr *, 8> InsertedCopies;
for (int i = NumSrcs - 1; i >= 0; --i) {
Register SrcReg = MPhi->getOperand(i * 2 + 1).getReg();
unsigned SrcSubReg = MPhi->getOperand(i * 2 + 1).getSubReg();
@@ -581,20 +582,6 @@ void PHIEliminationImpl::LowerPHINode(MachineBasicBlock &MBB,
continue;
}
- // Reuse an existing copy in the block if possible.
- if (IncomingReg.isVirtual()) {
- MachineInstr *DefMI = MRI->getUniqueVRegDef(SrcReg);
- const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
- const TargetRegisterClass *IncomingRC = MRI->getRegClass(IncomingReg);
- if (DefMI && DefMI->isCopy() && DefMI->getParent() == &opBlock &&
- MRI->use_empty(SrcReg) && IncomingRC->hasSuperClassEq(SrcRC)) {
- DefMI->getOperand(0).setReg(IncomingReg);
- if (LV)
- LV->getVarInfo(SrcReg).AliveBlocks.clear();
- continue;
- }
- }
-
// Find a safe location to insert the copy, this may be the first terminator
// in the block (or end()).
MachineBasicBlock::iterator InsertPos =
@@ -621,6 +608,7 @@ void PHIEliminationImpl::LowerPHINode(MachineBasicBlock &MBB,
NewSrcInstr = TII->createPHISourceCopy(opBlock, InsertPos, nullptr,
SrcReg, SrcSubReg, IncomingReg);
}
+ InsertedCopies.emplace_back(NewSrcInstr);
}
// We only need to update the LiveVariables kill of SrcReg if this was the
@@ -744,6 +732,32 @@ void PHIEliminationImpl::LowerPHINode(MachineBasicBlock &MBB,
}
}
+ // Remove redundant COPY instructions chains, which were potentially added by
+ // the code above. This can simplify the CFG which later on prevents
+ // suboptimal block layout.
+ for (MachineInstr *NewCopy : InsertedCopies) {
+ if (NewCopy->isImplicitDef())
+ continue;
+ Register IncomingReg = NewCopy->getOperand(0).getReg();
+ if (!IncomingReg.isVirtual())
+ continue;
+ Register SrcReg = NewCopy->getOperand(1).getReg();
+ if (!MRI->hasOneUse(SrcReg))
+ continue;
+ MachineInstr *DefMI = MRI->getUniqueVRegDef(SrcReg);
+ if (!DefMI || !DefMI->isCopy() ||
+ DefMI->getParent() != NewCopy->getParent())
+ continue;
+ const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
+ const TargetRegisterClass *IncomingRC = MRI->getRegClass(IncomingReg);
+ if (!IncomingRC->hasSuperClassEq(SrcRC))
+ continue;
+ DefMI->getOperand(0).setReg(IncomingReg);
+ NewCopy->removeFromParent();
+ if (LV)
+ LV->getVarInfo(SrcReg).AliveBlocks.clear();
+ }
+
// Really delete the PHI instruction now, if it is not in the LoweredPHIs map.
if (EliminateNow) {
if (LIS)
diff --git a/llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir b/llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir
index 20020a8ed3fb7..fb85a0dbff30d 100644
--- a/llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir
+++ b/llvm/test/CodeGen/AArch64/PHIElimination-reuse-copy.mir
@@ -135,14 +135,15 @@ body: |
; CHECK-NEXT: liveins: $nzcv
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: dead [[COPY2:%[0-9]+]]:gpr32 = COPY killed [[COPY1]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY3]]
; CHECK-NEXT: Bcc 1, %bb.1, implicit $nzcv
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.1(0x80000000)
; CHECK-NEXT: liveins: $nzcv
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY killed [[COPY3]]
; CHECK-NEXT: B %bb.1
bb.0:
successors: %bb.1
diff --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll
index 9e0dc87f0ab8b..33d7395693b49 100644
--- a/llvm/test/CodeGen/PowerPC/vsx.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx.ll
@@ -2487,8 +2487,7 @@ define double @test82(double %a, double %b, double %c, double %d) {
; CHECK-FISL-LABEL: test82:
; CHECK-FISL: # %bb.0: # %entry
; CHECK-FISL-NEXT: stfd f2, -16(r1) # 8-byte Folded Spill
-; CHECK-FISL-NEXT: fmr f2, f1
-; CHECK-FISL-NEXT: stfd f2, -8(r1) # 8-byte Folded Spill
+; CHECK-FISL-NEXT: stfd f1, -8(r1) # 8-byte Folded Spill
; CHECK-FISL-NEXT: xscmpudp cr0, f3, f4
; CHECK-FISL-NEXT: beq cr0, .LBB67_2
; CHECK-FISL-NEXT: # %bb.1: # %entry
|
6f36e64
to
8acba7d
Compare
8acba7d
to
8061cf9
Compare
You should probably mention in the commit message that this patch also solves a problem where debug info affected codegen. I've verified that this path fixes the debug info problem I mentioned in #131837 (comment) but apart from checking that I really don't feel confident reviewing this change. |
8061cf9
to
0629fff
Compare
@mikaelholmen @mstorsjo @macurtis-amd @sjoerdmeijer @sushgokh |
SHA f5c62ee + this PR was passing for us. Now, trunk + this PR has again started failing. |
I've re-tested the cases I've reported problems for with this patch on top of our downstream compiler based on trunk version c79fcfe and they still work. |
The insertion point of COPY isn't always optimal and could eventually lead to a worse block layout, see the regression test. This change affects many architectures but the amount of total instructions in the test cases seems too be slightly lower.
0629fff
to
c61ae62
Compare
@macurtis-amd I think I was able to narrow down the issue you've experienced and fix it here. |
Reintroduces the reverted #131837.