Skip to content

[AMDGPU] Generate waterfall for calls with SGPR(inreg) argument #146997

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,12 @@ class SIFixSGPRCopies {
SmallVector<MachineInstr*, 4> RegSequences;
SmallVector<MachineInstr*, 4> PHINodes;
SmallVector<MachineInstr*, 4> S2VCopies;
struct V2PysSCopyInfo {
SmallVector<MachineOperand *> MOs;
SmallVector<Register> SGPRs;
};
DenseMap<MachineInstr *, struct V2PysSCopyInfo> WaterFalls;
DenseSet<MachineInstr *> V2PhySCopies;
unsigned NextVGPRToSGPRCopyID = 0;
MapVector<unsigned, V2SCopyInfo> V2SCopies;
DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty;
Expand All @@ -143,6 +149,7 @@ class SIFixSGPRCopies {
bool needToBeConvertedToVALU(V2SCopyInfo *I);
void analyzeVGPRToSGPRCopy(MachineInstr *MI);
void lowerVGPR2SGPRCopies(MachineFunction &MF);
void lowerPysicalSGPRInsts(MachineFunction &MF);
// Handles copies which source register is:
// 1. Physical register
// 2. AGPR
Expand Down Expand Up @@ -770,6 +777,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
}
}

lowerPysicalSGPRInsts(MF);
lowerVGPR2SGPRCopies(MF);
// Postprocessing
fixSCCCopies(MF);
Expand Down Expand Up @@ -800,6 +808,8 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) {
PHINodes.clear();
S2VCopies.clear();
PHISources.clear();
WaterFalls.clear();
V2PhySCopies.clear();

return true;
}
Expand Down Expand Up @@ -901,6 +911,34 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
MI, MI.getDebugLoc())) {
I = std::next(I);
MI.eraseFromParent();
} else if (SrcReg.isVirtual() && TRI->getRegSizeInBits(SrcReg, *MRI) ==
TRI->getRegSizeInBits(DstReg, *MRI)) {
auto I = MI.getIterator();
auto E = MI.getParent()->end();
// COPY can be erased if all its uses can be converted to waterfall.
bool CanErase = true;
// Only search current block since phyreg's def & use cannot cross
// blocks when MF.NoPhi = false.
while (++I != E) {
// Currently, we only support waterfall on SI_CALL_ISEL.
if (I->getOpcode() == AMDGPU::SI_CALL_ISEL) {
MachineInstr *UseMI = &*I;
for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) {
if (UseMI->getOperand(i).isReg() &&
UseMI->getOperand(i).getReg() == DstReg) {
MachineOperand *MO = &UseMI->getOperand(i);
MO->setReg(SrcReg);
WaterFalls[UseMI].MOs.push_back(MO);
WaterFalls[UseMI].SGPRs.push_back(DstReg);
}
}
} else if (I->readsRegister(DstReg, TRI))
CanErase = false;
if (I->findRegisterDefOperand(DstReg, TRI))
break;
}
if (CanErase)
V2PhySCopies.insert(&MI);
}
return true;
}
Expand Down Expand Up @@ -1128,6 +1166,45 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
}
}

void SIFixSGPRCopies::lowerPysicalSGPRInsts(MachineFunction &MF) {
for (auto &Entry : WaterFalls) {
MachineInstr *MI = Entry.first;
struct V2PysSCopyInfo Info = Entry.second;
if (Info.MOs.size() == 0 || Info.SGPRs.size() != Info.MOs.size())
continue;

if (MI->getOpcode() == AMDGPU::SI_CALL_ISEL) {
// Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
// following copies, we also need to move copies from and to physical
// registers into the loop block.
unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();

// Also move the copies to physical registers into the loop block
MachineBasicBlock &MBB = *MI->getParent();
MachineBasicBlock::iterator Start(MI);
while (Start->getOpcode() != FrameSetupOpcode)
--Start;
MachineBasicBlock::iterator End(MI);
while (End->getOpcode() != FrameDestroyOpcode)
++End;

// Also include following copies of the return value
++End;
while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
MI->definesRegister(End->getOperand(1).getReg(), TRI))
++End;

llvm::loadMBUFScalarOperandsFromVGPR(*TII, *MI, Info.MOs, MDT, Start, End,
Info.SGPRs);
}
}
// Avoid some O0 tests where no use of COPY to SGPR
if (!WaterFalls.empty())
for (auto &Entry : V2PhySCopies)
Entry->eraseFromParent();
}

void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {
bool IsWave32 = MF.getSubtarget<GCNSubtarget>().isWave32();
for (MachineBasicBlock &MBB : MF) {
Expand Down
67 changes: 42 additions & 25 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6540,13 +6540,10 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
// Emit the actual waterfall loop, executing the wrapped instruction for each
// unique value of \p ScalarOps across all lanes. In the best case we execute 1
// iteration, in the worst case we execute 64 (once per lane).
static void
emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
MachineRegisterInfo &MRI,
MachineBasicBlock &LoopBB,
MachineBasicBlock &BodyBB,
const DebugLoc &DL,
ArrayRef<MachineOperand *> ScalarOps) {
static void emitLoadScalarOpsFromVGPRLoop(
const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB,
MachineBasicBlock &BodyBB, const DebugLoc &DL,
ArrayRef<MachineOperand *> ScalarOps, SmallVector<Register> PhySGPRs = {}) {
MachineFunction &MF = *LoopBB.getParent();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
Expand All @@ -6561,7 +6558,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,

MachineBasicBlock::iterator I = LoopBB.begin();
Register CondReg;

int Idx = 0;
for (MachineOperand *ScalarOp : ScalarOps) {
unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
unsigned NumSubRegs = RegSize / 32;
Expand Down Expand Up @@ -6591,7 +6588,16 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
}

// Update ScalarOp operand to use the SGPR ScalarOp.
ScalarOp->setReg(CurReg);
if (PhySGPRs.empty())
ScalarOp->setReg(CurReg);
else {
// Insert into the same block of use
BuildMI(*ScalarOp->getParent()->getParent(),
ScalarOp->getParent()->getIterator(), DL, TII.get(AMDGPU::COPY),
PhySGPRs[Idx])
.addReg(CurReg);
ScalarOp->setReg(PhySGPRs[Idx]);
}
ScalarOp->setIsKill();
} else {
SmallVector<Register, 8> ReadlanePieces;
Expand Down Expand Up @@ -6660,9 +6666,18 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
}

// Update ScalarOp operand to use the SGPR ScalarOp.
ScalarOp->setReg(SScalarOp);
if (PhySGPRs.empty())
ScalarOp->setReg(SScalarOp);
else {
BuildMI(*ScalarOp->getParent()->getParent(),
ScalarOp->getParent()->getIterator(), DL, TII.get(AMDGPU::COPY),
PhySGPRs[Idx])
.addReg(SScalarOp);
ScalarOp->setReg(PhySGPRs[Idx]);
}
ScalarOp->setIsKill();
}
Idx++;
}

Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
Expand All @@ -6686,12 +6701,13 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
// with SGPRs by iterating over all unique values across all lanes.
// Returns the loop basic block that now contains \p MI.
static MachineBasicBlock *
loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
ArrayRef<MachineOperand *> ScalarOps,
MachineDominatorTree *MDT,
MachineBasicBlock::iterator Begin = nullptr,
MachineBasicBlock::iterator End = nullptr) {
MachineBasicBlock *llvm::loadMBUFScalarOperandsFromVGPR(
const SIInstrInfo &TII, MachineInstr &MI,
ArrayRef<MachineOperand *> ScalarOps, MachineDominatorTree *MDT,
MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
SmallVector<Register> PhySGPRs) {
assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) &&
"Physical SGPRs must be empty or match the number of scalar operands");
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Expand Down Expand Up @@ -6777,7 +6793,8 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
}
}

emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps,
PhySGPRs);
Copy link
Preview

Copilot AI Jul 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inserting a waterfall loop alters control flow and can affect debug info and profiling data. Ensure debug locations and profiling intrinsics are preserved or updated for accurate performance analysis.

Copilot uses AI. Check for mistakes.


MachineBasicBlock::iterator First = RemainderBB->begin();
// Restore SCC
Expand Down Expand Up @@ -6998,13 +7015,13 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
: AMDGPU::OpName::srsrc;
MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);

AMDGPU::OpName SampOpName =
isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);

return CreatedBB;
}
Expand Down Expand Up @@ -7032,8 +7049,8 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
++End;
CreatedBB =
loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT,
Start, End);
}
}

Expand Down Expand Up @@ -7215,19 +7232,19 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
// Legalize a VGPR Rsrc and soffset together.
if (!isSoffsetLegal) {
MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
CreatedBB =
loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI,
{Rsrc, Soffset}, MDT);
return CreatedBB;
}
CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
return CreatedBB;
}
}

// Legalize a VGPR soffset.
if (!isSoffsetLegal) {
MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
return CreatedBB;
}
return CreatedBB;
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1554,6 +1554,17 @@ bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
Register VReg,
const MachineInstr &DefMI);

/// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
/// with SGPRs by iterating over all unique values across all lanes.
/// Returns the loop basic block that now contains \p MI.
MachineBasicBlock *
loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
ArrayRef<MachineOperand *> ScalarOps,
MachineDominatorTree *MDT,
MachineBasicBlock::iterator Begin = nullptr,
MachineBasicBlock::iterator End = nullptr,
SmallVector<Register> PhySGPRs = {});

namespace AMDGPU {

LLVM_READONLY
Expand Down
Loading
Loading