From 6d8627244fd72f9ad45337485f4aa6c440ecfaa9 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Fri, 4 Jul 2025 12:21:38 +0800 Subject: [PATCH 1/3] generate waterfall for calls with sgpr argument(inreg) --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 82 ++++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 67 ++- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 11 + llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 44 ++ llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 7 + ...l-args-inreg-no-sgpr-for-csrspill-xfail.ll | 436 +++++++++++++++++- .../AMDGPU/tail-call-inreg-arguments.error.ll | 226 ++++++++- 7 files changed, 822 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 1bf5b4a241780..b929c4e7f70e2 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -124,6 +124,13 @@ class SIFixSGPRCopies { SmallVector RegSequences; SmallVector PHINodes; SmallVector S2VCopies; + struct V2PysSCopyInfo { + bool CanConvert; + SmallVector MOs; + SmallVector SGPRs; + }; + DenseMap WaterFalls; + DenseMap V2SCanErase; unsigned NextVGPRToSGPRCopyID = 0; MapVector V2SCopies; DenseMap> SiblingPenalty; @@ -143,6 +150,7 @@ class SIFixSGPRCopies { bool needToBeConvertedToVALU(V2SCopyInfo *I); void analyzeVGPRToSGPRCopy(MachineInstr *MI); void lowerVGPR2SGPRCopies(MachineFunction &MF); + void lowerPysicalSGPRInsts(MachineFunction &MF); // Handles copies which source register is: // 1. Physical register // 2. AGPR @@ -770,6 +778,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) { } } + lowerPysicalSGPRInsts(MF); lowerVGPR2SGPRCopies(MF); // Postprocessing fixSCCCopies(MF); @@ -800,6 +809,8 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) { PHINodes.clear(); S2VCopies.clear(); PHISources.clear(); + WaterFalls.clear(); + V2SCanErase.clear(); return true; } @@ -901,6 +912,37 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, MI, MI.getDebugLoc())) { I = std::next(I); MI.eraseFromParent(); + } else if (SrcReg.isVirtual() && TRI->getRegSizeInBits(SrcReg, *MRI) == + TRI->getRegSizeInBits(DstReg, *MRI)) { + // COPY can be erased if all its uses can be converted to waterfall. + if (V2SCanErase.count(&MI) == 0) + V2SCanErase[&MI] = true; + for (auto UseMI : TRI->findRegUsesFrom(&MI, DstReg, {DstReg}, {})) { + // Currently, we only support waterfall on SI_CALL_ISEL. + if (UseMI->getOpcode() != AMDGPU::SI_CALL_ISEL) { + V2SCanErase[&MI] = false; + continue; + } + // If CALL has one pysical reg used which is not dominated by its COPY + // def, we cannot create waterfall on UseMI. + // If we cannot create waterfall on UseMI, we cannot erase COPY. + if (!MDT->dominates(&MI, UseMI)) { + WaterFalls[UseMI].CanConvert = false; + V2SCanErase[&MI] = false; + continue; + } + for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) { + if (UseMI->getOperand(i).isReg() && + UseMI->getOperand(i).getReg() == DstReg) { + MachineOperand *MO = &UseMI->getOperand(i); + MO->setReg(SrcReg); + if (WaterFalls.count(UseMI) == 0) + WaterFalls[UseMI].CanConvert = true; + WaterFalls[UseMI].MOs.push_back(MO); + WaterFalls[UseMI].SGPRs.push_back(DstReg); + } + } + } } return true; } @@ -1128,6 +1170,46 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { } } +void SIFixSGPRCopies::lowerPysicalSGPRInsts(MachineFunction &MF) { + for (auto &Entry : WaterFalls) { + MachineInstr *MI = Entry.first; + struct V2PysSCopyInfo Info = Entry.second; + if (!Info.CanConvert || Info.MOs.size() == 0 || + Info.SGPRs.size() != Info.MOs.size()) + continue; + + if (MI->getOpcode() == AMDGPU::SI_CALL_ISEL) { + // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and + // following copies, we also need to move copies from and to physical + // registers into the loop block. + unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); + unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + + // Also move the copies to physical registers into the loop block + MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock::iterator Start(MI); + while (Start->getOpcode() != FrameSetupOpcode) + --Start; + MachineBasicBlock::iterator End(MI); + while (End->getOpcode() != FrameDestroyOpcode) + ++End; + + // Also include following copies of the return value + ++End; + while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && + MI->definesRegister(End->getOperand(1).getReg(), TRI)) + ++End; + + llvm::loadMBUFScalarOperandsFromVGPR(*TII, *MI, Info.MOs, MDT, Start, End, + Info.SGPRs); + } + + for (auto &Entry : V2SCanErase) + if (Entry.second) + Entry.first->eraseFromParent(); + } +} + void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { bool IsWave32 = MF.getSubtarget().isWave32(); for (MachineBasicBlock &MBB : MF) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 2d37451eb32b9..2571da1bb8a68 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6540,13 +6540,10 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, // Emit the actual waterfall loop, executing the wrapped instruction for each // unique value of \p ScalarOps across all lanes. In the best case we execute 1 // iteration, in the worst case we execute 64 (once per lane). -static void -emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, - MachineRegisterInfo &MRI, - MachineBasicBlock &LoopBB, - MachineBasicBlock &BodyBB, - const DebugLoc &DL, - ArrayRef ScalarOps) { +static void emitLoadScalarOpsFromVGPRLoop( + const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, + MachineBasicBlock &BodyBB, const DebugLoc &DL, + ArrayRef ScalarOps, SmallVector PhySGPRs = {}) { MachineFunction &MF = *LoopBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -6561,7 +6558,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineBasicBlock::iterator I = LoopBB.begin(); Register CondReg; - + int Idx = 0; for (MachineOperand *ScalarOp : ScalarOps) { unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI); unsigned NumSubRegs = RegSize / 32; @@ -6591,7 +6588,16 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, } // Update ScalarOp operand to use the SGPR ScalarOp. - ScalarOp->setReg(CurReg); + if (PhySGPRs.empty()) + ScalarOp->setReg(CurReg); + else { + // Insert into the same block of use + BuildMI(*ScalarOp->getParent()->getParent(), + ScalarOp->getParent()->getIterator(), DL, TII.get(AMDGPU::COPY), + PhySGPRs[Idx]) + .addReg(CurReg); + ScalarOp->setReg(PhySGPRs[Idx]); + } ScalarOp->setIsKill(); } else { SmallVector ReadlanePieces; @@ -6660,9 +6666,18 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, } // Update ScalarOp operand to use the SGPR ScalarOp. - ScalarOp->setReg(SScalarOp); + if (PhySGPRs.empty()) + ScalarOp->setReg(SScalarOp); + else { + BuildMI(*ScalarOp->getParent()->getParent(), + ScalarOp->getParent()->getIterator(), DL, TII.get(AMDGPU::COPY), + PhySGPRs[Idx]) + .addReg(SScalarOp); + ScalarOp->setReg(PhySGPRs[Idx]); + } ScalarOp->setIsKill(); } + Idx++; } Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); @@ -6686,12 +6701,13 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, // Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register // with SGPRs by iterating over all unique values across all lanes. // Returns the loop basic block that now contains \p MI. -static MachineBasicBlock * -loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, - ArrayRef ScalarOps, - MachineDominatorTree *MDT, - MachineBasicBlock::iterator Begin = nullptr, - MachineBasicBlock::iterator End = nullptr) { +MachineBasicBlock *llvm::loadMBUFScalarOperandsFromVGPR( + const SIInstrInfo &TII, MachineInstr &MI, + ArrayRef ScalarOps, MachineDominatorTree *MDT, + MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, + SmallVector PhySGPRs) { + assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) && + "Physical SGPRs must be empty or match the number of scalar operands"); MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget(); @@ -6777,7 +6793,8 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, } } - emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps); + emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps, + PhySGPRs); MachineBasicBlock::iterator First = RemainderBB->begin(); // Restore SCC @@ -6998,13 +7015,13 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, : AMDGPU::OpName::srsrc; MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName); if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) - CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT); + CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT); AMDGPU::OpName SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp; MachineOperand *SSamp = getNamedOperand(MI, SampOpName); if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) - CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT); + CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT); return CreatedBB; } @@ -7032,8 +7049,8 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr)) ++End; - CreatedBB = - loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End); + CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, + Start, End); } } @@ -7215,11 +7232,11 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, // Legalize a VGPR Rsrc and soffset together. if (!isSoffsetLegal) { MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset); - CreatedBB = - loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT); + CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, + {Rsrc, Soffset}, MDT); return CreatedBB; } - CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT); + CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT); return CreatedBB; } } @@ -7227,7 +7244,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, // Legalize a VGPR soffset. if (!isSoffsetLegal) { MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset); - CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT); + CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT); return CreatedBB; } return CreatedBB; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 01dd3c9f4119e..77b0713275834 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1554,6 +1554,17 @@ bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI); +/// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register +/// with SGPRs by iterating over all unique values across all lanes. +/// Returns the loop basic block that now contains \p MI. +MachineBasicBlock * +loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, + ArrayRef ScalarOps, + MachineDominatorTree *MDT, + MachineBasicBlock::iterator Begin = nullptr, + MachineBasicBlock::iterator End = nullptr, + SmallVector PhySGPRs = {}); + namespace AMDGPU { LLVM_READONLY diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 6754be1a0b619..6fc9e3a313ce4 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -4063,6 +4063,50 @@ SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, return 0; } +SmallVector +SIRegisterInfo::findRegUsesFrom(MachineInstr *StartMI, Register TrgReg, + const DenseSet &StopAtDefs, + const DenseSet &Opcodes) const { + DenseSet Visited; + SmallVector Stack; + + Stack.push_back(&*std::next(StartMI->getIterator())); + + SmallVector Uses; + while (!Stack.empty()) { + MachineInstr *I = Stack.back(); + Stack.pop_back(); + if (!Visited.insert(I).second) + continue; + + MachineBasicBlock *MBB = I->getParent(); + MachineBasicBlock::iterator It = I->getIterator(); + MachineBasicBlock::iterator E = MBB->end(); + + bool DefFound = false; + while (It != E) { + if (It->readsRegister(TrgReg, this) && &*It != StartMI) + // Only add to Uses if the opcode is in the allowed set + if (Opcodes.empty() || Opcodes.count(It->getOpcode())) + Uses.push_back(&*It); + for (auto DefReg : StopAtDefs) + if (It->findRegisterDefOperand(DefReg, this)) { + DefFound = true; + break; + } + if (DefFound) + break; + It++; + } + if (DefFound) + continue; + // Push successors onto the stack to visit next. + for (auto *Succ : MBB->successors()) + Stack.push_back(&*(Succ->begin())); + } + return Uses; +} + SmallVector SIRegisterInfo::getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 06a7a17b0246b..eca310d46f3e4 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseSet.h" #define GET_REGINFO_HEADER #include "AMDGPUGenRegisterInfo.inc" @@ -490,6 +491,12 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC) const; + // \returns list of MI uses defined physical reg by a given \p MI. + SmallVector + findRegUsesFrom(MachineInstr *StartMI, Register TrgReg, + const DenseSet &StopAtDefs, + const DenseSet &Opcodes) const; + std::optional getVRegFlagValue(StringRef Name) const override { return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG : std::optional{}; diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll index 34f4476f7fd6a..27d7f117711f6 100644 --- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll +++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll @@ -1,22 +1,452 @@ -; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s - -; CHECK: illegal VGPR to SGPR copy +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope %s declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0 declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0 declare hidden void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg, i32 inreg) #0 define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #0 { +; CHECK-LABEL: test_call_external_void_func_a15i32_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s40, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[42:43], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[42:43] +; CHECK-NEXT: v_writelane_b32 v40, s40, 32 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v40, s48, 8 +; CHECK-NEXT: v_writelane_b32 v40, s49, 9 +; CHECK-NEXT: v_writelane_b32 v40, s50, 10 +; CHECK-NEXT: v_writelane_b32 v40, s51, 11 +; CHECK-NEXT: v_writelane_b32 v40, s52, 12 +; CHECK-NEXT: v_writelane_b32 v40, s53, 13 +; CHECK-NEXT: v_writelane_b32 v40, s54, 14 +; CHECK-NEXT: v_writelane_b32 v40, s55, 15 +; CHECK-NEXT: v_writelane_b32 v40, s64, 16 +; CHECK-NEXT: v_writelane_b32 v40, s65, 17 +; CHECK-NEXT: v_writelane_b32 v40, s66, 18 +; CHECK-NEXT: v_writelane_b32 v40, s67, 19 +; CHECK-NEXT: v_writelane_b32 v40, s68, 20 +; CHECK-NEXT: v_writelane_b32 v40, s69, 21 +; CHECK-NEXT: v_writelane_b32 v40, s70, 22 +; CHECK-NEXT: v_writelane_b32 v40, s71, 23 +; CHECK-NEXT: v_writelane_b32 v40, s80, 24 +; CHECK-NEXT: v_writelane_b32 v40, s81, 25 +; CHECK-NEXT: v_writelane_b32 v40, s82, 26 +; CHECK-NEXT: v_writelane_b32 v40, s83, 27 +; CHECK-NEXT: v_writelane_b32 v40, s84, 28 +; CHECK-NEXT: v_writelane_b32 v40, s85, 29 +; CHECK-NEXT: v_writelane_b32 v40, s86, 30 +; CHECK-NEXT: s_mov_b32 s50, s29 +; CHECK-NEXT: s_mov_b32 s51, s28 +; CHECK-NEXT: s_mov_b32 s52, s27 +; CHECK-NEXT: s_mov_b32 s53, s26 +; CHECK-NEXT: s_mov_b32 s54, s25 +; CHECK-NEXT: s_mov_b32 s55, s24 +; CHECK-NEXT: s_mov_b32 s64, s23 +; CHECK-NEXT: s_mov_b32 s65, s22 +; CHECK-NEXT: s_mov_b32 s66, s21 +; CHECK-NEXT: s_mov_b32 s67, s20 +; CHECK-NEXT: s_mov_b32 s68, s19 +; CHECK-NEXT: s_mov_b32 s69, s18 +; CHECK-NEXT: s_mov_b32 s70, s17 +; CHECK-NEXT: s_mov_b32 s71, s16 +; CHECK-NEXT: s_mov_b32 s80, s15 +; CHECK-NEXT: s_mov_b32 s81, s14 +; CHECK-NEXT: s_mov_b32 s82, s13 +; CHECK-NEXT: s_mov_b32 s83, s12 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: s_mov_b64 s[84:85], exec +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s87, 31 +; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s26, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s26, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[86:87], vcc +; CHECK-NEXT: s_getpc_b64 s[28:29] +; CHECK-NEXT: s_add_u32 s28, s28, external_void_func_a15i32_inreg@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s29, s29, external_void_func_a15i32_inreg@rel32@hi+12 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s83 +; CHECK-NEXT: s_mov_b32 s13, s82 +; CHECK-NEXT: s_mov_b32 s14, s81 +; CHECK-NEXT: s_mov_b32 s15, s80 +; CHECK-NEXT: s_mov_b32 s0, s71 +; CHECK-NEXT: s_mov_b32 s1, s70 +; CHECK-NEXT: s_mov_b32 s2, s69 +; CHECK-NEXT: s_mov_b32 s3, s68 +; CHECK-NEXT: s_mov_b32 s16, s67 +; CHECK-NEXT: s_mov_b32 s17, s66 +; CHECK-NEXT: s_mov_b32 s18, s65 +; CHECK-NEXT: s_mov_b32 s19, s64 +; CHECK-NEXT: s_mov_b32 s20, s55 +; CHECK-NEXT: s_mov_b32 s21, s54 +; CHECK-NEXT: s_mov_b32 s22, s53 +; CHECK-NEXT: s_mov_b32 s23, s52 +; CHECK-NEXT: s_mov_b32 s24, s51 +; CHECK-NEXT: s_mov_b32 s25, s50 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[28:29] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[86:87] +; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b64 exec, s[84:85] +; CHECK-NEXT: v_readlane_b32 s87, v40, 31 +; CHECK-NEXT: v_readlane_b32 s86, v40, 30 +; CHECK-NEXT: v_readlane_b32 s85, v40, 29 +; CHECK-NEXT: v_readlane_b32 s84, v40, 28 +; CHECK-NEXT: v_readlane_b32 s83, v40, 27 +; CHECK-NEXT: v_readlane_b32 s82, v40, 26 +; CHECK-NEXT: v_readlane_b32 s81, v40, 25 +; CHECK-NEXT: v_readlane_b32 s80, v40, 24 +; CHECK-NEXT: v_readlane_b32 s71, v40, 23 +; CHECK-NEXT: v_readlane_b32 s70, v40, 22 +; CHECK-NEXT: v_readlane_b32 s69, v40, 21 +; CHECK-NEXT: v_readlane_b32 s68, v40, 20 +; CHECK-NEXT: v_readlane_b32 s67, v40, 19 +; CHECK-NEXT: v_readlane_b32 s66, v40, 18 +; CHECK-NEXT: v_readlane_b32 s65, v40, 17 +; CHECK-NEXT: v_readlane_b32 s64, v40, 16 +; CHECK-NEXT: v_readlane_b32 s55, v40, 15 +; CHECK-NEXT: v_readlane_b32 s54, v40, 14 +; CHECK-NEXT: v_readlane_b32 s53, v40, 13 +; CHECK-NEXT: v_readlane_b32 s52, v40, 12 +; CHECK-NEXT: v_readlane_b32 s51, v40, 11 +; CHECK-NEXT: v_readlane_b32 s50, v40, 10 +; CHECK-NEXT: v_readlane_b32 s49, v40, 9 +; CHECK-NEXT: v_readlane_b32 s48, v40, 8 +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s4, v40, 32 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_a15i32_inreg([15 x i32] inreg %arg0) ret void } define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #0 { +; CHECK-LABEL: test_call_external_void_func_a16i32_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s40, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[42:43], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[42:43] +; CHECK-NEXT: v_writelane_b32 v40, s40, 32 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v40, s48, 8 +; CHECK-NEXT: v_writelane_b32 v40, s49, 9 +; CHECK-NEXT: v_writelane_b32 v40, s50, 10 +; CHECK-NEXT: v_writelane_b32 v40, s51, 11 +; CHECK-NEXT: v_writelane_b32 v40, s52, 12 +; CHECK-NEXT: v_writelane_b32 v40, s53, 13 +; CHECK-NEXT: v_writelane_b32 v40, s54, 14 +; CHECK-NEXT: v_writelane_b32 v40, s55, 15 +; CHECK-NEXT: v_writelane_b32 v40, s64, 16 +; CHECK-NEXT: v_writelane_b32 v40, s65, 17 +; CHECK-NEXT: v_writelane_b32 v40, s66, 18 +; CHECK-NEXT: v_writelane_b32 v40, s67, 19 +; CHECK-NEXT: v_writelane_b32 v40, s68, 20 +; CHECK-NEXT: v_writelane_b32 v40, s69, 21 +; CHECK-NEXT: v_writelane_b32 v40, s70, 22 +; CHECK-NEXT: v_writelane_b32 v40, s71, 23 +; CHECK-NEXT: v_writelane_b32 v40, s80, 24 +; CHECK-NEXT: v_writelane_b32 v40, s81, 25 +; CHECK-NEXT: v_writelane_b32 v40, s82, 26 +; CHECK-NEXT: v_writelane_b32 v40, s83, 27 +; CHECK-NEXT: v_writelane_b32 v40, s84, 28 +; CHECK-NEXT: v_writelane_b32 v40, s85, 29 +; CHECK-NEXT: v_writelane_b32 v40, s86, 30 +; CHECK-NEXT: s_mov_b32 s50, s29 +; CHECK-NEXT: s_mov_b32 s51, s28 +; CHECK-NEXT: s_mov_b32 s52, s27 +; CHECK-NEXT: s_mov_b32 s53, s26 +; CHECK-NEXT: s_mov_b32 s54, s25 +; CHECK-NEXT: s_mov_b32 s55, s24 +; CHECK-NEXT: s_mov_b32 s64, s23 +; CHECK-NEXT: s_mov_b32 s65, s22 +; CHECK-NEXT: s_mov_b32 s66, s21 +; CHECK-NEXT: s_mov_b32 s67, s20 +; CHECK-NEXT: s_mov_b32 s68, s19 +; CHECK-NEXT: s_mov_b32 s69, s18 +; CHECK-NEXT: s_mov_b32 s70, s17 +; CHECK-NEXT: s_mov_b32 s71, s16 +; CHECK-NEXT: s_mov_b32 s80, s15 +; CHECK-NEXT: s_mov_b32 s81, s14 +; CHECK-NEXT: s_mov_b32 s82, s13 +; CHECK-NEXT: s_mov_b32 s83, s12 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: s_mov_b64 s[84:85], exec +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s87, 31 +; CHECK-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s26, v0 +; CHECK-NEXT: v_readfirstlane_b32 s27, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s26, v0 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], s27, v1 +; CHECK-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[86:87], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[28:29] +; CHECK-NEXT: s_add_u32 s28, s28, external_void_func_a16i32_inreg@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s29, s29, external_void_func_a16i32_inreg@rel32@hi+12 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s83 +; CHECK-NEXT: s_mov_b32 s13, s82 +; CHECK-NEXT: s_mov_b32 s14, s81 +; CHECK-NEXT: s_mov_b32 s15, s80 +; CHECK-NEXT: s_mov_b32 s0, s71 +; CHECK-NEXT: s_mov_b32 s1, s70 +; CHECK-NEXT: s_mov_b32 s2, s69 +; CHECK-NEXT: s_mov_b32 s3, s68 +; CHECK-NEXT: s_mov_b32 s16, s67 +; CHECK-NEXT: s_mov_b32 s17, s66 +; CHECK-NEXT: s_mov_b32 s18, s65 +; CHECK-NEXT: s_mov_b32 s19, s64 +; CHECK-NEXT: s_mov_b32 s20, s55 +; CHECK-NEXT: s_mov_b32 s21, s54 +; CHECK-NEXT: s_mov_b32 s22, s53 +; CHECK-NEXT: s_mov_b32 s23, s52 +; CHECK-NEXT: s_mov_b32 s24, s51 +; CHECK-NEXT: s_mov_b32 s25, s50 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[28:29] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[86:87] +; CHECK-NEXT: s_cbranch_execnz .LBB1_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b64 exec, s[84:85] +; CHECK-NEXT: v_readlane_b32 s87, v40, 31 +; CHECK-NEXT: v_readlane_b32 s86, v40, 30 +; CHECK-NEXT: v_readlane_b32 s85, v40, 29 +; CHECK-NEXT: v_readlane_b32 s84, v40, 28 +; CHECK-NEXT: v_readlane_b32 s83, v40, 27 +; CHECK-NEXT: v_readlane_b32 s82, v40, 26 +; CHECK-NEXT: v_readlane_b32 s81, v40, 25 +; CHECK-NEXT: v_readlane_b32 s80, v40, 24 +; CHECK-NEXT: v_readlane_b32 s71, v40, 23 +; CHECK-NEXT: v_readlane_b32 s70, v40, 22 +; CHECK-NEXT: v_readlane_b32 s69, v40, 21 +; CHECK-NEXT: v_readlane_b32 s68, v40, 20 +; CHECK-NEXT: v_readlane_b32 s67, v40, 19 +; CHECK-NEXT: v_readlane_b32 s66, v40, 18 +; CHECK-NEXT: v_readlane_b32 s65, v40, 17 +; CHECK-NEXT: v_readlane_b32 s64, v40, 16 +; CHECK-NEXT: v_readlane_b32 s55, v40, 15 +; CHECK-NEXT: v_readlane_b32 s54, v40, 14 +; CHECK-NEXT: v_readlane_b32 s53, v40, 13 +; CHECK-NEXT: v_readlane_b32 s52, v40, 12 +; CHECK-NEXT: v_readlane_b32 s51, v40, 11 +; CHECK-NEXT: v_readlane_b32 s50, v40, 10 +; CHECK-NEXT: v_readlane_b32 s49, v40, 9 +; CHECK-NEXT: v_readlane_b32 s48, v40, 8 +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s4, v40, 32 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_a16i32_inreg([16 x i32] inreg %arg0) ret void } define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s40, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[42:43], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[42:43] +; CHECK-NEXT: v_writelane_b32 v40, s40, 32 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v40, s48, 8 +; CHECK-NEXT: v_writelane_b32 v40, s49, 9 +; CHECK-NEXT: v_writelane_b32 v40, s50, 10 +; CHECK-NEXT: v_writelane_b32 v40, s51, 11 +; CHECK-NEXT: v_writelane_b32 v40, s52, 12 +; CHECK-NEXT: v_writelane_b32 v40, s53, 13 +; CHECK-NEXT: v_writelane_b32 v40, s54, 14 +; CHECK-NEXT: v_writelane_b32 v40, s55, 15 +; CHECK-NEXT: v_writelane_b32 v40, s64, 16 +; CHECK-NEXT: v_writelane_b32 v40, s65, 17 +; CHECK-NEXT: v_writelane_b32 v40, s66, 18 +; CHECK-NEXT: v_writelane_b32 v40, s67, 19 +; CHECK-NEXT: v_writelane_b32 v40, s68, 20 +; CHECK-NEXT: v_writelane_b32 v40, s69, 21 +; CHECK-NEXT: v_writelane_b32 v40, s70, 22 +; CHECK-NEXT: v_writelane_b32 v40, s71, 23 +; CHECK-NEXT: v_writelane_b32 v40, s80, 24 +; CHECK-NEXT: v_writelane_b32 v40, s81, 25 +; CHECK-NEXT: v_writelane_b32 v40, s82, 26 +; CHECK-NEXT: v_writelane_b32 v40, s83, 27 +; CHECK-NEXT: v_writelane_b32 v40, s84, 28 +; CHECK-NEXT: v_writelane_b32 v40, s85, 29 +; CHECK-NEXT: v_writelane_b32 v40, s86, 30 +; CHECK-NEXT: s_mov_b32 s50, s29 +; CHECK-NEXT: s_mov_b32 s51, s28 +; CHECK-NEXT: s_mov_b32 s52, s27 +; CHECK-NEXT: s_mov_b32 s53, s26 +; CHECK-NEXT: s_mov_b32 s54, s25 +; CHECK-NEXT: s_mov_b32 s55, s24 +; CHECK-NEXT: s_mov_b32 s64, s23 +; CHECK-NEXT: s_mov_b32 s65, s22 +; CHECK-NEXT: s_mov_b32 s66, s21 +; CHECK-NEXT: s_mov_b32 s67, s20 +; CHECK-NEXT: s_mov_b32 s68, s19 +; CHECK-NEXT: s_mov_b32 s69, s18 +; CHECK-NEXT: s_mov_b32 s70, s17 +; CHECK-NEXT: s_mov_b32 s71, s16 +; CHECK-NEXT: s_mov_b32 s80, s15 +; CHECK-NEXT: s_mov_b32 s81, s14 +; CHECK-NEXT: s_mov_b32 s82, s13 +; CHECK-NEXT: s_mov_b32 s83, s12 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: s_mov_b64 s[84:85], exec +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s87, 31 +; CHECK-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s26, v0 +; CHECK-NEXT: v_readfirstlane_b32 s27, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s26, v0 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], s27, v1 +; CHECK-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[86:87], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[28:29] +; CHECK-NEXT: s_add_u32 s28, s28, external_void_func_a15i32_inreg_i32_inreg@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s29, s29, external_void_func_a15i32_inreg_i32_inreg@rel32@hi+12 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s83 +; CHECK-NEXT: s_mov_b32 s13, s82 +; CHECK-NEXT: s_mov_b32 s14, s81 +; CHECK-NEXT: s_mov_b32 s15, s80 +; CHECK-NEXT: s_mov_b32 s0, s71 +; CHECK-NEXT: s_mov_b32 s1, s70 +; CHECK-NEXT: s_mov_b32 s2, s69 +; CHECK-NEXT: s_mov_b32 s3, s68 +; CHECK-NEXT: s_mov_b32 s16, s67 +; CHECK-NEXT: s_mov_b32 s17, s66 +; CHECK-NEXT: s_mov_b32 s18, s65 +; CHECK-NEXT: s_mov_b32 s19, s64 +; CHECK-NEXT: s_mov_b32 s20, s55 +; CHECK-NEXT: s_mov_b32 s21, s54 +; CHECK-NEXT: s_mov_b32 s22, s53 +; CHECK-NEXT: s_mov_b32 s23, s52 +; CHECK-NEXT: s_mov_b32 s24, s51 +; CHECK-NEXT: s_mov_b32 s25, s50 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[28:29] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[86:87] +; CHECK-NEXT: s_cbranch_execnz .LBB2_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b64 exec, s[84:85] +; CHECK-NEXT: v_readlane_b32 s87, v40, 31 +; CHECK-NEXT: v_readlane_b32 s86, v40, 30 +; CHECK-NEXT: v_readlane_b32 s85, v40, 29 +; CHECK-NEXT: v_readlane_b32 s84, v40, 28 +; CHECK-NEXT: v_readlane_b32 s83, v40, 27 +; CHECK-NEXT: v_readlane_b32 s82, v40, 26 +; CHECK-NEXT: v_readlane_b32 s81, v40, 25 +; CHECK-NEXT: v_readlane_b32 s80, v40, 24 +; CHECK-NEXT: v_readlane_b32 s71, v40, 23 +; CHECK-NEXT: v_readlane_b32 s70, v40, 22 +; CHECK-NEXT: v_readlane_b32 s69, v40, 21 +; CHECK-NEXT: v_readlane_b32 s68, v40, 20 +; CHECK-NEXT: v_readlane_b32 s67, v40, 19 +; CHECK-NEXT: v_readlane_b32 s66, v40, 18 +; CHECK-NEXT: v_readlane_b32 s65, v40, 17 +; CHECK-NEXT: v_readlane_b32 s64, v40, 16 +; CHECK-NEXT: v_readlane_b32 s55, v40, 15 +; CHECK-NEXT: v_readlane_b32 s54, v40, 14 +; CHECK-NEXT: v_readlane_b32 s53, v40, 13 +; CHECK-NEXT: v_readlane_b32 s52, v40, 12 +; CHECK-NEXT: v_readlane_b32 s51, v40, 11 +; CHECK-NEXT: v_readlane_b32 s50, v40, 10 +; CHECK-NEXT: v_readlane_b32 s49, v40, 9 +; CHECK-NEXT: v_readlane_b32 s48, v40, 8 +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s4, v40, 32 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll index 242b5e9aeaf42..c2482e678e978 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll @@ -1,13 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 2> %t.err < %s | FileCheck %s -; RUN: FileCheck -check-prefix=ERR %s < %t.err -; FIXME: These tests cannot be tail called, and should be executed in a waterfall loop. +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s declare hidden void @void_func_i32_inreg(i32 inreg) -; ERR: error: :0:0: in function tail_call_i32_inreg_divergent void (i32): illegal VGPR to SGPR copy -; ERR: error: :0:0: in function indirect_tail_call_i32_inreg_divergent void (i32): illegal VGPR to SGPR copy - define void @tail_call_i32_inreg_divergent(i32 %vgpr) { ; CHECK-LABEL: tail_call_i32_inreg_divergent: ; CHECK: ; %bb.0: @@ -17,19 +12,78 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) { ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 2 -; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s16, 18 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, void_func_i32_inreg@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, void_func_i32_inreg@rel32@hi+12 -; CHECK-NEXT: ; illegal copy v0 to s0 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v40, s48, 8 +; CHECK-NEXT: v_writelane_b32 v40, s49, 9 +; CHECK-NEXT: v_writelane_b32 v40, s50, 10 +; CHECK-NEXT: v_writelane_b32 v40, s51, 11 +; CHECK-NEXT: v_writelane_b32 v40, s52, 12 +; CHECK-NEXT: v_writelane_b32 v40, s53, 13 +; CHECK-NEXT: v_writelane_b32 v40, s54, 14 +; CHECK-NEXT: v_writelane_b32 v40, s55, 15 +; CHECK-NEXT: v_writelane_b32 v40, s64, 16 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: s_mov_b64 s[54:55], exec +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s65, 17 +; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s16, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s16, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[64:65], vcc +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, void_func_i32_inreg@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, void_func_i32_inreg@rel32@hi+12 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 +; CHECK-NEXT: s_mov_b32 s0, s16 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[64:65] +; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b64 exec, s[54:55] +; CHECK-NEXT: v_readlane_b32 s65, v40, 17 +; CHECK-NEXT: v_readlane_b32 s64, v40, 16 +; CHECK-NEXT: v_readlane_b32 s55, v40, 15 +; CHECK-NEXT: v_readlane_b32 s54, v40, 14 +; CHECK-NEXT: v_readlane_b32 s53, v40, 13 +; CHECK-NEXT: v_readlane_b32 s52, v40, 12 +; CHECK-NEXT: v_readlane_b32 s51, v40, 11 +; CHECK-NEXT: v_readlane_b32 s50, v40, 10 +; CHECK-NEXT: v_readlane_b32 s49, v40, 9 +; CHECK-NEXT: v_readlane_b32 s48, v40, 8 +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 -; CHECK-NEXT: v_readlane_b32 s4, v40, 2 +; CHECK-NEXT: v_readlane_b32 s4, v40, 18 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] @@ -51,21 +105,84 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) { ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v40, s16, 2 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, constant@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, constant@rel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; CHECK-NEXT: v_writelane_b32 v40, s16, 20 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: ; illegal copy v0 to s0 +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v40, s48, 8 +; CHECK-NEXT: v_writelane_b32 v40, s49, 9 +; CHECK-NEXT: v_writelane_b32 v40, s50, 10 +; CHECK-NEXT: v_writelane_b32 v40, s51, 11 +; CHECK-NEXT: v_writelane_b32 v40, s52, 12 +; CHECK-NEXT: v_writelane_b32 v40, s53, 13 +; CHECK-NEXT: v_writelane_b32 v40, s54, 14 +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s55, 15 +; CHECK-NEXT: v_writelane_b32 v40, s64, 16 +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, constant@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, constant@rel32@hi+12 +; CHECK-NEXT: v_writelane_b32 v40, s65, 17 +; CHECK-NEXT: s_load_dwordx2 s[64:65], s[4:5], 0x0 +; CHECK-NEXT: v_writelane_b32 v40, s66, 18 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[54:55], exec +; CHECK-NEXT: v_writelane_b32 v40, s67, 19 +; CHECK-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s16, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s16, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[66:67], vcc +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 +; CHECK-NEXT: s_mov_b32 s0, s16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[64:65] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[66:67] +; CHECK-NEXT: s_cbranch_execnz .LBB1_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b64 exec, s[54:55] +; CHECK-NEXT: v_readlane_b32 s67, v40, 19 +; CHECK-NEXT: v_readlane_b32 s66, v40, 18 +; CHECK-NEXT: v_readlane_b32 s65, v40, 17 +; CHECK-NEXT: v_readlane_b32 s64, v40, 16 +; CHECK-NEXT: v_readlane_b32 s55, v40, 15 +; CHECK-NEXT: v_readlane_b32 s54, v40, 14 +; CHECK-NEXT: v_readlane_b32 s53, v40, 13 +; CHECK-NEXT: v_readlane_b32 s52, v40, 12 +; CHECK-NEXT: v_readlane_b32 s51, v40, 11 +; CHECK-NEXT: v_readlane_b32 s50, v40, 10 +; CHECK-NEXT: v_readlane_b32 s49, v40, 9 +; CHECK-NEXT: v_readlane_b32 s48, v40, 8 +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 -; CHECK-NEXT: v_readlane_b32 s4, v40, 2 +; CHECK-NEXT: v_readlane_b32 s4, v40, 20 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] @@ -76,3 +193,66 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) { tail call void %fptr(i32 inreg %vgpr) ret void } + +declare void @user(ptr addrspace(5)) + +define amdgpu_kernel void @v_multiple_frame_indexes_literal_offsets() #0 { +; CHECK-LABEL: v_multiple_frame_indexes_literal_offsets: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 +; CHECK-NEXT: v_mov_b32_e32 v3, 8 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s33, s16 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_movk_i32 s32, 0x400 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s15, v3 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s15, v3 +; CHECK-NEXT: s_and_saveexec_b64 s[52:53], vcc +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, user@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, user@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: s_mov_b32 s0, s15 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: ; implicit-def: $vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[52:53] +; CHECK-NEXT: s_cbranch_execnz .LBB2_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_endpgm + %vgpr = call i32 @llvm.amdgcn.workitem.id.x() + %alloca0 = alloca [2 x i32], align 8, addrspace(5) + %alloca1 = alloca i32, align 4, addrspace(5) + %cmp = icmp eq i32 %vgpr, 0 + %select = select i1 %cmp, ptr addrspace(5) %alloca0, ptr addrspace(5) %alloca1 + call void @user(ptr addrspace(5) inreg %select) + ret void +} + +declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } From 58313b68c6fd6e303abb15dadb4ed60254effffb Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Fri, 4 Jul 2025 13:38:57 +0800 Subject: [PATCH 2/3] search only within block --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 63 ++++++++++------------ llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 44 --------------- llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 6 --- 3 files changed, 29 insertions(+), 84 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index b929c4e7f70e2..c01a6b9ba046e 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -125,12 +125,11 @@ class SIFixSGPRCopies { SmallVector PHINodes; SmallVector S2VCopies; struct V2PysSCopyInfo { - bool CanConvert; SmallVector MOs; SmallVector SGPRs; }; DenseMap WaterFalls; - DenseMap V2SCanErase; + DenseSet V2PhySCopies; unsigned NextVGPRToSGPRCopyID = 0; MapVector V2SCopies; DenseMap> SiblingPenalty; @@ -810,7 +809,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) { S2VCopies.clear(); PHISources.clear(); WaterFalls.clear(); - V2SCanErase.clear(); + V2PhySCopies.clear(); return true; } @@ -914,35 +913,32 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, MI.eraseFromParent(); } else if (SrcReg.isVirtual() && TRI->getRegSizeInBits(SrcReg, *MRI) == TRI->getRegSizeInBits(DstReg, *MRI)) { + auto I = MI.getIterator(); + auto E = MI.getParent()->end(); // COPY can be erased if all its uses can be converted to waterfall. - if (V2SCanErase.count(&MI) == 0) - V2SCanErase[&MI] = true; - for (auto UseMI : TRI->findRegUsesFrom(&MI, DstReg, {DstReg}, {})) { + bool CanErase = true; + // Only search current block since phyreg's def & use cannot cross + // blocks when MF.NoPhi = false. + while (++I != E) { // Currently, we only support waterfall on SI_CALL_ISEL. - if (UseMI->getOpcode() != AMDGPU::SI_CALL_ISEL) { - V2SCanErase[&MI] = false; - continue; - } - // If CALL has one pysical reg used which is not dominated by its COPY - // def, we cannot create waterfall on UseMI. - // If we cannot create waterfall on UseMI, we cannot erase COPY. - if (!MDT->dominates(&MI, UseMI)) { - WaterFalls[UseMI].CanConvert = false; - V2SCanErase[&MI] = false; - continue; - } - for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) { - if (UseMI->getOperand(i).isReg() && - UseMI->getOperand(i).getReg() == DstReg) { - MachineOperand *MO = &UseMI->getOperand(i); - MO->setReg(SrcReg); - if (WaterFalls.count(UseMI) == 0) - WaterFalls[UseMI].CanConvert = true; - WaterFalls[UseMI].MOs.push_back(MO); - WaterFalls[UseMI].SGPRs.push_back(DstReg); + if (I->getOpcode() == AMDGPU::SI_CALL_ISEL) { + MachineInstr *UseMI = &*I; + for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) { + if (UseMI->getOperand(i).isReg() && + UseMI->getOperand(i).getReg() == DstReg) { + MachineOperand *MO = &UseMI->getOperand(i); + MO->setReg(SrcReg); + WaterFalls[UseMI].MOs.push_back(MO); + WaterFalls[UseMI].SGPRs.push_back(DstReg); + } } - } + } else if (I->readsRegister(DstReg, TRI)) + CanErase = false; + if (I->findRegisterDefOperand(DstReg, TRI)) + break; } + if (CanErase) + V2PhySCopies.insert(&MI); } return true; } @@ -1174,8 +1170,7 @@ void SIFixSGPRCopies::lowerPysicalSGPRInsts(MachineFunction &MF) { for (auto &Entry : WaterFalls) { MachineInstr *MI = Entry.first; struct V2PysSCopyInfo Info = Entry.second; - if (!Info.CanConvert || Info.MOs.size() == 0 || - Info.SGPRs.size() != Info.MOs.size()) + if (Info.MOs.size() == 0 || Info.SGPRs.size() != Info.MOs.size()) continue; if (MI->getOpcode() == AMDGPU::SI_CALL_ISEL) { @@ -1203,11 +1198,11 @@ void SIFixSGPRCopies::lowerPysicalSGPRInsts(MachineFunction &MF) { llvm::loadMBUFScalarOperandsFromVGPR(*TII, *MI, Info.MOs, MDT, Start, End, Info.SGPRs); } - - for (auto &Entry : V2SCanErase) - if (Entry.second) - Entry.first->eraseFromParent(); } + // Avoid some O0 tests where no use of COPY to SGPR + if (!WaterFalls.empty()) + for (auto &Entry : V2PhySCopies) + Entry->eraseFromParent(); } void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 6fc9e3a313ce4..6754be1a0b619 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -4063,50 +4063,6 @@ SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, return 0; } -SmallVector -SIRegisterInfo::findRegUsesFrom(MachineInstr *StartMI, Register TrgReg, - const DenseSet &StopAtDefs, - const DenseSet &Opcodes) const { - DenseSet Visited; - SmallVector Stack; - - Stack.push_back(&*std::next(StartMI->getIterator())); - - SmallVector Uses; - while (!Stack.empty()) { - MachineInstr *I = Stack.back(); - Stack.pop_back(); - if (!Visited.insert(I).second) - continue; - - MachineBasicBlock *MBB = I->getParent(); - MachineBasicBlock::iterator It = I->getIterator(); - MachineBasicBlock::iterator E = MBB->end(); - - bool DefFound = false; - while (It != E) { - if (It->readsRegister(TrgReg, this) && &*It != StartMI) - // Only add to Uses if the opcode is in the allowed set - if (Opcodes.empty() || Opcodes.count(It->getOpcode())) - Uses.push_back(&*It); - for (auto DefReg : StopAtDefs) - if (It->findRegisterDefOperand(DefReg, this)) { - DefFound = true; - break; - } - if (DefFound) - break; - It++; - } - if (DefFound) - continue; - // Push successors onto the stack to visit next. - for (auto *Succ : MBB->successors()) - Stack.push_back(&*(Succ->begin())); - } - return Uses; -} - SmallVector SIRegisterInfo::getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index eca310d46f3e4..7eebac4dedfd8 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -491,12 +491,6 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC) const; - // \returns list of MI uses defined physical reg by a given \p MI. - SmallVector - findRegUsesFrom(MachineInstr *StartMI, Register TrgReg, - const DenseSet &StopAtDefs, - const DenseSet &Opcodes) const; - std::optional getVRegFlagValue(StringRef Name) const override { return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG : std::optional{}; From 4e8c6ef44f7a430b93ebda02df07b81d00361164 Mon Sep 17 00:00:00 2001 From: shore <372660931@qq.com> Date: Fri, 4 Jul 2025 13:41:44 +0800 Subject: [PATCH 3/3] remove head file --- llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 7eebac4dedfd8..06a7a17b0246b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -15,7 +15,6 @@ #define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H #include "llvm/ADT/BitVector.h" -#include "llvm/ADT/DenseSet.h" #define GET_REGINFO_HEADER #include "AMDGPUGenRegisterInfo.inc"