diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 1bf5b4a241780..c01a6b9ba046e 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -124,6 +124,12 @@ class SIFixSGPRCopies { SmallVector RegSequences; SmallVector PHINodes; SmallVector S2VCopies; + struct V2PysSCopyInfo { + SmallVector MOs; + SmallVector SGPRs; + }; + DenseMap WaterFalls; + DenseSet V2PhySCopies; unsigned NextVGPRToSGPRCopyID = 0; MapVector V2SCopies; DenseMap> SiblingPenalty; @@ -143,6 +149,7 @@ class SIFixSGPRCopies { bool needToBeConvertedToVALU(V2SCopyInfo *I); void analyzeVGPRToSGPRCopy(MachineInstr *MI); void lowerVGPR2SGPRCopies(MachineFunction &MF); + void lowerPysicalSGPRInsts(MachineFunction &MF); // Handles copies which source register is: // 1. Physical register // 2. AGPR @@ -770,6 +777,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) { } } + lowerPysicalSGPRInsts(MF); lowerVGPR2SGPRCopies(MF); // Postprocessing fixSCCCopies(MF); @@ -800,6 +808,8 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) { PHINodes.clear(); S2VCopies.clear(); PHISources.clear(); + WaterFalls.clear(); + V2PhySCopies.clear(); return true; } @@ -901,6 +911,34 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, MI, MI.getDebugLoc())) { I = std::next(I); MI.eraseFromParent(); + } else if (SrcReg.isVirtual() && TRI->getRegSizeInBits(SrcReg, *MRI) == + TRI->getRegSizeInBits(DstReg, *MRI)) { + auto I = MI.getIterator(); + auto E = MI.getParent()->end(); + // COPY can be erased if all its uses can be converted to waterfall. + bool CanErase = true; + // Only search current block since phyreg's def & use cannot cross + // blocks when MF.NoPhi = false. + while (++I != E) { + // Currently, we only support waterfall on SI_CALL_ISEL. + if (I->getOpcode() == AMDGPU::SI_CALL_ISEL) { + MachineInstr *UseMI = &*I; + for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) { + if (UseMI->getOperand(i).isReg() && + UseMI->getOperand(i).getReg() == DstReg) { + MachineOperand *MO = &UseMI->getOperand(i); + MO->setReg(SrcReg); + WaterFalls[UseMI].MOs.push_back(MO); + WaterFalls[UseMI].SGPRs.push_back(DstReg); + } + } + } else if (I->readsRegister(DstReg, TRI)) + CanErase = false; + if (I->findRegisterDefOperand(DstReg, TRI)) + break; + } + if (CanErase) + V2PhySCopies.insert(&MI); } return true; } @@ -1128,6 +1166,45 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { } } +void SIFixSGPRCopies::lowerPysicalSGPRInsts(MachineFunction &MF) { + for (auto &Entry : WaterFalls) { + MachineInstr *MI = Entry.first; + struct V2PysSCopyInfo Info = Entry.second; + if (Info.MOs.size() == 0 || Info.SGPRs.size() != Info.MOs.size()) + continue; + + if (MI->getOpcode() == AMDGPU::SI_CALL_ISEL) { + // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and + // following copies, we also need to move copies from and to physical + // registers into the loop block. + unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); + unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + + // Also move the copies to physical registers into the loop block + MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock::iterator Start(MI); + while (Start->getOpcode() != FrameSetupOpcode) + --Start; + MachineBasicBlock::iterator End(MI); + while (End->getOpcode() != FrameDestroyOpcode) + ++End; + + // Also include following copies of the return value + ++End; + while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && + MI->definesRegister(End->getOperand(1).getReg(), TRI)) + ++End; + + llvm::loadMBUFScalarOperandsFromVGPR(*TII, *MI, Info.MOs, MDT, Start, End, + Info.SGPRs); + } + } + // Avoid some O0 tests where no use of COPY to SGPR + if (!WaterFalls.empty()) + for (auto &Entry : V2PhySCopies) + Entry->eraseFromParent(); +} + void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { bool IsWave32 = MF.getSubtarget().isWave32(); for (MachineBasicBlock &MBB : MF) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 2d37451eb32b9..2571da1bb8a68 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6540,13 +6540,10 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, // Emit the actual waterfall loop, executing the wrapped instruction for each // unique value of \p ScalarOps across all lanes. In the best case we execute 1 // iteration, in the worst case we execute 64 (once per lane). -static void -emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, - MachineRegisterInfo &MRI, - MachineBasicBlock &LoopBB, - MachineBasicBlock &BodyBB, - const DebugLoc &DL, - ArrayRef ScalarOps) { +static void emitLoadScalarOpsFromVGPRLoop( + const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, + MachineBasicBlock &BodyBB, const DebugLoc &DL, + ArrayRef ScalarOps, SmallVector PhySGPRs = {}) { MachineFunction &MF = *LoopBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -6561,7 +6558,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineBasicBlock::iterator I = LoopBB.begin(); Register CondReg; - + int Idx = 0; for (MachineOperand *ScalarOp : ScalarOps) { unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI); unsigned NumSubRegs = RegSize / 32; @@ -6591,7 +6588,16 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, } // Update ScalarOp operand to use the SGPR ScalarOp. - ScalarOp->setReg(CurReg); + if (PhySGPRs.empty()) + ScalarOp->setReg(CurReg); + else { + // Insert into the same block of use + BuildMI(*ScalarOp->getParent()->getParent(), + ScalarOp->getParent()->getIterator(), DL, TII.get(AMDGPU::COPY), + PhySGPRs[Idx]) + .addReg(CurReg); + ScalarOp->setReg(PhySGPRs[Idx]); + } ScalarOp->setIsKill(); } else { SmallVector ReadlanePieces; @@ -6660,9 +6666,18 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, } // Update ScalarOp operand to use the SGPR ScalarOp. - ScalarOp->setReg(SScalarOp); + if (PhySGPRs.empty()) + ScalarOp->setReg(SScalarOp); + else { + BuildMI(*ScalarOp->getParent()->getParent(), + ScalarOp->getParent()->getIterator(), DL, TII.get(AMDGPU::COPY), + PhySGPRs[Idx]) + .addReg(SScalarOp); + ScalarOp->setReg(PhySGPRs[Idx]); + } ScalarOp->setIsKill(); } + Idx++; } Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); @@ -6686,12 +6701,13 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, // Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register // with SGPRs by iterating over all unique values across all lanes. // Returns the loop basic block that now contains \p MI. -static MachineBasicBlock * -loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, - ArrayRef ScalarOps, - MachineDominatorTree *MDT, - MachineBasicBlock::iterator Begin = nullptr, - MachineBasicBlock::iterator End = nullptr) { +MachineBasicBlock *llvm::loadMBUFScalarOperandsFromVGPR( + const SIInstrInfo &TII, MachineInstr &MI, + ArrayRef ScalarOps, MachineDominatorTree *MDT, + MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, + SmallVector PhySGPRs) { + assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) && + "Physical SGPRs must be empty or match the number of scalar operands"); MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget(); @@ -6777,7 +6793,8 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, } } - emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps); + emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps, + PhySGPRs); MachineBasicBlock::iterator First = RemainderBB->begin(); // Restore SCC @@ -6998,13 +7015,13 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, : AMDGPU::OpName::srsrc; MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName); if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) - CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT); + CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT); AMDGPU::OpName SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp; MachineOperand *SSamp = getNamedOperand(MI, SampOpName); if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) - CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT); + CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT); return CreatedBB; } @@ -7032,8 +7049,8 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr)) ++End; - CreatedBB = - loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End); + CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, + Start, End); } } @@ -7215,11 +7232,11 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, // Legalize a VGPR Rsrc and soffset together. if (!isSoffsetLegal) { MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset); - CreatedBB = - loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT); + CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, + {Rsrc, Soffset}, MDT); return CreatedBB; } - CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT); + CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT); return CreatedBB; } } @@ -7227,7 +7244,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, // Legalize a VGPR soffset. if (!isSoffsetLegal) { MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset); - CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT); + CreatedBB = llvm::loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT); return CreatedBB; } return CreatedBB; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 01dd3c9f4119e..77b0713275834 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1554,6 +1554,17 @@ bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI); +/// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register +/// with SGPRs by iterating over all unique values across all lanes. +/// Returns the loop basic block that now contains \p MI. +MachineBasicBlock * +loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, + ArrayRef ScalarOps, + MachineDominatorTree *MDT, + MachineBasicBlock::iterator Begin = nullptr, + MachineBasicBlock::iterator End = nullptr, + SmallVector PhySGPRs = {}); + namespace AMDGPU { LLVM_READONLY diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll index 34f4476f7fd6a..27d7f117711f6 100644 --- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll +++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll @@ -1,22 +1,452 @@ -; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s - -; CHECK: illegal VGPR to SGPR copy +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope %s declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0 declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0 declare hidden void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg, i32 inreg) #0 define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #0 { +; CHECK-LABEL: test_call_external_void_func_a15i32_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s40, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[42:43], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[42:43] +; CHECK-NEXT: v_writelane_b32 v40, s40, 32 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v40, s48, 8 +; CHECK-NEXT: v_writelane_b32 v40, s49, 9 +; CHECK-NEXT: v_writelane_b32 v40, s50, 10 +; CHECK-NEXT: v_writelane_b32 v40, s51, 11 +; CHECK-NEXT: v_writelane_b32 v40, s52, 12 +; CHECK-NEXT: v_writelane_b32 v40, s53, 13 +; CHECK-NEXT: v_writelane_b32 v40, s54, 14 +; CHECK-NEXT: v_writelane_b32 v40, s55, 15 +; CHECK-NEXT: v_writelane_b32 v40, s64, 16 +; CHECK-NEXT: v_writelane_b32 v40, s65, 17 +; CHECK-NEXT: v_writelane_b32 v40, s66, 18 +; CHECK-NEXT: v_writelane_b32 v40, s67, 19 +; CHECK-NEXT: v_writelane_b32 v40, s68, 20 +; CHECK-NEXT: v_writelane_b32 v40, s69, 21 +; CHECK-NEXT: v_writelane_b32 v40, s70, 22 +; CHECK-NEXT: v_writelane_b32 v40, s71, 23 +; CHECK-NEXT: v_writelane_b32 v40, s80, 24 +; CHECK-NEXT: v_writelane_b32 v40, s81, 25 +; CHECK-NEXT: v_writelane_b32 v40, s82, 26 +; CHECK-NEXT: v_writelane_b32 v40, s83, 27 +; CHECK-NEXT: v_writelane_b32 v40, s84, 28 +; CHECK-NEXT: v_writelane_b32 v40, s85, 29 +; CHECK-NEXT: v_writelane_b32 v40, s86, 30 +; CHECK-NEXT: s_mov_b32 s50, s29 +; CHECK-NEXT: s_mov_b32 s51, s28 +; CHECK-NEXT: s_mov_b32 s52, s27 +; CHECK-NEXT: s_mov_b32 s53, s26 +; CHECK-NEXT: s_mov_b32 s54, s25 +; CHECK-NEXT: s_mov_b32 s55, s24 +; CHECK-NEXT: s_mov_b32 s64, s23 +; CHECK-NEXT: s_mov_b32 s65, s22 +; CHECK-NEXT: s_mov_b32 s66, s21 +; CHECK-NEXT: s_mov_b32 s67, s20 +; CHECK-NEXT: s_mov_b32 s68, s19 +; CHECK-NEXT: s_mov_b32 s69, s18 +; CHECK-NEXT: s_mov_b32 s70, s17 +; CHECK-NEXT: s_mov_b32 s71, s16 +; CHECK-NEXT: s_mov_b32 s80, s15 +; CHECK-NEXT: s_mov_b32 s81, s14 +; CHECK-NEXT: s_mov_b32 s82, s13 +; CHECK-NEXT: s_mov_b32 s83, s12 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: s_mov_b64 s[84:85], exec +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s87, 31 +; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s26, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s26, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[86:87], vcc +; CHECK-NEXT: s_getpc_b64 s[28:29] +; CHECK-NEXT: s_add_u32 s28, s28, external_void_func_a15i32_inreg@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s29, s29, external_void_func_a15i32_inreg@rel32@hi+12 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s83 +; CHECK-NEXT: s_mov_b32 s13, s82 +; CHECK-NEXT: s_mov_b32 s14, s81 +; CHECK-NEXT: s_mov_b32 s15, s80 +; CHECK-NEXT: s_mov_b32 s0, s71 +; CHECK-NEXT: s_mov_b32 s1, s70 +; CHECK-NEXT: s_mov_b32 s2, s69 +; CHECK-NEXT: s_mov_b32 s3, s68 +; CHECK-NEXT: s_mov_b32 s16, s67 +; CHECK-NEXT: s_mov_b32 s17, s66 +; CHECK-NEXT: s_mov_b32 s18, s65 +; CHECK-NEXT: s_mov_b32 s19, s64 +; CHECK-NEXT: s_mov_b32 s20, s55 +; CHECK-NEXT: s_mov_b32 s21, s54 +; CHECK-NEXT: s_mov_b32 s22, s53 +; CHECK-NEXT: s_mov_b32 s23, s52 +; CHECK-NEXT: s_mov_b32 s24, s51 +; CHECK-NEXT: s_mov_b32 s25, s50 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[28:29] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[86:87] +; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b64 exec, s[84:85] +; CHECK-NEXT: v_readlane_b32 s87, v40, 31 +; CHECK-NEXT: v_readlane_b32 s86, v40, 30 +; CHECK-NEXT: v_readlane_b32 s85, v40, 29 +; CHECK-NEXT: v_readlane_b32 s84, v40, 28 +; CHECK-NEXT: v_readlane_b32 s83, v40, 27 +; CHECK-NEXT: v_readlane_b32 s82, v40, 26 +; CHECK-NEXT: v_readlane_b32 s81, v40, 25 +; CHECK-NEXT: v_readlane_b32 s80, v40, 24 +; CHECK-NEXT: v_readlane_b32 s71, v40, 23 +; CHECK-NEXT: v_readlane_b32 s70, v40, 22 +; CHECK-NEXT: v_readlane_b32 s69, v40, 21 +; CHECK-NEXT: v_readlane_b32 s68, v40, 20 +; CHECK-NEXT: v_readlane_b32 s67, v40, 19 +; CHECK-NEXT: v_readlane_b32 s66, v40, 18 +; CHECK-NEXT: v_readlane_b32 s65, v40, 17 +; CHECK-NEXT: v_readlane_b32 s64, v40, 16 +; CHECK-NEXT: v_readlane_b32 s55, v40, 15 +; CHECK-NEXT: v_readlane_b32 s54, v40, 14 +; CHECK-NEXT: v_readlane_b32 s53, v40, 13 +; CHECK-NEXT: v_readlane_b32 s52, v40, 12 +; CHECK-NEXT: v_readlane_b32 s51, v40, 11 +; CHECK-NEXT: v_readlane_b32 s50, v40, 10 +; CHECK-NEXT: v_readlane_b32 s49, v40, 9 +; CHECK-NEXT: v_readlane_b32 s48, v40, 8 +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s4, v40, 32 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_a15i32_inreg([15 x i32] inreg %arg0) ret void } define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #0 { +; CHECK-LABEL: test_call_external_void_func_a16i32_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s40, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[42:43], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[42:43] +; CHECK-NEXT: v_writelane_b32 v40, s40, 32 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v40, s48, 8 +; CHECK-NEXT: v_writelane_b32 v40, s49, 9 +; CHECK-NEXT: v_writelane_b32 v40, s50, 10 +; CHECK-NEXT: v_writelane_b32 v40, s51, 11 +; CHECK-NEXT: v_writelane_b32 v40, s52, 12 +; CHECK-NEXT: v_writelane_b32 v40, s53, 13 +; CHECK-NEXT: v_writelane_b32 v40, s54, 14 +; CHECK-NEXT: v_writelane_b32 v40, s55, 15 +; CHECK-NEXT: v_writelane_b32 v40, s64, 16 +; CHECK-NEXT: v_writelane_b32 v40, s65, 17 +; CHECK-NEXT: v_writelane_b32 v40, s66, 18 +; CHECK-NEXT: v_writelane_b32 v40, s67, 19 +; CHECK-NEXT: v_writelane_b32 v40, s68, 20 +; CHECK-NEXT: v_writelane_b32 v40, s69, 21 +; CHECK-NEXT: v_writelane_b32 v40, s70, 22 +; CHECK-NEXT: v_writelane_b32 v40, s71, 23 +; CHECK-NEXT: v_writelane_b32 v40, s80, 24 +; CHECK-NEXT: v_writelane_b32 v40, s81, 25 +; CHECK-NEXT: v_writelane_b32 v40, s82, 26 +; CHECK-NEXT: v_writelane_b32 v40, s83, 27 +; CHECK-NEXT: v_writelane_b32 v40, s84, 28 +; CHECK-NEXT: v_writelane_b32 v40, s85, 29 +; CHECK-NEXT: v_writelane_b32 v40, s86, 30 +; CHECK-NEXT: s_mov_b32 s50, s29 +; CHECK-NEXT: s_mov_b32 s51, s28 +; CHECK-NEXT: s_mov_b32 s52, s27 +; CHECK-NEXT: s_mov_b32 s53, s26 +; CHECK-NEXT: s_mov_b32 s54, s25 +; CHECK-NEXT: s_mov_b32 s55, s24 +; CHECK-NEXT: s_mov_b32 s64, s23 +; CHECK-NEXT: s_mov_b32 s65, s22 +; CHECK-NEXT: s_mov_b32 s66, s21 +; CHECK-NEXT: s_mov_b32 s67, s20 +; CHECK-NEXT: s_mov_b32 s68, s19 +; CHECK-NEXT: s_mov_b32 s69, s18 +; CHECK-NEXT: s_mov_b32 s70, s17 +; CHECK-NEXT: s_mov_b32 s71, s16 +; CHECK-NEXT: s_mov_b32 s80, s15 +; CHECK-NEXT: s_mov_b32 s81, s14 +; CHECK-NEXT: s_mov_b32 s82, s13 +; CHECK-NEXT: s_mov_b32 s83, s12 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: s_mov_b64 s[84:85], exec +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s87, 31 +; CHECK-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s26, v0 +; CHECK-NEXT: v_readfirstlane_b32 s27, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s26, v0 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], s27, v1 +; CHECK-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[86:87], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[28:29] +; CHECK-NEXT: s_add_u32 s28, s28, external_void_func_a16i32_inreg@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s29, s29, external_void_func_a16i32_inreg@rel32@hi+12 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s83 +; CHECK-NEXT: s_mov_b32 s13, s82 +; CHECK-NEXT: s_mov_b32 s14, s81 +; CHECK-NEXT: s_mov_b32 s15, s80 +; CHECK-NEXT: s_mov_b32 s0, s71 +; CHECK-NEXT: s_mov_b32 s1, s70 +; CHECK-NEXT: s_mov_b32 s2, s69 +; CHECK-NEXT: s_mov_b32 s3, s68 +; CHECK-NEXT: s_mov_b32 s16, s67 +; CHECK-NEXT: s_mov_b32 s17, s66 +; CHECK-NEXT: s_mov_b32 s18, s65 +; CHECK-NEXT: s_mov_b32 s19, s64 +; CHECK-NEXT: s_mov_b32 s20, s55 +; CHECK-NEXT: s_mov_b32 s21, s54 +; CHECK-NEXT: s_mov_b32 s22, s53 +; CHECK-NEXT: s_mov_b32 s23, s52 +; CHECK-NEXT: s_mov_b32 s24, s51 +; CHECK-NEXT: s_mov_b32 s25, s50 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[28:29] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[86:87] +; CHECK-NEXT: s_cbranch_execnz .LBB1_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b64 exec, s[84:85] +; CHECK-NEXT: v_readlane_b32 s87, v40, 31 +; CHECK-NEXT: v_readlane_b32 s86, v40, 30 +; CHECK-NEXT: v_readlane_b32 s85, v40, 29 +; CHECK-NEXT: v_readlane_b32 s84, v40, 28 +; CHECK-NEXT: v_readlane_b32 s83, v40, 27 +; CHECK-NEXT: v_readlane_b32 s82, v40, 26 +; CHECK-NEXT: v_readlane_b32 s81, v40, 25 +; CHECK-NEXT: v_readlane_b32 s80, v40, 24 +; CHECK-NEXT: v_readlane_b32 s71, v40, 23 +; CHECK-NEXT: v_readlane_b32 s70, v40, 22 +; CHECK-NEXT: v_readlane_b32 s69, v40, 21 +; CHECK-NEXT: v_readlane_b32 s68, v40, 20 +; CHECK-NEXT: v_readlane_b32 s67, v40, 19 +; CHECK-NEXT: v_readlane_b32 s66, v40, 18 +; CHECK-NEXT: v_readlane_b32 s65, v40, 17 +; CHECK-NEXT: v_readlane_b32 s64, v40, 16 +; CHECK-NEXT: v_readlane_b32 s55, v40, 15 +; CHECK-NEXT: v_readlane_b32 s54, v40, 14 +; CHECK-NEXT: v_readlane_b32 s53, v40, 13 +; CHECK-NEXT: v_readlane_b32 s52, v40, 12 +; CHECK-NEXT: v_readlane_b32 s51, v40, 11 +; CHECK-NEXT: v_readlane_b32 s50, v40, 10 +; CHECK-NEXT: v_readlane_b32 s49, v40, 9 +; CHECK-NEXT: v_readlane_b32 s48, v40, 8 +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s4, v40, 32 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_a16i32_inreg([16 x i32] inreg %arg0) ret void } define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #0 { +; CHECK-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s40, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b64 s[42:43], -1 +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[42:43] +; CHECK-NEXT: v_writelane_b32 v40, s40, 32 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v40, s48, 8 +; CHECK-NEXT: v_writelane_b32 v40, s49, 9 +; CHECK-NEXT: v_writelane_b32 v40, s50, 10 +; CHECK-NEXT: v_writelane_b32 v40, s51, 11 +; CHECK-NEXT: v_writelane_b32 v40, s52, 12 +; CHECK-NEXT: v_writelane_b32 v40, s53, 13 +; CHECK-NEXT: v_writelane_b32 v40, s54, 14 +; CHECK-NEXT: v_writelane_b32 v40, s55, 15 +; CHECK-NEXT: v_writelane_b32 v40, s64, 16 +; CHECK-NEXT: v_writelane_b32 v40, s65, 17 +; CHECK-NEXT: v_writelane_b32 v40, s66, 18 +; CHECK-NEXT: v_writelane_b32 v40, s67, 19 +; CHECK-NEXT: v_writelane_b32 v40, s68, 20 +; CHECK-NEXT: v_writelane_b32 v40, s69, 21 +; CHECK-NEXT: v_writelane_b32 v40, s70, 22 +; CHECK-NEXT: v_writelane_b32 v40, s71, 23 +; CHECK-NEXT: v_writelane_b32 v40, s80, 24 +; CHECK-NEXT: v_writelane_b32 v40, s81, 25 +; CHECK-NEXT: v_writelane_b32 v40, s82, 26 +; CHECK-NEXT: v_writelane_b32 v40, s83, 27 +; CHECK-NEXT: v_writelane_b32 v40, s84, 28 +; CHECK-NEXT: v_writelane_b32 v40, s85, 29 +; CHECK-NEXT: v_writelane_b32 v40, s86, 30 +; CHECK-NEXT: s_mov_b32 s50, s29 +; CHECK-NEXT: s_mov_b32 s51, s28 +; CHECK-NEXT: s_mov_b32 s52, s27 +; CHECK-NEXT: s_mov_b32 s53, s26 +; CHECK-NEXT: s_mov_b32 s54, s25 +; CHECK-NEXT: s_mov_b32 s55, s24 +; CHECK-NEXT: s_mov_b32 s64, s23 +; CHECK-NEXT: s_mov_b32 s65, s22 +; CHECK-NEXT: s_mov_b32 s66, s21 +; CHECK-NEXT: s_mov_b32 s67, s20 +; CHECK-NEXT: s_mov_b32 s68, s19 +; CHECK-NEXT: s_mov_b32 s69, s18 +; CHECK-NEXT: s_mov_b32 s70, s17 +; CHECK-NEXT: s_mov_b32 s71, s16 +; CHECK-NEXT: s_mov_b32 s80, s15 +; CHECK-NEXT: s_mov_b32 s81, s14 +; CHECK-NEXT: s_mov_b32 s82, s13 +; CHECK-NEXT: s_mov_b32 s83, s12 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: s_mov_b64 s[84:85], exec +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s87, 31 +; CHECK-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s26, v0 +; CHECK-NEXT: v_readfirstlane_b32 s27, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s26, v0 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], s27, v1 +; CHECK-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[86:87], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[28:29] +; CHECK-NEXT: s_add_u32 s28, s28, external_void_func_a15i32_inreg_i32_inreg@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s29, s29, external_void_func_a15i32_inreg_i32_inreg@rel32@hi+12 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s83 +; CHECK-NEXT: s_mov_b32 s13, s82 +; CHECK-NEXT: s_mov_b32 s14, s81 +; CHECK-NEXT: s_mov_b32 s15, s80 +; CHECK-NEXT: s_mov_b32 s0, s71 +; CHECK-NEXT: s_mov_b32 s1, s70 +; CHECK-NEXT: s_mov_b32 s2, s69 +; CHECK-NEXT: s_mov_b32 s3, s68 +; CHECK-NEXT: s_mov_b32 s16, s67 +; CHECK-NEXT: s_mov_b32 s17, s66 +; CHECK-NEXT: s_mov_b32 s18, s65 +; CHECK-NEXT: s_mov_b32 s19, s64 +; CHECK-NEXT: s_mov_b32 s20, s55 +; CHECK-NEXT: s_mov_b32 s21, s54 +; CHECK-NEXT: s_mov_b32 s22, s53 +; CHECK-NEXT: s_mov_b32 s23, s52 +; CHECK-NEXT: s_mov_b32 s24, s51 +; CHECK-NEXT: s_mov_b32 s25, s50 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[28:29] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[86:87] +; CHECK-NEXT: s_cbranch_execnz .LBB2_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b64 exec, s[84:85] +; CHECK-NEXT: v_readlane_b32 s87, v40, 31 +; CHECK-NEXT: v_readlane_b32 s86, v40, 30 +; CHECK-NEXT: v_readlane_b32 s85, v40, 29 +; CHECK-NEXT: v_readlane_b32 s84, v40, 28 +; CHECK-NEXT: v_readlane_b32 s83, v40, 27 +; CHECK-NEXT: v_readlane_b32 s82, v40, 26 +; CHECK-NEXT: v_readlane_b32 s81, v40, 25 +; CHECK-NEXT: v_readlane_b32 s80, v40, 24 +; CHECK-NEXT: v_readlane_b32 s71, v40, 23 +; CHECK-NEXT: v_readlane_b32 s70, v40, 22 +; CHECK-NEXT: v_readlane_b32 s69, v40, 21 +; CHECK-NEXT: v_readlane_b32 s68, v40, 20 +; CHECK-NEXT: v_readlane_b32 s67, v40, 19 +; CHECK-NEXT: v_readlane_b32 s66, v40, 18 +; CHECK-NEXT: v_readlane_b32 s65, v40, 17 +; CHECK-NEXT: v_readlane_b32 s64, v40, 16 +; CHECK-NEXT: v_readlane_b32 s55, v40, 15 +; CHECK-NEXT: v_readlane_b32 s54, v40, 14 +; CHECK-NEXT: v_readlane_b32 s53, v40, 13 +; CHECK-NEXT: v_readlane_b32 s52, v40, 12 +; CHECK-NEXT: v_readlane_b32 s51, v40, 11 +; CHECK-NEXT: v_readlane_b32 s50, v40, 10 +; CHECK-NEXT: v_readlane_b32 s49, v40, 9 +; CHECK-NEXT: v_readlane_b32 s48, v40, 8 +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s4, v40, 32 +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[6:7] +; CHECK-NEXT: s_mov_b32 s33, s4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] call void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll index 242b5e9aeaf42..c2482e678e978 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll @@ -1,13 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 2> %t.err < %s | FileCheck %s -; RUN: FileCheck -check-prefix=ERR %s < %t.err -; FIXME: These tests cannot be tail called, and should be executed in a waterfall loop. +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s declare hidden void @void_func_i32_inreg(i32 inreg) -; ERR: error: :0:0: in function tail_call_i32_inreg_divergent void (i32): illegal VGPR to SGPR copy -; ERR: error: :0:0: in function indirect_tail_call_i32_inreg_divergent void (i32): illegal VGPR to SGPR copy - define void @tail_call_i32_inreg_divergent(i32 %vgpr) { ; CHECK-LABEL: tail_call_i32_inreg_divergent: ; CHECK: ; %bb.0: @@ -17,19 +12,78 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) { ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 2 -; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s16, 18 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, void_func_i32_inreg@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, void_func_i32_inreg@rel32@hi+12 -; CHECK-NEXT: ; illegal copy v0 to s0 -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v40, s48, 8 +; CHECK-NEXT: v_writelane_b32 v40, s49, 9 +; CHECK-NEXT: v_writelane_b32 v40, s50, 10 +; CHECK-NEXT: v_writelane_b32 v40, s51, 11 +; CHECK-NEXT: v_writelane_b32 v40, s52, 12 +; CHECK-NEXT: v_writelane_b32 v40, s53, 13 +; CHECK-NEXT: v_writelane_b32 v40, s54, 14 +; CHECK-NEXT: v_writelane_b32 v40, s55, 15 +; CHECK-NEXT: v_writelane_b32 v40, s64, 16 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: s_mov_b64 s[54:55], exec +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s65, 17 +; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s16, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s16, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[64:65], vcc +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, void_func_i32_inreg@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, void_func_i32_inreg@rel32@hi+12 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 +; CHECK-NEXT: s_mov_b32 s0, s16 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[64:65] +; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b64 exec, s[54:55] +; CHECK-NEXT: v_readlane_b32 s65, v40, 17 +; CHECK-NEXT: v_readlane_b32 s64, v40, 16 +; CHECK-NEXT: v_readlane_b32 s55, v40, 15 +; CHECK-NEXT: v_readlane_b32 s54, v40, 14 +; CHECK-NEXT: v_readlane_b32 s53, v40, 13 +; CHECK-NEXT: v_readlane_b32 s52, v40, 12 +; CHECK-NEXT: v_readlane_b32 s51, v40, 11 +; CHECK-NEXT: v_readlane_b32 s50, v40, 10 +; CHECK-NEXT: v_readlane_b32 s49, v40, 9 +; CHECK-NEXT: v_readlane_b32 s48, v40, 8 +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 -; CHECK-NEXT: v_readlane_b32 s4, v40, 2 +; CHECK-NEXT: v_readlane_b32 s4, v40, 18 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] @@ -51,21 +105,84 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) { ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: s_addk_i32 s32, 0x400 -; CHECK-NEXT: v_writelane_b32 v40, s16, 2 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, constant@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, constant@rel32@hi+12 -; CHECK-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; CHECK-NEXT: v_writelane_b32 v40, s16, 20 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 -; CHECK-NEXT: ; illegal copy v0 to s0 +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 +; CHECK-NEXT: v_writelane_b32 v40, s48, 8 +; CHECK-NEXT: v_writelane_b32 v40, s49, 9 +; CHECK-NEXT: v_writelane_b32 v40, s50, 10 +; CHECK-NEXT: v_writelane_b32 v40, s51, 11 +; CHECK-NEXT: v_writelane_b32 v40, s52, 12 +; CHECK-NEXT: v_writelane_b32 v40, s53, 13 +; CHECK-NEXT: v_writelane_b32 v40, s54, 14 +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v40, s55, 15 +; CHECK-NEXT: v_writelane_b32 v40, s64, 16 +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, constant@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, constant@rel32@hi+12 +; CHECK-NEXT: v_writelane_b32 v40, s65, 17 +; CHECK-NEXT: s_load_dwordx2 s[64:65], s[4:5], 0x0 +; CHECK-NEXT: v_writelane_b32 v40, s66, 18 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b32 s52, s13 +; CHECK-NEXT: s_mov_b32 s53, s12 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[54:55], exec +; CHECK-NEXT: v_writelane_b32 v40, s67, 19 +; CHECK-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s16, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s16, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[66:67], vcc +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s53 +; CHECK-NEXT: s_mov_b32 s13, s52 +; CHECK-NEXT: s_mov_b32 s14, s51 +; CHECK-NEXT: s_mov_b32 s15, s50 +; CHECK-NEXT: s_mov_b32 s0, s16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[64:65] +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[66:67] +; CHECK-NEXT: s_cbranch_execnz .LBB1_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_mov_b64 exec, s[54:55] +; CHECK-NEXT: v_readlane_b32 s67, v40, 19 +; CHECK-NEXT: v_readlane_b32 s66, v40, 18 +; CHECK-NEXT: v_readlane_b32 s65, v40, 17 +; CHECK-NEXT: v_readlane_b32 s64, v40, 16 +; CHECK-NEXT: v_readlane_b32 s55, v40, 15 +; CHECK-NEXT: v_readlane_b32 s54, v40, 14 +; CHECK-NEXT: v_readlane_b32 s53, v40, 13 +; CHECK-NEXT: v_readlane_b32 s52, v40, 12 +; CHECK-NEXT: v_readlane_b32 s51, v40, 11 +; CHECK-NEXT: v_readlane_b32 s50, v40, 10 +; CHECK-NEXT: v_readlane_b32 s49, v40, 9 +; CHECK-NEXT: v_readlane_b32 s48, v40, 8 +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 -; CHECK-NEXT: v_readlane_b32 s4, v40, 2 +; CHECK-NEXT: v_readlane_b32 s4, v40, 20 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] @@ -76,3 +193,66 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) { tail call void %fptr(i32 inreg %vgpr) ret void } + +declare void @user(ptr addrspace(5)) + +define amdgpu_kernel void @v_multiple_frame_indexes_literal_offsets() #0 { +; CHECK-LABEL: v_multiple_frame_indexes_literal_offsets: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 +; CHECK-NEXT: v_mov_b32_e32 v3, 8 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b32 s33, s16 +; CHECK-NEXT: s_mov_b32 s50, s15 +; CHECK-NEXT: s_mov_b32 s51, s14 +; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] +; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5] +; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 +; CHECK-NEXT: s_movk_i32 s32, 0x400 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: v_readfirstlane_b32 s15, v3 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s15, v3 +; CHECK-NEXT: s_and_saveexec_b64 s[52:53], vcc +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_add_u32 s4, s4, user@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s5, s5, user@gotpcrel32@hi+12 +; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49] +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] +; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] +; CHECK-NEXT: s_mov_b32 s12, s51 +; CHECK-NEXT: s_mov_b32 s13, s50 +; CHECK-NEXT: s_mov_b32 s14, s33 +; CHECK-NEXT: s_mov_b32 s0, s15 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: ; implicit-def: $vgpr3 +; CHECK-NEXT: ; implicit-def: $vgpr31 +; CHECK-NEXT: s_xor_b64 exec, exec, s[52:53] +; CHECK-NEXT: s_cbranch_execnz .LBB2_1 +; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_endpgm + %vgpr = call i32 @llvm.amdgcn.workitem.id.x() + %alloca0 = alloca [2 x i32], align 8, addrspace(5) + %alloca1 = alloca i32, align 4, addrspace(5) + %cmp = icmp eq i32 %vgpr, 0 + %select = select i1 %cmp, ptr addrspace(5) %alloca0, ptr addrspace(5) %alloca1 + call void @user(ptr addrspace(5) inreg %select) + ret void +} + +declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }