From 9ab275122989c47cc0096739d03ff105c2d207d1 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 28 Dec 2024 20:39:27 +0100 Subject: [PATCH 1/8] JitArm64: Check GPRs/FPRs to push inside EmitBackpatchRoutine Preparation for the next commit, which will make EmitBackpatchRoutine allocate registers on its own. Because the register allocation will change during the call to EmitBackpatchRoutine, the set of GPRs/FPRs to push can't be computed prior to the call, so let's compute them during the call instead. --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 8 +- .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 7 +- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 91 ++++++++++--------- .../JitArm64/JitArm64_LoadStoreFloating.cpp | 39 ++++---- .../JitArm64/JitArm64_LoadStorePaired.cpp | 32 +++---- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 90 +++++++++--------- 6 files changed, 137 insertions(+), 130 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 0c1ea0d64728..f037111a4479 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -273,11 +273,11 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA // !emitting_routine && mode != AlwaysSlowAccess && !jo.fastmem: X30 // !emitting_routine && mode == Auto && jo.fastmem: X30 // - // Furthermore, any callee-saved register which isn't marked in gprs_to_push/fprs_to_push - // may be clobbered if mode != AlwaysFastAccess. + // If there are any other registers that the caller doesn't mind being overwritten, + // these can be indicated in scratch_gprs and scratch_fprs. void EmitBackpatchRoutine(u32 flags, MemAccessMode mode, Arm64Gen::ARM64Reg RS, - Arm64Gen::ARM64Reg addr, BitSet32 gprs_to_push = BitSet32(0), - BitSet32 fprs_to_push = BitSet32(0), bool emitting_routine = false); + Arm64Gen::ARM64Reg addr, BitSet32 scratch_gprs = BitSet32(0), + BitSet32 scratch_fprs = BitSet32(0), bool emitting_routine = false); // Loadstore routines void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index 1c54c00ebea3..059be57048f3 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -54,7 +54,7 @@ void JitArm64::DoBacktrace(uintptr_t access_address, SContext* ctx) } void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, ARM64Reg addr, - BitSet32 gprs_to_push, BitSet32 fprs_to_push, + BitSet32 scratch_gprs, BitSet32 scratch_fprs, bool emitting_routine) { const u32 access_size = BackPatchInfo::GetFlagSize(flags); @@ -65,6 +65,11 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, const bool emit_fast_access = mode != MemAccessMode::AlwaysSlowAccess; const bool emit_slow_access = mode != MemAccessMode::AlwaysFastAccess; + const BitSet32 gprs_to_push = + (emitting_routine ? CALLER_SAVED_GPRS : gpr.GetCallerSavedUsed()) & ~scratch_gprs; + const BitSet32 fprs_to_push = + (emitting_routine ? BitSet32(0xFFFFFFFF) : fpr.GetCallerSavedUsed()) & ~scratch_fprs; + bool in_far_code = false; const u8* fast_access_start = GetCodePtr(); std::optional slow_access_fixup; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index ebcb8142b7a4..7462cdd52977 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -123,14 +123,14 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o MOV(gpr.R(addr), addr_reg); } - BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); - BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); + BitSet32 scratch_gprs; + BitSet32 scratch_fprs; if (!update || early_update) - regs_in_use[DecodeReg(ARM64Reg::W1)] = 0; + scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; if (jo.memcheck || !jo.fastmem) - regs_in_use[DecodeReg(ARM64Reg::W0)] = 0; + scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; if (!jo.memcheck) - regs_in_use[DecodeReg(dest_reg)] = 0; + scratch_gprs[DecodeReg(dest_reg)] = true; u32 access_size = BackPatchInfo::GetFlagSize(flags); u32 mmio_address = 0; @@ -140,22 +140,23 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o if (is_immediate && m_mmu.IsOptimizableRAMAddress(imm_addr, access_size)) { set_addr_reg_if_needed(); - EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, dest_reg, XA, regs_in_use, - fprs_in_use); + EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, dest_reg, XA, scratch_gprs, + scratch_fprs); } else if (mmio_address) { - regs_in_use[DecodeReg(ARM64Reg::W1)] = 0; - regs_in_use[DecodeReg(ARM64Reg::W30)] = 0; - regs_in_use[DecodeReg(dest_reg)] = 0; - MMIOLoadToReg(m_system, m_system.GetMemory().GetMMIOMapping(), this, &m_float_emit, regs_in_use, - fprs_in_use, dest_reg, mmio_address, flags); + scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(ARM64Reg::W30)] = true; + scratch_gprs[DecodeReg(dest_reg)] = true; + MMIOLoadToReg(m_system, m_system.GetMemory().GetMMIOMapping(), this, &m_float_emit, + gpr.GetCallerSavedUsed() & ~scratch_gprs, + fpr.GetCallerSavedUsed() & ~scratch_fprs, dest_reg, mmio_address, flags); addr_reg_set = false; } else { set_addr_reg_if_needed(); - EmitBackpatchRoutine(flags, MemAccessMode::Auto, dest_reg, XA, regs_in_use, fprs_in_use); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, dest_reg, XA, scratch_gprs, scratch_fprs); } gpr.BindToRegister(dest, false, true); @@ -271,13 +272,13 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s MOV(gpr.R(dest), addr_reg); } - BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); - BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); - regs_in_use[DecodeReg(ARM64Reg::W1)] = 0; + BitSet32 scratch_gprs; + BitSet32 scratch_fprs; + scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; if (!update || early_update) - regs_in_use[DecodeReg(ARM64Reg::W2)] = 0; + scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; if (!jo.fastmem) - regs_in_use[DecodeReg(ARM64Reg::W0)] = 0; + scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; u32 access_size = BackPatchInfo::GetFlagSize(flags); u32 mmio_address = 0; @@ -313,22 +314,24 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s else if (is_immediate && m_mmu.IsOptimizableRAMAddress(imm_addr, access_size)) { set_addr_reg_if_needed(); - EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, RS, XA, regs_in_use, fprs_in_use); + EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, RS, XA, scratch_gprs, + scratch_fprs); } else if (mmio_address) { - regs_in_use[DecodeReg(ARM64Reg::W1)] = 0; - regs_in_use[DecodeReg(ARM64Reg::W2)] = 0; - regs_in_use[DecodeReg(ARM64Reg::W30)] = 0; - regs_in_use[DecodeReg(RS)] = 0; + scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; + scratch_gprs[DecodeReg(ARM64Reg::W30)] = true; + scratch_gprs[DecodeReg(RS)] = 0; MMIOWriteRegToAddr(m_system, m_system.GetMemory().GetMMIOMapping(), this, &m_float_emit, - regs_in_use, fprs_in_use, RS, mmio_address, flags); + gpr.GetCallerSavedUsed() & ~scratch_gprs, + fpr.GetCallerSavedUsed() & ~scratch_fprs, RS, mmio_address, flags); addr_reg_set = false; } else { set_addr_reg_if_needed(); - EmitBackpatchRoutine(flags, MemAccessMode::Auto, RS, XA, regs_in_use, fprs_in_use); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, RS, XA, scratch_gprs, scratch_fprs); } if (update && !early_update) @@ -592,16 +595,16 @@ void JitArm64::lmw(UGeckoInstruction inst) else if (i != d) ADDI2R(addr_reg, addr_base_reg, (i - d) * 4); - BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); - BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); - regs_in_use[DecodeReg(addr_reg)] = 0; + BitSet32 scratch_gprs; + BitSet32 scratch_fprs; + scratch_gprs[DecodeReg(addr_reg)] = true; if (jo.memcheck || !jo.fastmem) - regs_in_use[DecodeReg(ARM64Reg::W0)] = 0; + scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; if (!jo.memcheck) - regs_in_use[DecodeReg(dest_reg)] = 0; + scratch_gprs[DecodeReg(dest_reg)] = true; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, dest_reg, EncodeRegTo64(addr_reg), regs_in_use, - fprs_in_use); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, dest_reg, EncodeRegTo64(addr_reg), + scratch_gprs, scratch_fprs); gpr.BindToRegister(i, false, true); ASSERT(dest_reg == gpr.R(i)); @@ -710,15 +713,15 @@ void JitArm64::stmw(UGeckoInstruction inst) else if (i != s) ADDI2R(addr_reg, addr_base_reg, (i - s) * 4); - BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); - BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); - regs_in_use[DecodeReg(ARM64Reg::W1)] = 0; - regs_in_use[DecodeReg(addr_reg)] = 0; + BitSet32 scratch_gprs; + BitSet32 scratch_fprs; + scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; if (!jo.fastmem) - regs_in_use[DecodeReg(ARM64Reg::W0)] = 0; + scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, src_reg, EncodeRegTo64(addr_reg), regs_in_use, - fprs_in_use); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, src_reg, EncodeRegTo64(addr_reg), scratch_gprs, + scratch_fprs); // To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores // after this instruction, flush registers that would be flushed after this instruction anyway. @@ -1044,14 +1047,14 @@ void JitArm64::dcbz(UGeckoInstruction inst) } } - BitSet32 gprs_to_push = gpr.GetCallerSavedUsed(); - BitSet32 fprs_to_push = fpr.GetCallerSavedUsed(); - gprs_to_push[DecodeReg(ARM64Reg::W1)] = 0; + BitSet32 scratch_gprs; + BitSet32 scratch_fprs; + scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; if (!jo.fastmem) - gprs_to_push[DecodeReg(ARM64Reg::W0)] = 0; + scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; EmitBackpatchRoutine(BackPatchInfo::FLAG_ZERO_256, MemAccessMode::Auto, ARM64Reg::W1, - EncodeRegTo64(addr_reg), gprs_to_push, fprs_to_push); + EncodeRegTo64(addr_reg), scratch_gprs, scratch_fprs); if (using_dcbz_hack) SetJumpTarget(end_dcbz_hack); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index 6bac5dc6562d..28e89aeff434 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -164,23 +164,24 @@ void JitArm64::lfXX(UGeckoInstruction inst) MOV(gpr.R(a), addr_reg); } - BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); - BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); + BitSet32 scratch_gprs; + BitSet32 scratch_fprs; if (!update || early_update) - regs_in_use[DecodeReg(ARM64Reg::W1)] = 0; + scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; if (jo.memcheck || !jo.fastmem) - regs_in_use[DecodeReg(ARM64Reg::W0)] = 0; - fprs_in_use[DecodeReg(ARM64Reg::Q0)] = 0; + scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; + scratch_gprs[DecodeReg(ARM64Reg::Q0)] = true; if (!jo.memcheck) - fprs_in_use[DecodeReg(VD)] = 0; + scratch_fprs[DecodeReg(VD)] = true; if (is_immediate && m_mmu.IsOptimizableRAMAddress(imm_addr, BackPatchInfo::GetFlagSize(flags))) { - EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, VD, XA, regs_in_use, fprs_in_use); + EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, VD, XA, scratch_gprs, + scratch_fprs); } else { - EmitBackpatchRoutine(flags, MemAccessMode::Auto, VD, XA, regs_in_use, fprs_in_use); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, VD, XA, scratch_gprs, scratch_fprs); } const ARM64Reg VD_again = fpr.RW(inst.FD, type, true); @@ -367,14 +368,14 @@ void JitArm64::stfXX(UGeckoInstruction inst) MOV(gpr.R(a), addr_reg); } - BitSet32 regs_in_use = gpr.GetCallerSavedUsed(); - BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); - regs_in_use[DecodeReg(ARM64Reg::W1)] = 0; + BitSet32 scratch_gprs; + BitSet32 scratch_fprs; + scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; if (!update || early_update) - regs_in_use[DecodeReg(ARM64Reg::W2)] = 0; + scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; if (!jo.fastmem) - regs_in_use[DecodeReg(ARM64Reg::W0)] = 0; - fprs_in_use[DecodeReg(ARM64Reg::Q0)] = 0; + scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; + scratch_fprs[DecodeReg(ARM64Reg::Q0)] = true; if (is_immediate) { @@ -402,20 +403,20 @@ void JitArm64::stfXX(UGeckoInstruction inst) else if (m_mmu.IsOptimizableRAMAddress(imm_addr, BackPatchInfo::GetFlagSize(flags))) { set_addr_reg_if_needed(); - EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, V0, XA, regs_in_use, - fprs_in_use); + EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, V0, XA, scratch_gprs, + scratch_fprs); } else { set_addr_reg_if_needed(); - EmitBackpatchRoutine(flags, MemAccessMode::AlwaysSlowAccess, V0, XA, regs_in_use, - fprs_in_use); + EmitBackpatchRoutine(flags, MemAccessMode::AlwaysSlowAccess, V0, XA, scratch_gprs, + scratch_fprs); } } else { set_addr_reg_if_needed(); - EmitBackpatchRoutine(flags, MemAccessMode::Auto, V0, XA, regs_in_use, fprs_in_use); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, V0, XA, scratch_gprs, scratch_fprs); } if (update && !early_update) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index eb8b4d015c4f..57970646f7b1 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -79,24 +79,23 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) if (js.assumeNoPairedQuantize) { - BitSet32 gprs_in_use = gpr.GetCallerSavedUsed(); - BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); + BitSet32 scratch_gprs; + BitSet32 scratch_fprs; - // Wipe the registers we are using as temporaries if (!update || early_update) - gprs_in_use[DecodeReg(ARM64Reg::W1)] = false; + scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; if (jo.memcheck || !jo.fastmem) - gprs_in_use[DecodeReg(ARM64Reg::W0)] = false; - fprs_in_use[DecodeReg(ARM64Reg::Q0)] = false; + scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; + scratch_fprs[DecodeReg(ARM64Reg::Q0)] = true; if (!jo.memcheck) - fprs_in_use[DecodeReg(VS)] = 0; + scratch_fprs[DecodeReg(VS)] = true; u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; if (!w) flags |= BackPatchInfo::FLAG_PAIR; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, VS, EncodeRegTo64(addr_reg), gprs_in_use, - fprs_in_use); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, VS, EncodeRegTo64(addr_reg), scratch_gprs, + scratch_fprs); } else { @@ -239,22 +238,21 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) if (js.assumeNoPairedQuantize) { - BitSet32 gprs_in_use = gpr.GetCallerSavedUsed(); - BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); + BitSet32 scratch_gprs; + BitSet32 scratch_fprs; - // Wipe the registers we are using as temporaries - gprs_in_use[DecodeReg(ARM64Reg::W1)] = false; + scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; if (!update || early_update) - gprs_in_use[DecodeReg(ARM64Reg::W2)] = false; + scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; if (!jo.fastmem) - gprs_in_use[DecodeReg(ARM64Reg::W0)] = false; + scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; if (!w) flags |= BackPatchInfo::FLAG_PAIR; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, VS, EncodeRegTo64(addr_reg), gprs_in_use, - fprs_in_use); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, VS, EncodeRegTo64(addr_reg), scratch_gprs, + scratch_fprs); } else { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 56c26739a398..aefec65cada0 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -526,10 +526,10 @@ void JitArm64::GenerateQuantizedLoads() ARM64Reg temp_reg = ARM64Reg::X0; ARM64Reg addr_reg = ARM64Reg::X1; ARM64Reg scale_reg = ARM64Reg::X2; - BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 3}; + BitSet32 scratch_gprs{0, 3}; if (!jo.memcheck) - gprs_to_push &= ~BitSet32{1}; - BitSet32 fprs_to_push = BitSet32(0xFFFFFFFF) & ~BitSet32{0, 1}; + scratch_gprs[1] = true; + BitSet32 scratch_fprs{0, 1}; ARM64FloatEmitter float_emit(this); const u8* start = GetCodePtr(); @@ -541,7 +541,7 @@ void JitArm64::GenerateQuantizedLoads() BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_32; EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, - gprs_to_push & ~BitSet32{DecodeReg(scale_reg)}, fprs_to_push, true); + scratch_gprs | BitSet32{DecodeReg(scale_reg)}, scratch_fprs, true); RET(ARM64Reg::X30); } @@ -550,8 +550,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); float_emit.UXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); @@ -568,8 +568,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); float_emit.SXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); @@ -586,8 +586,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -603,8 +603,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -622,7 +622,7 @@ void JitArm64::GenerateQuantizedLoads() BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, - gprs_to_push & ~BitSet32{DecodeReg(scale_reg)}, fprs_to_push, true); + scratch_gprs | BitSet32{DecodeReg(scale_reg)}, scratch_fprs, true); RET(ARM64Reg::X30); } @@ -631,8 +631,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); float_emit.UXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); @@ -649,8 +649,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); float_emit.SXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); @@ -667,8 +667,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -684,8 +684,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -736,12 +736,12 @@ void JitArm64::GenerateQuantizedStores() ARM64Reg temp_reg = ARM64Reg::X0; ARM64Reg scale_reg = ARM64Reg::X1; ARM64Reg addr_reg = ARM64Reg::X2; - BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 1}; + BitSet32 scratch_gprs{0, 1}; if (!jo.memcheck) - gprs_to_push &= ~BitSet32{2}; + scratch_gprs[2] = true; if (!jo.fastmem) - gprs_to_push &= ~BitSet32{3}; - BitSet32 fprs_to_push = BitSet32(0xFFFFFFFF) & ~BitSet32{0, 1}; + scratch_gprs[3] = true; + BitSet32 scratch_fprs{0, 1}; ARM64FloatEmitter float_emit(this); const u8* start = GetCodePtr(); @@ -752,8 +752,8 @@ void JitArm64::GenerateQuantizedStores() constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_32; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); RET(ARM64Reg::X30); } @@ -771,8 +771,8 @@ void JitArm64::GenerateQuantizedStores() constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); RET(ARM64Reg::X30); } @@ -790,8 +790,8 @@ void JitArm64::GenerateQuantizedStores() constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); RET(ARM64Reg::X30); } @@ -808,8 +808,8 @@ void JitArm64::GenerateQuantizedStores() constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); RET(ARM64Reg::X30); } @@ -826,8 +826,8 @@ void JitArm64::GenerateQuantizedStores() constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); RET(ARM64Reg::X30); } @@ -837,8 +837,8 @@ void JitArm64::GenerateQuantizedStores() constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); RET(ARM64Reg::X30); } @@ -856,8 +856,8 @@ void JitArm64::GenerateQuantizedStores() constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); RET(ARM64Reg::X30); } @@ -875,8 +875,8 @@ void JitArm64::GenerateQuantizedStores() constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); RET(ARM64Reg::X30); } @@ -893,8 +893,8 @@ void JitArm64::GenerateQuantizedStores() constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); RET(ARM64Reg::X30); } @@ -911,8 +911,8 @@ void JitArm64::GenerateQuantizedStores() constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, gprs_to_push, - fprs_to_push, true); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, ARM64Reg::D0, addr_reg, scratch_gprs, + scratch_fprs, true); RET(ARM64Reg::X30); } From 527ad0b99b02cee7c51cf17c0e75e5266e064625 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 28 Dec 2024 19:34:31 +0100 Subject: [PATCH 2/8] JitArm64: Allocate scratch registers inside EmitBackpatchRoutine This cuts down on how much callers have to think about what registers EmitBackpatchRoutine is using. Also, by allocating registers dynamically instead of using a fixed set of registers, we improve codegen in cases where the fixed registers are taken but other registers are free. (These improvements don't apply to the emitting_routine == true case, where everything still works like before by necessity.) --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 37 ++-- .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 164 +++++++++++++++--- 2 files changed, 165 insertions(+), 36 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index f037111a4479..d081271f7d46 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -252,29 +252,40 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA // // Registers used: // - // addr scratch - // Store: X2 X1 + // addr + // Store: X2 // Load: X1 - // Zero 256: X1 X30 - // Store float: X2 Q0 + // Zero 256: X1 + // Store float: X2 // Load float: X1 // // If mode == AlwaysFastAccess, the addr argument can be any register. // Otherwise it must be the register listed in the table above. // - // Additional scratch registers are used in the following situations: + // This routine allocates most scratch registers dynamically, but in the following + // situations, specific scratch registers have to be allocated in advance: // - // emitting_routine && mode == Auto: X0 - // emitting_routine && mode == Auto && !(flags & BackPatchInfo::FLAG_STORE): X3 - // emitting_routine && mode != AlwaysSlowAccess && !jo.fastmem: X3 - // mode != AlwaysSlowAccess && !jo.fastmem: X0 - // !emitting_routine && mode != AlwaysFastAccess && jo.memcheck && - // (flags & BackPatchInfo::FLAG_LOAD): X0 - // !emitting_routine && mode != AlwaysSlowAccess && !jo.fastmem: X30 - // !emitting_routine && mode == Auto && jo.fastmem: X30 + // emitting_routine && mode == Auto: X0 + // emitting_routine && mode == Auto && (flags & BackPatchInfo::FLAG_STORE): X1 + // emitting_routine && mode == Auto && !(flags & BackPatchInfo::FLAG_STORE): X3 + // emitting_routine && mode != AlwaysSlowAccess && !jo.fastmem: X3 + // emitting_routine && mode != AlwaysSlowAccess && !jo.fastmem: X0 + // emitting_routine && mode != AlwaysSlowAccess && + // (flags & BackPatchInfo::FLAG_STORE) && !(flags & BackPatchInfo::FLAG_FLOAT): X1 + // emitting_routine && mode != AlwaysSlowAccess && + // (flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT): Q0 + // emitting_routine && mode != AlwaysSlowAccess && + // (flags & BackPatchInfo::FLAG_ZERO_256): X30 + // !emitting_routine && mode == Auto && jo.fastmem: X30 // // If there are any other registers that the caller doesn't mind being overwritten, // these can be indicated in scratch_gprs and scratch_fprs. + // + // In the following situations, certain host registers must not contain guest registers: + // + // !emitting_routine && mode != AlwaysFastAccess && jo.memcheck: X30 + // !emitting_routine && mode != AlwaysFastAccess && jo.memcheck && + // (flags & BackPatchInfo::FLAG_LOAD): X0 void EmitBackpatchRoutine(u32 flags, MemAccessMode mode, Arm64Gen::ARM64Reg RS, Arm64Gen::ARM64Reg addr, BitSet32 scratch_gprs = BitSet32(0), BitSet32 scratch_fprs = BitSet32(0), bool emitting_routine = false); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index 059be57048f3..ad11385eddc3 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -65,11 +65,140 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, const bool emit_fast_access = mode != MemAccessMode::AlwaysSlowAccess; const bool emit_slow_access = mode != MemAccessMode::AlwaysFastAccess; - const BitSet32 gprs_to_push = + const bool memcheck = jo.memcheck && !emitting_routine; + + BitSet32 temp_gpr_candidates = scratch_gprs; + BitSet32 temp_fpr_candidates = scratch_fprs; + temp_gpr_candidates[DecodeReg(addr)] = false; + if (flags & BackPatchInfo::FLAG_FLOAT) + temp_fpr_candidates[DecodeReg(RS)] = false; + else if (!(flags & BackPatchInfo::FLAG_ZERO_256)) + temp_gpr_candidates[DecodeReg(RS)] = false; + if (!emitting_routine && mode == MemAccessMode::Auto && jo.fastmem) + temp_gpr_candidates[30] = true; + + const auto allocate_temp_reg = [this](Arm64RegCache& reg_cache, + BitSet32& candidates) -> Arm64RegCache::ScopedARM64Reg { + for (int i : candidates) + { + candidates[i] = false; + ARM64Reg reg = ARM64Reg(i); + if (®_cache == &fpr) + reg = EncodeRegToQuad(reg); + return reg; + } + return reg_cache.GetScopedReg(); + }; + + const auto can_allocate_temp_reg_for_free = [](Arm64RegCache& reg_cache, BitSet32& candidates) { + return candidates != BitSet32{} || reg_cache.GetUnlockedRegisterCount() > 0; + }; + + Arm64RegCache::ScopedARM64Reg temp_gpr_1; + Arm64RegCache::ScopedARM64Reg temp_gpr_2; + Arm64RegCache::ScopedARM64Reg temp_gpr_3; + Arm64RegCache::ScopedARM64Reg temp_fpr_1; + + if (emit_fast_access) + { + if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT)) + { + temp_fpr_1 = emitting_routine ? Arm64RegCache::ScopedARM64Reg(ARM64Reg::Q0) : + allocate_temp_reg(fpr, temp_fpr_candidates); + scratch_fprs[DecodeReg(temp_fpr_1)] = true; + } + else if (flags & BackPatchInfo::FLAG_STORE) + { + temp_gpr_1 = emitting_routine ? Arm64RegCache::ScopedARM64Reg(ARM64Reg::W1) : + allocate_temp_reg(gpr, temp_gpr_candidates); + scratch_gprs[DecodeReg(temp_gpr_1)] = true; + } + else if (flags & BackPatchInfo::FLAG_ZERO_256) + { + temp_gpr_1 = emitting_routine ? Arm64RegCache::ScopedARM64Reg(ARM64Reg::W30) : + allocate_temp_reg(gpr, temp_gpr_candidates); + scratch_gprs[DecodeReg(temp_gpr_1)] = true; + } + + if (!jo.fastmem) + { + temp_gpr_2 = emitting_routine ? Arm64RegCache::ScopedARM64Reg(ARM64Reg::W0) : + allocate_temp_reg(gpr, temp_gpr_candidates); + temp_gpr_3 = emitting_routine ? Arm64RegCache::ScopedARM64Reg(ARM64Reg::W3) : + allocate_temp_reg(gpr, temp_gpr_candidates); + scratch_gprs[DecodeReg(temp_gpr_2)] = true; + scratch_gprs[DecodeReg(temp_gpr_3)] = true; + } + else if (emit_slow_access && emitting_routine) + { + temp_gpr_2 = ARM64Reg::W0; + temp_gpr_3 = flags & BackPatchInfo::FLAG_STORE ? ARM64Reg::W1 : ARM64Reg::W3; + scratch_gprs[DecodeReg(temp_gpr_2)] = true; + scratch_gprs[DecodeReg(temp_gpr_3)] = true; + } + } + + // Setting memcheck_temp_gpr to W30 works, but because W30 is a register that needs to be pushed + // and popped, using W30 may require us to emit an extra push and pop instruction, depending on + // what other registers need pushing and popping. If we can find another register to use without + // having to evict anything from the register cache, let's do that instead of using W30. + ARM64Reg memcheck_temp_gpr = ARM64Reg::W30; + if (emit_slow_access && memcheck) + { + const auto is_suitable_as_memcheck_temp_gpr = [flags](ARM64Reg reg) { + return reg != ARM64Reg::INVALID_REG && reg != ARM64Reg::W30 && + (reg != ARM64Reg::W0 || !(flags & BackPatchInfo::FLAG_LOAD)); + }; + + const auto get_unset_temp_gpr = [&]() -> Arm64RegCache::ScopedARM64Reg& { + if (temp_gpr_1 == ARM64Reg::INVALID_REG) + return temp_gpr_1; + if (temp_gpr_2 == ARM64Reg::INVALID_REG) + return temp_gpr_2; + ASSERT(temp_gpr_3 == ARM64Reg::INVALID_REG); + return temp_gpr_3; + }; + + if (is_suitable_as_memcheck_temp_gpr(temp_gpr_1)) + { + memcheck_temp_gpr = temp_gpr_1; + } + else if (is_suitable_as_memcheck_temp_gpr(temp_gpr_2)) + { + memcheck_temp_gpr = temp_gpr_2; + } + else if (is_suitable_as_memcheck_temp_gpr(temp_gpr_3)) + { + memcheck_temp_gpr = temp_gpr_3; + } + else + { + while (can_allocate_temp_reg_for_free(gpr, temp_gpr_candidates)) + { + Arm64RegCache::ScopedARM64Reg& temp_gpr_x = get_unset_temp_gpr(); + temp_gpr_x = allocate_temp_reg(gpr, temp_gpr_candidates); + scratch_gprs[DecodeReg(temp_gpr_x)] = true; + if (is_suitable_as_memcheck_temp_gpr(temp_gpr_x)) + break; + } + } + + if (temp_fpr_1 == ARM64Reg::INVALID_REG && + can_allocate_temp_reg_for_free(fpr, temp_fpr_candidates)) + { + temp_fpr_1 = allocate_temp_reg(fpr, temp_fpr_candidates); + scratch_fprs[DecodeReg(temp_fpr_1)] = true; + } + } + + BitSet32 gprs_to_push = (emitting_routine ? CALLER_SAVED_GPRS : gpr.GetCallerSavedUsed()) & ~scratch_gprs; - const BitSet32 fprs_to_push = + BitSet32 fprs_to_push = (emitting_routine ? BitSet32(0xFFFFFFFF) : fpr.GetCallerSavedUsed()) & ~scratch_fprs; + if (!emitting_routine && mode == MemAccessMode::Auto && jo.fastmem) + gprs_to_push[30] = true; + bool in_far_code = false; const u8* fast_access_start = GetCodePtr(); std::optional slow_access_fixup; @@ -81,13 +210,11 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, if (!jo.fastmem) { - const ARM64Reg temp = emitting_routine ? ARM64Reg::W3 : ARM64Reg::W30; - - memory_base = EncodeRegTo64(temp); - memory_offset = ARM64Reg::W0; + memory_base = EncodeRegTo64(temp_gpr_3); + memory_offset = temp_gpr_2; - LSR(temp, addr, PowerPC::BAT_INDEX_SHIFT); - LDR(memory_base, MEM_REG, ArithOption(temp, true)); + LSR(temp_gpr_3, addr, PowerPC::BAT_INDEX_SHIFT); + LDR(memory_base, MEM_REG, ArithOption(temp_gpr_3, true)); if (emit_slow_access) { @@ -100,15 +227,12 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, } else if (emit_slow_access && emitting_routine) { - const ARM64Reg temp1 = flags & BackPatchInfo::FLAG_STORE ? ARM64Reg::W1 : ARM64Reg::W3; - const ARM64Reg temp2 = ARM64Reg::W0; - - slow_access_fixup = CheckIfSafeAddress(addr, temp1, temp2); + slow_access_fixup = CheckIfSafeAddress(addr, temp_gpr_3, temp_gpr_2); } if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT)) { - ARM64Reg temp = ARM64Reg::D0; + ARM64Reg temp = EncodeRegToDouble(temp_fpr_1); temp = ByteswapBeforeStore(this, &m_float_emit, temp, EncodeRegToDouble(RS), flags, true); m_float_emit.STR(access_size, temp, memory_base, memory_offset); @@ -122,7 +246,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, } else if (flags & BackPatchInfo::FLAG_STORE) { - ARM64Reg temp = ARM64Reg::W1; + ARM64Reg temp = temp_gpr_1; temp = ByteswapBeforeStore(this, &m_float_emit, temp, RS, flags, true); if (flags & BackPatchInfo::FLAG_SIZE_32) @@ -135,7 +259,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, else if (flags & BackPatchInfo::FLAG_ZERO_256) { // This literally only stores 32bytes of zeros to the target address - ARM64Reg temp = ARM64Reg::X30; + ARM64Reg temp = EncodeRegTo64(temp_gpr_1); ADD(temp, memory_base, memory_offset); STP(IndexType::Signed, ARM64Reg::ZR, ARM64Reg::ZR, temp, 0); STP(IndexType::Signed, ARM64Reg::ZR, ARM64Reg::ZR, temp, 16); @@ -156,8 +280,6 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, if (emit_slow_access) { - const bool memcheck = jo.memcheck && !emitting_routine; - if (emit_fast_access) { in_far_code = true; @@ -174,12 +296,9 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, if (slow_access_fixup) SetJumpTarget(*slow_access_fixup); - const ARM64Reg temp_gpr = ARM64Reg::W1; - const int temp_gpr_index = DecodeReg(temp_gpr); - BitSet32 gprs_to_push_early = {}; if (memcheck) - gprs_to_push_early[temp_gpr_index] = true; + gprs_to_push_early[DecodeReg(memcheck_temp_gpr)] = true; if (flags & BackPatchInfo::FLAG_LOAD) gprs_to_push_early[0] = true; @@ -270,11 +389,10 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, if (memcheck) { - const ARM64Reg temp_fpr = fprs_to_push[0] ? ARM64Reg::INVALID_REG : ARM64Reg::Q0; const u64 early_push_count = (gprs_to_push & gprs_to_push_early).Count(); const u64 early_push_size = Common::AlignUp(early_push_count, 2) * 8; - WriteConditionalExceptionExit(EXCEPTION_DSI, temp_gpr, temp_fpr, early_push_size); + WriteConditionalExceptionExit(EXCEPTION_DSI, memcheck_temp_gpr, temp_fpr_1, early_push_size); } if (flags & BackPatchInfo::FLAG_LOAD) From 53770f4abec65c1bc20bc40d534934700f435aae Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 29 Dec 2024 17:07:00 +0100 Subject: [PATCH 3/8] JitArm64: Remove now unnecessary locking of temp registers --- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 45 +++++-------------- .../JitArm64/JitArm64_LoadStoreFloating.cpp | 24 +++------- .../JitArm64/JitArm64_LoadStorePaired.cpp | 45 +++++++++---------- 3 files changed, 36 insertions(+), 78 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 7462cdd52977..97fa2d4354e2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -31,7 +31,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o { // We want to make sure to not get LR as a temp register gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); - if (jo.memcheck || !jo.fastmem) + if (jo.memcheck) gpr.Lock(ARM64Reg::W0); gpr.BindToRegister(dest, dest == (u32)addr || dest == (u32)offsetReg, false); @@ -127,7 +127,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o BitSet32 scratch_fprs; if (!update || early_update) scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; - if (jo.memcheck || !jo.fastmem) + if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; if (!jo.memcheck) scratch_gprs[DecodeReg(dest_reg)] = true; @@ -170,7 +170,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o } gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); - if (jo.memcheck || !jo.fastmem) + if (jo.memcheck) gpr.Unlock(ARM64Reg::W0); } @@ -178,9 +178,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s bool update) { // We want to make sure to not get LR as a temp register - gpr.Lock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); - if (!jo.fastmem) - gpr.Lock(ARM64Reg::W0); + gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); // Don't materialize zero. ARM64Reg RS = gpr.IsImm(value, 0) ? ARM64Reg::WZR : gpr.R(value); @@ -274,11 +272,8 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s BitSet32 scratch_gprs; BitSet32 scratch_fprs; - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; if (!update || early_update) scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; - if (!jo.fastmem) - scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; u32 access_size = BackPatchInfo::GetFlagSize(flags); u32 mmio_address = 0; @@ -319,7 +314,6 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s } else if (mmio_address) { - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; scratch_gprs[DecodeReg(ARM64Reg::W30)] = true; scratch_gprs[DecodeReg(RS)] = 0; @@ -341,9 +335,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s MOV(gpr.R(dest), addr_reg); } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); - if (!jo.fastmem) - gpr.Unlock(ARM64Reg::W0); + gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); } FixupBranch JitArm64::BATAddressLookup(ARM64Reg addr_out, ARM64Reg addr_in, ARM64Reg tmp, @@ -526,7 +518,7 @@ void JitArm64::lmw(UGeckoInstruction inst) s32 offset = inst.SIMM_16; gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); - if (jo.memcheck || !jo.fastmem) + if (jo.memcheck) gpr.Lock(ARM64Reg::W0); // MMU games make use of a >= d despite this being invalid according to the PEM. @@ -598,7 +590,7 @@ void JitArm64::lmw(UGeckoInstruction inst) BitSet32 scratch_gprs; BitSet32 scratch_fprs; scratch_gprs[DecodeReg(addr_reg)] = true; - if (jo.memcheck || !jo.fastmem) + if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; if (!jo.memcheck) scratch_gprs[DecodeReg(dest_reg)] = true; @@ -633,7 +625,7 @@ void JitArm64::lmw(UGeckoInstruction inst) } gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); - if (jo.memcheck || !jo.fastmem) + if (jo.memcheck) gpr.Unlock(ARM64Reg::W0); } @@ -645,9 +637,7 @@ void JitArm64::stmw(UGeckoInstruction inst) u32 a = inst.RA, s = inst.RS; s32 offset = inst.SIMM_16; - gpr.Lock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); - if (!jo.fastmem) - gpr.Lock(ARM64Reg::W0); + gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); ARM64Reg addr_reg = ARM64Reg::W2; bool a_is_addr_base_reg = false; @@ -715,10 +705,7 @@ void JitArm64::stmw(UGeckoInstruction inst) BitSet32 scratch_gprs; BitSet32 scratch_fprs; - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; scratch_gprs[DecodeReg(addr_reg)] = true; - if (!jo.fastmem) - scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; EmitBackpatchRoutine(flags, MemAccessMode::Auto, src_reg, EncodeRegTo64(addr_reg), scratch_gprs, scratch_fprs); @@ -753,9 +740,7 @@ void JitArm64::stmw(UGeckoInstruction inst) } } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); - if (!jo.fastmem) - gpr.Unlock(ARM64Reg::W0); + gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); } void JitArm64::dcbx(UGeckoInstruction inst) @@ -976,14 +961,8 @@ void JitArm64::dcbz(UGeckoInstruction inst) int a = inst.RA, b = inst.RB; gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); - if (!jo.fastmem) - gpr.Lock(ARM64Reg::W0); - Common::ScopeGuard register_guard([&] { - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); - if (!jo.fastmem) - gpr.Unlock(ARM64Reg::W0); - }); + Common::ScopeGuard register_guard([&] { gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); }); constexpr ARM64Reg addr_reg = ARM64Reg::W1; constexpr ARM64Reg temp_reg = ARM64Reg::W30; @@ -1050,8 +1029,6 @@ void JitArm64::dcbz(UGeckoInstruction inst) BitSet32 scratch_gprs; BitSet32 scratch_fprs; scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; - if (!jo.fastmem) - scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; EmitBackpatchRoutine(BackPatchInfo::FLAG_ZERO_256, MemAccessMode::Auto, ARM64Reg::W1, EncodeRegTo64(addr_reg), scratch_gprs, scratch_fprs); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index 28e89aeff434..c53e5f0f028e 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -78,8 +78,7 @@ void JitArm64::lfXX(UGeckoInstruction inst) (flags & BackPatchInfo::FLAG_SIZE_64) != 0 ? RegType::LowerPair : RegType::DuplicatedSingle; gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); - fpr.Lock(ARM64Reg::Q0); - if (jo.memcheck || !jo.fastmem) + if (jo.memcheck) gpr.Lock(ARM64Reg::W0); const ARM64Reg VD = fpr.RW(inst.FD, type, false); @@ -168,9 +167,8 @@ void JitArm64::lfXX(UGeckoInstruction inst) BitSet32 scratch_fprs; if (!update || early_update) scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; - if (jo.memcheck || !jo.fastmem) + if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; - scratch_gprs[DecodeReg(ARM64Reg::Q0)] = true; if (!jo.memcheck) scratch_fprs[DecodeReg(VD)] = true; @@ -194,8 +192,7 @@ void JitArm64::lfXX(UGeckoInstruction inst) } gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); - fpr.Unlock(ARM64Reg::Q0); - if (jo.memcheck || !jo.fastmem) + if (jo.memcheck) gpr.Unlock(ARM64Reg::W0); } @@ -265,8 +262,6 @@ void JitArm64::stfXX(UGeckoInstruction inst) u32 imm_addr = 0; bool is_immediate = false; - fpr.Lock(ARM64Reg::Q0); - const bool have_single = fpr.IsSingle(inst.FS, true); Arm64FPRCache::ScopedARM64Reg V0 = @@ -279,9 +274,7 @@ void JitArm64::stfXX(UGeckoInstruction inst) V0 = std::move(single_reg); } - gpr.Lock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); - if (!jo.fastmem) - gpr.Lock(ARM64Reg::W0); + gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); ARM64Reg addr_reg = ARM64Reg::W2; @@ -370,12 +363,8 @@ void JitArm64::stfXX(UGeckoInstruction inst) BitSet32 scratch_gprs; BitSet32 scratch_fprs; - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; if (!update || early_update) scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; - if (!jo.fastmem) - scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; - scratch_fprs[DecodeReg(ARM64Reg::Q0)] = true; if (is_immediate) { @@ -426,8 +415,5 @@ void JitArm64::stfXX(UGeckoInstruction inst) MOV(gpr.R(a), addr_reg); } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); - fpr.Unlock(ARM64Reg::Q0); - if (!jo.fastmem) - gpr.Unlock(ARM64Reg::W0); + gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index 57970646f7b1..9a8f2a3ede68 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -39,13 +39,12 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) const int w = indexed ? inst.Wx : inst.W; gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); - fpr.Lock(ARM64Reg::Q0); if (!js.assumeNoPairedQuantize) { gpr.Lock(ARM64Reg::W0, ARM64Reg::W2, ARM64Reg::W3); - fpr.Lock(ARM64Reg::Q1); + fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1); } - else if (jo.memcheck || !jo.fastmem) + else if (jo.memcheck) { gpr.Lock(ARM64Reg::W0); } @@ -84,9 +83,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) if (!update || early_update) scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; - if (jo.memcheck || !jo.fastmem) + if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; - scratch_fprs[DecodeReg(ARM64Reg::Q0)] = true; if (!jo.memcheck) scratch_fprs[DecodeReg(VS)] = true; @@ -133,13 +131,12 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) } gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); - fpr.Unlock(ARM64Reg::Q0); if (!js.assumeNoPairedQuantize) { gpr.Unlock(ARM64Reg::W0, ARM64Reg::W2, ARM64Reg::W3); - fpr.Unlock(ARM64Reg::Q1); + fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1); } - else if (jo.memcheck || !jo.fastmem) + else if (jo.memcheck) { gpr.Unlock(ARM64Reg::W0); } @@ -166,9 +163,8 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) const int i = indexed ? inst.Ix : inst.I; const int w = indexed ? inst.Wx : inst.W; - fpr.Lock(ARM64Reg::Q0); if (!js.assumeNoPairedQuantize) - fpr.Lock(ARM64Reg::Q1); + fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1); const bool have_single = fpr.IsSingle(inst.RS); @@ -204,11 +200,13 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) } } - gpr.Lock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); - if (!js.assumeNoPairedQuantize || !jo.fastmem) - gpr.Lock(ARM64Reg::W0); - if (!js.assumeNoPairedQuantize && !jo.fastmem) - gpr.Lock(ARM64Reg::W3); + gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); + if (!js.assumeNoPairedQuantize) + { + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1); + if (!jo.fastmem) + gpr.Lock(ARM64Reg::W3); + } constexpr ARM64Reg type_reg = ARM64Reg::W0; constexpr ARM64Reg scale_reg = ARM64Reg::W1; @@ -241,11 +239,8 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) BitSet32 scratch_gprs; BitSet32 scratch_fprs; - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; if (!update || early_update) scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; - if (!jo.fastmem) - scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; if (!w) @@ -278,12 +273,12 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) MOV(gpr.R(inst.RA), addr_reg); } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); - fpr.Unlock(ARM64Reg::Q0); - if (!js.assumeNoPairedQuantize || !jo.fastmem) - gpr.Unlock(ARM64Reg::W0); - if (!js.assumeNoPairedQuantize && !jo.fastmem) - gpr.Unlock(ARM64Reg::W3); + gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); if (!js.assumeNoPairedQuantize) - fpr.Unlock(ARM64Reg::Q1); + { + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1); + if (!jo.fastmem) + gpr.Unlock(ARM64Reg::W3); + fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1); + } } From 099b5d1afb7c982f732013a76c8de760a0c0c88d Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 29 Dec 2024 18:08:03 +0100 Subject: [PATCH 4/8] JitArm64: Automatically set whether RS is scratch When EmitBackpatchRoutine emits slow access code for a load, RS must not be in scratch_gprs/scratch_fprs if we have memchecks (otherwise the memcheck exit won't be able to save RS properly), and RS must be in scratch_gprs/scratch_fprs if we don't have memchecks (otherwise RS will be restored to the value it had before the load, overwriting the result of the load). Let's save callers from having to think about this by embedding the relevant logic inside EmitBackpatchRoutine. --- Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp | 8 ++++++++ Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp | 4 ---- .../Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp | 2 -- .../Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp | 2 -- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index ad11385eddc3..02299e0e2aee 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -67,6 +67,14 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, const bool memcheck = jo.memcheck && !emitting_routine; + if ((flags & BackPatchInfo::FLAG_LOAD)) + { + if ((flags & BackPatchInfo::FLAG_FLOAT)) + scratch_fprs[DecodeReg(RS)] = !memcheck; + else + scratch_gprs[DecodeReg(RS)] = !memcheck; + } + BitSet32 temp_gpr_candidates = scratch_gprs; BitSet32 temp_fpr_candidates = scratch_fprs; temp_gpr_candidates[DecodeReg(addr)] = false; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 97fa2d4354e2..6e85dc3d5580 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -129,8 +129,6 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; - if (!jo.memcheck) - scratch_gprs[DecodeReg(dest_reg)] = true; u32 access_size = BackPatchInfo::GetFlagSize(flags); u32 mmio_address = 0; @@ -592,8 +590,6 @@ void JitArm64::lmw(UGeckoInstruction inst) scratch_gprs[DecodeReg(addr_reg)] = true; if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; - if (!jo.memcheck) - scratch_gprs[DecodeReg(dest_reg)] = true; EmitBackpatchRoutine(flags, MemAccessMode::Auto, dest_reg, EncodeRegTo64(addr_reg), scratch_gprs, scratch_fprs); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index c53e5f0f028e..5deeaf37249e 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -169,8 +169,6 @@ void JitArm64::lfXX(UGeckoInstruction inst) scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; - if (!jo.memcheck) - scratch_fprs[DecodeReg(VD)] = true; if (is_immediate && m_mmu.IsOptimizableRAMAddress(imm_addr, BackPatchInfo::GetFlagSize(flags))) { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index 9a8f2a3ede68..b5731a215776 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -85,8 +85,6 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; - if (!jo.memcheck) - scratch_fprs[DecodeReg(VS)] = true; u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; if (!w) From d7f20fd5d51eaac6a8994d9283002f9932b092d9 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 29 Dec 2024 21:18:09 +0100 Subject: [PATCH 5/8] JitArm64: Stop encoding EmitBackpatchRoutine addr as 64-bit There's no reason why this register should be 64-bit. It always contains a 32-bit value, since guest addresses are 32-bit. --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 10 +++---- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 28 ++++++++----------- .../JitArm64/JitArm64_LoadStoreFloating.cpp | 18 +++++------- .../JitArm64/JitArm64_LoadStorePaired.cpp | 6 ++-- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 4 +-- 5 files changed, 28 insertions(+), 38 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index d081271f7d46..eec712f01039 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -253,11 +253,11 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA // Registers used: // // addr - // Store: X2 - // Load: X1 - // Zero 256: X1 - // Store float: X2 - // Load float: X1 + // Store: W2 + // Load: W1 + // Zero 256: W1 + // Store float: W2 + // Load float: W1 // // If mode == AlwaysFastAccess, the addr argument can be any register. // Otherwise it must be the register listed in the table above. diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 6e85dc3d5580..2a2bdf05e269 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -107,12 +107,10 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o } } - ARM64Reg XA = EncodeRegTo64(addr_reg); - bool addr_reg_set = !is_immediate; const auto set_addr_reg_if_needed = [&] { if (!addr_reg_set) - MOVI2R(XA, imm_addr); + MOVI2R(addr_reg, imm_addr); }; const bool early_update = !jo.memcheck && dest != static_cast(addr); @@ -138,7 +136,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o if (is_immediate && m_mmu.IsOptimizableRAMAddress(imm_addr, access_size)) { set_addr_reg_if_needed(); - EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, dest_reg, XA, scratch_gprs, + EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, dest_reg, addr_reg, scratch_gprs, scratch_fprs); } else if (mmio_address) @@ -154,7 +152,8 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o else { set_addr_reg_if_needed(); - EmitBackpatchRoutine(flags, MemAccessMode::Auto, dest_reg, XA, scratch_gprs, scratch_fprs); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, dest_reg, addr_reg, scratch_gprs, + scratch_fprs); } gpr.BindToRegister(dest, false, true); @@ -252,12 +251,10 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s } } - ARM64Reg XA = EncodeRegTo64(addr_reg); - bool addr_reg_set = !is_immediate; const auto set_addr_reg_if_needed = [&] { if (!addr_reg_set) - MOVI2R(XA, imm_addr); + MOVI2R(addr_reg, imm_addr); }; const bool early_update = !jo.memcheck && value != static_cast(dest); @@ -307,7 +304,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s else if (is_immediate && m_mmu.IsOptimizableRAMAddress(imm_addr, access_size)) { set_addr_reg_if_needed(); - EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, RS, XA, scratch_gprs, + EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, RS, addr_reg, scratch_gprs, scratch_fprs); } else if (mmio_address) @@ -323,7 +320,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s else { set_addr_reg_if_needed(); - EmitBackpatchRoutine(flags, MemAccessMode::Auto, RS, XA, scratch_gprs, scratch_fprs); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, RS, addr_reg, scratch_gprs, scratch_fprs); } if (update && !early_update) @@ -591,8 +588,8 @@ void JitArm64::lmw(UGeckoInstruction inst) if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, dest_reg, EncodeRegTo64(addr_reg), - scratch_gprs, scratch_fprs); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, dest_reg, addr_reg, scratch_gprs, + scratch_fprs); gpr.BindToRegister(i, false, true); ASSERT(dest_reg == gpr.R(i)); @@ -703,8 +700,7 @@ void JitArm64::stmw(UGeckoInstruction inst) BitSet32 scratch_fprs; scratch_gprs[DecodeReg(addr_reg)] = true; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, src_reg, EncodeRegTo64(addr_reg), scratch_gprs, - scratch_fprs); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, src_reg, addr_reg, scratch_gprs, scratch_fprs); // To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores // after this instruction, flush registers that would be flushed after this instruction anyway. @@ -1026,8 +1022,8 @@ void JitArm64::dcbz(UGeckoInstruction inst) BitSet32 scratch_fprs; scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; - EmitBackpatchRoutine(BackPatchInfo::FLAG_ZERO_256, MemAccessMode::Auto, ARM64Reg::W1, - EncodeRegTo64(addr_reg), scratch_gprs, scratch_fprs); + EmitBackpatchRoutine(BackPatchInfo::FLAG_ZERO_256, MemAccessMode::Auto, ARM64Reg::W1, addr_reg, + scratch_gprs, scratch_fprs); if (using_dcbz_hack) SetJumpTarget(end_dcbz_hack); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index 5deeaf37249e..ff7d37df96a5 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -151,10 +151,8 @@ void JitArm64::lfXX(UGeckoInstruction inst) } } - ARM64Reg XA = EncodeRegTo64(addr_reg); - if (is_immediate) - MOVI2R(XA, imm_addr); + MOVI2R(addr_reg, imm_addr); const bool early_update = !jo.memcheck; if (update && early_update) @@ -172,12 +170,12 @@ void JitArm64::lfXX(UGeckoInstruction inst) if (is_immediate && m_mmu.IsOptimizableRAMAddress(imm_addr, BackPatchInfo::GetFlagSize(flags))) { - EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, VD, XA, scratch_gprs, + EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, VD, addr_reg, scratch_gprs, scratch_fprs); } else { - EmitBackpatchRoutine(flags, MemAccessMode::Auto, VD, XA, scratch_gprs, scratch_fprs); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, VD, addr_reg, scratch_gprs, scratch_fprs); } const ARM64Reg VD_again = fpr.RW(inst.FD, type, true); @@ -343,12 +341,10 @@ void JitArm64::stfXX(UGeckoInstruction inst) } } - ARM64Reg XA = EncodeRegTo64(addr_reg); - bool addr_reg_set = !is_immediate; const auto set_addr_reg_if_needed = [&] { if (!addr_reg_set) - MOVI2R(XA, imm_addr); + MOVI2R(addr_reg, imm_addr); }; const bool early_update = !jo.memcheck; @@ -390,20 +386,20 @@ void JitArm64::stfXX(UGeckoInstruction inst) else if (m_mmu.IsOptimizableRAMAddress(imm_addr, BackPatchInfo::GetFlagSize(flags))) { set_addr_reg_if_needed(); - EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, V0, XA, scratch_gprs, + EmitBackpatchRoutine(flags, MemAccessMode::AlwaysFastAccess, V0, addr_reg, scratch_gprs, scratch_fprs); } else { set_addr_reg_if_needed(); - EmitBackpatchRoutine(flags, MemAccessMode::AlwaysSlowAccess, V0, XA, scratch_gprs, + EmitBackpatchRoutine(flags, MemAccessMode::AlwaysSlowAccess, V0, addr_reg, scratch_gprs, scratch_fprs); } } else { set_addr_reg_if_needed(); - EmitBackpatchRoutine(flags, MemAccessMode::Auto, V0, XA, scratch_gprs, scratch_fprs); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, V0, addr_reg, scratch_gprs, scratch_fprs); } if (update && !early_update) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index b5731a215776..b225c83eab6d 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -90,8 +90,7 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) if (!w) flags |= BackPatchInfo::FLAG_PAIR; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, VS, EncodeRegTo64(addr_reg), scratch_gprs, - scratch_fprs); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, VS, addr_reg, scratch_gprs, scratch_fprs); } else { @@ -244,8 +243,7 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) if (!w) flags |= BackPatchInfo::FLAG_PAIR; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, VS, EncodeRegTo64(addr_reg), scratch_gprs, - scratch_fprs); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, VS, addr_reg, scratch_gprs, scratch_fprs); } else { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index aefec65cada0..bb28b9052b83 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -524,7 +524,7 @@ void JitArm64::GenerateQuantizedLoads() // Q0 is the return // Q1 is a temporary ARM64Reg temp_reg = ARM64Reg::X0; - ARM64Reg addr_reg = ARM64Reg::X1; + ARM64Reg addr_reg = ARM64Reg::W1; ARM64Reg scale_reg = ARM64Reg::X2; BitSet32 scratch_gprs{0, 3}; if (!jo.memcheck) @@ -735,7 +735,7 @@ void JitArm64::GenerateQuantizedStores() // Q1 is a temporary ARM64Reg temp_reg = ARM64Reg::X0; ARM64Reg scale_reg = ARM64Reg::X1; - ARM64Reg addr_reg = ARM64Reg::X2; + ARM64Reg addr_reg = ARM64Reg::W2; BitSet32 scratch_gprs{0, 1}; if (!jo.memcheck) scratch_gprs[2] = true; From c3b6e67fc7807f8935e7680e22db8ec022dde149 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 29 Dec 2024 21:00:44 +0100 Subject: [PATCH 6/8] JitArm64: Allow free choice of EmitBackpatchRoutine addr register --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 7 +-- .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 44 +++++++++++++------ 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index eec712f01039..a0870be06c27 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -250,18 +250,15 @@ class JitArm64 : public JitBase, public Arm64Gen::ARM64CodeBlock, public CommonA // This is the core routine for accessing emulated memory, with support for // many different kinds of loads and stores as well as fastmem/backpatching. // - // Registers used: + // The addr parameter can be any register, but the code emitted for slow accesses + // will be slightly more efficient if the addr parameter is as follows: // - // addr // Store: W2 // Load: W1 // Zero 256: W1 // Store float: W2 // Load float: W1 // - // If mode == AlwaysFastAccess, the addr argument can be any register. - // Otherwise it must be the register listed in the table above. - // // This routine allocates most scratch registers dynamically, but in the following // situations, specific scratch registers have to be allocated in advance: // diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index 02299e0e2aee..3c11e83caa04 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -317,9 +317,18 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, if ((gprs_to_push & gprs_to_push_early).Count() & 1) gprs_to_push_early[30] = true; + // This temp GPR is only used when GPRs have been pushed, so we can choose almost any register + ARM64Reg temp_gpr_for_function_call = ARM64Reg::W8; + while (temp_gpr_for_function_call == addr || + (temp_gpr_for_function_call == RS && (flags & BackPatchInfo::FLAG_STORE))) + { + temp_gpr_for_function_call = + static_cast(static_cast(temp_gpr_for_function_call) + 1); + } + ABI_PushRegisters(gprs_to_push & gprs_to_push_early); ABI_PushRegisters(gprs_to_push & ~gprs_to_push_early); - m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30); + m_float_emit.ABI_PushRegisters(fprs_to_push, EncodeRegTo64(temp_gpr_for_function_call)); // PC is used by memory watchpoints (if enabled), profiling where to insert gather pipe // interrupt checks, and printing accurate PC locations in debug logs. @@ -328,14 +337,23 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, // so the caller has to store the PC themselves. if (!emitting_routine) { - MOVI2R(ARM64Reg::W30, js.compilerPC); - STR(IndexType::Unsigned, ARM64Reg::W30, PPC_REG, PPCSTATE_OFF(pc)); + MOVI2R(temp_gpr_for_function_call, js.compilerPC); + STR(IndexType::Unsigned, temp_gpr_for_function_call, PPC_REG, PPCSTATE_OFF(pc)); } if (flags & BackPatchInfo::FLAG_STORE) { ARM64Reg src_reg = RS; const ARM64Reg dst_reg = access_size == 64 ? ARM64Reg::X1 : ARM64Reg::W1; + ARM64Reg temp_addr_reg = addr; + if (addr == ARM64Reg::W1) + { + // If addr is W1, we must move the address to a different register so we don't + // overwrite it when moving RS to W1. W2 is the optimal register to move to, + // because that's the register the address needs to be in for the function call. + temp_addr_reg = RS != ARM64Reg::W2 ? ARM64Reg::W2 : temp_gpr_for_function_call; + MOV(temp_addr_reg, addr); + } if (flags & BackPatchInfo::FLAG_FLOAT) { @@ -359,40 +377,40 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS, if (access_size == 64) { ABI_CallFunction(reverse ? &PowerPC::WriteU64SwapFromJit : &PowerPC::WriteU64FromJit, - &m_mmu, src_reg, ARM64Reg::W2); + &m_mmu, src_reg, temp_addr_reg); } else if (access_size == 32) { ABI_CallFunction(reverse ? &PowerPC::WriteU32SwapFromJit : &PowerPC::WriteU32FromJit, - &m_mmu, src_reg, ARM64Reg::W2); + &m_mmu, src_reg, temp_addr_reg); } else if (access_size == 16) { ABI_CallFunction(reverse ? &PowerPC::WriteU16SwapFromJit : &PowerPC::WriteU16FromJit, - &m_mmu, src_reg, ARM64Reg::W2); + &m_mmu, src_reg, temp_addr_reg); } else { - ABI_CallFunction(&PowerPC::WriteU8FromJit, &m_mmu, src_reg, ARM64Reg::W2); + ABI_CallFunction(&PowerPC::WriteU8FromJit, &m_mmu, src_reg, addr); } } else if (flags & BackPatchInfo::FLAG_ZERO_256) { - ABI_CallFunction(&PowerPC::ClearDCacheLineFromJit, &m_mmu, ARM64Reg::W1); + ABI_CallFunction(&PowerPC::ClearDCacheLineFromJit, &m_mmu, addr); } else { if (access_size == 64) - ABI_CallFunction(&PowerPC::ReadU64FromJit, &m_mmu, ARM64Reg::W1); + ABI_CallFunction(&PowerPC::ReadU64FromJit, &m_mmu, addr); else if (access_size == 32) - ABI_CallFunction(&PowerPC::ReadU32FromJit, &m_mmu, ARM64Reg::W1); + ABI_CallFunction(&PowerPC::ReadU32FromJit, &m_mmu, addr); else if (access_size == 16) - ABI_CallFunction(&PowerPC::ReadU16FromJit, &m_mmu, ARM64Reg::W1); + ABI_CallFunction(&PowerPC::ReadU16FromJit, &m_mmu, addr); else - ABI_CallFunction(&PowerPC::ReadU8FromJit, &m_mmu, ARM64Reg::W1); + ABI_CallFunction(&PowerPC::ReadU8FromJit, &m_mmu, addr); } - m_float_emit.ABI_PopRegisters(fprs_to_push, ARM64Reg::X30); + m_float_emit.ABI_PopRegisters(fprs_to_push, EncodeRegTo64(temp_gpr_for_function_call)); ABI_PopRegisters(gprs_to_push & ~gprs_to_push_early); if (memcheck) From c88c6f5f18b291428410653412356b17467e67de Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 29 Dec 2024 21:28:59 +0100 Subject: [PATCH 7/8] JitArm64: Get rid of one MOV from lmw/stmw This is possible now that EmitBackpatchRoutine lets the caller choose addr register. Note: The removed MOV is actually traded for one added MOV in farcode. --- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 81 +++++++++++++------ 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 2a2bdf05e269..3f872600c6ce 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -519,21 +519,26 @@ void JitArm64::lmw(UGeckoInstruction inst) // MMU games make use of a >= d despite this being invalid according to the PEM. // If a >= d occurs, we must make sure to not re-read rA after starting doing the loads. ARM64Reg addr_reg = ARM64Reg::W1; + Arm64RegCache::ScopedARM64Reg addr_base_reg; bool a_is_addr_base_reg = false; if (!a) - MOVI2R(addr_reg, offset); + { + addr_base_reg = gpr.GetScopedReg(); + MOVI2R(addr_base_reg, offset); + } else if (gpr.IsImm(a)) - MOVI2R(addr_reg, gpr.GetImm(a) + offset); + { + addr_base_reg = gpr.GetScopedReg(); + MOVI2R(addr_base_reg, gpr.GetImm(a) + offset); + } else if (a < d && offset + (31 - d) * 4 < 0x1000) + { a_is_addr_base_reg = true; + } else - ADDI2R(addr_reg, gpr.R(a), offset, addr_reg); - - Arm64RegCache::ScopedARM64Reg addr_base_reg; - if (!a_is_addr_base_reg) { addr_base_reg = gpr.GetScopedReg(); - MOV(addr_base_reg, addr_reg); + ADDI2R(addr_base_reg, gpr.R(a), offset, addr_base_reg); } BitSet32 gprs_to_discard{}; @@ -576,11 +581,23 @@ void JitArm64::lmw(UGeckoInstruction inst) { gpr.BindToRegister(i, false, false); ARM64Reg dest_reg = gpr.R(i); + ARM64Reg current_iteration_addr_reg = addr_reg; if (a_is_addr_base_reg) - ADDI2R(addr_reg, gpr.R(a), offset + (i - d) * 4); - else if (i != d) - ADDI2R(addr_reg, addr_base_reg, (i - d) * 4); + { + const u32 current_iteration_offset = offset + (i - d) * 4; + if (current_iteration_offset != 0) + ADDI2R(addr_reg, gpr.R(a), current_iteration_offset); + else + current_iteration_addr_reg = gpr.R(a); + } + else + { + if (i != d) + ADDI2R(addr_reg, addr_base_reg, (i - d) * 4); + else + current_iteration_addr_reg = addr_base_reg; + } BitSet32 scratch_gprs; BitSet32 scratch_fprs; @@ -588,8 +605,8 @@ void JitArm64::lmw(UGeckoInstruction inst) if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, dest_reg, addr_reg, scratch_gprs, - scratch_fprs); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, dest_reg, current_iteration_addr_reg, + scratch_gprs, scratch_fprs); gpr.BindToRegister(i, false, true); ASSERT(dest_reg == gpr.R(i)); @@ -633,21 +650,26 @@ void JitArm64::stmw(UGeckoInstruction inst) gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); ARM64Reg addr_reg = ARM64Reg::W2; + Arm64RegCache::ScopedARM64Reg addr_base_reg; bool a_is_addr_base_reg = false; if (!a) - MOVI2R(addr_reg, offset); + { + addr_base_reg = gpr.GetScopedReg(); + MOVI2R(addr_base_reg, offset); + } else if (gpr.IsImm(a)) - MOVI2R(addr_reg, gpr.GetImm(a) + offset); + { + addr_base_reg = gpr.GetScopedReg(); + MOVI2R(addr_base_reg, gpr.GetImm(a) + offset); + } else if (offset + (31 - s) * 4 < 0x1000) + { a_is_addr_base_reg = true; + } else - ADDI2R(addr_reg, gpr.R(a), offset, addr_reg); - - Arm64GPRCache::ScopedARM64Reg addr_base_reg; - if (!a_is_addr_base_reg) { addr_base_reg = gpr.GetScopedReg(); - MOV(addr_base_reg, addr_reg); + ADDI2R(addr_base_reg, gpr.R(a), offset, addr_base_reg); } BitSet32 gprs_to_discard{}; @@ -690,17 +712,30 @@ void JitArm64::stmw(UGeckoInstruction inst) for (u32 i = s; i < 32; i++) { ARM64Reg src_reg = gpr.R(i); + ARM64Reg current_iteration_addr_reg = addr_reg; if (a_is_addr_base_reg) - ADDI2R(addr_reg, gpr.R(a), offset + (i - s) * 4); - else if (i != s) - ADDI2R(addr_reg, addr_base_reg, (i - s) * 4); + { + const u32 current_iteration_offset = offset + (i - s) * 4; + if (current_iteration_offset != 0) + ADDI2R(addr_reg, gpr.R(a), current_iteration_offset); + else + current_iteration_addr_reg = gpr.R(a); + } + else + { + if (i != s) + ADDI2R(addr_reg, addr_base_reg, (i - s) * 4); + else + current_iteration_addr_reg = addr_base_reg; + } BitSet32 scratch_gprs; BitSet32 scratch_fprs; scratch_gprs[DecodeReg(addr_reg)] = true; - EmitBackpatchRoutine(flags, MemAccessMode::Auto, src_reg, addr_reg, scratch_gprs, scratch_fprs); + EmitBackpatchRoutine(flags, MemAccessMode::Auto, src_reg, current_iteration_addr_reg, + scratch_gprs, scratch_fprs); // To reduce register pressure and to avoid getting a pipeline-unfriendly long run of stores // after this instruction, flush registers that would be flushed after this instruction anyway. From 7417efe6006f8b01458e3d78184363276d6c0a65 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 29 Dec 2024 23:06:53 +0100 Subject: [PATCH 8/8] JitArm64: Use GetRegWithPreference for EmitBackpatchRoutine addr I'm adding a new function to the register cache called GetRegWithPreference. If the passed-in register is unlocked, it gets locked. Otherwise, GetReg is called. The function also has a GetScopedRegWithPreference variant. Then, I'm making JitArm64 call this function when allocating an address register for use with EmitBackpatchRoutine. This way, when register pressure is low we can use the optimal register, and when register pressure is high (but not completely full) we can sacrifice a bit of farcode size for not having to evict a register from the register cache. --- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 43 ++++++++++--------- .../JitArm64/JitArm64_LoadStoreFloating.cpp | 16 +++---- .../JitArm64/JitArm64_LoadStorePaired.cpp | 26 +++++------ .../PowerPC/JitArm64/JitArm64_RegCache.cpp | 25 +++++++++++ .../Core/PowerPC/JitArm64/JitArm64_RegCache.h | 11 ++++- 5 files changed, 78 insertions(+), 43 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 3f872600c6ce..44681cdf14e5 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -30,10 +30,12 @@ using namespace Arm64Gen; void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update) { // We want to make sure to not get LR as a temp register - gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); if (jo.memcheck) gpr.Lock(ARM64Reg::W0); + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1); + gpr.BindToRegister(dest, dest == (u32)addr || dest == (u32)offsetReg, false); ARM64Reg dest_reg = gpr.R(dest); ARM64Reg up_reg = ARM64Reg::INVALID_REG; @@ -45,7 +47,6 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o if (offsetReg != -1 && !gpr.IsImm(offsetReg)) off_reg = gpr.R(offsetReg); - ARM64Reg addr_reg = ARM64Reg::W1; u32 imm_addr = 0; bool is_immediate = false; @@ -124,7 +125,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o BitSet32 scratch_gprs; BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; @@ -141,7 +142,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o } else if (mmio_address) { - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; scratch_gprs[DecodeReg(ARM64Reg::W30)] = true; scratch_gprs[DecodeReg(dest_reg)] = true; MMIOLoadToReg(m_system, m_system.GetMemory().GetMMIOMapping(), this, &m_float_emit, @@ -166,7 +167,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o MOV(gpr.R(addr), addr_reg); } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); if (jo.memcheck) gpr.Unlock(ARM64Reg::W0); } @@ -175,7 +176,9 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s bool update) { // We want to make sure to not get LR as a temp register - gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); + + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W2); // Don't materialize zero. ARM64Reg RS = gpr.IsImm(value, 0) ? ARM64Reg::WZR : gpr.R(value); @@ -188,8 +191,6 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s if (dest != -1 && !gpr.IsImm(dest)) reg_dest = gpr.R(dest); - ARM64Reg addr_reg = ARM64Reg::W2; - u32 imm_addr = 0; bool is_immediate = false; @@ -268,7 +269,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s BitSet32 scratch_gprs; BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; u32 access_size = BackPatchInfo::GetFlagSize(flags); u32 mmio_address = 0; @@ -309,7 +310,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s } else if (mmio_address) { - scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; scratch_gprs[DecodeReg(ARM64Reg::W30)] = true; scratch_gprs[DecodeReg(RS)] = 0; MMIOWriteRegToAddr(m_system, m_system.GetMemory().GetMMIOMapping(), this, &m_float_emit, @@ -330,7 +331,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s MOV(gpr.R(dest), addr_reg); } - gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); } FixupBranch JitArm64::BATAddressLookup(ARM64Reg addr_out, ARM64Reg addr_in, ARM64Reg tmp, @@ -512,13 +513,13 @@ void JitArm64::lmw(UGeckoInstruction inst) u32 a = inst.RA, d = inst.RD; s32 offset = inst.SIMM_16; - gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); if (jo.memcheck) gpr.Lock(ARM64Reg::W0); // MMU games make use of a >= d despite this being invalid according to the PEM. // If a >= d occurs, we must make sure to not re-read rA after starting doing the loads. - ARM64Reg addr_reg = ARM64Reg::W1; + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1); Arm64RegCache::ScopedARM64Reg addr_base_reg; bool a_is_addr_base_reg = false; if (!a) @@ -634,7 +635,7 @@ void JitArm64::lmw(UGeckoInstruction inst) } } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); if (jo.memcheck) gpr.Unlock(ARM64Reg::W0); } @@ -647,9 +648,9 @@ void JitArm64::stmw(UGeckoInstruction inst) u32 a = inst.RA, s = inst.RS; s32 offset = inst.SIMM_16; - gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); - ARM64Reg addr_reg = ARM64Reg::W2; + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W2); Arm64RegCache::ScopedARM64Reg addr_base_reg; bool a_is_addr_base_reg = false; if (!a) @@ -767,7 +768,7 @@ void JitArm64::stmw(UGeckoInstruction inst) } } - gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); } void JitArm64::dcbx(UGeckoInstruction inst) @@ -987,11 +988,11 @@ void JitArm64::dcbz(UGeckoInstruction inst) int a = inst.RA, b = inst.RB; - gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); - Common::ScopeGuard register_guard([&] { gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); }); + Common::ScopeGuard register_guard([&] { gpr.Unlock(ARM64Reg::W30); }); - constexpr ARM64Reg addr_reg = ARM64Reg::W1; + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1); constexpr ARM64Reg temp_reg = ARM64Reg::W30; // HACK: Don't clear any memory in the [0x8000'0000, 0x8000'8000) region. @@ -1055,7 +1056,7 @@ void JitArm64::dcbz(UGeckoInstruction inst) BitSet32 scratch_gprs; BitSet32 scratch_fprs; - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; EmitBackpatchRoutine(BackPatchInfo::FLAG_ZERO_256, MemAccessMode::Auto, ARM64Reg::W1, addr_reg, scratch_gprs, scratch_fprs); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index ff7d37df96a5..18e86a318533 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -77,12 +77,12 @@ void JitArm64::lfXX(UGeckoInstruction inst) const RegType type = (flags & BackPatchInfo::FLAG_SIZE_64) != 0 ? RegType::LowerPair : RegType::DuplicatedSingle; - gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); if (jo.memcheck) gpr.Lock(ARM64Reg::W0); + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1); const ARM64Reg VD = fpr.RW(inst.FD, type, false); - ARM64Reg addr_reg = ARM64Reg::W1; if (update) { @@ -164,7 +164,7 @@ void JitArm64::lfXX(UGeckoInstruction inst) BitSet32 scratch_gprs; BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; @@ -187,7 +187,7 @@ void JitArm64::lfXX(UGeckoInstruction inst) MOV(gpr.R(a), addr_reg); } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); if (jo.memcheck) gpr.Unlock(ARM64Reg::W0); } @@ -270,9 +270,9 @@ void JitArm64::stfXX(UGeckoInstruction inst) V0 = std::move(single_reg); } - gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); - ARM64Reg addr_reg = ARM64Reg::W2; + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W2); if (update) { @@ -358,7 +358,7 @@ void JitArm64::stfXX(UGeckoInstruction inst) BitSet32 scratch_gprs; BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; if (is_immediate) { @@ -409,5 +409,5 @@ void JitArm64::stfXX(UGeckoInstruction inst) MOV(gpr.R(a), addr_reg); } - gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index b225c83eab6d..803d7a5dae67 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -38,10 +38,10 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) const int i = indexed ? inst.Ix : inst.I; const int w = indexed ? inst.Wx : inst.W; - gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); if (!js.assumeNoPairedQuantize) { - gpr.Lock(ARM64Reg::W0, ARM64Reg::W2, ARM64Reg::W3); + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3); fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1); } else if (jo.memcheck) @@ -50,7 +50,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) } constexpr ARM64Reg type_reg = ARM64Reg::W0; - constexpr ARM64Reg addr_reg = ARM64Reg::W1; + const auto addr_reg = js.assumeNoPairedQuantize ? gpr.GetScopedRegWithPreference(ARM64Reg::W1) : + Arm64RegCache::ScopedARM64Reg(ARM64Reg::W1); constexpr ARM64Reg scale_reg = ARM64Reg::W2; ARM64Reg VS = fpr.RW(inst.RS, RegType::Single, false); @@ -82,7 +83,7 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; @@ -127,10 +128,10 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) MOV(gpr.R(inst.RA), addr_reg); } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); if (!js.assumeNoPairedQuantize) { - gpr.Unlock(ARM64Reg::W0, ARM64Reg::W2, ARM64Reg::W3); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3); fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1); } else if (jo.memcheck) @@ -197,17 +198,18 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) } } - gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); if (!js.assumeNoPairedQuantize) { - gpr.Lock(ARM64Reg::W0, ARM64Reg::W1); + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2); if (!jo.fastmem) gpr.Lock(ARM64Reg::W3); } constexpr ARM64Reg type_reg = ARM64Reg::W0; constexpr ARM64Reg scale_reg = ARM64Reg::W1; - constexpr ARM64Reg addr_reg = ARM64Reg::W2; + const auto addr_reg = js.assumeNoPairedQuantize ? gpr.GetScopedRegWithPreference(ARM64Reg::W2) : + Arm64RegCache::ScopedARM64Reg(ARM64Reg::W2); if (inst.RA || update) // Always uses the register on update { @@ -237,7 +239,7 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; if (!w) @@ -269,10 +271,10 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) MOV(gpr.R(inst.RA), addr_reg); } - gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); if (!js.assumeNoPairedQuantize) { - gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2); if (!jo.fastmem) gpr.Unlock(ARM64Reg::W3); fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index 465d0257afa4..60dde40c6175 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include "Common/Assert.h" @@ -63,6 +64,30 @@ ARM64Reg Arm64RegCache::GetReg() return ARM64Reg::INVALID_REG; } +ARM64Reg Arm64RegCache::GetRegWithPreference(Arm64Gen::ARM64Reg preferred) +{ + // In practice, the preferred register tends to be towards the end of m_host_registers, + // so we scan through m_host_registers backwards + for (auto& it : m_host_registers | std::views::reverse) + { + if (it.GetReg() == preferred) + { + if (it.IsLocked()) + { + return GetReg(); + } + else + { + it.Lock(); + return it.GetReg(); + } + } + } + ASSERT_MSG(DYNA_REC, false, "Preferred register {:#x} is not in register cache", + static_cast(preferred)); + return ARM64Reg::INVALID_REG; +} + void Arm64RegCache::UpdateLastUsed(BitSet32 regs_used) { for (size_t i = 0; i < m_guest_registers.size(); ++i) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index b98e17053186..72a2c0af5964 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -183,13 +183,16 @@ class Arm64RegCache // Returns a temporary register for use // Requires unlocking after done Arm64Gen::ARM64Reg GetReg(); + Arm64Gen::ARM64Reg GetRegWithPreference(Arm64Gen::ARM64Reg preferred); class ScopedARM64Reg { public: inline ScopedARM64Reg() = default; ScopedARM64Reg(const ScopedARM64Reg&) = delete; - explicit inline ScopedARM64Reg(Arm64RegCache& cache) : m_reg(cache.GetReg()), m_gpr(&cache) {} + inline ScopedARM64Reg(Arm64RegCache& cache, Arm64Gen::ARM64Reg reg) : m_reg(reg), m_gpr(&cache) + { + } inline ScopedARM64Reg(Arm64Gen::ARM64Reg reg) : m_reg(reg) {} inline ScopedARM64Reg(ScopedARM64Reg&& scoped_reg) { *this = std::move(scoped_reg); } inline ~ScopedARM64Reg() { Unlock(); } @@ -235,7 +238,11 @@ class Arm64RegCache // Returns a temporary register // Unlocking is implicitly handled through RAII - inline ScopedARM64Reg GetScopedReg() { return ScopedARM64Reg(*this); } + inline ScopedARM64Reg GetScopedReg() { return ScopedARM64Reg(*this, GetReg()); } + inline ScopedARM64Reg GetScopedRegWithPreference(Arm64Gen::ARM64Reg preferred) + { + return ScopedARM64Reg(*this, GetRegWithPreference(preferred)); + } void UpdateLastUsed(BitSet32 regs_used);