diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 3f872600c6ce..44681cdf14e5 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -30,10 +30,12 @@ using namespace Arm64Gen; void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update) { // We want to make sure to not get LR as a temp register - gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); if (jo.memcheck) gpr.Lock(ARM64Reg::W0); + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1); + gpr.BindToRegister(dest, dest == (u32)addr || dest == (u32)offsetReg, false); ARM64Reg dest_reg = gpr.R(dest); ARM64Reg up_reg = ARM64Reg::INVALID_REG; @@ -45,7 +47,6 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o if (offsetReg != -1 && !gpr.IsImm(offsetReg)) off_reg = gpr.R(offsetReg); - ARM64Reg addr_reg = ARM64Reg::W1; u32 imm_addr = 0; bool is_immediate = false; @@ -124,7 +125,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o BitSet32 scratch_gprs; BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; @@ -141,7 +142,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o } else if (mmio_address) { - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; scratch_gprs[DecodeReg(ARM64Reg::W30)] = true; scratch_gprs[DecodeReg(dest_reg)] = true; MMIOLoadToReg(m_system, m_system.GetMemory().GetMMIOMapping(), this, &m_float_emit, @@ -166,7 +167,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o MOV(gpr.R(addr), addr_reg); } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); if (jo.memcheck) gpr.Unlock(ARM64Reg::W0); } @@ -175,7 +176,9 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s bool update) { // We want to make sure to not get LR as a temp register - gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); + + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W2); // Don't materialize zero. ARM64Reg RS = gpr.IsImm(value, 0) ? ARM64Reg::WZR : gpr.R(value); @@ -188,8 +191,6 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s if (dest != -1 && !gpr.IsImm(dest)) reg_dest = gpr.R(dest); - ARM64Reg addr_reg = ARM64Reg::W2; - u32 imm_addr = 0; bool is_immediate = false; @@ -268,7 +269,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s BitSet32 scratch_gprs; BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; u32 access_size = BackPatchInfo::GetFlagSize(flags); u32 mmio_address = 0; @@ -309,7 +310,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s } else if (mmio_address) { - scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; scratch_gprs[DecodeReg(ARM64Reg::W30)] = true; scratch_gprs[DecodeReg(RS)] = 0; MMIOWriteRegToAddr(m_system, m_system.GetMemory().GetMMIOMapping(), this, &m_float_emit, @@ -330,7 +331,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s MOV(gpr.R(dest), addr_reg); } - gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); } FixupBranch JitArm64::BATAddressLookup(ARM64Reg addr_out, ARM64Reg addr_in, ARM64Reg tmp, @@ -512,13 +513,13 @@ void JitArm64::lmw(UGeckoInstruction inst) u32 a = inst.RA, d = inst.RD; s32 offset = inst.SIMM_16; - gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); if (jo.memcheck) gpr.Lock(ARM64Reg::W0); // MMU games make use of a >= d despite this being invalid according to the PEM. // If a >= d occurs, we must make sure to not re-read rA after starting doing the loads. - ARM64Reg addr_reg = ARM64Reg::W1; + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1); Arm64RegCache::ScopedARM64Reg addr_base_reg; bool a_is_addr_base_reg = false; if (!a) @@ -634,7 +635,7 @@ void JitArm64::lmw(UGeckoInstruction inst) } } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); if (jo.memcheck) gpr.Unlock(ARM64Reg::W0); } @@ -647,9 +648,9 @@ void JitArm64::stmw(UGeckoInstruction inst) u32 a = inst.RA, s = inst.RS; s32 offset = inst.SIMM_16; - gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); - ARM64Reg addr_reg = ARM64Reg::W2; + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W2); Arm64RegCache::ScopedARM64Reg addr_base_reg; bool a_is_addr_base_reg = false; if (!a) @@ -767,7 +768,7 @@ void JitArm64::stmw(UGeckoInstruction inst) } } - gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); } void JitArm64::dcbx(UGeckoInstruction inst) @@ -987,11 +988,11 @@ void JitArm64::dcbz(UGeckoInstruction inst) int a = inst.RA, b = inst.RB; - gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); - Common::ScopeGuard register_guard([&] { gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); }); + Common::ScopeGuard register_guard([&] { gpr.Unlock(ARM64Reg::W30); }); - constexpr ARM64Reg addr_reg = ARM64Reg::W1; + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1); constexpr ARM64Reg temp_reg = ARM64Reg::W30; // HACK: Don't clear any memory in the [0x8000'0000, 0x8000'8000) region. @@ -1055,7 +1056,7 @@ void JitArm64::dcbz(UGeckoInstruction inst) BitSet32 scratch_gprs; BitSet32 scratch_fprs; - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; EmitBackpatchRoutine(BackPatchInfo::FLAG_ZERO_256, MemAccessMode::Auto, ARM64Reg::W1, addr_reg, scratch_gprs, scratch_fprs); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index ff7d37df96a5..18e86a318533 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -77,12 +77,12 @@ void JitArm64::lfXX(UGeckoInstruction inst) const RegType type = (flags & BackPatchInfo::FLAG_SIZE_64) != 0 ? RegType::LowerPair : RegType::DuplicatedSingle; - gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); if (jo.memcheck) gpr.Lock(ARM64Reg::W0); + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1); const ARM64Reg VD = fpr.RW(inst.FD, type, false); - ARM64Reg addr_reg = ARM64Reg::W1; if (update) { @@ -164,7 +164,7 @@ void JitArm64::lfXX(UGeckoInstruction inst) BitSet32 scratch_gprs; BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; @@ -187,7 +187,7 @@ void JitArm64::lfXX(UGeckoInstruction inst) MOV(gpr.R(a), addr_reg); } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); if (jo.memcheck) gpr.Unlock(ARM64Reg::W0); } @@ -270,9 +270,9 @@ void JitArm64::stfXX(UGeckoInstruction inst) V0 = std::move(single_reg); } - gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); - ARM64Reg addr_reg = ARM64Reg::W2; + const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W2); if (update) { @@ -358,7 +358,7 @@ void JitArm64::stfXX(UGeckoInstruction inst) BitSet32 scratch_gprs; BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; if (is_immediate) { @@ -409,5 +409,5 @@ void JitArm64::stfXX(UGeckoInstruction inst) MOV(gpr.R(a), addr_reg); } - gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index b225c83eab6d..803d7a5dae67 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -38,10 +38,10 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) const int i = indexed ? inst.Ix : inst.I; const int w = indexed ? inst.Wx : inst.W; - gpr.Lock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); if (!js.assumeNoPairedQuantize) { - gpr.Lock(ARM64Reg::W0, ARM64Reg::W2, ARM64Reg::W3); + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3); fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1); } else if (jo.memcheck) @@ -50,7 +50,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) } constexpr ARM64Reg type_reg = ARM64Reg::W0; - constexpr ARM64Reg addr_reg = ARM64Reg::W1; + const auto addr_reg = js.assumeNoPairedQuantize ? gpr.GetScopedRegWithPreference(ARM64Reg::W1) : + Arm64RegCache::ScopedARM64Reg(ARM64Reg::W1); constexpr ARM64Reg scale_reg = ARM64Reg::W2; ARM64Reg VS = fpr.RW(inst.RS, RegType::Single, false); @@ -82,7 +83,7 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W1)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; if (jo.memcheck) scratch_gprs[DecodeReg(ARM64Reg::W0)] = true; @@ -127,10 +128,10 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) MOV(gpr.R(inst.RA), addr_reg); } - gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); if (!js.assumeNoPairedQuantize) { - gpr.Unlock(ARM64Reg::W0, ARM64Reg::W2, ARM64Reg::W3); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3); fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1); } else if (jo.memcheck) @@ -197,17 +198,18 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) } } - gpr.Lock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W30); if (!js.assumeNoPairedQuantize) { - gpr.Lock(ARM64Reg::W0, ARM64Reg::W1); + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2); if (!jo.fastmem) gpr.Lock(ARM64Reg::W3); } constexpr ARM64Reg type_reg = ARM64Reg::W0; constexpr ARM64Reg scale_reg = ARM64Reg::W1; - constexpr ARM64Reg addr_reg = ARM64Reg::W2; + const auto addr_reg = js.assumeNoPairedQuantize ? gpr.GetScopedRegWithPreference(ARM64Reg::W2) : + Arm64RegCache::ScopedARM64Reg(ARM64Reg::W2); if (inst.RA || update) // Always uses the register on update { @@ -237,7 +239,7 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) BitSet32 scratch_fprs; if (!update || early_update) - scratch_gprs[DecodeReg(ARM64Reg::W2)] = true; + scratch_gprs[DecodeReg(addr_reg)] = true; u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; if (!w) @@ -269,10 +271,10 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) MOV(gpr.R(inst.RA), addr_reg); } - gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30); + gpr.Unlock(ARM64Reg::W30); if (!js.assumeNoPairedQuantize) { - gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1); + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2); if (!jo.fastmem) gpr.Unlock(ARM64Reg::W3); fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index 465d0257afa4..60dde40c6175 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include "Common/Assert.h" @@ -63,6 +64,30 @@ ARM64Reg Arm64RegCache::GetReg() return ARM64Reg::INVALID_REG; } +ARM64Reg Arm64RegCache::GetRegWithPreference(Arm64Gen::ARM64Reg preferred) +{ + // In practice, the preferred register tends to be towards the end of m_host_registers, + // so we scan through m_host_registers backwards + for (auto& it : m_host_registers | std::views::reverse) + { + if (it.GetReg() == preferred) + { + if (it.IsLocked()) + { + return GetReg(); + } + else + { + it.Lock(); + return it.GetReg(); + } + } + } + ASSERT_MSG(DYNA_REC, false, "Preferred register {:#x} is not in register cache", + static_cast(preferred)); + return ARM64Reg::INVALID_REG; +} + void Arm64RegCache::UpdateLastUsed(BitSet32 regs_used) { for (size_t i = 0; i < m_guest_registers.size(); ++i) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index b98e17053186..72a2c0af5964 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -183,13 +183,16 @@ class Arm64RegCache // Returns a temporary register for use // Requires unlocking after done Arm64Gen::ARM64Reg GetReg(); + Arm64Gen::ARM64Reg GetRegWithPreference(Arm64Gen::ARM64Reg preferred); class ScopedARM64Reg { public: inline ScopedARM64Reg() = default; ScopedARM64Reg(const ScopedARM64Reg&) = delete; - explicit inline ScopedARM64Reg(Arm64RegCache& cache) : m_reg(cache.GetReg()), m_gpr(&cache) {} + inline ScopedARM64Reg(Arm64RegCache& cache, Arm64Gen::ARM64Reg reg) : m_reg(reg), m_gpr(&cache) + { + } inline ScopedARM64Reg(Arm64Gen::ARM64Reg reg) : m_reg(reg) {} inline ScopedARM64Reg(ScopedARM64Reg&& scoped_reg) { *this = std::move(scoped_reg); } inline ~ScopedARM64Reg() { Unlock(); } @@ -235,7 +238,11 @@ class Arm64RegCache // Returns a temporary register // Unlocking is implicitly handled through RAII - inline ScopedARM64Reg GetScopedReg() { return ScopedARM64Reg(*this); } + inline ScopedARM64Reg GetScopedReg() { return ScopedARM64Reg(*this, GetReg()); } + inline ScopedARM64Reg GetScopedRegWithPreference(Arm64Gen::ARM64Reg preferred) + { + return ScopedARM64Reg(*this, GetRegWithPreference(preferred)); + } void UpdateLastUsed(BitSet32 regs_used);