JitArm64: Use GetRegWithPreference for EmitBackpatchRoutine addr

I'm adding a new function to the register cache called
GetRegWithPreference. If the passed-in register is unlocked, it gets
locked. Otherwise, GetReg is called. The function also has a
GetScopedRegWithPreference variant.

Then, I'm making JitArm64 call this function when allocating an address
register for use with EmitBackpatchRoutine. This way, when register
pressure is low we can use the optimal register, and when register
pressure is high (but not completely full) we can sacrifice a bit of
farcode size for not having to evict a register from the register cache.
This commit is contained in:
JosJuice 2024-12-29 23:06:53 +01:00
parent c88c6f5f18
commit 7417efe600
5 changed files with 78 additions and 43 deletions

View File

@ -30,10 +30,12 @@ using namespace Arm64Gen;
void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update)
{
// We want to make sure to not get LR as a temp register
gpr.Lock(ARM64Reg::W1, ARM64Reg::W30);
gpr.Lock(ARM64Reg::W30);
if (jo.memcheck)
gpr.Lock(ARM64Reg::W0);
const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1);
gpr.BindToRegister(dest, dest == (u32)addr || dest == (u32)offsetReg, false);
ARM64Reg dest_reg = gpr.R(dest);
ARM64Reg up_reg = ARM64Reg::INVALID_REG;
@ -45,7 +47,6 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o
if (offsetReg != -1 && !gpr.IsImm(offsetReg))
off_reg = gpr.R(offsetReg);
ARM64Reg addr_reg = ARM64Reg::W1;
u32 imm_addr = 0;
bool is_immediate = false;
@ -124,7 +125,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o
BitSet32 scratch_gprs;
BitSet32 scratch_fprs;
if (!update || early_update)
scratch_gprs[DecodeReg(ARM64Reg::W1)] = true;
scratch_gprs[DecodeReg(addr_reg)] = true;
if (jo.memcheck)
scratch_gprs[DecodeReg(ARM64Reg::W0)] = true;
@ -141,7 +142,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o
}
else if (mmio_address)
{
scratch_gprs[DecodeReg(ARM64Reg::W1)] = true;
scratch_gprs[DecodeReg(addr_reg)] = true;
scratch_gprs[DecodeReg(ARM64Reg::W30)] = true;
scratch_gprs[DecodeReg(dest_reg)] = true;
MMIOLoadToReg(m_system, m_system.GetMemory().GetMMIOMapping(), this, &m_float_emit,
@ -166,7 +167,7 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o
MOV(gpr.R(addr), addr_reg);
}
gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30);
gpr.Unlock(ARM64Reg::W30);
if (jo.memcheck)
gpr.Unlock(ARM64Reg::W0);
}
@ -175,7 +176,9 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
bool update)
{
// We want to make sure to not get LR as a temp register
gpr.Lock(ARM64Reg::W2, ARM64Reg::W30);
gpr.Lock(ARM64Reg::W30);
const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W2);
// Don't materialize zero.
ARM64Reg RS = gpr.IsImm(value, 0) ? ARM64Reg::WZR : gpr.R(value);
@ -188,8 +191,6 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
if (dest != -1 && !gpr.IsImm(dest))
reg_dest = gpr.R(dest);
ARM64Reg addr_reg = ARM64Reg::W2;
u32 imm_addr = 0;
bool is_immediate = false;
@ -268,7 +269,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
BitSet32 scratch_gprs;
BitSet32 scratch_fprs;
if (!update || early_update)
scratch_gprs[DecodeReg(ARM64Reg::W2)] = true;
scratch_gprs[DecodeReg(addr_reg)] = true;
u32 access_size = BackPatchInfo::GetFlagSize(flags);
u32 mmio_address = 0;
@ -309,7 +310,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
}
else if (mmio_address)
{
scratch_gprs[DecodeReg(ARM64Reg::W2)] = true;
scratch_gprs[DecodeReg(addr_reg)] = true;
scratch_gprs[DecodeReg(ARM64Reg::W30)] = true;
scratch_gprs[DecodeReg(RS)] = 0;
MMIOWriteRegToAddr(m_system, m_system.GetMemory().GetMMIOMapping(), this, &m_float_emit,
@ -330,7 +331,7 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
MOV(gpr.R(dest), addr_reg);
}
gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30);
gpr.Unlock(ARM64Reg::W30);
}
FixupBranch JitArm64::BATAddressLookup(ARM64Reg addr_out, ARM64Reg addr_in, ARM64Reg tmp,
@ -512,13 +513,13 @@ void JitArm64::lmw(UGeckoInstruction inst)
u32 a = inst.RA, d = inst.RD;
s32 offset = inst.SIMM_16;
gpr.Lock(ARM64Reg::W1, ARM64Reg::W30);
gpr.Lock(ARM64Reg::W30);
if (jo.memcheck)
gpr.Lock(ARM64Reg::W0);
// MMU games make use of a >= d despite this being invalid according to the PEM.
// If a >= d occurs, we must make sure to not re-read rA after starting doing the loads.
ARM64Reg addr_reg = ARM64Reg::W1;
const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1);
Arm64RegCache::ScopedARM64Reg addr_base_reg;
bool a_is_addr_base_reg = false;
if (!a)
@ -634,7 +635,7 @@ void JitArm64::lmw(UGeckoInstruction inst)
}
}
gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30);
gpr.Unlock(ARM64Reg::W30);
if (jo.memcheck)
gpr.Unlock(ARM64Reg::W0);
}
@ -647,9 +648,9 @@ void JitArm64::stmw(UGeckoInstruction inst)
u32 a = inst.RA, s = inst.RS;
s32 offset = inst.SIMM_16;
gpr.Lock(ARM64Reg::W2, ARM64Reg::W30);
gpr.Lock(ARM64Reg::W30);
ARM64Reg addr_reg = ARM64Reg::W2;
const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W2);
Arm64RegCache::ScopedARM64Reg addr_base_reg;
bool a_is_addr_base_reg = false;
if (!a)
@ -767,7 +768,7 @@ void JitArm64::stmw(UGeckoInstruction inst)
}
}
gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30);
gpr.Unlock(ARM64Reg::W30);
}
void JitArm64::dcbx(UGeckoInstruction inst)
@ -987,11 +988,11 @@ void JitArm64::dcbz(UGeckoInstruction inst)
int a = inst.RA, b = inst.RB;
gpr.Lock(ARM64Reg::W1, ARM64Reg::W30);
gpr.Lock(ARM64Reg::W30);
Common::ScopeGuard register_guard([&] { gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30); });
Common::ScopeGuard register_guard([&] { gpr.Unlock(ARM64Reg::W30); });
constexpr ARM64Reg addr_reg = ARM64Reg::W1;
const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1);
constexpr ARM64Reg temp_reg = ARM64Reg::W30;
// HACK: Don't clear any memory in the [0x8000'0000, 0x8000'8000) region.
@ -1055,7 +1056,7 @@ void JitArm64::dcbz(UGeckoInstruction inst)
BitSet32 scratch_gprs;
BitSet32 scratch_fprs;
scratch_gprs[DecodeReg(ARM64Reg::W1)] = true;
scratch_gprs[DecodeReg(addr_reg)] = true;
EmitBackpatchRoutine(BackPatchInfo::FLAG_ZERO_256, MemAccessMode::Auto, ARM64Reg::W1, addr_reg,
scratch_gprs, scratch_fprs);

View File

@ -77,12 +77,12 @@ void JitArm64::lfXX(UGeckoInstruction inst)
const RegType type =
(flags & BackPatchInfo::FLAG_SIZE_64) != 0 ? RegType::LowerPair : RegType::DuplicatedSingle;
gpr.Lock(ARM64Reg::W1, ARM64Reg::W30);
gpr.Lock(ARM64Reg::W30);
if (jo.memcheck)
gpr.Lock(ARM64Reg::W0);
const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W1);
const ARM64Reg VD = fpr.RW(inst.FD, type, false);
ARM64Reg addr_reg = ARM64Reg::W1;
if (update)
{
@ -164,7 +164,7 @@ void JitArm64::lfXX(UGeckoInstruction inst)
BitSet32 scratch_gprs;
BitSet32 scratch_fprs;
if (!update || early_update)
scratch_gprs[DecodeReg(ARM64Reg::W1)] = true;
scratch_gprs[DecodeReg(addr_reg)] = true;
if (jo.memcheck)
scratch_gprs[DecodeReg(ARM64Reg::W0)] = true;
@ -187,7 +187,7 @@ void JitArm64::lfXX(UGeckoInstruction inst)
MOV(gpr.R(a), addr_reg);
}
gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30);
gpr.Unlock(ARM64Reg::W30);
if (jo.memcheck)
gpr.Unlock(ARM64Reg::W0);
}
@ -270,9 +270,9 @@ void JitArm64::stfXX(UGeckoInstruction inst)
V0 = std::move(single_reg);
}
gpr.Lock(ARM64Reg::W2, ARM64Reg::W30);
gpr.Lock(ARM64Reg::W30);
ARM64Reg addr_reg = ARM64Reg::W2;
const Arm64RegCache::ScopedARM64Reg addr_reg = gpr.GetScopedRegWithPreference(ARM64Reg::W2);
if (update)
{
@ -358,7 +358,7 @@ void JitArm64::stfXX(UGeckoInstruction inst)
BitSet32 scratch_gprs;
BitSet32 scratch_fprs;
if (!update || early_update)
scratch_gprs[DecodeReg(ARM64Reg::W2)] = true;
scratch_gprs[DecodeReg(addr_reg)] = true;
if (is_immediate)
{
@ -409,5 +409,5 @@ void JitArm64::stfXX(UGeckoInstruction inst)
MOV(gpr.R(a), addr_reg);
}
gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30);
gpr.Unlock(ARM64Reg::W30);
}

View File

@ -38,10 +38,10 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
const int i = indexed ? inst.Ix : inst.I;
const int w = indexed ? inst.Wx : inst.W;
gpr.Lock(ARM64Reg::W1, ARM64Reg::W30);
gpr.Lock(ARM64Reg::W30);
if (!js.assumeNoPairedQuantize)
{
gpr.Lock(ARM64Reg::W0, ARM64Reg::W2, ARM64Reg::W3);
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3);
fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1);
}
else if (jo.memcheck)
@ -50,7 +50,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
}
constexpr ARM64Reg type_reg = ARM64Reg::W0;
constexpr ARM64Reg addr_reg = ARM64Reg::W1;
const auto addr_reg = js.assumeNoPairedQuantize ? gpr.GetScopedRegWithPreference(ARM64Reg::W1) :
Arm64RegCache::ScopedARM64Reg(ARM64Reg::W1);
constexpr ARM64Reg scale_reg = ARM64Reg::W2;
ARM64Reg VS = fpr.RW(inst.RS, RegType::Single, false);
@ -82,7 +83,7 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
BitSet32 scratch_fprs;
if (!update || early_update)
scratch_gprs[DecodeReg(ARM64Reg::W1)] = true;
scratch_gprs[DecodeReg(addr_reg)] = true;
if (jo.memcheck)
scratch_gprs[DecodeReg(ARM64Reg::W0)] = true;
@ -127,10 +128,10 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
MOV(gpr.R(inst.RA), addr_reg);
}
gpr.Unlock(ARM64Reg::W1, ARM64Reg::W30);
gpr.Unlock(ARM64Reg::W30);
if (!js.assumeNoPairedQuantize)
{
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W2, ARM64Reg::W3);
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3);
fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1);
}
else if (jo.memcheck)
@ -197,17 +198,18 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
}
}
gpr.Lock(ARM64Reg::W2, ARM64Reg::W30);
gpr.Lock(ARM64Reg::W30);
if (!js.assumeNoPairedQuantize)
{
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1);
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2);
if (!jo.fastmem)
gpr.Lock(ARM64Reg::W3);
}
constexpr ARM64Reg type_reg = ARM64Reg::W0;
constexpr ARM64Reg scale_reg = ARM64Reg::W1;
constexpr ARM64Reg addr_reg = ARM64Reg::W2;
const auto addr_reg = js.assumeNoPairedQuantize ? gpr.GetScopedRegWithPreference(ARM64Reg::W2) :
Arm64RegCache::ScopedARM64Reg(ARM64Reg::W2);
if (inst.RA || update) // Always uses the register on update
{
@ -237,7 +239,7 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
BitSet32 scratch_fprs;
if (!update || early_update)
scratch_gprs[DecodeReg(ARM64Reg::W2)] = true;
scratch_gprs[DecodeReg(addr_reg)] = true;
u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32;
if (!w)
@ -269,10 +271,10 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
MOV(gpr.R(inst.RA), addr_reg);
}
gpr.Unlock(ARM64Reg::W2, ARM64Reg::W30);
gpr.Unlock(ARM64Reg::W30);
if (!js.assumeNoPairedQuantize)
{
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1);
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2);
if (!jo.fastmem)
gpr.Unlock(ARM64Reg::W3);
fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1);

View File

@ -5,6 +5,7 @@
#include <algorithm>
#include <cstddef>
#include <ranges>
#include <vector>
#include "Common/Assert.h"
@ -63,6 +64,30 @@ ARM64Reg Arm64RegCache::GetReg()
return ARM64Reg::INVALID_REG;
}
ARM64Reg Arm64RegCache::GetRegWithPreference(Arm64Gen::ARM64Reg preferred)
{
// In practice, the preferred register tends to be towards the end of m_host_registers,
// so we scan through m_host_registers backwards
for (auto& it : m_host_registers | std::views::reverse)
{
if (it.GetReg() == preferred)
{
if (it.IsLocked())
{
return GetReg();
}
else
{
it.Lock();
return it.GetReg();
}
}
}
ASSERT_MSG(DYNA_REC, false, "Preferred register {:#x} is not in register cache",
static_cast<int>(preferred));
return ARM64Reg::INVALID_REG;
}
void Arm64RegCache::UpdateLastUsed(BitSet32 regs_used)
{
for (size_t i = 0; i < m_guest_registers.size(); ++i)

View File

@ -183,13 +183,16 @@ public:
// Returns a temporary register for use
// Requires unlocking after done
Arm64Gen::ARM64Reg GetReg();
Arm64Gen::ARM64Reg GetRegWithPreference(Arm64Gen::ARM64Reg preferred);
class ScopedARM64Reg
{
public:
inline ScopedARM64Reg() = default;
ScopedARM64Reg(const ScopedARM64Reg&) = delete;
explicit inline ScopedARM64Reg(Arm64RegCache& cache) : m_reg(cache.GetReg()), m_gpr(&cache) {}
inline ScopedARM64Reg(Arm64RegCache& cache, Arm64Gen::ARM64Reg reg) : m_reg(reg), m_gpr(&cache)
{
}
inline ScopedARM64Reg(Arm64Gen::ARM64Reg reg) : m_reg(reg) {}
inline ScopedARM64Reg(ScopedARM64Reg&& scoped_reg) { *this = std::move(scoped_reg); }
inline ~ScopedARM64Reg() { Unlock(); }
@ -235,7 +238,11 @@ public:
// Returns a temporary register
// Unlocking is implicitly handled through RAII
inline ScopedARM64Reg GetScopedReg() { return ScopedARM64Reg(*this); }
inline ScopedARM64Reg GetScopedReg() { return ScopedARM64Reg(*this, GetReg()); }
inline ScopedARM64Reg GetScopedRegWithPreference(Arm64Gen::ARM64Reg preferred)
{
return ScopedARM64Reg(*this, GetRegWithPreference(preferred));
}
void UpdateLastUsed(BitSet32 regs_used);