JitArm64: Allocate scratch registers inside EmitBackpatchRoutine

This cuts down on how much callers have to think about what registers
EmitBackpatchRoutine is using. Also, by allocating registers dynamically
instead of using a fixed set of registers, we improve codegen in cases
where the fixed registers are taken but other registers are free.

(These improvements don't apply to the emitting_routine == true case,
where everything still works like before by necessity.)
This commit is contained in:
JosJuice 2024-12-28 19:34:31 +01:00
parent 9ab2751229
commit 527ad0b99b
2 changed files with 165 additions and 36 deletions

View File

@ -252,29 +252,40 @@ protected:
// //
// Registers used: // Registers used:
// //
// addr scratch // addr
// Store: X2 X1 // Store: X2
// Load: X1 // Load: X1
// Zero 256: X1 X30 // Zero 256: X1
// Store float: X2 Q0 // Store float: X2
// Load float: X1 // Load float: X1
// //
// If mode == AlwaysFastAccess, the addr argument can be any register. // If mode == AlwaysFastAccess, the addr argument can be any register.
// Otherwise it must be the register listed in the table above. // Otherwise it must be the register listed in the table above.
// //
// Additional scratch registers are used in the following situations: // This routine allocates most scratch registers dynamically, but in the following
// situations, specific scratch registers have to be allocated in advance:
// //
// emitting_routine && mode == Auto: X0 // emitting_routine && mode == Auto: X0
// emitting_routine && mode == Auto && (flags & BackPatchInfo::FLAG_STORE): X1
// emitting_routine && mode == Auto && !(flags & BackPatchInfo::FLAG_STORE): X3 // emitting_routine && mode == Auto && !(flags & BackPatchInfo::FLAG_STORE): X3
// emitting_routine && mode != AlwaysSlowAccess && !jo.fastmem: X3 // emitting_routine && mode != AlwaysSlowAccess && !jo.fastmem: X3
// mode != AlwaysSlowAccess && !jo.fastmem: X0 // emitting_routine && mode != AlwaysSlowAccess && !jo.fastmem: X0
// !emitting_routine && mode != AlwaysFastAccess && jo.memcheck && // emitting_routine && mode != AlwaysSlowAccess &&
// (flags & BackPatchInfo::FLAG_LOAD): X0 // (flags & BackPatchInfo::FLAG_STORE) && !(flags & BackPatchInfo::FLAG_FLOAT): X1
// !emitting_routine && mode != AlwaysSlowAccess && !jo.fastmem: X30 // emitting_routine && mode != AlwaysSlowAccess &&
// (flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT): Q0
// emitting_routine && mode != AlwaysSlowAccess &&
// (flags & BackPatchInfo::FLAG_ZERO_256): X30
// !emitting_routine && mode == Auto && jo.fastmem: X30 // !emitting_routine && mode == Auto && jo.fastmem: X30
// //
// If there are any other registers that the caller doesn't mind being overwritten, // If there are any other registers that the caller doesn't mind being overwritten,
// these can be indicated in scratch_gprs and scratch_fprs. // these can be indicated in scratch_gprs and scratch_fprs.
//
// In the following situations, certain host registers must not contain guest registers:
//
// !emitting_routine && mode != AlwaysFastAccess && jo.memcheck: X30
// !emitting_routine && mode != AlwaysFastAccess && jo.memcheck &&
// (flags & BackPatchInfo::FLAG_LOAD): X0
void EmitBackpatchRoutine(u32 flags, MemAccessMode mode, Arm64Gen::ARM64Reg RS, void EmitBackpatchRoutine(u32 flags, MemAccessMode mode, Arm64Gen::ARM64Reg RS,
Arm64Gen::ARM64Reg addr, BitSet32 scratch_gprs = BitSet32(0), Arm64Gen::ARM64Reg addr, BitSet32 scratch_gprs = BitSet32(0),
BitSet32 scratch_fprs = BitSet32(0), bool emitting_routine = false); BitSet32 scratch_fprs = BitSet32(0), bool emitting_routine = false);

View File

@ -65,11 +65,140 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS,
const bool emit_fast_access = mode != MemAccessMode::AlwaysSlowAccess; const bool emit_fast_access = mode != MemAccessMode::AlwaysSlowAccess;
const bool emit_slow_access = mode != MemAccessMode::AlwaysFastAccess; const bool emit_slow_access = mode != MemAccessMode::AlwaysFastAccess;
const BitSet32 gprs_to_push = const bool memcheck = jo.memcheck && !emitting_routine;
BitSet32 temp_gpr_candidates = scratch_gprs;
BitSet32 temp_fpr_candidates = scratch_fprs;
temp_gpr_candidates[DecodeReg(addr)] = false;
if (flags & BackPatchInfo::FLAG_FLOAT)
temp_fpr_candidates[DecodeReg(RS)] = false;
else if (!(flags & BackPatchInfo::FLAG_ZERO_256))
temp_gpr_candidates[DecodeReg(RS)] = false;
if (!emitting_routine && mode == MemAccessMode::Auto && jo.fastmem)
temp_gpr_candidates[30] = true;
const auto allocate_temp_reg = [this](Arm64RegCache& reg_cache,
BitSet32& candidates) -> Arm64RegCache::ScopedARM64Reg {
for (int i : candidates)
{
candidates[i] = false;
ARM64Reg reg = ARM64Reg(i);
if (&reg_cache == &fpr)
reg = EncodeRegToQuad(reg);
return reg;
}
return reg_cache.GetScopedReg();
};
const auto can_allocate_temp_reg_for_free = [](Arm64RegCache& reg_cache, BitSet32& candidates) {
return candidates != BitSet32{} || reg_cache.GetUnlockedRegisterCount() > 0;
};
Arm64RegCache::ScopedARM64Reg temp_gpr_1;
Arm64RegCache::ScopedARM64Reg temp_gpr_2;
Arm64RegCache::ScopedARM64Reg temp_gpr_3;
Arm64RegCache::ScopedARM64Reg temp_fpr_1;
if (emit_fast_access)
{
if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT))
{
temp_fpr_1 = emitting_routine ? Arm64RegCache::ScopedARM64Reg(ARM64Reg::Q0) :
allocate_temp_reg(fpr, temp_fpr_candidates);
scratch_fprs[DecodeReg(temp_fpr_1)] = true;
}
else if (flags & BackPatchInfo::FLAG_STORE)
{
temp_gpr_1 = emitting_routine ? Arm64RegCache::ScopedARM64Reg(ARM64Reg::W1) :
allocate_temp_reg(gpr, temp_gpr_candidates);
scratch_gprs[DecodeReg(temp_gpr_1)] = true;
}
else if (flags & BackPatchInfo::FLAG_ZERO_256)
{
temp_gpr_1 = emitting_routine ? Arm64RegCache::ScopedARM64Reg(ARM64Reg::W30) :
allocate_temp_reg(gpr, temp_gpr_candidates);
scratch_gprs[DecodeReg(temp_gpr_1)] = true;
}
if (!jo.fastmem)
{
temp_gpr_2 = emitting_routine ? Arm64RegCache::ScopedARM64Reg(ARM64Reg::W0) :
allocate_temp_reg(gpr, temp_gpr_candidates);
temp_gpr_3 = emitting_routine ? Arm64RegCache::ScopedARM64Reg(ARM64Reg::W3) :
allocate_temp_reg(gpr, temp_gpr_candidates);
scratch_gprs[DecodeReg(temp_gpr_2)] = true;
scratch_gprs[DecodeReg(temp_gpr_3)] = true;
}
else if (emit_slow_access && emitting_routine)
{
temp_gpr_2 = ARM64Reg::W0;
temp_gpr_3 = flags & BackPatchInfo::FLAG_STORE ? ARM64Reg::W1 : ARM64Reg::W3;
scratch_gprs[DecodeReg(temp_gpr_2)] = true;
scratch_gprs[DecodeReg(temp_gpr_3)] = true;
}
}
// Setting memcheck_temp_gpr to W30 works, but because W30 is a register that needs to be pushed
// and popped, using W30 may require us to emit an extra push and pop instruction, depending on
// what other registers need pushing and popping. If we can find another register to use without
// having to evict anything from the register cache, let's do that instead of using W30.
ARM64Reg memcheck_temp_gpr = ARM64Reg::W30;
if (emit_slow_access && memcheck)
{
const auto is_suitable_as_memcheck_temp_gpr = [flags](ARM64Reg reg) {
return reg != ARM64Reg::INVALID_REG && reg != ARM64Reg::W30 &&
(reg != ARM64Reg::W0 || !(flags & BackPatchInfo::FLAG_LOAD));
};
const auto get_unset_temp_gpr = [&]() -> Arm64RegCache::ScopedARM64Reg& {
if (temp_gpr_1 == ARM64Reg::INVALID_REG)
return temp_gpr_1;
if (temp_gpr_2 == ARM64Reg::INVALID_REG)
return temp_gpr_2;
ASSERT(temp_gpr_3 == ARM64Reg::INVALID_REG);
return temp_gpr_3;
};
if (is_suitable_as_memcheck_temp_gpr(temp_gpr_1))
{
memcheck_temp_gpr = temp_gpr_1;
}
else if (is_suitable_as_memcheck_temp_gpr(temp_gpr_2))
{
memcheck_temp_gpr = temp_gpr_2;
}
else if (is_suitable_as_memcheck_temp_gpr(temp_gpr_3))
{
memcheck_temp_gpr = temp_gpr_3;
}
else
{
while (can_allocate_temp_reg_for_free(gpr, temp_gpr_candidates))
{
Arm64RegCache::ScopedARM64Reg& temp_gpr_x = get_unset_temp_gpr();
temp_gpr_x = allocate_temp_reg(gpr, temp_gpr_candidates);
scratch_gprs[DecodeReg(temp_gpr_x)] = true;
if (is_suitable_as_memcheck_temp_gpr(temp_gpr_x))
break;
}
}
if (temp_fpr_1 == ARM64Reg::INVALID_REG &&
can_allocate_temp_reg_for_free(fpr, temp_fpr_candidates))
{
temp_fpr_1 = allocate_temp_reg(fpr, temp_fpr_candidates);
scratch_fprs[DecodeReg(temp_fpr_1)] = true;
}
}
BitSet32 gprs_to_push =
(emitting_routine ? CALLER_SAVED_GPRS : gpr.GetCallerSavedUsed()) & ~scratch_gprs; (emitting_routine ? CALLER_SAVED_GPRS : gpr.GetCallerSavedUsed()) & ~scratch_gprs;
const BitSet32 fprs_to_push = BitSet32 fprs_to_push =
(emitting_routine ? BitSet32(0xFFFFFFFF) : fpr.GetCallerSavedUsed()) & ~scratch_fprs; (emitting_routine ? BitSet32(0xFFFFFFFF) : fpr.GetCallerSavedUsed()) & ~scratch_fprs;
if (!emitting_routine && mode == MemAccessMode::Auto && jo.fastmem)
gprs_to_push[30] = true;
bool in_far_code = false; bool in_far_code = false;
const u8* fast_access_start = GetCodePtr(); const u8* fast_access_start = GetCodePtr();
std::optional<FixupBranch> slow_access_fixup; std::optional<FixupBranch> slow_access_fixup;
@ -81,13 +210,11 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS,
if (!jo.fastmem) if (!jo.fastmem)
{ {
const ARM64Reg temp = emitting_routine ? ARM64Reg::W3 : ARM64Reg::W30; memory_base = EncodeRegTo64(temp_gpr_3);
memory_offset = temp_gpr_2;
memory_base = EncodeRegTo64(temp); LSR(temp_gpr_3, addr, PowerPC::BAT_INDEX_SHIFT);
memory_offset = ARM64Reg::W0; LDR(memory_base, MEM_REG, ArithOption(temp_gpr_3, true));
LSR(temp, addr, PowerPC::BAT_INDEX_SHIFT);
LDR(memory_base, MEM_REG, ArithOption(temp, true));
if (emit_slow_access) if (emit_slow_access)
{ {
@ -100,15 +227,12 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS,
} }
else if (emit_slow_access && emitting_routine) else if (emit_slow_access && emitting_routine)
{ {
const ARM64Reg temp1 = flags & BackPatchInfo::FLAG_STORE ? ARM64Reg::W1 : ARM64Reg::W3; slow_access_fixup = CheckIfSafeAddress(addr, temp_gpr_3, temp_gpr_2);
const ARM64Reg temp2 = ARM64Reg::W0;
slow_access_fixup = CheckIfSafeAddress(addr, temp1, temp2);
} }
if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT)) if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT))
{ {
ARM64Reg temp = ARM64Reg::D0; ARM64Reg temp = EncodeRegToDouble(temp_fpr_1);
temp = ByteswapBeforeStore(this, &m_float_emit, temp, EncodeRegToDouble(RS), flags, true); temp = ByteswapBeforeStore(this, &m_float_emit, temp, EncodeRegToDouble(RS), flags, true);
m_float_emit.STR(access_size, temp, memory_base, memory_offset); m_float_emit.STR(access_size, temp, memory_base, memory_offset);
@ -122,7 +246,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS,
} }
else if (flags & BackPatchInfo::FLAG_STORE) else if (flags & BackPatchInfo::FLAG_STORE)
{ {
ARM64Reg temp = ARM64Reg::W1; ARM64Reg temp = temp_gpr_1;
temp = ByteswapBeforeStore(this, &m_float_emit, temp, RS, flags, true); temp = ByteswapBeforeStore(this, &m_float_emit, temp, RS, flags, true);
if (flags & BackPatchInfo::FLAG_SIZE_32) if (flags & BackPatchInfo::FLAG_SIZE_32)
@ -135,7 +259,7 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS,
else if (flags & BackPatchInfo::FLAG_ZERO_256) else if (flags & BackPatchInfo::FLAG_ZERO_256)
{ {
// This literally only stores 32bytes of zeros to the target address // This literally only stores 32bytes of zeros to the target address
ARM64Reg temp = ARM64Reg::X30; ARM64Reg temp = EncodeRegTo64(temp_gpr_1);
ADD(temp, memory_base, memory_offset); ADD(temp, memory_base, memory_offset);
STP(IndexType::Signed, ARM64Reg::ZR, ARM64Reg::ZR, temp, 0); STP(IndexType::Signed, ARM64Reg::ZR, ARM64Reg::ZR, temp, 0);
STP(IndexType::Signed, ARM64Reg::ZR, ARM64Reg::ZR, temp, 16); STP(IndexType::Signed, ARM64Reg::ZR, ARM64Reg::ZR, temp, 16);
@ -156,8 +280,6 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS,
if (emit_slow_access) if (emit_slow_access)
{ {
const bool memcheck = jo.memcheck && !emitting_routine;
if (emit_fast_access) if (emit_fast_access)
{ {
in_far_code = true; in_far_code = true;
@ -174,12 +296,9 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS,
if (slow_access_fixup) if (slow_access_fixup)
SetJumpTarget(*slow_access_fixup); SetJumpTarget(*slow_access_fixup);
const ARM64Reg temp_gpr = ARM64Reg::W1;
const int temp_gpr_index = DecodeReg(temp_gpr);
BitSet32 gprs_to_push_early = {}; BitSet32 gprs_to_push_early = {};
if (memcheck) if (memcheck)
gprs_to_push_early[temp_gpr_index] = true; gprs_to_push_early[DecodeReg(memcheck_temp_gpr)] = true;
if (flags & BackPatchInfo::FLAG_LOAD) if (flags & BackPatchInfo::FLAG_LOAD)
gprs_to_push_early[0] = true; gprs_to_push_early[0] = true;
@ -270,11 +389,10 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, MemAccessMode mode, ARM64Reg RS,
if (memcheck) if (memcheck)
{ {
const ARM64Reg temp_fpr = fprs_to_push[0] ? ARM64Reg::INVALID_REG : ARM64Reg::Q0;
const u64 early_push_count = (gprs_to_push & gprs_to_push_early).Count(); const u64 early_push_count = (gprs_to_push & gprs_to_push_early).Count();
const u64 early_push_size = Common::AlignUp(early_push_count, 2) * 8; const u64 early_push_size = Common::AlignUp(early_push_count, 2) * 8;
WriteConditionalExceptionExit(EXCEPTION_DSI, temp_gpr, temp_fpr, early_push_size); WriteConditionalExceptionExit(EXCEPTION_DSI, memcheck_temp_gpr, temp_fpr_1, early_push_size);
} }
if (flags & BackPatchInfo::FLAG_LOAD) if (flags & BackPatchInfo::FLAG_LOAD)