mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-01-25 07:21:14 +01:00
Jit64: Clean up ExtractWithByteOffset
RCOpArg::ExtractWithByteOffset is only used in one place: a special case of rlwinmx. ExtractWithByteOffset first stores the value of the specified register into m_ppc_state (unless it's already there), and then returns an offset into m_ppc_state. Our use of this function has two undesirable properties (except in the trivial case `offset == 0`): 1. ExtractWithByteOffset calls StoreFromRegister without going through any of the usual functions. This violated an assumption I made when working on my constant propagation PR and led to a hard-to-find bug. 2. If the specified register is in a host register and is dirty, ExtractWithByteOffset will store its value to m_ppc_state even when it's unnecessary. In particular, this always happens when rlwinmx uses the same register as input and output, since rlwinmx always allocates a host register for the output and marks it as dirty. Since ExtractWithByteOffset is only used in one place, I figure we might as well inline it there. This commit does that, and also alters rlwinmx's logic so the special case code is only triggered when the input is already in m_ppc_state. Input in `m_ppc_state`, before (11 bytes): mov esi, dword ptr [rbp-104] mov dword ptr [rbp-104], esi movzx esi, byte ptr [rbp-101] Input in `m_ppc_state`, after (5 bytes): movzx esi, byte ptr [rbp-101] Input in host register, before (8 bytes): mov dword ptr [rbp-104], esi movzx esi, byte ptr [rbp-101] Input in host register, after (3 bytes): shr edi, 0x18
This commit is contained in:
parent
f30d3c9092
commit
fb75ae410f
@ -2065,69 +2065,83 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
|
||||
bool needs_sext = true;
|
||||
int mask_size = inst.ME - inst.MB + 1;
|
||||
|
||||
RCOpArg Rs = gpr.Use(s, RCMode::Read);
|
||||
RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
|
||||
RegCache::Realize(Rs, Ra);
|
||||
if (simple_mask && !(inst.SH & (mask_size - 1)) && !gpr.IsBound(s))
|
||||
{
|
||||
// optimized case: byte/word extract from m_ppc_state
|
||||
|
||||
if (a != s && left_shift && Rs.IsSimpleReg() && inst.SH <= 3)
|
||||
{
|
||||
LEA(32, Ra, MScaled(Rs.GetSimpleReg(), SCALE_1 << inst.SH, 0));
|
||||
}
|
||||
// common optimized case: byte/word extract
|
||||
else if (simple_mask && !(inst.SH & (mask_size - 1)))
|
||||
{
|
||||
MOVZX(32, mask_size, Ra, Rs.ExtractWithByteOffset(inst.SH ? (32 - inst.SH) >> 3 : 0));
|
||||
needs_sext = false;
|
||||
}
|
||||
// another optimized special case: byte/word extract plus rotate
|
||||
else if (simple_prerotate_mask && !left_shift)
|
||||
{
|
||||
MOVZX(32, prerotate_mask == 0xff ? 8 : 16, Ra, Rs);
|
||||
// Note: If a == s, calling Realize(Ra) will allocate a host register for Rs,
|
||||
// so we have to get mem_source from Rs before calling Realize(Ra)
|
||||
|
||||
RCOpArg Rs = gpr.Use(s, RCMode::Read);
|
||||
RegCache::Realize(Rs);
|
||||
OpArg mem_source = Rs.Location();
|
||||
if (inst.SH)
|
||||
ROL(32, Ra, Imm8(inst.SH));
|
||||
needs_sext = (mask & 0x80000000) != 0;
|
||||
}
|
||||
// Use BEXTR where possible: Only AMD implements this in one uop
|
||||
else if (field_extract && cpu_info.bBMI1 && cpu_info.vendor == CPUVendor::AMD)
|
||||
{
|
||||
MOV(32, R(RSCRATCH), Imm32((mask_size << 8) | (32 - inst.SH)));
|
||||
BEXTR(32, Ra, Rs, RSCRATCH);
|
||||
needs_sext = false;
|
||||
}
|
||||
else if (left_shift)
|
||||
{
|
||||
if (a != s)
|
||||
MOV(32, Ra, Rs);
|
||||
mem_source.AddMemOffset((32 - inst.SH) >> 3);
|
||||
Rs.Unlock();
|
||||
|
||||
SHL(32, Ra, Imm8(inst.SH));
|
||||
}
|
||||
else if (right_shift)
|
||||
{
|
||||
if (a != s)
|
||||
MOV(32, Ra, Rs);
|
||||
RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
|
||||
RegCache::Realize(Ra);
|
||||
MOVZX(32, mask_size, Ra, mem_source);
|
||||
|
||||
SHR(32, Ra, Imm8(inst.MB));
|
||||
needs_sext = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
RotateLeft(32, Ra, Rs, inst.SH);
|
||||
RCOpArg Rs = gpr.Use(s, RCMode::Read);
|
||||
RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
|
||||
RegCache::Realize(Rs, Ra);
|
||||
|
||||
if (!(inst.MB == 0 && inst.ME == 31))
|
||||
if (a != s && left_shift && Rs.IsSimpleReg() && inst.SH <= 3)
|
||||
{
|
||||
// we need flags if we're merging the branch
|
||||
if (inst.Rc && CheckMergedBranch(0))
|
||||
AND(32, Ra, Imm32(mask));
|
||||
else
|
||||
AndWithMask(Ra, mask);
|
||||
needs_sext = inst.MB == 0;
|
||||
needs_test = false;
|
||||
LEA(32, Ra, MScaled(Rs.GetSimpleReg(), SCALE_1 << inst.SH, 0));
|
||||
}
|
||||
// optimized case: byte/word extract plus rotate
|
||||
else if (simple_prerotate_mask && !left_shift)
|
||||
{
|
||||
MOVZX(32, prerotate_mask == 0xff ? 8 : 16, Ra, Rs);
|
||||
if (inst.SH)
|
||||
ROL(32, Ra, Imm8(inst.SH));
|
||||
needs_sext = (mask & 0x80000000) != 0;
|
||||
}
|
||||
// Use BEXTR where possible: Only AMD implements this in one uop
|
||||
else if (field_extract && cpu_info.bBMI1 && cpu_info.vendor == CPUVendor::AMD)
|
||||
{
|
||||
MOV(32, R(RSCRATCH), Imm32((mask_size << 8) | (32 - inst.SH)));
|
||||
BEXTR(32, Ra, Rs, RSCRATCH);
|
||||
needs_sext = false;
|
||||
}
|
||||
else if (left_shift)
|
||||
{
|
||||
if (a != s)
|
||||
MOV(32, Ra, Rs);
|
||||
|
||||
SHL(32, Ra, Imm8(inst.SH));
|
||||
}
|
||||
else if (right_shift)
|
||||
{
|
||||
if (a != s)
|
||||
MOV(32, Ra, Rs);
|
||||
|
||||
SHR(32, Ra, Imm8(inst.MB));
|
||||
needs_sext = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
RotateLeft(32, Ra, Rs, inst.SH);
|
||||
|
||||
if (!(inst.MB == 0 && inst.ME == 31))
|
||||
{
|
||||
// we need flags if we're merging the branch
|
||||
if (inst.Rc && CheckMergedBranch(0))
|
||||
AND(32, Ra, Imm32(mask));
|
||||
else
|
||||
AndWithMask(Ra, mask);
|
||||
needs_sext = inst.MB == 0;
|
||||
needs_test = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Rs.Unlock();
|
||||
Ra.Unlock();
|
||||
|
||||
if (inst.Rc)
|
||||
ComputeRC(a, needs_test, needs_sext);
|
||||
}
|
||||
|
@ -109,19 +109,6 @@ OpArg RCOpArg::Location() const
|
||||
return {};
|
||||
}
|
||||
|
||||
OpArg RCOpArg::ExtractWithByteOffset(int offset)
|
||||
{
|
||||
if (offset == 0)
|
||||
return Location();
|
||||
|
||||
ASSERT(rc);
|
||||
const preg_t preg = std::get<preg_t>(contents);
|
||||
rc->StoreFromRegister(preg, RegCache::FlushMode::MaintainState);
|
||||
OpArg result = rc->GetDefaultLocation(preg);
|
||||
result.AddMemOffset(offset);
|
||||
return result;
|
||||
}
|
||||
|
||||
void RCOpArg::Unlock()
|
||||
{
|
||||
if (const preg_t* preg = std::get_if<preg_t>(&contents))
|
||||
|
@ -47,9 +47,6 @@ public:
|
||||
bool IsSimpleReg(Gen::X64Reg reg) const { return Location().IsSimpleReg(reg); }
|
||||
Gen::X64Reg GetSimpleReg() const { return Location().GetSimpleReg(); }
|
||||
|
||||
// Use to extract bytes from a register using the regcache. offset is in bytes.
|
||||
Gen::OpArg ExtractWithByteOffset(int offset);
|
||||
|
||||
void Unlock();
|
||||
|
||||
bool IsImm() const;
|
||||
@ -159,6 +156,8 @@ public:
|
||||
u32 Imm32(preg_t preg) const { return R(preg).Imm32(); }
|
||||
s32 SImm32(preg_t preg) const { return R(preg).SImm32(); }
|
||||
|
||||
bool IsBound(preg_t preg) const { return m_regs[preg].IsBound(); }
|
||||
|
||||
RCOpArg Use(preg_t preg, RCMode mode);
|
||||
RCOpArg UseNoImm(preg_t preg, RCMode mode);
|
||||
RCOpArg BindOrImm(preg_t preg, RCMode mode);
|
||||
|
Loading…
x
Reference in New Issue
Block a user