mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-02-14 00:09:24 +01:00
Merge pull request #173 from delroth/movbe
Optimize memory access on Haswell by using MOVBE when possible.
This commit is contained in:
commit
a823edcc5b
@ -44,6 +44,7 @@ struct CPUInfo
|
|||||||
bool bAES;
|
bool bAES;
|
||||||
// FXSAVE/FXRSTOR
|
// FXSAVE/FXRSTOR
|
||||||
bool bFXSR;
|
bool bFXSR;
|
||||||
|
bool bMOVBE;
|
||||||
// This flag indicates that the hardware supports some mode
|
// This flag indicates that the hardware supports some mode
|
||||||
// in which denormal inputs _and_ outputs are automatically set to (signed) zero.
|
// in which denormal inputs _and_ outputs are automatically set to (signed) zero.
|
||||||
// TODO: ARM
|
// TODO: ARM
|
||||||
|
@ -159,6 +159,7 @@ void CPUInfo::Detect()
|
|||||||
if ((cpu_id[2] >> 9) & 1) bSSSE3 = true;
|
if ((cpu_id[2] >> 9) & 1) bSSSE3 = true;
|
||||||
if ((cpu_id[2] >> 19) & 1) bSSE4_1 = true;
|
if ((cpu_id[2] >> 19) & 1) bSSE4_1 = true;
|
||||||
if ((cpu_id[2] >> 20) & 1) bSSE4_2 = true;
|
if ((cpu_id[2] >> 20) & 1) bSSE4_2 = true;
|
||||||
|
if ((cpu_id[2] >> 22) & 1) bMOVBE = true;
|
||||||
if ((cpu_id[2] >> 25) & 1) bAES = true;
|
if ((cpu_id[2] >> 25) & 1) bAES = true;
|
||||||
|
|
||||||
// To check DAZ support, we first need to check FXSAVE support.
|
// To check DAZ support, we first need to check FXSAVE support.
|
||||||
@ -263,6 +264,7 @@ std::string CPUInfo::Summarize()
|
|||||||
if (bAVX) sum += ", AVX";
|
if (bAVX) sum += ", AVX";
|
||||||
if (bFMA) sum += ", FMA";
|
if (bFMA) sum += ", FMA";
|
||||||
if (bAES) sum += ", AES";
|
if (bAES) sum += ", AES";
|
||||||
|
if (bMOVBE) sum += ", MOVBE";
|
||||||
if (bLongMode) sum += ", 64-bit support";
|
if (bLongMode) sum += ", 64-bit support";
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
@ -804,6 +804,38 @@ void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src)
|
|||||||
src.WriteRest(this);
|
src.WriteRest(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void XEmitter::MOVBE(int bits, const OpArg& dest, const OpArg& src)
|
||||||
|
{
|
||||||
|
_assert_msg_(DYNA_REC, cpu_info.bMOVBE, "Generating MOVBE on a system that does not support it.");
|
||||||
|
if (bits == 8)
|
||||||
|
{
|
||||||
|
MOV(bits, dest, src);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bits == 16)
|
||||||
|
Write8(0x66);
|
||||||
|
|
||||||
|
if (dest.IsSimpleReg())
|
||||||
|
{
|
||||||
|
_assert_msg_(DYNA_REC, !src.IsSimpleReg() && !src.IsImm(), "MOVBE: Loading from !mem");
|
||||||
|
src.WriteRex(this, bits, bits, dest.GetSimpleReg());
|
||||||
|
Write8(0x0F); Write8(0x38); Write8(0xF0);
|
||||||
|
src.WriteRest(this, 0, dest.GetSimpleReg());
|
||||||
|
}
|
||||||
|
else if (src.IsSimpleReg())
|
||||||
|
{
|
||||||
|
_assert_msg_(DYNA_REC, !dest.IsSimpleReg() && !dest.IsImm(), "MOVBE: Storing to !mem");
|
||||||
|
dest.WriteRex(this, bits, bits, src.GetSimpleReg());
|
||||||
|
Write8(0x0F); Write8(0x38); Write8(0xF1);
|
||||||
|
dest.WriteRest(this, 0, src.GetSimpleReg());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_assert_msg_(DYNA_REC, 0, "MOVBE: Not loading or storing to mem");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void XEmitter::LEA(int bits, X64Reg dest, OpArg src)
|
void XEmitter::LEA(int bits, X64Reg dest, OpArg src)
|
||||||
{
|
{
|
||||||
|
@ -427,6 +427,9 @@ public:
|
|||||||
void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary
|
void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary
|
||||||
void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
|
void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
|
||||||
|
|
||||||
|
// Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE.
|
||||||
|
void MOVBE(int dbits, const OpArg& dest, const OpArg& src);
|
||||||
|
|
||||||
// WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
|
// WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
|
||||||
void STMXCSR(OpArg memloc);
|
void STMXCSR(OpArg memloc);
|
||||||
void LDMXCSR(OpArg memloc);
|
void LDMXCSR(OpArg memloc);
|
||||||
|
@ -314,8 +314,7 @@ void Jit64::stX(UGeckoInstruction inst)
|
|||||||
else if (Memory::IsRAMAddress(addr))
|
else if (Memory::IsRAMAddress(addr))
|
||||||
{
|
{
|
||||||
MOV(32, R(EAX), gpr.R(s));
|
MOV(32, R(EAX), gpr.R(s));
|
||||||
BSWAP(accessSize, EAX);
|
WriteToConstRamAddress(accessSize, EAX, addr, true);
|
||||||
WriteToConstRamAddress(accessSize, R(EAX), addr);
|
|
||||||
if (update)
|
if (update)
|
||||||
gpr.SetImmediate32(a, addr);
|
gpr.SetImmediate32(a, addr);
|
||||||
return;
|
return;
|
||||||
@ -344,10 +343,10 @@ void Jit64::stX(UGeckoInstruction inst)
|
|||||||
gpr.FlushLockX(ABI_PARAM1);
|
gpr.FlushLockX(ABI_PARAM1);
|
||||||
MOV(32, R(ABI_PARAM1), gpr.R(a));
|
MOV(32, R(ABI_PARAM1), gpr.R(a));
|
||||||
MOV(32, R(EAX), gpr.R(s));
|
MOV(32, R(EAX), gpr.R(s));
|
||||||
BSWAP(32, EAX);
|
|
||||||
#if _M_X86_64
|
#if _M_X86_64
|
||||||
MOV(accessSize, MComplex(RBX, ABI_PARAM1, SCALE_1, (u32)offset), R(EAX));
|
SwapAndStore(accessSize, MComplex(RBX, ABI_PARAM1, SCALE_1, (u32)offset), EAX);
|
||||||
#else
|
#else
|
||||||
|
BSWAP(32, EAX);
|
||||||
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
|
AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
|
||||||
MOV(accessSize, MDisp(ABI_PARAM1, (u32)Memory::base + (u32)offset), R(EAX));
|
MOV(accessSize, MDisp(ABI_PARAM1, (u32)Memory::base + (u32)offset), R(EAX));
|
||||||
#endif
|
#endif
|
||||||
@ -456,8 +455,7 @@ void Jit64::lmw(UGeckoInstruction inst)
|
|||||||
ADD(32, R(EAX), gpr.R(inst.RA));
|
ADD(32, R(EAX), gpr.R(inst.RA));
|
||||||
for (int i = inst.RD; i < 32; i++)
|
for (int i = inst.RD; i < 32; i++)
|
||||||
{
|
{
|
||||||
MOV(32, R(ECX), MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4));
|
LoadAndSwap(32, ECX, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4));
|
||||||
BSWAP(32, ECX);
|
|
||||||
gpr.BindToRegister(i, false, true);
|
gpr.BindToRegister(i, false, true);
|
||||||
MOV(32, gpr.R(i), R(ECX));
|
MOV(32, gpr.R(i), R(ECX));
|
||||||
}
|
}
|
||||||
@ -481,8 +479,7 @@ void Jit64::stmw(UGeckoInstruction inst)
|
|||||||
for (int i = inst.RD; i < 32; i++)
|
for (int i = inst.RD; i < 32; i++)
|
||||||
{
|
{
|
||||||
MOV(32, R(ECX), gpr.R(i));
|
MOV(32, R(ECX), gpr.R(i));
|
||||||
BSWAP(32, ECX);
|
SwapAndStore(32, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4), ECX);
|
||||||
MOV(32, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4), R(ECX));
|
|
||||||
}
|
}
|
||||||
gpr.UnlockAllX();
|
gpr.UnlockAllX();
|
||||||
#else
|
#else
|
||||||
|
@ -96,8 +96,7 @@ void Jit64::lfd(UGeckoInstruction inst)
|
|||||||
MOVSD(xd, R(XMM0));
|
MOVSD(xd, R(XMM0));
|
||||||
} else {
|
} else {
|
||||||
#if _M_X86_64
|
#if _M_X86_64
|
||||||
MOV(64, R(EAX), MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
|
LoadAndSwap(64, EAX, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
|
||||||
BSWAP(64, EAX);
|
|
||||||
MOV(64, M(&temp64), R(EAX));
|
MOV(64, M(&temp64), R(EAX));
|
||||||
|
|
||||||
MEMCHECK_START
|
MEMCHECK_START
|
||||||
|
@ -21,15 +21,11 @@ void CommonAsmRoutines::GenFifoWrite(int size)
|
|||||||
PUSH(ESI);
|
PUSH(ESI);
|
||||||
if (size != 32)
|
if (size != 32)
|
||||||
PUSH(EDX);
|
PUSH(EDX);
|
||||||
BSWAP(size, ABI_PARAM1);
|
|
||||||
MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
|
MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
|
||||||
MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
|
MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
|
||||||
if (size != 32) {
|
|
||||||
MOV(32, R(EDX), R(ABI_PARAM1));
|
SwapAndStore(size, MComplex(RAX, RSI, 1, 0), ABI_PARAM1);
|
||||||
MOV(size, MComplex(RAX, RSI, 1, 0), R(EDX));
|
|
||||||
} else {
|
|
||||||
MOV(size, MComplex(RAX, RSI, 1, 0), R(ABI_PARAM1));
|
|
||||||
}
|
|
||||||
ADD(32, R(ESI), Imm8(size >> 3));
|
ADD(32, R(ESI), Imm8(size >> 3));
|
||||||
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
|
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
|
||||||
if (size != 32)
|
if (size != 32)
|
||||||
@ -45,10 +41,9 @@ void CommonAsmRoutines::GenFifoFloatWrite()
|
|||||||
PUSH(EDX);
|
PUSH(EDX);
|
||||||
MOVSS(M(&temp32), XMM0);
|
MOVSS(M(&temp32), XMM0);
|
||||||
MOV(32, R(EDX), M(&temp32));
|
MOV(32, R(EDX), M(&temp32));
|
||||||
BSWAP(32, EDX);
|
|
||||||
MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
|
MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe));
|
||||||
MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
|
MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount));
|
||||||
MOV(32, MComplex(RAX, RSI, 1, 0), R(EDX));
|
SwapAndStore(32, MComplex(RAX, RSI, 1, 0), EDX);
|
||||||
ADD(32, R(ESI), Imm8(4));
|
ADD(32, R(ESI), Imm8(4));
|
||||||
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
|
MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI));
|
||||||
POP(EDX);
|
POP(EDX);
|
||||||
@ -150,8 +145,7 @@ void CommonAsmRoutines::GenQuantizedStores()
|
|||||||
TEST(32, R(ECX), Imm32(0x0C000000));
|
TEST(32, R(ECX), Imm32(0x0C000000));
|
||||||
FixupBranch too_complex = J_CC(CC_NZ, true);
|
FixupBranch too_complex = J_CC(CC_NZ, true);
|
||||||
MOV(64, R(RAX), M(&psTemp[0]));
|
MOV(64, R(RAX), M(&psTemp[0]));
|
||||||
BSWAP(64, RAX);
|
SwapAndStore(64, MComplex(RBX, RCX, SCALE_1, 0), RAX);
|
||||||
MOV(64, MComplex(RBX, RCX, SCALE_1, 0), R(RAX));
|
|
||||||
FixupBranch skip_complex = J(true);
|
FixupBranch skip_complex = J(true);
|
||||||
SetJumpTarget(too_complex);
|
SetJumpTarget(too_complex);
|
||||||
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true);
|
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true);
|
||||||
@ -371,8 +365,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
|||||||
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
|
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
|
||||||
} else {
|
} else {
|
||||||
#if _M_X86_64
|
#if _M_X86_64
|
||||||
MOV(64, R(RCX), MComplex(RBX, RCX, 1, 0));
|
LoadAndSwap(64, RCX, MComplex(RBX, RCX, 1, 0));
|
||||||
BSWAP(64, RCX);
|
|
||||||
ROL(64, R(RCX), Imm8(32));
|
ROL(64, R(RCX), Imm8(32));
|
||||||
MOVQ_xmm(XMM0, R(RCX));
|
MOVQ_xmm(XMM0, R(RCX));
|
||||||
#else
|
#else
|
||||||
|
@ -16,6 +16,32 @@ using namespace Gen;
|
|||||||
static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||||
static u32 GC_ALIGNED16(float_buffer);
|
static u32 GC_ALIGNED16(float_buffer);
|
||||||
|
|
||||||
|
void EmuCodeBlock::LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src)
|
||||||
|
{
|
||||||
|
if (cpu_info.bMOVBE)
|
||||||
|
{
|
||||||
|
MOVBE(size, R(dst), src);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MOV(size, R(dst), src);
|
||||||
|
BSWAP(size, dst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmuCodeBlock::SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src)
|
||||||
|
{
|
||||||
|
if (cpu_info.bMOVBE)
|
||||||
|
{
|
||||||
|
MOVBE(size, dst, R(src));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
BSWAP(size, src);
|
||||||
|
MOV(size, dst, R(src));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void EmuCodeBlock::UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend)
|
void EmuCodeBlock::UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend)
|
||||||
{
|
{
|
||||||
#if _M_X86_64
|
#if _M_X86_64
|
||||||
@ -513,12 +539,15 @@ void EmuCodeBlock::SafeWriteFloatToReg(X64Reg xmm_value, X64Reg reg_addr, u32 re
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void EmuCodeBlock::WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address)
|
void EmuCodeBlock::WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap)
|
||||||
{
|
{
|
||||||
#if _M_X86_64
|
#if _M_X86_64
|
||||||
MOV(accessSize, MDisp(RBX, address & 0x3FFFFFFF), arg);
|
if (swap)
|
||||||
|
SwapAndStore(accessSize, MDisp(RBX, address & 0x3FFFFFFF), arg);
|
||||||
|
else
|
||||||
|
MOV(accessSize, MDisp(RBX, address & 0x3FFFFFFF), R(arg));
|
||||||
#else
|
#else
|
||||||
MOV(accessSize, M((void*)(Memory::base + (address & Memory::MEMVIEW32_MASK))), arg);
|
MOV(accessSize, M((void*)(Memory::base + (address & Memory::MEMVIEW32_MASK))), R(arg));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,6 +25,9 @@ namespace MMIO { class Mapping; }
|
|||||||
class EmuCodeBlock : public Gen::X64CodeBlock
|
class EmuCodeBlock : public Gen::X64CodeBlock
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
void LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src);
|
||||||
|
void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src);
|
||||||
|
|
||||||
void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false);
|
void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false);
|
||||||
void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset);
|
void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset);
|
||||||
// these return the address of the MOV, for backpatching
|
// these return the address of the MOV, for backpatching
|
||||||
@ -47,7 +50,7 @@ public:
|
|||||||
// Trashes both inputs and EAX.
|
// Trashes both inputs and EAX.
|
||||||
void SafeWriteFloatToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, u32 registersInUse, int flags = 0);
|
void SafeWriteFloatToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, u32 registersInUse, int flags = 0);
|
||||||
|
|
||||||
void WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address);
|
void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
|
||||||
void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address);
|
void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address);
|
||||||
void JitClearCA();
|
void JitClearCA();
|
||||||
void JitSetCA();
|
void JitSetCA();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user