From d2de1ddabc55d65a84fccd480e3ddf34b7d98c8f Mon Sep 17 00:00:00 2001 From: Pierre Bourdon Date: Sun, 16 Mar 2014 01:41:37 +0100 Subject: [PATCH 1/5] CPUDetect: add support for MOVBE detection --- Source/Core/Common/CPUDetect.h | 1 + Source/Core/Common/x64CPUDetect.cpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/Source/Core/Common/CPUDetect.h b/Source/Core/Common/CPUDetect.h index 51dd20b8ce..e5feb20f88 100644 --- a/Source/Core/Common/CPUDetect.h +++ b/Source/Core/Common/CPUDetect.h @@ -44,6 +44,7 @@ struct CPUInfo bool bAES; // FXSAVE/FXRSTOR bool bFXSR; + bool bMOVBE; // This flag indicates that the hardware supports some mode // in which denormal inputs _and_ outputs are automatically set to (signed) zero. // TODO: ARM diff --git a/Source/Core/Common/x64CPUDetect.cpp b/Source/Core/Common/x64CPUDetect.cpp index 64ded2df3f..e64cd9d1ee 100644 --- a/Source/Core/Common/x64CPUDetect.cpp +++ b/Source/Core/Common/x64CPUDetect.cpp @@ -159,6 +159,7 @@ void CPUInfo::Detect() if ((cpu_id[2] >> 9) & 1) bSSSE3 = true; if ((cpu_id[2] >> 19) & 1) bSSE4_1 = true; if ((cpu_id[2] >> 20) & 1) bSSE4_2 = true; + if ((cpu_id[2] >> 22) & 1) bMOVBE = true; if ((cpu_id[2] >> 25) & 1) bAES = true; // To check DAZ support, we first need to check FXSAVE support. @@ -263,6 +264,7 @@ std::string CPUInfo::Summarize() if (bAVX) sum += ", AVX"; if (bFMA) sum += ", FMA"; if (bAES) sum += ", AES"; + if (bMOVBE) sum += ", MOVBE"; if (bLongMode) sum += ", 64-bit support"; return sum; } From b2597739ffccf2121874a3a52b21d0518425ab47 Mon Sep 17 00:00:00 2001 From: Pierre Bourdon Date: Sun, 16 Mar 2014 03:43:12 +0100 Subject: [PATCH 2/5] x64Emitter: Add the MOVBE instruction. --- Source/Core/Common/x64Emitter.cpp | 32 +++++++++++++++++++++++++++++++ Source/Core/Common/x64Emitter.h | 3 +++ 2 files changed, 35 insertions(+) diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index ce4e5964af..6567fe5543 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -804,6 +804,38 @@ void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src) src.WriteRest(this); } +void XEmitter::MOVBE(int bits, const OpArg& dest, const OpArg& src) +{ + _assert_msg_(DYNA_REC, cpu_info.bMOVBE, "Generating MOVBE on a system that does not support it."); + if (bits == 8) + { + MOV(bits, dest, src); + return; + } + + if (bits == 16) + Write8(0x66); + + if (dest.IsSimpleReg()) + { + _assert_msg_(DYNA_REC, !src.IsSimpleReg() && !src.IsImm(), "MOVBE: Loading from !mem"); + src.WriteRex(this, bits, bits, dest.GetSimpleReg()); + Write8(0x0F); Write8(0x38); Write8(0xF0); + src.WriteRest(this, 0, dest.GetSimpleReg()); + } + else if (src.IsSimpleReg()) + { + _assert_msg_(DYNA_REC, !dest.IsSimpleReg() && !dest.IsImm(), "MOVBE: Storing to !mem"); + dest.WriteRex(this, bits, bits, src.GetSimpleReg()); + Write8(0x0F); Write8(0x38); Write8(0xF1); + dest.WriteRest(this, 0, src.GetSimpleReg()); + } + else + { + _assert_msg_(DYNA_REC, 0, "MOVBE: Not loading or storing to mem"); + } +} + void XEmitter::LEA(int bits, X64Reg dest, OpArg src) { diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index 8d9c90e3b0..025afb7b18 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -427,6 +427,9 @@ public: void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src); + // Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE. + void MOVBE(int dbits, const OpArg& dest, const OpArg& src); + // WARNING - These two take 11-13 cycles and are VectorPath! (AMD64) void STMXCSR(OpArg memloc); void LDMXCSR(OpArg memloc); From fcbe265adc48f30e6076b606170d8cff6adc94b9 Mon Sep 17 00:00:00 2001 From: Pierre Bourdon Date: Sun, 16 Mar 2014 04:08:31 +0100 Subject: [PATCH 3/5] Jit_Util: Provide two util functions to load/store and swap values, using MOVBE internally when possible. --- .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 35 +++++++++++++++++-- Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 5 ++- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 3601a1973c..0b30ab67ab 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -16,6 +16,32 @@ using namespace Gen; static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; static u32 GC_ALIGNED16(float_buffer); +void EmuCodeBlock::LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src) +{ + if (cpu_info.bMOVBE) + { + MOVBE(size, R(dst), src); + } + else + { + MOV(size, R(dst), src); + BSWAP(size, dst); + } +} + +void EmuCodeBlock::SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src) +{ + if (cpu_info.bMOVBE) + { + MOVBE(size, dst, R(src)); + } + else + { + BSWAP(size, src); + MOV(size, dst, R(src)); + } +} + void EmuCodeBlock::UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend) { #if _M_X86_64 @@ -513,12 +539,15 @@ void EmuCodeBlock::SafeWriteFloatToReg(X64Reg xmm_value, X64Reg reg_addr, u32 re } } -void EmuCodeBlock::WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address) +void EmuCodeBlock::WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap) { #if _M_X86_64 - MOV(accessSize, MDisp(RBX, address & 0x3FFFFFFF), arg); + if (swap) + SwapAndStore(accessSize, MDisp(RBX, address & 0x3FFFFFFF), arg); + else + MOV(accessSize, MDisp(RBX, address & 0x3FFFFFFF), R(arg)); #else - MOV(accessSize, M((void*)(Memory::base + (address & Memory::MEMVIEW32_MASK))), arg); + MOV(accessSize, M((void*)(Memory::base + (address & Memory::MEMVIEW32_MASK))), R(arg)); #endif } diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index b452ca4741..24fa76a536 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -25,6 +25,9 @@ namespace MMIO { class Mapping; } class EmuCodeBlock : public Gen::X64CodeBlock { public: + void LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src); + void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src); + void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false); void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset); // these return the address of the MOV, for backpatching @@ -47,7 +50,7 @@ public: // Trashes both inputs and EAX. void SafeWriteFloatToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, u32 registersInUse, int flags = 0); - void WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address); + void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false); void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address); void JitClearCA(); void JitSetCA(); From 6cb42859d4ba3d6e1b72ea2685abdd382c33c233 Mon Sep 17 00:00:00 2001 From: Pierre Bourdon Date: Sun, 16 Mar 2014 03:44:03 +0100 Subject: [PATCH 4/5] JitAsmCommon: Use MOVBE everywhere it matters (!x86 only code, !old CPU support code). --- .../Core/PowerPC/JitCommon/JitAsmCommon.cpp | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index 2856feec2a..5ef17670b9 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -21,15 +21,11 @@ void CommonAsmRoutines::GenFifoWrite(int size) PUSH(ESI); if (size != 32) PUSH(EDX); - BSWAP(size, ABI_PARAM1); MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe)); MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount)); - if (size != 32) { - MOV(32, R(EDX), R(ABI_PARAM1)); - MOV(size, MComplex(RAX, RSI, 1, 0), R(EDX)); - } else { - MOV(size, MComplex(RAX, RSI, 1, 0), R(ABI_PARAM1)); - } + + SwapAndStore(size, MComplex(RAX, RSI, 1, 0), ABI_PARAM1); + ADD(32, R(ESI), Imm8(size >> 3)); MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI)); if (size != 32) @@ -45,10 +41,9 @@ void CommonAsmRoutines::GenFifoFloatWrite() PUSH(EDX); MOVSS(M(&temp32), XMM0); MOV(32, R(EDX), M(&temp32)); - BSWAP(32, EDX); MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe)); MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount)); - MOV(32, MComplex(RAX, RSI, 1, 0), R(EDX)); + SwapAndStore(32, MComplex(RAX, RSI, 1, 0), EDX); ADD(32, R(ESI), Imm8(4)); MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI)); POP(EDX); @@ -150,8 +145,7 @@ void CommonAsmRoutines::GenQuantizedStores() TEST(32, R(ECX), Imm32(0x0C000000)); FixupBranch too_complex = J_CC(CC_NZ, true); MOV(64, R(RAX), M(&psTemp[0])); - BSWAP(64, RAX); - MOV(64, MComplex(RBX, RCX, SCALE_1, 0), R(RAX)); + SwapAndStore(64, MComplex(RBX, RCX, SCALE_1, 0), RAX); FixupBranch skip_complex = J(true); SetJumpTarget(too_complex); ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true); @@ -371,8 +365,7 @@ void CommonAsmRoutines::GenQuantizedLoads() PSHUFB(XMM0, M((void *)pbswapShuffle2x4)); } else { #if _M_X86_64 - MOV(64, R(RCX), MComplex(RBX, RCX, 1, 0)); - BSWAP(64, RCX); + LoadAndSwap(64, RCX, MComplex(RBX, RCX, 1, 0)); ROL(64, R(RCX), Imm8(32)); MOVQ_xmm(XMM0, R(RCX)); #else From 745fe14269eaf08cd0773c2cb5e9536670484f72 Mon Sep 17 00:00:00 2001 From: Pierre Bourdon Date: Sun, 16 Mar 2014 04:08:51 +0100 Subject: [PATCH 5/5] Jit64: Use LoadAndSwap/SwapAndStore where it makes sense. --- Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 13 +++++-------- .../Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp | 3 +-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index a6ea29ca7f..b30a847181 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -314,8 +314,7 @@ void Jit64::stX(UGeckoInstruction inst) else if (Memory::IsRAMAddress(addr)) { MOV(32, R(EAX), gpr.R(s)); - BSWAP(accessSize, EAX); - WriteToConstRamAddress(accessSize, R(EAX), addr); + WriteToConstRamAddress(accessSize, EAX, addr, true); if (update) gpr.SetImmediate32(a, addr); return; @@ -344,10 +343,10 @@ void Jit64::stX(UGeckoInstruction inst) gpr.FlushLockX(ABI_PARAM1); MOV(32, R(ABI_PARAM1), gpr.R(a)); MOV(32, R(EAX), gpr.R(s)); - BSWAP(32, EAX); #if _M_X86_64 - MOV(accessSize, MComplex(RBX, ABI_PARAM1, SCALE_1, (u32)offset), R(EAX)); + SwapAndStore(accessSize, MComplex(RBX, ABI_PARAM1, SCALE_1, (u32)offset), EAX); #else + BSWAP(32, EAX); AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); MOV(accessSize, MDisp(ABI_PARAM1, (u32)Memory::base + (u32)offset), R(EAX)); #endif @@ -456,8 +455,7 @@ void Jit64::lmw(UGeckoInstruction inst) ADD(32, R(EAX), gpr.R(inst.RA)); for (int i = inst.RD; i < 32; i++) { - MOV(32, R(ECX), MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4)); - BSWAP(32, ECX); + LoadAndSwap(32, ECX, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4)); gpr.BindToRegister(i, false, true); MOV(32, gpr.R(i), R(ECX)); } @@ -481,8 +479,7 @@ void Jit64::stmw(UGeckoInstruction inst) for (int i = inst.RD; i < 32; i++) { MOV(32, R(ECX), gpr.R(i)); - BSWAP(32, ECX); - MOV(32, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4), R(ECX)); + SwapAndStore(32, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4), ECX); } gpr.UnlockAllX(); #else diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index 1a0b03ad3d..cd0261e158 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -96,8 +96,7 @@ void Jit64::lfd(UGeckoInstruction inst) MOVSD(xd, R(XMM0)); } else { #if _M_X86_64 - MOV(64, R(EAX), MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); - BSWAP(64, EAX); + LoadAndSwap(64, EAX, MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); MOV(64, M(&temp64), R(EAX)); MEMCHECK_START