From 43d56febc488cb26f0a2269a96265f2d15a68659 Mon Sep 17 00:00:00 2001 From: magumagu Date: Thu, 11 Dec 2014 14:12:20 -0800 Subject: [PATCH 1/2] JIT: use fastmem loads in MMU mode. Even in games that require MMU mode, loads outside the area specified by the BAT are rare, so fastmem is a substantial improvement. All of the interesting changes are in the backpatch handler, to make it generate DSI exceptions correctly. --- Source/Core/Core/HW/Memmap.h | 7 +++ Source/Core/Core/HW/MemmapFunctions.cpp | 52 ++++++++++++++++--- .../Core/PowerPC/JitCommon/JitAsmCommon.cpp | 20 +++---- .../Core/PowerPC/JitCommon/JitBackpatch.cpp | 18 ++++++- .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 3 +- .../PowerPC/JitCommon/TrampolineCache.cpp | 45 ++++++++++------ 6 files changed, 110 insertions(+), 35 deletions(-) diff --git a/Source/Core/Core/HW/Memmap.h b/Source/Core/Core/HW/Memmap.h index 8c6a00c128..8a8b5b42e1 100644 --- a/Source/Core/Core/HW/Memmap.h +++ b/Source/Core/Core/HW/Memmap.h @@ -96,6 +96,13 @@ u16 Read_U16(const u32 _Address); u32 Read_U32(const u32 _Address); u64 Read_U64(const u32 _Address); +u32 Read_S8_Val(u32 address, u32 val); +u32 Read_U8_Val(u32 address, u32 val); +u32 Read_S16_Val(u32 address, u32 val); +u32 Read_U16_Val(u32 address, u32 val); +u32 Read_U32_Val(u32 address, u32 val); +u64 Read_U64_Val(u32 address, u64 val); + // Useful helper functions, used by ARM JIT float Read_F32(const u32 _Address); double Read_F64(const u32 _Address); diff --git a/Source/Core/Core/HW/MemmapFunctions.cpp b/Source/Core/Core/HW/MemmapFunctions.cpp index 810a45a00d..50cf3768f3 100644 --- a/Source/Core/Core/HW/MemmapFunctions.cpp +++ b/Source/Core/Core/HW/MemmapFunctions.cpp @@ -57,10 +57,12 @@ GXPeekZ // ---------------- // Overloaded byteswap functions, for use within the templated functions below. -inline u8 bswap(u8 val) {return val;} -inline u16 bswap(u16 val) {return Common::swap16(val);} -inline u32 bswap(u32 val) {return Common::swap32(val);} -inline u64 bswap(u64 val) {return Common::swap64(val);} +inline u8 bswap(u8 val) { return val; } +inline s8 bswap(s8 val) { return val; } +inline u16 bswap(u16 val) { return Common::swap16(val); } +inline s16 bswap(s16 val) { return Common::swap16(val); } +inline u32 bswap(u32 val) { return Common::swap32(val); } +inline u64 bswap(u64 val) { return Common::swap64(val); } // ================= @@ -89,8 +91,8 @@ static u32 EFB_Read(const u32 addr) static void GenerateDSIException(u32 _EffectiveAddress, bool _bWrite); -template -inline void ReadFromHardware(T &_var, const u32 em_address, Memory::XCheckTLBFlag flag) +template +inline void ReadFromHardware(U &_var, const u32 em_address, Memory::XCheckTLBFlag flag) { // TODO: Figure out the fastest order of tests for both read and write (they are probably different). if ((em_address & 0xC8000000) == 0xC8000000) @@ -98,7 +100,7 @@ inline void ReadFromHardware(T &_var, const u32 em_address, Memory::XCheckTLBFla if (em_address < 0xcc000000) _var = EFB_Read(em_address); else - _var = mmio_mapping->Read(em_address); + _var = (T)mmio_mapping->Read::type>(em_address); } else if (((em_address & 0xF0000000) == 0x80000000) || ((em_address & 0xF0000000) == 0xC0000000) || @@ -449,6 +451,42 @@ float Read_F32(const u32 _Address) return cvt.d; } +u32 Read_U8_Val(u32 address, u32 val) +{ + ReadFromHardware(val, address, FLAG_READ); + return val; +} + +u32 Read_S8_Val(u32 address, u32 val) +{ + ReadFromHardware(val, address, FLAG_READ); + return val; +} + +u32 Read_U16_Val(u32 address, u32 val) +{ + ReadFromHardware(val, address, FLAG_READ); + return val; +} + +u32 Read_S16_Val(u32 address, u32 val) +{ + ReadFromHardware(val, address, FLAG_READ); + return val; +} + +u32 Read_U32_Val(u32 address, u32 val) +{ + ReadFromHardware(val, address, FLAG_READ); + return val; +} + +u64 Read_U64_Val(u32 address, u64 val) +{ + ReadFromHardware(val, address, FLAG_READ); + return val; +} + u32 Read_U8_ZX(const u32 _Address) { return (u32)Read_U8(_Address); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp index a4bc801586..8f106b287b 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.cpp @@ -444,7 +444,7 @@ void CommonAsmRoutines::GenQuantizedLoads() const u8* loadPairedFloatTwo = AlignCode4(); if (jit->js.memcheck) { - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_PROLOG); + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); ROL(64, R(RSCRATCH_EXTRA), Imm8(32)); MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA)); } @@ -464,7 +464,7 @@ void CommonAsmRoutines::GenQuantizedLoads() const u8* loadPairedFloatOne = AlignCode4(); if (jit->js.memcheck) { - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_PROLOG); + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); UNPCKLPS(XMM0, M(m_one)); } @@ -486,7 +486,7 @@ void CommonAsmRoutines::GenQuantizedLoads() if (jit->js.memcheck) { // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); ROR(16, R(RSCRATCH_EXTRA), Imm8(8)); } else @@ -512,7 +512,7 @@ void CommonAsmRoutines::GenQuantizedLoads() const u8* loadPairedU8One = AlignCode4(); if (jit->js.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); else UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); @@ -525,7 +525,7 @@ void CommonAsmRoutines::GenQuantizedLoads() if (jit->js.memcheck) { // TODO: Support not swapping in safeLoadToReg to avoid bswapping twice - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); ROR(16, R(RSCRATCH_EXTRA), Imm8(8)); } else @@ -551,7 +551,7 @@ void CommonAsmRoutines::GenQuantizedLoads() const u8* loadPairedS8One = AlignCode4(); if (jit->js.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_PROLOG); + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); else UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true); CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); @@ -563,7 +563,7 @@ void CommonAsmRoutines::GenQuantizedLoads() const u8* loadPairedU16Two = AlignCode4(); // TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice if (jit->js.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); else UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); @@ -585,7 +585,7 @@ void CommonAsmRoutines::GenQuantizedLoads() const u8* loadPairedU16One = AlignCode4(); if (jit->js.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); else UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false); CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); @@ -596,7 +596,7 @@ void CommonAsmRoutines::GenQuantizedLoads() const u8* loadPairedS16Two = AlignCode4(); if (jit->js.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG); + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); else UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); @@ -618,7 +618,7 @@ void CommonAsmRoutines::GenQuantizedLoads() const u8* loadPairedS16One = AlignCode4(); if (jit->js.memcheck) - SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_PROLOG); + SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG); else UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true); CVTSI2SS(XMM0, R(RSCRATCH_EXTRA)); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp index 38e298c2fd..3a693a3c71 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/JitBackpatch.cpp @@ -85,9 +85,25 @@ bool Jitx86Base::BackPatch(u32 emAddress, SContext* ctx) else bswapNopCount = 2; + int totalSize = info.instructionSize + bswapNopCount; + if (info.operandSize == 2 && !info.byteSwap) + { + if ((codePtr[totalSize] & 0xF0) == 0x40) + { + ++totalSize; + } + if (codePtr[totalSize] != 0xc1 || codePtr[totalSize + 2] != 0x10) + { + PanicAlert("BackPatch: didn't find expected shift %p", codePtr); + return nullptr; + } + info.signExtend = (codePtr[totalSize + 1] & 0x10) != 0; + totalSize += 3; + } + const u8 *trampoline = trampolines.GetReadTrampoline(info, registersInUse); emitter.CALL((void *)trampoline); - int padding = info.instructionSize + bswapNopCount - BACKPATCH_SIZE; + int padding = totalSize - BACKPATCH_SIZE; if (padding > 0) { emitter.NOP(padding); diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index a1ce65424d..ca217ee63d 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -296,8 +296,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress, { registersInUse[reg_value] = false; } - if (!jit->js.memcheck && - SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem && + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem && !opAddress.IsImm() && !(flags & (SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_FASTMEM)) #ifdef ENABLE_MEM_CHECK diff --git a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp index 63fbd20fdc..717dcd06a0 100644 --- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp @@ -57,40 +57,55 @@ const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, B const u8* trampoline = GetCodePtr(); X64Reg addrReg = (X64Reg)info.scaledReg; X64Reg dataReg = (X64Reg)info.regOperandReg; + registersInUse[addrReg] = true; + registersInUse[dataReg] = false; // It's a read. Easy. // RSP alignment here is 8 due to the call. ABI_PushRegistersAndAdjustStack(registersInUse, 8); - if (addrReg != ABI_PARAM1) - MOV(32, R(ABI_PARAM1), R(addrReg)); + int dataRegSize = info.operandSize == 8 ? 64 : 32; + + if (dataReg == ABI_PARAM1) + { + if (addrReg == ABI_PARAM2) + { + XCHG(dataRegSize, R(ABI_PARAM1), R(ABI_PARAM2)); + } + else + { + MOV(dataRegSize, R(ABI_PARAM2), R(dataReg)); + MOV(32, R(ABI_PARAM1), R(addrReg)); + } + } + else + { + if (addrReg != ABI_PARAM1) + MOV(32, R(ABI_PARAM1), R(addrReg)); + if (dataReg != ABI_PARAM2) + MOV(dataRegSize, R(ABI_PARAM2), R(dataReg)); + } if (info.displacement) ADD(32, R(ABI_PARAM1), Imm32(info.displacement)); switch (info.operandSize) { + case 8: + CALL((void *)&Memory::Read_U64_Val); + break; case 4: - CALL((void *)&Memory::Read_U32); + CALL((void *)&Memory::Read_U32_Val); break; case 2: - CALL((void *)&Memory::Read_U16); - SHL(32, R(ABI_RETURN), Imm8(16)); + CALL(info.signExtend ? (void *)&Memory::Read_S16_Val : (void *)&Memory::Read_U16_Val); break; case 1: - CALL((void *)&Memory::Read_U8); + CALL(info.signExtend ? (void *)&Memory::Read_S8_Val : (void *)&Memory::Read_U8_Val); break; } - if (info.signExtend && info.operandSize == 1) - { - // Need to sign extend value from Read_U8. - MOVSX(32, 8, dataReg, R(ABI_RETURN)); - } - else if (dataReg != EAX) - { - MOV(32, R(dataReg), R(ABI_RETURN)); - } + MOV(dataRegSize, R(dataReg), R(ABI_RETURN)); ABI_PopRegistersAndAdjustStack(registersInUse, 8); RET(); From e479606b077bbab60ace82948eae34c482ab5e25 Mon Sep 17 00:00:00 2001 From: magumagu Date: Fri, 12 Dec 2014 22:29:49 -0800 Subject: [PATCH 2/2] JIT: simplify code using MOVTwo. --- .../PowerPC/JitCommon/TrampolineCache.cpp | 21 +------------------ 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp index 717dcd06a0..fb1503b56d 100644 --- a/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/TrampolineCache.cpp @@ -65,26 +65,7 @@ const u8* TrampolineCache::GenerateReadTrampoline(const InstructionInfo &info, B ABI_PushRegistersAndAdjustStack(registersInUse, 8); int dataRegSize = info.operandSize == 8 ? 64 : 32; - - if (dataReg == ABI_PARAM1) - { - if (addrReg == ABI_PARAM2) - { - XCHG(dataRegSize, R(ABI_PARAM1), R(ABI_PARAM2)); - } - else - { - MOV(dataRegSize, R(ABI_PARAM2), R(dataReg)); - MOV(32, R(ABI_PARAM1), R(addrReg)); - } - } - else - { - if (addrReg != ABI_PARAM1) - MOV(32, R(ABI_PARAM1), R(addrReg)); - if (dataReg != ABI_PARAM2) - MOV(dataRegSize, R(ABI_PARAM2), R(dataReg)); - } + MOVTwo(dataRegSize, ABI_PARAM1, addrReg, ABI_PARAM2, dataReg); if (info.displacement) ADD(32, R(ABI_PARAM1), Imm32(info.displacement));