From 7ad90275934b63dcc4726a7c49ee9a64758dda3b Mon Sep 17 00:00:00 2001 From: comex Date: Mon, 15 Sep 2014 23:03:07 -0400 Subject: [PATCH] Be pedantic about stack overflow on Linux and OS X. Add some magic to the fault handler to handle stack overflow due to BLR optimization, and disable the optimization if fastmem is not enabled. --- Source/Core/Common/MemoryUtil.cpp | 19 ++++ Source/Core/Common/MemoryUtil.h | 4 + Source/Core/Common/x64Emitter.cpp | 2 + Source/Core/Common/x64Emitter.h | 2 + Source/Core/Core/PowerPC/Jit64/Jit.cpp | 114 ++++++++++++++++++--- Source/Core/Core/PowerPC/Jit64/Jit.h | 13 +++ Source/Core/Core/PowerPC/Jit64/JitAsm.cpp | 37 +++++-- Source/Core/Core/PowerPC/Jit64/JitAsm.h | 4 +- Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp | 2 +- 9 files changed, 174 insertions(+), 23 deletions(-) diff --git a/Source/Core/Common/MemoryUtil.cpp b/Source/Core/Common/MemoryUtil.cpp index f7e1d7d902..a741deef4f 100644 --- a/Source/Core/Common/MemoryUtil.cpp +++ b/Source/Core/Common/MemoryUtil.cpp @@ -158,6 +158,25 @@ void FreeAlignedMemory(void* ptr) } } +void ReadProtectMemory(void* ptr, size_t size) +{ + bool error_occurred = false; + +#ifdef _WIN32 + DWORD oldValue; + if (!VirtualProtect(ptr, size, PAGE_NOACCESS, &oldValue)) + error_occurred = true; +#else + int retval = mprotect(ptr, size, PROT_NONE); + + if (retval != 0) + error_occurred = true; +#endif + + if (error_occurred) + PanicAlert("ReadProtectMemory failed!\n%s", GetLastErrorMsg()); +} + void WriteProtectMemory(void* ptr, size_t size, bool allowExecute) { bool error_occurred = false; diff --git a/Source/Core/Common/MemoryUtil.h b/Source/Core/Common/MemoryUtil.h index 6f437fcda7..5f584f868d 100644 --- a/Source/Core/Common/MemoryUtil.h +++ b/Source/Core/Common/MemoryUtil.h @@ -12,8 +12,12 @@ void* AllocateMemoryPages(size_t size); void FreeMemoryPages(void* ptr, size_t size); void* AllocateAlignedMemory(size_t size,size_t alignment); void FreeAlignedMemory(void* ptr); +void ReadProtectMemory(void* ptr, size_t size); void WriteProtectMemory(void* ptr, size_t size, bool executable = false); void UnWriteProtectMemory(void* ptr, size_t size, bool allowExecute = false); std::string MemUsage(); +void GuardMemoryMake(void* ptr, size_t size); +void GuardMemoryUnmake(void* ptr, size_t size); + inline int GetPageSize() { return 4096; } diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index fa16cf2b36..75cd418379 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -1766,6 +1766,8 @@ void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI void XEmitter::LOCK() { Write8(0xF0); } void XEmitter::REP() { Write8(0xF3); } void XEmitter::REPNE() { Write8(0xF2); } +void XEmitter::FSOverride() { Write8(0x64); } +void XEmitter::GSOverride() { Write8(0x65); } void XEmitter::FWAIT() { diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index 8f41065668..8b655c2c42 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -467,6 +467,8 @@ public: void LOCK(); void REP(); void REPNE(); + void FSOverride(); + void GSOverride(); // x87 enum x87StatusWordBits { diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index d928d02927..92595f6acd 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -95,6 +95,83 @@ using namespace PowerPC; and such, but it's currently limited to integer ops only. This can definitely be made better. */ +// The BLR optimization is nice, but it means that JITted code can overflow the +// native stack by repeatedly running BL. (The chance of this happening in any +// retail game is close to 0, but correctness is correctness...) Also, the +// overflow might not happen directly in the JITted code but in a C++ function +// called from it, so we can't just adjust RSP in the case of a fault. +// Instead, we have to have extra stack space preallocated under the fault +// point which allows the code to continue, after wiping the JIT cache so we +// can reset things at a safe point. Once this condition trips, the +// optimization is permanently disabled, under the assumption this will never +// happen in practice. + +// On Unix, we just mark an appropriate region of the stack as PROT_NONE and +// handle it the same way as fastmem faults. It's safe to take a fault with a +// bad RSP, because on Linux we can use sigaltstack and on OS X we're already +// on a separate thread. + +// On Windows, the OS gets upset if RSP doesn't work, and I don't know any +// equivalent of sigaltstack. Windows supports guard pages which, when +// accessed, immediately turn into regular pages but cause a trap... but +// putting them in the path of RSP just leads to something (in the kernel?) +// thinking a regular stack extension is required. So this protection is not +// supported on Windows yet... We still use a separate stack for the sake of +// simplicity. + +enum +{ + STACK_SIZE = 2 * 1024 * 1024, + SAFE_STACK_SIZE = 512 * 1024, + GUARD_SIZE = 0x10000, // two guards - bottom (permanent) and middle (see above) + GUARD_OFFSET = STACK_SIZE - SAFE_STACK_SIZE - GUARD_SIZE, +}; + +void Jit64::AllocStack() +{ +#if defined(_WIN32) + m_stack = (u8*)AllocateMemoryPages(STACK_SIZE); + ReadProtectMemory(m_stack, GUARD_SIZE); + ReadProtectMemory(m_stack + GUARD_OFFSET, GUARD_SIZE); +#endif +} + +void Jit64::FreeStack() +{ +#if defined(_WIN32) + if (m_stack) + { + FreeMemoryPages(m_stack, STACK_SIZE); + m_stack = NULL; + } +#endif +} + +bool Jit64::HandleFault(uintptr_t access_address, SContext* ctx) +{ + uintptr_t stack = (uintptr_t)m_stack, diff = access_address - stack; + // In the trap region? + if (stack && diff >= GUARD_OFFSET && diff < GUARD_OFFSET + GUARD_SIZE) + { + WARN_LOG(POWERPC, "BLR cache disabled due to excessive BL in the emulated program."); + m_enable_blr_optimization = false; + UnWriteProtectMemory(m_stack + GUARD_OFFSET, GUARD_SIZE); + // We're going to need to clear the whole cache to get rid of the bad + // CALLs, but we can't yet. Fake the downcount so we're forced to the + // dispatcher (no block linking), and clear the cache so we're sent to + // Jit. Yeah, it's kind of gross. + GetBlockCache()->InvalidateICache(0, 0xffffffff); + CoreTiming::ForceExceptionCheck(0); + m_clear_cache_asap = true; + + return true; + } + + return Jitx86Base::HandleFault(access_address, ctx); +} + + + void Jit64::Init() { jo.optimizeStack = true; @@ -130,8 +207,18 @@ void Jit64::Init() trampolines.Init(); AllocCodeSpace(CODE_SIZE); + + // BLR optimization has the same consequences as block linking, as well as + // depending on the fault handler to be safe in the event of excessive BL. + m_enable_blr_optimization = jo.enableBlocklink && SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem; + m_clear_cache_asap = false; + + m_stack = nullptr; + if (m_enable_blr_optimization) + AllocStack(); + blocks.Init(); - asm_routines.Init(); + asm_routines.Init(m_stack ? (m_stack + STACK_SIZE) : nullptr); // important: do this *after* generating the global asm routines, because we can't use farcode in them. // it'll crash because the farcode functions get cleared on JIT clears. @@ -155,6 +242,7 @@ void Jit64::ClearCache() void Jit64::Shutdown() { + FreeStack(); FreeCodeSpace(); blocks.Shutdown(); @@ -251,11 +339,8 @@ bool Jit64::Cleanup() void Jit64::WriteExit(u32 destination, bool bl, u32 after) { - // BLR optimization has similar consequences to block linking. - if (!jo.enableBlocklink) - { + if (!m_enable_blr_optimization) bl = false; - } Cleanup(); @@ -313,17 +398,17 @@ void Jit64::JustWriteExit(u32 destination, bool bl, u32 after) void Jit64::WriteExitDestInRSCRATCH(bool bl, u32 after) { - if (!jo.enableBlocklink) - { + if (!m_enable_blr_optimization) bl = false; - } + MOV(32, PPCSTATE(pc), R(RSCRATCH)); + Cleanup(); + if (bl) { MOV(32, R(RSCRATCH2), Imm32(after)); PUSH(RSCRATCH2); } - MOV(32, PPCSTATE(pc), R(RSCRATCH)); - Cleanup(); + SUB(32, PPCSTATE(downcount), Imm32(js.downcountAmount)); if (bl) { @@ -339,7 +424,7 @@ void Jit64::WriteExitDestInRSCRATCH(bool bl, u32 after) void Jit64::WriteBLRExit() { - if (!jo.enableBlocklink) + if (!m_enable_blr_optimization) { WriteExitDestInRSCRATCH(); return; @@ -428,8 +513,11 @@ void Jit64::Trace() void STACKALIGN Jit64::Jit(u32 em_address) { - if (GetSpaceLeft() < 0x10000 || farcode.GetSpaceLeft() < 0x10000 || blocks.IsFull() || - SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache) + if (GetSpaceLeft() < 0x10000 || + farcode.GetSpaceLeft() < 0x10000 || + blocks.IsFull() || + SConfig::GetInstance().m_LocalCoreStartupParameter.bJITNoBlockCache || + m_clear_cache_asap) { ClearCache(); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index cface00cb3..0391d258cc 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -18,6 +18,10 @@ // ---------- #pragma once +#ifdef _WIN32 +#include +#endif + #include "Common/x64ABI.h" #include "Common/x64Analyzer.h" #include "Common/x64Emitter.h" @@ -40,6 +44,9 @@ class Jit64 : public Jitx86Base { private: + void AllocStack(); + void FreeStack(); + GPRRegCache gpr; FPURegCache fpr; @@ -48,6 +55,10 @@ private: PPCAnalyst::CodeBuffer code_buffer; Jit64AsmRoutineManager asm_routines; + bool m_enable_blr_optimization; + bool m_clear_cache_asap; + u8* m_stack; + public: Jit64() : code_buffer(32000) {} ~Jit64() {} @@ -55,6 +66,8 @@ public: void Init() override; void Shutdown() override; + bool HandleFault(uintptr_t access_address, SContext* ctx) override; + // Jit! void Jit(u32 em_address) override; diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp index dc307540f6..dcfffaa3e9 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.cpp @@ -23,8 +23,18 @@ void Jit64AsmRoutineManager::Generate() // for the shadow region before calls in this function. This call will // waste a bit of space for a second shadow, but whatever. ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, /*frame*/ 16); + if (m_stack_top) + { + // Pivot the stack to our custom one. + MOV(64, R(RSCRATCH), R(RSP)); + MOV(64, R(RSP), Imm64((u64)m_stack_top - 0x20)); + MOV(64, MDisp(RSP, 0x18), R(RSCRATCH)); + } + else + { + MOV(64, M(&s_saved_rsp), R(RSP)); + } // something that can't pass the BLR test - MOV(64, M(&s_saved_rsp), R(RSP)); MOV(64, MDisp(RSP, 8), Imm32((u32)-1)); // Two statically allocated registers. @@ -46,7 +56,10 @@ void Jit64AsmRoutineManager::Generate() ABI_PopRegistersAndAdjustStack(1 << RSCRATCH, 0); #endif - MOV(64, R(RSP), M(&s_saved_rsp)); + if (m_stack_top) + MOV(64, R(RSP), Imm64((u64)m_stack_top - 0x20)); + else + MOV(64, R(RSP), M(&s_saved_rsp)); SUB(32, PPCSTATE(downcount), R(RSCRATCH)); @@ -55,6 +68,8 @@ void Jit64AsmRoutineManager::Generate() // IMPORTANT - We jump on negative, not carry!!! FixupBranch bail = J_CC(CC_BE, true); + FixupBranch dbg_exit; + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging) { TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(PowerPC::CPU_STEPPING)); @@ -63,11 +78,7 @@ void Jit64AsmRoutineManager::Generate() ABI_CallFunction(reinterpret_cast(&PowerPC::CheckBreakPoints)); ABI_PopRegistersAndAdjustStack(0, 0); TEST(32, M((void*)PowerPC::GetStatePtr()), Imm32(0xFFFFFFFF)); - FixupBranch noBreakpoint = J_CC(CC_Z); - MOV(64, R(RSP), M(&s_saved_rsp)); - ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16); - RET(); - SetJumpTarget(noBreakpoint); + dbg_exit = J_CC(CC_NZ); SetJumpTarget(notStepping); } @@ -155,7 +166,17 @@ void Jit64AsmRoutineManager::Generate() J_CC(CC_Z, outerLoop); //Landing pad for drec space - MOV(64, R(RSP), M(&s_saved_rsp)); + if (SConfig::GetInstance().m_LocalCoreStartupParameter.bEnableDebugging) + SetJumpTarget(dbg_exit); + if (m_stack_top) + { + MOV(64, R(RSP), Imm64((u64)m_stack_top - 0x8)); + POP(RSP); + } + else + { + MOV(64, R(RSP), M(&s_saved_rsp)); + } ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16); RET(); diff --git a/Source/Core/Core/PowerPC/Jit64/JitAsm.h b/Source/Core/Core/PowerPC/Jit64/JitAsm.h index e3cc4371f7..9272f5c8aa 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitAsm.h +++ b/Source/Core/Core/PowerPC/Jit64/JitAsm.h @@ -25,10 +25,12 @@ class Jit64AsmRoutineManager : public CommonAsmRoutines private: void Generate(); void GenerateCommon(); + u8* m_stack_top; public: - void Init() + void Init(u8* stack_top) { + m_stack_top = stack_top; AllocCodeSpace(8192); Generate(); WriteProtect(); diff --git a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp index 81260249c7..9f9f9cf98c 100644 --- a/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp +++ b/Source/Core/Core/PowerPC/Jit64IL/JitIL.cpp @@ -272,7 +272,7 @@ void JitIL::Init() trampolines.Init(); AllocCodeSpace(CODE_SIZE); blocks.Init(); - asm_routines.Init(); + asm_routines.Init(nullptr); farcode.Init(js.memcheck ? FARCODE_SIZE_MMU : FARCODE_SIZE);