From f79520a90601e20c887055761cd6c7ab0463f329 Mon Sep 17 00:00:00 2001 From: mitaclaw <140017135+mitaclaw@users.noreply.github.com> Date: Tue, 23 Apr 2024 09:28:49 -0700 Subject: [PATCH 1/5] Cached Interpreter 2.0 It now supports variable-sized data payloads and memory range freeing. It's a little faster, too. --- Source/Core/Common/CodeBlock.h | 7 +- Source/Core/Core/CMakeLists.txt | 6 +- .../CachedInterpreter/CachedInterpreter.cpp | 376 +++++++++--------- .../CachedInterpreter/CachedInterpreter.h | 106 ++++- .../CachedInterpreterBlockCache.cpp | 41 ++ .../CachedInterpreterBlockCache.h | 35 ++ .../CachedInterpreterEmitter.cpp | 39 ++ .../CachedInterpreterEmitter.h | 84 ++++ .../InterpreterBlockCache.cpp | 14 - .../CachedInterpreter/InterpreterBlockCache.h | 17 - Source/Core/DolphinLib.props | 6 +- 11 files changed, 479 insertions(+), 252 deletions(-) create mode 100644 Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterBlockCache.cpp create mode 100644 Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterBlockCache.h create mode 100644 Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterEmitter.cpp create mode 100644 Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterEmitter.h delete mode 100644 Source/Core/Core/PowerPC/CachedInterpreter/InterpreterBlockCache.cpp delete mode 100644 Source/Core/Core/PowerPC/CachedInterpreter/InterpreterBlockCache.h diff --git a/Source/Core/Common/CodeBlock.h b/Source/Core/Common/CodeBlock.h index 9efc82edeb..8c6291dc63 100644 --- a/Source/Core/Common/CodeBlock.h +++ b/Source/Core/Common/CodeBlock.h @@ -17,7 +17,7 @@ namespace Common // having to prefix them with gen-> or something similar. // Example implementation: // class JIT : public CodeBlock {} -template +template class CodeBlock : public T { private: @@ -53,7 +53,10 @@ public: { region_size = size; total_region_size = size; - region = static_cast(Common::AllocateExecutableMemory(total_region_size)); + if constexpr (executable) + region = static_cast(Common::AllocateExecutableMemory(total_region_size)); + else + region = static_cast(Common::AllocateMemoryPages(total_region_size)); T::SetCodePtr(region, region + size); } diff --git a/Source/Core/Core/CMakeLists.txt b/Source/Core/Core/CMakeLists.txt index 54a2e18db5..6b0022d63c 100644 --- a/Source/Core/Core/CMakeLists.txt +++ b/Source/Core/Core/CMakeLists.txt @@ -481,8 +481,10 @@ add_library(core PowerPC/BreakPoints.h PowerPC/CachedInterpreter/CachedInterpreter.cpp PowerPC/CachedInterpreter/CachedInterpreter.h - PowerPC/CachedInterpreter/InterpreterBlockCache.cpp - PowerPC/CachedInterpreter/InterpreterBlockCache.h + PowerPC/CachedInterpreter/CachedInterpreterBlockCache.cpp + PowerPC/CachedInterpreter/CachedInterpreterBlockCache.h + PowerPC/CachedInterpreter/CachedInterpreterEmitter.cpp + PowerPC/CachedInterpreter/CachedInterpreterEmitter.h PowerPC/ConditionRegister.cpp PowerPC/ConditionRegister.h PowerPC/Expression.cpp diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp index f8b5c6f53b..7e740b5de9 100644 --- a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp +++ b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp @@ -6,6 +6,7 @@ #include "Common/CommonTypes.h" #include "Common/Logging/Log.h" #include "Core/ConfigManager.h" +#include "Core/Core.h" #include "Core/CoreTiming.h" #include "Core/HLE/HLE.h" #include "Core/HW/CPU.h" @@ -16,65 +17,7 @@ #include "Core/PowerPC/PowerPC.h" #include "Core/System.h" -struct CachedInterpreter::Instruction -{ - using CommonCallback = void (*)(UGeckoInstruction); - using ConditionalCallback = bool (*)(u32); - using InterpreterCallback = void (*)(Interpreter&, UGeckoInstruction); - using CachedInterpreterCallback = void (*)(CachedInterpreter&, UGeckoInstruction); - using ConditionalCachedInterpreterCallback = bool (*)(CachedInterpreter&, u32); - - Instruction() {} - Instruction(const CommonCallback c, UGeckoInstruction i) - : common_callback(c), data(i.hex), type(Type::Common) - { - } - - Instruction(const ConditionalCallback c, u32 d) - : conditional_callback(c), data(d), type(Type::Conditional) - { - } - - Instruction(const InterpreterCallback c, UGeckoInstruction i) - : interpreter_callback(c), data(i.hex), type(Type::Interpreter) - { - } - - Instruction(const CachedInterpreterCallback c, UGeckoInstruction i) - : cached_interpreter_callback(c), data(i.hex), type(Type::CachedInterpreter) - { - } - - Instruction(const ConditionalCachedInterpreterCallback c, u32 d) - : conditional_cached_interpreter_callback(c), data(d), - type(Type::ConditionalCachedInterpreter) - { - } - - enum class Type - { - Abort, - Common, - Conditional, - Interpreter, - CachedInterpreter, - ConditionalCachedInterpreter, - }; - - union - { - const CommonCallback common_callback = nullptr; - const ConditionalCallback conditional_callback; - const InterpreterCallback interpreter_callback; - const CachedInterpreterCallback cached_interpreter_callback; - const ConditionalCachedInterpreterCallback conditional_cached_interpreter_callback; - }; - - u32 data = 0; - Type type = Type::Abort; -}; - -CachedInterpreter::CachedInterpreter(Core::System& system) : JitBase(system) +CachedInterpreter::CachedInterpreter(Core::System& system) : JitBase(system), m_block_cache(*this) { } @@ -84,7 +27,8 @@ void CachedInterpreter::Init() { RefreshConfig(); - m_code.reserve(CODE_SIZE / sizeof(Instruction)); + AllocCodeSpace(CODE_SIZE); + ResetFreeMemoryRanges(); jo.enableBlocklink = false; @@ -100,11 +44,6 @@ void CachedInterpreter::Shutdown() m_block_cache.Shutdown(); } -u8* CachedInterpreter::GetCodePtr() -{ - return reinterpret_cast(m_code.data() + m_code.size()); -} - void CachedInterpreter::ExecuteOneBlock() { const u8* normal_entry = m_block_cache.Dispatch(); @@ -114,50 +53,23 @@ void CachedInterpreter::ExecuteOneBlock() return; } - const Instruction* code = reinterpret_cast(normal_entry); - auto& interpreter = m_system.GetInterpreter(); - - for (; code->type != Instruction::Type::Abort; ++code) + auto& ppc_state = m_ppc_state; + while (true) { - switch (code->type) - { - case Instruction::Type::Common: - code->common_callback(UGeckoInstruction(code->data)); + const auto callback = *reinterpret_cast(normal_entry); + if (const auto distance = callback(ppc_state, normal_entry + sizeof(callback))) + normal_entry += distance; + else break; - - case Instruction::Type::Conditional: - if (code->conditional_callback(code->data)) - return; - break; - - case Instruction::Type::Interpreter: - code->interpreter_callback(interpreter, UGeckoInstruction(code->data)); - break; - - case Instruction::Type::CachedInterpreter: - code->cached_interpreter_callback(*this, UGeckoInstruction(code->data)); - break; - - case Instruction::Type::ConditionalCachedInterpreter: - if (code->conditional_cached_interpreter_callback(*this, code->data)) - return; - break; - - default: - ERROR_LOG_FMT(POWERPC, "Unknown CachedInterpreter Instruction: {}", - static_cast(code->type)); - break; - } } } void CachedInterpreter::Run() { auto& core_timing = m_system.GetCoreTiming(); - auto& cpu = m_system.GetCPU(); - const CPU::State* state_ptr = cpu.GetStatePtr(); - while (cpu.GetState() == CPU::State::Running) + const CPU::State* state_ptr = m_system.GetCPU().GetStatePtr(); + while (*state_ptr == CPU::State::Running) { // Start new timing slice // NOTE: Exceptions may change PC @@ -177,93 +89,105 @@ void CachedInterpreter::SingleStep() ExecuteOneBlock(); } -void CachedInterpreter::EndBlock(CachedInterpreter& cached_interpreter, UGeckoInstruction data) +s32 CachedInterpreter::EndBlock(PowerPC::PowerPCState& ppc_state, const EndBlockOperands& operands) { - auto& ppc_state = cached_interpreter.m_ppc_state; + const auto& [downcount, num_load_stores, num_fp_inst] = operands; ppc_state.pc = ppc_state.npc; - ppc_state.downcount -= data.hex; - PowerPC::UpdatePerformanceMonitor(data.hex, 0, 0, ppc_state); + ppc_state.downcount -= downcount; + PowerPC::UpdatePerformanceMonitor(downcount, num_load_stores, num_fp_inst, ppc_state); + return 0; } -void CachedInterpreter::UpdateNumLoadStoreInstructions(CachedInterpreter& cached_interpreter, - UGeckoInstruction data) +s32 CachedInterpreter::Interpret(PowerPC::PowerPCState& ppc_state, + const InterpretOperands& operands) { - PowerPC::UpdatePerformanceMonitor(0, data.hex, 0, cached_interpreter.m_ppc_state); + const auto& [interpreter, func, current_pc, inst] = operands; + func(interpreter, inst); + return sizeof(AnyCallback) + sizeof(operands); } -void CachedInterpreter::UpdateNumFloatingPointInstructions(CachedInterpreter& cached_interpreter, - UGeckoInstruction data) +s32 CachedInterpreter::HLEFunction(PowerPC::PowerPCState& ppc_state, + const HLEFunctionOperands& operands) { - PowerPC::UpdatePerformanceMonitor(0, 0, data.hex, cached_interpreter.m_ppc_state); + const auto& [system, current_pc, hook_index] = operands; + HLE::Execute(Core::CPUThreadGuard{system}, current_pc, hook_index); + return sizeof(AnyCallback) + sizeof(operands); } -void CachedInterpreter::WritePC(CachedInterpreter& cached_interpreter, UGeckoInstruction data) +s32 CachedInterpreter::WritePC(PowerPC::PowerPCState& ppc_state, const WritePCOperands& operands) { - auto& ppc_state = cached_interpreter.m_ppc_state; - ppc_state.pc = data.hex; - ppc_state.npc = data.hex + 4; + const auto& [current_pc] = operands; + ppc_state.pc = current_pc; + ppc_state.npc = current_pc + 4; + return sizeof(AnyCallback) + sizeof(operands); } -void CachedInterpreter::WriteBrokenBlockNPC(CachedInterpreter& cached_interpreter, - UGeckoInstruction data) +s32 CachedInterpreter::WriteBrokenBlockNPC(PowerPC::PowerPCState& ppc_state, + const WritePCOperands& operands) { - cached_interpreter.m_ppc_state.npc = data.hex; + const auto& [current_pc] = operands; + ppc_state.npc = current_pc; + return sizeof(AnyCallback) + sizeof(operands); } -bool CachedInterpreter::CheckFPU(CachedInterpreter& cached_interpreter, u32 data) +s32 CachedInterpreter::CheckFPU(PowerPC::PowerPCState& ppc_state, const CheckHaltOperands& operands) { - auto& ppc_state = cached_interpreter.m_ppc_state; + const auto& [power_pc, downcount] = operands; if (!ppc_state.msr.FP) { ppc_state.Exceptions |= EXCEPTION_FPU_UNAVAILABLE; - cached_interpreter.m_system.GetPowerPC().CheckExceptions(); - ppc_state.downcount -= data; - return true; + power_pc.CheckExceptions(); + ppc_state.downcount -= downcount; + return 0; } - return false; + return sizeof(AnyCallback) + sizeof(operands); } -bool CachedInterpreter::CheckDSI(CachedInterpreter& cached_interpreter, u32 data) +s32 CachedInterpreter::CheckDSI(PowerPC::PowerPCState& ppc_state, const CheckHaltOperands& operands) { - auto& ppc_state = cached_interpreter.m_ppc_state; - if (ppc_state.Exceptions & EXCEPTION_DSI) + const auto& [power_pc, downcount] = operands; + if ((ppc_state.Exceptions & EXCEPTION_DSI) != 0) { - cached_interpreter.m_system.GetPowerPC().CheckExceptions(); - ppc_state.downcount -= data; - return true; + power_pc.CheckExceptions(); + ppc_state.downcount -= downcount; + return 0; } - return false; + return sizeof(AnyCallback) + sizeof(operands); } -bool CachedInterpreter::CheckProgramException(CachedInterpreter& cached_interpreter, u32 data) +s32 CachedInterpreter::CheckProgramException(PowerPC::PowerPCState& ppc_state, + const CheckHaltOperands& operands) { - auto& ppc_state = cached_interpreter.m_ppc_state; - if (ppc_state.Exceptions & EXCEPTION_PROGRAM) + const auto& [power_pc, downcount] = operands; + if ((ppc_state.Exceptions & EXCEPTION_PROGRAM) != 0) { - cached_interpreter.m_system.GetPowerPC().CheckExceptions(); - ppc_state.downcount -= data; - return true; + power_pc.CheckExceptions(); + ppc_state.downcount -= downcount; + return 0; } - return false; + return sizeof(AnyCallback) + sizeof(operands); } -bool CachedInterpreter::CheckBreakpoint(CachedInterpreter& cached_interpreter, u32 data) +s32 CachedInterpreter::CheckBreakpoint(PowerPC::PowerPCState& ppc_state, + const CheckHaltOperands& operands) { - if (cached_interpreter.m_system.GetPowerPC().CheckAndHandleBreakPoints()) + const auto& [power_pc, downcount] = operands; + if (power_pc.CheckAndHandleBreakPoints()) { - cached_interpreter.m_ppc_state.downcount -= data; - return true; + // Accessing PowerPCState through power_pc instead of ppc_state produces better assembly. + power_pc.GetPPCState().downcount -= downcount; + return 0; } - return false; + return sizeof(AnyCallback) + sizeof(operands); } -bool CachedInterpreter::CheckIdle(CachedInterpreter& cached_interpreter, u32 idle_pc) +s32 CachedInterpreter::CheckIdle(PowerPC::PowerPCState& ppc_state, + const CheckIdleOperands& operands) { - if (cached_interpreter.m_ppc_state.npc == idle_pc) - { - cached_interpreter.m_system.GetCoreTiming().Idle(); - } - return false; + const auto& [core_timing, idle_pc] = operands; + if (ppc_state.npc == idle_pc) + core_timing.Idle(); + return sizeof(AnyCallback) + sizeof(operands); } bool CachedInterpreter::HandleFunctionHooking(u32 address) @@ -274,27 +198,57 @@ bool CachedInterpreter::HandleFunctionHooking(u32 address) if (!result) return false; - m_code.emplace_back(WritePC, address); - m_code.emplace_back(Interpreter::HLEFunction, result.hook_index); + Write(WritePC, {address}); + Write(HLEFunction, {m_system, address, result.hook_index}); if (result.type != HLE::HookType::Replace) return false; - m_code.emplace_back(EndBlock, js.downcountAmount); - m_code.emplace_back(); + js.downcountAmount += js.st.numCycles; + Write(EndBlock, {js.downcountAmount, js.numLoadStoreInst, js.numFloatingPointInst}); return true; } -void CachedInterpreter::Jit(u32 address) +bool CachedInterpreter::SetEmitterStateToFreeCodeRegion() { - if (m_code.size() >= CODE_SIZE / sizeof(Instruction) - 0x1000 || - SConfig::GetInstance().bJITNoBlockCache) + const auto free = m_free_ranges.by_size_begin(); + if (free == m_free_ranges.by_size_end()) + { + WARN_LOG_FMT(DYNA_REC, "Failed to find free memory region in code region."); + return false; + } + SetCodePtr(free.from(), free.to()); + return true; +} + +void CachedInterpreter::FreeRanges() +{ + for (const auto& [from, to] : m_block_cache.GetRangesToFree()) + m_free_ranges.insert(from, to); + m_block_cache.ClearRangesToFree(); +} + +void CachedInterpreter::ResetFreeMemoryRanges() +{ + m_free_ranges.clear(); + m_free_ranges.insert(region, region + region_size); +} + +void CachedInterpreter::Jit(u32 em_address) +{ + Jit(em_address, true); +} + +void CachedInterpreter::Jit(u32 em_address, bool clear_cache_and_retry_on_failure) +{ + if (IsAlmostFull() || SConfig::GetInstance().bJITNoBlockCache) { ClearCache(); } + FreeRanges(); const u32 nextPC = - analyzer.Analyze(m_ppc_state.pc, &code_block, &m_code_buffer, m_code_buffer.size()); + analyzer.Analyze(em_address, &code_block, &m_code_buffer, m_code_buffer.size()); if (code_block.m_memory_exception) { // Address of instruction could not be translated @@ -305,9 +259,46 @@ void CachedInterpreter::Jit(u32 address) return; } - JitBlock* b = m_block_cache.AllocateBlock(m_ppc_state.pc); + if (SetEmitterStateToFreeCodeRegion()) + { + JitBlock* b = m_block_cache.AllocateBlock(em_address); + b->normalEntry = b->near_begin = GetWritableCodePtr(); - js.blockStart = m_ppc_state.pc; + if (DoJit(em_address, b, nextPC)) + { + // Record what memory region was used so we know what to free if this block gets invalidated. + b->near_end = GetWritableCodePtr(); + b->far_begin = b->far_end = nullptr; + + b->codeSize = static_cast(b->near_end - b->normalEntry); + b->originalSize = code_block.m_num_instructions; + + // Mark the memory region that this code block uses in the RangeSizeSet. + if (b->near_begin != b->near_end) + m_free_ranges.erase(b->near_begin, b->near_end); + + m_block_cache.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses); + + return; + } + } + + if (clear_cache_and_retry_on_failure) + { + WARN_LOG_FMT(DYNA_REC, "flushing code caches, please report if this happens a lot"); + ClearCache(); + Jit(em_address, false); + return; + } + + PanicAlertFmtT("JIT failed to find code space after a cache clear. This should never happen. " + "Please report this incident on the bug tracker. Dolphin will now exit."); + std::exit(-1); +} + +bool CachedInterpreter::DoJit(u32 em_address, JitBlock* b, u32 nextPC) +{ + js.blockStart = em_address; js.firstFPInstructionFound = false; js.fifoBytesSinceCheck = 0; js.downcountAmount = 0; @@ -315,85 +306,80 @@ void CachedInterpreter::Jit(u32 address) js.numFloatingPointInst = 0; js.curBlock = b; - b->normalEntry = b->near_begin = GetCodePtr(); + auto& interpreter = m_system.GetInterpreter(); + auto& power_pc = m_system.GetPowerPC(); + auto& cpu = m_system.GetCPU(); + auto& breakpoints = power_pc.GetBreakPoints(); for (u32 i = 0; i < code_block.m_num_instructions; i++) { PPCAnalyst::CodeOp& op = m_code_buffer[i]; + js.op = &op; + js.compilerPC = op.address; + js.instructionsLeft = (code_block.m_num_instructions - 1) - i; js.downcountAmount += op.opinfo->num_cycles; if (op.opinfo->flags & FL_LOADSTORE) ++js.numLoadStoreInst; if (op.opinfo->flags & FL_USE_FPU) ++js.numFloatingPointInst; - if (HandleFunctionHooking(op.address)) + if (HandleFunctionHooking(js.compilerPC)) break; if (!op.skip) { - const bool breakpoint = - m_enable_debugging && - m_system.GetPowerPC().GetBreakPoints().IsAddressBreakPoint(op.address); - const bool check_fpu = (op.opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound; + const bool breakpoint = IsDebuggingEnabled() && !cpu.IsStepping() && + breakpoints.IsAddressBreakPoint(js.compilerPC); + const bool check_fpu = (op.opinfo->flags & FL_USE_FPU) != 0 && !js.firstFPInstructionFound; const bool endblock = (op.opinfo->flags & FL_ENDBLOCK) != 0; - const bool memcheck = (op.opinfo->flags & FL_LOADSTORE) && jo.memcheck; + const bool memcheck = (op.opinfo->flags & FL_LOADSTORE) != 0 && jo.memcheck; const bool check_program_exception = !endblock && ShouldHandleFPExceptionForInstruction(&op); const bool idle_loop = op.branchIsIdleLoop; if (breakpoint || check_fpu || endblock || memcheck || check_program_exception) - m_code.emplace_back(WritePC, op.address); + Write(WritePC, {js.compilerPC}); if (breakpoint) - m_code.emplace_back(CheckBreakpoint, js.downcountAmount); + Write(CheckBreakpoint, {power_pc, js.downcountAmount}); if (check_fpu) { - m_code.emplace_back(CheckFPU, js.downcountAmount); + Write(CheckFPU, {power_pc, js.downcountAmount}); js.firstFPInstructionFound = true; } - m_code.emplace_back(Interpreter::GetInterpreterOp(op.inst), op.inst); + Write(Interpret, + {interpreter, Interpreter::GetInterpreterOp(op.inst), js.compilerPC, op.inst}); if (memcheck) - m_code.emplace_back(CheckDSI, js.downcountAmount); + Write(CheckDSI, {power_pc, js.downcountAmount}); if (check_program_exception) - m_code.emplace_back(CheckProgramException, js.downcountAmount); + Write(CheckProgramException, {power_pc, js.downcountAmount}); if (idle_loop) - m_code.emplace_back(CheckIdle, js.blockStart); + Write(CheckIdle, {m_system.GetCoreTiming(), js.blockStart}); if (endblock) - { - m_code.emplace_back(EndBlock, js.downcountAmount); - if (js.numLoadStoreInst != 0) - m_code.emplace_back(UpdateNumLoadStoreInstructions, js.numLoadStoreInst); - if (js.numFloatingPointInst != 0) - m_code.emplace_back(UpdateNumFloatingPointInstructions, js.numFloatingPointInst); - } + Write(EndBlock, {js.downcountAmount, js.numLoadStoreInst, js.numFloatingPointInst}); } } if (code_block.m_broken) { - m_code.emplace_back(WriteBrokenBlockNPC, nextPC); - m_code.emplace_back(EndBlock, js.downcountAmount); - if (js.numLoadStoreInst != 0) - m_code.emplace_back(UpdateNumLoadStoreInstructions, js.numLoadStoreInst); - if (js.numFloatingPointInst != 0) - m_code.emplace_back(UpdateNumFloatingPointInstructions, js.numFloatingPointInst); + Write(WriteBrokenBlockNPC, {nextPC}); + Write(EndBlock, {js.downcountAmount, js.numLoadStoreInst, js.numFloatingPointInst}); } - m_code.emplace_back(); - b->near_end = GetCodePtr(); - b->far_begin = nullptr; - b->far_end = nullptr; - - b->codeSize = static_cast(GetCodePtr() - b->normalEntry); - b->originalSize = code_block.m_num_instructions; - - m_block_cache.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses); + if (HasWriteFailed()) + { + WARN_LOG_FMT(DYNA_REC, "JIT ran out of space in code region during code generation."); + return false; + } + return true; } void CachedInterpreter::ClearCache() { - m_code.clear(); m_block_cache.Clear(); + m_block_cache.ClearRangesToFree(); + ClearCodeSpace(); + ResetFreeMemoryRanges(); RefreshConfig(); } diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.h b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.h index 513ad7dbc9..b500ba81e7 100644 --- a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.h +++ b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.h @@ -3,14 +3,27 @@ #pragma once -#include +#include + +#include #include "Common/CommonTypes.h" -#include "Core/PowerPC/CachedInterpreter/InterpreterBlockCache.h" +#include "Core/PowerPC/CachedInterpreter/CachedInterpreterBlockCache.h" +#include "Core/PowerPC/CachedInterpreter/CachedInterpreterEmitter.h" #include "Core/PowerPC/JitCommon/JitBase.h" #include "Core/PowerPC/PPCAnalyst.h" -class CachedInterpreter : public JitBase +namespace CoreTiming +{ +class CoreTimingManager; +} +namespace CPU +{ +enum class State; +} +class Interpreter; + +class CachedInterpreter : public JitBase, public CachedInterpreterCodeBlock { public: explicit CachedInterpreter(Core::System& system); @@ -30,32 +43,85 @@ public: void SingleStep() override; void Jit(u32 address) override; + void Jit(u32 address, bool clear_cache_and_retry_on_failure); + bool DoJit(u32 address, JitBlock* b, u32 nextPC); JitBaseBlockCache* GetBlockCache() override { return &m_block_cache; } const char* GetName() const override { return "Cached Interpreter"; } const CommonAsmRoutinesBase* GetAsmRoutines() override { return nullptr; } private: - struct Instruction; - - u8* GetCodePtr(); void ExecuteOneBlock(); bool HandleFunctionHooking(u32 address); - static void EndBlock(CachedInterpreter& cached_interpreter, UGeckoInstruction data); - static void UpdateNumLoadStoreInstructions(CachedInterpreter& cached_interpreter, - UGeckoInstruction data); - static void UpdateNumFloatingPointInstructions(CachedInterpreter& cached_interpreter, - UGeckoInstruction data); - static void WritePC(CachedInterpreter& cached_interpreter, UGeckoInstruction data); - static void WriteBrokenBlockNPC(CachedInterpreter& cached_interpreter, UGeckoInstruction data); - static bool CheckFPU(CachedInterpreter& cached_interpreter, u32 data); - static bool CheckDSI(CachedInterpreter& cached_interpreter, u32 data); - static bool CheckProgramException(CachedInterpreter& cached_interpreter, u32 data); - static bool CheckBreakpoint(CachedInterpreter& cached_interpreter, u32 data); - static bool CheckIdle(CachedInterpreter& cached_interpreter, u32 idle_pc); + // Finds a free memory region and sets the code emitter to point at that region. + // Returns false if no free memory region can be found. + bool SetEmitterStateToFreeCodeRegion(); - BlockCache m_block_cache{*this}; - std::vector m_code; + void FreeRanges(); + void ResetFreeMemoryRanges(); + + struct EndBlockOperands; + struct InterpretOperands; + struct HLEFunctionOperands; + struct WritePCOperands; + struct CheckHaltOperands; + struct CheckIdleOperands; + + static s32 EndBlock(PowerPC::PowerPCState& ppc_state, const EndBlockOperands& operands); + static s32 Interpret(PowerPC::PowerPCState& ppc_state, const InterpretOperands& operands); + static s32 HLEFunction(PowerPC::PowerPCState& ppc_state, const HLEFunctionOperands& operands); + static s32 WritePC(PowerPC::PowerPCState& ppc_state, const WritePCOperands& operands); + static s32 WriteBrokenBlockNPC(PowerPC::PowerPCState& ppc_state, const WritePCOperands& operands); + static s32 CheckFPU(PowerPC::PowerPCState& ppc_state, const CheckHaltOperands& operands); + static s32 CheckDSI(PowerPC::PowerPCState& ppc_state, const CheckHaltOperands& operands); + static s32 CheckProgramException(PowerPC::PowerPCState& ppc_state, + const CheckHaltOperands& operands); + static s32 CheckBreakpoint(PowerPC::PowerPCState& ppc_state, const CheckHaltOperands& operands); + static s32 CheckIdle(PowerPC::PowerPCState& ppc_state, const CheckIdleOperands& operands); + + HyoutaUtilities::RangeSizeSet m_free_ranges; + CachedInterpreterBlockCache m_block_cache; +}; + +struct CachedInterpreter::EndBlockOperands +{ + u32 downcount; + u32 num_load_stores; + u32 num_fp_inst; + u32 : 32; +}; + +struct CachedInterpreter::InterpretOperands +{ + Interpreter& interpreter; + void (*func)(Interpreter&, UGeckoInstruction); // Interpreter::Instruction + u32 current_pc; + UGeckoInstruction inst; +}; + +struct CachedInterpreter::HLEFunctionOperands +{ + Core::System& system; + u32 current_pc; + u32 hook_index; +}; + +struct CachedInterpreter::WritePCOperands +{ + u32 current_pc; + u32 : 32; +}; + +struct CachedInterpreter::CheckHaltOperands +{ + PowerPC::PowerPCManager& power_pc; + u32 downcount; +}; + +struct CachedInterpreter::CheckIdleOperands +{ + CoreTiming::CoreTimingManager& core_timing; + u32 idle_pc; }; diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterBlockCache.cpp b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterBlockCache.cpp new file mode 100644 index 0000000000..3b4c4a0637 --- /dev/null +++ b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterBlockCache.cpp @@ -0,0 +1,41 @@ +// Copyright 2024 Dolphin Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "Core/PowerPC/CachedInterpreter/CachedInterpreterBlockCache.h" + +#include "Core/PowerPC/CachedInterpreter/CachedInterpreterEmitter.h" +#include "Core/PowerPC/JitCommon/JitBase.h" + +CachedInterpreterBlockCache::CachedInterpreterBlockCache(JitBase& jit) : JitBaseBlockCache{jit} +{ +} + +void CachedInterpreterBlockCache::Init() +{ + JitBaseBlockCache::Init(); + ClearRangesToFree(); +} + +void CachedInterpreterBlockCache::DestroyBlock(JitBlock& block) +{ + JitBaseBlockCache::DestroyBlock(block); + + if (block.near_begin != block.near_end) + m_ranges_to_free_on_next_codegen.emplace_back(block.near_begin, block.near_end); +} + +void CachedInterpreterBlockCache::ClearRangesToFree() +{ + m_ranges_to_free_on_next_codegen.clear(); +} + +void CachedInterpreterBlockCache::WriteLinkBlock(const JitBlock::LinkData& source, + const JitBlock* dest) +{ +} + +void CachedInterpreterBlockCache::WriteDestroyBlock(const JitBlock& block) +{ + CachedInterpreterEmitter emitter(block.normalEntry, block.near_end); + emitter.Write(CachedInterpreterEmitter::PoisonCallback); +} diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterBlockCache.h b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterBlockCache.h new file mode 100644 index 0000000000..2e06bafb41 --- /dev/null +++ b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterBlockCache.h @@ -0,0 +1,35 @@ +// Copyright 2024 Dolphin Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include + +#include "Common/CommonTypes.h" +#include "Core/PowerPC/JitCommon/JitCache.h" + +class JitBase; + +class CachedInterpreterBlockCache final : public JitBaseBlockCache +{ +public: + explicit CachedInterpreterBlockCache(JitBase& jit); + + void Init() override; + + void DestroyBlock(JitBlock& block) override; + + void ClearRangesToFree(); + + const std::vector>& GetRangesToFree() const + { + return m_ranges_to_free_on_next_codegen; + }; + +private: + void WriteLinkBlock(const JitBlock::LinkData& source, const JitBlock* dest) override; + void WriteDestroyBlock(const JitBlock& block) override; + + std::vector> m_ranges_to_free_on_next_codegen; +}; diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterEmitter.cpp b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterEmitter.cpp new file mode 100644 index 0000000000..8422ef9edb --- /dev/null +++ b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterEmitter.cpp @@ -0,0 +1,39 @@ +// Copyright 2024 Dolphin Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "Core/PowerPC/CachedInterpreter/CachedInterpreterEmitter.h" + +#include +#include + +#include "Common/Assert.h" +#include "Common/MsgHandler.h" + +void CachedInterpreterEmitter::Write(AnyCallback callback, const void* operands, std::size_t size) +{ + DEBUG_ASSERT(reinterpret_cast(m_code) % alignof(AnyCallback) == 0); + if (m_code + sizeof(callback) + size >= m_code_end) + { + m_write_failed = true; + return; + } + std::memcpy(m_code, &callback, sizeof(callback)); + m_code += sizeof(callback); + std::memcpy(m_code, operands, size); + m_code += size; +} + +s32 CachedInterpreterEmitter::PoisonCallback(PowerPC::PowerPCState& ppc_state, const void* operands) +{ + ASSERT_MSG(DYNA_REC, false, + "The Cached Interpreter reached a poisoned callback. This should never happen!"); + return 0; +} + +void CachedInterpreterCodeBlock::PoisonMemory() +{ + DEBUG_ASSERT(reinterpret_cast(region) % alignof(AnyCallback) == 0); + DEBUG_ASSERT(region_size % sizeof(AnyCallback) == 0); + std::fill(reinterpret_cast(region), + reinterpret_cast(region + region_size), AnyCallbackCast(PoisonCallback)); +} diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterEmitter.h b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterEmitter.h new file mode 100644 index 0000000000..7c2ee24632 --- /dev/null +++ b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreterEmitter.h @@ -0,0 +1,84 @@ +// Copyright 2024 Dolphin Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include + +#include "Common/CodeBlock.h" +#include "Common/CommonTypes.h" + +namespace PowerPC +{ +struct PowerPCState; +} + +class CachedInterpreterEmitter +{ +protected: + // The return value of most callbacks is the distance in memory to the next callback. + // If a callback returns 0, the block will be exited. The return value is signed to + // support block-linking. 32-bit return values seem to perform better than 64-bit ones. + template + using Callback = s32 (*)(PowerPC::PowerPCState& ppc_state, const Operands& operands); + using AnyCallback = s32 (*)(PowerPC::PowerPCState& ppc_state, const void* operands); + + template + static AnyCallback AnyCallbackCast(Callback callback) + { + return reinterpret_cast(callback); + } + static consteval AnyCallback AnyCallbackCast(AnyCallback callback) { return callback; } + +public: + CachedInterpreterEmitter() = default; + explicit CachedInterpreterEmitter(u8* begin, u8* end) : m_code(begin), m_code_end(end) {} + + template + void Write(Callback callback, const Operands& operands) + { + // I would use std::is_trivial_v, but almost every operands struct uses + // references instead of pointers to make the callback functions nicer. + static_assert( + std::is_trivially_copyable_v && std::is_trivially_destructible_v && + alignof(Operands) <= alignof(AnyCallback) && sizeof(Operands) % alignof(AnyCallback) == 0); + Write(AnyCallbackCast(callback), &operands, sizeof(Operands)); + } + void Write(AnyCallback callback) { Write(callback, nullptr, 0); } + + const u8* GetCodePtr() const { return m_code; } + u8* GetWritableCodePtr() { return m_code; } + const u8* GetCodeEnd() const { return m_code_end; }; + u8* GetWritableCodeEnd() { return m_code_end; }; + // Should be checked after a block of code has been generated to see if the code has been + // successfully written to memory. Do not call the generated code when this returns true! + bool HasWriteFailed() const { return m_write_failed; } + + void SetCodePtr(u8* begin, u8* end) + { + m_code = begin; + m_code_end = end; + m_write_failed = false; + }; + + static s32 PoisonCallback(PowerPC::PowerPCState& ppc_state, const void* operands); + +private: + void Write(AnyCallback callback, const void* operands, std::size_t size); + + // Pointer to memory where code will be emitted to. + u8* m_code = nullptr; + // Pointer past the end of the memory region we're allowed to emit to. + // Writes that would reach this memory are refused and will set the m_write_failed flag instead. + u8* m_code_end = nullptr; + // Set to true when a write request happens that would write past m_code_end. + // Must be cleared with SetCodePtr() afterwards. + bool m_write_failed = false; +}; + +class CachedInterpreterCodeBlock : public Common::CodeBlock +{ +private: + void PoisonMemory() override; +}; diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/InterpreterBlockCache.cpp b/Source/Core/Core/PowerPC/CachedInterpreter/InterpreterBlockCache.cpp deleted file mode 100644 index 4bcc0b3848..0000000000 --- a/Source/Core/Core/PowerPC/CachedInterpreter/InterpreterBlockCache.cpp +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright 2016 Dolphin Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include "Core/PowerPC/CachedInterpreter/InterpreterBlockCache.h" - -#include "Core/PowerPC/JitCommon/JitBase.h" - -BlockCache::BlockCache(JitBase& jit) : JitBaseBlockCache{jit} -{ -} - -void BlockCache::WriteLinkBlock(const JitBlock::LinkData& source, const JitBlock* dest) -{ -} diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/InterpreterBlockCache.h b/Source/Core/Core/PowerPC/CachedInterpreter/InterpreterBlockCache.h deleted file mode 100644 index a20e457ef6..0000000000 --- a/Source/Core/Core/PowerPC/CachedInterpreter/InterpreterBlockCache.h +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2016 Dolphin Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include "Core/PowerPC/JitCommon/JitCache.h" - -class JitBase; - -class BlockCache final : public JitBaseBlockCache -{ -public: - explicit BlockCache(JitBase& jit); - -private: - void WriteLinkBlock(const JitBlock::LinkData& source, const JitBlock* dest) override; -}; diff --git a/Source/Core/DolphinLib.props b/Source/Core/DolphinLib.props index e236276ce9..5f1c59ac4d 100644 --- a/Source/Core/DolphinLib.props +++ b/Source/Core/DolphinLib.props @@ -428,7 +428,8 @@ - + + @@ -1089,7 +1090,8 @@ - + + From 0282fa7adbeea737bfdb8f11570d6d17ea94d47b Mon Sep 17 00:00:00 2001 From: mitaclaw <140017135+mitaclaw@users.noreply.github.com> Date: Sat, 20 Apr 2024 05:16:05 -0700 Subject: [PATCH 2/5] CachedInterpreter: Exception Check Callback Micro-Optimization This saves two register pushes / pops. --- .../Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp index 7e740b5de9..82b8058e7a 100644 --- a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp +++ b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp @@ -135,9 +135,9 @@ s32 CachedInterpreter::CheckFPU(PowerPC::PowerPCState& ppc_state, const CheckHal const auto& [power_pc, downcount] = operands; if (!ppc_state.msr.FP) { + ppc_state.downcount -= downcount; ppc_state.Exceptions |= EXCEPTION_FPU_UNAVAILABLE; power_pc.CheckExceptions(); - ppc_state.downcount -= downcount; return 0; } return sizeof(AnyCallback) + sizeof(operands); @@ -148,8 +148,8 @@ s32 CachedInterpreter::CheckDSI(PowerPC::PowerPCState& ppc_state, const CheckHal const auto& [power_pc, downcount] = operands; if ((ppc_state.Exceptions & EXCEPTION_DSI) != 0) { - power_pc.CheckExceptions(); ppc_state.downcount -= downcount; + power_pc.CheckExceptions(); return 0; } return sizeof(AnyCallback) + sizeof(operands); @@ -161,8 +161,8 @@ s32 CachedInterpreter::CheckProgramException(PowerPC::PowerPCState& ppc_state, const auto& [power_pc, downcount] = operands; if ((ppc_state.Exceptions & EXCEPTION_PROGRAM) != 0) { - power_pc.CheckExceptions(); ppc_state.downcount -= downcount; + power_pc.CheckExceptions(); return 0; } return sizeof(AnyCallback) + sizeof(operands); From 818647d694e613c79da10854a65b823e9aea3e8b Mon Sep 17 00:00:00 2001 From: mitaclaw <140017135+mitaclaw@users.noreply.github.com> Date: Sat, 4 May 2024 18:10:57 -0700 Subject: [PATCH 3/5] CachedInterpreter: WritePC optimizations WritePC is now needed far less, only for instructions that end the block. Unfortunately, WritePC still needs to update `PowerPCState::npc` to support the false path of conditional branch instructions. Both drawbacks should be smoothed over by optimized cached instructions in the future. --- .../CachedInterpreter/CachedInterpreter.cpp | 50 +++++++++---------- .../CachedInterpreter/CachedInterpreter.h | 1 + 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp index 82b8058e7a..745dc9c9b3 100644 --- a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp +++ b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp @@ -110,6 +110,7 @@ s32 CachedInterpreter::HLEFunction(PowerPC::PowerPCState& ppc_state, const HLEFunctionOperands& operands) { const auto& [system, current_pc, hook_index] = operands; + ppc_state.pc = current_pc; HLE::Execute(Core::CPUThreadGuard{system}, current_pc, hook_index); return sizeof(AnyCallback) + sizeof(operands); } @@ -132,9 +133,10 @@ s32 CachedInterpreter::WriteBrokenBlockNPC(PowerPC::PowerPCState& ppc_state, s32 CachedInterpreter::CheckFPU(PowerPC::PowerPCState& ppc_state, const CheckHaltOperands& operands) { - const auto& [power_pc, downcount] = operands; + const auto& [power_pc, current_pc, downcount] = operands; if (!ppc_state.msr.FP) { + ppc_state.pc = current_pc; ppc_state.downcount -= downcount; ppc_state.Exceptions |= EXCEPTION_FPU_UNAVAILABLE; power_pc.CheckExceptions(); @@ -145,9 +147,10 @@ s32 CachedInterpreter::CheckFPU(PowerPC::PowerPCState& ppc_state, const CheckHal s32 CachedInterpreter::CheckDSI(PowerPC::PowerPCState& ppc_state, const CheckHaltOperands& operands) { - const auto& [power_pc, downcount] = operands; + const auto& [power_pc, current_pc, downcount] = operands; if ((ppc_state.Exceptions & EXCEPTION_DSI) != 0) { + ppc_state.pc = current_pc; ppc_state.downcount -= downcount; power_pc.CheckExceptions(); return 0; @@ -158,9 +161,10 @@ s32 CachedInterpreter::CheckDSI(PowerPC::PowerPCState& ppc_state, const CheckHal s32 CachedInterpreter::CheckProgramException(PowerPC::PowerPCState& ppc_state, const CheckHaltOperands& operands) { - const auto& [power_pc, downcount] = operands; + const auto& [power_pc, current_pc, downcount] = operands; if ((ppc_state.Exceptions & EXCEPTION_PROGRAM) != 0) { + ppc_state.pc = current_pc; ppc_state.downcount -= downcount; power_pc.CheckExceptions(); return 0; @@ -171,7 +175,8 @@ s32 CachedInterpreter::CheckProgramException(PowerPC::PowerPCState& ppc_state, s32 CachedInterpreter::CheckBreakpoint(PowerPC::PowerPCState& ppc_state, const CheckHaltOperands& operands) { - const auto& [power_pc, downcount] = operands; + const auto& [power_pc, current_pc, downcount] = operands; + ppc_state.pc = current_pc; if (power_pc.CheckAndHandleBreakPoints()) { // Accessing PowerPCState through power_pc instead of ppc_state produces better assembly. @@ -198,7 +203,6 @@ bool CachedInterpreter::HandleFunctionHooking(u32 address) if (!result) return false; - Write(WritePC, {address}); Write(HLEFunction, {m_system, address, result.hook_index}); if (result.type != HLE::HookType::Replace) @@ -329,33 +333,27 @@ bool CachedInterpreter::DoJit(u32 em_address, JitBlock* b, u32 nextPC) if (!op.skip) { - const bool breakpoint = IsDebuggingEnabled() && !cpu.IsStepping() && - breakpoints.IsAddressBreakPoint(js.compilerPC); - const bool check_fpu = (op.opinfo->flags & FL_USE_FPU) != 0 && !js.firstFPInstructionFound; - const bool endblock = (op.opinfo->flags & FL_ENDBLOCK) != 0; - const bool memcheck = (op.opinfo->flags & FL_LOADSTORE) != 0 && jo.memcheck; - const bool check_program_exception = !endblock && ShouldHandleFPExceptionForInstruction(&op); - const bool idle_loop = op.branchIsIdleLoop; - - if (breakpoint || check_fpu || endblock || memcheck || check_program_exception) - Write(WritePC, {js.compilerPC}); - - if (breakpoint) - Write(CheckBreakpoint, {power_pc, js.downcountAmount}); - - if (check_fpu) + if (IsDebuggingEnabled() && !cpu.IsStepping() && + breakpoints.IsAddressBreakPoint(js.compilerPC)) { - Write(CheckFPU, {power_pc, js.downcountAmount}); + Write(CheckBreakpoint, {power_pc, js.compilerPC, js.downcountAmount}); + } + if (!js.firstFPInstructionFound && (op.opinfo->flags & FL_USE_FPU) != 0) + { + Write(CheckFPU, {power_pc, js.compilerPC, js.downcountAmount}); js.firstFPInstructionFound = true; } + const bool endblock = (op.opinfo->flags & FL_ENDBLOCK) != 0; + if (endblock) + Write(WritePC, {js.compilerPC}); Write(Interpret, {interpreter, Interpreter::GetInterpreterOp(op.inst), js.compilerPC, op.inst}); - if (memcheck) - Write(CheckDSI, {power_pc, js.downcountAmount}); - if (check_program_exception) - Write(CheckProgramException, {power_pc, js.downcountAmount}); - if (idle_loop) + if (jo.memcheck && (op.opinfo->flags & FL_LOADSTORE) != 0) + Write(CheckDSI, {power_pc, js.compilerPC, js.downcountAmount}); + if (!endblock && ShouldHandleFPExceptionForInstruction(&op)) + Write(CheckProgramException, {power_pc, js.compilerPC, js.downcountAmount}); + if (op.branchIsIdleLoop) Write(CheckIdle, {m_system.GetCoreTiming(), js.blockStart}); if (endblock) Write(EndBlock, {js.downcountAmount, js.numLoadStoreInst, js.numFloatingPointInst}); diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.h b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.h index b500ba81e7..a1c87a8986 100644 --- a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.h +++ b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.h @@ -117,6 +117,7 @@ struct CachedInterpreter::WritePCOperands struct CachedInterpreter::CheckHaltOperands { PowerPC::PowerPCManager& power_pc; + u32 current_pc; u32 downcount; }; From ae43b10eff153c0322fcc254e1f63a92a8581d2e Mon Sep 17 00:00:00 2001 From: mitaclaw <140017135+mitaclaw@users.noreply.github.com> Date: Sun, 5 May 2024 00:48:40 -0700 Subject: [PATCH 4/5] CachedInterpreter: Use `CodeOp::canEndBlock` This was a bigger performance boost than I expected. --- .../Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp index 745dc9c9b3..98534ed903 100644 --- a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp +++ b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp @@ -344,18 +344,17 @@ bool CachedInterpreter::DoJit(u32 em_address, JitBlock* b, u32 nextPC) js.firstFPInstructionFound = true; } - const bool endblock = (op.opinfo->flags & FL_ENDBLOCK) != 0; - if (endblock) + if (op.canEndBlock) Write(WritePC, {js.compilerPC}); Write(Interpret, {interpreter, Interpreter::GetInterpreterOp(op.inst), js.compilerPC, op.inst}); if (jo.memcheck && (op.opinfo->flags & FL_LOADSTORE) != 0) Write(CheckDSI, {power_pc, js.compilerPC, js.downcountAmount}); - if (!endblock && ShouldHandleFPExceptionForInstruction(&op)) + if (!op.canEndBlock && ShouldHandleFPExceptionForInstruction(&op)) Write(CheckProgramException, {power_pc, js.compilerPC, js.downcountAmount}); if (op.branchIsIdleLoop) Write(CheckIdle, {m_system.GetCoreTiming(), js.blockStart}); - if (endblock) + if (op.canEndBlock) Write(EndBlock, {js.downcountAmount, js.numLoadStoreInst, js.numFloatingPointInst}); } } From 6c3024c3b1bf161c0c435640ccf5b8057a433f71 Mon Sep 17 00:00:00 2001 From: mitaclaw <140017135+mitaclaw@users.noreply.github.com> Date: Sun, 5 May 2024 21:34:52 -0700 Subject: [PATCH 5/5] CachedInterpreter: Combine Interpret, CheckDSI, CheckProgram, and WritePC I tried making the new templated Interpret callback test only the relevant exceptions (EXCEPTION_DSI, EXCEPTION_PROGRAM, or both), but didn't find a significant performance boost in it. As I am learning, the biggest bottleneck is the number of callbacks emitted, not usually the actual contents of them. --- .../CachedInterpreter/CachedInterpreter.cpp | 94 ++++++++++--------- .../CachedInterpreter/CachedInterpreter.h | 22 +++-- 2 files changed, 63 insertions(+), 53 deletions(-) diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp index 98534ed903..0960b77fb7 100644 --- a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp +++ b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.cpp @@ -98,11 +98,37 @@ s32 CachedInterpreter::EndBlock(PowerPC::PowerPCState& ppc_state, const EndBlock return 0; } +template s32 CachedInterpreter::Interpret(PowerPC::PowerPCState& ppc_state, const InterpretOperands& operands) { - const auto& [interpreter, func, current_pc, inst] = operands; - func(interpreter, inst); + if constexpr (write_pc) + { + ppc_state.pc = operands.current_pc; + ppc_state.npc = operands.current_pc + 4; + } + operands.func(operands.interpreter, operands.inst); + return sizeof(AnyCallback) + sizeof(operands); +} + +template +s32 CachedInterpreter::InterpretAndCheckExceptions( + PowerPC::PowerPCState& ppc_state, const InterpretAndCheckExceptionsOperands& operands) +{ + if constexpr (write_pc) + { + ppc_state.pc = operands.current_pc; + ppc_state.npc = operands.current_pc + 4; + } + operands.func(operands.interpreter, operands.inst); + + if ((ppc_state.Exceptions & (EXCEPTION_DSI | EXCEPTION_PROGRAM)) != 0) + { + ppc_state.pc = operands.current_pc; + ppc_state.downcount -= operands.downcount; + operands.power_pc.CheckExceptions(); + return 0; + } return sizeof(AnyCallback) + sizeof(operands); } @@ -115,16 +141,8 @@ s32 CachedInterpreter::HLEFunction(PowerPC::PowerPCState& ppc_state, return sizeof(AnyCallback) + sizeof(operands); } -s32 CachedInterpreter::WritePC(PowerPC::PowerPCState& ppc_state, const WritePCOperands& operands) -{ - const auto& [current_pc] = operands; - ppc_state.pc = current_pc; - ppc_state.npc = current_pc + 4; - return sizeof(AnyCallback) + sizeof(operands); -} - s32 CachedInterpreter::WriteBrokenBlockNPC(PowerPC::PowerPCState& ppc_state, - const WritePCOperands& operands) + const WriteBrokenBlockNPCOperands& operands) { const auto& [current_pc] = operands; ppc_state.npc = current_pc; @@ -145,33 +163,6 @@ s32 CachedInterpreter::CheckFPU(PowerPC::PowerPCState& ppc_state, const CheckHal return sizeof(AnyCallback) + sizeof(operands); } -s32 CachedInterpreter::CheckDSI(PowerPC::PowerPCState& ppc_state, const CheckHaltOperands& operands) -{ - const auto& [power_pc, current_pc, downcount] = operands; - if ((ppc_state.Exceptions & EXCEPTION_DSI) != 0) - { - ppc_state.pc = current_pc; - ppc_state.downcount -= downcount; - power_pc.CheckExceptions(); - return 0; - } - return sizeof(AnyCallback) + sizeof(operands); -} - -s32 CachedInterpreter::CheckProgramException(PowerPC::PowerPCState& ppc_state, - const CheckHaltOperands& operands) -{ - const auto& [power_pc, current_pc, downcount] = operands; - if ((ppc_state.Exceptions & EXCEPTION_PROGRAM) != 0) - { - ppc_state.pc = current_pc; - ppc_state.downcount -= downcount; - power_pc.CheckExceptions(); - return 0; - } - return sizeof(AnyCallback) + sizeof(operands); -} - s32 CachedInterpreter::CheckBreakpoint(PowerPC::PowerPCState& ppc_state, const CheckHaltOperands& operands) { @@ -344,14 +335,25 @@ bool CachedInterpreter::DoJit(u32 em_address, JitBlock* b, u32 nextPC) js.firstFPInstructionFound = true; } - if (op.canEndBlock) - Write(WritePC, {js.compilerPC}); - Write(Interpret, - {interpreter, Interpreter::GetInterpreterOp(op.inst), js.compilerPC, op.inst}); - if (jo.memcheck && (op.opinfo->flags & FL_LOADSTORE) != 0) - Write(CheckDSI, {power_pc, js.compilerPC, js.downcountAmount}); - if (!op.canEndBlock && ShouldHandleFPExceptionForInstruction(&op)) - Write(CheckProgramException, {power_pc, js.compilerPC, js.downcountAmount}); + // Instruction may cause a DSI Exception or Program Exception. + if ((jo.memcheck && (op.opinfo->flags & FL_LOADSTORE) != 0) || + (!op.canEndBlock && ShouldHandleFPExceptionForInstruction(&op))) + { + const InterpretAndCheckExceptionsOperands operands = { + {interpreter, Interpreter::GetInterpreterOp(op.inst), js.compilerPC, op.inst}, + power_pc, + js.downcountAmount}; + Write(op.canEndBlock ? InterpretAndCheckExceptions : + InterpretAndCheckExceptions, + operands); + } + else + { + const InterpretOperands operands = {interpreter, Interpreter::GetInterpreterOp(op.inst), + js.compilerPC, op.inst}; + Write(op.canEndBlock ? Interpret : Interpret, operands); + } + if (op.branchIsIdleLoop) Write(CheckIdle, {m_system.GetCoreTiming(), js.blockStart}); if (op.canEndBlock) diff --git a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.h b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.h index a1c87a8986..8b4752dbb8 100644 --- a/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.h +++ b/Source/Core/Core/PowerPC/CachedInterpreter/CachedInterpreter.h @@ -64,20 +64,22 @@ private: struct EndBlockOperands; struct InterpretOperands; + struct InterpretAndCheckExceptionsOperands; struct HLEFunctionOperands; - struct WritePCOperands; + struct WriteBrokenBlockNPCOperands; struct CheckHaltOperands; struct CheckIdleOperands; static s32 EndBlock(PowerPC::PowerPCState& ppc_state, const EndBlockOperands& operands); + template static s32 Interpret(PowerPC::PowerPCState& ppc_state, const InterpretOperands& operands); + template + static s32 InterpretAndCheckExceptions(PowerPC::PowerPCState& ppc_state, + const InterpretAndCheckExceptionsOperands& operands); static s32 HLEFunction(PowerPC::PowerPCState& ppc_state, const HLEFunctionOperands& operands); - static s32 WritePC(PowerPC::PowerPCState& ppc_state, const WritePCOperands& operands); - static s32 WriteBrokenBlockNPC(PowerPC::PowerPCState& ppc_state, const WritePCOperands& operands); + static s32 WriteBrokenBlockNPC(PowerPC::PowerPCState& ppc_state, + const WriteBrokenBlockNPCOperands& operands); static s32 CheckFPU(PowerPC::PowerPCState& ppc_state, const CheckHaltOperands& operands); - static s32 CheckDSI(PowerPC::PowerPCState& ppc_state, const CheckHaltOperands& operands); - static s32 CheckProgramException(PowerPC::PowerPCState& ppc_state, - const CheckHaltOperands& operands); static s32 CheckBreakpoint(PowerPC::PowerPCState& ppc_state, const CheckHaltOperands& operands); static s32 CheckIdle(PowerPC::PowerPCState& ppc_state, const CheckIdleOperands& operands); @@ -101,6 +103,12 @@ struct CachedInterpreter::InterpretOperands UGeckoInstruction inst; }; +struct CachedInterpreter::InterpretAndCheckExceptionsOperands : InterpretOperands +{ + PowerPC::PowerPCManager& power_pc; + u32 downcount; +}; + struct CachedInterpreter::HLEFunctionOperands { Core::System& system; @@ -108,7 +116,7 @@ struct CachedInterpreter::HLEFunctionOperands u32 hook_index; }; -struct CachedInterpreter::WritePCOperands +struct CachedInterpreter::WriteBrokenBlockNPCOperands { u32 current_pc; u32 : 32;