From 7cccedca1e95b755a923972ce08fbcd946e36bfb Mon Sep 17 00:00:00 2001 From: mitaclaw <140017135+mitaclaw@users.noreply.github.com> Date: Thu, 7 Dec 2023 09:35:14 -0800 Subject: [PATCH] Jit64: Install BranchWatch --- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 13 +- Source/Core/Core/PowerPC/Jit64/Jit.h | 6 + Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp | 142 +++++++++++++++++- .../Core/Core/PowerPC/Jit64/Jit_Integer.cpp | 49 +++++- .../Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 35 +++++ 5 files changed, 234 insertions(+), 11 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index 4e8fa16889..e83b9204aa 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -1041,7 +1041,18 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) if (HandleFunctionHooking(op.address)) break; - if (!op.skip) + if (op.skip) + { + if (IsDebuggingEnabled()) + { + // The only thing that currently sets op.skip is the BLR following optimization. + // If any non-branch instruction starts setting that too, this will need to be changed. + ASSERT(op.inst.hex == 0x4e800020); + WriteBranchWatch(op.address, op.branchTo, op.inst, RSCRATCH, RSCRATCH2, + CallerSavedRegistersInUse()); + } + } + else { if ((opinfo->flags & FL_USE_FPU) && !js.firstFPInstructionFound) { diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 9fd3f9d7f7..0794dc34a3 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -98,6 +98,12 @@ public: void WriteExternalExceptionExit(); void WriteRfiExitDestInRSCRATCH(); void WriteIdleExit(u32 destination); + template + void WriteBranchWatch(u32 origin, u32 destination, UGeckoInstruction inst, Gen::X64Reg reg_a, + Gen::X64Reg reg_b, BitSet32 caller_save); + void WriteBranchWatchDestInRSCRATCH(u32 origin, UGeckoInstruction inst, Gen::X64Reg reg_a, + Gen::X64Reg reg_b, BitSet32 caller_save); + bool Cleanup(); void GenerateConstantOverflow(bool overflow); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp index 20de3a33a4..bd611e2c7b 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Branch.cpp @@ -7,6 +7,7 @@ #include "Common/CommonTypes.h" #include "Common/x64Emitter.h" #include "Core/CoreTiming.h" +#include "Core/Debugger/BranchWatch.h" #include "Core/PowerPC/Gekko.h" #include "Core/PowerPC/Jit64/RegCache/JitRegCache.h" #include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h" @@ -66,6 +67,68 @@ void Jit64::rfi(UGeckoInstruction inst) WriteRfiExitDestInRSCRATCH(); } +template +void Jit64::WriteBranchWatch(u32 origin, u32 destination, UGeckoInstruction inst, X64Reg reg_a, + X64Reg reg_b, BitSet32 caller_save) +{ + MOV(64, R(reg_a), ImmPtr(&m_branch_watch)); + MOVZX(32, 8, reg_b, MDisp(reg_a, Core::BranchWatch::GetOffsetOfRecordingActive())); + TEST(32, R(reg_b), R(reg_b)); + + FixupBranch branch_in = J_CC(CC_NZ, Jump::Near); + SwitchToFarCode(); + SetJumpTarget(branch_in); + + ABI_PushRegistersAndAdjustStack(caller_save, 0); + // Some call sites have an optimization to use ABI_PARAM1 as a scratch register. + if (reg_a != ABI_PARAM1) + MOV(64, R(ABI_PARAM1), R(reg_a)); + MOV(64, R(ABI_PARAM2), Imm64(Core::FakeBranchWatchCollectionKey{origin, destination})); + MOV(32, R(ABI_PARAM3), Imm32(inst.hex)); + ABI_CallFunction(m_ppc_state.msr.IR ? (condition ? &Core::BranchWatch::HitVirtualTrue_fk : + &Core::BranchWatch::HitVirtualFalse_fk) : + (condition ? &Core::BranchWatch::HitPhysicalTrue_fk : + &Core::BranchWatch::HitPhysicalFalse_fk)); + ABI_PopRegistersAndAdjustStack(caller_save, 0); + + FixupBranch branch_out = J(Jump::Near); + SwitchToNearCode(); + SetJumpTarget(branch_out); +} + +template void Jit64::WriteBranchWatch(u32, u32, UGeckoInstruction, X64Reg, X64Reg, BitSet32); +template void Jit64::WriteBranchWatch(u32, u32, UGeckoInstruction, X64Reg, X64Reg, BitSet32); + +void Jit64::WriteBranchWatchDestInRSCRATCH(u32 origin, UGeckoInstruction inst, X64Reg reg_a, + X64Reg reg_b, BitSet32 caller_save) +{ + MOV(64, R(reg_a), ImmPtr(&m_branch_watch)); + MOVZX(32, 8, reg_b, MDisp(reg_a, Core::BranchWatch::GetOffsetOfRecordingActive())); + TEST(32, R(reg_b), R(reg_b)); + + FixupBranch branch_in = J_CC(CC_NZ, Jump::Near); + SwitchToFarCode(); + SetJumpTarget(branch_in); + + // Assert RSCRATCH won't be clobbered before it is moved from. + static_assert(ABI_PARAM1 != RSCRATCH); + + ABI_PushRegistersAndAdjustStack(caller_save, 0); + // Some call sites have an optimization to use ABI_PARAM1 as a scratch register. + if (reg_a != ABI_PARAM1) + MOV(64, R(ABI_PARAM1), R(reg_a)); + MOV(32, R(ABI_PARAM3), R(RSCRATCH)); + MOV(32, R(ABI_PARAM2), Imm32(origin)); + MOV(32, R(ABI_PARAM4), Imm32(inst.hex)); + ABI_CallFunction(m_ppc_state.msr.IR ? &Core::BranchWatch::HitVirtualTrue : + &Core::BranchWatch::HitPhysicalTrue); + ABI_PopRegistersAndAdjustStack(caller_save, 0); + + FixupBranch branch_out = J(Jump::Near); + SwitchToNearCode(); + SetJumpTarget(branch_out); +} + void Jit64::bx(UGeckoInstruction inst) { INSTRUCTION_START @@ -81,6 +144,11 @@ void Jit64::bx(UGeckoInstruction inst) // Because PPCAnalyst::Flatten() merged the blocks. if (!js.isLastInstruction) { + if (IsDebuggingEnabled()) + { + WriteBranchWatch(js.compilerPC, js.op->branchTo, inst, RSCRATCH, RSCRATCH2, + CallerSavedRegistersInUse()); + } if (inst.LK && !js.op->skipLRStack) { // We have to fake the stack as the RET instruction was not @@ -94,6 +162,11 @@ void Jit64::bx(UGeckoInstruction inst) gpr.Flush(); fpr.Flush(); + if (IsDebuggingEnabled()) + { + // ABI_PARAM1 is safe to use after a GPR flush for an optimization in this function. + WriteBranchWatch(js.compilerPC, js.op->branchTo, inst, ABI_PARAM1, RSCRATCH, {}); + } #ifdef ACID_TEST if (inst.LK) AND(32, PPCSTATE(cr), Imm32(~(0xFF000000))); @@ -144,6 +217,11 @@ void Jit64::bcx(UGeckoInstruction inst) if (!js.isLastInstruction && (inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION)) { + if (IsDebuggingEnabled()) + { + WriteBranchWatch(js.compilerPC, js.op->branchTo, inst, RSCRATCH, RSCRATCH2, + CallerSavedRegistersInUse()); + } if (inst.LK && !js.op->skipLRStack) { // We have to fake the stack as the RET instruction was not @@ -160,6 +238,11 @@ void Jit64::bcx(UGeckoInstruction inst) gpr.Flush(); fpr.Flush(); + if (IsDebuggingEnabled()) + { + // ABI_PARAM1 is safe to use after a GPR flush for an optimization in this function. + WriteBranchWatch(js.compilerPC, js.op->branchTo, inst, ABI_PARAM1, RSCRATCH, {}); + } if (js.op->branchIsIdleLoop) { WriteIdleExit(js.op->branchTo); @@ -179,8 +262,18 @@ void Jit64::bcx(UGeckoInstruction inst) { gpr.Flush(); fpr.Flush(); + if (IsDebuggingEnabled()) + { + // ABI_PARAM1 is safe to use after a GPR flush for an optimization in this function. + WriteBranchWatch(js.compilerPC, js.compilerPC + 4, inst, ABI_PARAM1, RSCRATCH, {}); + } WriteExit(js.compilerPC + 4); } + else if (IsDebuggingEnabled()) + { + WriteBranchWatch(js.compilerPC, js.compilerPC + 4, inst, RSCRATCH, RSCRATCH2, + CallerSavedRegistersInUse()); + } } void Jit64::bcctrx(UGeckoInstruction inst) @@ -204,6 +297,12 @@ void Jit64::bcctrx(UGeckoInstruction inst) if (inst.LK_3) MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4)); // LR = PC + 4; AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); + if (IsDebuggingEnabled()) + { + // ABI_PARAM1 is safe to use after a GPR flush for an optimization in this function. + WriteBranchWatchDestInRSCRATCH(js.compilerPC, inst, ABI_PARAM1, RSCRATCH2, + BitSet32{RSCRATCH}); + } WriteExitDestInRSCRATCH(inst.LK_3, js.compilerPC + 4); } else @@ -226,6 +325,12 @@ void Jit64::bcctrx(UGeckoInstruction inst) RCForkGuard fpr_guard = fpr.Fork(); gpr.Flush(); fpr.Flush(); + if (IsDebuggingEnabled()) + { + // ABI_PARAM1 is safe to use after a GPR flush for an optimization in this function. + WriteBranchWatchDestInRSCRATCH(js.compilerPC, inst, ABI_PARAM1, RSCRATCH2, + BitSet32{RSCRATCH}); + } WriteExitDestInRSCRATCH(inst.LK_3, js.compilerPC + 4); // Would really like to continue the block here, but it ends. TODO. } @@ -235,8 +340,18 @@ void Jit64::bcctrx(UGeckoInstruction inst) { gpr.Flush(); fpr.Flush(); + if (IsDebuggingEnabled()) + { + // ABI_PARAM1 is safe to use after a GPR flush for an optimization in this function. + WriteBranchWatch(js.compilerPC, js.compilerPC + 4, inst, ABI_PARAM1, RSCRATCH, {}); + } WriteExit(js.compilerPC + 4); } + else if (IsDebuggingEnabled()) + { + WriteBranchWatch(js.compilerPC, js.compilerPC + 4, inst, RSCRATCH, RSCRATCH2, + CallerSavedRegistersInUse()); + } } } @@ -270,10 +385,8 @@ void Jit64::bclrx(UGeckoInstruction inst) MOV(32, R(RSCRATCH), PPCSTATE_LR); // We don't have to do this because WriteBLRExit handles it for us. Specifically, since we only - // ever push - // divisible-by-four instruction addresses onto the stack, if the return address matches, we're - // already - // good. If it doesn't match, the mispredicted-BLR code handles the fixup. + // ever push divisible-by-four instruction addresses onto the stack, if the return address + // matches, we're already good. If it doesn't match, the mispredicted-BLR code handles the fixup. if (!m_enable_blr_optimization) AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); if (inst.LK) @@ -287,10 +400,21 @@ void Jit64::bclrx(UGeckoInstruction inst) if (js.op->branchIsIdleLoop) { + if (IsDebuggingEnabled()) + { + // ABI_PARAM1 is safe to use after a GPR flush for an optimization in this function. + WriteBranchWatch(js.compilerPC, js.op->branchTo, inst, ABI_PARAM1, RSCRATCH, {}); + } WriteIdleExit(js.op->branchTo); } else { + if (IsDebuggingEnabled()) + { + // ABI_PARAM1 is safe to use after a GPR flush for an optimization in this function. + WriteBranchWatchDestInRSCRATCH(js.compilerPC, inst, ABI_PARAM1, RSCRATCH2, + BitSet32{RSCRATCH}); + } WriteBLRExit(); } } @@ -304,6 +428,16 @@ void Jit64::bclrx(UGeckoInstruction inst) { gpr.Flush(); fpr.Flush(); + if (IsDebuggingEnabled()) + { + // ABI_PARAM1 is safe to use after a GPR flush for an optimization in this function. + WriteBranchWatch(js.compilerPC, js.compilerPC + 4, inst, ABI_PARAM1, RSCRATCH, {}); + } WriteExit(js.compilerPC + 4); } + else if (IsDebuggingEnabled()) + { + WriteBranchWatch(js.compilerPC, js.compilerPC + 4, inst, RSCRATCH, RSCRATCH2, + CallerSavedRegistersInUse()); + } } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp index ec2d6ae778..afc1c9a920 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp @@ -394,18 +394,25 @@ void Jit64::DoMergedBranch() if (next.LK) MOV(32, PPCSTATE_SPR(SPR_LR), Imm32(nextPC + 4)); - WriteIdleExit(js.op[1].branchTo); + const u32 destination = js.op[1].branchTo; + if (IsDebuggingEnabled()) + { + // ABI_PARAM1 is safe to use after a GPR flush for an optimization in this function. + WriteBranchWatch(nextPC, destination, next, ABI_PARAM1, RSCRATCH, {}); + } + WriteIdleExit(destination); } else if (next.OPCD == 16) // bcx { if (next.LK) MOV(32, PPCSTATE_SPR(SPR_LR), Imm32(nextPC + 4)); - u32 destination; - if (next.AA) - destination = SignExt16(next.BD << 2); - else - destination = nextPC + SignExt16(next.BD << 2); + const u32 destination = js.op[1].branchTo; + if (IsDebuggingEnabled()) + { + // ABI_PARAM1 is safe to use after a GPR flush for an optimization in this function. + WriteBranchWatch(nextPC, destination, next, ABI_PARAM1, RSCRATCH, {}); + } WriteExit(destination, next.LK, nextPC + 4); } else if ((next.OPCD == 19) && (next.SUBOP10 == 528)) // bcctrx @@ -414,6 +421,11 @@ void Jit64::DoMergedBranch() MOV(32, PPCSTATE_SPR(SPR_LR), Imm32(nextPC + 4)); MOV(32, R(RSCRATCH), PPCSTATE_SPR(SPR_CTR)); AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); + if (IsDebuggingEnabled()) + { + // ABI_PARAM1 is safe to use after a GPR flush for an optimization in this function. + WriteBranchWatchDestInRSCRATCH(nextPC, next, ABI_PARAM1, RSCRATCH2, BitSet32{RSCRATCH}); + } WriteExitDestInRSCRATCH(next.LK, nextPC + 4); } else if ((next.OPCD == 19) && (next.SUBOP10 == 16)) // bclrx @@ -423,6 +435,11 @@ void Jit64::DoMergedBranch() AND(32, R(RSCRATCH), Imm32(0xFFFFFFFC)); if (next.LK) MOV(32, PPCSTATE_SPR(SPR_LR), Imm32(nextPC + 4)); + if (IsDebuggingEnabled()) + { + // ABI_PARAM1 is safe to use after a GPR flush for an optimization in this function. + WriteBranchWatchDestInRSCRATCH(nextPC, next, ABI_PARAM1, RSCRATCH2, BitSet32{RSCRATCH}); + } WriteBLRExit(); } else @@ -480,8 +497,18 @@ void Jit64::DoMergedBranchCondition() { gpr.Flush(); fpr.Flush(); + if (IsDebuggingEnabled()) + { + // ABI_PARAM1 is safe to use after a GPR flush for an optimization in this function. + WriteBranchWatch(nextPC, nextPC + 4, next, ABI_PARAM1, RSCRATCH, {}); + } WriteExit(nextPC + 4); } + else if (IsDebuggingEnabled()) + { + WriteBranchWatch(nextPC, nextPC + 4, next, RSCRATCH, RSCRATCH2, + CallerSavedRegistersInUse()); + } } void Jit64::DoMergedBranchImmediate(s64 val) @@ -515,8 +542,18 @@ void Jit64::DoMergedBranchImmediate(s64 val) { gpr.Flush(); fpr.Flush(); + if (IsDebuggingEnabled()) + { + // ABI_PARAM1 is safe to use after a GPR flush for an optimization in this function. + WriteBranchWatch(nextPC, nextPC + 4, next, ABI_PARAM1, RSCRATCH, {}); + } WriteExit(nextPC + 4); } + else if (IsDebuggingEnabled()) + { + WriteBranchWatch(nextPC, nextPC + 4, next, RSCRATCH, RSCRATCH2, + CallerSavedRegistersInUse()); + } } void Jit64::cmpXX(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index 6a1dc65141..05bd690694 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -15,6 +15,7 @@ #include "Core/ConfigManager.h" #include "Core/CoreTiming.h" +#include "Core/Debugger/BranchWatch.h" #include "Core/HW/CPU.h" #include "Core/HW/Memmap.h" #include "Core/PowerPC/Jit64/RegCache/JitRegCache.h" @@ -300,6 +301,40 @@ void Jit64::dcbx(UGeckoInstruction inst) // Load the loop_counter register with the amount of invalidations to execute. LEA(32, loop_counter, MDisp(RSCRATCH2, 1)); + + if (IsDebuggingEnabled()) + { + const X64Reg bw_reg_a = reg_cycle_count, bw_reg_b = reg_downcount; + const BitSet32 bw_caller_save = (CallerSavedRegistersInUse() | BitSet32{RSCRATCH2}) & + ~BitSet32{int(bw_reg_a), int(bw_reg_b)}; + + MOV(64, R(bw_reg_a), ImmPtr(&m_branch_watch)); + MOVZX(32, 8, bw_reg_b, MDisp(bw_reg_a, Core::BranchWatch::GetOffsetOfRecordingActive())); + TEST(32, R(bw_reg_b), R(bw_reg_b)); + + FixupBranch branch_in = J_CC(CC_NZ, Jump::Near); + SwitchToFarCode(); + SetJumpTarget(branch_in); + + // Assert RSCRATCH2 won't be clobbered before it is moved from. + static_assert(RSCRATCH2 != ABI_PARAM1); + + ABI_PushRegistersAndAdjustStack(bw_caller_save, 0); + MOV(64, R(ABI_PARAM1), R(bw_reg_a)); + // RSCRATCH2 holds the amount of faked branch watch hits. Move RSCRATCH2 first, because + // ABI_PARAM2 clobbers RSCRATCH2 on Windows and ABI_PARAM3 clobbers RSCRATCH2 on Linux! + MOV(32, R(ABI_PARAM4), R(RSCRATCH2)); + const PPCAnalyst::CodeOp& op = js.op[2]; + MOV(64, R(ABI_PARAM2), Imm64(Core::FakeBranchWatchCollectionKey{op.address, op.branchTo})); + MOV(32, R(ABI_PARAM3), Imm32(op.inst.hex)); + ABI_CallFunction(m_ppc_state.msr.IR ? &Core::BranchWatch::HitVirtualTrue_fk_n : + &Core::BranchWatch::HitPhysicalTrue_fk_n); + ABI_PopRegistersAndAdjustStack(bw_caller_save, 0); + + FixupBranch branch_out = J(Jump::Near); + SwitchToNearCode(); + SetJumpTarget(branch_out); + } } X64Reg addr = RSCRATCH;