From dbf5dca11c22967af747a0b29231ffea853aee6b Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 20 Aug 2021 14:56:24 +0200 Subject: [PATCH 1/2] JitArm64: FIFO optimization improvements JitArm64 port of 789975e. --- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 50 +++++++++++++++++ Source/Core/Core/PowerPC/JitArm64/Jit.h | 2 + .../PowerPC/JitArm64/JitArm64_RegCache.cpp | 56 ++++++++++--------- .../Core/PowerPC/JitArm64/JitArm64_RegCache.h | 7 ++- 4 files changed, 87 insertions(+), 28 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index 6f8aeaceec..c1519450ed 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -320,6 +320,50 @@ void JitArm64::FreeStack() #endif } +void JitArm64::IntializeSpeculativeConstants() +{ + // If the block depends on an input register which looks like a gather pipe or MMIO related + // constant, guess that it is actually a constant input, and specialize the block based on this + // assumption. This happens when there are branches in code writing to the gather pipe, but only + // the first block loads the constant. + // Insert a check at the start of the block to verify that the value is actually constant. + // This can save a lot of backpatching and optimize gather pipe writes in more places. + const u8* fail = nullptr; + for (auto i : code_block.m_gpr_inputs) + { + u32 compile_time_value = PowerPC::ppcState.gpr[i]; + if (PowerPC::IsOptimizableGatherPipeWrite(compile_time_value) || + PowerPC::IsOptimizableGatherPipeWrite(compile_time_value - 0x8000) || + compile_time_value == 0xCC000000) + { + if (!fail) + { + SwitchToFarCode(); + fail = GetCodePtr(); + MOVI2R(DISPATCHER_PC, js.blockStart); + STR(IndexType::Unsigned, DISPATCHER_PC, PPC_REG, PPCSTATE_OFF(pc)); + MOVP2R(ARM64Reg::X8, &JitInterface::CompileExceptionCheck); + MOVI2R(ARM64Reg::W0, static_cast(JitInterface::ExceptionType::SpeculativeConstants)); + BLR(ARM64Reg::X8); + B(dispatcher_no_check); + SwitchToNearCode(); + } + + ARM64Reg tmp = gpr.GetReg(); + ARM64Reg value = gpr.R(i); + MOVI2R(tmp, compile_time_value); + CMP(value, tmp); + gpr.Unlock(tmp); + + FixupBranch no_fail = B(CCFlags::CC_EQ); + B(fail); + SetJumpTarget(no_fail); + + gpr.SetImmediate(i, compile_time_value, true); + } + } +} + void JitArm64::WriteExit(u32 destination, bool LK, u32 exit_address_after_return) { Cleanup(); @@ -806,6 +850,12 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) gpr.Start(js.gpa); fpr.Start(js.fpa); + if (js.noSpeculativeConstantsAddresses.find(js.blockStart) == + js.noSpeculativeConstantsAddresses.end()) + { + IntializeSpeculativeConstants(); + } + // Translate instructions for (u32 i = 0; i < code_block.m_num_instructions; i++) { diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 75565b771d..ac1b232578 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -285,6 +285,8 @@ protected: void ResetFreeMemoryRanges(); + void IntializeSpeculativeConstants(); + // AsmRoutines void GenerateAsm(); void GenerateCommonAsm(); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index 3913cdff79..83bbfe1b3a 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -202,37 +202,40 @@ void Arm64GPRCache::FlushRegister(size_t index, bool maintain_state, ARM64Reg tm } else if (reg.GetType() == RegType::Immediate) { - if (!reg.GetImm()) + if (reg.IsDirty()) { - m_emit->STR(IndexType::Unsigned, bitsize == 64 ? ARM64Reg::ZR : ARM64Reg::WZR, PPC_REG, - u32(guest_reg.ppc_offset)); - } - else - { - bool allocated_tmp_reg = false; - if (tmp_reg != ARM64Reg::INVALID_REG) + if (!reg.GetImm()) { - ASSERT(IsGPR(tmp_reg)); + m_emit->STR(IndexType::Unsigned, bitsize == 64 ? ARM64Reg::ZR : ARM64Reg::WZR, PPC_REG, + u32(guest_reg.ppc_offset)); } else { - ASSERT_MSG(DYNA_REC, !maintain_state, - "Flushing immediate while maintaining state requires temporary register"); - tmp_reg = GetReg(); - allocated_tmp_reg = true; + bool allocated_tmp_reg = false; + if (tmp_reg != ARM64Reg::INVALID_REG) + { + ASSERT(IsGPR(tmp_reg)); + } + else + { + ASSERT_MSG(DYNA_REC, !maintain_state, + "Flushing immediate while maintaining state requires temporary register"); + tmp_reg = GetReg(); + allocated_tmp_reg = true; + } + + const ARM64Reg encoded_tmp_reg = bitsize != 64 ? tmp_reg : EncodeRegTo64(tmp_reg); + + m_emit->MOVI2R(encoded_tmp_reg, reg.GetImm()); + m_emit->STR(IndexType::Unsigned, encoded_tmp_reg, PPC_REG, u32(guest_reg.ppc_offset)); + + if (allocated_tmp_reg) + UnlockRegister(tmp_reg); } - const ARM64Reg encoded_tmp_reg = bitsize != 64 ? tmp_reg : EncodeRegTo64(tmp_reg); - - m_emit->MOVI2R(encoded_tmp_reg, reg.GetImm()); - m_emit->STR(IndexType::Unsigned, encoded_tmp_reg, PPC_REG, u32(guest_reg.ppc_offset)); - - if (allocated_tmp_reg) - UnlockRegister(tmp_reg); + if (!maintain_state) + reg.Flush(); } - - if (!maintain_state) - reg.Flush(); } } @@ -335,12 +338,13 @@ ARM64Reg Arm64GPRCache::R(const GuestRegInfo& guest_reg) return ARM64Reg::INVALID_REG; } -void Arm64GPRCache::SetImmediate(const GuestRegInfo& guest_reg, u32 imm) +void Arm64GPRCache::SetImmediate(const GuestRegInfo& guest_reg, u32 imm, bool dirty) { OpArg& reg = guest_reg.reg; if (reg.GetType() == RegType::Register) UnlockRegister(EncodeRegTo32(reg.GetReg())); reg.LoadToImm(imm); + reg.SetDirty(dirty); } void Arm64GPRCache::BindToRegister(const GuestRegInfo& guest_reg, bool will_read, bool will_write) @@ -373,8 +377,8 @@ void Arm64GPRCache::BindToRegister(const GuestRegInfo& guest_reg, bool will_read m_emit->MOVI2R(host_reg, reg.GetImm()); } reg.Load(host_reg); - // If the register had an immediate value, the register was effectively already dirty - reg.SetDirty(true); + if (will_write) + reg.SetDirty(true); } else if (will_write) { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index d814e846f2..a8e63eb006 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -262,7 +262,10 @@ public: Arm64Gen::ARM64Reg CR(size_t preg) { return R(GetGuestCR(preg)); } // Set a register to an immediate. Only valid for guest GPRs. - void SetImmediate(size_t preg, u32 imm) { SetImmediate(GetGuestGPR(preg), imm); } + void SetImmediate(size_t preg, u32 imm, bool dirty = true) + { + SetImmediate(GetGuestGPR(preg), imm, dirty); + } // Returns if a register is set as an immediate. Only valid for guest GPRs. bool IsImm(size_t preg) const { return GetGuestGPROpArg(preg).GetType() == RegType::Immediate; } @@ -345,7 +348,7 @@ private: GuestRegInfo GetGuestByIndex(size_t index); Arm64Gen::ARM64Reg R(const GuestRegInfo& guest_reg); - void SetImmediate(const GuestRegInfo& guest_reg, u32 imm); + void SetImmediate(const GuestRegInfo& guest_reg, u32 imm, bool dirty); void BindToRegister(const GuestRegInfo& guest_reg, bool will_read, bool will_write = true); void FlushRegisters(BitSet32 regs, bool maintain_state, Arm64Gen::ARM64Reg tmp_reg); From 351d095ffffccd424763cb77b13f716b5e4155c1 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 20 Aug 2021 15:21:24 +0200 Subject: [PATCH 2/2] JitArm64: Optimize a few tail calls Maybe "tail call" isn't quite the right term for what this code is doing, since it's jumping to the dispatcher rather than returning, but it's the same optimization as for a tail call. --- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index c1519450ed..a2aae7c2d3 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -344,8 +344,9 @@ void JitArm64::IntializeSpeculativeConstants() STR(IndexType::Unsigned, DISPATCHER_PC, PPC_REG, PPCSTATE_OFF(pc)); MOVP2R(ARM64Reg::X8, &JitInterface::CompileExceptionCheck); MOVI2R(ARM64Reg::W0, static_cast(JitInterface::ExceptionType::SpeculativeConstants)); - BLR(ARM64Reg::X8); - B(dispatcher_no_check); + // Write dispatcher_no_check to LR for tail call + MOVP2R(ARM64Reg::X30, dispatcher_no_check); + BR(ARM64Reg::X8); SwitchToNearCode(); } @@ -837,10 +838,11 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) SetJumpTarget(fail); MOVI2R(DISPATCHER_PC, js.blockStart); STR(IndexType::Unsigned, DISPATCHER_PC, PPC_REG, PPCSTATE_OFF(pc)); + MOVP2R(ARM64Reg::X8, &JitInterface::CompileExceptionCheck); MOVI2R(ARM64Reg::W0, static_cast(JitInterface::ExceptionType::PairedQuantize)); - MOVP2R(ARM64Reg::X1, &JitInterface::CompileExceptionCheck); - BLR(ARM64Reg::X1); - B(dispatcher_no_check); + // Write dispatcher_no_check to LR for tail call + MOVP2R(ARM64Reg::X30, dispatcher_no_check); + BR(ARM64Reg::X8); SwitchToNearCode(); SetJumpTarget(no_fail); js.assumeNoPairedQuantize = true;