From 1d106ceaf5615f9e7fc3697776c27dcfd9431e5d Mon Sep 17 00:00:00 2001 From: JosJuice Date: Mon, 1 Feb 2021 22:14:16 +0100 Subject: [PATCH] JitArm64: Optimize ConvertSingleToDouble, part 2 If we can prove that FCVT will provide a correct conversion, we can use FCVT. This makes the common case a bit faster and the less likely cases (unfortunately including zero, which FCVT actually can convert correctly) a bit slower. --- Source/Core/Common/Arm64Emitter.cpp | 8 ++ Source/Core/Common/Arm64Emitter.h | 2 + Source/Core/Core/PowerPC/JitArm64/Jit.h | 11 ++- .../JitArm64/JitArm64_FloatingPoint.cpp | 97 ++++++++++++++++++- .../PowerPC/JitArm64/JitArm64_RegCache.cpp | 29 ++++-- .../Core/PowerPC/JitArm64/JitArm64_RegCache.h | 6 +- 6 files changed, 139 insertions(+), 14 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 837d7efdc8..1a718d1e3e 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -3601,6 +3601,14 @@ void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn) { Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn); } +void ARM64FloatEmitter::FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, size >> 6, 0x1D, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(1, 2 | (size >> 6), 0x1D, Rd, Rn, Rm); +} void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) { diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index a076098cb2..58caec8d08 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -1094,6 +1094,8 @@ public: void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); // Conditional select void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index cc65155ccd..1c60ae0aaf 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -154,8 +154,10 @@ public: void ConvertDoubleToSingleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); void ConvertDoubleToSinglePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); - void ConvertSingleToDoubleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); - void ConvertSingleToDoublePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg); + void ConvertSingleToDoubleLower(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg, + Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG); + void ConvertSingleToDoublePair(Arm64Gen::ARM64Reg dest_reg, Arm64Gen::ARM64Reg src_reg, + Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG); private: struct SlowmemHandler @@ -189,14 +191,18 @@ private: nearcode = GetWritableCodePtr(); SetCodePtrUnsafe(farcode.GetWritableCodePtr()); AlignCode16(); + m_in_farcode = true; } void SwitchToNearCode() { farcode.SetCodePtrUnsafe(GetWritableCodePtr()); SetCodePtrUnsafe(nearcode); + m_in_farcode = false; } + bool IsInFarCode() const { return m_in_farcode; } + // Dump a memory range of code void DumpCode(const u8* start, const u8* end); @@ -262,6 +268,7 @@ private: Arm64Gen::ARM64CodeBlock farcode; u8* nearcode; // Backed up when we switch to far code. + bool m_in_farcode = false; bool m_enable_blr_optimization; bool m_cleanup_after_stackfault = false; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index d3d5f7ddbf..59e27431cd 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -421,10 +421,35 @@ void JitArm64::ConvertDoubleToSinglePair(ARM64Reg dest_reg, ARM64Reg src_reg) ABI_PopRegisters(gpr_saved); } -void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg) +void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg, ARM64Reg scratch_reg) { + ASSERT(scratch_reg != src_reg); + + const bool switch_to_farcode = !IsInFarCode(); + FlushCarry(); + // Do we know that the input isn't NaN, and that the input isn't denormal or FPCR.FZ is not set? + // (This check unfortunately also catches zeroes) + + FixupBranch fast; + if (scratch_reg != ARM64Reg::INVALID_REG) + { + m_float_emit.FABS(EncodeRegToSingle(scratch_reg), EncodeRegToSingle(src_reg)); + m_float_emit.FCMP(EncodeRegToSingle(scratch_reg)); + fast = B(CCFlags::CC_GT); + + if (switch_to_farcode) + { + FixupBranch slow = B(); + + SwitchToFarCode(); + SetJumpTarget(slow); + } + } + + // If no (or if we don't have a scratch register), call the bit-exact routine + const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30}; ABI_PushRegisters(gpr_saved); @@ -433,12 +458,65 @@ void JitArm64::ConvertSingleToDoubleLower(ARM64Reg dest_reg, ARM64Reg src_reg) m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0); ABI_PopRegisters(gpr_saved); + + // If yes, do a fast conversion with FCVT + + if (scratch_reg != ARM64Reg::INVALID_REG) + { + FixupBranch continue1 = B(); + + if (switch_to_farcode) + SwitchToNearCode(); + + SetJumpTarget(fast); + + m_float_emit.FCVT(64, 32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); + + SetJumpTarget(continue1); + } } -void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg) +void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg, ARM64Reg scratch_reg) { + ASSERT(scratch_reg != src_reg); + + const bool switch_to_farcode = !IsInFarCode(); + FlushCarry(); + // Do we know that neither input is NaN, and that neither input is denormal or FPCR.FZ is not set? + // (This check unfortunately also catches zeroes) + + FixupBranch fast; + if (scratch_reg != ARM64Reg::INVALID_REG) + { + // Set each 32-bit element of scratch_reg to 0x0000'0000 or 0xFFFF'FFFF depending on whether + // the absolute value of the corresponding element in src_reg compares greater than 0 + m_float_emit.MOVI(8, EncodeRegToDouble(scratch_reg), 0); + m_float_emit.FACGT(32, EncodeRegToDouble(scratch_reg), EncodeRegToDouble(src_reg), + EncodeRegToDouble(scratch_reg)); + + // 0x0000'0000'0000'0000 (zero) -> 0x0000'0000'0000'0000 (zero) + // 0x0000'0000'FFFF'FFFF (denormal) -> 0xFF00'0000'FFFF'FFFF (normal) + // 0xFFFF'FFFF'0000'0000 (NaN) -> 0x00FF'FFFF'0000'0000 (normal) + // 0xFFFF'FFFF'FFFF'FFFF (NaN) -> 0xFFFF'FFFF'FFFF'FFFF (NaN) + m_float_emit.INS(8, EncodeRegToDouble(scratch_reg), 7, EncodeRegToDouble(scratch_reg), 0); + + // Is scratch_reg a NaN (0xFFFF'FFFF'FFFF'FFFF)? + m_float_emit.FCMP(EncodeRegToDouble(scratch_reg)); + fast = B(CCFlags::CC_VS); + + if (switch_to_farcode) + { + FixupBranch slow = B(); + + SwitchToFarCode(); + SetJumpTarget(slow); + } + } + + // If no (or if we don't have a scratch register), call the bit-exact routine + // Save X0-X4 and X30 if they're in use const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30}; ABI_PushRegisters(gpr_saved); @@ -452,4 +530,19 @@ void JitArm64::ConvertSingleToDoublePair(ARM64Reg dest_reg, ARM64Reg src_reg) m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0); ABI_PopRegisters(gpr_saved); + + // If yes, do a fast conversion with FCVTL + + if (scratch_reg != ARM64Reg::INVALID_REG) + { + FixupBranch continue1 = B(); + + if (switch_to_farcode) + SwitchToNearCode(); + + SetJumpTarget(fast); + m_float_emit.FCVTL(64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg)); + + SetJumpTarget(continue1); + } } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index 4b2ecd81e7..1363863286 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -468,7 +468,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type) return host_reg; // Else convert this register back to doubles. - m_jit->ConvertSingleToDoublePair(host_reg, host_reg); + const ARM64Reg tmp_reg = GetReg(); + m_jit->ConvertSingleToDoublePair(host_reg, host_reg, tmp_reg); + UnlockRegister(tmp_reg); + reg.Load(host_reg, RegType::Register); [[fallthrough]]; } @@ -483,7 +486,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type) return host_reg; // Else convert this register back to a double. - m_jit->ConvertSingleToDoubleLower(host_reg, host_reg); + const ARM64Reg tmp_reg = GetReg(); + m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg); + UnlockRegister(tmp_reg); + reg.Load(host_reg, RegType::LowerPair); [[fallthrough]]; } @@ -517,7 +523,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type) return host_reg; } - m_jit->ConvertSingleToDoubleLower(host_reg, host_reg); + const ARM64Reg tmp_reg = GetReg(); + m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg); + UnlockRegister(tmp_reg); + reg.Load(host_reg, RegType::Duplicated); [[fallthrough]]; } @@ -594,7 +603,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) { case RegType::Single: flush_reg = GetReg(); - m_jit->ConvertSingleToDoublePair(flush_reg, host_reg); + m_jit->ConvertSingleToDoublePair(flush_reg, host_reg, flush_reg); [[fallthrough]]; case RegType::Register: // We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit @@ -605,7 +614,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type) break; case RegType::DuplicatedSingle: flush_reg = GetReg(); - m_jit->ConvertSingleToDoubleLower(flush_reg, host_reg); + m_jit->ConvertSingleToDoubleLower(flush_reg, host_reg, flush_reg); [[fallthrough]]; case RegType::Duplicated: // Store PSR1 (which is equal to PSR0) in memory. @@ -709,17 +718,20 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state) const bool dirty = reg.IsDirty(); RegType type = reg.GetType(); + // If FlushRegister calls GetReg with all registers locked, we can get infinite recursion + const ARM64Reg tmp_reg = GetUnlockedRegisterCount() > 0 ? GetReg() : ARM64Reg::INVALID_REG; + // If we're in single mode, just convert it back to a double. if (type == RegType::Single) { if (dirty) - m_jit->ConvertSingleToDoublePair(host_reg, host_reg); + m_jit->ConvertSingleToDoublePair(host_reg, host_reg, tmp_reg); type = RegType::Register; } if (type == RegType::DuplicatedSingle || type == RegType::LowerPairSingle) { if (dirty) - m_jit->ConvertSingleToDoubleLower(host_reg, host_reg); + m_jit->ConvertSingleToDoubleLower(host_reg, host_reg, tmp_reg); if (type == RegType::DuplicatedSingle) type = RegType::Duplicated; @@ -771,6 +783,9 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state) reg.Flush(); } } + + if (tmp_reg != ARM64Reg::INVALID_REG) + UnlockRegister(tmp_reg); } void Arm64FPRCache::FlushRegisters(BitSet32 regs, bool maintain_state) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index 465e8fef67..8375687c87 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -168,6 +168,9 @@ public: void UpdateLastUsed(BitSet32 regs_used); + // Get available host registers + u32 GetUnlockedRegisterCount() const; + // Locks a register so a cache cannot use it // Useful for function calls template @@ -211,9 +214,6 @@ protected: void DiscardRegister(size_t preg); virtual void FlushRegister(size_t preg, bool maintain_state) = 0; - // Get available host registers - u32 GetUnlockedRegisterCount() const; - void IncrementAllUsed() { for (auto& reg : m_guest_registers)