From bfe8b1068dfba317689beac2509c7f804c90bbe9 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 10 Apr 2021 12:43:02 +0200 Subject: [PATCH] JitArm64: Implement FPRF updates --- Source/Core/Core/PowerPC/Gekko.h | 1 + Source/Core/Core/PowerPC/JitArm64/Jit.h | 3 + .../JitArm64/JitArm64_FloatingPoint.cpp | 61 ++++++++++-- .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 15 +-- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 94 +++++++++++++++++++ .../Core/PowerPC/JitCommon/JitAsmCommon.h | 2 + 6 files changed, 162 insertions(+), 14 deletions(-) diff --git a/Source/Core/Core/PowerPC/Gekko.h b/Source/Core/Core/PowerPC/Gekko.h index 590bd5c3ea..9e3e006c9c 100644 --- a/Source/Core/Core/PowerPC/Gekko.h +++ b/Source/Core/Core/PowerPC/Gekko.h @@ -399,6 +399,7 @@ union UReg_MSR }; #define FPRF_SHIFT 12 +#define FPRF_WIDTH 5 #define FPRF_MASK (0x1F << FPRF_SHIFT) // FPSCR exception flags diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 2ec565943b..7d7bba6404 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -234,6 +234,7 @@ protected: void GenerateCommonAsm(); void GenerateConvertDoubleToSingle(); void GenerateConvertSingleToDouble(); + void GenerateFPRF(bool single); void GenerateQuantizedLoadStores(); // Profiling @@ -262,6 +263,8 @@ protected: Arm64Gen::ARM64Reg), bool Rc = false); + void SetFPRFIfNeeded(bool single, Arm64Gen::ARM64Reg reg); + // std::map m_fault_to_handler; std::map m_handler_to_loc; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 4410ecd564..57ad09c1c8 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -9,6 +9,7 @@ #include "Core/ConfigManager.h" #include "Core/Core.h" #include "Core/CoreTiming.h" +#include "Core/PowerPC/Gekko.h" #include "Core/PowerPC/JitArm64/Jit.h" #include "Core/PowerPC/JitArm64/JitArm64_RegCache.h" #include "Core/PowerPC/PPCTables.h" @@ -16,12 +17,25 @@ using namespace Arm64Gen; +void JitArm64::SetFPRFIfNeeded(bool single, ARM64Reg reg) +{ + if (!SConfig::GetInstance().bFPRF || !js.op->wantsFPRF) + return; + + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + + reg = single ? EncodeRegToSingle(reg) : EncodeRegToDouble(reg); + m_float_emit.FMOV(single ? ARM64Reg::W0 : ARM64Reg::X0, reg); + BL(single ? GetAsmRoutines()->fprf_single : GetAsmRoutines()->fprf_double); + + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); +} + void JitArm64::fp_arith(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITFloatingPointOff); FALLBACK_IF(inst.Rc); - FALLBACK_IF(SConfig::GetInstance().bFPRF && js.op->wantsFPRF); u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; u32 op5 = inst.SUBOP5; @@ -120,13 +134,17 @@ void JitArm64::fp_arith(UGeckoInstruction inst) } } - if (single || packed) + const bool outputs_are_singles = single || packed; + + if (outputs_are_singles) { ASSERT_MSG(DYNA_REC, inputs_are_singles == inputs_are_singles_func(), "Register allocation turned singles into doubles in the middle of fp_arith"); fpr.FixSinglePrecision(d); } + + SetFPRFIfNeeded(outputs_are_singles, VD); } void JitArm64::fp_logic(UGeckoInstruction inst) @@ -252,7 +270,6 @@ void JitArm64::frspx(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITFloatingPointOff); FALLBACK_IF(inst.Rc); - FALLBACK_IF(SConfig::GetInstance().bFPRF && js.op->wantsFPRF); const u32 b = inst.FB; const u32 d = inst.FD; @@ -269,6 +286,8 @@ void JitArm64::frspx(UGeckoInstruction inst) ASSERT_MSG(DYNA_REC, fpr.IsSingle(b, true), "Register allocation turned singles into doubles in the middle of frspx"); + + SetFPRFIfNeeded(true, VD); } else { @@ -276,6 +295,8 @@ void JitArm64::frspx(UGeckoInstruction inst) const ARM64Reg VD = fpr.RW(d, RegType::DuplicatedSingle); m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB)); + + SetFPRFIfNeeded(true, VD); } } @@ -283,7 +304,8 @@ void JitArm64::fcmpX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITFloatingPointOff); - FALLBACK_IF(SConfig::GetInstance().bFPRF && js.op->wantsFPRF); + + const bool fprf = SConfig::GetInstance().bFPRF && js.op->wantsFPRF; const u32 a = inst.FA; const u32 b = inst.FB; @@ -299,6 +321,14 @@ void JitArm64::fcmpX(UGeckoInstruction inst) gpr.BindCRToRegister(crf, false); const ARM64Reg XA = gpr.CR(crf); + ARM64Reg fpscr_reg; + if (fprf) + { + fpscr_reg = gpr.GetReg(); + LDR(IndexType::Unsigned, fpscr_reg, PPC_REG, PPCSTATE_OFF(fpscr)); + ANDI2R(fpscr_reg, fpscr_reg, ~FPRF_MASK); + } + FixupBranch pNaN, pLesser, pGreater; FixupBranch continue1, continue2, continue3; ORR(XA, ARM64Reg::ZR, 32, 0, true); @@ -317,11 +347,16 @@ void JitArm64::fcmpX(UGeckoInstruction inst) // A == B ORR(XA, XA, 64 - 63, 0, true); + if (fprf) + ORRI2R(fpscr_reg, fpscr_reg, PowerPC::CR_EQ << FPRF_SHIFT); + continue1 = B(); SetJumpTarget(pNaN); MOVI2R(XA, PowerPC::ConditionRegister::PPCToInternal(PowerPC::CR_SO)); + if (fprf) + ORRI2R(fpscr_reg, fpscr_reg, PowerPC::CR_SO << FPRF_SHIFT); if (a != b) { @@ -329,12 +364,16 @@ void JitArm64::fcmpX(UGeckoInstruction inst) SetJumpTarget(pGreater); ORR(XA, XA, 0, 0, true); + if (fprf) + ORRI2R(fpscr_reg, fpscr_reg, PowerPC::CR_GT << FPRF_SHIFT); continue3 = B(); SetJumpTarget(pLesser); ORR(XA, XA, 64 - 62, 1, true); ORR(XA, XA, 0, 0, true); + if (fprf) + ORRI2R(fpscr_reg, fpscr_reg, PowerPC::CR_LT << FPRF_SHIFT); SetJumpTarget(continue2); SetJumpTarget(continue3); @@ -343,6 +382,12 @@ void JitArm64::fcmpX(UGeckoInstruction inst) ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a, true) && fpr.IsSingle(b, true)), "Register allocation turned singles into doubles in the middle of fcmpX"); + + if (fprf) + { + STR(IndexType::Unsigned, fpscr_reg, PPC_REG, PPCSTATE_OFF(fpscr)); + gpr.Unlock(fpscr_reg); + } } void JitArm64::fctiwzx(UGeckoInstruction inst) @@ -371,12 +416,12 @@ void JitArm64::fctiwzx(UGeckoInstruction inst) } else { - const ARM64Reg V1 = gpr.GetReg(); + const ARM64Reg WA = gpr.GetReg(); - m_float_emit.FCVTS(V1, EncodeRegToDouble(VB), RoundingMode::Z); - m_float_emit.FMOV(EncodeRegToSingle(VD), V1); + m_float_emit.FCVTS(WA, EncodeRegToDouble(VB), RoundingMode::Z); + m_float_emit.FMOV(EncodeRegToSingle(VD), WA); - gpr.Unlock(V1); + gpr.Unlock(WA); } m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(VD), EncodeRegToDouble(V0)); fpr.Unlock(V0); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index 7055a05079..068a4ed1bb 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -76,7 +76,6 @@ void JitArm64::ps_mulsX(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITPairedOff); FALLBACK_IF(inst.Rc); - FALLBACK_IF(SConfig::GetInstance().bFPRF && js.op->wantsFPRF); const u32 a = inst.FA; const u32 c = inst.FC; @@ -99,6 +98,8 @@ void JitArm64::ps_mulsX(UGeckoInstruction inst) "Register allocation turned singles into doubles in the middle of ps_mulsX"); fpr.FixSinglePrecision(d); + + SetFPRFIfNeeded(true, VD); } void JitArm64::ps_maddXX(UGeckoInstruction inst) @@ -106,7 +107,6 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITPairedOff); FALLBACK_IF(inst.Rc); - FALLBACK_IF(SConfig::GetInstance().bFPRF && js.op->wantsFPRF); const u32 a = inst.FA; const u32 b = inst.FB; @@ -257,13 +257,15 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) break; } + if (V0Q != ARM64Reg::INVALID_REG) + fpr.Unlock(V0Q); + ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)), "Register allocation turned singles into doubles in the middle of ps_maddXX"); fpr.FixSinglePrecision(d); - if (V0Q != ARM64Reg::INVALID_REG) - fpr.Unlock(V0Q); + SetFPRFIfNeeded(true, VD); } void JitArm64::ps_sel(UGeckoInstruction inst) @@ -311,7 +313,6 @@ void JitArm64::ps_sumX(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITPairedOff); FALLBACK_IF(inst.Rc); - FALLBACK_IF(SConfig::GetInstance().bFPRF && js.op->wantsFPRF); const u32 a = inst.FA; const u32 b = inst.FB; @@ -343,10 +344,12 @@ void JitArm64::ps_sumX(UGeckoInstruction inst) m_float_emit.INS(size, VD, upper ? 1 : 0, V0, upper ? 1 : 0); } + fpr.Unlock(V0); + ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)), "Register allocation turned singles into doubles in the middle of ps_sumX"); fpr.FixSinglePrecision(d); - fpr.Unlock(V0); + SetFPRFIfNeeded(true, VD); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 930d8240be..7dee0d8079 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -4,11 +4,14 @@ #include "Common/Arm64Emitter.h" #include "Common/CommonTypes.h" +#include "Common/FloatUtils.h" #include "Common/JitRegister.h" #include "Common/MathUtil.h" + #include "Core/CoreTiming.h" #include "Core/HW/CPU.h" #include "Core/HW/Memmap.h" +#include "Core/PowerPC/Gekko.h" #include "Core/PowerPC/JitArm64/Jit.h" #include "Core/PowerPC/JitCommon/JitAsmCommon.h" #include "Core/PowerPC/JitCommon/JitCache.h" @@ -203,6 +206,12 @@ void JitArm64::GenerateCommonAsm() GenerateConvertSingleToDouble(); JitRegister::Register(GetAsmRoutines()->cstd, GetCodePtr(), "JIT_cstd"); + GetAsmRoutines()->fprf_single = GetCodePtr(); + GenerateFPRF(true); + GetAsmRoutines()->fprf_double = GetCodePtr(); + GenerateFPRF(false); + JitRegister::Register(GetAsmRoutines()->fprf_single, GetCodePtr(), "JIT_FPRF"); + GenerateQuantizedLoadStores(); } @@ -272,6 +281,91 @@ void JitArm64::GenerateConvertSingleToDouble() RET(); } +// Input in X0. Outputs to memory (PPCState). Clobbers X0-X4 and flags. +void JitArm64::GenerateFPRF(bool single) +{ + const auto reg_encoder = single ? EncodeRegTo32 : EncodeRegTo64; + + const ARM64Reg input_reg = reg_encoder(ARM64Reg::W0); + const ARM64Reg temp_reg = reg_encoder(ARM64Reg::W1); + const ARM64Reg exp_reg = reg_encoder(ARM64Reg::W2); + + constexpr ARM64Reg fprf_reg = ARM64Reg::W3; + constexpr ARM64Reg fpscr_reg = ARM64Reg::W4; + + const auto INPUT_EXP_MASK = single ? Common::FLOAT_EXP : Common::DOUBLE_EXP; + const auto INPUT_FRAC_MASK = single ? Common::FLOAT_FRAC : Common::DOUBLE_FRAC; + constexpr u32 OUTPUT_SIGN_MASK = 0xC; + + // This code is duplicated for the most common cases for performance. + // For the less common cases, we branch to an existing copy of this code. + auto emit_write_fprf_and_ret = [&] { + BFI(fpscr_reg, fprf_reg, FPRF_SHIFT, FPRF_WIDTH); + STR(IndexType::Unsigned, fpscr_reg, PPC_REG, PPCSTATE_OFF(fpscr)); + RET(); + }; + + // First of all, start the load of the old FPSCR value, in case it takes a while + LDR(IndexType::Unsigned, fpscr_reg, PPC_REG, PPCSTATE_OFF(fpscr)); + + CMP(input_reg, 0); // Grab sign bit (conveniently the same bit for floats as for integers) + ANDI2R(exp_reg, input_reg, INPUT_EXP_MASK); // Grab exponent + + // Most branches handle the sign in the same way. Perform that handling before branching + MOVI2R(ARM64Reg::W3, Common::PPC_FPCLASS_PN); + MOVI2R(ARM64Reg::W1, Common::PPC_FPCLASS_NN); + CSEL(fprf_reg, ARM64Reg::W1, ARM64Reg::W3, CCFlags::CC_LT); + + FixupBranch zero_or_denormal = CBZ(exp_reg); + + // exp != 0 + MOVI2R(temp_reg, INPUT_EXP_MASK); + CMP(exp_reg, temp_reg); + FixupBranch nan_or_inf = B(CCFlags::CC_EQ); + + // exp != 0 && exp != EXP_MASK + const u8* normal = GetCodePtr(); + emit_write_fprf_and_ret(); + + // exp == 0 + SetJumpTarget(zero_or_denormal); + TSTI2R(input_reg, INPUT_FRAC_MASK); + FixupBranch denormal; + if (single) + { + // To match the interpreter, what we output should be based on how the input would be classified + // after conversion to double. Converting a denormal single to a double always results in a + // normal double, so for denormal singles we need to output PPC_FPCLASS_PN/PPC_FPCLASS_NN. + // TODO: Hardware test that the interpreter actually is correct. + B(CCFlags::CC_NEQ, normal); + } + else + { + denormal = B(CCFlags::CC_NEQ); + } + + // exp == 0 && frac == 0 + LSR(ARM64Reg::W1, fprf_reg, 3); + MOVI2R(fprf_reg, Common::PPC_FPCLASS_PZ & ~OUTPUT_SIGN_MASK); + BFI(fprf_reg, ARM64Reg::W1, 4, 1); + const u8* write_fprf_and_ret = GetCodePtr(); + emit_write_fprf_and_ret(); + + // exp == 0 && frac != 0 + if (!single) + SetJumpTarget(denormal); + ORRI2R(fprf_reg, fprf_reg, Common::PPC_FPCLASS_PD & ~OUTPUT_SIGN_MASK); + B(write_fprf_and_ret); + + // exp == EXP_MASK + SetJumpTarget(nan_or_inf); + TSTI2R(input_reg, INPUT_FRAC_MASK); + ORRI2R(ARM64Reg::W1, fprf_reg, Common::PPC_FPCLASS_PINF & ~OUTPUT_SIGN_MASK); + MOVI2R(ARM64Reg::W2, Common::PPC_FPCLASS_QNAN); + CSEL(fprf_reg, ARM64Reg::W1, ARM64Reg::W2, CCFlags::CC_EQ); + B(write_fprf_and_ret); +} + void JitArm64::GenerateQuantizedLoadStores() { // X0 is the scale diff --git a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h index c525e7849c..c4f4dc7ca9 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitAsmCommon.h @@ -27,6 +27,8 @@ struct CommonAsmRoutinesBase const u8* mfcr; const u8* cdts; const u8* cstd; + const u8* fprf_single; + const u8* fprf_double; // In: array index: GQR to use. // In: ECX: Address to read from.