diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 2aa3c99198..b24219171b 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -2310,6 +2310,12 @@ void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd (DecodeReg(Rn) << 5) | DecodeReg(Rd)); } +void ARM64FloatEmitter::EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) +{ + Write32((1 << 30) | (U << 29) | (0b11110001 << 21) | (size << 22) | (opcode << 12) | (1 << 11) | + (DecodeReg(Rn) << 5) | DecodeReg(Rd)); +} + void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) { ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __func__); @@ -3102,6 +3108,15 @@ void ARM64FloatEmitter::FSQRT(ARM64Reg Rd, ARM64Reg Rn) EmitScalar1Source(0, 0, IsDouble(Rd), 3, Rd, Rn); } +void ARM64FloatEmitter::FRECPE(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalar2RegMisc(0, 2 | IsDouble(Rd), 0x1D, Rd, Rn); +} +void ARM64FloatEmitter::FRSQRTE(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalar2RegMisc(1, 2 | IsDouble(Rd), 0x1D, Rd, Rn); +} + // Scalar - 2 Source void ARM64FloatEmitter::FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 58caec8d08..1fd0c938e5 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -996,6 +996,8 @@ public: void FNEG(ARM64Reg Rd, ARM64Reg Rn); void FSQRT(ARM64Reg Rd, ARM64Reg Rn); void FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top = false); // Also generalized move between GPR/FP + void FRECPE(ARM64Reg Rd, ARM64Reg Rn); + void FRSQRTE(ARM64Reg Rd, ARM64Reg Rn); // Scalar - 2 Source void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); @@ -1145,6 +1147,7 @@ private: ARM64Reg Rm); void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn); + void EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 7d7bba6404..9936b55a2d 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -140,6 +140,8 @@ public: void fcmpX(UGeckoInstruction inst); void frspx(UGeckoInstruction inst); void fctiwzx(UGeckoInstruction inst); + void fresx(UGeckoInstruction inst); + void frsqrtex(UGeckoInstruction inst); // Paired void ps_maddXX(UGeckoInstruction inst); @@ -147,6 +149,8 @@ public: void ps_mulsX(UGeckoInstruction inst); void ps_sel(UGeckoInstruction inst); void ps_sumX(UGeckoInstruction inst); + void ps_res(UGeckoInstruction inst); + void ps_rsqrte(UGeckoInstruction inst); // Loadstore paired void psq_l(UGeckoInstruction inst); @@ -232,6 +236,8 @@ protected: // AsmRoutines void GenerateAsm(); void GenerateCommonAsm(); + void GenerateFres(); + void GenerateFrsqrte(); void GenerateConvertDoubleToSingle(); void GenerateConvertSingleToDouble(); void GenerateFPRF(bool single); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 57ad09c1c8..649a70037a 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -24,8 +24,16 @@ void JitArm64::SetFPRFIfNeeded(bool single, ARM64Reg reg) gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); - reg = single ? EncodeRegToSingle(reg) : EncodeRegToDouble(reg); - m_float_emit.FMOV(single ? ARM64Reg::W0 : ARM64Reg::X0, reg); + const ARM64Reg routine_input_reg = single ? ARM64Reg::W0 : ARM64Reg::X0; + if (IsVector(reg)) + { + m_float_emit.FMOV(routine_input_reg, single ? EncodeRegToSingle(reg) : EncodeRegToDouble(reg)); + } + else if (reg != routine_input_reg) + { + MOV(routine_input_reg, reg); + } + BL(single ? GetAsmRoutines()->fprf_single : GetAsmRoutines()->fprf_double); gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); @@ -430,6 +438,60 @@ void JitArm64::fctiwzx(UGeckoInstruction inst) "Register allocation turned singles into doubles in the middle of fctiwzx"); } +void JitArm64::fresx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + const u32 b = inst.FB; + const u32 d = inst.FD; + + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + fpr.Lock(ARM64Reg::Q0); + + const ARM64Reg VB = fpr.R(b, RegType::LowerPair); + m_float_emit.FMOV(ARM64Reg::X1, EncodeRegToDouble(VB)); + m_float_emit.FRECPE(ARM64Reg::D0, EncodeRegToDouble(VB)); + + BL(GetAsmRoutines()->fres); + + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + fpr.Unlock(ARM64Reg::Q0); + + const ARM64Reg VD = fpr.RW(d, RegType::Duplicated); + m_float_emit.FMOV(EncodeRegToDouble(VD), ARM64Reg::X0); + + SetFPRFIfNeeded(false, ARM64Reg::X0); +} + +void JitArm64::frsqrtex(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + const u32 b = inst.FB; + const u32 d = inst.FD; + + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + fpr.Lock(ARM64Reg::Q0); + + const ARM64Reg VB = fpr.R(b, RegType::LowerPair); + m_float_emit.FMOV(ARM64Reg::X1, EncodeRegToDouble(VB)); + m_float_emit.FRSQRTE(ARM64Reg::D0, EncodeRegToDouble(VB)); + + BL(GetAsmRoutines()->frsqrte); + + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + fpr.Unlock(ARM64Reg::Q0); + + const ARM64Reg VD = fpr.RW(d, RegType::LowerPair); + m_float_emit.FMOV(EncodeRegToDouble(VD), ARM64Reg::X0); + + SetFPRFIfNeeded(false, ARM64Reg::X0); +} + // Since the following float conversion functions are used in non-arithmetic PPC float // instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs // into QNaNs. This means we can't just use FCVT/FCVTL/FCVTN. diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index 068a4ed1bb..1808d14f51 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -353,3 +353,67 @@ void JitArm64::ps_sumX(UGeckoInstruction inst) SetFPRFIfNeeded(true, VD); } + +void JitArm64::ps_res(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + const u32 b = inst.FB; + const u32 d = inst.FD; + + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + fpr.Lock(ARM64Reg::Q0); + + const ARM64Reg VB = fpr.R(b, RegType::Register); + const ARM64Reg VD = fpr.RW(d, RegType::Register); + + m_float_emit.FMOV(ARM64Reg::X1, EncodeRegToDouble(VB)); + m_float_emit.FRECPE(64, ARM64Reg::Q0, EncodeRegToQuad(VB)); + BL(GetAsmRoutines()->fres); + m_float_emit.UMOV(64, ARM64Reg::X1, EncodeRegToQuad(VB), 1); + m_float_emit.DUP(64, ARM64Reg::Q0, ARM64Reg::Q0, 1); + m_float_emit.FMOV(EncodeRegToDouble(VD), ARM64Reg::X0); + BL(GetAsmRoutines()->fres); + m_float_emit.INS(64, EncodeRegToQuad(VD), 1, ARM64Reg::X0); + + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + fpr.Unlock(ARM64Reg::Q0); + + fpr.FixSinglePrecision(d); + + SetFPRFIfNeeded(true, VD); +} + +void JitArm64::ps_rsqrte(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITPairedOff); + FALLBACK_IF(inst.Rc); + + const u32 b = inst.FB; + const u32 d = inst.FD; + + gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + fpr.Lock(ARM64Reg::Q0); + + const ARM64Reg VB = fpr.R(b, RegType::Register); + const ARM64Reg VD = fpr.RW(d, RegType::Register); + + m_float_emit.FMOV(ARM64Reg::X1, EncodeRegToDouble(VB)); + m_float_emit.FRSQRTE(64, ARM64Reg::Q0, EncodeRegToQuad(VB)); + BL(GetAsmRoutines()->frsqrte); + m_float_emit.UMOV(64, ARM64Reg::X1, EncodeRegToQuad(VB), 1); + m_float_emit.DUP(64, ARM64Reg::Q0, ARM64Reg::Q0, 1); + m_float_emit.FMOV(EncodeRegToDouble(VD), ARM64Reg::X0); + BL(GetAsmRoutines()->frsqrte); + m_float_emit.INS(64, EncodeRegToQuad(VD), 1, ARM64Reg::X0); + + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W4, ARM64Reg::W30); + fpr.Unlock(ARM64Reg::Q0); + + fpr.FixSinglePrecision(d); + + SetFPRFIfNeeded(true, VD); +} diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp index 4d63101718..471a4566c8 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp @@ -106,23 +106,23 @@ constexpr std::array table4{{ }}; constexpr std::array table4_2{{ - {10, &JitArm64::ps_sumX}, // ps_sum0 - {11, &JitArm64::ps_sumX}, // ps_sum1 - {12, &JitArm64::ps_mulsX}, // ps_muls0 - {13, &JitArm64::ps_mulsX}, // ps_muls1 - {14, &JitArm64::ps_maddXX}, // ps_madds0 - {15, &JitArm64::ps_maddXX}, // ps_madds1 - {18, &JitArm64::fp_arith}, // ps_div - {20, &JitArm64::fp_arith}, // ps_sub - {21, &JitArm64::fp_arith}, // ps_add - {23, &JitArm64::ps_sel}, // ps_sel - {24, &JitArm64::FallBackToInterpreter}, // ps_res - {25, &JitArm64::fp_arith}, // ps_mul - {26, &JitArm64::FallBackToInterpreter}, // ps_rsqrte - {28, &JitArm64::ps_maddXX}, // ps_msub - {29, &JitArm64::ps_maddXX}, // ps_madd - {30, &JitArm64::ps_maddXX}, // ps_nmsub - {31, &JitArm64::ps_maddXX}, // ps_nmadd + {10, &JitArm64::ps_sumX}, // ps_sum0 + {11, &JitArm64::ps_sumX}, // ps_sum1 + {12, &JitArm64::ps_mulsX}, // ps_muls0 + {13, &JitArm64::ps_mulsX}, // ps_muls1 + {14, &JitArm64::ps_maddXX}, // ps_madds0 + {15, &JitArm64::ps_maddXX}, // ps_madds1 + {18, &JitArm64::fp_arith}, // ps_div + {20, &JitArm64::fp_arith}, // ps_sub + {21, &JitArm64::fp_arith}, // ps_add + {23, &JitArm64::ps_sel}, // ps_sel + {24, &JitArm64::ps_res}, // ps_res + {25, &JitArm64::fp_arith}, // ps_mul + {26, &JitArm64::ps_rsqrte}, // ps_rsqrte + {28, &JitArm64::ps_maddXX}, // ps_msub + {29, &JitArm64::ps_maddXX}, // ps_madd + {30, &JitArm64::ps_maddXX}, // ps_nmsub + {31, &JitArm64::ps_maddXX}, // ps_nmadd }}; constexpr std::array table4_3{{ @@ -293,15 +293,15 @@ constexpr std::array table31{{ }}; constexpr std::array table59{{ - {18, &JitArm64::fp_arith}, // fdivsx - {20, &JitArm64::fp_arith}, // fsubsx - {21, &JitArm64::fp_arith}, // faddsx - {24, &JitArm64::FallBackToInterpreter}, // fresx - {25, &JitArm64::fp_arith}, // fmulsx - {28, &JitArm64::fp_arith}, // fmsubsx - {29, &JitArm64::fp_arith}, // fmaddsx - {30, &JitArm64::fp_arith}, // fnmsubsx - {31, &JitArm64::fp_arith}, // fnmaddsx + {18, &JitArm64::fp_arith}, // fdivsx + {20, &JitArm64::fp_arith}, // fsubsx + {21, &JitArm64::fp_arith}, // faddsx + {24, &JitArm64::fresx}, // fresx + {25, &JitArm64::fp_arith}, // fmulsx + {28, &JitArm64::fp_arith}, // fmsubsx + {29, &JitArm64::fp_arith}, // fmaddsx + {30, &JitArm64::fp_arith}, // fnmsubsx + {31, &JitArm64::fp_arith}, // fnmaddsx }}; constexpr std::array table63{{ @@ -324,16 +324,16 @@ constexpr std::array table63{{ }}; constexpr std::array table63_2{{ - {18, &JitArm64::fp_arith}, // fdivx - {20, &JitArm64::fp_arith}, // fsubx - {21, &JitArm64::fp_arith}, // faddx - {23, &JitArm64::fselx}, // fselx - {25, &JitArm64::fp_arith}, // fmulx - {26, &JitArm64::FallBackToInterpreter}, // frsqrtex - {28, &JitArm64::fp_arith}, // fmsubx - {29, &JitArm64::fp_arith}, // fmaddx - {30, &JitArm64::fp_arith}, // fnmsubx - {31, &JitArm64::fp_arith}, // fnmaddx + {18, &JitArm64::fp_arith}, // fdivx + {20, &JitArm64::fp_arith}, // fsubx + {21, &JitArm64::fp_arith}, // faddx + {23, &JitArm64::fselx}, // fselx + {25, &JitArm64::fp_arith}, // fmulx + {26, &JitArm64::frsqrtex}, // frsqrtex + {28, &JitArm64::fp_arith}, // fmsubx + {29, &JitArm64::fp_arith}, // fmaddx + {30, &JitArm64::fp_arith}, // fnmsubx + {31, &JitArm64::fp_arith}, // fnmaddx }}; constexpr std::array dynaOpTable = [] { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 7dee0d8079..e38a5706e2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -2,7 +2,10 @@ // Licensed under GPLv2+ // Refer to the license.txt file included. +#include + #include "Common/Arm64Emitter.h" +#include "Common/BitUtils.h" #include "Common/CommonTypes.h" #include "Common/FloatUtils.h" #include "Common/JitRegister.h" @@ -198,6 +201,14 @@ void JitArm64::GenerateAsm() void JitArm64::GenerateCommonAsm() { + GetAsmRoutines()->fres = GetCodePtr(); + GenerateFres(); + JitRegister::Register(GetAsmRoutines()->fres, GetCodePtr(), "JIT_fres"); + + GetAsmRoutines()->frsqrte = GetCodePtr(); + GenerateFrsqrte(); + JitRegister::Register(GetAsmRoutines()->frsqrte, GetCodePtr(), "JIT_frsqrte"); + GetAsmRoutines()->cdts = GetCodePtr(); GenerateConvertDoubleToSingle(); JitRegister::Register(GetAsmRoutines()->cdts, GetCodePtr(), "JIT_cdts"); @@ -215,6 +226,125 @@ void JitArm64::GenerateCommonAsm() GenerateQuantizedLoadStores(); } +// Input: X1 contains input, and D0 contains result of running the input through AArch64 FRECPE. +// Output in X0 and memory (PPCState). Clobbers X0-X4 and flags. +void JitArm64::GenerateFres() +{ + // The idea behind this implementation: AArch64's frecpe instruction calculates the exponent and + // sign the same way as PowerPC's fresx does. For the special inputs zero, NaN and infinity, + // even the mantissa matches. But the mantissa does not match for most other inputs, so in the + // normal case we calculate the mantissa using the table-based algorithm from the interpreter. + + UBFX(ARM64Reg::X2, ARM64Reg::X1, 52, 11); // Grab the exponent + m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D0); + CMP(ARM64Reg::X2, 895); + ANDI2R(ARM64Reg::X3, ARM64Reg::X1, Common::DOUBLE_SIGN); + FixupBranch small_exponent = B(CCFlags::CC_LO); + + MOVI2R(ARM64Reg::X4, 1148LL); + CMP(ARM64Reg::X2, ARM64Reg::X4); + FixupBranch large_exponent = B(CCFlags::CC_HI); + + UBFX(ARM64Reg::X2, ARM64Reg::X1, 47, 5); // Grab upper part of mantissa + MOVP2R(ARM64Reg::X3, &Common::fres_expected); + ADD(ARM64Reg::X2, ARM64Reg::X3, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3)); + LDP(IndexType::Signed, ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::X2, 0); + UBFX(ARM64Reg::X1, ARM64Reg::X1, 37, 10); // Grab lower part of mantissa + MOVI2R(ARM64Reg::W4, 1); + ANDI2R(ARM64Reg::X0, ARM64Reg::X0, Common::DOUBLE_SIGN | Common::DOUBLE_EXP); + MADD(ARM64Reg::W1, ARM64Reg::W3, ARM64Reg::W1, ARM64Reg::W4); + SUB(ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W1, ArithOption(ARM64Reg::W1, ShiftType::LSR, 1)); + ORR(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X1, ArithOption(ARM64Reg::X1, ShiftType::LSL, 29)); + RET(); + + SetJumpTarget(small_exponent); + TSTI2R(ARM64Reg::X1, Common::DOUBLE_EXP | Common::DOUBLE_FRAC); + FixupBranch zero = B(CCFlags::CC_EQ); + MOVI2R(ARM64Reg::X4, + Common::BitCast(static_cast(std::numeric_limits::max()))); + ORR(ARM64Reg::X0, ARM64Reg::X3, ARM64Reg::X4); + RET(); + + SetJumpTarget(zero); + LDR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); + FixupBranch skip_set_zx = TBNZ(ARM64Reg::W4, 26); + ORRI2R(ARM64Reg::W4, ARM64Reg::W4, FPSCR_FX | FPSCR_ZX, ARM64Reg::W2); + STR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); + SetJumpTarget(skip_set_zx); + RET(); + + SetJumpTarget(large_exponent); + MOVI2R(ARM64Reg::X4, 0x7FF); + CMP(ARM64Reg::X2, ARM64Reg::X4); + CSEL(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X3, CCFlags::CC_EQ); + RET(); +} + +// Input: X1 contains input, and D0 contains result of running the input through AArch64 FRSQRTE. +// Output in X0 and memory (PPCState). Clobbers X0-X4 and flags. +void JitArm64::GenerateFrsqrte() +{ + // The idea behind this implementation: AArch64's frsqrte instruction calculates the exponent and + // sign the same way as PowerPC's frsqrtex does. For the special inputs zero, negative, NaN and + // inf, even the mantissa matches. But the mantissa does not match for most other inputs, so in + // the normal case we calculate the mantissa using the table-based algorithm from the interpreter. + + TSTI2R(ARM64Reg::X1, Common::DOUBLE_EXP | Common::DOUBLE_FRAC); + m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D0); + FixupBranch zero = B(CCFlags::CC_EQ); + ANDI2R(ARM64Reg::X2, ARM64Reg::X1, Common::DOUBLE_EXP); + MOVI2R(ARM64Reg::X3, Common::DOUBLE_EXP); + CMP(ARM64Reg::X2, ARM64Reg::X3); + FixupBranch nan_or_inf = B(CCFlags::CC_EQ); + FixupBranch negative = TBNZ(ARM64Reg::X1, 63); + ANDI2R(ARM64Reg::X3, ARM64Reg::X1, Common::DOUBLE_FRAC); + FixupBranch normal = CBNZ(ARM64Reg::X2); + + // "Normalize" denormal values + CLZ(ARM64Reg::X3, ARM64Reg::X3); + SUB(ARM64Reg::X4, ARM64Reg::X3, 11); + MOVI2R(ARM64Reg::X2, 0x00C0'0000'0000'0000); + LSLV(ARM64Reg::X4, ARM64Reg::X1, ARM64Reg::X4); + SUB(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSL, 52)); + ANDI2R(ARM64Reg::X3, ARM64Reg::X4, Common::DOUBLE_FRAC - 1); + + SetJumpTarget(normal); + LSR(ARM64Reg::X2, ARM64Reg::X2, 48); + ANDI2R(ARM64Reg::X2, ARM64Reg::X2, 0x10); + MOVP2R(ARM64Reg::X1, &Common::frsqrte_expected); + ORR(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X8, ShiftType::LSR, 48)); + EORI2R(ARM64Reg::X2, ARM64Reg::X2, 0x10); + ADD(ARM64Reg::X2, ARM64Reg::X1, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3)); + LDP(IndexType::Signed, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::X2, 0); + UBFX(ARM64Reg::X3, ARM64Reg::X3, 37, 11); + ANDI2R(ARM64Reg::X0, ARM64Reg::X0, Common::DOUBLE_SIGN | Common::DOUBLE_EXP); + MSUB(ARM64Reg::W3, ARM64Reg::W3, ARM64Reg::W2, ARM64Reg::W1); + ORR(ARM64Reg::X0, ARM64Reg::X0, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSL, 26)); + RET(); + + SetJumpTarget(zero); + LDR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); + FixupBranch skip_set_zx = TBNZ(ARM64Reg::W4, 26); + ORRI2R(ARM64Reg::W4, ARM64Reg::W4, FPSCR_FX | FPSCR_ZX, ARM64Reg::W2); + STR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); + SetJumpTarget(skip_set_zx); + RET(); + + SetJumpTarget(nan_or_inf); + MOVI2R(ARM64Reg::X3, Common::BitCast(-std::numeric_limits::infinity())); + CMP(ARM64Reg::X1, ARM64Reg::X3); + FixupBranch nan_or_positive_inf = B(CCFlags::CC_NEQ); + + SetJumpTarget(negative); + LDR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); + FixupBranch skip_set_vxsqrt = TBNZ(ARM64Reg::W4, 9); + ORRI2R(ARM64Reg::W4, ARM64Reg::W4, FPSCR_FX | FPSCR_VXSQRT, ARM64Reg::W2); + STR(IndexType::Unsigned, ARM64Reg::W4, PPC_REG, PPCSTATE_OFF(fpscr)); + SetJumpTarget(skip_set_vxsqrt); + SetJumpTarget(nan_or_positive_inf); + RET(); +} + // Input in X0, output in W1, clobbers X0-X3 and flags. void JitArm64::GenerateConvertDoubleToSingle() { diff --git a/Source/UnitTests/Core/CMakeLists.txt b/Source/UnitTests/Core/CMakeLists.txt index 6dd344e638..134f7da78b 100644 --- a/Source/UnitTests/Core/CMakeLists.txt +++ b/Source/UnitTests/Core/CMakeLists.txt @@ -25,6 +25,8 @@ elseif(_M_ARM_64) PowerPC/DivUtilsTest.cpp PowerPC/JitArm64/ConvertSingleDouble.cpp PowerPC/JitArm64/FPRF.cpp + PowerPC/JitArm64/Fres.cpp + PowerPC/JitArm64/Frsqrte.cpp PowerPC/JitArm64/MovI2R.cpp ) else() diff --git a/Source/UnitTests/Core/PowerPC/JitArm64/Fres.cpp b/Source/UnitTests/Core/PowerPC/JitArm64/Fres.cpp new file mode 100644 index 0000000000..027a3ac89f --- /dev/null +++ b/Source/UnitTests/Core/PowerPC/JitArm64/Fres.cpp @@ -0,0 +1,66 @@ +// Copyright 2021 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include + +#include "Common/Arm64Emitter.h" +#include "Common/BitUtils.h" +#include "Common/CommonTypes.h" +#include "Core/PowerPC/Interpreter/Interpreter_FPUtils.h" +#include "Core/PowerPC/JitArm64/Jit.h" +#include "Core/PowerPC/PowerPC.h" + +#include "../TestValues.h" + +#include + +namespace +{ +using namespace Arm64Gen; + +class TestFres : public JitArm64 +{ +public: + TestFres() + { + AllocCodeSpace(4096); + + const u8* raw_fres = GetCodePtr(); + GenerateFres(); + + fres = Common::BitCast(GetCodePtr()); + MOV(ARM64Reg::X15, ARM64Reg::X30); + MOV(ARM64Reg::X14, PPC_REG); + MOVP2R(PPC_REG, &PowerPC::ppcState); + MOV(ARM64Reg::X1, ARM64Reg::X0); + m_float_emit.FMOV(ARM64Reg::D0, ARM64Reg::X0); + m_float_emit.FRECPE(ARM64Reg::D0, ARM64Reg::D0); + BL(raw_fres); + MOV(ARM64Reg::X30, ARM64Reg::X15); + MOV(PPC_REG, ARM64Reg::X14); + RET(); + } + + std::function fres; +}; + +} // namespace + +TEST(JitArm64, Fres) +{ + TestFres test; + + for (const u64 ivalue : double_test_values) + { + const double dvalue = Common::BitCast(ivalue); + + const u64 expected = Common::BitCast(Common::ApproximateReciprocal(dvalue)); + const u64 actual = test.fres(ivalue); + + if (expected != actual) + fmt::print("{:016x} -> {:016x} == {:016x}\n", ivalue, actual, expected); + + EXPECT_EQ(expected, actual); + } +} diff --git a/Source/UnitTests/Core/PowerPC/JitArm64/Frsqrte.cpp b/Source/UnitTests/Core/PowerPC/JitArm64/Frsqrte.cpp new file mode 100644 index 0000000000..749b147dcb --- /dev/null +++ b/Source/UnitTests/Core/PowerPC/JitArm64/Frsqrte.cpp @@ -0,0 +1,66 @@ +// Copyright 2021 Dolphin Emulator Project +// Licensed under GPLv2+ +// Refer to the license.txt file included. + +#include + +#include "Common/Arm64Emitter.h" +#include "Common/BitUtils.h" +#include "Common/CommonTypes.h" +#include "Core/PowerPC/Interpreter/Interpreter_FPUtils.h" +#include "Core/PowerPC/JitArm64/Jit.h" +#include "Core/PowerPC/PowerPC.h" + +#include "../TestValues.h" + +#include + +namespace +{ +using namespace Arm64Gen; + +class TestFrsqrte : public JitArm64 +{ +public: + TestFrsqrte() + { + AllocCodeSpace(4096); + + const u8* raw_frsqrte = GetCodePtr(); + GenerateFrsqrte(); + + frsqrte = Common::BitCast(GetCodePtr()); + MOV(ARM64Reg::X15, ARM64Reg::X30); + MOV(ARM64Reg::X14, PPC_REG); + MOVP2R(PPC_REG, &PowerPC::ppcState); + MOV(ARM64Reg::X1, ARM64Reg::X0); + m_float_emit.FMOV(ARM64Reg::D0, ARM64Reg::X0); + m_float_emit.FRSQRTE(ARM64Reg::D0, ARM64Reg::D0); + BL(raw_frsqrte); + MOV(ARM64Reg::X30, ARM64Reg::X15); + MOV(PPC_REG, ARM64Reg::X14); + RET(); + } + + std::function frsqrte; +}; + +} // namespace + +TEST(JitArm64, Frsqrte) +{ + TestFrsqrte test; + + for (const u64 ivalue : double_test_values) + { + const double dvalue = Common::BitCast(ivalue); + + const u64 expected = Common::BitCast(Common::ApproximateReciprocalSquareRoot(dvalue)); + const u64 actual = test.frsqrte(ivalue); + + if (expected != actual) + fmt::print("{:016x} -> {:016x} == {:016x}\n", ivalue, actual, expected); + + EXPECT_EQ(expected, actual); + } +} diff --git a/Source/UnitTests/Core/PowerPC/TestValues.h b/Source/UnitTests/Core/PowerPC/TestValues.h index 20fbbe913c..a1df5b0ff5 100644 --- a/Source/UnitTests/Core/PowerPC/TestValues.h +++ b/Source/UnitTests/Core/PowerPC/TestValues.h @@ -8,7 +8,7 @@ #include "Common/CommonTypes.h" -constexpr std::array double_test_values{ +constexpr std::array double_test_values{ // Special values 0x0000'0000'0000'0000, // positive zero 0x0000'0000'0000'0001, // smallest positive denormal @@ -54,13 +54,25 @@ constexpr std::array double_test_values{ 0x3680'1234'5678'9ABC, 0x36A0'1234'5678'9ABC, 0x36B0'1234'5678'9ABC, 0xB680'1234'5678'9ABC, 0xB6A0'1234'5678'9ABC, 0xB6B0'1234'5678'9ABC, + // (exp > 1148) Boundary case for fres + 0x47C0'0000'0000'0000, // 2^125 = fres result is non-zero + 0x47D0'0000'0000'0000, // 2^126 = fres result is zero + 0xC7C0'0000'0000'0000, // -2^125 = fres result is non-zero + 0xC7D0'0000'0000'0000, // -2^126 = fres result is zero + + // (exp < 895) Boundary case for fres + 0x37F0'0000'0000'0000, // 2^(-128) = fres result is non-max + 0x37E0'0000'0000'0000, // 2^(-129) = fres result is max + 0xB7F0'0000'0000'0000, // -2^(-128) = fres result is non-max + 0xB7E0'0000'0000'0000, // -2^(-129) = fres result is max + // Some typical numbers 0x3FF8'0000'0000'0000, // 1.5 0x408F'4000'0000'0000, // 1000 0xC008'0000'0000'0000, // -3 }; -constexpr std::array single_test_values{ +constexpr std::array single_test_values{ // Special values 0x0000'0000, // positive zero 0x0000'0001, // smallest positive denormal @@ -89,6 +101,12 @@ constexpr std::array single_test_values{ 0xFFC0'0000, // first negative QNaN 0xFFFF'FFFF, // last negative QNaN + // (exp > 252) Boundary case for fres + 0x7E00'0000, // 2^125 = fres result is non-zero + 0x7E80'0000, // 2^126 = fres result is zero + 0xC7C0'0000, // -2^125 = fres result is non-zero + 0xC7D0'0000, // -2^126 = fres result is zero + // Some typical numbers 0x3FC0'0000, // 1.5 0x447A'0000, // 1000 diff --git a/Source/UnitTests/UnitTests.vcxproj b/Source/UnitTests/UnitTests.vcxproj index 69349d51b0..3d288aab80 100644 --- a/Source/UnitTests/UnitTests.vcxproj +++ b/Source/UnitTests/UnitTests.vcxproj @@ -84,6 +84,8 @@ + +