From 5c41d3b602559af56b618c6ef890b6af2fa95704 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 20 Nov 2022 23:06:16 +0100 Subject: [PATCH 1/2] JitArm64: Refactor temp reg handling in fp_arith/ps_arith --- .../JitArm64/JitArm64_FloatingPoint.cpp | 63 ++++---- .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 141 ++++++------------ 2 files changed, 83 insertions(+), 121 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 26c6dfd1b7..bc0e427a66 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -71,6 +71,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst) const bool use_c = op5 >= 25; // fmul and all kind of fmaddXX const bool use_b = op5 != 25; // fmul uses no B + const bool fma = use_b && use_c; + const bool negate_result = (op5 & ~0x1) == 30; const bool output_is_single = inst.OPCD == 59; const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA); @@ -92,43 +94,44 @@ void JitArm64::fp_arith(UGeckoInstruction inst) const ARM64Reg VA = reg_encoder(fpr.R(a, type)); const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG; - ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; + const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; const ARM64Reg VD = reg_encoder(fpr.RW(d, type_out)); ARM64Reg V0Q = ARM64Reg::INVALID_REG; - ARM64Reg V1Q = ARM64Reg::INVALID_REG; + ARM64Reg rounded_c_reg = VC; if (round_c) { ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single"); - V1Q = fpr.GetReg(); - - Force25BitPrecision(reg_encoder(V1Q), VC); - VC = reg_encoder(V1Q); - } - - ARM64Reg inaccurate_fma_temp_reg = VD; - if (inaccurate_fma && d == b) - { V0Q = fpr.GetReg(); - - inaccurate_fma_temp_reg = reg_encoder(V0Q); + rounded_c_reg = reg_encoder(V0Q); + Force25BitPrecision(rounded_c_reg, VC); } + ARM64Reg inaccurate_fma_reg = VD; + if (fma && inaccurate_fma && VD == VB) + { + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetReg(); + inaccurate_fma_reg = reg_encoder(V0Q); + } + + ARM64Reg result_reg = VD; + switch (op5) { case 18: - m_float_emit.FDIV(VD, VA, VB); + m_float_emit.FDIV(result_reg, VA, VB); break; case 20: - m_float_emit.FSUB(VD, VA, VB); + m_float_emit.FSUB(result_reg, VA, VB); break; case 21: - m_float_emit.FADD(VD, VA, VB); + m_float_emit.FADD(result_reg, VA, VB); break; case 25: - m_float_emit.FMUL(VD, VA, VC); + m_float_emit.FMUL(result_reg, VA, rounded_c_reg); break; // While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic], // the subtly different definitions affect how signed zeroes are handled. @@ -138,39 +141,41 @@ void JitArm64::fp_arith(UGeckoInstruction inst) case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)" if (inaccurate_fma) { - m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC); - m_float_emit.FSUB(VD, inaccurate_fma_temp_reg, VB); + m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg); + m_float_emit.FSUB(result_reg, inaccurate_fma_reg, VB); } else { - m_float_emit.FNMSUB(VD, VA, VC, VB); + m_float_emit.FNMSUB(result_reg, VA, rounded_c_reg, VB); } - if (op5 == 30) - m_float_emit.FNEG(VD, VD); break; case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm" case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)" if (inaccurate_fma) { - m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC); - m_float_emit.FADD(VD, inaccurate_fma_temp_reg, VB); + m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg); + m_float_emit.FADD(result_reg, inaccurate_fma_reg, VB); } else { - m_float_emit.FMADD(VD, VA, VC, VB); + m_float_emit.FMADD(result_reg, VA, rounded_c_reg, VB); } - if (op5 == 31) - m_float_emit.FNEG(VD, VD); break; default: ASSERT_MSG(DYNA_REC, 0, "fp_arith"); break; } + + // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case + // for any of AArch64's FMA instructions, so we negate using a separate instruction. + if (negate_result) + m_float_emit.FNEG(VD, result_reg); + else if (result_reg != VD) + m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg)); + if (V0Q != ARM64Reg::INVALID_REG) fpr.Unlock(V0Q); - if (V1Q != ARM64Reg::INVALID_REG) - fpr.Unlock(V1Q); if (output_is_single) { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index 239a235533..4e986b0ce2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -85,6 +85,9 @@ void JitArm64::ps_arith(UGeckoInstruction inst) const bool use_c = op5 == 25 || (op5 & ~0x13) == 12; // mul, muls, and all kinds of maddXX const bool use_b = op5 != 25 && (op5 & ~0x1) != 12; // mul and muls don't use B + const bool fma = use_b && use_c; + const bool negate_result = (op5 & ~0x1) == 30; + const bool msub = op5 == 28 || op5 == 30; const auto singles_func = [&] { return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && (!use_c || fpr.IsSingle(c)); @@ -99,147 +102,108 @@ void JitArm64::ps_arith(UGeckoInstruction inst) const ARM64Reg VA = reg_encoder(fpr.R(a, type)); const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG; - ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; + const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; const ARM64Reg VD = reg_encoder(fpr.RW(d, type)); ARM64Reg V0Q = ARM64Reg::INVALID_REG; - ARM64Reg V0 = ARM64Reg::INVALID_REG; ARM64Reg V1Q = ARM64Reg::INVALID_REG; - const auto allocate_v0_if_needed = [&] { - if (V0Q == ARM64Reg::INVALID_REG) - { - V0Q = fpr.GetReg(); - V0 = reg_encoder(V0Q); - } - }; - + ARM64Reg rounded_c_reg = VC; if (round_c) { ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single"); - V1Q = fpr.GetReg(); - - Force25BitPrecision(reg_encoder(V1Q), VC); - VC = reg_encoder(V1Q); + V0Q = fpr.GetReg(); + rounded_c_reg = reg_encoder(V0Q); + Force25BitPrecision(rounded_c_reg, VC); } - ARM64Reg inaccurate_fma_temp_reg = VD; - if (inaccurate_fma && d == b) + ARM64Reg inaccurate_fma_reg = VD; + if (fma && inaccurate_fma && VD == VB) { - allocate_v0_if_needed(); - inaccurate_fma_temp_reg = V0; + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetReg(); + inaccurate_fma_reg = reg_encoder(V0Q); } ARM64Reg result_reg = VD; + if (fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg)) + { + V1Q = fpr.GetReg(); + result_reg = reg_encoder(V1Q); + } + switch (op5) { case 12: // ps_muls0: d = a * c.ps0 - m_float_emit.FMUL(size, VD, VA, VC, 0); + m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 0); break; case 13: // ps_muls1: d = a * c.ps1 - m_float_emit.FMUL(size, VD, VA, VC, 1); + m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 1); break; case 14: // ps_madds0: d = a * c.ps0 + b if (inaccurate_fma) { - m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC, 0); - m_float_emit.FADD(size, VD, inaccurate_fma_temp_reg, VB); - } - else if (VD == VB) - { - m_float_emit.FMLA(size, VD, VA, VC, 0); - } - else if (VD != VA && VD != VC) - { - m_float_emit.MOV(VD, VB); - m_float_emit.FMLA(size, VD, VA, VC, 0); + m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 0); + m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB); } else { - allocate_v0_if_needed(); - m_float_emit.MOV(V0, VB); - m_float_emit.FMLA(size, V0, VA, VC, 0); - result_reg = V0; + if (result_reg != VB) + m_float_emit.MOV(result_reg, VB); + m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg, 0); } break; case 15: // ps_madds1: d = a * c.ps1 + b if (inaccurate_fma) { - m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC, 1); - m_float_emit.FADD(size, VD, inaccurate_fma_temp_reg, VB); - } - else if (VD == VB) - { - m_float_emit.FMLA(size, VD, VA, VC, 1); - } - else if (VD != VA && VD != VC) - { - m_float_emit.MOV(VD, VB); - m_float_emit.FMLA(size, VD, VA, VC, 1); + m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 1); + m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB); } else { - allocate_v0_if_needed(); - m_float_emit.MOV(V0, VB); - m_float_emit.FMLA(size, V0, VA, VC, 1); - result_reg = V0; + if (result_reg != VB) + m_float_emit.MOV(result_reg, VB); + m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg, 1); } break; case 18: // ps_div - m_float_emit.FDIV(size, VD, VA, VB); + m_float_emit.FDIV(size, result_reg, VA, VB); break; case 20: // ps_sub - m_float_emit.FSUB(size, VD, VA, VB); + m_float_emit.FSUB(size, result_reg, VA, VB); break; case 21: // ps_add - m_float_emit.FADD(size, VD, VA, VB); + m_float_emit.FADD(size, result_reg, VA, VB); break; case 25: // ps_mul - m_float_emit.FMUL(size, VD, VA, VC); + m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg); break; case 28: // ps_msub: d = a * c - b case 30: // ps_nmsub: d = -(a * c - b) if (inaccurate_fma) { - m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC); - m_float_emit.FSUB(size, VD, inaccurate_fma_temp_reg, VB); - } - else if (VD != VA && VD != VC) - { - m_float_emit.FNEG(size, VD, VB); - m_float_emit.FMLA(size, VD, VA, VC); + m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg); + m_float_emit.FSUB(size, result_reg, inaccurate_fma_reg, VB); } else { - allocate_v0_if_needed(); - m_float_emit.FNEG(size, V0, VB); - m_float_emit.FMLA(size, V0, VA, VC); - result_reg = V0; + m_float_emit.FNEG(size, result_reg, VB); + m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg); } break; case 29: // ps_madd: d = a * c + b case 31: // ps_nmadd: d = -(a * c + b) if (inaccurate_fma) { - m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC); - m_float_emit.FADD(size, VD, inaccurate_fma_temp_reg, VB); - } - else if (VD == VB) - { - m_float_emit.FMLA(size, VD, VA, VC); - } - else if (VD != VA && VD != VC) - { - m_float_emit.MOV(VD, VB); - m_float_emit.FMLA(size, VD, VA, VC); + m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg); + m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB); } else { - allocate_v0_if_needed(); - m_float_emit.MOV(V0, VB); - m_float_emit.FMLA(size, V0, VA, VC); - result_reg = V0; + if (result_reg != VB) + m_float_emit.MOV(result_reg, VB); + m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg); } break; default: @@ -247,19 +211,12 @@ void JitArm64::ps_arith(UGeckoInstruction inst) break; } - switch (op5) - { - case 30: // ps_nmsub - case 31: // ps_nmadd - // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case - // for any of AArch64's FMA instructions, so we negate using a separate instruction. + // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case + // for any of AArch64's FMA instructions, so we negate using a separate instruction. + if (negate_result) m_float_emit.FNEG(size, VD, result_reg); - break; - default: - if (result_reg != VD) - m_float_emit.MOV(VD, result_reg); - break; - } + else if (result_reg != VD) + m_float_emit.MOV(VD, result_reg); if (V0Q != ARM64Reg::INVALID_REG) fpr.Unlock(V0Q); From 06e60ac327c223f189103e2ed15e6896f92717e2 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 3 Dec 2022 17:37:51 +0100 Subject: [PATCH 2/2] JitArm64: Implement accurate NaNs For quite some time now, we've had a setting on x86-64 that makes Dolphin handle NaNs in a more accurate but slower way. There's only one game that cares about this, Dragon Ball: Revenge of King Piccolo, and what that game cares about more specifically is that the default NaN (or "generated NaN" as I believe it's called in PowerPC documentation) is the same as on PowerPC. On ARM, the default NaN is the same as on PowerPC, so for the longest time we didn't need to do anything special to get Dragon Ball: Revenge of King Piccolo working. However, in 93e636a I changed how we handle FMA instructions in a way that resulted in the sign of NaNs becoming inverted for nmadd/nmsub instructions, breaking the game. To fix this, let's implement the AccurateNaNs setting, like on x86-64. --- Source/Core/Common/Arm64Emitter.cpp | 28 +++ Source/Core/Common/Arm64Emitter.h | 8 + Source/Core/Core/PowerPC/JitArm64/Jit.h | 4 + .../JitArm64/JitArm64_FloatingPoint.cpp | 133 +++++++++++++- .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 171 +++++++++++++++++- 5 files changed, 329 insertions(+), 15 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 084ab24902..87dae220df 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -2173,6 +2173,12 @@ void ARM64FloatEmitter::EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Re (DecodeReg(Rn) << 5) | DecodeReg(Rd)); } +void ARM64FloatEmitter::EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) +{ + Write32((1 << 30) | (U << 29) | (0b111100011 << 20) | (size << 22) | (opcode << 12) | (1 << 11) | + (DecodeReg(Rn) << 5) | DecodeReg(Rd)); +} + void ARM64FloatEmitter::Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) { ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "Singles are not supported!"); @@ -2985,6 +2991,28 @@ void ARM64FloatEmitter::FRSQRTE(ARM64Reg Rd, ARM64Reg Rn) EmitScalar2RegMisc(1, IsDouble(Rd) ? 3 : 2, 0x1D, Rd, Rn); } +// Scalar - pairwise +void ARM64FloatEmitter::FADDP(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalarPairwise(1, IsDouble(Rd), 0b01101, Rd, Rn); +} +void ARM64FloatEmitter::FMAXP(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalarPairwise(1, IsDouble(Rd), 0b01111, Rd, Rn); +} +void ARM64FloatEmitter::FMINP(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01111, Rd, Rn); +} +void ARM64FloatEmitter::FMAXNMP(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalarPairwise(1, IsDouble(Rd), 0b01100, Rd, Rn); +} +void ARM64FloatEmitter::FMINNMP(ARM64Reg Rd, ARM64Reg Rn) +{ + EmitScalarPairwise(1, IsDouble(Rd) ? 3 : 2, 0b01100, Rd, Rn); +} + // Scalar - 2 Source void ARM64FloatEmitter::ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 09a2633660..baaf598741 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -1130,6 +1130,13 @@ public: void FRECPE(ARM64Reg Rd, ARM64Reg Rn); void FRSQRTE(ARM64Reg Rd, ARM64Reg Rn); + // Scalar - pairwise + void FADDP(ARM64Reg Rd, ARM64Reg Rn); + void FMAXP(ARM64Reg Rd, ARM64Reg Rn); + void FMINP(ARM64Reg Rd, ARM64Reg Rn); + void FMAXNMP(ARM64Reg Rd, ARM64Reg Rn); + void FMINNMP(ARM64Reg Rd, ARM64Reg Rn); + // Scalar - 2 Source void ADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); @@ -1296,6 +1303,7 @@ private: void EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd, ARM64Reg Rn); void EmitScalar2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitScalarPairwise(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void Emit2RegMisc(bool Q, bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 9372fffd2d..ae87d815c8 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -177,6 +177,10 @@ public: void FloatCompare(UGeckoInstruction inst, bool upper = false); + // temp_gpr can be INVALID_REG if single is true + void EmitQuietNaNBitConstant(Arm64Gen::ARM64Reg dest_reg, bool single, + Arm64Gen::ARM64Reg temp_gpr); + bool IsFPRStoreSafe(size_t guest_reg) const; protected: diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index bc0e427a66..c5624ee6d4 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -3,6 +3,8 @@ #include "Core/PowerPC/JitArm64/Jit.h" +#include + #include "Common/Arm64Emitter.h" #include "Common/CPUDetect.h" #include "Common/CommonTypes.h" @@ -66,14 +68,20 @@ void JitArm64::fp_arith(UGeckoInstruction inst) FALLBACK_IF(inst.Rc); FALLBACK_IF(jo.fp_exceptions || (jo.div_by_zero_exceptions && inst.SUBOP5 == 18)); - u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD; - u32 op5 = inst.SUBOP5; + const u32 a = inst.FA; + const u32 b = inst.FB; + const u32 c = inst.FC; + const u32 d = inst.FD; + const u32 op5 = inst.SUBOP5; const bool use_c = op5 >= 25; // fmul and all kind of fmaddXX const bool use_b = op5 != 25; // fmul uses no B const bool fma = use_b && use_c; const bool negate_result = (op5 & ~0x1) == 30; + // Addition and subtraction can't generate new NaNs, they can only take NaNs from inputs + const bool can_generate_nan = (op5 & ~0x1) != 20; + const bool output_is_single = inst.OPCD == 59; const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA); const bool round_c = use_c && output_is_single && !js.op->fprIsSingle[inst.FC]; @@ -84,13 +92,12 @@ void JitArm64::fp_arith(UGeckoInstruction inst) }; const bool inputs_are_singles = inputs_are_singles_func(); - const RegType type = - (inputs_are_singles && output_is_single) ? RegType::LowerPairSingle : RegType::LowerPair; + const bool single = inputs_are_singles && output_is_single; + const RegType type = single ? RegType::LowerPairSingle : RegType::LowerPair; const RegType type_out = output_is_single ? (inputs_are_singles ? RegType::DuplicatedSingle : RegType::Duplicated) : RegType::LowerPair; - const auto reg_encoder = - (inputs_are_singles && output_is_single) ? EncodeRegToSingle : EncodeRegToDouble; + const auto reg_encoder = single ? EncodeRegToSingle : EncodeRegToDouble; const ARM64Reg VA = reg_encoder(fpr.R(a, type)); const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG; @@ -98,6 +105,7 @@ void JitArm64::fp_arith(UGeckoInstruction inst) const ARM64Reg VD = reg_encoder(fpr.RW(d, type_out)); ARM64Reg V0Q = ARM64Reg::INVALID_REG; + ARM64Reg V1Q = ARM64Reg::INVALID_REG; ARM64Reg rounded_c_reg = VC; if (round_c) @@ -118,6 +126,21 @@ void JitArm64::fp_arith(UGeckoInstruction inst) } ARM64Reg result_reg = VD; + const bool preserve_d = + m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC)); + if (preserve_d) + { + V1Q = fpr.GetReg(); + result_reg = reg_encoder(V1Q); + } + + const ARM64Reg temp_gpr = m_accurate_nans && !single ? gpr.GetReg() : ARM64Reg::INVALID_REG; + + if (m_accurate_nans) + { + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetReg(); + } switch (op5) { @@ -166,6 +189,74 @@ void JitArm64::fp_arith(UGeckoInstruction inst) break; } + std::vector nan_fixups; + if (m_accurate_nans) + { + // Check if we need to handle NaNs + m_float_emit.FCMP(result_reg); + FixupBranch no_nan = B(CCFlags::CC_VC); + FixupBranch nan = B(); + SetJumpTarget(no_nan); + + SwitchToFarCode(); + SetJumpTarget(nan); + + const ARM64Reg quiet_bit_reg = reg_encoder(V0Q); + + EmitQuietNaNBitConstant(quiet_bit_reg, inputs_are_singles && output_is_single, temp_gpr); + + std::vector inputs; + inputs.push_back(VA); + if (use_b && VA != VB) + inputs.push_back(VB); + if (use_c && VA != VC && (!use_b || VB != VC)) + inputs.push_back(VC); + + // If any inputs are NaNs, pick the first NaN of them and OR it with the quiet bit + for (size_t i = 0; i < inputs.size(); ++i) + { + // Skip checking if the input is a NaN if it's the last input and we're guaranteed to have at + // least one NaN input + const bool check_input = can_generate_nan || i != inputs.size() - 1; + + const ARM64Reg input = inputs[i]; + FixupBranch skip; + if (check_input) + { + m_float_emit.FCMP(input); + skip = B(CCFlags::CC_VC); + } + + m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(input), + EncodeRegToDouble(quiet_bit_reg)); + nan_fixups.push_back(B()); + + if (check_input) + SetJumpTarget(skip); + } + + std::optional nan_early_fixup; + if (can_generate_nan) + { + // There was no NaN in any of the inputs, so the NaN must have been generated by the + // arithmetic instruction. In this case, the result is already correct. + if (negate_result) + { + if (result_reg != VD) + m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg)); + nan_fixups.push_back(B()); + } + else + { + nan_early_fixup = B(); + } + } + + SwitchToNearCode(); + + if (nan_early_fixup) + SetJumpTarget(*nan_early_fixup); + } // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case // for any of AArch64's FMA instructions, so we negate using a separate instruction. @@ -174,8 +265,15 @@ void JitArm64::fp_arith(UGeckoInstruction inst) else if (result_reg != VD) m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg)); + for (FixupBranch fixup : nan_fixups) + SetJumpTarget(fixup); + if (V0Q != ARM64Reg::INVALID_REG) fpr.Unlock(V0Q); + if (V1Q != ARM64Reg::INVALID_REG) + fpr.Unlock(V1Q); + if (temp_gpr != ARM64Reg::INVALID_REG) + gpr.Unlock(temp_gpr); if (output_is_single) { @@ -787,6 +885,29 @@ void JitArm64::ConvertSingleToDoublePair(size_t guest_reg, ARM64Reg dest_reg, AR } } +void JitArm64::EmitQuietNaNBitConstant(ARM64Reg dest_reg, bool single, ARM64Reg temp_gpr) +{ + // dest_reg = QNaN & ~SNaN + // + // (Alternatively, dest_reg = QNaN would also work, but that would take + // two instructions to emit even for singles) + + if (single) + { + m_float_emit.MOVI(32, dest_reg, 0x40, 16); + } + else + { + ASSERT(temp_gpr != ARM64Reg::INVALID_REG); + + MOVI2R(EncodeRegTo64(temp_gpr), 0x0008'0000'0000'0000); + if (IsQuad(dest_reg)) + m_float_emit.DUP(64, dest_reg, EncodeRegTo64(temp_gpr)); + else + m_float_emit.FMOV(dest_reg, EncodeRegTo64(temp_gpr)); + } +} + bool JitArm64::IsFPRStoreSafe(size_t guest_reg) const { return js.fpr_is_store_safe[guest_reg]; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index 4e986b0ce2..4c0730f9d0 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -83,8 +83,11 @@ void JitArm64::ps_arith(UGeckoInstruction inst) const u32 d = inst.FD; const u32 op5 = inst.SUBOP5; + const bool muls = (op5 & ~0x1) == 12; + const bool madds = (op5 & ~0x1) == 14; const bool use_c = op5 == 25 || (op5 & ~0x13) == 12; // mul, muls, and all kinds of maddXX - const bool use_b = op5 != 25 && (op5 & ~0x1) != 12; // mul and muls don't use B + const bool use_b = op5 != 25 && !muls; // mul and muls don't use B + const bool duplicated_c = muls || madds; const bool fma = use_b && use_c; const bool negate_result = (op5 & ~0x1) == 30; const bool msub = op5 == 28 || op5 == 30; @@ -107,6 +110,8 @@ void JitArm64::ps_arith(UGeckoInstruction inst) ARM64Reg V0Q = ARM64Reg::INVALID_REG; ARM64Reg V1Q = ARM64Reg::INVALID_REG; + ARM64Reg V2Q = ARM64Reg::INVALID_REG; + ARM64Reg V3Q = ARM64Reg::INVALID_REG; ARM64Reg rounded_c_reg = VC; if (round_c) @@ -127,12 +132,29 @@ void JitArm64::ps_arith(UGeckoInstruction inst) } ARM64Reg result_reg = VD; - if (fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg)) + const bool need_accurate_fma_reg = + fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg); + const bool preserve_d = + m_accurate_nans && (VD == VA || (use_b && VD == VB) || (use_c && VD == VC)); + if (need_accurate_fma_reg || preserve_d) { V1Q = fpr.GetReg(); result_reg = reg_encoder(V1Q); } + const ARM64Reg temp_gpr = m_accurate_nans && !singles ? gpr.GetReg() : ARM64Reg::INVALID_REG; + + if (m_accurate_nans) + { + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetReg(); + + V2Q = fpr.GetReg(); + + if (duplicated_c || VD == result_reg) + V3Q = fpr.GetReg(); + } + switch (op5) { case 12: // ps_muls0: d = a * c.ps0 @@ -211,6 +233,69 @@ void JitArm64::ps_arith(UGeckoInstruction inst) break; } + FixupBranch nan_fixup; + if (m_accurate_nans) + { + const ARM64Reg nan_temp_reg = singles ? EncodeRegToSingle(V0Q) : EncodeRegToDouble(V0Q); + const ARM64Reg nan_temp_reg_paired = reg_encoder(V0Q); + + const ARM64Reg zero_reg = reg_encoder(V2Q); + + // Check if we need to handle NaNs + + m_float_emit.FMAXP(nan_temp_reg, result_reg); + m_float_emit.FCMP(nan_temp_reg); + FixupBranch no_nan = B(CCFlags::CC_VC); + FixupBranch nan = B(); + SetJumpTarget(no_nan); + + SwitchToFarCode(); + SetJumpTarget(nan); + + // Pick the right NaNs + + m_float_emit.MOVI(8, zero_reg, 0); + + const auto check_input = [&](ARM64Reg input) { + m_float_emit.FACGE(size, nan_temp_reg_paired, input, zero_reg); + m_float_emit.BIF(result_reg, input, nan_temp_reg_paired); + }; + + ARM64Reg c_reg_for_nan_purposes = VC; + if (duplicated_c) + { + c_reg_for_nan_purposes = reg_encoder(V3Q); + m_float_emit.DUP(size, c_reg_for_nan_purposes, VC, op5 & 0x1); + } + + if (use_c) + check_input(c_reg_for_nan_purposes); + + if (use_b && (!use_c || VB != c_reg_for_nan_purposes)) + check_input(VB); + + if ((!use_b || VA != VB) && (!use_c || VA != c_reg_for_nan_purposes)) + check_input(VA); + + // Make the NaNs quiet + + const ARM64Reg quiet_bit_reg = VD == result_reg ? reg_encoder(V3Q) : VD; + EmitQuietNaNBitConstant(quiet_bit_reg, singles, temp_gpr); + + m_float_emit.FACGE(size, nan_temp_reg_paired, result_reg, zero_reg); + m_float_emit.ORR(quiet_bit_reg, quiet_bit_reg, result_reg); + if (negate_result) + m_float_emit.FNEG(size, result_reg, result_reg); + if (VD == result_reg) + m_float_emit.BIF(VD, quiet_bit_reg, nan_temp_reg_paired); + else // quiet_bit_reg == VD + m_float_emit.BIT(VD, result_reg, nan_temp_reg_paired); + + nan_fixup = B(); + + SwitchToNearCode(); + } + // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case // for any of AArch64's FMA instructions, so we negate using a separate instruction. if (negate_result) @@ -218,10 +303,19 @@ void JitArm64::ps_arith(UGeckoInstruction inst) else if (result_reg != VD) m_float_emit.MOV(VD, result_reg); + if (m_accurate_nans) + SetJumpTarget(nan_fixup); + if (V0Q != ARM64Reg::INVALID_REG) fpr.Unlock(V0Q); if (V1Q != ARM64Reg::INVALID_REG) fpr.Unlock(V1Q); + if (V2Q != ARM64Reg::INVALID_REG) + fpr.Unlock(V2Q); + if (V3Q != ARM64Reg::INVALID_REG) + fpr.Unlock(V3Q); + if (temp_gpr != ARM64Reg::INVALID_REG) + gpr.Unlock(temp_gpr); ASSERT_MSG(DYNA_REC, singles == singles_func(), "Register allocation turned singles into doubles in the middle of ps_arith"); @@ -283,32 +377,91 @@ void JitArm64::ps_sumX(UGeckoInstruction inst) const u32 c = inst.FC; const u32 d = inst.FD; - const bool upper = inst.SUBOP5 == 11; + const bool upper = inst.SUBOP5 & 0x1; const bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c); const RegType type = singles ? RegType::Single : RegType::Register; const u8 size = singles ? 32 : 64; const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad; + const auto scalar_reg_encoder = singles ? EncodeRegToSingle : EncodeRegToDouble; const ARM64Reg VA = fpr.R(a, type); const ARM64Reg VB = fpr.R(b, type); const ARM64Reg VC = fpr.R(c, type); const ARM64Reg VD = fpr.RW(d, type); const ARM64Reg V0 = fpr.GetReg(); + const ARM64Reg V1 = m_accurate_nans ? fpr.GetReg() : ARM64Reg::INVALID_REG; + const ARM64Reg temp_gpr = m_accurate_nans && !singles ? gpr.GetReg() : ARM64Reg::INVALID_REG; - m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(upper ? VA : VB), upper ? 0 : 1); - if (d != c) + m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VB), 1); + + FixupBranch a_nan_done, b_nan_done; + if (m_accurate_nans) { - m_float_emit.FADD(size, reg_encoder(VD), reg_encoder(V0), reg_encoder(upper ? VB : VA)); - m_float_emit.INS(size, VD, upper ? 0 : 1, VC, upper ? 0 : 1); + const auto check_nan = [&](ARM64Reg input) { + m_float_emit.FCMP(scalar_reg_encoder(input)); + FixupBranch not_nan = B(CCFlags::CC_VC); + FixupBranch nan = B(); + SetJumpTarget(not_nan); + + SwitchToFarCode(); + SetJumpTarget(nan); + + EmitQuietNaNBitConstant(scalar_reg_encoder(V1), singles, temp_gpr); + + if (upper) + { + m_float_emit.ORR(EncodeRegToDouble(V1), EncodeRegToDouble(V1), EncodeRegToDouble(input)); + m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V1)); + } + else if (d != c) + { + m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(V1), EncodeRegToDouble(input)); + m_float_emit.INS(size, VD, 1, VC, 1); + } + else + { + m_float_emit.ORR(EncodeRegToDouble(V1), EncodeRegToDouble(V1), EncodeRegToDouble(input)); + m_float_emit.INS(size, VD, 0, V1, 0); + } + + FixupBranch nan_done = B(); + SwitchToNearCode(); + + return nan_done; + }; + + a_nan_done = check_nan(VA); + b_nan_done = check_nan(V0); + } + + if (upper) + { + m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(V0), scalar_reg_encoder(VA)); + m_float_emit.TRN1(size, reg_encoder(VD), reg_encoder(VC), reg_encoder(V0)); + } + else if (d != c) + { + m_float_emit.FADD(scalar_reg_encoder(VD), scalar_reg_encoder(V0), scalar_reg_encoder(VA)); + m_float_emit.INS(size, VD, 1, VC, 1); } else { - m_float_emit.FADD(size, reg_encoder(V0), reg_encoder(V0), reg_encoder(upper ? VB : VA)); - m_float_emit.INS(size, VD, upper ? 1 : 0, V0, upper ? 1 : 0); + m_float_emit.FADD(scalar_reg_encoder(V0), scalar_reg_encoder(V0), scalar_reg_encoder(VA)); + m_float_emit.INS(size, VD, 0, V0, 0); + } + + if (m_accurate_nans) + { + SetJumpTarget(a_nan_done); + SetJumpTarget(b_nan_done); } fpr.Unlock(V0); + if (m_accurate_nans) + fpr.Unlock(V1); + if (temp_gpr != ARM64Reg::INVALID_REG) + gpr.Unlock(temp_gpr); ASSERT_MSG(DYNA_REC, singles == (fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c)), "Register allocation turned singles into doubles in the middle of ps_sumX");