From 5c41d3b602559af56b618c6ef890b6af2fa95704 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sun, 20 Nov 2022 23:06:16 +0100 Subject: [PATCH] JitArm64: Refactor temp reg handling in fp_arith/ps_arith --- .../JitArm64/JitArm64_FloatingPoint.cpp | 63 ++++---- .../Core/PowerPC/JitArm64/JitArm64_Paired.cpp | 141 ++++++------------ 2 files changed, 83 insertions(+), 121 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 26c6dfd1b7..bc0e427a66 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -71,6 +71,8 @@ void JitArm64::fp_arith(UGeckoInstruction inst) const bool use_c = op5 >= 25; // fmul and all kind of fmaddXX const bool use_b = op5 != 25; // fmul uses no B + const bool fma = use_b && use_c; + const bool negate_result = (op5 & ~0x1) == 30; const bool output_is_single = inst.OPCD == 59; const bool inaccurate_fma = op5 > 25 && !Config::Get(Config::SESSION_USE_FMA); @@ -92,43 +94,44 @@ void JitArm64::fp_arith(UGeckoInstruction inst) const ARM64Reg VA = reg_encoder(fpr.R(a, type)); const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG; - ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; + const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; const ARM64Reg VD = reg_encoder(fpr.RW(d, type_out)); ARM64Reg V0Q = ARM64Reg::INVALID_REG; - ARM64Reg V1Q = ARM64Reg::INVALID_REG; + ARM64Reg rounded_c_reg = VC; if (round_c) { ASSERT_MSG(DYNA_REC, !inputs_are_singles, "Tried to apply 25-bit precision to single"); - V1Q = fpr.GetReg(); - - Force25BitPrecision(reg_encoder(V1Q), VC); - VC = reg_encoder(V1Q); - } - - ARM64Reg inaccurate_fma_temp_reg = VD; - if (inaccurate_fma && d == b) - { V0Q = fpr.GetReg(); - - inaccurate_fma_temp_reg = reg_encoder(V0Q); + rounded_c_reg = reg_encoder(V0Q); + Force25BitPrecision(rounded_c_reg, VC); } + ARM64Reg inaccurate_fma_reg = VD; + if (fma && inaccurate_fma && VD == VB) + { + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetReg(); + inaccurate_fma_reg = reg_encoder(V0Q); + } + + ARM64Reg result_reg = VD; + switch (op5) { case 18: - m_float_emit.FDIV(VD, VA, VB); + m_float_emit.FDIV(result_reg, VA, VB); break; case 20: - m_float_emit.FSUB(VD, VA, VB); + m_float_emit.FSUB(result_reg, VA, VB); break; case 21: - m_float_emit.FADD(VD, VA, VB); + m_float_emit.FADD(result_reg, VA, VB); break; case 25: - m_float_emit.FMUL(VD, VA, VC); + m_float_emit.FMUL(result_reg, VA, rounded_c_reg); break; // While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic], // the subtly different definitions affect how signed zeroes are handled. @@ -138,39 +141,41 @@ void JitArm64::fp_arith(UGeckoInstruction inst) case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)" if (inaccurate_fma) { - m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC); - m_float_emit.FSUB(VD, inaccurate_fma_temp_reg, VB); + m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg); + m_float_emit.FSUB(result_reg, inaccurate_fma_reg, VB); } else { - m_float_emit.FNMSUB(VD, VA, VC, VB); + m_float_emit.FNMSUB(result_reg, VA, rounded_c_reg, VB); } - if (op5 == 30) - m_float_emit.FNEG(VD, VD); break; case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm" case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)" if (inaccurate_fma) { - m_float_emit.FMUL(inaccurate_fma_temp_reg, VA, VC); - m_float_emit.FADD(VD, inaccurate_fma_temp_reg, VB); + m_float_emit.FMUL(inaccurate_fma_reg, VA, rounded_c_reg); + m_float_emit.FADD(result_reg, inaccurate_fma_reg, VB); } else { - m_float_emit.FMADD(VD, VA, VC, VB); + m_float_emit.FMADD(result_reg, VA, rounded_c_reg, VB); } - if (op5 == 31) - m_float_emit.FNEG(VD, VD); break; default: ASSERT_MSG(DYNA_REC, 0, "fp_arith"); break; } + + // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case + // for any of AArch64's FMA instructions, so we negate using a separate instruction. + if (negate_result) + m_float_emit.FNEG(VD, result_reg); + else if (result_reg != VD) + m_float_emit.MOV(EncodeRegToDouble(VD), EncodeRegToDouble(result_reg)); + if (V0Q != ARM64Reg::INVALID_REG) fpr.Unlock(V0Q); - if (V1Q != ARM64Reg::INVALID_REG) - fpr.Unlock(V1Q); if (output_is_single) { diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index 239a235533..4e986b0ce2 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -85,6 +85,9 @@ void JitArm64::ps_arith(UGeckoInstruction inst) const bool use_c = op5 == 25 || (op5 & ~0x13) == 12; // mul, muls, and all kinds of maddXX const bool use_b = op5 != 25 && (op5 & ~0x1) != 12; // mul and muls don't use B + const bool fma = use_b && use_c; + const bool negate_result = (op5 & ~0x1) == 30; + const bool msub = op5 == 28 || op5 == 30; const auto singles_func = [&] { return fpr.IsSingle(a) && (!use_b || fpr.IsSingle(b)) && (!use_c || fpr.IsSingle(c)); @@ -99,147 +102,108 @@ void JitArm64::ps_arith(UGeckoInstruction inst) const ARM64Reg VA = reg_encoder(fpr.R(a, type)); const ARM64Reg VB = use_b ? reg_encoder(fpr.R(b, type)) : ARM64Reg::INVALID_REG; - ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; + const ARM64Reg VC = use_c ? reg_encoder(fpr.R(c, type)) : ARM64Reg::INVALID_REG; const ARM64Reg VD = reg_encoder(fpr.RW(d, type)); ARM64Reg V0Q = ARM64Reg::INVALID_REG; - ARM64Reg V0 = ARM64Reg::INVALID_REG; ARM64Reg V1Q = ARM64Reg::INVALID_REG; - const auto allocate_v0_if_needed = [&] { - if (V0Q == ARM64Reg::INVALID_REG) - { - V0Q = fpr.GetReg(); - V0 = reg_encoder(V0Q); - } - }; - + ARM64Reg rounded_c_reg = VC; if (round_c) { ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single"); - V1Q = fpr.GetReg(); - - Force25BitPrecision(reg_encoder(V1Q), VC); - VC = reg_encoder(V1Q); + V0Q = fpr.GetReg(); + rounded_c_reg = reg_encoder(V0Q); + Force25BitPrecision(rounded_c_reg, VC); } - ARM64Reg inaccurate_fma_temp_reg = VD; - if (inaccurate_fma && d == b) + ARM64Reg inaccurate_fma_reg = VD; + if (fma && inaccurate_fma && VD == VB) { - allocate_v0_if_needed(); - inaccurate_fma_temp_reg = V0; + if (V0Q == ARM64Reg::INVALID_REG) + V0Q = fpr.GetReg(); + inaccurate_fma_reg = reg_encoder(V0Q); } ARM64Reg result_reg = VD; + if (fma && !inaccurate_fma && (msub || VD != VB) && (VD == VA || VD == rounded_c_reg)) + { + V1Q = fpr.GetReg(); + result_reg = reg_encoder(V1Q); + } + switch (op5) { case 12: // ps_muls0: d = a * c.ps0 - m_float_emit.FMUL(size, VD, VA, VC, 0); + m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 0); break; case 13: // ps_muls1: d = a * c.ps1 - m_float_emit.FMUL(size, VD, VA, VC, 1); + m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg, 1); break; case 14: // ps_madds0: d = a * c.ps0 + b if (inaccurate_fma) { - m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC, 0); - m_float_emit.FADD(size, VD, inaccurate_fma_temp_reg, VB); - } - else if (VD == VB) - { - m_float_emit.FMLA(size, VD, VA, VC, 0); - } - else if (VD != VA && VD != VC) - { - m_float_emit.MOV(VD, VB); - m_float_emit.FMLA(size, VD, VA, VC, 0); + m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 0); + m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB); } else { - allocate_v0_if_needed(); - m_float_emit.MOV(V0, VB); - m_float_emit.FMLA(size, V0, VA, VC, 0); - result_reg = V0; + if (result_reg != VB) + m_float_emit.MOV(result_reg, VB); + m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg, 0); } break; case 15: // ps_madds1: d = a * c.ps1 + b if (inaccurate_fma) { - m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC, 1); - m_float_emit.FADD(size, VD, inaccurate_fma_temp_reg, VB); - } - else if (VD == VB) - { - m_float_emit.FMLA(size, VD, VA, VC, 1); - } - else if (VD != VA && VD != VC) - { - m_float_emit.MOV(VD, VB); - m_float_emit.FMLA(size, VD, VA, VC, 1); + m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg, 1); + m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB); } else { - allocate_v0_if_needed(); - m_float_emit.MOV(V0, VB); - m_float_emit.FMLA(size, V0, VA, VC, 1); - result_reg = V0; + if (result_reg != VB) + m_float_emit.MOV(result_reg, VB); + m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg, 1); } break; case 18: // ps_div - m_float_emit.FDIV(size, VD, VA, VB); + m_float_emit.FDIV(size, result_reg, VA, VB); break; case 20: // ps_sub - m_float_emit.FSUB(size, VD, VA, VB); + m_float_emit.FSUB(size, result_reg, VA, VB); break; case 21: // ps_add - m_float_emit.FADD(size, VD, VA, VB); + m_float_emit.FADD(size, result_reg, VA, VB); break; case 25: // ps_mul - m_float_emit.FMUL(size, VD, VA, VC); + m_float_emit.FMUL(size, result_reg, VA, rounded_c_reg); break; case 28: // ps_msub: d = a * c - b case 30: // ps_nmsub: d = -(a * c - b) if (inaccurate_fma) { - m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC); - m_float_emit.FSUB(size, VD, inaccurate_fma_temp_reg, VB); - } - else if (VD != VA && VD != VC) - { - m_float_emit.FNEG(size, VD, VB); - m_float_emit.FMLA(size, VD, VA, VC); + m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg); + m_float_emit.FSUB(size, result_reg, inaccurate_fma_reg, VB); } else { - allocate_v0_if_needed(); - m_float_emit.FNEG(size, V0, VB); - m_float_emit.FMLA(size, V0, VA, VC); - result_reg = V0; + m_float_emit.FNEG(size, result_reg, VB); + m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg); } break; case 29: // ps_madd: d = a * c + b case 31: // ps_nmadd: d = -(a * c + b) if (inaccurate_fma) { - m_float_emit.FMUL(size, inaccurate_fma_temp_reg, VA, VC); - m_float_emit.FADD(size, VD, inaccurate_fma_temp_reg, VB); - } - else if (VD == VB) - { - m_float_emit.FMLA(size, VD, VA, VC); - } - else if (VD != VA && VD != VC) - { - m_float_emit.MOV(VD, VB); - m_float_emit.FMLA(size, VD, VA, VC); + m_float_emit.FMUL(size, inaccurate_fma_reg, VA, rounded_c_reg); + m_float_emit.FADD(size, result_reg, inaccurate_fma_reg, VB); } else { - allocate_v0_if_needed(); - m_float_emit.MOV(V0, VB); - m_float_emit.FMLA(size, V0, VA, VC); - result_reg = V0; + if (result_reg != VB) + m_float_emit.MOV(result_reg, VB); + m_float_emit.FMLA(size, result_reg, VA, rounded_c_reg); } break; default: @@ -247,19 +211,12 @@ void JitArm64::ps_arith(UGeckoInstruction inst) break; } - switch (op5) - { - case 30: // ps_nmsub - case 31: // ps_nmadd - // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case - // for any of AArch64's FMA instructions, so we negate using a separate instruction. + // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case + // for any of AArch64's FMA instructions, so we negate using a separate instruction. + if (negate_result) m_float_emit.FNEG(size, VD, result_reg); - break; - default: - if (result_reg != VD) - m_float_emit.MOV(VD, result_reg); - break; - } + else if (result_reg != VD) + m_float_emit.MOV(VD, result_reg); if (V0Q != ARM64Reg::INVALID_REG) fpr.Unlock(V0Q);