diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 499557f812..ace5e193d1 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -345,13 +345,18 @@ void Jit64::fmaddXX(UGeckoInstruction inst) RegCache::Realize(Ra, Rb, Rc, Rd); } - X64Reg scratch_xmm = !use_fma && inst.SUBOP5 == 30 ? XMM1 : XMM0; - X64Reg result_xmm = scratch_xmm == XMM0 ? XMM1 : XMM0; + const bool subtract = inst.SUBOP5 == 28 || inst.SUBOP5 == 30; // msub, nmsub + const bool negate = inst.SUBOP5 == 30 || inst.SUBOP5 == 31; // nmsub, nmadd + const bool madds0 = inst.SUBOP5 == 14; + const bool madds1 = inst.SUBOP5 == 15; + + X64Reg scratch_xmm = XMM0; + X64Reg result_xmm = XMM1; if (software_fma) { for (size_t i = (packed ? 1 : 0); i != std::numeric_limits::max(); --i) { - if ((i == 0 || inst.SUBOP5 == 14) && inst.SUBOP5 != 15) // (i == 0 || madds0) && !madds1 + if ((i == 0 || madds0) && !madds1) { if (round_input) Force25BitPrecision(XMM1, Rc, XMM2); @@ -381,7 +386,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst) MOVHLPS(XMM2, Rb.GetSimpleReg()); } - if (inst.SUBOP5 == 28 || inst.SUBOP5 == 30) // nsub, nmsub + if (subtract) XORPS(XMM2, MConst(psSignBits)); BitSet32 registers_in_use = CallerSavedRegistersInUse(); @@ -399,128 +404,87 @@ void Jit64::fmaddXX(UGeckoInstruction inst) { result_xmm = XMM0; } - - if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31) // nmsub, nmadd - XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits)); } else { - switch (inst.SUBOP5) + if (madds0) { - case 14: // madds0 MOVDDUP(result_xmm, Rc); if (round_input) Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm); - break; - case 15: // madds1 + } + else if (madds1) + { avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, result_xmm, Rc, Rc, 3); if (round_input) Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm); - break; - default: - if (single && round_input) + } + else + { + if (round_input) Force25BitPrecision(result_xmm, Rc, scratch_xmm); else MOVAPD(result_xmm, Rc); - break; } if (use_fma) { - switch (inst.SUBOP5) + if (subtract) { - case 28: // msub if (packed) VFMSUB132PD(result_xmm, Rb.GetSimpleReg(), Ra); else VFMSUB132SD(result_xmm, Rb.GetSimpleReg(), Ra); - break; - case 14: // madds0 - case 15: // madds1 - case 29: // madd - if (packed) - VFMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra); - else - VFMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra); - break; - // PowerPC and x86 define NMADD/NMSUB differently - // x86: D = -A*C (+/-) B - // PPC: D = -(A*C (+/-) B) - // so we have to swap them; the ADD/SUB here isn't a typo. - case 30: // nmsub - if (packed) - VFNMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra); - else - VFNMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra); - break; - case 31: // nmadd - if (packed) - VFNMSUB132PD(result_xmm, Rb.GetSimpleReg(), Ra); - else - VFNMSUB132SD(result_xmm, Rb.GetSimpleReg(), Ra); - break; - } - } - else - { - if (inst.SUBOP5 == 30) // nmsub - { - // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), - // so handle it separately. - MOVAPD(scratch_xmm, Rb); - if (packed) - { - MULPD(result_xmm, Ra); - SUBPD(scratch_xmm, R(result_xmm)); - } - else - { - MULSD(result_xmm, Ra); - SUBSD(scratch_xmm, R(result_xmm)); - } - result_xmm = scratch_xmm; } else { if (packed) - { - MULPD(result_xmm, Ra); - if (inst.SUBOP5 == 28) // msub - SUBPD(result_xmm, Rb); - else //(n)madd(s[01]) - ADDPD(result_xmm, Rb); - } + VFMADD132PD(result_xmm, Rb.GetSimpleReg(), Ra); else - { - MULSD(result_xmm, Ra); - if (inst.SUBOP5 == 28) - SUBSD(result_xmm, Rb); - else - ADDSD(result_xmm, Rb); - } - if (inst.SUBOP5 == 31) // nmadd - XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits)); + VFMADD132SD(result_xmm, Rb.GetSimpleReg(), Ra); + } + } + else + { + if (packed) + { + MULPD(result_xmm, Ra); + if (subtract) + SUBPD(result_xmm, Rb); + else + ADDPD(result_xmm, Rb); + } + else + { + MULSD(result_xmm, Ra); + if (subtract) + SUBSD(result_xmm, Rb); + else + ADDSD(result_xmm, Rb); } } } + // Using x64's nmadd/nmsub would require us to swap the sign of the addend + // (i.e. PPC nmadd maps to x64 nmsub), which can cause problems with signed zeroes. + // Also, PowerPC's nmadd/nmsub round before the final negation unlike x64's nmadd/nmsub. + // So, negate using a separate instruction instead of using x64's nmadd/nmsub. + if (negate) + XORPD(result_xmm, MConst(packed ? psSignBits2 : psSignBits)); + if (SConfig::GetInstance().bAccurateNaNs && result_xmm == XMM0) { // HandleNaNs needs to clobber XMM0 - MOVAPD(XMM1, R(result_xmm)); - result_xmm = XMM1; + MOVAPD(Rd, R(result_xmm)); + result_xmm = Rd; } + HandleNaNs(inst, result_xmm, result_xmm, XMM0); + if (single) - { - HandleNaNs(inst, result_xmm, result_xmm, XMM0); FinalizeSingleResult(Rd, R(result_xmm), packed, true); - } else - { - HandleNaNs(inst, result_xmm, result_xmm, XMM0); FinalizeDoubleResult(Rd, R(result_xmm)); - } } void Jit64::fsign(UGeckoInstruction inst) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 09afcbb3e0..05c338e9c4 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -178,18 +178,24 @@ void JitArm64::fp_arith(UGeckoInstruction inst) case 25: m_float_emit.FMUL(VD, VA, VC); break; - case 28: + case 28: // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm" m_float_emit.FNMSUB(VD, VA, VC, VB); - break; // fmsub: "D = A*C - B" vs "Vd = (-Va) + Vn*Vm" - case 29: + break; + case 29: // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm" m_float_emit.FMADD(VD, VA, VC, VB); - break; // fmadd: "D = A*C + B" vs "Vd = Va + Vn*Vm" - case 30: - m_float_emit.FMSUB(VD, VA, VC, VB); - break; // fnmsub: "D = -(A*C - B)" vs "Vd = Va + (-Vn)*Vm" - case 31: - m_float_emit.FNMADD(VD, VA, VC, VB); - break; // fnmadd: "D = -(A*C + B)" vs "Vd = (-Va) + (-Vn)*Vm" + break; + // While it may seem like PowerPC's nmadd/nmsub map to AArch64's nmadd/msub [sic], + // the subtly different definitions affect how signed zeroes are handled. + // Also, PowerPC's nmadd/nmsub perform rounding before the final negation. + // So, negate using a separate instruction instead of using AArch64's nmadd/msub. + case 30: // fnmsub: "D = -(A*C - B)" vs "Vd = -((-Va) + Vn*Vm)" + m_float_emit.FNMSUB(VD, VA, VC, VB); + m_float_emit.FNEG(VD, VD); + break; + case 31: // fnmadd: "D = -(A*C + B)" vs "Vd = -(Va + Vn*Vm)" + m_float_emit.FMADD(VD, VA, VC, VB); + m_float_emit.FNEG(VD, VD); + break; default: ASSERT_MSG(DYNA_REC, 0, "fp_arith"); break; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp index 86a5112d59..1e8edb5036 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp @@ -147,26 +147,29 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) ARM64Reg V0 = ARM64Reg::INVALID_REG; ARM64Reg V1Q = ARM64Reg::INVALID_REG; - if (round_c || (d != b && (d == a || d == c))) - { - V0Q = fpr.GetReg(); - V0 = reg_encoder(V0Q); - } + const auto allocate_v0_if_needed = [&] { + if (V0Q == ARM64Reg::INVALID_REG) + { + V0Q = fpr.GetReg(); + V0 = reg_encoder(V0Q); + } + }; if (round_c) { ASSERT_MSG(DYNA_REC, !singles, "Tried to apply 25-bit precision to single"); + allocate_v0_if_needed(); V1Q = fpr.GetReg(); Force25BitPrecision(reg_encoder(V1Q), VC, V0); VC = reg_encoder(V1Q); } + ARM64Reg result_reg = VD; switch (op5) { - case 14: // ps_madds0 - // d = a * c.ps0 + b + case 14: // ps_madds0: d = a * c.ps0 + b if (VD == VB) { m_float_emit.FMLA(size, VD, VA, VC, 0); @@ -178,13 +181,13 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) } else { + allocate_v0_if_needed(); m_float_emit.MOV(V0, VB); m_float_emit.FMLA(size, V0, VA, VC, 0); - m_float_emit.MOV(VD, V0); + result_reg = V0; } break; - case 15: // ps_madds1 - // d = a * c.ps1 + b + case 15: // ps_madds1: d = a * c.ps1 + b if (VD == VB) { m_float_emit.FMLA(size, VD, VA, VC, 1); @@ -196,34 +199,29 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) } else { + allocate_v0_if_needed(); m_float_emit.MOV(V0, VB); m_float_emit.FMLA(size, V0, VA, VC, 1); - m_float_emit.MOV(VD, V0); + result_reg = V0; } break; - case 28: // ps_msub - // d = a * c - b - if (VD == VB) - { - // d = -(-a * c + b) - // rounding is incorrect if the rounding mode is +/- infinity - m_float_emit.FMLS(size, VD, VA, VC); - m_float_emit.FNEG(size, VD, VD); - } - else if (VD != VA && VD != VC) + case 28: // ps_msub: d = a * c - b + case 30: // ps_nmsub: d = -(a * c - b) + if (VD != VA && VD != VC) { m_float_emit.FNEG(size, VD, VB); m_float_emit.FMLA(size, VD, VA, VC); } else { + allocate_v0_if_needed(); m_float_emit.FNEG(size, V0, VB); m_float_emit.FMLA(size, V0, VA, VC); - m_float_emit.MOV(VD, V0); + result_reg = V0; } break; - case 29: // ps_madd - // d = a * c + b + case 29: // ps_madd: d = a * c + b + case 31: // ps_nmadd: d = -(a * c + b) if (VD == VB) { m_float_emit.FMLA(size, VD, VA, VC); @@ -235,53 +233,10 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) } else { + allocate_v0_if_needed(); m_float_emit.MOV(V0, VB); m_float_emit.FMLA(size, V0, VA, VC); - m_float_emit.MOV(VD, V0); - } - break; - case 30: // ps_nmsub - // d = -(a * c - b) - // => - // d = -a * c + b - // Note: PowerPC rounds before the final negation. - // We don't handle this at the moment because it's - // only relevant when rounding to +/- infinity. - if (VD == VB) - { - m_float_emit.FMLS(size, VD, VA, VC); - } - else if (VD != VA && VD != VC) - { - m_float_emit.MOV(VD, VB); - m_float_emit.FMLS(size, VD, VA, VC); - } - else - { - m_float_emit.MOV(V0, VB); - m_float_emit.FMLS(size, V0, VA, VC); - m_float_emit.MOV(VD, V0); - } - break; - case 31: // ps_nmadd - // d = -(a * c + b) - if (VD == VB) - { - m_float_emit.FMLA(size, VD, VA, VC); - m_float_emit.FNEG(size, VD, VD); - } - else if (VD != VA && VD != VC) - { - // d = -a * c - b - // See rounding note at ps_nmsub. - m_float_emit.FNEG(size, VD, VB); - m_float_emit.FMLS(size, VD, VA, VC); - } - else - { - m_float_emit.MOV(V0, VB); - m_float_emit.FMLA(size, V0, VA, VC); - m_float_emit.FNEG(size, VD, V0); + result_reg = V0; } break; default: @@ -289,6 +244,20 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst) break; } + switch (op5) + { + case 30: // ps_nmsub + case 31: // ps_nmadd + // PowerPC's nmadd/nmsub perform rounding before the final negation, which is not the case + // for any of AArch64's FMA instructions, so we negate using a separate instruction. + m_float_emit.FNEG(size, VD, result_reg); + break; + default: + if (result_reg != VD) + m_float_emit.MOV(VD, result_reg); + break; + } + if (V0Q != ARM64Reg::INVALID_REG) fpr.Unlock(V0Q); if (V1Q != ARM64Reg::INVALID_REG)