Merge pull request #9748 from JosJuice/fma-accuracy

Interpreter/Jit64: Emulate FMA accurately in more cases
This commit is contained in:
Tilka 2021-06-06 02:29:42 +01:00 committed by GitHub
commit 6c0180fc61
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 190 additions and 152 deletions

View File

@ -238,7 +238,7 @@ inline FPResult NI_sub(UReg_FPSCR* fpscr, double a, double b)
// inputs are checked for NaN is still a, b, c. // inputs are checked for NaN is still a, b, c.
inline FPResult NI_madd(UReg_FPSCR* fpscr, double a, double c, double b) inline FPResult NI_madd(UReg_FPSCR* fpscr, double a, double c, double b)
{ {
FPResult result{a * c}; FPResult result{std::fma(a, c, b)};
if (std::isnan(result.value)) if (std::isnan(result.value))
{ {
@ -263,27 +263,7 @@ inline FPResult NI_madd(UReg_FPSCR* fpscr, double a, double c, double b)
return result; return result;
} }
result.SetException(fpscr, FPSCR_VXIMZ); result.SetException(fpscr, std::isnan(a * c) ? FPSCR_VXIMZ : FPSCR_VXISI);
result.value = PPC_NAN;
return result;
}
result.value += b;
if (std::isnan(result.value))
{
if (Common::IsSNAN(b))
result.SetException(fpscr, FPSCR_VXSNAN);
fpscr->ClearFIFR();
if (std::isnan(b))
{
result.value = MakeQuiet(b);
return result;
}
result.SetException(fpscr, FPSCR_VXISI);
result.value = PPC_NAN; result.value = PPC_NAN;
return result; return result;
} }
@ -296,7 +276,7 @@ inline FPResult NI_madd(UReg_FPSCR* fpscr, double a, double c, double b)
inline FPResult NI_msub(UReg_FPSCR* fpscr, double a, double c, double b) inline FPResult NI_msub(UReg_FPSCR* fpscr, double a, double c, double b)
{ {
FPResult result{a * c}; FPResult result{std::fma(a, c, -b)};
if (std::isnan(result.value)) if (std::isnan(result.value))
{ {
@ -321,27 +301,7 @@ inline FPResult NI_msub(UReg_FPSCR* fpscr, double a, double c, double b)
return result; return result;
} }
result.SetException(fpscr, FPSCR_VXIMZ); result.SetException(fpscr, std::isnan(a * c) ? FPSCR_VXIMZ : FPSCR_VXISI);
result.value = PPC_NAN;
return result;
}
result.value -= b;
if (std::isnan(result.value))
{
if (Common::IsSNAN(b))
result.SetException(fpscr, FPSCR_VXSNAN);
fpscr->ClearFIFR();
if (std::isnan(b))
{
result.value = MakeQuiet(b);
return result;
}
result.SetException(fpscr, FPSCR_VXISI);
result.value = PPC_NAN; result.value = PPC_NAN;
return result; return result;
} }

View File

@ -3,6 +3,8 @@
// Refer to the license.txt file included. // Refer to the license.txt file included.
#include <algorithm> #include <algorithm>
#include <cmath>
#include <limits>
#include <vector> #include <vector>
#include "Common/Assert.h" #include "Common/Assert.h"
@ -239,72 +241,139 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
JITDISABLE(bJITFloatingPointOff); JITDISABLE(bJITFloatingPointOff);
FALLBACK_IF(inst.Rc); FALLBACK_IF(inst.Rc);
// While we don't know if any games are actually affected (replays seem to work with all the usual
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
// be extra careful and use software FMA on CPUs that don't have hardware FMA.
const bool software_fma = !cpu_info.bFMA && Core::WantsDeterminism();
int a = inst.FA; int a = inst.FA;
int b = inst.FB; int b = inst.FB;
int c = inst.FC; int c = inst.FC;
int d = inst.FD; int d = inst.FD;
bool single = inst.OPCD == 4 || inst.OPCD == 59; bool single = inst.OPCD == 4 || inst.OPCD == 59;
bool round_input = single && !js.op->fprIsSingle[c]; bool round_input = single && !js.op->fprIsSingle[c];
bool packed = inst.OPCD == 4 || (!cpu_info.bAtom && single && js.op->fprIsDuplicated[a] && bool packed =
inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]); js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
// While we don't know if any games are actually affected (replays seem to work with all the usual RCOpArg Ra;
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so RCOpArg Rb;
// be extra careful and don't use FMA, even if in theory it might be okay. RCOpArg Rc;
// Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared RCX64Reg Rd;
// to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin RCX64Reg scratch_guard;
// instances on different computers giving identical results. if (software_fma)
const bool use_fma = cpu_info.bFMA && !Core::WantsDeterminism(); {
scratch_guard = fpr.Scratch(XMM2);
// For use_fma == true: Ra = packed ? fpr.Bind(a, RCMode::Read) : fpr.Use(a, RCMode::Read);
Rb = packed ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
Rc = packed ? fpr.Bind(c, RCMode::Read) : fpr.Use(c, RCMode::Read);
Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
RegCache::Realize(Ra, Rb, Rc, Rd, scratch_guard);
}
else
{
// For cpu_info.bFMA == true:
// Statistics suggests b is a lot less likely to be unbound in practice, so // Statistics suggests b is a lot less likely to be unbound in practice, so
// if we have to pick one of a or b to bind, let's make it b. // if we have to pick one of a or b to bind, let's make it b.
RCOpArg Ra = fpr.Use(a, RCMode::Read); Ra = fpr.Use(a, RCMode::Read);
RCOpArg Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read); Rb = cpu_info.bFMA ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
RCOpArg Rc = fpr.Use(c, RCMode::Read); Rc = fpr.Use(c, RCMode::Read);
RCX64Reg Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite); Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
RegCache::Realize(Ra, Rb, Rc, Rd); RegCache::Realize(Ra, Rb, Rc, Rd);
}
X64Reg result_reg = XMM0;
if (software_fma)
{
for (size_t i = (packed ? 1 : 0); i != std::numeric_limits<size_t>::max(); --i)
{
if ((i == 0 || inst.SUBOP5 == 14) && inst.SUBOP5 != 15) // (i == 0 || madds0) && !madds1
{
if (round_input)
Force25BitPrecision(XMM1, Rc, XMM2);
else
MOVSD(XMM1, Rc);
}
else
{
MOVHLPS(XMM1, Rc.GetSimpleReg());
if (round_input)
Force25BitPrecision(XMM1, R(XMM1), XMM2);
}
// Write the result from the previous loop iteration into Rd so we don't lose it.
// It's important that this is done after reading Rc above, in case we have madds1 and c == d.
if (packed && i == 0)
MOVLHPS(Rd, XMM0);
if (i == 0)
{
MOVSD(XMM0, Ra);
MOVSD(XMM2, Rb);
}
else
{
MOVHLPS(XMM0, Ra.GetSimpleReg());
MOVHLPS(XMM2, Rb.GetSimpleReg());
}
if (inst.SUBOP5 == 28 || inst.SUBOP5 == 30) // nsub, nmsub
XORPS(XMM2, MConst(psSignBits));
BitSet32 registers_in_use = CallerSavedRegistersInUse();
ABI_PushRegistersAndAdjustStack(registers_in_use, 0);
ABI_CallFunction(static_cast<double (*)(double, double, double)>(&std::fma));
ABI_PopRegistersAndAdjustStack(registers_in_use, 0);
}
if (packed)
{
MOVSD(Rd, XMM0);
result_reg = Rd;
}
if (inst.SUBOP5 == 30 || inst.SUBOP5 == 31) // nmsub, nmadd
XORPD(result_reg, MConst(packed ? psSignBits2 : psSignBits));
}
else
{
switch (inst.SUBOP5) switch (inst.SUBOP5)
{ {
case 14: case 14: // madds0
MOVDDUP(XMM1, Rc); MOVDDUP(XMM0, Rc);
if (round_input) if (round_input)
Force25BitPrecision(XMM1, R(XMM1), XMM0); Force25BitPrecision(XMM0, R(XMM0), XMM1);
break; break;
case 15: case 15: // madds1
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, Rc, Rc, 3); avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, Rc, Rc, 3);
if (round_input) if (round_input)
Force25BitPrecision(XMM1, R(XMM1), XMM0); Force25BitPrecision(XMM0, R(XMM0), XMM1);
break; break;
default: default:
bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::WantsDeterminism());
X64Reg tmp1 = special ? XMM0 : XMM1;
X64Reg tmp2 = special ? XMM1 : XMM0;
if (single && round_input) if (single && round_input)
Force25BitPrecision(tmp1, Rc, tmp2); Force25BitPrecision(XMM0, Rc, XMM1);
else else
MOVAPD(tmp1, Rc); MOVAPD(XMM0, Rc);
break; break;
} }
if (use_fma) if (cpu_info.bFMA)
{ {
switch (inst.SUBOP5) switch (inst.SUBOP5)
{ {
case 28: // msub case 28: // msub
if (packed) if (packed)
VFMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra); VFMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
else else
VFMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra); VFMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
break; break;
case 14: // madds0 case 14: // madds0
case 15: // madds1 case 15: // madds1
case 29: // madd case 29: // madd
if (packed) if (packed)
VFMADD132PD(XMM1, Rb.GetSimpleReg(), Ra); VFMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
else else
VFMADD132SD(XMM1, Rb.GetSimpleReg(), Ra); VFMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
break; break;
// PowerPC and x86 define NMADD/NMSUB differently // PowerPC and x86 define NMADD/NMSUB differently
// x86: D = -A*C (+/-) B // x86: D = -A*C (+/-) B
@ -312,22 +381,27 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
// so we have to swap them; the ADD/SUB here isn't a typo. // so we have to swap them; the ADD/SUB here isn't a typo.
case 30: // nmsub case 30: // nmsub
if (packed) if (packed)
VFNMADD132PD(XMM1, Rb.GetSimpleReg(), Ra); VFNMADD132PD(XMM0, Rb.GetSimpleReg(), Ra);
else else
VFNMADD132SD(XMM1, Rb.GetSimpleReg(), Ra); VFNMADD132SD(XMM0, Rb.GetSimpleReg(), Ra);
break; break;
case 31: // nmadd case 31: // nmadd
if (packed) if (packed)
VFNMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra); VFNMSUB132PD(XMM0, Rb.GetSimpleReg(), Ra);
else else
VFNMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra); VFNMSUB132SD(XMM0, Rb.GetSimpleReg(), Ra);
break; break;
} }
} }
else if (inst.SUBOP5 == 30) // nmsub else
{ {
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it // No hardware support for FMA, and determinism is not enabled. In this case we inaccurately
// separately. // do the multiplication and addition/subtraction in two separate operations for performance.
if (inst.SUBOP5 == 30) // nmsub
{
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)),
// so handle it separately.
MOVAPD(XMM1, Rb); MOVAPD(XMM1, Rb);
if (packed) if (packed)
{ {
@ -339,38 +413,41 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
MULSD(XMM0, Ra); MULSD(XMM0, Ra);
SUBSD(XMM1, R(XMM0)); SUBSD(XMM1, R(XMM0));
} }
result_reg = XMM1;
} }
else else
{ {
if (packed) if (packed)
{ {
MULPD(XMM1, Ra); MULPD(XMM0, Ra);
if (inst.SUBOP5 == 28) // msub if (inst.SUBOP5 == 28) // msub
SUBPD(XMM1, Rb); SUBPD(XMM0, Rb);
else //(n)madd(s[01]) else //(n)madd(s[01])
ADDPD(XMM1, Rb); ADDPD(XMM0, Rb);
} }
else else
{ {
MULSD(XMM1, Ra); MULSD(XMM0, Ra);
if (inst.SUBOP5 == 28) if (inst.SUBOP5 == 28)
SUBSD(XMM1, Rb); SUBSD(XMM0, Rb);
else else
ADDSD(XMM1, Rb); ADDSD(XMM0, Rb);
} }
if (inst.SUBOP5 == 31) // nmadd if (inst.SUBOP5 == 31) // nmadd
XORPD(XMM1, MConst(packed ? psSignBits2 : psSignBits)); XORPD(XMM0, MConst(packed ? psSignBits2 : psSignBits));
}
}
} }
if (single) if (single)
{ {
HandleNaNs(inst, Rd, XMM1); HandleNaNs(inst, result_reg, result_reg, result_reg == XMM1 ? XMM0 : XMM1);
ForceSinglePrecision(Rd, Rd, packed, true); ForceSinglePrecision(Rd, R(result_reg), packed, true);
} }
else else
{ {
HandleNaNs(inst, XMM1, XMM1); HandleNaNs(inst, result_reg, result_reg, XMM1);
MOVSD(Rd, R(XMM1)); MOVSD(Rd, R(result_reg));
} }
SetFPRFIfNeeded(Rd); SetFPRFIfNeeded(Rd);
} }

View File

@ -828,6 +828,7 @@ void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, const OpArg&,
else else
{ {
(this->*sseOp)(XMM0, arg2, imm); (this->*sseOp)(XMM0, arg2, imm);
if (regOp != XMM0)
MOVAPD(regOp, R(XMM0)); MOVAPD(regOp, R(XMM0));
} }
} }