mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-01-25 07:21:14 +01:00
Jit_FloatingPoint: fmaddXX
This commit is contained in:
parent
a26c9c4b74
commit
537eeb7ebf
@ -243,17 +243,32 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||||||
bool packed = inst.OPCD == 4 || (!cpu_info.bAtom && single && js.op->fprIsDuplicated[a] &&
|
bool packed = inst.OPCD == 4 || (!cpu_info.bAtom && single && js.op->fprIsDuplicated[a] &&
|
||||||
js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
|
js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
|
||||||
|
|
||||||
fpr.Lock(a, b, c, d);
|
// While we don't know if any games are actually affected (replays seem to work with all the usual
|
||||||
|
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
|
||||||
|
// be extra careful and don't use FMA, even if in theory it might be okay.
|
||||||
|
// Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared
|
||||||
|
// to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin
|
||||||
|
// instances on different computers giving identical results.
|
||||||
|
const bool use_fma = cpu_info.bFMA && !Core::WantsDeterminism();
|
||||||
|
|
||||||
|
// For use_fma == true:
|
||||||
|
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
||||||
|
// if we have to pick one of a or b to bind, let's make it b.
|
||||||
|
RCOpArg Ra = fpr.Use(a, RCMode::Read);
|
||||||
|
RCOpArg Rb = use_fma ? fpr.Bind(b, RCMode::Read) : fpr.Use(b, RCMode::Read);
|
||||||
|
RCOpArg Rc = fpr.Use(c, RCMode::Read);
|
||||||
|
RCX64Reg Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
|
||||||
|
RegCache::Realize(Ra, Rb, Rc, Rd);
|
||||||
|
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 14:
|
case 14:
|
||||||
MOVDDUP(XMM1, fpr.R(c));
|
MOVDDUP(XMM1, Rc);
|
||||||
if (round_input)
|
if (round_input)
|
||||||
Force25BitPrecision(XMM1, R(XMM1), XMM0);
|
Force25BitPrecision(XMM1, R(XMM1), XMM0);
|
||||||
break;
|
break;
|
||||||
case 15:
|
case 15:
|
||||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, fpr.R(c), fpr.R(c), 3);
|
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM1, Rc, Rc, 3);
|
||||||
if (round_input)
|
if (round_input)
|
||||||
Force25BitPrecision(XMM1, R(XMM1), XMM0);
|
Force25BitPrecision(XMM1, R(XMM1), XMM0);
|
||||||
break;
|
break;
|
||||||
@ -262,38 +277,29 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||||||
X64Reg tmp1 = special ? XMM0 : XMM1;
|
X64Reg tmp1 = special ? XMM0 : XMM1;
|
||||||
X64Reg tmp2 = special ? XMM1 : XMM0;
|
X64Reg tmp2 = special ? XMM1 : XMM0;
|
||||||
if (single && round_input)
|
if (single && round_input)
|
||||||
Force25BitPrecision(tmp1, fpr.R(c), tmp2);
|
Force25BitPrecision(tmp1, Rc, tmp2);
|
||||||
else
|
else
|
||||||
MOVAPD(tmp1, fpr.R(c));
|
MOVAPD(tmp1, Rc);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// While we don't know if any games are actually affected (replays seem to work with all the usual
|
if (use_fma)
|
||||||
// suspects for desyncing), netplay and other applications need absolute perfect determinism, so
|
|
||||||
// be extra careful and don't use FMA, even if in theory it might be okay.
|
|
||||||
// Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared
|
|
||||||
// to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin
|
|
||||||
// instances on different computers giving identical results.
|
|
||||||
if (cpu_info.bFMA && !Core::WantsDeterminism())
|
|
||||||
{
|
{
|
||||||
// Statistics suggests b is a lot less likely to be unbound in practice, so
|
|
||||||
// if we have to pick one of a or b to bind, let's make it b.
|
|
||||||
fpr.BindToRegister(b, true, false);
|
|
||||||
switch (inst.SUBOP5)
|
switch (inst.SUBOP5)
|
||||||
{
|
{
|
||||||
case 28: // msub
|
case 28: // msub
|
||||||
if (packed)
|
if (packed)
|
||||||
VFMSUB132PD(XMM1, fpr.RX(b), fpr.R(a));
|
VFMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||||
else
|
else
|
||||||
VFMSUB132SD(XMM1, fpr.RX(b), fpr.R(a));
|
VFMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||||
break;
|
break;
|
||||||
case 14: // madds0
|
case 14: // madds0
|
||||||
case 15: // madds1
|
case 15: // madds1
|
||||||
case 29: // madd
|
case 29: // madd
|
||||||
if (packed)
|
if (packed)
|
||||||
VFMADD132PD(XMM1, fpr.RX(b), fpr.R(a));
|
VFMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||||
else
|
else
|
||||||
VFMADD132SD(XMM1, fpr.RX(b), fpr.R(a));
|
VFMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||||
break;
|
break;
|
||||||
// PowerPC and x86 define NMADD/NMSUB differently
|
// PowerPC and x86 define NMADD/NMSUB differently
|
||||||
// x86: D = -A*C (+/-) B
|
// x86: D = -A*C (+/-) B
|
||||||
@ -301,15 +307,15 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||||||
// so we have to swap them; the ADD/SUB here isn't a typo.
|
// so we have to swap them; the ADD/SUB here isn't a typo.
|
||||||
case 30: // nmsub
|
case 30: // nmsub
|
||||||
if (packed)
|
if (packed)
|
||||||
VFNMADD132PD(XMM1, fpr.RX(b), fpr.R(a));
|
VFNMADD132PD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||||
else
|
else
|
||||||
VFNMADD132SD(XMM1, fpr.RX(b), fpr.R(a));
|
VFNMADD132SD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||||
break;
|
break;
|
||||||
case 31: // nmadd
|
case 31: // nmadd
|
||||||
if (packed)
|
if (packed)
|
||||||
VFNMSUB132PD(XMM1, fpr.RX(b), fpr.R(a));
|
VFNMSUB132PD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||||
else
|
else
|
||||||
VFNMSUB132SD(XMM1, fpr.RX(b), fpr.R(a));
|
VFNMSUB132SD(XMM1, Rb.GetSimpleReg(), Ra);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -317,15 +323,15 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||||||
{
|
{
|
||||||
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it
|
// We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it
|
||||||
// separately.
|
// separately.
|
||||||
MOVAPD(XMM1, fpr.R(b));
|
MOVAPD(XMM1, Rb);
|
||||||
if (packed)
|
if (packed)
|
||||||
{
|
{
|
||||||
MULPD(XMM0, fpr.R(a));
|
MULPD(XMM0, Ra);
|
||||||
SUBPD(XMM1, R(XMM0));
|
SUBPD(XMM1, R(XMM0));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
MULSD(XMM0, fpr.R(a));
|
MULSD(XMM0, Ra);
|
||||||
SUBSD(XMM1, R(XMM0));
|
SUBSD(XMM1, R(XMM0));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -333,36 +339,35 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||||||
{
|
{
|
||||||
if (packed)
|
if (packed)
|
||||||
{
|
{
|
||||||
MULPD(XMM1, fpr.R(a));
|
MULPD(XMM1, Ra);
|
||||||
if (inst.SUBOP5 == 28) // msub
|
if (inst.SUBOP5 == 28) // msub
|
||||||
SUBPD(XMM1, fpr.R(b));
|
SUBPD(XMM1, Rb);
|
||||||
else //(n)madd(s[01])
|
else //(n)madd(s[01])
|
||||||
ADDPD(XMM1, fpr.R(b));
|
ADDPD(XMM1, Rb);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
MULSD(XMM1, fpr.R(a));
|
MULSD(XMM1, Ra);
|
||||||
if (inst.SUBOP5 == 28)
|
if (inst.SUBOP5 == 28)
|
||||||
SUBSD(XMM1, fpr.R(b));
|
SUBSD(XMM1, Rb);
|
||||||
else
|
else
|
||||||
ADDSD(XMM1, fpr.R(b));
|
ADDSD(XMM1, Rb);
|
||||||
}
|
}
|
||||||
if (inst.SUBOP5 == 31) // nmadd
|
if (inst.SUBOP5 == 31) // nmadd
|
||||||
XORPD(XMM1, MConst(packed ? psSignBits2 : psSignBits));
|
XORPD(XMM1, MConst(packed ? psSignBits2 : psSignBits));
|
||||||
}
|
}
|
||||||
fpr.BindToRegister(d, !single);
|
|
||||||
if (single)
|
if (single)
|
||||||
{
|
{
|
||||||
HandleNaNs(inst, fpr.RX(d), XMM1);
|
HandleNaNs(inst, Rd, XMM1);
|
||||||
ForceSinglePrecision(fpr.RX(d), fpr.R(d), packed, true);
|
ForceSinglePrecision(Rd, Rd, packed, true);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
HandleNaNs(inst, XMM1, XMM1);
|
HandleNaNs(inst, XMM1, XMM1);
|
||||||
MOVSD(fpr.RX(d), R(XMM1));
|
MOVSD(Rd, R(XMM1));
|
||||||
}
|
}
|
||||||
SetFPRFIfNeeded(fpr.RX(d));
|
SetFPRFIfNeeded(Rd);
|
||||||
fpr.UnlockAll();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Jit64::fsign(UGeckoInstruction inst)
|
void Jit64::fsign(UGeckoInstruction inst)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user