Jit64: Correctly handle NaNs for ps_maddsX

This commit is contained in:
JosJuice 2022-12-03 20:19:03 +01:00
parent af5596720f
commit eeef5363e4

View File

@ -350,11 +350,19 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] && inst.OPCD == 4 || (!cpu_info.bAtom && !software_fma && single && js.op->fprIsDuplicated[a] &&
js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]); js.op->fprIsDuplicated[b] && js.op->fprIsDuplicated[c]);
const bool subtract = inst.SUBOP5 == 28 || inst.SUBOP5 == 30; // msub, nmsub
const bool negate = inst.SUBOP5 == 30 || inst.SUBOP5 == 31; // nmsub, nmadd
const bool madds0 = inst.SUBOP5 == 14;
const bool madds1 = inst.SUBOP5 == 15;
const bool madds_accurate_nans = m_accurate_nans && (madds0 || madds1);
RCOpArg Ra; RCOpArg Ra;
RCOpArg Rb; RCOpArg Rb;
RCOpArg Rc; RCOpArg Rc;
RCX64Reg Rd; RCX64Reg Rd;
RCX64Reg scratch_guard; RCX64Reg scratch_guard;
RCX64Reg Rc_duplicated_guard;
X64Reg Rc_duplicated = XMM2;
if (software_fma) if (software_fma)
{ {
scratch_guard = fpr.Scratch(XMM2); scratch_guard = fpr.Scratch(XMM2);
@ -374,12 +382,14 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
Rc = fpr.Use(c, RCMode::Read); Rc = fpr.Use(c, RCMode::Read);
Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite); Rd = fpr.Bind(d, single ? RCMode::Write : RCMode::ReadWrite);
RegCache::Realize(Ra, Rb, Rc, Rd); RegCache::Realize(Ra, Rb, Rc, Rd);
}
const bool subtract = inst.SUBOP5 == 28 || inst.SUBOP5 == 30; // msub, nmsub if (madds_accurate_nans)
const bool negate = inst.SUBOP5 == 30 || inst.SUBOP5 == 31; // nmsub, nmadd {
const bool madds0 = inst.SUBOP5 == 14; Rc_duplicated_guard = fpr.Scratch();
const bool madds1 = inst.SUBOP5 == 15; RegCache::Realize(Rc_duplicated_guard);
Rc_duplicated = Rc_duplicated_guard;
}
}
X64Reg scratch_xmm = XMM0; X64Reg scratch_xmm = XMM0;
X64Reg result_xmm = XMM1; X64Reg result_xmm = XMM1;
@ -435,18 +445,30 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
{ {
result_xmm = XMM0; result_xmm = XMM0;
} }
if (madds_accurate_nans)
{
if (madds0)
MOVDDUP(Rc_duplicated, Rc);
else
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, Rc_duplicated, Rc, Rc, 3);
}
} }
else else
{ {
if (madds0) if (madds0)
{ {
MOVDDUP(result_xmm, Rc); MOVDDUP(result_xmm, Rc);
if (madds_accurate_nans)
MOVAPD(R(Rc_duplicated), result_xmm);
if (round_input) if (round_input)
Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm); Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
} }
else if (madds1) else if (madds1)
{ {
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, result_xmm, Rc, Rc, 3); avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, result_xmm, Rc, Rc, 3);
if (madds_accurate_nans)
MOVAPD(R(Rc_duplicated), result_xmm);
if (round_input) if (round_input)
Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm); Force25BitPrecision(result_xmm, R(result_xmm), scratch_xmm);
} }
@ -510,7 +532,8 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
result_xmm = Rd; result_xmm = Rd;
} }
HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, Rc); // If packed, the clobber register must be XMM0. If not packed, the clobber register is unused.
HandleNaNs(inst, result_xmm, XMM0, Ra, Rb, madds_accurate_nans ? R(Rc_duplicated) : Rc);
if (single) if (single)
FinalizeSingleResult(Rd, R(result_xmm), packed, true); FinalizeSingleResult(Rd, R(result_xmm), packed, true);