From 36d6a165590ceed0ada72e956ea2de5c87145979 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 21 May 2015 12:33:37 +0200 Subject: [PATCH] Jit64: merge ps_maddXX into fmaddXX --- Source/Core/Core/PowerPC/Jit64/Jit.h | 1 - .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 12 +-- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 51 ++++++---- Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp | 94 ------------------- 4 files changed, 39 insertions(+), 119 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 0a367f8fa4..f068030b61 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -194,7 +194,6 @@ public: void ps_mr(UGeckoInstruction inst); void ps_sign(UGeckoInstruction inst); //aggregate void ps_mergeXX(UGeckoInstruction inst); - void ps_maddXX(UGeckoInstruction inst); void ps_res(UGeckoInstruction inst); void ps_rsqrte(UGeckoInstruction inst); void ps_sum(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 8b9dc5d3c2..8dd66132ca 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -122,8 +122,8 @@ static GekkoOPTemplate table4_2[] = {11, &Jit64::ps_sum}, // ps_sum1 {12, &Jit64::ps_muls}, // ps_muls0 {13, &Jit64::ps_muls}, // ps_muls1 - {14, &Jit64::ps_maddXX}, // ps_madds0 - {15, &Jit64::ps_maddXX}, // ps_madds1 + {14, &Jit64::fmaddXX}, // ps_madds0 + {15, &Jit64::fmaddXX}, // ps_madds1 {18, &Jit64::fp_arith}, // ps_div {20, &Jit64::fp_arith}, // ps_sub {21, &Jit64::fp_arith}, // ps_add @@ -131,10 +131,10 @@ static GekkoOPTemplate table4_2[] = {24, &Jit64::ps_res}, // ps_res {25, &Jit64::fp_arith}, // ps_mul {26, &Jit64::ps_rsqrte}, // ps_rsqrte - {28, &Jit64::ps_maddXX}, // ps_msub - {29, &Jit64::ps_maddXX}, // ps_madd - {30, &Jit64::ps_maddXX}, // ps_nmsub - {31, &Jit64::ps_maddXX}, // ps_nmadd + {28, &Jit64::fmaddXX}, // ps_msub + {29, &Jit64::fmaddXX}, // ps_madd + {30, &Jit64::fmaddXX}, // ps_nmsub + {31, &Jit64::fmaddXX}, // ps_nmadd }; diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 5313a76da0..a848c72e6e 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -105,14 +105,39 @@ void Jit64::fmaddXX(UGeckoInstruction inst) int b = inst.FB; int c = inst.FC; int d = inst.FD; - bool single = inst.OPCD == 59; + bool single = inst.OPCD == 4 || inst.OPCD == 59; bool round_input = single && !jit->js.op->fprIsSingle[c]; - bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[b] && jit->js.op->fprIsDuplicated[c]; - if (cpu_info.bAtom) - packed = false; + bool packed = inst.OPCD == 4 || + (!cpu_info.bAtom && single && + jit->js.op->fprIsDuplicated[a] && + jit->js.op->fprIsDuplicated[b] && + jit->js.op->fprIsDuplicated[c]); fpr.Lock(a, b, c, d); + switch(inst.SUBOP5) + { + case 14: + MOVDDUP(XMM0, fpr.R(c)); + if (round_input) + Force25BitPrecision(XMM0, R(XMM0), XMM1); + break; + case 15: + avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3); + if (round_input) + Force25BitPrecision(XMM0, R(XMM0), XMM1); + break; + default: + bool special = inst.SUBOP5 == 30 && (!cpu_info.bFMA || Core::g_want_determinism); + X64Reg tmp1 = special ? XMM1 : XMM0; + X64Reg tmp2 = special ? XMM0 : XMM1; + if (single && round_input) + Force25BitPrecision(tmp1, fpr.R(c), tmp2); + else + MOVAPD(tmp1, fpr.R(c)); + break; + } + // While we don't know if any games are actually affected (replays seem to work with all the usual // suspects for desyncing), netplay and other applications need absolute perfect determinism, so // be extra careful and don't use FMA, even if in theory it might be okay. @@ -121,10 +146,6 @@ void Jit64::fmaddXX(UGeckoInstruction inst) // instances on different computers giving identical results. if (cpu_info.bFMA && !Core::g_want_determinism) { - if (single && round_input) - Force25BitPrecision(XMM0, fpr.R(c), XMM1); - else - MOVAPD(XMM0, fpr.R(c)); // Statistics suggests b is a lot less likely to be unbound in practice, so // if we have to pick one of a or b to bind, let's make it b. fpr.BindToRegister(b, true, false); @@ -136,6 +157,8 @@ void Jit64::fmaddXX(UGeckoInstruction inst) else VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a)); break; + case 14: //madds0 + case 15: //madds1 case 29: //madd if (packed) VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a)); @@ -162,11 +185,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst) } else if (inst.SUBOP5 == 30) //nmsub { - // nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately - if (single && round_input) - Force25BitPrecision(XMM1, fpr.R(c), XMM0); - else - MOVAPD(XMM1, fpr.R(c)); + // We implement nmsub a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately. MOVAPD(XMM0, fpr.R(b)); if (packed) { @@ -181,16 +200,12 @@ void Jit64::fmaddXX(UGeckoInstruction inst) } else { - if (single && round_input) - Force25BitPrecision(XMM0, fpr.R(c), XMM1); - else - MOVAPD(XMM0, fpr.R(c)); if (packed) { MULPD(XMM0, fpr.R(a)); if (inst.SUBOP5 == 28) //msub SUBPD(XMM0, fpr.R(b)); - else //(n)madd + else //(n)madd(s[01]) ADDPD(XMM0, fpr.R(b)); } else diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index 8154151be5..9a4a6186bf 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -224,100 +224,6 @@ void Jit64::ps_res(UGeckoInstruction inst) gpr.UnlockAllX(); } -//TODO: add optimized cases -void Jit64::ps_maddXX(UGeckoInstruction inst) -{ - INSTRUCTION_START - JITDISABLE(bJITPairedOff); - FALLBACK_IF(inst.Rc); - - int a = inst.FA; - int b = inst.FB; - int c = inst.FC; - int d = inst.FD; - bool fma = cpu_info.bFMA && !Core::g_want_determinism; - bool round_input = !jit->js.op->fprIsSingle[c]; - fpr.Lock(a, b, c, d); - - if (fma) - fpr.BindToRegister(b, true, false); - - if (inst.SUBOP5 == 14) - { - MOVDDUP(XMM0, fpr.R(c)); - if (round_input) - Force25BitPrecision(XMM0, R(XMM0), XMM1); - } - else if (inst.SUBOP5 == 15) - { - avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3); - if (round_input) - Force25BitPrecision(XMM0, R(XMM0), XMM1); - } - else - { - if (round_input) - Force25BitPrecision(XMM0, fpr.R(c), XMM1); - else - MOVAPD(XMM0, fpr.R(c)); - } - - if (fma) - { - switch (inst.SUBOP5) - { - case 14: //madds0 - case 15: //madds1 - case 29: //madd - VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a)); - break; - case 28: //msub - VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a)); - break; - case 30: //nmsub - VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a)); - break; - case 31: //nmadd - VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a)); - break; - } - } - else - { - switch (inst.SUBOP5) - { - case 14: //madds0 - case 15: //madds1 - case 29: //madd - MULPD(XMM0, fpr.R(a)); - ADDPD(XMM0, fpr.R(b)); - break; - case 28: //msub - MULPD(XMM0, fpr.R(a)); - SUBPD(XMM0, fpr.R(b)); - break; - case 30: //nmsub - MULPD(XMM0, fpr.R(a)); - SUBPD(XMM0, fpr.R(b)); - PXOR(XMM0, M(psSignBits)); - break; - case 31: //nmadd - MULPD(XMM0, fpr.R(a)); - ADDPD(XMM0, fpr.R(b)); - PXOR(XMM0, M(psSignBits)); - break; - default: - _assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!"); - return; - } - } - - fpr.BindToRegister(d, false); - ForceSinglePrecision(fpr.RX(d), R(XMM0)); - SetFPRFIfNeeded(fpr.RX(d)); - fpr.UnlockAll(); -} - void Jit64::ps_cmpXX(UGeckoInstruction inst) { INSTRUCTION_START