From 72c96c20d31327f0885bbbc7bf3fc104cd1b6e99 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sat, 11 Oct 2014 14:22:44 -0700 Subject: [PATCH] JIT: more optimizing of float ops based on known input characteristics If the inputs are both float singles, and the top half is known to be identical to the bottom half, we can use packed arithmetic instead of scalar to skip the movddup. This is slower on a few rather old CPUs, plus the Atom+Silvermont, so detect Atom and disable it in that case. Also avoid PPC_FP on stores if we know that the output came from a float op. --- Source/Core/Common/CPUDetect.h | 2 +- Source/Core/Common/x64CPUDetect.cpp | 6 + Source/Core/Core/PowerPC/Jit64/Jit.h | 2 +- .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp | 116 ++++++++++++++---- .../PowerPC/Jit64/Jit_LoadStoreFloating.cpp | 11 +- .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp | 10 +- Source/Core/Core/PowerPC/JitCommon/Jit_Util.h | 2 +- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 39 +++++- Source/Core/Core/PowerPC/PPCAnalyst.h | 5 + 9 files changed, 154 insertions(+), 39 deletions(-) diff --git a/Source/Core/Common/CPUDetect.h b/Source/Core/Common/CPUDetect.h index 752d26afb2..c63076ff7b 100644 --- a/Source/Core/Common/CPUDetect.h +++ b/Source/Core/Common/CPUDetect.h @@ -50,10 +50,10 @@ struct CPUInfo bool bMOVBE; // This flag indicates that the hardware supports some mode // in which denormal inputs _and_ outputs are automatically set to (signed) zero. - // TODO: ARM bool bFlushToZero; bool bLAHFSAHF64; bool bLongMode; + bool bAtom; // ARM specific CPUInfo bool bSwp; diff --git a/Source/Core/Common/x64CPUDetect.cpp b/Source/Core/Common/x64CPUDetect.cpp index 31409685e8..8ad8046c8b 100644 --- a/Source/Core/Common/x64CPUDetect.cpp +++ b/Source/Core/Common/x64CPUDetect.cpp @@ -129,6 +129,12 @@ void CPUInfo::Detect() if (max_std_fn >= 1) { __cpuid(cpu_id, 0x00000001); + int family = ((cpu_id[0] >> 8) & 0xf) + ((cpu_id[0] >> 20) & 0xff); + int model = ((cpu_id[0] >> 4) & 0xf) + ((cpu_id[0] >> 12) & 0xf0); + // Detect people unfortunate enough to be running Dolphin on an Atom + if (family == 6 && (model == 0x1C || model == 0x26 ||model == 0x27 || model == 0x35 || model == 0x36 || + model == 0x37 || model == 0x4A || model == 0x4D || model == 0x5A || model == 0x5D)) + bAtom = true; logical_cpu_count = (cpu_id[1] >> 16) & 0xFF; ht = (cpu_id[3] >> 28) & 1; diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index 5793d744c8..a2ec9f2a66 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -151,7 +151,7 @@ public: void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg), - void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); + void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool packed = false, bool roundRHS = false); void FloatCompare(UGeckoInstruction inst, bool upper = false); // OPCODES diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index 84e1ce4969..f404ccd88b 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -11,11 +11,12 @@ using namespace Gen; static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL}; +static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL}; static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL}; static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000}; void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), - void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS) + void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool packed, bool roundRHS) { fpr.Lock(d, a, b); fpr.BindToRegister(d, d == a || d == b || !single); @@ -34,12 +35,19 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X } else { - avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), false, reversible); + avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), packed, reversible); } if (single) { - ForceSinglePrecisionS(fpr.RX(d)); - MOVDDUP(fpr.RX(d), fpr.R(d)); + if (packed) + { + ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d)); + } + else + { + ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d)); + MOVDDUP(fpr.RX(d), fpr.R(d)); + } } SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); @@ -63,14 +71,32 @@ void Jit64::fp_arith(UGeckoInstruction inst) JITDISABLE(bJITFloatingPointOff); FALLBACK_IF(inst.Rc); + int a = inst.FA; + int b = inst.FB; + int c = inst.FC; + int d = inst.FD; + int arg2 = inst.SUBOP5 == 25 ? c : b; + bool single = inst.OPCD == 59; bool round_input = single && !jit->js.op->fprIsSingle[inst.FC]; + // If both the inputs are known to have identical top and bottom halves, we can skip the MOVDDUP at the end by + // using packed arithmetic instead. + bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[arg2]; + // Packed divides are slower than scalar divides on basically all x86, so this optimization isn't worth it in that case. + // Atoms (and a few really old CPUs) are also slower on packed operations than scalar ones. + if (inst.SUBOP5 == 18 || cpu_info.bAtom) + packed = false; + switch (inst.SUBOP5) { - case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div - case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub - case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add - case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, round_input); break; //mul + case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD, + packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, inst, packed); break; + case 20: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD, + packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, inst, packed); break; + case 21: fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD, + packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, inst, packed); break; + case 25: fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD, + packed ? &XEmitter::MULPD : &XEmitter::MULSD, inst, packed, round_input); break; default: _assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!"); } @@ -88,6 +114,9 @@ void Jit64::fmaddXX(UGeckoInstruction inst) int d = inst.FD; bool single = inst.OPCD == 59; bool round_input = single && !jit->js.op->fprIsSingle[c]; + bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[b] && jit->js.op->fprIsDuplicated[c]; + if (cpu_info.bAtom) + packed = false; fpr.Lock(a, b, c, d); @@ -109,20 +138,32 @@ void Jit64::fmaddXX(UGeckoInstruction inst) switch (inst.SUBOP5) { case 28: //msub - VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a)); + if (packed) + VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a)); + else + VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a)); break; case 29: //madd - VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a)); + if (packed) + VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a)); + else + VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a)); break; // PowerPC and x86 define NMADD/NMSUB differently // x86: D = -A*C (+/-) B // PPC: D = -(A*C (+/-) B) // so we have to swap them; the ADD/SUB here isn't a typo. case 30: //nmsub - VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a)); + if (packed) + VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a)); + else + VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a)); break; case 31: //nmadd - VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a)); + if (packed) + VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a)); + else + VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a)); break; } } @@ -133,9 +174,17 @@ void Jit64::fmaddXX(UGeckoInstruction inst) Force25BitPrecision(XMM1, fpr.R(c), XMM0); else MOVAPD(XMM1, fpr.R(c)); - MULSD(XMM1, fpr.R(a)); MOVAPD(XMM0, fpr.R(b)); - SUBSD(XMM0, R(XMM1)); + if (packed) + { + MULPD(XMM1, fpr.R(a)); + SUBPD(XMM0, R(XMM1)); + } + else + { + MULSD(XMM1, fpr.R(a)); + SUBSD(XMM0, R(XMM1)); + } } else { @@ -143,22 +192,39 @@ void Jit64::fmaddXX(UGeckoInstruction inst) Force25BitPrecision(XMM0, fpr.R(c), XMM1); else MOVAPD(XMM0, fpr.R(c)); - MULSD(XMM0, fpr.R(a)); - if (inst.SUBOP5 == 28) //msub - SUBSD(XMM0, fpr.R(b)); - else //(n)madd - ADDSD(XMM0, fpr.R(b)); + if (packed) + { + MULPD(XMM0, fpr.R(a)); + if (inst.SUBOP5 == 28) //msub + SUBPD(XMM0, fpr.R(b)); + else //(n)madd + ADDPD(XMM0, fpr.R(b)); + } + else + { + MULSD(XMM0, fpr.R(a)); + if (inst.SUBOP5 == 28) + SUBSD(XMM0, fpr.R(b)); + else + ADDSD(XMM0, fpr.R(b)); + } if (inst.SUBOP5 == 31) //nmadd - PXOR(XMM0, M((void*)&psSignBits)); + PXOR(XMM0, M((void*)&(packed ? psSignBits2 : psSignBits))); } fpr.BindToRegister(d, !single); - //YES it is necessary to dupe the result :( - //TODO : analysis - does the top reg get used? If so, dupe, if not, don't. + if (single) { - ForceSinglePrecisionS(XMM0); - MOVDDUP(fpr.RX(d), R(XMM0)); + if (packed) + { + ForceSinglePrecisionP(fpr.RX(d), XMM0); + } + else + { + ForceSinglePrecisionS(fpr.RX(d), XMM0); + MOVDDUP(fpr.RX(d), fpr.R(d)); + } } else { @@ -427,7 +493,7 @@ void Jit64::frspx(UGeckoInstruction inst) fpr.BindToRegister(d, d == b); if (b != d) MOVAPD(fpr.RX(d), fpr.R(b)); - ForceSinglePrecisionS(fpr.RX(d)); + ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d)); MOVDDUP(fpr.RX(d), fpr.R(d)); SetFPRFIfNeeded(inst, fpr.RX(d)); fpr.UnlockAll(); diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index a859a53ff9..2a246b3a0b 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -108,8 +108,15 @@ void Jit64::stfXXX(UGeckoInstruction inst) if (single) { - fpr.BindToRegister(s, true, false); - ConvertDoubleToSingle(XMM0, fpr.RX(s)); + if (jit->js.op->fprIsStoreSafe[s]) + { + CVTSD2SS(XMM0, fpr.R(s)); + } + else + { + fpr.BindToRegister(s, true, false); + ConvertDoubleToSingle(XMM0, fpr.RX(s)); + } MOVD_xmm(R(RSCRATCH), XMM0); } else diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 30573246c0..a9808f7d07 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -667,13 +667,17 @@ void EmuCodeBlock::WriteToConstRamAddress(int accessSize, OpArg arg, u32 address MOV(accessSize, MDisp(RMEM, address & 0x3FFFFFFF), R(reg)); } -void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm) +void EmuCodeBlock::ForceSinglePrecisionS(X64Reg output, X64Reg input) { // Most games don't need these. Zelda requires it though - some platforms get stuck without them. if (jit->jo.accurateSinglePrecision) { - CVTSD2SS(xmm, R(xmm)); - CVTSS2SD(xmm, R(xmm)); + CVTSD2SS(input, R(input)); + CVTSS2SD(output, R(input)); + } + else if (output != input) + { + MOVAPD(output, R(input)); } } diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index 3487fb374f..67a01249f2 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -130,7 +130,7 @@ public: void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8), Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm); - void ForceSinglePrecisionS(Gen::X64Reg xmm); + void ForceSinglePrecisionS(Gen::X64Reg output, Gen::X64Reg input); void ForceSinglePrecisionP(Gen::X64Reg output, Gen::X64Reg input); void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp); diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 0d72e8a5a4..b5a5c22716 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -830,18 +830,45 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 fprInUse[code[i].fregOut] = true; } - // Forward scan, for flags that need the other direction for calculation - BitSet32 fprIsSingle; + // Forward scan, for flags that need the other direction for calculation. + BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe; for (u32 i = 0; i < block->m_num_instructions; i++) { code[i].fprIsSingle = fprIsSingle; + code[i].fprIsDuplicated = fprIsDuplicated; + code[i].fprIsStoreSafe = fprIsStoreSafe; if (code[i].fregOut >= 0) { - // This instruction outputs float, so we can omit the special rounding done in fmuls/fmadds - if (code[i].opinfo->type == OPTYPE_SINGLEFP || code[i].opinfo->type == OPTYPE_PS || strncmp(code[i].opinfo->opname, "lfs", 3)) + fprIsSingle[code[i].fregOut] = false; + fprIsDuplicated[code[i].fregOut] = false; + fprIsStoreSafe[code[i].fregOut] = false; + // Single, duplicated, and doesn't need PPC_FP. + if (code[i].opinfo->type == OPTYPE_SINGLEFP) + { fprIsSingle[code[i].fregOut] = true; - else - fprIsSingle[code[i].fregOut] = false; + fprIsDuplicated[code[i].fregOut] = true; + fprIsStoreSafe[code[i].fregOut] = true; + } + // Single and duplicated, but might be a denormal (not safe to skip PPC_FP). + // TODO: if we go directly from a load to store, skip conversion entirely? + // TODO: if we go directly from a load to a float instruction, and the value isn't used + // for anything else, we can skip PPC_FP on a load too. + if (!strncmp(code[i].opinfo->opname, "lfs", 3)) + { + fprIsSingle[code[i].fregOut] = true; + fprIsDuplicated[code[i].fregOut] = true; + } + // Paired are still floats, but the top/bottom halves may differ. + if (code[i].opinfo->type == OPTYPE_PS || code[i].opinfo->type == OPTYPE_LOADPS) + { + fprIsSingle[code[i].fregOut] = true; + fprIsStoreSafe[code[i].fregOut] = true; + } + // Careful: changing the float mode in a block breaks this optimization, since + // a previous float op might have had had FTZ off while the later store has FTZ + // on. So, discard all information we have. + if (!strncmp(code[i].opinfo->opname, "mtfs", 4)) + fprIsStoreSafe = BitSet32(0); } } return address; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index e68be7a5ee..59c637e5b2 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -53,6 +53,11 @@ struct CodeOp //16B BitSet32 fprInXmm; // whether an fpr is known to be an actual single-precision value at this point in the block. BitSet32 fprIsSingle; + // whether an fpr is known to have identical top and bottom halves (e.g. due to a single instruction) + BitSet32 fprIsDuplicated; + // whether an fpr is the output of a single-precision arithmetic instruction, i.e. whether we can safely + // skip PPC_FP. + BitSet32 fprIsStoreSafe; }; struct BlockStats