From 72c96c20d31327f0885bbbc7bf3fc104cd1b6e99 Mon Sep 17 00:00:00 2001
From: Fiora <fioraaeterna@gmail.com>
Date: Sat, 11 Oct 2014 14:22:44 -0700
Subject: [PATCH] JIT: more optimizing of float ops based on known input
 characteristics

If the inputs are both float singles, and the top half is known to be identical
to the bottom half, we can use packed arithmetic instead of scalar to skip
the movddup.

This is slower on a few rather old CPUs, plus the Atom+Silvermont, so detect
Atom and disable it in that case.

Also avoid PPC_FP on stores if we know that the output came from a float op.
---
 Source/Core/Common/CPUDetect.h                |   2 +-
 Source/Core/Common/x64CPUDetect.cpp           |   6 +
 Source/Core/Core/PowerPC/Jit64/Jit.h          |   2 +-
 .../Core/PowerPC/Jit64/Jit_FloatingPoint.cpp  | 116 ++++++++++++++----
 .../PowerPC/Jit64/Jit_LoadStoreFloating.cpp   |  11 +-
 .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp  |  10 +-
 Source/Core/Core/PowerPC/JitCommon/Jit_Util.h |   2 +-
 Source/Core/Core/PowerPC/PPCAnalyst.cpp       |  39 +++++-
 Source/Core/Core/PowerPC/PPCAnalyst.h         |   5 +
 9 files changed, 154 insertions(+), 39 deletions(-)

diff --git a/Source/Core/Common/CPUDetect.h b/Source/Core/Common/CPUDetect.h
index 752d26afb2..c63076ff7b 100644
--- a/Source/Core/Common/CPUDetect.h
+++ b/Source/Core/Common/CPUDetect.h
@@ -50,10 +50,10 @@ struct CPUInfo
 	bool bMOVBE;
 	// This flag indicates that the hardware supports some mode
 	// in which denormal inputs _and_ outputs are automatically set to (signed) zero.
-	// TODO: ARM
 	bool bFlushToZero;
 	bool bLAHFSAHF64;
 	bool bLongMode;
+	bool bAtom;
 
 	// ARM specific CPUInfo
 	bool bSwp;
diff --git a/Source/Core/Common/x64CPUDetect.cpp b/Source/Core/Common/x64CPUDetect.cpp
index 31409685e8..8ad8046c8b 100644
--- a/Source/Core/Common/x64CPUDetect.cpp
+++ b/Source/Core/Common/x64CPUDetect.cpp
@@ -129,6 +129,12 @@ void CPUInfo::Detect()
 	if (max_std_fn >= 1)
 	{
 		__cpuid(cpu_id, 0x00000001);
+		int family = ((cpu_id[0] >> 8) & 0xf) + ((cpu_id[0] >> 20) & 0xff);
+		int model = ((cpu_id[0] >> 4) & 0xf) + ((cpu_id[0] >> 12) & 0xf0);
+		// Detect people unfortunate enough to be running Dolphin on an Atom
+		if (family == 6 && (model == 0x1C || model == 0x26 ||model == 0x27 || model == 0x35 || model == 0x36 ||
+		                    model == 0x37 || model == 0x4A || model == 0x4D || model == 0x5A || model == 0x5D))
+			bAtom = true;
 		logical_cpu_count = (cpu_id[1] >> 16) & 0xFF;
 		ht = (cpu_id[3] >> 28) & 1;
 
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index 5793d744c8..a2ec9f2a66 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -151,7 +151,7 @@ public:
 	void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
 		          bool Rc = false, bool carry = false);
 	void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg),
-	               void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
+	               void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool packed = false, bool roundRHS = false);
 	void FloatCompare(UGeckoInstruction inst, bool upper = false);
 
 	// OPCODES
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
index 84e1ce4969..f404ccd88b 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp
@@ -11,11 +11,12 @@
 using namespace Gen;
 
 static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
+static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
 static const u64 GC_ALIGNED16(psAbsMask[2])  = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
 static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
 
 void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg),
-                      void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
+                      void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool packed, bool roundRHS)
 {
 	fpr.Lock(d, a, b);
 	fpr.BindToRegister(d, d == a || d == b || !single);
@@ -34,12 +35,19 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X
 	}
 	else
 	{
-		avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), false, reversible);
+		avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), packed, reversible);
 	}
 	if (single)
 	{
-		ForceSinglePrecisionS(fpr.RX(d));
-		MOVDDUP(fpr.RX(d), fpr.R(d));
+		if (packed)
+		{
+			ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
+		}
+		else
+		{
+			ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d));
+			MOVDDUP(fpr.RX(d), fpr.R(d));
+		}
 	}
 	SetFPRFIfNeeded(inst, fpr.RX(d));
 	fpr.UnlockAll();
@@ -63,14 +71,32 @@ void Jit64::fp_arith(UGeckoInstruction inst)
 	JITDISABLE(bJITFloatingPointOff);
 	FALLBACK_IF(inst.Rc);
 
+	int a = inst.FA;
+	int b = inst.FB;
+	int c = inst.FC;
+	int d = inst.FD;
+	int arg2 = inst.SUBOP5 == 25 ? c : b;
+
 	bool single = inst.OPCD == 59;
 	bool round_input = single && !jit->js.op->fprIsSingle[inst.FC];
+	// If both the inputs are known to have identical top and bottom halves, we can skip the MOVDDUP at the end by
+	// using packed arithmetic instead.
+	bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[arg2];
+	// Packed divides are slower than scalar divides on basically all x86, so this optimization isn't worth it in that case.
+	// Atoms (and a few really old CPUs) are also slower on packed operations than scalar ones.
+	if (inst.SUBOP5 == 18 || cpu_info.bAtom)
+		packed = false;
+
 	switch (inst.SUBOP5)
 	{
-	case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div
-	case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub
-	case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add
-	case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, round_input); break; //mul
+	case 18: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VDIVPD : &XEmitter::VDIVSD,
+	                   packed ? &XEmitter::DIVPD : &XEmitter::DIVSD, inst, packed); break;
+	case 20: fp_tri_op(d, a, b, false, single, packed ? &XEmitter::VSUBPD : &XEmitter::VSUBSD,
+	                   packed ? &XEmitter::SUBPD : &XEmitter::SUBSD, inst, packed); break;
+	case 21: fp_tri_op(d, a, b, true, single, packed ? &XEmitter::VADDPD : &XEmitter::VADDSD,
+	                   packed ? &XEmitter::ADDPD : &XEmitter::ADDSD, inst, packed); break;
+	case 25: fp_tri_op(d, a, c, true, single, packed ? &XEmitter::VMULPD : &XEmitter::VMULSD,
+	                   packed ? &XEmitter::MULPD : &XEmitter::MULSD, inst, packed, round_input); break;
 	default:
 		_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
 	}
@@ -88,6 +114,9 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 	int d = inst.FD;
 	bool single = inst.OPCD == 59;
 	bool round_input = single && !jit->js.op->fprIsSingle[c];
+	bool packed = single && jit->js.op->fprIsDuplicated[a] && jit->js.op->fprIsDuplicated[b] && jit->js.op->fprIsDuplicated[c];
+	if (cpu_info.bAtom)
+		packed = false;
 
 	fpr.Lock(a, b, c, d);
 
@@ -109,20 +138,32 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 		switch (inst.SUBOP5)
 		{
 		case 28: //msub
-			VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
+			if (packed)
+				VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
+			else
+				VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
 			break;
 		case 29: //madd
-			VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
+			if (packed)
+				VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
+			else
+				VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
 			break;
 			// PowerPC and x86 define NMADD/NMSUB differently
 			// x86: D = -A*C (+/-) B
 			// PPC: D = -(A*C (+/-) B)
 			// so we have to swap them; the ADD/SUB here isn't a typo.
 		case 30: //nmsub
-			VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
+			if (packed)
+				VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a));
+			else
+				VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a));
 			break;
 		case 31: //nmadd
-			VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
+			if (packed)
+				VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a));
+			else
+				VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a));
 			break;
 		}
 	}
@@ -133,9 +174,17 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 			Force25BitPrecision(XMM1, fpr.R(c), XMM0);
 		else
 			MOVAPD(XMM1, fpr.R(c));
-		MULSD(XMM1, fpr.R(a));
 		MOVAPD(XMM0, fpr.R(b));
-		SUBSD(XMM0, R(XMM1));
+		if (packed)
+		{
+			MULPD(XMM1, fpr.R(a));
+			SUBPD(XMM0, R(XMM1));
+		}
+		else
+		{
+			MULSD(XMM1, fpr.R(a));
+			SUBSD(XMM0, R(XMM1));
+		}
 	}
 	else
 	{
@@ -143,22 +192,39 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
 			Force25BitPrecision(XMM0, fpr.R(c), XMM1);
 		else
 			MOVAPD(XMM0, fpr.R(c));
-		MULSD(XMM0, fpr.R(a));
-		if (inst.SUBOP5 == 28) //msub
-			SUBSD(XMM0, fpr.R(b));
-		else                   //(n)madd
-			ADDSD(XMM0, fpr.R(b));
+		if (packed)
+		{
+			MULPD(XMM0, fpr.R(a));
+			if (inst.SUBOP5 == 28) //msub
+				SUBPD(XMM0, fpr.R(b));
+			else                   //(n)madd
+				ADDPD(XMM0, fpr.R(b));
+		}
+		else
+		{
+			MULSD(XMM0, fpr.R(a));
+			if (inst.SUBOP5 == 28)
+				SUBSD(XMM0, fpr.R(b));
+			else
+				ADDSD(XMM0, fpr.R(b));
+		}
 		if (inst.SUBOP5 == 31) //nmadd
-			PXOR(XMM0, M((void*)&psSignBits));
+			PXOR(XMM0, M((void*)&(packed ? psSignBits2 : psSignBits)));
 	}
 
 	fpr.BindToRegister(d, !single);
-	//YES it is necessary to dupe the result :(
-	//TODO : analysis - does the top reg get used? If so, dupe, if not, don't.
+
 	if (single)
 	{
-		ForceSinglePrecisionS(XMM0);
-		MOVDDUP(fpr.RX(d), R(XMM0));
+		if (packed)
+		{
+			ForceSinglePrecisionP(fpr.RX(d), XMM0);
+		}
+		else
+		{
+			ForceSinglePrecisionS(fpr.RX(d), XMM0);
+			MOVDDUP(fpr.RX(d), fpr.R(d));
+		}
 	}
 	else
 	{
@@ -427,7 +493,7 @@ void Jit64::frspx(UGeckoInstruction inst)
 	fpr.BindToRegister(d, d == b);
 	if (b != d)
 		MOVAPD(fpr.RX(d), fpr.R(b));
-	ForceSinglePrecisionS(fpr.RX(d));
+	ForceSinglePrecisionS(fpr.RX(d), fpr.RX(d));
 	MOVDDUP(fpr.RX(d), fpr.R(d));
 	SetFPRFIfNeeded(inst, fpr.RX(d));
 	fpr.UnlockAll();
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
index a859a53ff9..2a246b3a0b 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
@@ -108,8 +108,15 @@ void Jit64::stfXXX(UGeckoInstruction inst)
 
 	if (single)
 	{
-		fpr.BindToRegister(s, true, false);
-		ConvertDoubleToSingle(XMM0, fpr.RX(s));
+		if (jit->js.op->fprIsStoreSafe[s])
+		{
+			CVTSD2SS(XMM0, fpr.R(s));
+		}
+		else
+		{
+			fpr.BindToRegister(s, true, false);
+			ConvertDoubleToSingle(XMM0, fpr.RX(s));
+		}
 		MOVD_xmm(R(RSCRATCH), XMM0);
 	}
 	else
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
index 30573246c0..a9808f7d07 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@@ -667,13 +667,17 @@ void EmuCodeBlock::WriteToConstRamAddress(int accessSize, OpArg arg, u32 address
 		MOV(accessSize, MDisp(RMEM, address & 0x3FFFFFFF), R(reg));
 }
 
-void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm)
+void EmuCodeBlock::ForceSinglePrecisionS(X64Reg output, X64Reg input)
 {
 	// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
 	if (jit->jo.accurateSinglePrecision)
 	{
-		CVTSD2SS(xmm, R(xmm));
-		CVTSS2SD(xmm, R(xmm));
+		CVTSD2SS(input, R(input));
+		CVTSS2SD(output, R(input));
+	}
+	else if (output != input)
+	{
+		MOVAPD(output, R(input));
 	}
 }
 
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
index 3487fb374f..67a01249f2 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@@ -130,7 +130,7 @@ public:
 	void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8),
 	            Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm);
 
-	void ForceSinglePrecisionS(Gen::X64Reg xmm);
+	void ForceSinglePrecisionS(Gen::X64Reg output, Gen::X64Reg input);
 	void ForceSinglePrecisionP(Gen::X64Reg output, Gen::X64Reg input);
 	void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp);
 
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
index 0d72e8a5a4..b5a5c22716 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp
@@ -830,18 +830,45 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
 			fprInUse[code[i].fregOut] = true;
 	}
 
-	// Forward scan, for flags that need the other direction for calculation
-	BitSet32 fprIsSingle;
+	// Forward scan, for flags that need the other direction for calculation.
+	BitSet32 fprIsSingle, fprIsDuplicated, fprIsStoreSafe;
 	for (u32 i = 0; i < block->m_num_instructions; i++)
 	{
 		code[i].fprIsSingle = fprIsSingle;
+		code[i].fprIsDuplicated = fprIsDuplicated;
+		code[i].fprIsStoreSafe = fprIsStoreSafe;
 		if (code[i].fregOut >= 0)
 		{
-			// This instruction outputs float, so we can omit the special rounding done in fmuls/fmadds
-			if (code[i].opinfo->type == OPTYPE_SINGLEFP || code[i].opinfo->type == OPTYPE_PS || strncmp(code[i].opinfo->opname, "lfs", 3))
+			fprIsSingle[code[i].fregOut] = false;
+			fprIsDuplicated[code[i].fregOut] = false;
+			fprIsStoreSafe[code[i].fregOut] = false;
+			// Single, duplicated, and doesn't need PPC_FP.
+			if (code[i].opinfo->type == OPTYPE_SINGLEFP)
+			{
 				fprIsSingle[code[i].fregOut] = true;
-			else
-				fprIsSingle[code[i].fregOut] = false;
+				fprIsDuplicated[code[i].fregOut] = true;
+				fprIsStoreSafe[code[i].fregOut] = true;
+			}
+			// Single and duplicated, but might be a denormal (not safe to skip PPC_FP).
+			// TODO: if we go directly from a load to store, skip conversion entirely?
+			// TODO: if we go directly from a load to a float instruction, and the value isn't used
+			// for anything else, we can skip PPC_FP on a load too.
+			if (!strncmp(code[i].opinfo->opname, "lfs", 3))
+			{
+				fprIsSingle[code[i].fregOut] = true;
+				fprIsDuplicated[code[i].fregOut] = true;
+			}
+			// Paired are still floats, but the top/bottom halves may differ.
+			if (code[i].opinfo->type == OPTYPE_PS || code[i].opinfo->type == OPTYPE_LOADPS)
+			{
+				fprIsSingle[code[i].fregOut] = true;
+				fprIsStoreSafe[code[i].fregOut] = true;
+			}
+			// Careful: changing the float mode in a block breaks this optimization, since
+			// a previous float op might have had had FTZ off while the later store has FTZ
+			// on. So, discard all information we have.
+			if (!strncmp(code[i].opinfo->opname, "mtfs", 4))
+				fprIsStoreSafe = BitSet32(0);
 		}
 	}
 	return address;
diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h
index e68be7a5ee..59c637e5b2 100644
--- a/Source/Core/Core/PowerPC/PPCAnalyst.h
+++ b/Source/Core/Core/PowerPC/PPCAnalyst.h
@@ -53,6 +53,11 @@ struct CodeOp //16B
 	BitSet32 fprInXmm;
 	// whether an fpr is known to be an actual single-precision value at this point in the block.
 	BitSet32 fprIsSingle;
+	// whether an fpr is known to have identical top and bottom halves (e.g. due to a single instruction)
+	BitSet32 fprIsDuplicated;
+	// whether an fpr is the output of a single-precision arithmetic instruction, i.e. whether we can safely
+	// skip PPC_FP.
+	BitSet32 fprIsStoreSafe;
 };
 
 struct BlockStats