Merge pull request #753 from FioraAeterna/integeropts

JIT64: various integer optimizations
2025-03-12 22:56:52 +01:00 · 2014-09-09 04:10:30 -05:00 · 2014-09-09 04:10:30 -05:00 · 09c1ad1631
commit 09c1ad1631
parent 6a7e8899d3 8fc57d61ba
12 changed files with 494 additions and 527 deletions
--- a/Source/Core/Common/MathUtil.h
+++ b/Source/Core/Common/MathUtil.h
@ -175,16 +175,15 @@ struct Rectangle
 }  // namespace MathUtil
 inline float pow2f(float x) {return x * x;}
 inline double pow2(double x) {return x * x;}
 float MathFloatVectorSum(const std::vector<float>&);
 #define ROUND_UP(x, a)   (((x) + (a) - 1) & ~((a) - 1))
 #define ROUND_DOWN(x, a) ((x) & ~((a) - 1))
 inline bool IsPow2(u32 imm) {return (imm & (imm - 1)) == 0;}
 // Rounds down. 0 -> undefined
-inline int Log2(u64 val)
+inline int IntLog2(u64 val)
 {
 #if defined(__GNUC__)
 	return 63 - __builtin_clzll(val);
--- a/Source/Core/Core/PowerPC/Gekko.h
+++ b/Source/Core/Core/PowerPC/Gekko.h
@ -331,9 +331,12 @@ union UFPR
 	float f[2];
 };
-#define XER_CA_MASK 0x20000000
+#define XER_CA_SHIFT 29
-#define XER_OV_MASK 0x40000000
+#define XER_OV_SHIFT 30
-#define XER_SO_MASK 0x80000000
+#define XER_SO_SHIFT 31
 #define XER_CA_MASK (1U << XER_CA_SHIFT)
 #define XER_OV_MASK (1U << XER_OV_SHIFT)
 #define XER_SO_MASK (1U << XER_SO_SHIFT)
 // XER
 union UReg_XER
 {
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@ -101,12 +101,14 @@ public:
 	void GenerateConstantOverflow(s64 val);
 	void GenerateOverflow();
 	void FinalizeCarryOverflow(bool oe, bool inv = false);
 	void GetCarryRSCRATCHAndClear();
 	void FinalizeCarryGenerateOverflowRSCRATCH(bool oe, bool inv = false);
 	void GenerateCarry();
 	void GenerateRC();
 	void ComputeRC(const Gen::OpArg & arg);
 	// Use to extract bytes from a register using the regcache. offset is in bytes.
 	Gen::OpArg ExtractFromReg(int reg, int offset);
 	void AndWithMask(Gen::X64Reg reg, u32 mask);
 	bool CheckMergedBranch(int crf);
 	void DoMergedBranch();
 	// Reads a given bit of a given CR register part.
 	void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
 	// Clobbers RDX.
@ -117,6 +119,8 @@ public:
 	Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
 	void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm);
 	void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
 	void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
 	typedef u32 (*Operation)(u32 a, u32 b);
 	void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
@ -147,8 +151,7 @@ public:
 	void addmex(UGeckoInstruction inst);
 	void addzex(UGeckoInstruction inst);
-	void extsbx(UGeckoInstruction inst);
+	void extsXx(UGeckoInstruction inst);
 	void extshx(UGeckoInstruction inst);
 	void sc(UGeckoInstruction _inst);
 	void rfi(UGeckoInstruction _inst);
--- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp
@ -190,8 +190,8 @@ static GekkoOPTemplate table31[] =
 	{0,   &Jit64::cmpXX},                  //"cmp",    OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}},
 	{32,  &Jit64::cmpXX},                  //"cmpl",   OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}},
 	{26,  &Jit64::cntlzwx},                //"cntlzwx",OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
-	{922, &Jit64::extshx},                 //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
+	{922, &Jit64::extsXx},                 //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
-	{954, &Jit64::extsbx},                 //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
+	{954, &Jit64::extsXx},                 //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
 	{536, &Jit64::srwx},                   //"srwx",   OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
 	{792, &Jit64::srawx},                  //"srawx",  OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
 	{824, &Jit64::srawix},                 //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
--- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
@ -1106,7 +1106,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
 			Jit->JitSetCA();
 			FixupBranch cont = Jit->J();
 			Jit->SetJumpTarget(nocarry);
-			Jit->JitClearCA();
+			Jit->JitClearCAOV(false);
 			Jit->SetJumpTarget(cont);
 			regNormalRegClear(RI, I);
 			break;
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@ -803,10 +803,11 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
 	OR(32, PPCSTATE(fpscr), R(RSCRATCH));
 }
-
+void EmuCodeBlock::JitGetAndClearCAOV(bool oe)
 void EmuCodeBlock::JitClearCA()
 {
-	AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
+	if (oe)
 		AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_OV_MASK)); //XER.OV = 0
 	BTR(32, PPCSTATE(spr[SPR_XER]), Imm8(29)); //carry = XER.CA, XER.CA = 0
 }
 void EmuCodeBlock::JitSetCA()
@ -814,6 +815,16 @@ void EmuCodeBlock::JitSetCA()
 	OR(32, PPCSTATE(spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1
 }
 // Some testing shows CA is set roughly ~1/3 of the time (relative to clears), so
 // branchless calculation of CA is probably faster in general.
 void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode)
 {
 	SETcc(conditionCode, R(RSCRATCH));
 	MOVZX(32, 8, RSCRATCH, R(RSCRATCH));
 	SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT));
 	OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); //XER.CA = 1
 }
 void EmuCodeBlock::JitClearCAOV(bool oe)
 {
 	if (oe)
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@ -71,8 +71,9 @@ public:
 	void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0);
 	void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
-	void JitClearCA();
+	void JitGetAndClearCAOV(bool oe);
 	void JitSetCA();
 	void JitSetCAIf(Gen::CCFlags conditionCode);
 	void JitClearCAOV(bool oe);
 	void ForceSinglePrecisionS(Gen::X64Reg xmm);
--- a/Source/Core/DolphinWX/GameListCtrl.cpp
+++ b/Source/Core/DolphinWX/GameListCtrl.cpp
@ -397,7 +397,7 @@ static wxString NiceSizeFormat(u64 _size)
 	// Find largest power of 2 less than _size.
 	// div 10 to get largest named unit less than _size
 	// 10 == log2(1024) (number of B in a KiB, KiB in a MiB, etc)
-	const u64 unit = Log2(std::max<u64>(_size, 1)) / 10;
+	const u64 unit = IntLog2(std::max<u64>(_size, 1)) / 10;
 	const u64 unit_size = (1 << (unit * 10));
 	// mul 1000 for 3 decimal places, add 5 to round up, div 10 for 2 decimal places
--- a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp
+++ b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp
@ -23,7 +23,7 @@ static u32 genBuffer()
 }
 StreamBuffer::StreamBuffer(u32 type, u32 size)
-: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(Log2(ROUND_UP_POW2(size) / SYNC_POINTS))
+: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(IntLog2(ROUND_UP_POW2(size) / SYNC_POINTS))
 {
 	m_iterator = 0;
 	m_used_iterator = 0;
--- a/Source/Core/VideoCommon/TextureConversionShader.cpp
+++ b/Source/Core/VideoCommon/TextureConversionShader.cpp
@ -91,8 +91,8 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)
 	WRITE(p, "  int y_block_position = uv1.y & %d;\n", ~(blkH - 1));
 	WRITE(p, "  int y_offset_in_block = uv1.y & %d;\n", blkH - 1);
-	WRITE(p, "  int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", Log2(samples));
+	WRITE(p, "  int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", IntLog2(samples));
-	WRITE(p, "  int x_block_position = (x_virtual_position >> %d) & %d;\n", Log2(blkH), ~(blkW - 1));
+	WRITE(p, "  int x_block_position = (x_virtual_position >> %d) & %d;\n", IntLog2(blkH), ~(blkW - 1));
 	if (samples == 1)
 	{
 		// 32 bit textures (RGBA8 and Z24) are stored in 2 cache line increments
@ -100,7 +100,7 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)
 		WRITE(p, "  x_virtual_position = x_virtual_position << 1;\n");
 	}
 	WRITE(p, "  int x_offset_in_block = x_virtual_position & %d;\n", blkW - 1);
-	WRITE(p, "  int y_offset = (x_virtual_position >> %d) & %d;\n", Log2(blkW), blkH - 1);
+	WRITE(p, "  int y_offset = (x_virtual_position >> %d) & %d;\n", IntLog2(blkW), blkH - 1);
 	WRITE(p, "  sampleUv.x = x_offset_in_block + x_block_position;\n");
 	WRITE(p, "  sampleUv.y = y_block_position + y_offset;\n");
--- a/Source/UnitTests/Common/MathUtilTest.cpp
+++ b/Source/UnitTests/Common/MathUtilTest.cpp
@ -44,17 +44,17 @@ TEST(MathUtil, IsSNAN)
 	EXPECT_TRUE(MathUtil::IsSNAN(std::numeric_limits<double>::signaling_NaN()));
 }
-TEST(MathUtil, Log2)
+TEST(MathUtil, IntLog2)
 {
-	EXPECT_EQ(0, Log2(1));
+	EXPECT_EQ(0, IntLog2(1));
-	EXPECT_EQ(1, Log2(2));
+	EXPECT_EQ(1, IntLog2(2));
-	EXPECT_EQ(2, Log2(4));
+	EXPECT_EQ(2, IntLog2(4));
-	EXPECT_EQ(3, Log2(8));
+	EXPECT_EQ(3, IntLog2(8));
-	EXPECT_EQ(63, Log2(0x8000000000000000ull));
+	EXPECT_EQ(63, IntLog2(0x8000000000000000ull));
 	// Rounding behavior.
-	EXPECT_EQ(3, Log2(15));
+	EXPECT_EQ(3, IntLog2(15));
-	EXPECT_EQ(63, Log2(0xFFFFFFFFFFFFFFFFull));
+	EXPECT_EQ(63, IntLog2(0xFFFFFFFFFFFFFFFFull));
 }
 TEST(MathUtil, FlushToZero)