Merge pull request #753 from FioraAeterna/integeropts

JIT64: various integer optimizations
This commit is contained in:
Ryan Houdek 2014-09-09 04:10:30 -05:00
commit 09c1ad1631
12 changed files with 494 additions and 527 deletions

View File

@ -175,16 +175,15 @@ struct Rectangle
} // namespace MathUtil } // namespace MathUtil
inline float pow2f(float x) {return x * x;}
inline double pow2(double x) {return x * x;}
float MathFloatVectorSum(const std::vector<float>&); float MathFloatVectorSum(const std::vector<float>&);
#define ROUND_UP(x, a) (((x) + (a) - 1) & ~((a) - 1)) #define ROUND_UP(x, a) (((x) + (a) - 1) & ~((a) - 1))
#define ROUND_DOWN(x, a) ((x) & ~((a) - 1)) #define ROUND_DOWN(x, a) ((x) & ~((a) - 1))
inline bool IsPow2(u32 imm) {return (imm & (imm - 1)) == 0;}
// Rounds down. 0 -> undefined // Rounds down. 0 -> undefined
inline int Log2(u64 val) inline int IntLog2(u64 val)
{ {
#if defined(__GNUC__) #if defined(__GNUC__)
return 63 - __builtin_clzll(val); return 63 - __builtin_clzll(val);

View File

@ -331,9 +331,12 @@ union UFPR
float f[2]; float f[2];
}; };
#define XER_CA_MASK 0x20000000 #define XER_CA_SHIFT 29
#define XER_OV_MASK 0x40000000 #define XER_OV_SHIFT 30
#define XER_SO_MASK 0x80000000 #define XER_SO_SHIFT 31
#define XER_CA_MASK (1U << XER_CA_SHIFT)
#define XER_OV_MASK (1U << XER_OV_SHIFT)
#define XER_SO_MASK (1U << XER_SO_SHIFT)
// XER // XER
union UReg_XER union UReg_XER
{ {

View File

@ -101,12 +101,14 @@ public:
void GenerateConstantOverflow(s64 val); void GenerateConstantOverflow(s64 val);
void GenerateOverflow(); void GenerateOverflow();
void FinalizeCarryOverflow(bool oe, bool inv = false); void FinalizeCarryOverflow(bool oe, bool inv = false);
void GetCarryRSCRATCHAndClear();
void FinalizeCarryGenerateOverflowRSCRATCH(bool oe, bool inv = false);
void GenerateCarry();
void GenerateRC();
void ComputeRC(const Gen::OpArg & arg); void ComputeRC(const Gen::OpArg & arg);
// Use to extract bytes from a register using the regcache. offset is in bytes.
Gen::OpArg ExtractFromReg(int reg, int offset);
void AndWithMask(Gen::X64Reg reg, u32 mask);
bool CheckMergedBranch(int crf);
void DoMergedBranch();
// Reads a given bit of a given CR register part. // Reads a given bit of a given CR register part.
void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false); void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
// Clobbers RDX. // Clobbers RDX.
@ -117,6 +119,8 @@ public:
Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true); Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm); void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm);
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false); void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
typedef u32 (*Operation)(u32 a, u32 b); typedef u32 (*Operation)(u32 a, u32 b);
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
@ -147,8 +151,7 @@ public:
void addmex(UGeckoInstruction inst); void addmex(UGeckoInstruction inst);
void addzex(UGeckoInstruction inst); void addzex(UGeckoInstruction inst);
void extsbx(UGeckoInstruction inst); void extsXx(UGeckoInstruction inst);
void extshx(UGeckoInstruction inst);
void sc(UGeckoInstruction _inst); void sc(UGeckoInstruction _inst);
void rfi(UGeckoInstruction _inst); void rfi(UGeckoInstruction _inst);

View File

@ -190,8 +190,8 @@ static GekkoOPTemplate table31[] =
{0, &Jit64::cmpXX}, //"cmp", OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}}, {0, &Jit64::cmpXX}, //"cmp", OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}},
{32, &Jit64::cmpXX}, //"cmpl", OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}}, {32, &Jit64::cmpXX}, //"cmpl", OPTYPE_INTEGER, FL_IN_AB | FL_SET_CRn}},
{26, &Jit64::cntlzwx}, //"cntlzwx",OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {26, &Jit64::cntlzwx}, //"cntlzwx",OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
{922, &Jit64::extshx}, //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {922, &Jit64::extsXx}, //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
{954, &Jit64::extsbx}, //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}}, {954, &Jit64::extsXx}, //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
{536, &Jit64::srwx}, //"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {536, &Jit64::srwx}, //"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
{792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
{824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}}, {824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},

File diff suppressed because it is too large Load Diff

View File

@ -1106,7 +1106,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
Jit->JitSetCA(); Jit->JitSetCA();
FixupBranch cont = Jit->J(); FixupBranch cont = Jit->J();
Jit->SetJumpTarget(nocarry); Jit->SetJumpTarget(nocarry);
Jit->JitClearCA(); Jit->JitClearCAOV(false);
Jit->SetJumpTarget(cont); Jit->SetJumpTarget(cont);
regNormalRegClear(RI, I); regNormalRegClear(RI, I);
break; break;

View File

@ -803,10 +803,11 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
OR(32, PPCSTATE(fpscr), R(RSCRATCH)); OR(32, PPCSTATE(fpscr), R(RSCRATCH));
} }
void EmuCodeBlock::JitGetAndClearCAOV(bool oe)
void EmuCodeBlock::JitClearCA()
{ {
AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0 if (oe)
AND(32, PPCSTATE(spr[SPR_XER]), Imm32(~XER_OV_MASK)); //XER.OV = 0
BTR(32, PPCSTATE(spr[SPR_XER]), Imm8(29)); //carry = XER.CA, XER.CA = 0
} }
void EmuCodeBlock::JitSetCA() void EmuCodeBlock::JitSetCA()
@ -814,6 +815,16 @@ void EmuCodeBlock::JitSetCA()
OR(32, PPCSTATE(spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1 OR(32, PPCSTATE(spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1
} }
// Some testing shows CA is set roughly ~1/3 of the time (relative to clears), so
// branchless calculation of CA is probably faster in general.
void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode)
{
SETcc(conditionCode, R(RSCRATCH));
MOVZX(32, 8, RSCRATCH, R(RSCRATCH));
SHL(32, R(RSCRATCH), Imm8(XER_CA_SHIFT));
OR(32, PPCSTATE(spr[SPR_XER]), R(RSCRATCH)); //XER.CA = 1
}
void EmuCodeBlock::JitClearCAOV(bool oe) void EmuCodeBlock::JitClearCAOV(bool oe)
{ {
if (oe) if (oe)

View File

@ -71,8 +71,9 @@ public:
void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0); void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0);
void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false); void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
void JitClearCA(); void JitGetAndClearCAOV(bool oe);
void JitSetCA(); void JitSetCA();
void JitSetCAIf(Gen::CCFlags conditionCode);
void JitClearCAOV(bool oe); void JitClearCAOV(bool oe);
void ForceSinglePrecisionS(Gen::X64Reg xmm); void ForceSinglePrecisionS(Gen::X64Reg xmm);

View File

@ -397,7 +397,7 @@ static wxString NiceSizeFormat(u64 _size)
// Find largest power of 2 less than _size. // Find largest power of 2 less than _size.
// div 10 to get largest named unit less than _size // div 10 to get largest named unit less than _size
// 10 == log2(1024) (number of B in a KiB, KiB in a MiB, etc) // 10 == log2(1024) (number of B in a KiB, KiB in a MiB, etc)
const u64 unit = Log2(std::max<u64>(_size, 1)) / 10; const u64 unit = IntLog2(std::max<u64>(_size, 1)) / 10;
const u64 unit_size = (1 << (unit * 10)); const u64 unit_size = (1 << (unit * 10));
// mul 1000 for 3 decimal places, add 5 to round up, div 10 for 2 decimal places // mul 1000 for 3 decimal places, add 5 to round up, div 10 for 2 decimal places

View File

@ -23,7 +23,7 @@ static u32 genBuffer()
} }
StreamBuffer::StreamBuffer(u32 type, u32 size) StreamBuffer::StreamBuffer(u32 type, u32 size)
: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(Log2(ROUND_UP_POW2(size) / SYNC_POINTS)) : m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(IntLog2(ROUND_UP_POW2(size) / SYNC_POINTS))
{ {
m_iterator = 0; m_iterator = 0;
m_used_iterator = 0; m_used_iterator = 0;

View File

@ -91,8 +91,8 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)
WRITE(p, " int y_block_position = uv1.y & %d;\n", ~(blkH - 1)); WRITE(p, " int y_block_position = uv1.y & %d;\n", ~(blkH - 1));
WRITE(p, " int y_offset_in_block = uv1.y & %d;\n", blkH - 1); WRITE(p, " int y_offset_in_block = uv1.y & %d;\n", blkH - 1);
WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", Log2(samples)); WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", IntLog2(samples));
WRITE(p, " int x_block_position = (x_virtual_position >> %d) & %d;\n", Log2(blkH), ~(blkW - 1)); WRITE(p, " int x_block_position = (x_virtual_position >> %d) & %d;\n", IntLog2(blkH), ~(blkW - 1));
if (samples == 1) if (samples == 1)
{ {
// 32 bit textures (RGBA8 and Z24) are stored in 2 cache line increments // 32 bit textures (RGBA8 and Z24) are stored in 2 cache line increments
@ -100,7 +100,7 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)
WRITE(p, " x_virtual_position = x_virtual_position << 1;\n"); WRITE(p, " x_virtual_position = x_virtual_position << 1;\n");
} }
WRITE(p, " int x_offset_in_block = x_virtual_position & %d;\n", blkW - 1); WRITE(p, " int x_offset_in_block = x_virtual_position & %d;\n", blkW - 1);
WRITE(p, " int y_offset = (x_virtual_position >> %d) & %d;\n", Log2(blkW), blkH - 1); WRITE(p, " int y_offset = (x_virtual_position >> %d) & %d;\n", IntLog2(blkW), blkH - 1);
WRITE(p, " sampleUv.x = x_offset_in_block + x_block_position;\n"); WRITE(p, " sampleUv.x = x_offset_in_block + x_block_position;\n");
WRITE(p, " sampleUv.y = y_block_position + y_offset;\n"); WRITE(p, " sampleUv.y = y_block_position + y_offset;\n");

View File

@ -44,17 +44,17 @@ TEST(MathUtil, IsSNAN)
EXPECT_TRUE(MathUtil::IsSNAN(std::numeric_limits<double>::signaling_NaN())); EXPECT_TRUE(MathUtil::IsSNAN(std::numeric_limits<double>::signaling_NaN()));
} }
TEST(MathUtil, Log2) TEST(MathUtil, IntLog2)
{ {
EXPECT_EQ(0, Log2(1)); EXPECT_EQ(0, IntLog2(1));
EXPECT_EQ(1, Log2(2)); EXPECT_EQ(1, IntLog2(2));
EXPECT_EQ(2, Log2(4)); EXPECT_EQ(2, IntLog2(4));
EXPECT_EQ(3, Log2(8)); EXPECT_EQ(3, IntLog2(8));
EXPECT_EQ(63, Log2(0x8000000000000000ull)); EXPECT_EQ(63, IntLog2(0x8000000000000000ull));
// Rounding behavior. // Rounding behavior.
EXPECT_EQ(3, Log2(15)); EXPECT_EQ(3, IntLog2(15));
EXPECT_EQ(63, Log2(0xFFFFFFFFFFFFFFFFull)); EXPECT_EQ(63, IntLog2(0xFFFFFFFFFFFFFFFFull));
} }
TEST(MathUtil, FlushToZero) TEST(MathUtil, FlushToZero)