Revert "JIT64: optimize CA calculations"

This commit is contained in:
Fiora 2014-09-05 10:26:30 -07:00
parent 97420c6ec6
commit 07e0c917c6
16 changed files with 534 additions and 530 deletions

View File

@ -175,15 +175,16 @@ struct Rectangle
} // namespace MathUtil
inline float pow2f(float x) {return x * x;}
inline double pow2(double x) {return x * x;}
float MathFloatVectorSum(const std::vector<float>&);
#define ROUND_UP(x, a) (((x) + (a) - 1) & ~((a) - 1))
#define ROUND_DOWN(x, a) ((x) & ~((a) - 1))
inline bool IsPow2(u32 imm) {return (imm & (imm - 1)) == 0;}
// Rounds down. 0 -> undefined
inline int IntLog2(u64 val)
inline int Log2(u64 val)
{
#if defined(__GNUC__)
return 63 - __builtin_clzll(val);

View File

@ -331,12 +331,9 @@ union UFPR
float f[2];
};
#define XER_CA_SHIFT 29
#define XER_OV_SHIFT 30
#define XER_SO_SHIFT 31
#define XER_CA_MASK (1U << XER_CA_SHIFT)
#define XER_OV_MASK (1U << XER_OV_SHIFT)
#define XER_SO_MASK (1U << XER_SO_SHIFT)
#define XER_CA_MASK 0x20000000
#define XER_OV_MASK 0x40000000
#define XER_SO_MASK 0x80000000
// XER
union UReg_XER
{

View File

@ -34,7 +34,7 @@ static GekkoOPTemplate primarytable[] =
{10, Interpreter::cmpli, {"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
{11, Interpreter::cmpi, {"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
{12, Interpreter::addic, {"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA, 1, 0, 0, 0}},
{13, Interpreter::addic_rc, {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0, 1, 0, 0, 0}},
{13, Interpreter::addic_rc, {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0, 1, 0, 0, 0}},
{14, Interpreter::addi, {"addi", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},
{15, Interpreter::addis, {"addis", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},
@ -180,8 +180,8 @@ static GekkoOPTemplate table31[] =
{922, Interpreter::extshx, {"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{954, Interpreter::extsbx, {"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{536, Interpreter::srwx, {"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{792, Interpreter::srawx, {"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{824, Interpreter::srawix, {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
{792, Interpreter::srawx, {"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{824, Interpreter::srawix, {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{24, Interpreter::slwx, {"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
{54, Interpreter::dcbst, {"dcbst", OPTYPE_DCACHE, 0, 5, 0, 0, 0}},
@ -260,7 +260,7 @@ static GekkoOPTemplate table31[] =
{339, Interpreter::mfspr, {"mfspr", OPTYPE_SPR, FL_OUT_D, 1, 0, 0, 0}},
{467, Interpreter::mtspr, {"mtspr", OPTYPE_SPR, 0, 2, 0, 0, 0}},
{371, Interpreter::mftb, {"mftb", OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER, 1, 0, 0, 0}},
{512, Interpreter::mcrxr, {"mcrxr", OPTYPE_SYSTEM, FL_READ_CA | FL_SET_CA, 1, 0, 0, 0}},
{512, Interpreter::mcrxr, {"mcrxr", OPTYPE_SYSTEM, 0, 1, 0, 0, 0}},
{595, Interpreter::mfsr, {"mfsr", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},
{659, Interpreter::mfsrin, {"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},

View File

@ -100,15 +100,13 @@ public:
void GenerateConstantOverflow(bool overflow);
void GenerateConstantOverflow(s64 val);
void GenerateOverflow();
void FinalizeCarryOverflow(bool ca, bool oe, bool inv = false);
void FinalizeCarryOverflow(bool oe, bool inv = false);
void GetCarryEAXAndClear();
void FinalizeCarryGenerateOverflowEAX(bool oe, bool inv = false);
void GenerateCarry();
void GenerateRC();
void ComputeRC(const Gen::OpArg & arg);
// use to extract bytes from a register using the regcache. offset is in bytes.
Gen::OpArg ExtractFromReg(int reg, int offset);
void AndWithMask(Gen::X64Reg reg, u32 mask);
bool CheckMergedBranch(int crf);
void DoMergedBranch();
// Reads a given bit of a given CR register part. Clobbers ABI_PARAM1,
// don't forget to xlock it before.
void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
@ -120,8 +118,6 @@ public:
Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm);
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
typedef u32 (*Operation)(u32 a, u32 b);
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);

View File

@ -193,8 +193,8 @@ static GekkoOPTemplate table31[] =
{922, &Jit64::extshx}, //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
{954, &Jit64::extsbx}, //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
{536, &Jit64::srwx}, //"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
{792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
{824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
{792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
{824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
{24, &Jit64::slwx}, //"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
{54, &Jit64::dcbst}, //"dcbst", OPTYPE_DCACHE, 0, 4}},

View File

@ -314,9 +314,6 @@ void RegCache::StoreFromRegister(size_t i, FlushMode mode)
void GPRRegCache::LoadRegister(size_t preg, X64Reg newLoc)
{
if (regs[preg].location.IsImm() && !regs[preg].location.offset)
emit->XOR(32, ::Gen::R(newLoc), ::Gen::R(newLoc));
else
emit->MOV(32, ::Gen::R(newLoc), regs[preg].location);
}

File diff suppressed because it is too large Load Diff

View File

@ -1104,7 +1104,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
Jit->JitSetCA();
FixupBranch cont = Jit->J();
Jit->SetJumpTarget(nocarry);
Jit->JitClearCAOV(true, false);
Jit->JitClearCA();
Jit->SetJumpTarget(cont);
regNormalRegClear(RI, I);
break;

View File

@ -802,11 +802,10 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
OR(32, M(&FPSCR), R(EAX));
}
void EmuCodeBlock::JitGetAndClearCAOV(bool oe)
void EmuCodeBlock::JitClearCA()
{
if (oe)
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_OV_MASK)); //XER.OV = 0
BTR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm8(29)); //carry = XER.CA, XER.CA = 0
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
}
void EmuCodeBlock::JitSetCA()
@ -814,20 +813,10 @@ void EmuCodeBlock::JitSetCA()
OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1
}
// Some testing shows CA is set roughly ~1/3 of the time (relative to clears), so
// branchless calculation of CA is probably faster in general.
void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode)
void EmuCodeBlock::JitClearCAOV(bool oe)
{
SETcc(conditionCode, R(EAX));
MOVZX(32, 8, EAX, R(AL));
SHL(32, R(EAX), Imm8(XER_CA_SHIFT));
OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); //XER.CA = 1
}
void EmuCodeBlock::JitClearCAOV(bool ca, bool oe)
{
u32 mask = (ca ? ~XER_CA_MASK : 0xFFFFFFFF) & (oe ? ~XER_OV_MASK : 0xFFFFFFFF);
if (mask == 0xFFFFFFFF)
return;
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(mask));
if (oe)
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK & ~XER_OV_MASK)); //XER.CA, XER.OV = 0
else
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
}

View File

@ -50,10 +50,9 @@ public:
void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0);
void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
void JitGetAndClearCAOV(bool oe);
void JitClearCA();
void JitSetCA();
void JitSetCAIf(Gen::CCFlags conditionCode);
void JitClearCAOV(bool ca, bool oe);
void JitClearCAOV(bool oe);
void ForceSinglePrecisionS(Gen::X64Reg xmm);
void ForceSinglePrecisionP(Gen::X64Reg xmm);

View File

@ -430,6 +430,7 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
{
code->wantsCR0 = false;
code->wantsCR1 = false;
code->wantsPS1 = false;
if (opinfo->flags & FL_USE_FPU)
block->m_fpa->any = true;
@ -457,15 +458,6 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
code->outputFPRF = (opinfo->flags & FL_SET_FPRF) ? true : false;
code->canEndBlock = (opinfo->flags & FL_ENDBLOCK) ? true : false;
code->wantsCA = (opinfo->flags & FL_READ_CA) ? true : false;
code->outputCA = (opinfo->flags & FL_SET_CA) ? true : false;
// mfspr/mtspr can affect/use XER, so be super careful here
if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 339) // mfspr
code->wantsCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr
code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
int numOut = 0;
int numIn = 0;
if (opinfo->flags & FL_OUT_A)
@ -723,30 +715,26 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
block->m_broken = true;
}
// Scan for flag dependencies; assume the next block (or any branch that can leave the block)
// wants flags, to be safe.
// Scan for CR0 dependency
// assume next block wants flags to be safe
bool wantsCR0 = true;
bool wantsCR1 = true;
bool wantsPS1 = true;
bool wantsFPRF = true;
bool wantsCA = true;
for (int i = block->m_num_instructions - 1; i >= 0; i--)
{
bool opWantsCR0 = code[i].wantsCR0;
bool opWantsCR1 = code[i].wantsCR1;
bool opWantsFPRF = code[i].wantsFPRF;
bool opWantsCA = code[i].wantsCA;
wantsCR0 |= opWantsCR0 || code[i].canEndBlock;
wantsCR1 |= opWantsCR1 || code[i].canEndBlock;
wantsFPRF |= opWantsFPRF || code[i].canEndBlock;
wantsCA |= opWantsCA || code[i].canEndBlock;
wantsCR0 |= code[i].wantsCR0 || code[i].canEndBlock;
wantsCR1 |= code[i].wantsCR1 || code[i].canEndBlock;
wantsPS1 |= code[i].wantsPS1 || code[i].canEndBlock;
wantsFPRF |= code[i].wantsFPRF || code[i].canEndBlock;
code[i].wantsCR0 = wantsCR0;
code[i].wantsCR1 = wantsCR1;
code[i].wantsPS1 = wantsPS1;
code[i].wantsFPRF = wantsFPRF;
code[i].wantsCA = wantsCA;
wantsCR0 &= !code[i].outputCR0 || opWantsCR0;
wantsCR1 &= !code[i].outputCR1 || opWantsCR1;
wantsFPRF &= !code[i].outputFPRF || opWantsFPRF;
wantsCA &= !code[i].outputCA || opWantsCA;
wantsCR0 &= !code[i].outputCR0;
wantsCR1 &= !code[i].outputCR1;
wantsPS1 &= !code[i].outputPS1;
wantsFPRF &= !code[i].outputFPRF;
}
return address;
}

View File

@ -33,12 +33,12 @@ struct CodeOp //16B
bool isBranchTarget;
bool wantsCR0;
bool wantsCR1;
bool wantsPS1;
bool wantsFPRF;
bool wantsCA;
bool outputCR0;
bool outputCR1;
bool outputPS1;
bool outputFPRF;
bool outputCA;
bool canEndBlock;
bool skip; // followed BL-s for example
};

View File

@ -397,7 +397,7 @@ static wxString NiceSizeFormat(u64 _size)
// Find largest power of 2 less than _size.
// div 10 to get largest named unit less than _size
// 10 == log2(1024) (number of B in a KiB, KiB in a MiB, etc)
const u64 unit = IntLog2(std::max<u64>(_size, 1)) / 10;
const u64 unit = Log2(std::max<u64>(_size, 1)) / 10;
const u64 unit_size = (1 << (unit * 10));
// mul 1000 for 3 decimal places, add 5 to round up, div 10 for 2 decimal places

View File

@ -23,7 +23,7 @@ static u32 genBuffer()
}
StreamBuffer::StreamBuffer(u32 type, u32 size)
: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(IntLog2(ROUND_UP_POW2(size) / SYNC_POINTS))
: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(Log2(ROUND_UP_POW2(size) / SYNC_POINTS))
{
m_iterator = 0;
m_used_iterator = 0;

View File

@ -91,8 +91,8 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)
WRITE(p, " int y_block_position = uv1.y & %d;\n", ~(blkH - 1));
WRITE(p, " int y_offset_in_block = uv1.y & %d;\n", blkH - 1);
WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", IntLog2(samples));
WRITE(p, " int x_block_position = (x_virtual_position >> %d) & %d;\n", IntLog2(blkH), ~(blkW - 1));
WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", Log2(samples));
WRITE(p, " int x_block_position = (x_virtual_position >> %d) & %d;\n", Log2(blkH), ~(blkW - 1));
if (samples == 1)
{
// 32 bit textures (RGBA8 and Z24) are stored in 2 cache line increments
@ -100,7 +100,7 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)
WRITE(p, " x_virtual_position = x_virtual_position << 1;\n");
}
WRITE(p, " int x_offset_in_block = x_virtual_position & %d;\n", blkW - 1);
WRITE(p, " int y_offset = (x_virtual_position >> %d) & %d;\n", IntLog2(blkW), blkH - 1);
WRITE(p, " int y_offset = (x_virtual_position >> %d) & %d;\n", Log2(blkW), blkH - 1);
WRITE(p, " sampleUv.x = x_offset_in_block + x_block_position;\n");
WRITE(p, " sampleUv.y = y_block_position + y_offset;\n");

View File

@ -44,17 +44,17 @@ TEST(MathUtil, IsSNAN)
EXPECT_TRUE(MathUtil::IsSNAN(std::numeric_limits<double>::signaling_NaN()));
}
TEST(MathUtil, IntLog2)
TEST(MathUtil, Log2)
{
EXPECT_EQ(0, IntLog2(1));
EXPECT_EQ(1, IntLog2(2));
EXPECT_EQ(2, IntLog2(4));
EXPECT_EQ(3, IntLog2(8));
EXPECT_EQ(63, IntLog2(0x8000000000000000ull));
EXPECT_EQ(0, Log2(1));
EXPECT_EQ(1, Log2(2));
EXPECT_EQ(2, Log2(4));
EXPECT_EQ(3, Log2(8));
EXPECT_EQ(63, Log2(0x8000000000000000ull));
// Rounding behavior.
EXPECT_EQ(3, IntLog2(15));
EXPECT_EQ(63, IntLog2(0xFFFFFFFFFFFFFFFFull));
EXPECT_EQ(3, Log2(15));
EXPECT_EQ(63, Log2(0xFFFFFFFFFFFFFFFFull));
}
TEST(MathUtil, FlushToZero)