mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-01-25 07:21:14 +01:00
Revert "JIT64: optimize CA calculations"
This commit is contained in:
parent
97420c6ec6
commit
07e0c917c6
@ -175,15 +175,16 @@ struct Rectangle
|
||||
|
||||
} // namespace MathUtil
|
||||
|
||||
inline float pow2f(float x) {return x * x;}
|
||||
inline double pow2(double x) {return x * x;}
|
||||
|
||||
float MathFloatVectorSum(const std::vector<float>&);
|
||||
|
||||
#define ROUND_UP(x, a) (((x) + (a) - 1) & ~((a) - 1))
|
||||
#define ROUND_DOWN(x, a) ((x) & ~((a) - 1))
|
||||
|
||||
inline bool IsPow2(u32 imm) {return (imm & (imm - 1)) == 0;}
|
||||
|
||||
// Rounds down. 0 -> undefined
|
||||
inline int IntLog2(u64 val)
|
||||
inline int Log2(u64 val)
|
||||
{
|
||||
#if defined(__GNUC__)
|
||||
return 63 - __builtin_clzll(val);
|
||||
|
@ -331,12 +331,9 @@ union UFPR
|
||||
float f[2];
|
||||
};
|
||||
|
||||
#define XER_CA_SHIFT 29
|
||||
#define XER_OV_SHIFT 30
|
||||
#define XER_SO_SHIFT 31
|
||||
#define XER_CA_MASK (1U << XER_CA_SHIFT)
|
||||
#define XER_OV_MASK (1U << XER_OV_SHIFT)
|
||||
#define XER_SO_MASK (1U << XER_SO_SHIFT)
|
||||
#define XER_CA_MASK 0x20000000
|
||||
#define XER_OV_MASK 0x40000000
|
||||
#define XER_SO_MASK 0x80000000
|
||||
// XER
|
||||
union UReg_XER
|
||||
{
|
||||
|
@ -34,7 +34,7 @@ static GekkoOPTemplate primarytable[] =
|
||||
{10, Interpreter::cmpli, {"cmpli", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
|
||||
{11, Interpreter::cmpi, {"cmpi", OPTYPE_INTEGER, FL_IN_A | FL_SET_CRn, 1, 0, 0, 0}},
|
||||
{12, Interpreter::addic, {"addic", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA, 1, 0, 0, 0}},
|
||||
{13, Interpreter::addic_rc, {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CA | FL_SET_CR0, 1, 0, 0, 0}},
|
||||
{13, Interpreter::addic_rc, {"addic_rc", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A | FL_SET_CR0, 1, 0, 0, 0}},
|
||||
{14, Interpreter::addi, {"addi", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},
|
||||
{15, Interpreter::addis, {"addis", OPTYPE_INTEGER, FL_OUT_D | FL_IN_A0, 1, 0, 0, 0}},
|
||||
|
||||
@ -180,8 +180,8 @@ static GekkoOPTemplate table31[] =
|
||||
{922, Interpreter::extshx, {"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{954, Interpreter::extsbx, {"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{536, Interpreter::srwx, {"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{792, Interpreter::srawx, {"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{824, Interpreter::srawix, {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{792, Interpreter::srawx, {"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{824, Interpreter::srawix, {"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
{24, Interpreter::slwx, {"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT, 1, 0, 0, 0}},
|
||||
|
||||
{54, Interpreter::dcbst, {"dcbst", OPTYPE_DCACHE, 0, 5, 0, 0, 0}},
|
||||
@ -260,7 +260,7 @@ static GekkoOPTemplate table31[] =
|
||||
{339, Interpreter::mfspr, {"mfspr", OPTYPE_SPR, FL_OUT_D, 1, 0, 0, 0}},
|
||||
{467, Interpreter::mtspr, {"mtspr", OPTYPE_SPR, 0, 2, 0, 0, 0}},
|
||||
{371, Interpreter::mftb, {"mftb", OPTYPE_SYSTEM, FL_OUT_D | FL_TIMER, 1, 0, 0, 0}},
|
||||
{512, Interpreter::mcrxr, {"mcrxr", OPTYPE_SYSTEM, FL_READ_CA | FL_SET_CA, 1, 0, 0, 0}},
|
||||
{512, Interpreter::mcrxr, {"mcrxr", OPTYPE_SYSTEM, 0, 1, 0, 0, 0}},
|
||||
{595, Interpreter::mfsr, {"mfsr", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},
|
||||
{659, Interpreter::mfsrin, {"mfsrin", OPTYPE_SYSTEM, FL_OUT_D, 3, 0, 0, 0}},
|
||||
|
||||
|
@ -100,15 +100,13 @@ public:
|
||||
void GenerateConstantOverflow(bool overflow);
|
||||
void GenerateConstantOverflow(s64 val);
|
||||
void GenerateOverflow();
|
||||
void FinalizeCarryOverflow(bool ca, bool oe, bool inv = false);
|
||||
void FinalizeCarryOverflow(bool oe, bool inv = false);
|
||||
void GetCarryEAXAndClear();
|
||||
void FinalizeCarryGenerateOverflowEAX(bool oe, bool inv = false);
|
||||
void GenerateCarry();
|
||||
void GenerateRC();
|
||||
void ComputeRC(const Gen::OpArg & arg);
|
||||
|
||||
// use to extract bytes from a register using the regcache. offset is in bytes.
|
||||
Gen::OpArg ExtractFromReg(int reg, int offset);
|
||||
void AndWithMask(Gen::X64Reg reg, u32 mask);
|
||||
bool CheckMergedBranch(int crf);
|
||||
void DoMergedBranch();
|
||||
|
||||
// Reads a given bit of a given CR register part. Clobbers ABI_PARAM1,
|
||||
// don't forget to xlock it before.
|
||||
void GetCRFieldBit(int field, int bit, Gen::X64Reg out, bool negate = false);
|
||||
@ -120,8 +118,6 @@ public:
|
||||
Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
|
||||
void SetFPRFIfNeeded(UGeckoInstruction inst, Gen::X64Reg xmm);
|
||||
|
||||
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
|
||||
|
||||
void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
|
||||
typedef u32 (*Operation)(u32 a, u32 b);
|
||||
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
|
||||
|
@ -193,8 +193,8 @@ static GekkoOPTemplate table31[] =
|
||||
{922, &Jit64::extshx}, //"extshx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
|
||||
{954, &Jit64::extsbx}, //"extsbx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_S | FL_RC_BIT}},
|
||||
{536, &Jit64::srwx}, //"srwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
|
||||
{792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
|
||||
{824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_SET_CA | FL_RC_BIT}},
|
||||
{792, &Jit64::srawx}, //"srawx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
|
||||
{824, &Jit64::srawix}, //"srawix", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
|
||||
{24, &Jit64::slwx}, //"slwx", OPTYPE_INTEGER, FL_OUT_A | FL_IN_B | FL_IN_S | FL_RC_BIT}},
|
||||
|
||||
{54, &Jit64::dcbst}, //"dcbst", OPTYPE_DCACHE, 0, 4}},
|
||||
|
@ -314,9 +314,6 @@ void RegCache::StoreFromRegister(size_t i, FlushMode mode)
|
||||
|
||||
void GPRRegCache::LoadRegister(size_t preg, X64Reg newLoc)
|
||||
{
|
||||
if (regs[preg].location.IsImm() && !regs[preg].location.offset)
|
||||
emit->XOR(32, ::Gen::R(newLoc), ::Gen::R(newLoc));
|
||||
else
|
||||
emit->MOV(32, ::Gen::R(newLoc), regs[preg].location);
|
||||
}
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1104,7 +1104,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
|
||||
Jit->JitSetCA();
|
||||
FixupBranch cont = Jit->J();
|
||||
Jit->SetJumpTarget(nocarry);
|
||||
Jit->JitClearCAOV(true, false);
|
||||
Jit->JitClearCA();
|
||||
Jit->SetJumpTarget(cont);
|
||||
regNormalRegClear(RI, I);
|
||||
break;
|
||||
|
@ -802,11 +802,10 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
|
||||
OR(32, M(&FPSCR), R(EAX));
|
||||
}
|
||||
|
||||
void EmuCodeBlock::JitGetAndClearCAOV(bool oe)
|
||||
|
||||
void EmuCodeBlock::JitClearCA()
|
||||
{
|
||||
if (oe)
|
||||
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_OV_MASK)); //XER.OV = 0
|
||||
BTR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm8(29)); //carry = XER.CA, XER.CA = 0
|
||||
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
|
||||
}
|
||||
|
||||
void EmuCodeBlock::JitSetCA()
|
||||
@ -814,20 +813,10 @@ void EmuCodeBlock::JitSetCA()
|
||||
OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1
|
||||
}
|
||||
|
||||
// Some testing shows CA is set roughly ~1/3 of the time (relative to clears), so
|
||||
// branchless calculation of CA is probably faster in general.
|
||||
void EmuCodeBlock::JitSetCAIf(CCFlags conditionCode)
|
||||
void EmuCodeBlock::JitClearCAOV(bool oe)
|
||||
{
|
||||
SETcc(conditionCode, R(EAX));
|
||||
MOVZX(32, 8, EAX, R(AL));
|
||||
SHL(32, R(EAX), Imm8(XER_CA_SHIFT));
|
||||
OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), R(EAX)); //XER.CA = 1
|
||||
}
|
||||
|
||||
void EmuCodeBlock::JitClearCAOV(bool ca, bool oe)
|
||||
{
|
||||
u32 mask = (ca ? ~XER_CA_MASK : 0xFFFFFFFF) & (oe ? ~XER_OV_MASK : 0xFFFFFFFF);
|
||||
if (mask == 0xFFFFFFFF)
|
||||
return;
|
||||
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(mask));
|
||||
if (oe)
|
||||
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK & ~XER_OV_MASK)); //XER.CA, XER.OV = 0
|
||||
else
|
||||
AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
|
||||
}
|
||||
|
@ -50,10 +50,9 @@ public:
|
||||
void SafeWriteF32ToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr, s32 offset, u32 registersInUse, int flags = 0);
|
||||
|
||||
void WriteToConstRamAddress(int accessSize, Gen::X64Reg arg, u32 address, bool swap = false);
|
||||
void JitGetAndClearCAOV(bool oe);
|
||||
void JitClearCA();
|
||||
void JitSetCA();
|
||||
void JitSetCAIf(Gen::CCFlags conditionCode);
|
||||
void JitClearCAOV(bool ca, bool oe);
|
||||
void JitClearCAOV(bool oe);
|
||||
|
||||
void ForceSinglePrecisionS(Gen::X64Reg xmm);
|
||||
void ForceSinglePrecisionP(Gen::X64Reg xmm);
|
||||
|
@ -430,6 +430,7 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
|
||||
{
|
||||
code->wantsCR0 = false;
|
||||
code->wantsCR1 = false;
|
||||
code->wantsPS1 = false;
|
||||
|
||||
if (opinfo->flags & FL_USE_FPU)
|
||||
block->m_fpa->any = true;
|
||||
@ -457,15 +458,6 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf
|
||||
code->outputFPRF = (opinfo->flags & FL_SET_FPRF) ? true : false;
|
||||
code->canEndBlock = (opinfo->flags & FL_ENDBLOCK) ? true : false;
|
||||
|
||||
code->wantsCA = (opinfo->flags & FL_READ_CA) ? true : false;
|
||||
code->outputCA = (opinfo->flags & FL_SET_CA) ? true : false;
|
||||
|
||||
// mfspr/mtspr can affect/use XER, so be super careful here
|
||||
if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 339) // mfspr
|
||||
code->wantsCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
|
||||
if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr
|
||||
code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER;
|
||||
|
||||
int numOut = 0;
|
||||
int numIn = 0;
|
||||
if (opinfo->flags & FL_OUT_A)
|
||||
@ -723,30 +715,26 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
|
||||
block->m_broken = true;
|
||||
}
|
||||
|
||||
// Scan for flag dependencies; assume the next block (or any branch that can leave the block)
|
||||
// wants flags, to be safe.
|
||||
// Scan for CR0 dependency
|
||||
// assume next block wants flags to be safe
|
||||
bool wantsCR0 = true;
|
||||
bool wantsCR1 = true;
|
||||
bool wantsPS1 = true;
|
||||
bool wantsFPRF = true;
|
||||
bool wantsCA = true;
|
||||
for (int i = block->m_num_instructions - 1; i >= 0; i--)
|
||||
{
|
||||
bool opWantsCR0 = code[i].wantsCR0;
|
||||
bool opWantsCR1 = code[i].wantsCR1;
|
||||
bool opWantsFPRF = code[i].wantsFPRF;
|
||||
bool opWantsCA = code[i].wantsCA;
|
||||
wantsCR0 |= opWantsCR0 || code[i].canEndBlock;
|
||||
wantsCR1 |= opWantsCR1 || code[i].canEndBlock;
|
||||
wantsFPRF |= opWantsFPRF || code[i].canEndBlock;
|
||||
wantsCA |= opWantsCA || code[i].canEndBlock;
|
||||
wantsCR0 |= code[i].wantsCR0 || code[i].canEndBlock;
|
||||
wantsCR1 |= code[i].wantsCR1 || code[i].canEndBlock;
|
||||
wantsPS1 |= code[i].wantsPS1 || code[i].canEndBlock;
|
||||
wantsFPRF |= code[i].wantsFPRF || code[i].canEndBlock;
|
||||
code[i].wantsCR0 = wantsCR0;
|
||||
code[i].wantsCR1 = wantsCR1;
|
||||
code[i].wantsPS1 = wantsPS1;
|
||||
code[i].wantsFPRF = wantsFPRF;
|
||||
code[i].wantsCA = wantsCA;
|
||||
wantsCR0 &= !code[i].outputCR0 || opWantsCR0;
|
||||
wantsCR1 &= !code[i].outputCR1 || opWantsCR1;
|
||||
wantsFPRF &= !code[i].outputFPRF || opWantsFPRF;
|
||||
wantsCA &= !code[i].outputCA || opWantsCA;
|
||||
wantsCR0 &= !code[i].outputCR0;
|
||||
wantsCR1 &= !code[i].outputCR1;
|
||||
wantsPS1 &= !code[i].outputPS1;
|
||||
wantsFPRF &= !code[i].outputFPRF;
|
||||
}
|
||||
return address;
|
||||
}
|
||||
|
@ -33,12 +33,12 @@ struct CodeOp //16B
|
||||
bool isBranchTarget;
|
||||
bool wantsCR0;
|
||||
bool wantsCR1;
|
||||
bool wantsPS1;
|
||||
bool wantsFPRF;
|
||||
bool wantsCA;
|
||||
bool outputCR0;
|
||||
bool outputCR1;
|
||||
bool outputPS1;
|
||||
bool outputFPRF;
|
||||
bool outputCA;
|
||||
bool canEndBlock;
|
||||
bool skip; // followed BL-s for example
|
||||
};
|
||||
|
@ -397,7 +397,7 @@ static wxString NiceSizeFormat(u64 _size)
|
||||
// Find largest power of 2 less than _size.
|
||||
// div 10 to get largest named unit less than _size
|
||||
// 10 == log2(1024) (number of B in a KiB, KiB in a MiB, etc)
|
||||
const u64 unit = IntLog2(std::max<u64>(_size, 1)) / 10;
|
||||
const u64 unit = Log2(std::max<u64>(_size, 1)) / 10;
|
||||
const u64 unit_size = (1 << (unit * 10));
|
||||
|
||||
// mul 1000 for 3 decimal places, add 5 to round up, div 10 for 2 decimal places
|
||||
|
@ -23,7 +23,7 @@ static u32 genBuffer()
|
||||
}
|
||||
|
||||
StreamBuffer::StreamBuffer(u32 type, u32 size)
|
||||
: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(IntLog2(ROUND_UP_POW2(size) / SYNC_POINTS))
|
||||
: m_buffer(genBuffer()), m_buffertype(type), m_size(ROUND_UP_POW2(size)), m_bit_per_slot(Log2(ROUND_UP_POW2(size) / SYNC_POINTS))
|
||||
{
|
||||
m_iterator = 0;
|
||||
m_used_iterator = 0;
|
||||
|
@ -91,8 +91,8 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)
|
||||
|
||||
WRITE(p, " int y_block_position = uv1.y & %d;\n", ~(blkH - 1));
|
||||
WRITE(p, " int y_offset_in_block = uv1.y & %d;\n", blkH - 1);
|
||||
WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", IntLog2(samples));
|
||||
WRITE(p, " int x_block_position = (x_virtual_position >> %d) & %d;\n", IntLog2(blkH), ~(blkW - 1));
|
||||
WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", Log2(samples));
|
||||
WRITE(p, " int x_block_position = (x_virtual_position >> %d) & %d;\n", Log2(blkH), ~(blkW - 1));
|
||||
if (samples == 1)
|
||||
{
|
||||
// 32 bit textures (RGBA8 and Z24) are stored in 2 cache line increments
|
||||
@ -100,7 +100,7 @@ static void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType)
|
||||
WRITE(p, " x_virtual_position = x_virtual_position << 1;\n");
|
||||
}
|
||||
WRITE(p, " int x_offset_in_block = x_virtual_position & %d;\n", blkW - 1);
|
||||
WRITE(p, " int y_offset = (x_virtual_position >> %d) & %d;\n", IntLog2(blkW), blkH - 1);
|
||||
WRITE(p, " int y_offset = (x_virtual_position >> %d) & %d;\n", Log2(blkW), blkH - 1);
|
||||
|
||||
WRITE(p, " sampleUv.x = x_offset_in_block + x_block_position;\n");
|
||||
WRITE(p, " sampleUv.y = y_block_position + y_offset;\n");
|
||||
|
@ -44,17 +44,17 @@ TEST(MathUtil, IsSNAN)
|
||||
EXPECT_TRUE(MathUtil::IsSNAN(std::numeric_limits<double>::signaling_NaN()));
|
||||
}
|
||||
|
||||
TEST(MathUtil, IntLog2)
|
||||
TEST(MathUtil, Log2)
|
||||
{
|
||||
EXPECT_EQ(0, IntLog2(1));
|
||||
EXPECT_EQ(1, IntLog2(2));
|
||||
EXPECT_EQ(2, IntLog2(4));
|
||||
EXPECT_EQ(3, IntLog2(8));
|
||||
EXPECT_EQ(63, IntLog2(0x8000000000000000ull));
|
||||
EXPECT_EQ(0, Log2(1));
|
||||
EXPECT_EQ(1, Log2(2));
|
||||
EXPECT_EQ(2, Log2(4));
|
||||
EXPECT_EQ(3, Log2(8));
|
||||
EXPECT_EQ(63, Log2(0x8000000000000000ull));
|
||||
|
||||
// Rounding behavior.
|
||||
EXPECT_EQ(3, IntLog2(15));
|
||||
EXPECT_EQ(63, IntLog2(0xFFFFFFFFFFFFFFFFull));
|
||||
EXPECT_EQ(3, Log2(15));
|
||||
EXPECT_EQ(63, Log2(0xFFFFFFFFFFFFFFFFull));
|
||||
}
|
||||
|
||||
TEST(MathUtil, FlushToZero)
|
||||
|
Loading…
x
Reference in New Issue
Block a user