mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-02-19 02:36:27 +01:00
Merge pull request #4735 from degasus/jitcache
Jit64: Enable branch following.
This commit is contained in:
commit
5da565a1a1
@ -443,6 +443,16 @@ void XEmitter::CALL(const void* fnptr)
|
|||||||
Write32(u32(distance));
|
Write32(u32(distance));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FixupBranch XEmitter::CALL()
|
||||||
|
{
|
||||||
|
FixupBranch branch;
|
||||||
|
branch.type = 1;
|
||||||
|
branch.ptr = code + 5;
|
||||||
|
Write8(0xE8);
|
||||||
|
Write32(0);
|
||||||
|
return branch;
|
||||||
|
}
|
||||||
|
|
||||||
FixupBranch XEmitter::J(bool force5bytes)
|
FixupBranch XEmitter::J(bool force5bytes)
|
||||||
{
|
{
|
||||||
FixupBranch branch;
|
FixupBranch branch;
|
||||||
|
@ -467,6 +467,7 @@ public:
|
|||||||
#undef CALL
|
#undef CALL
|
||||||
#endif
|
#endif
|
||||||
void CALL(const void* fnptr);
|
void CALL(const void* fnptr);
|
||||||
|
FixupBranch CALL();
|
||||||
void CALLptr(OpArg arg);
|
void CALLptr(OpArg arg);
|
||||||
|
|
||||||
FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
|
FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
|
||||||
|
@ -372,6 +372,21 @@ bool Jit64::Cleanup()
|
|||||||
return did_something;
|
return did_something;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Jit64::FakeBLCall(u32 after)
|
||||||
|
{
|
||||||
|
if (!m_enable_blr_optimization)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// We may need to fake the BLR stack on inlined CALL instructions.
|
||||||
|
// Else we can't return to this location any more.
|
||||||
|
MOV(32, R(RSCRATCH2), Imm32(after));
|
||||||
|
PUSH(RSCRATCH2);
|
||||||
|
FixupBranch skip_exit = CALL();
|
||||||
|
POP(RSCRATCH2);
|
||||||
|
JustWriteExit(after, false, 0);
|
||||||
|
SetJumpTarget(skip_exit);
|
||||||
|
}
|
||||||
|
|
||||||
void Jit64::WriteExit(u32 destination, bool bl, u32 after)
|
void Jit64::WriteExit(u32 destination, bool bl, u32 after)
|
||||||
{
|
{
|
||||||
if (!m_enable_blr_optimization)
|
if (!m_enable_blr_optimization)
|
||||||
@ -569,6 +584,7 @@ void Jit64::Jit(u32 em_address)
|
|||||||
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
|
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
|
||||||
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
|
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
|
||||||
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
|
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
|
||||||
|
analyzer.ClearOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
|
||||||
}
|
}
|
||||||
Trace();
|
Trace();
|
||||||
}
|
}
|
||||||
@ -973,6 +989,7 @@ void Jit64::EnableOptimization()
|
|||||||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
|
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_MERGE);
|
||||||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
|
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CROR_MERGE);
|
||||||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
|
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
|
||||||
|
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Jit64::IntializeSpeculativeConstants()
|
void Jit64::IntializeSpeculativeConstants()
|
||||||
|
@ -85,6 +85,7 @@ public:
|
|||||||
|
|
||||||
// Utilities for use by opcodes
|
// Utilities for use by opcodes
|
||||||
|
|
||||||
|
void FakeBLCall(u32 after);
|
||||||
void WriteExit(u32 destination, bool bl = false, u32 after = 0);
|
void WriteExit(u32 destination, bool bl = false, u32 after = 0);
|
||||||
void JustWriteExit(u32 destination, bool bl, u32 after);
|
void JustWriteExit(u32 destination, bool bl, u32 after);
|
||||||
void WriteExitDestInRSCRATCH(bool bl = false, u32 after = 0);
|
void WriteExitDestInRSCRATCH(bool bl = false, u32 after = 0);
|
||||||
|
@ -74,6 +74,13 @@ void Jit64::bx(UGeckoInstruction inst)
|
|||||||
// Because PPCAnalyst::Flatten() merged the blocks.
|
// Because PPCAnalyst::Flatten() merged the blocks.
|
||||||
if (!js.isLastInstruction)
|
if (!js.isLastInstruction)
|
||||||
{
|
{
|
||||||
|
if (inst.LK && !js.op->skipLRStack)
|
||||||
|
{
|
||||||
|
// We have to fake the stack as the RET instruction was not
|
||||||
|
// found in the same block. This is a big overhead, but still
|
||||||
|
// better than calling the dispatcher.
|
||||||
|
FakeBLCall(js.compilerPC + 4);
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -131,6 +138,22 @@ void Jit64::bcx(UGeckoInstruction inst)
|
|||||||
if (inst.LK)
|
if (inst.LK)
|
||||||
MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4));
|
MOV(32, PPCSTATE_LR, Imm32(js.compilerPC + 4));
|
||||||
|
|
||||||
|
// If this is not the last instruction of a block
|
||||||
|
// and an unconditional branch, we will skip the rest process.
|
||||||
|
// Because PPCAnalyst::Flatten() merged the blocks.
|
||||||
|
if (!js.isLastInstruction && (inst.BO & BO_DONT_DECREMENT_FLAG) &&
|
||||||
|
(inst.BO & BO_DONT_CHECK_CONDITION))
|
||||||
|
{
|
||||||
|
if (inst.LK && !js.op->skipLRStack)
|
||||||
|
{
|
||||||
|
// We have to fake the stack as the RET instruction was not
|
||||||
|
// found in the same block. This is a big overhead, but still
|
||||||
|
// better than calling the dispatcher.
|
||||||
|
FakeBLCall(js.compilerPC + 4);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
u32 destination;
|
u32 destination;
|
||||||
if (inst.AA)
|
if (inst.AA)
|
||||||
destination = SignExt16(inst.BD << 2);
|
destination = SignExt16(inst.BD << 2);
|
||||||
|
@ -55,6 +55,7 @@ void JitArm64::Init()
|
|||||||
code_block.m_fpa = &js.fpa;
|
code_block.m_fpa = &js.fpa;
|
||||||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
|
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE);
|
||||||
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
|
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_CARRY_MERGE);
|
||||||
|
analyzer.SetOption(PPCAnalyst::PPCAnalyzer::OPTION_BRANCH_FOLLOW);
|
||||||
|
|
||||||
m_supports_cycle_counter = HasCycleCounters();
|
m_supports_cycle_counter = HasCycleCounters();
|
||||||
}
|
}
|
||||||
|
@ -76,9 +76,6 @@ void JitArm64::bx(UGeckoInstruction inst)
|
|||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
JITDISABLE(bJITBranchOff);
|
JITDISABLE(bJITBranchOff);
|
||||||
|
|
||||||
gpr.Flush(FlushMode::FLUSH_ALL);
|
|
||||||
fpr.Flush(FlushMode::FLUSH_ALL);
|
|
||||||
|
|
||||||
u32 destination;
|
u32 destination;
|
||||||
if (inst.AA)
|
if (inst.AA)
|
||||||
destination = SignExt26(inst.LI << 2);
|
destination = SignExt26(inst.LI << 2);
|
||||||
@ -93,6 +90,14 @@ void JitArm64::bx(UGeckoInstruction inst)
|
|||||||
gpr.Unlock(WA);
|
gpr.Unlock(WA);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!js.isLastInstruction)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
gpr.Flush(FlushMode::FLUSH_ALL);
|
||||||
|
fpr.Flush(FlushMode::FLUSH_ALL);
|
||||||
|
|
||||||
if (destination == js.compilerPC)
|
if (destination == js.compilerPC)
|
||||||
{
|
{
|
||||||
// make idle loops go faster
|
// make idle loops go faster
|
||||||
|
@ -32,8 +32,9 @@
|
|||||||
namespace PPCAnalyst
|
namespace PPCAnalyst
|
||||||
{
|
{
|
||||||
constexpr int CODEBUFFER_SIZE = 32000;
|
constexpr int CODEBUFFER_SIZE = 32000;
|
||||||
|
|
||||||
// 0 does not perform block merging
|
// 0 does not perform block merging
|
||||||
constexpr u32 FUNCTION_FOLLOWING_THRESHOLD = 16;
|
constexpr u32 BRANCH_FOLLOWING_THRESHOLD = 2;
|
||||||
|
|
||||||
constexpr u32 INVALID_BRANCH_TARGET = 0xFFFFFFFF;
|
constexpr u32 INVALID_BRANCH_TARGET = 0xFFFFFFFF;
|
||||||
|
|
||||||
@ -651,7 +652,8 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
|
|||||||
CodeOp* code = buffer->codebuffer;
|
CodeOp* code = buffer->codebuffer;
|
||||||
|
|
||||||
bool found_exit = false;
|
bool found_exit = false;
|
||||||
u32 return_address = 0;
|
bool found_call = false;
|
||||||
|
size_t caller = 0;
|
||||||
u32 numFollows = 0;
|
u32 numFollows = 0;
|
||||||
u32 num_inst = 0;
|
u32 num_inst = 0;
|
||||||
|
|
||||||
@ -686,50 +688,65 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
|
|||||||
|
|
||||||
bool conditional_continue = false;
|
bool conditional_continue = false;
|
||||||
|
|
||||||
// Do we inline leaf functions?
|
// TODO: Find the optimal value for BRANCH_FOLLOWING_THRESHOLD.
|
||||||
if (HasOption(OPTION_LEAF_INLINE))
|
// If it is small, the performance will be down.
|
||||||
|
// If it is big, the size of generated code will be big and
|
||||||
|
// cache clearning will happen many times.
|
||||||
|
if (HasOption(OPTION_BRANCH_FOLLOW) && numFollows < BRANCH_FOLLOWING_THRESHOLD)
|
||||||
{
|
{
|
||||||
if (inst.OPCD == 18 && blockSize > 1)
|
if (inst.OPCD == 18 && blockSize > 1)
|
||||||
{
|
{
|
||||||
// Is bx - should we inline? yes!
|
// Always follow BX instructions.
|
||||||
if (inst.AA)
|
// TODO: Loop unrolling might bloat the code size too much.
|
||||||
destination = SignExt26(inst.LI << 2);
|
// Enable it carefully.
|
||||||
else
|
follow = destination != block->m_address;
|
||||||
destination = address + SignExt26(inst.LI << 2);
|
destination = SignExt26(inst.LI << 2) + (inst.AA ? 0 : address);
|
||||||
if (destination != block->m_address)
|
if (inst.LK)
|
||||||
follow = true;
|
{
|
||||||
|
found_call = true;
|
||||||
|
caller = i;
|
||||||
}
|
}
|
||||||
else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && (inst.BO & (1 << 4)) &&
|
}
|
||||||
(inst.BO & (1 << 2)) && return_address != 0)
|
else if (inst.OPCD == 16 && (inst.BO & BO_DONT_DECREMENT_FLAG) &&
|
||||||
|
(inst.BO & BO_DONT_CHECK_CONDITION) && blockSize > 1)
|
||||||
|
{
|
||||||
|
// Always follow unconditional BCX instructions, but they are very rare.
|
||||||
|
follow = true;
|
||||||
|
destination = SignExt16(inst.BD << 2) + (inst.AA ? 0 : address);
|
||||||
|
if (inst.LK)
|
||||||
|
{
|
||||||
|
found_call = true;
|
||||||
|
caller = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (inst.OPCD == 19 && inst.SUBOP10 == 16 && !inst.LK && found_call &&
|
||||||
|
(inst.BO & BO_DONT_DECREMENT_FLAG) && (inst.BO & BO_DONT_CHECK_CONDITION))
|
||||||
{
|
{
|
||||||
// bclrx with unconditional branch = return
|
// bclrx with unconditional branch = return
|
||||||
|
// Follow it if we can propagate the LR value of the last CALL instruction.
|
||||||
|
// Through it would be easy to track the upper level of call/return,
|
||||||
|
// we can't guarantee the LR value. The PPC ABI forces all functions to push
|
||||||
|
// the LR value on the stack as there are no spare registers. So we'd need
|
||||||
|
// to check all store instruction to not alias with the stack.
|
||||||
follow = true;
|
follow = true;
|
||||||
destination = return_address;
|
destination = code[caller].address + 4;
|
||||||
return_address = 0;
|
found_call = false;
|
||||||
|
code[i].skip = true;
|
||||||
|
|
||||||
if (inst.LK)
|
// Skip the RET, so also don't generate the stack entry for the BLR optimization.
|
||||||
return_address = address + 4;
|
code[caller].skipLRStack = true;
|
||||||
}
|
}
|
||||||
else if (inst.OPCD == 31 && inst.SUBOP10 == 467)
|
else if (inst.OPCD == 31 && inst.SUBOP10 == 467)
|
||||||
{
|
{
|
||||||
// mtspr
|
// mtspr, skip CALL/RET merging as LR is overwritten.
|
||||||
const u32 index = (inst.SPRU << 5) | (inst.SPRL & 0x1F);
|
const u32 index = (inst.SPRU << 5) | (inst.SPRL & 0x1F);
|
||||||
if (index == SPR_LR)
|
if (index == SPR_LR)
|
||||||
{
|
{
|
||||||
// We give up to follow the return address
|
// We give up to follow the return address
|
||||||
// because we have to check the register usage.
|
// because we have to check the register usage.
|
||||||
return_address = 0;
|
found_call = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Find the optimal value for FUNCTION_FOLLOWING_THRESHOLD.
|
|
||||||
// If it is small, the performance will be down.
|
|
||||||
// If it is big, the size of generated code will be big and
|
|
||||||
// cache clearning will happen many times.
|
|
||||||
// TODO: Investivate the reason why
|
|
||||||
// "0" is fastest in some games, MP2 for example.
|
|
||||||
if (numFollows > FUNCTION_FOLLOWING_THRESHOLD)
|
|
||||||
follow = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (HasOption(OPTION_CONDITIONAL_CONTINUE))
|
if (HasOption(OPTION_CONDITIONAL_CONTINUE))
|
||||||
@ -759,27 +776,28 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!follow)
|
if (follow)
|
||||||
{
|
{
|
||||||
|
// Follow the unconditional branch.
|
||||||
|
numFollows++;
|
||||||
|
address = destination;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Just pick the next instruction
|
||||||
address += 4;
|
address += 4;
|
||||||
if (!conditional_continue && opinfo->flags & FL_ENDBLOCK) // right now we stop early
|
if (!conditional_continue && opinfo->flags & FL_ENDBLOCK) // right now we stop early
|
||||||
{
|
{
|
||||||
found_exit = true;
|
found_exit = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
if (conditional_continue)
|
||||||
// XXX: We don't support inlining yet.
|
|
||||||
#if 0
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
numFollows++;
|
// If we skip any conditional branch, we can't garantee to get the matching CALL/RET pair.
|
||||||
// We don't "code[i].skip = true" here
|
// So we stop inling the RET here and let the BLR optitmization handle this case.
|
||||||
// because bx may store a certain value to the link register.
|
found_call = false;
|
||||||
// Instead, we skip a part of bx in Jit**::bx().
|
}
|
||||||
address = destination;
|
|
||||||
merged_addresses[size_of_merged_addresses++] = address;
|
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
block->m_num_instructions = num_inst;
|
block->m_num_instructions = num_inst;
|
||||||
|
@ -42,6 +42,7 @@ struct CodeOp // 16B
|
|||||||
bool outputFPRF;
|
bool outputFPRF;
|
||||||
bool outputCA;
|
bool outputCA;
|
||||||
bool canEndBlock;
|
bool canEndBlock;
|
||||||
|
bool skipLRStack;
|
||||||
bool skip; // followed BL-s for example
|
bool skip; // followed BL-s for example
|
||||||
// which registers are still needed after this instruction in this block
|
// which registers are still needed after this instruction in this block
|
||||||
BitSet32 fprInUse;
|
BitSet32 fprInUse;
|
||||||
@ -189,11 +190,11 @@ public:
|
|||||||
// Requires JIT support to be enabled.
|
// Requires JIT support to be enabled.
|
||||||
OPTION_CONDITIONAL_CONTINUE = (1 << 0),
|
OPTION_CONDITIONAL_CONTINUE = (1 << 0),
|
||||||
|
|
||||||
// If there is a unconditional branch that jumps to a leaf function then inline it.
|
// Try to inline unconditional branches/calls/returns.
|
||||||
|
// Also track the LR value to follow unconditional return instructions.
|
||||||
// Might require JIT intervention to support it correctly.
|
// Might require JIT intervention to support it correctly.
|
||||||
// Requires JITBLock support for inlined code
|
// Especially if the BLR optimization is used.
|
||||||
// XXX: NOT COMPLETE
|
OPTION_BRANCH_FOLLOW = (1 << 1),
|
||||||
OPTION_LEAF_INLINE = (1 << 1),
|
|
||||||
|
|
||||||
// Complex blocks support jumping backwards on to themselves.
|
// Complex blocks support jumping backwards on to themselves.
|
||||||
// Happens commonly in loops, pretty complex to support.
|
// Happens commonly in loops, pretty complex to support.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user