From d6987b98be7431921ee5bbf31bd201a02ec01ed0 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Tue, 29 Jun 2021 15:13:13 +0200 Subject: [PATCH 1/8] PPCAnalyst: Perform CR analysis for crXXX --- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 9cfbbdec6b..01feaffd45 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -572,11 +572,26 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock* block, CodeOp* code, code->wantsCR = BitSet8(0); if (opinfo->flags & FL_READ_ALL_CR) + { code->wantsCR = BitSet8(0xFF); + } else if (opinfo->flags & FL_READ_CRn) + { code->wantsCR[code->inst.CRFS] = true; + } else if (opinfo->flags & FL_READ_CR_BI) + { code->wantsCR[code->inst.BI] = true; + } + else if (opinfo->type == OpType::CR) + { + code->wantsCR[code->inst.CRBA >> 2] = true; + code->wantsCR[code->inst.CRBB >> 2] = true; + + // CR instructions only write to one bit of the destination CR, + // so treat the other three bits of the destination as inputs + code->wantsCR[code->inst.CRBD >> 2] = true; + } code->outputCR = BitSet8(0); if (opinfo->flags & FL_SET_ALL_CR) @@ -587,6 +602,8 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock* block, CodeOp* code, code->outputCR[0] = true; else if ((opinfo->flags & FL_SET_CR1) || ((opinfo->flags & FL_RC_BIT_F) && code->inst.Rc)) code->outputCR[1] = true; + else if (opinfo->type == OpType::CR) + code->outputCR[code->inst.CRBD >> 2] = true; code->wantsFPRF = (opinfo->flags & FL_READ_FPRF) != 0; code->outputFPRF = (opinfo->flags & FL_SET_FPRF) != 0; From 6cc4f593e54fbce4f9b8920ae4053549050e86a1 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Tue, 29 Jun 2021 15:18:55 +0200 Subject: [PATCH 2/8] PPCAnalyst: Add in-register/discard analysis for CR This brings the analysis done for condition registers more in line with the analysis done for GPRs and FPRs. This gets rid of the old wantsCR member, which wasn't actually used anyway. In case someone wants it again in the future, they can compute the bitwise inverse of crDiscardable. --- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 41 +++++++++++++------------ Source/Core/Core/PowerPC/PPCAnalyst.h | 6 +++- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 01feaffd45..2dd988d6c2 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -506,7 +506,7 @@ void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool r // Reorder integer compares, rlwinm., and carry-affecting ops // (if we add more merged branch instructions, add them here!) if ((type == ReorderType::CROR && isCror(a)) || (type == ReorderType::Carry && isCarryOp(a)) || - (type == ReorderType::CMP && (isCmp(a) || a.outputCR[0]))) + (type == ReorderType::CMP && (isCmp(a) || a.crOut[0]))) { // once we're next to a carry instruction, don't move away! if (type == ReorderType::Carry && i != start) @@ -570,40 +570,40 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock* block, CodeOp* code, block->m_fpa->any = true; } - code->wantsCR = BitSet8(0); + code->crIn = BitSet8(0); if (opinfo->flags & FL_READ_ALL_CR) { - code->wantsCR = BitSet8(0xFF); + code->crIn = BitSet8(0xFF); } else if (opinfo->flags & FL_READ_CRn) { - code->wantsCR[code->inst.CRFS] = true; + code->crIn[code->inst.CRFS] = true; } else if (opinfo->flags & FL_READ_CR_BI) { - code->wantsCR[code->inst.BI] = true; + code->crIn[code->inst.BI] = true; } else if (opinfo->type == OpType::CR) { - code->wantsCR[code->inst.CRBA >> 2] = true; - code->wantsCR[code->inst.CRBB >> 2] = true; + code->crIn[code->inst.CRBA >> 2] = true; + code->crIn[code->inst.CRBB >> 2] = true; // CR instructions only write to one bit of the destination CR, // so treat the other three bits of the destination as inputs - code->wantsCR[code->inst.CRBD >> 2] = true; + code->crIn[code->inst.CRBD >> 2] = true; } - code->outputCR = BitSet8(0); + code->crOut = BitSet8(0); if (opinfo->flags & FL_SET_ALL_CR) - code->outputCR = BitSet8(0xFF); + code->crOut = BitSet8(0xFF); else if (opinfo->flags & FL_SET_CRn) - code->outputCR[code->inst.CRFD] = true; + code->crOut[code->inst.CRFD] = true; else if ((opinfo->flags & FL_SET_CR0) || ((opinfo->flags & FL_RC_BIT) && code->inst.Rc)) - code->outputCR[0] = true; + code->crOut[0] = true; else if ((opinfo->flags & FL_SET_CR1) || ((opinfo->flags & FL_RC_BIT_F) && code->inst.Rc)) - code->outputCR[1] = true; + code->crOut[1] = true; else if (opinfo->type == OpType::CR) - code->outputCR[code->inst.CRBD >> 2] = true; + code->crOut[code->inst.CRBD >> 2] = true; code->wantsFPRF = (opinfo->flags & FL_READ_FPRF) != 0; code->outputFPRF = (opinfo->flags & FL_SET_FPRF) != 0; @@ -972,9 +972,9 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, // Scan for flag dependencies; assume the next block (or any branch that can leave the block) // wants flags, to be safe. - BitSet8 wantsCR = BitSet8(0xFF); bool wantsFPRF = true; bool wantsCA = true; + BitSet8 crInUse, crDiscardable; BitSet32 gprBlockInputs, gprInUse, fprInUse, gprDiscardable, fprDiscardable, fprInXmm; for (int i = block->m_num_instructions - 1; i >= 0; i--) { @@ -991,27 +991,26 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, const bool hle = !!HLE::TryReplaceFunction(op.address); const bool may_exit_block = hle || op.canEndBlock || op.canCauseException; - const BitSet8 opWantsCR = op.wantsCR; const bool opWantsFPRF = op.wantsFPRF; const bool opWantsCA = op.wantsCA; - op.wantsCR = wantsCR | BitSet8(may_exit_block ? 0xFF : 0); op.wantsFPRF = wantsFPRF || may_exit_block; op.wantsCA = wantsCA || may_exit_block; - wantsCR |= opWantsCR | BitSet8(may_exit_block ? 0xFF : 0); wantsFPRF |= opWantsFPRF || may_exit_block; wantsCA |= opWantsCA || may_exit_block; - wantsCR &= ~op.outputCR | opWantsCR; wantsFPRF &= !op.outputFPRF || opWantsFPRF; wantsCA &= !op.outputCA || opWantsCA; op.gprInUse = gprInUse; op.fprInUse = fprInUse; + op.crInUse = crInUse; op.gprDiscardable = gprDiscardable; op.fprDiscardable = fprDiscardable; + op.crDiscardable = crDiscardable; op.fprInXmm = fprInXmm; gprBlockInputs &= ~op.regsOut; gprBlockInputs |= op.regsIn; gprInUse |= op.regsIn | op.regsOut; fprInUse |= op.fregsIn | op.GetFregsOut(); + crInUse |= op.crIn | op.crOut; if (strncmp(op.opinfo->opname, "stfd", 4)) fprInXmm |= op.fregsIn; @@ -1023,11 +1022,13 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, fprInXmm = BitSet32{}; gprDiscardable = BitSet32{}; fprDiscardable = BitSet32{}; + crDiscardable = BitSet8{}; } else if (op.canEndBlock || op.canCauseException) { gprDiscardable = BitSet32{}; fprDiscardable = BitSet32{}; + crDiscardable = BitSet8{}; } else { @@ -1035,6 +1036,8 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, gprDiscardable &= ~op.regsIn; fprDiscardable |= op.GetFregsOut(); fprDiscardable &= ~op.fregsIn; + crDiscardable |= op.crOut; + crDiscardable &= ~op.crIn; } } diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 1c24120cc9..1bd25ae476 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -32,10 +32,12 @@ struct CodeOp // 16B const GekkoOPInfo* opinfo = nullptr; u32 address = 0; u32 branchTo = 0; // if UINT32_MAX, not a branch - BitSet32 regsOut; BitSet32 regsIn; + BitSet32 regsOut; BitSet32 fregsIn; s8 fregOut = 0; + BitSet8 crIn; + BitSet8 crOut; bool isBranchTarget = false; bool branchUsesCtr = false; bool branchIsIdleLoop = false; @@ -50,6 +52,8 @@ struct CodeOp // 16B bool canCauseException = false; bool skipLRStack = false; bool skip = false; // followed BL-s for example + BitSet8 crInUse; + BitSet8 crDiscardable; // which registers are still needed after this instruction in this block BitSet32 fprInUse; BitSet32 gprInUse; From 8e9609df6e031568537550d9d193aaa650de3eed Mon Sep 17 00:00:00 2001 From: JosJuice Date: Tue, 29 Jun 2021 16:55:05 +0200 Subject: [PATCH 3/8] JitArm64: Add flush/discard support for condition registers By flushing the condition registers as soon as we no longer need them, we reduce the register pressure. --- Source/Core/Core/PowerPC/JitArm64/Jit.cpp | 3 +++ .../PowerPC/JitArm64/JitArm64_RegCache.cpp | 21 +++++++++++++++++++ .../Core/PowerPC/JitArm64/JitArm64_RegCache.h | 3 +++ 3 files changed, 27 insertions(+) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp index a2dc1cb6b7..9433e48b49 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.cpp @@ -205,6 +205,7 @@ void JitArm64::FallBackToInterpreter(UGeckoInstruction inst) // we must mark them as no longer discarded gpr.ResetRegisters(js.op->regsOut); fpr.ResetRegisters(js.op->GetFregsOut()); + gpr.ResetCRRegisters(js.op->crOut); if (js.op->opinfo->flags & FL_ENDBLOCK) { @@ -1199,9 +1200,11 @@ bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) { gpr.DiscardRegisters(op.gprDiscardable); fpr.DiscardRegisters(op.fprDiscardable); + gpr.DiscardCRRegisters(op.crDiscardable); } gpr.StoreRegisters(~op.gprInUse & (op.regsIn | op.regsOut)); fpr.StoreRegisters(~op.fprInUse & (op.fregsIn | op.GetFregsOut())); + gpr.StoreCRRegisters(~op.crInUse & (op.crIn | op.crOut)); if (opinfo->flags & FL_LOADSTORE) ++js.numLoadStoreInst; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index d4e933aaf3..5da455cb61 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -129,6 +129,8 @@ void Arm64RegCache::DiscardRegister(size_t preg) { OpArg& reg = m_guest_registers[preg]; ARM64Reg host_reg = reg.GetReg(); + if (!IsVector(host_reg)) + host_reg = EncodeRegTo32(host_reg); reg.Discard(); if (host_reg != ARM64Reg::INVALID_REG) @@ -288,6 +290,25 @@ void Arm64GPRCache::FlushCRRegisters(BitSet8 regs, bool maintain_state, ARM64Reg } } +void Arm64GPRCache::DiscardCRRegisters(BitSet8 regs) +{ + for (int i : regs) + DiscardRegister(GUEST_CR_OFFSET + i); +} + +void Arm64GPRCache::ResetCRRegisters(BitSet8 regs) +{ + for (int i : regs) + { + OpArg& reg = m_guest_registers[GUEST_CR_OFFSET + i]; + ARM64Reg host_reg = reg.GetReg(); + + ASSERT_MSG(DYNA_REC, host_reg == ARM64Reg::INVALID_REG, + "Attempted to reset a loaded register (did you mean to flush it?)"); + reg.Flush(); + } +} + void Arm64GPRCache::Flush(FlushMode mode, ARM64Reg tmp_reg) { FlushRegisters(BitSet32(0xFFFFFFFF), mode == FlushMode::MaintainState, tmp_reg); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h index fbe1f30134..dad43ce9ef 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h @@ -335,6 +335,9 @@ public: FlushCRRegisters(regs, false, tmp_reg); } + void DiscardCRRegisters(BitSet8 regs); + void ResetCRRegisters(BitSet8 regs); + protected: // Get the order of the host registers void GetAllocationOrder() override; From da63cee711de321d4f5eab1bb4f0e58493202398 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 16 Oct 2021 15:56:50 +0200 Subject: [PATCH 4/8] PPCAnalyst: More strict a_flags checks in CanSwapAdjacentOps If for instance instruction a sets OE and instruction b reads it, we shouldn't permit reordering. --- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 2dd988d6c2..1ea5f7dd3e 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -222,7 +222,7 @@ bool PPCAnalyzer::CanSwapAdjacentOps(const CodeOp& a, const CodeOp& b) const // [1] https://bugs.dolphin-emu.org/issues/5864#note-7 if (a.canCauseException || b.canCauseException) return false; - if (a_flags & FL_ENDBLOCK) + if (a_flags & (FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE)) return false; if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE)) return false; From 40e0dd93be50d8ff7301cf36ce3c3417a5ce690a Mon Sep 17 00:00:00 2001 From: JosJuice Date: Tue, 29 Jun 2021 15:30:12 +0200 Subject: [PATCH 5/8] PPCAnalyst: Allow more reordering of CR operations This is possible with the improved CR analysis implemented in the previous commits. --- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 20 +++++++++----------- Source/Core/Core/PowerPC/PPCTables.cpp | 18 +++++++++--------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 1ea5f7dd3e..e40076bcc8 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -224,9 +224,7 @@ bool PPCAnalyzer::CanSwapAdjacentOps(const CodeOp& a, const CodeOp& b) const return false; if (a_flags & (FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE)) return false; - if (b_flags & (FL_SET_CRx | FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE)) - return false; - if ((b_flags & (FL_RC_BIT | FL_RC_BIT_F)) && (b.inst.Rc)) + if (b_flags & (FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE)) return false; if ((a_flags & (FL_SET_CA | FL_READ_CA)) && (b_flags & (FL_SET_CA | FL_READ_CA))) return false; @@ -249,16 +247,22 @@ bool PPCAnalyzer::CanSwapAdjacentOps(const CodeOp& a, const CodeOp& b) const // Check that we have no register collisions. // That is, check that none of b's outputs matches any of a's inputs, // and that none of a's outputs matches any of b's inputs. - // The latter does not apply if a is a cmp, of course, but doesn't hurt to check. + // register collision: b outputs to one of a's inputs if (b.regsOut & a.regsIn) return false; + if (b.crOut & a.crIn) + return false; // register collision: a outputs to one of b's inputs if (a.regsOut & b.regsIn) return false; + if (a.crOut & b.crIn) + return false; // register collision: b outputs to one of a's outputs (overwriting it) if (b.regsOut & a.regsOut) return false; + if (b.crOut & a.crOut) + return false; return true; } @@ -451,12 +455,6 @@ void FindFunctions(const Core::CPUThreadGuard& guard, u32 startAddr, u32 endAddr unniceSize); } -static bool isCmp(const CodeOp& a) -{ - return (a.inst.OPCD == 10 || a.inst.OPCD == 11) || - (a.inst.OPCD == 31 && (a.inst.SUBOP10 == 0 || a.inst.SUBOP10 == 32)); -} - static bool isCarryOp(const CodeOp& a) { return (a.opinfo->flags & FL_SET_CA) && !(a.opinfo->flags & FL_SET_OE) && @@ -506,7 +504,7 @@ void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool r // Reorder integer compares, rlwinm., and carry-affecting ops // (if we add more merged branch instructions, add them here!) if ((type == ReorderType::CROR && isCror(a)) || (type == ReorderType::Carry && isCarryOp(a)) || - (type == ReorderType::CMP && (isCmp(a) || a.crOut[0]))) + (type == ReorderType::CMP && (type == ReorderType::CMP && a.crOut))) { // once we're next to a carry instruction, don't move away! if (type == ReorderType::Carry && i != start) diff --git a/Source/Core/Core/PowerPC/PPCTables.cpp b/Source/Core/Core/PowerPC/PPCTables.cpp index 79ff5444a7..dc05b03979 100644 --- a/Source/Core/Core/PowerPC/PPCTables.cpp +++ b/Source/Core/Core/PowerPC/PPCTables.cpp @@ -224,17 +224,17 @@ constexpr std::array s_table4_3{{ constexpr std::array s_table19{{ {528, "bcctrx", OpType::Branch, 1, FL_ENDBLOCK | FL_READ_CR_BI}, {16, "bclrx", OpType::Branch, 1, FL_ENDBLOCK | FL_READ_CR_BI}, - {257, "crand", OpType::CR, 1, FL_EVIL}, - {129, "crandc", OpType::CR, 1, FL_EVIL}, - {289, "creqv", OpType::CR, 1, FL_EVIL}, - {225, "crnand", OpType::CR, 1, FL_EVIL}, - {33, "crnor", OpType::CR, 1, FL_EVIL}, - {449, "cror", OpType::CR, 1, FL_EVIL}, - {417, "crorc", OpType::CR, 1, FL_EVIL}, - {193, "crxor", OpType::CR, 1, FL_EVIL}, + {257, "crand", OpType::CR, 1, 0}, + {129, "crandc", OpType::CR, 1, 0}, + {289, "creqv", OpType::CR, 1, 0}, + {225, "crnand", OpType::CR, 1, 0}, + {33, "crnor", OpType::CR, 1, 0}, + {449, "cror", OpType::CR, 1, 0}, + {417, "crorc", OpType::CR, 1, 0}, + {193, "crxor", OpType::CR, 1, 0}, {150, "isync", OpType::InstructionCache, 1, FL_EVIL}, - {0, "mcrf", OpType::System, 1, FL_EVIL | FL_SET_CRn | FL_READ_CRn}, + {0, "mcrf", OpType::System, 1, FL_SET_CRn | FL_READ_CRn}, {50, "rfi", OpType::System, 2, FL_ENDBLOCK | FL_CHECKEXCEPTIONS | FL_PROGRAMEXCEPTION}, }}; From 96d622bb61171b01bc3644f7d06a3ecbd25753f7 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 16 Oct 2021 16:53:05 +0200 Subject: [PATCH 6/8] PPCAnalyst: Run cror reordering after cmp reordering We would rather have cror be close to the cmp than the branch. --- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index e40076bcc8..6c850ea297 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -542,11 +542,6 @@ void PPCAnalyzer::ReorderInstructionsCore(u32 instructions, CodeOp* code, bool r void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp* code) const { - // Reorder cror instructions upwards (e.g. towards an fcmp). Technically we should be more - // picky about this, but cror seems to almost solely be used for this purpose in real code. - // Additionally, the other boolean ops seem to almost never be used. - if (HasOption(OPTION_CROR_MERGE)) - ReorderInstructionsCore(instructions, code, true, ReorderType::CROR); // For carry, bubble instructions *towards* each other; one direction often isn't enough // to get pairs like addc/adde next to each other. if (HasOption(OPTION_CARRY_MERGE)) @@ -554,8 +549,16 @@ void PPCAnalyzer::ReorderInstructions(u32 instructions, CodeOp* code) const ReorderInstructionsCore(instructions, code, false, ReorderType::Carry); ReorderInstructionsCore(instructions, code, true, ReorderType::Carry); } + + // Reorder instructions which write to CR (typically compare instructions) towards branches. if (HasOption(OPTION_BRANCH_MERGE)) ReorderInstructionsCore(instructions, code, false, ReorderType::CMP); + + // Reorder cror instructions upwards (e.g. towards an fcmp). Technically we should be more + // picky about this, but cror seems to almost solely be used for this purpose in real code. + // Additionally, the other boolean ops seem to almost never be used. + if (HasOption(OPTION_CROR_MERGE)) + ReorderInstructionsCore(instructions, code, true, ReorderType::CROR); } void PPCAnalyzer::SetInstructionStats(CodeBlock* block, CodeOp* code, From f494a3d9e8ec60fafdc600f5c4507d97b3053ec8 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Tue, 29 Jun 2021 15:41:13 +0200 Subject: [PATCH 7/8] PPCAnalyst: Remove CanSwapAdjacentOps's OPCD check Other than the CR instructions, which we now analyze properly, all the covered instructions are not integer operations and also have either FL_ENDBLOCK or FL_EVIL set, so there are two other checks in CanSwapAdjacentOps that will reject them. --- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 6c850ea297..e2a4fc519c 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -229,17 +229,6 @@ bool PPCAnalyzer::CanSwapAdjacentOps(const CodeOp& a, const CodeOp& b) const if ((a_flags & (FL_SET_CA | FL_READ_CA)) && (b_flags & (FL_SET_CA | FL_READ_CA))) return false; - switch (b.inst.OPCD) - { - case 16: - case 18: - // branches. Do not swap. - case 17: // sc - case 46: // lmw - case 19: // table19 - lots of tricky stuff - return false; - } - // For now, only integer ops are acceptable. if (b_info->type != OpType::Integer) return false; From 80171adf1ec33dc0199d471c5cde2987e1536637 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Sat, 16 Oct 2021 16:42:39 +0200 Subject: [PATCH 8/8] PPCTables: Retire FL_EVIL FL_EVIL is only used for blocking instructions from being reordered. There are three types of instructions which have FL_EVIL set: 1. CR operations. The previous commits improved our CR analysis and removed FL_EVIL from these instructions. 2. Load/store operations. These are always blocked from reordering due to always having canCauseException set. 3. isync. I don't know if we actually need to prevent reordering around this one, since as far as I know we only do reorderings that are guaranteed to not change the behavior of the program. But just in case, I've renamed FL_EVIL to FL_NO_REORDER instead of removing it entirely, so that it can be used for this instruction. --- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 4 ++-- Source/Core/Core/PowerPC/PPCTables.cpp | 18 +++++++++--------- Source/Core/Core/PowerPC/PPCTables.h | 23 +++++++++++------------ 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index e2a4fc519c..d7c4a09a64 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -222,9 +222,9 @@ bool PPCAnalyzer::CanSwapAdjacentOps(const CodeOp& a, const CodeOp& b) const // [1] https://bugs.dolphin-emu.org/issues/5864#note-7 if (a.canCauseException || b.canCauseException) return false; - if (a_flags & (FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE)) + if (a_flags & (FL_ENDBLOCK | FL_TIMER | FL_NO_REORDER | FL_SET_OE)) return false; - if (b_flags & (FL_ENDBLOCK | FL_TIMER | FL_EVIL | FL_SET_OE)) + if (b_flags & (FL_ENDBLOCK | FL_TIMER | FL_NO_REORDER | FL_SET_OE)) return false; if ((a_flags & (FL_SET_CA | FL_READ_CA)) && (b_flags & (FL_SET_CA | FL_READ_CA))) return false; diff --git a/Source/Core/Core/PowerPC/PPCTables.cpp b/Source/Core/Core/PowerPC/PPCTables.cpp index dc05b03979..9d70fb662b 100644 --- a/Source/Core/Core/PowerPC/PPCTables.cpp +++ b/Source/Core/Core/PowerPC/PPCTables.cpp @@ -87,8 +87,8 @@ constexpr std::array s_primary_table{{ {38, "stb", OpType::Store, 1, FL_IN_A0 | FL_IN_S | FL_LOADSTORE}, {39, "stbu", OpType::Store, 1, FL_OUT_A | FL_IN_A | FL_IN_S | FL_LOADSTORE}, - {46, "lmw", OpType::System, 11, FL_EVIL | FL_IN_A0 | FL_LOADSTORE}, - {47, "stmw", OpType::System, 11, FL_EVIL | FL_IN_A0 | FL_LOADSTORE}, + {46, "lmw", OpType::System, 11, FL_IN_A0 | FL_LOADSTORE}, + {47, "stmw", OpType::System, 11, FL_IN_A0 | FL_LOADSTORE}, {48, "lfs", OpType::LoadFP, 1, FL_OUT_FLOAT_D | FL_IN_A | FL_USE_FPU | FL_LOADSTORE}, {49, "lfsu", OpType::LoadFP, 1, @@ -233,7 +233,7 @@ constexpr std::array s_table19{{ {417, "crorc", OpType::CR, 1, 0}, {193, "crxor", OpType::CR, 1, 0}, - {150, "isync", OpType::InstructionCache, 1, FL_EVIL}, + {150, "isync", OpType::InstructionCache, 1, FL_NO_REORDER}, {0, "mcrf", OpType::System, 1, FL_SET_CRn | FL_READ_CRn}, {50, "rfi", OpType::System, 2, FL_ENDBLOCK | FL_CHECKEXCEPTIONS | FL_PROGRAMEXCEPTION}, @@ -324,12 +324,12 @@ constexpr std::array s_table31{{ {790, "lhbrx", OpType::Load, 1, FL_OUT_D | FL_IN_A0B | FL_LOADSTORE}, // Conditional load/store (Wii SMP) - {150, "stwcxd", OpType::Store, 1, FL_EVIL | FL_IN_S | FL_IN_A0B | FL_SET_CR0 | FL_LOADSTORE}, - {20, "lwarx", OpType::Load, 1, FL_EVIL | FL_OUT_D | FL_IN_A0B | FL_SET_CR0 | FL_LOADSTORE}, + {150, "stwcxd", OpType::Store, 1, FL_IN_S | FL_IN_A0B | FL_SET_CR0 | FL_LOADSTORE}, + {20, "lwarx", OpType::Load, 1, FL_OUT_D | FL_IN_A0B | FL_SET_CR0 | FL_LOADSTORE}, // load string (Inst these) - {533, "lswx", OpType::Load, 1, FL_EVIL | FL_IN_A0B | FL_OUT_D | FL_LOADSTORE}, - {597, "lswi", OpType::Load, 1, FL_EVIL | FL_IN_A0 | FL_OUT_D | FL_LOADSTORE}, + {533, "lswx", OpType::Load, 1, FL_IN_A0B | FL_OUT_D | FL_LOADSTORE}, + {597, "lswi", OpType::Load, 1, FL_IN_A0 | FL_OUT_D | FL_LOADSTORE}, // store word {151, "stwx", OpType::Store, 1, FL_IN_S | FL_IN_A0B | FL_LOADSTORE}, @@ -347,8 +347,8 @@ constexpr std::array s_table31{{ {662, "stwbrx", OpType::Store, 1, FL_IN_S | FL_IN_A0B | FL_LOADSTORE}, {918, "sthbrx", OpType::Store, 1, FL_IN_S | FL_IN_A0B | FL_LOADSTORE}, - {661, "stswx", OpType::Store, 1, FL_EVIL | FL_IN_A0B | FL_LOADSTORE}, - {725, "stswi", OpType::Store, 1, FL_EVIL | FL_IN_A0 | FL_LOADSTORE}, + {661, "stswx", OpType::Store, 1, FL_IN_A0B | FL_LOADSTORE}, + {725, "stswi", OpType::Store, 1, FL_IN_A0 | FL_LOADSTORE}, // fp load/store {535, "lfsx", OpType::LoadFP, 1, FL_OUT_FLOAT_D | FL_IN_A0B | FL_USE_FPU | FL_LOADSTORE}, diff --git a/Source/Core/Core/PowerPC/PPCTables.h b/Source/Core/Core/PowerPC/PPCTables.h index 00d4baaf9f..afb265f840 100644 --- a/Source/Core/Core/PowerPC/PPCTables.h +++ b/Source/Core/Core/PowerPC/PPCTables.h @@ -36,18 +36,17 @@ enum InstructionFlags : u64 FL_OUT_AD = FL_OUT_A | FL_OUT_D, FL_TIMER = (1ull << 15), // Used only for mftb. FL_CHECKEXCEPTIONS = (1ull << 16), // Used with rfi/rfid. - FL_EVIL = - (1ull << 17), // Historically used to refer to instructions that messed up Super Monkey Ball. - FL_USE_FPU = (1ull << 18), // Used to indicate a floating point instruction. - FL_LOADSTORE = (1ull << 19), // Used to indicate a load/store instruction. - FL_SET_FPRF = (1ull << 20), // Sets bits in the FPRF. - FL_READ_FPRF = (1ull << 21), // Reads bits from the FPRF. - FL_SET_OE = (1ull << 22), // Sets the overflow flag. - FL_IN_FLOAT_A = (1ull << 23), // frA is used as an input. - FL_IN_FLOAT_B = (1ull << 24), // frB is used as an input. - FL_IN_FLOAT_C = (1ull << 25), // frC is used as an input. - FL_IN_FLOAT_S = (1ull << 26), // frS is used as an input. - FL_IN_FLOAT_D = (1ull << 27), // frD is used as an input. + FL_NO_REORDER = (1ull << 17), // Instruction should not be reordered by our optimizations. + FL_USE_FPU = (1ull << 18), // Used to indicate a floating point instruction. + FL_LOADSTORE = (1ull << 19), // Used to indicate a load/store instruction. + FL_SET_FPRF = (1ull << 20), // Sets bits in the FPRF. + FL_READ_FPRF = (1ull << 21), // Reads bits from the FPRF. + FL_SET_OE = (1ull << 22), // Sets the overflow flag. + FL_IN_FLOAT_A = (1ull << 23), // frA is used as an input. + FL_IN_FLOAT_B = (1ull << 24), // frB is used as an input. + FL_IN_FLOAT_C = (1ull << 25), // frC is used as an input. + FL_IN_FLOAT_S = (1ull << 26), // frS is used as an input. + FL_IN_FLOAT_D = (1ull << 27), // frD is used as an input. FL_IN_FLOAT_AB = FL_IN_FLOAT_A | FL_IN_FLOAT_B, FL_IN_FLOAT_AC = FL_IN_FLOAT_A | FL_IN_FLOAT_C, FL_IN_FLOAT_ABC = FL_IN_FLOAT_A | FL_IN_FLOAT_B | FL_IN_FLOAT_C,