From 856ff296ae4a8cbcc879e1b6477d15d64860ddf3 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 13 Aug 2021 14:21:27 +0200 Subject: [PATCH 1/3] JitArm64: Optimize dcbx being called in a loop over a large memory region JitArm64 port of 8b2f5d5. --- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 107 ++++++++++++++++-- 1 file changed, 99 insertions(+), 8 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 29aae5ac59..46ecb5485c 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -553,21 +553,97 @@ void JitArm64::dcbx(UGeckoInstruction inst) INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); + u32 a = inst.RA, b = inst.RB; + + // Check if the next instructions match a known looping pattern: + // - dcbx rX + // - addi rX,rX,32 + // - bdnz+ -8 + const bool make_loop = a == 0 && b != 0 && CanMergeNextInstructions(2) && + (js.op[1].inst.hex & 0xfc00'ffff) == 0x38000020 && + js.op[1].inst.RA_6 == b && js.op[1].inst.RD_2 == b && + js.op[2].inst.hex == 0x4200fff8; + gpr.Lock(ARM64Reg::W0, ARM64Reg::W30); + if (make_loop) + gpr.Lock(ARM64Reg::W1); + + ARM64Reg WA = ARM64Reg::W30; + + if (make_loop) + gpr.BindToRegister(b, true); + + ARM64Reg loop_counter = ARM64Reg::INVALID_REG; + if (make_loop) + { + // We'll execute somewhere between one single cacheline invalidation and however many are needed + // to reduce the downcount to zero, never exceeding the amount requested by the game. + // To stay consistent with the rest of the code we adjust the involved registers (CTR and Rb) + // by the amount of cache lines we invalidate minus one -- since we'll run the regular addi and + // bdnz afterwards! So if we invalidate a single cache line, we don't adjust the registers at + // all, if we invalidate 2 cachelines we adjust the registers by one step, and so on. + + ARM64Reg reg_cycle_count = gpr.GetReg(); + ARM64Reg reg_downcount = gpr.GetReg(); + loop_counter = ARM64Reg::W1; + ARM64Reg WB = ARM64Reg::W0; + + // Figure out how many loops we want to do. + const u8 cycle_count_per_loop = + js.op[0].opinfo->numCycles + js.op[1].opinfo->numCycles + js.op[2].opinfo->numCycles; + + LDR(IndexType::Unsigned, reg_downcount, PPC_REG, PPCSTATE_OFF(downcount)); + MOVI2R(WA, 0); + CMP(reg_downcount, 0); // if (downcount <= 0) + FixupBranch downcount_is_zero_or_negative = B(CCFlags::CC_LE); // only do 1 invalidation; else: + LDR(IndexType::Unsigned, loop_counter, PPC_REG, PPCSTATE_OFF_SPR(SPR_CTR)); + MOVI2R(reg_cycle_count, cycle_count_per_loop); + SDIV(WB, reg_downcount, reg_cycle_count); // WB = downcount / cycle_count + SUB(WA, loop_counter, 1); // WA = CTR - 1 + // ^ Note that this CTR-1 implicitly handles the CTR == 0 case correctly. + CMP(WB, WA); + CSEL(WA, WB, WA, CCFlags::CC_LO); // WA = min(WB, WA) + + // WA now holds the amount of loops to execute minus 1, which is the amount we need to adjust + // downcount, CTR, and Rb by to exit the loop construct with the right values in those + // registers. + + // CTR -= WA + SUB(loop_counter, loop_counter, WA); + STR(IndexType::Unsigned, loop_counter, PPC_REG, PPCSTATE_OFF_SPR(SPR_CTR)); + + // downcount -= (WA * reg_cycle_count) + MUL(WB, WA, reg_cycle_count); + // ^ Note that this cannot overflow because it's limited by (downcount/cycle_count). + SUB(reg_downcount, reg_downcount, WB); + STR(IndexType::Unsigned, reg_downcount, PPC_REG, PPCSTATE_OFF(downcount)); + + SetJumpTarget(downcount_is_zero_or_negative); + + // Load the loop_counter register with the amount of invalidations to execute. + ADD(loop_counter, WA, 1); + + gpr.Unlock(reg_cycle_count, reg_downcount); + } ARM64Reg effective_addr = ARM64Reg::W0; ARM64Reg physical_addr = MSR.IR ? gpr.GetReg() : effective_addr; ARM64Reg value = gpr.GetReg(); - ARM64Reg WA = ARM64Reg::W30; - - u32 a = inst.RA, b = inst.RB; if (a) ADD(effective_addr, gpr.R(a), gpr.R(b)); else MOV(effective_addr, gpr.R(b)); + if (make_loop) + { + // This is the best place to adjust Rb to what it should be since WA still has the + // adjusted loop count and we're done reading from Rb. + ADD(gpr.R(b), gpr.R(b), WA, ArithOption(WA, ShiftType::LSL, 5)); // Rb += (WA * 32) + } + // Translate effective address to physical address. + const u8* loop_start = GetCodePtr(); FixupBranch bat_lookup_failed; if (MSR.IR) { @@ -586,9 +662,18 @@ void JitArm64::dcbx(UGeckoInstruction inst) LSRV(value, value, WA); // move current bit to bit 0 FixupBranch bit_not_set = TBZ(value, 0); - FixupBranch far_addr = B(); + FixupBranch invalidate_needed = B(); + SetJumpTarget(bit_not_set); + + if (make_loop) + { + ADD(effective_addr, effective_addr, 32); + SUBS(loop_counter, loop_counter, 1); + B(CCFlags::CC_NEQ, loop_start); + } + SwitchToFarCode(); - SetJumpTarget(far_addr); + SetJumpTarget(invalidate_needed); if (MSR.IR) SetJumpTarget(bat_lookup_failed); @@ -598,12 +683,17 @@ void JitArm64::dcbx(UGeckoInstruction inst) gprs_to_push[DecodeReg(physical_addr)] = false; gprs_to_push[DecodeReg(value)] = false; gprs_to_push[DecodeReg(WA)] = false; + if (make_loop) + gprs_to_push[DecodeReg(loop_counter)] = false; ABI_PushRegisters(gprs_to_push); m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30); - // W0 (the function call argument) was already set earlier - MOVP2R(ARM64Reg::X8, &JitInterface::InvalidateICacheLine); + // The function call arguments are already in the correct registers + if (make_loop) + MOVP2R(ARM64Reg::X8, &JitInterface::InvalidateICacheLines); + else + MOVP2R(ARM64Reg::X8, &JitInterface::InvalidateICacheLine); BLR(ARM64Reg::X8); m_float_emit.ABI_PopRegisters(fprs_to_push, ARM64Reg::X30); @@ -611,12 +701,13 @@ void JitArm64::dcbx(UGeckoInstruction inst) FixupBranch near_addr = B(); SwitchToNearCode(); - SetJumpTarget(bit_not_set); SetJumpTarget(near_addr); gpr.Unlock(effective_addr, value, WA); if (MSR.IR) gpr.Unlock(physical_addr); + if (make_loop) + gpr.Unlock(loop_counter); } void JitArm64::dcbt(UGeckoInstruction inst) From 62e7b34c88f707c6ed04d6b36f0693c2cc0a2773 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 13 Aug 2021 14:32:25 +0200 Subject: [PATCH 2/3] JitArm64: Don't lock W30 in dcbx If W30 is in use and we don't lock it, it will be pushed to the stack before the BLR, so there isn't really any reason to lock W30. --- Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 46ecb5485c..a107a775c3 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -564,11 +564,11 @@ void JitArm64::dcbx(UGeckoInstruction inst) js.op[1].inst.RA_6 == b && js.op[1].inst.RD_2 == b && js.op[2].inst.hex == 0x4200fff8; - gpr.Lock(ARM64Reg::W0, ARM64Reg::W30); + gpr.Lock(ARM64Reg::W0); if (make_loop) gpr.Lock(ARM64Reg::W1); - ARM64Reg WA = ARM64Reg::W30; + ARM64Reg WA = gpr.GetReg(); if (make_loop) gpr.BindToRegister(b, true); @@ -687,7 +687,7 @@ void JitArm64::dcbx(UGeckoInstruction inst) gprs_to_push[DecodeReg(loop_counter)] = false; ABI_PushRegisters(gprs_to_push); - m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30); + m_float_emit.ABI_PushRegisters(fprs_to_push, WA); // The function call arguments are already in the correct registers if (make_loop) @@ -696,7 +696,7 @@ void JitArm64::dcbx(UGeckoInstruction inst) MOVP2R(ARM64Reg::X8, &JitInterface::InvalidateICacheLine); BLR(ARM64Reg::X8); - m_float_emit.ABI_PopRegisters(fprs_to_push, ARM64Reg::X30); + m_float_emit.ABI_PopRegisters(fprs_to_push, WA); ABI_PopRegisters(gprs_to_push); FixupBranch near_addr = B(); From 90fcaf7e964853c8120fdb21157854541189bae2 Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 13 Aug 2021 14:59:47 +0200 Subject: [PATCH 3/3] Jit: Use one less register in dcbx We were using a "value" register to avoid clobbering physical_addr, but this isn't actually needed anymore. The only bits we need from physical_addr after we start clobbering it are bits 5-9, and those bits are identical in effective_addr and physical_addr, so we can read them from effective_addr instead. --- .../Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 43 ++++++++++--------- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 18 +++----- 2 files changed, 29 insertions(+), 32 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index e00f2a7065..a3a428c889 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -245,9 +245,7 @@ void Jit64::dcbx(UGeckoInstruction inst) RCOpArg Ra = inst.RA ? gpr.Use(inst.RA, RCMode::Read) : RCOpArg::Imm32(0); RCX64Reg Rb = gpr.Bind(inst.RB, make_loop ? RCMode::ReadWrite : RCMode::Read); - RCX64Reg tmp = gpr.Scratch(); - RCX64Reg effective_address = gpr.Scratch(); - RegCache::Realize(Ra, Rb, tmp, effective_address); + RegCache::Realize(Ra, Rb); RCX64Reg loop_counter; if (make_loop) @@ -259,10 +257,10 @@ void Jit64::dcbx(UGeckoInstruction inst) // bdnz afterwards! So if we invalidate a single cache line, we don't adjust the registers at // all, if we invalidate 2 cachelines we adjust the registers by one step, and so on. - RCX64Reg& reg_cycle_count = tmp; - RCX64Reg& reg_downcount = effective_address; + RCX64Reg reg_cycle_count = gpr.Scratch(); + RCX64Reg reg_downcount = gpr.Scratch(); loop_counter = gpr.Scratch(); - RegCache::Realize(loop_counter); + RegCache::Realize(reg_cycle_count, reg_downcount, loop_counter); // This must be true in order for us to pick up the DIV results and not trash any data. static_assert(RSCRATCH == Gen::EAX && RSCRATCH2 == Gen::EDX); @@ -304,8 +302,8 @@ void Jit64::dcbx(UGeckoInstruction inst) LEA(32, loop_counter, MDisp(RSCRATCH2, 1)); } - X64Reg value = RSCRATCH; - MOV_sum(32, value, Ra, Rb); + X64Reg addr = RSCRATCH; + MOV_sum(32, addr, Ra, Rb); if (make_loop) { @@ -315,33 +313,36 @@ void Jit64::dcbx(UGeckoInstruction inst) ADD(32, R(Rb), R(RSCRATCH2)); // Rb += (RSCRATCH2 * 32) } - X64Reg addr = RSCRATCH2; + X64Reg tmp = RSCRATCH2; + RCX64Reg effective_address = gpr.Scratch(); + RegCache::Realize(effective_address); + FixupBranch bat_lookup_failed; - MOV(32, R(effective_address), R(value)); + MOV(32, R(effective_address), R(addr)); const u8* loop_start = GetCodePtr(); if (MSR.IR) { // Translate effective address to physical address. - bat_lookup_failed = BATAddressLookup(value, tmp, PowerPC::ibat_table.data()); - MOV(32, R(addr), R(effective_address)); - AND(32, R(addr), Imm32(0x0001ffff)); - AND(32, R(value), Imm32(0xfffe0000)); - OR(32, R(value), R(addr)); + bat_lookup_failed = BATAddressLookup(addr, tmp, PowerPC::ibat_table.data()); + MOV(32, R(tmp), R(effective_address)); + AND(32, R(tmp), Imm32(0x0001ffff)); + AND(32, R(addr), Imm32(0xfffe0000)); + OR(32, R(addr), R(tmp)); } - MOV(32, R(addr), R(value)); // Check whether a JIT cache line needs to be invalidated. - SHR(32, R(value), Imm8(5 + 5)); // >> 5 for cache line size, >> 5 for width of bitset + SHR(32, R(addr), Imm8(5 + 5)); // >> 5 for cache line size, >> 5 for width of bitset MOV(64, R(tmp), ImmPtr(GetBlockCache()->GetBlockBitSet())); - MOV(32, R(value), MComplex(tmp, value, SCALE_4, 0)); - SHR(32, R(addr), Imm8(5)); - BT(32, R(value), R(addr)); + MOV(32, R(addr), MComplex(tmp, addr, SCALE_4, 0)); + MOV(32, R(tmp), R(effective_address)); + SHR(32, R(tmp), Imm8(5)); + BT(32, R(addr), R(tmp)); FixupBranch invalidate_needed = J_CC(CC_C, true); if (make_loop) { ADD(32, R(effective_address), Imm8(32)); - MOV(32, R(value), R(effective_address)); + MOV(32, R(addr), R(effective_address)); SUB(32, R(loop_counter), Imm8(1)); J_CC(CC_NZ, loop_start); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index a107a775c3..fc4603811f 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -627,8 +627,7 @@ void JitArm64::dcbx(UGeckoInstruction inst) } ARM64Reg effective_addr = ARM64Reg::W0; - ARM64Reg physical_addr = MSR.IR ? gpr.GetReg() : effective_addr; - ARM64Reg value = gpr.GetReg(); + ARM64Reg physical_addr = gpr.GetReg(); if (a) ADD(effective_addr, gpr.R(a), gpr.R(b)); @@ -653,15 +652,15 @@ void JitArm64::dcbx(UGeckoInstruction inst) } // Check whether a JIT cache line needs to be invalidated. - LSR(value, physical_addr, 5 + 5); // >> 5 for cache line size, >> 5 for width of bitset + LSR(physical_addr, physical_addr, 5 + 5); // >> 5 for cache line size, >> 5 for width of bitset MOVP2R(EncodeRegTo64(WA), GetBlockCache()->GetBlockBitSet()); - LDR(value, EncodeRegTo64(WA), ArithOption(EncodeRegTo64(value), true)); + LDR(physical_addr, EncodeRegTo64(WA), ArithOption(EncodeRegTo64(physical_addr), true)); - LSR(WA, physical_addr, 5); // mask sizeof cacheline, & 0x1f is the position within the bitset + LSR(WA, effective_addr, 5); // mask sizeof cacheline, & 0x1f is the position within the bitset - LSRV(value, value, WA); // move current bit to bit 0 + LSRV(physical_addr, physical_addr, WA); // move current bit to bit 0 - FixupBranch bit_not_set = TBZ(value, 0); + FixupBranch bit_not_set = TBZ(physical_addr, 0); FixupBranch invalidate_needed = B(); SetJumpTarget(bit_not_set); @@ -681,7 +680,6 @@ void JitArm64::dcbx(UGeckoInstruction inst) BitSet32 fprs_to_push = fpr.GetCallerSavedUsed(); gprs_to_push[DecodeReg(effective_addr)] = false; gprs_to_push[DecodeReg(physical_addr)] = false; - gprs_to_push[DecodeReg(value)] = false; gprs_to_push[DecodeReg(WA)] = false; if (make_loop) gprs_to_push[DecodeReg(loop_counter)] = false; @@ -703,9 +701,7 @@ void JitArm64::dcbx(UGeckoInstruction inst) SwitchToNearCode(); SetJumpTarget(near_addr); - gpr.Unlock(effective_addr, value, WA); - if (MSR.IR) - gpr.Unlock(physical_addr); + gpr.Unlock(effective_addr, physical_addr, WA); if (make_loop) gpr.Unlock(loop_counter); }