diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index c3cb492a5f..ce40123bf1 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "Common/Align.h" @@ -1998,104 +1999,195 @@ void ARM64XEmitter::ADR(ARM64Reg Rd, s32 imm) { EncodeAddressInst(0, Rd, imm); } -void ARM64XEmitter::ADRP(ARM64Reg Rd, s32 imm) +void ARM64XEmitter::ADRP(ARM64Reg Rd, s64 imm) { - EncodeAddressInst(1, Rd, imm >> 12); + EncodeAddressInst(1, Rd, static_cast(imm >> 12)); } -// Wrapper around MOVZ+MOVK (and later MOVN) -void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize) +template +class SmallVector final { - unsigned int parts = Is64Bit(Rd) ? 4 : 2; - BitSet32 upload_part(0); +public: + SmallVector() = default; + explicit SmallVector(size_t size) : m_size(size) {} - // Always start with a movz! Kills the dependency on the register. - bool use_movz = true; + void push_back(const T& x) { m_array[m_size++] = x; } + void push_back(T&& x) { m_array[m_size++] = std::move(x); } - if (!imm) + template + T& emplace_back(Args&&... args) { - // Zero immediate, just clear the register. EOR is pointless when we have MOVZ, which looks - // clearer in disasm too. - MOVZ(Rd, 0, ShiftAmount::Shift0); - return; + return m_array[m_size++] = T{std::forward(args)...}; } - if ((Is64Bit(Rd) && imm == std::numeric_limits::max()) || - (!Is64Bit(Rd) && imm == std::numeric_limits::max())) - { - // Max unsigned value (or if signed, -1) - // Set to ~ZR - ARM64Reg ZR = Is64Bit(Rd) ? SP : WSP; - ORN(Rd, ZR, ZR, ArithOption(ZR, ShiftType::LSL, 0)); - return; - } + T& operator[](size_t i) { return m_array[i]; } + const T& operator[](size_t i) const { return m_array[i]; } - // TODO: Make some more systemic use of MOVN, but this will take care of most cases. - // Small negative integer. Use MOVN - if (!Is64Bit(Rd) && (imm | 0xFFFF0000) == imm) - { - MOVN(Rd, ~imm, ShiftAmount::Shift0); - return; - } + size_t size() const { return m_size; } + bool empty() const { return m_size == 0; } - // XXX: Use MOVN when possible. - // XXX: Optimize more - // XXX: Support rotating immediates to save instructions - if (optimize) +private: + std::array m_array{}; + size_t m_size = 0; +}; + +template +void ARM64XEmitter::MOVI2RImpl(ARM64Reg Rd, T imm) +{ + enum class Approach { - for (unsigned int i = 0; i < parts; ++i) + MOVZ, + MOVN, + ADR, + ADRP, + ORR, + }; + + struct Part + { + Part() = default; + Part(u16 imm_, ShiftAmount shift_) : imm(imm_), shift(shift_) {} + + u16 imm; + ShiftAmount shift; + }; + + constexpr size_t max_parts = sizeof(T) / 2; + + SmallVector best_parts; + Approach best_approach; + u64 best_base; + + const auto instructions_required = [](const SmallVector& parts, + Approach approach) { + return parts.size() + (approach > Approach::MOVN); + }; + + const auto try_base = [&](T base, Approach approach, bool first_time) { + SmallVector parts; + + for (size_t i = 0; i < max_parts; ++i) { - if ((imm >> (i * 16)) & 0xFFFF) - upload_part[i] = 1; + const size_t shift = i * 16; + const u16 imm_shifted = static_cast(imm >> shift); + const u16 base_shifted = static_cast(base >> shift); + if (imm_shifted != base_shifted) + parts.emplace_back(imm_shifted, static_cast(i)); } + + if (first_time || + instructions_required(parts, approach) < instructions_required(best_parts, best_approach)) + { + best_parts = std::move(parts); + best_approach = approach; + best_base = base; + } + }; + + // Try MOVZ/MOVN + try_base(T(0), Approach::MOVZ, true); + try_base(~T(0), Approach::MOVN, false); + + // Try PC-relative approaches + const auto sext_21_bit = [](u64 x) { + return static_cast((x & 0x1FFFFF) | (x & 0x100000 ? ~0x1FFFFF : 0)); + }; + const u64 pc = reinterpret_cast(GetCodePtr()); + const s64 adrp_offset = sext_21_bit((imm >> 12) - (pc >> 12)) << 12; + const s64 adr_offset = sext_21_bit(imm - pc); + const u64 adrp_base = (pc & ~0xFFF) + adrp_offset; + const u64 adr_base = pc + adr_offset; + if constexpr (sizeof(T) == 8) + { + try_base(adrp_base, Approach::ADRP, false); + try_base(adr_base, Approach::ADR, false); } - u64 aligned_pc = (u64)GetCodePtr() & ~0xFFF; - s64 aligned_offset = (s64)imm - (s64)aligned_pc; - // The offset for ADR/ADRP is an s32, so make sure it can be represented in that - if (upload_part.Count() > 1 && std::abs(aligned_offset) < 0x7FFFFFFFLL) + // Try ORR (or skip it if we already have a 1-instruction encoding - these tests are non-trivial) + if (instructions_required(best_parts, best_approach) > 1) { - // Immediate we are loading is within 4GB of our aligned range - // Most likely a address that we can load in one or two instructions - if (!(std::abs(aligned_offset) & 0xFFF)) + if constexpr (sizeof(T) == 8) { - // Aligned ADR - ADRP(Rd, (s32)aligned_offset); - return; + for (u64 orr_imm : {(imm << 32) | (imm & 0x0000'0000'FFFF'FFFF), + (imm & 0xFFFF'FFFF'0000'0000) | (imm >> 32), + (imm << 48) | (imm & 0x0000'FFFF'FFFF'0000) | (imm >> 48)}) + { + if (IsImmLogical(orr_imm, 64)) + try_base(orr_imm, Approach::ORR, false); + } } else { - // If the address is within 1MB of PC we can load it in a single instruction still - s64 offset = (s64)imm - (s64)GetCodePtr(); - if (offset >= -0xFFFFF && offset <= 0xFFFFF) - { - ADR(Rd, (s32)offset); - return; - } - else - { - ADRP(Rd, (s32)(aligned_offset & ~0xFFF)); - ADD(Rd, Rd, imm & 0xFFF); - return; - } + if (IsImmLogical(imm, 32)) + try_base(imm, Approach::ORR, false); } } - for (unsigned i = 0; i < parts; ++i) + size_t parts_uploaded = 0; + + // To kill any dependencies, we start with an instruction that overwrites the entire register + switch (best_approach) { - if (use_movz && upload_part[i]) + case Approach::MOVZ: + if (best_parts.empty()) + best_parts.emplace_back(u16(0), ShiftAmount::Shift0); + + MOVZ(Rd, best_parts[0].imm, best_parts[0].shift); + ++parts_uploaded; + break; + + case Approach::MOVN: + if (best_parts.empty()) + best_parts.emplace_back(u16(0xFFFF), ShiftAmount::Shift0); + + MOVN(Rd, static_cast(~best_parts[0].imm), best_parts[0].shift); + ++parts_uploaded; + break; + + case Approach::ADR: + ADR(Rd, adr_offset); + break; + + case Approach::ADRP: + ADRP(Rd, adrp_offset); + break; + + case Approach::ORR: + constexpr ARM64Reg zero_reg = sizeof(T) == 8 ? ZR : WZR; + const bool success = TryORRI2R(Rd, zero_reg, best_base); + ASSERT(success); + break; + } + + // And then we use MOVK for the remaining parts + for (; parts_uploaded < best_parts.size(); ++parts_uploaded) + { + const Part& part = best_parts[parts_uploaded]; + + if (best_approach == Approach::ADRP && part.shift == ShiftAmount::Shift0) { - MOVZ(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i); - use_movz = false; + // The combination of ADRP followed by ADD immediate is specifically optimized in hardware + ASSERT(part.imm == (adrp_base & 0xF000) + (part.imm & 0xFFF)); + ADD(Rd, Rd, part.imm & 0xFFF); } else { - if (upload_part[i] || !optimize) - MOVK(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i); + MOVK(Rd, part.imm, part.shift); } } } +template void ARM64XEmitter::MOVI2RImpl(ARM64Reg Rd, u64 imm); +template void ARM64XEmitter::MOVI2RImpl(ARM64Reg Rd, u32 imm); + +void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm) +{ + if (Is64Bit(Rd)) + MOVI2RImpl(Rd, imm); + else + MOVI2RImpl(Rd, static_cast(imm)); +} + bool ARM64XEmitter::MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2) { // TODO: Also optimize for performance, not just for code size. @@ -4271,7 +4363,7 @@ void ARM64XEmitter::CMPI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch) ADDI2R_internal(Is64Bit(Rn) ? ZR : WZR, Rn, imm, true, true, scratch); } -bool ARM64XEmitter::TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +bool ARM64XEmitter::TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm) { if (const auto result = IsImmArithmetic(imm)) { @@ -4283,7 +4375,7 @@ bool ARM64XEmitter::TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) return false; } -bool ARM64XEmitter::TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +bool ARM64XEmitter::TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm) { if (const auto result = IsImmArithmetic(imm)) { @@ -4295,7 +4387,7 @@ bool ARM64XEmitter::TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) return false; } -bool ARM64XEmitter::TryCMPI2R(ARM64Reg Rn, u32 imm) +bool ARM64XEmitter::TryCMPI2R(ARM64Reg Rn, u64 imm) { if (const auto result = IsImmArithmetic(imm)) { @@ -4307,9 +4399,9 @@ bool ARM64XEmitter::TryCMPI2R(ARM64Reg Rn, u32 imm) return false; } -bool ARM64XEmitter::TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) +bool ARM64XEmitter::TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm) { - if (const auto result = IsImmLogical(imm, 32)) + if (const auto result = IsImmLogical(imm, Is64Bit(Rd) ? 64 : 32)) { const auto& [n, imm_s, imm_r] = *result; AND(Rd, Rn, imm_r, imm_s, n != 0); @@ -4318,9 +4410,10 @@ bool ARM64XEmitter::TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) return false; } -bool ARM64XEmitter::TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) + +bool ARM64XEmitter::TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm) { - if (const auto result = IsImmLogical(imm, 32)) + if (const auto result = IsImmLogical(imm, Is64Bit(Rd) ? 64 : 32)) { const auto& [n, imm_s, imm_r] = *result; ORR(Rd, Rn, imm_r, imm_s, n != 0); @@ -4329,9 +4422,10 @@ bool ARM64XEmitter::TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) return false; } -bool ARM64XEmitter::TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm) + +bool ARM64XEmitter::TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm) { - if (const auto result = IsImmLogical(imm, 32)) + if (const auto result = IsImmLogical(imm, Is64Bit(Rd) ? 64 : 32)) { const auto& [n, imm_s, imm_r] = *result; EOR(Rd, Rn, imm_r, imm_s, n != 0); diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 7a43495aab..874f1d2fed 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -521,6 +521,9 @@ private: void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm); void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + template + void MOVI2RImpl(ARM64Reg Rd, T imm); + protected: void Write32(u32 value); @@ -862,10 +865,10 @@ public: // Address of label/page PC-relative void ADR(ARM64Reg Rd, s32 imm); - void ADRP(ARM64Reg Rd, s32 imm); + void ADRP(ARM64Reg Rd, s64 imm); - // Wrapper around MOVZ+MOVK - void MOVI2R(ARM64Reg Rd, u64 imm, bool optimize = true); + // Wrapper around ADR/ADRP/MOVZ/MOVN/MOVK + void MOVI2R(ARM64Reg Rd, u64 imm); bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2); template void MOVP2R(ARM64Reg Rd, P* ptr) @@ -893,13 +896,13 @@ public: void SUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); void SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch = INVALID_REG); - bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); - bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); - bool TryCMPI2R(ARM64Reg Rn, u32 imm); + bool TryADDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm); + bool TrySUBI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm); + bool TryCMPI2R(ARM64Reg Rn, u64 imm); - bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); - bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); - bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); + bool TryANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm); + bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm); + bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm); // ABI related void ABI_PushRegisters(BitSet32 registers); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp index dbcf424aa2..6a4600d67c 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp @@ -912,7 +912,7 @@ void JitArm64::subfex(UGeckoInstruction inst) ARM64Reg WA = gpr.GetReg(); if (js.carryFlagSet) { - MOVI2R(WA, ~i + j, gpr.R(d)); + MOVI2R(WA, ~i + j); ADC(gpr.R(d), WA, WZR); } else diff --git a/Source/UnitTests/Core/PowerPC/JitArm64/MovI2R.cpp b/Source/UnitTests/Core/PowerPC/JitArm64/MovI2R.cpp index 8f44259f8d..c21e47c03d 100644 --- a/Source/UnitTests/Core/PowerPC/JitArm64/MovI2R.cpp +++ b/Source/UnitTests/Core/PowerPC/JitArm64/MovI2R.cpp @@ -78,20 +78,44 @@ TEST(JitArm64, MovI2R_ADP) { TestMovI2R test; const u64 base = Common::BitCast(test.GetCodePtr()); + + // Test offsets around 0 for (s64 i = -0x20000; i < 0x20000; i++) { const u64 offset = static_cast(i); test.Check64(base + offset); } + + // Test offsets around the maximum + for (const s64 i : {-0x200000ll, 0x200000ll}) + { + for (s64 j = -4; j < 4; j++) + { + const u64 offset = static_cast(i + j); + test.Check64(base + offset); + } + } } TEST(JitArm64, MovI2R_ADRP) { TestMovI2R test; const u64 base = Common::BitCast(test.GetCodePtr()) & ~0xFFF; + + // Test offsets around 0 for (s64 i = -0x20000; i < 0x20000; i++) { const u64 offset = static_cast(i) << 12; test.Check64(base + offset); } + + // Test offsets around the maximum + for (const s64 i : {-0x100000000ll, -0x80000000ll, 0x80000000ll, 0x100000000ll}) + { + for (s64 j = -4; j < 4; j++) + { + const u64 offset = static_cast(i + (j << 12)); + test.Check64(base + offset); + } + } } diff --git a/Source/UnitTests/UnitTests.vcxproj b/Source/UnitTests/UnitTests.vcxproj index f758af5bed..230ac50412 100644 --- a/Source/UnitTests/UnitTests.vcxproj +++ b/Source/UnitTests/UnitTests.vcxproj @@ -78,6 +78,9 @@ + + +