Arm64Emitter: Improve MOVI2R

More or less a complete rewrite of the function which aims
to be equally good or better for each given input, without
relying on special cases like the old implementation did.

In particular, we now have more extensive support for
MOVN, as mentioned in a TODO comment.
This commit is contained in:
JosJuice 2021-01-01 20:43:34 +01:00
parent 4e107935ac
commit 0d5ed06daf
2 changed files with 152 additions and 87 deletions

View File

@ -8,6 +8,7 @@
#include <cstring>
#include <optional>
#include <tuple>
#include <utility>
#include <vector>
#include "Common/Align.h"
@ -2003,94 +2004,155 @@ void ARM64XEmitter::ADRP(ARM64Reg Rd, s64 imm)
EncodeAddressInst(1, Rd, static_cast<s32>(imm >> 12));
}
// Wrapper around MOVZ+MOVK (and later MOVN)
template <typename T, size_t MaxSize>
class SmallVector final
{
public:
SmallVector() = default;
explicit SmallVector(size_t size) : m_size(size) {}
void push_back(const T& x) { m_array[m_size++] = x; }
void push_back(T&& x) { m_array[m_size++] = std::move(x); }
template <typename... Args>
T& emplace_back(Args&&... args)
{
return m_array[m_size++] = T{std::forward<Args>(args)...};
}
T& operator[](size_t i) { return m_array[i]; }
const T& operator[](size_t i) const { return m_array[i]; }
size_t size() const { return m_size; }
bool empty() const { return m_size == 0; }
private:
std::array<T, MaxSize> m_array{};
size_t m_size = 0;
};
template <typename T>
void ARM64XEmitter::MOVI2RImpl(ARM64Reg Rd, T imm)
{
enum class Approach
{
MOVZ,
MOVN,
ADR,
ADRP,
};
struct Part
{
Part() = default;
Part(u16 imm_, ShiftAmount shift_) : imm(imm_), shift(shift_) {}
u16 imm;
ShiftAmount shift;
};
constexpr size_t max_parts = sizeof(T) / 2;
SmallVector<Part, max_parts> best_parts;
Approach best_approach;
const auto try_base = [&](T base, Approach approach, bool first_time) {
SmallVector<Part, max_parts> parts;
for (size_t i = 0; i < max_parts; ++i)
{
const size_t shift = i * 16;
const u16 imm_shifted = static_cast<u16>(imm >> shift);
const u16 base_shifted = static_cast<u16>(base >> shift);
if (imm_shifted != base_shifted)
parts.emplace_back(imm_shifted, static_cast<ShiftAmount>(i));
}
if (first_time || parts.size() < best_parts.size())
{
best_parts = std::move(parts);
best_approach = approach;
}
};
const auto sext_21_bit = [](u64 x) {
return static_cast<s64>((x & 0x1FFFFF) | (x & 0x100000 ? ~0x1FFFFF : 0));
};
const u64 pc = reinterpret_cast<u64>(GetCodePtr());
const s64 adrp_offset = sext_21_bit((imm >> 12) - (pc >> 12)) << 12;
const s64 adr_offset = sext_21_bit(imm - pc);
const u64 adrp_base = (pc & ~0xFFF) + adrp_offset;
const u64 adr_base = pc + adr_offset;
// First: Try approaches for which instruction_count = max(parts.size(), 1)
try_base(T(0), Approach::MOVZ, true);
try_base(~T(0), Approach::MOVN, false);
// Second: Try approaches for which instruction_count = parts.size() + 1
if constexpr (sizeof(T) == 8)
{
try_base(adrp_base, Approach::ADRP, false);
try_base(adr_base, Approach::ADR, false);
}
size_t parts_uploaded = 0;
// To kill any dependencies, we start with an instruction that overwrites the entire register
switch (best_approach)
{
case Approach::MOVZ:
if (best_parts.empty())
best_parts.emplace_back(u16(0), ShiftAmount::Shift0);
MOVZ(Rd, best_parts[0].imm, best_parts[0].shift);
++parts_uploaded;
break;
case Approach::MOVN:
if (best_parts.empty())
best_parts.emplace_back(u16(0xFFFF), ShiftAmount::Shift0);
MOVN(Rd, static_cast<u16>(~best_parts[0].imm), best_parts[0].shift);
++parts_uploaded;
break;
case Approach::ADR:
ADR(Rd, adr_offset);
break;
case Approach::ADRP:
ADRP(Rd, adrp_offset);
break;
}
// And then we use MOVK for the remaining parts
for (; parts_uploaded < best_parts.size(); ++parts_uploaded)
{
const Part& part = best_parts[parts_uploaded];
if (best_approach == Approach::ADRP && part.shift == ShiftAmount::Shift0)
{
// The combination of ADRP followed by ADD immediate is specifically optimized in hardware
ASSERT(part.imm == (adrp_base & 0xF000) + (part.imm & 0xFFF));
ADD(Rd, Rd, part.imm & 0xFFF);
}
else
{
MOVK(Rd, part.imm, part.shift);
}
}
}
template void ARM64XEmitter::MOVI2RImpl(ARM64Reg Rd, u64 imm);
template void ARM64XEmitter::MOVI2RImpl(ARM64Reg Rd, u32 imm);
void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm)
{
unsigned int parts = Is64Bit(Rd) ? 4 : 2;
BitSet32 upload_part(0);
// Always start with a movz! Kills the dependency on the register.
bool use_movz = true;
if (!imm)
{
// Zero immediate, just clear the register. EOR is pointless when we have MOVZ, which looks
// clearer in disasm too.
MOVZ(Rd, 0, ShiftAmount::Shift0);
return;
}
if ((Is64Bit(Rd) && imm == std::numeric_limits<u64>::max()) ||
(!Is64Bit(Rd) && imm == std::numeric_limits<u32>::max()))
{
// Max unsigned value (or if signed, -1)
// Set to ~ZR
ARM64Reg ZR = Is64Bit(Rd) ? SP : WSP;
ORN(Rd, ZR, ZR, ArithOption(ZR, ShiftType::LSL, 0));
return;
}
// TODO: Make some more systemic use of MOVN, but this will take care of most cases.
// Small negative integer. Use MOVN
if (!Is64Bit(Rd) && (imm | 0xFFFF0000) == imm)
{
MOVN(Rd, ~imm, ShiftAmount::Shift0);
return;
}
// XXX: Use MOVN when possible.
// XXX: Optimize more
// XXX: Support rotating immediates to save instructions
for (unsigned int i = 0; i < parts; ++i)
{
if ((imm >> (i * 16)) & 0xFFFF)
upload_part[i] = 1;
}
u64 aligned_pc = (u64)GetCodePtr() & ~0xFFF;
s64 aligned_offset = (s64)imm - (s64)aligned_pc;
// The offset for ADR/ADRP is an s32, so make sure it can be represented in that
if (upload_part.Count() > 1 && std::abs(aligned_offset) < 0x7FFFFFFFLL)
{
// Immediate we are loading is within 4GB of our aligned range
// Most likely a address that we can load in one or two instructions
if (!(std::abs(aligned_offset) & 0xFFF))
{
// Aligned ADR
ADRP(Rd, (s32)aligned_offset);
return;
}
else
{
// If the address is within 1MB of PC we can load it in a single instruction still
s64 offset = (s64)imm - (s64)GetCodePtr();
if (offset >= -0xFFFFF && offset <= 0xFFFFF)
{
ADR(Rd, (s32)offset);
return;
}
else
{
ADRP(Rd, (s32)(aligned_offset & ~0xFFF));
ADD(Rd, Rd, imm & 0xFFF);
return;
}
}
}
for (unsigned i = 0; i < parts; ++i)
{
if (use_movz && upload_part[i])
{
MOVZ(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i);
use_movz = false;
}
else
{
if (upload_part[i])
MOVK(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i);
}
}
if (Is64Bit(Rd))
MOVI2RImpl<u64>(Rd, imm);
else
MOVI2RImpl<u32>(Rd, static_cast<u32>(imm));
}
bool ARM64XEmitter::MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2)

View File

@ -521,6 +521,9 @@ private:
void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm);
void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
template <typename T>
void MOVI2RImpl(ARM64Reg Rd, T imm);
protected:
void Write32(u32 value);
@ -864,7 +867,7 @@ public:
void ADR(ARM64Reg Rd, s32 imm);
void ADRP(ARM64Reg Rd, s64 imm);
// Wrapper around MOVZ+MOVK
// Wrapper around ADR/ADRP/MOVZ/MOVN/MOVK
void MOVI2R(ARM64Reg Rd, u64 imm);
bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2);
template <class P>