Arm64Emitter: Improve MOVI2R

More or less a complete rewrite of the function which aims to be equally good or better for each given input, without relying on special cases like the old implementation did. In particular, we now have more extensive support for MOVN, as mentioned in a TODO comment.
2025-04-12 09:51:24 +02:00 · 2021-01-01 20:43:34 +01:00 · 2021-01-01 20:43:34 +01:00 · 0d5ed06daf
commit 0d5ed06daf
parent 4e107935ac
2 changed files with 152 additions and 87 deletions
--- a/Source/Core/Common/Arm64Emitter.cpp
+++ b/Source/Core/Common/Arm64Emitter.cpp
@ -8,6 +8,7 @@
 #include <cstring>
 #include <optional>
 #include <tuple>
+#include <utility>
 #include <vector>

 #include "Common/Align.h"
@ -2003,94 +2004,155 @@ void ARM64XEmitter::ADRP(ARM64Reg Rd, s64 imm)
  EncodeAddressInst(1, Rd, static_cast<s32>(imm >> 12));
 }

-// Wrapper around MOVZ+MOVK (and later MOVN)
+template <typename T, size_t MaxSize>
+class SmallVector final
+{
+public:
+  SmallVector() = default;
+  explicit SmallVector(size_t size) : m_size(size) {}
+
+  void push_back(const T& x) { m_array[m_size++] = x; }
+  void push_back(T&& x) { m_array[m_size++] = std::move(x); }
+
+  template <typename... Args>
+  T& emplace_back(Args&&... args)
+  {
+    return m_array[m_size++] = T{std::forward<Args>(args)...};
+  }
+
+  T& operator[](size_t i) { return m_array[i]; }
+  const T& operator[](size_t i) const { return m_array[i]; }
+
+  size_t size() const { return m_size; }
+  bool empty() const { return m_size == 0; }
+
+private:
+  std::array<T, MaxSize> m_array{};
+  size_t m_size = 0;
+};
+
+template <typename T>
+void ARM64XEmitter::MOVI2RImpl(ARM64Reg Rd, T imm)
+{
+  enum class Approach
+  {
+    MOVZ,
+    MOVN,
+    ADR,
+    ADRP,
+  };
+
+  struct Part
+  {
+    Part() = default;
+    Part(u16 imm_, ShiftAmount shift_) : imm(imm_), shift(shift_) {}
+
+    u16 imm;
+    ShiftAmount shift;
+  };
+
+  constexpr size_t max_parts = sizeof(T) / 2;
+
+  SmallVector<Part, max_parts> best_parts;
+  Approach best_approach;
+
+  const auto try_base = [&](T base, Approach approach, bool first_time) {
+    SmallVector<Part, max_parts> parts;
+
+    for (size_t i = 0; i < max_parts; ++i)
+    {
+      const size_t shift = i * 16;
+      const u16 imm_shifted = static_cast<u16>(imm >> shift);
+      const u16 base_shifted = static_cast<u16>(base >> shift);
+      if (imm_shifted != base_shifted)
+        parts.emplace_back(imm_shifted, static_cast<ShiftAmount>(i));
+    }
+
+    if (first_time || parts.size() < best_parts.size())
+    {
+      best_parts = std::move(parts);
+      best_approach = approach;
+    }
+  };
+
+  const auto sext_21_bit = [](u64 x) {
+    return static_cast<s64>((x & 0x1FFFFF) | (x & 0x100000 ? ~0x1FFFFF : 0));
+  };
+
+  const u64 pc = reinterpret_cast<u64>(GetCodePtr());
+  const s64 adrp_offset = sext_21_bit((imm >> 12) - (pc >> 12)) << 12;
+  const s64 adr_offset = sext_21_bit(imm - pc);
+  const u64 adrp_base = (pc & ~0xFFF) + adrp_offset;
+  const u64 adr_base = pc + adr_offset;
+
+  // First: Try approaches for which instruction_count = max(parts.size(), 1)
+  try_base(T(0), Approach::MOVZ, true);
+  try_base(~T(0), Approach::MOVN, false);
+
+  // Second: Try approaches for which instruction_count = parts.size() + 1
+  if constexpr (sizeof(T) == 8)
+  {
+    try_base(adrp_base, Approach::ADRP, false);
+    try_base(adr_base, Approach::ADR, false);
+  }
+
+  size_t parts_uploaded = 0;
+
+  // To kill any dependencies, we start with an instruction that overwrites the entire register
+  switch (best_approach)
+  {
+  case Approach::MOVZ:
+    if (best_parts.empty())
+      best_parts.emplace_back(u16(0), ShiftAmount::Shift0);
+
+    MOVZ(Rd, best_parts[0].imm, best_parts[0].shift);
+    ++parts_uploaded;
+    break;
+
+  case Approach::MOVN:
+    if (best_parts.empty())
+      best_parts.emplace_back(u16(0xFFFF), ShiftAmount::Shift0);
+
+    MOVN(Rd, static_cast<u16>(~best_parts[0].imm), best_parts[0].shift);
+    ++parts_uploaded;
+    break;
+
+  case Approach::ADR:
+    ADR(Rd, adr_offset);
+    break;
+
+  case Approach::ADRP:
+    ADRP(Rd, adrp_offset);
+    break;
+  }
+
+  // And then we use MOVK for the remaining parts
+  for (; parts_uploaded < best_parts.size(); ++parts_uploaded)
+  {
+    const Part& part = best_parts[parts_uploaded];
+
+    if (best_approach == Approach::ADRP && part.shift == ShiftAmount::Shift0)
+    {
+      // The combination of ADRP followed by ADD immediate is specifically optimized in hardware
+      ASSERT(part.imm == (adrp_base & 0xF000) + (part.imm & 0xFFF));
+      ADD(Rd, Rd, part.imm & 0xFFF);
+    }
+    else
+    {
+      MOVK(Rd, part.imm, part.shift);
+    }
+  }
+}
+
+template void ARM64XEmitter::MOVI2RImpl(ARM64Reg Rd, u64 imm);
+template void ARM64XEmitter::MOVI2RImpl(ARM64Reg Rd, u32 imm);
+
 void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm)
 {
-  unsigned int parts = Is64Bit(Rd) ? 4 : 2;
-  BitSet32 upload_part(0);
-
-  // Always start with a movz! Kills the dependency on the register.
-  bool use_movz = true;
-
-  if (!imm)
-  {
-    // Zero immediate, just clear the register. EOR is pointless when we have MOVZ, which looks
-    // clearer in disasm too.
-    MOVZ(Rd, 0, ShiftAmount::Shift0);
-    return;
-  }
-
-  if ((Is64Bit(Rd) && imm == std::numeric_limits<u64>::max()) ||
-      (!Is64Bit(Rd) && imm == std::numeric_limits<u32>::max()))
-  {
-    // Max unsigned value (or if signed, -1)
-    // Set to ~ZR
-    ARM64Reg ZR = Is64Bit(Rd) ? SP : WSP;
-    ORN(Rd, ZR, ZR, ArithOption(ZR, ShiftType::LSL, 0));
-    return;
-  }
-
-  // TODO: Make some more systemic use of MOVN, but this will take care of most cases.
-  // Small negative integer. Use MOVN
-  if (!Is64Bit(Rd) && (imm | 0xFFFF0000) == imm)
-  {
-    MOVN(Rd, ~imm, ShiftAmount::Shift0);
-    return;
-  }
-
-  // XXX: Use MOVN when possible.
-  // XXX: Optimize more
-  // XXX: Support rotating immediates to save instructions
-  for (unsigned int i = 0; i < parts; ++i)
-  {
-    if ((imm >> (i * 16)) & 0xFFFF)
-      upload_part[i] = 1;
-  }
-
-  u64 aligned_pc = (u64)GetCodePtr() & ~0xFFF;
-  s64 aligned_offset = (s64)imm - (s64)aligned_pc;
-  // The offset for ADR/ADRP is an s32, so make sure it can be represented in that
-  if (upload_part.Count() > 1 && std::abs(aligned_offset) < 0x7FFFFFFFLL)
-  {
-    // Immediate we are loading is within 4GB of our aligned range
-    // Most likely a address that we can load in one or two instructions
-    if (!(std::abs(aligned_offset) & 0xFFF))
-    {
-      // Aligned ADR
-      ADRP(Rd, (s32)aligned_offset);
-      return;
-    }
-    else
-    {
-      // If the address is within 1MB of PC we can load it in a single instruction still
-      s64 offset = (s64)imm - (s64)GetCodePtr();
-      if (offset >= -0xFFFFF && offset <= 0xFFFFF)
-      {
-        ADR(Rd, (s32)offset);
-        return;
-      }
-      else
-      {
-        ADRP(Rd, (s32)(aligned_offset & ~0xFFF));
-        ADD(Rd, Rd, imm & 0xFFF);
-        return;
-      }
-    }
-  }
-
-  for (unsigned i = 0; i < parts; ++i)
-  {
-    if (use_movz && upload_part[i])
-    {
-      MOVZ(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i);
-      use_movz = false;
-    }
-    else
-    {
-      if (upload_part[i])
-        MOVK(Rd, (imm >> (i * 16)) & 0xFFFF, (ShiftAmount)i);
-    }
-  }
+  if (Is64Bit(Rd))
+    MOVI2RImpl<u64>(Rd, imm);
+  else
+    MOVI2RImpl<u32>(Rd, static_cast<u32>(imm));
 }

 bool ARM64XEmitter::MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2)
--- a/Source/Core/Common/Arm64Emitter.h
+++ b/Source/Core/Common/Arm64Emitter.h
@ -521,6 +521,9 @@ private:
  void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm);
  void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);

+  template <typename T>
+  void MOVI2RImpl(ARM64Reg Rd, T imm);
+
 protected:
  void Write32(u32 value);

@ -864,7 +867,7 @@ public:
  void ADR(ARM64Reg Rd, s32 imm);
  void ADRP(ARM64Reg Rd, s64 imm);

-  // Wrapper around MOVZ+MOVK
+  // Wrapper around ADR/ADRP/MOVZ/MOVN/MOVK
  void MOVI2R(ARM64Reg Rd, u64 imm);
  bool MOVI2R2(ARM64Reg Rd, u64 imm1, u64 imm2);
  template <class P>