Jit: Move rlwinmx and rlwnmx to ConstantPropagation

2025-02-03 11:32:43 +01:00 · 2024-05-24 19:49:53 +02:00 · 2024-05-24 19:49:53 +02:00 · 0b6b494b8b
commit 0b6b494b8b
parent 4898fa72ab
4 changed files with 95 additions and 101 deletions
--- a/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_Integer.cpp
@ -1979,112 +1979,99 @@ void Jit64::rlwinmx(UGeckoInstruction inst)
  int a = inst.RA;
  int s = inst.RS;

-  if (gpr.IsImm(s))
+  const bool left_shift = inst.SH && inst.MB == 0 && inst.ME == 31 - inst.SH;
+  const bool right_shift = inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH;
+  const bool field_extract = inst.SH && inst.ME == 31 && inst.MB > 32 - inst.SH;
+  const u32 mask = MakeRotationMask(inst.MB, inst.ME);
+  const u32 prerotate_mask = std::rotr(mask, inst.SH);
+  const bool simple_mask = mask == 0xff || mask == 0xffff;
+  const bool simple_prerotate_mask = prerotate_mask == 0xff || prerotate_mask == 0xffff;
+  // In case of a merged branch, track whether or not we've set flags.
+  // If not, we need to do a test later to get them.
+  bool needs_test = true;
+  // If we know the high bit can't be set, we can avoid doing a sign extend for flag storage.
+  bool needs_sext = true;
+  int mask_size = inst.ME - inst.MB + 1;
+
+  if (simple_mask && !(inst.SH & (mask_size - 1)) && !gpr.IsBound(s) && !gpr.IsImm(s))
  {
-    u32 result = gpr.Imm32(s);
-    if (inst.SH != 0)
-      result = std::rotl(result, inst.SH);
-    result &= MakeRotationMask(inst.MB, inst.ME);
-    gpr.SetImmediate32(a, result);
-    if (inst.Rc)
-      ComputeRC(a);
+    // optimized case: byte/word extract from m_ppc_state
+
+    // Note: If a == s, calling Realize(Ra) will allocate a host register for Rs,
+    // so we have to get mem_source from Rs before calling Realize(Ra)
+
+    RCOpArg Rs = gpr.Use(s, RCMode::Read);
+    RegCache::Realize(Rs);
+    OpArg mem_source = Rs.Location();
+    if (inst.SH)
+      mem_source.AddMemOffset((32 - inst.SH) >> 3);
+    Rs.Unlock();
+
+    RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
+    RegCache::Realize(Ra);
+    MOVZX(32, mask_size, Ra, mem_source);
+
+    needs_sext = false;
  }
  else
  {
-    const bool left_shift = inst.SH && inst.MB == 0 && inst.ME == 31 - inst.SH;
-    const bool right_shift = inst.SH && inst.ME == 31 && inst.MB == 32 - inst.SH;
-    const bool field_extract = inst.SH && inst.ME == 31 && inst.MB > 32 - inst.SH;
-    const u32 mask = MakeRotationMask(inst.MB, inst.ME);
-    const u32 prerotate_mask = std::rotr(mask, inst.SH);
-    const bool simple_mask = mask == 0xff || mask == 0xffff;
-    const bool simple_prerotate_mask = prerotate_mask == 0xff || prerotate_mask == 0xffff;
-    // In case of a merged branch, track whether or not we've set flags.
-    // If not, we need to do a test later to get them.
-    bool needs_test = true;
-    // If we know the high bit can't be set, we can avoid doing a sign extend for flag storage.
-    bool needs_sext = true;
-    int mask_size = inst.ME - inst.MB + 1;
+    RCOpArg Rs = gpr.UseNoImm(s, RCMode::Read);
+    RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
+    RegCache::Realize(Rs, Ra);

-    if (simple_mask && !(inst.SH & (mask_size - 1)) && !gpr.IsBound(s))
+    if (a != s && left_shift && Rs.IsSimpleReg() && inst.SH <= 3)
    {
-      // optimized case: byte/word extract from m_ppc_state
-
-      // Note: If a == s, calling Realize(Ra) will allocate a host register for Rs,
-      // so we have to get mem_source from Rs before calling Realize(Ra)
-
-      RCOpArg Rs = gpr.Use(s, RCMode::Read);
-      RegCache::Realize(Rs);
-      OpArg mem_source = Rs.Location();
+      LEA(32, Ra, MScaled(Rs.GetSimpleReg(), SCALE_1 << inst.SH, 0));
+    }
+    // optimized case: byte/word extract plus rotate
+    else if (simple_prerotate_mask && !left_shift)
+    {
+      MOVZX(32, prerotate_mask == 0xff ? 8 : 16, Ra, Rs);
      if (inst.SH)
-        mem_source.AddMemOffset((32 - inst.SH) >> 3);
-      Rs.Unlock();
+        ROL(32, Ra, Imm8(inst.SH));
+      needs_sext = (mask & 0x80000000) != 0;
+    }
+    // Use BEXTR where possible: Only AMD implements this in one uop
+    else if (field_extract && cpu_info.bBMI1 && cpu_info.vendor == CPUVendor::AMD)
+    {
+      MOV(32, R(RSCRATCH), Imm32((mask_size << 8) | (32 - inst.SH)));
+      BEXTR(32, Ra, Rs, RSCRATCH);
+      needs_sext = false;
+    }
+    else if (left_shift)
+    {
+      if (a != s)
+        MOV(32, Ra, Rs);

-      RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
-      RegCache::Realize(Ra);
-      MOVZX(32, mask_size, Ra, mem_source);
+      SHL(32, Ra, Imm8(inst.SH));
+    }
+    else if (right_shift)
+    {
+      if (a != s)
+        MOV(32, Ra, Rs);

+      SHR(32, Ra, Imm8(inst.MB));
      needs_sext = false;
    }
    else
    {
-      RCOpArg Rs = gpr.Use(s, RCMode::Read);
-      RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
-      RegCache::Realize(Rs, Ra);
+      RotateLeft(32, Ra, Rs, inst.SH);

-      if (a != s && left_shift && Rs.IsSimpleReg() && inst.SH <= 3)
+      if (!(inst.MB == 0 && inst.ME == 31))
      {
-        LEA(32, Ra, MScaled(Rs.GetSimpleReg(), SCALE_1 << inst.SH, 0));
-      }
-      // optimized case: byte/word extract plus rotate
-      else if (simple_prerotate_mask && !left_shift)
-      {
-        MOVZX(32, prerotate_mask == 0xff ? 8 : 16, Ra, Rs);
-        if (inst.SH)
-          ROL(32, Ra, Imm8(inst.SH));
-        needs_sext = (mask & 0x80000000) != 0;
-      }
-      // Use BEXTR where possible: Only AMD implements this in one uop
-      else if (field_extract && cpu_info.bBMI1 && cpu_info.vendor == CPUVendor::AMD)
-      {
-        MOV(32, R(RSCRATCH), Imm32((mask_size << 8) | (32 - inst.SH)));
-        BEXTR(32, Ra, Rs, RSCRATCH);
-        needs_sext = false;
-      }
-      else if (left_shift)
-      {
-        if (a != s)
-          MOV(32, Ra, Rs);
-
-        SHL(32, Ra, Imm8(inst.SH));
-      }
-      else if (right_shift)
-      {
-        if (a != s)
-          MOV(32, Ra, Rs);
-
-        SHR(32, Ra, Imm8(inst.MB));
-        needs_sext = false;
-      }
-      else
-      {
-        RotateLeft(32, Ra, Rs, inst.SH);
-
-        if (!(inst.MB == 0 && inst.ME == 31))
-        {
-          // we need flags if we're merging the branch
-          if (inst.Rc && CheckMergedBranch(0))
-            AND(32, Ra, Imm32(mask));
-          else
-            AndWithMask(Ra, mask);
-          needs_sext = inst.MB == 0;
-          needs_test = false;
-        }
+        // we need flags if we're merging the branch
+        if (inst.Rc && CheckMergedBranch(0))
+          AND(32, Ra, Imm32(mask));
+        else
+          AndWithMask(Ra, mask);
+        needs_sext = inst.MB == 0;
+        needs_test = false;
      }
    }
-
-    if (inst.Rc)
-      ComputeRC(a, needs_test, needs_sext);
  }
+
+  if (inst.Rc)
+    ComputeRC(a, needs_test, needs_sext);
 }

 void Jit64::rlwimix(UGeckoInstruction inst)
@ -2233,11 +2220,7 @@ void Jit64::rlwnmx(UGeckoInstruction inst)
  int a = inst.RA, b = inst.RB, s = inst.RS;

  const u32 mask = MakeRotationMask(inst.MB, inst.ME);
-  if (gpr.IsImm(b, s))
-  {
-    gpr.SetImmediate32(a, std::rotl(gpr.Imm32(s), gpr.Imm32(b) & 0x1F) & mask);
-  }
-  else if (gpr.IsImm(b))
+  if (gpr.IsImm(b))
  {
    u32 amount = gpr.Imm32(b) & 0x1f;
    RCX64Reg Ra = gpr.Bind(a, RCMode::Write);
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Integer.cpp
@ -674,15 +674,7 @@ void JitArm64::cmpli(UGeckoInstruction inst)
 void JitArm64::rlwinmx_internal(UGeckoInstruction inst, u32 sh)
 {
  u32 a = inst.RA, s = inst.RS;
-
  const u32 mask = MakeRotationMask(inst.MB, inst.ME);
-  if (gpr.IsImm(inst.RS))
-  {
-    gpr.SetImmediate(a, std::rotl(gpr.GetImm(s), sh) & mask);
-    if (inst.Rc)
-      ComputeRC0(gpr.GetImm(a));
-    return;
-  }

  gpr.BindToRegister(a, a == s);

--- a/Source/Core/Core/PowerPC/JitCommon/ConstantPropagation.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/ConstantPropagation.cpp
@ -5,6 +5,7 @@

 #include <bit>

+#include "Core/PowerPC/Gekko.h"
 #include "Core/PowerPC/PPCTables.h"

 namespace JitCommon
@ -32,6 +33,13 @@ ConstantPropagationResult ConstantPropagation::EvaluateInstruction(UGeckoInstruc
  case 14:  // addi
  case 15:  // addis
    return EvaluateAddImm(inst);
+  case 21:  // rlwinmx
+    return EvaluateRlwinmxRlwnmx(inst, inst.SH);
+  case 23:  // rlwnmx
+    if (HasGPR(inst.RB))
+      return EvaluateRlwinmxRlwnmx(inst, GetGPR(inst.RB) & 0x1F);
+    else
+      return {};
  case 24:  // ori
  case 25:  // oris
    return EvaluateBitwiseImm(inst, BitOR);
@ -61,6 +69,16 @@ ConstantPropagationResult ConstantPropagation::EvaluateAddImm(UGeckoInstruction
  return ConstantPropagationResult(inst.RD, m_gpr_values[inst.RA] + immediate);
 }

+ConstantPropagationResult ConstantPropagation::EvaluateRlwinmxRlwnmx(UGeckoInstruction inst,
+                                                                     u32 shift) const
+{
+  if (!HasGPR(inst.RS))
+    return {};
+
+  const u32 mask = MakeRotationMask(inst.MB, inst.ME);
+  return ConstantPropagationResult(inst.RA, std::rotl(GetGPR(inst.RS), shift) & mask, inst.Rc);
+}
+
 ConstantPropagationResult ConstantPropagation::EvaluateBitwiseImm(UGeckoInstruction inst,
                                                                  u32 (*do_op)(u32, u32)) const
 {
--- a/Source/Core/Core/PowerPC/JitCommon/ConstantPropagation.h
+++ b/Source/Core/Core/PowerPC/JitCommon/ConstantPropagation.h
@ -78,6 +78,7 @@ public:

 private:
  ConstantPropagationResult EvaluateAddImm(UGeckoInstruction inst) const;
+  ConstantPropagationResult EvaluateRlwinmxRlwnmx(UGeckoInstruction inst, u32 shift) const;
  ConstantPropagationResult EvaluateBitwiseImm(UGeckoInstruction inst,
                                               u32 (*do_op)(u32, u32)) const;
  ConstantPropagationResult EvaluateTable31(UGeckoInstruction inst, u64 flags) const;