From c3bcc67653513b3dae7dec4df78699202363cb57 Mon Sep 17 00:00:00 2001
From: JosJuice <josjuice@gmail.com>
Date: Tue, 17 Aug 2021 19:57:06 +0200
Subject: [PATCH] PowerPC: Update FEX on FPSCR store instead of FPSCR load

This is needed not only for the next commit, but also for
correctly emulating float instructions that write to CR1.
---
 .../PowerPC/Interpreter/Interpreter_FPUtils.h |   8 +-
 .../Interpreter_SystemRegisters.cpp           |  30 +----
 Source/Core/Core/PowerPC/Jit64/Jit.h          |   3 +-
 .../PowerPC/Jit64/Jit_SystemRegisters.cpp     | 126 ++++++++++++++----
 Source/Core/Core/PowerPC/JitArm64/Jit.h       |   1 +
 .../JitArm64/JitArm64_SystemRegisters.cpp     | 108 +++++++++++----
 6 files changed, 200 insertions(+), 76 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
index c3e1d40d4d..b8860eabf5 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@@ -24,6 +24,12 @@ enum class FPCC
   FU = 1,  // ?
 };
 
+inline void UpdateFPExceptionSummary(UReg_FPSCR* fpscr)
+{
+  fpscr->VX = (fpscr->Hex & FPSCR_VX_ANY) != 0;
+  fpscr->FEX = ((fpscr->Hex >> 22) & (fpscr->Hex & FPSCR_ANY_E)) != 0;
+}
+
 inline void SetFPException(UReg_FPSCR* fpscr, u32 mask)
 {
   if ((fpscr->Hex & mask) != mask)
@@ -32,7 +38,7 @@ inline void SetFPException(UReg_FPSCR* fpscr, u32 mask)
   }
 
   fpscr->Hex |= mask;
-  fpscr->VX = (fpscr->Hex & FPSCR_VX_ANY) != 0;
+  UpdateFPExceptionSummary(fpscr);
 }
 
 inline float ForceSingle(const UReg_FPSCR& fpscr, double value)
diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp
index 3f7b82717a..50d586efa5 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_SystemRegisters.cpp
@@ -25,22 +25,10 @@ mffsx: 80036650 (huh?)
 
 */
 
-static void FPSCRUpdated(UReg_FPSCR fp)
+static void FPSCRUpdated(UReg_FPSCR* fpscr)
 {
+  UpdateFPExceptionSummary(fpscr);
   PowerPC::RoundingModeUpdated();
-
-  if (fp.VE || fp.OE || fp.UE || fp.ZE || fp.XE)
-  {
-    // PanicAlert("FPSCR - exceptions enabled. Please report. VE=%i OE=%i UE=%i ZE=%i XE=%i",
-    // fp.VE, fp.OE, fp.UE, fp.ZE, fp.XE);
-    // Pokemon Colosseum does this. Gah.
-  }
-}
-
-static void UpdateFPSCR(UReg_FPSCR* fpscr)
-{
-  fpscr->VX = (fpscr->Hex & FPSCR_VX_ANY) != 0;
-  fpscr->FEX = ((fpscr->Hex >> 22) & (fpscr->Hex & FPSCR_ANY_E)) != 0;
 }
 
 void Interpreter::mtfsb0x(UGeckoInstruction inst)
@@ -48,7 +36,7 @@ void Interpreter::mtfsb0x(UGeckoInstruction inst)
   u32 b = 0x80000000 >> inst.CRBD;
 
   FPSCR.Hex &= ~b;
-  FPSCRUpdated(FPSCR);
+  FPSCRUpdated(&FPSCR);
 
   if (inst.Rc)
     PowerPC::ppcState.UpdateCR1();
@@ -65,7 +53,7 @@ void Interpreter::mtfsb1x(UGeckoInstruction inst)
   else
     FPSCR |= b;
 
-  FPSCRUpdated(FPSCR);
+  FPSCRUpdated(&FPSCR);
 
   if (inst.Rc)
     PowerPC::ppcState.UpdateCR1();
@@ -80,7 +68,7 @@ void Interpreter::mtfsfix(UGeckoInstruction inst)
 
   FPSCR = (FPSCR.Hex & ~mask) | (imm >> (4 * field));
 
-  FPSCRUpdated(FPSCR);
+  FPSCRUpdated(&FPSCR);
 
   if (inst.Rc)
     PowerPC::ppcState.UpdateCR1();
@@ -97,7 +85,7 @@ void Interpreter::mtfsfx(UGeckoInstruction inst)
   }
 
   FPSCR = (FPSCR.Hex & ~m) | (static_cast<u32>(rPS(inst.FB).PS0AsU64()) & m);
-  FPSCRUpdated(FPSCR);
+  FPSCRUpdated(&FPSCR);
 
   if (inst.Rc)
     PowerPC::ppcState.UpdateCR1();
@@ -563,22 +551,18 @@ void Interpreter::isync(UGeckoInstruction inst)
 
 void Interpreter::mcrfs(UGeckoInstruction inst)
 {
-  UpdateFPSCR(&FPSCR);
   const u32 shift = 4 * (7 - inst.CRFS);
   const u32 fpflags = (FPSCR.Hex >> shift) & 0xF;
 
   // If any exception bits were read, clear them
   FPSCR.Hex &= ~((0xF << shift) & (FPSCR_FX | FPSCR_ANY_X));
+  FPSCRUpdated(&FPSCR);
 
   PowerPC::ppcState.cr.SetField(inst.CRFD, fpflags);
 }
 
 void Interpreter::mffsx(UGeckoInstruction inst)
 {
-  // load from FPSCR
-  // TODO(ector): grab all overflow flags etc and set them in FPSCR
-
-  UpdateFPSCR(&FPSCR);
   rPS(inst.FD).SetPS0(UINT64_C(0xFFF8000000000000) | FPSCR.Hex);
 
   if (inst.Rc)
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h
index 35f198dc6b..70c53bd784 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit.h
+++ b/Source/Core/Core/PowerPC/Jit64/Jit.h
@@ -116,11 +116,12 @@ public:
   void ClearCRFieldBit(int field, int bit);
   void SetCRFieldBit(int field, int bit);
   void FixGTBeforeSettingCRFieldBit(Gen::X64Reg reg);
-
   // Generates a branch that will check if a given bit of a CR register part
   // is set or not.
   Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set = true);
 
+  void UpdateFPExceptionSummary(Gen::X64Reg fpscr, Gen::X64Reg tmp1, Gen::X64Reg tmp2);
+
   void SetFPRFIfNeeded(const Gen::OpArg& xmm, bool single);
   void FinalizeSingleResult(Gen::X64Reg output, const Gen::OpArg& input, bool packed = true,
                             bool duplicate = false);
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp
index 34fb820274..3117ef563f 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_SystemRegisters.cpp
@@ -4,7 +4,9 @@
 #include "Common/BitSet.h"
 #include "Common/CPUDetect.h"
 #include "Common/CommonTypes.h"
+#include "Common/MathUtil.h"
 #include "Common/x64Emitter.h"
+
 #include "Core/CoreTiming.h"
 #include "Core/HW/ProcessorInterface.h"
 #include "Core/PowerPC/Jit64/Jit.h"
@@ -185,6 +187,33 @@ FixupBranch Jit64::JumpIfCRFieldBit(int field, int bit, bool jump_if_set)
   return FixupBranch();
 }
 
+// Could be done with one temp register, but with two temp registers it's faster
+void Jit64::UpdateFPExceptionSummary(X64Reg fpscr, X64Reg tmp1, X64Reg tmp2)
+{
+  // Kill dependency on tmp1 (not required for correctness, since SHL will shift out upper bytes)
+  XOR(32, R(tmp1), R(tmp1));
+
+  // fpscr.VX = (fpscr & FPSCR_VX_ANY) != 0
+  TEST(32, R(fpscr), Imm32(FPSCR_VX_ANY));
+  SETcc(CC_NZ, R(tmp1));
+  SHL(32, R(tmp1), Imm8(IntLog2(FPSCR_VX)));
+  AND(32, R(fpscr), Imm32(~(FPSCR_VX | FPSCR_FEX)));
+  OR(32, R(fpscr), R(tmp1));
+
+  // fpscr.FEX = ((fpscr >> 22) & (fpscr & FPSCR_ANY_E)) != 0
+  MOV(32, R(tmp1), R(fpscr));
+  MOV(32, R(tmp2), R(fpscr));
+  SHR(32, R(tmp1), Imm8(22));
+  AND(32, R(tmp2), Imm32(FPSCR_ANY_E));
+  TEST(32, R(tmp1), R(tmp2));
+  // Unfortunately we eat a partial register stall below - we can't zero any of the registers before
+  // the TEST, and we can't use XOR right after the TEST since that would overwrite flags. However,
+  // there is no false dependency, since SETcc depends on TEST's flags and TEST depends on tmp1.
+  SETcc(CC_NZ, R(tmp1));
+  SHL(32, R(tmp1), Imm8(IntLog2(FPSCR_FEX)));
+  OR(32, R(fpscr), R(tmp1));
+}
+
 static void DoICacheReset()
 {
   PowerPC::ppcState.iCache.Reset();
@@ -637,6 +666,19 @@ void Jit64::mcrfs(UGeckoInstruction inst)
   // Only clear exception bits (but not FEX/VX).
   mask &= FPSCR_FX | FPSCR_ANY_X;
 
+  RCX64Reg scratch_guard;
+  X64Reg scratch;
+  if (mask != 0)
+  {
+    scratch_guard = gpr.Scratch();
+    RegCache::Realize(scratch_guard);
+    scratch = scratch_guard;
+  }
+  else
+  {
+    scratch = RSCRATCH;
+  }
+
   if (cpu_info.bBMI1)
   {
     MOV(32, R(RSCRATCH), PPCSTATE(fpscr));
@@ -652,14 +694,17 @@ void Jit64::mcrfs(UGeckoInstruction inst)
     SHR(32, R(RSCRATCH2), Imm8(shift));
     AND(32, R(RSCRATCH2), Imm32(0xF));
   }
+
+  LEA(64, scratch, MConst(PowerPC::ConditionRegister::s_crTable));
+  MOV(64, R(scratch), MComplex(scratch, RSCRATCH2, SCALE_8, 0));
+  MOV(64, CROffset(inst.CRFD), R(scratch));
+
   if (mask != 0)
   {
     AND(32, R(RSCRATCH), Imm32(~mask));
+    UpdateFPExceptionSummary(RSCRATCH, RSCRATCH2, scratch);
     MOV(32, PPCSTATE(fpscr), R(RSCRATCH));
   }
-  LEA(64, RSCRATCH, MConst(PowerPC::ConditionRegister::s_crTable));
-  MOV(64, R(RSCRATCH), MComplex(RSCRATCH, RSCRATCH2, SCALE_8, 0));
-  MOV(64, CROffset(inst.CRFD), R(RSCRATCH));
 }
 
 void Jit64::mffsx(UGeckoInstruction inst)
@@ -670,18 +715,6 @@ void Jit64::mffsx(UGeckoInstruction inst)
 
   MOV(32, R(RSCRATCH), PPCSTATE(fpscr));
 
-  // FPSCR.FEX = 0 (and VX for below)
-  AND(32, R(RSCRATCH), Imm32(~0x60000000));
-
-  // FPSCR.VX = (FPSCR.Hex & FPSCR_VX_ANY) != 0;
-  XOR(32, R(RSCRATCH2), R(RSCRATCH2));
-  TEST(32, R(RSCRATCH), Imm32(FPSCR_VX_ANY));
-  SETcc(CC_NZ, R(RSCRATCH2));
-  SHL(32, R(RSCRATCH2), Imm8(31 - 2));
-  OR(32, R(RSCRATCH), R(RSCRATCH2));
-
-  MOV(32, PPCSTATE(fpscr), R(RSCRATCH));
-
   int d = inst.FD;
   RCX64Reg Rd = fpr.Bind(d, RCMode::Write);
   RegCache::Realize(Rd);
@@ -710,17 +743,32 @@ void Jit64::mtfsb0x(UGeckoInstruction inst)
   JITDISABLE(bJITSystemRegistersOff);
   FALLBACK_IF(inst.Rc);
 
-  u32 mask = ~(0x80000000 >> inst.CRBD);
-  if (inst.CRBD < 29)
+  const u32 mask = 0x80000000 >> inst.CRBD;
+  const u32 inverted_mask = ~mask;
+
+  if (mask == FPSCR_FEX || mask == FPSCR_VX)
+    return;
+
+  if (inst.CRBD < 29 && (mask & (FPSCR_ANY_X | FPSCR_ANY_E)) == 0)
   {
-    AND(32, PPCSTATE(fpscr), Imm32(mask));
+    AND(32, PPCSTATE(fpscr), Imm32(inverted_mask));
   }
   else
   {
     MOV(32, R(RSCRATCH), PPCSTATE(fpscr));
-    AND(32, R(RSCRATCH), Imm32(mask));
+    AND(32, R(RSCRATCH), Imm32(inverted_mask));
+
+    if ((mask & (FPSCR_ANY_X | FPSCR_ANY_E)) != 0)
+    {
+      RCX64Reg scratch = gpr.Scratch();
+      RegCache::Realize(scratch);
+
+      UpdateFPExceptionSummary(RSCRATCH, RSCRATCH2, scratch);
+    }
+
     MOV(32, PPCSTATE(fpscr), R(RSCRATCH));
-    UpdateMXCSR();
+    if (inst.CRBD >= 29)
+      UpdateMXCSR();
   }
 }
 
@@ -730,9 +778,13 @@ void Jit64::mtfsb1x(UGeckoInstruction inst)
   JITDISABLE(bJITSystemRegistersOff);
   FALLBACK_IF(inst.Rc);
 
-  u32 mask = 0x80000000 >> inst.CRBD;
+  const u32 mask = 0x80000000 >> inst.CRBD;
+
+  if (mask == FPSCR_FEX || mask == FPSCR_VX)
+    return;
+
   MOV(32, R(RSCRATCH), PPCSTATE(fpscr));
-  if (mask & FPSCR_ANY_X)
+  if ((mask & FPSCR_ANY_X) != 0)
   {
     BTS(32, R(RSCRATCH), Imm32(31 - inst.CRBD));
     FixupBranch dont_set_fx = J_CC(CC_C);
@@ -743,6 +795,15 @@ void Jit64::mtfsb1x(UGeckoInstruction inst)
   {
     OR(32, R(RSCRATCH), Imm32(mask));
   }
+
+  if ((mask & (FPSCR_ANY_X | FPSCR_ANY_E)) != 0)
+  {
+    RCX64Reg scratch = gpr.Scratch();
+    RegCache::Realize(scratch);
+
+    UpdateFPExceptionSummary(RSCRATCH, RSCRATCH2, scratch);
+  }
+
   MOV(32, PPCSTATE(fpscr), R(RSCRATCH));
   if (inst.CRBD >= 29)
     UpdateMXCSR();
@@ -755,12 +816,22 @@ void Jit64::mtfsfix(UGeckoInstruction inst)
   FALLBACK_IF(inst.Rc);
 
   u8 imm = (inst.hex >> (31 - 19)) & 0xF;
+  u32 mask = 0xF0000000 >> (4 * inst.CRFD);
   u32 or_mask = imm << (28 - 4 * inst.CRFD);
-  u32 and_mask = ~(0xF0000000 >> (4 * inst.CRFD));
+  u32 and_mask = ~mask;
 
   MOV(32, R(RSCRATCH), PPCSTATE(fpscr));
   AND(32, R(RSCRATCH), Imm32(and_mask));
   OR(32, R(RSCRATCH), Imm32(or_mask));
+
+  if ((mask & (FPSCR_FEX | FPSCR_VX | FPSCR_ANY_X | FPSCR_ANY_E)) != 0)
+  {
+    RCX64Reg scratch = gpr.Scratch();
+    RegCache::Realize(scratch);
+
+    UpdateFPExceptionSummary(RSCRATCH, RSCRATCH2, scratch);
+  }
+
   MOV(32, PPCSTATE(fpscr), R(RSCRATCH));
 
   // Field 7 contains NI and RN.
@@ -798,6 +869,15 @@ void Jit64::mtfsfx(UGeckoInstruction inst)
     AND(32, R(RSCRATCH2), Imm32(~mask));
     OR(32, R(RSCRATCH), R(RSCRATCH2));
   }
+
+  if ((mask & (FPSCR_FEX | FPSCR_VX | FPSCR_ANY_X | FPSCR_ANY_E)) != 0)
+  {
+    RCX64Reg scratch = gpr.Scratch();
+    RegCache::Realize(scratch);
+
+    UpdateFPExceptionSummary(RSCRATCH, RSCRATCH2, scratch);
+  }
+
   MOV(32, PPCSTATE(fpscr), R(RSCRATCH));
 
   if (inst.FM & 1)
diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h
index b029f545cc..f19bd33d55 100644
--- a/Source/Core/Core/PowerPC/JitArm64/Jit.h
+++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h
@@ -273,6 +273,7 @@ protected:
 
   Arm64Gen::FixupBranch JumpIfCRFieldBit(int field, int bit, bool jump_if_set);
   void FixGTBeforeSettingCRFieldBit(Arm64Gen::ARM64Reg reg);
+  void UpdateFPExceptionSummary(Arm64Gen::ARM64Reg fpscr);
   void UpdateRoundingMode();
 
   void ComputeRC0(Arm64Gen::ARM64Reg reg);
diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp
index d3de831872..568d3072f3 100644
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_SystemRegisters.cpp
@@ -4,6 +4,7 @@
 #include "Common/Arm64Emitter.h"
 #include "Common/Assert.h"
 #include "Common/CommonTypes.h"
+#include "Common/MathUtil.h"
 
 #include "Core/Core.h"
 #include "Core/CoreTiming.h"
@@ -49,6 +50,25 @@ void JitArm64::FixGTBeforeSettingCRFieldBit(Arm64Gen::ARM64Reg reg)
   gpr.Unlock(WA);
 }
 
+void JitArm64::UpdateFPExceptionSummary(ARM64Reg fpscr)
+{
+  ARM64Reg WA = gpr.GetReg();
+
+  // fpscr.VX = (fpscr & FPSCR_VX_ANY) != 0
+  MOVI2R(WA, FPSCR_VX_ANY);
+  TST(WA, fpscr);
+  CSET(WA, CCFlags::CC_NEQ);
+  BFI(fpscr, WA, IntLog2(FPSCR_VX), 1);
+
+  // fpscr.FEX = ((fpscr >> 22) & (fpscr & FPSCR_ANY_E)) != 0
+  AND(WA, fpscr, LogicalImm(FPSCR_ANY_E, 32));
+  TST(WA, fpscr, ArithOption(fpscr, ShiftType::LSR, 22));
+  CSET(WA, CCFlags::CC_NEQ);
+  BFI(fpscr, WA, IntLog2(FPSCR_FEX), 1);
+
+  gpr.Unlock(WA);
+}
+
 void JitArm64::UpdateRoundingMode()
 {
   const BitSet32 gprs_to_save = gpr.GetCallerSavedUsed();
@@ -732,6 +752,8 @@ void JitArm64::mcrfs(UGeckoInstruction inst)
   {
     const u32 inverted_mask = ~mask;
     AND(WA, WA, LogicalImm(inverted_mask, 32));
+
+    UpdateFPExceptionSummary(WA);
     STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr));
   }
 
@@ -753,24 +775,11 @@ void JitArm64::mffsx(UGeckoInstruction inst)
   LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr));
 
   ARM64Reg VD = fpr.RW(inst.FD, RegType::LowerPair);
-  ARM64Reg WB = gpr.GetReg();
 
-  // FPSCR.FEX = 0;
-  // FPSCR.VX = (FPSCR.Hex & FPSCR_VX_ANY) != 0;
-  // (FEX is right next to VX, so we can set both using one BFI instruction)
-  MOVI2R(WB, FPSCR_VX_ANY);
-  TST(WA, WB);
-  CSET(WB, CCFlags::CC_NEQ);
-  BFI(WA, WB, 31 - 2, 2);
-
-  STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr));
-
-  // Vd = FPSCR.Hex | 0xFFF8'0000'0000'0000;
   ORR(XA, XA, LogicalImm(0xFFF8'0000'0000'0000, 64));
   m_float_emit.FMOV(EncodeRegToDouble(VD), XA);
 
   gpr.Unlock(WA);
-  gpr.Unlock(WB);
 }
 
 void JitArm64::mtfsb0x(UGeckoInstruction inst)
@@ -779,12 +788,20 @@ void JitArm64::mtfsb0x(UGeckoInstruction inst)
   JITDISABLE(bJITSystemRegistersOff);
   FALLBACK_IF(inst.Rc);
 
-  u32 mask = ~(0x80000000 >> inst.CRBD);
+  const u32 mask = 0x80000000 >> inst.CRBD;
+  const u32 inverted_mask = ~mask;
+
+  if (mask == FPSCR_FEX || mask == FPSCR_VX)
+    return;
 
   ARM64Reg WA = gpr.GetReg();
 
   LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr));
-  AND(WA, WA, LogicalImm(mask, 32));
+
+  AND(WA, WA, LogicalImm(inverted_mask, 32));
+
+  if ((mask & (FPSCR_ANY_X | FPSCR_ANY_E)) != 0)
+    UpdateFPExceptionSummary(WA);
   STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr));
 
   gpr.Unlock(WA);
@@ -799,12 +816,16 @@ void JitArm64::mtfsb1x(UGeckoInstruction inst)
   JITDISABLE(bJITSystemRegistersOff);
   FALLBACK_IF(inst.Rc);
 
-  u32 mask = 0x80000000 >> inst.CRBD;
+  const u32 mask = 0x80000000 >> inst.CRBD;
+
+  if (mask == FPSCR_FEX || mask == FPSCR_VX)
+    return;
 
   ARM64Reg WA = gpr.GetReg();
 
   LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr));
-  if (mask & FPSCR_ANY_X)
+
+  if ((mask & FPSCR_ANY_X) != 0)
   {
     ARM64Reg WB = gpr.GetReg();
     TST(WA, LogicalImm(mask, 32));
@@ -813,6 +834,9 @@ void JitArm64::mtfsb1x(UGeckoInstruction inst)
     gpr.Unlock(WB);
   }
   ORR(WA, WA, LogicalImm(mask, 32));
+
+  if ((mask & (FPSCR_ANY_X | FPSCR_ANY_E)) != 0)
+    UpdateFPExceptionSummary(WA);
   STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr));
 
   gpr.Unlock(WA);
@@ -829,13 +853,15 @@ void JitArm64::mtfsfix(UGeckoInstruction inst)
 
   u8 imm = (inst.hex >> (31 - 19)) & 0xF;
   u8 shift = 28 - 4 * inst.CRFD;
+  u32 mask = 0xF << shift;
 
   ARM64Reg WA = gpr.GetReg();
+
   LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr));
 
   if (imm == 0xF)
   {
-    ORR(WA, WA, LogicalImm(0xF << shift, 32));
+    ORR(WA, WA, LogicalImm(mask, 32));
   }
   else if (imm == 0x0)
   {
@@ -849,7 +875,10 @@ void JitArm64::mtfsfix(UGeckoInstruction inst)
     gpr.Unlock(WB);
   }
 
+  if ((mask & (FPSCR_FEX | FPSCR_VX | FPSCR_ANY_X | FPSCR_ANY_E)) != 0)
+    UpdateFPExceptionSummary(WA);
   STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr));
+
   gpr.Unlock(WA);
 
   // Field 7 contains NI and RN.
@@ -873,24 +902,47 @@ void JitArm64::mtfsfx(UGeckoInstruction inst)
   if (mask == 0xFFFFFFFF)
   {
     ARM64Reg VB = fpr.R(inst.FB, RegType::LowerPair);
+    ARM64Reg WA = gpr.GetReg();
 
-    m_float_emit.STR(32, IndexType::Unsigned, VB, PPC_REG, PPCSTATE_OFF(fpscr));
+    m_float_emit.FMOV(WA, EncodeRegToSingle(VB));
+
+    UpdateFPExceptionSummary(WA);
+    STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr));
+
+    gpr.Unlock(WA);
   }
   else if (mask != 0)
   {
     ARM64Reg VB = fpr.R(inst.FB, RegType::LowerPair);
-
-    ARM64Reg V0 = fpr.GetReg();
-    ARM64Reg V1 = fpr.GetReg();
     ARM64Reg WA = gpr.GetReg();
+    ARM64Reg WB = gpr.GetReg();
 
-    m_float_emit.LDR(32, IndexType::Unsigned, V0, PPC_REG, PPCSTATE_OFF(fpscr));
-    MOVI2R(WA, mask);
-    m_float_emit.FMOV(EncodeRegToSingle(V1), WA);
-    m_float_emit.BIT(EncodeRegToDouble(V0), EncodeRegToDouble(VB), EncodeRegToDouble(V1));
-    m_float_emit.STR(32, IndexType::Unsigned, V0, PPC_REG, PPCSTATE_OFF(fpscr));
+    LDR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr));
+    m_float_emit.FMOV(WB, EncodeRegToSingle(VB));
+
+    if (LogicalImm imm = LogicalImm(mask, 32))
+    {
+      AND(WA, WA, LogicalImm(~mask, 32));
+      AND(WB, WB, imm);
+    }
+    else
+    {
+      ARM64Reg WC = gpr.GetReg();
+
+      MOVI2R(WC, mask);
+      BIC(WA, WA, WC);
+      AND(WB, WB, WC);
+
+      gpr.Unlock(WC);
+    }
+    ORR(WA, WA, WB);
+
+    gpr.Unlock(WB);
+
+    if ((mask & (FPSCR_FEX | FPSCR_VX | FPSCR_ANY_X | FPSCR_ANY_E)) != 0)
+      UpdateFPExceptionSummary(WA);
+    STR(IndexType::Unsigned, WA, PPC_REG, PPCSTATE_OFF(fpscr));
 
-    fpr.Unlock(V0, V1);
     gpr.Unlock(WA);
   }