Merge pull request #3629 from degasus/arm

JitArm64: Single precision tracking.
2025-02-10 22:49:00 +01:00 · 2016-02-25 18:10:15 -05:00 · 2016-02-25 18:10:15 -05:00 · a0c51806ec
commit a0c51806ec
parent 2855fb39a4 3286bbd9bd
8 changed files with 412 additions and 263 deletions
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp
@ -73,6 +73,11 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
 				m_float_emit.REV32(8, D0, D0);
 				m_float_emit.STR(64, Q0, X28, addr);
 			}
 			else if (flags & BackPatchInfo::FLAG_SIZE_F32X2I)
 			{
 				m_float_emit.REV32(8, D0, RS);
 				m_float_emit.STR(64, Q0, X28, addr);
 			}
 			else
 			{
 				m_float_emit.REV64(8, Q0, RS);
@ -86,7 +91,6 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
 			{
 				m_float_emit.LDR(32, EncodeRegToDouble(RS), X28, addr);
 				m_float_emit.REV32(8, EncodeRegToDouble(RS), EncodeRegToDouble(RS));
 				m_float_emit.FCVT(64, 32, EncodeRegToDouble(RS), EncodeRegToDouble(RS));
 			}
 			else
 			{
@ -198,6 +202,13 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
 				MOVI2R(X30, (u64)PowerPC::Write_U64);
 				BLR(X30);
 			}
 			else if (flags & BackPatchInfo::FLAG_SIZE_F32X2I)
 			{
 				m_float_emit.UMOV(64, X0, RS, 0);
 				ORR(X0, SP, X0, ArithOption(X0, ST_ROR, 32));
 				MOVI2R(X30, (u64)PowerPC::Write_U64);
 				BLR(X30);
 			}
 			else
 			{
 				MOVI2R(X30, (u64)&PowerPC::Write_U64);
@ -214,7 +225,6 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode,
 				MOVI2R(X30, (u64)&PowerPC::Read_U32);
 				BLR(X30);
 				m_float_emit.INS(32, RS, 0, X0);
 				m_float_emit.FCVT(64, 32, EncodeRegToDouble(RS), EncodeRegToDouble(RS));
 			}
 			else
 			{
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp
@ -33,34 +33,44 @@ void JitArm64::fp_arith(UGeckoInstruction inst)
 	bool use_c = op5 >= 25; // fmul and all kind of fmaddXX
 	bool use_b = op5 != 25; // fmul uses no B
 	bool inputs_are_singles = fpr.IsSingle(a, !packed) && (!use_b || fpr.IsSingle(b, !packed)) && (!use_c || fpr.IsSingle(c, !packed));
 	ARM64Reg VA, VB, VC, VD;
 	if (packed)
 	{
-		VA = fpr.R(a, REG_REG);
+		RegType type = inputs_are_singles ? REG_REG_SINGLE : REG_REG;
 		u8 size = inputs_are_singles ? 32 : 64;
 		ARM64Reg (*reg_encoder)(ARM64Reg) = inputs_are_singles ? EncodeRegToDouble : EncodeRegToQuad;
 		VA = reg_encoder(fpr.R(a, type));
 		if (use_b)
-			VB = fpr.R(b, REG_REG);
+			VB = reg_encoder(fpr.R(b, type));
 		if (use_c)
-			VC = fpr.R(c, REG_REG);
+			VC = reg_encoder(fpr.R(c, type));
-		VD = fpr.RW(d, REG_REG);
+		VD = reg_encoder(fpr.RW(d, type));
 		switch (op5)
 		{
-		case 18: m_float_emit.FDIV(64, VD, VA, VB); break;
+		case 18: m_float_emit.FDIV(size, VD, VA, VB); break;
-		case 20: m_float_emit.FSUB(64, VD, VA, VB); break;
+		case 20: m_float_emit.FSUB(size, VD, VA, VB); break;
-		case 21: m_float_emit.FADD(64, VD, VA, VB); break;
+		case 21: m_float_emit.FADD(size, VD, VA, VB); break;
-		case 25: m_float_emit.FMUL(64, VD, VA, VC); break;
+		case 25: m_float_emit.FMUL(size, VD, VA, VC); break;
 		default: _assert_msg_(DYNA_REC, 0, "fp_arith"); break;
 		}
 	}
 	else
 	{
-		VA = EncodeRegToDouble(fpr.R(a, REG_IS_LOADED));
+		RegType type = (inputs_are_singles && single) ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR;
 		RegType type_out = single ? (inputs_are_singles ? REG_DUP_SINGLE : REG_DUP) : REG_LOWER_PAIR;
 		ARM64Reg (*reg_encoder)(ARM64Reg) = (inputs_are_singles && single) ? EncodeRegToSingle : EncodeRegToDouble;
 		VA = reg_encoder(fpr.R(a, type));
 		if (use_b)
-			VB = EncodeRegToDouble(fpr.R(b, REG_IS_LOADED));
+			VB = reg_encoder(fpr.R(b, type));
 		if (use_c)
-			VC = EncodeRegToDouble(fpr.R(c, REG_IS_LOADED));
+			VC = reg_encoder(fpr.R(c, type));
-		VD = EncodeRegToDouble(fpr.RW(d, single ? REG_DUP : REG_LOWER_PAIR));
+		VD = reg_encoder(fpr.RW(d, type_out));
 		switch (op5)
 		{
@ -95,33 +105,42 @@ void JitArm64::fp_logic(UGeckoInstruction inst)
 	if (op10 == 72 && b == d)
 		return;
 	bool single = fpr.IsSingle(b, !packed);
 	u8 size = single ? 32 : 64;
 	if (packed)
 	{
-		ARM64Reg VB = fpr.R(b, REG_REG);
+		RegType type = single ? REG_REG_SINGLE : REG_REG;
-		ARM64Reg VD = fpr.RW(d, REG_REG);
+		ARM64Reg (*reg_encoder)(ARM64Reg) = single ? EncodeRegToDouble : EncodeRegToQuad;
 		ARM64Reg VB = reg_encoder(fpr.R(b, type));
 		ARM64Reg VD = reg_encoder(fpr.RW(d, type));
 		switch (op10)
 		{
-		case  40: m_float_emit.FNEG(64, VD, VB); break;
+		case  40: m_float_emit.FNEG(size, VD, VB); break;
 		case  72: m_float_emit.ORR(VD, VB, VB); break;
-		case 136: m_float_emit.FABS(64, VD, VB);
+		case 136: m_float_emit.FABS(size, VD, VB);
-		          m_float_emit.FNEG(64, VD, VD); break;
+		          m_float_emit.FNEG(size, VD, VD); break;
-		case 264: m_float_emit.FABS(64, VD, VB); break;
+		case 264: m_float_emit.FABS(size, VD, VB); break;
 		default: _assert_msg_(DYNA_REC, 0, "fp_logic"); break;
 		}
 	}
 	else
 	{
-		ARM64Reg VB = fpr.R(b, REG_IS_LOADED);
+		RegType type = single ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR;
-		ARM64Reg VD = fpr.RW(d);
+		ARM64Reg (*reg_encoder)(ARM64Reg) = single ? EncodeRegToSingle : EncodeRegToDouble;
 		ARM64Reg VB = fpr.R(b, type);
 		ARM64Reg VD = fpr.RW(d, type);
 		switch (op10)
 		{
-		case  40: m_float_emit.FNEG(EncodeRegToDouble(VD), EncodeRegToDouble(VB)); break;
+		case  40: m_float_emit.FNEG(reg_encoder(VD), reg_encoder(VB)); break;
-		case  72: m_float_emit.INS(64, VD, 0, VB, 0); break;
+		case  72: m_float_emit.INS(size, VD, 0, VB, 0); break;
-		case 136: m_float_emit.FABS(EncodeRegToDouble(VD), EncodeRegToDouble(VB));
+		case 136: m_float_emit.FABS(reg_encoder(VD), reg_encoder(VB));
-		          m_float_emit.FNEG(EncodeRegToDouble(VD), EncodeRegToDouble(VD)); break;
+		          m_float_emit.FNEG(reg_encoder(VD), reg_encoder(VD)); break;
-		case 264: m_float_emit.FABS(EncodeRegToDouble(VD), EncodeRegToDouble(VB)); break;
+		case 264: m_float_emit.FABS(reg_encoder(VD), reg_encoder(VB)); break;
 		default: _assert_msg_(DYNA_REC, 0, "fp_logic"); break;
 		}
 	}
@ -135,13 +154,26 @@ void JitArm64::fselx(UGeckoInstruction inst)
 	u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD;
-	ARM64Reg VA = fpr.R(a, REG_IS_LOADED);
+	if (fpr.IsSingle(a, true))
-	ARM64Reg VB = fpr.R(b, REG_IS_LOADED);
+	{
-	ARM64Reg VC = fpr.R(c, REG_IS_LOADED);
+		ARM64Reg VA = fpr.R(a, REG_LOWER_PAIR_SINGLE);
-	ARM64Reg VD = fpr.RW(d);
+		m_float_emit.FCMPE(EncodeRegToSingle(VA));
 	}
 	else
 	{
 		ARM64Reg VA = fpr.R(a, REG_LOWER_PAIR);
 		m_float_emit.FCMPE(EncodeRegToDouble(VA));
 	}
-	m_float_emit.FCMPE(EncodeRegToDouble(VA));
+	bool single = fpr.IsSingle(b, true) && fpr.IsSingle(c, true);
-	m_float_emit.FCSEL(EncodeRegToDouble(VD), EncodeRegToDouble(VC), EncodeRegToDouble(VB), CC_GE);
+	RegType type = single ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR;
 	ARM64Reg (*reg_encoder)(ARM64Reg) = single ? EncodeRegToSingle : EncodeRegToDouble;
 	ARM64Reg VB = fpr.R(b, type);
 	ARM64Reg VC = fpr.R(c, type);
 	ARM64Reg VD = fpr.RW(d, type);
 	m_float_emit.FCSEL(reg_encoder(VD), reg_encoder(VC), reg_encoder(VB), CC_GE);
 }
 void JitArm64::frspx(UGeckoInstruction inst)
@ -153,11 +185,22 @@ void JitArm64::frspx(UGeckoInstruction inst)
 	u32 b = inst.FB, d = inst.FD;
-	ARM64Reg VB = fpr.R(b, REG_IS_LOADED);
+	if (fpr.IsSingle(b, true))
-	ARM64Reg VD = fpr.RW(d, REG_DUP);
+	{
 		// Source is already in single precision, so no need to do anything but to copy to PSR1.
 		ARM64Reg VB = fpr.R(b, REG_LOWER_PAIR_SINGLE);
 		ARM64Reg VD = fpr.RW(d, REG_DUP_SINGLE);
-	m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
+		if (b != d)
-	m_float_emit.FCVT(64, 32, EncodeRegToDouble(VD), EncodeRegToDouble(VD));
+			m_float_emit.FMOV(EncodeRegToSingle(VD), EncodeRegToSingle(VB));
 	}
 	else
 	{
 		ARM64Reg VB = fpr.R(b, REG_LOWER_PAIR);
 		ARM64Reg VD = fpr.RW(d, REG_DUP_SINGLE);
 		m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
 	}
 }
 void JitArm64::fcmpX(UGeckoInstruction inst)
@ -169,8 +212,12 @@ void JitArm64::fcmpX(UGeckoInstruction inst)
 	u32 a = inst.FA, b = inst.FB;
 	int crf = inst.CRFD;
-	ARM64Reg VA = fpr.R(a, REG_IS_LOADED);
+	bool singles = fpr.IsSingle(a, true) && fpr.IsSingle(b, true);
-	ARM64Reg VB = fpr.R(b, REG_IS_LOADED);
+	RegType type = singles ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR;
 	ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToSingle : EncodeRegToDouble;
 	ARM64Reg VA = reg_encoder(fpr.R(a, type));
 	ARM64Reg VB = reg_encoder(fpr.R(b, type));
 	ARM64Reg WA = gpr.GetReg();
 	ARM64Reg XA = EncodeRegTo64(WA);
@ -179,7 +226,7 @@ void JitArm64::fcmpX(UGeckoInstruction inst)
 	FixupBranch continue1, continue2, continue3;
 	ORR(XA, ZR, 32, 0, true);
-	m_float_emit.FCMP(EncodeRegToDouble(VA), EncodeRegToDouble(VB));
+	m_float_emit.FCMP(VA, VB);
 	if (a != b)
 	{
@ -231,7 +278,9 @@ void JitArm64::fctiwzx(UGeckoInstruction inst)
 	u32 b = inst.FB, d = inst.FD;
-	ARM64Reg VB = fpr.R(b, REG_IS_LOADED);
+	bool single = fpr.IsSingle(b, true);
 	ARM64Reg VB = fpr.R(b, single ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR);
 	ARM64Reg VD = fpr.RW(d);
 	ARM64Reg V0 = fpr.GetReg();
@ -240,8 +289,15 @@ void JitArm64::fctiwzx(UGeckoInstruction inst)
 	m_float_emit.MOVI(64, EncodeRegToDouble(V0), 0xFFFF000000000000ULL);
 	m_float_emit.BIC(16, EncodeRegToDouble(V0), 0x7);
-	m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
+	if (single)
-	m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VD), ROUND_Z);
+	{
 		m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VB), ROUND_Z);
 	}
 	else
 	{
 		m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
 		m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VD), ROUND_Z);
 	}
 	m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(VD), EncodeRegToDouble(V0));
 	fpr.Unlock(V0);
 }
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp
@ -76,7 +76,7 @@ void JitArm64::lfXX(UGeckoInstruction inst)
 	u32 imm_addr = 0;
 	bool is_immediate = false;
-	RegType type = !!(flags & BackPatchInfo::FLAG_SIZE_F64) ? REG_LOWER_PAIR : REG_DUP;
+	RegType type = !!(flags & BackPatchInfo::FLAG_SIZE_F64) ? REG_LOWER_PAIR : REG_DUP_SINGLE;
 	gpr.Lock(W0, W30);
 	fpr.Lock(Q0);
@ -270,7 +270,16 @@ void JitArm64::stfXX(UGeckoInstruction inst)
 	gpr.Lock(W0, W1, W30);
 	fpr.Lock(Q0);
-	ARM64Reg V0 = fpr.R(inst.FS, REG_IS_LOADED);
+	bool single = (flags & BackPatchInfo::FLAG_SIZE_F32) && fpr.IsSingle(inst.FS, true);
 	ARM64Reg V0 = fpr.R(inst.FS, single ? REG_LOWER_PAIR_SINGLE : REG_LOWER_PAIR);
 	if (single)
 	{
 		flags &= ~BackPatchInfo::FLAG_SIZE_F32;
 		flags |= BackPatchInfo::FLAG_SIZE_F32I;
 	}
 	ARM64Reg addr_reg = W1;
 	if (update)
@ -407,24 +416,29 @@ void JitArm64::stfXX(UGeckoInstruction inst)
 				ADD(X1, X30, pipe_off);
 			LDR(INDEX_UNSIGNED, W0, X30, count_off);
-			if (accessSize == 64)
+			if (flags & BackPatchInfo::FLAG_SIZE_F64)
 			{
 				m_float_emit.REV64(8, Q0, V0);
 				if (pipe_off)
 					m_float_emit.STR(64, Q0, X1, ArithOption(X0));
 				else
 					m_float_emit.STR(64, Q0, X30, ArithOption(X0));
 			}
-			else if (accessSize == 32)
+			else if (flags & BackPatchInfo::FLAG_SIZE_F32)
 			{
 				m_float_emit.FCVT(32, 64, D0, EncodeRegToDouble(V0));
 				m_float_emit.REV32(8, D0, D0);
 				if (pipe_off)
 					m_float_emit.STR(32, D0, X1, ArithOption(X0));
 				else
 					m_float_emit.STR(32, D0, X30, ArithOption(X0));
 			}
 			else if (flags & BackPatchInfo::FLAG_SIZE_F32I)
 			{
 				m_float_emit.REV32(8, D0, V0);
 			}
 			if (pipe_off)
 			{
 				m_float_emit.STR(accessSize, accessSize == 64 ? Q0 : D0, X1, ArithOption(X0));
 			}
 			else
 			{
 				m_float_emit.STR(accessSize, accessSize == 64 ? Q0 : D0, X30, ArithOption(X0));
 			}
 			ADD(W0, W0, accessSize >> 3);
 			STR(INDEX_UNSIGNED, W0, X30, count_off);
 			js.fifoBytesThisBlock += accessSize >> 3;
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp
@ -62,20 +62,17 @@ void JitArm64::psq_l(UGeckoInstruction inst)
 	if (js.assumeNoPairedQuantize)
 	{
-		VS = fpr.RW(inst.RS, REG_REG);
+		VS = fpr.RW(inst.RS, REG_REG_SINGLE);
 		if (!inst.W)
 		{
 			ADD(EncodeRegTo64(addr_reg), EncodeRegTo64(addr_reg), X28);
 			m_float_emit.LD1(32, 1, EncodeRegToDouble(VS), EncodeRegTo64(addr_reg));
 			m_float_emit.REV32(8, VS, VS);
 			m_float_emit.FCVTL(64, VS, VS);
 		}
 		else
 		{
 			m_float_emit.LDR(32, VS, EncodeRegTo64(addr_reg), X28);
 			m_float_emit.REV32(8, VS, VS);
 			m_float_emit.FCVT(64, 32, EncodeRegToDouble(VS), EncodeRegToDouble(VS));
 		}
 		m_float_emit.REV32(8, EncodeRegToDouble(VS), EncodeRegToDouble(VS));
 	}
 	else
 	{
@ -87,17 +84,14 @@ void JitArm64::psq_l(UGeckoInstruction inst)
 		LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true));
 		BLR(X30);
-		VS = fpr.RW(inst.RS, REG_REG);
+		VS = fpr.RW(inst.RS, REG_REG_SINGLE);
-		if (!inst.W)
+		m_float_emit.ORR(EncodeRegToDouble(VS), D0, D0);
 			m_float_emit.FCVTL(64, VS, D0);
 		else
 			m_float_emit.FCVT(64, 32, EncodeRegToDouble(VS), D0);
 	}
 	if (inst.W)
 	{
-		m_float_emit.FMOV(D0, 0x70); // 1.0 as a Double
+		m_float_emit.FMOV(S0, 0x70); // 1.0 as a Single
-		m_float_emit.INS(64, VS, 1, Q0, 0);
+		m_float_emit.INS(32, VS, 1, Q0, 0);
 	}
 	gpr.Unlock(W0, W1, W2, W30);
@ -121,8 +115,10 @@ void JitArm64::psq_st(UGeckoInstruction inst)
 	gpr.Lock(W0, W1, W2, W30);
 	fpr.Lock(Q0, Q1);
 	bool single = fpr.IsSingle(inst.RS);
 	ARM64Reg arm_addr = gpr.R(inst.RA);
-	ARM64Reg VS = fpr.R(inst.RS, REG_REG);
+	ARM64Reg VS = fpr.R(inst.RS, single ? REG_REG_SINGLE : REG_REG);
 	ARM64Reg scale_reg = W0;
 	ARM64Reg addr_reg = W1;
@ -156,7 +152,12 @@ void JitArm64::psq_st(UGeckoInstruction inst)
 	if (js.assumeNoPairedQuantize)
 	{
 		u32 flags = BackPatchInfo::FLAG_STORE;
-		flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2);
+
 		if (single)
 			flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32I : BackPatchInfo::FLAG_SIZE_F32X2I);
 		else
 			flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2);
 		EmitBackpatchRoutine(flags,
 			jo.fastmem,
 			jo.fastmem,
@ -166,10 +167,17 @@ void JitArm64::psq_st(UGeckoInstruction inst)
 	}
 	else
 	{
-		if (inst.W)
+		if (single)
-			m_float_emit.FCVT(32, 64, D0, VS);
+		{
 			m_float_emit.ORR(D0, VS, VS);
 		}
 		else
-			m_float_emit.FCVTN(32, D0, VS);
+		{
 			if (inst.W)
 				m_float_emit.FCVT(32, 64, D0, VS);
 			else
 				m_float_emit.FCVTN(32, D0, VS);
 		}
 		LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
 		UBFM(type_reg, scale_reg, 0, 2); // Type
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Paired.cpp
@ -25,36 +25,41 @@ void JitArm64::ps_mergeXX(UGeckoInstruction inst)
 	u32 a = inst.FA, b = inst.FB, d = inst.FD;
-	ARM64Reg VA = fpr.R(a, REG_REG);
+	bool singles = fpr.IsSingle(a) && fpr.IsSingle(b);
-	ARM64Reg VB = fpr.R(b, REG_REG);
+	RegType type = singles ? REG_REG_SINGLE : REG_REG;
-	ARM64Reg VD = fpr.RW(d, REG_REG);
+	u8 size = singles ? 32 : 64;
 	ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad;
 	ARM64Reg VA = fpr.R(a, type);
 	ARM64Reg VB = fpr.R(b, type);
 	ARM64Reg VD = fpr.RW(d, type);
 	switch (inst.SUBOP10)
 	{
 	case 528: //00
-		m_float_emit.TRN1(64, VD, VA, VB);
+		m_float_emit.TRN1(size, VD, VA, VB);
 		break;
 	case 560: //01
-		m_float_emit.INS(64, VD, 0, VA, 0);
+		m_float_emit.INS(size, VD, 0, VA, 0);
-		m_float_emit.INS(64, VD, 1, VB, 1);
+		m_float_emit.INS(size, VD, 1, VB, 1);
 		break;
 	case 592: //10
 		if (d != a && d != b)
 		{
-			m_float_emit.INS(64, VD, 0, VA, 1);
+			m_float_emit.INS(size, VD, 0, VA, 1);
-			m_float_emit.INS(64, VD, 1, VB, 0);
+			m_float_emit.INS(size, VD, 1, VB, 0);
 		}
 		else
 		{
 			ARM64Reg V0 = fpr.GetReg();
-			m_float_emit.INS(64, V0, 0, VA, 1);
+			m_float_emit.INS(size, V0, 0, VA, 1);
-			m_float_emit.INS(64, V0, 1, VB, 0);
+			m_float_emit.INS(size, V0, 1, VB, 0);
-			m_float_emit.ORR(VD, V0, V0);
+			m_float_emit.ORR(reg_encoder(VD), reg_encoder(V0), reg_encoder(V0));
 			fpr.Unlock(V0);
 		}
 		break;
 	case 624: //11
-		m_float_emit.TRN2(64, VD, VA, VB);
+		m_float_emit.TRN2(size, VD, VA, VB);
 		break;
 	default:
 		_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
@ -73,13 +78,19 @@ void JitArm64::ps_mulsX(UGeckoInstruction inst)
 	bool upper = inst.SUBOP5 == 13;
-	ARM64Reg VA = fpr.R(a, REG_REG);
+	bool singles = fpr.IsSingle(a) && fpr.IsSingle(c);
-	ARM64Reg VC = fpr.R(c, REG_REG);
+	RegType type = singles ? REG_REG_SINGLE : REG_REG;
-	ARM64Reg VD = fpr.RW(d, REG_REG);
+	u8 size = singles ? 32 : 64;
 	ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad;
 	ARM64Reg VA = fpr.R(a, type);
 	ARM64Reg VC = fpr.R(c, type);
 	ARM64Reg VD = fpr.RW(d, type);
 	ARM64Reg V0 = fpr.GetReg();
-	m_float_emit.DUP(64, V0, VC, upper ? 1 : 0);
+	m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(VC), upper ? 1 : 0);
-	m_float_emit.FMUL(64, VD, VA, V0);
+	m_float_emit.FMUL(size, reg_encoder(VD), reg_encoder(VA), reg_encoder(V0));
 	fpr.FixSinglePrecision(d);
 	fpr.Unlock(V0);
 }
@ -94,41 +105,49 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
 	u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD;
 	u32 op5 = inst.SUBOP5;
-	ARM64Reg VA = fpr.R(a, REG_REG);
+	bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
-	ARM64Reg VB = fpr.R(b, REG_REG);
+	RegType type = singles ? REG_REG_SINGLE : REG_REG;
-	ARM64Reg VC = fpr.R(c, REG_REG);
+	u8 size = singles ? 32 : 64;
-	ARM64Reg VD = fpr.RW(d, REG_REG);
+	ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad;
-	ARM64Reg V0 = fpr.GetReg();
+
 	ARM64Reg VA = reg_encoder(fpr.R(a, type));
 	ARM64Reg VB = reg_encoder(fpr.R(b, type));
 	ARM64Reg VC = reg_encoder(fpr.R(c, type));
 	ARM64Reg VD = reg_encoder(fpr.RW(d, type));
 	ARM64Reg V0Q = fpr.GetReg();
 	ARM64Reg V0 = reg_encoder(V0Q);
 	// TODO: Do FMUL and FADD/FSUB in *one* host call to save accuracy.
 	switch (op5)
 	{
 	case 14: // ps_madds0
-		m_float_emit.DUP(64, V0, VC, 0);
+		m_float_emit.DUP(size, V0, VC, 0);
-		m_float_emit.FMUL(64, V0, V0, VA);
+		m_float_emit.FMUL(size, V0, V0, VA);
-		m_float_emit.FADD(64, VD, V0, VB);
+		m_float_emit.FADD(size, VD, V0, VB);
 		break;
 	case 15: // ps_madds1
-		m_float_emit.DUP(64, V0, VC, 1);
+		m_float_emit.DUP(size, V0, VC, 1);
-		m_float_emit.FMUL(64, V0, V0, VA);
+		m_float_emit.FMUL(size, V0, V0, VA);
-		m_float_emit.FADD(64, VD, V0, VB);
+		m_float_emit.FADD(size, VD, V0, VB);
 		break;
 	case 28: // ps_msub
-		m_float_emit.FMUL(64, V0, VA, VC);
+		m_float_emit.FMUL(size, V0, VA, VC);
-		m_float_emit.FSUB(64, VD, V0, VB);
+		m_float_emit.FSUB(size, VD, V0, VB);
 		break;
 	case 29: // ps_madd
-		m_float_emit.FMUL(64, V0, VA, VC);
+		m_float_emit.FMUL(size, V0, VA, VC);
-		m_float_emit.FADD(64, VD, V0, VB);
+		m_float_emit.FADD(size, VD, V0, VB);
 		break;
 	case 30: // ps_nmsub
-		m_float_emit.FMUL(64, V0, VA, VC);
+		m_float_emit.FMUL(size, V0, VA, VC);
-		m_float_emit.FSUB(64, VD, V0, VB);
+		m_float_emit.FSUB(size, VD, V0, VB);
-		m_float_emit.FNEG(64, VD, VD);
+		m_float_emit.FNEG(size, VD, VD);
 		break;
 	case 31: // ps_nmadd
-		m_float_emit.FMUL(64, V0, VA, VC);
+		m_float_emit.FMUL(size, V0, VA, VC);
-		m_float_emit.FADD(64, VD, V0, VB);
+		m_float_emit.FADD(size, VD, V0, VB);
-		m_float_emit.FNEG(64, VD, VD);
+		m_float_emit.FNEG(size, VD, VD);
 		break;
 	default:
 		_assert_msg_(DYNA_REC, 0, "ps_madd - invalid op");
@ -136,7 +155,7 @@ void JitArm64::ps_maddXX(UGeckoInstruction inst)
 	}
 	fpr.FixSinglePrecision(d);
-	fpr.Unlock(V0);
+	fpr.Unlock(V0Q);
 }
 void JitArm64::ps_res(UGeckoInstruction inst)
@ -148,10 +167,16 @@ void JitArm64::ps_res(UGeckoInstruction inst)
 	u32 b = inst.FB, d = inst.FD;
-	ARM64Reg VB = fpr.R(b, REG_REG);
+	bool singles = fpr.IsSingle(b);
-	ARM64Reg VD = fpr.RW(d, REG_REG);
+	RegType type = singles ? REG_REG_SINGLE : REG_REG;
 	u8 size = singles ? 32 : 64;
 	ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad;
 	ARM64Reg VB = fpr.R(b, type);
 	ARM64Reg VD = fpr.RW(d, type);
 	m_float_emit.FRSQRTE(size, reg_encoder(VD), reg_encoder(VB));
 	m_float_emit.FRSQRTE(64, VD, VB);
 	fpr.FixSinglePrecision(d);
 }
@ -163,23 +188,29 @@ void JitArm64::ps_sel(UGeckoInstruction inst)
 	u32 a = inst.FA, b = inst.FB, c = inst.FC, d = inst.FD;
-	ARM64Reg VA = fpr.R(a, REG_REG);
+	bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
-	ARM64Reg VB = fpr.R(b, REG_REG);
+	RegType type = singles ? REG_REG_SINGLE : REG_REG;
-	ARM64Reg VC = fpr.R(c, REG_REG);
+	u8 size = singles ? 32 : 64;
-	ARM64Reg VD = fpr.RW(d, REG_REG);
+	ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad;
-	if (d != a && d != b && d != c)
+	ARM64Reg VA = reg_encoder(fpr.R(a, type));
 	ARM64Reg VB = reg_encoder(fpr.R(b, type));
 	ARM64Reg VC = reg_encoder(fpr.R(c, type));
 	ARM64Reg VD = reg_encoder(fpr.RW(d, type));
 	if (d != b && d != c)
 	{
-		m_float_emit.FCMGE(64, VD, VA);
+		m_float_emit.FCMGE(size, VD, VA);
 		m_float_emit.BSL(VD, VC, VB);
 	}
 	else
 	{
-		ARM64Reg V0 = fpr.GetReg();
+		ARM64Reg V0Q = fpr.GetReg();
-		m_float_emit.FCMGE(64, V0, VA);
+		ARM64Reg V0 = reg_encoder(V0Q);
 		m_float_emit.FCMGE(size, V0, VA);
 		m_float_emit.BSL(V0, VC, VB);
 		m_float_emit.ORR(VD, V0, V0);
-		fpr.Unlock(V0);
+		fpr.Unlock(V0Q);
 	}
 }
@ -194,23 +225,29 @@ void JitArm64::ps_sumX(UGeckoInstruction inst)
 	bool upper = inst.SUBOP5 == 11;
-	ARM64Reg VA = fpr.R(a, REG_REG);
+	bool singles = fpr.IsSingle(a) && fpr.IsSingle(b) && fpr.IsSingle(c);
-	ARM64Reg VB = fpr.R(b, REG_REG);
+	RegType type = singles ? REG_REG_SINGLE : REG_REG;
-	ARM64Reg VC = fpr.R(c, REG_REG);
+	u8 size = singles ? 32 : 64;
-	ARM64Reg VD = fpr.RW(d, REG_REG);
+	ARM64Reg (*reg_encoder)(ARM64Reg) = singles ? EncodeRegToDouble : EncodeRegToQuad;
 	ARM64Reg VA = fpr.R(a, type);
 	ARM64Reg VB = fpr.R(b, type);
 	ARM64Reg VC = fpr.R(c, type);
 	ARM64Reg VD = fpr.RW(d, type);
 	ARM64Reg V0 = fpr.GetReg();
-	m_float_emit.DUP(64, V0, upper ? VA : VB, upper ? 0 : 1);
+	m_float_emit.DUP(size, reg_encoder(V0), reg_encoder(upper ? VA : VB), upper ? 0 : 1);
 	if (d != c)
 	{
-		m_float_emit.FADD(64, VD, V0, upper ? VB : VA);
+		m_float_emit.FADD(size, reg_encoder(VD), reg_encoder(V0), reg_encoder(upper ? VB : VA));
-		m_float_emit.INS(64, VD, upper ? 0 : 1, VC, upper ? 0 : 1);
+		m_float_emit.INS(size, VD, upper ? 0 : 1, VC, upper ? 0 : 1);
 	}
 	else
 	{
-		m_float_emit.FADD(64, V0, V0, upper ? VB : VA);
+		m_float_emit.FADD(size, reg_encoder(V0), reg_encoder(V0), reg_encoder(upper ? VB : VA));
-		m_float_emit.INS(64, VD, upper ? 1 : 0, V0, upper ? 1 : 0);
+		m_float_emit.INS(size, VD, upper ? 1 : 0, V0, upper ? 1 : 0);
 	}
 	fpr.FixSinglePrecision(d);
 	fpr.Unlock(V0);
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp
@ -198,7 +198,7 @@ ARM64Reg Arm64GPRCache::R(u32 preg)
 	{
 		ARM64Reg host_reg = GetReg();
 		m_emit->MOVI2R(host_reg, reg.GetImm());
-		reg.LoadToReg(host_reg);
+		reg.Load(host_reg);
 		reg.SetDirty(true);
 		return host_reg;
 	}
@ -208,7 +208,7 @@ ARM64Reg Arm64GPRCache::R(u32 preg)
 		// This is a bit annoying. We try to keep these preloaded as much as possible
 		// This can also happen on cases where PPCAnalyst isn't feeing us proper register usage statistics
 		ARM64Reg host_reg = GetReg();
-		reg.LoadToReg(host_reg);
+		reg.Load(host_reg);
 		reg.SetDirty(false);
 		m_emit->LDR(INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(gpr[preg]));
 		return host_reg;
@ -240,7 +240,7 @@ void Arm64GPRCache::BindToRegister(u32 preg, bool do_load)
 	if (reg.GetType() == REG_NOTLOADED)
 	{
 		ARM64Reg host_reg = GetReg();
-		reg.LoadToReg(host_reg);
+		reg.Load(host_reg);
 		if (do_load)
 			m_emit->LDR(INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(gpr[preg]));
 	}
@ -307,12 +307,38 @@ ARM64Reg Arm64FPRCache::R(u32 preg, RegType type)
 	OpArg& reg = m_guest_registers[preg];
 	IncrementAllUsed();
 	reg.ResetLastUsed();
 	ARM64Reg host_reg = reg.GetReg();
 	switch (reg.GetType())
 	{
 	case REG_REG_SINGLE:
 	{
 		// We're asked for singles, so just return the register.
 		if (type == REG_REG_SINGLE || type == REG_LOWER_PAIR_SINGLE)
 			return host_reg;
 		// Else convert this register back to doubles.
 		m_float_emit->FCVTL(64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
 		reg.Load(host_reg, REG_REG);
 		// fall through
 	}
 	case REG_REG: // already in a reg
-		return reg.GetReg();
+	{
-	break;
+		return host_reg;
 	}
 	case REG_LOWER_PAIR_SINGLE:
 	{
 		// We're asked for the lower single, so just return the register.
 		if (type == REG_LOWER_PAIR_SINGLE)
 			return host_reg;
 		// Else convert this register back to a double.
 		m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
 		reg.Load(host_reg, REG_LOWER_PAIR);
 		// fall through
 	}
 	case REG_LOWER_PAIR:
 	{
 		if (type == REG_REG)
@ -320,48 +346,62 @@ ARM64Reg Arm64FPRCache::R(u32 preg, RegType type)
 			// Load the high 64bits from the file and insert them in to the high 64bits of the host register
 			ARM64Reg tmp_reg = GetReg();
 			m_float_emit->LDR(64, INDEX_UNSIGNED, tmp_reg, X29, PPCSTATE_OFF(ps[preg][1]));
-			m_float_emit->INS(64, reg.GetReg(), 1, tmp_reg, 0);
+			m_float_emit->INS(64, host_reg, 1, tmp_reg, 0);
 			UnlockRegister(tmp_reg);
 			// Change it over to a full 128bit register
-			reg.LoadToReg(reg.GetReg());
+			reg.Load(host_reg, REG_REG);
 		}
-		return reg.GetReg();
+		return host_reg;
 	}
 	case REG_DUP_SINGLE:
 	{
 		if (type == REG_LOWER_PAIR_SINGLE)
 			return host_reg;
 		if (type == REG_REG_SINGLE)
 		{
 			// Duplicate to the top and change over
 			m_float_emit->INS(32, host_reg, 1, host_reg, 0);
 			reg.Load(host_reg, REG_REG_SINGLE);
 			return host_reg;
 		}
 		m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
 		reg.Load(host_reg, REG_DUP);
 		// fall through
 	}
 	break;
 	case REG_DUP:
 	{
 		ARM64Reg host_reg = reg.GetReg();
 		if (type == REG_REG)
 		{
 			// We are requesting a full 128bit register
 			// but we are only available in the lower 64bits
 			// Duplicate to the top and change over
 			m_float_emit->INS(64, host_reg, 1, host_reg, 0);
-			reg.LoadToReg(host_reg);
+			reg.Load(host_reg, REG_REG);
 		}
 		return host_reg;
 	}
 	break;
 	case REG_NOTLOADED: // Register isn't loaded at /all/
 	{
-		ARM64Reg host_reg = GetReg();
+		host_reg = GetReg();
 		u32 load_size;
 		if (type == REG_REG)
 		{
 			load_size = 128;
-			reg.LoadToReg(host_reg);
+			reg.Load(host_reg, REG_REG);
 		}
 		else
 		{
 			load_size = 64;
-			reg.LoadLowerReg(host_reg);
+			reg.Load(host_reg, REG_LOWER_PAIR);
 		}
 		reg.SetDirty(false);
 		m_float_emit->LDR(load_size, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0]));
 		return host_reg;
 	}
 	break;
 	default:
 		_dbg_assert_msg_(DYNA_REC, false, "Invalid OpArg Type!");
 	break;
@ -380,90 +420,52 @@ ARM64Reg Arm64FPRCache::RW(u32 preg, RegType type)
 	reg.ResetLastUsed();
 	reg.SetDirty(true);
-	switch (reg.GetType())
+
 	// If not loaded at all, just alloc a new one.
 	if (reg.GetType() == REG_NOTLOADED)
 	{
-	case REG_NOTLOADED:
+		reg.Load(GetReg(), type);
-	{
+		return reg.GetReg();
 		ARM64Reg host_reg = GetReg();
 		if (type == REG_LOWER_PAIR)
 		{
 			reg.LoadLowerReg(host_reg);
 		}
 		else if (type == REG_DUP)
 		{
 			reg.LoadDup(host_reg);
 		}
 		else
 		{
 			reg.LoadToReg(host_reg);
 		}
 	}
-	break;
+
-	case REG_LOWER_PAIR:
+	// Only the lower value will be overwritten, so we must be extra careful to store PSR1 if dirty.
 	if ((type == REG_LOWER_PAIR || type == REG_LOWER_PAIR_SINGLE) && was_dirty)
 	{
 		// We must *not* change host_reg as this register might still be in use. So it's fine to
 		// store this register, but it's *not* fine to convert it to double. So for double convertion,
 		// a temporary register needs to be used.
 		ARM64Reg host_reg = reg.GetReg();
-		if (type == REG_REG)
+		ARM64Reg flush_reg = host_reg;
 		switch (reg.GetType())
 		{
-			// Change it over to a full 128bit register
+		case REG_REG_SINGLE:
-			reg.LoadToReg(host_reg);
+			flush_reg = GetReg();
-		}
+			m_float_emit->FCVTL(64, EncodeRegToDouble(flush_reg), EncodeRegToDouble(host_reg));
-		else if (type == REG_DUP)
+			// fall through
-		{
+		case REG_REG:
 			// Register is already the lower pair
 			// Just convert it over to a dup
 			reg.LoadDup(host_reg);
 		}
 	}
 	break;
 	case REG_REG:
 	{
 		ARM64Reg host_reg = reg.GetReg();
 		if (type == REG_LOWER_PAIR)
 		{
 			// If we only want the lower bits, let's store away the high bits and drop to a lower only register
 			// We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit store.
 			// It would take longer to do an insert to a temporary and a 64bit store than to just do this.
-			if (was_dirty)
+			m_float_emit->STR(128, INDEX_UNSIGNED, flush_reg, X29, PPCSTATE_OFF(ps[preg][0]));
-				m_float_emit->STR(128, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0]));
+			break;
-			reg.LoadLowerReg(host_reg);
+		case REG_DUP_SINGLE:
 			flush_reg = GetReg();
 			m_float_emit->FCVT(64, 32, EncodeRegToDouble(flush_reg), EncodeRegToDouble(host_reg));
 			// fall through
 		case REG_DUP:
 			// Store PSR1 (which is equal to PSR0) in memory.
 			m_float_emit->STR(64, INDEX_UNSIGNED, flush_reg, X29, PPCSTATE_OFF(ps[preg][1]));
 			break;
 		default:
 			// All other types doesn't store anything in PSR1.
 			break;
 		}
 		else if (type == REG_DUP)
 		{
 			// If we are going from a full 128bit register to a duplicate
 			// then we can just change over
 			reg.LoadDup(host_reg);
 		}
 	}
 	break;
 	case REG_DUP:
 	{
 		ARM64Reg host_reg = reg.GetReg();
 		if (type == REG_REG)
 		{
 			// We are a duplicated register going to a full 128bit register
 			// Do an insert of our lower 64bits to the higher 64bits
 			m_float_emit->INS(64, host_reg, 1, host_reg, 0);
-			// Change over to the full 128bit register
+		if (host_reg != flush_reg)
-			reg.LoadToReg(host_reg);
+			Unlock(flush_reg);
 		}
 		else if (type == REG_LOWER_PAIR)
 		{
 			// We are duplicated changing over to a lower register
 			// We've got to be careful in this instance and do a store of our lower 64bits
 			// to the upper 64bits in the PowerPC state
 			// That way incase if we hit the path of DUP->LOWER->REG we get the correct bits back
 			if (was_dirty)
 				m_float_emit->STR(64, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][1]));
 			reg.LoadLowerReg(host_reg);
 		}
 	}
 	break;
 	default:
 		// Do nothing
 	break;
 	}
 	reg.Load(reg.GetReg(), type);
 	return reg.GetReg();
 }
@ -510,17 +512,37 @@ bool Arm64FPRCache::IsCalleeSaved(ARM64Reg reg)
 void Arm64FPRCache::FlushRegister(u32 preg, bool maintain_state)
 {
 	OpArg& reg = m_guest_registers[preg];
-	if (reg.GetType() == REG_REG ||
+	ARM64Reg host_reg = reg.GetReg();
-	    reg.GetType() == REG_LOWER_PAIR)
+	RegType type = reg.GetType();
 	bool dirty = reg.IsDirty();
 	// If we're in single mode, just convert it back to a double.
 	if (type == REG_REG_SINGLE)
 	{
 		if (dirty)
 			m_float_emit->FCVTL(64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
 		type = REG_REG;
 	}
 	if (type == REG_DUP_SINGLE || type == REG_LOWER_PAIR_SINGLE)
 	{
 		if (dirty)
 			m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
 		if (type == REG_DUP_SINGLE)
 			type = REG_DUP;
 		else
 			type = REG_LOWER_PAIR;
 	}
 	if (type == REG_REG || type == REG_LOWER_PAIR)
 	{
 		ARM64Reg host_reg = reg.GetReg();
 		u32 store_size;
-		if (reg.GetType() == REG_REG)
+		if (type == REG_REG)
 			store_size = 128;
 		else
 			store_size = 64;
-		if (reg.IsDirty())
+		if (dirty)
 			m_float_emit->STR(store_size, INDEX_UNSIGNED, host_reg, X29, PPCSTATE_OFF(ps[preg][0]));
 		if (!maintain_state)
@ -529,10 +551,9 @@ void Arm64FPRCache::FlushRegister(u32 preg, bool maintain_state)
 			reg.Flush();
 		}
 	}
-	else if (reg.GetType() == REG_DUP)
+	else if (type == REG_DUP)
 	{
-		ARM64Reg host_reg = reg.GetReg();
+		if (dirty)
 		if (reg.IsDirty())
 		{
 			// If the paired registers were at the start of ppcState we could do an STP here.
 			// Too bad moving them would break savestate compatibility between x86_64 and AArch64
@ -564,18 +585,25 @@ BitSet32 Arm64FPRCache::GetCallerSavedUsed()
 	return registers;
 }
 bool Arm64FPRCache::IsSingle(u32 preg, bool lower_only)
 {
 	RegType type = m_guest_registers[preg].GetType();
 	return type == REG_REG_SINGLE || type == REG_DUP_SINGLE || (lower_only && type == REG_LOWER_PAIR_SINGLE);
 }
 void Arm64FPRCache::FixSinglePrecision(u32 preg)
 {
-	ARM64Reg host_reg = m_guest_registers[preg].GetReg();
+	OpArg& reg = m_guest_registers[preg];
-	switch (m_guest_registers[preg].GetType())
+	ARM64Reg host_reg = reg.GetReg();
 	switch (reg.GetType())
 	{
 	case REG_DUP: // only PS0 needs to be converted
 		m_float_emit->FCVT(32, 64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
-		m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
+		reg.Load(host_reg, REG_DUP_SINGLE);
 		break;
 	case REG_REG: // PS0 and PS1 needs to be converted
 		m_float_emit->FCVTN(32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
-		m_float_emit->FCVTL(64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
+		reg.Load(host_reg, REG_REG_SINGLE);
 		break;
 	default:
 		break;
--- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h
+++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.h
@ -22,7 +22,9 @@ enum RegType
 	REG_IMM, // Reg is really a IMM
 	REG_LOWER_PAIR, // Only the lower pair of a paired register
 	REG_DUP, // The lower reg is the same as the upper one (physical upper doesn't actually have the duplicated value)
-	REG_IS_LOADED, // We don't care what type it is, as long as the lower 64bits are loaded
+	REG_REG_SINGLE, // Both registers are loaded as single
 	REG_LOWER_PAIR_SINGLE, // Only the lower pair of a paired register, as single
 	REG_DUP_SINGLE, // The lower one contains both registers, as single
 };
 enum FlushMode
@ -56,19 +58,9 @@ public:
 	{
 		return m_value;
 	}
-	void LoadToReg(ARM64Reg reg)
+	void Load(ARM64Reg reg, RegType type = REG_REG)
 	{
-		m_type = REG_REG;
+		m_type = type;
 		m_reg = reg;
 	}
 	void LoadLowerReg(ARM64Reg reg)
 	{
 		m_type = REG_LOWER_PAIR;
 		m_reg = reg;
 	}
 	void LoadDup(ARM64Reg reg)
 	{
 		m_type = REG_DUP;
 		m_reg = reg;
 	}
 	void LoadToImm(u32 imm)
@ -278,6 +270,8 @@ public:
 	BitSet32 GetCallerSavedUsed() override;
 	bool IsSingle(u32 preg, bool lower_only = false);
 	void FixSinglePrecision(u32 preg);
 protected:
--- a/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h
+++ b/Source/Core/Core/PowerPC/JitArmCommon/BackPatch.h
@ -9,22 +9,24 @@ struct BackPatchInfo
 {
 	enum
 	{
-		FLAG_STORE      = (1 << 0),
+		FLAG_STORE       = (1 << 0),
-		FLAG_LOAD       = (1 << 1),
+		FLAG_LOAD        = (1 << 1),
-		FLAG_SIZE_8     = (1 << 2),
+		FLAG_SIZE_8      = (1 << 2),
-		FLAG_SIZE_16    = (1 << 3),
+		FLAG_SIZE_16     = (1 << 3),
-		FLAG_SIZE_32    = (1 << 4),
+		FLAG_SIZE_32     = (1 << 4),
-		FLAG_SIZE_F32   = (1 << 5),
+		FLAG_SIZE_F32    = (1 << 5),
-		FLAG_SIZE_F32X2 = (1 << 6),
+		FLAG_SIZE_F32X2  = (1 << 6),
-		FLAG_SIZE_F64   = (1 << 7),
+		FLAG_SIZE_F32X2I = (1 << 7),
-		FLAG_REVERSE    = (1 << 8),
+		FLAG_SIZE_F64    = (1 << 8),
-		FLAG_EXTEND     = (1 << 9),
+		FLAG_REVERSE     = (1 << 9),
-		FLAG_SIZE_F32I  = (1 << 10),
+		FLAG_EXTEND      = (1 << 10),
-		FLAG_ZERO_256   = (1 << 11),
+		FLAG_SIZE_F32I   = (1 << 11),
-		FLAG_MASK_FLOAT = FLAG_SIZE_F32 |
+		FLAG_ZERO_256    = (1 << 12),
-		                  FLAG_SIZE_F32X2 |
+		FLAG_MASK_FLOAT  = FLAG_SIZE_F32 |
-		                  FLAG_SIZE_F64 |
+		                   FLAG_SIZE_F32X2 |
-		                  FLAG_SIZE_F32I,
+		                   FLAG_SIZE_F32X2I |
 		                   FLAG_SIZE_F64 |
 		                   FLAG_SIZE_F32I,
 	};
 	static u32 GetFlagSize(u32 flags)