From c25c4a6e20fb2c776f112cac5bcd4351b0a73703 Mon Sep 17 00:00:00 2001
From: Tillmann Karras <tilkax@gmail.com>
Date: Sat, 25 Jan 2014 18:38:06 +0100
Subject: [PATCH 1/8] x64: add support for some x87 instructions

---
 Source/Core/Common/x64Emitter.cpp | 21 +++++++++++++++++++--
 Source/Core/Common/x64Emitter.h   | 12 ++++++++++++
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp
index 76e235ebcd..09f9097d7f 100644
--- a/Source/Core/Common/x64Emitter.cpp
+++ b/Source/Core/Common/x64Emitter.cpp
@@ -204,7 +204,7 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
 	{
 		// Oh, RIP addressing.
 		_offsetOrBaseReg = 5;
-		emit->WriteModRM(0, _operandReg&7, 5);
+		emit->WriteModRM(0, _operandReg, _offsetOrBaseReg);
 		//TODO : add some checks
 #ifdef _M_X64
 		u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes;
@@ -328,7 +328,6 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
 	}
 }
 
-
 // W = operand extended width (1 if 64-bit)
 // R = register# upper bit
 // X = scale amnt upper bit
@@ -1510,6 +1509,24 @@ void XEmitter::FWAIT()
 	Write8(0x9B);
 }
 
+// TODO: make this more generic
+void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, OpArg arg)
+{
+	int mf = 0;
+	switch (bits) {
+		case 32: mf = 0; break;
+		case 64: mf = 2; break;
+		default: _assert_msg_(DYNA_REC, 0, "WriteFloatLoadStore: bits is not 32 or 64");
+	}
+	Write8(0xd9 | (mf << 1));
+	// x87 instructions use the reg field of the ModR/M byte as opcode:
+	arg.WriteRest(this, 0, (X64Reg) op);
+}
+
+void XEmitter::FLD(int bits, OpArg src) {WriteFloatLoadStore(bits, floatLD, src);}
+void XEmitter::FST(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatST, dest);}
+void XEmitter::FSTP(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatSTP, dest);}
+
 void XEmitter::RTDSC() { Write8(0x0F); Write8(0x31); }
 
 // helper routines for setting pointers
diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h
index bdf82442b4..fee21a24db 100644
--- a/Source/Core/Common/x64Emitter.h
+++ b/Source/Core/Common/x64Emitter.h
@@ -97,6 +97,12 @@ enum NormalOp {
 	nrmXCHG,
 };
 
+enum FloatOp {
+	floatLD = 0,
+	floatST = 2,
+	floatSTP = 3,
+};
+
 class XEmitter;
 
 // RIP addressing does not benefit from micro op fusion on Core arch
@@ -115,6 +121,7 @@ struct OpArg
 	void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const;
 	void WriteVex(XEmitter* emit, int size, int packed, Gen::X64Reg regOp1, X64Reg regOp2) const;
 	void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF, bool warn_64bit_offset = true) const;
+	void WriteFloatModRM(XEmitter *emit, FloatOp op);
 	void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits);
 	// This one is public - must be written to
 	u64 offset;  // use RIP-relative as much as possible - 64-bit immediates are not available.
@@ -244,6 +251,7 @@ private:
 	void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
 	void WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
 	void WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
+	void WriteFloatLoadStore(int bits, FloatOp op, OpArg arg);
 	void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
 
 protected:
@@ -424,6 +432,10 @@ public:
 	void REP();
 	void REPNE();
 
+	// x87
+	void FLD(int bits, OpArg src);
+	void FST(int bits, OpArg dest);
+	void FSTP(int bits, OpArg dest);
 	void FWAIT();
 
 	// SSE/SSE2: Floating point arithmetic

From db196d8c5beb12c36bf4ee22ea2886df8be123dd Mon Sep 17 00:00:00 2001
From: Tillmann Karras <tilkax@gmail.com>
Date: Mon, 3 Feb 2014 23:56:11 +0100
Subject: [PATCH 2/8] Jit64[IL]: fix float conversions

Floating-point is complicated...

Some background: Denormals are floats that are too close to zero to be
stored in a normalized way (their exponent would need more bits). Since
they are stored unnormalized, they are hard to work with, even in
hardware.  That's why both PowerPC and SSE can be configured to operate
in faster but non-standard-conpliant modes in which these numbers are
simply rounded ('flushed') to zero.

Internally, we do the same as the PowerPC CPU and store all floats in
double format. This means that for loading and storing singles we need a
conversion. The PowerPC CPU does this in hardware. We previously did
this using CVTSS2SD/CVTSD2SS. Unfortunately, these instructions are
considered arithmetic and therefore flush denormals to zero if non-IEEE
mode is active. This normally wouldn't be a problem since the next
arithmetic floating-point instruction would do the same anyway but as it
turns out some games actually use floating-point instructions for
copying arbitrary data.

My idea for fixing this problem was to use x87 instructions since the
x87 FPU never supported flush-to-zero and thus doesn't mangle denormals.
However, there is one more problem to deal with: SNaNs are automatically
converted to QNaNs (by setting the most-significant bit of the
fraction). I opted to fix this by manually resetting the QNaN bit of all
values with all-1s exponent.
---
 Source/Core/Common/x64Emitter.cpp             | 17 +++-
 Source/Core/Common/x64Emitter.h               | 21 ++++
 Source/Core/Common/x64FPURoundMode.cpp        |  6 +-
 .../Core/Core/PowerPC/Jit64/JitRegCache.cpp   |  2 +-
 .../PowerPC/Jit64/Jit_LoadStoreFloating.cpp   | 46 ++++-----
 Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp   | 19 ++--
 .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp  | 95 +++++++++++++++++++
 Source/Core/Core/PowerPC/JitCommon/Jit_Util.h |  4 +
 8 files changed, 173 insertions(+), 37 deletions(-)

diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp
index 09f9097d7f..12985e8a0c 100644
--- a/Source/Core/Common/x64Emitter.cpp
+++ b/Source/Core/Common/x64Emitter.cpp
@@ -1437,7 +1437,19 @@ void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {
 	Write8(0x0f);
 	Write8(0x38);
 	Write8(0x00);
-	arg.WriteRest(this, 0);
+	arg.WriteRest(this);
+}
+
+void XEmitter::PTEST(X64Reg dest, OpArg arg) {
+	if (!cpu_info.bSSE4_1) {
+		PanicAlert("Trying to use PTEST on a system that doesn't support it. Nobody hears your screams.");
+	}
+	Write8(0x66);
+	Write8(0x0f);
+	Write8(0x38);
+	Write8(0x17);
+	arg.operandReg = dest;
+	arg.WriteRest(this);
 }
 
 void XEmitter::PAND(X64Reg dest, OpArg arg)     {WriteSSEOp(64, 0xDB, true, dest, arg);}
@@ -1497,6 +1509,8 @@ void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64,
 void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseMUL, false, regOp1, regOp2, arg);}
 void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseDIV, false, regOp1, regOp2, arg);}
 void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)  {WriteAVXOp(64, sseSQRT, false, regOp1, regOp2, arg);}
+void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(64, sseAND, false, regOp1, regOp2, arg);}
+void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseANDN, false, regOp1, regOp2, arg);}
 
 // Prefixes
 
@@ -1526,6 +1540,7 @@ void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, OpArg arg)
 void XEmitter::FLD(int bits, OpArg src) {WriteFloatLoadStore(bits, floatLD, src);}
 void XEmitter::FST(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatST, dest);}
 void XEmitter::FSTP(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatSTP, dest);}
+void XEmitter::FNSTSW_AX() { Write8(0xDF); Write8(0xE0); }
 
 void XEmitter::RTDSC() { Write8(0x0F); Write8(0x31); }
 
diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h
index fee21a24db..19edf3b822 100644
--- a/Source/Core/Common/x64Emitter.h
+++ b/Source/Core/Common/x64Emitter.h
@@ -433,9 +433,27 @@ public:
 	void REPNE();
 
 	// x87
+	enum x87StatusWordBits {
+		x87_InvalidOperation = 0x1,
+		x87_DenormalizedOperand = 0x2,
+		x87_DivisionByZero = 0x4,
+		x87_Overflow = 0x8,
+		x87_Underflow = 0x10,
+		x87_Precision = 0x20,
+		x87_StackFault = 0x40,
+		x87_ErrorSummary = 0x80,
+		x87_C0 = 0x100,
+		x87_C1 = 0x200,
+		x87_C2 = 0x400,
+		x87_TopOfStack = 0x2000 | 0x1000 | 0x800,
+		x87_C3 = 0x4000,
+		x87_FPUBusy = 0x8000,
+	};
+
 	void FLD(int bits, OpArg src);
 	void FST(int bits, OpArg dest);
 	void FSTP(int bits, OpArg dest);
+	void FNSTSW_AX();
 	void FWAIT();
 
 	// SSE/SSE2: Floating point arithmetic
@@ -562,6 +580,7 @@ public:
 	void PUNPCKLWD(X64Reg dest, const OpArg &arg);
 	void PUNPCKLDQ(X64Reg dest, const OpArg &arg);
 
+	void PTEST(X64Reg dest, OpArg arg);
 	void PAND(X64Reg dest, OpArg arg);
 	void PANDN(X64Reg dest, OpArg arg);
 	void PXOR(X64Reg dest, OpArg arg);
@@ -631,6 +650,8 @@ public:
 	void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
 	void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
 	void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg);
 
 	void RTDSC();
 
diff --git a/Source/Core/Common/x64FPURoundMode.cpp b/Source/Core/Common/x64FPURoundMode.cpp
index 34438d12b2..a336859143 100644
--- a/Source/Core/Common/x64FPURoundMode.cpp
+++ b/Source/Core/Common/x64FPURoundMode.cpp
@@ -16,11 +16,11 @@ static const unsigned short FPU_ROUND_MASK = 3 << 10;
 #endif
 
 // OR-mask for disabling FPU exceptions (bits 7-12 in the MXCSR register)
-const u32 EXCEPTION_MASK = 0x1F80;
+static const u32 EXCEPTION_MASK = 0x1F80;
 // Denormals-Are-Zero (non-IEEE mode: denormal inputs are set to +/- 0)
-const u32 DAZ = 0x40;
+static const u32 DAZ = 0x40;
 // Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0)
-const u32 FTZ = 0x8000;
+static const u32 FTZ = 0x8000;
 
 namespace FPURoundMode
 {
diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
index 1be04a0d5d..c4b063e1a6 100644
--- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp
@@ -378,7 +378,7 @@ void RegCache::Flush(FlushMode mode)
 	{
 		if (locks[i])
 		{
-			PanicAlert("Someone forgot to unlock PPC reg %i.", i);
+			PanicAlert("Someone forgot to unlock PPC reg %i (X64 reg %i).", i, RX(i));
 		}
 		if (regs[i].away)
 		{
diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
index bc056e6bd1..0aac678151 100644
--- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
+++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStoreFloating.cpp
@@ -12,6 +12,8 @@
 #include "JitAsm.h"
 #include "JitRegCache.h"
 
+namespace {
+
 // pshufb todo: MOVQ
 const u8 GC_ALIGNED16(bswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 const u8 GC_ALIGNED16(bswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
@@ -19,11 +21,10 @@ const u8 GC_ALIGNED16(bswapShuffle1x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 8, 9, 10,
 const u8 GC_ALIGNED16(bswapShuffle1x8Dupe[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0};
 const u8 GC_ALIGNED16(bswapShuffle2x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
 
-namespace {
-
 u64 GC_ALIGNED16(temp64);
-u32 GC_ALIGNED16(temp32);
+
 }
+
 // TODO: Add peephole optimizations for multiple consecutive lfd/lfs/stfd/stfs since they are so common,
 // and pshufb could help a lot.
 // Also add hacks for things like lfs/stfs the same reg consecutively, that is, simple memory moves.
@@ -46,11 +47,9 @@ void Jit64::lfs(UGeckoInstruction inst)
 
 	MEMCHECK_START
 
-	MOV(32, M(&temp32), R(EAX));
 	fpr.Lock(d);
 	fpr.BindToRegister(d, false);
-	CVTSS2SD(fpr.RX(d), M(&temp32));
-	MOVDDUP(fpr.RX(d), fpr.R(d));
+	ConvertSingleToDouble(fpr.RX(d), EAX, true);
 
 	MEMCHECK_END
 
@@ -235,13 +234,15 @@ void Jit64::stfs(UGeckoInstruction inst)
 		return;
 	}
 
+	fpr.BindToRegister(s, true, false);
+	ConvertDoubleToSingle(XMM0, fpr.RX(s));
+
 	if (gpr.R(a).IsImm())
 	{
 		u32 addr = (u32)(gpr.R(a).offset + offset);
 		if (Memory::IsRAMAddress(addr))
 		{
 			if (cpu_info.bSSSE3) {
-				CVTSD2SS(XMM0, fpr.R(s));
 				PSHUFB(XMM0, M((void *)bswapShuffle1x4));
 				WriteFloatToConstRamAddress(XMM0, addr);
 				return;
@@ -250,7 +251,6 @@ void Jit64::stfs(UGeckoInstruction inst)
 		else if (addr == 0xCC008000)
 		{
 			// Float directly to write gather pipe! Fun!
-			CVTSD2SS(XMM0, fpr.R(s));
 			CALL((void*)asm_routines.fifoDirectWriteFloat);
 			// TODO
 			js.fifoBytesThisBlock += 4;
@@ -260,7 +260,6 @@ void Jit64::stfs(UGeckoInstruction inst)
 
 	gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
 	gpr.Lock(a);
-	fpr.Lock(s);
 	MOV(32, R(ABI_PARAM2), gpr.R(a));
 	ADD(32, R(ABI_PARAM2), Imm32(offset));
 	if (update && offset)
@@ -275,7 +274,6 @@ void Jit64::stfs(UGeckoInstruction inst)
 
 		MEMCHECK_END
 	}
-	CVTSD2SS(XMM0, fpr.R(s));
 	SafeWriteFloatToReg(XMM0, ABI_PARAM2, RegistersInUse());
 	gpr.UnlockAll();
 	gpr.UnlockAllX();
@@ -290,11 +288,14 @@ void Jit64::stfsx(UGeckoInstruction inst)
 
 	// We can take a shortcut here - it's not likely that a hardware access would use this instruction.
 	gpr.FlushLockX(ABI_PARAM1);
-	fpr.Lock(inst.RS);
 	MOV(32, R(ABI_PARAM1), gpr.R(inst.RB));
 	if (inst.RA)
 		ADD(32, R(ABI_PARAM1), gpr.R(inst.RA));
-	CVTSD2SS(XMM0, fpr.R(inst.RS));
+
+	int s = inst.RS;
+	fpr.Lock(s);
+	fpr.BindToRegister(s, true, false);
+	ConvertDoubleToSingle(XMM0, fpr.RX(s));
 	MOVD_xmm(R(EAX), XMM0);
 	SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse());
 
@@ -313,21 +314,20 @@ void Jit64::lfsx(UGeckoInstruction inst)
 	{
 		ADD(32, R(EAX), gpr.R(inst.RA));
 	}
+	fpr.Lock(inst.RS);
+	fpr.BindToRegister(inst.RS, false);
+	X64Reg s = fpr.RX(inst.RS);
 	if (cpu_info.bSSSE3 && !js.memcheck) {
-		fpr.Lock(inst.RS);
-		fpr.BindToRegister(inst.RS, false, true);
-		X64Reg r = fpr.R(inst.RS).GetSimpleReg();
 #ifdef _M_IX86
 		AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
-		MOVD_xmm(r, MDisp(EAX, (u32)Memory::base));
+		MOVD_xmm(XMM0, MDisp(EAX, (u32)Memory::base));
 #else
-		MOVD_xmm(r, MComplex(RBX, EAX, SCALE_1, 0));
+		MOVD_xmm(XMM0, MComplex(RBX, EAX, SCALE_1, 0));
 #endif
 		MEMCHECK_START
 
-		PSHUFB(r, M((void *)bswapShuffle1x4));
-		CVTSS2SD(r, R(r));
-		MOVDDUP(r, R(r));
+		PSHUFB(XMM0, M((void *)bswapShuffle1x4));
+		ConvertSingleToDouble(s, XMM0);
 
 		MEMCHECK_END
 	} else {
@@ -335,11 +335,7 @@ void Jit64::lfsx(UGeckoInstruction inst)
 
 		MEMCHECK_START
 
-		MOV(32, M(&temp32), R(EAX));
-		CVTSS2SD(XMM0, M(&temp32));
-		fpr.Lock(inst.RS);
-		fpr.BindToRegister(inst.RS, false, true);
-		MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0));
+		ConvertSingleToDouble(s, EAX, true);
 
 		MEMCHECK_END
 	}
diff --git a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
index 5c9d2075a8..be87a77890 100644
--- a/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
+++ b/Source/Core/Core/PowerPC/Jit64IL/IR_X86.cpp
@@ -1288,10 +1288,12 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit) {
 		}
 		case DupSingleToMReg: {
 			if (!thisUsed) break;
-			X64Reg reg = fregURegWithoutMov(RI, I);
-			Jit->CVTSS2SD(reg, fregLocForInst(RI, getOp1(I)));
-			Jit->MOVDDUP(reg, R(reg));
-			RI.fregs[reg] = I;
+
+			X64Reg input = fregEnsureInReg(RI, getOp1(I));
+			X64Reg output = fregURegWithoutMov(RI, I);
+			Jit->ConvertSingleToDouble(output, input);
+
+			RI.fregs[output] = I;
 			fregNormalRegClear(RI, I);
 			break;
 		}
@@ -1412,9 +1414,12 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit) {
 		}
 		case DoubleToSingle: {
 			if (!thisUsed) break;
-			X64Reg reg = fregURegWithoutMov(RI, I);
-			Jit->CVTSD2SS(reg, fregLocForInst(RI, getOp1(I)));
-			RI.fregs[reg] = I;
+
+			X64Reg input = fregEnsureInReg(RI, getOp1(I));
+			X64Reg output = fregURegWithoutMov(RI, I);
+			Jit->ConvertDoubleToSingle(output, input);
+
+			RI.fregs[output] = I;
 			fregNormalRegClear(RI, I);
 			break;
 		}
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
index 49a83e1831..814dbd3cf1 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@@ -416,6 +416,101 @@ void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm) {
 	}
 }
 
+static u32 GC_ALIGNED16(temp32);
+static u64 GC_ALIGNED16(temp64);
+#ifdef _WIN32
+#include <intrin.h>
+#ifdef _M_X64
+static const __m128i GC_ALIGNED16(single_qnan_bit) = _mm_set_epi64x(0, 0x0000000000400000);
+static const __m128i GC_ALIGNED16(single_exponent) = _mm_set_epi64x(0, 0x000000007f800000);
+static const __m128i GC_ALIGNED16(double_qnan_bit) = _mm_set_epi64x(0, 0x0008000000000000);
+static const __m128i GC_ALIGNED16(double_exponent) = _mm_set_epi64x(0, 0x7ff0000000000000);
+#else
+static const __m128i GC_ALIGNED16(single_qnan_bit) = _mm_set_epi32(0, 0, 0x00000000, 0x00400000);
+static const __m128i GC_ALIGNED16(single_exponent) = _mm_set_epi32(0, 0, 0x00000000, 0x7f800000);
+static const __m128i GC_ALIGNED16(double_qnan_bit) = _mm_set_epi32(0, 0, 0x00080000, 0x00000000);
+static const __m128i GC_ALIGNED16(double_exponent) = _mm_set_epi32(0, 0, 0x7ff00000, 0x00000000);
+#endif
+#else
+static const __uint128_t GC_ALIGNED16(single_qnan_bit) = 0x0000000000400000;
+static const __uint128_t GC_ALIGNED16(single_exponent) = 0x000000007f800000;
+static const __uint128_t GC_ALIGNED16(double_qnan_bit) = 0x0008000000000000;
+static const __uint128_t GC_ALIGNED16(double_exponent) = 0x7ff0000000000000;
+#endif
+
+// Since the following two functions are used in non-arithmetic PPC float instructions,
+// they must convert floats bitexact and never flush denormals to zero or turn SNaNs into QNaNs.
+// This means we can't use CVTSS2SD/CVTSD2SS :(
+// The x87 FPU doesn't even support flush-to-zero so we can use FLD+FSTP even on denormals.
+// If the number is a NaN, make sure to set the QNaN bit back to its original value.
+
+void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr)
+{
+	if (src_is_gpr) {
+		MOV(32, M(&temp32), R(src));
+		MOVD_xmm(XMM1, R(src));
+	} else {
+		MOVSS(M(&temp32), src);
+		MOVSS(R(XMM1), src);
+	}
+	FLD(32, M(&temp32));
+	CCFlags cond;
+	if (cpu_info.bSSE4_1) {
+		PTEST(XMM1, M((void *)&single_exponent));
+		cond = CC_NC;
+	} else {
+		FNSTSW_AX();
+		TEST(16, R(AX), Imm16(x87_InvalidOperation));
+		cond = CC_Z;
+	}
+	FSTP(64, M(&temp64));
+	MOVSD(dst, M(&temp64));
+	FixupBranch dont_reset_qnan_bit = J_CC(cond);
+
+	PANDN(XMM1, M((void *)&single_qnan_bit));
+	PSLLQ(XMM1, 29);
+	if (cpu_info.bAVX) {
+		VPANDN(dst, XMM1, R(dst));
+	} else {
+		PANDN(XMM1, R(dst));
+		MOVSD(dst, R(XMM1));
+	}
+
+	SetJumpTarget(dont_reset_qnan_bit);
+	MOVDDUP(dst, R(dst));
+}
+
+void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
+{
+	MOVSD(M(&temp64), src);
+	MOVSD(XMM1, R(src));
+	FLD(64, M(&temp64));
+	CCFlags cond;
+	if (cpu_info.bSSE4_1) {
+		PTEST(XMM1, M((void *)&double_exponent));
+		cond = CC_NC;
+	} else {
+		FNSTSW_AX();
+		TEST(16, R(AX), Imm16(x87_InvalidOperation));
+		cond = CC_Z;
+	}
+	FSTP(32, M(&temp32));
+	MOVSS(XMM0, M(&temp32));
+	FixupBranch dont_reset_qnan_bit = J_CC(cond);
+
+	PANDN(XMM1, M((void *)&double_qnan_bit));
+	PSRLQ(XMM1, 29);
+	if (cpu_info.bAVX) {
+		VPANDN(XMM0, XMM1, R(XMM0));
+	} else {
+		PANDN(XMM1, R(XMM0));
+		MOVSS(XMM0, R(XMM1));
+	}
+
+	SetJumpTarget(dont_reset_qnan_bit);
+	MOVDDUP(dst, R(XMM0));
+}
+
 void EmuCodeBlock::JitClearCA()
 {
 	AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
index 278b9d7352..bd7af7e19d 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h
@@ -47,6 +47,10 @@ public:
 
 	void ForceSinglePrecisionS(Gen::X64Reg xmm);
 	void ForceSinglePrecisionP(Gen::X64Reg xmm);
+
+	// AX might get trashed
+	void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);
+	void ConvertDoubleToSingle(Gen::X64Reg dst, Gen::X64Reg src);
 protected:
 	std::unordered_map<u8 *, u32> registersInUseAtLoc;
 };

From f6897039c7a4c71aedf30ad9bc9085c3197839c0 Mon Sep 17 00:00:00 2001
From: Tillmann Karras <tilkax@gmail.com>
Date: Mon, 3 Feb 2014 23:48:31 +0100
Subject: [PATCH 3/8] Interpreter: fix float conversions

Can't use simple casting, otherwise we get the same problems as in Jit64.
---
 .../PowerPC/Interpreter/Interpreter_FPUtils.h | 30 +++++++++++++++++++
 .../Interpreter/Interpreter_LoadStore.cpp     | 27 ++++++++---------
 2 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
index 2c1b0f7076..66ce49a8b0 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@@ -231,3 +231,33 @@ inline u32 ConvertToSingleFTZ(u64 x)
 		return (x >> 32) & 0x80000000;
 	}
 }
+
+inline u64 ConvertToDouble(u32 _x)
+{
+	u64 x = _x;
+	u64 exp = (x >> 23) & 0xff;
+	u64 frac = x & 0x007fffff;
+	if (exp || frac == 0)
+	{
+		// not denormalized
+		u64 y = exp & 0x80;
+		u64 z = y << 54 | y << 53 | y << 52;
+		if (exp > 0 && exp < 255)
+		{
+			// not inf/nan/zero
+			z = ~z;
+		}
+		return ((x & 0xc0000000) << 32) | z | ((x & 0x3fffffff) << 29);
+	}
+	else
+	{
+		// denormalized
+		exp = 1023 - 126;
+		do
+		{
+			frac <<= 1;
+			exp -= 1;
+		} while ((frac & 0x00800000) == 0);
+		return ((x & 0x80000000) << 32) | (exp << 52) | ((frac & 0x007fffff) << 29);
+	}
+}
diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp
index 3fb441f5b9..0356bd4247 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp
@@ -93,9 +93,9 @@ void Interpreter::lfs(UGeckoInstruction _inst)
 	u32 uTemp = Memory::Read_U32(Helper_Get_EA(_inst));
 	if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI))
 	{
-		double value = *(float*)&uTemp;
-		rPS0(_inst.FD) = value;
-		rPS1(_inst.FD) = value;
+		u64 value = ConvertToDouble(uTemp);
+		riPS0(_inst.FD) = value;
+		riPS1(_inst.FD) = value;
 	}
 }
 
@@ -105,9 +105,9 @@ void Interpreter::lfsu(UGeckoInstruction _inst)
 	u32 uTemp = Memory::Read_U32(uAddress);
 	if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI))
 	{
-		double value = *(float*)&uTemp;
-		rPS0(_inst.FD) = value;
-		rPS1(_inst.FD) = value;
+		u64 value = ConvertToDouble(uTemp);
+		riPS0(_inst.FD) = value;
+		riPS1(_inst.FD) = value;
 		m_GPR[_inst.RA] = uAddress;
 	}
 
@@ -119,9 +119,9 @@ void Interpreter::lfsux(UGeckoInstruction _inst)
 	u32 uTemp = Memory::Read_U32(uAddress);
 	if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI))
 	{
-		double value = *(float*)&uTemp;
-		rPS0(_inst.FD) = value;
-		rPS1(_inst.FD) = value;
+		u64 value = ConvertToDouble(uTemp);
+		riPS0(_inst.FD) = value;
+		riPS1(_inst.FD) = value;
 		m_GPR[_inst.RA] = uAddress;
 	}
 }
@@ -131,9 +131,9 @@ void Interpreter::lfsx(UGeckoInstruction _inst)
 	u32 uTemp = Memory::Read_U32(Helper_Get_EA_X(_inst));
 	if (!(PowerPC::ppcState.Exceptions & EXCEPTION_DSI))
 	{
-		double value = *(float*)&uTemp;
-		rPS0(_inst.FD) = value;
-		rPS1(_inst.FD) = value;
+		u64 value = ConvertToDouble(uTemp);
+		riPS0(_inst.FD) = value;
+		riPS1(_inst.FD) = value;
 	}
 }
 
@@ -282,9 +282,6 @@ void Interpreter::stfdu(UGeckoInstruction _inst)
 
 void Interpreter::stfs(UGeckoInstruction _inst)
 {
-	//double value = rPS0(_inst.FS);
-	//float fTemp = (float)value;
-	//Memory::Write_U32(*(u32*)&fTemp, Helper_Get_EA(_inst));
 	Memory::Write_U32(ConvertToSingle(riPS0(_inst.FS)), Helper_Get_EA(_inst));
 }
 

From 1f34ed2c25be0b1770cc2699681cff4aa4994212 Mon Sep 17 00:00:00 2001
From: Tillmann Karras <tilkax@gmail.com>
Date: Mon, 3 Feb 2014 23:58:54 +0100
Subject: [PATCH 4/8] Re-enable non-IEEE mode support

---
 Source/Core/Common/x64FPURoundMode.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Source/Core/Common/x64FPURoundMode.cpp b/Source/Core/Common/x64FPURoundMode.cpp
index a336859143..e695ca5650 100644
--- a/Source/Core/Common/x64FPURoundMode.cpp
+++ b/Source/Core/Common/x64FPURoundMode.cpp
@@ -101,8 +101,7 @@ namespace FPURoundMode
 			FTZ,       // flush-to-zero only
 			FTZ | DAZ, // flush-to-zero and denormals-are-zero (may not be supported)
 		};
-		// FIXME: proper (?) non-IEEE mode emulation causes issues in lots of games
-		if (nonIEEEMode && false)
+		if (nonIEEEMode)
 		{
 			csr |= denormalLUT[cpu_info.bFlushToZero];
 		}

From 1eb8168488d14be4f982caef2c0abfaf2a0e640c Mon Sep 17 00:00:00 2001
From: Scott Mansell <phiren@gmail.com>
Date: Thu, 6 Feb 2014 20:39:57 +1300
Subject: [PATCH 5/8] x64Emitter: Add the xmm, xmm form of PSRLQ instruction.

---
 Source/Core/Common/x64Emitter.cpp | 4 ++++
 Source/Core/Common/x64Emitter.h   | 1 +
 2 files changed, 5 insertions(+)

diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp
index 12985e8a0c..5cba748727 100644
--- a/Source/Core/Common/x64Emitter.cpp
+++ b/Source/Core/Common/x64Emitter.cpp
@@ -1390,6 +1390,10 @@ void XEmitter::PSRLQ(X64Reg reg, int shift) {
 	Write8(shift);
 }
 
+void XEmitter::PSRLQ(X64Reg reg, OpArg arg) {
+	WriteSSEOp(64, 0xd3, true, reg, arg);
+}
+
 void XEmitter::PSLLW(X64Reg reg, int shift) {
 	WriteSSEOp(64, 0x71, true, (X64Reg)6, R(reg));
 	Write8(shift);
diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h
index 19edf3b822..8f68bdb6f7 100644
--- a/Source/Core/Common/x64Emitter.h
+++ b/Source/Core/Common/x64Emitter.h
@@ -636,6 +636,7 @@ public:
 	void PSRLW(X64Reg reg, int shift);
 	void PSRLD(X64Reg reg, int shift);
 	void PSRLQ(X64Reg reg, int shift);
+	void PSRLQ(X64Reg reg, OpArg arg);
 
 	void PSLLW(X64Reg reg, int shift);
 	void PSLLD(X64Reg reg, int shift);

From cf5938c4df3ccaf67f731d4487dc3b5d13e48813 Mon Sep 17 00:00:00 2001
From: Scott Mansell <phiren@gmail.com>
Date: Thu, 6 Feb 2014 20:41:13 +1300
Subject: [PATCH 6/8] x64Emitter: Fix the PSUBQ instruction's opcode

---
 Source/Core/Common/x64Emitter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp
index 5cba748727..75fe76f893 100644
--- a/Source/Core/Common/x64Emitter.cpp
+++ b/Source/Core/Common/x64Emitter.cpp
@@ -1474,7 +1474,7 @@ void XEmitter::PADDUSW(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0xDD, true, dest
 void XEmitter::PSUBB(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xF8, true, dest, arg);}
 void XEmitter::PSUBW(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xF9, true, dest, arg);}
 void XEmitter::PSUBD(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xFA, true, dest, arg);}
-void XEmitter::PSUBQ(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xDB, true, dest, arg);}
+void XEmitter::PSUBQ(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xFB, true, dest, arg);}
 
 void XEmitter::PSUBSB(X64Reg dest, OpArg arg)   {WriteSSEOp(64, 0xE8, true, dest, arg);}
 void XEmitter::PSUBSW(X64Reg dest, OpArg arg)   {WriteSSEOp(64, 0xE9, true, dest, arg);}

From 7062cf86577f4a4a017e526aec40ea5a1ebd91cb Mon Sep 17 00:00:00 2001
From: Scott Mansell <phiren@gmail.com>
Date: Thu, 6 Feb 2014 20:44:42 +1300
Subject: [PATCH 7/8] Interpeter: Fixed ConvertToDouble to match the manual.

Also added some documntation comments.
---
 .../PowerPC/Interpreter/Interpreter_FPUtils.h | 27 +++++++++++--------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
index 66ce49a8b0..d4c6632453 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@@ -234,24 +234,23 @@ inline u32 ConvertToSingleFTZ(u64 x)
 
 inline u64 ConvertToDouble(u32 _x)
 {
+	// This is a little-endian re-implementation of the algrothm described in
+	// the Power PC Programming Enviroments Manual for Loading single
+	// percision floating point numbers.
+	// See page 566 of http://www.freescale.com/files/product/doc/MPCFPE32B.pdf
+
 	u64 x = _x;
 	u64 exp = (x >> 23) & 0xff;
 	u64 frac = x & 0x007fffff;
-	if (exp || frac == 0)
+
+	if (exp > 0 && exp < 255) // Normal number
 	{
-		// not denormalized
-		u64 y = exp & 0x80;
-		u64 z = y << 54 | y << 53 | y << 52;
-		if (exp > 0 && exp < 255)
-		{
-			// not inf/nan/zero
-			z = ~z;
-		}
+		u64 y = !(exp >> 7);
+		u64 z = y << 61 | y << 60 | y << 59;
 		return ((x & 0xc0000000) << 32) | z | ((x & 0x3fffffff) << 29);
 	}
-	else
+	else if (exp == 0 && frac != 0) // Subnormal number
 	{
-		// denormalized
 		exp = 1023 - 126;
 		do
 		{
@@ -260,4 +259,10 @@ inline u64 ConvertToDouble(u32 _x)
 		} while ((frac & 0x00800000) == 0);
 		return ((x & 0x80000000) << 32) | (exp << 52) | ((frac & 0x007fffff) << 29);
 	}
+	else // QNaN, SNaN or Zero
+	{
+		u64 y = exp >> 7;
+		u64 z = y << 61 | y << 60 | y << 59;
+		return ((x & 0xc0000000) << 32) | z | ((x & 0x3fffffff) << 29);
+	}
 }

From ee21cbe2d181b8ea2565dfde0f882cbbe3df02d9 Mon Sep 17 00:00:00 2001
From: Tillmann Karras <tilkax@gmail.com>
Date: Wed, 12 Feb 2014 23:26:15 +0100
Subject: [PATCH 8/8] Add phire's more accurate DoubleToSingle version

This method doesn't involve messing around with the quirks of the x87
FPU and should be reasonably fast. As a bonus, it does the correct thing
for out-of-range doubles.

However, it is also a little slower and only benefits programs that rely
on undefined behavior so it is disabled for now.
---
 .../PowerPC/Interpreter/Interpreter_FPUtils.h |   6 +-
 .../Core/Core/PowerPC/JitCommon/Jit_Util.cpp  | 163 ++++++++++++++----
 2 files changed, 134 insertions(+), 35 deletions(-)

diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
index d4c6632453..4063c19d30 100644
--- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
+++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_FPUtils.h
@@ -234,9 +234,9 @@ inline u32 ConvertToSingleFTZ(u64 x)
 
 inline u64 ConvertToDouble(u32 _x)
 {
-	// This is a little-endian re-implementation of the algrothm described in
-	// the Power PC Programming Enviroments Manual for Loading single
-	// percision floating point numbers.
+	// This is a little-endian re-implementation of the algorithm described in
+	// the PowerPC Programming Environments Manual for loading single
+	// precision floating point numbers.
 	// See page 566 of http://www.freescale.com/files/product/doc/MPCFPE32B.pdf
 
 	u64 x = _x;
diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
index 814dbd3cf1..a5a022be6c 100644
--- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
+++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp
@@ -438,12 +438,142 @@ static const __uint128_t GC_ALIGNED16(double_qnan_bit) = 0x0008000000000000;
 static const __uint128_t GC_ALIGNED16(double_exponent) = 0x7ff0000000000000;
 #endif
 
-// Since the following two functions are used in non-arithmetic PPC float instructions,
+// Since the following float conversion functions are used in non-arithmetic PPC float instructions,
 // they must convert floats bitexact and never flush denormals to zero or turn SNaNs into QNaNs.
 // This means we can't use CVTSS2SD/CVTSD2SS :(
 // The x87 FPU doesn't even support flush-to-zero so we can use FLD+FSTP even on denormals.
 // If the number is a NaN, make sure to set the QNaN bit back to its original value.
 
+// Another problem is that officially, converting doubles to single format results in undefined behavior.
+// Relying on undefined behavior is a bug so no software should ever do this.
+// In case it does happen, phire's more accurate implementation of ConvertDoubleToSingle() is reproduced below.
+
+//#define MORE_ACCURATE_DOUBLETOSINGLE
+#ifdef MORE_ACCURATE_DOUBLETOSINGLE
+
+#ifdef _WIN32
+#ifdef _M_X64
+static const __m128i GC_ALIGNED16(double_fraction) = _mm_set_epi64x(0, 0x000fffffffffffff);
+static const __m128i GC_ALIGNED16(double_sign_bit) = _mm_set_epi64x(0, 0x8000000000000000);
+static const __m128i GC_ALIGNED16(double_explicit_top_bit) = _mm_set_epi64x(0, 0x0010000000000000);
+static const __m128i GC_ALIGNED16(double_top_two_bits) = _mm_set_epi64x(0, 0xc000000000000000);
+static const __m128i GC_ALIGNED16(double_bottom_bits)  = _mm_set_epi64x(0, 0x07ffffffe0000000);
+#else
+static const __m128i GC_ALIGNED16(double_fraction) = _mm_set_epi32(0, 0, 0x000fffff, 0xffffffff);
+static const __m128i GC_ALIGNED16(double_sign_bit) = _mm_set_epi32(0, 0, 0x80000000, 0x00000000);
+static const __m128i GC_ALIGNED16(double_explicit_top_bit) = _mm_set_epi32(0, 0, 0x00100000, 0x00000000);
+static const __m128i GC_ALIGNED16(double_top_two_bits) = _mm_set_epi32(0, 0, 0xc0000000, 0x00000000);
+static const __m128i GC_ALIGNED16(double_bottom_bits)  = _mm_set_epi32(0, 0, 0x07ffffff, 0xe0000000);
+#endif
+#else
+static const __uint128_t GC_ALIGNED16(double_fraction) = 0x000fffffffffffff;
+static const __uint128_t GC_ALIGNED16(double_sign_bit) = 0x8000000000000000;
+static const __uint128_t GC_ALIGNED16(double_explicit_top_bit) = 0x0010000000000000;
+static const __uint128_t GC_ALIGNED16(double_top_two_bits) = 0xc000000000000000;
+static const __uint128_t GC_ALIGNED16(double_bottom_bits)  = 0x07ffffffe0000000;
+#endif
+
+// This is the same algorithm used in the interpreter (and actual hardware)
+// The documentation states that the conversion of a double with an outside the
+// valid range for a single (or a single denormal) is undefined.
+// But testing on actual hardware shows it always picks bits 0..1 and 5..34
+// unless the exponent is in the range of 874 to 896.
+void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
+{
+	MOVSD(XMM1, R(src));
+
+	// Grab Exponent
+	PAND(XMM1, M((void *)&double_exponent));
+	PSRLQ(XMM1, 52);
+	MOVD_xmm(R(EAX), XMM1);
+
+
+	// Check if the double is in the range of valid single subnormal
+	CMP(16, R(EAX), Imm16(896));
+	FixupBranch NoDenormalize = J_CC(CC_G);
+	CMP(16, R(EAX), Imm16(874));
+	FixupBranch NoDenormalize2 = J_CC(CC_L);
+
+	// Denormalise
+
+	// shift = (905 - Exponent) plus the 21 bit double to single shift
+	MOV(16, R(EAX), Imm16(905 + 21));
+	MOVD_xmm(XMM0, R(EAX));
+	PSUBQ(XMM0, R(XMM1));
+
+	// xmm1 = fraction | 0x0010000000000000
+	MOVSD(XMM1, R(src));
+	PAND(XMM1, M((void *)&double_fraction));
+	POR(XMM1, M((void *)&double_explicit_top_bit));
+
+	// fraction >> shift
+	PSRLQ(XMM1, R(XMM0));
+
+	// OR the sign bit in.
+	MOVSD(XMM0, R(src));
+	PAND(XMM0, M((void *)&double_sign_bit));
+	PSRLQ(XMM0, 32);
+	POR(XMM1, R(XMM0));
+
+	FixupBranch end = J(false); // Goto end
+
+	SetJumpTarget(NoDenormalize);
+	SetJumpTarget(NoDenormalize2);
+
+	// Don't Denormalize
+
+	// We want bits 0, 1
+	MOVSD(XMM1, R(src));
+	PAND(XMM1, M((void *)&double_top_two_bits));
+	PSRLQ(XMM1, 32);
+
+	// And 5 through to 34
+	MOVSD(XMM0, R(src));
+	PAND(XMM0, M((void *)&double_bottom_bits));
+	PSRLQ(XMM0, 29);
+
+	// OR them togther
+	POR(XMM1, R(XMM0));
+
+	// End
+	SetJumpTarget(end);
+	MOVDDUP(dst, R(XMM1));
+}
+
+#else // MORE_ACCURATE_DOUBLETOSINGLE
+
+void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
+{
+	MOVSD(M(&temp64), src);
+	MOVSD(XMM1, R(src));
+	FLD(64, M(&temp64));
+	CCFlags cond;
+	if (cpu_info.bSSE4_1) {
+		PTEST(XMM1, M((void *)&double_exponent));
+		cond = CC_NC;
+	} else {
+		FNSTSW_AX();
+		TEST(16, R(AX), Imm16(x87_InvalidOperation));
+		cond = CC_Z;
+	}
+	FSTP(32, M(&temp32));
+	MOVSS(XMM0, M(&temp32));
+	FixupBranch dont_reset_qnan_bit = J_CC(cond);
+
+	PANDN(XMM1, M((void *)&double_qnan_bit));
+	PSRLQ(XMM1, 29);
+	if (cpu_info.bAVX) {
+		VPANDN(XMM0, XMM1, R(XMM0));
+	} else {
+		PANDN(XMM1, R(XMM0));
+		MOVSS(XMM0, R(XMM1));
+	}
+
+	SetJumpTarget(dont_reset_qnan_bit);
+	MOVDDUP(dst, R(XMM0));
+}
+#endif // MORE_ACCURATE_DOUBLETOSINGLE
+
 void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr)
 {
 	if (src_is_gpr) {
@@ -480,37 +610,6 @@ void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr
 	MOVDDUP(dst, R(dst));
 }
 
-void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
-{
-	MOVSD(M(&temp64), src);
-	MOVSD(XMM1, R(src));
-	FLD(64, M(&temp64));
-	CCFlags cond;
-	if (cpu_info.bSSE4_1) {
-		PTEST(XMM1, M((void *)&double_exponent));
-		cond = CC_NC;
-	} else {
-		FNSTSW_AX();
-		TEST(16, R(AX), Imm16(x87_InvalidOperation));
-		cond = CC_Z;
-	}
-	FSTP(32, M(&temp32));
-	MOVSS(XMM0, M(&temp32));
-	FixupBranch dont_reset_qnan_bit = J_CC(cond);
-
-	PANDN(XMM1, M((void *)&double_qnan_bit));
-	PSRLQ(XMM1, 29);
-	if (cpu_info.bAVX) {
-		VPANDN(XMM0, XMM1, R(XMM0));
-	} else {
-		PANDN(XMM1, R(XMM0));
-		MOVSS(XMM0, R(XMM1));
-	}
-
-	SetJumpTarget(dont_reset_qnan_bit);
-	MOVDDUP(dst, R(XMM0));
-}
-
 void EmuCodeBlock::JitClearCA()
 {
 	AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0