mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-01-26 15:55:31 +01:00
Merge branch 'ppc_fp'
This commit is contained in:
commit
b863e40677
@ -43,6 +43,12 @@ struct CPUInfo
|
|||||||
bool bAVX;
|
bool bAVX;
|
||||||
bool bFMA;
|
bool bFMA;
|
||||||
bool bAES;
|
bool bAES;
|
||||||
|
// FXSAVE/FXRSTOR
|
||||||
|
bool bFXSR;
|
||||||
|
// This flag indicates that the hardware supports some mode
|
||||||
|
// in which denormal inputs _and_ outputs are automatically set to (signed) zero.
|
||||||
|
// TODO: ARM
|
||||||
|
bool bFlushToZero;
|
||||||
bool bLAHFSAHF64;
|
bool bLAHFSAHF64;
|
||||||
bool bLongMode;
|
bool bLongMode;
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ namespace FPURoundMode
|
|||||||
|
|
||||||
void SetPrecisionMode(u32 mode);
|
void SetPrecisionMode(u32 mode);
|
||||||
|
|
||||||
void SetSIMDMode(u32 mode);
|
void SetSIMDMode(u32 roundingMode, u32 nonIEEEMode);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* There are two different flavors of float to int conversion:
|
* There are two different flavors of float to int conversion:
|
||||||
|
@ -26,7 +26,7 @@ namespace FPURoundMode
|
|||||||
void SetPrecisionMode(u32 mode)
|
void SetPrecisionMode(u32 mode)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
void SetSIMDMode(u32 mode)
|
void SetSIMDMode(u32 mode, u32 nonIEEEMode)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
void SaveSIMDState()
|
void SaveSIMDState()
|
||||||
|
@ -64,10 +64,10 @@ inline float FlushToZero(float f)
|
|||||||
return x.f;
|
return x.f;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline double FlushToZeroAsFloat(double d)
|
inline double FlushToZero(double d)
|
||||||
{
|
{
|
||||||
IntDouble x; x.d = d;
|
IntDouble x; x.d = d;
|
||||||
if ((x.i & DOUBLE_EXP) < 0x3800000000000000ULL)
|
if ((x.i & DOUBLE_EXP) == 0)
|
||||||
x.i &= DOUBLE_SIGN; // turn into signed zero
|
x.i &= DOUBLE_SIGN; // turn into signed zero
|
||||||
return x.d;
|
return x.d;
|
||||||
}
|
}
|
||||||
|
@ -162,6 +162,34 @@ void CPUInfo::Detect()
|
|||||||
if ((cpu_id[2] >> 20) & 1) bSSE4_2 = true;
|
if ((cpu_id[2] >> 20) & 1) bSSE4_2 = true;
|
||||||
if ((cpu_id[2] >> 25) & 1) bAES = true;
|
if ((cpu_id[2] >> 25) & 1) bAES = true;
|
||||||
|
|
||||||
|
// To check DAZ support, we first need to check FXSAVE support.
|
||||||
|
if ((cpu_id[3] >> 24) & 1)
|
||||||
|
{
|
||||||
|
// We can use FXSAVE.
|
||||||
|
bFXSR = true;
|
||||||
|
|
||||||
|
GC_ALIGNED16(u8 fx_state[512]);
|
||||||
|
memset(fx_state, 0, sizeof(fx_state));
|
||||||
|
#ifdef _WIN32
|
||||||
|
#ifdef _M_IX86
|
||||||
|
_fxsave(fx_state);
|
||||||
|
#elif defined (_M_X64)
|
||||||
|
_fxsave64(fx_state);
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
__asm__("fxsave %0" : "=m" (fx_state));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// lowest byte of MXCSR_MASK
|
||||||
|
if ((fx_state[0x1C] >> 6) & 1)
|
||||||
|
{
|
||||||
|
// On x86, the FTZ field (supported since SSE1) only flushes denormal _outputs_ to zero,
|
||||||
|
// now that we checked DAZ support (flushing denormal _inputs_ to zero),
|
||||||
|
// we can set our generic flag.
|
||||||
|
bFlushToZero = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// AVX support requires 3 separate checks:
|
// AVX support requires 3 separate checks:
|
||||||
// - Is the AVX bit set in CPUID?
|
// - Is the AVX bit set in CPUID?
|
||||||
// - Is the XSAVE bit set in CPUID?
|
// - Is the XSAVE bit set in CPUID?
|
||||||
@ -222,7 +250,12 @@ std::string CPUInfo::Summarize()
|
|||||||
{
|
{
|
||||||
std::string sum(cpu_string);
|
std::string sum(cpu_string);
|
||||||
if (bSSE) sum += ", SSE";
|
if (bSSE) sum += ", SSE";
|
||||||
if (bSSE2) sum += ", SSE2";
|
if (bSSE2)
|
||||||
|
{
|
||||||
|
sum += ", SSE2";
|
||||||
|
if (!bFlushToZero)
|
||||||
|
sum += " (but not DAZ!)";
|
||||||
|
}
|
||||||
if (bSSE3) sum += ", SSE3";
|
if (bSSE3) sum += ", SSE3";
|
||||||
if (bSSSE3) sum += ", SSSE3";
|
if (bSSSE3) sum += ", SSSE3";
|
||||||
if (bSSE4_1) sum += ", SSE4.1";
|
if (bSSE4_1) sum += ", SSE4.1";
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
#include "Common.h"
|
#include "Common.h"
|
||||||
#include "FPURoundMode.h"
|
#include "FPURoundMode.h"
|
||||||
|
#include "CPUDetect.h"
|
||||||
|
|
||||||
#ifndef _WIN32
|
#ifndef _WIN32
|
||||||
static const unsigned short FPU_ROUND_NEAR = 0 << 10;
|
static const unsigned short FPU_ROUND_NEAR = 0 << 10;
|
||||||
@ -14,8 +15,11 @@ static const unsigned short FPU_ROUND_MASK = 3 << 10;
|
|||||||
#include <xmmintrin.h>
|
#include <xmmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const u32 MASKS = 0x1F80; // mask away the interrupts.
|
// OR-mask for disabling FPU exceptions (bits 7-12 in the MXCSR register)
|
||||||
|
const u32 EXCEPTION_MASK = 0x1F80;
|
||||||
|
// Denormals-Are-Zero (non-IEEE mode: denormal inputs are set to +/- 0)
|
||||||
const u32 DAZ = 0x40;
|
const u32 DAZ = 0x40;
|
||||||
|
// Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0)
|
||||||
const u32 FTZ = 0x8000;
|
const u32 FTZ = 0x8000;
|
||||||
|
|
||||||
namespace FPURoundMode
|
namespace FPURoundMode
|
||||||
@ -79,16 +83,28 @@ namespace FPURoundMode
|
|||||||
//but still - set any useful sse options here
|
//but still - set any useful sse options here
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
void SetSIMDMode(u32 mode)
|
|
||||||
|
void SetSIMDMode(u32 roundingMode, u32 nonIEEEMode)
|
||||||
{
|
{
|
||||||
static const u32 ssetable[4] =
|
// lookup table for FPSCR.RN-to-MXCSR.RC translation
|
||||||
|
static const u32 roundingModeLUT[4] =
|
||||||
{
|
{
|
||||||
(0 << 13) | MASKS,
|
(0 << 13) | EXCEPTION_MASK, // nearest
|
||||||
(3 << 13) | MASKS,
|
(3 << 13) | EXCEPTION_MASK, // -inf
|
||||||
(2 << 13) | MASKS,
|
(2 << 13) | EXCEPTION_MASK, // +inf
|
||||||
(1 << 13) | MASKS,
|
(1 << 13) | EXCEPTION_MASK, // zero
|
||||||
};
|
};
|
||||||
u32 csr = ssetable[mode];
|
u32 csr = roundingModeLUT[roundingMode];
|
||||||
|
|
||||||
|
static const u32 denormalLUT[2] =
|
||||||
|
{
|
||||||
|
FTZ, // flush-to-zero only
|
||||||
|
FTZ | DAZ, // flush-to-zero and denormals-are-zero (may not be supported)
|
||||||
|
};
|
||||||
|
if (nonIEEEMode)
|
||||||
|
{
|
||||||
|
csr |= denormalLUT[cpu_info.bFlushToZero];
|
||||||
|
}
|
||||||
_mm_setcsr(csr);
|
_mm_setcsr(csr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#ifndef _INTERPRETER_FPUTILS_H
|
#ifndef _INTERPRETER_FPUTILS_H
|
||||||
#define _INTERPRETER_FPUTILS_H
|
#define _INTERPRETER_FPUTILS_H
|
||||||
|
|
||||||
|
#include "CPUDetect.h"
|
||||||
#include "Interpreter.h"
|
#include "Interpreter.h"
|
||||||
#include "MathUtil.h"
|
#include "MathUtil.h"
|
||||||
|
|
||||||
@ -69,28 +70,22 @@ inline void UpdateFPSCR()
|
|||||||
|
|
||||||
inline double ForceSingle(double _x)
|
inline double ForceSingle(double _x)
|
||||||
{
|
{
|
||||||
//if (FPSCR.RN != 0)
|
// convert to float...
|
||||||
// PanicAlert("RN = %d at %x", (int)FPSCR.RN, PC);
|
float x = _x;
|
||||||
if (FPSCR.NI)
|
if (!cpu_info.bFlushToZero && FPSCR.NI)
|
||||||
_x = FlushToZeroAsFloat(_x);
|
{
|
||||||
|
x = FlushToZero(x);
|
||||||
double x = static_cast<float>(_x);
|
}
|
||||||
|
// ...and back to double:
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline double ForceDouble(double d)
|
inline double ForceDouble(double d)
|
||||||
{
|
{
|
||||||
//if (FPSCR.RN != 0)
|
if (!cpu_info.bFlushToZero && FPSCR.NI)
|
||||||
// PanicAlert("RN = %d at %x", (int)FPSCR.RN, PC);
|
{
|
||||||
|
d = FlushToZero(d);
|
||||||
//if (FPSCR.NI)
|
}
|
||||||
//{
|
|
||||||
// IntDouble x; x.d = d;
|
|
||||||
//if ((x.i & DOUBLE_EXP) == 0)
|
|
||||||
// x.i &= DOUBLE_SIGN; // turn into signed zero
|
|
||||||
// return x.d;
|
|
||||||
//}
|
|
||||||
return d;
|
return d;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -48,15 +48,8 @@ static void FPSCRtoFPUSettings(UReg_FPSCR fp)
|
|||||||
// Pokemon Colosseum does this. Gah.
|
// Pokemon Colosseum does this. Gah.
|
||||||
}
|
}
|
||||||
|
|
||||||
// Also corresponding SSE rounding mode setting
|
// Set SSE rounding mode and denormal handling
|
||||||
if (FPSCR.NI)
|
FPURoundMode::SetSIMDMode(FPSCR.RN, FPSCR.NI);
|
||||||
{
|
|
||||||
// Either one of these two breaks Beyond Good & Evil.
|
|
||||||
// if (cpu_info.bSSSE3)
|
|
||||||
// csr |= DAZ;
|
|
||||||
// csr |= FTZ;
|
|
||||||
}
|
|
||||||
FPURoundMode::SetSIMDMode(FPSCR.RN);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Interpreter::mtfsb0x(UGeckoInstruction _inst)
|
void Interpreter::mtfsb0x(UGeckoInstruction _inst)
|
||||||
|
@ -182,7 +182,7 @@ public:
|
|||||||
void ps_sum(UGeckoInstruction inst);
|
void ps_sum(UGeckoInstruction inst);
|
||||||
void ps_muls(UGeckoInstruction inst);
|
void ps_muls(UGeckoInstruction inst);
|
||||||
|
|
||||||
void fp_arith_s(UGeckoInstruction inst);
|
void fp_arith(UGeckoInstruction inst);
|
||||||
void frsqrtex(UGeckoInstruction inst);
|
void frsqrtex(UGeckoInstruction inst);
|
||||||
|
|
||||||
void fcmpx(UGeckoInstruction inst);
|
void fcmpx(UGeckoInstruction inst);
|
||||||
|
@ -320,12 +320,12 @@ static GekkoOPTemplate table31_2[] =
|
|||||||
|
|
||||||
static GekkoOPTemplate table59[] =
|
static GekkoOPTemplate table59[] =
|
||||||
{
|
{
|
||||||
{18, &Jit64::Default}, //{"fdivsx", OPTYPE_FPU, FL_RC_BIT_F, 16}},
|
{18, &Jit64::fp_arith}, //{"fdivsx", OPTYPE_FPU, FL_RC_BIT_F, 16}},
|
||||||
{20, &Jit64::fp_arith_s}, //"fsubsx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{20, &Jit64::fp_arith}, //"fsubsx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{21, &Jit64::fp_arith_s}, //"faddsx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{21, &Jit64::fp_arith}, //"faddsx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
// {22, &Jit64::Default}, //"fsqrtsx", OPTYPE_FPU, FL_RC_BIT_F}}, // Not implemented on gekko
|
// {22, &Jit64::Default}, //"fsqrtsx", OPTYPE_FPU, FL_RC_BIT_F}}, // Not implemented on gekko
|
||||||
{24, &Jit64::Default}, //"fresx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{24, &Jit64::Default}, //"fresx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{25, &Jit64::fp_arith_s}, //"fmulsx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{25, &Jit64::fp_arith}, //"fmulsx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{28, &Jit64::fmaddXX}, //"fmsubsx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{28, &Jit64::fmaddXX}, //"fmsubsx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{29, &Jit64::fmaddXX}, //"fmaddsx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{29, &Jit64::fmaddXX}, //"fmaddsx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{30, &Jit64::fmaddXX}, //"fnmsubsx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{30, &Jit64::fmaddXX}, //"fnmsubsx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
@ -354,12 +354,12 @@ static GekkoOPTemplate table63[] =
|
|||||||
|
|
||||||
static GekkoOPTemplate table63_2[] =
|
static GekkoOPTemplate table63_2[] =
|
||||||
{
|
{
|
||||||
{18, &Jit64::Default}, //"fdivx", OPTYPE_FPU, FL_RC_BIT_F, 30}},
|
{18, &Jit64::fp_arith}, //"fdivx", OPTYPE_FPU, FL_RC_BIT_F, 30}},
|
||||||
{20, &Jit64::Default}, //"fsubx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{20, &Jit64::fp_arith}, //"fsubx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{21, &Jit64::Default}, //"faddx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{21, &Jit64::fp_arith}, //"faddx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{22, &Jit64::Default}, //"fsqrtx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{22, &Jit64::Default}, //"fsqrtx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{23, &Jit64::Default}, //"fselx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{23, &Jit64::Default}, //"fselx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{25, &Jit64::fp_arith_s}, //"fmulx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{25, &Jit64::fp_arith}, //"fmulx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{26, &Jit64::frsqrtex}, //"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F}},
|
{26, &Jit64::frsqrtex}, //"frsqrtex", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{28, &Jit64::fmaddXX}, //"fmsubx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{28, &Jit64::fmaddXX}, //"fmsubx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
{29, &Jit64::fmaddXX}, //"fmaddx", OPTYPE_FPU, FL_RC_BIT_F}},
|
{29, &Jit64::fmaddXX}, //"fmaddx", OPTYPE_FPU, FL_RC_BIT_F}},
|
||||||
|
@ -85,7 +85,7 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single,
|
|||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
void Jit64::fp_arith_s(UGeckoInstruction inst)
|
void Jit64::fp_arith(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
JITDISABLE(bJITFloatingPointOff)
|
JITDISABLE(bJITFloatingPointOff)
|
||||||
@ -106,7 +106,7 @@ void Jit64::fp_arith_s(UGeckoInstruction inst)
|
|||||||
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD, &XEmitter::VADDSD); break; //add
|
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD, &XEmitter::VADDSD); break; //add
|
||||||
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, &XEmitter::VMULSD); break; //mul
|
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, &XEmitter::VMULSD); break; //mul
|
||||||
default:
|
default:
|
||||||
_assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!");
|
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user