mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-01-25 15:31:17 +01:00
JIT: various float optimizations
This commit is contained in:
parent
34287b8042
commit
7b0f559ae1
@ -10,8 +10,8 @@
|
|||||||
|
|
||||||
using namespace Gen;
|
using namespace Gen;
|
||||||
|
|
||||||
static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL};
|
static const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x0000000000000000ULL};
|
||||||
static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL};
|
static const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
|
||||||
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
|
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
|
||||||
|
|
||||||
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS)
|
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS)
|
||||||
@ -77,16 +77,7 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (X
|
|||||||
if (single)
|
if (single)
|
||||||
{
|
{
|
||||||
ForceSinglePrecisionS(fpr.RX(d));
|
ForceSinglePrecisionS(fpr.RX(d));
|
||||||
if (cpu_info.bSSE3)
|
MOVDDUP(fpr.RX(d), fpr.R(d));
|
||||||
{
|
|
||||||
MOVDDUP(fpr.RX(d), fpr.R(d));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (!fpr.R(d).IsSimpleReg(fpr.RX(d)))
|
|
||||||
MOVQ_xmm(fpr.RX(d), fpr.R(d));
|
|
||||||
UNPCKLPD(fpr.RX(d), R(fpr.RX(d)));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
@ -136,29 +127,29 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
|||||||
int d = inst.FD;
|
int d = inst.FD;
|
||||||
|
|
||||||
fpr.Lock(a, b, c, d);
|
fpr.Lock(a, b, c, d);
|
||||||
MOVSD(XMM0, fpr.R(c));
|
|
||||||
if (single_precision)
|
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
|
||||||
Force25BitPrecision(XMM0, XMM1);
|
if (inst.SUBOP5 == 30) //nmsub
|
||||||
switch (inst.SUBOP5)
|
|
||||||
{
|
{
|
||||||
case 28: //msub
|
MOVSD(XMM1, fpr.R(c));
|
||||||
|
if (single_precision)
|
||||||
|
Force25BitPrecision(XMM1, XMM0);
|
||||||
|
MULSD(XMM1, fpr.R(a));
|
||||||
|
MOVSD(XMM0, fpr.R(b));
|
||||||
|
SUBSD(XMM0, R(XMM1));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MOVSD(XMM0, fpr.R(c));
|
||||||
|
if (single_precision)
|
||||||
|
Force25BitPrecision(XMM0, XMM1);
|
||||||
MULSD(XMM0, fpr.R(a));
|
MULSD(XMM0, fpr.R(a));
|
||||||
SUBSD(XMM0, fpr.R(b));
|
if (inst.SUBOP5 == 28) //msub
|
||||||
break;
|
SUBSD(XMM0, fpr.R(b));
|
||||||
case 29: //madd
|
else //(n)madd
|
||||||
MULSD(XMM0, fpr.R(a));
|
ADDSD(XMM0, fpr.R(b));
|
||||||
ADDSD(XMM0, fpr.R(b));
|
if (inst.SUBOP5 == 31) //nmadd
|
||||||
break;
|
PXOR(XMM0, M((void*)&psSignBits2));
|
||||||
case 30: //nmsub
|
|
||||||
MULSD(XMM0, fpr.R(a));
|
|
||||||
SUBSD(XMM0, fpr.R(b));
|
|
||||||
PXOR(XMM0, M((void*)&psSignBits2));
|
|
||||||
break;
|
|
||||||
case 31: //nmadd
|
|
||||||
MULSD(XMM0, fpr.R(a));
|
|
||||||
ADDSD(XMM0, fpr.R(b));
|
|
||||||
PXOR(XMM0, M((void*)&psSignBits2));
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
fpr.BindToRegister(d, false);
|
fpr.BindToRegister(d, false);
|
||||||
//YES it is necessary to dupe the result :(
|
//YES it is necessary to dupe the result :(
|
||||||
@ -186,23 +177,26 @@ void Jit64::fsign(UGeckoInstruction inst)
|
|||||||
int b = inst.FB;
|
int b = inst.FB;
|
||||||
fpr.Lock(b, d);
|
fpr.Lock(b, d);
|
||||||
fpr.BindToRegister(d, true, true);
|
fpr.BindToRegister(d, true, true);
|
||||||
MOVSD(XMM0, fpr.R(b));
|
|
||||||
|
if (d != b)
|
||||||
|
MOVSD(fpr.RX(d), fpr.R(b));
|
||||||
switch (inst.SUBOP10)
|
switch (inst.SUBOP10)
|
||||||
{
|
{
|
||||||
case 40: // fnegx
|
case 40: // fnegx
|
||||||
PXOR(XMM0, M((void*)&psSignBits2));
|
// We can cheat and not worry about clobbering the top half by using masks
|
||||||
|
// that don't modify the top half.
|
||||||
|
PXOR(fpr.RX(d), M((void*)&psSignBits2));
|
||||||
break;
|
break;
|
||||||
case 264: // fabsx
|
case 264: // fabsx
|
||||||
PAND(XMM0, M((void*)&psAbsMask2));
|
PAND(fpr.RX(d), M((void*)&psAbsMask2));
|
||||||
break;
|
break;
|
||||||
case 136: // fnabs
|
case 136: // fnabs
|
||||||
POR(XMM0, M((void*)&psSignBits2));
|
POR(fpr.RX(d), M((void*)&psSignBits2));
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
PanicAlert("fsign bleh");
|
PanicAlert("fsign bleh");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
MOVSD(fpr.R(d), XMM0);
|
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -220,14 +214,22 @@ void Jit64::fmrx(UGeckoInstruction inst)
|
|||||||
|
|
||||||
fpr.Lock(b, d);
|
fpr.Lock(b, d);
|
||||||
|
|
||||||
// We don't need to load d, but if it is loaded, we need to mark it as dirty.
|
|
||||||
if (fpr.IsBound(d))
|
if (fpr.IsBound(d))
|
||||||
|
{
|
||||||
|
// We don't need to load d, but if it is loaded, we need to mark it as dirty.
|
||||||
fpr.BindToRegister(d);
|
fpr.BindToRegister(d);
|
||||||
|
// We have to use MOVLPD if b isn't loaded because "MOVSD reg, mem" sets the upper bits (64+)
|
||||||
// b needs to be in a register because "MOVSD reg, mem" sets the upper bits (64+) to zero and we don't want that.
|
// to zero and we don't want that.
|
||||||
fpr.BindToRegister(b, true, false);
|
if (!fpr.R(b).IsSimpleReg())
|
||||||
|
MOVLPD(fpr.RX(d), fpr.R(b));
|
||||||
MOVSD(fpr.R(d), fpr.RX(b));
|
else
|
||||||
|
MOVSD(fpr.R(d), fpr.RX(b));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
fpr.BindToRegister(b, true, false);
|
||||||
|
MOVSD(fpr.R(d), fpr.RX(b));
|
||||||
|
}
|
||||||
|
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user