mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-01-25 15:31:17 +01:00
Merge pull request #1216 from FioraAeterna/movoptimizations
Add more AVX support, refactor emitter, reduce redundant XMM moves
This commit is contained in:
commit
8fdf43109f
@ -1340,41 +1340,56 @@ void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a)
|
||||
}
|
||||
|
||||
|
||||
void XEmitter::WriteSSEOp(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
|
||||
void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
|
||||
{
|
||||
if (size == 64 && packed)
|
||||
Write8(0x66); //this time, override goes upwards
|
||||
if (!packed)
|
||||
Write8(size == 64 ? 0xF2 : 0xF3);
|
||||
if (opPrefix)
|
||||
Write8(opPrefix);
|
||||
arg.operandReg = regOp;
|
||||
arg.WriteRex(this, 0, 0);
|
||||
Write8(0x0F);
|
||||
if (sseOp > 0xFF)
|
||||
Write8((sseOp >> 8) & 0xFF);
|
||||
Write8(sseOp & 0xFF);
|
||||
if (op > 0xFF)
|
||||
Write8((op >> 8) & 0xFF);
|
||||
Write8(op & 0xFF);
|
||||
arg.WriteRest(this, extrabytes);
|
||||
}
|
||||
|
||||
void XEmitter::WriteAVXOp(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
|
||||
void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
|
||||
{
|
||||
WriteAVXOp(size, sseOp, packed, regOp, X64Reg::INVALID_REG, arg, extrabytes);
|
||||
WriteAVXOp(opPrefix, op, regOp, INVALID_REG, arg, extrabytes);
|
||||
}
|
||||
|
||||
void XEmitter::WriteAVXOp(int size, u16 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
|
||||
static int GetVEXmmmmm(u16 op)
|
||||
{
|
||||
// Currently, only 0x38 and 0x3A are used as secondary escape byte.
|
||||
if ((op >> 8) == 0x3A)
|
||||
return 3;
|
||||
else if ((op >> 8) == 0x38)
|
||||
return 2;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int GetVEXpp(u8 opPrefix)
|
||||
{
|
||||
if (opPrefix == 0x66)
|
||||
return 1;
|
||||
else if (opPrefix == 0xF3)
|
||||
return 2;
|
||||
else if (opPrefix == 0xF2)
|
||||
return 3;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
|
||||
{
|
||||
if (!cpu_info.bAVX)
|
||||
PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer.");
|
||||
// Currently, only 0x38 and 0x3A are used as secondary escape byte.
|
||||
int mmmmm;
|
||||
if ((sseOp >> 8) == 0x3A)
|
||||
mmmmm = 3;
|
||||
else if ((sseOp >> 8) == 0x38)
|
||||
mmmmm = 2;
|
||||
else
|
||||
mmmmm = 1;
|
||||
int mmmmm = GetVEXmmmmm(op);
|
||||
int pp = GetVEXpp(opPrefix);
|
||||
// FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here
|
||||
arg.WriteVex(this, regOp1, regOp2, 0, (packed << 1) | (size == 64), mmmmm);
|
||||
Write8(sseOp & 0xFF);
|
||||
arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm);
|
||||
Write8(op & 0xFF);
|
||||
arg.WriteRest(this, extrabytes, regOp1);
|
||||
}
|
||||
|
||||
@ -1383,21 +1398,8 @@ void XEmitter::WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg r
|
||||
{
|
||||
if (size != 32 && size != 64)
|
||||
PanicAlert("VEX GPR instructions only support 32-bit and 64-bit modes!");
|
||||
int mmmmm, pp;
|
||||
if ((op >> 8) == 0x3A)
|
||||
mmmmm = 3;
|
||||
else if ((op >> 8) == 0x38)
|
||||
mmmmm = 2;
|
||||
else
|
||||
mmmmm = 1;
|
||||
if (opPrefix == 0x66)
|
||||
pp = 1;
|
||||
else if (opPrefix == 0xF3)
|
||||
pp = 2;
|
||||
else if (opPrefix == 0xF2)
|
||||
pp = 3;
|
||||
else
|
||||
pp = 0;
|
||||
int mmmmm = GetVEXmmmmm(op);
|
||||
int pp = GetVEXpp(opPrefix);
|
||||
arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm, size == 64);
|
||||
Write8(op & 0xFF);
|
||||
arg.WriteRest(this, extrabytes, regOp1);
|
||||
@ -1419,8 +1421,8 @@ void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg
|
||||
WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
|
||||
}
|
||||
|
||||
void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x6E, true, dest, arg, 0);}
|
||||
void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(64, 0x7E, true, src, arg, 0);}
|
||||
void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6E, dest, arg, 0);}
|
||||
void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(0x66, 0x7E, src, arg, 0);}
|
||||
|
||||
void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg)
|
||||
{
|
||||
@ -1473,123 +1475,123 @@ void XEmitter::WriteMXCSR(OpArg arg, int ext)
|
||||
void XEmitter::STMXCSR(OpArg memloc) {WriteMXCSR(memloc, 3);}
|
||||
void XEmitter::LDMXCSR(OpArg memloc) {WriteMXCSR(memloc, 2);}
|
||||
|
||||
void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVNTDQ, true, regOp, arg);}
|
||||
void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVNTP, true, regOp, arg);}
|
||||
void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVNTP, true, regOp, arg);}
|
||||
void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);}
|
||||
void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVNTP, regOp, arg);}
|
||||
void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTP, regOp, arg);}
|
||||
|
||||
void XEmitter::ADDSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseADD, false, regOp, arg);}
|
||||
void XEmitter::ADDSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseADD, false, regOp, arg);}
|
||||
void XEmitter::SUBSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSUB, false, regOp, arg);}
|
||||
void XEmitter::SUBSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSUB, false, regOp, arg);}
|
||||
void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(32, sseCMP, false, regOp, arg,1); Write8(compare);}
|
||||
void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(64, sseCMP, false, regOp, arg,1); Write8(compare);}
|
||||
void XEmitter::MULSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMUL, false, regOp, arg);}
|
||||
void XEmitter::MULSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMUL, false, regOp, arg);}
|
||||
void XEmitter::DIVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseDIV, false, regOp, arg);}
|
||||
void XEmitter::DIVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseDIV, false, regOp, arg);}
|
||||
void XEmitter::MINSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMIN, false, regOp, arg);}
|
||||
void XEmitter::MINSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMIN, false, regOp, arg);}
|
||||
void XEmitter::MAXSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMAX, false, regOp, arg);}
|
||||
void XEmitter::MAXSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMAX, false, regOp, arg);}
|
||||
void XEmitter::SQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSQRT, false, regOp, arg);}
|
||||
void XEmitter::SQRTSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSQRT, false, regOp, arg);}
|
||||
void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseRSQRT, false, regOp, arg);}
|
||||
void XEmitter::ADDSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseADD, regOp, arg);}
|
||||
void XEmitter::ADDSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseADD, regOp, arg);}
|
||||
void XEmitter::SUBSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSUB, regOp, arg);}
|
||||
void XEmitter::SUBSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSUB, regOp, arg);}
|
||||
void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF3, sseCMP, regOp, arg, 1); Write8(compare);}
|
||||
void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0xF2, sseCMP, regOp, arg, 1); Write8(compare);}
|
||||
void XEmitter::MULSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMUL, regOp, arg);}
|
||||
void XEmitter::MULSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMUL, regOp, arg);}
|
||||
void XEmitter::DIVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseDIV, regOp, arg);}
|
||||
void XEmitter::DIVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseDIV, regOp, arg);}
|
||||
void XEmitter::MINSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMIN, regOp, arg);}
|
||||
void XEmitter::MINSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMIN, regOp, arg);}
|
||||
void XEmitter::MAXSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMAX, regOp, arg);}
|
||||
void XEmitter::MAXSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMAX, regOp, arg);}
|
||||
void XEmitter::SQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseSQRT, regOp, arg);}
|
||||
void XEmitter::SQRTSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseSQRT, regOp, arg);}
|
||||
void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseRSQRT, regOp, arg);}
|
||||
|
||||
void XEmitter::ADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseADD, true, regOp, arg);}
|
||||
void XEmitter::ADDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseADD, true, regOp, arg);}
|
||||
void XEmitter::SUBPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSUB, true, regOp, arg);}
|
||||
void XEmitter::SUBPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSUB, true, regOp, arg);}
|
||||
void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(32, sseCMP, true, regOp, arg,1); Write8(compare);}
|
||||
void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(64, sseCMP, true, regOp, arg,1); Write8(compare);}
|
||||
void XEmitter::ANDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseAND, true, regOp, arg);}
|
||||
void XEmitter::ANDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseAND, true, regOp, arg);}
|
||||
void XEmitter::ANDNPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseANDN, true, regOp, arg);}
|
||||
void XEmitter::ANDNPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseANDN, true, regOp, arg);}
|
||||
void XEmitter::ORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseOR, true, regOp, arg);}
|
||||
void XEmitter::ORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseOR, true, regOp, arg);}
|
||||
void XEmitter::XORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseXOR, true, regOp, arg);}
|
||||
void XEmitter::XORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseXOR, true, regOp, arg);}
|
||||
void XEmitter::MULPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMUL, true, regOp, arg);}
|
||||
void XEmitter::MULPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMUL, true, regOp, arg);}
|
||||
void XEmitter::DIVPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseDIV, true, regOp, arg);}
|
||||
void XEmitter::DIVPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseDIV, true, regOp, arg);}
|
||||
void XEmitter::MINPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMIN, true, regOp, arg);}
|
||||
void XEmitter::MINPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMIN, true, regOp, arg);}
|
||||
void XEmitter::MAXPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMAX, true, regOp, arg);}
|
||||
void XEmitter::MAXPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMAX, true, regOp, arg);}
|
||||
void XEmitter::SQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseSQRT, true, regOp, arg);}
|
||||
void XEmitter::SQRTPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseSQRT, true, regOp, arg);}
|
||||
void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseRSQRT, true, regOp, arg);}
|
||||
void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(32, sseSHUF, true, regOp, arg,1); Write8(shuffle);}
|
||||
void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(64, sseSHUF, true, regOp, arg,1); Write8(shuffle);}
|
||||
void XEmitter::ADDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseADD, regOp, arg);}
|
||||
void XEmitter::ADDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseADD, regOp, arg);}
|
||||
void XEmitter::SUBPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSUB, regOp, arg);}
|
||||
void XEmitter::SUBPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSUB, regOp, arg);}
|
||||
void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x00, sseCMP, regOp, arg, 1); Write8(compare);}
|
||||
void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare) {WriteSSEOp(0x66, sseCMP, regOp, arg, 1); Write8(compare);}
|
||||
void XEmitter::ANDPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseAND, regOp, arg);}
|
||||
void XEmitter::ANDPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseAND, regOp, arg);}
|
||||
void XEmitter::ANDNPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseANDN, regOp, arg);}
|
||||
void XEmitter::ANDNPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseANDN, regOp, arg);}
|
||||
void XEmitter::ORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseOR, regOp, arg);}
|
||||
void XEmitter::ORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseOR, regOp, arg);}
|
||||
void XEmitter::XORPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseXOR, regOp, arg);}
|
||||
void XEmitter::XORPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseXOR, regOp, arg);}
|
||||
void XEmitter::MULPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMUL, regOp, arg);}
|
||||
void XEmitter::MULPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMUL, regOp, arg);}
|
||||
void XEmitter::DIVPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseDIV, regOp, arg);}
|
||||
void XEmitter::DIVPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseDIV, regOp, arg);}
|
||||
void XEmitter::MINPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMIN, regOp, arg);}
|
||||
void XEmitter::MINPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMIN, regOp, arg);}
|
||||
void XEmitter::MAXPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMAX, regOp, arg);}
|
||||
void XEmitter::MAXPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMAX, regOp, arg);}
|
||||
void XEmitter::SQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseSQRT, regOp, arg);}
|
||||
void XEmitter::SQRTPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseSQRT, regOp, arg);}
|
||||
void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseRSQRT, regOp, arg);}
|
||||
void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x00, sseSHUF, regOp, arg,1); Write8(shuffle);}
|
||||
void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, sseSHUF, regOp, arg,1); Write8(shuffle);}
|
||||
|
||||
void XEmitter::COMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseCOMIS, true, regOp, arg);} //weird that these should be packed
|
||||
void XEmitter::COMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseCOMIS, true, regOp, arg);} //ordered
|
||||
void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseUCOMIS, true, regOp, arg);} //unordered
|
||||
void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseUCOMIS, true, regOp, arg);}
|
||||
void XEmitter::COMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseCOMIS, regOp, arg);} //weird that these should be packed
|
||||
void XEmitter::COMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseCOMIS, regOp, arg);} //ordered
|
||||
void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseUCOMIS, regOp, arg);} //unordered
|
||||
void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseUCOMIS, regOp, arg);}
|
||||
|
||||
void XEmitter::MOVAPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMOVAPfromRM, true, regOp, arg);}
|
||||
void XEmitter::MOVAPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVAPfromRM, true, regOp, arg);}
|
||||
void XEmitter::MOVAPS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVAPtoRM, true, regOp, arg);}
|
||||
void XEmitter::MOVAPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVAPtoRM, true, regOp, arg);}
|
||||
void XEmitter::MOVAPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);}
|
||||
void XEmitter::MOVAPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);}
|
||||
void XEmitter::MOVAPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);}
|
||||
void XEmitter::MOVAPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);}
|
||||
|
||||
void XEmitter::MOVUPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMOVUPfromRM, true, regOp, arg);}
|
||||
void XEmitter::MOVUPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVUPfromRM, true, regOp, arg);}
|
||||
void XEmitter::MOVUPS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVUPtoRM, true, regOp, arg);}
|
||||
void XEmitter::MOVUPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVUPtoRM, true, regOp, arg);}
|
||||
void XEmitter::MOVUPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);}
|
||||
void XEmitter::MOVUPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);}
|
||||
void XEmitter::MOVUPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);}
|
||||
void XEmitter::MOVUPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);}
|
||||
|
||||
void XEmitter::MOVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseMOVUPfromRM, false, regOp, arg);}
|
||||
void XEmitter::MOVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVUPfromRM, false, regOp, arg);}
|
||||
void XEmitter::MOVSS(OpArg arg, X64Reg regOp) {WriteSSEOp(32, sseMOVUPtoRM, false, regOp, arg);}
|
||||
void XEmitter::MOVSD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVUPtoRM, false, regOp, arg);}
|
||||
void XEmitter::MOVSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);}
|
||||
void XEmitter::MOVSD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);}
|
||||
void XEmitter::MOVSS(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);}
|
||||
void XEmitter::MOVSD(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);}
|
||||
|
||||
void XEmitter::MOVLPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVLPDfromRM, false, regOp, arg);}
|
||||
void XEmitter::MOVHPD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseMOVHPDfromRM, false, regOp, arg);}
|
||||
void XEmitter::MOVLPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVLPDtoRM, false, regOp, arg);}
|
||||
void XEmitter::MOVHPD(OpArg arg, X64Reg regOp) {WriteSSEOp(64, sseMOVHPDtoRM, false, regOp, arg);}
|
||||
void XEmitter::MOVLPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMOVLPDfromRM, regOp, arg);}
|
||||
void XEmitter::MOVHPD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, sseMOVHPDfromRM, regOp, arg);}
|
||||
void XEmitter::MOVLPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVLPDtoRM, regOp, arg);}
|
||||
void XEmitter::MOVHPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0xF2, sseMOVHPDtoRM, regOp, arg);}
|
||||
|
||||
void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(32, sseMOVHLPS, true, regOp1, R(regOp2));}
|
||||
void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(32, sseMOVLHPS, true, regOp1, R(regOp2));}
|
||||
void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));}
|
||||
void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));}
|
||||
|
||||
void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5A, true, regOp, arg);}
|
||||
void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5A, true, regOp, arg);}
|
||||
void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5A, regOp, arg);}
|
||||
void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5A, regOp, arg);}
|
||||
|
||||
void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5A, false, regOp, arg);}
|
||||
void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5A, false, regOp, arg);}
|
||||
void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x2D, false, regOp, arg);}
|
||||
void XEmitter::CVTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x2D, false, regOp, arg);}
|
||||
void XEmitter::CVTSI2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x2A, false, regOp, arg);}
|
||||
void XEmitter::CVTSI2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x2A, false, regOp, arg);}
|
||||
void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x5A, regOp, arg);}
|
||||
void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5A, regOp, arg);}
|
||||
void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2D, regOp, arg);}
|
||||
void XEmitter::CVTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2D, regOp, arg);}
|
||||
void XEmitter::CVTSI2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2A, regOp, arg);}
|
||||
void XEmitter::CVTSI2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2A, regOp, arg);}
|
||||
|
||||
void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0xE6, false, regOp, arg);}
|
||||
void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5B, true, regOp, arg);}
|
||||
void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0xE6, false, regOp, arg);}
|
||||
void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5B, true, regOp, arg);}
|
||||
void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0xE6, regOp, arg);}
|
||||
void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5B, regOp, arg);}
|
||||
void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0xE6, regOp, arg);}
|
||||
void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5B, regOp, arg);}
|
||||
|
||||
void XEmitter::CVTTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x2C, false, regOp, arg);}
|
||||
void XEmitter::CVTTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x2C, false, regOp, arg);}
|
||||
void XEmitter::CVTTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5B, false, regOp, arg);}
|
||||
void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0xE6, true, regOp, arg);}
|
||||
void XEmitter::CVTTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2C, regOp, arg);}
|
||||
void XEmitter::CVTTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2C, regOp, arg);}
|
||||
void XEmitter::CVTTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5B, regOp, arg);}
|
||||
void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0xE6, regOp, arg);}
|
||||
|
||||
void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(64, sseMASKMOVDQU, true, dest, R(src));}
|
||||
void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));}
|
||||
|
||||
void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x50, true, dest, arg);}
|
||||
void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x50, true, dest, arg);}
|
||||
void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x50, dest, arg);}
|
||||
void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, arg);}
|
||||
|
||||
void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(64, sseLDDQU, false, dest, arg);} // For integer data only
|
||||
void XEmitter::LDDQU(X64Reg dest, OpArg arg) {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only
|
||||
|
||||
// THESE TWO ARE UNTESTED.
|
||||
void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x14, true, dest, arg);}
|
||||
void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x15, true, dest, arg);}
|
||||
void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);}
|
||||
void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);}
|
||||
|
||||
void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x14, true, dest, arg);}
|
||||
void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x15, true, dest, arg);}
|
||||
void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x14, dest, arg);}
|
||||
void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x15, dest, arg);}
|
||||
|
||||
void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg)
|
||||
{
|
||||
if (cpu_info.bSSE3)
|
||||
{
|
||||
WriteSSEOp(64, 0x12, false, regOp, arg); //SSE3 movddup
|
||||
WriteSSEOp(0xF2, 0x12, regOp, arg); //SSE3 movddup
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -1603,53 +1605,52 @@ void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg)
|
||||
//There are a few more left
|
||||
|
||||
// Also some integer instructions are missing
|
||||
void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x6B, true, dest, arg);}
|
||||
void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x63, true, dest, arg);}
|
||||
void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x67, true, dest, arg);}
|
||||
void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x6B, dest, arg);}
|
||||
void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x63, dest, arg);}
|
||||
void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x67, dest, arg);}
|
||||
|
||||
void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x60, true, dest, arg);}
|
||||
void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x61, true, dest, arg);}
|
||||
void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x62, true, dest, arg);}
|
||||
//void PUNPCKLQDQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x60, true, dest, arg);}
|
||||
void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x60, dest, arg);}
|
||||
void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x61, dest, arg);}
|
||||
void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x62, dest, arg);}
|
||||
|
||||
void XEmitter::PSRLW(X64Reg reg, int shift)
|
||||
{
|
||||
WriteSSEOp(64, 0x71, true, (X64Reg)2, R(reg));
|
||||
WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::PSRLD(X64Reg reg, int shift)
|
||||
{
|
||||
WriteSSEOp(64, 0x72, true, (X64Reg)2, R(reg));
|
||||
WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::PSRLQ(X64Reg reg, int shift)
|
||||
{
|
||||
WriteSSEOp(64, 0x73, true, (X64Reg)2, R(reg));
|
||||
WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::PSRLQ(X64Reg reg, OpArg arg)
|
||||
{
|
||||
WriteSSEOp(64, 0xd3, true, reg, arg);
|
||||
WriteSSEOp(0x66, 0xd3, reg, arg);
|
||||
}
|
||||
|
||||
void XEmitter::PSLLW(X64Reg reg, int shift)
|
||||
{
|
||||
WriteSSEOp(64, 0x71, true, (X64Reg)6, R(reg));
|
||||
WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::PSLLD(X64Reg reg, int shift)
|
||||
{
|
||||
WriteSSEOp(64, 0x72, true, (X64Reg)6, R(reg));
|
||||
WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::PSLLQ(X64Reg reg, int shift)
|
||||
{
|
||||
WriteSSEOp(64, 0x73, true, (X64Reg)6, R(reg));
|
||||
WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
@ -1677,100 +1678,109 @@ void XEmitter::PSRAD(X64Reg reg, int shift)
|
||||
Write8(shift);
|
||||
}
|
||||
|
||||
void XEmitter::WriteSSSE3Op(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
|
||||
void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
|
||||
{
|
||||
if (!cpu_info.bSSSE3)
|
||||
PanicAlert("Trying to use SSSE3 on a system that doesn't support it. Bad programmer.");
|
||||
WriteSSEOp(size, sseOp, packed, regOp, arg, extrabytes);
|
||||
WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
|
||||
}
|
||||
|
||||
void XEmitter::WriteSSE41Op(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
|
||||
void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
|
||||
{
|
||||
if (!cpu_info.bSSE4_1)
|
||||
PanicAlert("Trying to use SSE4.1 on a system that doesn't support it. Bad programmer.");
|
||||
WriteSSEOp(size, sseOp, packed, regOp, arg, extrabytes);
|
||||
WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
|
||||
}
|
||||
|
||||
void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {WriteSSSE3Op(64, 0x3800, true, dest, arg);}
|
||||
void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3817, true, dest, arg);}
|
||||
void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x382b, true, dest, arg);}
|
||||
void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {WriteSSSE3Op(0x66, 0x3800, dest, arg);}
|
||||
void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest, arg);}
|
||||
void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
|
||||
|
||||
void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3820, true, dest, arg);}
|
||||
void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3821, true, dest, arg);}
|
||||
void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3822, true, dest, arg);}
|
||||
void XEmitter::PMOVSXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3823, true, dest, arg);}
|
||||
void XEmitter::PMOVSXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3824, true, dest, arg);}
|
||||
void XEmitter::PMOVSXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3825, true, dest, arg);}
|
||||
void XEmitter::PMOVZXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3830, true, dest, arg);}
|
||||
void XEmitter::PMOVZXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3831, true, dest, arg);}
|
||||
void XEmitter::PMOVZXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3832, true, dest, arg);}
|
||||
void XEmitter::PMOVZXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3833, true, dest, arg);}
|
||||
void XEmitter::PMOVZXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3834, true, dest, arg);}
|
||||
void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3835, true, dest, arg);}
|
||||
void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3820, dest, arg);}
|
||||
void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3821, dest, arg);}
|
||||
void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3822, dest, arg);}
|
||||
void XEmitter::PMOVSXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3823, dest, arg);}
|
||||
void XEmitter::PMOVSXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3824, dest, arg);}
|
||||
void XEmitter::PMOVSXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3825, dest, arg);}
|
||||
void XEmitter::PMOVZXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3830, dest, arg);}
|
||||
void XEmitter::PMOVZXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3831, dest, arg);}
|
||||
void XEmitter::PMOVZXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3832, dest, arg);}
|
||||
void XEmitter::PMOVZXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3833, dest, arg);}
|
||||
void XEmitter::PMOVZXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3834, dest, arg);}
|
||||
void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3835, dest, arg);}
|
||||
|
||||
void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3810, true, dest, arg);}
|
||||
void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3814, true, dest, arg);}
|
||||
void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(64, 0x3815, true, dest, arg);}
|
||||
void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);}
|
||||
void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);}
|
||||
void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);}
|
||||
|
||||
void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDB, true, dest, arg);}
|
||||
void XEmitter::PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDF, true, dest, arg);}
|
||||
void XEmitter::PXOR(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEF, true, dest, arg);}
|
||||
void XEmitter::POR(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEB, true, dest, arg);}
|
||||
void XEmitter::PAND(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDB, dest, arg);}
|
||||
void XEmitter::PANDN(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDF, dest, arg);}
|
||||
void XEmitter::PXOR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEF, dest, arg);}
|
||||
void XEmitter::POR(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEB, dest, arg);}
|
||||
|
||||
void XEmitter::PADDB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFC, true, dest, arg);}
|
||||
void XEmitter::PADDW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFD, true, dest, arg);}
|
||||
void XEmitter::PADDD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFE, true, dest, arg);}
|
||||
void XEmitter::PADDQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD4, true, dest, arg);}
|
||||
void XEmitter::PADDB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFC, dest, arg);}
|
||||
void XEmitter::PADDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFD, dest, arg);}
|
||||
void XEmitter::PADDD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFE, dest, arg);}
|
||||
void XEmitter::PADDQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD4, dest, arg);}
|
||||
|
||||
void XEmitter::PADDSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEC, true, dest, arg);}
|
||||
void XEmitter::PADDSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xED, true, dest, arg);}
|
||||
void XEmitter::PADDUSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDC, true, dest, arg);}
|
||||
void XEmitter::PADDUSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDD, true, dest, arg);}
|
||||
void XEmitter::PADDSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEC, dest, arg);}
|
||||
void XEmitter::PADDSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xED, dest, arg);}
|
||||
void XEmitter::PADDUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDC, dest, arg);}
|
||||
void XEmitter::PADDUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDD, dest, arg);}
|
||||
|
||||
void XEmitter::PSUBB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF8, true, dest, arg);}
|
||||
void XEmitter::PSUBW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF9, true, dest, arg);}
|
||||
void XEmitter::PSUBD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFA, true, dest, arg);}
|
||||
void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xFB, true, dest, arg);}
|
||||
void XEmitter::PSUBB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF8, dest, arg);}
|
||||
void XEmitter::PSUBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF9, dest, arg);}
|
||||
void XEmitter::PSUBD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFA, dest, arg);}
|
||||
void XEmitter::PSUBQ(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xFB, dest, arg);}
|
||||
|
||||
void XEmitter::PSUBSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE8, true, dest, arg);}
|
||||
void XEmitter::PSUBSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE9, true, dest, arg);}
|
||||
void XEmitter::PSUBUSB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD8, true, dest, arg);}
|
||||
void XEmitter::PSUBUSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD9, true, dest, arg);}
|
||||
void XEmitter::PSUBSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE8, dest, arg);}
|
||||
void XEmitter::PSUBSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE9, dest, arg);}
|
||||
void XEmitter::PSUBUSB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD8, dest, arg);}
|
||||
void XEmitter::PSUBUSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD9, dest, arg);}
|
||||
|
||||
void XEmitter::PAVGB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE0, true, dest, arg);}
|
||||
void XEmitter::PAVGW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xE3, true, dest, arg);}
|
||||
void XEmitter::PAVGB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE0, dest, arg);}
|
||||
void XEmitter::PAVGW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xE3, dest, arg);}
|
||||
|
||||
void XEmitter::PCMPEQB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x74, true, dest, arg);}
|
||||
void XEmitter::PCMPEQW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x75, true, dest, arg);}
|
||||
void XEmitter::PCMPEQD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x76, true, dest, arg);}
|
||||
void XEmitter::PCMPEQB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x74, dest, arg);}
|
||||
void XEmitter::PCMPEQW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x75, dest, arg);}
|
||||
void XEmitter::PCMPEQD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x76, dest, arg);}
|
||||
|
||||
void XEmitter::PCMPGTB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x64, true, dest, arg);}
|
||||
void XEmitter::PCMPGTW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x65, true, dest, arg);}
|
||||
void XEmitter::PCMPGTD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x66, true, dest, arg);}
|
||||
void XEmitter::PCMPGTB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x64, dest, arg);}
|
||||
void XEmitter::PCMPGTW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x65, dest, arg);}
|
||||
void XEmitter::PCMPGTD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x66, dest, arg);}
|
||||
|
||||
void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(64, 0xC5, true, dest, arg); Write8(subreg);}
|
||||
void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(64, 0xC4, true, dest, arg); Write8(subreg);}
|
||||
void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC5, dest, arg); Write8(subreg);}
|
||||
void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg) {WriteSSEOp(0x66, 0xC4, dest, arg); Write8(subreg);}
|
||||
|
||||
void XEmitter::PMADDWD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF5, true, dest, arg); }
|
||||
void XEmitter::PSADBW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xF6, true, dest, arg);}
|
||||
void XEmitter::PMADDWD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF5, dest, arg); }
|
||||
void XEmitter::PSADBW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xF6, dest, arg);}
|
||||
|
||||
void XEmitter::PMAXSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEE, true, dest, arg); }
|
||||
void XEmitter::PMAXUB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDE, true, dest, arg); }
|
||||
void XEmitter::PMINSW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xEA, true, dest, arg); }
|
||||
void XEmitter::PMINUB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xDA, true, dest, arg); }
|
||||
void XEmitter::PMAXSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEE, dest, arg); }
|
||||
void XEmitter::PMAXUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDE, dest, arg); }
|
||||
void XEmitter::PMINSW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xEA, dest, arg); }
|
||||
void XEmitter::PMINUB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xDA, dest, arg); }
|
||||
|
||||
void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0xD7, true, dest, arg); }
|
||||
void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0xD7, dest, arg); }
|
||||
|
||||
void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(64, 0x70, false, regOp, arg, 1); Write8(shuffle);}
|
||||
void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0xF2, 0x70, regOp, arg, 1); Write8(shuffle);}
|
||||
|
||||
// VEX
|
||||
void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseADD, false, regOp1, regOp2, arg);}
|
||||
void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseSUB, false, regOp1, regOp2, arg);}
|
||||
void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseMUL, false, regOp1, regOp2, arg);}
|
||||
void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseDIV, false, regOp1, regOp2, arg);}
|
||||
void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseSQRT, false, regOp1, regOp2, arg);}
|
||||
void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseAND, false, regOp1, regOp2, arg);}
|
||||
void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(64, sseANDN, false, regOp1, regOp2, arg);}
|
||||
void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);}
|
||||
void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);}
|
||||
void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);}
|
||||
void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);}
|
||||
void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);}
|
||||
void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);}
|
||||
void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);}
|
||||
void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);}
|
||||
void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);}
|
||||
void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg);}
|
||||
void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg);}
|
||||
void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg);}
|
||||
void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg);}
|
||||
void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 1); Write8(shuffle);}
|
||||
void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);}
|
||||
void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);}
|
||||
|
||||
void XEmitter::SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);}
|
||||
void XEmitter::SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);}
|
||||
|
@ -126,6 +126,11 @@ struct OpArg
|
||||
//if scale == 0 never mind offsetting
|
||||
offset = _offset;
|
||||
}
|
||||
bool operator==(OpArg b)
|
||||
{
|
||||
return operandReg == b.operandReg && scale == b.scale && offsetOrBaseReg == b.offsetOrBaseReg &&
|
||||
indexReg == b.indexReg && offset == b.offset;
|
||||
}
|
||||
void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const;
|
||||
void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W = 0) const;
|
||||
void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=INVALID_REG, bool warn_64bit_offset = true) const;
|
||||
@ -273,11 +278,11 @@ private:
|
||||
void WriteShift(int bits, OpArg dest, OpArg &shift, int ext);
|
||||
void WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext);
|
||||
void WriteMXCSR(OpArg arg, int ext);
|
||||
void WriteSSEOp(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
|
||||
void WriteSSSE3Op(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
|
||||
void WriteSSE41Op(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
|
||||
void WriteAVXOp(int size, u16 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
|
||||
void WriteAVXOp(int size, u16 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
|
||||
void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
|
||||
void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
|
||||
void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
|
||||
void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
|
||||
void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
|
||||
void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
|
||||
void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
|
||||
void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
|
||||
@ -725,9 +730,18 @@ public:
|
||||
void VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle);
|
||||
void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
|
||||
|
||||
// VEX GPR instructions
|
||||
void SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
|
||||
|
@ -728,6 +728,26 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
|
||||
SetJumpTarget(noBreakpoint);
|
||||
}
|
||||
|
||||
// If we have an input register that is going to be used again, load it pre-emptively,
|
||||
// even if the instruction doesn't strictly need it in a register, to avoid redundant
|
||||
// loads later. Of course, don't do this if we're already out of registers.
|
||||
// As a bit of a heuristic, make sure we have at least one register left over for the
|
||||
// output, which needs to be bound in the actual instruction compilation.
|
||||
// TODO: make this smarter in the case that we're actually register-starved, i.e.
|
||||
// prioritize the more important registers.
|
||||
for (int k = 0; k < 3 && gpr.NumFreeRegisters() >= 2; k++)
|
||||
{
|
||||
int reg = ops[i].regsIn[k];
|
||||
if (reg >= 0 && (ops[i].gprInUse & (1 << reg)) && !gpr.R(reg).IsImm())
|
||||
gpr.BindToRegister(reg, true, false);
|
||||
}
|
||||
for (int k = 0; k < 4 && fpr.NumFreeRegisters() >= 2; k++)
|
||||
{
|
||||
int reg = ops[i].fregsIn[k];
|
||||
if (reg >= 0 && (ops[i].fprInXmm & (1 << reg)))
|
||||
fpr.BindToRegister(reg, true, false);
|
||||
}
|
||||
|
||||
Jit64Tables::CompileInstruction(ops[i]);
|
||||
|
||||
// If we have a register that will never be used again, flush it.
|
||||
|
@ -140,10 +140,13 @@ public:
|
||||
|
||||
void MultiplyImmediate(u32 imm, int a, int d, bool overflow);
|
||||
|
||||
void tri_op(int d, int a, int b, bool reversible, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
|
||||
void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg),
|
||||
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
|
||||
typedef u32 (*Operation)(u32 a, u32 b);
|
||||
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false);
|
||||
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
|
||||
void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (Gen::XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&),
|
||||
bool Rc = false, bool carry = false);
|
||||
void fp_tri_op(int d, int a, int b, bool reversible, bool single, void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg),
|
||||
void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS = false);
|
||||
void FloatCompare(UGeckoInstruction inst, bool upper = false);
|
||||
|
||||
// OPCODES
|
||||
|
@ -108,7 +108,22 @@ X64Reg RegCache::GetFreeXReg()
|
||||
return (X64Reg)xr;
|
||||
}
|
||||
}
|
||||
//Okay, not found :( Force grab one
|
||||
// Okay, not found :( Force grab one!
|
||||
|
||||
// First, see if we have any registers that are only going to be used for a float store.
|
||||
// These go through GPRs, so the cost of tossing them back into memory is lower than anything else.
|
||||
for (size_t i = 0; i < aCount; i++)
|
||||
{
|
||||
X64Reg xr = (X64Reg)aOrder[i];
|
||||
if (xregs[xr].locked)
|
||||
continue;
|
||||
size_t preg = xregs[xr].ppcReg;
|
||||
if (!regs[preg].locked && !(jit->js.op->fprInXmm & (1 << preg)))
|
||||
{
|
||||
StoreFromRegister(preg);
|
||||
return xr;
|
||||
}
|
||||
}
|
||||
|
||||
//TODO - add a pass to grab xregs whose ppcreg is not used in the next 3 instructions
|
||||
u32 last_used = 0xFFFFFFFF;
|
||||
@ -366,3 +381,14 @@ void RegCache::Flush(FlushMode mode)
|
||||
|
||||
cur_use_quantum = 0;
|
||||
}
|
||||
|
||||
int RegCache::NumFreeRegisters()
|
||||
{
|
||||
int count = 0;
|
||||
size_t aCount;
|
||||
const int* aOrder = GetAllocationOrder(aCount);
|
||||
for (size_t i = 0; i < aCount; i++)
|
||||
if (!xregs[aOrder[i]].locked && xregs[aOrder[i]].free)
|
||||
count++;
|
||||
return count;
|
||||
}
|
||||
|
@ -123,6 +123,7 @@ public:
|
||||
|
||||
|
||||
Gen::X64Reg GetFreeXReg();
|
||||
int NumFreeRegisters();
|
||||
};
|
||||
|
||||
class GPRRegCache : public RegCache
|
||||
|
@ -14,65 +14,27 @@ static const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x0000000
|
||||
static const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL};
|
||||
static const double GC_ALIGNED16(half_qnan_and_s32_max[2]) = {0x7FFFFFFF, -0x80000};
|
||||
|
||||
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg), UGeckoInstruction inst, bool roundRHS)
|
||||
void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool single, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg),
|
||||
void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
|
||||
{
|
||||
fpr.Lock(d, a, b);
|
||||
fpr.BindToRegister(d, d == a || d == b || !single);
|
||||
if (roundRHS)
|
||||
{
|
||||
if (d == a)
|
||||
{
|
||||
fpr.BindToRegister(d, true);
|
||||
MOVSD(XMM0, fpr.R(b));
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
(this->*op)(fpr.RX(d), R(XMM0));
|
||||
Force25BitPrecision(XMM0, fpr.R(b), XMM1);
|
||||
(this->*sseOp)(fpr.RX(d), R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
fpr.BindToRegister(d, d == b);
|
||||
if (d != b)
|
||||
MOVSD(fpr.RX(d), fpr.R(b));
|
||||
Force25BitPrecision(fpr.RX(d), XMM0);
|
||||
(this->*op)(fpr.RX(d), fpr.R(a));
|
||||
}
|
||||
}
|
||||
else if (d == a)
|
||||
{
|
||||
fpr.BindToRegister(d, true);
|
||||
if (!single)
|
||||
{
|
||||
fpr.BindToRegister(b, true, false);
|
||||
}
|
||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
||||
}
|
||||
else if (d == b)
|
||||
{
|
||||
if (reversible)
|
||||
{
|
||||
fpr.BindToRegister(d, true);
|
||||
if (!single)
|
||||
{
|
||||
fpr.BindToRegister(a, true, false);
|
||||
}
|
||||
(this->*op)(fpr.RX(d), fpr.R(a));
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVSD(XMM0, fpr.R(b));
|
||||
fpr.BindToRegister(d, !single);
|
||||
MOVSD(fpr.RX(d), fpr.R(a));
|
||||
(this->*op)(fpr.RX(d), Gen::R(XMM0));
|
||||
Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0);
|
||||
(this->*sseOp)(fpr.RX(d), fpr.R(a));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Sources different from d, can use rather quick solution
|
||||
fpr.BindToRegister(d, !single);
|
||||
if (!single)
|
||||
{
|
||||
fpr.BindToRegister(b, true, false);
|
||||
}
|
||||
MOVSD(fpr.RX(d), fpr.R(a));
|
||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
||||
avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), false, reversible);
|
||||
}
|
||||
if (single)
|
||||
{
|
||||
@ -104,10 +66,10 @@ void Jit64::fp_arith(UGeckoInstruction inst)
|
||||
bool single = inst.OPCD == 59;
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::DIVSD, inst); break; //div
|
||||
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::SUBSD, inst); break; //sub
|
||||
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::ADDSD, inst); break; //add
|
||||
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::MULSD, inst, single); break; //mul
|
||||
case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VDIVSD, &XEmitter::DIVSD, inst); break; //div
|
||||
case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, single, &XEmitter::VSUBSD, &XEmitter::SUBSD, inst); break; //sub
|
||||
case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, single, &XEmitter::VADDSD, &XEmitter::ADDSD, inst); break; //add
|
||||
case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, single, &XEmitter::VMULSD, &XEmitter::MULSD, inst, single); break; //mul
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "fp_arith WTF!!!");
|
||||
}
|
||||
@ -131,18 +93,20 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||
// nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately
|
||||
if (inst.SUBOP5 == 30) //nmsub
|
||||
{
|
||||
MOVSD(XMM1, fpr.R(c));
|
||||
if (single_precision)
|
||||
Force25BitPrecision(XMM1, XMM0);
|
||||
Force25BitPrecision(XMM1, fpr.R(c), XMM0);
|
||||
else
|
||||
MOVSD(XMM1, fpr.R(c));
|
||||
MULSD(XMM1, fpr.R(a));
|
||||
MOVSD(XMM0, fpr.R(b));
|
||||
SUBSD(XMM0, R(XMM1));
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVSD(XMM0, fpr.R(c));
|
||||
if (single_precision)
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
else
|
||||
MOVSD(XMM0, fpr.R(c));
|
||||
MULSD(XMM0, fpr.R(a));
|
||||
if (inst.SUBOP5 == 28) //msub
|
||||
SUBSD(XMM0, fpr.R(b));
|
||||
|
@ -43,17 +43,15 @@ void Jit64::ps_sel(UGeckoInstruction inst)
|
||||
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
MOVAPD(XMM1, fpr.R(a));
|
||||
PXOR(XMM0, R(XMM0));
|
||||
CMPPD(XMM0, R(XMM1), NLE);
|
||||
CMPPD(XMM0, fpr.R(a), NLE);
|
||||
MOVAPD(XMM1, fpr.R(c));
|
||||
BLENDVPD(XMM1, fpr.R(b));
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVAPD(XMM0, fpr.R(a));
|
||||
PXOR(XMM1, R(XMM1));
|
||||
CMPPD(XMM1, R(XMM0), NLE);
|
||||
CMPPD(XMM1, fpr.R(a), NLE);
|
||||
MOVAPD(XMM0, R(XMM1));
|
||||
PAND(XMM1, fpr.R(b));
|
||||
PANDN(XMM0, fpr.R(c));
|
||||
@ -74,26 +72,18 @@ void Jit64::ps_sign(UGeckoInstruction inst)
|
||||
int b = inst.FB;
|
||||
|
||||
fpr.Lock(d, b);
|
||||
if (d != b)
|
||||
{
|
||||
fpr.BindToRegister(d, false);
|
||||
MOVAPD(fpr.RX(d), fpr.R(b));
|
||||
}
|
||||
else
|
||||
{
|
||||
fpr.BindToRegister(d, true);
|
||||
}
|
||||
fpr.BindToRegister(d, d == b);
|
||||
|
||||
switch (inst.SUBOP10)
|
||||
{
|
||||
case 40: //neg
|
||||
PXOR(fpr.RX(d), M((void*)&psSignBits));
|
||||
avx_op(&XEmitter::VPXOR, &XEmitter::PXOR, fpr.RX(d), fpr.R(b), M((void*)&psSignBits));
|
||||
break;
|
||||
case 136: //nabs
|
||||
POR(fpr.RX(d), M((void*)&psSignBits));
|
||||
avx_op(&XEmitter::VPOR, &XEmitter::POR, fpr.RX(d), fpr.R(b), M((void*)&psSignBits));
|
||||
break;
|
||||
case 264: //abs
|
||||
PAND(fpr.RX(d), M((void*)&psAbsMask));
|
||||
avx_op(&XEmitter::VPAND, &XEmitter::PAND, fpr.RX(d), fpr.R(b), M((void*)&psAbsMask));
|
||||
break;
|
||||
}
|
||||
|
||||
@ -101,56 +91,29 @@ void Jit64::ps_sign(UGeckoInstruction inst)
|
||||
}
|
||||
|
||||
//There's still a little bit more optimization that can be squeezed out of this
|
||||
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
|
||||
void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), void (XEmitter::*sseOp)(X64Reg, OpArg), UGeckoInstruction inst, bool roundRHS)
|
||||
{
|
||||
fpr.Lock(d, a, b);
|
||||
fpr.BindToRegister(d, d == a || d == b);
|
||||
|
||||
if (roundRHS)
|
||||
{
|
||||
if (d == a)
|
||||
{
|
||||
fpr.BindToRegister(d, true);
|
||||
MOVAPD(XMM0, fpr.R(b));
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
(this->*op)(fpr.RX(d), R(XMM0));
|
||||
Force25BitPrecision(XMM0, fpr.R(b), XMM1);
|
||||
(this->*sseOp)(fpr.RX(d), R(XMM0));
|
||||
}
|
||||
else
|
||||
{
|
||||
fpr.BindToRegister(d, d == b);
|
||||
if (d != b)
|
||||
MOVAPD(fpr.RX(d), fpr.R(b));
|
||||
Force25BitPrecision(fpr.RX(d), XMM0);
|
||||
(this->*op)(fpr.RX(d), fpr.R(a));
|
||||
}
|
||||
}
|
||||
else if (d == a)
|
||||
{
|
||||
fpr.BindToRegister(d, true);
|
||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
||||
}
|
||||
else if (d == b)
|
||||
{
|
||||
if (reversible)
|
||||
{
|
||||
fpr.BindToRegister(d, true);
|
||||
(this->*op)(fpr.RX(d), fpr.R(a));
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVAPD(XMM0, fpr.R(b));
|
||||
fpr.BindToRegister(d, false);
|
||||
MOVAPD(fpr.RX(d), fpr.R(a));
|
||||
(this->*op)(fpr.RX(d), R(XMM0));
|
||||
Force25BitPrecision(fpr.RX(d), fpr.R(b), XMM0);
|
||||
(this->*sseOp)(fpr.RX(d), fpr.R(a));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//sources different from d, can use rather quick solution
|
||||
fpr.BindToRegister(d, false);
|
||||
MOVAPD(fpr.RX(d), fpr.R(a));
|
||||
(this->*op)(fpr.RX(d), fpr.R(b));
|
||||
avx_op(avxOp, sseOp, fpr.RX(d), fpr.R(a), fpr.R(b), true, reversible);
|
||||
}
|
||||
ForceSinglePrecisionP(fpr.RX(d));
|
||||
ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
|
||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||
fpr.UnlockAll();
|
||||
}
|
||||
@ -164,16 +127,16 @@ void Jit64::ps_arith(UGeckoInstruction inst)
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 18: // div
|
||||
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD, inst);
|
||||
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::VDIVPD, &XEmitter::DIVPD, inst);
|
||||
break;
|
||||
case 20: // sub
|
||||
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD, inst);
|
||||
tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::VSUBPD, &XEmitter::SUBPD, inst);
|
||||
break;
|
||||
case 21: // add
|
||||
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD, inst);
|
||||
tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::VADDPD, &XEmitter::ADDPD, inst);
|
||||
break;
|
||||
case 25: // mul
|
||||
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD, inst, true);
|
||||
tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::VMULPD, &XEmitter::MULPD, inst, true);
|
||||
break;
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!");
|
||||
@ -208,10 +171,9 @@ void Jit64::ps_sum(UGeckoInstruction inst)
|
||||
default:
|
||||
PanicAlert("ps_sum WTF!!!");
|
||||
}
|
||||
ForceSinglePrecisionP(XMM0);
|
||||
SetFPRFIfNeeded(inst, XMM0);
|
||||
fpr.BindToRegister(d, false);
|
||||
MOVAPD(fpr.RX(d), R(XMM0));
|
||||
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||
fpr.UnlockAll();
|
||||
}
|
||||
|
||||
@ -232,18 +194,16 @@ void Jit64::ps_muls(UGeckoInstruction inst)
|
||||
MOVDDUP(XMM0, fpr.R(c));
|
||||
break;
|
||||
case 13:
|
||||
MOVAPD(XMM0, fpr.R(c));
|
||||
SHUFPD(XMM0, R(XMM0), 3);
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
||||
break;
|
||||
default:
|
||||
PanicAlert("ps_muls WTF!!!");
|
||||
}
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ForceSinglePrecisionP(XMM0);
|
||||
SetFPRFIfNeeded(inst, XMM0);
|
||||
fpr.BindToRegister(d, false);
|
||||
MOVAPD(fpr.RX(d), R(XMM0));
|
||||
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||
fpr.UnlockAll();
|
||||
}
|
||||
|
||||
@ -258,27 +218,25 @@ void Jit64::ps_mergeXX(UGeckoInstruction inst)
|
||||
int a = inst.FA;
|
||||
int b = inst.FB;
|
||||
fpr.Lock(a,b,d);
|
||||
fpr.BindToRegister(d, d == a || d == b);
|
||||
|
||||
MOVAPD(XMM0, fpr.R(a));
|
||||
switch (inst.SUBOP10)
|
||||
{
|
||||
case 528:
|
||||
UNPCKLPD(XMM0, fpr.R(b)); //unpck is faster than shuf
|
||||
avx_op(&XEmitter::VUNPCKLPD, &XEmitter::UNPCKLPD, fpr.RX(d), fpr.R(a), fpr.R(b));
|
||||
break; //00
|
||||
case 560:
|
||||
SHUFPD(XMM0, fpr.R(b), 2); //must use shuf here
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, fpr.RX(d), fpr.R(a), fpr.R(b), 2);
|
||||
break; //01
|
||||
case 592:
|
||||
SHUFPD(XMM0, fpr.R(b), 1);
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, fpr.RX(d), fpr.R(a), fpr.R(b), 1);
|
||||
break; //10
|
||||
case 624:
|
||||
UNPCKHPD(XMM0, fpr.R(b));
|
||||
avx_op(&XEmitter::VUNPCKHPD, &XEmitter::UNPCKHPD, fpr.RX(d), fpr.R(a), fpr.R(b));
|
||||
break; //11
|
||||
default:
|
||||
_assert_msg_(DYNA_REC, 0, "ps_merge - invalid op");
|
||||
}
|
||||
fpr.BindToRegister(d, false);
|
||||
MOVAPD(fpr.RX(d), R(XMM0));
|
||||
fpr.UnlockAll();
|
||||
}
|
||||
|
||||
@ -303,7 +261,7 @@ void Jit64::ps_rsqrte(UGeckoInstruction inst)
|
||||
CALL((void *)asm_routines.frsqrte);
|
||||
MOVLHPS(fpr.RX(d), XMM0);
|
||||
|
||||
ForceSinglePrecisionP(fpr.RX(d));
|
||||
ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
|
||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||
fpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
@ -330,7 +288,7 @@ void Jit64::ps_res(UGeckoInstruction inst)
|
||||
CALL((void *)asm_routines.fres);
|
||||
MOVLHPS(fpr.RX(d), XMM0);
|
||||
|
||||
ForceSinglePrecisionP(fpr.RX(d));
|
||||
ForceSinglePrecisionP(fpr.RX(d), fpr.RX(d));
|
||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||
fpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
@ -352,42 +310,35 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
|
||||
switch (inst.SUBOP5)
|
||||
{
|
||||
case 14: //madds0
|
||||
MOVDDUP(XMM1, fpr.R(c));
|
||||
Force25BitPrecision(XMM1, XMM0);
|
||||
MOVAPD(XMM0, fpr.R(a));
|
||||
MULPD(XMM0, R(XMM1));
|
||||
MOVDDUP(XMM0, fpr.R(c));
|
||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 15: //madds1
|
||||
MOVAPD(XMM1, fpr.R(c));
|
||||
SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
|
||||
Force25BitPrecision(XMM1, XMM0);
|
||||
MOVAPD(XMM0, fpr.R(a));
|
||||
MULPD(XMM0, R(XMM1));
|
||||
avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3);
|
||||
Force25BitPrecision(XMM0, R(XMM0), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 28: //msub
|
||||
MOVAPD(XMM0, fpr.R(c));
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
SUBPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 29: //madd
|
||||
MOVAPD(XMM0, fpr.R(c));
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
break;
|
||||
case 30: //nmsub
|
||||
MOVAPD(XMM0, fpr.R(c));
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
SUBPD(XMM0, fpr.R(b));
|
||||
PXOR(XMM0, M((void*)&psSignBits));
|
||||
break;
|
||||
case 31: //nmadd
|
||||
MOVAPD(XMM0, fpr.R(c));
|
||||
Force25BitPrecision(XMM0, XMM1);
|
||||
Force25BitPrecision(XMM0, fpr.R(c), XMM1);
|
||||
MULPD(XMM0, fpr.R(a));
|
||||
ADDPD(XMM0, fpr.R(b));
|
||||
PXOR(XMM0, M((void*)&psSignBits));
|
||||
@ -399,9 +350,8 @@ void Jit64::ps_maddXX(UGeckoInstruction inst)
|
||||
return;
|
||||
}
|
||||
fpr.BindToRegister(d, false);
|
||||
ForceSinglePrecisionP(XMM0);
|
||||
SetFPRFIfNeeded(inst, XMM0);
|
||||
MOVAPD(fpr.RX(d), R(XMM0));
|
||||
ForceSinglePrecisionP(fpr.RX(d), XMM0);
|
||||
SetFPRFIfNeeded(inst, fpr.RX(d));
|
||||
fpr.UnlockAll();
|
||||
}
|
||||
|
||||
|
@ -409,8 +409,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
|
||||
|
||||
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULSS(XMM0, R(XMM1));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
XORPS(XMM1, R(XMM1));
|
||||
MAXSS(XMM0, R(XMM1));
|
||||
MINSS(XMM0, M((void *)&m_255));
|
||||
@ -420,8 +419,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
|
||||
|
||||
const u8* storeSingleS8 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULSS(XMM0, R(XMM1));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MAXSS(XMM0, M((void *)&m_m128));
|
||||
MINSS(XMM0, M((void *)&m_127));
|
||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||
@ -430,8 +428,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
|
||||
|
||||
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULSS(XMM0, R(XMM1));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
XORPS(XMM1, R(XMM1));
|
||||
MAXSS(XMM0, R(XMM1));
|
||||
MINSS(XMM0, M((void *)&m_65535));
|
||||
@ -441,8 +438,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
|
||||
|
||||
const u8* storeSingleS16 = AlignCode4();
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MULSS(XMM0, R(XMM1));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||
MAXSS(XMM0, M((void *)&m_m32768));
|
||||
MINSS(XMM0, M((void *)&m_32767));
|
||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||
@ -543,8 +539,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULSS(XMM0, R(XMM1));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M((void*)m_one));
|
||||
RET();
|
||||
|
||||
@ -583,8 +578,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULSS(XMM0, R(XMM1));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M((void*)m_one));
|
||||
RET();
|
||||
|
||||
@ -618,8 +612,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULSS(XMM0, R(XMM1));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M((void*)m_one));
|
||||
RET();
|
||||
|
||||
@ -652,8 +645,7 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
|
||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||
MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
MULSS(XMM0, R(XMM1));
|
||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||
UNPCKLPS(XMM0, M((void*)m_one));
|
||||
RET();
|
||||
|
||||
|
@ -608,13 +608,98 @@ void EmuCodeBlock::ForceSinglePrecisionS(X64Reg xmm)
|
||||
}
|
||||
}
|
||||
|
||||
void EmuCodeBlock::ForceSinglePrecisionP(X64Reg xmm)
|
||||
void EmuCodeBlock::ForceSinglePrecisionP(X64Reg output, X64Reg input)
|
||||
{
|
||||
// Most games don't need these. Zelda requires it though - some platforms get stuck without them.
|
||||
if (jit->jo.accurateSinglePrecision)
|
||||
{
|
||||
CVTPD2PS(xmm, R(xmm));
|
||||
CVTPS2PD(xmm, R(xmm));
|
||||
CVTPD2PS(input, R(input));
|
||||
CVTPS2PD(output, R(input));
|
||||
}
|
||||
else if (output != input)
|
||||
{
|
||||
MOVAPD(output, R(input));
|
||||
}
|
||||
}
|
||||
|
||||
// Abstract between AVX and SSE: automatically handle 3-operand instructions
|
||||
void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg), void (XEmitter::*sseOp)(X64Reg, OpArg),
|
||||
X64Reg regOp, OpArg arg1, OpArg arg2, bool packed, bool reversible)
|
||||
{
|
||||
if (arg1.IsSimpleReg() && regOp == arg1.GetSimpleReg())
|
||||
{
|
||||
(this->*sseOp)(regOp, arg2);
|
||||
}
|
||||
else if (arg1.IsSimpleReg() && cpu_info.bAVX)
|
||||
{
|
||||
(this->*avxOp)(regOp, arg1.GetSimpleReg(), arg2);
|
||||
}
|
||||
else if (arg2.IsSimpleReg() && arg2.GetSimpleReg() == regOp)
|
||||
{
|
||||
if (reversible)
|
||||
{
|
||||
(this->*sseOp)(regOp, arg1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// The ugly case: regOp == arg2 without AVX, or with arg1 == memory
|
||||
if (!arg1.IsSimpleReg() || arg1.GetSimpleReg() != XMM0)
|
||||
MOVAPD(XMM0, arg1);
|
||||
if (cpu_info.bAVX)
|
||||
{
|
||||
(this->*avxOp)(regOp, XMM0, arg2);
|
||||
}
|
||||
else
|
||||
{
|
||||
(this->*sseOp)(XMM0, arg2);
|
||||
if (packed)
|
||||
MOVAPD(regOp, R(XMM0));
|
||||
else
|
||||
MOVSD(regOp, R(XMM0));
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (packed)
|
||||
MOVAPD(regOp, arg1);
|
||||
else
|
||||
MOVSD(regOp, arg1);
|
||||
(this->*sseOp)(regOp, arg1 == arg2 ? R(regOp) : arg2);
|
||||
}
|
||||
}
|
||||
|
||||
// Abstract between AVX and SSE: automatically handle 3-operand instructions
|
||||
void EmuCodeBlock::avx_op(void (XEmitter::*avxOp)(X64Reg, X64Reg, OpArg, u8), void (XEmitter::*sseOp)(X64Reg, OpArg, u8),
|
||||
X64Reg regOp, OpArg arg1, OpArg arg2, u8 imm)
|
||||
{
|
||||
if (arg1.IsSimpleReg() && regOp == arg1.GetSimpleReg())
|
||||
{
|
||||
(this->*sseOp)(regOp, arg2, imm);
|
||||
}
|
||||
else if (arg1.IsSimpleReg() && cpu_info.bAVX)
|
||||
{
|
||||
(this->*avxOp)(regOp, arg1.GetSimpleReg(), arg2, imm);
|
||||
}
|
||||
else if (arg2.IsSimpleReg() && arg2.GetSimpleReg() == regOp)
|
||||
{
|
||||
// The ugly case: regOp == arg2 without AVX, or with arg1 == memory
|
||||
if (!arg1.IsSimpleReg() || arg1.GetSimpleReg() != XMM0)
|
||||
MOVAPD(XMM0, arg1);
|
||||
if (cpu_info.bAVX)
|
||||
{
|
||||
(this->*avxOp)(regOp, XMM0, arg2, imm);
|
||||
}
|
||||
else
|
||||
{
|
||||
(this->*sseOp)(XMM0, arg2, imm);
|
||||
MOVAPD(regOp, R(XMM0));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
MOVAPD(regOp, arg1);
|
||||
(this->*sseOp)(regOp, arg1 == arg2 ? R(regOp) : arg2, imm);
|
||||
}
|
||||
}
|
||||
|
||||
@ -625,15 +710,25 @@ static const u64 GC_ALIGNED16(psRoundBit[2]) = {0x8000000, 0x8000000};
|
||||
// a single precision multiply. To be precise, it drops the low 28 bits of the mantissa,
|
||||
// rounding to nearest as it does.
|
||||
// It needs a temp, so let the caller pass that in.
|
||||
void EmuCodeBlock::Force25BitPrecision(X64Reg xmm, X64Reg tmp)
|
||||
void EmuCodeBlock::Force25BitPrecision(X64Reg output, OpArg input, X64Reg tmp)
|
||||
{
|
||||
if (jit->jo.accurateSinglePrecision)
|
||||
{
|
||||
// mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1);
|
||||
MOVAPD(tmp, R(xmm));
|
||||
PAND(xmm, M((void*)&psMantissaTruncate));
|
||||
PAND(tmp, M((void*)&psRoundBit));
|
||||
PADDQ(xmm, R(tmp));
|
||||
if (input.IsSimpleReg() && cpu_info.bAVX)
|
||||
{
|
||||
VPAND(tmp, input.GetSimpleReg(), M((void*)&psRoundBit));
|
||||
VPAND(output, input.GetSimpleReg(), M((void*)&psMantissaTruncate));
|
||||
PADDQ(output, R(tmp));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!input.IsSimpleReg() || input.GetSimpleReg() != output)
|
||||
MOVAPD(output, input);
|
||||
avx_op(&XEmitter::VPAND, &XEmitter::PAND, tmp, R(output), M((void*)&psRoundBit), true, true);
|
||||
PAND(output, M((void*)&psMantissaTruncate));
|
||||
PADDQ(output, R(tmp));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -123,9 +123,14 @@ public:
|
||||
void JitSetCAIf(Gen::CCFlags conditionCode);
|
||||
void JitClearCA();
|
||||
|
||||
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg),
|
||||
Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, bool packed = true, bool reversible = false);
|
||||
void avx_op(void (Gen::XEmitter::*avxOp)(Gen::X64Reg, Gen::X64Reg, Gen::OpArg, u8), void (Gen::XEmitter::*sseOp)(Gen::X64Reg, Gen::OpArg, u8),
|
||||
Gen::X64Reg regOp, Gen::OpArg arg1, Gen::OpArg arg2, u8 imm);
|
||||
|
||||
void ForceSinglePrecisionS(Gen::X64Reg xmm);
|
||||
void ForceSinglePrecisionP(Gen::X64Reg xmm);
|
||||
void Force25BitPrecision(Gen::X64Reg xmm, Gen::X64Reg tmp);
|
||||
void ForceSinglePrecisionP(Gen::X64Reg output, Gen::X64Reg input);
|
||||
void Force25BitPrecision(Gen::X64Reg output, Gen::OpArg input, Gen::X64Reg tmp);
|
||||
|
||||
// RSCRATCH might get trashed
|
||||
void ConvertSingleToDouble(Gen::X64Reg dst, Gen::X64Reg src, bool src_is_gpr = false);
|
||||
|
@ -802,6 +802,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
|
||||
bool wantsCA = true;
|
||||
u32 fregInUse = 0;
|
||||
u32 regInUse = 0;
|
||||
u32 fregInXmm = 0;
|
||||
for (int i = block->m_num_instructions - 1; i >= 0; i--)
|
||||
{
|
||||
bool opWantsCR0 = code[i].wantsCR0;
|
||||
@ -822,6 +823,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
|
||||
wantsCA &= !code[i].outputCA || opWantsCA;
|
||||
code[i].gprInUse = regInUse;
|
||||
code[i].fprInUse = fregInUse;
|
||||
code[i].fprInXmm = fregInXmm;
|
||||
// TODO: if there's no possible endblocks or exceptions in between, tell the regcache
|
||||
// we can throw away a register if it's going to be overwritten later.
|
||||
for (int j = 0; j < 3; j++)
|
||||
@ -829,7 +831,11 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
|
||||
regInUse |= 1 << code[i].regsIn[j];
|
||||
for (int j = 0; j < 4; j++)
|
||||
if (code[i].fregsIn[j] >= 0)
|
||||
{
|
||||
fregInUse |= 1 << code[i].fregsIn[j];
|
||||
if (strncmp(code[i].opinfo->opname, "stfd", 4))
|
||||
fregInXmm |= 1 << code[i].fregsIn[j];
|
||||
}
|
||||
// For now, we need to count output registers as "used" though; otherwise the flush
|
||||
// will result in a redundant store (e.g. store to regcache, then store again to
|
||||
// the same location later).
|
||||
@ -837,7 +843,11 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32
|
||||
if (code[i].regsOut[j] >= 0)
|
||||
regInUse |= 1 << code[i].regsOut[j];
|
||||
if (code[i].fregOut >= 0)
|
||||
{
|
||||
fregInUse |= 1 << code[i].fregOut;
|
||||
if (strncmp(code[i].opinfo->opname, "stfd", 4))
|
||||
fregInXmm |= 1 << code[i].fregOut;
|
||||
}
|
||||
}
|
||||
return address;
|
||||
}
|
||||
|
@ -45,6 +45,9 @@ struct CodeOp //16B
|
||||
// which registers are still needed after this instruction in this block
|
||||
u32 gprInUse;
|
||||
u32 fprInUse;
|
||||
// we do double stores from GPRs, so we don't want to load a PowerPC floating point register into
|
||||
// an XMM only to move it again to a GPR afterwards.
|
||||
u32 fprInXmm;
|
||||
};
|
||||
|
||||
struct BlockStats
|
||||
|
Loading…
x
Reference in New Issue
Block a user