diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index bda0cc0faf..3716d922aa 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -1355,9 +1355,9 @@ void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extr arg.WriteRest(this, extrabytes); } -void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes) +void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int W, int extrabytes) { - WriteAVXOp(opPrefix, op, regOp, INVALID_REG, arg, extrabytes); + WriteAVXOp(opPrefix, op, regOp, INVALID_REG, arg, W, extrabytes); } static int GetVEXmmmmm(u16 op) @@ -1383,14 +1383,14 @@ static int GetVEXpp(u8 opPrefix) return 0; } -void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes) +void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int W, int extrabytes) { if (!cpu_info.bAVX) PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer."); int mmmmm = GetVEXmmmmm(op); int pp = GetVEXpp(opPrefix); // FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here - arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm); + arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm, W); Write8(op & 0xFF); arg.WriteRest(this, extrabytes, regOp1); } @@ -1799,10 +1799,71 @@ void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x6 void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg);} void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg);} void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg);} -void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 1); Write8(shuffle);} +void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 0, 1); Write8(shuffle);} void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);} void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);} +void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg);} +void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg);} +void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg);} +void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg, 1);} +void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg, 1);} +void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg, 1);} +void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg);} +void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg);} +void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg);} +void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg, 1);} +void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg, 1);} +void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg, 1);} +void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg);} +void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg);} +void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg);} +void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg, 1);} +void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg, 1);} +void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg, 1);} +void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg);} +void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg);} +void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg);} +void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg, 1);} +void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg, 1);} +void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg, 1);} +void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg);} +void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg);} +void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg);} +void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg, 1);} +void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg, 1);} +void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg, 1);} +void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg);} +void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg);} +void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg);} +void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg, 1);} +void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg, 1);} +void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg, 1);} +void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg);} +void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg);} +void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg);} +void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg, 1);} +void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg, 1);} +void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg, 1);} +void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg);} +void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg);} +void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg);} +void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg, 1);} +void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg, 1);} +void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg, 1);} +void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg);} +void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg);} +void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg);} +void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg, 1);} +void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg, 1);} +void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg, 1);} +void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg);} +void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg);} +void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg);} +void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg, 1);} +void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg, 1);} +void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg, 1);} + void XEmitter::SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);} void XEmitter::SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);} void XEmitter::SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);} diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index 8af38431a1..ed0250e8d0 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -294,8 +294,8 @@ private: void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); - void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0); - void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); + void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int W = 0, int extrabytes = 0); + void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int W = 0, int extrabytes = 0); void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); @@ -773,6 +773,68 @@ public: void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + // FMA + void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg); + // VEX GPR instructions void SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); void SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2); diff --git a/Source/Core/Core/Core.cpp b/Source/Core/Core/Core.cpp index 8bbd97f0be..2b36fa6f3f 100644 --- a/Source/Core/Core/Core.cpp +++ b/Source/Core/Core/Core.cpp @@ -49,6 +49,7 @@ #include "Core/HW/Wiimote.h" #include "Core/IPC_HLE/WII_IPC_HLE_Device_usb.h" #include "Core/IPC_HLE/WII_Socket.h" +#include "Core/PowerPC/JitInterface.h" #include "Core/PowerPC/PowerPC.h" #ifdef USE_GDBSTUB @@ -728,6 +729,8 @@ void UpdateWantDeterminism(bool initial) g_want_determinism = new_want_determinism; WiiSockMan::GetInstance().UpdateWantDeterminism(new_want_determinism); g_video_backend->UpdateWantDeterminism(new_want_determinism); + // We need to clear the cache because some parts of the JIT depend on want_determinism, e.g. use of FMA. + JitInterface::ClearCache(); Core::PauseAndLock(false, was_unpaused); } diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp index e7086b59b9..0b58eb03cb 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -90,9 +90,44 @@ void Jit64::fmaddXX(UGeckoInstruction inst) fpr.Lock(a, b, c, d); - // nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately - if (inst.SUBOP5 == 30) //nmsub + // While we don't know if any games are actually affected (replays seem to work with all the usual + // suspects for desyncing), netplay and other applications need absolute perfect determinism, so + // be extra careful and don't use FMA, even if in theory it might be okay. + // Note that FMA isn't necessarily less correct (it may actually be closer to correct) compared + // to what the Gekko does here; in deterministic mode, the important thing is multiple Dolphin + // instances on different computers giving identical results. + if (cpu_info.bFMA && !Core::g_want_determinism) { + if (single_precision) + Force25BitPrecision(XMM0, fpr.R(c), XMM1); + else + MOVSD(XMM0, fpr.R(c)); + // Statistics suggests b is a lot less likely to be unbound in practice, so + // if we have to pick one of a or b to bind, let's make it b. + fpr.BindToRegister(b, true, false); + switch (inst.SUBOP5) + { + case 28: //msub + VFMSUB132SD(XMM0, fpr.RX(b), fpr.R(a)); + break; + case 29: //madd + VFMADD132SD(XMM0, fpr.RX(b), fpr.R(a)); + break; + // PowerPC and x86 define NMADD/NMSUB differently + // x86: D = -A*C (+/-) B + // PPC: D = -(A*C (+/-) B) + // so we have to swap them; the ADD/SUB here isn't a typo. + case 30: //nmsub + VFNMADD132SD(XMM0, fpr.RX(b), fpr.R(a)); + break; + case 31: //nmadd + VFNMSUB132SD(XMM0, fpr.RX(b), fpr.R(a)); + break; + } + } + else if (inst.SUBOP5 == 30) //nmsub + { + // nmsub is implemented a little differently ((b - a*c) instead of -(a*c - b)), so handle it separately if (single_precision) Force25BitPrecision(XMM1, fpr.R(c), XMM0); else @@ -115,6 +150,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst) if (inst.SUBOP5 == 31) //nmadd PXOR(XMM0, M((void*)&psSignBits)); } + fpr.BindToRegister(d, false); //YES it is necessary to dupe the result :( //TODO : analysis - does the top reg get used? If so, dupe, if not, don't. diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp index cd069bb9fc..6cb6a20cba 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_Paired.cpp @@ -305,50 +305,77 @@ void Jit64::ps_maddXX(UGeckoInstruction inst) int b = inst.FB; int c = inst.FC; int d = inst.FD; + bool fma = cpu_info.bFMA && !Core::g_want_determinism; fpr.Lock(a,b,c,d); - switch (inst.SUBOP5) + if (fma) + fpr.BindToRegister(b, true, false); + + if (inst.SUBOP5 == 14) { - case 14: //madds0 MOVDDUP(XMM0, fpr.R(c)); Force25BitPrecision(XMM0, R(XMM0), XMM1); - MULPD(XMM0, fpr.R(a)); - ADDPD(XMM0, fpr.R(b)); - break; - case 15: //madds1 + } + else if (inst.SUBOP5 == 15) + { avx_op(&XEmitter::VSHUFPD, &XEmitter::SHUFPD, XMM0, fpr.R(c), fpr.R(c), 3); Force25BitPrecision(XMM0, R(XMM0), XMM1); - MULPD(XMM0, fpr.R(a)); - ADDPD(XMM0, fpr.R(b)); - break; - case 28: //msub - Force25BitPrecision(XMM0, fpr.R(c), XMM1); - MULPD(XMM0, fpr.R(a)); - SUBPD(XMM0, fpr.R(b)); - break; - case 29: //madd - Force25BitPrecision(XMM0, fpr.R(c), XMM1); - MULPD(XMM0, fpr.R(a)); - ADDPD(XMM0, fpr.R(b)); - break; - case 30: //nmsub - Force25BitPrecision(XMM0, fpr.R(c), XMM1); - MULPD(XMM0, fpr.R(a)); - SUBPD(XMM0, fpr.R(b)); - PXOR(XMM0, M((void*)&psSignBits)); - break; - case 31: //nmadd - Force25BitPrecision(XMM0, fpr.R(c), XMM1); - MULPD(XMM0, fpr.R(a)); - ADDPD(XMM0, fpr.R(b)); - PXOR(XMM0, M((void*)&psSignBits)); - break; - default: - _assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!"); - //FallBackToInterpreter(inst); - //fpr.UnlockAll(); - return; } + else + { + Force25BitPrecision(XMM0, fpr.R(c), XMM1); + } + + if (fma) + { + switch (inst.SUBOP5) + { + case 14: //madds0 + case 15: //madds1 + case 29: //madd + VFMADD132PD(XMM0, fpr.RX(b), fpr.R(a)); + break; + case 28: //msub + VFMSUB132PD(XMM0, fpr.RX(b), fpr.R(a)); + break; + case 30: //nmsub + VFNMADD132PD(XMM0, fpr.RX(b), fpr.R(a)); + break; + case 31: //nmadd + VFNMSUB132PD(XMM0, fpr.RX(b), fpr.R(a)); + break; + } + } + else + { + switch (inst.SUBOP5) + { + case 14: //madds0 + case 15: //madds1 + case 29: //madd + MULPD(XMM0, fpr.R(a)); + ADDPD(XMM0, fpr.R(b)); + break; + case 28: //msub + MULPD(XMM0, fpr.R(a)); + SUBPD(XMM0, fpr.R(b)); + break; + case 30: //nmsub + MULPD(XMM0, fpr.R(a)); + SUBPD(XMM0, fpr.R(b)); + PXOR(XMM0, M((void*)&psSignBits)); + break; + case 31: //nmadd + MULPD(XMM0, fpr.R(a)); + ADDPD(XMM0, fpr.R(b)); + PXOR(XMM0, M((void*)&psSignBits)); + break; + default: + _assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!"); + return; + } + } + fpr.BindToRegister(d, false); ForceSinglePrecisionP(fpr.RX(d), XMM0); SetFPRFIfNeeded(inst, fpr.RX(d)); diff --git a/Source/UnitTests/Common/x64EmitterTest.cpp b/Source/UnitTests/Common/x64EmitterTest.cpp index 2977ad1b92..22164b8120 100644 --- a/Source/UnitTests/Common/x64EmitterTest.cpp +++ b/Source/UnitTests/Common/x64EmitterTest.cpp @@ -948,4 +948,47 @@ VEX_RM_TEST(BLSI) VEX_RMI_TEST(RORX) +// for AVX instructions that take the form op reg, reg, r/m +#define AVX_RRM_TEST(Name, sizename) \ + TEST_F(x64EmitterTest, Name) \ + { \ + struct { \ + int bits; \ + std::vector regs; \ + std::string out_name; \ + std::string size; \ + } regsets[] = { \ + { 64, xmmnames, "xmm0", sizename }, \ + }; \ + for (const auto& regset : regsets) \ + for (const auto& r : regset.regs) \ + { \ + emitter->Name(r.reg, RAX, R(RAX)); \ + emitter->Name(RAX, RAX, R(r.reg)); \ + emitter->Name(RAX, r.reg, MatR(R12)); \ + ExpectDisassembly(#Name " " + r.name+ ", " + regset.out_name + ", " + regset.out_name + " " \ + #Name " " + regset.out_name + ", " + regset.out_name + ", " + r.name + " " \ + #Name " " + regset.out_name + ", " + r.name + ", " + regset.size + " ptr ds:[r12] "); \ + } \ + } + +#define FMA_TEST(Name, P, packed) \ + AVX_RRM_TEST(Name ## 132 ## P ## S, packed ? "dqword" : "dword") \ + AVX_RRM_TEST(Name ## 213 ## P ## S, packed ? "dqword" : "dword") \ + AVX_RRM_TEST(Name ## 231 ## P ## S, packed ? "dqword" : "dword") \ + AVX_RRM_TEST(Name ## 132 ## P ## D, packed ? "dqword" : "qword") \ + AVX_RRM_TEST(Name ## 213 ## P ## D, packed ? "dqword" : "qword") \ + AVX_RRM_TEST(Name ## 231 ## P ## D, packed ? "dqword" : "qword") + +FMA_TEST(VFMADD, P, true) +FMA_TEST(VFMADD, S, false) +FMA_TEST(VFMSUB, P, true) +FMA_TEST(VFMSUB, S, false) +FMA_TEST(VFNMADD, P, true) +FMA_TEST(VFNMADD, S, false) +FMA_TEST(VFNMSUB, P, true) +FMA_TEST(VFNMSUB, S, false) +FMA_TEST(VFMADDSUB, P, true) +FMA_TEST(VFMSUBADD, P, true) + } // namespace Gen