From 29102ecbc6c7d012da281350d32542e15d8e4c00 Mon Sep 17 00:00:00 2001 From: hrydgard Date: Mon, 11 Aug 2008 19:35:38 +0000 Subject: [PATCH] For unknown reasons, this patch fixes Beyond Good and Evil and Metroid intro in 32-bit mode only. Yeah, I have some work to do on the JIT. Also adds some minor stuff like memory card write notification, plus some minor SSSE3 optimizations. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@179 8ced0084-cf51-0410-be5f-012b33b47a6e --- Binary/win32/Dolphin.ini | 15 ++++ Binary/x64/Dolphin.ini | 15 ++++ Source/Core/Common/Src/Common.h | 2 + Source/Core/Common/Src/x64Emitter.cpp | 8 +-- Source/Core/Core/Src/Core.cpp | 15 +++- Source/Core/Core/Src/Core.h | 2 + Source/Core/Core/Src/HW/EXI_Channel.cpp | 2 +- .../Core/Core/Src/HW/EXI_DeviceMemoryCard.cpp | 2 + Source/Core/Core/Src/HW/GPFifo.cpp | 4 +- Source/Core/Core/Src/HW/GPFifo.h | 5 ++ Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp | 14 ++++ .../Src/PowerPC/Jit64/Jit_FloatingPoint.cpp | 9 +++ .../PowerPC/Jit64/Jit_LoadStoreFloating.cpp | 69 +++++++++++++++---- .../Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp | 69 ++++++++++++------- .../Plugins/Plugin_VideoOGL/Src/TextureMngr.h | 2 +- 15 files changed, 183 insertions(+), 50 deletions(-) diff --git a/Binary/win32/Dolphin.ini b/Binary/win32/Dolphin.ini index 6a067147fc..f6dbb144c7 100644 --- a/Binary/win32/Dolphin.ini +++ b/Binary/win32/Dolphin.ini @@ -2,3 +2,18 @@ GFXPlugin = Plugins\Plugin_VideoOGL.dll DSPPlugin = Plugins\Plugin_DSP_NULL.dll PadPlugin = Plugins\Plugin_PadSimple.dll +[General] +LastFilename = +GCMPathes = 1 +GCMPath0 = E:\GCM +[Core] +GFXPlugin = Plugins\Plugin_VideoOGL.dll +DSPPlugin = Plugins\Plugin_DSP.dll +PadPlugin = Plugins\Plugin_PadSimple.dll +HLEBios = True +UseDynarec = True +UseDualCore = True +Throttle = False +LockThreads = True +DefaultGCM = +OptimizeQuantizers = True diff --git a/Binary/x64/Dolphin.ini b/Binary/x64/Dolphin.ini index 6a067147fc..44ededdcc1 100644 --- a/Binary/x64/Dolphin.ini +++ b/Binary/x64/Dolphin.ini @@ -2,3 +2,18 @@ GFXPlugin = Plugins\Plugin_VideoOGL.dll DSPPlugin = Plugins\Plugin_DSP_NULL.dll PadPlugin = Plugins\Plugin_PadSimple.dll +[General] +LastFilename = +GCMPathes = 1 +GCMPath0 = E:\GCM +[Core] +GFXPlugin = Plugins\Plugin_VideoOGL.dll +DSPPlugin = Plugins\Plugin_DSP.dll +PadPlugin = Plugins\Plugin_PadSimple.dll +HLEBios = True +UseDynarec = True +UseDualCore = False +Throttle = False +LockThreads = True +DefaultGCM = +OptimizeQuantizers = True diff --git a/Source/Core/Common/Src/Common.h b/Source/Core/Common/Src/Common.h index 043d051433..8421fd1225 100644 --- a/Source/Core/Common/Src/Common.h +++ b/Source/Core/Common/Src/Common.h @@ -71,6 +71,7 @@ typedef signed __int16 s16; typedef signed __int8 s8; #define GC_ALIGNED16(x) __declspec(align(16)) x +#define GC_ALIGNED32(x) __declspec(align(32)) x #define GC_ALIGNED64(x) __declspec(align(64)) x #define GC_ALIGNED16_DECL(x) __declspec(align(16)) x #define GC_ALIGNED64_DECL(x) __declspec(align(64)) x @@ -101,6 +102,7 @@ typedef union _LARGE_INTEGER #endif #define GC_ALIGNED16(x) __attribute((aligned(16))) x +#define GC_ALIGNED32(x) __attribute((aligned(16))) x #define GC_ALIGNED64(x) __attribute((aligned(64))) x #define GC_ALIGNED16_DECL(x) __attribute((aligned(16))) x #define GC_ALIGNED64_DECL(x) __attribute((aligned(64))) x diff --git a/Source/Core/Common/Src/x64Emitter.cpp b/Source/Core/Common/Src/x64Emitter.cpp index 2b38430e40..27c353b3a1 100644 --- a/Source/Core/Common/Src/x64Emitter.cpp +++ b/Source/Core/Common/Src/x64Emitter.cpp @@ -974,8 +974,7 @@ namespace Gen void MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x6E, true, dest, arg, 0);} void MOVQ_xmm(X64Reg dest, OpArg arg) { - if (dest > 7) - { +#ifdef _M_X64 // Alternate encoding // This does not display correctly in MSVC's debugger, it thinks it's a MOVD arg.operandReg = dest; @@ -984,14 +983,13 @@ namespace Gen Write8(0x0f); Write8(0x6E); arg.WriteRest(0); - } else { +#else arg.operandReg = dest; - arg.WriteRex(false); Write8(0xF3); Write8(0x0f); Write8(0x7E); arg.WriteRest(0); - } +#endif } void MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(64, 0x7E, true, src, arg, 0);} diff --git a/Source/Core/Core/Src/Core.cpp b/Source/Core/Core/Src/Core.cpp index 4c5905e5d1..e2d3397b9f 100644 --- a/Source/Core/Core/Src/Core.cpp +++ b/Source/Core/Core/Src/Core.cpp @@ -93,7 +93,7 @@ Common::Event emuThreadGoing; bool PanicAlertToVideo(const char* text, bool yes_no) { - PluginVideo::Video_AddMessage(text,3000); + DisplayMessage(text, 3000); return true; } @@ -140,13 +140,24 @@ bool Init(const SCoreStartupParameter _CoreParameter) // all right ... here we go Host_SetWaitCursor(false); - PluginVideo::Video_AddMessage("Emulation started.",3000); + DisplayMessage("Emulation started.", 3000); //RegisterPanicAlertHandler(PanicAlertToVideo); return true; } +void DisplayMessage(const std::string &message, int time_in_ms) +{ + PluginVideo::Video_AddMessage(message.c_str(), time_in_ms); +} + +void DisplayMessage(const char *message, int time_in_ms) +{ + PluginVideo::Video_AddMessage(message, time_in_ms); +} + + // Called from GUI thread or VI thread void Stop() // - Hammertime! { diff --git a/Source/Core/Core/Src/Core.h b/Source/Core/Core/Src/Core.h index 68f8154faa..a8ea7ac88d 100644 --- a/Source/Core/Core/Src/Core.h +++ b/Source/Core/Core/Src/Core.h @@ -54,6 +54,8 @@ namespace Core extern bool bWriteTrace; void StartTrace(bool write); + void DisplayMessage(const std::string &message, int time_in_ms); // This displays messages in a user-visible way. + void DisplayMessage(const char *message, int time_in_ms); // This displays messages in a user-visible way. int SyncTrace(); void SetBlockStart(u32 addr); diff --git a/Source/Core/Core/Src/HW/EXI_Channel.cpp b/Source/Core/Core/Src/HW/EXI_Channel.cpp index 8da176bffa..ea4b1df078 100644 --- a/Source/Core/Core/Src/HW/EXI_Channel.cpp +++ b/Source/Core/Core/Src/HW/EXI_Channel.cpp @@ -32,7 +32,7 @@ CEXIChannel::CEXIChannel() : m_Status.CHIP_SELECT = 1; - for (int i=0; i CPeripheralInterface::Fifo_CPUEnd) - _assert_msg_(DYNA_REC,0,"ARGH"); + _assert_msg_(DYNA_REC, 0, "ARGH"); if (CPeripheralInterface::Fifo_CPUWritePointer >= CPeripheralInterface::Fifo_CPUEnd) CPeripheralInterface::Fifo_CPUWritePointer = CPeripheralInterface::Fifo_CPUBase; diff --git a/Source/Core/Core/Src/HW/GPFifo.h b/Source/Core/Core/Src/HW/GPFifo.h index 2cfa0300e1..a91767edce 100644 --- a/Source/Core/Core/Src/HW/GPFifo.h +++ b/Source/Core/Core/Src/HW/GPFifo.h @@ -28,6 +28,11 @@ enum GATHER_PIPE_SIZE = 32 }; +extern u8 m_gatherPipe[GATHER_PIPE_SIZE*16]; //more room, for the fastmodes + +// pipe counter +extern u32 m_gatherPipeCount; + // Init void Init(); diff --git a/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp b/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp index 166864f3ee..bc180a6512 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/JitAsm.cpp @@ -183,6 +183,20 @@ void Generate() SetJumpTarget(pLesser); OR(32, M(&CR), Imm32(0x80000000)); // _x86Reg < 0 RET(); + + // Fast write routines - special case the most common hardware write + // TODO: use this. + // Even in x86, the param values will be in the right registers. + /* + const u8 *fastMemWrite8 = AlignCode16(); + CMP(32, R(ABI_PARAM2), Imm32(0xCC008000)); + FixupBranch skip_fast_write = J_CC(CC_NE, false); + MOV(32, EAX, M(&m_gatherPipeCount)); + MOV(8, MDisp(EAX, (u32)&m_gatherPipe), ABI_PARAM1); + ADD(32, 1, M(&m_gatherPipeCount)); + RET(); + SetJumpTarget(skip_fast_write); + CALL((void *)&Memory::Write_U8);*/ } #elif defined(_M_X64) diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp index 2de2fc894e..9eca6538d0 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_FloatingPoint.cpp @@ -139,6 +139,15 @@ namespace Jit64 fpr.UnlockAll(); } + + void fmrx(UGeckoInstruction inst) + { + INSTRUCTION_START; + Default(inst); return; + + + } + void fcmpx(UGeckoInstruction inst) { INSTRUCTION_START; diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp index d7e437dd08..95580bf76b 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStoreFloating.cpp @@ -27,6 +27,7 @@ #include "../../HW/PixelEngine.h" #include "../../HW/Memmap.h" #include "../PPCTables.h" +#include "CPUDetect.h" #include "x64Emitter.h" #include "ABI.h" @@ -51,6 +52,7 @@ namespace Jit64 const u8 GC_ALIGNED16(bswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const u8 GC_ALIGNED16(bswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15}; const u8 GC_ALIGNED16(bswapShuffle1x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 8, 9, 10, 11, 12, 13, 14, 15}; +const u8 GC_ALIGNED16(bswapShuffle1x8Dupe[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0}; const u8 GC_ALIGNED16(bswapShuffle2x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; static u64 GC_ALIGNED16(temp64); @@ -115,12 +117,18 @@ void lfd(UGeckoInstruction inst) s32 offset = (s32)(s16)inst.SIMM_16; gpr.Lock(a); MOV(32, R(ABI_PARAM1), gpr.R(a)); - MOV(64, R(EAX), MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); - BSWAP(64, EAX); - MOV(64, M(&temp64), R(EAX)); - fpr.Lock(d); fpr.LoadToX64(d, false); - MOVDDUP(fpr.RX(d), M(&temp64)); + fpr.Lock(d); + if (cpu_info.bSSE3NewInstructions) { + X64Reg xd = fpr.RX(d); + MOVQ_xmm(xd, MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); + PSHUFB(xd, M((void *)bswapShuffle1x8Dupe)); + } else { + MOV(64, R(EAX), MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); + BSWAP(64, EAX); + MOV(64, M(&temp64), R(EAX)); + MOVDDUP(fpr.RX(d), M(&temp64)); + } gpr.UnlockAll(); fpr.UnlockAll(); } @@ -128,7 +136,10 @@ void lfd(UGeckoInstruction inst) void stfd(UGeckoInstruction inst) { INSTRUCTION_START; - DISABLE_32BIT; + if (!cpu_info.bSSSE3NewInstructions) + { + DISABLE_32BIT; + } int s = inst.RS; int a = inst.RA; if (!a) @@ -140,12 +151,25 @@ void stfd(UGeckoInstruction inst) gpr.Lock(a); fpr.Lock(s); gpr.FlushLockX(ABI_PARAM1); - fpr.LoadToX64(s, true, false); - MOVSD(M(&temp64), fpr.RX(s)); MOV(32, R(ABI_PARAM1), gpr.R(a)); - MOV(64, R(EAX), M(&temp64)); - BSWAP(64, EAX); - MOV(64, MComplex(RBX, ABI_PARAM1, SCALE_1, offset), R(EAX)); +#ifdef _M_IX86 + AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); +#endif + if (cpu_info.bSSSE3NewInstructions) { + MOVAPS(XMM0, fpr.R(s)); + PSHUFB(XMM0, M((void *)bswapShuffle1x8)); +#ifdef _M_X64 + MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, offset), XMM0); +#else + MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base + offset), XMM0); +#endif + } else { + fpr.LoadToX64(s, true, false); + MOVSD(M(&temp64), fpr.RX(s)); + MOV(64, R(EAX), M(&temp64)); + BSWAP(64, EAX); + MOV(64, MComplex(RBX, ABI_PARAM1, SCALE_1, offset), R(EAX)); + } gpr.UnlockAll(); gpr.UnlockAllX(); fpr.UnlockAll(); @@ -154,6 +178,7 @@ void stfd(UGeckoInstruction inst) void stfs(UGeckoInstruction inst) { INSTRUCTION_START; + DISABLE_32BIT; bool update = inst.OPCD & 1; int s = inst.RS; int a = inst.RA; @@ -192,10 +217,24 @@ void lfsx(UGeckoInstruction inst) MOV(32, R(EAX), gpr.R(inst.RB)); if (inst.RA) ADD(32, R(EAX), gpr.R(inst.RA)); - UnsafeLoadRegToReg(EAX, EAX, 32, false); - MOV(32, M(&temp32), R(EAX)); - CVTSS2SD(XMM0, M(&temp32)); - MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0)); + if (cpu_info.bSSSE3NewInstructions) { + // PanicAlert("SSE3 supported!"); + X64Reg r = fpr.R(inst.RS).GetSimpleReg(); +#ifdef _M_IX86 + AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); + MOVD_xmm(r, MDisp(EAX, (u32)Memory::base)); +#else + MOVD_xmm(r, MComplex(RBX, EAX, SCALE_1, 0)); +#endif + PSHUFB(r, M((void *)bswapShuffle1x4)); + CVTSS2SD(r, R(r)); + MOVDDUP(r, R(r)); + } else { + UnsafeLoadRegToReg(EAX, EAX, 32, false); + MOV(32, M(&temp32), R(EAX)); + CVTSS2SD(XMM0, M(&temp32)); + MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0)); + } fpr.UnlockAll(); } diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp index d900640171..281f2debcd 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -27,6 +27,7 @@ #include "../../HW/PixelEngine.h" #include "../../HW/Memmap.h" #include "../PPCTables.h" +#include "CPUDetect.h" #include "x64Emitter.h" #include "ABI.h" @@ -225,6 +226,8 @@ void psq_st(UGeckoInstruction inst) } } +const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15}; +const u8 GC_ALIGNED16(pbswapShuffleNoop[16]) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; void psq_l(UGeckoInstruction inst) { @@ -247,39 +250,57 @@ void psq_l(UGeckoInstruction inst) } int offset = inst.SIMM_12; switch (ldType) { - case QUANTIZE_FLOAT: + case QUANTIZE_FLOAT: // We know this is from RAM, so we don't need to check the address. { #ifdef _M_X64 - gpr.LoadToX64(inst.RA); - MOV(64, R(RAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); - BSWAP(64, RAX); - MOV(64, M(&psTemp[0]), R(RAX)); + gpr.LoadToX64(inst.RA, true, update); fpr.LoadToX64(inst.RS, false); - X64Reg r = fpr.R(inst.RS).GetSimpleReg(); - CVTPS2PD(r, M(&psTemp[0])); - SHUFPD(r, R(r), 1); + if (cpu_info.bSSSE3NewInstructions) { + X64Reg xd = fpr.R(inst.RS).GetSimpleReg(); + MOVQ_xmm(xd, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); + PSHUFB(xd, M((void *)pbswapShuffle2x4)); + CVTPS2PD(xd, R(xd)); + } else { + MOV(64, R(RAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); + BSWAP(64, RAX); + MOV(64, M(&psTemp[0]), R(RAX)); + X64Reg r = fpr.R(inst.RS).GetSimpleReg(); + CVTPS2PD(r, M(&psTemp[0])); + SHUFPD(r, R(r), 1); + } if (update) ADD(32, gpr.R(inst.RA), Imm32(offset)); break; #else - gpr.FlushR(ECX); - gpr.LockX(ECX); - gpr.LoadToX64(inst.RA); - // This can probably be optimized somewhat. - LEA(32, ECX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset)); - AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); - MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base)); - BSWAP(32, RAX); - MOV(32, M(&psTemp[0]), R(RAX)); - MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base + 4)); - BSWAP(32, RAX); - MOV(32, M(((float *)&psTemp[0]) + 1), R(RAX)); - fpr.LoadToX64(inst.RS, false); - X64Reg r = fpr.R(inst.RS).GetSimpleReg(); - CVTPS2PD(r, M(&psTemp[0])); + if (cpu_info.bSSSE3NewInstructions) { + gpr.LoadToX64(inst.RA, true, update); + fpr.LoadToX64(inst.RS, false); + X64Reg xd = fpr.R(inst.RS).GetSimpleReg(); + MOV(32, R(EAX), gpr.R(inst.RA)); + AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); + MOVQ_xmm(xd, MDisp(EAX, (u32)Memory::base + offset)); + PSHUFB(xd, M((void *)pbswapShuffle2x4)); + CVTPS2PD(xd, R(xd)); + } else { + gpr.FlushR(ECX); + gpr.LockX(ECX); + gpr.LoadToX64(inst.RA); + // This can probably be optimized somewhat. + LEA(32, ECX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset)); + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base)); + BSWAP(32, RAX); + MOV(32, M(&psTemp[0]), R(RAX)); + MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base + 4)); + BSWAP(32, RAX); + MOV(32, M(((float *)&psTemp[0]) + 1), R(RAX)); + fpr.LoadToX64(inst.RS, false); + X64Reg r = fpr.R(inst.RS).GetSimpleReg(); + CVTPS2PD(r, M(&psTemp[0])); + gpr.UnlockAllX(); + } if (update) ADD(32, gpr.R(inst.RA), Imm32(offset)); - gpr.UnlockAllX(); break; #endif } diff --git a/Source/Plugins/Plugin_VideoOGL/Src/TextureMngr.h b/Source/Plugins/Plugin_VideoOGL/Src/TextureMngr.h index f9788d060e..be82e70abf 100644 --- a/Source/Plugins/Plugin_VideoOGL/Src/TextureMngr.h +++ b/Source/Plugins/Plugin_VideoOGL/Src/TextureMngr.h @@ -33,7 +33,7 @@ public: { TCacheEntry() : texture(0), addr(0), hash(0), w(0), h(0), isRenderTarget(false), isUpsideDown(false), isNonPow2(true), bHaveMipMaps(false) { mode.hex = 0xFCFCFCFC; } - u32 texture; + GLuint texture; u32 addr; u32 hash; u32 paletteHash;