From 3679f9ba60b6d833b710faf41bc9feef607429d2 Mon Sep 17 00:00:00 2001 From: comex Date: Fri, 4 Oct 2013 15:26:20 -0400 Subject: [PATCH] Don't push registers before pairedStoreQuantized, that's dumb. And fix some stuff up. It would probably be good to unify the stack handling some more rather than having ABI_PushRegistersAndAdjustStack do part of it and ABI_AlignStack the rest, causing unnecessary subtract instructions on Linux x86 (only). --- Source/Core/Common/Src/x64ABI.h | 13 ++++- .../Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp | 3 - .../Src/PowerPC/JitCommon/JitAsmCommon.cpp | 57 +++++++++---------- .../Core/Src/PowerPC/JitCommon/Jit_Util.cpp | 10 ++-- 4 files changed, 45 insertions(+), 38 deletions(-) diff --git a/Source/Core/Common/Src/x64ABI.h b/Source/Core/Common/Src/x64ABI.h index 837e4ec3d8..4b10d11e54 100644 --- a/Source/Core/Common/Src/x64ABI.h +++ b/Source/Core/Common/Src/x64ABI.h @@ -43,6 +43,8 @@ // 32-bit bog standard cdecl, shared between linux and windows // MacOSX 32-bit is same as System V with a few exceptions that we probably don't care much about. +#define ALL_CALLEE_SAVED ((1 << EAX) | (1 << ECX) | (1 << EDX)) + #else // 64 bit calling convention #ifdef _WIN32 // 64-bit Windows - the really exotic calling convention @@ -52,7 +54,12 @@ #define ABI_PARAM3 R8 #define ABI_PARAM4 R9 -#else //64-bit Unix (hopefully MacOSX too) +#define ABI_ALL_CALLEE_SAVED ((1 << RAX) | (1 << RCX) | (1 << RDX) | (1 << R8) | \ + (1 << R9) | (1 << R10) | (1 << R11) | \ + (1 << XMM0) | (1 << XMM1) | (1 << XMM2) | (1 << XMM3) | \ + (1 << XMM4) | (1 << XMM5)) + +#else //64-bit Unix / OS X #define ABI_PARAM1 RDI #define ABI_PARAM2 RSI @@ -61,6 +68,10 @@ #define ABI_PARAM5 R8 #define ABI_PARAM6 R9 +#define ABI_ALL_CALLEE_SAVED ((1 << RAX) | (1 << RCX) | (1 << RDX) | (1 << RDI) | \ + (1 << RSI) | (1 << R8) | (1 << R9) | (1 << R10) | (1 << R11) | \ + 0xffff0000 /* xmm0..15 */) + #endif // WIN32 #endif // X86 diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp index 876268b90a..4548890e2c 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -101,8 +101,6 @@ void Jit64::psq_st(UGeckoInstruction inst) #else int addr_scale = SCALE_8; #endif - u32 registersInUse = RegistersInUse(); - ABI_PushRegistersAndAdjustStack(registersInUse, false); if (inst.W) { // One value XORPS(XMM0, R(XMM0)); // TODO: See if we can get rid of this cheaply by tweaking the code in the singleStore* functions. @@ -113,7 +111,6 @@ void Jit64::psq_st(UGeckoInstruction inst) CVTPD2PS(XMM0, fpr.R(s)); CALLptr(MScaled(EDX, addr_scale, (u32)(u64)asm_routines.pairedStoreQuantized)); } - ABI_PopRegistersAndAdjustStack(registersInUse, false); gpr.UnlockAll(); gpr.UnlockAllX(); } diff --git a/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp b/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp index 8d1f6fa06b..cb763db179 100644 --- a/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp +++ b/Source/Core/Core/Src/PowerPC/JitCommon/JitAsmCommon.cpp @@ -20,6 +20,9 @@ #include "JitAsmCommon.h" #include "JitBase.h" +#define QUANTIZED_REGS_TO_SAVE (ABI_ALL_CALLEE_SAVED & ~((1 << RAX) | (1 << RCX) | (1 << RDX) | \ + (1 << XMM0) | (1 << XMM1))) + using namespace Gen; static int temp32; @@ -141,14 +144,10 @@ static const float GC_ALIGNED16(m_one[]) = {1.0f, 0.0f, 0.0f, 0.0f}; // I don't know whether the overflow actually happens in any games // but it potentially can cause problems, so we need some clamping -#ifdef _M_X64 -// TODO(ector): Improve 64-bit version -static void WriteDual32(u64 value, u32 address) +static void WriteDual32(u32 address) { - Memory::Write_U32((u32)(value >> 32), address); - Memory::Write_U32((u32)value, address + 4); + Memory::Write_U64(*(u64 *) psTemp, address); } -#endif // See comment in header for in/outs. void CommonAsmRoutines::GenQuantizedStores() { @@ -161,18 +160,20 @@ void CommonAsmRoutines::GenQuantizedStores() { MOVQ_xmm(M(&psTemp[0]), XMM0); MOV(64, R(RAX), M(&psTemp[0])); TEST(32, R(ECX), Imm32(0x0C000000)); - FixupBranch too_complex = J_CC(CC_NZ); + FixupBranch too_complex = J_CC(CC_NZ, true); BSWAP(64, RAX); MOV(64, MComplex(RBX, RCX, SCALE_1, 0), R(RAX)); - FixupBranch skip_complex = J(); + FixupBranch skip_complex = J(true); SetJumpTarget(too_complex); - ABI_CallFunctionRR((void *)&WriteDual32, RAX, RCX, /* noProlog = */ true); + ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true); + ABI_CallFunctionR((void *)&WriteDual32, RCX); + ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true); SetJumpTarget(skip_complex); RET(); #else MOVQ_xmm(M(&psTemp[0]), XMM0); TEST(32, R(ECX), Imm32(0x0C000000)); - FixupBranch argh = J_CC(CC_NZ); + FixupBranch argh = J_CC(CC_NZ, true); MOV(32, R(EAX), M(&psTemp)); BSWAP(32, EAX); AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); @@ -180,13 +181,11 @@ void CommonAsmRoutines::GenQuantizedStores() { MOV(32, R(EAX), M(((char*)&psTemp) + 4)); BSWAP(32, EAX); MOV(32, MDisp(ECX, 4+(u32)Memory::base), R(EAX)); - FixupBranch arg2 = J(); + FixupBranch arg2 = J(true); SetJumpTarget(argh); - MOV(32, R(EAX), M(((char*)&psTemp))); - ABI_CallFunctionRR((void *)&Memory::Write_U32, EAX, ECX, /* noProlog = */ true); - MOV(32, R(EAX), M(((char*)&psTemp)+4)); - ADD(32, R(ECX), Imm32(4)); - ABI_CallFunctionRR((void *)&Memory::Write_U32, EAX, ECX, /* noProlog = */ true); + ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true); + ABI_CallFunctionR((void *)&WriteDual32, ECX); + ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true); SetJumpTarget(arg2); RET(); #endif @@ -205,8 +204,8 @@ void CommonAsmRoutines::GenQuantizedStores() { PACKSSDW(XMM0, R(XMM0)); PACKUSWB(XMM0, R(XMM0)); MOVD_xmm(R(EAX), XMM0); - SafeWriteRegToReg(AX, ECX, 16, 0, 0, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); - + SafeWriteRegToReg(AX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); + RET(); const u8* storePairedS8 = AlignCode4(); @@ -224,8 +223,8 @@ void CommonAsmRoutines::GenQuantizedStores() { PACKSSWB(XMM0, R(XMM0)); MOVD_xmm(R(EAX), XMM0); - SafeWriteRegToReg(AX, ECX, 16, 0, 0, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); - + SafeWriteRegToReg(AX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); + RET(); const u8* storePairedU16 = AlignCode4(); @@ -250,8 +249,8 @@ void CommonAsmRoutines::GenQuantizedStores() { MOV(16, R(AX), M((char*)psTemp + 4)); BSWAP(32, EAX); - SafeWriteRegToReg(EAX, ECX, 32, 0, 0, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); - + SafeWriteRegToReg(EAX, ECX, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); + RET(); const u8* storePairedS16 = AlignCode4(); @@ -270,8 +269,8 @@ void CommonAsmRoutines::GenQuantizedStores() { MOVD_xmm(R(EAX), XMM0); BSWAP(32, EAX); ROL(32, R(EAX), Imm8(16)); - SafeWriteRegToReg(EAX, ECX, 32, 0, 0, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); - + SafeWriteRegToReg(EAX, ECX, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); + RET(); pairedStoreQuantized = reinterpret_cast(const_cast(AlignCode16())); @@ -294,7 +293,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() { // Easy! const u8* storeSingleFloat = AlignCode4(); - SafeWriteFloatToReg(XMM0, ECX, 0, SAFE_WRITE_NO_FASTMEM); + SafeWriteFloatToReg(XMM0, ECX, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_FASTMEM); RET(); /* if (cpu_info.bSSSE3) { @@ -317,7 +316,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() { MAXSS(XMM0, R(XMM1)); MINSS(XMM0, M((void *)&m_255)); CVTTSS2SI(EAX, R(XMM0)); - SafeWriteRegToReg(AL, ECX, 8, 0, 0, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); + SafeWriteRegToReg(AL, ECX, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); RET(); const u8* storeSingleS8 = AlignCode4(); @@ -327,7 +326,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() { MAXSS(XMM0, M((void *)&m_m128)); MINSS(XMM0, M((void *)&m_127)); CVTTSS2SI(EAX, R(XMM0)); - SafeWriteRegToReg(AL, ECX, 8, 0, 0, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); + SafeWriteRegToReg(AL, ECX, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); RET(); const u8* storeSingleU16 = AlignCode4(); // Used by MKWii @@ -338,7 +337,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() { MAXSS(XMM0, R(XMM1)); MINSS(XMM0, M((void *)&m_65535)); CVTTSS2SI(EAX, R(XMM0)); - SafeWriteRegToReg(EAX, ECX, 16, 0, 0, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); + SafeWriteRegToReg(EAX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); RET(); const u8* storeSingleS16 = AlignCode4(); @@ -348,7 +347,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() { MAXSS(XMM0, M((void *)&m_m32768)); MINSS(XMM0, M((void *)&m_32767)); CVTTSS2SI(EAX, R(XMM0)); - SafeWriteRegToReg(EAX, ECX, 16, 0, 0, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); + SafeWriteRegToReg(EAX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM); RET(); singleStoreQuantized = reinterpret_cast(const_cast(AlignCode16())); diff --git a/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.cpp index 7fbc565c23..69cfdc8bd3 100644 --- a/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/Src/PowerPC/JitCommon/Jit_Util.cpp @@ -311,14 +311,14 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce MOV(32, M(&PC), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write bool noProlog = flags & SAFE_WRITE_NO_PROLOG; bool swap = !(flags & SAFE_WRITE_NO_SWAP); - ABI_PushRegistersAndAdjustStack(registersInUse, false); + ABI_PushRegistersAndAdjustStack(registersInUse, noProlog); switch (accessSize) { - case 32: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U32) : ((void *)&Memory::Write_U32_Swap), reg_value, reg_addr, noProlog); break; - case 16: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U16) : ((void *)&Memory::Write_U16_Swap), reg_value, reg_addr, noProlog); break; - case 8: ABI_CallFunctionRR((void *)&Memory::Write_U8, reg_value, reg_addr, noProlog); break; + case 32: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U32) : ((void *)&Memory::Write_U32_Swap), reg_value, reg_addr, false); break; + case 16: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U16) : ((void *)&Memory::Write_U16_Swap), reg_value, reg_addr, false); break; + case 8: ABI_CallFunctionRR((void *)&Memory::Write_U8, reg_value, reg_addr, false); break; } - ABI_PopRegistersAndAdjustStack(registersInUse, false); + ABI_PopRegistersAndAdjustStack(registersInUse, noProlog); FixupBranch exit = J(); SetJumpTarget(fast); UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0, swap);