Don't push registers before pairedStoreQuantized, that's dumb.

And fix some stuff up.  It would probably be good to unify the stack
handling some more rather than having ABI_PushRegistersAndAdjustStack do
part of it and ABI_AlignStack the rest, causing unnecessary subtract
instructions on Linux x86 (only).
This commit is contained in:
comex 2013-10-04 15:26:20 -04:00
parent a91469ffa5
commit 3679f9ba60
4 changed files with 45 additions and 38 deletions

View File

@ -43,6 +43,8 @@
// 32-bit bog standard cdecl, shared between linux and windows
// MacOSX 32-bit is same as System V with a few exceptions that we probably don't care much about.
#define ALL_CALLEE_SAVED ((1 << EAX) | (1 << ECX) | (1 << EDX))
#else // 64 bit calling convention
#ifdef _WIN32 // 64-bit Windows - the really exotic calling convention
@ -52,7 +54,12 @@
#define ABI_PARAM3 R8
#define ABI_PARAM4 R9
#else //64-bit Unix (hopefully MacOSX too)
#define ABI_ALL_CALLEE_SAVED ((1 << RAX) | (1 << RCX) | (1 << RDX) | (1 << R8) | \
(1 << R9) | (1 << R10) | (1 << R11) | \
(1 << XMM0) | (1 << XMM1) | (1 << XMM2) | (1 << XMM3) | \
(1 << XMM4) | (1 << XMM5))
#else //64-bit Unix / OS X
#define ABI_PARAM1 RDI
#define ABI_PARAM2 RSI
@ -61,6 +68,10 @@
#define ABI_PARAM5 R8
#define ABI_PARAM6 R9
#define ABI_ALL_CALLEE_SAVED ((1 << RAX) | (1 << RCX) | (1 << RDX) | (1 << RDI) | \
(1 << RSI) | (1 << R8) | (1 << R9) | (1 << R10) | (1 << R11) | \
0xffff0000 /* xmm0..15 */)
#endif // WIN32
#endif // X86

View File

@ -101,8 +101,6 @@ void Jit64::psq_st(UGeckoInstruction inst)
#else
int addr_scale = SCALE_8;
#endif
u32 registersInUse = RegistersInUse();
ABI_PushRegistersAndAdjustStack(registersInUse, false);
if (inst.W) {
// One value
XORPS(XMM0, R(XMM0)); // TODO: See if we can get rid of this cheaply by tweaking the code in the singleStore* functions.
@ -113,7 +111,6 @@ void Jit64::psq_st(UGeckoInstruction inst)
CVTPD2PS(XMM0, fpr.R(s));
CALLptr(MScaled(EDX, addr_scale, (u32)(u64)asm_routines.pairedStoreQuantized));
}
ABI_PopRegistersAndAdjustStack(registersInUse, false);
gpr.UnlockAll();
gpr.UnlockAllX();
}

View File

@ -20,6 +20,9 @@
#include "JitAsmCommon.h"
#include "JitBase.h"
#define QUANTIZED_REGS_TO_SAVE (ABI_ALL_CALLEE_SAVED & ~((1 << RAX) | (1 << RCX) | (1 << RDX) | \
(1 << XMM0) | (1 << XMM1)))
using namespace Gen;
static int temp32;
@ -141,14 +144,10 @@ static const float GC_ALIGNED16(m_one[]) = {1.0f, 0.0f, 0.0f, 0.0f};
// I don't know whether the overflow actually happens in any games
// but it potentially can cause problems, so we need some clamping
#ifdef _M_X64
// TODO(ector): Improve 64-bit version
static void WriteDual32(u64 value, u32 address)
static void WriteDual32(u32 address)
{
Memory::Write_U32((u32)(value >> 32), address);
Memory::Write_U32((u32)value, address + 4);
Memory::Write_U64(*(u64 *) psTemp, address);
}
#endif
// See comment in header for in/outs.
void CommonAsmRoutines::GenQuantizedStores() {
@ -161,18 +160,20 @@ void CommonAsmRoutines::GenQuantizedStores() {
MOVQ_xmm(M(&psTemp[0]), XMM0);
MOV(64, R(RAX), M(&psTemp[0]));
TEST(32, R(ECX), Imm32(0x0C000000));
FixupBranch too_complex = J_CC(CC_NZ);
FixupBranch too_complex = J_CC(CC_NZ, true);
BSWAP(64, RAX);
MOV(64, MComplex(RBX, RCX, SCALE_1, 0), R(RAX));
FixupBranch skip_complex = J();
FixupBranch skip_complex = J(true);
SetJumpTarget(too_complex);
ABI_CallFunctionRR((void *)&WriteDual32, RAX, RCX, /* noProlog = */ true);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true);
ABI_CallFunctionR((void *)&WriteDual32, RCX);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true);
SetJumpTarget(skip_complex);
RET();
#else
MOVQ_xmm(M(&psTemp[0]), XMM0);
TEST(32, R(ECX), Imm32(0x0C000000));
FixupBranch argh = J_CC(CC_NZ);
FixupBranch argh = J_CC(CC_NZ, true);
MOV(32, R(EAX), M(&psTemp));
BSWAP(32, EAX);
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
@ -180,13 +181,11 @@ void CommonAsmRoutines::GenQuantizedStores() {
MOV(32, R(EAX), M(((char*)&psTemp) + 4));
BSWAP(32, EAX);
MOV(32, MDisp(ECX, 4+(u32)Memory::base), R(EAX));
FixupBranch arg2 = J();
FixupBranch arg2 = J(true);
SetJumpTarget(argh);
MOV(32, R(EAX), M(((char*)&psTemp)));
ABI_CallFunctionRR((void *)&Memory::Write_U32, EAX, ECX, /* noProlog = */ true);
MOV(32, R(EAX), M(((char*)&psTemp)+4));
ADD(32, R(ECX), Imm32(4));
ABI_CallFunctionRR((void *)&Memory::Write_U32, EAX, ECX, /* noProlog = */ true);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true);
ABI_CallFunctionR((void *)&WriteDual32, ECX);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true);
SetJumpTarget(arg2);
RET();
#endif
@ -205,8 +204,8 @@ void CommonAsmRoutines::GenQuantizedStores() {
PACKSSDW(XMM0, R(XMM0));
PACKUSWB(XMM0, R(XMM0));
MOVD_xmm(R(EAX), XMM0);
SafeWriteRegToReg(AX, ECX, 16, 0, 0, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
SafeWriteRegToReg(AX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
RET();
const u8* storePairedS8 = AlignCode4();
@ -224,8 +223,8 @@ void CommonAsmRoutines::GenQuantizedStores() {
PACKSSWB(XMM0, R(XMM0));
MOVD_xmm(R(EAX), XMM0);
SafeWriteRegToReg(AX, ECX, 16, 0, 0, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
SafeWriteRegToReg(AX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
RET();
const u8* storePairedU16 = AlignCode4();
@ -250,8 +249,8 @@ void CommonAsmRoutines::GenQuantizedStores() {
MOV(16, R(AX), M((char*)psTemp + 4));
BSWAP(32, EAX);
SafeWriteRegToReg(EAX, ECX, 32, 0, 0, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
SafeWriteRegToReg(EAX, ECX, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
RET();
const u8* storePairedS16 = AlignCode4();
@ -270,8 +269,8 @@ void CommonAsmRoutines::GenQuantizedStores() {
MOVD_xmm(R(EAX), XMM0);
BSWAP(32, EAX);
ROL(32, R(EAX), Imm8(16));
SafeWriteRegToReg(EAX, ECX, 32, 0, 0, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
SafeWriteRegToReg(EAX, ECX, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
RET();
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
@ -294,7 +293,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
// Easy!
const u8* storeSingleFloat = AlignCode4();
SafeWriteFloatToReg(XMM0, ECX, 0, SAFE_WRITE_NO_FASTMEM);
SafeWriteFloatToReg(XMM0, ECX, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_FASTMEM);
RET();
/*
if (cpu_info.bSSSE3) {
@ -317,7 +316,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M((void *)&m_255));
CVTTSS2SI(EAX, R(XMM0));
SafeWriteRegToReg(AL, ECX, 8, 0, 0, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
SafeWriteRegToReg(AL, ECX, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
RET();
const u8* storeSingleS8 = AlignCode4();
@ -327,7 +326,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
MAXSS(XMM0, M((void *)&m_m128));
MINSS(XMM0, M((void *)&m_127));
CVTTSS2SI(EAX, R(XMM0));
SafeWriteRegToReg(AL, ECX, 8, 0, 0, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
SafeWriteRegToReg(AL, ECX, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
RET();
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
@ -338,7 +337,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M((void *)&m_65535));
CVTTSS2SI(EAX, R(XMM0));
SafeWriteRegToReg(EAX, ECX, 16, 0, 0, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
SafeWriteRegToReg(EAX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
RET();
const u8* storeSingleS16 = AlignCode4();
@ -348,7 +347,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
MAXSS(XMM0, M((void *)&m_m32768));
MINSS(XMM0, M((void *)&m_32767));
CVTTSS2SI(EAX, R(XMM0));
SafeWriteRegToReg(EAX, ECX, 16, 0, 0, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
SafeWriteRegToReg(EAX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
RET();
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));

View File

@ -311,14 +311,14 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
MOV(32, M(&PC), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write
bool noProlog = flags & SAFE_WRITE_NO_PROLOG;
bool swap = !(flags & SAFE_WRITE_NO_SWAP);
ABI_PushRegistersAndAdjustStack(registersInUse, false);
ABI_PushRegistersAndAdjustStack(registersInUse, noProlog);
switch (accessSize)
{
case 32: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U32) : ((void *)&Memory::Write_U32_Swap), reg_value, reg_addr, noProlog); break;
case 16: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U16) : ((void *)&Memory::Write_U16_Swap), reg_value, reg_addr, noProlog); break;
case 8: ABI_CallFunctionRR((void *)&Memory::Write_U8, reg_value, reg_addr, noProlog); break;
case 32: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U32) : ((void *)&Memory::Write_U32_Swap), reg_value, reg_addr, false); break;
case 16: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U16) : ((void *)&Memory::Write_U16_Swap), reg_value, reg_addr, false); break;
case 8: ABI_CallFunctionRR((void *)&Memory::Write_U8, reg_value, reg_addr, false); break;
}
ABI_PopRegistersAndAdjustStack(registersInUse, false);
ABI_PopRegistersAndAdjustStack(registersInUse, noProlog);
FixupBranch exit = J();
SetJumpTarget(fast);
UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0, swap);