mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-02-13 07:49:19 +01:00
Don't push registers before pairedStoreQuantized, that's dumb.
And fix some stuff up. It would probably be good to unify the stack handling some more rather than having ABI_PushRegistersAndAdjustStack do part of it and ABI_AlignStack the rest, causing unnecessary subtract instructions on Linux x86 (only).
This commit is contained in:
parent
a91469ffa5
commit
3679f9ba60
@ -43,6 +43,8 @@
|
||||
// 32-bit bog standard cdecl, shared between linux and windows
|
||||
// MacOSX 32-bit is same as System V with a few exceptions that we probably don't care much about.
|
||||
|
||||
#define ALL_CALLEE_SAVED ((1 << EAX) | (1 << ECX) | (1 << EDX))
|
||||
|
||||
#else // 64 bit calling convention
|
||||
|
||||
#ifdef _WIN32 // 64-bit Windows - the really exotic calling convention
|
||||
@ -52,7 +54,12 @@
|
||||
#define ABI_PARAM3 R8
|
||||
#define ABI_PARAM4 R9
|
||||
|
||||
#else //64-bit Unix (hopefully MacOSX too)
|
||||
#define ABI_ALL_CALLEE_SAVED ((1 << RAX) | (1 << RCX) | (1 << RDX) | (1 << R8) | \
|
||||
(1 << R9) | (1 << R10) | (1 << R11) | \
|
||||
(1 << XMM0) | (1 << XMM1) | (1 << XMM2) | (1 << XMM3) | \
|
||||
(1 << XMM4) | (1 << XMM5))
|
||||
|
||||
#else //64-bit Unix / OS X
|
||||
|
||||
#define ABI_PARAM1 RDI
|
||||
#define ABI_PARAM2 RSI
|
||||
@ -61,6 +68,10 @@
|
||||
#define ABI_PARAM5 R8
|
||||
#define ABI_PARAM6 R9
|
||||
|
||||
#define ABI_ALL_CALLEE_SAVED ((1 << RAX) | (1 << RCX) | (1 << RDX) | (1 << RDI) | \
|
||||
(1 << RSI) | (1 << R8) | (1 << R9) | (1 << R10) | (1 << R11) | \
|
||||
0xffff0000 /* xmm0..15 */)
|
||||
|
||||
#endif // WIN32
|
||||
|
||||
#endif // X86
|
||||
|
@ -101,8 +101,6 @@ void Jit64::psq_st(UGeckoInstruction inst)
|
||||
#else
|
||||
int addr_scale = SCALE_8;
|
||||
#endif
|
||||
u32 registersInUse = RegistersInUse();
|
||||
ABI_PushRegistersAndAdjustStack(registersInUse, false);
|
||||
if (inst.W) {
|
||||
// One value
|
||||
XORPS(XMM0, R(XMM0)); // TODO: See if we can get rid of this cheaply by tweaking the code in the singleStore* functions.
|
||||
@ -113,7 +111,6 @@ void Jit64::psq_st(UGeckoInstruction inst)
|
||||
CVTPD2PS(XMM0, fpr.R(s));
|
||||
CALLptr(MScaled(EDX, addr_scale, (u32)(u64)asm_routines.pairedStoreQuantized));
|
||||
}
|
||||
ABI_PopRegistersAndAdjustStack(registersInUse, false);
|
||||
gpr.UnlockAll();
|
||||
gpr.UnlockAllX();
|
||||
}
|
||||
|
@ -20,6 +20,9 @@
|
||||
#include "JitAsmCommon.h"
|
||||
#include "JitBase.h"
|
||||
|
||||
#define QUANTIZED_REGS_TO_SAVE (ABI_ALL_CALLEE_SAVED & ~((1 << RAX) | (1 << RCX) | (1 << RDX) | \
|
||||
(1 << XMM0) | (1 << XMM1)))
|
||||
|
||||
using namespace Gen;
|
||||
|
||||
static int temp32;
|
||||
@ -141,14 +144,10 @@ static const float GC_ALIGNED16(m_one[]) = {1.0f, 0.0f, 0.0f, 0.0f};
|
||||
// I don't know whether the overflow actually happens in any games
|
||||
// but it potentially can cause problems, so we need some clamping
|
||||
|
||||
#ifdef _M_X64
|
||||
// TODO(ector): Improve 64-bit version
|
||||
static void WriteDual32(u64 value, u32 address)
|
||||
static void WriteDual32(u32 address)
|
||||
{
|
||||
Memory::Write_U32((u32)(value >> 32), address);
|
||||
Memory::Write_U32((u32)value, address + 4);
|
||||
Memory::Write_U64(*(u64 *) psTemp, address);
|
||||
}
|
||||
#endif
|
||||
|
||||
// See comment in header for in/outs.
|
||||
void CommonAsmRoutines::GenQuantizedStores() {
|
||||
@ -161,18 +160,20 @@ void CommonAsmRoutines::GenQuantizedStores() {
|
||||
MOVQ_xmm(M(&psTemp[0]), XMM0);
|
||||
MOV(64, R(RAX), M(&psTemp[0]));
|
||||
TEST(32, R(ECX), Imm32(0x0C000000));
|
||||
FixupBranch too_complex = J_CC(CC_NZ);
|
||||
FixupBranch too_complex = J_CC(CC_NZ, true);
|
||||
BSWAP(64, RAX);
|
||||
MOV(64, MComplex(RBX, RCX, SCALE_1, 0), R(RAX));
|
||||
FixupBranch skip_complex = J();
|
||||
FixupBranch skip_complex = J(true);
|
||||
SetJumpTarget(too_complex);
|
||||
ABI_CallFunctionRR((void *)&WriteDual32, RAX, RCX, /* noProlog = */ true);
|
||||
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true);
|
||||
ABI_CallFunctionR((void *)&WriteDual32, RCX);
|
||||
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true);
|
||||
SetJumpTarget(skip_complex);
|
||||
RET();
|
||||
#else
|
||||
MOVQ_xmm(M(&psTemp[0]), XMM0);
|
||||
TEST(32, R(ECX), Imm32(0x0C000000));
|
||||
FixupBranch argh = J_CC(CC_NZ);
|
||||
FixupBranch argh = J_CC(CC_NZ, true);
|
||||
MOV(32, R(EAX), M(&psTemp));
|
||||
BSWAP(32, EAX);
|
||||
AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
|
||||
@ -180,13 +181,11 @@ void CommonAsmRoutines::GenQuantizedStores() {
|
||||
MOV(32, R(EAX), M(((char*)&psTemp) + 4));
|
||||
BSWAP(32, EAX);
|
||||
MOV(32, MDisp(ECX, 4+(u32)Memory::base), R(EAX));
|
||||
FixupBranch arg2 = J();
|
||||
FixupBranch arg2 = J(true);
|
||||
SetJumpTarget(argh);
|
||||
MOV(32, R(EAX), M(((char*)&psTemp)));
|
||||
ABI_CallFunctionRR((void *)&Memory::Write_U32, EAX, ECX, /* noProlog = */ true);
|
||||
MOV(32, R(EAX), M(((char*)&psTemp)+4));
|
||||
ADD(32, R(ECX), Imm32(4));
|
||||
ABI_CallFunctionRR((void *)&Memory::Write_U32, EAX, ECX, /* noProlog = */ true);
|
||||
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true);
|
||||
ABI_CallFunctionR((void *)&WriteDual32, ECX);
|
||||
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, true);
|
||||
SetJumpTarget(arg2);
|
||||
RET();
|
||||
#endif
|
||||
@ -205,8 +204,8 @@ void CommonAsmRoutines::GenQuantizedStores() {
|
||||
PACKSSDW(XMM0, R(XMM0));
|
||||
PACKUSWB(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(EAX), XMM0);
|
||||
SafeWriteRegToReg(AX, ECX, 16, 0, 0, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
|
||||
SafeWriteRegToReg(AX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedS8 = AlignCode4();
|
||||
@ -224,8 +223,8 @@ void CommonAsmRoutines::GenQuantizedStores() {
|
||||
PACKSSWB(XMM0, R(XMM0));
|
||||
MOVD_xmm(R(EAX), XMM0);
|
||||
|
||||
SafeWriteRegToReg(AX, ECX, 16, 0, 0, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
|
||||
SafeWriteRegToReg(AX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedU16 = AlignCode4();
|
||||
@ -250,8 +249,8 @@ void CommonAsmRoutines::GenQuantizedStores() {
|
||||
MOV(16, R(AX), M((char*)psTemp + 4));
|
||||
|
||||
BSWAP(32, EAX);
|
||||
SafeWriteRegToReg(EAX, ECX, 32, 0, 0, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
|
||||
SafeWriteRegToReg(EAX, ECX, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
const u8* storePairedS16 = AlignCode4();
|
||||
@ -270,8 +269,8 @@ void CommonAsmRoutines::GenQuantizedStores() {
|
||||
MOVD_xmm(R(EAX), XMM0);
|
||||
BSWAP(32, EAX);
|
||||
ROL(32, R(EAX), Imm8(16));
|
||||
SafeWriteRegToReg(EAX, ECX, 32, 0, 0, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
|
||||
SafeWriteRegToReg(EAX, ECX, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_SWAP | SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
|
||||
RET();
|
||||
|
||||
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||
@ -294,7 +293,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
|
||||
|
||||
// Easy!
|
||||
const u8* storeSingleFloat = AlignCode4();
|
||||
SafeWriteFloatToReg(XMM0, ECX, 0, SAFE_WRITE_NO_FASTMEM);
|
||||
SafeWriteFloatToReg(XMM0, ECX, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_FASTMEM);
|
||||
RET();
|
||||
/*
|
||||
if (cpu_info.bSSSE3) {
|
||||
@ -317,7 +316,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
|
||||
MAXSS(XMM0, R(XMM1));
|
||||
MINSS(XMM0, M((void *)&m_255));
|
||||
CVTTSS2SI(EAX, R(XMM0));
|
||||
SafeWriteRegToReg(AL, ECX, 8, 0, 0, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
SafeWriteRegToReg(AL, ECX, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
const u8* storeSingleS8 = AlignCode4();
|
||||
@ -327,7 +326,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
|
||||
MAXSS(XMM0, M((void *)&m_m128));
|
||||
MINSS(XMM0, M((void *)&m_127));
|
||||
CVTTSS2SI(EAX, R(XMM0));
|
||||
SafeWriteRegToReg(AL, ECX, 8, 0, 0, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
SafeWriteRegToReg(AL, ECX, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
|
||||
@ -338,7 +337,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
|
||||
MAXSS(XMM0, R(XMM1));
|
||||
MINSS(XMM0, M((void *)&m_65535));
|
||||
CVTTSS2SI(EAX, R(XMM0));
|
||||
SafeWriteRegToReg(EAX, ECX, 16, 0, 0, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
SafeWriteRegToReg(EAX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
const u8* storeSingleS16 = AlignCode4();
|
||||
@ -348,7 +347,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
|
||||
MAXSS(XMM0, M((void *)&m_m32768));
|
||||
MINSS(XMM0, M((void *)&m_32767));
|
||||
CVTTSS2SI(EAX, R(XMM0));
|
||||
SafeWriteRegToReg(EAX, ECX, 16, 0, 0, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
SafeWriteRegToReg(EAX, ECX, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_WRITE_NO_PROLOG | SAFE_WRITE_NO_FASTMEM);
|
||||
RET();
|
||||
|
||||
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||
|
@ -311,14 +311,14 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
|
||||
MOV(32, M(&PC), Imm32(jit->js.compilerPC)); // Helps external systems know which instruction triggered the write
|
||||
bool noProlog = flags & SAFE_WRITE_NO_PROLOG;
|
||||
bool swap = !(flags & SAFE_WRITE_NO_SWAP);
|
||||
ABI_PushRegistersAndAdjustStack(registersInUse, false);
|
||||
ABI_PushRegistersAndAdjustStack(registersInUse, noProlog);
|
||||
switch (accessSize)
|
||||
{
|
||||
case 32: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U32) : ((void *)&Memory::Write_U32_Swap), reg_value, reg_addr, noProlog); break;
|
||||
case 16: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U16) : ((void *)&Memory::Write_U16_Swap), reg_value, reg_addr, noProlog); break;
|
||||
case 8: ABI_CallFunctionRR((void *)&Memory::Write_U8, reg_value, reg_addr, noProlog); break;
|
||||
case 32: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U32) : ((void *)&Memory::Write_U32_Swap), reg_value, reg_addr, false); break;
|
||||
case 16: ABI_CallFunctionRR(swap ? ((void *)&Memory::Write_U16) : ((void *)&Memory::Write_U16_Swap), reg_value, reg_addr, false); break;
|
||||
case 8: ABI_CallFunctionRR((void *)&Memory::Write_U8, reg_value, reg_addr, false); break;
|
||||
}
|
||||
ABI_PopRegistersAndAdjustStack(registersInUse, false);
|
||||
ABI_PopRegistersAndAdjustStack(registersInUse, noProlog);
|
||||
FixupBranch exit = J();
|
||||
SetJumpTarget(fast);
|
||||
UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0, swap);
|
||||
|
Loading…
x
Reference in New Issue
Block a user