Hopefully fix all remaining quantizer issues in Mario Kart Wii:

* must use a truncating float-to-int conversion, for example.
  * introduce optimized variants of the single value psq_st operation (JIT only).
  * fix bug in SafeWriteRegToReg when swap = false

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@4861 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
hrydgard 2010-01-16 22:44:49 +00:00
parent 734b0f5dd4
commit b84a1823b2
13 changed files with 131 additions and 103 deletions

View File

@ -1150,6 +1150,9 @@ void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5B, true, reg
void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0xE6, false, regOp, arg);}
void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5B, true, regOp, arg);}
void XEmitter::CVTTSS2SI(X64Reg xregdest, OpArg arg) {WriteSSEOp(32, 0x2C, false, xregdest, arg);}
void XEmitter::CVTTPS2DQ(X64Reg xregdest, OpArg arg) {WriteSSEOp(32, 0x5B, false, xregdest, arg);}
void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(64, sseMASKMOVDQU, true, dest, R(src));}
void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x50, true, dest, arg);}

View File

@ -512,6 +512,9 @@ public:
void CVTDQ2PS(X64Reg regOp, OpArg arg);
void CVTPS2DQ(X64Reg regOp, OpArg arg);
void CVTTSS2SI(X64Reg xregdest, OpArg arg); // Yeah, destination really is a GPR like EAX!
void CVTTPS2DQ(X64Reg regOp, OpArg arg);
// SSE2: Packed integer instructions
void PACKSSDW(X64Reg dest, OpArg arg);
void PACKSSWB(X64Reg dest, OpArg arg);

View File

@ -324,7 +324,7 @@ THREAD_RETURN EmuThread(void *pArg)
VideoInitialize.Fifo_CPUBase = &ProcessorInterface::Fifo_CPUBase;
VideoInitialize.Fifo_CPUEnd = &ProcessorInterface::Fifo_CPUEnd;
VideoInitialize.Fifo_CPUWritePointer = &ProcessorInterface::Fifo_CPUWritePointer;
VideoInitialize.bAutoAspectIs16_9 = _CoreParameter.bWii ? SConfig::GetInstance().m_SYSCONF->GetData<u8>("IPL.AR") : false;
VideoInitialize.bAutoAspectIs16_9 = _CoreParameter.bWii ? (SConfig::GetInstance().m_SYSCONF->GetData<u8>("IPL.AR") ? true : false) : false;
Plugins.GetVideo()->Initialize(&VideoInitialize); // Call the dll

View File

@ -48,10 +48,6 @@ may be redirected here (for example to Read_U32()).
#include "../Debugger/Debugger_SymbolMap.h"
#include "../PluginManager.h"
// Declarations and definitions
// ----------------
namespace Memory
{
@ -75,10 +71,9 @@ u8* base = NULL;
MemArena g_arena;
// ==============
// STATE_TO_SAVE (applies to a lot of things in this file)
// STATE_TO_SAVE
bool m_IsInitialized = false; // Save the Init(), Shutdown() state
// END STATE_TO_SAVE
// 64-bit: Pointers to low-mem (sub-0x10000000) mirror
// 32-bit: Same as the corresponding physical/virtual pointers.
@ -130,8 +125,6 @@ void HW_Default_Write(const T _Data, const u32 _Address){ ERROR_LOG(MASTER_LOG,
template <class T>
void HW_Default_Read(T _Data, const u32 _Address){ ERROR_LOG(MASTER_LOG, "Illegal HW Read%i %08x", sizeof(T)*8, _Address); _dbg_assert_(MEMMAP, 0);}
u32 CheckDTLB(u32 _Address, XCheckTLBFlag _Flag);
#define PAGE_SHIFT 10
#define PAGE_SIZE (1 << PAGE_SHIFT)
#define PAGE_MASK (PAGE_SHIFT - 1)
@ -606,12 +599,10 @@ void CheckForBadAddresses(u32 Address, u32 Data, bool Read, int Bits)
if(Read)
{
WARN_LOG(CONSOLE, "Read%i: Program tried to read [%08x] from [%08x]", Bits, Address);
//PanicAlert("Write_U32: Program tried to write [%08x] to [%08x]", _Address);
}
else
{
ERROR_LOG(CONSOLE, "Write%i: Program tried to write [%08x] to [%08x]", Bits, Data, Address);
//PanicAlert("Read: Program tried to write [%08x] to [%08x]", Data, Address);
}
}
@ -620,16 +611,14 @@ void CheckForBadAddresses(u32 Address, u32 Data, bool Read, int Bits)
if(Read)
{
WARN_LOG(CONSOLE, "Read%i: Program read [0x%08x] from [0x%08x] * * * 0 * * *", Bits, Data, Address);
//PanicAlert("Read: Program read [%08x] from [%08x]", Data, Address);
}
else
{
WARN_LOG(CONSOLE, "Write%i: Program wrote [0x%08x] to [0x%08x] * * * 0 * * *", Bits, Data, Address);
//PanicAlert("Read: Program wrote [%08x] to [%08x]", Data, Address);
}
}
/* Try to figure out where the dev/di Ioctl arguments are stored (including buffer out), so we can
find the bad one */
// Try to figure out where the dev/di Ioctl arguments are stored (including buffer out), so we can
// find the bad one
if(
Data == 0x1090f4c0 // good out buffer right before it, for sound/smashbros_sound.brsar
|| Data == 0x10913b00 // second one
@ -646,12 +635,10 @@ void CheckForBadAddresses(u32 Address, u32 Data, bool Read, int Bits)
if(Read)
{
ERROR_LOG(CONSOLE, "Read%i: Program read [0x%08x] from [0x%08x] * * * * * * * * * * * *", Bits, Data, Address);
//PanicAlert("Read%i: Program read [%08x] from [%08x]", Bits, Data, Address);
}
else
{
ERROR_LOG(CONSOLE, "Write%i: Program wrote [0x%08x] to [0x%08x] * * * * * * * * * * * *", Bits,Data, Address);
//PanicAlert("Write%i: Program wrote [0x%08x] to [0x%08x]", Bits, Data, Address);
}
}
}
@ -683,9 +670,6 @@ void Memset(const u32 _Address, const u8 _iValue, const u32 _iLength)
}
else
{
// (comment for old implementation) : F|RES: rogue squadron and other games use the TLB ... so this cant work
// fixed implementation:
for (u32 i = 0; i < _iLength; i++)
Write_U8(_iValue, _Address + i);
}
@ -839,12 +823,9 @@ bool IsRAMAddress(const u32 addr, bool allow_locked_cache)
return true;
else
return false;
default:
return false;
}
}
} // namespace

View File

@ -142,6 +142,10 @@ void Write_U16(const u16 _Data, const u32 _Address);
void Write_U32(const u32 _Data, const u32 _Address);
void Write_U64(const u64 _Data, const u32 _Address);
void Write_U16_Swap(const u16 _Data, const u32 _Address);
void Write_U32_Swap(const u32 _Data, const u32 _Address);
void Write_U64_Swap(const u64 _Data, const u32 _Address);
void WriteHW_U32(const u32 _Data, const u32 _Address);
void GetString(std::string& _string, const u32 _Address);

View File

@ -409,6 +409,9 @@ void Write_U16(const u16 _Data, const u32 _Address)
WriteToHardware<u16>(_Address, _Data, _Address, FLAG_WRITE);
}
void Write_U16_Swap(const u16 _Data, const u32 _Address) {
Write_U16(Common::swap16(_Data), _Address);
}
void Write_U32(const u32 _Data, const u32 _Address)
@ -423,7 +426,9 @@ void Write_U32(const u32 _Data, const u32 _Address)
#endif
WriteToHardware<u32>(_Address, _Data, _Address, FLAG_WRITE);
}
void Write_U32_Swap(const u32 _Data, const u32 _Address) {
Write_U32(Common::swap32(_Data), _Address);
}
void Write_U64(const u64 _Data, const u32 _Address)
{
@ -438,6 +443,9 @@ void Write_U64(const u64 _Data, const u32 _Address)
WriteToHardware<u64>(_Address, _Data, _Address + 4, FLAG_WRITE);
}
void Write_U64_Swap(const u32 _Data, const u32 _Address) {
Write_U64(Common::swap64(_Data), _Address);
}
u8 ReadUnchecked_U8(const u32 _Address)
{

View File

@ -76,7 +76,7 @@ inline T CLAMP(T a, T bottom, T top) {
void Helper_Quantize(const u32 _Addr, const double _fValue,
const EQuantizeType _quantizeType, const unsigned int _uScale)
{
switch(_quantizeType)
switch (_quantizeType)
{
case QUANTIZE_FLOAT:
Memory::Write_U32( ConvertToSingleFTZ( *(u64*)&_fValue ), _Addr );
@ -222,7 +222,7 @@ void psq_st(UGeckoInstruction _inst)
}
else
{
Helper_Quantize( EA, (float)rPS0(_inst.RS), stType, stScale );
Helper_Quantize( EA, rPS0(_inst.RS), stType, stScale );
}
}

View File

@ -245,6 +245,7 @@ void AsmRoutineManager::GenerateCommon()
GenQuantizedLoads();
GenQuantizedStores();
GenQuantizedSingleStores();
//CMPSD(R(XMM0), M(&zero),
// TODO

View File

@ -72,71 +72,6 @@ void Jit64::psq_st(UGeckoInstruction inst)
const EQuantizeType stType = static_cast<EQuantizeType>(gqr.ST_TYPE);
int stScale = gqr.ST_SCALE;
if (inst.W) {
Default(inst);
return;
// PanicAlert("W=1: stType %i stScale %i update %i", (int)stType, (int)stScale, (int)update);
// It's fairly common that games write stuff to the pipe using this. Then, it's pretty much only
// floats so that's what we'll work on.
switch (stType)
{
case QUANTIZE_FLOAT:
{
// This one has quite a bit of optimization potential.
if (gpr.R(a).IsImm())
{
PanicAlert("Imm: %08x", gpr.R(a).offset);
}
gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2);
gpr.Lock(a);
fpr.Lock(s);
// Check that the quantizer is set the way we expect.
INT3();
CMP(16, M(&rSPR(SPR_GQR0 + inst.I)), Imm16(store_gqr));
FixupBranch skip_opt = J_CC(CC_NE);
if (update)
gpr.LoadToX64(a, true, true);
MOV(32, R(ABI_PARAM2), gpr.R(a));
if (offset)
ADD(32, R(ABI_PARAM2), Imm32((u32)offset));
TEST(32, R(ABI_PARAM2), Imm32(0x0C000000));
if (update && offset)
MOV(32, gpr.R(a), R(ABI_PARAM2));
CVTSD2SS(XMM0, fpr.R(s));
MOVD_xmm(M(&temp64), XMM0);
MOV(32, R(ABI_PARAM1), M(&temp64));
FixupBranch argh = J_CC(CC_NZ);
BSWAP(32, ABI_PARAM1);
#ifdef _M_X64
MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1));
#else
MOV(32, R(EAX), R(ABI_PARAM2));
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1));
#endif
FixupBranch skip_call = J();
SetJumpTarget(argh);
ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2);
SetJumpTarget(skip_call);
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();
FixupBranch skip_slow = J();
SetJumpTarget(skip_opt);
Default(inst);
SetJumpTarget(skip_slow);
return;
}
default:
Default(inst);
return;
}
}
#if 0
// Is this specialization still worth it? Let's keep it for now. It's probably
// not very risky since a game most likely wouldn't use the same code to process
@ -176,8 +111,16 @@ void Jit64::psq_st(UGeckoInstruction inst)
#else
SHL(32, R(EDX), Imm8(3));
#endif
if (inst.W) {
// One value
XORPS(XMM0, R(XMM0)); // TODO: See if we can get rid of this cheaply by tweaking the code in the singleStore* functions.
CVTSD2SS(XMM0, fpr.R(s));
CALLptr(MDisp(EDX, (u32)(u64)asm_routines.singleStoreQuantized));
} else {
// Pair of values
CVTPD2PS(XMM0, fpr.R(s));
CALLptr(MDisp(EDX, (u32)(u64)asm_routines.pairedStoreQuantized));
}
gpr.UnlockAll();
gpr.UnlockAllX();
}

View File

@ -251,6 +251,7 @@ void AsmRoutineManager::GenerateCommon()
GenQuantizedLoads();
GenQuantizedStores();
GenQuantizedSingleStores();
//CMPSD(R(XMM0), M(&zero),
// TODO

View File

@ -137,8 +137,12 @@ static const float GC_ALIGNED16(m_dequantizeTableS[]) =
static float GC_ALIGNED16(psTemp[4]);
static const float m_65535 = 65535.0f;
static const float GC_ALIGNED16(m_65535) = 65535.0f;
static const float GC_ALIGNED16(m_32767) = 32767.0f;
static const float GC_ALIGNED16(m_m32768) = -32768.0f;
static const float GC_ALIGNED16(m_255) = 255.0f;
static const float GC_ALIGNED16(m_127) = 127.0f;
static const float GC_ALIGNED16(m_m128) = -128.0f;
#define QUANTIZE_OVERFLOW_SAFE
@ -205,7 +209,7 @@ void CommonAsmRoutines::GenQuantizedStores() {
PUNPCKLDQ(XMM1, R(XMM1));
MINPS(XMM0, R(XMM1));
#endif
CVTPS2DQ(XMM0, R(XMM0));
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
PACKUSWB(XMM0, R(XMM0));
MOVD_xmm(R(EAX), XMM0);
@ -223,7 +227,7 @@ void CommonAsmRoutines::GenQuantizedStores() {
PUNPCKLDQ(XMM1, R(XMM1));
MINPS(XMM0, R(XMM1));
#endif
CVTPS2DQ(XMM0, R(XMM0));
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
PACKSSWB(XMM0, R(XMM0));
MOVD_xmm(R(EAX), XMM0);
@ -245,7 +249,7 @@ void CommonAsmRoutines::GenQuantizedStores() {
PUNPCKLDQ(XMM1, R(XMM1));
MINPS(XMM0, R(XMM1));
CVTPS2DQ(XMM0, R(XMM0));
CVTTPS2DQ(XMM0, R(XMM0));
MOVQ_xmm(M(psTemp), XMM0);
// place ps[0] into the higher word, ps[1] into the lower
// so no need in ROL after BSWAP
@ -269,7 +273,7 @@ void CommonAsmRoutines::GenQuantizedStores() {
PUNPCKLDQ(XMM1, R(XMM1));
MINPS(XMM0, R(XMM1));
#endif
CVTPS2DQ(XMM0, R(XMM0));
CVTTPS2DQ(XMM0, R(XMM0));
PACKSSDW(XMM0, R(XMM0));
MOVD_xmm(R(EAX), XMM0);
BSWAP(32, EAX);
@ -288,6 +292,79 @@ void CommonAsmRoutines::GenQuantizedStores() {
pairedStoreQuantized[7] = storePairedS16;
}
// See comment in header for in/outs.
void CommonAsmRoutines::GenQuantizedSingleStores() {
const u8* storeSingleIllegal = AlignCode4();
UD2();
// Easy!
const u8* storeSingleFloat = AlignCode4();
if (cpu_info.bSSSE3) {
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
// TODO: SafeWriteFloat
MOVSS(M(&psTemp[0]), XMM0);
MOV(32, R(EAX), M(&psTemp[0]));
SafeWriteRegToReg(EAX, ECX, 32, 0, false);
} else {
MOVSS(M(&psTemp[0]), XMM0);
MOV(32, R(EAX), M(&psTemp[0]));
SafeWriteRegToReg(EAX, ECX, 32, 0, true);
}
RET();
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
SHR(32, R(EAX), Imm8(6));
MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_quantizeTableS));
MULSS(XMM0, R(XMM1));
PXOR(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M((void *)&m_255));
CVTTSS2SI(EAX, R(XMM0));
SafeWriteRegToReg(AL, ECX, 8, 0, true);
RET();
const u8* storeSingleS8 = AlignCode4();
SHR(32, R(EAX), Imm8(6));
MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_quantizeTableS));
MULSS(XMM0, R(XMM1));
MAXSS(XMM0, M((void *)&m_m128));
MINSS(XMM0, M((void *)&m_127));
CVTTSS2SI(EAX, R(XMM0));
SafeWriteRegToReg(AL, ECX, 8, 0, true);
RET();
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
SHR(32, R(EAX), Imm8(6));
MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_quantizeTableS));
PUNPCKLDQ(XMM1, R(XMM1));
MULPS(XMM0, R(XMM1));
PXOR(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M((void *)&m_65535));
CVTTSS2SI(EAX, R(XMM0));
SafeWriteRegToReg(EAX, ECX, 16, 0, true);
RET();
const u8* storeSingleS16 = AlignCode4();
SHR(32, R(EAX), Imm8(6));
MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_quantizeTableS));
MULSS(XMM0, R(XMM1));
MAXSS(XMM0, M((void *)&m_m32768));
MINSS(XMM0, M((void *)&m_32767));
CVTTSS2SI(EAX, R(XMM0));
SafeWriteRegToReg(EAX, ECX, 16, 0, true);
RET();
singleStoreQuantized[0] = storeSingleFloat;
singleStoreQuantized[1] = storeSingleIllegal;
singleStoreQuantized[2] = storeSingleIllegal;
singleStoreQuantized[3] = storeSingleIllegal;
singleStoreQuantized[4] = storeSingleU8;
singleStoreQuantized[5] = storeSingleU16;
singleStoreQuantized[6] = storeSingleS8;
singleStoreQuantized[7] = storeSingleS16;
}
void CommonAsmRoutines::GenQuantizedLoads() {
const u8* loadPairedIllegal = AlignCode4();
UD2();

View File

@ -24,6 +24,8 @@ class CommonAsmRoutines : public EmuCodeBlock {
protected:
void GenQuantizedLoads();
void GenQuantizedStores();
void GenQuantizedSingleStores();
public:
void GenFifoWrite(int size);
void GenFifoXmm64Write();
@ -42,6 +44,11 @@ public:
// Out: Nothing.
// Trashes: EAX ECX EDX
const u8 GC_ALIGNED16(*pairedStoreQuantized[8]);
// In: array index: GQR to use.
// In: ECX: Address to write to.
// In: XMM0: Bottom 32-bit slot holds the float to be written.
const u8 GC_ALIGNED16(*singleStoreQuantized[8]);
};
#endif

View File

@ -129,8 +129,8 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
FixupBranch argh = J_CC(CC_Z);
switch (accessSize)
{
case 32: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), reg_value, reg_addr); break;
case 16: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U16, 2), reg_value, reg_addr); break;
case 32: ABI_CallFunctionRR(thunks.ProtectFunction(swap ? ((void *)&Memory::Write_U32) : ((void *)&Memory::Write_U32_Swap), 2), reg_value, reg_addr); break;
case 16: ABI_CallFunctionRR(thunks.ProtectFunction(swap ? ((void *)&Memory::Write_U16) : ((void *)&Memory::Write_U16_Swap), 2), reg_value, reg_addr); break;
case 8: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U8, 2), reg_value, reg_addr); break;
}
FixupBranch arg2 = J();