Merge pull request #6197 from degasus/GP

Jit: Optimize gather pipe usage.
This commit is contained in:
Markus Wick 2017-11-19 11:52:20 +01:00 committed by GitHub
commit 9178a6e636
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 53 additions and 95 deletions

View File

@ -13,6 +13,7 @@
#include "Core/HW/Memmap.h" #include "Core/HW/Memmap.h"
#include "Core/HW/ProcessorInterface.h" #include "Core/HW/ProcessorInterface.h"
#include "Core/PowerPC/JitInterface.h" #include "Core/PowerPC/JitInterface.h"
#include "Core/PowerPC/PowerPC.h"
#include "VideoCommon/CommandProcessor.h" #include "VideoCommon/CommandProcessor.h"
namespace GPFifo namespace GPFifo
@ -31,17 +32,14 @@ namespace GPFifo
// More room for the fastmodes // More room for the fastmodes
alignas(32) static u8 s_gather_pipe[GATHER_PIPE_SIZE * 16]; alignas(32) static u8 s_gather_pipe[GATHER_PIPE_SIZE * 16];
// pipe pointer
u8* g_gather_pipe_ptr = s_gather_pipe;
static size_t GetGatherPipeCount() static size_t GetGatherPipeCount()
{ {
return g_gather_pipe_ptr - s_gather_pipe; return PowerPC::ppcState.gather_pipe_ptr - s_gather_pipe;
} }
static void SetGatherPipeCount(size_t size) static void SetGatherPipeCount(size_t size)
{ {
g_gather_pipe_ptr = s_gather_pipe + size; PowerPC::ppcState.gather_pipe_ptr = s_gather_pipe + size;
} }
void DoState(PointerWrap& p) void DoState(PointerWrap& p)
@ -55,6 +53,7 @@ void DoState(PointerWrap& p)
void Init() void Init()
{ {
ResetGatherPipe(); ResetGatherPipe();
PowerPC::ppcState.gather_pipe_base_ptr = s_gather_pipe;
memset(s_gather_pipe, 0, sizeof(s_gather_pipe)); memset(s_gather_pipe, 0, sizeof(s_gather_pipe));
} }
@ -68,7 +67,7 @@ void ResetGatherPipe()
SetGatherPipeCount(0); SetGatherPipeCount(0);
} }
static void UpdateGatherPipe() void UpdateGatherPipe()
{ {
size_t pipe_count = GetGatherPipeCount(); size_t pipe_count = GetGatherPipeCount();
size_t processed; size_t processed;
@ -144,29 +143,29 @@ void Write64(const u64 value)
void FastWrite8(const u8 value) void FastWrite8(const u8 value)
{ {
*g_gather_pipe_ptr = value; *PowerPC::ppcState.gather_pipe_ptr = value;
g_gather_pipe_ptr += sizeof(u8); PowerPC::ppcState.gather_pipe_ptr += sizeof(u8);
} }
void FastWrite16(u16 value) void FastWrite16(u16 value)
{ {
value = Common::swap16(value); value = Common::swap16(value);
std::memcpy(g_gather_pipe_ptr, &value, sizeof(u16)); std::memcpy(PowerPC::ppcState.gather_pipe_ptr, &value, sizeof(u16));
g_gather_pipe_ptr += sizeof(u16); PowerPC::ppcState.gather_pipe_ptr += sizeof(u16);
} }
void FastWrite32(u32 value) void FastWrite32(u32 value)
{ {
value = Common::swap32(value); value = Common::swap32(value);
std::memcpy(g_gather_pipe_ptr, &value, sizeof(u32)); std::memcpy(PowerPC::ppcState.gather_pipe_ptr, &value, sizeof(u32));
g_gather_pipe_ptr += sizeof(u32); PowerPC::ppcState.gather_pipe_ptr += sizeof(u32);
} }
void FastWrite64(u64 value) void FastWrite64(u64 value)
{ {
value = Common::swap64(value); value = Common::swap64(value);
std::memcpy(g_gather_pipe_ptr, &value, sizeof(u64)); std::memcpy(PowerPC::ppcState.gather_pipe_ptr, &value, sizeof(u64));
g_gather_pipe_ptr += sizeof(u64); PowerPC::ppcState.gather_pipe_ptr += sizeof(u64);
} }
} // end of namespace GPFifo } // end of namespace GPFifo

View File

@ -15,15 +15,13 @@ enum
GATHER_PIPE_SIZE = 32 GATHER_PIPE_SIZE = 32
}; };
// pipe pointer for JIT access
extern u8* g_gather_pipe_ptr;
// Init // Init
void Init(); void Init();
void DoState(PointerWrap& p); void DoState(PointerWrap& p);
// ResetGatherPipe // ResetGatherPipe
void ResetGatherPipe(); void ResetGatherPipe();
void UpdateGatherPipe();
void CheckGatherPipe(); void CheckGatherPipe();
void FastCheckGatherPipe(); void FastCheckGatherPipe();

View File

@ -355,9 +355,14 @@ bool Jit64::Cleanup()
if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0) if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
{ {
MOV(64, R(RSCRATCH), PPCSTATE(gather_pipe_ptr));
SUB(64, R(RSCRATCH), PPCSTATE(gather_pipe_base_ptr));
CMP(64, R(RSCRATCH), Imm32(GPFifo::GATHER_PIPE_SIZE));
FixupBranch exit = J_CC(CC_L);
ABI_PushRegistersAndAdjustStack({}, 0); ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(GPFifo::FastCheckGatherPipe); ABI_CallFunction(GPFifo::UpdateGatherPipe);
ABI_PopRegistersAndAdjustStack({}, 0); ABI_PopRegistersAndAdjustStack({}, 0);
SetJumpTarget(exit);
did_something = true; did_something = true;
} }

View File

@ -222,14 +222,6 @@ void Jit64AsmRoutineManager::ResetStack(X64CodeBlock& emitter)
void Jit64AsmRoutineManager::GenerateCommon() void Jit64AsmRoutineManager::GenerateCommon()
{ {
fifoDirectWrite8 = AlignCode4();
GenFifoWrite(8);
fifoDirectWrite16 = AlignCode4();
GenFifoWrite(16);
fifoDirectWrite32 = AlignCode4();
GenFifoWrite(32);
fifoDirectWrite64 = AlignCode4();
GenFifoWrite(64);
frsqrte = AlignCode4(); frsqrte = AlignCode4();
GenFrsqrte(); GenFrsqrte();
fres = AlignCode4(); fres = AlignCode4();

View File

@ -203,28 +203,6 @@ bool EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, OpArg opAddress, int access
return offsetAddedToAddress; return offsetAddedToAddress;
} }
void EmuCodeBlock::UnsafeWriteGatherPipe(int accessSize)
{
// No need to protect these, they don't touch any state
// question - should we inline them instead? Pro: Lose a CALL Con: Code bloat
switch (accessSize)
{
case 8:
CALL(g_jit->GetAsmRoutines()->fifoDirectWrite8);
break;
case 16:
CALL(g_jit->GetAsmRoutines()->fifoDirectWrite16);
break;
case 32:
CALL(g_jit->GetAsmRoutines()->fifoDirectWrite32);
break;
case 64:
CALL(g_jit->GetAsmRoutines()->fifoDirectWrite64);
break;
}
g_jit->js.fifoBytesSinceCheck += accessSize >> 3;
}
// Visitor that generates code to read a MMIO value. // Visitor that generates code to read a MMIO value.
template <typename T> template <typename T>
class MMIOReadCodeGenerator : public MMIO::ReadHandlingMethodVisitor<T> class MMIOReadCodeGenerator : public MMIO::ReadHandlingMethodVisitor<T>
@ -622,10 +600,22 @@ bool EmuCodeBlock::WriteToConstAddress(int accessSize, OpArg arg, u32 address,
// fun tricks... // fun tricks...
if (g_jit->jo.optimizeGatherPipe && PowerPC::IsOptimizableGatherPipeWrite(address)) if (g_jit->jo.optimizeGatherPipe && PowerPC::IsOptimizableGatherPipeWrite(address))
{ {
if (!arg.IsSimpleReg(RSCRATCH)) X64Reg arg_reg = RSCRATCH;
MOV(accessSize, R(RSCRATCH), arg);
UnsafeWriteGatherPipe(accessSize); // With movbe, we can store inplace without temporary register
if (arg.IsSimpleReg() && cpu_info.bMOVBE)
arg_reg = arg.GetSimpleReg();
if (!arg.IsSimpleReg(arg_reg))
MOV(accessSize, R(arg_reg), arg);
// And store it in the gather pipe
MOV(64, R(RSCRATCH2), PPCSTATE(gather_pipe_ptr));
SwapAndStore(accessSize, MatR(RSCRATCH2), arg_reg);
ADD(64, R(RSCRATCH2), Imm8(accessSize >> 3));
MOV(64, PPCSTATE(gather_pipe_ptr), R(RSCRATCH2));
g_jit->js.fifoBytesSinceCheck += accessSize >> 3;
return false; return false;
} }
else if (PowerPC::IsOptimizableRAMAddress(address)) else if (PowerPC::IsOptimizableRAMAddress(address))

View File

@ -61,7 +61,6 @@ public:
bool UnsafeLoadToReg(Gen::X64Reg reg_value, Gen::OpArg opAddress, int accessSize, s32 offset, bool UnsafeLoadToReg(Gen::X64Reg reg_value, Gen::OpArg opAddress, int accessSize, s32 offset,
bool signExtend, Gen::MovInfo* info = nullptr); bool signExtend, Gen::MovInfo* info = nullptr);
void UnsafeWriteGatherPipe(int accessSize);
// Generate a load/write from the MMIO handler for a given address. Only // Generate a load/write from the MMIO handler for a given address. Only
// call for known addresses in MMIO range (MMIO::IsMMIOAddress). // call for known addresses in MMIO range (MMIO::IsMMIOAddress).

View File

@ -12,7 +12,6 @@
#include "Common/MathUtil.h" #include "Common/MathUtil.h"
#include "Common/x64ABI.h" #include "Common/x64ABI.h"
#include "Common/x64Emitter.h" #include "Common/x64Emitter.h"
#include "Core/HW/GPFifo.h"
#include "Core/PowerPC/Gekko.h" #include "Core/PowerPC/Gekko.h"
#include "Core/PowerPC/Jit64Common/Jit64Base.h" #include "Core/PowerPC/Jit64Common/Jit64Base.h"
#include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h" #include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"
@ -25,22 +24,6 @@
using namespace Gen; using namespace Gen;
void CommonAsmRoutines::GenFifoWrite(int size)
{
const void* start = GetCodePtr();
// Assume value in RSCRATCH
MOV(64, R(RSCRATCH2), ImmPtr(&GPFifo::g_gather_pipe_ptr));
MOV(64, R(RSCRATCH2), MatR(RSCRATCH2));
SwapAndStore(size, MatR(RSCRATCH2), RSCRATCH);
MOV(64, R(RSCRATCH), ImmPtr(&GPFifo::g_gather_pipe_ptr));
ADD(64, R(RSCRATCH2), Imm8(size >> 3));
MOV(64, MatR(RSCRATCH), R(RSCRATCH2));
RET();
JitRegister::Register(start, GetCodePtr(), "JIT_FifoWrite_%i", size);
}
void CommonAsmRoutines::GenFrsqrte() void CommonAsmRoutines::GenFrsqrte()
{ {
const void* start = GetCodePtr(); const void* start = GetCodePtr();

View File

@ -24,7 +24,6 @@ private:
class CommonAsmRoutines : public CommonAsmRoutinesBase, public QuantizedMemoryRoutines class CommonAsmRoutines : public CommonAsmRoutinesBase, public QuantizedMemoryRoutines
{ {
public: public:
void GenFifoWrite(int size);
void GenFrsqrte(); void GenFrsqrte();
void GenFres(); void GenFres();
void GenMfcr(); void GenMfcr();

View File

@ -231,8 +231,13 @@ void JitArm64::Cleanup()
{ {
if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0) if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
{ {
MOVP2R(X0, &GPFifo::FastCheckGatherPipe); LDP(INDEX_SIGNED, X0, X1, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
SUB(X0, X0, X1);
CMP(X0, GPFifo::GATHER_PIPE_SIZE);
FixupBranch exit = B(CC_LT);
MOVP2R(X0, &GPFifo::UpdateGatherPipe);
BLR(X0); BLR(X0);
SetJumpTarget(exit);
} }
} }

View File

@ -10,7 +10,6 @@
#include "Core/Core.h" #include "Core/Core.h"
#include "Core/CoreTiming.h" #include "Core/CoreTiming.h"
#include "Core/HW/DSP.h" #include "Core/HW/DSP.h"
#include "Core/HW/GPFifo.h"
#include "Core/HW/MMIO.h" #include "Core/HW/MMIO.h"
#include "Core/HW/Memmap.h" #include "Core/HW/Memmap.h"
#include "Core/PowerPC/JitArm64/Jit.h" #include "Core/PowerPC/JitArm64/Jit.h"
@ -230,7 +229,6 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
if (is_immediate && jo.optimizeGatherPipe && PowerPC::IsOptimizableGatherPipeWrite(imm_addr)) if (is_immediate && jo.optimizeGatherPipe && PowerPC::IsOptimizableGatherPipeWrite(imm_addr))
{ {
ARM64Reg WA = INVALID_REG;
int accessSize; int accessSize;
if (flags & BackPatchInfo::FLAG_SIZE_32) if (flags & BackPatchInfo::FLAG_SIZE_32)
accessSize = 32; accessSize = 32;
@ -239,30 +237,23 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
else else
accessSize = 8; accessSize = 8;
if (accessSize != 8) LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
WA = gpr.GetReg();
MOVP2R(X1, &GPFifo::g_gather_pipe_ptr);
LDR(INDEX_UNSIGNED, X0, X1, 0);
if (accessSize == 32) if (accessSize == 32)
{ {
REV32(WA, RS); REV32(W1, RS);
STR(INDEX_POST, WA, X0, 4); STR(INDEX_POST, W1, X0, 4);
} }
else if (accessSize == 16) else if (accessSize == 16)
{ {
REV16(WA, RS); REV16(W1, RS);
STRH(INDEX_POST, WA, X0, 2); STRH(INDEX_POST, W1, X0, 2);
} }
else else
{ {
STRB(INDEX_POST, RS, X0, 1); STRB(INDEX_POST, RS, X0, 1);
} }
STR(INDEX_UNSIGNED, X0, X1, 0); STR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
js.fifoBytesSinceCheck += accessSize >> 3; js.fifoBytesSinceCheck += accessSize >> 3;
if (accessSize != 8)
gpr.Unlock(WA);
} }
else if (is_immediate && PowerPC::IsOptimizableRAMAddress(imm_addr)) else if (is_immediate && PowerPC::IsOptimizableRAMAddress(imm_addr))
{ {

View File

@ -10,7 +10,6 @@
#include "Core/Core.h" #include "Core/Core.h"
#include "Core/CoreTiming.h" #include "Core/CoreTiming.h"
#include "Core/HW/GPFifo.h"
#include "Core/PowerPC/JitArm64/Jit.h" #include "Core/PowerPC/JitArm64/Jit.h"
#include "Core/PowerPC/JitArm64/JitArm64_RegCache.h" #include "Core/PowerPC/JitArm64/JitArm64_RegCache.h"
#include "Core/PowerPC/PPCTables.h" #include "Core/PowerPC/PPCTables.h"
@ -357,8 +356,7 @@ void JitArm64::stfXX(UGeckoInstruction inst)
else else
accessSize = 32; accessSize = 32;
MOVP2R(X1, &GPFifo::g_gather_pipe_ptr); LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
LDR(INDEX_UNSIGNED, X0, X1, 0);
if (flags & BackPatchInfo::FLAG_SIZE_F64) if (flags & BackPatchInfo::FLAG_SIZE_F64)
{ {
m_float_emit.REV64(8, Q0, V0); m_float_emit.REV64(8, Q0, V0);
@ -375,7 +373,7 @@ void JitArm64::stfXX(UGeckoInstruction inst)
m_float_emit.STR(accessSize, INDEX_POST, accessSize == 64 ? Q0 : D0, X0, accessSize >> 3); m_float_emit.STR(accessSize, INDEX_POST, accessSize == 64 ? Q0 : D0, X0, accessSize >> 3);
STR(INDEX_UNSIGNED, X0, X1, 0); STR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
js.fifoBytesSinceCheck += accessSize >> 3; js.fifoBytesSinceCheck += accessSize >> 3;
if (update) if (update)

View File

@ -15,11 +15,6 @@ alignas(16) extern const float m_dequantizeTableS[128];
class CommonAsmRoutinesBase class CommonAsmRoutinesBase
{ {
public: public:
const u8* fifoDirectWrite8;
const u8* fifoDirectWrite16;
const u8* fifoDirectWrite32;
const u8* fifoDirectWrite64;
const u8* enterCode; const u8* enterCode;
const u8* dispatcherMispredictedBLR; const u8* dispatcherMispredictedBLR;

View File

@ -93,6 +93,10 @@ struct PowerPCState
// lscbx // lscbx
u16 xer_stringctrl; u16 xer_stringctrl;
// gather pipe pointer for JIT access
u8* gather_pipe_ptr;
u8* gather_pipe_base_ptr;
#if _M_X86_64 #if _M_X86_64
// This member exists for the purpose of an assertion in x86 JitBase.cpp // This member exists for the purpose of an assertion in x86 JitBase.cpp
// that its offset <= 0x100. To minimize code size on x86, we want as much // that its offset <= 0x100. To minimize code size on x86, we want as much