Merge pull request #6197 from degasus/GP

Jit: Optimize gather pipe usage.
This commit is contained in:
Markus Wick 2017-11-19 11:52:20 +01:00 committed by GitHub
commit 9178a6e636
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 53 additions and 95 deletions

View File

@ -13,6 +13,7 @@
#include "Core/HW/Memmap.h"
#include "Core/HW/ProcessorInterface.h"
#include "Core/PowerPC/JitInterface.h"
#include "Core/PowerPC/PowerPC.h"
#include "VideoCommon/CommandProcessor.h"
namespace GPFifo
@ -31,17 +32,14 @@ namespace GPFifo
// More room for the fastmodes
alignas(32) static u8 s_gather_pipe[GATHER_PIPE_SIZE * 16];
// pipe pointer
u8* g_gather_pipe_ptr = s_gather_pipe;
static size_t GetGatherPipeCount()
{
return g_gather_pipe_ptr - s_gather_pipe;
return PowerPC::ppcState.gather_pipe_ptr - s_gather_pipe;
}
static void SetGatherPipeCount(size_t size)
{
g_gather_pipe_ptr = s_gather_pipe + size;
PowerPC::ppcState.gather_pipe_ptr = s_gather_pipe + size;
}
void DoState(PointerWrap& p)
@ -55,6 +53,7 @@ void DoState(PointerWrap& p)
void Init()
{
ResetGatherPipe();
PowerPC::ppcState.gather_pipe_base_ptr = s_gather_pipe;
memset(s_gather_pipe, 0, sizeof(s_gather_pipe));
}
@ -68,7 +67,7 @@ void ResetGatherPipe()
SetGatherPipeCount(0);
}
static void UpdateGatherPipe()
void UpdateGatherPipe()
{
size_t pipe_count = GetGatherPipeCount();
size_t processed;
@ -144,29 +143,29 @@ void Write64(const u64 value)
void FastWrite8(const u8 value)
{
*g_gather_pipe_ptr = value;
g_gather_pipe_ptr += sizeof(u8);
*PowerPC::ppcState.gather_pipe_ptr = value;
PowerPC::ppcState.gather_pipe_ptr += sizeof(u8);
}
void FastWrite16(u16 value)
{
value = Common::swap16(value);
std::memcpy(g_gather_pipe_ptr, &value, sizeof(u16));
g_gather_pipe_ptr += sizeof(u16);
std::memcpy(PowerPC::ppcState.gather_pipe_ptr, &value, sizeof(u16));
PowerPC::ppcState.gather_pipe_ptr += sizeof(u16);
}
void FastWrite32(u32 value)
{
value = Common::swap32(value);
std::memcpy(g_gather_pipe_ptr, &value, sizeof(u32));
g_gather_pipe_ptr += sizeof(u32);
std::memcpy(PowerPC::ppcState.gather_pipe_ptr, &value, sizeof(u32));
PowerPC::ppcState.gather_pipe_ptr += sizeof(u32);
}
void FastWrite64(u64 value)
{
value = Common::swap64(value);
std::memcpy(g_gather_pipe_ptr, &value, sizeof(u64));
g_gather_pipe_ptr += sizeof(u64);
std::memcpy(PowerPC::ppcState.gather_pipe_ptr, &value, sizeof(u64));
PowerPC::ppcState.gather_pipe_ptr += sizeof(u64);
}
} // end of namespace GPFifo

View File

@ -15,15 +15,13 @@ enum
GATHER_PIPE_SIZE = 32
};
// pipe pointer for JIT access
extern u8* g_gather_pipe_ptr;
// Init
void Init();
void DoState(PointerWrap& p);
// ResetGatherPipe
void ResetGatherPipe();
void UpdateGatherPipe();
void CheckGatherPipe();
void FastCheckGatherPipe();

View File

@ -355,9 +355,14 @@ bool Jit64::Cleanup()
if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
{
MOV(64, R(RSCRATCH), PPCSTATE(gather_pipe_ptr));
SUB(64, R(RSCRATCH), PPCSTATE(gather_pipe_base_ptr));
CMP(64, R(RSCRATCH), Imm32(GPFifo::GATHER_PIPE_SIZE));
FixupBranch exit = J_CC(CC_L);
ABI_PushRegistersAndAdjustStack({}, 0);
ABI_CallFunction(GPFifo::FastCheckGatherPipe);
ABI_CallFunction(GPFifo::UpdateGatherPipe);
ABI_PopRegistersAndAdjustStack({}, 0);
SetJumpTarget(exit);
did_something = true;
}

View File

@ -222,14 +222,6 @@ void Jit64AsmRoutineManager::ResetStack(X64CodeBlock& emitter)
void Jit64AsmRoutineManager::GenerateCommon()
{
fifoDirectWrite8 = AlignCode4();
GenFifoWrite(8);
fifoDirectWrite16 = AlignCode4();
GenFifoWrite(16);
fifoDirectWrite32 = AlignCode4();
GenFifoWrite(32);
fifoDirectWrite64 = AlignCode4();
GenFifoWrite(64);
frsqrte = AlignCode4();
GenFrsqrte();
fres = AlignCode4();

View File

@ -203,28 +203,6 @@ bool EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, OpArg opAddress, int access
return offsetAddedToAddress;
}
void EmuCodeBlock::UnsafeWriteGatherPipe(int accessSize)
{
// No need to protect these, they don't touch any state
// question - should we inline them instead? Pro: Lose a CALL Con: Code bloat
switch (accessSize)
{
case 8:
CALL(g_jit->GetAsmRoutines()->fifoDirectWrite8);
break;
case 16:
CALL(g_jit->GetAsmRoutines()->fifoDirectWrite16);
break;
case 32:
CALL(g_jit->GetAsmRoutines()->fifoDirectWrite32);
break;
case 64:
CALL(g_jit->GetAsmRoutines()->fifoDirectWrite64);
break;
}
g_jit->js.fifoBytesSinceCheck += accessSize >> 3;
}
// Visitor that generates code to read a MMIO value.
template <typename T>
class MMIOReadCodeGenerator : public MMIO::ReadHandlingMethodVisitor<T>
@ -622,10 +600,22 @@ bool EmuCodeBlock::WriteToConstAddress(int accessSize, OpArg arg, u32 address,
// fun tricks...
if (g_jit->jo.optimizeGatherPipe && PowerPC::IsOptimizableGatherPipeWrite(address))
{
if (!arg.IsSimpleReg(RSCRATCH))
MOV(accessSize, R(RSCRATCH), arg);
X64Reg arg_reg = RSCRATCH;
UnsafeWriteGatherPipe(accessSize);
// With movbe, we can store inplace without temporary register
if (arg.IsSimpleReg() && cpu_info.bMOVBE)
arg_reg = arg.GetSimpleReg();
if (!arg.IsSimpleReg(arg_reg))
MOV(accessSize, R(arg_reg), arg);
// And store it in the gather pipe
MOV(64, R(RSCRATCH2), PPCSTATE(gather_pipe_ptr));
SwapAndStore(accessSize, MatR(RSCRATCH2), arg_reg);
ADD(64, R(RSCRATCH2), Imm8(accessSize >> 3));
MOV(64, PPCSTATE(gather_pipe_ptr), R(RSCRATCH2));
g_jit->js.fifoBytesSinceCheck += accessSize >> 3;
return false;
}
else if (PowerPC::IsOptimizableRAMAddress(address))

View File

@ -61,7 +61,6 @@ public:
bool UnsafeLoadToReg(Gen::X64Reg reg_value, Gen::OpArg opAddress, int accessSize, s32 offset,
bool signExtend, Gen::MovInfo* info = nullptr);
void UnsafeWriteGatherPipe(int accessSize);
// Generate a load/write from the MMIO handler for a given address. Only
// call for known addresses in MMIO range (MMIO::IsMMIOAddress).

View File

@ -12,7 +12,6 @@
#include "Common/MathUtil.h"
#include "Common/x64ABI.h"
#include "Common/x64Emitter.h"
#include "Core/HW/GPFifo.h"
#include "Core/PowerPC/Gekko.h"
#include "Core/PowerPC/Jit64Common/Jit64Base.h"
#include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"
@ -25,22 +24,6 @@
using namespace Gen;
void CommonAsmRoutines::GenFifoWrite(int size)
{
const void* start = GetCodePtr();
// Assume value in RSCRATCH
MOV(64, R(RSCRATCH2), ImmPtr(&GPFifo::g_gather_pipe_ptr));
MOV(64, R(RSCRATCH2), MatR(RSCRATCH2));
SwapAndStore(size, MatR(RSCRATCH2), RSCRATCH);
MOV(64, R(RSCRATCH), ImmPtr(&GPFifo::g_gather_pipe_ptr));
ADD(64, R(RSCRATCH2), Imm8(size >> 3));
MOV(64, MatR(RSCRATCH), R(RSCRATCH2));
RET();
JitRegister::Register(start, GetCodePtr(), "JIT_FifoWrite_%i", size);
}
void CommonAsmRoutines::GenFrsqrte()
{
const void* start = GetCodePtr();

View File

@ -24,7 +24,6 @@ private:
class CommonAsmRoutines : public CommonAsmRoutinesBase, public QuantizedMemoryRoutines
{
public:
void GenFifoWrite(int size);
void GenFrsqrte();
void GenFres();
void GenMfcr();

View File

@ -231,8 +231,13 @@ void JitArm64::Cleanup()
{
if (jo.optimizeGatherPipe && js.fifoBytesSinceCheck > 0)
{
MOVP2R(X0, &GPFifo::FastCheckGatherPipe);
LDP(INDEX_SIGNED, X0, X1, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
SUB(X0, X0, X1);
CMP(X0, GPFifo::GATHER_PIPE_SIZE);
FixupBranch exit = B(CC_LT);
MOVP2R(X0, &GPFifo::UpdateGatherPipe);
BLR(X0);
SetJumpTarget(exit);
}
}

View File

@ -10,7 +10,6 @@
#include "Core/Core.h"
#include "Core/CoreTiming.h"
#include "Core/HW/DSP.h"
#include "Core/HW/GPFifo.h"
#include "Core/HW/MMIO.h"
#include "Core/HW/Memmap.h"
#include "Core/PowerPC/JitArm64/Jit.h"
@ -230,7 +229,6 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
if (is_immediate && jo.optimizeGatherPipe && PowerPC::IsOptimizableGatherPipeWrite(imm_addr))
{
ARM64Reg WA = INVALID_REG;
int accessSize;
if (flags & BackPatchInfo::FLAG_SIZE_32)
accessSize = 32;
@ -239,30 +237,23 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s
else
accessSize = 8;
if (accessSize != 8)
WA = gpr.GetReg();
MOVP2R(X1, &GPFifo::g_gather_pipe_ptr);
LDR(INDEX_UNSIGNED, X0, X1, 0);
LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
if (accessSize == 32)
{
REV32(WA, RS);
STR(INDEX_POST, WA, X0, 4);
REV32(W1, RS);
STR(INDEX_POST, W1, X0, 4);
}
else if (accessSize == 16)
{
REV16(WA, RS);
STRH(INDEX_POST, WA, X0, 2);
REV16(W1, RS);
STRH(INDEX_POST, W1, X0, 2);
}
else
{
STRB(INDEX_POST, RS, X0, 1);
}
STR(INDEX_UNSIGNED, X0, X1, 0);
STR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
js.fifoBytesSinceCheck += accessSize >> 3;
if (accessSize != 8)
gpr.Unlock(WA);
}
else if (is_immediate && PowerPC::IsOptimizableRAMAddress(imm_addr))
{

View File

@ -10,7 +10,6 @@
#include "Core/Core.h"
#include "Core/CoreTiming.h"
#include "Core/HW/GPFifo.h"
#include "Core/PowerPC/JitArm64/Jit.h"
#include "Core/PowerPC/JitArm64/JitArm64_RegCache.h"
#include "Core/PowerPC/PPCTables.h"
@ -357,8 +356,7 @@ void JitArm64::stfXX(UGeckoInstruction inst)
else
accessSize = 32;
MOVP2R(X1, &GPFifo::g_gather_pipe_ptr);
LDR(INDEX_UNSIGNED, X0, X1, 0);
LDR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
if (flags & BackPatchInfo::FLAG_SIZE_F64)
{
m_float_emit.REV64(8, Q0, V0);
@ -375,7 +373,7 @@ void JitArm64::stfXX(UGeckoInstruction inst)
m_float_emit.STR(accessSize, INDEX_POST, accessSize == 64 ? Q0 : D0, X0, accessSize >> 3);
STR(INDEX_UNSIGNED, X0, X1, 0);
STR(INDEX_UNSIGNED, X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
js.fifoBytesSinceCheck += accessSize >> 3;
if (update)

View File

@ -15,11 +15,6 @@ alignas(16) extern const float m_dequantizeTableS[128];
class CommonAsmRoutinesBase
{
public:
const u8* fifoDirectWrite8;
const u8* fifoDirectWrite16;
const u8* fifoDirectWrite32;
const u8* fifoDirectWrite64;
const u8* enterCode;
const u8* dispatcherMispredictedBLR;

View File

@ -93,6 +93,10 @@ struct PowerPCState
// lscbx
u16 xer_stringctrl;
// gather pipe pointer for JIT access
u8* gather_pipe_ptr;
u8* gather_pipe_base_ptr;
#if _M_X86_64
// This member exists for the purpose of an assertion in x86 JitBase.cpp
// that its offset <= 0x100. To minimize code size on x86, we want as much