Merge pull request #12155 from krnlyng/jit_block_map_msr_fast

JitArm64/Jit64: Extend the fast lookup mmap-ed segment further to avoid needing to check the msr bits.
This commit is contained in:
JosJuice 2023-09-17 15:13:53 +02:00 committed by GitHub
commit 9862ba4548
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 149 additions and 93 deletions

View File

@ -113,15 +113,22 @@ void Jit64AsmRoutineManager::Generate()
const bool assembly_dispatcher = true; const bool assembly_dispatcher = true;
if (assembly_dispatcher) if (assembly_dispatcher)
{ {
if (m_jit.GetBlockCache()->GetFastBlockMap()) if (m_jit.GetBlockCache()->GetEntryPoints())
{ {
u64 icache = reinterpret_cast<u64>(m_jit.GetBlockCache()->GetFastBlockMap()); MOV(32, R(RSCRATCH2), PPCSTATE(msr));
MOV(32, R(RSCRATCH), PPCSTATE(pc)); AND(32, R(RSCRATCH2), Imm32(JitBaseBlockCache::JIT_CACHE_MSR_MASK));
SHL(64, R(RSCRATCH2), Imm8(28));
MOV(32, R(RSCRATCH_EXTRA), PPCSTATE(pc));
OR(64, R(RSCRATCH_EXTRA), R(RSCRATCH2));
u64 icache = reinterpret_cast<u64>(m_jit.GetBlockCache()->GetEntryPoints());
MOV(64, R(RSCRATCH2), Imm64(icache)); MOV(64, R(RSCRATCH2), Imm64(icache));
// Each 4-byte offset of the PC register corresponds to a 8-byte offset // The entry points map is indexed by ((msrBits << 26) | (address >> 2)).
// in the lookup table due to host pointers being 8-bytes long. // The map contains 8 byte 64-bit pointers and that means we need to shift
MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH, SCALE_2, 0)); // msr left by 29 bits and address left by 1 bit to get the correct offset
// in the map.
MOV(64, R(RSCRATCH), MComplex(RSCRATCH2, RSCRATCH_EXTRA, SCALE_2, 0));
} }
else else
{ {
@ -146,49 +153,57 @@ void Jit64AsmRoutineManager::Generate()
// Check if we found a block. // Check if we found a block.
TEST(64, R(RSCRATCH), R(RSCRATCH)); TEST(64, R(RSCRATCH), R(RSCRATCH));
FixupBranch not_found = J_CC(CC_Z); FixupBranch not_found = J_CC(CC_Z);
FixupBranch state_mismatch;
// Check block.msrBits. if (!m_jit.GetBlockCache()->GetEntryPoints())
MOV(32, R(RSCRATCH2), PPCSTATE(msr));
AND(32, R(RSCRATCH2), Imm32(JitBaseBlockCache::JIT_CACHE_MSR_MASK));
if (m_jit.GetBlockCache()->GetFastBlockMap())
{
CMP(32, R(RSCRATCH2), MDisp(RSCRATCH, static_cast<s32>(offsetof(JitBlockData, msrBits))));
}
else
{ {
// Check block.msrBits.
MOV(32, R(RSCRATCH2), PPCSTATE(msr));
AND(32, R(RSCRATCH2), Imm32(JitBaseBlockCache::JIT_CACHE_MSR_MASK));
// Also check the block.effectiveAddress // Also check the block.effectiveAddress
SHL(64, R(RSCRATCH2), Imm8(32)); SHL(64, R(RSCRATCH2), Imm8(32));
// RSCRATCH_EXTRA still has the PC. // RSCRATCH_EXTRA still has the PC.
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA));
CMP(64, R(RSCRATCH2), CMP(64, R(RSCRATCH2),
MDisp(RSCRATCH, static_cast<s32>(offsetof(JitBlockData, effectiveAddress)))); MDisp(RSCRATCH, static_cast<s32>(offsetof(JitBlockData, effectiveAddress))));
state_mismatch = J_CC(CC_NE);
// Success; branch to the block we found.
JMPptr(MDisp(RSCRATCH, static_cast<s32>(offsetof(JitBlockData, normalEntry))));
}
else
{
// Success; branch to the block we found.
JMPptr(R(RSCRATCH));
} }
FixupBranch state_mismatch = J_CC(CC_NE);
// Success; branch to the block we found.
JMPptr(MDisp(RSCRATCH, static_cast<s32>(offsetof(JitBlockData, normalEntry))));
SetJumpTarget(not_found); SetJumpTarget(not_found);
SetJumpTarget(state_mismatch); if (!m_jit.GetBlockCache()->GetEntryPoints())
{
SetJumpTarget(state_mismatch);
}
// Failure, fallback to the C++ dispatcher for calling the JIT. // Failure, fallback to the C++ dispatcher for calling the JIT.
} }
// Ok, no block, let's call the slow dispatcher // There is no point in calling the dispatcher in the fast lookup table
ABI_PushRegistersAndAdjustStack({}, 0); // case, since the assembly dispatcher would already have found a block.
MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast<u64>(&m_jit))); if (!assembly_dispatcher || !m_jit.GetBlockCache()->GetEntryPoints())
ABI_CallFunction(JitBase::Dispatch); {
ABI_PopRegistersAndAdjustStack({}, 0); // Ok, no block, let's call the slow dispatcher
ABI_PushRegistersAndAdjustStack({}, 0);
MOV(64, R(ABI_PARAM1), Imm64(reinterpret_cast<u64>(&m_jit)));
ABI_CallFunction(JitBase::Dispatch);
ABI_PopRegistersAndAdjustStack({}, 0);
TEST(64, R(ABI_RETURN), R(ABI_RETURN)); TEST(64, R(ABI_RETURN), R(ABI_RETURN));
FixupBranch no_block_available = J_CC(CC_Z); FixupBranch no_block_available = J_CC(CC_Z);
// Jump to the block // Jump to the block
JMPptr(R(ABI_RETURN)); JMPptr(R(ABI_RETURN));
SetJumpTarget(no_block_available); SetJumpTarget(no_block_available);
}
// We reset the stack because Jit might clear the code cache. // We reset the stack because Jit might clear the code cache.
// Also if we are in the middle of disabling BLR optimization on windows // Also if we are in the middle of disabling BLR optimization on windows

View File

@ -97,32 +97,21 @@ void JitArm64::GenerateAsm()
if (assembly_dispatcher) if (assembly_dispatcher)
{ {
if (GetBlockCache()->GetFastBlockMap()) if (GetBlockCache()->GetEntryPoints())
{ {
// Check if there is a block // Check if there is a block
ARM64Reg pc_masked = ARM64Reg::X25; ARM64Reg pc_and_msr = ARM64Reg::X25;
ARM64Reg cache_base = ARM64Reg::X24; ARM64Reg cache_base = ARM64Reg::X27;
ARM64Reg block = ARM64Reg::X30; ARM64Reg block = ARM64Reg::X30;
LSL(pc_masked, DISPATCHER_PC, 1); LDR(IndexType::Unsigned, EncodeRegTo32(pc_and_msr), PPC_REG, PPCSTATE_OFF(msr));
MOVP2R(cache_base, GetBlockCache()->GetFastBlockMap()); MOVP2R(cache_base, GetBlockCache()->GetEntryPoints());
LDR(block, cache_base, pc_masked); // The entry points map is indexed by ((msrBits << 26) | (address >> 2)).
UBFIZ(pc_and_msr, pc_and_msr, 26, 6);
BFXIL(pc_and_msr, EncodeRegTo64(DISPATCHER_PC), 2, 30);
LDR(block, cache_base, ArithOption(pc_and_msr, true));
FixupBranch not_found = CBZ(block); FixupBranch not_found = CBZ(block);
// b.msrBits != msr
ARM64Reg msr = ARM64Reg::W27;
ARM64Reg msr2 = ARM64Reg::W24;
LDR(IndexType::Unsigned, msr, PPC_REG, PPCSTATE_OFF(msr));
AND(msr, msr, LogicalImm(JitBaseBlockCache::JIT_CACHE_MSR_MASK, 32));
LDR(IndexType::Unsigned, msr2, block, offsetof(JitBlockData, msrBits));
CMP(msr, msr2);
FixupBranch msr_missmatch = B(CC_NEQ);
// return blocks[block_num].normalEntry;
LDR(IndexType::Unsigned, block, block, offsetof(JitBlockData, normalEntry));
BR(block); BR(block);
SetJumpTarget(not_found); SetJumpTarget(not_found);
SetJumpTarget(msr_missmatch);
} }
else else
{ {
@ -160,18 +149,25 @@ void JitArm64::GenerateAsm()
} }
} }
// Call C version of Dispatch().
STR(IndexType::Unsigned, DISPATCHER_PC, PPC_REG, PPCSTATE_OFF(pc)); STR(IndexType::Unsigned, DISPATCHER_PC, PPC_REG, PPCSTATE_OFF(pc));
MOVP2R(ARM64Reg::X8, reinterpret_cast<void*>(&JitBase::Dispatch));
MOVP2R(ARM64Reg::X0, this);
BLR(ARM64Reg::X8);
FixupBranch no_block_available = CBZ(ARM64Reg::X0); // There is no point in calling the dispatcher in the fast lookup table
// case, since the assembly dispatcher would already have found a block.
if (!assembly_dispatcher || !GetBlockCache()->GetEntryPoints())
{
// Call C version of Dispatch().
MOVP2R(ARM64Reg::X8, reinterpret_cast<void*>(&JitBase::Dispatch));
MOVP2R(ARM64Reg::X0, this);
BLR(ARM64Reg::X8);
BR(ARM64Reg::X0); FixupBranch no_block_available = CBZ(ARM64Reg::X0);
BR(ARM64Reg::X0);
SetJumpTarget(no_block_available);
}
// Call JIT // Call JIT
SetJumpTarget(no_block_available);
ResetStack(); ResetStack();
MOVP2R(ARM64Reg::X0, this); MOVP2R(ARM64Reg::X0, this);
MOV(ARM64Reg::W1, DISPATCHER_PC); MOV(ARM64Reg::W1, DISPATCHER_PC);

View File

@ -43,14 +43,10 @@ void JitBaseBlockCache::Init()
Common::JitRegister::Init(Config::Get(Config::MAIN_PERF_MAP_DIR)); Common::JitRegister::Init(Config::Get(Config::MAIN_PERF_MAP_DIR));
#ifdef _ARCH_64 #ifdef _ARCH_64
m_fast_block_map = reinterpret_cast<JitBlock**>(m_block_map_arena.Create(FAST_BLOCK_MAP_SIZE)); m_entry_points_ptr = reinterpret_cast<u8**>(m_entry_points_arena.Create(FAST_BLOCK_MAP_SIZE));
#else #else
m_fast_block_map = nullptr; m_entry_points_ptr = nullptr;
#endif #endif
if (m_fast_block_map)
m_fast_block_map_ptr = m_fast_block_map;
else
m_fast_block_map_ptr = m_fast_block_map_fallback.data();
Clear(); Clear();
} }
@ -59,7 +55,7 @@ void JitBaseBlockCache::Shutdown()
{ {
Common::JitRegister::Shutdown(); Common::JitRegister::Shutdown();
m_block_map_arena.Release(); m_entry_points_arena.Release();
} }
// This clears the JIT cache. It's called from JitCache.cpp when the JIT cache // This clears the JIT cache. It's called from JitCache.cpp when the JIT cache
@ -82,8 +78,8 @@ void JitBaseBlockCache::Clear()
valid_block.ClearAll(); valid_block.ClearAll();
if (m_fast_block_map) if (m_entry_points_ptr)
m_block_map_arena.Clear(); m_entry_points_arena.Clear();
} }
void JitBaseBlockCache::Reset() void JitBaseBlockCache::Reset()
@ -92,9 +88,9 @@ void JitBaseBlockCache::Reset()
Init(); Init();
} }
JitBlock** JitBaseBlockCache::GetFastBlockMap() u8** JitBaseBlockCache::GetEntryPoints()
{ {
return m_fast_block_map; return m_entry_points_ptr;
} }
JitBlock** JitBaseBlockCache::GetFastBlockMapFallback() JitBlock** JitBaseBlockCache::GetFastBlockMapFallback()
@ -123,8 +119,11 @@ JitBlock* JitBaseBlockCache::AllocateBlock(u32 em_address)
void JitBaseBlockCache::FinalizeBlock(JitBlock& block, bool block_link, void JitBaseBlockCache::FinalizeBlock(JitBlock& block, bool block_link,
const std::set<u32>& physical_addresses) const std::set<u32>& physical_addresses)
{ {
size_t index = FastLookupIndexForAddress(block.effectiveAddress); size_t index = FastLookupIndexForAddress(block.effectiveAddress, block.msrBits);
m_fast_block_map_ptr[index] = &block; if (m_entry_points_ptr)
m_entry_points_ptr[index] = block.normalEntry;
else
m_fast_block_map_fallback[index] = &block;
block.fast_block_map_index = index; block.fast_block_map_index = index;
block.physical_addresses = physical_addresses; block.physical_addresses = physical_addresses;
@ -187,7 +186,28 @@ JitBlock* JitBaseBlockCache::GetBlockFromStartAddress(u32 addr, u32 msr)
const u8* JitBaseBlockCache::Dispatch() const u8* JitBaseBlockCache::Dispatch()
{ {
const auto& ppc_state = m_jit.m_ppc_state; const auto& ppc_state = m_jit.m_ppc_state;
JitBlock* block = m_fast_block_map_ptr[FastLookupIndexForAddress(ppc_state.pc)]; if (m_entry_points_ptr)
{
u8* entry_point =
m_entry_points_ptr[FastLookupIndexForAddress(ppc_state.pc, ppc_state.msr.Hex)];
if (entry_point)
{
return entry_point;
}
else
{
JitBlock* block =
MoveBlockIntoFastCache(ppc_state.pc, ppc_state.msr.Hex & JIT_CACHE_MSR_MASK);
if (!block)
return nullptr;
return block->normalEntry;
}
}
JitBlock* block =
m_fast_block_map_fallback[FastLookupIndexForAddress(ppc_state.pc, ppc_state.msr.Hex)];
if (!block || block->effectiveAddress != ppc_state.pc || if (!block || block->effectiveAddress != ppc_state.pc ||
block->msrBits != (ppc_state.msr.Hex & JIT_CACHE_MSR_MASK)) block->msrBits != (ppc_state.msr.Hex & JIT_CACHE_MSR_MASK))
@ -408,8 +428,20 @@ void JitBaseBlockCache::UnlinkBlock(const JitBlock& block)
void JitBaseBlockCache::DestroyBlock(JitBlock& block) void JitBaseBlockCache::DestroyBlock(JitBlock& block)
{ {
if (m_fast_block_map_ptr[block.fast_block_map_index] == &block) if (m_entry_points_ptr)
m_fast_block_map_ptr[block.fast_block_map_index] = nullptr; {
if (m_entry_points_ptr[block.fast_block_map_index] == block.normalEntry)
{
m_entry_points_ptr[block.fast_block_map_index] = nullptr;
}
}
else
{
if (m_fast_block_map_fallback[block.fast_block_map_index] == &block)
{
m_fast_block_map_fallback[block.fast_block_map_index] = nullptr;
}
}
UnlinkBlock(block); UnlinkBlock(block);
@ -436,22 +468,37 @@ JitBlock* JitBaseBlockCache::MoveBlockIntoFastCache(u32 addr, u32 msr)
return nullptr; return nullptr;
// Drop old fast block map entry // Drop old fast block map entry
if (m_fast_block_map_ptr[block->fast_block_map_index] == block) if (m_entry_points_ptr)
m_fast_block_map_ptr[block->fast_block_map_index] = nullptr; {
if (m_entry_points_ptr[block->fast_block_map_index] == block->normalEntry)
{
m_entry_points_ptr[block->fast_block_map_index] = nullptr;
}
}
else
{
if (m_fast_block_map_fallback[block->fast_block_map_index] == block)
{
m_fast_block_map_fallback[block->fast_block_map_index] = nullptr;
}
}
// And create a new one // And create a new one
size_t index = FastLookupIndexForAddress(addr); size_t index = FastLookupIndexForAddress(addr, msr);
m_fast_block_map_ptr[index] = block; if (m_entry_points_ptr)
m_entry_points_ptr[index] = block->normalEntry;
else
m_fast_block_map_fallback[index] = block;
block->fast_block_map_index = index; block->fast_block_map_index = index;
return block; return block;
} }
size_t JitBaseBlockCache::FastLookupIndexForAddress(u32 address) size_t JitBaseBlockCache::FastLookupIndexForAddress(u32 address, u32 msr)
{ {
if (m_fast_block_map) if (m_entry_points_ptr)
{ {
return address >> 2; return ((msr & JIT_CACHE_MSR_MASK) << 26) | (address >> 2);
} }
else else
{ {

View File

@ -133,8 +133,8 @@ public:
static constexpr u32 JIT_CACHE_MSR_MASK = 0x30; static constexpr u32 JIT_CACHE_MSR_MASK = 0x30;
// The value for the map is determined like this: // The value for the map is determined like this:
// ((4 GB guest memory space) / (4 bytes per address)) * sizeof(JitBlock*) // ((4 GB guest memory space) / (4 bytes per address) * sizeof(JitBlock*)) * (4 for 2 bits of msr)
static constexpr u64 FAST_BLOCK_MAP_SIZE = 0x2'0000'0000; static constexpr u64 FAST_BLOCK_MAP_SIZE = 0x8'0000'0000;
static constexpr u32 FAST_BLOCK_MAP_FALLBACK_ELEMENTS = 0x10000; static constexpr u32 FAST_BLOCK_MAP_FALLBACK_ELEMENTS = 0x10000;
static constexpr u32 FAST_BLOCK_MAP_FALLBACK_MASK = FAST_BLOCK_MAP_FALLBACK_ELEMENTS - 1; static constexpr u32 FAST_BLOCK_MAP_FALLBACK_MASK = FAST_BLOCK_MAP_FALLBACK_ELEMENTS - 1;
@ -147,7 +147,7 @@ public:
void Reset(); void Reset();
// Code Cache // Code Cache
JitBlock** GetFastBlockMap(); u8** GetEntryPoints();
JitBlock** GetFastBlockMapFallback(); JitBlock** GetFastBlockMapFallback();
void RunOnBlocks(std::function<void(const JitBlock&)> f); void RunOnBlocks(std::function<void(const JitBlock&)> f);
@ -188,7 +188,7 @@ private:
JitBlock* MoveBlockIntoFastCache(u32 em_address, u32 msr); JitBlock* MoveBlockIntoFastCache(u32 em_address, u32 msr);
// Fast but risky block lookup based on fast_block_map. // Fast but risky block lookup based on fast_block_map.
size_t FastLookupIndexForAddress(u32 address); size_t FastLookupIndexForAddress(u32 address, u32 msr);
// links_to hold all exit points of all valid blocks in a reverse way. // links_to hold all exit points of all valid blocks in a reverse way.
// It is used to query all blocks which links to an address. // It is used to query all blocks which links to an address.
@ -208,16 +208,14 @@ private:
// It is used to provide a fast way to query if no icache invalidation is needed. // It is used to provide a fast way to query if no icache invalidation is needed.
ValidBlockBitSet valid_block; ValidBlockBitSet valid_block;
// This array is indexed with the shifted PC and likely holds the correct block id. // This contains the entry points for each block.
// This is used as a fast cache of block_map used in the assembly dispatcher. // It is used by the assembly dispatcher to quickly
// It is implemented via a shm segment using m_block_map_arena. // know where to jump based on pc and msr bits.
JitBlock** m_fast_block_map = 0; Common::LazyMemoryRegion m_entry_points_arena;
Common::LazyMemoryRegion m_block_map_arena; u8** m_entry_points_ptr = 0;
// An alternative for the above fast_block_map but without a shm segment // An alternative for the above but without a shm segment
// in case the shm memory region couldn't be allocated. // in case the shm memory region couldn't be allocated.
std::array<JitBlock*, FAST_BLOCK_MAP_FALLBACK_ELEMENTS> std::array<JitBlock*, FAST_BLOCK_MAP_FALLBACK_ELEMENTS>
m_fast_block_map_fallback{}; // start_addr & mask -> number m_fast_block_map_fallback{}; // start_addr & mask -> number
JitBlock** m_fast_block_map_ptr = 0;
}; };