From cf0752f93715bc19653ee93fc93a8c51257b4e6a Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Wed, 14 Sep 2022 21:58:05 +0100 Subject: [PATCH] Use NCE memory tracking for guest shaders Prevents needing to hash them for every single pipeline state update, without this just hashing shaders takes up a significant amount of time. --- .../gpu/interconnect/maxwell_3d/common.h | 6 ++ .../interconnect/maxwell_3d/maxwell_3d.cpp | 4 +- .../gpu/interconnect/maxwell_3d/maxwell_3d.h | 2 + .../maxwell_3d/pipeline_state.cpp | 64 +++++++++++++++++-- .../interconnect/maxwell_3d/pipeline_state.h | 27 +++++++- .../skyline/soc/gm20b/engines/maxwell_3d.cpp | 2 +- 6 files changed, 94 insertions(+), 11 deletions(-) diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/common.h b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/common.h index 8593f0bc..85b5a82f 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/common.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/common.h @@ -8,6 +8,10 @@ #include #include +namespace skyline::kernel { + class MemoryManager; +} + namespace skyline::soc::gm20b { struct ChannelContext; } @@ -26,6 +30,8 @@ namespace skyline::gpu::interconnect::maxwell3d { soc::gm20b::ChannelContext &channelCtx; CommandExecutor &executor; GPU &gpu; + nce::NCE &nce; + kernel::MemoryManager &memory; }; /** diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp index 1d6325dc..f2fa8b8b 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp @@ -14,9 +14,11 @@ namespace skyline::gpu::interconnect::maxwell3d { Maxwell3D::Maxwell3D(GPU &gpu, soc::gm20b::ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor, + nce::NCE &nce, + skyline::kernel::MemoryManager &memoryManager, DirtyManager &manager, const EngineRegisterBundle ®isterBundle) - : ctx{channelCtx, executor, gpu}, + : ctx{channelCtx, executor, gpu, nce, memoryManager}, activeState{manager, registerBundle.activeStateRegisters}, clearEngineRegisters{registerBundle.clearRegisters}, constantBuffers{manager, registerBundle.constantBufferSelectorRegisters}, diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.h b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.h index 39692189..0ab3ac3b 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.h @@ -47,6 +47,8 @@ namespace skyline::gpu::interconnect::maxwell3d { Maxwell3D(GPU &gpu, soc::gm20b::ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor, + nce::NCE &nce, + kernel::MemoryManager &memoryManager, DirtyManager &manager, const EngineRegisterBundle ®isterBundle); diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp index 737a2728..9299a6e6 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp @@ -4,6 +4,8 @@ // Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/) #include +#include +#include #include #include #include @@ -228,24 +230,74 @@ namespace skyline::gpu::interconnect::maxwell3d { return; } - binary.binary = ctx.channelCtx.asCtx->gmmu.ReadTill(shaderBacking, engine->programRegion + engine->pipeline.programOffset, [](span data) -> std::optional { + auto [blockMapping, blockOffset]{ctx.channelCtx.asCtx->gmmu.LookupBlock(engine->programRegion + engine->pipeline.programOffset)}; + + // Skip looking up the mirror if it is the same as the one used for the previous update + if (!mirrorBlock.valid() || !mirrorBlock.contains(blockMapping)) { + auto mirrorIt{mirrorMap.find(blockMapping.data())}; + if (mirrorIt == mirrorMap.end()) { + // Allocate a host mirror for the mapping and trap the guest region + auto newIt{mirrorMap.emplace(blockMapping.data(), std::make_unique(ctx.memory.CreateMirror(blockMapping)))}; + + // We need to create the trap after allocating the entry so that we have an `invalid` pointer we can pass in + auto trapHandle{ctx.nce.CreateTrap(blockMapping, [](){}, [](){ return true; }, [dirty = &newIt.first->second->dirty, mutex = &trapMutex](){ + std::scoped_lock lock{*mutex}; // Don't use lock callback here since we need trapMutex to be always locked on accesses to prevent UAFs + *dirty = true; + return true; + })}; + + // Write only trap + ctx.nce.TrapRegions(trapHandle, true); + + entry = newIt.first->second.get(); + entry->trap = trapHandle; + } else { + entry = mirrorIt->second.get(); + } + + mirrorBlock = blockMapping; + } + + // If the mirror entry has been written to, clear its shader binary cache and retrap to catch any future writes + if (entry->dirty) { + entry->cache.clear(); + entry->dirty = false; + ctx.nce.TrapRegions(*entry->trap, true); + } else if (auto it{entry->cache.find(blockMapping.data() + blockOffset)}; it != entry->cache.end()) { + binary = it->second.binary; + hash = it->second.hash; + return; + } + + // If nothing was in the cache then do a full shader parse + auto guest{[](span mapping) { // We attempt to find the shader size by looking for "BRA $" (Infinite Loop) which is used as padding at the end of the shader // UAM Shader Compiler Reference: https://github.com/devkitPro/uam/blob/5a5afc2bae8b55409ab36ba45be63fcb73f68993/source/compiler_iface.cpp#L319-L351 constexpr u64 BraSelf1{0xE2400FFFFF87000F}, BraSelf2{0xE2400FFFFF07000F}; - span shaderInstructions{data.cast()}; + span shaderInstructions{mapping.cast()}; for (auto it{shaderInstructions.begin()}; it != shaderInstructions.end(); it++) { auto instruction{*it}; if (instruction == BraSelf1 || instruction == BraSelf2) [[unlikely]] // It is far more likely that the instruction doesn't match so this is an unlikely case - return static_cast(std::distance(shaderInstructions.begin(), it)) * sizeof(u64); + return span{shaderInstructions.begin(), it}.cast(); } - return std::nullopt; - }); + + return span{}; + }(blockMapping.subspan(blockOffset))}; binary.baseOffset = engine->pipeline.programOffset; + hash = XXH64(guest.data(), guest.size_bytes(), 0); - hash = XXH64(binary.binary.data(), binary.binary.size_bytes(), 0); + binary.binary = {guest.data() - mirrorBlock.data() + entry->mirror.data(), guest.size()}; + + entry->cache.insert({blockMapping.data() + blockOffset, CacheEntry{binary, hash}}); + } + + PipelineStageState::~PipelineStageState() { + std::scoped_lock lock{trapMutex}; + //for (const auto &mirror : mirrorMap) + // ctx.nce.DestroyTrap(*mirror.second->trap); } /* Vertex Input State */ diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.h b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.h index 8627985e..9ba6d6d7 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.h @@ -65,12 +65,31 @@ namespace skyline::gpu::interconnect::maxwell3d { }; private: + struct CacheEntry { + ShaderBinary binary; + u64 hash; + + CacheEntry(ShaderBinary binary, u64 hash) : binary{binary}, hash{hash} {} + }; + + /** + * @brief Holds mirror state for a single GPU mapped block + */ + struct MirrorEntry { + span mirror; + tsl::robin_map cache; + std::optional trap; + bool dirty{}; + MirrorEntry(span alignedMirror) : mirror{alignedMirror} {} + }; + dirty::BoundSubresource engine; engine::Pipeline::Shader::Type shaderType; - constexpr static size_t MaxShaderBytecodeSize{1 * 1024 * 1024}; //!< The largest shader binary that we support (1 MiB) - - std::array shaderBacking; + tsl::robin_map> mirrorMap; + std::mutex trapMutex; //!< Protects accesses from trap handlers to the mirror map + MirrorEntry *entry{}; + span mirrorBlock{}; //!< Guest mapped memory block corresponding to `entry` public: ShaderBinary binary; @@ -78,6 +97,8 @@ namespace skyline::gpu::interconnect::maxwell3d { PipelineStageState(dirty::Handle dirtyHandle, DirtyManager &manager, const EngineRegisters &engine, u8 shaderType); + ~PipelineStageState(); + void Flush(InterconnectContext &ctx); }; diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp index 1c16faef..17b1d590 100644 --- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp +++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp @@ -62,7 +62,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d { syncpoints{state.soc->host1x.syncpoints}, i2m{channelCtx}, dirtyManager{registers}, - interconnect{*state.gpu, channelCtx, executor, dirtyManager, MakeEngineRegisters(registers)}, + interconnect{*state.gpu, channelCtx, executor, *state.nce, state.process->memory, dirtyManager, MakeEngineRegisters(registers)}, channelCtx{channelCtx} { executor.AddFlushCallback([this]() { FlushEngineState(); }); InitializeRegisters();