Use NCE memory tracking for guest shaders

Prevents needing to hash them for every single pipeline state update, without this just hashing shaders takes up a significant amount of time.
2025-01-23 02:11:12 +01:00 · 2022-09-14 21:58:05 +01:00 · 2022-09-14 21:58:05 +01:00 · cf0752f937
commit cf0752f937
parent 19a75c3f65
6 changed files with 94 additions and 11 deletions
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/common.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/common.h
@ -8,6 +8,10 @@
 #include <soc/gm20b/engines/maxwell/types.h>
 #include <gpu/buffer.h>

+namespace skyline::kernel {
+    class MemoryManager;
+}
+
 namespace skyline::soc::gm20b {
    struct ChannelContext;
 }
@ -26,6 +30,8 @@ namespace skyline::gpu::interconnect::maxwell3d {
        soc::gm20b::ChannelContext &channelCtx;
        CommandExecutor &executor;
        GPU &gpu;
+        nce::NCE &nce;
+        kernel::MemoryManager &memory;
    };

    /**
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp
@ -14,9 +14,11 @@ namespace skyline::gpu::interconnect::maxwell3d {
    Maxwell3D::Maxwell3D(GPU &gpu,
                         soc::gm20b::ChannelContext &channelCtx,
                         gpu::interconnect::CommandExecutor &executor,
+                         nce::NCE &nce,
+                         skyline::kernel::MemoryManager &memoryManager,
                         DirtyManager &manager,
                         const EngineRegisterBundle &registerBundle)
-        : ctx{channelCtx, executor, gpu},
+        : ctx{channelCtx, executor, gpu, nce, memoryManager},
          activeState{manager, registerBundle.activeStateRegisters},
          clearEngineRegisters{registerBundle.clearRegisters},
          constantBuffers{manager, registerBundle.constantBufferSelectorRegisters},
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.h
@ -47,6 +47,8 @@ namespace skyline::gpu::interconnect::maxwell3d {
        Maxwell3D(GPU &gpu,
                  soc::gm20b::ChannelContext &channelCtx,
                  gpu::interconnect::CommandExecutor &executor,
+                  nce::NCE &nce,
+                  kernel::MemoryManager &memoryManager,
                  DirtyManager &manager,
                  const EngineRegisterBundle &registerBundle);

--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp
@ -4,6 +4,8 @@
 // Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)

 #include <range/v3/algorithm/for_each.hpp>
+#include <nce.h>
+#include <kernel/memory.h>
 #include <soc/gm20b/channel.h>
 #include <soc/gm20b/gmmu.h>
 #include <gpu/texture/format.h>
@ -228,24 +230,74 @@ namespace skyline::gpu::interconnect::maxwell3d {
            return;
        }

-        binary.binary = ctx.channelCtx.asCtx->gmmu.ReadTill(shaderBacking, engine->programRegion + engine->pipeline.programOffset, [](span<u8> data) -> std::optional<size_t> {
+        auto [blockMapping, blockOffset]{ctx.channelCtx.asCtx->gmmu.LookupBlock(engine->programRegion + engine->pipeline.programOffset)};
+
+        // Skip looking up the mirror if it is the same as the one used for the previous update
+        if (!mirrorBlock.valid() || !mirrorBlock.contains(blockMapping)) {
+            auto mirrorIt{mirrorMap.find(blockMapping.data())};
+            if (mirrorIt == mirrorMap.end()) {
+                // Allocate a host mirror for the mapping and trap the guest region
+                auto newIt{mirrorMap.emplace(blockMapping.data(), std::make_unique<MirrorEntry>(ctx.memory.CreateMirror(blockMapping)))};
+
+                // We need to create the trap after allocating the entry so that we have an `invalid` pointer we can pass in
+                auto trapHandle{ctx.nce.CreateTrap(blockMapping, [](){}, [](){ return true; }, [dirty = &newIt.first->second->dirty, mutex = &trapMutex](){
+                    std::scoped_lock lock{*mutex}; // Don't use lock callback here since we need trapMutex to be always locked on accesses to prevent UAFs
+                    *dirty = true;
+                    return true;
+                })};
+
+                // Write only trap
+                ctx.nce.TrapRegions(trapHandle, true);
+
+                entry = newIt.first->second.get();
+                entry->trap = trapHandle;
+            } else {
+                entry = mirrorIt->second.get();
+            }
+
+            mirrorBlock = blockMapping;
+        }
+
+        // If the mirror entry has been written to, clear its shader binary cache and retrap to catch any future writes
+        if (entry->dirty) {
+            entry->cache.clear();
+            entry->dirty = false;
+            ctx.nce.TrapRegions(*entry->trap, true);
+        } else if (auto it{entry->cache.find(blockMapping.data() + blockOffset)}; it != entry->cache.end()) {
+            binary = it->second.binary;
+            hash = it->second.hash;
+            return;
+        }
+
+        // If nothing was in the cache then do a full shader parse
+        auto guest{[](span<u8> mapping) {
            // We attempt to find the shader size by looking for "BRA $" (Infinite Loop) which is used as padding at the end of the shader
            // UAM Shader Compiler Reference: https://github.com/devkitPro/uam/blob/5a5afc2bae8b55409ab36ba45be63fcb73f68993/source/compiler_iface.cpp#L319-L351
            constexpr u64 BraSelf1{0xE2400FFFFF87000F}, BraSelf2{0xE2400FFFFF07000F};

-            span<u64> shaderInstructions{data.cast<u64, std::dynamic_extent, true>()};
+            span<u64> shaderInstructions{mapping.cast<u64, std::dynamic_extent, true>()};
            for (auto it{shaderInstructions.begin()}; it != shaderInstructions.end(); it++) {
                auto instruction{*it};
                if (instruction == BraSelf1 || instruction == BraSelf2) [[unlikely]]
                    // It is far more likely that the instruction doesn't match so this is an unlikely case
-                    return static_cast<size_t>(std::distance(shaderInstructions.begin(), it)) * sizeof(u64);
+                    return span{shaderInstructions.begin(), it}.cast<u8>();
            }
-            return std::nullopt;
-        });
+
+            return span<u8>{};
+        }(blockMapping.subspan(blockOffset))};

        binary.baseOffset = engine->pipeline.programOffset;
+        hash = XXH64(guest.data(), guest.size_bytes(), 0);

-        hash = XXH64(binary.binary.data(), binary.binary.size_bytes(), 0);
+        binary.binary = {guest.data() - mirrorBlock.data() + entry->mirror.data(), guest.size()};
+
+        entry->cache.insert({blockMapping.data() + blockOffset, CacheEntry{binary, hash}});
+    }
+
+    PipelineStageState::~PipelineStageState() {
+        std::scoped_lock lock{trapMutex};
+        //for (const auto &mirror : mirrorMap)
+        //    ctx.nce.DestroyTrap(*mirror.second->trap);
    }

    /* Vertex Input State */
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.h
@ -65,12 +65,31 @@ namespace skyline::gpu::interconnect::maxwell3d {
        };

      private:
+        struct CacheEntry {
+            ShaderBinary binary;
+            u64 hash;
+
+            CacheEntry(ShaderBinary binary, u64 hash) : binary{binary}, hash{hash} {}
+        };
+
+        /**
+         * @brief Holds mirror state for a single GPU mapped block
+         */
+        struct MirrorEntry {
+            span<u8> mirror;
+            tsl::robin_map<u8 *, CacheEntry> cache;
+            std::optional<nce::NCE::TrapHandle> trap;
+            bool dirty{};
+            MirrorEntry(span<u8> alignedMirror) : mirror{alignedMirror} {}
+        };
+
        dirty::BoundSubresource<EngineRegisters> engine;
        engine::Pipeline::Shader::Type shaderType;

-        constexpr static size_t MaxShaderBytecodeSize{1 * 1024 * 1024}; //!< The largest shader binary that we support (1 MiB)
-
-        std::array<u8, MaxShaderBytecodeSize> shaderBacking;
+        tsl::robin_map<u8 *, std::unique_ptr<MirrorEntry>> mirrorMap;
+        std::mutex trapMutex; //!< Protects accesses from trap handlers to the mirror map
+        MirrorEntry *entry{};
+        span<u8> mirrorBlock{}; //!< Guest mapped memory block corresponding to `entry`

      public:
        ShaderBinary binary;
@ -78,6 +97,8 @@ namespace skyline::gpu::interconnect::maxwell3d {

        PipelineStageState(dirty::Handle dirtyHandle, DirtyManager &manager, const EngineRegisters &engine, u8 shaderType);

+        ~PipelineStageState();
+
        void Flush(InterconnectContext &ctx);
    };

--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
@ -62,7 +62,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
          syncpoints{state.soc->host1x.syncpoints},
          i2m{channelCtx},
          dirtyManager{registers},
-          interconnect{*state.gpu, channelCtx, executor, dirtyManager, MakeEngineRegisters(registers)},
+          interconnect{*state.gpu, channelCtx, executor, *state.nce, state.process->memory, dirtyManager, MakeEngineRegisters(registers)},
          channelCtx{channelCtx} {
        executor.AddFlushCallback([this]() { FlushEngineState(); });
        InitializeRegisters();