From 23a7f70a8e109b16f443d5d4a45ba945b85d272d Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Fri, 18 Nov 2022 21:28:04 +0000
Subject: [PATCH] Commonise maxwell3d guest shader caching code

---
 .../gpu/interconnect/common/shader_cache.cpp  | 119 ++++++++++++++++++
 .../gpu/interconnect/common/shader_cache.h    |  42 +++++++
 .../maxwell_3d/pipeline_manager.h             |   5 -
 .../maxwell_3d/pipeline_state.cpp             | 103 +--------------
 .../interconnect/maxwell_3d/pipeline_state.h  |  33 +----
 5 files changed, 168 insertions(+), 134 deletions(-)
 create mode 100644 app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.cpp
 create mode 100644 app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.h
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.cpp b/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.cpp
new file mode 100644
index 00000000..2820c67f
--- /dev/null
+++ b/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.cpp
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/Ryujinx/)
+// Copyright © 2022 yuzu Team and Contributors (https://github.com/yuzu-emu/)
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <nce.h>
+#include <kernel/memory.h>
+#include <soc/gm20b/channel.h>
+#include <soc/gm20b/gmmu.h>
+#include <gpu.h>
+#include "shader_cache.h"
+
+namespace skyline::gpu::interconnect {
+    /* Pipeline Stage */
+    ShaderBinary ShaderCache::Lookup(InterconnectContext &ctx, u64 programBase, u32 programOffset) {
+        lastProgramBase = programBase;
+        lastProgramOffset = programOffset;
+        auto[blockMapping, blockOffset]{ctx.channelCtx.asCtx->gmmu.LookupBlock(programBase + programOffset)};
+
+        if (!trapExecutionLock)
+            trapExecutionLock.emplace(trapMutex);
+
+        // Skip looking up the mirror if it is the same as the one used for the previous update
+        if (!mirrorBlock.valid() || !mirrorBlock.contains(blockMapping)) {
+            auto mirrorIt{mirrorMap.find(blockMapping.data())};
+            if (mirrorIt == mirrorMap.end()) {
+                // Allocate a host mirror for the mapping and trap the guest region
+                auto newIt{mirrorMap.emplace(blockMapping.data(), std::make_unique<MirrorEntry>(ctx.memory.CreateMirror(blockMapping)))};
+
+                // We need to create the trap after allocating the entry so that we have an `invalid` pointer we can pass in
+                auto trapHandle{ctx.nce.CreateTrap(blockMapping, [mutex = &trapMutex]() {
+                    std::scoped_lock lock{*mutex};
+                    return;
+                }, []() { return true; }, [entry = newIt.first->second.get(), mutex = &trapMutex]() {
+                    std::unique_lock lock{*mutex, std::try_to_lock};
+                    if (!lock)
+                        return false;
+
+                    if (++entry->trapCount <= MirrorEntry::SkipTrapThreshold)
+                        entry->dirty = true;
+                    return true;
+                })};
+
+                // Write only trap
+                ctx.nce.TrapRegions(trapHandle, true);
+
+                entry = newIt.first->second.get();
+                entry->trap = trapHandle;
+            } else {
+                entry = mirrorIt->second.get();
+            }
+
+            mirrorBlock = blockMapping;
+        }
+
+        if (entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber) {
+            entry->channelSequenceNumber = ctx.channelCtx.channelSequenceNumber;
+            entry->dirty = true;
+        }
+
+        // If the mirror entry has been written to, clear its shader binary cache and retrap to catch any future writes
+        if (entry->dirty) {
+            entry->cache.clear();
+            entry->dirty = false;
+
+            if (entry->trapCount <= MirrorEntry::SkipTrapThreshold)
+                ctx.nce.TrapRegions(*entry->trap, true);
+        } else if (auto it{entry->cache.find(blockMapping.data() + blockOffset)}; it != entry->cache.end()) {
+            return it->second;
+        }
+
+        // entry->mirror may not be a direct mirror of blockMapping and may just contain it as a subregion, so we need to explicitly calculate the offset
+        span<u8> blockMappingMirror{blockMapping.data() - mirrorBlock.data() + entry->mirror.data(), blockMapping.size()};
+
+        ShaderBinary binary{};
+        // If nothing was in the cache then do a full shader parse
+        binary.binary = [](span<u8> mapping) {
+            // We attempt to find the shader size by looking for "BRA $" (Infinite Loop) which is used as padding at the end of the shader
+            // UAM Shader Compiler Reference: https://github.com/devkitPro/uam/blob/5a5afc2bae8b55409ab36ba45be63fcb73f68993/source/compiler_iface.cpp#L319-L351
+            constexpr u64 BraSelf1{0xE2400FFFFF87000F}, BraSelf2{0xE2400FFFFF07000F};
+
+            span<u64> shaderInstructions{mapping.cast<u64, std::dynamic_extent, true>()};
+            for (auto it{shaderInstructions.begin()}; it != shaderInstructions.end(); it++) {
+                auto instruction{*it};
+                if (instruction == BraSelf1 || instruction == BraSelf2) [[unlikely]]
+                    // It is far more likely that the instruction doesn't match so this is an unlikely case
+                    return span{shaderInstructions.begin(), it}.cast<u8>();
+            }
+
+            return span<u8>{};
+        }(blockMappingMirror.subspan(blockOffset));
+
+        binary.baseOffset = programOffset;
+        binary.hash = XXH64(binary.binary.data(), binary.binary.size_bytes(), 0);
+
+        entry->cache.insert({blockMapping.data() + blockOffset, binary});
+
+        return binary;
+    }
+
+    bool ShaderCache::Refresh(InterconnectContext &ctx, u64 programBase, u32 programOffset) {
+        if (!trapExecutionLock)
+            trapExecutionLock.emplace(trapMutex);
+
+        if (programBase != lastProgramBase || programOffset != lastProgramOffset)
+            return true;
+
+        if (entry && entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber)
+            return true;
+        else if (entry && entry->dirty)
+            return true;
+
+        return false;
+    }
+
+    void ShaderCache::PurgeCaches() {
+        trapExecutionLock.reset();
+    }
+}
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.h b/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.h
new file mode 100644
index 00000000..82f914e0
--- /dev/null
+++ b/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.h
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include "common.h"
+
+namespace skyline::gpu::interconnect {
+    class ShaderCache {
+      private:
+        /**
+         * @brief Holds mirror state for a single GPU mapped block
+         */
+        struct MirrorEntry {
+            span<u8> mirror;
+            tsl::robin_map<u8 *, ShaderBinary> cache;
+            std::optional<nce::NCE::TrapHandle> trap;
+
+            static constexpr u32 SkipTrapThreshold{20}; //!< Threshold for the number of times a mirror trap needs to be hit before we fallback to always hashing
+            u32 trapCount{}; //!< The number of times the trap has been hit, used to avoid trapping in cases where the constant retraps would harm performance
+            size_t channelSequenceNumber{}; //!< For the case where `trapCount > SkipTrapThreshold`, the memory sequence number number used to clear the cache after every access
+            bool dirty{}; //!< If the trap has been hit and the cache needs to be cleared
+
+            MirrorEntry(span<u8> alignedMirror) : mirror{alignedMirror} {}
+        };
+
+        tsl::robin_map<u8 *, std::unique_ptr<MirrorEntry>> mirrorMap;
+        std::mutex trapMutex; //!< Protects accesses from trap handlers to the mirror map
+        std::optional<std::scoped_lock<std::mutex>> trapExecutionLock; //!< Persistently held lock over an execution to avoid frequent relocking
+        MirrorEntry *entry{};
+        span<u8> mirrorBlock{}; //!< Guest mapped memory block corresponding to `entry`
+        u64 lastProgramBase{};
+        u32 lastProgramOffset{};
+
+      public:
+        ShaderBinary Lookup(InterconnectContext &ctx, u64 programBase, u32 programOffset);
+
+        bool Refresh(InterconnectContext &ctx, u64 programBase, u32 programOffset);
+
+        void PurgeCaches();
+    };
+}
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h
index ba4cb634..8aaf81ea 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h
@@ -17,11 +17,6 @@ namespace skyline::gpu {
 }
 
 namespace skyline::gpu::interconnect::maxwell3d {
-    struct ShaderBinary {
-        span<u8> binary;
-        u32 baseOffset;
-    };
-
     class Pipeline {
       public:
         struct ShaderStage {
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp
index a641b88d..5e3a3f81 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp
@@ -255,112 +255,19 @@ namespace skyline::gpu::interconnect::maxwell3d {
             throw exception("Shader type mismatch: {} != {}!", engine->pipeline.shader.type, static_cast<u8>(shaderType));
 
         if (!engine->pipeline.shader.enable && shaderType != engine::Pipeline::Shader::Type::Vertex) {
-            hash = 0;
+            binary.hash = 0;
             return;
         }
 
-        auto[blockMapping, blockOffset]{ctx.channelCtx.asCtx->gmmu.LookupBlock(engine->programRegion + engine->pipeline.programOffset)};
-
-        if (!trapExecutionLock)
-            trapExecutionLock.emplace(trapMutex);
-
-        // Skip looking up the mirror if it is the same as the one used for the previous update
-        if (!mirrorBlock.valid() || !mirrorBlock.contains(blockMapping)) {
-            auto mirrorIt{mirrorMap.find(blockMapping.data())};
-            if (mirrorIt == mirrorMap.end()) {
-                // Allocate a host mirror for the mapping and trap the guest region
-                auto newIt{mirrorMap.emplace(blockMapping.data(), std::make_unique<MirrorEntry>(ctx.memory.CreateMirror(blockMapping)))};
-
-                // We need to create the trap after allocating the entry so that we have an `invalid` pointer we can pass in
-                auto trapHandle{ctx.nce.CreateTrap(blockMapping, [mutex = &trapMutex]() {
-                    std::scoped_lock lock{*mutex};
-                    return;
-                }, []() { return true; }, [entry = newIt.first->second.get(), mutex = &trapMutex]() {
-                    std::unique_lock lock{*mutex, std::try_to_lock};
-                    if (!lock)
-                        return false;
-
-                    if (++entry->trapCount <= MirrorEntry::SkipTrapThreshold)
-                        entry->dirty = true;
-                    return true;
-                })};
-
-                // Write only trap
-                ctx.nce.TrapRegions(trapHandle, true);
-
-                entry = newIt.first->second.get();
-                entry->trap = trapHandle;
-            } else {
-                entry = mirrorIt->second.get();
-            }
-
-            mirrorBlock = blockMapping;
-        }
-
-        if (entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber) {
-            entry->channelSequenceNumber = ctx.channelCtx.channelSequenceNumber;
-            entry->dirty = true;
-        }
-
-        // If the mirror entry has been written to, clear its shader binary cache and retrap to catch any future writes
-        if (entry->dirty) {
-            entry->cache.clear();
-            entry->dirty = false;
-
-            if (entry->trapCount <= MirrorEntry::SkipTrapThreshold)
-                ctx.nce.TrapRegions(*entry->trap, true);
-        } else if (auto it{entry->cache.find(blockMapping.data() + blockOffset)}; it != entry->cache.end()) {
-            binary = it->second.binary;
-            hash = it->second.hash;
-            return;
-        }
-
-        // entry->mirror may not be a direct mirror of blockMapping and may just contain it as a subregion, so we need to explicitly calculate the offset
-        span<u8> blockMappingMirror{blockMapping.data() - mirrorBlock.data() + entry->mirror.data(), blockMapping.size()};
-
-        // If nothing was in the cache then do a full shader parse
-        binary.binary = [](span<u8> mapping) {
-            // We attempt to find the shader size by looking for "BRA $" (Infinite Loop) which is used as padding at the end of the shader
-            // UAM Shader Compiler Reference: https://github.com/devkitPro/uam/blob/5a5afc2bae8b55409ab36ba45be63fcb73f68993/source/compiler_iface.cpp#L319-L351
-            constexpr u64 BraSelf1{0xE2400FFFFF87000F}, BraSelf2{0xE2400FFFFF07000F};
-
-            span<u64> shaderInstructions{mapping.cast<u64, std::dynamic_extent, true>()};
-            for (auto it{shaderInstructions.begin()}; it != shaderInstructions.end(); it++) {
-                auto instruction{*it};
-                if (instruction == BraSelf1 || instruction == BraSelf2) [[unlikely]]
-                    // It is far more likely that the instruction doesn't match so this is an unlikely case
-                    return span{shaderInstructions.begin(), it}.cast<u8>();
-            }
-
-            return span<u8>{};
-        }(blockMappingMirror.subspan(blockOffset));
-
-        binary.baseOffset = engine->pipeline.programOffset;
-        hash = XXH64(binary.binary.data(), binary.binary.size_bytes(), 0);
-
-        entry->cache.insert({blockMapping.data() + blockOffset, CacheEntry{binary, hash}});
+        binary = cache.Lookup(ctx, engine->programRegion, engine->pipeline.programOffset);
     }
 
     bool PipelineStageState::Refresh(InterconnectContext &ctx) {
-        if (!trapExecutionLock)
-            trapExecutionLock.emplace(trapMutex);
-
-        if (entry && entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber)
-            return true;
-        else if (entry && entry->dirty)
-            return true;
-
-        return false;
+        return cache.Refresh(ctx, engine->programRegion, engine->pipeline.programOffset);
     }
 
     void PipelineStageState::PurgeCaches() {
-        trapExecutionLock.reset();
-    }
-
-    PipelineStageState::~PipelineStageState() {
-        std::scoped_lock lock{trapMutex};
-        //for (const auto &mirror : mirrorMap)
-        //    ctx.nce.DestroyTrap(*mirror.second->trap);
+        cache.PurgeCaches();
     }
 
     /* Vertex Input State */
@@ -584,7 +491,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
         std::array<ShaderBinary, engine::PipelineCount> shaderBinaries;
         for (size_t i{}; i < engine::PipelineCount; i++) {
             const auto &stage{pipelineStages[i].UpdateGet(ctx)};
-            packedState.shaderHashes[i] = stage.hash;
+            packedState.shaderHashes[i] = stage.binary.hash;
             shaderBinaries[i] = stage.binary;
         }
 
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.h b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.h
index d6ae11ab..07462f4f 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_state.h
@@ -5,6 +5,7 @@
 
 #include <boost/container/static_vector.hpp>
 #include <gpu/texture/texture.h>
+#include <gpu/interconnect/common/shader_cache.h>
 #include "common.h"
 #include "packed_pipeline_state.h"
 #include "pipeline_manager.h"
@@ -67,46 +68,16 @@ namespace skyline::gpu::interconnect::maxwell3d {
         };
 
       private:
-        struct CacheEntry {
-            ShaderBinary binary;
-            u64 hash;
-
-            CacheEntry(ShaderBinary binary, u64 hash) : binary{binary}, hash{hash} {}
-        };
-
-        /**
-         * @brief Holds mirror state for a single GPU mapped block
-         */
-        struct MirrorEntry {
-            span<u8> mirror;
-            tsl::robin_map<u8 *, CacheEntry> cache;
-            std::optional<nce::NCE::TrapHandle> trap;
-
-            static constexpr u32 SkipTrapThreshold{20}; //!< Threshold for the number of times a mirror trap needs to be hit before we fallback to always hashing
-            u32 trapCount{}; //!< The number of times the trap has been hit, used to avoid trapping in cases where the constant retraps would harm performance
-            size_t channelSequenceNumber{}; //!< For the case where `trapCount > SkipTrapThreshold`, the memory sequence number number used to clear the cache after every access
-            bool dirty{}; //!< If the trap has been hit and the cache needs to be cleared
-
-            MirrorEntry(span<u8> alignedMirror) : mirror{alignedMirror} {}
-        };
-
         dirty::BoundSubresource<EngineRegisters> engine;
         engine::Pipeline::Shader::Type shaderType;
 
-        tsl::robin_map<u8 *, std::unique_ptr<MirrorEntry>> mirrorMap;
-        std::mutex trapMutex; //!< Protects accesses from trap handlers to the mirror map
-        std::optional<std::scoped_lock<std::mutex>> trapExecutionLock;
-        MirrorEntry *entry{};
-        span<u8> mirrorBlock{}; //!< Guest mapped memory block corresponding to `entry`
+        ShaderCache cache;
 
       public:
         ShaderBinary binary;
-        u64 hash;
 
         PipelineStageState(dirty::Handle dirtyHandle, DirtyManager &manager, const EngineRegisters &engine, u8 shaderType);
 
-        ~PipelineStageState();
-
         void Flush(InterconnectContext &ctx);
 
         bool Refresh(InterconnectContext &ctx);