From 25a29f90445082fae880224b0cc0510626142512 Mon Sep 17 00:00:00 2001
From: PixelyIon <pixelyion@protonmail.com>
Date: Sun, 1 May 2022 18:15:13 +0530
Subject: [PATCH] Skip zero-initializing shader bytecode backing

The backing for shader data would implicitly be zero-initialized due to a `resize` on every shader parse, this was entirely unnecessary as we would overwrite the entire range regardless.

We avoid this by using statically allocated storage and a span over it containing the shader bytecode which avoids any unnecessary clear semantics without resorting to more complex solutions such as a custom allocator.
---
 .../main/cpp/skyline/common/address_space.h   | 16 +++++++--------
 .../gpu/interconnect/graphics_context.h       | 20 +++++++++----------
 2 files changed, 17 insertions(+), 19 deletions(-)
diff --git a/app/src/main/cpp/skyline/common/address_space.h b/app/src/main/cpp/skyline/common/address_space.h
index d816c1e0..3a64a44f 100644
--- a/app/src/main/cpp/skyline/common/address_space.h
+++ b/app/src/main/cpp/skyline/common/address_space.h
@@ -135,11 +135,12 @@ namespace skyline {
         /**
          * @brief Writes contents starting from the virtual address till the end of the span or an unmapped block has been hit or when `function` returns a non-nullopt value
          * @param function A function that is called on every block where it should return an end offset into the block when it wants to end reading or std::nullopt when it wants to continue reading
-         * @return If returning was caused by the supplied function returning a non-nullopt value or other conditions
+         * @return A span into the supplied container with the contents of the memory region
          * @note The function will **NOT** be run on any sparse block
+         * @note The function will provide no feedback on if the end has been reached or if there was an early exit
          */
         template<typename Function, typename Container>
-        bool ReadTill(Container& destination, VaType virt, Function function) {
+        span<u8> ReadTill(Container& destination, VaType virt, Function function) {
             //TRACE_EVENT("containers", "FlatMemoryManager::ReadTill");
 
             std::scoped_lock lock(this->blockMutex);
@@ -158,18 +159,15 @@ namespace skyline {
 
             while (remainingSize) {
                 if (predecessor->phys == nullptr) {
-                    destination.resize(destination.size() - remainingSize);
-                    return false;
+                    return {destination.data(), destination.size() - remainingSize};
                 } else {
                     if (predecessor->extraInfo.sparseMapped) {
                         std::memset(pointer, 0, blockReadSize);
                     } else {
                         auto end{function(span<u8>(blockPhys, blockReadSize))};
                         std::memcpy(pointer, blockPhys, end ? *end : blockReadSize);
-                        if (end) {
-                            destination.resize((destination.size() - remainingSize) + *end);
-                            return true;
-                        }
+                        if (end)
+                            return {destination.data(), (destination.size() - remainingSize) + *end};
                     }
                 }
 
@@ -183,7 +181,7 @@ namespace skyline {
                 }
             }
 
-            return false;
+            return {destination.data(), destination.size()};
         }
 
         void Write(VaType virt, u8 *source, VaType size);
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
index 3462ca83..5d91a5c4 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
@@ -747,7 +747,8 @@ namespace skyline::gpu::interconnect {
             bool invalidated{true}; //!< If the shader that existed earlier has been invalidated
             bool shouldCheckSame{false}; //!< If we should do a check for the shader being the same as before
             u32 offset{}; //!< Offset of the shader from the base IOVA
-            boost::container::static_vector<u8, MaxShaderBytecodeSize> data; //!< The shader bytecode in a statically allocated vector
+            std::array<u8, MaxShaderBytecodeSize> backing; //!< The backing storage for shader bytecode in a statically allocated array
+            span<u8> bytecode{}; //!< A span of the shader bytecode inside the backing storage
             std::shared_ptr<ShaderManager::ShaderProgram> program{};
 
             Shader(ShaderCompiler::Stage stage) : stage(stage) {}
@@ -914,10 +915,10 @@ namespace skyline::gpu::interconnect {
                         // If a shader is invalidated, we need to reparse the program (given that it has changed)
 
                         bool shouldParseShader{[&]() {
-                            if (!shader.data.empty() && shader.shouldCheckSame) {
+                            if (shader.bytecode.valid() && shader.shouldCheckSame) {
                                 // A fast path to check if the shader is the same as before to avoid reparsing the shader
-                                auto newIovaRanges{channelCtx.asCtx->gmmu.TranslateRange(shaderBaseIova + shader.offset, shader.data.size())};
-                                auto originalShader{shader.data.data()};
+                                auto newIovaRanges{channelCtx.asCtx->gmmu.TranslateRange(shaderBaseIova + shader.offset, shader.bytecode.size())};
+                                auto originalShader{shader.bytecode.data()};
 
                                 for (auto &range : newIovaRanges) {
                                     if (range.data() && std::memcmp(range.data(), originalShader, range.size()) == 0) {
@@ -936,8 +937,7 @@ namespace skyline::gpu::interconnect {
 
                         if (shouldParseShader) {
                             // A pass to check if the shader has a BRA infloop opcode ending (On most commercial games)
-                            shader.data.resize(MaxShaderBytecodeSize);
-                            auto foundEnd{channelCtx.asCtx->gmmu.ReadTill(shader.data, shaderBaseIova + shader.offset, [](span<u8> data) -> std::optional<size_t> {
+                            shader.bytecode = channelCtx.asCtx->gmmu.ReadTill(shader.backing, shaderBaseIova + shader.offset, [](span<u8> data) -> std::optional<size_t> {
                                 // We attempt to find the shader size by looking for "BRA $" (Infinite Loop) which is used as padding at the end of the shader
                                 // UAM Shader Compiler Reference: https://github.com/devkitPro/uam/blob/5a5afc2bae8b55409ab36ba45be63fcb73f68993/source/compiler_iface.cpp#L319-L351
                                 constexpr u64 BraSelf1{0xE2400FFFFF87000F}, BraSelf2{0xE2400FFFFF07000F};
@@ -950,9 +950,9 @@ namespace skyline::gpu::interconnect {
                                         return static_cast<size_t>(std::distance(shaderInstructions.begin(), it)) * sizeof(u64);
                                 }
                                 return std::nullopt;
-                            })};
+                            });
 
-                            shader.program = gpu.shader.ParseGraphicsShader(shader.stage, shader.data, shader.offset, bindlessTextureConstantBufferIndex);
+                            shader.program = gpu.shader.ParseGraphicsShader(shader.stage, shader.bytecode, shader.offset, bindlessTextureConstantBufferIndex);
 
                             if (shader.stage != ShaderCompiler::Stage::VertexA && shader.stage != ShaderCompiler::Stage::VertexB) {
                                 pipelineStage.program = shader.program;
@@ -963,13 +963,13 @@ namespace skyline::gpu::interconnect {
                                     throw exception("Enabling VertexA without VertexB is not supported");
                                 else if (!vertexB.invalidated)
                                     // If only VertexA is invalidated, we need to recombine here but we can defer it otherwise
-                                    pipelineStage.program = gpu.shader.CombineVertexShaders(shader.program, vertexB.program, vertexB.data);
+                                    pipelineStage.program = gpu.shader.CombineVertexShaders(shader.program, vertexB.program, vertexB.bytecode);
                             } else if (shader.stage == ShaderCompiler::Stage::VertexB) {
                                 auto &vertexA{shaders[maxwell3d::ShaderStage::VertexA]};
 
                                 if (vertexA.enabled)
                                     // We need to combine the vertex shader stages if VertexA is enabled
-                                    pipelineStage.program = gpu.shader.CombineVertexShaders(vertexA.program, shader.program, shader.data);
+                                    pipelineStage.program = gpu.shader.CombineVertexShaders(vertexA.program, shader.program, shader.bytecode);
                                 else
                                     pipelineStage.program = shader.program;
                             }