Implement thread pool based async pipeline compilation with futures

By distributing the load of shader compiling onto multiple threads and then only waiting for completion until absolutely neccessary we can reduce compilation stutters significantly.
2025-02-16 21:49:14 +01:00 · 2022-12-03 19:57:00 +00:00 · 2022-12-03 19:57:00 +00:00 · 072b8193a1
commit 072b8193a1
parent 186549748d
6 changed files with 78 additions and 54 deletions
--- a/app/src/main/cpp/skyline/gpu/cache/graphics_pipeline_cache.cpp
+++ b/app/src/main/cpp/skyline/gpu/cache/graphics_pipeline_cache.cpp
@ -296,32 +296,9 @@ namespace skyline::gpu::cache {
        return lhs == rhs;
    }

-    GraphicsPipelineCache::PipelineCacheEntry::PipelineCacheEntry(vk::raii::DescriptorSetLayout &&descriptorSetLayout, vk::raii::PipelineLayout &&pipelineLayout, vk::raii::Pipeline &&pipeline) : descriptorSetLayout(std::move(descriptorSetLayout)), pipelineLayout(std::move(pipelineLayout)), pipeline(std::move(pipeline)) {}
-
-    GraphicsPipelineCache::CompiledPipeline::CompiledPipeline(const PipelineCacheEntry &entry) : descriptorSetLayout(*entry.descriptorSetLayout), pipelineLayout(*entry.pipelineLayout), pipeline(*entry.pipeline) {}
-
-    GraphicsPipelineCache::CompiledPipeline GraphicsPipelineCache::GetCompiledPipeline(const PipelineState &state, span<const vk::DescriptorSetLayoutBinding> layoutBindings, span<const vk::PushConstantRange> pushConstantRanges, bool noPushDescriptors) {
-        std::unique_lock lock(mutex);
-
-        auto it{pipelineCache.find(state)};
-        if (it != pipelineCache.end())
-            return CompiledPipeline{it->second};
-
-        lock.unlock();
-
-        vk::raii::DescriptorSetLayout descriptorSetLayout{gpu.vkDevice, vk::DescriptorSetLayoutCreateInfo{
-            .flags = vk::DescriptorSetLayoutCreateFlags{(!noPushDescriptors && gpu.traits.supportsPushDescriptors) ? vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR : vk::DescriptorSetLayoutCreateFlags{}},
-            .pBindings = layoutBindings.data(),
-            .bindingCount = static_cast<u32>(layoutBindings.size()),
-        }};
-
-        vk::raii::PipelineLayout pipelineLayout{gpu.vkDevice, vk::PipelineLayoutCreateInfo{
-            .pSetLayouts = &*descriptorSetLayout,
-            .setLayoutCount = 1,
-            .pPushConstantRanges = pushConstantRanges.data(),
-            .pushConstantRangeCount = static_cast<u32>(pushConstantRanges.size()),
-        }};
+    GraphicsPipelineCache::PipelineCacheEntry::PipelineCacheEntry(vk::raii::DescriptorSetLayout &&descriptorSetLayout, vk::raii::PipelineLayout &&pipelineLayout) : descriptorSetLayout{std::move(descriptorSetLayout)}, pipelineLayout{std::move(pipelineLayout)} {}

+    vk::raii::Pipeline GraphicsPipelineCache::BuildPipeline(const PipelineCacheKey &key, vk::PipelineLayout pipelineLayout) {
        boost::container::small_vector<vk::AttachmentDescription, 8> attachmentDescriptions;
        boost::container::small_vector<vk::AttachmentReference, 8> attachmentReferences;

@ -329,7 +306,7 @@ namespace skyline::gpu::cache {
            if (format != vk::Format::eUndefined) {
                attachmentDescriptions.push_back(vk::AttachmentDescription{
                    .format = format,
-                    .samples = state.sampleCount,
+                    .samples = key.sampleCount,
                    .loadOp = vk::AttachmentLoadOp::eLoad,
                    .storeOp = vk::AttachmentStoreOp::eStore,
                    .stencilLoadOp = vk::AttachmentLoadOp::eLoad,
@ -354,11 +331,11 @@ namespace skyline::gpu::cache {
            .pipelineBindPoint = vk::PipelineBindPoint::eGraphics,
        };

-        for (auto &colorAttachment : state.colorFormats)
+        for (auto &colorAttachment : key.colorFormats)
            pushAttachment(colorAttachment);

-        if (state.depthStencilFormat != vk::Format::eUndefined) {
-            pushAttachment(state.depthStencilFormat);
+        if (key.depthStencilFormat != vk::Format::eUndefined) {
+            pushAttachment(key.depthStencilFormat);

            subpassDescription.pColorAttachments = attachmentReferences.data();
            subpassDescription.colorAttachmentCount = static_cast<u32>(attachmentReferences.size() - 1);
@ -375,25 +352,48 @@ namespace skyline::gpu::cache {
            .pSubpasses = &subpassDescription,
        }};

-        auto pipeline{gpu.vkDevice.createGraphicsPipeline(vkPipelineCache, vk::GraphicsPipelineCreateInfo{
-            .pStages = state.shaderStages.data(),
-            .stageCount = static_cast<u32>(state.shaderStages.size()),
-            .pVertexInputState = &state.vertexState.get<vk::PipelineVertexInputStateCreateInfo>(),
-            .pInputAssemblyState = &state.inputAssemblyState,
-            .pViewportState = &state.viewportState,
-            .pRasterizationState = &state.rasterizationState.get<vk::PipelineRasterizationStateCreateInfo>(),
-            .pMultisampleState = &state.multisampleState,
-            .pDepthStencilState = &state.depthStencilState,
-            .pColorBlendState = &state.colorBlendState,
-            .pDynamicState = &state.dynamicState,
-            .layout = *pipelineLayout,
+        return gpu.vkDevice.createGraphicsPipeline(vkPipelineCache, vk::GraphicsPipelineCreateInfo{
+            .pStages = key.shaderStages.data(),
+            .stageCount = static_cast<u32>(key.shaderStages.size()),
+            .pVertexInputState = &key.vertexState.get<vk::PipelineVertexInputStateCreateInfo>(),
+            .pInputAssemblyState = &key.inputAssemblyState,
+            .pViewportState = &key.viewportState,
+            .pRasterizationState = &key.rasterizationState.get<vk::PipelineRasterizationStateCreateInfo>(),
+            .pMultisampleState = &key.multisampleState,
+            .pDepthStencilState = &key.depthStencilState,
+            .pColorBlendState = &key.colorBlendState,
+            .pDynamicState = &key.dynamicState,
+            .layout = pipelineLayout,
            .renderPass = *renderPass,
            .subpass = 0,
-        })};
+        });
+    }

-        lock.lock();
+    GraphicsPipelineCache::CompiledPipeline::CompiledPipeline(const PipelineCacheEntry &entry) : descriptorSetLayout{*entry.descriptorSetLayout}, pipelineLayout{*entry.pipelineLayout}, pipeline{*entry.pipeline} {}

-        auto pipelineEntryIt{pipelineCache.try_emplace(PipelineCacheKey{state}, std::move(descriptorSetLayout), std::move(pipelineLayout), std::move(pipeline))};
+    GraphicsPipelineCache::CompiledPipeline GraphicsPipelineCache::GetCompiledPipeline(const PipelineState &state, span<const vk::DescriptorSetLayoutBinding> layoutBindings, span<const vk::PushConstantRange> pushConstantRanges, bool noPushDescriptors) {
+        std::unique_lock lock(mutex);
+
+        auto it{pipelineCache.find(state)};
+        if (it != pipelineCache.end())
+            return CompiledPipeline{it->second};
+
+        vk::raii::DescriptorSetLayout descriptorSetLayout{gpu.vkDevice, vk::DescriptorSetLayoutCreateInfo{
+            .flags = vk::DescriptorSetLayoutCreateFlags{(!noPushDescriptors && gpu.traits.supportsPushDescriptors) ? vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR : vk::DescriptorSetLayoutCreateFlags{}},
+            .pBindings = layoutBindings.data(),
+            .bindingCount = static_cast<u32>(layoutBindings.size()),
+        }};
+
+        vk::raii::PipelineLayout pipelineLayout{gpu.vkDevice, vk::PipelineLayoutCreateInfo{
+            .pSetLayouts = &*descriptorSetLayout,
+            .setLayoutCount = 1,
+            .pPushConstantRanges = pushConstantRanges.data(),
+            .pushConstantRangeCount = static_cast<u32>(pushConstantRanges.size()),
+        }};
+
+        auto pipelineEntryIt{pipelineCache.try_emplace(PipelineCacheKey{state}, std::move(descriptorSetLayout), std::move(pipelineLayout))};
+        auto pipelineFuture{pool.submit(&GraphicsPipelineCache::BuildPipeline, this, std::ref(pipelineEntryIt.first->first), std::ref(*pipelineEntryIt.first->second.pipelineLayout))};
+        pipelineEntryIt.first->second.pipeline = pipelineFuture.share();
        return CompiledPipeline{pipelineEntryIt.first->second};
    }
 }
--- a/app/src/main/cpp/skyline/gpu/cache/graphics_pipeline_cache.h
+++ b/app/src/main/cpp/skyline/gpu/cache/graphics_pipeline_cache.h
@ -3,6 +3,8 @@

 #pragma once

+#include <future>
+#include <BS_thread_pool.hpp>
 #include <vulkan/vulkan_raii.hpp>

 namespace skyline::gpu {
@ -136,20 +138,23 @@ namespace skyline::gpu::cache {
        struct PipelineCacheEntry {
            vk::raii::DescriptorSetLayout descriptorSetLayout;
            vk::raii::PipelineLayout pipelineLayout;
-            vk::raii::Pipeline pipeline;
+            std::optional<std::shared_future<vk::raii::Pipeline>> pipeline;

-            PipelineCacheEntry(vk::raii::DescriptorSetLayout&& descriptorSetLayout, vk::raii::PipelineLayout &&layout, vk::raii::Pipeline &&pipeline);
+            PipelineCacheEntry(vk::raii::DescriptorSetLayout&& descriptorSetLayout, vk::raii::PipelineLayout &&layout);
        };

+        BS::thread_pool pool;
        std::unordered_map<PipelineCacheKey, PipelineCacheEntry, PipelineStateHash, PipelineCacheEqual> pipelineCache;

+        vk::raii::Pipeline BuildPipeline(const PipelineCacheKey &key, vk::PipelineLayout pipelineLayout);
+
      public:
        GraphicsPipelineCache(GPU &gpu);

        struct CompiledPipeline {
            vk::DescriptorSetLayout descriptorSetLayout;
            vk::PipelineLayout pipelineLayout;
-            vk::Pipeline pipeline;
+            std::shared_future<vk::raii::Pipeline> pipeline;

            CompiledPipeline(const PipelineCacheEntry &entry);
        };
--- a/app/src/main/cpp/skyline/gpu/interconnect/common/state_updater.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/common/state_updater.h
@ -3,6 +3,7 @@

 #pragma once

+#include <future>
 #include <gpu/interconnect/command_executor.h>
 #include "common.h"

@ -261,6 +262,16 @@ namespace skyline::gpu::interconnect {
    };
    using SetPipelineCmd = CmdHolder<SetPipelineCmdImpl>;

+    struct SetPipelineFutureCmdImpl {
+        void Record(GPU &gpu, vk::raii::CommandBuffer &commandBuffer) {
+            commandBuffer.bindPipeline(bindPoint, *pipeline.get());
+        }
+
+        std::shared_future<vk::raii::Pipeline> pipeline;
+        vk::PipelineBindPoint bindPoint;
+    };
+    using SetPipelineFutureCmd = CmdHolder<SetPipelineFutureCmdImpl>;
+
    /**
     * @brief Single-use helper for recording a batch of state updates into a command buffer
     */
@ -471,6 +482,14 @@ namespace skyline::gpu::interconnect {
                });
        }

+        void SetPipeline(const std::shared_future<vk::raii::Pipeline> &pipeline, vk::PipelineBindPoint bindPoint) {
+            AppendCmd<SetPipelineFutureCmd>(
+                {
+                    .pipeline = pipeline,
+                    .bindPoint = bindPoint,
+                });
+        }
+
        void SetDescriptorSetWithPush(DescriptorUpdateInfo *updateInfo) {
            AppendCmd<SetDescriptorSetWithPushCmd>(
                {
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/packed_pipeline_state.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/packed_pipeline_state.cpp
@ -205,7 +205,7 @@ namespace skyline::gpu::interconnect::maxwell3d {

    size_t PackedPipelineState::GetColorRenderTargetCount() const {
        for (size_t i{engine::ColorTargetCount}; i > 0 ; i--)
-            if (IsColorRenderTargetEnabled(i - 1))
+            if (IsColorRenderTargetEnabled(ctSelect[i - 1]))
                return i;

        return 0;
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp
@ -530,7 +530,7 @@ namespace skyline::gpu::interconnect::maxwell3d {

        for (u32 i{}; i < packedState.GetColorRenderTargetCount(); i++) {
            attachmentBlendStates.push_back(packedState.GetAttachmentBlendState(i));
-            texture::Format format{packedState.GetColorRenderTargetFormat(i)};
+            texture::Format format{packedState.GetColorRenderTargetFormat(packedState.ctSelect[i])};
            colorAttachmentFormats.push_back(format ? format->vkFormat : vk::Format::eUndefined);
        }

@ -595,10 +595,10 @@ namespace skyline::gpu::interconnect::maxwell3d {
    }

    Pipeline::Pipeline(InterconnectContext &ctx, const PipelineStateAccessor &accessor, const PackedPipelineState &packedState)
-        : shaderStages{MakePipelineShaders(ctx, accessor, packedState)},
+        : sourcePackedState{packedState},
+          shaderStages{MakePipelineShaders(ctx, accessor, sourcePackedState)},
          descriptorInfo{MakePipelineDescriptorInfo(shaderStages, ctx.gpu.traits.quirks.needsIndividualTextureBindingWrites)},
-          compiledPipeline{MakeCompiledPipeline(ctx, packedState, shaderStages, descriptorInfo.descriptorSetLayoutBindings)},
-          sourcePackedState{packedState} {
+          compiledPipeline{MakeCompiledPipeline(ctx, sourcePackedState, shaderStages, descriptorInfo.descriptorSetLayoutBindings)} {
        storageBufferViews.resize(descriptorInfo.totalStorageBufferCount);
    }

--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h
@ -82,6 +82,8 @@ namespace skyline::gpu::interconnect::maxwell3d {
            u32 totalImageDescCount;
        };

+        PackedPipelineState sourcePackedState;
+
      private:
        std::vector<CachedMappedBufferView> storageBufferViews;
        u32 lastExecutionNumber{}; //!< The last execution number this pipeline was used at
@ -99,8 +101,6 @@ namespace skyline::gpu::interconnect::maxwell3d {
        cache::GraphicsPipelineCache::CompiledPipeline compiledPipeline;
        size_t sampledImageCount{};

-        PackedPipelineState sourcePackedState;
-
        Pipeline(InterconnectContext &ctx, const PipelineStateAccessor &accessor, const PackedPipelineState &packedState);

        Pipeline *LookupNext(const PackedPipelineState &packedState);