Implement the Kepler compute engine

This can reuse a fair bit of the now-commonised Maxwell 3D code and mostly consists of compute-specific pipeline code which was deemed not suitable for being commonised (e.g. descriptor update code is somewhat duplicated). Of note is how compute lacks any active state at all de to its use of QMDs which bundle up all state into a single object in memory.
2025-02-12 20:58:55 +01:00 · 2022-11-18 21:51:00 +00:00 · 2022-11-18 21:51:00 +00:00 · bf03f945ee
commit bf03f945ee
parent 4bc81f007f
13 changed files with 602 additions and 2 deletions
--- a/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.h
@ -3,6 +3,7 @@

 #pragma once

+#include <tsl/robin_map.h>
 #include "common.h"

 namespace skyline::gpu::interconnect {
--- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/common.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/common.h
@ -0,0 +1,11 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <soc/gm20b/engines/kepler_compute/qmd.h>
+#include <gpu/interconnect/common/common.h>
+
+namespace skyline::gpu::interconnect::kepler_compute {
+    using QMD = skyline::soc::gm20b::engine::kepler_compute::QMD;
+}
--- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/constant_buffers.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/constant_buffers.cpp
@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include "constant_buffers.h"
+
+namespace skyline::gpu::interconnect::kepler_compute {
+    void ConstantBuffers::Update(InterconnectContext &ctx, const QMD &qmd) {
+        for (u32 i{}; i < QMD::ConstantBufferCount; i++) {
+            if (qmd.constantBufferValid & (1U << i)) {
+                auto &buffer{cachedBuffers[i]};
+                const auto &qmdBuffer{qmd.constantBuffer[i]};
+                buffer.Update(ctx, qmdBuffer.Address(), qmdBuffer.size);
+                boundConstantBuffers[i] = {*buffer};
+            }
+        }
+    }
+
+    void ConstantBuffers::MarkAllDirty() {
+        for (auto &buffer : cachedBuffers)
+            buffer.PurgeCaches();
+    }
+}
--- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/constant_buffers.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/constant_buffers.h
@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <soc/gm20b/engines/kepler_compute/qmd.h>
+#include "common.h"
+
+namespace skyline::gpu::interconnect::kepler_compute {
+    using ConstantBufferSet = std::array<ConstantBuffer, QMD::ConstantBufferCount>;
+
+    /**
+     * @brief Abstracts out QMD constant buffer creation
+     */
+    struct ConstantBuffers {
+      private:
+        std::array<CachedMappedBufferView, QMD::ConstantBufferCount> cachedBuffers;
+
+      public:
+        ConstantBufferSet boundConstantBuffers{}; //!< The currently active set of constant buffers from the QMD
+
+        void Update(InterconnectContext &ctx, const QMD &qmd);
+
+        void MarkAllDirty();
+    };
+}
--- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp
@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/Ryujinx/)
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <gpu/interconnect/command_executor.h>
+#include <gpu/interconnect/common/state_updater.h>
+#include <soc/gm20b/channel.h>
+#include "pipeline_state.h"
+#include "kepler_compute.h"
+
+namespace skyline::gpu::interconnect::kepler_compute {
+    KeplerCompute::KeplerCompute(GPU &gpu,
+                                 soc::gm20b::ChannelContext &channelCtx,
+                                 nce::NCE &nce,
+                                 kernel::MemoryManager &memoryManager,
+                                 DirtyManager &manager,
+                                 const EngineRegisterBundle &registerBundle)
+        : ctx{channelCtx, channelCtx.executor, gpu, nce, memoryManager},
+          pipelineState{manager, registerBundle.pipelineStateRegisters},
+          samplers{manager, registerBundle.samplerPoolRegisters},
+          textures{manager, registerBundle.texturePoolRegisters} {
+        ctx.executor.AddFlushCallback([this] {
+            pipelineState.PurgeCaches();
+            constantBuffers.MarkAllDirty();
+            samplers.MarkAllDirty();
+            textures.MarkAllDirty();
+        });
+    }
+
+    void KeplerCompute::Dispatch(const QMD &qmd) {
+        StateUpdateBuilder builder{*ctx.executor.allocator};
+
+        constantBuffers.Update(ctx, qmd);
+        samplers.Update(ctx, qmd.samplerIndex == soc::gm20b::engine::kepler_compute::QMD::SamplerIndex::ViaHeaderIndex);
+        auto *pipeline{pipelineState.Update(ctx, builder, textures, constantBuffers.boundConstantBuffers, qmd)};
+
+        auto *descUpdateInfo{pipeline->SyncDescriptors(ctx, constantBuffers.boundConstantBuffers, samplers, textures)};
+        builder.SetPipeline(*pipeline->compiledPipeline.pipeline, vk::PipelineBindPoint::eCompute);
+
+        if (ctx.gpu.traits.supportsPushDescriptors) {
+            builder.SetDescriptorSetWithPush(descUpdateInfo);
+        } else {
+            auto set{std::make_shared<DescriptorAllocator::ActiveDescriptorSet>(ctx.gpu.descriptor.AllocateSet(descUpdateInfo->descriptorSetLayout))};
+
+            builder.SetDescriptorSetWithUpdate(descUpdateInfo, set.get(), nullptr);
+            ctx.executor.AttachDependency(set);
+        }
+
+        auto stateUpdater{builder.Build()};
+
+        /**
+         * @brief Struct that can be linearly allocated, holding all state for the draw to avoid a dynamic allocation with lambda captures
+         */
+        struct DrawParams {
+            StateUpdater stateUpdater;
+            std::array<u32, 3> dimensions;
+        };
+        auto *drawParams{ctx.executor.allocator->EmplaceUntracked<DrawParams>(DrawParams{stateUpdater, {qmd.ctaRasterWidth, qmd.ctaRasterHeight, qmd.ctaRasterDepth}})};
+
+
+        ctx.executor.AddOutsideRpCommand([drawParams](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &gpu) {
+            drawParams->stateUpdater.RecordAll(gpu, commandBuffer);
+
+            commandBuffer.dispatch(drawParams->dimensions[0], drawParams->dimensions[1], drawParams->dimensions[2]);
+        });
+    }
+}
--- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.h
@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <gpu/descriptor_allocator.h>
+#include <gpu/interconnect/common/samplers.h>
+#include <gpu/interconnect/common/textures.h>
+#include "constant_buffers.h"
+#include "pipeline_state.h"
+
+namespace skyline::gpu::interconnect::kepler_compute {
+    /**
+     * @brief The core Kepler Compute interconnect object, directly accessed by the engine code to perform compute dispatches
+     */
+    class KeplerCompute {
+      public:
+        /**
+         * @brief The full set of register state used by the GPU interconnect
+         */
+        struct EngineRegisterBundle {
+            PipelineState::EngineRegisters pipelineStateRegisters;
+            SamplerPoolState::EngineRegisters samplerPoolRegisters;
+            TexturePoolState::EngineRegisters texturePoolRegisters;
+        };
+
+      private:
+        InterconnectContext ctx;
+        PipelineState pipelineState;
+        ConstantBuffers constantBuffers;
+        Samplers samplers;
+        Textures textures;
+
+      public:
+        KeplerCompute(GPU &gpu,
+                      soc::gm20b::ChannelContext &channelCtx,
+                      nce::NCE &nce,
+                      kernel::MemoryManager &memoryManager,
+                      DirtyManager &manager,
+                      const EngineRegisterBundle &registerBundle);
+
+        /**
+         * @brief Performs a compute dispatch using the given QMD
+         */
+        void Dispatch(const QMD &qmd);
+    };
+}
--- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/packed_pipeline_state.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/packed_pipeline_state.h
@ -0,0 +1,21 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <common.h>
+
+namespace skyline::gpu::interconnect::kepler_compute {
+    /**
+     * @brief Packed struct of pipeline state suitable for use as a map key
+     */
+    struct PackedPipelineState {
+        u64 shaderHash;
+        std::array<u32, 3> dimensions;
+        u32 localMemorySize;
+        u32 sharedMemorySize;
+        u32 bindlessTextureConstantBufferSlotSelect;
+
+        bool operator==(const PackedPipelineState &) const = default;
+    };
+}
--- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_manager.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_manager.cpp
@ -0,0 +1,206 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <gpu/texture/texture.h>
+#include <gpu/interconnect/command_executor.h>
+#include <gpu/interconnect/common/pipeline.inc>
+#include <gpu/shader_manager.h>
+#include <gpu.h>
+#include "pipeline_manager.h"
+
+namespace skyline::gpu::interconnect::kepler_compute {
+    static Pipeline::ShaderStage MakePipelineShader(InterconnectContext &ctx, Textures &textures, ConstantBufferSet &constantBuffers, const PackedPipelineState &packedState, const ShaderBinary &shaderBinary) {
+        ctx.gpu.shader.ResetPools();
+
+        auto program{ctx.gpu.shader.ParseComputeShader(
+            shaderBinary.binary, shaderBinary.baseOffset,
+            packedState.bindlessTextureConstantBufferSlotSelect,
+            packedState.localMemorySize, packedState.sharedMemorySize,
+            packedState.dimensions,
+            [&](u32 index, u32 offset) {
+                return constantBuffers[index].Read<int>(ctx.executor, offset);
+            }, [&](u32 index) {
+                return textures.GetTextureType(ctx, BindlessHandle{ .raw = index }.textureIndex);
+            })};
+
+        Shader::Backend::Bindings bindings{};
+
+        return {ctx.gpu.shader.CompileShader({}, program, bindings), program.info};
+    }
+
+    static Pipeline::DescriptorInfo MakePipelineDescriptorInfo(const Pipeline::ShaderStage &stage) {
+        Pipeline::DescriptorInfo descriptorInfo{};
+        u32 bindingIndex{};
+
+        auto pushBindings{[&](vk::DescriptorType type, const auto &descs, u32 &count) {
+            descriptorInfo.totalWriteDescCount += descs.size();
+
+            for (u32 descIdx{}; descIdx < descs.size(); descIdx++) {
+                const auto &desc{descs[descIdx]};
+                count += desc.count;
+
+                descriptorInfo.descriptorSetLayoutBindings.push_back(vk::DescriptorSetLayoutBinding{
+                    .binding = bindingIndex++,
+                    .descriptorType = type,
+                    .descriptorCount = desc.count,
+                    .stageFlags = vk::ShaderStageFlagBits::eCompute,
+                });
+            }
+        }};
+
+        pushBindings(vk::DescriptorType::eUniformBuffer, stage.info.constant_buffer_descriptors, descriptorInfo.totalBufferDescCount);
+        pushBindings(vk::DescriptorType::eStorageBuffer, stage.info.storage_buffers_descriptors, descriptorInfo.totalBufferDescCount);
+
+        pushBindings(vk::DescriptorType::eUniformTexelBuffer, stage.info.texture_buffer_descriptors, descriptorInfo.totalTexelBufferDescCount);
+        pushBindings(vk::DescriptorType::eStorageTexelBuffer, stage.info.image_buffer_descriptors, descriptorInfo.totalTexelBufferDescCount);
+        if (descriptorInfo.totalTexelBufferDescCount > 0)
+            Logger::Warn("Image buffer descriptors are not supported");
+
+        pushBindings(vk::DescriptorType::eCombinedImageSampler, stage.info.texture_descriptors, descriptorInfo.totalImageDescCount);
+        pushBindings(vk::DescriptorType::eStorageImage, stage.info.image_descriptors, descriptorInfo.totalImageDescCount);
+        if (stage.info.image_descriptors.size() > 0)
+            Logger::Warn("Image descriptors are not supported");
+
+        return descriptorInfo;
+    }
+
+    static Pipeline::CompiledPipeline MakeCompiledPipeline(InterconnectContext &ctx,
+                                                                               const PackedPipelineState &packedState,
+                                                                               const Pipeline::ShaderStage &shaderStage,
+                                                                               span<vk::DescriptorSetLayoutBinding> layoutBindings) {
+        vk::raii::DescriptorSetLayout descriptorSetLayout{ctx.gpu.vkDevice, vk::DescriptorSetLayoutCreateInfo{
+            .flags = vk::DescriptorSetLayoutCreateFlags{ctx.gpu.traits.supportsPushDescriptors ? vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR : vk::DescriptorSetLayoutCreateFlags{}},
+            .pBindings = layoutBindings.data(),
+            .bindingCount = static_cast<u32>(layoutBindings.size()),
+        }};
+
+        vk::raii::PipelineLayout pipelineLayout{ctx.gpu.vkDevice, vk::PipelineLayoutCreateInfo{
+            .pSetLayouts = &*descriptorSetLayout,
+            .setLayoutCount = 1,
+        }};
+
+        vk::PipelineShaderStageCreateInfo shaderStageInfo{
+            .stage = vk::ShaderStageFlagBits::eCompute,
+            .module = &*shaderStage.module,
+            .pName = "main"
+        };
+
+        vk::ComputePipelineCreateInfo pipelineInfo{
+            .stage = shaderStageInfo,
+            .layout = *pipelineLayout,
+        };
+
+        vk::raii::Pipeline pipeline{ctx.gpu.vkDevice, nullptr, pipelineInfo};
+
+        return Pipeline::CompiledPipeline{
+            .pipeline = std::move(pipeline),
+            .pipelineLayout = std::move(pipelineLayout),
+            .descriptorSetLayout = std::move(descriptorSetLayout),
+        };
+    }
+
+    Pipeline::Pipeline(InterconnectContext &ctx, Textures &textures, ConstantBufferSet &constantBuffers, const PackedPipelineState &packedState, const ShaderBinary &shaderBinary)
+        : shaderStage{MakePipelineShader(ctx, textures, constantBuffers, packedState, shaderBinary)},
+          descriptorInfo{MakePipelineDescriptorInfo(shaderStage)},
+          compiledPipeline{MakeCompiledPipeline(ctx, packedState, shaderStage, descriptorInfo.descriptorSetLayoutBindings)},
+          sourcePackedState{packedState} {
+        storageBufferViews.resize(shaderStage.info.storage_buffers_descriptors.size());
+    }
+
+    void Pipeline::SyncCachedStorageBufferViews(u32 executionNumber) {
+        if (lastExecutionNumber != executionNumber) {
+            for (auto &view : storageBufferViews)
+                view.PurgeCaches();
+
+            lastExecutionNumber = executionNumber;
+        }
+    }
+
+    DescriptorUpdateInfo *Pipeline::SyncDescriptors(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, Samplers &samplers, Textures &textures) {
+        SyncCachedStorageBufferViews(ctx.executor.executionNumber);
+
+        u32 writeIdx{};
+        auto writes{ctx.executor.allocator->AllocateUntracked<vk::WriteDescriptorSet>(descriptorInfo.totalWriteDescCount)};
+
+        u32 bufferIdx{};
+        auto bufferDescs{ctx.executor.allocator->AllocateUntracked<vk::DescriptorBufferInfo>(descriptorInfo.totalBufferDescCount)};
+        auto bufferDescDynamicBindings{ctx.executor.allocator->AllocateUntracked<DynamicBufferBinding>(descriptorInfo.totalBufferDescCount)};
+        u32 imageIdx{};
+        auto imageDescs{ctx.executor.allocator->AllocateUntracked<vk::DescriptorImageInfo>(descriptorInfo.totalImageDescCount)};
+
+        u32 storageBufferIdx{};
+        u32 bindingIdx{};
+
+        /**
+         * @brief Adds descriptor writes for a single Vulkan descriptor type that uses buffer descriptors
+         * @param count Total number of descriptors to write, including array elements
+         */
+        auto writeBufferDescs{[&](vk::DescriptorType type, const auto &descs, auto getBufferCb) {
+            if (!descs.empty()) {
+                // The underlying buffer bindings will be resolved from the dynamic ones during recording
+                for (const auto &desc : descs) {
+                    writes[writeIdx++] = {
+                        .dstBinding = bindingIdx++,
+                        .descriptorCount = desc.count,
+                        .descriptorType = type,
+                        .pBufferInfo = &bufferDescs[bufferIdx],
+                    };
+
+                    for (u32 arrayIdx{}; arrayIdx < desc.count; arrayIdx++)
+                        bufferDescDynamicBindings[bufferIdx++] = getBufferCb(desc, arrayIdx);
+                }
+            }
+        }};
+
+        auto writeImageDescs{[&](vk::DescriptorType type, const auto &descs, auto getTextureCb) {
+            if (!descs.empty()) {
+                for (const auto &desc : descs) {
+                    writes[writeIdx++] = {
+                        .dstBinding = bindingIdx++,
+                        .descriptorCount = desc.count,
+                        .descriptorType = type,
+                        .pImageInfo = &imageDescs[imageIdx],
+                    };
+
+                    for (u32 arrayIdx{}; arrayIdx < desc.count; arrayIdx++)
+                        imageDescs[imageIdx++] = getTextureCb(desc, arrayIdx);
+                }
+            }
+        }};
+
+        writeBufferDescs(vk::DescriptorType::eUniformBuffer, shaderStage.info.constant_buffer_descriptors,
+                         [&](const Shader::ConstantBufferDescriptor &desc, size_t arrayIdx) {
+                             size_t cbufIdx{desc.index + arrayIdx};
+                             return GetConstantBufferBinding(ctx, shaderStage.info, constantBuffers[cbufIdx].view, cbufIdx);
+                         });
+
+        writeBufferDescs(vk::DescriptorType::eStorageBuffer, shaderStage.info.storage_buffers_descriptors,
+                         [&](const Shader::StorageBufferDescriptor &desc, size_t arrayIdx) {
+                             auto binding{GetStorageBufferBinding(ctx, desc, constantBuffers[desc.cbuf_index], storageBufferViews[storageBufferIdx])};
+                             storageBufferIdx += arrayIdx ? 0 : 1;
+                             return binding;
+                         });
+
+        writeImageDescs(vk::DescriptorType::eCombinedImageSampler, shaderStage.info.texture_descriptors,
+                        [&](const Shader::TextureDescriptor &desc, size_t arrayIdx) {
+                            BindlessHandle handle{ReadBindlessHandle(ctx, constantBuffers, desc, arrayIdx)};
+                            auto binding{GetTextureBinding(ctx, desc, samplers, textures, handle)};
+                            return binding.first;
+                        });
+
+        // Since we don't implement all descriptor types the number of writes might not match what's expected
+        if (!writeIdx)
+            return nullptr;
+
+        return ctx.executor.allocator->EmplaceUntracked<DescriptorUpdateInfo>(DescriptorUpdateInfo{
+            .writes = writes.first(writeIdx),
+            .bufferDescs = bufferDescs.first(bufferIdx),
+            .bufferDescDynamicBindings = bufferDescDynamicBindings.first(bufferIdx),
+            .pipelineLayout = *compiledPipeline.pipelineLayout,
+            .descriptorSetLayout = *compiledPipeline.descriptorSetLayout,
+            .bindPoint = vk::PipelineBindPoint::eCompute,
+            .descriptorSetIndex = 0,
+        });
+    }
+}
+
--- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_manager.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_manager.h
@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <tsl/robin_map.h>
+#include <shader_compiler/frontend/ir/program.h>
+#include <gpu/interconnect/common/samplers.h>
+#include <gpu/interconnect/common/textures.h>
+#include "packed_pipeline_state.h"
+#include "constant_buffers.h"
+
+
+namespace skyline::gpu {
+    class TextureView;
+}
+
+namespace skyline::gpu::interconnect::kepler_compute {
+    class Pipeline {
+      public:
+        struct ShaderStage {
+            vk::ShaderModule module;
+            Shader::Info info;
+        };
+
+        struct DescriptorInfo {
+            std::vector<vk::DescriptorSetLayoutBinding> descriptorSetLayoutBindings;
+
+            u32 totalWriteDescCount;
+            u32 totalBufferDescCount;
+            u32 totalTexelBufferDescCount;
+            u32 totalImageDescCount;
+        };
+
+        struct CompiledPipeline {
+            vk::raii::DescriptorSetLayout descriptorSetLayout;
+            vk::raii::PipelineLayout pipelineLayout;
+            vk::raii::Pipeline pipeline;
+        };
+
+      private:
+        ShaderStage shaderStage;
+        DescriptorInfo descriptorInfo;
+        std::vector<CachedMappedBufferView> storageBufferViews;
+        u32 lastExecutionNumber{}; //!< The last execution number this pipeline was used at
+
+        void SyncCachedStorageBufferViews(u32 executionNumber);
+
+      public:
+        CompiledPipeline compiledPipeline;
+
+        PackedPipelineState sourcePackedState;
+
+        Pipeline(InterconnectContext &ctx, Textures &textures, ConstantBufferSet &constantBuffers, const PackedPipelineState &packedState, const ShaderBinary &shaderBinary);
+
+        /**
+         * @brief Creates a descriptor set update from the current GPU state
+         */
+        DescriptorUpdateInfo *SyncDescriptors(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, Samplers &samplers, Textures &textures);
+    };
+
+    class PipelineManager {
+      private:
+        tsl::robin_map<PackedPipelineState, std::unique_ptr<Pipeline>, util::ObjectHash<PackedPipelineState>> map;
+
+      public:
+        Pipeline *FindOrCreate(InterconnectContext &ctx, Textures &textures, ConstantBufferSet &constantBuffers, const PackedPipelineState &packedState, const ShaderBinary &shaderBinary) {
+            auto it{map.find(packedState)};
+            if (it != map.end())
+                return it->second.get();
+
+            return map.emplace(packedState, std::make_unique<Pipeline>(ctx, textures, constantBuffers, packedState, shaderBinary)).first->second.get();
+        }
+    };
+}
--- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_state.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_state.cpp
@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <gpu.h>
+#include "pipeline_state.h"
+
+namespace skyline::gpu::interconnect::kepler_compute {
+    /* Pipeline Stage */
+    void PipelineStageState::EngineRegisters::DirtyBind(DirtyManager &manager, dirty::Handle handle) const {
+        manager.Bind(handle, programRegion);
+    }
+
+    PipelineStageState::PipelineStageState(dirty::Handle dirtyHandle, DirtyManager &manager, const EngineRegisters &engine)
+        : engine{manager, dirtyHandle, engine} {}
+
+    void PipelineStageState::Flush(InterconnectContext &ctx, u32 programOffset) {
+        binary = cache.Lookup(ctx, engine->programRegion, programOffset);
+    }
+
+    bool PipelineStageState::Refresh(InterconnectContext &ctx, u32 programOffset) {
+        return cache.Refresh(ctx, engine->programRegion, programOffset);
+    }
+
+    void PipelineStageState::PurgeCaches() {
+        cache.PurgeCaches();
+    }
+
+    /* Pipeline State */
+    PipelineState::PipelineState(DirtyManager &manager, const EngineRegisters &engine)
+        : pipelineStage{manager, engine.pipelineStageRegisters},
+          bindlessTexture{engine.bindlessTexture} {}
+
+    Pipeline *PipelineState::Update(InterconnectContext &ctx, StateUpdateBuilder &builder, Textures &textures, ConstantBufferSet &constantBuffers, const QMD &qmd) {
+        const auto &stage{pipelineStage.UpdateGet(ctx, qmd.programOffset)};
+        packedState.shaderHash = stage.binary.hash;
+        packedState.dimensions = {qmd.ctaThreadDimension0, qmd.ctaThreadDimension1, qmd.ctaThreadDimension2};
+        packedState.localMemorySize = qmd.shaderLocalMemoryLowSize + qmd.shaderLocalMemoryHighSize;
+        packedState.sharedMemorySize = qmd.sharedMemorySize;
+        packedState.bindlessTextureConstantBufferSlotSelect = bindlessTexture.constantBufferSlotSelect;
+
+        return pipelineManager.FindOrCreate(ctx, textures, constantBuffers, packedState, stage.binary);
+    }
+
+    void PipelineState::PurgeCaches() {
+        pipelineStage.MarkDirty(true);
+    }
+}
--- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_state.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_state.h
@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <common.h>
+#include <gpu/texture/texture.h>
+#include <gpu/interconnect/common/shader_cache.h>
+#include "common.h"
+#include "packed_pipeline_state.h"
+#include "pipeline_manager.h"
+
+namespace skyline::gpu::interconnect::kepler_compute {
+    class PipelineStageState : dirty::RefreshableManualDirty, dirty::CachedManualDirty {
+      public:
+        struct EngineRegisters {
+            const soc::gm20b::engine::Address &programRegion;
+
+            void DirtyBind(DirtyManager &manager, dirty::Handle handle) const;
+        };
+
+      private:
+        dirty::BoundSubresource<EngineRegisters> engine;
+
+        ShaderCache cache;
+
+      public:
+        ShaderBinary binary;
+
+        PipelineStageState(dirty::Handle dirtyHandle, DirtyManager &manager, const EngineRegisters &engine);
+        
+        void Flush(InterconnectContext &ctx, u32 programOffset);
+
+        bool Refresh(InterconnectContext &ctx, u32 programOffset);
+
+        void PurgeCaches();
+    };
+
+    class PipelineState {
+      public:
+        struct EngineRegisters {
+            PipelineStageState::EngineRegisters pipelineStageRegisters;
+            const engine_common::BindlessTexture &bindlessTexture;
+        };
+
+      private:
+        dirty::ManualDirtyState<PipelineStageState> pipelineStage;
+        const engine_common::BindlessTexture &bindlessTexture;
+
+        PackedPipelineState packedState{};
+        PipelineManager pipelineManager;
+
+
+      public:
+        PipelineState(DirtyManager &manager, const EngineRegisters &engine);
+
+        Pipeline *Update(InterconnectContext &ctx, StateUpdateBuilder &builder, Textures &textures, ConstantBufferSet &constantBuffers, const QMD &qmd);
+
+        void PurgeCaches();
+    };
+}
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute.cpp
@ -7,8 +7,20 @@
 #include "kepler_compute.h"

 namespace skyline::soc::gm20b::engine {
+    static gpu::interconnect::kepler_compute::KeplerCompute::EngineRegisterBundle MakeEngineRegisters(const KeplerCompute::Registers &registers) {
+        return {
+            .pipelineStateRegisters = {*registers.programRegion, *registers.bindlessTexture},
+            .samplerPoolRegisters = {*registers.texSamplerPool, *registers.texHeaderPool},
+            .texturePoolRegisters = {*registers.texHeaderPool}
+        };
+    }
+
    KeplerCompute::KeplerCompute(const DeviceState &state, ChannelContext &channelCtx)
-        : syncpoints{state.soc->host1x.syncpoints}, i2m{state, channelCtx} {}
+        : syncpoints{state.soc->host1x.syncpoints},
+          channelCtx{channelCtx},
+          i2m{state, channelCtx},
+          dirtyManager{registers},
+          interconnect{*state.gpu, channelCtx, *state.nce, state.process->memory, dirtyManager, MakeEngineRegisters(registers)} {}

    __attribute__((always_inline)) void KeplerCompute::CallMethod(u32 method, u32 argument) {
        Logger::Verbose("Called method in Kepler compute: 0x{:X} args: 0x{:X}", method, argument);
@ -27,7 +39,7 @@ namespace skyline::soc::gm20b::engine {
                i2m.LoadInlineData(*registers.i2m, argument);
            })
            ENGINE_CASE(sendSignalingPcasB, {
-                Logger::Warn("Attempted to execute compute kernel!");
+                interconnect.Dispatch(channelCtx.asCtx->gmmu.Read<kepler_compute::QMD>(registers.sendPcas->QmdAddress()));
            })
            ENGINE_STRUCT_CASE(reportSemaphore, action, {
                throw exception("Compute semaphores are unimplemented!");
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute.h
@ -4,6 +4,7 @@

 #pragma once

+#include <gpu/interconnect/kepler_compute/kepler_compute.h>
 #include "engine.h"
 #include "inline2memory.h"

@ -18,7 +19,10 @@ namespace skyline::soc::gm20b::engine {
    class KeplerCompute {
      private:
        host1x::SyncpointSet &syncpoints;
+        ChannelContext &channelCtx;
        Inline2MemoryBackend i2m;
+        gpu::interconnect::DirtyManager dirtyManager;
+        gpu::interconnect::kepler_compute::KeplerCompute interconnect;

        void HandleMethod(u32 method, u32 argument);