skyline/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp

// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/Ryujinx/)
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)

#include <gpu/interconnect/command_executor.h>
#include <gpu/interconnect/common/state_updater.h>
#include <soc/gm20b/channel.h>
#include "pipeline_state.h"
#include "kepler_compute.h"

namespace skyline::gpu::interconnect::kepler_compute {
    KeplerCompute::KeplerCompute(GPU &gpu,
                                 soc::gm20b::ChannelContext &channelCtx,
                                 nce::NCE &nce,
                                 kernel::MemoryManager &memoryManager,
                                 DirtyManager &manager,
                                 const EngineRegisterBundle &registerBundle)
        : ctx{channelCtx, channelCtx.executor, gpu, nce, memoryManager},
          pipelineState{manager, registerBundle.pipelineStateRegisters},
          samplers{manager, registerBundle.samplerPoolRegisters},
          textures{manager, registerBundle.texturePoolRegisters} {
        ctx.executor.AddFlushCallback([this] {
            pipelineState.PurgeCaches();
            constantBuffers.MarkAllDirty();
            samplers.MarkAllDirty();
            textures.MarkAllDirty();
        });
    }

    void KeplerCompute::Dispatch(const QMD &qmd) {
        if (ctx.gpu.traits.quirks.brokenComputeShaders)
            return;

        TRACE_EVENT("gpu", "KeplerCompute::Dispatch");

        StateUpdateBuilder builder{*ctx.executor.allocator};

        constantBuffers.Update(ctx, qmd);
        samplers.Update(ctx, qmd.samplerIndex == soc::gm20b::engine::kepler_compute::QMD::SamplerIndex::ViaHeaderIndex);
        auto *pipeline{pipelineState.Update(ctx, builder, textures, constantBuffers.boundConstantBuffers, qmd)};

        vk::PipelineStageFlags srcStageMask{}, dstStageMask{};
        auto *descUpdateInfo{pipeline->SyncDescriptors(ctx, constantBuffers.boundConstantBuffers, samplers, textures, srcStageMask, dstStageMask)};
        builder.SetPipeline(*pipeline->compiledPipeline.pipeline, vk::PipelineBindPoint::eCompute);

        if (ctx.gpu.traits.supportsPushDescriptors) {
            builder.SetDescriptorSetWithPush(descUpdateInfo);
        } else {
            auto set{std::make_shared<DescriptorAllocator::ActiveDescriptorSet>(ctx.gpu.descriptor.AllocateSet(descUpdateInfo->descriptorSetLayout))};

            builder.SetDescriptorSetWithUpdate(descUpdateInfo, set.get(), nullptr);
            ctx.executor.AttachDependency(set);
        }

        auto stateUpdater{builder.Build()};

        /**
         * @brief Struct that can be linearly allocated, holding all state for the draw to avoid a dynamic allocation with lambda captures
         */
        struct DrawParams {
            StateUpdater stateUpdater;
            std::array<u32, 3> dimensions;
            vk::PipelineStageFlags srcStageMask, dstStageMask;
        };
        auto *drawParams{ctx.executor.allocator->EmplaceUntracked<DrawParams>(DrawParams{stateUpdater, {qmd.ctaRasterWidth, qmd.ctaRasterHeight, qmd.ctaRasterDepth}, srcStageMask, dstStageMask})};


        ctx.executor.AddCheckpoint("Before dispatch");
        ctx.executor.AddOutsideRpCommand([drawParams](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &gpu) {
            drawParams->stateUpdater.RecordAll(gpu, commandBuffer);

            if (drawParams->srcStageMask && drawParams->dstStageMask)
                commandBuffer.pipelineBarrier(drawParams->srcStageMask, drawParams->dstStageMask, {}, {vk::MemoryBarrier{
                    .srcAccessMask = vk::AccessFlagBits::eMemoryWrite,
                    .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite
                }}, {}, {});

            commandBuffer.dispatch(drawParams->dimensions[0], drawParams->dimensions[1], drawParams->dimensions[2]);
        });
        ctx.executor.AddCheckpoint("After dispatch");
    }
}
Implement the Kepler compute engine This can reuse a fair bit of the now-commonised Maxwell 3D code and mostly consists of compute-specific pipeline code which was deemed not suitable for being commonised (e.g. descriptor update code is somewhat duplicated). Of note is how compute lacks any active state at all de to its use of QMDs which bundle up all state into a single object in memory. 2022-11-18 22:51:00 +01:00			`// SPDX-License-Identifier: MPL-2.0`
			`// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/Ryujinx/)`
			`// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)`

			`#include <gpu/interconnect/command_executor.h>`
			`#include <gpu/interconnect/common/state_updater.h>`
			`#include <soc/gm20b/channel.h>`
			`#include "pipeline_state.h"`
			`#include "kepler_compute.h"`

			`namespace skyline::gpu::interconnect::kepler_compute {`
			`KeplerCompute::KeplerCompute(GPU &gpu,`
			`soc::gm20b::ChannelContext &channelCtx,`
			`nce::NCE &nce,`
			`kernel::MemoryManager &memoryManager,`
			`DirtyManager &manager,`
			`const EngineRegisterBundle &registerBundle)`
			`: ctx{channelCtx, channelCtx.executor, gpu, nce, memoryManager},`
			`pipelineState{manager, registerBundle.pipelineStateRegisters},`
			`samplers{manager, registerBundle.samplerPoolRegisters},`
			`textures{manager, registerBundle.texturePoolRegisters} {`
			`ctx.executor.AddFlushCallback([this] {`
			`pipelineState.PurgeCaches();`
			`constantBuffers.MarkAllDirty();`
			`samplers.MarkAllDirty();`
			`textures.MarkAllDirty();`
			`});`
			`}`

			`void KeplerCompute::Dispatch(const QMD &qmd) {`
Disable compute shaders on mali This will need to be debugged properly at some point but its fine for now. 2022-11-19 21:20:27 +01:00			`if (ctx.gpu.traits.quirks.brokenComputeShaders)`
			`return;`

Add more perfetto tracepoints 2023-02-04 22:00:10 +01:00			`TRACE_EVENT("gpu", "KeplerCompute::Dispatch");`

Implement the Kepler compute engine This can reuse a fair bit of the now-commonised Maxwell 3D code and mostly consists of compute-specific pipeline code which was deemed not suitable for being commonised (e.g. descriptor update code is somewhat duplicated). Of note is how compute lacks any active state at all de to its use of QMDs which bundle up all state into a single object in memory. 2022-11-18 22:51:00 +01:00			`StateUpdateBuilder builder{*ctx.executor.allocator};`

			`constantBuffers.Update(ctx, qmd);`
			`samplers.Update(ctx, qmd.samplerIndex == soc::gm20b::engine::kepler_compute::QMD::SamplerIndex::ViaHeaderIndex);`
			`auto *pipeline{pipelineState.Update(ctx, builder, textures, constantBuffers.boundConstantBuffers, qmd)};`

Implement usage based implicit renderpass barrier generation Full pipeline barriers between every RP can be extremely expensive on HW, by analysing the inputs and outputs of a draw it's possible to construct a much more optimal barrier that only syncs what is neccessary. 2023-02-12 22:14:05 +01:00			`vk::PipelineStageFlags srcStageMask{}, dstStageMask{};`
			`auto *descUpdateInfo{pipeline->SyncDescriptors(ctx, constantBuffers.boundConstantBuffers, samplers, textures, srcStageMask, dstStageMask)};`
Implement the Kepler compute engine This can reuse a fair bit of the now-commonised Maxwell 3D code and mostly consists of compute-specific pipeline code which was deemed not suitable for being commonised (e.g. descriptor update code is somewhat duplicated). Of note is how compute lacks any active state at all de to its use of QMDs which bundle up all state into a single object in memory. 2022-11-18 22:51:00 +01:00			`builder.SetPipeline(*pipeline->compiledPipeline.pipeline, vk::PipelineBindPoint::eCompute);`

			`if (ctx.gpu.traits.supportsPushDescriptors) {`
			`builder.SetDescriptorSetWithPush(descUpdateInfo);`
			`} else {`
			`auto set{std::make_shared<DescriptorAllocator::ActiveDescriptorSet>(ctx.gpu.descriptor.AllocateSet(descUpdateInfo->descriptorSetLayout))};`

			`builder.SetDescriptorSetWithUpdate(descUpdateInfo, set.get(), nullptr);`
			`ctx.executor.AttachDependency(set);`
			`}`

			`auto stateUpdater{builder.Build()};`

			`/**`
			`* @brief Struct that can be linearly allocated, holding all state for the draw to avoid a dynamic allocation with lambda captures`
			`*/`
			`struct DrawParams {`
			`StateUpdater stateUpdater;`
			`std::array<u32, 3> dimensions;`
Implement usage based implicit renderpass barrier generation Full pipeline barriers between every RP can be extremely expensive on HW, by analysing the inputs and outputs of a draw it's possible to construct a much more optimal barrier that only syncs what is neccessary. 2023-02-12 22:14:05 +01:00			`vk::PipelineStageFlags srcStageMask, dstStageMask;`
Implement the Kepler compute engine This can reuse a fair bit of the now-commonised Maxwell 3D code and mostly consists of compute-specific pipeline code which was deemed not suitable for being commonised (e.g. descriptor update code is somewhat duplicated). Of note is how compute lacks any active state at all de to its use of QMDs which bundle up all state into a single object in memory. 2022-11-18 22:51:00 +01:00			`};`
Implement usage based implicit renderpass barrier generation Full pipeline barriers between every RP can be extremely expensive on HW, by analysing the inputs and outputs of a draw it's possible to construct a much more optimal barrier that only syncs what is neccessary. 2023-02-12 22:14:05 +01:00			`auto *drawParams{ctx.executor.allocator->EmplaceUntracked<DrawParams>(DrawParams{stateUpdater, {qmd.ctaRasterWidth, qmd.ctaRasterHeight, qmd.ctaRasterDepth}, srcStageMask, dstStageMask})};`
Implement the Kepler compute engine This can reuse a fair bit of the now-commonised Maxwell 3D code and mostly consists of compute-specific pipeline code which was deemed not suitable for being commonised (e.g. descriptor update code is somewhat duplicated). Of note is how compute lacks any active state at all de to its use of QMDs which bundle up all state into a single object in memory. 2022-11-18 22:51:00 +01:00

Introduce GPU checkpoints for crash debugging When GPU crashes aren't reproducable in renderdoc, it helps to have someway to figure out what exactly is going on when a crash happens or what operation caused it. Add a checkpoint system that reports the GPU execution state in perfetto in time with actual GPU execution, and use flow events to show the event's path through execution, vulkan record and executor record stages. 2023-02-04 22:10:36 +01:00			`ctx.executor.AddCheckpoint("Before dispatch");`
Implement the Kepler compute engine This can reuse a fair bit of the now-commonised Maxwell 3D code and mostly consists of compute-specific pipeline code which was deemed not suitable for being commonised (e.g. descriptor update code is somewhat duplicated). Of note is how compute lacks any active state at all de to its use of QMDs which bundle up all state into a single object in memory. 2022-11-18 22:51:00 +01:00			`ctx.executor.AddOutsideRpCommand([drawParams](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &gpu) {`
			`drawParams->stateUpdater.RecordAll(gpu, commandBuffer);`

Implement usage based implicit renderpass barrier generation Full pipeline barriers between every RP can be extremely expensive on HW, by analysing the inputs and outputs of a draw it's possible to construct a much more optimal barrier that only syncs what is neccessary. 2023-02-12 22:14:05 +01:00			`if (drawParams->srcStageMask && drawParams->dstStageMask)`
			`commandBuffer.pipelineBarrier(drawParams->srcStageMask, drawParams->dstStageMask, {}, {vk::MemoryBarrier{`
			`.srcAccessMask = vk::AccessFlagBits::eMemoryWrite,`
			`.dstAccessMask = vk::AccessFlagBits::eMemoryRead \| vk::AccessFlagBits::eMemoryWrite`
			`}}, {}, {});`

Implement the Kepler compute engine This can reuse a fair bit of the now-commonised Maxwell 3D code and mostly consists of compute-specific pipeline code which was deemed not suitable for being commonised (e.g. descriptor update code is somewhat duplicated). Of note is how compute lacks any active state at all de to its use of QMDs which bundle up all state into a single object in memory. 2022-11-18 22:51:00 +01:00			`commandBuffer.dispatch(drawParams->dimensions[0], drawParams->dimensions[1], drawParams->dimensions[2]);`
			`});`
Introduce GPU checkpoints for crash debugging When GPU crashes aren't reproducable in renderdoc, it helps to have someway to figure out what exactly is going on when a crash happens or what operation caused it. Add a checkpoint system that reports the GPU execution state in perfetto in time with actual GPU execution, and use flow events to show the event's path through execution, vulkan record and executor record stages. 2023-02-04 22:10:36 +01:00			`ctx.executor.AddCheckpoint("After dispatch");`
Implement the Kepler compute engine This can reuse a fair bit of the now-commonised Maxwell 3D code and mostly consists of compute-specific pipeline code which was deemed not suitable for being commonised (e.g. descriptor update code is somewhat duplicated). Of note is how compute lacks any active state at all de to its use of QMDs which bundle up all state into a single object in memory. 2022-11-18 22:51:00 +01:00			`}`
			`}`