Introduce GPU checkpoints for crash debugging

When GPU crashes aren't reproducable in renderdoc, it helps to have someway to figure out what exactly is going on when a crash happens or what operation caused it. Add a checkpoint system that reports the GPU execution state in perfetto in time with actual GPU execution, and use flow events to show the event's path through execution, vulkan record and executor record stages.
2024-12-26 15:51:51 +01:00 · 2023-02-04 21:10:36 +00:00 · 2023-02-04 21:10:36 +00:00 · 49cd2a71cc
commit 49cd2a71cc
parent d5b6c68ae4
9 changed files with 161 additions and 23 deletions
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@ -31,6 +31,8 @@ namespace skyline::gpu {

        BufferBinding(vk::Buffer buffer, vk::DeviceSize offset = 0, vk::DeviceSize size = 0) : buffer{buffer}, offset{offset}, size{size} {}

+        BufferBinding(MegaBufferAllocator::Allocation allocation) : buffer{allocation.buffer}, offset{allocation.offset}, size{allocation.region.size()} {}
+
        operator bool() const {
            return buffer;
        }
--- a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
+++ b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
@ -3,7 +3,9 @@

 #include <gpu.h>
 #include <loader/loader.h>
+#include <vulkan/vulkan.hpp>
 #include "command_scheduler.h"
+#include "common/exception.h"

 namespace skyline::gpu {
    void CommandScheduler::WaiterThread() {
@ -91,16 +93,22 @@ namespace skyline::gpu {
        fullSignalSemaphores.push_back(cycle->semaphore);

        {
-            std::scoped_lock lock{gpu.queueMutex};
-            gpu.vkQueue.submit(vk::SubmitInfo{
-                .commandBufferCount = 1,
-                .pCommandBuffers = &*commandBuffer,
-                .waitSemaphoreCount = static_cast<u32>(fullWaitSemaphores.size()),
-                .pWaitSemaphores = fullWaitSemaphores.data(),
-                .pWaitDstStageMask = fullWaitStages.data(),
-                .signalSemaphoreCount = static_cast<u32>(fullSignalSemaphores.size()),
-                .pSignalSemaphores = fullSignalSemaphores.data(),
-            }, cycle->fence);
+            try {
+                std::scoped_lock lock{gpu.queueMutex};
+                gpu.vkQueue.submit(vk::SubmitInfo{
+                    .commandBufferCount = 1,
+                    .pCommandBuffers = &*commandBuffer,
+                    .waitSemaphoreCount = static_cast<u32>(fullWaitSemaphores.size()),
+                    .pWaitSemaphores = fullWaitSemaphores.data(),
+                    .pWaitDstStageMask = fullWaitStages.data(),
+                    .signalSemaphoreCount = static_cast<u32>(fullSignalSemaphores.size()),
+                    .pSignalSemaphores = fullSignalSemaphores.data(),
+                }, cycle->fence);
+            } catch (const vk::DeviceLostError &e) {
+                // Wait 5 seconds to give traces etc. time to settle
+                std::this_thread::sleep_for(std::chrono::seconds(5));
+                throw exception("Vulkan device lost!");
+            }
        }

        cycle->NotifySubmitted();
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@ -1,6 +1,7 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)

+#include <chrono>
 #include <condition_variable>
 #include <mutex>
 #include <range/v3/view.hpp>
@ -97,31 +98,65 @@ namespace skyline::gpu::interconnect {
    }

    void CommandRecordThread::ProcessSlot(Slot *slot) {
-        TRACE_EVENT_FMT("gpu", "ProcessSlot: 0x{:X}, execution: {}", slot, slot->executionTag);
+        TRACE_EVENT_FMT("gpu", "ProcessSlot: 0x{:X}, execution: {}", slot, u64{slot->executionTag});
        auto &gpu{*state.gpu};
+        std::scoped_lock lock{gpu.buffer.recreationMutex};

        vk::RenderPass lRenderPass;
        u32 subpassIndex;

        using namespace node;
        for (NodeVariant &node : slot->nodes) {
-            #define NODE(name) [&](name& node) { node(slot->commandBuffer, slot->cycle, gpu); }
            std::visit(VariantVisitor{
-                NODE(FunctionNode),
+                [&](FunctionNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "FunctionNode");
+                    node(slot->commandBuffer, slot->cycle, gpu);
+                },
+
+                [&](CheckpointNode &node) {
+                    RecordFullBarrier(slot->commandBuffer);
+
+                    TRACE_EVENT_INSTANT("gpu", "CheckpointNode", "id", node.id, [&](perfetto::EventContext ctx) {
+                        ctx.event()->add_flow_ids(node.id);
+                    });
+
+                    std::array<vk::BufferCopy, 1> copy{vk::BufferCopy{
+                        .size = node.binding.size,
+                        .srcOffset = node.binding.offset,
+                        .dstOffset = 0,
+                    }};
+
+                    slot->commandBuffer.copyBuffer(node.binding.buffer, gpu.debugTracingBuffer.vkBuffer, copy);
+
+                    RecordFullBarrier(slot->commandBuffer);
+                },

                [&](RenderPassNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "RenderPassNode");
                    lRenderPass = node(slot->commandBuffer, slot->cycle, gpu);
                    subpassIndex = 0;
                },

                [&](NextSubpassNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "NextSubpassNode");
                    node(slot->commandBuffer, slot->cycle, gpu);
                    ++subpassIndex;
                },
-                [&](SubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, subpassIndex); },
-                [&](NextSubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, ++subpassIndex); },

-                NODE(RenderPassEndNode),
+                [&](SubpassFunctionNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "SubpassFunctionNode");
+                    node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, subpassIndex);
+                },
+
+                [&](NextSubpassFunctionNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "NextSubpassFunctionNode");
+                    node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, ++subpassIndex);
+                },
+
+                [&](RenderPassEndNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "RenderPassEndNode");
+                    node(slot->commandBuffer, slot->cycle, gpu);
+                },
            }, node);
            #undef NODE
        }
@ -258,11 +293,35 @@ namespace skyline::gpu::interconnect {
        condition.notify_all();
    }

+    void CheckpointPollerThread::Run() {
+        u32 prevCheckpoint{};
+        for (size_t iteration{}; true; iteration++) {
+            u32 curCheckpoint{state.gpu->debugTracingBuffer.as<u32>()};
+
+            if ((iteration % 1024) == 0)
+                Logger::Info("Current Checkpoint: {}", curCheckpoint);
+
+            while (prevCheckpoint != curCheckpoint) {
+                // Make sure to report an event for every checkpoint inbetween the previous and current values, to ensure the perfetto trace is consistent
+                prevCheckpoint++;
+                TRACE_EVENT_INSTANT("gpu", "Checkpoint", "id", prevCheckpoint, [&](perfetto::EventContext ctx) {
+                    ctx.event()->add_terminating_flow_ids(prevCheckpoint);
+                });
+            }
+
+            prevCheckpoint = curCheckpoint;
+            std::this_thread::sleep_for(std::chrono::microseconds(5));
+        }
+    }
+
+    CheckpointPollerThread::CheckpointPollerThread(const DeviceState &state) : state{state}, thread{&CheckpointPollerThread::Run, this} {}
+
    CommandExecutor::CommandExecutor(const DeviceState &state)
        : state{state},
          gpu{*state.gpu},
          recordThread{state},
          waiterThread{state},
+          checkpointPollerThread{EnableGpuCheckpoints ? std::optional<CheckpointPollerThread>{state} : std::optional<CheckpointPollerThread>{}},
          tag{AllocateTag()} {
        RotateRecordSlot();
    }
@ -512,6 +571,21 @@ namespace skyline::gpu::interconnect {
            callback();
    }

+    u32 CommandExecutor::AddCheckpointImpl(std::string_view annotation) {
+        if (renderPass)
+            FinishRenderPass();
+
+        slot->nodes.emplace_back(node::CheckpointNode{gpu.megaBufferAllocator.Push(cycle, span<u32>(&nextCheckpointId, 1).cast<u8>()), nextCheckpointId});
+
+        TRACE_EVENT_INSTANT("gpu", "Mark Checkpoint", "id", nextCheckpointId, "annotation", [&annotation](perfetto::TracedValue context) {
+            std::move(context).WriteString(annotation.data(), annotation.size());
+        }, [&](perfetto::EventContext ctx) {
+            ctx.event()->add_flow_ids(nextCheckpointId);
+        });
+
+        return nextCheckpointId++;
+    }
+
    void CommandExecutor::SubmitInternal() {
        if (renderPass)
            FinishRenderPass();
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@ -12,6 +12,8 @@
 #include "common/spin_lock.h"

 namespace skyline::gpu::interconnect {
+    constexpr bool EnableGpuCheckpoints{false}; //!< Whether to enable GPU debugging checkpoints (WILL DECREASE PERF SIGNIFICANTLY)
+
    /*
     * @brief Thread responsible for recording Vulkan commands from the execution nodes and submitting them
     */
@ -119,6 +121,20 @@ namespace skyline::gpu::interconnect {
        void Queue(std::shared_ptr<FenceCycle> cycle, std::function<void()> &&callback);
    };

+    /**
+     * @brief Polls the debug buffer for checkpoint updates and reports them to perfetto
+     */
+    class CheckpointPollerThread {
+      private:
+        const DeviceState &state;
+        std::thread thread;
+
+        void Run();
+
+      public:
+        CheckpointPollerThread(const DeviceState &state);
+    };
+
    /**
     * @brief Assembles a Vulkan command stream with various nodes and manages execution of the produced graph
     * @note This class is **NOT** thread-safe and should **ONLY** be utilized by a single thread
@ -130,6 +146,7 @@ namespace skyline::gpu::interconnect {
        CommandRecordThread recordThread;
        CommandRecordThread::Slot *slot{};
        ExecutionWaiterThread waiterThread;
+        std::optional<CheckpointPollerThread> checkpointPollerThread;
        node::RenderPassNode *renderPass{};
        size_t subpassCount{}; //!< The number of subpasses in the current render pass
        u32 renderPassIndex{};
@ -183,6 +200,8 @@ namespace skyline::gpu::interconnect {
        std::vector<std::function<void()>> flushCallbacks; //!< Set of persistent callbacks that will be called at the start of Execute in order to flush data required for recording
        std::vector<std::function<void()>> pipelineChangeCallbacks; //!< Set of persistent callbacks that will be called after any non-Maxwell 3D engine changes the active pipeline

+        u32 nextCheckpointId{}; //!< The ID of the next debug checkpoint to be allocated
+
        void RotateRecordSlot();

        /**
@ -211,6 +230,11 @@ namespace skyline::gpu::interconnect {

        void AttachBufferBase(std::shared_ptr<Buffer> buffer);

+        /**
+         * @brief Non-gated implementation of `AddCheckpoint`
+         */
+        u32 AddCheckpointImpl(std::string_view annotation);
+
      public:
        std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
        LinearAllocatorState<> *allocator;
@ -305,6 +329,18 @@ namespace skyline::gpu::interconnect {
         */
        void NotifyPipelineChange();

+        /**
+         * @brief Records a checkpoint into the GPU command stream at the current
+         * @param annotation A string annotation to display in perfetto for this checkpoint
+         * @return The checkpoint ID
+         */
+        u32 AddCheckpoint(std::string_view annotation) {
+            if constexpr (EnableGpuCheckpoints)
+                return AddCheckpointImpl(annotation);
+            else
+                return 0;
+        }
+
        /**
         * @brief Execute all the nodes and submit the resulting command buffer to the GPU
         * @param callback A function to call upon GPU completion of the submission
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h
@ -122,5 +122,13 @@ namespace skyline::gpu::interconnect::node {
        }
    };

-    using NodeVariant = std::variant<FunctionNode, RenderPassNode, NextSubpassNode, SubpassFunctionNode, NextSubpassFunctionNode, RenderPassEndNode>; //!< A variant encompassing all command nodes types
+    /**
+     * @brief A node which copies the contained ID value to the debug tracking buffer
+     */
+    struct CheckpointNode {
+        BufferBinding binding; //!< Binding for a GPU-side buffer containing the checkpoint ID
+        u32 id;
+    };
+
+    using NodeVariant = std::variant<FunctionNode, CheckpointNode, RenderPassNode, NextSubpassNode, SubpassFunctionNode, NextSubpassFunctionNode, RenderPassEndNode>; //!< A variant encompassing all command nodes types
 }
--- a/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp
@ -158,6 +158,8 @@ namespace skyline::gpu::interconnect {
                                    vk::PipelineStageFlagBits::eAllGraphics, vk::PipelineStageFlagBits::eAllGraphics);
            }
        );
+        executor.AddCheckpoint("After blit");
+

        executor.NotifyPipelineChange();
    }
--- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp
@ -65,6 +65,7 @@ namespace skyline::gpu::interconnect::kepler_compute {
        auto *drawParams{ctx.executor.allocator->EmplaceUntracked<DrawParams>(DrawParams{stateUpdater, {qmd.ctaRasterWidth, qmd.ctaRasterHeight, qmd.ctaRasterDepth}, srcStageMask, dstStageMask})};


+        ctx.executor.AddCheckpoint("Before dispatch");
        ctx.executor.AddOutsideRpCommand([drawParams](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &gpu) {
            drawParams->stateUpdater.RecordAll(gpu, commandBuffer);

@ -76,5 +77,6 @@ namespace skyline::gpu::interconnect::kepler_compute {

            commandBuffer.dispatch(drawParams->dimensions[0], drawParams->dimensions[1], drawParams->dimensions[2]);
        });
+        ctx.executor.AddCheckpoint("After dispatch");
    }
 }
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp
@ -64,6 +64,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
                callbackData.view.GetBuffer()->BlockAllCpuBackingWrites();

                auto srcGpuAllocation{callbackData.ctx.gpu.megaBufferAllocator.Push(callbackData.ctx.executor.cycle, callbackData.srcCpuBuf)};
+                callbackData.ctx.executor.AddCheckpoint("Before constant buffer load");
                callbackData.ctx.executor.AddOutsideRpCommand([=, srcCpuBuf = callbackData.srcCpuBuf, view = callbackData.view, offset = callbackData.offset](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &gpu) {
                    auto binding{view.GetBinding(gpu)};
                    vk::BufferCopy copyRegion{
@ -77,6 +78,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
                        .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite
                    }, {}, {});
                });
+                callbackData.ctx.executor.AddCheckpoint("After constant buffer load");
            });
        }
    }
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp
@ -207,6 +207,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
            return;

        TRACE_EVENT("gpu", "Maxwell3D::Clear");
+        ctx.executor.AddCheckpoint("Before clear");

        auto needsAttachmentClearCmd{[&](auto &view) {
            return scissor.offset.x != 0 || scissor.offset.y != 0 ||
@ -281,13 +282,14 @@ namespace skyline::gpu::interconnect::maxwell3d {
            }
        }

-        if (clearAttachments.empty())
-            return;
+        if (!clearAttachments.empty()) {
+            std::array<TextureView *, 1> colorAttachments{colorView ? &*colorView : nullptr};
+            ctx.executor.AddSubpass([clearAttachments, clearRects](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &, vk::RenderPass, u32) {
+                commandBuffer.clearAttachments(clearAttachments, span(clearRects).first(clearAttachments.size()));
+            }, renderArea, {}, {}, colorView ? colorAttachments : span<TextureView *>{}, depthStencilView ? &*depthStencilView : nullptr);
+        }

-        std::array<TextureView *, 1> colorAttachments{colorView ? &*colorView : nullptr};
-        ctx.executor.AddSubpass([clearAttachments, clearRects](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &, vk::RenderPass, u32) {
-            commandBuffer.clearAttachments(clearAttachments, span(clearRects).first(clearAttachments.size()));
-        }, renderArea, {}, {}, colorView ? colorAttachments : span<TextureView *>{}, depthStencilView ? &*depthStencilView : nullptr);
+        ctx.executor.AddCheckpoint("After clear");
    }

    void Maxwell3D::Draw(engine::DrawTopology topology, bool transformFeedbackEnable, bool indexed, u32 count, u32 first, u32 instanceCount, u32 vertexOffset, u32 firstInstance) {
@ -333,6 +335,7 @@ namespace skyline::gpu::interconnect::maxwell3d {


        constantBuffers.ResetQuickBind();
+        ctx.executor.AddCheckpoint("Before draw");
        ctx.executor.AddSubpass([drawParams](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &gpu, vk::RenderPass, u32) {
            drawParams->stateUpdater.RecordAll(gpu, commandBuffer);

@ -347,5 +350,6 @@ namespace skyline::gpu::interconnect::maxwell3d {
            if (drawParams->transformFeedbackEnable)
                commandBuffer.endTransformFeedbackEXT(0, {}, {});
        }, scissor, activeDescriptorSetSampledImages, {}, activeState.GetColorAttachments(), activeState.GetDepthAttachment(), !ctx.gpu.traits.quirks.relaxedRenderPassCompatibility, srcStageMask, dstStageMask);
+        ctx.executor.AddCheckpoint("After draw");
    }
 }