From 49cd2a71cca5488bdce217ae3140276ad6dbebe3 Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Sat, 4 Feb 2023 21:10:36 +0000
Subject: [PATCH] Introduce GPU checkpoints for crash debugging

When GPU crashes aren't reproducable in renderdoc, it helps to have someway to figure out what exactly is going on when a crash happens or what operation caused it. Add a checkpoint system that reports the GPU execution state in perfetto in time with actual GPU execution, and use flow events to show the event's path through execution, vulkan record and executor record stages.
---
 app/src/main/cpp/skyline/gpu/buffer.h         |  2 +
 .../cpp/skyline/gpu/command_scheduler.cpp     | 28 +++---
 .../gpu/interconnect/command_executor.cpp     | 86 +++++++++++++++++--
 .../gpu/interconnect/command_executor.h       | 36 ++++++++
 .../skyline/gpu/interconnect/command_nodes.h  | 10 ++-
 .../cpp/skyline/gpu/interconnect/fermi_2d.cpp |  2 +
 .../kepler_compute/kepler_compute.cpp         |  2 +
 .../maxwell_3d/constant_buffers.cpp           |  2 +
 .../interconnect/maxwell_3d/maxwell_3d.cpp    | 16 ++--
 9 files changed, 161 insertions(+), 23 deletions(-)
diff --git a/app/src/main/cpp/skyline/gpu/buffer.h b/app/src/main/cpp/skyline/gpu/buffer.h
index 2b2db8f5..940703db 100644
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@@ -31,6 +31,8 @@ namespace skyline::gpu {
 
         BufferBinding(vk::Buffer buffer, vk::DeviceSize offset = 0, vk::DeviceSize size = 0) : buffer{buffer}, offset{offset}, size{size} {}
 
+        BufferBinding(MegaBufferAllocator::Allocation allocation) : buffer{allocation.buffer}, offset{allocation.offset}, size{allocation.region.size()} {}
+
         operator bool() const {
             return buffer;
         }
diff --git a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
index cbe45632..a80602e7 100644
--- a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
+++ b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
@@ -3,7 +3,9 @@
 
 #include <gpu.h>
 #include <loader/loader.h>
+#include <vulkan/vulkan.hpp>
 #include "command_scheduler.h"
+#include "common/exception.h"
 
 namespace skyline::gpu {
     void CommandScheduler::WaiterThread() {
@@ -91,16 +93,22 @@ namespace skyline::gpu {
         fullSignalSemaphores.push_back(cycle->semaphore);
 
         {
-            std::scoped_lock lock{gpu.queueMutex};
-            gpu.vkQueue.submit(vk::SubmitInfo{
-                .commandBufferCount = 1,
-                .pCommandBuffers = &*commandBuffer,
-                .waitSemaphoreCount = static_cast<u32>(fullWaitSemaphores.size()),
-                .pWaitSemaphores = fullWaitSemaphores.data(),
-                .pWaitDstStageMask = fullWaitStages.data(),
-                .signalSemaphoreCount = static_cast<u32>(fullSignalSemaphores.size()),
-                .pSignalSemaphores = fullSignalSemaphores.data(),
-            }, cycle->fence);
+            try {
+                std::scoped_lock lock{gpu.queueMutex};
+                gpu.vkQueue.submit(vk::SubmitInfo{
+                    .commandBufferCount = 1,
+                    .pCommandBuffers = &*commandBuffer,
+                    .waitSemaphoreCount = static_cast<u32>(fullWaitSemaphores.size()),
+                    .pWaitSemaphores = fullWaitSemaphores.data(),
+                    .pWaitDstStageMask = fullWaitStages.data(),
+                    .signalSemaphoreCount = static_cast<u32>(fullSignalSemaphores.size()),
+                    .pSignalSemaphores = fullSignalSemaphores.data(),
+                }, cycle->fence);
+            } catch (const vk::DeviceLostError &e) {
+                // Wait 5 seconds to give traces etc. time to settle
+                std::this_thread::sleep_for(std::chrono::seconds(5));
+                throw exception("Vulkan device lost!");
+            }
         }
 
         cycle->NotifySubmitted();
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
index 5d286751..02dcdfcc 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
 
+#include <chrono>
 #include <condition_variable>
 #include <mutex>
 #include <range/v3/view.hpp>
@@ -97,31 +98,65 @@ namespace skyline::gpu::interconnect {
     }
 
     void CommandRecordThread::ProcessSlot(Slot *slot) {
-        TRACE_EVENT_FMT("gpu", "ProcessSlot: 0x{:X}, execution: {}", slot, slot->executionTag);
+        TRACE_EVENT_FMT("gpu", "ProcessSlot: 0x{:X}, execution: {}", slot, u64{slot->executionTag});
         auto &gpu{*state.gpu};
+        std::scoped_lock lock{gpu.buffer.recreationMutex};
 
         vk::RenderPass lRenderPass;
         u32 subpassIndex;
 
         using namespace node;
         for (NodeVariant &node : slot->nodes) {
-            #define NODE(name) [&](name& node) { node(slot->commandBuffer, slot->cycle, gpu); }
             std::visit(VariantVisitor{
-                NODE(FunctionNode),
+                [&](FunctionNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "FunctionNode");
+                    node(slot->commandBuffer, slot->cycle, gpu);
+                },
+
+                [&](CheckpointNode &node) {
+                    RecordFullBarrier(slot->commandBuffer);
+
+                    TRACE_EVENT_INSTANT("gpu", "CheckpointNode", "id", node.id, [&](perfetto::EventContext ctx) {
+                        ctx.event()->add_flow_ids(node.id);
+                    });
+
+                    std::array<vk::BufferCopy, 1> copy{vk::BufferCopy{
+                        .size = node.binding.size,
+                        .srcOffset = node.binding.offset,
+                        .dstOffset = 0,
+                    }};
+
+                    slot->commandBuffer.copyBuffer(node.binding.buffer, gpu.debugTracingBuffer.vkBuffer, copy);
+
+                    RecordFullBarrier(slot->commandBuffer);
+                },
 
                 [&](RenderPassNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "RenderPassNode");
                     lRenderPass = node(slot->commandBuffer, slot->cycle, gpu);
                     subpassIndex = 0;
                 },
 
                 [&](NextSubpassNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "NextSubpassNode");
                     node(slot->commandBuffer, slot->cycle, gpu);
                     ++subpassIndex;
                 },
-                [&](SubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, subpassIndex); },
-                [&](NextSubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, ++subpassIndex); },
 
-                NODE(RenderPassEndNode),
+                [&](SubpassFunctionNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "SubpassFunctionNode");
+                    node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, subpassIndex);
+                },
+
+                [&](NextSubpassFunctionNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "NextSubpassFunctionNode");
+                    node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, ++subpassIndex);
+                },
+
+                [&](RenderPassEndNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "RenderPassEndNode");
+                    node(slot->commandBuffer, slot->cycle, gpu);
+                },
             }, node);
             #undef NODE
         }
@@ -258,11 +293,35 @@ namespace skyline::gpu::interconnect {
         condition.notify_all();
     }
 
+    void CheckpointPollerThread::Run() {
+        u32 prevCheckpoint{};
+        for (size_t iteration{}; true; iteration++) {
+            u32 curCheckpoint{state.gpu->debugTracingBuffer.as<u32>()};
+
+            if ((iteration % 1024) == 0)
+                Logger::Info("Current Checkpoint: {}", curCheckpoint);
+
+            while (prevCheckpoint != curCheckpoint) {
+                // Make sure to report an event for every checkpoint inbetween the previous and current values, to ensure the perfetto trace is consistent
+                prevCheckpoint++;
+                TRACE_EVENT_INSTANT("gpu", "Checkpoint", "id", prevCheckpoint, [&](perfetto::EventContext ctx) {
+                    ctx.event()->add_terminating_flow_ids(prevCheckpoint);
+                });
+            }
+
+            prevCheckpoint = curCheckpoint;
+            std::this_thread::sleep_for(std::chrono::microseconds(5));
+        }
+    }
+
+    CheckpointPollerThread::CheckpointPollerThread(const DeviceState &state) : state{state}, thread{&CheckpointPollerThread::Run, this} {}
+
     CommandExecutor::CommandExecutor(const DeviceState &state)
         : state{state},
           gpu{*state.gpu},
           recordThread{state},
           waiterThread{state},
+          checkpointPollerThread{EnableGpuCheckpoints ? std::optional<CheckpointPollerThread>{state} : std::optional<CheckpointPollerThread>{}},
           tag{AllocateTag()} {
         RotateRecordSlot();
     }
@@ -512,6 +571,21 @@ namespace skyline::gpu::interconnect {
             callback();
     }
 
+    u32 CommandExecutor::AddCheckpointImpl(std::string_view annotation) {
+        if (renderPass)
+            FinishRenderPass();
+
+        slot->nodes.emplace_back(node::CheckpointNode{gpu.megaBufferAllocator.Push(cycle, span<u32>(&nextCheckpointId, 1).cast<u8>()), nextCheckpointId});
+
+        TRACE_EVENT_INSTANT("gpu", "Mark Checkpoint", "id", nextCheckpointId, "annotation", [&annotation](perfetto::TracedValue context) {
+            std::move(context).WriteString(annotation.data(), annotation.size());
+        }, [&](perfetto::EventContext ctx) {
+            ctx.event()->add_flow_ids(nextCheckpointId);
+        });
+
+        return nextCheckpointId++;
+    }
+
     void CommandExecutor::SubmitInternal() {
         if (renderPass)
             FinishRenderPass();
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
index 9eaf5997..024fe133 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@@ -12,6 +12,8 @@
 #include "common/spin_lock.h"
 
 namespace skyline::gpu::interconnect {
+    constexpr bool EnableGpuCheckpoints{false}; //!< Whether to enable GPU debugging checkpoints (WILL DECREASE PERF SIGNIFICANTLY)
+
     /*
      * @brief Thread responsible for recording Vulkan commands from the execution nodes and submitting them
      */
@@ -119,6 +121,20 @@ namespace skyline::gpu::interconnect {
         void Queue(std::shared_ptr<FenceCycle> cycle, std::function<void()> &&callback);
     };
 
+    /**
+     * @brief Polls the debug buffer for checkpoint updates and reports them to perfetto
+     */
+    class CheckpointPollerThread {
+      private:
+        const DeviceState &state;
+        std::thread thread;
+
+        void Run();
+
+      public:
+        CheckpointPollerThread(const DeviceState &state);
+    };
+
     /**
      * @brief Assembles a Vulkan command stream with various nodes and manages execution of the produced graph
      * @note This class is **NOT** thread-safe and should **ONLY** be utilized by a single thread
@@ -130,6 +146,7 @@ namespace skyline::gpu::interconnect {
         CommandRecordThread recordThread;
         CommandRecordThread::Slot *slot{};
         ExecutionWaiterThread waiterThread;
+        std::optional<CheckpointPollerThread> checkpointPollerThread;
         node::RenderPassNode *renderPass{};
         size_t subpassCount{}; //!< The number of subpasses in the current render pass
         u32 renderPassIndex{};
@@ -183,6 +200,8 @@ namespace skyline::gpu::interconnect {
         std::vector<std::function<void()>> flushCallbacks; //!< Set of persistent callbacks that will be called at the start of Execute in order to flush data required for recording
         std::vector<std::function<void()>> pipelineChangeCallbacks; //!< Set of persistent callbacks that will be called after any non-Maxwell 3D engine changes the active pipeline
 
+        u32 nextCheckpointId{}; //!< The ID of the next debug checkpoint to be allocated
+
         void RotateRecordSlot();
 
         /**
@@ -211,6 +230,11 @@ namespace skyline::gpu::interconnect {
 
         void AttachBufferBase(std::shared_ptr<Buffer> buffer);
 
+        /**
+         * @brief Non-gated implementation of `AddCheckpoint`
+         */
+        u32 AddCheckpointImpl(std::string_view annotation);
+
       public:
         std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
         LinearAllocatorState<> *allocator;
@@ -305,6 +329,18 @@ namespace skyline::gpu::interconnect {
          */
         void NotifyPipelineChange();
 
+        /**
+         * @brief Records a checkpoint into the GPU command stream at the current
+         * @param annotation A string annotation to display in perfetto for this checkpoint
+         * @return The checkpoint ID
+         */
+        u32 AddCheckpoint(std::string_view annotation) {
+            if constexpr (EnableGpuCheckpoints)
+                return AddCheckpointImpl(annotation);
+            else
+                return 0;
+        }
+
         /**
          * @brief Execute all the nodes and submit the resulting command buffer to the GPU
          * @param callback A function to call upon GPU completion of the submission
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h b/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h
index 6507feae..1f102009 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h
@@ -122,5 +122,13 @@ namespace skyline::gpu::interconnect::node {
         }
     };
 
-    using NodeVariant = std::variant<FunctionNode, RenderPassNode, NextSubpassNode, SubpassFunctionNode, NextSubpassFunctionNode, RenderPassEndNode>; //!< A variant encompassing all command nodes types
+    /**
+     * @brief A node which copies the contained ID value to the debug tracking buffer
+     */
+    struct CheckpointNode {
+        BufferBinding binding; //!< Binding for a GPU-side buffer containing the checkpoint ID
+        u32 id;
+    };
+
+    using NodeVariant = std::variant<FunctionNode, CheckpointNode, RenderPassNode, NextSubpassNode, SubpassFunctionNode, NextSubpassFunctionNode, RenderPassEndNode>; //!< A variant encompassing all command nodes types
 }
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp b/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp
index 51c841eb..da618587 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp
@@ -158,6 +158,8 @@ namespace skyline::gpu::interconnect {
                                     vk::PipelineStageFlagBits::eAllGraphics, vk::PipelineStageFlagBits::eAllGraphics);
             }
         );
+        executor.AddCheckpoint("After blit");
+
 
         executor.NotifyPipelineChange();
     }
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp
index ecf391bc..d428a67f 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp
@@ -65,6 +65,7 @@ namespace skyline::gpu::interconnect::kepler_compute {
         auto *drawParams{ctx.executor.allocator->EmplaceUntracked<DrawParams>(DrawParams{stateUpdater, {qmd.ctaRasterWidth, qmd.ctaRasterHeight, qmd.ctaRasterDepth}, srcStageMask, dstStageMask})};
 
 
+        ctx.executor.AddCheckpoint("Before dispatch");
         ctx.executor.AddOutsideRpCommand([drawParams](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &gpu) {
             drawParams->stateUpdater.RecordAll(gpu, commandBuffer);
 
@@ -76,5 +77,6 @@ namespace skyline::gpu::interconnect::kepler_compute {
 
             commandBuffer.dispatch(drawParams->dimensions[0], drawParams->dimensions[1], drawParams->dimensions[2]);
         });
+        ctx.executor.AddCheckpoint("After dispatch");
     }
 }
\ No newline at end of file
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp
index ee51b379..289f0de6 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp
@@ -64,6 +64,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
                 callbackData.view.GetBuffer()->BlockAllCpuBackingWrites();
 
                 auto srcGpuAllocation{callbackData.ctx.gpu.megaBufferAllocator.Push(callbackData.ctx.executor.cycle, callbackData.srcCpuBuf)};
+                callbackData.ctx.executor.AddCheckpoint("Before constant buffer load");
                 callbackData.ctx.executor.AddOutsideRpCommand([=, srcCpuBuf = callbackData.srcCpuBuf, view = callbackData.view, offset = callbackData.offset](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &gpu) {
                     auto binding{view.GetBinding(gpu)};
                     vk::BufferCopy copyRegion{
@@ -77,6 +78,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
                         .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite
                     }, {}, {});
                 });
+                callbackData.ctx.executor.AddCheckpoint("After constant buffer load");
             });
         }
     }
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp
index f8074bb9..3dfc68a5 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp
@@ -207,6 +207,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
             return;
 
         TRACE_EVENT("gpu", "Maxwell3D::Clear");
+        ctx.executor.AddCheckpoint("Before clear");
 
         auto needsAttachmentClearCmd{[&](auto &view) {
             return scissor.offset.x != 0 || scissor.offset.y != 0 ||
@@ -281,13 +282,14 @@ namespace skyline::gpu::interconnect::maxwell3d {
             }
         }
 
-        if (clearAttachments.empty())
-            return;
+        if (!clearAttachments.empty()) {
+            std::array<TextureView *, 1> colorAttachments{colorView ? &*colorView : nullptr};
+            ctx.executor.AddSubpass([clearAttachments, clearRects](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &, vk::RenderPass, u32) {
+                commandBuffer.clearAttachments(clearAttachments, span(clearRects).first(clearAttachments.size()));
+            }, renderArea, {}, {}, colorView ? colorAttachments : span<TextureView *>{}, depthStencilView ? &*depthStencilView : nullptr);
+        }
 
-        std::array<TextureView *, 1> colorAttachments{colorView ? &*colorView : nullptr};
-        ctx.executor.AddSubpass([clearAttachments, clearRects](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &, vk::RenderPass, u32) {
-            commandBuffer.clearAttachments(clearAttachments, span(clearRects).first(clearAttachments.size()));
-        }, renderArea, {}, {}, colorView ? colorAttachments : span<TextureView *>{}, depthStencilView ? &*depthStencilView : nullptr);
+        ctx.executor.AddCheckpoint("After clear");
     }
 
     void Maxwell3D::Draw(engine::DrawTopology topology, bool transformFeedbackEnable, bool indexed, u32 count, u32 first, u32 instanceCount, u32 vertexOffset, u32 firstInstance) {
@@ -333,6 +335,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
 
 
         constantBuffers.ResetQuickBind();
+        ctx.executor.AddCheckpoint("Before draw");
         ctx.executor.AddSubpass([drawParams](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &gpu, vk::RenderPass, u32) {
             drawParams->stateUpdater.RecordAll(gpu, commandBuffer);
 
@@ -347,5 +350,6 @@ namespace skyline::gpu::interconnect::maxwell3d {
             if (drawParams->transformFeedbackEnable)
                 commandBuffer.endTransformFeedbackEXT(0, {}, {});
         }, scissor, activeDescriptorSetSampledImages, {}, activeState.GetColorAttachments(), activeState.GetDepthAttachment(), !ctx.gpu.traits.quirks.relaxedRenderPassCompatibility, srcStageMask, dstStageMask);
+        ctx.executor.AddCheckpoint("After draw");
     }
 }
\ No newline at end of file