diff --git a/app/src/main/cpp/skyline/gpu/buffer.h b/app/src/main/cpp/skyline/gpu/buffer.h
index 2b2db8f5..940703db 100644
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@@ -31,6 +31,8 @@ namespace skyline::gpu {
 
         BufferBinding(vk::Buffer buffer, vk::DeviceSize offset = 0, vk::DeviceSize size = 0) : buffer{buffer}, offset{offset}, size{size} {}
 
+        BufferBinding(MegaBufferAllocator::Allocation allocation) : buffer{allocation.buffer}, offset{allocation.offset}, size{allocation.region.size()} {}
+
         operator bool() const {
             return buffer;
         }
diff --git a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
index cbe45632..a80602e7 100644
--- a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
+++ b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
@@ -3,7 +3,9 @@
 
 #include <gpu.h>
 #include <loader/loader.h>
+#include <vulkan/vulkan.hpp>
 #include "command_scheduler.h"
+#include "common/exception.h"
 
 namespace skyline::gpu {
     void CommandScheduler::WaiterThread() {
@@ -91,16 +93,22 @@ namespace skyline::gpu {
         fullSignalSemaphores.push_back(cycle->semaphore);
 
         {
-            std::scoped_lock lock{gpu.queueMutex};
-            gpu.vkQueue.submit(vk::SubmitInfo{
-                .commandBufferCount = 1,
-                .pCommandBuffers = &*commandBuffer,
-                .waitSemaphoreCount = static_cast<u32>(fullWaitSemaphores.size()),
-                .pWaitSemaphores = fullWaitSemaphores.data(),
-                .pWaitDstStageMask = fullWaitStages.data(),
-                .signalSemaphoreCount = static_cast<u32>(fullSignalSemaphores.size()),
-                .pSignalSemaphores = fullSignalSemaphores.data(),
-            }, cycle->fence);
+            try {
+                std::scoped_lock lock{gpu.queueMutex};
+                gpu.vkQueue.submit(vk::SubmitInfo{
+                    .commandBufferCount = 1,
+                    .pCommandBuffers = &*commandBuffer,
+                    .waitSemaphoreCount = static_cast<u32>(fullWaitSemaphores.size()),
+                    .pWaitSemaphores = fullWaitSemaphores.data(),
+                    .pWaitDstStageMask = fullWaitStages.data(),
+                    .signalSemaphoreCount = static_cast<u32>(fullSignalSemaphores.size()),
+                    .pSignalSemaphores = fullSignalSemaphores.data(),
+                }, cycle->fence);
+            } catch (const vk::DeviceLostError &e) {
+                // Wait 5 seconds to give traces etc. time to settle
+                std::this_thread::sleep_for(std::chrono::seconds(5));
+                throw exception("Vulkan device lost!");
+            }
         }
 
         cycle->NotifySubmitted();
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
index 5d286751..02dcdfcc 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
 
+#include <chrono>
 #include <condition_variable>
 #include <mutex>
 #include <range/v3/view.hpp>
@@ -97,31 +98,65 @@ namespace skyline::gpu::interconnect {
     }
 
     void CommandRecordThread::ProcessSlot(Slot *slot) {
-        TRACE_EVENT_FMT("gpu", "ProcessSlot: 0x{:X}, execution: {}", slot, slot->executionTag);
+        TRACE_EVENT_FMT("gpu", "ProcessSlot: 0x{:X}, execution: {}", slot, u64{slot->executionTag});
         auto &gpu{*state.gpu};
+        std::scoped_lock lock{gpu.buffer.recreationMutex};
 
         vk::RenderPass lRenderPass;
         u32 subpassIndex;
 
         using namespace node;
         for (NodeVariant &node : slot->nodes) {
-            #define NODE(name) [&](name& node) { node(slot->commandBuffer, slot->cycle, gpu); }
             std::visit(VariantVisitor{
-                NODE(FunctionNode),
+                [&](FunctionNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "FunctionNode");
+                    node(slot->commandBuffer, slot->cycle, gpu);
+                },
+
+                [&](CheckpointNode &node) {
+                    RecordFullBarrier(slot->commandBuffer);
+
+                    TRACE_EVENT_INSTANT("gpu", "CheckpointNode", "id", node.id, [&](perfetto::EventContext ctx) {
+                        ctx.event()->add_flow_ids(node.id);
+                    });
+
+                    std::array<vk::BufferCopy, 1> copy{vk::BufferCopy{
+                        .size = node.binding.size,
+                        .srcOffset = node.binding.offset,
+                        .dstOffset = 0,
+                    }};
+
+                    slot->commandBuffer.copyBuffer(node.binding.buffer, gpu.debugTracingBuffer.vkBuffer, copy);
+
+                    RecordFullBarrier(slot->commandBuffer);
+                },
 
                 [&](RenderPassNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "RenderPassNode");
                     lRenderPass = node(slot->commandBuffer, slot->cycle, gpu);
                     subpassIndex = 0;
                 },
 
                 [&](NextSubpassNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "NextSubpassNode");
                     node(slot->commandBuffer, slot->cycle, gpu);
                     ++subpassIndex;
                 },
-                [&](SubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, subpassIndex); },
-                [&](NextSubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, ++subpassIndex); },
 
-                NODE(RenderPassEndNode),
+                [&](SubpassFunctionNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "SubpassFunctionNode");
+                    node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, subpassIndex);
+                },
+
+                [&](NextSubpassFunctionNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "NextSubpassFunctionNode");
+                    node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, ++subpassIndex);
+                },
+
+                [&](RenderPassEndNode &node) {
+                    TRACE_EVENT_INSTANT("gpu", "RenderPassEndNode");
+                    node(slot->commandBuffer, slot->cycle, gpu);
+                },
             }, node);
             #undef NODE
         }
@@ -258,11 +293,35 @@ namespace skyline::gpu::interconnect {
         condition.notify_all();
     }
 
+    void CheckpointPollerThread::Run() {
+        u32 prevCheckpoint{};
+        for (size_t iteration{}; true; iteration++) {
+            u32 curCheckpoint{state.gpu->debugTracingBuffer.as<u32>()};
+
+            if ((iteration % 1024) == 0)
+                Logger::Info("Current Checkpoint: {}", curCheckpoint);
+
+            while (prevCheckpoint != curCheckpoint) {
+                // Make sure to report an event for every checkpoint inbetween the previous and current values, to ensure the perfetto trace is consistent
+                prevCheckpoint++;
+                TRACE_EVENT_INSTANT("gpu", "Checkpoint", "id", prevCheckpoint, [&](perfetto::EventContext ctx) {
+                    ctx.event()->add_terminating_flow_ids(prevCheckpoint);
+                });
+            }
+
+            prevCheckpoint = curCheckpoint;
+            std::this_thread::sleep_for(std::chrono::microseconds(5));
+        }
+    }
+
+    CheckpointPollerThread::CheckpointPollerThread(const DeviceState &state) : state{state}, thread{&CheckpointPollerThread::Run, this} {}
+
     CommandExecutor::CommandExecutor(const DeviceState &state)
         : state{state},
           gpu{*state.gpu},
           recordThread{state},
           waiterThread{state},
+          checkpointPollerThread{EnableGpuCheckpoints ? std::optional<CheckpointPollerThread>{state} : std::optional<CheckpointPollerThread>{}},
           tag{AllocateTag()} {
         RotateRecordSlot();
     }
@@ -512,6 +571,21 @@ namespace skyline::gpu::interconnect {
             callback();
     }
 
+    u32 CommandExecutor::AddCheckpointImpl(std::string_view annotation) {
+        if (renderPass)
+            FinishRenderPass();
+
+        slot->nodes.emplace_back(node::CheckpointNode{gpu.megaBufferAllocator.Push(cycle, span<u32>(&nextCheckpointId, 1).cast<u8>()), nextCheckpointId});
+
+        TRACE_EVENT_INSTANT("gpu", "Mark Checkpoint", "id", nextCheckpointId, "annotation", [&annotation](perfetto::TracedValue context) {
+            std::move(context).WriteString(annotation.data(), annotation.size());
+        }, [&](perfetto::EventContext ctx) {
+            ctx.event()->add_flow_ids(nextCheckpointId);
+        });
+
+        return nextCheckpointId++;
+    }
+
     void CommandExecutor::SubmitInternal() {
         if (renderPass)
             FinishRenderPass();
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
index 9eaf5997..024fe133 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@@ -12,6 +12,8 @@
 #include "common/spin_lock.h"
 
 namespace skyline::gpu::interconnect {
+    constexpr bool EnableGpuCheckpoints{false}; //!< Whether to enable GPU debugging checkpoints (WILL DECREASE PERF SIGNIFICANTLY)
+
     /*
      * @brief Thread responsible for recording Vulkan commands from the execution nodes and submitting them
      */
@@ -119,6 +121,20 @@ namespace skyline::gpu::interconnect {
         void Queue(std::shared_ptr<FenceCycle> cycle, std::function<void()> &&callback);
     };
 
+    /**
+     * @brief Polls the debug buffer for checkpoint updates and reports them to perfetto
+     */
+    class CheckpointPollerThread {
+      private:
+        const DeviceState &state;
+        std::thread thread;
+
+        void Run();
+
+      public:
+        CheckpointPollerThread(const DeviceState &state);
+    };
+
     /**
      * @brief Assembles a Vulkan command stream with various nodes and manages execution of the produced graph
      * @note This class is **NOT** thread-safe and should **ONLY** be utilized by a single thread
@@ -130,6 +146,7 @@ namespace skyline::gpu::interconnect {
         CommandRecordThread recordThread;
         CommandRecordThread::Slot *slot{};
         ExecutionWaiterThread waiterThread;
+        std::optional<CheckpointPollerThread> checkpointPollerThread;
         node::RenderPassNode *renderPass{};
         size_t subpassCount{}; //!< The number of subpasses in the current render pass
         u32 renderPassIndex{};
@@ -183,6 +200,8 @@ namespace skyline::gpu::interconnect {
         std::vector<std::function<void()>> flushCallbacks; //!< Set of persistent callbacks that will be called at the start of Execute in order to flush data required for recording
         std::vector<std::function<void()>> pipelineChangeCallbacks; //!< Set of persistent callbacks that will be called after any non-Maxwell 3D engine changes the active pipeline
 
+        u32 nextCheckpointId{}; //!< The ID of the next debug checkpoint to be allocated
+
         void RotateRecordSlot();
 
         /**
@@ -211,6 +230,11 @@ namespace skyline::gpu::interconnect {
 
         void AttachBufferBase(std::shared_ptr<Buffer> buffer);
 
+        /**
+         * @brief Non-gated implementation of `AddCheckpoint`
+         */
+        u32 AddCheckpointImpl(std::string_view annotation);
+
       public:
         std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
         LinearAllocatorState<> *allocator;
@@ -305,6 +329,18 @@ namespace skyline::gpu::interconnect {
          */
         void NotifyPipelineChange();
 
+        /**
+         * @brief Records a checkpoint into the GPU command stream at the current
+         * @param annotation A string annotation to display in perfetto for this checkpoint
+         * @return The checkpoint ID
+         */
+        u32 AddCheckpoint(std::string_view annotation) {
+            if constexpr (EnableGpuCheckpoints)
+                return AddCheckpointImpl(annotation);
+            else
+                return 0;
+        }
+
         /**
          * @brief Execute all the nodes and submit the resulting command buffer to the GPU
          * @param callback A function to call upon GPU completion of the submission
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h b/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h
index 6507feae..1f102009 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h
@@ -122,5 +122,13 @@ namespace skyline::gpu::interconnect::node {
         }
     };
 
-    using NodeVariant = std::variant<FunctionNode, RenderPassNode, NextSubpassNode, SubpassFunctionNode, NextSubpassFunctionNode, RenderPassEndNode>; //!< A variant encompassing all command nodes types
+    /**
+     * @brief A node which copies the contained ID value to the debug tracking buffer
+     */
+    struct CheckpointNode {
+        BufferBinding binding; //!< Binding for a GPU-side buffer containing the checkpoint ID
+        u32 id;
+    };
+
+    using NodeVariant = std::variant<FunctionNode, CheckpointNode, RenderPassNode, NextSubpassNode, SubpassFunctionNode, NextSubpassFunctionNode, RenderPassEndNode>; //!< A variant encompassing all command nodes types
 }
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp b/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp
index 51c841eb..da618587 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp
@@ -158,6 +158,8 @@ namespace skyline::gpu::interconnect {
                                     vk::PipelineStageFlagBits::eAllGraphics, vk::PipelineStageFlagBits::eAllGraphics);
             }
         );
+        executor.AddCheckpoint("After blit");
+
 
         executor.NotifyPipelineChange();
     }
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp
index ecf391bc..d428a67f 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp
@@ -65,6 +65,7 @@ namespace skyline::gpu::interconnect::kepler_compute {
         auto *drawParams{ctx.executor.allocator->EmplaceUntracked<DrawParams>(DrawParams{stateUpdater, {qmd.ctaRasterWidth, qmd.ctaRasterHeight, qmd.ctaRasterDepth}, srcStageMask, dstStageMask})};
 
 
+        ctx.executor.AddCheckpoint("Before dispatch");
         ctx.executor.AddOutsideRpCommand([drawParams](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &gpu) {
             drawParams->stateUpdater.RecordAll(gpu, commandBuffer);
 
@@ -76,5 +77,6 @@ namespace skyline::gpu::interconnect::kepler_compute {
 
             commandBuffer.dispatch(drawParams->dimensions[0], drawParams->dimensions[1], drawParams->dimensions[2]);
         });
+        ctx.executor.AddCheckpoint("After dispatch");
     }
 }
\ No newline at end of file
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp
index ee51b379..289f0de6 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp
@@ -64,6 +64,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
                 callbackData.view.GetBuffer()->BlockAllCpuBackingWrites();
 
                 auto srcGpuAllocation{callbackData.ctx.gpu.megaBufferAllocator.Push(callbackData.ctx.executor.cycle, callbackData.srcCpuBuf)};
+                callbackData.ctx.executor.AddCheckpoint("Before constant buffer load");
                 callbackData.ctx.executor.AddOutsideRpCommand([=, srcCpuBuf = callbackData.srcCpuBuf, view = callbackData.view, offset = callbackData.offset](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &gpu) {
                     auto binding{view.GetBinding(gpu)};
                     vk::BufferCopy copyRegion{
@@ -77,6 +78,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
                         .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite
                     }, {}, {});
                 });
+                callbackData.ctx.executor.AddCheckpoint("After constant buffer load");
             });
         }
     }
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp
index f8074bb9..3dfc68a5 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp
@@ -207,6 +207,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
             return;
 
         TRACE_EVENT("gpu", "Maxwell3D::Clear");
+        ctx.executor.AddCheckpoint("Before clear");
 
         auto needsAttachmentClearCmd{[&](auto &view) {
             return scissor.offset.x != 0 || scissor.offset.y != 0 ||
@@ -281,13 +282,14 @@ namespace skyline::gpu::interconnect::maxwell3d {
             }
         }
 
-        if (clearAttachments.empty())
-            return;
+        if (!clearAttachments.empty()) {
+            std::array<TextureView *, 1> colorAttachments{colorView ? &*colorView : nullptr};
+            ctx.executor.AddSubpass([clearAttachments, clearRects](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &, vk::RenderPass, u32) {
+                commandBuffer.clearAttachments(clearAttachments, span(clearRects).first(clearAttachments.size()));
+            }, renderArea, {}, {}, colorView ? colorAttachments : span<TextureView *>{}, depthStencilView ? &*depthStencilView : nullptr);
+        }
 
-        std::array<TextureView *, 1> colorAttachments{colorView ? &*colorView : nullptr};
-        ctx.executor.AddSubpass([clearAttachments, clearRects](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &, vk::RenderPass, u32) {
-            commandBuffer.clearAttachments(clearAttachments, span(clearRects).first(clearAttachments.size()));
-        }, renderArea, {}, {}, colorView ? colorAttachments : span<TextureView *>{}, depthStencilView ? &*depthStencilView : nullptr);
+        ctx.executor.AddCheckpoint("After clear");
     }
 
     void Maxwell3D::Draw(engine::DrawTopology topology, bool transformFeedbackEnable, bool indexed, u32 count, u32 first, u32 instanceCount, u32 vertexOffset, u32 firstInstance) {
@@ -333,6 +335,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
 
 
         constantBuffers.ResetQuickBind();
+        ctx.executor.AddCheckpoint("Before draw");
         ctx.executor.AddSubpass([drawParams](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &gpu, vk::RenderPass, u32) {
             drawParams->stateUpdater.RecordAll(gpu, commandBuffer);
 
@@ -347,5 +350,6 @@ namespace skyline::gpu::interconnect::maxwell3d {
             if (drawParams->transformFeedbackEnable)
                 commandBuffer.endTransformFeedbackEXT(0, {}, {});
         }, scissor, activeDescriptorSetSampledImages, {}, activeState.GetColorAttachments(), activeState.GetDepthAttachment(), !ctx.gpu.traits.quirks.relaxedRenderPassCompatibility, srcStageMask, dstStageMask);
+        ctx.executor.AddCheckpoint("After draw");
     }
 }
\ No newline at end of file