diff --git a/app/src/main/cpp/skyline/gpu/buffer.h b/app/src/main/cpp/skyline/gpu/buffer.h index 2b2db8f5..940703db 100644 --- a/app/src/main/cpp/skyline/gpu/buffer.h +++ b/app/src/main/cpp/skyline/gpu/buffer.h @@ -31,6 +31,8 @@ namespace skyline::gpu { BufferBinding(vk::Buffer buffer, vk::DeviceSize offset = 0, vk::DeviceSize size = 0) : buffer{buffer}, offset{offset}, size{size} {} + BufferBinding(MegaBufferAllocator::Allocation allocation) : buffer{allocation.buffer}, offset{allocation.offset}, size{allocation.region.size()} {} + operator bool() const { return buffer; } diff --git a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp index cbe45632..a80602e7 100644 --- a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp +++ b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp @@ -3,7 +3,9 @@ #include #include +#include #include "command_scheduler.h" +#include "common/exception.h" namespace skyline::gpu { void CommandScheduler::WaiterThread() { @@ -91,16 +93,22 @@ namespace skyline::gpu { fullSignalSemaphores.push_back(cycle->semaphore); { - std::scoped_lock lock{gpu.queueMutex}; - gpu.vkQueue.submit(vk::SubmitInfo{ - .commandBufferCount = 1, - .pCommandBuffers = &*commandBuffer, - .waitSemaphoreCount = static_cast(fullWaitSemaphores.size()), - .pWaitSemaphores = fullWaitSemaphores.data(), - .pWaitDstStageMask = fullWaitStages.data(), - .signalSemaphoreCount = static_cast(fullSignalSemaphores.size()), - .pSignalSemaphores = fullSignalSemaphores.data(), - }, cycle->fence); + try { + std::scoped_lock lock{gpu.queueMutex}; + gpu.vkQueue.submit(vk::SubmitInfo{ + .commandBufferCount = 1, + .pCommandBuffers = &*commandBuffer, + .waitSemaphoreCount = static_cast(fullWaitSemaphores.size()), + .pWaitSemaphores = fullWaitSemaphores.data(), + .pWaitDstStageMask = fullWaitStages.data(), + .signalSemaphoreCount = static_cast(fullSignalSemaphores.size()), + .pSignalSemaphores = fullSignalSemaphores.data(), + }, cycle->fence); + } catch (const vk::DeviceLostError &e) { + // Wait 5 seconds to give traces etc. time to settle + std::this_thread::sleep_for(std::chrono::seconds(5)); + throw exception("Vulkan device lost!"); + } } cycle->NotifySubmitted(); diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp index 5d286751..02dcdfcc 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp @@ -1,6 +1,7 @@ // SPDX-License-Identifier: MPL-2.0 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/) +#include #include #include #include @@ -97,31 +98,65 @@ namespace skyline::gpu::interconnect { } void CommandRecordThread::ProcessSlot(Slot *slot) { - TRACE_EVENT_FMT("gpu", "ProcessSlot: 0x{:X}, execution: {}", slot, slot->executionTag); + TRACE_EVENT_FMT("gpu", "ProcessSlot: 0x{:X}, execution: {}", slot, u64{slot->executionTag}); auto &gpu{*state.gpu}; + std::scoped_lock lock{gpu.buffer.recreationMutex}; vk::RenderPass lRenderPass; u32 subpassIndex; using namespace node; for (NodeVariant &node : slot->nodes) { - #define NODE(name) [&](name& node) { node(slot->commandBuffer, slot->cycle, gpu); } std::visit(VariantVisitor{ - NODE(FunctionNode), + [&](FunctionNode &node) { + TRACE_EVENT_INSTANT("gpu", "FunctionNode"); + node(slot->commandBuffer, slot->cycle, gpu); + }, + + [&](CheckpointNode &node) { + RecordFullBarrier(slot->commandBuffer); + + TRACE_EVENT_INSTANT("gpu", "CheckpointNode", "id", node.id, [&](perfetto::EventContext ctx) { + ctx.event()->add_flow_ids(node.id); + }); + + std::array copy{vk::BufferCopy{ + .size = node.binding.size, + .srcOffset = node.binding.offset, + .dstOffset = 0, + }}; + + slot->commandBuffer.copyBuffer(node.binding.buffer, gpu.debugTracingBuffer.vkBuffer, copy); + + RecordFullBarrier(slot->commandBuffer); + }, [&](RenderPassNode &node) { + TRACE_EVENT_INSTANT("gpu", "RenderPassNode"); lRenderPass = node(slot->commandBuffer, slot->cycle, gpu); subpassIndex = 0; }, [&](NextSubpassNode &node) { + TRACE_EVENT_INSTANT("gpu", "NextSubpassNode"); node(slot->commandBuffer, slot->cycle, gpu); ++subpassIndex; }, - [&](SubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, subpassIndex); }, - [&](NextSubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, ++subpassIndex); }, - NODE(RenderPassEndNode), + [&](SubpassFunctionNode &node) { + TRACE_EVENT_INSTANT("gpu", "SubpassFunctionNode"); + node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, subpassIndex); + }, + + [&](NextSubpassFunctionNode &node) { + TRACE_EVENT_INSTANT("gpu", "NextSubpassFunctionNode"); + node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, ++subpassIndex); + }, + + [&](RenderPassEndNode &node) { + TRACE_EVENT_INSTANT("gpu", "RenderPassEndNode"); + node(slot->commandBuffer, slot->cycle, gpu); + }, }, node); #undef NODE } @@ -258,11 +293,35 @@ namespace skyline::gpu::interconnect { condition.notify_all(); } + void CheckpointPollerThread::Run() { + u32 prevCheckpoint{}; + for (size_t iteration{}; true; iteration++) { + u32 curCheckpoint{state.gpu->debugTracingBuffer.as()}; + + if ((iteration % 1024) == 0) + Logger::Info("Current Checkpoint: {}", curCheckpoint); + + while (prevCheckpoint != curCheckpoint) { + // Make sure to report an event for every checkpoint inbetween the previous and current values, to ensure the perfetto trace is consistent + prevCheckpoint++; + TRACE_EVENT_INSTANT("gpu", "Checkpoint", "id", prevCheckpoint, [&](perfetto::EventContext ctx) { + ctx.event()->add_terminating_flow_ids(prevCheckpoint); + }); + } + + prevCheckpoint = curCheckpoint; + std::this_thread::sleep_for(std::chrono::microseconds(5)); + } + } + + CheckpointPollerThread::CheckpointPollerThread(const DeviceState &state) : state{state}, thread{&CheckpointPollerThread::Run, this} {} + CommandExecutor::CommandExecutor(const DeviceState &state) : state{state}, gpu{*state.gpu}, recordThread{state}, waiterThread{state}, + checkpointPollerThread{EnableGpuCheckpoints ? std::optional{state} : std::optional{}}, tag{AllocateTag()} { RotateRecordSlot(); } @@ -512,6 +571,21 @@ namespace skyline::gpu::interconnect { callback(); } + u32 CommandExecutor::AddCheckpointImpl(std::string_view annotation) { + if (renderPass) + FinishRenderPass(); + + slot->nodes.emplace_back(node::CheckpointNode{gpu.megaBufferAllocator.Push(cycle, span(&nextCheckpointId, 1).cast()), nextCheckpointId}); + + TRACE_EVENT_INSTANT("gpu", "Mark Checkpoint", "id", nextCheckpointId, "annotation", [&annotation](perfetto::TracedValue context) { + std::move(context).WriteString(annotation.data(), annotation.size()); + }, [&](perfetto::EventContext ctx) { + ctx.event()->add_flow_ids(nextCheckpointId); + }); + + return nextCheckpointId++; + } + void CommandExecutor::SubmitInternal() { if (renderPass) FinishRenderPass(); diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h index 9eaf5997..024fe133 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h @@ -12,6 +12,8 @@ #include "common/spin_lock.h" namespace skyline::gpu::interconnect { + constexpr bool EnableGpuCheckpoints{false}; //!< Whether to enable GPU debugging checkpoints (WILL DECREASE PERF SIGNIFICANTLY) + /* * @brief Thread responsible for recording Vulkan commands from the execution nodes and submitting them */ @@ -119,6 +121,20 @@ namespace skyline::gpu::interconnect { void Queue(std::shared_ptr cycle, std::function &&callback); }; + /** + * @brief Polls the debug buffer for checkpoint updates and reports them to perfetto + */ + class CheckpointPollerThread { + private: + const DeviceState &state; + std::thread thread; + + void Run(); + + public: + CheckpointPollerThread(const DeviceState &state); + }; + /** * @brief Assembles a Vulkan command stream with various nodes and manages execution of the produced graph * @note This class is **NOT** thread-safe and should **ONLY** be utilized by a single thread @@ -130,6 +146,7 @@ namespace skyline::gpu::interconnect { CommandRecordThread recordThread; CommandRecordThread::Slot *slot{}; ExecutionWaiterThread waiterThread; + std::optional checkpointPollerThread; node::RenderPassNode *renderPass{}; size_t subpassCount{}; //!< The number of subpasses in the current render pass u32 renderPassIndex{}; @@ -183,6 +200,8 @@ namespace skyline::gpu::interconnect { std::vector> flushCallbacks; //!< Set of persistent callbacks that will be called at the start of Execute in order to flush data required for recording std::vector> pipelineChangeCallbacks; //!< Set of persistent callbacks that will be called after any non-Maxwell 3D engine changes the active pipeline + u32 nextCheckpointId{}; //!< The ID of the next debug checkpoint to be allocated + void RotateRecordSlot(); /** @@ -211,6 +230,11 @@ namespace skyline::gpu::interconnect { void AttachBufferBase(std::shared_ptr buffer); + /** + * @brief Non-gated implementation of `AddCheckpoint` + */ + u32 AddCheckpointImpl(std::string_view annotation); + public: std::shared_ptr cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands LinearAllocatorState<> *allocator; @@ -305,6 +329,18 @@ namespace skyline::gpu::interconnect { */ void NotifyPipelineChange(); + /** + * @brief Records a checkpoint into the GPU command stream at the current + * @param annotation A string annotation to display in perfetto for this checkpoint + * @return The checkpoint ID + */ + u32 AddCheckpoint(std::string_view annotation) { + if constexpr (EnableGpuCheckpoints) + return AddCheckpointImpl(annotation); + else + return 0; + } + /** * @brief Execute all the nodes and submit the resulting command buffer to the GPU * @param callback A function to call upon GPU completion of the submission diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h b/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h index 6507feae..1f102009 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h @@ -122,5 +122,13 @@ namespace skyline::gpu::interconnect::node { } }; - using NodeVariant = std::variant; //!< A variant encompassing all command nodes types + /** + * @brief A node which copies the contained ID value to the debug tracking buffer + */ + struct CheckpointNode { + BufferBinding binding; //!< Binding for a GPU-side buffer containing the checkpoint ID + u32 id; + }; + + using NodeVariant = std::variant; //!< A variant encompassing all command nodes types } diff --git a/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp b/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp index 51c841eb..da618587 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp @@ -158,6 +158,8 @@ namespace skyline::gpu::interconnect { vk::PipelineStageFlagBits::eAllGraphics, vk::PipelineStageFlagBits::eAllGraphics); } ); + executor.AddCheckpoint("After blit"); + executor.NotifyPipelineChange(); } diff --git a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp index ecf391bc..d428a67f 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp @@ -65,6 +65,7 @@ namespace skyline::gpu::interconnect::kepler_compute { auto *drawParams{ctx.executor.allocator->EmplaceUntracked(DrawParams{stateUpdater, {qmd.ctaRasterWidth, qmd.ctaRasterHeight, qmd.ctaRasterDepth}, srcStageMask, dstStageMask})}; + ctx.executor.AddCheckpoint("Before dispatch"); ctx.executor.AddOutsideRpCommand([drawParams](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr &, GPU &gpu) { drawParams->stateUpdater.RecordAll(gpu, commandBuffer); @@ -76,5 +77,6 @@ namespace skyline::gpu::interconnect::kepler_compute { commandBuffer.dispatch(drawParams->dimensions[0], drawParams->dimensions[1], drawParams->dimensions[2]); }); + ctx.executor.AddCheckpoint("After dispatch"); } } \ No newline at end of file diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp index ee51b379..289f0de6 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp @@ -64,6 +64,7 @@ namespace skyline::gpu::interconnect::maxwell3d { callbackData.view.GetBuffer()->BlockAllCpuBackingWrites(); auto srcGpuAllocation{callbackData.ctx.gpu.megaBufferAllocator.Push(callbackData.ctx.executor.cycle, callbackData.srcCpuBuf)}; + callbackData.ctx.executor.AddCheckpoint("Before constant buffer load"); callbackData.ctx.executor.AddOutsideRpCommand([=, srcCpuBuf = callbackData.srcCpuBuf, view = callbackData.view, offset = callbackData.offset](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr &, GPU &gpu) { auto binding{view.GetBinding(gpu)}; vk::BufferCopy copyRegion{ @@ -77,6 +78,7 @@ namespace skyline::gpu::interconnect::maxwell3d { .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, {}, {}); }); + callbackData.ctx.executor.AddCheckpoint("After constant buffer load"); }); } } diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp index f8074bb9..3dfc68a5 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp @@ -207,6 +207,7 @@ namespace skyline::gpu::interconnect::maxwell3d { return; TRACE_EVENT("gpu", "Maxwell3D::Clear"); + ctx.executor.AddCheckpoint("Before clear"); auto needsAttachmentClearCmd{[&](auto &view) { return scissor.offset.x != 0 || scissor.offset.y != 0 || @@ -281,13 +282,14 @@ namespace skyline::gpu::interconnect::maxwell3d { } } - if (clearAttachments.empty()) - return; + if (!clearAttachments.empty()) { + std::array colorAttachments{colorView ? &*colorView : nullptr}; + ctx.executor.AddSubpass([clearAttachments, clearRects](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr &, GPU &, vk::RenderPass, u32) { + commandBuffer.clearAttachments(clearAttachments, span(clearRects).first(clearAttachments.size())); + }, renderArea, {}, {}, colorView ? colorAttachments : span{}, depthStencilView ? &*depthStencilView : nullptr); + } - std::array colorAttachments{colorView ? &*colorView : nullptr}; - ctx.executor.AddSubpass([clearAttachments, clearRects](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr &, GPU &, vk::RenderPass, u32) { - commandBuffer.clearAttachments(clearAttachments, span(clearRects).first(clearAttachments.size())); - }, renderArea, {}, {}, colorView ? colorAttachments : span{}, depthStencilView ? &*depthStencilView : nullptr); + ctx.executor.AddCheckpoint("After clear"); } void Maxwell3D::Draw(engine::DrawTopology topology, bool transformFeedbackEnable, bool indexed, u32 count, u32 first, u32 instanceCount, u32 vertexOffset, u32 firstInstance) { @@ -333,6 +335,7 @@ namespace skyline::gpu::interconnect::maxwell3d { constantBuffers.ResetQuickBind(); + ctx.executor.AddCheckpoint("Before draw"); ctx.executor.AddSubpass([drawParams](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr &, GPU &gpu, vk::RenderPass, u32) { drawParams->stateUpdater.RecordAll(gpu, commandBuffer); @@ -347,5 +350,6 @@ namespace skyline::gpu::interconnect::maxwell3d { if (drawParams->transformFeedbackEnable) commandBuffer.endTransformFeedbackEXT(0, {}, {}); }, scissor, activeDescriptorSetSampledImages, {}, activeState.GetColorAttachments(), activeState.GetDepthAttachment(), !ctx.gpu.traits.quirks.relaxedRenderPassCompatibility, srcStageMask, dstStageMask); + ctx.executor.AddCheckpoint("After draw"); } } \ No newline at end of file