From 7c9212743ce2c7b90c10da42b28251b058cedc23 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Mon, 19 Sep 2022 14:38:36 +0100 Subject: [PATCH] Implement asynchronous command recording Recording of command nodes into Vulkan command buffers is very easily parallelisable as it can effectively be treated as part of the GPU execution, which is inherently async. By moving it to a seperate thread we can shave off about 20% of GPFIFO execution time. It should be noted that the command scheduler command buffer infra is no longer used, since we need to record texture updates on the GPFIFO thread (while another slot is being recorded on the record thread) and then use the same command buffer on the record thread later. This ends up requiring a pool per slot, which is reasonable considering we only have four slots by default. --- .../gpu/interconnect/command_executor.cpp | 234 ++++++++++++------ .../gpu/interconnect/command_executor.h | 65 ++++- .../interconnect/maxwell_3d/maxwell_3d.cpp | 2 +- .../maxwell_3d/pipeline_manager.cpp | 14 +- 4 files changed, 218 insertions(+), 97 deletions(-) diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp index 213fab86..e8214fa6 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp @@ -1,16 +1,137 @@ // SPDX-License-Identifier: MPL-2.0 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/) +#include #include #include "command_executor.h" namespace skyline::gpu::interconnect { - CommandExecutor::CommandExecutor(const DeviceState &state) : gpu{*state.gpu}, activeCommandBuffer{gpu.scheduler.AllocateCommandBuffer()}, cycle{activeCommandBuffer.GetFenceCycle()}, tag{AllocateTag()} {} + CommandRecordThread::CommandRecordThread(const DeviceState &state) : state{state}, thread{&CommandRecordThread::Run, this} {} + + static vk::raii::CommandBuffer AllocateRaiiCommandBuffer(GPU &gpu, vk::raii::CommandPool &pool) { + return {gpu.vkDevice, (*gpu.vkDevice).allocateCommandBuffers( + { + .commandPool = *pool, + .level = vk::CommandBufferLevel::ePrimary, + .commandBufferCount = 1 + }, *gpu.vkDevice.getDispatcher()).front(), + *pool}; + } + + CommandRecordThread::Slot::Slot(GPU &gpu) + : commandPool{gpu.vkDevice, + vk::CommandPoolCreateInfo{ + .flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer | vk::CommandPoolCreateFlagBits::eTransient, + .queueFamilyIndex = gpu.vkQueueFamilyIndex + } + }, + commandBuffer{AllocateRaiiCommandBuffer(gpu, commandPool)}, + fence{gpu.vkDevice, vk::FenceCreateInfo{ .flags = vk::FenceCreateFlagBits::eSignaled }}, + cycle{std::make_shared(gpu.vkDevice, *fence, true)} {} + + std::shared_ptr CommandRecordThread::Slot::Reset(GPU &gpu) { + cycle->Wait(); + cycle = std::make_shared(gpu.vkDevice, *fence); + commandBuffer.reset(); + return cycle; + } + + void CommandRecordThread::ProcessSlot(Slot *slot) { + auto &gpu{*state.gpu}; + + vk::RenderPass lRenderPass; + u32 subpassIndex; + + std::scoped_lock bufferLock{gpu.buffer.recreationMutex}; + using namespace node; + for (NodeVariant &node : slot->nodes) { + #define NODE(name) [&](name& node) { node(slot->commandBuffer, slot->cycle, gpu); } + std::visit(VariantVisitor{ + NODE(FunctionNode), + + [&](RenderPassNode &node) { + lRenderPass = node(slot->commandBuffer, slot->cycle, gpu); + subpassIndex = 0; + }, + + [&](NextSubpassNode &node) { + node(slot->commandBuffer, slot->cycle, gpu); + ++subpassIndex; + }, + [&](SubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, subpassIndex); }, + [&](NextSubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, ++subpassIndex); }, + + NODE(RenderPassEndNode), + }, node); + #undef NODE + } + + slot->commandBuffer.end(); + + gpu.scheduler.SubmitCommandBuffer(slot->commandBuffer, slot->cycle); + + slot->nodes.clear(); + slot->allocator.Reset(); + } + + void CommandRecordThread::Run() { + auto &gpu{*state.gpu}; + std::array slots{{gpu, gpu, gpu, gpu}}; + outgoing.AppendTranform(span(slots), [](auto &slot) { return &slot; }); + + if (int result{pthread_setname_np(pthread_self(), "Sky-CmdRecord")}) + Logger::Warn("Failed to set the thread name: {}", strerror(result)); + + try { + signal::SetSignalHandler({SIGINT, SIGILL, SIGTRAP, SIGBUS, SIGFPE, SIGSEGV}, signal::ExceptionalSignalHandler); + + incoming.Process([this](Slot *slot) { + ProcessSlot(slot); + outgoing.Push(slot); + }, [] {}); + } catch (const signal::SignalException &e) { + Logger::Error("{}\nStack Trace:{}", e.what(), state.loader->GetStackTrace(e.frames)); + if (state.process) + state.process->Kill(false); + else + std::rethrow_exception(std::current_exception()); + } catch (const std::exception &e) { + Logger::Error(e.what()); + if (state.process) + state.process->Kill(false); + else + std::rethrow_exception(std::current_exception()); + } + } + + CommandRecordThread::Slot *CommandRecordThread::AcquireSlot() { + return outgoing.Pop(); + } + + void CommandRecordThread::ReleaseSlot(Slot *slot) { + incoming.Push(slot); + } + + CommandExecutor::CommandExecutor(const DeviceState &state) + : gpu{*state.gpu}, + recordThread{state}, + tag{AllocateTag()} { + RotateRecordSlot(); + } CommandExecutor::~CommandExecutor() { cycle->Cancel(); } + void CommandExecutor::RotateRecordSlot() { + if (slot) + recordThread.ReleaseSlot(slot); + + slot = recordThread.AcquireSlot(); + cycle = slot->Reset(gpu); + allocator = &slot->allocator; + } + TextureManager &CommandExecutor::AcquireTextureManager() { if (!textureManagerLock) textureManagerLock.emplace(gpu.texture); @@ -55,8 +176,8 @@ namespace skyline::gpu::interconnect { if (renderPass == nullptr || renderPass->renderArea != renderArea || subpassCount >= gpu.traits.quirks.maxSubpassCount) { // We need to create a render pass if one doesn't already exist or the current one isn't compatible if (renderPass != nullptr) - nodes.emplace_back(std::in_place_type_t()); - renderPass = &std::get(nodes.emplace_back(std::in_place_type_t(), renderArea)); + slot->nodes.emplace_back(std::in_place_type_t()); + renderPass = &std::get(slot->nodes.emplace_back(std::in_place_type_t(), renderArea)); addSubpass(); subpassCount = 1; return false; @@ -77,7 +198,7 @@ namespace skyline::gpu::interconnect { void CommandExecutor::FinishRenderPass() { if (renderPass) { - nodes.emplace_back(std::in_place_type_t()); + slot->nodes.emplace_back(std::in_place_type_t()); renderPass = nullptr; subpassCount = 0; @@ -168,9 +289,9 @@ namespace skyline::gpu::interconnect { bool gotoNext{CreateRenderPassWithSubpass(renderArea, inputAttachments, colorAttachments, depthStencilAttachment ? &*depthStencilAttachment : nullptr)}; if (gotoNext) - nodes.emplace_back(std::in_place_type_t(), std::forward(function)); + slot->nodes.emplace_back(std::in_place_type_t(), std::forward(function)); else - nodes.emplace_back(std::in_place_type_t(), std::forward(function)); + slot->nodes.emplace_back(std::in_place_type_t(), std::forward(function)); if (exclusiveSubpass) FinishRenderPass(); @@ -180,14 +301,14 @@ namespace skyline::gpu::interconnect { if (renderPass) FinishRenderPass(); - nodes.emplace_back(std::in_place_type_t(), std::forward(function)); + slot->nodes.emplace_back(std::in_place_type_t(), std::forward(function)); } void CommandExecutor::AddClearColorSubpass(TextureView *attachment, const vk::ClearColorValue &value) { bool gotoNext{CreateRenderPassWithSubpass(vk::Rect2D{.extent = attachment->texture->dimensions}, {}, attachment, nullptr)}; if (renderPass->ClearColorAttachment(0, value, gpu)) { if (gotoNext) - nodes.emplace_back(std::in_place_type_t()); + slot->nodes.emplace_back(std::in_place_type_t()); } else { auto function{[scissor = attachment->texture->dimensions, value](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr &, GPU &, vk::RenderPass, u32) { commandBuffer.clearAttachments(vk::ClearAttachment{ @@ -202,9 +323,9 @@ namespace skyline::gpu::interconnect { }}; if (gotoNext) - nodes.emplace_back(std::in_place_type_t(), function); + slot->nodes.emplace_back(std::in_place_type_t(), function); else - nodes.emplace_back(std::in_place_type_t(), function); + slot->nodes.emplace_back(std::in_place_type_t(), function); } } @@ -212,7 +333,7 @@ namespace skyline::gpu::interconnect { bool gotoNext{CreateRenderPassWithSubpass(vk::Rect2D{.extent = attachment->texture->dimensions}, {}, {}, attachment)}; if (renderPass->ClearDepthStencilAttachment(value, gpu)) { if (gotoNext) - nodes.emplace_back(std::in_place_type_t()); + slot->nodes.emplace_back(std::in_place_type_t()); } else { auto function{[aspect = attachment->format->vkAspect, extent = attachment->texture->dimensions, value](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr &, GPU &, vk::RenderPass, u32) { commandBuffer.clearAttachments(vk::ClearAttachment{ @@ -226,9 +347,9 @@ namespace skyline::gpu::interconnect { }}; if (gotoNext) - nodes.emplace_back(std::in_place_type_t(), function); + slot->nodes.emplace_back(std::in_place_type_t(), function); else - nodes.emplace_back(std::in_place_type_t(), function); + slot->nodes.emplace_back(std::in_place_type_t(), function); } } @@ -241,13 +362,12 @@ namespace skyline::gpu::interconnect { FinishRenderPass(); { - auto &commandBuffer{*activeCommandBuffer}; - commandBuffer.begin(vk::CommandBufferBeginInfo{ + slot->commandBuffer.begin(vk::CommandBufferBeginInfo{ .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit, }); // We need this barrier here to ensure that resources are in the state we expect them to be in, we shouldn't overwrite resources while prior commands might still be using them or read from them while they might be modified by prior commands - commandBuffer.pipelineBarrier( + slot->commandBuffer.pipelineBarrier( vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{ .srcAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite, .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite, @@ -255,57 +375,27 @@ namespace skyline::gpu::interconnect { ); for (const auto &texture : attachedTextures) - texture->SynchronizeHostInline(commandBuffer, cycle, true); + texture->SynchronizeHostInline(slot->commandBuffer, cycle, true); + } - vk::RenderPass lRenderPass; - u32 subpassIndex; + for (const auto &attachedBuffer : attachedBuffers) + if (attachedBuffer->SequencedCpuBackingWritesBlocked()) + attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer - using namespace node; - for (NodeVariant &node : nodes) { - #define NODE(name) [&](name& node) { node(commandBuffer, cycle, gpu); } - std::visit(VariantVisitor{ - NODE(FunctionNode), + for (const auto &attachedTexture : attachedTextures) { + // We don't need to attach the Texture to the cycle as a TextureView will already be attached + cycle->ChainCycle(attachedTexture->cycle); + attachedTexture->cycle = cycle; + } - [&](RenderPassNode &node) { - lRenderPass = node(commandBuffer, cycle, gpu); - subpassIndex = 0; - }, - - [&](NextSubpassNode &node) { - node(commandBuffer, cycle, gpu); - ++subpassIndex; - }, - [&](SubpassFunctionNode &node) { node(commandBuffer, cycle, gpu, lRenderPass, subpassIndex); }, - [&](NextSubpassFunctionNode &node) { node(commandBuffer, cycle, gpu, lRenderPass, ++subpassIndex); }, - - NODE(RenderPassEndNode), - }, node); - #undef NODE - } - - commandBuffer.end(); - - for (const auto &attachedBuffer : attachedBuffers) - if (attachedBuffer->SequencedCpuBackingWritesBlocked()) - attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer, this is done directly prior to submission to prevent stalls - - gpu.scheduler.SubmitCommandBuffer(commandBuffer, cycle); - - nodes.clear(); - - for (const auto &attachedTexture : attachedTextures) { - // We don't need to attach the Texture to the cycle as a TextureView will already be attached - cycle->ChainCycle(attachedTexture->cycle); - attachedTexture->cycle = cycle; - } - - for (const auto &attachedBuffer : attachedBuffers) { - if (attachedBuffer->RequiresCycleAttach() ) { - cycle->AttachObject(attachedBuffer.buffer); - attachedBuffer->UpdateCycle(cycle); - } + for (const auto &attachedBuffer : attachedBuffers) { + if (attachedBuffer->RequiresCycleAttach() ) { + cycle->AttachObject(attachedBuffer.buffer); + attachedBuffer->UpdateCycle(cycle); } } + + RotateRecordSlot(); } void CommandExecutor::ResetInternal() { @@ -314,32 +404,16 @@ namespace skyline::gpu::interconnect { attachedBuffers.clear(); bufferManagerLock.reset(); megaBufferAllocatorLock.reset(); - allocator.Reset(); + allocator->Reset(); } void CommandExecutor::Submit() { for (const auto &callback : flushCallbacks) callback(); - if (!nodes.empty()) { + if (!slot->nodes.empty()) { TRACE_EVENT("gpu", "CommandExecutor::Submit"); SubmitInternal(); - activeCommandBuffer = gpu.scheduler.AllocateCommandBuffer(); - cycle = activeCommandBuffer.GetFenceCycle(); - } - ResetInternal(); - - executionNumber++; - } - - void CommandExecutor::SubmitWithFlush() { - for (const auto &callback : flushCallbacks) - callback(); - - if (!nodes.empty()) { - TRACE_EVENT("gpu", "CommandExecutor::SubmitWithFlush"); - SubmitInternal(); - cycle = activeCommandBuffer.Reset(); } ResetInternal(); diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h index 4b6f3a54..e8bf0ee8 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h @@ -10,6 +10,57 @@ #include "command_nodes.h" namespace skyline::gpu::interconnect { + /* + * @brief Thread responsible for recording Vulkan commands from the execution nodes and submitting them + */ + class CommandRecordThread { + public: + /** + * @brief Single execution slot, buffered back and forth between the GPFIFO thread and the record thread + */ + struct Slot { + vk::raii::CommandPool commandPool; //!< Use one command pool per slot since command buffers from different slots may be recorded into on multiple threads at the same time + vk::raii::CommandBuffer commandBuffer; + vk::raii::Fence fence; + std::shared_ptr cycle; + boost::container::stable_vector nodes; + LinearAllocatorState<> allocator; + + Slot(GPU &gpu); + + /** + * @brief Waits on the fence and resets the command buffer + * @note A new fence cycle for the reset command buffer + */ + std::shared_ptr Reset(GPU &gpu); + }; + + private: + const DeviceState &state; + std::thread thread; + + static constexpr size_t ActiveRecordSlots{4}; //!< Maximum number of simultaneously active slots + CircularQueue incoming{ActiveRecordSlots}; //!< Slots pending recording + CircularQueue outgoing{ActiveRecordSlots}; //!< Slots that have been submitted, may still be active on the GPU + + void ProcessSlot(Slot *slot); + + void Run(); + + public: + CommandRecordThread(const DeviceState &state); + + /** + * @return A free slot, `Reset` needs to be called before accessing it + */ + Slot *AcquireSlot(); + + /** + * @brief Submit a slot to be recorded + */ + void ReleaseSlot(Slot *slot); + }; + /** * @brief Assembles a Vulkan command stream with various nodes and manages execution of the produced graph * @note This class is **NOT** thread-safe and should **ONLY** be utilized by a single thread @@ -17,11 +68,10 @@ namespace skyline::gpu::interconnect { class CommandExecutor { private: GPU &gpu; - CommandScheduler::ActiveCommandBuffer activeCommandBuffer; - boost::container::stable_vector nodes; + CommandRecordThread recordThread; + CommandRecordThread::Slot *slot{}; node::RenderPassNode *renderPass{}; size_t subpassCount{}; //!< The number of subpasses in the current render pass - std::optional> textureManagerLock; //!< The lock on the texture manager, this is locked for the duration of the command execution from the first usage inside an execution to the submission std::optional> bufferManagerLock; //!< The lock on the buffer manager, see above for details std::optional> megaBufferAllocatorLock; //!< The lock on the megabuffer allocator, see above for details @@ -72,6 +122,8 @@ namespace skyline::gpu::interconnect { std::vector> flushCallbacks; //!< Set of persistent callbacks that will be called at the start of Execute in order to flush data required for recording + void RotateRecordSlot(); + /** * @brief Create a new render pass and subpass with the specified attachments, if one doesn't already exist or the current one isn't compatible * @note This also checks for subpass coalescing and will merge the new subpass with the previous one when possible @@ -97,7 +149,7 @@ namespace skyline::gpu::interconnect { public: std::shared_ptr cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands - LinearAllocatorState<> allocator; + LinearAllocatorState<> *allocator; ContextTag tag; //!< The tag associated with this command executor, any tagged resource locking must utilize this tag size_t executionNumber{}; @@ -193,10 +245,5 @@ namespace skyline::gpu::interconnect { * @brief Execute all the nodes and submit the resulting command buffer to the GPU */ void Submit(); - - /** - * @brief Execute all the nodes and submit the resulting command buffer to the GPU then wait for the completion of the command buffer - */ - void SubmitWithFlush(); }; } diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp index b05b9ff2..16704615 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp @@ -163,7 +163,7 @@ namespace skyline::gpu::interconnect::maxwell3d { } void Maxwell3D::Draw(engine::DrawTopology topology, bool indexed, u32 count, u32 first, u32 instanceCount, u32 vertexOffset, u32 firstInstance) { - StateUpdateBuilder builder{ctx.executor.allocator}; + StateUpdateBuilder builder{*ctx.executor.allocator}; Pipeline *oldPipeline{activeState.GetPipeline()}; activeState.Update(ctx, builder, indexed, topology, count); diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp index e60c63ee..d9015c89 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp @@ -615,9 +615,9 @@ namespace skyline::gpu::interconnect::maxwell3d { u32 bufferIdx{}; u32 imageIdx{}; - auto writes{ctx.executor.allocator.AllocateUntracked(descriptorInfo.writeDescCount)}; - auto bufferDescs{ctx.executor.allocator.AllocateUntracked(descriptorInfo.totalBufferDescCount)}; - auto bufferDescViews{ctx.executor.allocator.AllocateUntracked(descriptorInfo.totalBufferDescCount)}; + auto writes{ctx.executor.allocator->AllocateUntracked(descriptorInfo.writeDescCount)}; + auto bufferDescs{ctx.executor.allocator->AllocateUntracked(descriptorInfo.totalBufferDescCount)}; + auto bufferDescViews{ctx.executor.allocator->AllocateUntracked(descriptorInfo.totalBufferDescCount)}; auto writeBufferDescs{[&](vk::DescriptorType type, const auto &descs, u32 count, auto getBufferCb) { if (!descs.empty()) { @@ -658,13 +658,13 @@ namespace skyline::gpu::interconnect::maxwell3d { const auto &cbufUsageInfo{descriptorInfo.cbufUsages[static_cast(quickBind.stage)][quickBind.index]}; const auto &shaderInfo{shaderStages[static_cast(quickBind.stage)].info}; auto &stageConstantBuffers{constantBuffers[static_cast(quickBind.stage)]}; - auto copy{ctx.executor.allocator.AllocateUntracked()}; - auto writes{ctx.executor.allocator.AllocateUntracked(cbufUsageInfo.writeDescCount)}; + auto copy{ctx.executor.allocator->AllocateUntracked()}; + auto writes{ctx.executor.allocator->AllocateUntracked(cbufUsageInfo.writeDescCount)}; size_t writeIdx{}; size_t bufferIdx{}; - auto bufferDescs{ctx.executor.allocator.AllocateUntracked(cbufUsageInfo.totalBufferDescCount)}; - auto bufferDescViews{ctx.executor.allocator.AllocateUntracked(cbufUsageInfo.totalBufferDescCount)}; + auto bufferDescs{ctx.executor.allocator->AllocateUntracked(cbufUsageInfo.totalBufferDescCount)}; + auto bufferDescViews{ctx.executor.allocator->AllocateUntracked(cbufUsageInfo.totalBufferDescCount)}; // TODO: opt this to do partial copy *copy = vk::CopyDescriptorSet{