Implement asynchronous command recording

Recording of command nodes into Vulkan command buffers is very easily parallelisable as it can effectively be treated as part of the GPU execution, which is inherently async. By moving it to a seperate thread we can shave off about 20% of GPFIFO execution time. It should be noted that the command scheduler command buffer infra is no longer used, since we need to record texture updates on the GPFIFO thread (while another slot is being recorded on the record thread) and then use the same command buffer on the record thread later. This ends up requiring a pool per slot, which is reasonable considering we only have four slots by default.
This commit is contained in:
Billy Laws 2022-09-19 14:38:36 +01:00
parent a197dd2b28
commit 7c9212743c
4 changed files with 218 additions and 97 deletions

View File

@ -1,16 +1,137 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
#include <loader/loader.h>
#include <gpu.h>
#include "command_executor.h"
namespace skyline::gpu::interconnect {
CommandExecutor::CommandExecutor(const DeviceState &state) : gpu{*state.gpu}, activeCommandBuffer{gpu.scheduler.AllocateCommandBuffer()}, cycle{activeCommandBuffer.GetFenceCycle()}, tag{AllocateTag()} {}
CommandRecordThread::CommandRecordThread(const DeviceState &state) : state{state}, thread{&CommandRecordThread::Run, this} {}
static vk::raii::CommandBuffer AllocateRaiiCommandBuffer(GPU &gpu, vk::raii::CommandPool &pool) {
return {gpu.vkDevice, (*gpu.vkDevice).allocateCommandBuffers(
{
.commandPool = *pool,
.level = vk::CommandBufferLevel::ePrimary,
.commandBufferCount = 1
}, *gpu.vkDevice.getDispatcher()).front(),
*pool};
}
CommandRecordThread::Slot::Slot(GPU &gpu)
: commandPool{gpu.vkDevice,
vk::CommandPoolCreateInfo{
.flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer | vk::CommandPoolCreateFlagBits::eTransient,
.queueFamilyIndex = gpu.vkQueueFamilyIndex
}
},
commandBuffer{AllocateRaiiCommandBuffer(gpu, commandPool)},
fence{gpu.vkDevice, vk::FenceCreateInfo{ .flags = vk::FenceCreateFlagBits::eSignaled }},
cycle{std::make_shared<FenceCycle>(gpu.vkDevice, *fence, true)} {}
std::shared_ptr<FenceCycle> CommandRecordThread::Slot::Reset(GPU &gpu) {
cycle->Wait();
cycle = std::make_shared<FenceCycle>(gpu.vkDevice, *fence);
commandBuffer.reset();
return cycle;
}
void CommandRecordThread::ProcessSlot(Slot *slot) {
auto &gpu{*state.gpu};
vk::RenderPass lRenderPass;
u32 subpassIndex;
std::scoped_lock bufferLock{gpu.buffer.recreationMutex};
using namespace node;
for (NodeVariant &node : slot->nodes) {
#define NODE(name) [&](name& node) { node(slot->commandBuffer, slot->cycle, gpu); }
std::visit(VariantVisitor{
NODE(FunctionNode),
[&](RenderPassNode &node) {
lRenderPass = node(slot->commandBuffer, slot->cycle, gpu);
subpassIndex = 0;
},
[&](NextSubpassNode &node) {
node(slot->commandBuffer, slot->cycle, gpu);
++subpassIndex;
},
[&](SubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, subpassIndex); },
[&](NextSubpassFunctionNode &node) { node(slot->commandBuffer, slot->cycle, gpu, lRenderPass, ++subpassIndex); },
NODE(RenderPassEndNode),
}, node);
#undef NODE
}
slot->commandBuffer.end();
gpu.scheduler.SubmitCommandBuffer(slot->commandBuffer, slot->cycle);
slot->nodes.clear();
slot->allocator.Reset();
}
void CommandRecordThread::Run() {
auto &gpu{*state.gpu};
std::array<Slot, ActiveRecordSlots> slots{{gpu, gpu, gpu, gpu}};
outgoing.AppendTranform(span<Slot>(slots), [](auto &slot) { return &slot; });
if (int result{pthread_setname_np(pthread_self(), "Sky-CmdRecord")})
Logger::Warn("Failed to set the thread name: {}", strerror(result));
try {
signal::SetSignalHandler({SIGINT, SIGILL, SIGTRAP, SIGBUS, SIGFPE, SIGSEGV}, signal::ExceptionalSignalHandler);
incoming.Process([this](Slot *slot) {
ProcessSlot(slot);
outgoing.Push(slot);
}, [] {});
} catch (const signal::SignalException &e) {
Logger::Error("{}\nStack Trace:{}", e.what(), state.loader->GetStackTrace(e.frames));
if (state.process)
state.process->Kill(false);
else
std::rethrow_exception(std::current_exception());
} catch (const std::exception &e) {
Logger::Error(e.what());
if (state.process)
state.process->Kill(false);
else
std::rethrow_exception(std::current_exception());
}
}
CommandRecordThread::Slot *CommandRecordThread::AcquireSlot() {
return outgoing.Pop();
}
void CommandRecordThread::ReleaseSlot(Slot *slot) {
incoming.Push(slot);
}
CommandExecutor::CommandExecutor(const DeviceState &state)
: gpu{*state.gpu},
recordThread{state},
tag{AllocateTag()} {
RotateRecordSlot();
}
CommandExecutor::~CommandExecutor() {
cycle->Cancel();
}
void CommandExecutor::RotateRecordSlot() {
if (slot)
recordThread.ReleaseSlot(slot);
slot = recordThread.AcquireSlot();
cycle = slot->Reset(gpu);
allocator = &slot->allocator;
}
TextureManager &CommandExecutor::AcquireTextureManager() {
if (!textureManagerLock)
textureManagerLock.emplace(gpu.texture);
@ -55,8 +176,8 @@ namespace skyline::gpu::interconnect {
if (renderPass == nullptr || renderPass->renderArea != renderArea || subpassCount >= gpu.traits.quirks.maxSubpassCount) {
// We need to create a render pass if one doesn't already exist or the current one isn't compatible
if (renderPass != nullptr)
nodes.emplace_back(std::in_place_type_t<node::RenderPassEndNode>());
renderPass = &std::get<node::RenderPassNode>(nodes.emplace_back(std::in_place_type_t<node::RenderPassNode>(), renderArea));
slot->nodes.emplace_back(std::in_place_type_t<node::RenderPassEndNode>());
renderPass = &std::get<node::RenderPassNode>(slot->nodes.emplace_back(std::in_place_type_t<node::RenderPassNode>(), renderArea));
addSubpass();
subpassCount = 1;
return false;
@ -77,7 +198,7 @@ namespace skyline::gpu::interconnect {
void CommandExecutor::FinishRenderPass() {
if (renderPass) {
nodes.emplace_back(std::in_place_type_t<node::RenderPassEndNode>());
slot->nodes.emplace_back(std::in_place_type_t<node::RenderPassEndNode>());
renderPass = nullptr;
subpassCount = 0;
@ -168,9 +289,9 @@ namespace skyline::gpu::interconnect {
bool gotoNext{CreateRenderPassWithSubpass(renderArea, inputAttachments, colorAttachments, depthStencilAttachment ? &*depthStencilAttachment : nullptr)};
if (gotoNext)
nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), std::forward<decltype(function)>(function));
slot->nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), std::forward<decltype(function)>(function));
else
nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), std::forward<decltype(function)>(function));
slot->nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), std::forward<decltype(function)>(function));
if (exclusiveSubpass)
FinishRenderPass();
@ -180,14 +301,14 @@ namespace skyline::gpu::interconnect {
if (renderPass)
FinishRenderPass();
nodes.emplace_back(std::in_place_type_t<node::FunctionNode>(), std::forward<decltype(function)>(function));
slot->nodes.emplace_back(std::in_place_type_t<node::FunctionNode>(), std::forward<decltype(function)>(function));
}
void CommandExecutor::AddClearColorSubpass(TextureView *attachment, const vk::ClearColorValue &value) {
bool gotoNext{CreateRenderPassWithSubpass(vk::Rect2D{.extent = attachment->texture->dimensions}, {}, attachment, nullptr)};
if (renderPass->ClearColorAttachment(0, value, gpu)) {
if (gotoNext)
nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>());
slot->nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>());
} else {
auto function{[scissor = attachment->texture->dimensions, value](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &, vk::RenderPass, u32) {
commandBuffer.clearAttachments(vk::ClearAttachment{
@ -202,9 +323,9 @@ namespace skyline::gpu::interconnect {
}};
if (gotoNext)
nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
slot->nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
else
nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), function);
slot->nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), function);
}
}
@ -212,7 +333,7 @@ namespace skyline::gpu::interconnect {
bool gotoNext{CreateRenderPassWithSubpass(vk::Rect2D{.extent = attachment->texture->dimensions}, {}, {}, attachment)};
if (renderPass->ClearDepthStencilAttachment(value, gpu)) {
if (gotoNext)
nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>());
slot->nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>());
} else {
auto function{[aspect = attachment->format->vkAspect, extent = attachment->texture->dimensions, value](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &, vk::RenderPass, u32) {
commandBuffer.clearAttachments(vk::ClearAttachment{
@ -226,9 +347,9 @@ namespace skyline::gpu::interconnect {
}};
if (gotoNext)
nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
slot->nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
else
nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), function);
slot->nodes.emplace_back(std::in_place_type_t<node::SubpassFunctionNode>(), function);
}
}
@ -241,13 +362,12 @@ namespace skyline::gpu::interconnect {
FinishRenderPass();
{
auto &commandBuffer{*activeCommandBuffer};
commandBuffer.begin(vk::CommandBufferBeginInfo{
slot->commandBuffer.begin(vk::CommandBufferBeginInfo{
.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
});
// We need this barrier here to ensure that resources are in the state we expect them to be in, we shouldn't overwrite resources while prior commands might still be using them or read from them while they might be modified by prior commands
commandBuffer.pipelineBarrier(
slot->commandBuffer.pipelineBarrier(
vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{
.srcAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
.dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
@ -255,57 +375,27 @@ namespace skyline::gpu::interconnect {
);
for (const auto &texture : attachedTextures)
texture->SynchronizeHostInline(commandBuffer, cycle, true);
texture->SynchronizeHostInline(slot->commandBuffer, cycle, true);
}
vk::RenderPass lRenderPass;
u32 subpassIndex;
for (const auto &attachedBuffer : attachedBuffers)
if (attachedBuffer->SequencedCpuBackingWritesBlocked())
attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer
using namespace node;
for (NodeVariant &node : nodes) {
#define NODE(name) [&](name& node) { node(commandBuffer, cycle, gpu); }
std::visit(VariantVisitor{
NODE(FunctionNode),
for (const auto &attachedTexture : attachedTextures) {
// We don't need to attach the Texture to the cycle as a TextureView will already be attached
cycle->ChainCycle(attachedTexture->cycle);
attachedTexture->cycle = cycle;
}
[&](RenderPassNode &node) {
lRenderPass = node(commandBuffer, cycle, gpu);
subpassIndex = 0;
},
[&](NextSubpassNode &node) {
node(commandBuffer, cycle, gpu);
++subpassIndex;
},
[&](SubpassFunctionNode &node) { node(commandBuffer, cycle, gpu, lRenderPass, subpassIndex); },
[&](NextSubpassFunctionNode &node) { node(commandBuffer, cycle, gpu, lRenderPass, ++subpassIndex); },
NODE(RenderPassEndNode),
}, node);
#undef NODE
}
commandBuffer.end();
for (const auto &attachedBuffer : attachedBuffers)
if (attachedBuffer->SequencedCpuBackingWritesBlocked())
attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer, this is done directly prior to submission to prevent stalls
gpu.scheduler.SubmitCommandBuffer(commandBuffer, cycle);
nodes.clear();
for (const auto &attachedTexture : attachedTextures) {
// We don't need to attach the Texture to the cycle as a TextureView will already be attached
cycle->ChainCycle(attachedTexture->cycle);
attachedTexture->cycle = cycle;
}
for (const auto &attachedBuffer : attachedBuffers) {
if (attachedBuffer->RequiresCycleAttach() ) {
cycle->AttachObject(attachedBuffer.buffer);
attachedBuffer->UpdateCycle(cycle);
}
for (const auto &attachedBuffer : attachedBuffers) {
if (attachedBuffer->RequiresCycleAttach() ) {
cycle->AttachObject(attachedBuffer.buffer);
attachedBuffer->UpdateCycle(cycle);
}
}
RotateRecordSlot();
}
void CommandExecutor::ResetInternal() {
@ -314,32 +404,16 @@ namespace skyline::gpu::interconnect {
attachedBuffers.clear();
bufferManagerLock.reset();
megaBufferAllocatorLock.reset();
allocator.Reset();
allocator->Reset();
}
void CommandExecutor::Submit() {
for (const auto &callback : flushCallbacks)
callback();
if (!nodes.empty()) {
if (!slot->nodes.empty()) {
TRACE_EVENT("gpu", "CommandExecutor::Submit");
SubmitInternal();
activeCommandBuffer = gpu.scheduler.AllocateCommandBuffer();
cycle = activeCommandBuffer.GetFenceCycle();
}
ResetInternal();
executionNumber++;
}
void CommandExecutor::SubmitWithFlush() {
for (const auto &callback : flushCallbacks)
callback();
if (!nodes.empty()) {
TRACE_EVENT("gpu", "CommandExecutor::SubmitWithFlush");
SubmitInternal();
cycle = activeCommandBuffer.Reset();
}
ResetInternal();

View File

@ -10,6 +10,57 @@
#include "command_nodes.h"
namespace skyline::gpu::interconnect {
/*
* @brief Thread responsible for recording Vulkan commands from the execution nodes and submitting them
*/
class CommandRecordThread {
public:
/**
* @brief Single execution slot, buffered back and forth between the GPFIFO thread and the record thread
*/
struct Slot {
vk::raii::CommandPool commandPool; //!< Use one command pool per slot since command buffers from different slots may be recorded into on multiple threads at the same time
vk::raii::CommandBuffer commandBuffer;
vk::raii::Fence fence;
std::shared_ptr<FenceCycle> cycle;
boost::container::stable_vector<node::NodeVariant> nodes;
LinearAllocatorState<> allocator;
Slot(GPU &gpu);
/**
* @brief Waits on the fence and resets the command buffer
* @note A new fence cycle for the reset command buffer
*/
std::shared_ptr<FenceCycle> Reset(GPU &gpu);
};
private:
const DeviceState &state;
std::thread thread;
static constexpr size_t ActiveRecordSlots{4}; //!< Maximum number of simultaneously active slots
CircularQueue<Slot *> incoming{ActiveRecordSlots}; //!< Slots pending recording
CircularQueue<Slot *> outgoing{ActiveRecordSlots}; //!< Slots that have been submitted, may still be active on the GPU
void ProcessSlot(Slot *slot);
void Run();
public:
CommandRecordThread(const DeviceState &state);
/**
* @return A free slot, `Reset` needs to be called before accessing it
*/
Slot *AcquireSlot();
/**
* @brief Submit a slot to be recorded
*/
void ReleaseSlot(Slot *slot);
};
/**
* @brief Assembles a Vulkan command stream with various nodes and manages execution of the produced graph
* @note This class is **NOT** thread-safe and should **ONLY** be utilized by a single thread
@ -17,11 +68,10 @@ namespace skyline::gpu::interconnect {
class CommandExecutor {
private:
GPU &gpu;
CommandScheduler::ActiveCommandBuffer activeCommandBuffer;
boost::container::stable_vector<node::NodeVariant> nodes;
CommandRecordThread recordThread;
CommandRecordThread::Slot *slot{};
node::RenderPassNode *renderPass{};
size_t subpassCount{}; //!< The number of subpasses in the current render pass
std::optional<std::scoped_lock<TextureManager>> textureManagerLock; //!< The lock on the texture manager, this is locked for the duration of the command execution from the first usage inside an execution to the submission
std::optional<std::scoped_lock<BufferManager>> bufferManagerLock; //!< The lock on the buffer manager, see above for details
std::optional<std::scoped_lock<MegaBufferAllocator>> megaBufferAllocatorLock; //!< The lock on the megabuffer allocator, see above for details
@ -72,6 +122,8 @@ namespace skyline::gpu::interconnect {
std::vector<std::function<void()>> flushCallbacks; //!< Set of persistent callbacks that will be called at the start of Execute in order to flush data required for recording
void RotateRecordSlot();
/**
* @brief Create a new render pass and subpass with the specified attachments, if one doesn't already exist or the current one isn't compatible
* @note This also checks for subpass coalescing and will merge the new subpass with the previous one when possible
@ -97,7 +149,7 @@ namespace skyline::gpu::interconnect {
public:
std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
LinearAllocatorState<> allocator;
LinearAllocatorState<> *allocator;
ContextTag tag; //!< The tag associated with this command executor, any tagged resource locking must utilize this tag
size_t executionNumber{};
@ -193,10 +245,5 @@ namespace skyline::gpu::interconnect {
* @brief Execute all the nodes and submit the resulting command buffer to the GPU
*/
void Submit();
/**
* @brief Execute all the nodes and submit the resulting command buffer to the GPU then wait for the completion of the command buffer
*/
void SubmitWithFlush();
};
}

View File

@ -163,7 +163,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
}
void Maxwell3D::Draw(engine::DrawTopology topology, bool indexed, u32 count, u32 first, u32 instanceCount, u32 vertexOffset, u32 firstInstance) {
StateUpdateBuilder builder{ctx.executor.allocator};
StateUpdateBuilder builder{*ctx.executor.allocator};
Pipeline *oldPipeline{activeState.GetPipeline()};
activeState.Update(ctx, builder, indexed, topology, count);

View File

@ -615,9 +615,9 @@ namespace skyline::gpu::interconnect::maxwell3d {
u32 bufferIdx{};
u32 imageIdx{};
auto writes{ctx.executor.allocator.AllocateUntracked<vk::WriteDescriptorSet>(descriptorInfo.writeDescCount)};
auto bufferDescs{ctx.executor.allocator.AllocateUntracked<vk::DescriptorBufferInfo>(descriptorInfo.totalBufferDescCount)};
auto bufferDescViews{ctx.executor.allocator.AllocateUntracked<DynamicBufferBinding>(descriptorInfo.totalBufferDescCount)};
auto writes{ctx.executor.allocator->AllocateUntracked<vk::WriteDescriptorSet>(descriptorInfo.writeDescCount)};
auto bufferDescs{ctx.executor.allocator->AllocateUntracked<vk::DescriptorBufferInfo>(descriptorInfo.totalBufferDescCount)};
auto bufferDescViews{ctx.executor.allocator->AllocateUntracked<DynamicBufferBinding>(descriptorInfo.totalBufferDescCount)};
auto writeBufferDescs{[&](vk::DescriptorType type, const auto &descs, u32 count, auto getBufferCb) {
if (!descs.empty()) {
@ -658,13 +658,13 @@ namespace skyline::gpu::interconnect::maxwell3d {
const auto &cbufUsageInfo{descriptorInfo.cbufUsages[static_cast<size_t>(quickBind.stage)][quickBind.index]};
const auto &shaderInfo{shaderStages[static_cast<size_t>(quickBind.stage)].info};
auto &stageConstantBuffers{constantBuffers[static_cast<size_t>(quickBind.stage)]};
auto copy{ctx.executor.allocator.AllocateUntracked<vk::CopyDescriptorSet>()};
auto writes{ctx.executor.allocator.AllocateUntracked<vk::WriteDescriptorSet>(cbufUsageInfo.writeDescCount)};
auto copy{ctx.executor.allocator->AllocateUntracked<vk::CopyDescriptorSet>()};
auto writes{ctx.executor.allocator->AllocateUntracked<vk::WriteDescriptorSet>(cbufUsageInfo.writeDescCount)};
size_t writeIdx{};
size_t bufferIdx{};
auto bufferDescs{ctx.executor.allocator.AllocateUntracked<vk::DescriptorBufferInfo>(cbufUsageInfo.totalBufferDescCount)};
auto bufferDescViews{ctx.executor.allocator.AllocateUntracked<DynamicBufferBinding>(cbufUsageInfo.totalBufferDescCount)};
auto bufferDescs{ctx.executor.allocator->AllocateUntracked<vk::DescriptorBufferInfo>(cbufUsageInfo.totalBufferDescCount)};
auto bufferDescViews{ctx.executor.allocator->AllocateUntracked<DynamicBufferBinding>(cbufUsageInfo.totalBufferDescCount)};
// TODO: opt this to do partial copy
*copy = vk::CopyDescriptorSet{