From 1f9de17e9824c69c817da090a816bd4aea959f28 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Sun, 6 Nov 2022 19:18:36 +0000 Subject: [PATCH] Begin command buffers asynchronously in command executor vkBeginCommandBuffer can take quite some time on adreno, move it to the cycle waiter thread where it won't block GPFIFO. --- .../gpu/interconnect/command_executor.cpp | 33 ++++++++++++++++--- .../gpu/interconnect/command_executor.h | 21 ++++++++++++ 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp index 726782a6..6b8485f8 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp @@ -15,6 +15,12 @@ namespace skyline::gpu::interconnect { outgoing{*state.settings->executorSlotCount}, thread{&CommandRecordThread::Run, this} {} + CommandRecordThread::Slot::ScopedBegin::ScopedBegin(CommandRecordThread::Slot &slot) : slot{slot} {} + + CommandRecordThread::Slot::ScopedBegin::~ScopedBegin() { + slot.Begin(); + } + static vk::raii::CommandBuffer AllocateRaiiCommandBuffer(GPU &gpu, vk::raii::CommandPool &pool) { return {gpu.vkDevice, (*gpu.vkDevice).allocateCommandBuffers( { @@ -35,14 +41,17 @@ namespace skyline::gpu::interconnect { commandBuffer{AllocateRaiiCommandBuffer(gpu, commandPool)}, fence{gpu.vkDevice, vk::FenceCreateInfo{ .flags = vk::FenceCreateFlagBits::eSignaled }}, semaphore{gpu.vkDevice, vk::SemaphoreCreateInfo{}}, - cycle{std::make_shared(gpu.vkDevice, *fence, *semaphore, true)} {} + cycle{std::make_shared(gpu.vkDevice, *fence, *semaphore, true)} { + Begin(); + } CommandRecordThread::Slot::Slot(Slot &&other) : commandPool{std::move(other.commandPool)}, commandBuffer{std::move(other.commandBuffer)}, fence{std::move(other.fence)}, semaphore{std::move(other.semaphore)}, - cycle{std::move(other.cycle)} {} + cycle{std::move(other.cycle)}, + ready{other.ready} {} std::shared_ptr CommandRecordThread::Slot::Reset(GPU &gpu) { cycle->Wait(); @@ -51,6 +60,21 @@ namespace skyline::gpu::interconnect { return cycle; } + void CommandRecordThread::Slot::WaitReady() { + std::unique_lock lock{beginLock}; + beginCondition.wait(lock, [this] { return ready; }); + cycle->AttachObject(std::make_shared(*this)); + } + + void CommandRecordThread::Slot::Begin() { + std::unique_lock lock{beginLock}; + commandBuffer.begin(vk::CommandBufferBeginInfo{ + .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit, + }); + ready = true; + beginCondition.notify_all(); + } + void CommandRecordThread::ProcessSlot(Slot *slot) { TRACE_EVENT_FMT("gpu", "ProcessSlot: 0x{:X}, execution: {}", slot, slot->executionNumber); auto &gpu{*state.gpu}; @@ -83,6 +107,7 @@ namespace skyline::gpu::interconnect { } slot->commandBuffer.end(); + slot->ready = false; gpu.scheduler.SubmitCommandBuffer(slot->commandBuffer, slot->cycle); @@ -404,9 +429,7 @@ namespace skyline::gpu::interconnect { FinishRenderPass(); { - slot->commandBuffer.begin(vk::CommandBufferBeginInfo{ - .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit, - }); + slot->WaitReady(); // We need this barrier here to ensure that resources are in the state we expect them to be in, we shouldn't overwrite resources while prior commands might still be using them or read from them while they might be modified by prior commands slot->commandBuffer.pipelineBarrier( diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h index 6371ac55..521a53ef 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h @@ -19,6 +19,17 @@ namespace skyline::gpu::interconnect { * @brief Single execution slot, buffered back and forth between the GPFIFO thread and the record thread */ struct Slot { + /** + * @brief Helper to begin the slot command buffer on the cycle waiter thread + */ + struct ScopedBegin { + Slot &slot; + + ScopedBegin(Slot &slot); + + ~ScopedBegin(); + }; + vk::raii::CommandPool commandPool; //!< Use one command pool per slot since command buffers from different slots may be recorded into on multiple threads at the same time vk::raii::CommandBuffer commandBuffer; vk::raii::Fence fence; @@ -26,7 +37,10 @@ namespace skyline::gpu::interconnect { std::shared_ptr cycle; boost::container::stable_vector nodes; LinearAllocatorState<> allocator; + std::mutex beginLock; + std::condition_variable beginCondition; u32 executionNumber; + bool ready{}; //!< If this slot's command buffer has had 'beginCommandBuffer' called and is ready to have commands recorded into it bool capture{}; //!< If this slot's Vulkan commands should be captured using the renderdoc API Slot(GPU &gpu); @@ -38,6 +52,13 @@ namespace skyline::gpu::interconnect { * @note A new fence cycle for the reset command buffer */ std::shared_ptr Reset(GPU &gpu); + + /** + * @brief Waits for the command buffer to be began so it can be recorded into + */ + void WaitReady(); + + void Begin(); }; private: