From 0670e0e0dc4a7d712fa255e9dd5cbc5d379b961f Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Sun, 16 Oct 2022 20:31:36 +0100 Subject: [PATCH] Support using Vulkan semaphores with fence cycles In some cases like presentation, it may be possible to avoid waiting on the CPU by using a semaphore to indicate GPU completion. Due to the binary nature of Vulkan semaphores this requires a fair bit of code as we need to ensure semaphores are always unsignalled before they are waited on and signalled again. This is achieved with a special kind of chained cycle that can be added even after guest GPFIFO processing for a given cycle, the main cycle's semaphore can be waited and then the cycle for the wait attached to the main cycle and it will be waited on before signalling. --- .../cpp/skyline/gpu/command_scheduler.cpp | 32 ++++- .../main/cpp/skyline/gpu/command_scheduler.h | 33 +---- app/src/main/cpp/skyline/gpu/fence_cycle.h | 130 +++++++++--------- .../gpu/interconnect/command_executor.cpp | 5 +- .../gpu/interconnect/command_executor.h | 1 + 5 files changed, 101 insertions(+), 100 deletions(-) diff --git a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp index 0092db84..0c78c7fb 100644 --- a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp +++ b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp @@ -32,10 +32,11 @@ namespace skyline::gpu { } CommandScheduler::CommandBufferSlot::CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool) - : device(device), - commandBuffer(device, static_cast(commandBuffer), static_cast(*pool)), - fence(device, vk::FenceCreateInfo{}), - cycle(std::make_shared(device, *fence)) {} + : device{device}, + commandBuffer{device, static_cast(commandBuffer), static_cast(*pool)}, + fence{device, vk::FenceCreateInfo{}}, + semaphore{device, vk::SemaphoreCreateInfo{}}, + cycle{std::make_shared(device, *fence, *semaphore)} {} CommandScheduler::CommandScheduler(const DeviceState &state, GPU &pGpu) : state{state}, @@ -55,7 +56,7 @@ namespace skyline::gpu { if (!slot.active.test_and_set(std::memory_order_acq_rel)) { if (slot.cycle->Poll()) { slot.commandBuffer.reset(); - slot.cycle = std::make_shared(slot.device, *slot.fence); + slot.cycle = std::make_shared(*slot.cycle); return {slot}; } else { slot.active.clear(std::memory_order_release); @@ -76,12 +77,29 @@ namespace skyline::gpu { return {pool->buffers.emplace_back(gpu.vkDevice, commandBuffer, pool->vkCommandPool)}; } - void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr cycle) { + void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr cycle, span waitSemaphores, span signalSemaphores) { + boost::container::small_vector fullWaitSemaphores{waitSemaphores.begin(), waitSemaphores.end()}; + boost::container::small_vector fullWaitStages{waitSemaphores.size(), vk::PipelineStageFlagBits::eAllCommands}; + + if (cycle->semaphoreSubmitWait) { + fullWaitSemaphores.push_back(cycle->semaphore); + // We don't need a full barrier since this is only done to ensure the semaphore is unsignalled + fullWaitStages.push_back(vk::PipelineStageFlagBits::eTopOfPipe); + } + + boost::container::small_vector fullSignalSemaphores{signalSemaphores.begin(), signalSemaphores.end()}; + fullSignalSemaphores.push_back(cycle->semaphore); + { - std::scoped_lock lock(gpu.queueMutex); + std::scoped_lock lock{gpu.queueMutex}; gpu.vkQueue.submit(vk::SubmitInfo{ .commandBufferCount = 1, .pCommandBuffers = &*commandBuffer, + .waitSemaphoreCount = static_cast(waitSemaphores.size()), + .pWaitSemaphores = fullWaitSemaphores.data(), + .pWaitDstStageMask = fullWaitStages.data(), + .signalSemaphoreCount = static_cast(fullSignalSemaphores.size()), + .pSignalSemaphores = fullSignalSemaphores.data(), }, cycle->fence); } diff --git a/app/src/main/cpp/skyline/gpu/command_scheduler.h b/app/src/main/cpp/skyline/gpu/command_scheduler.h index 02f19290..6d3c3e06 100644 --- a/app/src/main/cpp/skyline/gpu/command_scheduler.h +++ b/app/src/main/cpp/skyline/gpu/command_scheduler.h @@ -21,6 +21,7 @@ namespace skyline::gpu { const vk::raii::Device &device; vk::raii::CommandBuffer commandBuffer; vk::raii::Fence fence; //!< A fence used for tracking all submits of a buffer + vk::raii::Semaphore semaphore; //!< A semaphore used for tracking work status on the GPU std::shared_ptr cycle; //!< The latest cycle on the fence, all waits must be performed through this CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool); @@ -94,7 +95,7 @@ namespace skyline::gpu { */ std::shared_ptr Reset() { slot->cycle->Wait(); - slot->cycle = std::make_shared(slot->device, *slot->fence); + slot->cycle = std::make_shared(*slot->cycle); slot->commandBuffer.reset(); return slot->cycle; } @@ -114,13 +115,15 @@ namespace skyline::gpu { * @note The supplied command buffer and cycle **must** be from AllocateCommandBuffer() * @note Any cycle submitted via this method does not need to destroy dependencies manually, the waiter thread will handle this */ - void SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr cycle); + void SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr cycle, span waitSemaphores = {}, span signalSemaphore = {}); /** * @brief Submits a command buffer recorded with the supplied function synchronously + * @param waitSemaphores A span of all (excl fence cycle) semaphores that should be waited on by the GPU before executing the command buffer + * @param signalSemaphore A span of all semaphores that should be signalled by the GPU after executing the command buffer */ template - std::shared_ptr Submit(RecordFunction recordFunction) { + std::shared_ptr Submit(RecordFunction recordFunction, span waitSemaphores = {}, span signalSemaphores = {}) { auto commandBuffer{AllocateCommandBuffer()}; try { commandBuffer->begin(vk::CommandBufferBeginInfo{ @@ -130,29 +133,7 @@ namespace skyline::gpu { commandBuffer->end(); auto cycle{commandBuffer.GetFenceCycle()}; - SubmitCommandBuffer(*commandBuffer, cycle); - return cycle; - } catch (...) { - commandBuffer.GetFenceCycle()->Cancel(); - std::rethrow_exception(std::current_exception()); - } - } - - /** - * @note Same as Submit but with FenceCycle as an argument rather than return value - */ - template - std::shared_ptr SubmitWithCycle(RecordFunction recordFunction) { - auto commandBuffer{AllocateCommandBuffer()}; - auto cycle{commandBuffer.GetFenceCycle()}; - try { - commandBuffer->begin(vk::CommandBufferBeginInfo{ - .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit, - }); - recordFunction(*commandBuffer, cycle); - commandBuffer->end(); - - SubmitCommandBuffer(*commandBuffer, cycle); + SubmitCommandBuffer(*commandBuffer, cycle, waitSemaphores, signalSemaphores); return cycle; } catch (...) { commandBuffer.GetFenceCycle()->Cancel(); diff --git a/app/src/main/cpp/skyline/gpu/fence_cycle.h b/app/src/main/cpp/skyline/gpu/fence_cycle.h index 98a3c035..c38a30f2 100644 --- a/app/src/main/cpp/skyline/gpu/fence_cycle.h +++ b/app/src/main/cpp/skyline/gpu/fence_cycle.h @@ -25,6 +25,10 @@ namespace skyline::gpu { std::condition_variable_any submitCondition; bool submitted{}; //!< If the fence has been submitted to the GPU vk::Fence fence; + vk::Semaphore semaphore; //!< Semaphore that will be signalled upon GPU completion of the fence + bool semaphoreSubmitWait{}; //!< If the semaphore needs to be waited on (on GPU) before the fence's command buffer begins. Used to ensure fences that wouldn't otherwise be unsignalled are unsignalled + bool nextSemaphoreSubmitWait{true}; //!< If the next fence cycle created from this one after it's signalled should wait on the semaphore to unsignal it + std::shared_ptr semaphoreUnsignalCycle{}; //!< If the semaphore is used on the GPU, the cycle for the submission that uses it, so it can be waited on before the fence is signalled to ensure the semaphore is unsignalled friend CommandScheduler; @@ -41,11 +45,15 @@ namespace skyline::gpu { } public: - FenceCycle(const vk::raii::Device &device, vk::Fence fence, bool signalled = false) : signalled{signalled}, device{device}, fence{fence} { + FenceCycle(const vk::raii::Device &device, vk::Fence fence, vk::Semaphore semaphore, bool signalled = false) : signalled{signalled}, device{device}, fence{fence}, semaphore{semaphore}, nextSemaphoreSubmitWait{!signalled} { if (!signalled) device.resetFences(fence); } + explicit FenceCycle(const FenceCycle &cycle) : signalled{false}, device{cycle.device}, fence{cycle.fence}, semaphore{cycle.semaphore}, semaphoreSubmitWait{cycle.nextSemaphoreSubmitWait} { + device.resetFences(fence); + } + ~FenceCycle() { Wait(); } @@ -58,6 +66,33 @@ namespace skyline::gpu { DestroyDependencies(); } + /** + * @brief Executes a function with the fence locked to record a usage of its semaphore, if no semaphore can be provided then a CPU-side wait will be performed instead + */ + std::shared_ptr RecordSemaphoreWaitUsage(std::function(vk::Semaphore sema)> &&func) { + // We can't submit any semaphore waits until the signal has been submitted, so do that first + WaitSubmit(); + + std::unique_lock lock{mutex}; + + // If we already have a semaphore usage, just wait on the fence since we can't wait on it twice and have no way to add one after the fact + if (semaphoreUnsignalCycle) { + // Safe to unlock since semaphoreUnsignalCycle can never be reset + lock.unlock(); + + Wait(); + return func({}); + } + + // If we're already signalled then there's no need to wait on the semaphore + if (signalled.test(std::memory_order_relaxed)) + return func({}); + + semaphoreUnsignalCycle = func(semaphore); + nextSemaphoreSubmitWait = false; // We don't need a semaphore wait on the next fence cycle to unsignal the semaphore anymore as the usage will do that + return semaphoreUnsignalCycle; + } + /** * @brief Waits for submission of the command buffer associated with this cycle to the GPU */ @@ -65,11 +100,21 @@ namespace skyline::gpu { if (signalled.test(std::memory_order_consume)) return; + std::unique_lock lock{mutex}; + if (submitted) + return; + + if (signalled.test(std::memory_order_consume)) + return; + + lock.unlock(); chainedCycles.Iterate([&](const auto &cycle) { + if (!cycle->Find(this)) + raise(SIGTRAP); cycle->WaitSubmit(); }); + lock.lock(); - std::unique_lock lock{mutex}; submitCondition.wait(lock, [this] { return submitted; }); } @@ -84,19 +129,19 @@ namespace skyline::gpu { return; } - chainedCycles.Iterate([shouldDestroy](auto &cycle) { + chainedCycles.Iterate([shouldDestroy, this](auto &cycle) { cycle->Wait(shouldDestroy); }); std::unique_lock lock{mutex}; - submitCondition.wait(lock, [&] { return submitted; }); - - if (signalled.test(std::memory_order_consume)) { + if (signalled.test(std::memory_order_relaxed)) { if (shouldDestroy) DestroyDependencies(); return; } + submitCondition.wait(lock, [&] { return submitted; }); + vk::Result waitResult; while ((waitResult = (*device).waitForFences(1, &fence, false, std::numeric_limits::max(), *device.getDispatcher())) != vk::Result::eSuccess) { if (waitResult == vk::Result::eTimeout) @@ -110,68 +155,14 @@ namespace skyline::gpu { throw exception("An error occurred while waiting for fence 0x{:X}: {}", static_cast(fence), vk::to_string(waitResult)); } - signalled.test_and_set(std::memory_order_release); + if (semaphoreUnsignalCycle) + semaphoreUnsignalCycle->Wait(); + + signalled.test_and_set(std::memory_order_relaxed); if (shouldDestroy) DestroyDependencies(); } - /** - * @brief Wait on a fence cycle with a timeout in nanoseconds - * @param shouldDestroy If true, the dependencies of this cycle will be destroyed after the fence is signalled - * @return If the wait was successful or timed out - */ - bool Wait(i64 timeoutNs, bool shouldDestroy = false) { - if (signalled.test(std::memory_order_consume)) { - if (shouldDestroy) - DestroyDependencies(); - return true; - } - - i64 startTime{util::GetTimeNs()}, initialTimeout{timeoutNs}; - if (!chainedCycles.AllOf([&](auto &cycle) { - if (!cycle->Wait(timeoutNs, shouldDestroy)) - return false; - timeoutNs = std::max(0, initialTimeout - (util::GetTimeNs() - startTime)); - return true; - })) - return false; - - std::unique_lock lock{mutex, std::defer_lock}; - if (!lock.try_lock_for(std::chrono::nanoseconds{timeoutNs})) - return false; - - if (!submitCondition.wait_for(lock, std::chrono::nanoseconds(timeoutNs), [&] { return submitted; })) - return false; - - timeoutNs = std::max(0, initialTimeout - (util::GetTimeNs() - startTime)); - - vk::Result waitResult; - while ((waitResult = (*device).waitForFences(1, &fence, false, static_cast(timeoutNs), *device.getDispatcher())) != vk::Result::eSuccess) { - if (waitResult == vk::Result::eTimeout) - break; - - if (waitResult == vk::Result::eErrorInitializationFailed) { - timeoutNs = std::max(0, initialTimeout - (util::GetTimeNs() - startTime)); - continue; - } - - throw exception("An error occurred while waiting for fence 0x{:X}: {}", static_cast(fence), vk::to_string(waitResult)); - } - - if (waitResult == vk::Result::eSuccess) { - signalled.test_and_set(std::memory_order_release); - if (shouldDestroy) - DestroyDependencies(); - return true; - } else { - return false; - } - } - - bool Wait(std::chrono::duration timeout, bool shouldDestroy = false) { - return Wait(timeout.count(), shouldDestroy); - } - /** * @param quick Skips the call to check the fence's status, just checking the signalled flag * @return If the fence is signalled currently or not @@ -193,12 +184,21 @@ namespace skyline::gpu { if (!lock) return false; + if (signalled.test(std::memory_order_relaxed)) { + if (shouldDestroy) + DestroyDependencies(); + return true; + } + if (!submitted) return false; auto status{(*device).getFenceStatus(fence, *device.getDispatcher())}; if (status == vk::Result::eSuccess) { - signalled.test_and_set(std::memory_order_release); + if (semaphoreUnsignalCycle && !semaphoreUnsignalCycle->Poll()) + return false; + + signalled.test_and_set(std::memory_order_relaxed); if (shouldDestroy) DestroyDependencies(); return true; diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp index b43c62ff..0cda42d9 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp @@ -28,11 +28,12 @@ namespace skyline::gpu::interconnect { }, commandBuffer{AllocateRaiiCommandBuffer(gpu, commandPool)}, fence{gpu.vkDevice, vk::FenceCreateInfo{ .flags = vk::FenceCreateFlagBits::eSignaled }}, - cycle{std::make_shared(gpu.vkDevice, *fence, true)} {} + semaphore{gpu.vkDevice, vk::SemaphoreCreateInfo{}}, + cycle{std::make_shared(gpu.vkDevice, *fence, *semaphore, true)} {} std::shared_ptr CommandRecordThread::Slot::Reset(GPU &gpu) { cycle->Wait(); - cycle = std::make_shared(gpu.vkDevice, *fence); + cycle = std::make_shared(*cycle); // Command buffer doesn't need to be reset since that's done implicitly by begin return cycle; } diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h index 72b3df0e..92da3c47 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h @@ -24,6 +24,7 @@ namespace skyline::gpu::interconnect { vk::raii::CommandPool commandPool; //!< Use one command pool per slot since command buffers from different slots may be recorded into on multiple threads at the same time vk::raii::CommandBuffer commandBuffer; vk::raii::Fence fence; + vk::raii::Semaphore semaphore; std::shared_ptr cycle; boost::container::stable_vector nodes; LinearAllocatorState<> allocator;