Support using Vulkan semaphores with fence cycles

In some cases like presentation, it may be possible to avoid waiting on the CPU by using a semaphore to indicate GPU completion. Due to the binary nature of Vulkan semaphores this requires a fair bit of code as we need to ensure semaphores are always unsignalled before they are waited on and signalled again. This is achieved with a special kind of chained cycle that can be added even after guest GPFIFO processing for a given cycle, the main cycle's semaphore can be waited and then the cycle for the wait attached to the main cycle and it will be waited on before signalling.
2025-02-17 05:26:24 +01:00 · 2022-10-16 20:31:36 +01:00 · 2022-10-16 20:31:36 +01:00 · 0670e0e0dc
commit 0670e0e0dc
parent 5b72be88c3
5 changed files with 101 additions and 100 deletions
--- a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
+++ b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
@ -32,10 +32,11 @@ namespace skyline::gpu {
    }

    CommandScheduler::CommandBufferSlot::CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool)
-        : device(device),
-          commandBuffer(device, static_cast<VkCommandBuffer>(commandBuffer), static_cast<VkCommandPool>(*pool)),
-          fence(device, vk::FenceCreateInfo{}),
-          cycle(std::make_shared<FenceCycle>(device, *fence)) {}
+        : device{device},
+          commandBuffer{device, static_cast<VkCommandBuffer>(commandBuffer), static_cast<VkCommandPool>(*pool)},
+          fence{device, vk::FenceCreateInfo{}},
+          semaphore{device, vk::SemaphoreCreateInfo{}},
+          cycle{std::make_shared<FenceCycle>(device, *fence, *semaphore)} {}

    CommandScheduler::CommandScheduler(const DeviceState &state, GPU &pGpu)
        : state{state},
@ -55,7 +56,7 @@ namespace skyline::gpu {
            if (!slot.active.test_and_set(std::memory_order_acq_rel)) {
                if (slot.cycle->Poll()) {
                    slot.commandBuffer.reset();
-                    slot.cycle = std::make_shared<FenceCycle>(slot.device, *slot.fence);
+                    slot.cycle = std::make_shared<FenceCycle>(*slot.cycle);
                    return {slot};
                } else {
                    slot.active.clear(std::memory_order_release);
@ -76,12 +77,29 @@ namespace skyline::gpu {
        return {pool->buffers.emplace_back(gpu.vkDevice, commandBuffer, pool->vkCommandPool)};
    }

-    void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr<FenceCycle> cycle) {
+    void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr<FenceCycle> cycle, span<vk::Semaphore> waitSemaphores, span<vk::Semaphore> signalSemaphores) {
+        boost::container::small_vector<vk::Semaphore, 3> fullWaitSemaphores{waitSemaphores.begin(), waitSemaphores.end()};
+        boost::container::small_vector<vk::PipelineStageFlags, 3> fullWaitStages{waitSemaphores.size(), vk::PipelineStageFlagBits::eAllCommands};
+
+        if (cycle->semaphoreSubmitWait) {
+            fullWaitSemaphores.push_back(cycle->semaphore);
+            // We don't need a full barrier since this is only done to ensure the semaphore is unsignalled
+            fullWaitStages.push_back(vk::PipelineStageFlagBits::eTopOfPipe);
+        }
+
+        boost::container::small_vector<vk::Semaphore, 2> fullSignalSemaphores{signalSemaphores.begin(), signalSemaphores.end()};
+        fullSignalSemaphores.push_back(cycle->semaphore);
+
        {
-            std::scoped_lock lock(gpu.queueMutex);
+            std::scoped_lock lock{gpu.queueMutex};
            gpu.vkQueue.submit(vk::SubmitInfo{
                .commandBufferCount = 1,
                .pCommandBuffers = &*commandBuffer,
+                .waitSemaphoreCount = static_cast<u32>(waitSemaphores.size()),
+                .pWaitSemaphores = fullWaitSemaphores.data(),
+                .pWaitDstStageMask = fullWaitStages.data(),
+                .signalSemaphoreCount = static_cast<u32>(fullSignalSemaphores.size()),
+                .pSignalSemaphores = fullSignalSemaphores.data(),
            }, cycle->fence);
        }

--- a/app/src/main/cpp/skyline/gpu/command_scheduler.h
+++ b/app/src/main/cpp/skyline/gpu/command_scheduler.h
@ -21,6 +21,7 @@ namespace skyline::gpu {
            const vk::raii::Device &device;
            vk::raii::CommandBuffer commandBuffer;
            vk::raii::Fence fence; //!< A fence used for tracking all submits of a buffer
+            vk::raii::Semaphore semaphore; //!< A semaphore used for tracking work status on the GPU
            std::shared_ptr<FenceCycle> cycle; //!< The latest cycle on the fence, all waits must be performed through this

            CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool);
@ -94,7 +95,7 @@ namespace skyline::gpu {
             */
            std::shared_ptr<FenceCycle> Reset() {
                slot->cycle->Wait();
-                slot->cycle = std::make_shared<FenceCycle>(slot->device, *slot->fence);
+                slot->cycle = std::make_shared<FenceCycle>(*slot->cycle);
                slot->commandBuffer.reset();
                return slot->cycle;
            }
@ -114,13 +115,15 @@ namespace skyline::gpu {
         * @note The supplied command buffer and cycle **must** be from AllocateCommandBuffer()
         * @note Any cycle submitted via this method does not need to destroy dependencies manually, the waiter thread will handle this
         */
-        void SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr<FenceCycle> cycle);
+        void SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr<FenceCycle> cycle, span<vk::Semaphore> waitSemaphores = {}, span<vk::Semaphore> signalSemaphore = {});

        /**
         * @brief Submits a command buffer recorded with the supplied function synchronously
+         * @param waitSemaphores A span of all (excl fence cycle) semaphores that should be waited on by the GPU before executing the command buffer
+         * @param signalSemaphore A span of all semaphores that should be signalled by the GPU after executing the command buffer
         */
        template<typename RecordFunction>
-        std::shared_ptr<FenceCycle> Submit(RecordFunction recordFunction) {
+        std::shared_ptr<FenceCycle> Submit(RecordFunction recordFunction, span<vk::Semaphore> waitSemaphores = {}, span<vk::Semaphore> signalSemaphores = {}) {
            auto commandBuffer{AllocateCommandBuffer()};
            try {
                commandBuffer->begin(vk::CommandBufferBeginInfo{
@ -130,29 +133,7 @@ namespace skyline::gpu {
                commandBuffer->end();

                auto cycle{commandBuffer.GetFenceCycle()};
-                SubmitCommandBuffer(*commandBuffer, cycle);
-                return cycle;
-            } catch (...) {
-                commandBuffer.GetFenceCycle()->Cancel();
-                std::rethrow_exception(std::current_exception());
-            }
-        }
-
-        /**
-         * @note Same as Submit but with FenceCycle as an argument rather than return value
-         */
-        template<typename RecordFunction>
-        std::shared_ptr<FenceCycle> SubmitWithCycle(RecordFunction recordFunction) {
-            auto commandBuffer{AllocateCommandBuffer()};
-            auto cycle{commandBuffer.GetFenceCycle()};
-            try {
-                commandBuffer->begin(vk::CommandBufferBeginInfo{
-                    .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
-                });
-                recordFunction(*commandBuffer, cycle);
-                commandBuffer->end();
-
-                SubmitCommandBuffer(*commandBuffer, cycle);
+                SubmitCommandBuffer(*commandBuffer, cycle, waitSemaphores, signalSemaphores);
                return cycle;
            } catch (...) {
                commandBuffer.GetFenceCycle()->Cancel();
--- a/app/src/main/cpp/skyline/gpu/fence_cycle.h
+++ b/app/src/main/cpp/skyline/gpu/fence_cycle.h
@ -25,6 +25,10 @@ namespace skyline::gpu {
        std::condition_variable_any submitCondition;
        bool submitted{}; //!< If the fence has been submitted to the GPU
        vk::Fence fence;
+        vk::Semaphore semaphore; //!< Semaphore that will be signalled upon GPU completion of the fence
+        bool semaphoreSubmitWait{}; //!< If the semaphore needs to be waited on (on GPU) before the fence's command buffer begins. Used to ensure fences that wouldn't otherwise be unsignalled are unsignalled
+        bool nextSemaphoreSubmitWait{true}; //!< If the next fence cycle created from this one after it's signalled should wait on the semaphore to unsignal it
+        std::shared_ptr<FenceCycle> semaphoreUnsignalCycle{}; //!< If the semaphore is used on the GPU, the cycle for the submission that uses it, so it can be waited on before the fence is signalled to ensure the semaphore is unsignalled

        friend CommandScheduler;

@ -41,11 +45,15 @@ namespace skyline::gpu {
        }

      public:
-        FenceCycle(const vk::raii::Device &device, vk::Fence fence, bool signalled = false) : signalled{signalled}, device{device}, fence{fence} {
+        FenceCycle(const vk::raii::Device &device, vk::Fence fence, vk::Semaphore semaphore, bool signalled = false) : signalled{signalled}, device{device}, fence{fence}, semaphore{semaphore}, nextSemaphoreSubmitWait{!signalled} {
            if (!signalled)
                device.resetFences(fence);
        }

+        explicit FenceCycle(const FenceCycle &cycle) : signalled{false}, device{cycle.device}, fence{cycle.fence}, semaphore{cycle.semaphore}, semaphoreSubmitWait{cycle.nextSemaphoreSubmitWait} {
+            device.resetFences(fence);
+        }
+
        ~FenceCycle() {
            Wait();
        }
@ -58,6 +66,33 @@ namespace skyline::gpu {
            DestroyDependencies();
        }

+        /**
+         * @brief Executes a function with the fence locked to record a usage of its semaphore, if no semaphore can be provided then a CPU-side wait will be performed instead
+         */
+        std::shared_ptr<FenceCycle> RecordSemaphoreWaitUsage(std::function<std::shared_ptr<FenceCycle>(vk::Semaphore sema)> &&func) {
+            // We can't submit any semaphore waits until the signal has been submitted, so do that first
+            WaitSubmit();
+
+            std::unique_lock lock{mutex};
+
+            // If we already have a semaphore usage, just wait on the fence since we can't wait on it twice and have no way to add one after the fact
+            if (semaphoreUnsignalCycle) {
+                // Safe to unlock since semaphoreUnsignalCycle can never be reset
+                lock.unlock();
+
+                Wait();
+                return func({});
+            }
+
+            // If we're already signalled then there's no need to wait on the semaphore
+            if (signalled.test(std::memory_order_relaxed))
+                return func({});
+
+            semaphoreUnsignalCycle = func(semaphore);
+            nextSemaphoreSubmitWait = false; // We don't need a semaphore wait on the next fence cycle to unsignal the semaphore anymore as the usage will do that
+            return semaphoreUnsignalCycle;
+        }
+
        /**
         * @brief Waits for submission of the command buffer associated with this cycle to the GPU
         */
@ -65,11 +100,21 @@ namespace skyline::gpu {
            if (signalled.test(std::memory_order_consume))
                return;

+            std::unique_lock lock{mutex};
+            if (submitted)
+                return;
+
+            if (signalled.test(std::memory_order_consume))
+                return;
+
+            lock.unlock();
            chainedCycles.Iterate([&](const auto &cycle) {
+                if (!cycle->Find(this))
+                    raise(SIGTRAP);
                cycle->WaitSubmit();
            });
+            lock.lock();

-            std::unique_lock lock{mutex};
            submitCondition.wait(lock, [this] { return submitted; });
        }

@ -84,19 +129,19 @@ namespace skyline::gpu {
                return;
            }

-            chainedCycles.Iterate([shouldDestroy](auto &cycle) {
+            chainedCycles.Iterate([shouldDestroy, this](auto &cycle) {
                cycle->Wait(shouldDestroy);
            });

            std::unique_lock lock{mutex};
-            submitCondition.wait(lock, [&] { return submitted; });
-
-            if (signalled.test(std::memory_order_consume)) {
+            if (signalled.test(std::memory_order_relaxed)) {
                if (shouldDestroy)
                    DestroyDependencies();
                return;
            }

+            submitCondition.wait(lock, [&] { return submitted; });
+
            vk::Result waitResult;
            while ((waitResult = (*device).waitForFences(1, &fence, false, std::numeric_limits<u64>::max(), *device.getDispatcher())) != vk::Result::eSuccess) {
                if (waitResult == vk::Result::eTimeout)
@ -110,68 +155,14 @@ namespace skyline::gpu {
                throw exception("An error occurred while waiting for fence 0x{:X}: {}", static_cast<VkFence>(fence), vk::to_string(waitResult));
            }

-            signalled.test_and_set(std::memory_order_release);
+            if (semaphoreUnsignalCycle)
+                semaphoreUnsignalCycle->Wait();
+
+            signalled.test_and_set(std::memory_order_relaxed);
            if (shouldDestroy)
                DestroyDependencies();
        }

-        /**
-         * @brief Wait on a fence cycle with a timeout in nanoseconds
-         * @param shouldDestroy If true, the dependencies of this cycle will be destroyed after the fence is signalled
-         * @return If the wait was successful or timed out
-         */
-        bool Wait(i64 timeoutNs, bool shouldDestroy = false) {
-            if (signalled.test(std::memory_order_consume)) {
-                if (shouldDestroy)
-                    DestroyDependencies();
-                return true;
-            }
-
-            i64 startTime{util::GetTimeNs()}, initialTimeout{timeoutNs};
-            if (!chainedCycles.AllOf([&](auto &cycle) {
-                if (!cycle->Wait(timeoutNs, shouldDestroy))
-                    return false;
-                timeoutNs = std::max<i64>(0, initialTimeout - (util::GetTimeNs() - startTime));
-                return true;
-            }))
-                return false;
-
-            std::unique_lock lock{mutex, std::defer_lock};
-            if (!lock.try_lock_for(std::chrono::nanoseconds{timeoutNs}))
-                return false;
-
-            if (!submitCondition.wait_for(lock, std::chrono::nanoseconds(timeoutNs), [&] { return submitted; }))
-                return false;
-
-            timeoutNs = std::max<i64>(0, initialTimeout - (util::GetTimeNs() - startTime));
-
-            vk::Result waitResult;
-            while ((waitResult = (*device).waitForFences(1, &fence, false, static_cast<u64>(timeoutNs), *device.getDispatcher())) != vk::Result::eSuccess) {
-                if (waitResult == vk::Result::eTimeout)
-                    break;
-
-                if (waitResult == vk::Result::eErrorInitializationFailed) {
-                    timeoutNs = std::max<i64>(0, initialTimeout - (util::GetTimeNs() - startTime));
-                    continue;
-                }
-
-                throw exception("An error occurred while waiting for fence 0x{:X}: {}", static_cast<VkFence>(fence), vk::to_string(waitResult));
-            }
-
-            if (waitResult == vk::Result::eSuccess) {
-                signalled.test_and_set(std::memory_order_release);
-                if (shouldDestroy)
-                    DestroyDependencies();
-                return true;
-            } else {
-                return false;
-            }
-        }
-
-        bool Wait(std::chrono::duration<i64, std::nano> timeout, bool shouldDestroy = false) {
-            return Wait(timeout.count(), shouldDestroy);
-        }
-
        /**
         * @param quick Skips the call to check the fence's status, just checking the signalled flag
         * @return If the fence is signalled currently or not
@ -193,12 +184,21 @@ namespace skyline::gpu {
            if (!lock)
                return false;

+            if (signalled.test(std::memory_order_relaxed)) {
+                if (shouldDestroy)
+                    DestroyDependencies();
+                return true;
+            }
+
            if (!submitted)
                return false;

            auto status{(*device).getFenceStatus(fence, *device.getDispatcher())};
            if (status == vk::Result::eSuccess) {
-                signalled.test_and_set(std::memory_order_release);
+                if (semaphoreUnsignalCycle && !semaphoreUnsignalCycle->Poll())
+                    return false;
+
+                signalled.test_and_set(std::memory_order_relaxed);
                if (shouldDestroy)
                    DestroyDependencies();
                return true;
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@ -28,11 +28,12 @@ namespace skyline::gpu::interconnect {
          },
          commandBuffer{AllocateRaiiCommandBuffer(gpu, commandPool)},
          fence{gpu.vkDevice, vk::FenceCreateInfo{ .flags = vk::FenceCreateFlagBits::eSignaled }},
-          cycle{std::make_shared<FenceCycle>(gpu.vkDevice, *fence, true)} {}
+          semaphore{gpu.vkDevice, vk::SemaphoreCreateInfo{}},
+          cycle{std::make_shared<FenceCycle>(gpu.vkDevice, *fence, *semaphore, true)} {}

    std::shared_ptr<FenceCycle> CommandRecordThread::Slot::Reset(GPU &gpu) {
        cycle->Wait();
-        cycle = std::make_shared<FenceCycle>(gpu.vkDevice, *fence);
+        cycle = std::make_shared<FenceCycle>(*cycle);
        // Command buffer doesn't need to be reset since that's done implicitly by begin
        return cycle;
    }
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@ -24,6 +24,7 @@ namespace skyline::gpu::interconnect {
            vk::raii::CommandPool commandPool; //!< Use one command pool per slot since command buffers from different slots may be recorded into on multiple threads at the same time
            vk::raii::CommandBuffer commandBuffer;
            vk::raii::Fence fence;
+            vk::raii::Semaphore semaphore;
            std::shared_ptr<FenceCycle> cycle;
            boost::container::stable_vector<node::NodeVariant> nodes;
            LinearAllocatorState<> allocator;