Introduce FenceCycle Chaining

If we want to allow submitting multiple pieces of work to the GPU at once while still requiring CPU synchronization, we'll need to track all past fence cycles associated with a resource alongside the current one. To solve this the concept of chaining fences has been introduced, fences from past usages can be chained to the latest fence which'll then recursively forward operations to chained fences. This change also ends up mandating a move away from `FenceCycleDependency` as it would prevent fences from concurrently locking the same resources which is required for chaining to work as two fences being chained fundamentally means they're locking the same resources. The `AtomicForwardList` is therefore used as the new container.
2024-06-14 14:28:44 +02:00 · 2022-06-26 12:26:18 +05:30 · 2022-06-26 12:26:18 +05:30 · 07d45ee504
commit 07d45ee504
parent cf9e31c1eb
8 changed files with 71 additions and 88 deletions
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@ -198,7 +198,7 @@ namespace skyline::gpu {
    /**
     * @brief A FenceCycleDependency that synchronizes the contents of a host buffer with the guest buffer
     */
-    struct BufferGuestSync : public FenceCycleDependency {
+    struct BufferGuestSync {
        std::shared_ptr<Buffer> buffer;

        explicit BufferGuestSync(std::shared_ptr<Buffer> buffer) : buffer(std::move(buffer)) {}
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@ -20,7 +20,7 @@ namespace skyline::gpu {
     * @brief A buffer which is backed by host constructs while being synchronized with the underlying guest buffer
     * @note This class conforms to the Lockable and BasicLockable C++ named requirements
     */
-    class Buffer : public std::enable_shared_from_this<Buffer>, public FenceCycleDependency {
+    class Buffer : public std::enable_shared_from_this<Buffer> {
      private:
        GPU &gpu;
        std::mutex mutex; //!< Synchronizes any mutations to the buffer or its backing
@ -93,7 +93,7 @@ namespace skyline::gpu {
         * @brief A delegate for a strong reference to a Buffer by a BufferView which can be changed to another Buffer transparently
         * @note This class conforms to the Lockable and BasicLockable C++ named requirements
         */
-        struct BufferDelegate : public FenceCycleDependency {
+        struct BufferDelegate {
            std::shared_ptr<Buffer> buffer;
            const Buffer::BufferViewStorage *view;
            std::function<void(const BufferViewStorage &, const std::shared_ptr<Buffer> &)> usageCallback;
--- a/app/src/main/cpp/skyline/gpu/fence_cycle.h
+++ b/app/src/main/cpp/skyline/gpu/fence_cycle.h
@ -3,23 +3,13 @@

 #pragma once

-#include <forward_list>
 #include <vulkan/vulkan_raii.hpp>
 #include <common.h>
+#include <common/atomic_forward_list.h>

 namespace skyline::gpu {
    struct FenceCycle;

-    /**
-     * @brief Any object whose lifetime can be attached to a fence cycle needs to inherit this class
-     */
-    struct FenceCycleDependency {
-      private:
-        std::shared_ptr<FenceCycleDependency> next{}; //!< A shared pointer to the next dependendency to form a linked list
-        FenceCycle *cycle{}; //!< A pointer to the fence cycle owning this, used to prevent attaching to the same cycle multiple times but should never be used to access the object due to not being an owning reference nor nullified on the destruction of the owning cycle
-        friend FenceCycle;
-    };
-
    /**
     * @brief A wrapper around a Vulkan Fence which only tracks a single reset -> signal cycle with the ability to attach lifetimes of objects to it
     * @note This provides the guarantee that the fence must be signalled prior to destruction when objects are to be destroyed
@ -30,23 +20,20 @@ namespace skyline::gpu {
        std::atomic_flag signalled;
        const vk::raii::Device &device;
        vk::Fence fence;
-        std::shared_ptr<FenceCycleDependency> list{};
+
+        AtomicForwardList<std::shared_ptr<void>> dependencies; //!< A list of all dependencies on this fence cycle
+        AtomicForwardList<std::shared_ptr<FenceCycle>> chainedCycles; //!< A list of all chained FenceCycles, this is used to express multi-fence dependencies

        /**
         * @brief Sequentially iterate through the shared_ptr linked list of dependencies and reset all pointers in a thread-safe atomic manner
-         * @note We cannot simply nullify the base pointer of the list as a false dependency chain is maintained between the objects when retained exteranlly
+         * @note We cannot simply nullify the base pointer of the list as a false dependency chain is maintained between the objects when retained externally
         */
        void DestroyDependencies() {
-            auto current{std::atomic_exchange_explicit(&list, std::shared_ptr<FenceCycleDependency>{}, std::memory_order_acquire)};
-            while (current) {
-                std::shared_ptr<FenceCycleDependency> next{};
-                next.swap(current->next);
-                current.swap(next);
-            }
+            dependencies.Clear();
        }

      public:
-        FenceCycle(const vk::raii::Device &device, vk::Fence fence) : signalled(false), device(device), fence(fence) {
+        FenceCycle(const vk::raii::Device &device, vk::Fence fence) : signalled{false}, device{device}, fence{fence} {
            device.resetFences(fence);
        }

@ -69,6 +56,10 @@ namespace skyline::gpu {
            if (signalled.test(std::memory_order_consume))
                return;

+            chainedCycles.Iterate([](auto &cycle) {
+                cycle->Wait();
+            });
+
            vk::Result waitResult;
            while ((waitResult = (*device).waitForFences(1, &fence, false, std::numeric_limits<u64>::max(), *device.getDispatcher())) != vk::Result::eSuccess) {
                if (waitResult == vk::Result::eTimeout)
@ -90,10 +81,33 @@ namespace skyline::gpu {
         * @brief Wait on a fence cycle with a timeout in nanoseconds
         * @return If the wait was successful or timed out
         */
-        bool Wait(std::chrono::duration<u64, std::nano> timeout) {
+        bool Wait(i64 timeoutNs) {
            if (signalled.test(std::memory_order_consume))
                return true;
-            if ((*device).waitForFences(1, &fence, false, std::numeric_limits<u64>::max(), *device.getDispatcher()) == vk::Result::eSuccess) {
+
+            i64 startTime{util::GetTimeNs()}, initialTimeout{timeoutNs};
+            if (!chainedCycles.AllOf([&](auto &cycle) {
+                if (!cycle->Wait(timeoutNs))
+                    return false;
+                timeoutNs = std::max<i64>(0, initialTimeout - (util::GetTimeNs() - startTime));
+                return true;
+            }))
+                return false;
+
+            vk::Result waitResult;
+            while ((waitResult = (*device).waitForFences(1, &fence, false, static_cast<u64>(timeoutNs), *device.getDispatcher())) != vk::Result::eSuccess) {
+                if (waitResult == vk::Result::eTimeout)
+                    break;
+
+                if (waitResult == vk::Result::eErrorInitializationFailed) {
+                    timeoutNs = std::max<i64>(0, initialTimeout - (util::GetTimeNs() - startTime));
+                    continue;
+                }
+
+                throw exception("An error occurred while waiting for fence 0x{:X}: {}", static_cast<VkFence>(fence), vk::to_string(waitResult));
+            }
+
+            if (waitResult == vk::Result::eSuccess) {
                if (!signalled.test_and_set(std::memory_order_release))
                    DestroyDependencies();
                return true;
@ -102,12 +116,20 @@ namespace skyline::gpu {
            }
        }

+        bool Wait(std::chrono::duration<i64, std::nano> timeout) {
+            return Wait(timeout.count());
+        }
+
        /**
         * @return If the fence is signalled currently or not
         */
        bool Poll() {
            if (signalled.test(std::memory_order_consume))
                return true;
+
+            if (!chainedCycles.AllOf([](auto &cycle) { return cycle->Poll(); }))
+                return false;
+
            auto status{(*device).getFenceStatus(fence, *device.getDispatcher())};
            if (status == vk::Result::eSuccess) {
                if (!signalled.test_and_set(std::memory_order_release))
@ -121,66 +143,27 @@ namespace skyline::gpu {
        /**
         * @brief Attach the lifetime of an object to the fence being signalled
         */
-        void AttachObject(const std::shared_ptr<FenceCycleDependency> &dependency) {
-            if (dependency->cycle != this && !signalled.test(std::memory_order_consume)) {
-                // Only try to insert if the object isn't already owned by this fence cycle and it hasn't been signalled yet
-                dependency->next = std::atomic_load_explicit(&list, std::memory_order_acquire);
-
-                do {
-                    if (dependency->next == nullptr && signalled.test(std::memory_order_consume)) {
-                        // `list` will be nullptr after being signalled, dependencies will not be destroyed and we need to do so ourselves
-                        dependency->next.reset(); // We need to reset the dependency so it doesn't incorrectly extend the lifetime of any resources
-                        return;
-                    }
-                } while (!std::atomic_compare_exchange_strong_explicit(&list, &dependency->next, dependency, std::memory_order_release, std::memory_order_acquire));
-
-                dependency->cycle = this;
-            }
+        void AttachObject(const std::shared_ptr<void> &dependency) {
+            if (!signalled.test(std::memory_order_consume))
+                dependencies.Append(dependency);
        }

        /**
         * @brief A version of AttachObject optimized for several objects being attached at once
         */
-        void AttachObjects(std::initializer_list<std::shared_ptr<FenceCycleDependency>> dependencies) {
-            if (!signalled.test(std::memory_order_consume)) {
-                {
-                    auto it{dependencies.begin()}, next{std::next(it)};
-                    if (it != dependencies.end()) {
-                        // Only try to insert if the object isn't already owned by this fence cycle
-                        for (; next != dependencies.end(); next++) {
-                            if ((*it)->cycle != this) {
-                                (*it)->next = *next;
-                                (*it)->cycle = this;
-                            }
-                            it = next;
-                        }
-                    }
-                }
-
-                auto &dependency{*dependencies.begin()};
-                auto &lastDependency{*std::prev(dependencies.end())};
-                lastDependency->next = std::atomic_load_explicit(&list, std::memory_order_acquire);
-
-                do {
-                    if (lastDependency->next == nullptr && signalled.test(std::memory_order_consume)) {
-                        // `list` will be nullptr after being signalled, dependencies will not be destroyed and we need to do so ourselves
-
-                        auto current{dependency->next}; // We need to reset any chained dependencies before exiting
-                        while (current) {
-                            std::shared_ptr<FenceCycleDependency> next{};
-                            next.swap(current->next);
-                            current.swap(next);
-                        }
-
-                        return;
-                    }
-                } while (!std::atomic_compare_exchange_strong_explicit(&list, &dependency->next, dependency, std::memory_order_release, std::memory_order_acquire));
-            }
+        template<typename... Dependencies>
+        void AttachObjects(Dependencies &&... pDependencies) {
+            if (!signalled.test(std::memory_order_consume))
+                dependencies.Append(pDependencies...);
        }

-        template<typename... Dependencies>
-        void AttachObjects(Dependencies &&... dependencies) {
-            AttachObjects(std::initializer_list<std::shared_ptr<FenceCycleDependency>>{std::forward<Dependencies>(dependencies)...});
+        /**
+         * @brief Chains another cycle to this cycle, this cycle will not be signalled till the supplied cycle is signalled
+         * @param cycle The cycle to chain to this one, this is nullable and this function will be a no-op if this is nullptr
+         */
+        void ChainCycle(const std::shared_ptr<FenceCycle> &cycle) {
+            if (cycle && !signalled.test(std::memory_order_consume) && cycle.get() != this && cycle->Poll())
+                chainedCycles.Append(cycle); // If the cycle isn't the current cycle or already signalled, we need to chain it
        }
    };
 }
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@ -115,7 +115,7 @@ namespace skyline::gpu::interconnect {
        return didLock;
    }

-    void CommandExecutor::AttachDependency(const std::shared_ptr<FenceCycleDependency> &dependency) {
+    void CommandExecutor::AttachDependency(const std::shared_ptr<void> &dependency) {
        cycle->AttachObject(dependency);
    }

--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@ -114,7 +114,7 @@ namespace skyline::gpu::interconnect {
        /**
         * @brief Attach the lifetime of the fence cycle dependency to the command buffer
         */
-        void AttachDependency(const std::shared_ptr<FenceCycleDependency> &dependency);
+        void AttachDependency(const std::shared_ptr<void> &dependency);

        /**
         * @brief Adds a command that needs to be executed inside a subpass configured with certain attachments
--- a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
@ -2043,7 +2043,7 @@ namespace skyline::gpu::interconnect {
        u32 bindlessTextureConstantBufferIndex{};
        std::shared_ptr<TextureView> nullTextureView; //!< View used instead of a null descriptor when an empty TIC is encountered, this avoids the need for the nullDescriptor VK feature

-        struct PoolTexture : public FenceCycleDependency {
+        struct PoolTexture {
            GuestTexture guest;
            std::weak_ptr<TextureView> view;
        };
@ -2338,7 +2338,7 @@ namespace skyline::gpu::interconnect {
      private:
        bool tscIndexLinked{}; //!< If the TSC index in bindless texture handles is the same as the TIC index or if it's independent from the TIC index

-        struct Sampler : public vk::raii::Sampler, public FenceCycleDependency {
+        struct Sampler : public vk::raii::Sampler {
            using vk::raii::Sampler::Sampler;
        };

@ -2937,7 +2937,7 @@ namespace skyline::gpu::interconnect {
            }, programState.descriptorSetBindings)};

            // Draw Persistent Storage
-            struct DrawStorage : FenceCycleDependency {
+            struct DrawStorage {
                ShaderProgramState::DescriptorSetWrites descriptorSetWrites;
                std::optional<DescriptorAllocator::ActiveDescriptorSet> descriptorSet;

--- a/app/src/main/cpp/skyline/gpu/memory_manager.h
+++ b/app/src/main/cpp/skyline/gpu/memory_manager.h
@ -39,7 +39,7 @@ namespace skyline::gpu::memory {
    /**
     * @brief A Buffer that can be independently attached to a fence cycle
     */
-    class StagingBuffer : public Buffer, public FenceCycleDependency {
+    class StagingBuffer : public Buffer {
        using Buffer::Buffer;
    };

--- a/app/src/main/cpp/skyline/gpu/texture/texture.h
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.h
@ -293,7 +293,7 @@ namespace skyline::gpu {
     * @note The object **must** be locked prior to accessing any members as values will be mutated
     * @note This class conforms to the Lockable and BasicLockable C++ named requirements
     */
-    class TextureView : public FenceCycleDependency, public std::enable_shared_from_this<TextureView> {
+    class TextureView : public std::enable_shared_from_this<TextureView> {
      private:
        vk::ImageView vkView{};

@ -350,7 +350,7 @@ namespace skyline::gpu {
     * @brief A texture which is backed by host constructs while being synchronized with the underlying guest texture
     * @note This class conforms to the Lockable and BasicLockable C++ named requirements
     */
-    class Texture : public std::enable_shared_from_this<Texture>, public FenceCycleDependency {
+    class Texture : public std::enable_shared_from_this<Texture> {
      private:
        GPU &gpu;
        std::mutex mutex; //!< Synchronizes any mutations to the texture or its backing
@ -415,9 +415,9 @@ namespace skyline::gpu {
        void CopyToGuest(u8 *hostBuffer);

        /**
-         * @brief A FenceCycleDependency that copies the contents of a staging buffer or mapped image backing the texture to the guest texture
+         * @brief A fence cycle dependency that copies the contents of a staging buffer or mapped image backing the texture to the guest texture
         */
-        struct TextureBufferCopy : public FenceCycleDependency {
+        struct TextureBufferCopy {
            std::shared_ptr<Texture> texture;
            std::shared_ptr<memory::StagingBuffer> stagingBuffer;