From e1a4325137c6efce48555beb22df540cdfbb6d81 Mon Sep 17 00:00:00 2001
From: PixelyIon <pixelyion@protonmail.com>
Date: Mon, 1 Aug 2022 22:08:32 +0530
Subject: [PATCH] Introduce `FenceCycle` Waiter Thread

A substantial amount of time is spent destroying dependencies for any threads waiting or polling `FenceCycle`s, this is not optimal as it blocks them from moving onto other tasks while destruction is a fundamentally async task and can be delayed.

This commit solves this by introducing a thread that is dedicated to waiting on every `FenceCycle` then signalling and destroying all dependencies which entirely fixes the issue of destruction blocking on more important threads.
---
 app/src/main/cpp/skyline/gpu.cpp              |  2 +-
 .../cpp/skyline/gpu/command_scheduler.cpp     | 58 +++++++++++++---
 .../main/cpp/skyline/gpu/command_scheduler.h  | 32 ++++++---
 app/src/main/cpp/skyline/gpu/fence_cycle.h    | 66 ++++++++++++-------
 .../gpu/interconnect/command_executor.cpp     |  2 +-
 5 files changed, 118 insertions(+), 42 deletions(-)
diff --git a/app/src/main/cpp/skyline/gpu.cpp b/app/src/main/cpp/skyline/gpu.cpp
index 3f5bd5ad..3db6f287 100644
--- a/app/src/main/cpp/skyline/gpu.cpp
+++ b/app/src/main/cpp/skyline/gpu.cpp
@@ -368,7 +368,7 @@ namespace skyline::gpu {
           vkDevice(CreateDevice(vkContext, vkPhysicalDevice, vkQueueFamilyIndex, traits)),
           vkQueue(vkDevice, vkQueueFamilyIndex, 0),
           memory(*this),
-          scheduler(*this),
+          scheduler(state, *this),
           presentation(state, *this),
           texture(*this),
           buffer(*this),
diff --git a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
index f41e9187..36f90043 100644
--- a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
+++ b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
@@ -2,19 +2,53 @@
 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
 
 #include <gpu.h>
+#include <loader/loader.h>
 #include "command_scheduler.h"
 
 namespace skyline::gpu {
+    void CommandScheduler::WaiterThread() {
+        if (int result{pthread_setname_np(pthread_self(), "Sky-CycleWaiter")})
+            Logger::Warn("Failed to set the thread name: {}", strerror(result));
+
+        try {
+            signal::SetSignalHandler({SIGINT, SIGILL, SIGTRAP, SIGBUS, SIGFPE, SIGSEGV}, signal::ExceptionalSignalHandler);
+
+            cycleQueue.Process([](const std::shared_ptr<FenceCycle> &cycle) {
+                cycle->Wait(true);
+            }, [] {});
+        } catch (const signal::SignalException &e) {
+            Logger::Error("{}\nStack Trace:{}", e.what(), state.loader->GetStackTrace(e.frames));
+            if (state.process)
+                state.process->Kill(false);
+            else
+                std::rethrow_exception(std::current_exception());
+        } catch (const std::exception &e) {
+            Logger::Error(e.what());
+            if (state.process)
+                state.process->Kill(false);
+            else
+                std::rethrow_exception(std::current_exception());
+        }
+    }
+
     CommandScheduler::CommandBufferSlot::CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool)
         : device(device),
           commandBuffer(device, static_cast<VkCommandBuffer>(commandBuffer), static_cast<VkCommandPool>(*pool)),
           fence(device, vk::FenceCreateInfo{}),
           cycle(std::make_shared<FenceCycle>(device, *fence)) {}
 
-    CommandScheduler::CommandScheduler(GPU &pGpu) : gpu(pGpu), pool(std::ref(pGpu.vkDevice), vk::CommandPoolCreateInfo{
-        .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
-        .queueFamilyIndex = pGpu.vkQueueFamilyIndex,
-    }) {}
+    CommandScheduler::CommandScheduler(const DeviceState &state, GPU &pGpu)
+        : state{state},
+          gpu{pGpu},
+          waiterThread{&CommandScheduler::WaiterThread, this},
+          pool{std::ref(pGpu.vkDevice), vk::CommandPoolCreateInfo{
+              .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
+              .queueFamilyIndex = pGpu.vkQueueFamilyIndex,
+          }} {}
+
+    CommandScheduler::~CommandScheduler() {
+        waiterThread.join();
+    }
 
     CommandScheduler::ActiveCommandBuffer CommandScheduler::AllocateCommandBuffer() {
         for (auto &slot : pool->buffers) {
@@ -42,11 +76,15 @@ namespace skyline::gpu {
         return {pool->buffers.emplace_back(gpu.vkDevice, commandBuffer, pool->vkCommandPool)};
     }
 
-    void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, vk::Fence fence) {
-        std::scoped_lock lock(gpu.queueMutex);
-        gpu.vkQueue.submit(vk::SubmitInfo{
-            .commandBufferCount = 1,
-            .pCommandBuffers = &*commandBuffer,
-        }, fence);
+    void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle) {
+        {
+            std::scoped_lock lock(gpu.queueMutex);
+            gpu.vkQueue.submit(vk::SubmitInfo{
+                .commandBufferCount = 1,
+                .pCommandBuffers = &*commandBuffer,
+            }, cycle->fence);
+        }
+
+        cycleQueue.Push(cycle);
     }
 }
diff --git a/app/src/main/cpp/skyline/gpu/command_scheduler.h b/app/src/main/cpp/skyline/gpu/command_scheduler.h
index 816112f1..050d6b0e 100644
--- a/app/src/main/cpp/skyline/gpu/command_scheduler.h
+++ b/app/src/main/cpp/skyline/gpu/command_scheduler.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <common/thread_local.h>
+#include <common/circular_queue.h>
 #include "fence_cycle.h"
 
 namespace skyline::gpu {
@@ -25,6 +26,7 @@ namespace skyline::gpu {
             CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool);
         };
 
+        const DeviceState &state;
         GPU &gpu;
 
         /**
@@ -40,6 +42,12 @@ namespace skyline::gpu {
         };
         ThreadLocal<CommandPool> pool;
 
+        std::thread waiterThread; //!< A thread that waits on and signals FenceCycle(s) then clears any associated resources
+        static constexpr size_t FenceCycleWaitCount{256}; //!< The amount of fence cycles the cycle queue can hold
+        CircularQueue<std::shared_ptr<FenceCycle>> cycleQueue{FenceCycleWaitCount}; //!< A circular queue containing all the active cycles that can be waited on
+
+        void WaiterThread();
+
       public:
         /**
          * @brief An active command buffer occupies a slot and ensures that its status is updated correctly
@@ -92,7 +100,9 @@ namespace skyline::gpu {
             }
         };
 
-        CommandScheduler(GPU &gpu);
+        CommandScheduler(const DeviceState &state, GPU &gpu);
+
+        ~CommandScheduler();
 
         /**
          * @brief Allocates an existing or new primary command buffer from the pool
@@ -100,9 +110,11 @@ namespace skyline::gpu {
         ActiveCommandBuffer AllocateCommandBuffer();
 
         /**
-         * @brief Submits a single command buffer to the GPU queue with an optional fence
+         * @brief Submits a single command buffer to the GPU queue while queuing it up to be waited on
+         * @note The supplied command buffer and cycle **must** be from AllocateCommandBuffer()
+         * @note Any cycle submitted via this method does not need to destroy dependencies manually, the waiter thread will handle this
          */
-        void SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, vk::Fence fence = {});
+        void SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle);
 
         /**
          * @brief Submits a command buffer recorded with the supplied function synchronously
@@ -116,8 +128,10 @@ namespace skyline::gpu {
                 });
                 recordFunction(*commandBuffer);
                 commandBuffer->end();
-                SubmitCommandBuffer(*commandBuffer, commandBuffer.GetFence());
-                return commandBuffer.GetFenceCycle();
+
+                auto cycle{commandBuffer.GetFenceCycle()};
+                SubmitCommandBuffer(*commandBuffer, cycle);
+                return cycle;
             } catch (...) {
                 commandBuffer.GetFenceCycle()->Cancel();
                 std::rethrow_exception(std::current_exception());
@@ -130,14 +144,16 @@ namespace skyline::gpu {
         template<typename RecordFunction>
         std::shared_ptr<FenceCycle> SubmitWithCycle(RecordFunction recordFunction) {
             auto commandBuffer{AllocateCommandBuffer()};
+            auto cycle{commandBuffer.GetFenceCycle()};
             try {
                 commandBuffer->begin(vk::CommandBufferBeginInfo{
                     .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
                 });
-                recordFunction(*commandBuffer, commandBuffer.GetFenceCycle());
+                recordFunction(*commandBuffer, cycle);
                 commandBuffer->end();
-                SubmitCommandBuffer(*commandBuffer, commandBuffer.GetFence());
-                return commandBuffer.GetFenceCycle();
+
+                SubmitCommandBuffer(*commandBuffer, cycle);
+                return cycle;
             } catch (...) {
                 commandBuffer.GetFenceCycle()->Cancel();
                 std::rethrow_exception(std::current_exception());
diff --git a/app/src/main/cpp/skyline/gpu/fence_cycle.h b/app/src/main/cpp/skyline/gpu/fence_cycle.h
index a0966561..89c3d140 100644
--- a/app/src/main/cpp/skyline/gpu/fence_cycle.h
+++ b/app/src/main/cpp/skyline/gpu/fence_cycle.h
@@ -8,7 +8,7 @@
 #include <common/atomic_forward_list.h>
 
 namespace skyline::gpu {
-    struct FenceCycle;
+    class CommandScheduler;
 
     /**
      * @brief A wrapper around a Vulkan Fence which only tracks a single reset -> signal cycle with the ability to attach lifetimes of objects to it
@@ -17,19 +17,23 @@ namespace skyline::gpu {
      */
     struct FenceCycle {
       private:
-        std::atomic_flag signalled;
+        std::atomic_flag signalled{}; //!< If the underlying fence has been signalled since the creation of this FenceCycle, this doesn't necessarily mean the dependencies have been destroyed
+        std::atomic_flag alreadyDestroyed{}; //!< If the cycle's dependencies are already destroyed, this prevents multiple destructions
         const vk::raii::Device &device;
         vk::Fence fence;
 
+        friend CommandScheduler;
+
         AtomicForwardList<std::shared_ptr<void>> dependencies; //!< A list of all dependencies on this fence cycle
         AtomicForwardList<std::shared_ptr<FenceCycle>> chainedCycles; //!< A list of all chained FenceCycles, this is used to express multi-fence dependencies
 
         /**
-         * @brief Sequentially iterate through the shared_ptr linked list of dependencies and reset all pointers in a thread-safe atomic manner
-         * @note We cannot simply nullify the base pointer of the list as a false dependency chain is maintained between the objects when retained externally
+         * @brief Destroy all the dependencies of this cycle
+         * @note We cannot delete the chained cycles associated with this fence as they may be iterated over during the deletion, it is only safe to delete them during the destruction of the cycle
          */
         void DestroyDependencies() {
-            dependencies.Clear();
+            if (!alreadyDestroyed.test_and_set(std::memory_order_release))
+                dependencies.Clear();
         }
 
       public:
@@ -45,19 +49,23 @@ namespace skyline::gpu {
          * @brief Signals this fence regardless of if the underlying fence has been signalled or not
          */
         void Cancel() {
-            if (!signalled.test_and_set(std::memory_order_release))
-                DestroyDependencies();
+            signalled.test_and_set(std::memory_order_release);
+            DestroyDependencies();
         }
 
         /**
          * @brief Wait on a fence cycle till it has been signalled
+         * @param shouldDestroy If true, the dependencies of this cycle will be destroyed after the fence is signalled
          */
-        void Wait() {
-            if (signalled.test(std::memory_order_consume))
+        void Wait(bool shouldDestroy = false) {
+            if (signalled.test(std::memory_order_consume)) {
+                if (shouldDestroy)
+                    DestroyDependencies();
                 return;
+            }
 
-            chainedCycles.Iterate([](auto &cycle) {
-                cycle->Wait();
+            chainedCycles.Iterate([shouldDestroy](auto &cycle) {
+                cycle->Wait(shouldDestroy);
             });
 
             vk::Result waitResult;
@@ -73,21 +81,26 @@ namespace skyline::gpu {
                 throw exception("An error occurred while waiting for fence 0x{:X}: {}", static_cast<VkFence>(fence), vk::to_string(waitResult));
             }
 
-            if (!signalled.test_and_set(std::memory_order_release))
+            signalled.test_and_set(std::memory_order_release);
+            if (shouldDestroy)
                 DestroyDependencies();
         }
 
         /**
          * @brief Wait on a fence cycle with a timeout in nanoseconds
+         * @param shouldDestroy If true, the dependencies of this cycle will be destroyed after the fence is signalled
          * @return If the wait was successful or timed out
          */
-        bool Wait(i64 timeoutNs) {
-            if (signalled.test(std::memory_order_consume))
+        bool Wait(i64 timeoutNs, bool shouldDestroy = false) {
+            if (signalled.test(std::memory_order_consume)) {
+                if (shouldDestroy)
+                    DestroyDependencies();
                 return true;
+            }
 
             i64 startTime{util::GetTimeNs()}, initialTimeout{timeoutNs};
             if (!chainedCycles.AllOf([&](auto &cycle) {
-                if (!cycle->Wait(timeoutNs))
+                if (!cycle->Wait(timeoutNs, shouldDestroy))
                     return false;
                 timeoutNs = std::max<i64>(0, initialTimeout - (util::GetTimeNs() - startTime));
                 return true;
@@ -108,7 +121,8 @@ namespace skyline::gpu {
             }
 
             if (waitResult == vk::Result::eSuccess) {
-                if (!signalled.test_and_set(std::memory_order_release))
+                signalled.test_and_set(std::memory_order_release);
+                if (shouldDestroy)
                     DestroyDependencies();
                 return true;
             } else {
@@ -116,23 +130,31 @@ namespace skyline::gpu {
             }
         }
 
-        bool Wait(std::chrono::duration<i64, std::nano> timeout) {
-            return Wait(timeout.count());
+        bool Wait(std::chrono::duration<i64, std::nano> timeout, bool shouldDestroy = false) {
+            return Wait(timeout.count(), shouldDestroy);
         }
 
         /**
+         * @param quick Skips the call to check the fence's status, just checking the signalled flag
          * @return If the fence is signalled currently or not
          */
-        bool Poll() {
-            if (signalled.test(std::memory_order_consume))
+        bool Poll(bool quick = true, bool shouldDestroy = false) {
+            if (signalled.test(std::memory_order_consume)) {
+                if (shouldDestroy)
+                    DestroyDependencies();
                 return true;
+            }
 
-            if (!chainedCycles.AllOf([](auto &cycle) { return cycle->Poll(); }))
+            if (quick)
+                return false; // We need to return early if we're not waiting on the fence
+
+            if (!chainedCycles.AllOf([=](auto &cycle) { return cycle->Poll(quick, shouldDestroy); }))
                 return false;
 
             auto status{(*device).getFenceStatus(fence, *device.getDispatcher())};
             if (status == vk::Result::eSuccess) {
-                if (!signalled.test_and_set(std::memory_order_release))
+                signalled.test_and_set(std::memory_order_release);
+                if (shouldDestroy)
                     DestroyDependencies();
                 return true;
             } else {
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
index 4f2291f4..efb4f5c2 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@@ -289,7 +289,7 @@ namespace skyline::gpu::interconnect {
             for (const auto &attachedBuffer : attachedBuffers)
                 attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer, this is done directly prior to submission to prevent stalls
 
-            gpu.scheduler.SubmitCommandBuffer(commandBuffer, activeCommandBuffer.GetFence());
+            gpu.scheduler.SubmitCommandBuffer(commandBuffer, cycle);
 
             nodes.clear();