From 98c0cc3e7f42a596317dfe318dc19f34f16d4ace Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Sun, 9 Oct 2022 13:23:42 +0100
Subject: [PATCH] Impl preserve attached buffers/textures to avoid GPFIFO lock
 thrashing

When profiling SMO, it became obvious that the constant locking of textures and buffers in SyncDescriptors took up a large amount of CPU time (3-5%), a precious resource in intensive areas like Metro. This commit implements somewhat of a workaround to avoid constant relocking, if a buffer is frequently attached on the GPU and almost never used on the CPU we can keep the lock held between executions. Of course it's not that simple though, if the guest tries to lock a texture for the first time which has already been locked as preserve on the GPFIFO we need to avoid a deadlock. This is acheived through a combination of two things: first we periodically clear the locked attachments every 2*SlotCount submissions, preventing a complete deadlock on the CPU (just a long wait instead) and meaning that the next time the resource is attached on the GPU it will not be marked for preservation due to having been locked on the guest before; second, we always need to unlock everything when the GPU thread runs out of work, as the perioding clearing will not execute in this case which would otherwise leave the textures locked on the GPFIFO thread forever (if guest was waiting on a lock to submit work). It should be noted that we don't clear preserve attached resources in the latter scenario, only unlock them and then relock when more work is available.
---
 app/src/main/cpp/skyline/gpu/buffer.cpp       |  2 +-
 app/src/main/cpp/skyline/gpu/buffer.h         |  7 +-
 .../gpu/interconnect/command_executor.cpp     | 84 +++++++++++++++----
 .../gpu/interconnect/command_executor.h       | 19 ++++-
 .../main/cpp/skyline/gpu/texture/texture.h    |  5 +-
 5 files changed, 94 insertions(+), 23 deletions(-)
diff --git a/app/src/main/cpp/skyline/gpu/buffer.cpp b/app/src/main/cpp/skyline/gpu/buffer.cpp
index 42862bbc..f53fc311 100644
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@@ -345,7 +345,7 @@ namespace skyline::gpu {
 
     void Buffer::unlock() {
         tag = ContextTag{};
-        backingImmutability = BackingImmutability::None;
+        AllowAllBackingWrites();
         mutex.unlock();
     }
 
diff --git a/app/src/main/cpp/skyline/gpu/buffer.h b/app/src/main/cpp/skyline/gpu/buffer.h
index f2d94461..342f6ffe 100644
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@@ -43,7 +43,7 @@ namespace skyline::gpu {
     class Buffer : public std::enable_shared_from_this<Buffer> {
       private:
         GPU &gpu;
-        SpinLock mutex; //!< Synchronizes any mutations to the buffer or its backing
+        RecursiveSpinLock mutex; //!< Synchronizes any mutations to the buffer or its backing
         std::atomic<ContextTag> tag{}; //!< The tag associated with the last lock call
         memory::Buffer backing;
         std::optional<GuestBuffer> guest;
@@ -190,6 +190,11 @@ namespace skyline::gpu {
             backingImmutability = BackingImmutability::AllWrites;
         }
 
+        void AllowAllBackingWrites() {
+            std::scoped_lock lock{stateMutex};
+            backingImmutability = BackingImmutability::None;
+        }
+
         /**
          * @return If sequenced writes to the backing must not occur on the CPU
          * @note The buffer **must** be locked prior to calling this
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
index 4ce188bb..ee35fac5 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
 
+#include <range/v3/view.hpp>
 #include <loader/loader.h>
 #include <gpu.h>
 #include "command_executor.h"
@@ -234,8 +235,12 @@ namespace skyline::gpu::interconnect {
             textureManagerLock.emplace(gpu.texture);
 
         bool didLock{view->LockWithTag(tag)};
-        if (didLock)
-            attachedTextures.emplace_back(view->texture);
+        if (didLock) {
+            if (view->texture->FrequentlyLocked())
+                attachedTextures.emplace_back(view->texture);
+            else
+                preserveAttachedTextures.emplace_back(view->texture);
+        }
 
         return didLock;
     }
@@ -259,8 +264,12 @@ namespace skyline::gpu::interconnect {
             bufferManagerLock.emplace(gpu.buffer);
 
         bool didLock{view.LockWithTag(tag)};
-        if (didLock)
-            attachedBuffers.emplace_back(view.GetBuffer()->shared_from_this());
+        if (didLock) {
+            if (view.GetBuffer()->FrequentlyLocked())
+                attachedBuffers.emplace_back(view.GetBuffer()->shared_from_this());
+            else
+                preserveAttachedBuffers.emplace_back(view.GetBuffer()->shared_from_this());
+        }
         return didLock;
     }
 
@@ -271,14 +280,20 @@ namespace skyline::gpu::interconnect {
 
         if (lock.OwnsLock()) {
             // Transfer ownership to executor so that the resource will stay locked for the period it is used on the GPU
-            attachedBuffers.emplace_back(view.GetBuffer()->shared_from_this());
+            if (view.GetBuffer()->FrequentlyLocked())
+                attachedBuffers.emplace_back(view.GetBuffer()->shared_from_this());
+            else
+                preserveAttachedBuffers.emplace_back(view.GetBuffer()->shared_from_this());
             lock.Release(); // The executor will handle unlocking the lock so it doesn't need to be handled here
         }
     }
 
     void CommandExecutor::AttachLockedBuffer(std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
         if (lock.OwnsLock()) {
-            attachedBuffers.emplace_back(std::move(buffer));
+            if (buffer->FrequentlyLocked())
+                attachedBuffers.emplace_back(std::move(buffer));
+            else
+                preserveAttachedBuffers.emplace_back(std::move(buffer));
             lock.Release(); // See AttachLockedBufferView(...)
         }
     }
@@ -381,24 +396,25 @@ namespace skyline::gpu::interconnect {
                 }, {}, {}
             );
 
-            for (const auto &texture : attachedTextures)
+            boost::container::small_vector<FenceCycle *, 8> chainedCycles;
+            for (const auto &texture : ranges::views::concat(attachedTextures, preserveAttachedTextures)) {
                 texture->SynchronizeHostInline(slot->commandBuffer, cycle, true);
+                // We don't need to attach the Texture to the cycle as a TextureView will already be attached
+                if (ranges::find(chainedCycles, texture->cycle.get()) == chainedCycles.end()) {
+                    cycle->ChainCycle(texture->cycle);
+                    chainedCycles.emplace_back(texture->cycle.get());
+                }
+
+                texture->cycle = cycle;
+            }
         }
 
-        for (const auto &attachedBuffer : attachedBuffers)
-            if (attachedBuffer->SequencedCpuBackingWritesBlocked())
+        for (const auto &attachedBuffer : ranges::views::concat(attachedBuffers, preserveAttachedBuffers)) {
+            if (attachedBuffer->RequiresCycleAttach()) {
                 attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer
-
-        for (const auto &attachedTexture : attachedTextures) {
-            // We don't need to attach the Texture to the cycle as a TextureView will already be attached
-            cycle->ChainCycle(attachedTexture->cycle);
-            attachedTexture->cycle = cycle;
-        }
-
-        for (const auto &attachedBuffer : attachedBuffers) {
-            if (attachedBuffer->RequiresCycleAttach() ) {
                 cycle->AttachObject(attachedBuffer.buffer);
                 attachedBuffer->UpdateCycle(cycle);
+                attachedBuffer->AllowAllBackingWrites();
             }
         }
 
@@ -412,6 +428,12 @@ namespace skyline::gpu::interconnect {
         bufferManagerLock.reset();
         megaBufferAllocatorLock.reset();
         allocator->Reset();
+
+        // Periodically clear preserve attachments just in case there are new waiters which would otherwise end up waiting forever
+        if ((submissionNumber % CommandRecordThread::ActiveRecordSlots * 2) == 0) {
+            preserveAttachedBuffers.clear();
+            preserveAttachedTextures.clear();
+        }
     }
 
     void CommandExecutor::Submit() {
@@ -423,7 +445,33 @@ namespace skyline::gpu::interconnect {
         if (!slot->nodes.empty()) {
             TRACE_EVENT("gpu", "CommandExecutor::Submit");
             SubmitInternal();
+            submissionNumber++;
         }
+
         ResetInternal();
     }
+
+    void CommandExecutor::LockPreserve() {
+        if (!preserveLocked) {
+            preserveLocked = true;
+
+            for (auto &buffer : preserveAttachedBuffers)
+                buffer->LockWithTag(tag);
+
+            for (auto &texture : preserveAttachedTextures)
+                texture->LockWithTag(tag);
+        }
+    }
+
+    void CommandExecutor::UnlockPreserve() {
+        if (preserveLocked) {
+            for (auto &buffer : preserveAttachedBuffers)
+                buffer->unlock();
+
+            for (auto &texture : preserveAttachedTextures)
+                texture->unlock();
+
+            preserveLocked = false;
+        }
+    }
 }
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
index 9f7a9d5e..874426fb 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@@ -15,6 +15,8 @@ namespace skyline::gpu::interconnect {
      */
     class CommandRecordThread {
       public:
+        static constexpr size_t ActiveRecordSlots{6}; //!< Maximum number of simultaneously active slots
+
         /**
          * @brief Single execution slot, buffered back and forth between the GPFIFO thread and the record thread
          */
@@ -40,7 +42,6 @@ namespace skyline::gpu::interconnect {
         const DeviceState &state;
         std::thread thread;
 
-        static constexpr size_t ActiveRecordSlots{6}; //!< Maximum number of simultaneously active slots
         CircularQueue<Slot *> incoming{ActiveRecordSlots}; //!< Slots pending recording
         CircularQueue<Slot *> outgoing{ActiveRecordSlots}; //!< Slots that have been submitted, may still be active on the GPU
 
@@ -76,6 +77,7 @@ namespace skyline::gpu::interconnect {
         std::optional<std::scoped_lock<TextureManager>> textureManagerLock; //!< The lock on the texture manager, this is locked for the duration of the command execution from the first usage inside an execution to the submission
         std::optional<std::scoped_lock<BufferManager>> bufferManagerLock; //!< The lock on the buffer manager, see above for details
         std::optional<std::scoped_lock<MegaBufferAllocator>> megaBufferAllocatorLock; //!< The lock on the megabuffer allocator, see above for details
+        bool preserveLocked{};
 
         /**
          * @brief A wrapper of a Texture object that has been locked beforehand and must be unlocked afterwards
@@ -94,6 +96,7 @@ namespace skyline::gpu::interconnect {
             ~LockedTexture();
         };
 
+        std::vector<LockedTexture> preserveAttachedTextures;
         std::vector<LockedTexture> attachedTextures; //!< All textures that are attached to the current execution
 
         /**
@@ -113,6 +116,7 @@ namespace skyline::gpu::interconnect {
             ~LockedBuffer();
         };
 
+        std::vector<LockedBuffer> preserveAttachedBuffers;
         std::vector<LockedBuffer> attachedBuffers; //!< All textures that are attached to the current execution
 
 
@@ -154,6 +158,7 @@ namespace skyline::gpu::interconnect {
         std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
         LinearAllocatorState<> *allocator;
         ContextTag tag; //!< The tag associated with this command executor, any tagged resource locking must utilize this tag
+        size_t submissionNumber{};
         u32 executionNumber{};
 
         CommandExecutor(const DeviceState &state);
@@ -258,5 +263,17 @@ namespace skyline::gpu::interconnect {
          * @brief Execute all the nodes and submit the resulting command buffer to the GPU
          */
         void Submit();
+
+        /**
+         * @brief Locks all preserve attached buffers/textures
+         * @note This **MUST** be called before attaching any buffers/textures to an execution
+         */
+        void LockPreserve();
+
+        /**
+         * @brief Unlocks all preserve attached buffers/textures
+         * @note This **MUST** be called when there is no GPU work left to be done to avoid deadlocks where the guest will try to lock a buffer/texture but the GPFIFO thread has no work so won't periodically unlock it
+         */
+        void UnlockPreserve();
     };
 }
diff --git a/app/src/main/cpp/skyline/gpu/texture/texture.h b/app/src/main/cpp/skyline/gpu/texture/texture.h
index 000a2122..f1e8da30 100644
--- a/app/src/main/cpp/skyline/gpu/texture/texture.h
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <common/spin_lock.h>
 #include <common/lockable_shared_ptr.h>
 #include <nce.h>
 #include <gpu/tag_allocator.h>
@@ -356,9 +357,9 @@ namespace skyline::gpu {
     class Texture : public std::enable_shared_from_this<Texture> {
       private:
         GPU &gpu;
-        std::mutex mutex; //!< Synchronizes any mutations to the texture or its backing
+        RecursiveSpinLock mutex; //!< Synchronizes any mutations to the texture or its backing
         std::atomic<ContextTag> tag{}; //!< The tag associated with the last lock call
-        std::condition_variable backingCondition; //!< Signalled when a valid backing has been swapped in
+        std::condition_variable_any backingCondition; //!< Signalled when a valid backing has been swapped in
         using BackingType = std::variant<vk::Image, vk::raii::Image, memory::Image>;
         BackingType backing; //!< The Vulkan image that backs this texture, it is nullable