From 6b9269b88ee97e3ac689b170577e31bb3126c39a Mon Sep 17 00:00:00 2001
From: PixelyIon <pixelyion@protonmail.com>
Date: Sun, 26 Jun 2022 14:32:34 +0530
Subject: [PATCH] Introduce `Context` semantics to GPU resource locking

Resources on the GPU can be fairly convoluted and involve overlaps which can lead to the same GPU resources being utilized with different views, we previously utilized fences to lock resources to prevent concurrent access but this was overly harsh as it would block usage of resources till GPU completion of the commands associated with a resource.

Fences have now been replaced with locks but locks run into the issue of being per-view and therefore to add a common object for tracking usage the concept of "tags" was introduced to track a single context so locks can be skipped if they're from the same context. This is important to prevent a deadlock when locking a resource which has been already locked from the current context with a different view.
---
 app/src/main/cpp/skyline/gpu/buffer.cpp       | 51 +++++++++++--
 app/src/main/cpp/skyline/gpu/buffer.h         | 40 +++++++---
 .../main/cpp/skyline/gpu/buffer_manager.cpp   | 10 +--
 app/src/main/cpp/skyline/gpu/buffer_manager.h | 10 +--
 .../skyline/gpu/interconnect/blit_context.h   | 13 +---
 .../gpu/interconnect/command_executor.cpp     | 68 ++++++++++++-----
 .../gpu/interconnect/command_executor.h       | 52 +++++++++++--
 .../gpu/interconnect/graphics_context.h       | 56 ++++++--------
 app/src/main/cpp/skyline/gpu/tag_allocator.h  | 73 +++++++++++++++++++
 .../main/cpp/skyline/gpu/texture/texture.cpp  | 39 +++++++++-
 .../main/cpp/skyline/gpu/texture/texture.h    | 30 +++++---
 .../main/cpp/skyline/gpu/texture_manager.cpp  |  3 +-
 .../main/cpp/skyline/gpu/texture_manager.h    |  2 +-
 13 files changed, 336 insertions(+), 111 deletions(-)
 create mode 100644 app/src/main/cpp/skyline/gpu/tag_allocator.h
diff --git a/app/src/main/cpp/skyline/gpu/buffer.cpp b/app/src/main/cpp/skyline/gpu/buffer.cpp
index 7bbc8450..33006c81 100644
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@@ -34,12 +34,11 @@ namespace skyline::gpu {
         });
     }
 
-    Buffer::Buffer(GPU &gpu, GuestBuffer guest) : gpu(gpu), backing(gpu.memory.AllocateBuffer(guest.size())), guest(guest) {
+    Buffer::Buffer(GPU &gpu, GuestBuffer guest) : gpu{gpu}, backing{gpu.memory.AllocateBuffer(guest.size())}, guest{guest} {
         SetupGuestMappings();
     }
 
-    Buffer::Buffer(GPU &gpu, const std::shared_ptr<FenceCycle> &pCycle, GuestBuffer guest, span<std::shared_ptr<Buffer>> srcBuffers) : gpu(gpu), backing(gpu.memory.AllocateBuffer(guest.size())), guest(guest) {
-        std::scoped_lock bufLock{*this};
+    Buffer::Buffer(GPU &gpu, GuestBuffer guest, ContextTag tag, span<std::shared_ptr<Buffer>> srcBuffers) : gpu{gpu}, backing{gpu.memory.AllocateBuffer(guest.size())}, guest{guest} {
         SetupGuestMappings();
 
         // Source buffers don't necessarily fully overlap with us so we have to perform a sync here to prevent any gaps
@@ -60,7 +59,7 @@ namespace skyline::gpu {
 
         // Transfer data/state from source buffers
         for (const auto &srcBuffer : srcBuffers) {
-            std::scoped_lock lock{*srcBuffer};
+            ContextLock lock{tag, *srcBuffer};
             if (srcBuffer->guest) {
                 if (srcBuffer->hostImmutableCycle) {
                     // Propagate any host immutability
@@ -75,8 +74,8 @@ namespace skyline::gpu {
                 if (srcBuffer->dirtyState == Buffer::DirtyState::GpuDirty) {
                     // If the source buffer is GPU dirty we cannot directly copy over its GPU backing contents
 
-                    // Only sync back the buffer if it's not attached to the current fence cycle, otherwise propagate the GPU dirtiness
-                    if (!srcBuffer->cycle.owner_before(pCycle)) {
+                    // Only sync back the buffer if it's not attached to the current context, otherwise propagate the GPU dirtiness
+                    if (lock.isFirst) {
                         // Perform a GPU -> CPU sync on the source then do a CPU -> GPU sync for the region occupied by the source
                         // This is required since if we were created from a two buffers: one GPU dirty in the current cycle, and one GPU dirty in the previous cycle, if we marked ourselves as CPU dirty here then the GPU dirtiness from the current cycle buffer would be ignored and cause writes to be missed
                         srcBuffer->SynchronizeGuest(true);
@@ -100,12 +99,12 @@ namespace skyline::gpu {
     }
 
     Buffer::~Buffer() {
-        std::scoped_lock lock{*this};
         if (trapHandle)
             gpu.state.nce->DeleteTrap(*trapHandle);
         SynchronizeGuest(true);
         if (alignedMirror.valid())
             munmap(alignedMirror.data(), alignedMirror.size());
+        WaitOnFence();
     }
 
     void Buffer::MarkGpuDirty() {
@@ -289,6 +288,28 @@ namespace skyline::gpu {
         hostImmutableCycle = pCycle;
     }
 
+    void Buffer::lock() {
+        mutex.lock();
+    }
+
+    bool Buffer::LockWithTag(ContextTag pTag) {
+        if (pTag && pTag == tag)
+            return false;
+
+        mutex.lock();
+        tag = pTag;
+        return true;
+    }
+
+    void Buffer::unlock() {
+        tag = ContextTag{};
+        mutex.unlock();
+    }
+
+    bool Buffer::try_lock() {
+        return mutex.try_lock();
+    }
+
     Buffer::BufferViewStorage::BufferViewStorage(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format) : offset(offset), size(size), format(format) {}
 
     Buffer::BufferDelegate::BufferDelegate(std::shared_ptr<Buffer> pBuffer, const Buffer::BufferViewStorage *view) : buffer(std::move(pBuffer)), view(view) {
@@ -296,7 +317,6 @@ namespace skyline::gpu {
     }
 
     Buffer::BufferDelegate::~BufferDelegate() {
-        std::scoped_lock lock(*this);
         buffer->delegates.erase(iterator);
     }
 
@@ -314,6 +334,21 @@ namespace skyline::gpu {
         }
     }
 
+    bool Buffer::BufferDelegate::LockWithTag(ContextTag pTag) {
+        auto lBuffer{std::atomic_load(&buffer)};
+        while (true) {
+            bool didLock{lBuffer->LockWithTag(pTag)};
+
+            auto latestBacking{std::atomic_load(&buffer)};
+            if (lBuffer == latestBacking)
+                return didLock;
+
+            if (didLock)
+                lBuffer->unlock();
+            lBuffer = latestBacking;
+        }
+    }
+
     void Buffer::BufferDelegate::unlock() {
         buffer->unlock();
     }
diff --git a/app/src/main/cpp/skyline/gpu/buffer.h b/app/src/main/cpp/skyline/gpu/buffer.h
index f662797f..631c3bed 100644
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@@ -6,6 +6,7 @@
 #include <unordered_set>
 #include <boost/functional/hash.hpp>
 #include <nce.h>
+#include <gpu/tag_allocator.h>
 #include "memory_manager.h"
 
 namespace skyline::gpu {
@@ -23,6 +24,7 @@ namespace skyline::gpu {
       private:
         GPU &gpu;
         std::mutex mutex; //!< Synchronizes any mutations to the buffer or its backing
+        std::atomic<ContextTag> tag{}; //!< The tag associated with the last lock call
         memory::Buffer backing;
         std::optional<GuestBuffer> guest;
 
@@ -103,6 +105,8 @@ namespace skyline::gpu {
 
             void lock();
 
+            bool LockWithTag(ContextTag tag);
+
             void unlock();
 
             bool try_lock();
@@ -120,7 +124,7 @@ namespace skyline::gpu {
         void SetupGuestMappings();
 
       public:
-        std::weak_ptr<FenceCycle> cycle; //!< A fence cycle for when any host operation mutating the buffer has completed, it must be waited on prior to any mutations to the backing
+        std::weak_ptr<FenceCycle> cycle{}; //!< A fence cycle for when any host operation mutating the buffer has completed, it must be waited on prior to any mutations to the backing
 
         constexpr vk::Buffer GetBacking() {
             return backing.vkBuffer;
@@ -140,10 +144,10 @@ namespace skyline::gpu {
 
         /**
          * @brief Creates a Buffer that is pre-synchronised with the contents of the input buffers
-         * @param pCycle The FenceCycle associated with the current workload, utilised for synchronising GPU dirty buffers
+         * @param tag The tag to associate locking of srcBuffers with
          * @param srcBuffers Span of overlapping source buffers
          */
-        Buffer(GPU &gpu, const std::shared_ptr<FenceCycle> &pCycle, GuestBuffer guest, span<std::shared_ptr<Buffer>> srcBuffers);
+        Buffer(GPU &gpu, GuestBuffer guest, ContextTag tag, span<std::shared_ptr<Buffer>> srcBuffers);
 
         /**
          * @brief Creates a host-only Buffer which isn't backed by any guest buffer
@@ -157,25 +161,27 @@ namespace skyline::gpu {
          * @brief Acquires an exclusive lock on the buffer for the calling thread
          * @note Naming is in accordance to the BasicLockable named requirement
          */
-        void lock() {
-            mutex.lock();
-        }
+        void lock();
+
+        /**
+         * @brief Acquires an exclusive lock on the texture for the calling thread
+         * @param tag A tag to associate with the lock, future invocations with the same tag prior to the unlock will acquire the lock without waiting (0 is not a valid tag value and will disable tag behavior)
+         * @return If the lock was acquired by this call rather than having the same tag as the holder
+         * @note All locks using the same tag **must** be from the same thread as it'll only have one corresponding unlock() call
+         */
+        bool LockWithTag(ContextTag tag);
 
         /**
          * @brief Relinquishes an existing lock on the buffer by the calling thread
          * @note Naming is in accordance to the BasicLockable named requirement
          */
-        void unlock() {
-            mutex.unlock();
-        }
+        void unlock();
 
         /**
          * @brief Attempts to acquire an exclusive lock but returns immediately if it's captured by another thread
          * @note Naming is in accordance to the Lockable named requirement
          */
-        bool try_lock() {
-            return mutex.try_lock();
-        }
+        bool try_lock();
 
         /**
          * @brief Marks the buffer as dirty on the GPU, it will be synced on the next call to SynchronizeGuest
@@ -312,6 +318,16 @@ namespace skyline::gpu {
             bufferDelegate->lock();
         }
 
+        /**
+         * @brief Acquires an exclusive lock on the texture for the calling thread
+         * @param tag A tag to associate with the lock, future invocations with the same tag prior to the unlock will acquire the lock without waiting (0 is not a valid tag value and will disable tag behavior)
+         * @return If the lock was acquired without waiting (i.e. the tag was the same as the last lock)
+         * @note All locks using the same tag **must** be from the same thread as it'll only have one corresponding unlock() call
+         */
+        bool LockWithTag(ContextTag tag) const {
+            return bufferDelegate->LockWithTag(tag);
+        }
+
         /**
          * @brief Relinquishes an existing lock on the buffer by the calling thread
          * @note Naming is in accordance to the BasicLockable named requirement
diff --git a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
index 04b64def..12131ff3 100644
--- a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
@@ -12,7 +12,7 @@ namespace skyline::gpu {
         return it->guest->begin().base() < pointer;
     }
 
-    BufferView BufferManager::FindOrCreate(GuestBuffer guestMapping, const std::shared_ptr<FenceCycle> &cycle) {
+    BufferView BufferManager::FindOrCreate(GuestBuffer guestMapping, ContextTag tag) {
         /*
          * We align the buffer to the page boundary to ensure that:
          * 1) Any buffer view has the same alignment guarantees as on the guest, this is required for UBOs, SSBOs and Texel buffers
@@ -34,7 +34,7 @@ namespace skyline::gpu {
             auto buffer{overlaps.front()};
             if (buffer->guest->begin() <= guestMapping.begin() && buffer->guest->end() >= guestMapping.end()) {
                 // If we find a buffer which can entirely fit the guest mapping, we can just return a view into it
-                std::scoped_lock bufferLock{*buffer};
+                ContextLock bufferLock{tag, *buffer};
                 return buffer->GetView(static_cast<vk::DeviceSize>(guestMapping.begin() - buffer->guest->begin()) + offset, size);
             }
         }
@@ -49,9 +49,9 @@ namespace skyline::gpu {
                 highestAddress = mapping.end().base();
         }
 
-        auto newBuffer{std::make_shared<Buffer>(gpu, cycle, span<u8>(lowestAddress, highestAddress), overlaps)};
+        auto newBuffer{std::make_shared<Buffer>(gpu, span<u8>{lowestAddress, highestAddress}, tag, overlaps)};
         for (auto &overlap : overlaps) {
-            std::scoped_lock overlapLock{*overlap};
+            ContextLock overlapLock{tag, *overlap};
 
             buffers.erase(std::find(buffers.begin(), buffers.end(), overlap));
 
@@ -62,7 +62,7 @@ namespace skyline::gpu {
                     // This is a slight hack as we really shouldn't be changing the underlying non-mutable set elements without a rehash but without writing our own set impl this is the best we can do
                     const_cast<Buffer::BufferViewStorage *>(&*it)->offset += overlapOffset;
 
-                // Reset the sequence number to the initial one, if the new buffer was created from any GPU dirty overlaps then the new buffer's sequence will be incremented past this thus forcing a reacquire if neccessary
+                // Reset the sequence number to the initial one, if the new buffer was created from any GPU dirty overlaps then the new buffer's sequence will be incremented past this thus forcing a reacquire if necessary
                 // This is fine to do in the set since the hash and operator== do not use this value
                 it->lastAcquiredSequence = Buffer::InitialSequenceNumber;
             }
diff --git a/app/src/main/cpp/skyline/gpu/buffer_manager.h b/app/src/main/cpp/skyline/gpu/buffer_manager.h
index dad387de..f7cf4102 100644
--- a/app/src/main/cpp/skyline/gpu/buffer_manager.h
+++ b/app/src/main/cpp/skyline/gpu/buffer_manager.h
@@ -42,16 +42,16 @@ namespace skyline::gpu {
 
         BufferManager(GPU &gpu);
 
+        /**
+         * @return A pre-existing or newly created Buffer object which covers the supplied mappings
+         */
+        BufferView FindOrCreate(GuestBuffer guestMapping, ContextTag tag = {});
+
         /**
          * @return A dynamically allocated megabuffer which can be used to store buffer modifications allowing them to be replayed in-sequence on the GPU
          * @note This object **must** be destroyed to be reclaimed by the manager and prevent a memory leak
          */
         MegaBuffer AcquireMegaBuffer(const std::shared_ptr<FenceCycle> &cycle);
-
-        /**
-         * @return A pre-existing or newly created Buffer object which covers the supplied mappings
-         */
-        BufferView FindOrCreate(GuestBuffer guestMapping, const std::shared_ptr<FenceCycle> &cycle = nullptr);
     };
 
     /**
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/blit_context.h b/app/src/main/cpp/skyline/gpu/interconnect/blit_context.h
index a864acee..3195100a 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/blit_context.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/blit_context.h
@@ -118,15 +118,11 @@ namespace skyline::gpu::interconnect {
             auto srcGuestTexture{GetGuestTexture(srcSurface)};
             auto dstGuestTexture{GetGuestTexture(dstSurface)};
 
-            auto srcTextureView{gpu.texture.FindOrCreate(srcGuestTexture)};
-            auto dstTextureView{gpu.texture.FindOrCreate(dstGuestTexture)};
+            auto srcTextureView{gpu.texture.FindOrCreate(srcGuestTexture, executor.tag)};
+            executor.AttachTexture(&*srcTextureView);
 
-            {
-                std::scoped_lock lock{*srcTextureView, *dstTextureView};
-
-                executor.AttachTexture(&*srcTextureView);
-                executor.AttachTexture(&*dstTextureView);
-            }
+            auto dstTextureView{gpu.texture.FindOrCreate(dstGuestTexture, executor.tag)};
+            executor.AttachTexture(&*dstTextureView);
 
             auto getSubresourceLayers{[](const vk::ImageSubresourceRange &range, vk::ImageAspectFlags aspect) {
                 return vk::ImageSubresourceLayers{
@@ -145,7 +141,6 @@ namespace skyline::gpu::interconnect {
             };
 
             executor.AddOutsideRpCommand([region, srcTextureView, dstTextureView, linearFilter](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle, GPU &) {
-                std::scoped_lock lock{*srcTextureView, *dstTextureView};
                 auto blitSrcImage{srcTextureView->texture->GetBacking()};
                 auto blitDstImage{dstTextureView->texture->GetBacking()};
 
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
index 94f542a6..9db07e3f 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@@ -5,7 +5,7 @@
 #include "command_executor.h"
 
 namespace skyline::gpu::interconnect {
-    CommandExecutor::CommandExecutor(const DeviceState &state) : gpu{*state.gpu}, activeCommandBuffer{gpu.scheduler.AllocateCommandBuffer()}, cycle{activeCommandBuffer.GetFenceCycle()}, megaBuffer{gpu.buffer.AcquireMegaBuffer(cycle)} {}
+    CommandExecutor::CommandExecutor(const DeviceState &state) : gpu{*state.gpu}, activeCommandBuffer{gpu.scheduler.AllocateCommandBuffer()}, cycle{activeCommandBuffer.GetFenceCycle()}, megaBuffer{gpu.buffer.AcquireMegaBuffer(cycle)}, tag{AllocateTag()} {}
 
     CommandExecutor::~CommandExecutor() {
         cycle->Cancel();
@@ -71,23 +71,48 @@ namespace skyline::gpu::interconnect {
         }
     }
 
-    void CommandExecutor::AttachTexture(TextureView *view) {
-        auto texture{view->texture.get()};
-        if (!attachedTextures.contains(texture)) {
-            texture->WaitOnFence();
-            texture->cycle = cycle;
-            attachedTextures.emplace(texture);
-        }
-        cycle->AttachObject(view->shared_from_this());
+    CommandExecutor::LockedTexture::LockedTexture(std::shared_ptr<Texture> texture) : texture{std::move(texture)} {}
+
+    constexpr CommandExecutor::LockedTexture::LockedTexture(CommandExecutor::LockedTexture &&other) : texture{std::exchange(other.texture, nullptr)} {}
+
+    constexpr Texture *CommandExecutor::LockedTexture::operator->() const {
+        return texture.get();
     }
 
-    void CommandExecutor::AttachBuffer(BufferView &view) {
-        view->buffer->SynchronizeHost();
+    CommandExecutor::LockedTexture::~LockedTexture() {
+        if (texture)
+            texture->unlock();
+    }
 
-        if (!attachedBuffers.contains(view.bufferDelegate)) {
-            view.AttachCycle(cycle);
-            attachedBuffers.emplace(view.bufferDelegate);
-        }
+    bool CommandExecutor::AttachTexture(TextureView *view) {
+        bool didLock{view->LockWithTag(tag)};
+        if (didLock)
+            attachedTextures.emplace_back(view->texture);
+        return didLock;
+    }
+
+    CommandExecutor::LockedBuffer::LockedBuffer(std::shared_ptr<Buffer> buffer) : buffer{std::move(buffer)} {}
+
+    constexpr CommandExecutor::LockedBuffer::LockedBuffer(CommandExecutor::LockedBuffer &&other) : buffer{std::exchange(other.buffer, nullptr)} {}
+
+    constexpr Buffer *CommandExecutor::LockedBuffer::operator->() const {
+        return buffer.get();
+    }
+
+    CommandExecutor::LockedBuffer::~LockedBuffer() {
+        if (buffer)
+            buffer->unlock();
+    }
+
+    bool CommandExecutor::AttachBuffer(BufferView &view) {
+        bool didLock{view->LockWithTag(tag)};
+        if (didLock)
+            attachedBuffers.emplace_back(view->buffer);
+
+        if (!attachedBufferDelegates.contains(view.bufferDelegate))
+            attachedBufferDelegates.emplace(view.bufferDelegate);
+
+        return didLock;
     }
 
     void CommandExecutor::AttachDependency(const std::shared_ptr<FenceCycleDependency> &dependency) {
@@ -178,12 +203,12 @@ namespace skyline::gpu::interconnect {
                 .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
             });
 
-            for (auto texture : attachedTextures) {
+            for (const auto &texture : attachedTextures) {
                 texture->SynchronizeHostWithBuffer(commandBuffer, cycle, true);
                 texture->MarkGpuDirty();
             }
 
-            for (const auto &delegate : attachedBuffers)
+            for (const auto &delegate : attachedBufferDelegates)
                 delegate->usageCallback = nullptr;
 
             vk::RenderPass lRenderPass;
@@ -213,14 +238,19 @@ namespace skyline::gpu::interconnect {
             }
 
             commandBuffer.end();
+
+            for (const auto &attachedBuffer : attachedBuffers)
+                attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer, this is done directly prior to submission to prevent stalls
+
             gpu.scheduler.SubmitCommandBuffer(commandBuffer, activeCommandBuffer.GetFence());
 
-                for (const auto &delegate : attachedBuffers)
-                    delegate->view->megabufferOffset = 0;
+            for (const auto &delegate : attachedBufferDelegates)
+                delegate->view->megabufferOffset = 0;
 
             nodes.clear();
             attachedTextures.clear();
             attachedBuffers.clear();
+            attachedBufferDelegates.clear();
         }
     }
 
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
index 57dc7602..2346e561 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@@ -20,9 +20,46 @@ namespace skyline::gpu::interconnect {
         node::RenderPassNode *renderPass{};
         size_t subpassCount{}; //!< The number of subpasses in the current render pass
 
-        std::unordered_set<Texture *> attachedTextures; //!< All textures that need to be synced prior to and after execution
+        /**
+         * @brief A wrapper of a Texture object that has been locked beforehand and must be unlocked afterwards
+         */
+        struct LockedTexture {
+            std::shared_ptr<Texture> texture;
+
+            explicit LockedTexture(std::shared_ptr<Texture> texture);
+
+            LockedTexture(const LockedTexture &) = delete;
+
+            constexpr LockedTexture(LockedTexture &&other);
+
+            constexpr Texture *operator->() const;
+
+            ~LockedTexture();
+        };
+
+        std::vector<LockedTexture> attachedTextures; //!< All textures that are attached to the current execution
+
+        /**
+         * @brief A wrapper of a Buffer object that has been locked beforehand and must be unlocked afterwards
+         */
+        struct LockedBuffer {
+            std::shared_ptr<Buffer> buffer;
+
+            LockedBuffer(std::shared_ptr<Buffer> buffer);
+
+            LockedBuffer(const LockedBuffer &) = delete;
+
+            constexpr LockedBuffer(LockedBuffer &&other);
+
+            constexpr Buffer *operator->() const;
+
+            ~LockedBuffer();
+        };
+
+        std::vector<LockedBuffer> attachedBuffers; //!< All textures that are attached to the current execution
+
         using SharedBufferDelegate = std::shared_ptr<Buffer::BufferDelegate>;
-        std::unordered_set<SharedBufferDelegate> attachedBuffers; //!< All buffers that are attached to the current execution
+        std::unordered_set<SharedBufferDelegate> attachedBufferDelegates; //!< All buffers that are attached to the current execution
 
         std::vector<TextureView *> lastSubpassAttachments; //!< The storage backing for attachments used in the last subpass
         span<TextureView *> lastSubpassInputAttachments; //!< The set of input attachments used in the last subpass
@@ -52,6 +89,7 @@ namespace skyline::gpu::interconnect {
       public:
         std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
         MegaBuffer megaBuffer; //!< The megabuffer used to temporarily store buffer modifications allowing them to be replayed in-sequence on the GPU
+        ContextTag tag; //!< The tag associated with this command executor, any tagged resource locking must utilize this tag
 
         CommandExecutor(const DeviceState &state);
 
@@ -59,17 +97,19 @@ namespace skyline::gpu::interconnect {
 
         /**
          * @brief Attach the lifetime of the texture to the command buffer
-         * @note The supplied texture **must** be locked by the calling thread
+         * @return If this is the first usage of the backing of this resource within this execution
+         * @note The supplied texture will be locked automatically until the command buffer is submitted and must **not** be locked by the caller
          * @note This'll automatically handle syncing of the texture in the most optimal way possible
          */
-        void AttachTexture(TextureView *view);
+        bool AttachTexture(TextureView *view);
 
         /**
          * @brief Attach the lifetime of a buffer to the command buffer
-         * @note The supplied buffer **must** be locked by the calling thread
+         * @return If this is the first usage of the backing of this resource within this execution
+         * @note The supplied buffer will be locked automatically until the command buffer is submitted and must **not** be locked by the caller
          * @note This'll automatically handle syncing of the buffer in the most optimal way possible
          */
-        void AttachBuffer(BufferView &view);
+        bool AttachBuffer(BufferView &view);
 
         /**
          * @brief Attach the lifetime of the fence cycle dependency to the command buffer
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
index a057dd47..508d68d1 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
@@ -400,7 +400,7 @@ namespace skyline::gpu::interconnect {
                 return vk::ImageViewType::e2D;
             }();
 
-            renderTarget.view = gpu.texture.FindOrCreate(renderTarget.guest);
+            renderTarget.view = gpu.texture.FindOrCreate(renderTarget.guest, executor.tag);
             return renderTarget.view.get();
         }
 
@@ -522,7 +522,6 @@ namespace skyline::gpu::interconnect {
         }
 
         void ClearColorRt(TextureView *renderTarget, vk::Rect2D scissor, u32 layerIndex) {
-            std::scoped_lock lock{*renderTarget};
             executor.AttachTexture(renderTarget);
 
             scissor.extent.width = static_cast<u32>(std::min(static_cast<i32>(renderTarget->texture->dimensions.width) - scissor.offset.x,
@@ -554,7 +553,6 @@ namespace skyline::gpu::interconnect {
         }
 
         void ClearDepthStencilRt(TextureView *renderTarget, vk::ImageAspectFlags aspect, u32 layerIndex) {
-            std::scoped_lock lock{*renderTarget};
             executor.AttachTexture(renderTarget);
 
             if (renderTarget->range.layerCount == 1 && layerIndex == 0) {
@@ -628,7 +626,7 @@ namespace skyline::gpu::interconnect {
             template<typename T>
             T Read(CommandExecutor &pExecutor, size_t dstOffset) const {
                 T object;
-                std::scoped_lock lock{view};
+                ContextLock lock{pExecutor.tag, view};
                 view.Read(pExecutor.cycle, []() {
                     // TODO: here we should trigger a SubmitWithFlush, however that doesn't currently work due to Read being called mid-draw and attached objects not handling this case
                     Logger::Warn("GPU dirty buffer reads for attached buffers are unimplemented");
@@ -644,7 +642,7 @@ namespace skyline::gpu::interconnect {
             void Write(CommandExecutor &pExecutor, MegaBuffer &megaBuffer, span<T> buf, size_t dstOffset) {
                 auto srcCpuBuf{buf.template cast<u8>()};
 
-                std::scoped_lock lock{view};
+                ContextLock lock{pExecutor.tag, view};
                 view.Write(pExecutor.cycle, []() {
                     // TODO: see Read()
                     Logger::Warn("GPU dirty buffer reads for attached buffers are unimplemented");
@@ -652,7 +650,6 @@ namespace skyline::gpu::interconnect {
                     auto srcGpuOffset{megaBuffer.Push(srcCpuBuf)};
                     auto srcGpuBuf{megaBuffer.GetBacking()};
                     pExecutor.AddOutsideRpCommand([=](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
-                        std::scoped_lock lock{view};
                         vk::BufferCopy copyRegion{
                             .size = srcCpuBuf.size_bytes(),
                             .srcOffset = srcGpuOffset,
@@ -728,7 +725,7 @@ namespace skyline::gpu::interconnect {
             auto view{constantBufferCache.Lookup(constantBufferSelector.size, constantBufferSelector.iova)};
             if (!view) {
                 auto mappings{channelCtx.asCtx->gmmu.TranslateRange(constantBufferSelector.iova, constantBufferSelector.size)};
-                view = gpu.buffer.FindOrCreate(mappings.front(), executor.cycle);
+                view = gpu.buffer.FindOrCreate(mappings.front(), executor.tag);
                 constantBufferCache.Insert(constantBufferSelector.size, constantBufferSelector.iova, *view);
             }
 
@@ -919,7 +916,7 @@ namespace skyline::gpu::interconnect {
             if (mappings.size() != 1)
                 Logger::Warn("Multiple buffer mappings ({}) are not supported", mappings.size());
 
-            return gpu.buffer.FindOrCreate(mappings.front(), executor.cycle);
+            return gpu.buffer.FindOrCreate(mappings.front(), executor.tag);
         }
 
         /**
@@ -1108,8 +1105,7 @@ namespace skyline::gpu::interconnect {
                         });
 
                         auto view{pipelineStage.constantBuffers[constantBuffer.index].view};
-
-                        std::scoped_lock lock(view);
+                        executor.AttachBuffer(view);
                         if (auto megaBufferOffset{view.AcquireMegaBuffer(executor.megaBuffer)}) {
                             // If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
                             bufferDescriptors[bufferIndex] = vk::DescriptorBufferInfo{
@@ -1127,7 +1123,6 @@ namespace skyline::gpu::interconnect {
                             });
                         }
 
-                        executor.AttachBuffer(view);
                         bufferIndex++;
                     }
                 }
@@ -1149,8 +1144,8 @@ namespace skyline::gpu::interconnect {
                         });
 
                         auto view{GetSsboViewFromDescriptor(storageBuffer, pipelineStage.constantBuffers)};
+                        executor.AttachBuffer(view);
 
-                        std::scoped_lock lock{view};
                         if (storageBuffer.is_written)
                             view->buffer->MarkGpuDirty();
 
@@ -1161,7 +1156,6 @@ namespace skyline::gpu::interconnect {
                                 .range = view.size,
                             };
                         });
-                        executor.AttachBuffer(view);
                     }
                 }
 
@@ -1204,16 +1198,16 @@ namespace skyline::gpu::interconnect {
                             handle.samplerIndex = handle.textureIndex;
 
                         auto sampler{GetSampler(handle.samplerIndex)};
-                        auto textureView{GetPoolTextureView(handle.textureIndex)};
+                        executor.AttachDependency(sampler);
+
+                        auto textureView{GetPoolTextureView(handle.textureIndex)};
+                        executor.AttachTexture(textureView.get());
 
-                        std::scoped_lock lock(*textureView);
                         imageDescriptors[imageIndex++] = vk::DescriptorImageInfo{
                             .sampler = **sampler,
                             .imageView = textureView->GetView(),
                             .imageLayout = textureView->texture->layout,
                         };
-                        executor.AttachTexture(textureView.get());
-                        executor.AttachDependency(std::move(sampler));
                     }
                 }
 
@@ -1851,7 +1845,7 @@ namespace skyline::gpu::interconnect {
             if (mappings.size() != 1)
                 Logger::Warn("Multiple buffer mappings ({}) are not supported", mappings.size());
 
-            vertexBuffer.view = gpu.buffer.FindOrCreate(mappings.front(), executor.cycle);
+            vertexBuffer.view = gpu.buffer.FindOrCreate(mappings.front(), executor.tag);
             return &vertexBuffer;
         }
 
@@ -2335,7 +2329,7 @@ namespace skyline::gpu::interconnect {
                 return textureView;
             }
 
-            auto textureView{gpu.texture.FindOrCreate(poolTexture.guest)};
+            auto textureView{gpu.texture.FindOrCreate(poolTexture.guest, executor.tag)};
             poolTexture.view = textureView;
             return textureView;
         }
@@ -2606,7 +2600,7 @@ namespace skyline::gpu::interconnect {
                 Logger::Warn("Multiple buffer mappings ({}) are not supported", mappings.size());
 
             auto mapping{mappings.front()};
-            indexBuffer.view = gpu.buffer.FindOrCreate(span<u8>(mapping.data(), size), executor.cycle);
+            indexBuffer.view = gpu.buffer.FindOrCreate(span<u8>(mapping.data(), size), executor.tag);
             return indexBuffer.view;
         }
 
@@ -2822,8 +2816,7 @@ namespace skyline::gpu::interconnect {
                     throw exception("Indexed quad conversion is not supported");
 
                 auto indexBufferView{GetIndexBuffer(count)};
-
-                std::scoped_lock lock(indexBufferView);
+                executor.AttachBuffer(indexBufferView);
 
                 boundIndexBuffer = std::make_shared<BoundIndexBuffer>();
                 boundIndexBuffer->type = indexBuffer.type;
@@ -2837,18 +2830,16 @@ namespace skyline::gpu::interconnect {
                         boundIndexBuffer->offset = view.offset;
                     });
                 }
-
-                executor.AttachBuffer(indexBufferView);
             } else if (needsQuadConversion) {
                 // Convert the guest-supplied quad list to an indexed triangle list
                 auto[bufferView, indexType, indexCount] = GetNonIndexedQuadConversionBuffer(count);
-                std::scoped_lock lock(bufferView);
+                executor.AttachBuffer(bufferView);
+
                 count = indexCount;
                 boundIndexBuffer = std::make_shared<BoundIndexBuffer>();
                 boundIndexBuffer->type = indexType;
                 boundIndexBuffer->handle = bufferView->buffer->GetBacking();
                 boundIndexBuffer->offset = bufferView->view->offset;
-                executor.AttachBuffer(bufferView);
             }
 
             // Vertex Buffer Setup
@@ -2864,13 +2855,12 @@ namespace skyline::gpu::interconnect {
             for (u32 index{}; index < maxwell3d::VertexBufferCount; index++) {
                 auto vertexBuffer{GetVertexBuffer(index)};
                 if (vertexBuffer) {
-                    auto &vertexBufferView{vertexBuffer->view};
                     vertexBindingDescriptions.push_back(vertexBuffer->bindingDescription);
                     if (vertexBuffer->bindingDescription.inputRate == vk::VertexInputRate::eInstance)
                         vertexBindingDivisorsDescriptions.push_back(vertexBuffer->bindingDivisorDescription);
 
-                    std::scoped_lock vertexBufferLock(vertexBufferView);
-
+                    auto &vertexBufferView{vertexBuffer->view};
+                    executor.AttachBuffer(vertexBufferView);
                     if (auto megaBufferOffset{vertexBufferView.AcquireMegaBuffer(executor.megaBuffer)}) {
                         // If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
                         boundVertexBuffers->handles[index] = executor.megaBuffer.GetBacking();
@@ -2881,7 +2871,6 @@ namespace skyline::gpu::interconnect {
                             *offset = view.offset;
                         });
                     }
-                    executor.AttachBuffer(vertexBufferView);
                 }
             }
 
@@ -2896,9 +2885,8 @@ namespace skyline::gpu::interconnect {
             for (u32 index{}; index < maxwell3d::RenderTargetCount; index++) {
                 auto renderTarget{GetColorRenderTarget(index)};
                 if (renderTarget) {
-                    std::scoped_lock lock(*renderTarget);
-                    activeColorRenderTargets.push_back(renderTarget);
                     executor.AttachTexture(renderTarget);
+                    activeColorRenderTargets.push_back(renderTarget);
                 }
             }
 
@@ -2906,10 +2894,8 @@ namespace skyline::gpu::interconnect {
 
             // Depth/Stencil Render Target Setup
             auto depthRenderTargetView{GetDepthRenderTarget()};
-            if (depthRenderTargetView) {
-                std::scoped_lock lock(*depthRenderTargetView);
+            if (depthRenderTargetView)
                 executor.AttachTexture(depthRenderTargetView);
-            }
 
             // Pipeline Creation
             vk::StructureChain<vk::PipelineVertexInputStateCreateInfo, vk::PipelineVertexInputDivisorStateCreateInfoEXT> vertexState{
diff --git a/app/src/main/cpp/skyline/gpu/tag_allocator.h b/app/src/main/cpp/skyline/gpu/tag_allocator.h
new file mode 100644
index 00000000..159ffe3b
--- /dev/null
+++ b/app/src/main/cpp/skyline/gpu/tag_allocator.h
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <common/base.h>
+
+namespace skyline::gpu {
+    struct ContextTag;
+
+    static ContextTag AllocateTag();
+
+    /**
+     * @brief A unique tag associated with a single "context" an abstraction to allow concurrent locking of resources by different parts of a single context
+     */
+    struct ContextTag {
+      private:
+        size_t key;
+
+        friend ContextTag AllocateTag();
+
+        constexpr ContextTag(size_t key) : key{key} {}
+
+      public:
+        constexpr ContextTag() : key{} {}
+
+        constexpr bool operator==(const ContextTag &other) const {
+            return key == other.key;
+        }
+
+        constexpr bool operator!=(const ContextTag &other) const {
+            return key != other.key;
+        }
+
+        constexpr operator bool() const {
+            return key != 0;
+        }
+    };
+
+    /**
+     * @return A globally unique tag to utilize for any operations
+     */
+    inline ContextTag AllocateTag() {
+        static std::atomic<size_t> key{1};
+        return ContextTag{key++};
+    }
+
+    /**
+     * @brief A scoped lock specially designed for classes with ContextTag-based locking
+     * @note This will unlock the tag when the scope is exited, **if** it locked the tag in the first place
+     */
+    template<typename T>
+    class ContextLock {
+      private:
+        T &resource;
+
+      public:
+        bool isFirst; //!< If this was the first lock for this context
+
+        ContextLock(ContextTag tag, T &resource) : resource{resource}, isFirst{resource.LockWithTag(tag)} {}
+
+        ContextLock(const ContextLock &) = delete;
+
+        ContextLock(ContextLock &&other) noexcept : resource{other.resource}, isFirst{other.isFirst} {
+            other.isFirst = false;
+        }
+
+        ~ContextLock() {
+            if (isFirst)
+                resource.unlock();
+        }
+    };
+}
diff --git a/app/src/main/cpp/skyline/gpu/texture/texture.cpp b/app/src/main/cpp/skyline/gpu/texture/texture.cpp
index 3ce0236d..267a5b04 100644
--- a/app/src/main/cpp/skyline/gpu/texture/texture.cpp
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.cpp
@@ -106,6 +106,21 @@ namespace skyline::gpu {
         }
     }
 
+    bool TextureView::LockWithTag(ContextTag tag) {
+        auto backing{std::atomic_load(&texture)};
+        while (true) {
+            bool didLock{backing->LockWithTag(tag)};
+
+            auto latestBacking{std::atomic_load(&texture)};
+            if (backing == latestBacking)
+                return didLock;
+
+            if (didLock)
+                backing->unlock();
+            backing = latestBacking;
+        }
+    }
+
     void TextureView::unlock() {
         texture->unlock();
     }
@@ -564,12 +579,34 @@ namespace skyline::gpu {
     }
 
     Texture::~Texture() {
-        std::scoped_lock lock{*this};
         if (trapHandle)
             gpu.state.nce->DeleteTrap(*trapHandle);
         SynchronizeGuest(true);
         if (alignedMirror.valid())
             munmap(alignedMirror.data(), alignedMirror.size());
+        WaitOnFence();
+    }
+
+    void Texture::lock() {
+        mutex.lock();
+    }
+
+    bool Texture::LockWithTag(ContextTag pTag) {
+        if (pTag && pTag == tag)
+            return false;
+
+        mutex.lock();
+        tag = pTag;
+        return true;
+    }
+
+    void Texture::unlock() {
+        tag = ContextTag{};
+        mutex.unlock();
+    }
+
+    bool Texture::try_lock() {
+        return mutex.try_lock();
     }
 
     void Texture::MarkGpuDirty() {
diff --git a/app/src/main/cpp/skyline/gpu/texture/texture.h b/app/src/main/cpp/skyline/gpu/texture/texture.h
index d2edcaae..907ae184 100644
--- a/app/src/main/cpp/skyline/gpu/texture/texture.h
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <nce.h>
+#include <gpu/tag_allocator.h>
 #include <gpu/memory_manager.h>
 
 namespace skyline::gpu {
@@ -314,6 +315,14 @@ namespace skyline::gpu {
          */
         void lock();
 
+        /**
+         * @brief Acquires an exclusive lock on the texture for the calling thread
+         * @param tag A tag to associate with the lock, future invocations with the same tag prior to the unlock will acquire the lock without waiting (0 is not a valid tag value and will disable tag behavior)
+         * @return If the lock was acquired by this call rather than having the same tag as the holder
+         * @note All locks using the same tag **must** be from the same thread as it'll only have one corresponding unlock() call
+         */
+        bool LockWithTag(ContextTag tag);
+
         /**
          * @brief Relinquishes an existing lock on the backing texture by the calling thread
          * @note Naming is in accordance to the BasicLockable named requirement
@@ -345,6 +354,7 @@ namespace skyline::gpu {
       private:
         GPU &gpu;
         std::mutex mutex; //!< Synchronizes any mutations to the texture or its backing
+        std::atomic<ContextTag> tag{}; //!< The tag associated with the last lock call
         std::condition_variable backingCondition; //!< Signalled when a valid backing has been swapped in
         using BackingType = std::variant<vk::Image, vk::raii::Image, memory::Image>;
         BackingType backing; //!< The Vulkan image that backs this texture, it is nullable
@@ -467,25 +477,27 @@ namespace skyline::gpu {
          * @brief Acquires an exclusive lock on the texture for the calling thread
          * @note Naming is in accordance to the BasicLockable named requirement
          */
-        void lock() {
-            mutex.lock();
-        }
+        void lock();
+
+        /**
+         * @brief Acquires an exclusive lock on the texture for the calling thread
+         * @param tag A tag to associate with the lock, future invocations with the same tag prior to the unlock will acquire the lock without waiting (0 is not a valid tag value and will disable tag behavior)
+         * @return If the lock was acquired by this call rather than having the same tag as the holder
+         * @note All locks using the same tag **must** be from the same thread as it'll only have one corresponding unlock() call
+         */
+        bool LockWithTag(ContextTag tag);
 
         /**
          * @brief Relinquishes an existing lock on the texture by the calling thread
          * @note Naming is in accordance to the BasicLockable named requirement
          */
-        void unlock() {
-            mutex.unlock();
-        }
+        void unlock();
 
         /**
          * @brief Attempts to acquire an exclusive lock but returns immediately if it's captured by another thread
          * @note Naming is in accordance to the Lockable named requirement
          */
-        bool try_lock() {
-            return mutex.try_lock();
-        }
+        bool try_lock();
 
         /**
          * @brief Marks the texture as dirty on the GPU, it will be synced on the next call to SynchronizeGuest
diff --git a/app/src/main/cpp/skyline/gpu/texture_manager.cpp b/app/src/main/cpp/skyline/gpu/texture_manager.cpp
index ca3cc2bd..db476361 100644
--- a/app/src/main/cpp/skyline/gpu/texture_manager.cpp
+++ b/app/src/main/cpp/skyline/gpu/texture_manager.cpp
@@ -6,7 +6,7 @@
 namespace skyline::gpu {
     TextureManager::TextureManager(GPU &gpu) : gpu(gpu) {}
 
-    std::shared_ptr<TextureView> TextureManager::FindOrCreate(const GuestTexture &guestTexture) {
+    std::shared_ptr<TextureView> TextureManager::FindOrCreate(const GuestTexture &guestTexture, ContextTag tag) {
         auto guestMapping{guestTexture.mappings.front()};
 
         /*
@@ -52,6 +52,7 @@ namespace skyline::gpu {
                       || matchGuestTexture.viewMipBase > 0)
                      && matchGuestTexture.tileConfig == guestTexture.tileConfig) {
                     auto &texture{hostMapping->texture};
+                    ContextLock textureLock{tag, *texture};
                     return texture->GetView(guestTexture.viewType, vk::ImageSubresourceRange{
                         .aspectMask = guestTexture.aspect,
                         .baseMipLevel = guestTexture.viewMipBase,
diff --git a/app/src/main/cpp/skyline/gpu/texture_manager.h b/app/src/main/cpp/skyline/gpu/texture_manager.h
index dc87f6cd..b7f0fa89 100644
--- a/app/src/main/cpp/skyline/gpu/texture_manager.h
+++ b/app/src/main/cpp/skyline/gpu/texture_manager.h
@@ -35,6 +35,6 @@ namespace skyline::gpu {
         /**
          * @return A pre-existing or newly created Texture object which matches the specified criteria
          */
-        std::shared_ptr<TextureView> FindOrCreate(const GuestTexture &guestTexture);
+        std::shared_ptr<TextureView> FindOrCreate(const GuestTexture &guestTexture, ContextTag tag = {});
     };
 }