From 5dca5cc10e101103cf70736bf270092de0855464 Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Wed, 31 Aug 2022 14:15:56 +0100
Subject: [PATCH] Redesign buffer view infra to remarkably reduce creation
 overhead

Buffer views creation was a significant pain point, requiring several layers of caching to reduce the number of creations that introduced a lot of complexity. By reworking delegates to be per-buffer rather than per-view and then linearly allocating delegates (without ever freeing) views can be reduced to just {delegatePtr, offset, size}, avoiding the need for any allocations or set operations in GetView. The one difficulty with this is the need to support buffer recreation, which is achived by allowing delegates to be chained - during recreation all source buffers have their delegates modified to point to the newly created buffer's delegate. Upon accessing a view with such a chained delegate the view will be modified to point directly to the end delegate with offset being updated accordingly, skipping the need to traverse the chain for future accesses.
---
 app/src/main/cpp/skyline/gpu/buffer.cpp       | 120 +++++----
 app/src/main/cpp/skyline/gpu/buffer.h         | 255 +++++++++---------
 .../main/cpp/skyline/gpu/buffer_manager.cpp   |  50 +---
 app/src/main/cpp/skyline/gpu/buffer_manager.h |   3 +
 .../gpu/interconnect/command_executor.cpp     |  26 +-
 .../gpu/interconnect/command_executor.h       |   2 -
 6 files changed, 207 insertions(+), 249 deletions(-)
diff --git a/app/src/main/cpp/skyline/gpu/buffer.cpp b/app/src/main/cpp/skyline/gpu/buffer.cpp
index 2d4a42cd..5db282d6 100644
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@@ -77,9 +77,18 @@ namespace skyline::gpu {
         });
     }
 
-    Buffer::Buffer(GPU &gpu, GuestBuffer guest) : gpu{gpu}, backing{gpu.memory.AllocateBuffer(guest.size())}, guest{guest} {}
+    Buffer::Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, GuestBuffer guest, size_t id)
+        : gpu{gpu},
+          backing{gpu.memory.AllocateBuffer(guest.size())},
+          guest{guest},
+          delegate{delegateAllocator.EmplaceUntracked<BufferDelegate>(this)},
+          id{id} {}
 
-    Buffer::Buffer(GPU &gpu, vk::DeviceSize size) : gpu(gpu), backing(gpu.memory.AllocateBuffer(size)) {
+    Buffer::Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, vk::DeviceSize size, size_t id)
+        : gpu{gpu},
+          backing{gpu.memory.AllocateBuffer(size)},
+          delegate{delegateAllocator.EmplaceUntracked<BufferDelegate>(this)},
+          id{id} {
         dirtyState = DirtyState::Clean; // Since this is a host-only buffer it's always going to be clean
     }
 
@@ -237,10 +246,15 @@ namespace skyline::gpu {
             gpuCopyCallback();
     }
 
-    BufferView Buffer::GetView(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format) {
-        // Will return an iterator to the inserted view or the already-existing view if the same view is already in the set
-        auto it{views.emplace(offset, size, format).first};
-        return BufferView{shared_from_this(), &(*it)};
+    BufferView Buffer::GetView(vk::DeviceSize offset, vk::DeviceSize size) {
+        return BufferView{delegate, offset, size};
+    }
+
+    BufferView Buffer::TryGetView(span<u8> mapping) {
+        if (guest->contains(mapping))
+            return GetView(static_cast<vk::DeviceSize>(std::distance(guest->begin(), mapping.begin())), mapping.size());
+        else
+            return {};
     }
 
     std::pair<u64, span<u8>> Buffer::AcquireCurrentSequence() {
@@ -288,90 +302,80 @@ namespace skyline::gpu {
         return mutex.try_lock();
     }
 
-    Buffer::BufferViewStorage::BufferViewStorage(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format) : offset(offset), size(size), format(format) {}
+    BufferDelegate::BufferDelegate(Buffer *buffer) : buffer{buffer} {}
 
-    Buffer::BufferDelegate::BufferDelegate(std::shared_ptr<Buffer> pBuffer, const Buffer::BufferViewStorage *view) : buffer(std::move(pBuffer)), view(view) {
-        iterator = buffer->delegates.emplace(buffer->delegates.end(), this);
+    Buffer *BufferDelegate::GetBuffer() {
+        if (linked) [[unlikely]]
+            return link->GetBuffer();
+        else
+            return buffer;
     }
 
-    Buffer::BufferDelegate::~BufferDelegate() {
-        buffer->delegates.erase(iterator);
+    void BufferDelegate::Link(BufferDelegate *newTarget, vk::DeviceSize newOffset) {
+        if (linked)
+            throw exception("Cannot link a buffer delegate that is already linked!");
+
+        linked = true;
+        link = newTarget;
+        offset = newOffset;
     }
 
-    void Buffer::BufferDelegate::lock() {
-        buffer.Lock();
+    vk::DeviceSize BufferDelegate::GetOffset() {
+        if (linked) [[unlikely]]
+            return link->GetOffset() + offset;
+        else
+            return offset;
     }
 
-    bool Buffer::BufferDelegate::LockWithTag(ContextTag pTag) {
-        bool result{};
-        buffer.Lock([pTag, &result](Buffer *pBuffer) {
-            result = pBuffer->LockWithTag(pTag);
-        });
-        return result;
+    void BufferView::ResolveDelegate() {
+        offset += delegate->GetOffset();
+        delegate = delegate->GetBuffer()->delegate;
     }
 
-    void Buffer::BufferDelegate::unlock() {
-        buffer->unlock();
+    BufferView::BufferView() {}
+
+    BufferView::BufferView(BufferDelegate *delegate, vk::DeviceSize offset, vk::DeviceSize size) : delegate{delegate}, offset{offset}, size{size} {}
+
+    Buffer *BufferView::GetBuffer() const {
+        return delegate->GetBuffer();
     }
 
-    bool Buffer::BufferDelegate::try_lock() {
-        return buffer.TryLock();
+    vk::DeviceSize BufferView::GetOffset() const {
+        return offset + delegate->GetOffset();
     }
 
-    BufferView::BufferView(std::shared_ptr<Buffer> buffer, const Buffer::BufferViewStorage *view) : bufferDelegate(std::make_shared<Buffer::BufferDelegate>(std::move(buffer), view)) {}
-
-    void BufferView::RegisterUsage(LinearAllocatorState<> &allocator, const std::shared_ptr<FenceCycle> &cycle, Buffer::BufferDelegate::UsageCallback usageCallback) {
-        if (!bufferDelegate->usageCallbacks)
-            bufferDelegate->usageCallbacks = decltype(bufferDelegate->usageCallbacks)::value_type{allocator};
-
-        // Users of RegisterUsage expect the buffer contents to be sequenced as the guest GPU would be, so force any further sequenced writes in the current cycle to occur on the GPU
-        bufferDelegate->buffer->BlockSequencedCpuBackingWrites();
-
-        usageCallback(*bufferDelegate->view, bufferDelegate->buffer);
-        bufferDelegate->usageCallbacks->emplace_back(std::move(usageCallback));
+    void BufferView::Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize readOffset) const {
+        GetBuffer()->Read(isFirstUsage, flushHostCallback, data, readOffset + GetOffset());
     }
 
-    void BufferView::Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const {
-        bufferDelegate->buffer->Read(isFirstUsage, flushHostCallback, data, offset + bufferDelegate->view->offset);
-    }
-
-    void BufferView::Write(bool isFirstUsage, const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const {
+    bool BufferView::Write(bool isFirstUsage, const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize writeOffset, const std::function<void()> &gpuCopyCallback) const {
         // If megabuffering can't be enabled we have to do a GPU-side copy to ensure sequencing
-        bool gpuCopy{bufferDelegate->view->size > MegaBufferingDisableThreshold};
+        bool gpuCopy{size > MegaBufferingDisableThreshold};
         if (gpuCopy)
-            bufferDelegate->buffer->BlockSequencedCpuBackingWrites();
+            GetBuffer()->BlockSequencedCpuBackingWrites();
 
-        bufferDelegate->buffer->Write(isFirstUsage, flushHostCallback, gpuCopyCallback, data, offset + bufferDelegate->view->offset);
+        return GetBuffer()->Write(isFirstUsage, flushHostCallback, data, writeOffset + GetOffset(), gpuCopyCallback);
     }
 
     MegaBufferAllocator::Allocation BufferView::AcquireMegaBuffer(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator) const {
-        if (!bufferDelegate->buffer->EverHadInlineUpdate())
+        if (!GetBuffer()->EverHadInlineUpdate())
             // Don't megabuffer buffers that have never had inline updates since performance is only going to be harmed as a result of the constant copying and there wont be any benefit since there are no GPU inline updates that would be avoided
             return {};
 
-        if (bufferDelegate->view->size > MegaBufferingDisableThreshold)
+        if (size > MegaBufferingDisableThreshold)
             return {};
 
-        auto [newSequence, sequenceSpan]{bufferDelegate->buffer->AcquireCurrentSequence()};
+        auto [newSequence, sequenceSpan]{GetBuffer()->AcquireCurrentSequence()};
         if (!newSequence)
             return {}; // If the sequence can't be acquired then the buffer is GPU dirty and we can't megabuffer
 
-        // If a copy of the view for the current sequence is already in megabuffer then we can just use that
-        if (newSequence == bufferDelegate->view->lastAcquiredSequence && bufferDelegate->view->megaBufferAllocation)
-            return bufferDelegate->view->megaBufferAllocation;
+        auto viewBackingSpan{sequenceSpan.subspan(GetOffset(), size)};
 
-        // If the view is not in the megabuffer then we need to allocate a new copy
-        auto viewBackingSpan{sequenceSpan.subspan(bufferDelegate->view->offset, bufferDelegate->view->size)};
-
-        // TODO: we could optimise the alignment requirements here based on buffer usage
-        bufferDelegate->view->megaBufferAllocation = allocator.Push(pCycle, viewBackingSpan, true);
-        bufferDelegate->view->lastAcquiredSequence = newSequence;
-
-        return bufferDelegate->view->megaBufferAllocation; // Success!
+        return allocator.Push(pCycle, viewBackingSpan, true); // Success!
     }
 
     span<u8> BufferView::GetReadOnlyBackingSpan(bool isFirstUsage, const std::function<void()> &flushHostCallback) {
-        auto backing{bufferDelegate->buffer->GetReadOnlyBackingSpan(isFirstUsage, flushHostCallback)};
-        return backing.subspan(bufferDelegate->view->offset, bufferDelegate->view->size);
+        auto backing{delegate->GetBuffer()->GetReadOnlyBackingSpan(isFirstUsage, flushHostCallback)};
+        return backing.subspan(GetOffset(), size);
     }
 }
diff --git a/app/src/main/cpp/skyline/gpu/buffer.h b/app/src/main/cpp/skyline/gpu/buffer.h
index 02ddf954..f8742882 100644
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@@ -3,9 +3,7 @@
 
 #pragma once
 
-#include <unordered_set>
 #include <boost/functional/hash.hpp>
-#include <common/lockable_shared_ptr.h>
 #include <common/linear_allocator.h>
 #include <nce.h>
 #include <gpu/tag_allocator.h>
@@ -15,9 +13,9 @@
 namespace skyline::gpu {
     using GuestBuffer = span<u8>; //!< The CPU mapping for the guest buffer, multiple mappings for buffers aren't supported since overlaps cannot be reconciled
 
-    struct BufferView;
+    class BufferView;
     class BufferManager;
-    class MegaBuffer;
+    class BufferDelegate;
 
     /**
      * @brief A buffer which is backed by host constructs while being synchronized with the underlying guest buffer
@@ -31,6 +29,7 @@ namespace skyline::gpu {
         memory::Buffer backing;
         std::optional<GuestBuffer> guest;
         std::shared_ptr<FenceCycle> cycle{}; //!< A fence cycle for when any host operation mutating the buffer has completed, it must be waited on prior to any mutations to the backing
+        size_t id;
 
         span<u8> mirror{}; //!< A contiguous mirror of all the guest mappings to allow linear access on the CPU
         span<u8> alignedMirror{}; //!< The mirror mapping aligned to page size to reflect the full mapping
@@ -52,75 +51,15 @@ namespace skyline::gpu {
         bool everHadInlineUpdate{}; //!< Whether the buffer has ever had an inline update since it was created, if this is set then megabuffering will be attempted by views to avoid the cost of inline GPU updates
 
       public:
-        /**
-         * @brief Storage for all metadata about a specific view into the buffer, used to prevent redundant view creation and duplication of VkBufferView(s)
-         */
-        struct BufferViewStorage {
-            vk::DeviceSize offset;
-            vk::DeviceSize size;
-            vk::Format format;
-
-            // These are not accounted for in hash nor operator== since they are not an inherent property of the view, but they are required nonetheless for megabuffering on a per-view basis
-            mutable u64 lastAcquiredSequence{}; //!< The last sequence number for the attached buffer that the megabuffer copy of this view was acquired from, if this is equal to the current sequence of the attached buffer then the copy at `megabufferOffset` is still valid
-            mutable MegaBufferAllocator::Allocation megaBufferAllocation; //!< Allocation for the current copy of the view in the megabuffer (if any), 0 if no copy exists and this is only valid if `lastAcquiredSequence` is equal to the current sequence of the attached buffer
-
-            BufferViewStorage(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format);
-
-            bool operator==(const BufferViewStorage &other) const {
-                return other.offset == offset && other.size == size && other.format == format;
-            }
-        };
 
         static constexpr u64 InitialSequenceNumber{1}; //!< Sequence number that all buffers start off with
 
       private:
-        /**
-         * @brief Hash function for BufferViewStorage to be used in the views set
-         */
-        struct BufferViewStorageHash {
-            size_t operator()(const BufferViewStorage &entry) const noexcept {
-                size_t seed{};
-                boost::hash_combine(seed, entry.offset);
-                boost::hash_combine(seed, entry.size);
-                boost::hash_combine(seed, entry.format);
-
-                // The mutable fields {lastAcquiredSequence, megabufferOffset} are deliberately ignored
-                return seed;
-            }
-        };
-
-        std::unordered_set<BufferViewStorage, BufferViewStorageHash> views; //!< BufferViewStorage(s) that are backed by this Buffer, used for storage and repointing to a new Buffer on deletion
-
         u64 sequenceNumber{InitialSequenceNumber}; //!< Sequence number that is incremented after all modifications to the host side `backing` buffer, used to prevent redundant copies of the buffer being stored in the megabuffer by views
 
-      public:
-        /**
-         * @brief A delegate for a strong reference to a Buffer by a BufferView which can be changed to another Buffer transparently
-         * @note This class conforms to the Lockable and BasicLockable C++ named requirements
-         */
-        struct BufferDelegate {
-            LockableSharedPtr<Buffer> buffer;
-            const Buffer::BufferViewStorage *view;
-            bool attached{};
-            using UsageCallback = std::function<void(const BufferViewStorage &, const std::shared_ptr<Buffer> &)>;
-            std::optional<std::vector<UsageCallback, LinearAllocator<UsageCallback>>> usageCallbacks;
-            std::list<BufferDelegate *>::iterator iterator;
-
-            BufferDelegate(std::shared_ptr<Buffer> buffer, const Buffer::BufferViewStorage *view);
-
-            ~BufferDelegate();
-
-            void lock();
-
-            bool LockWithTag(ContextTag tag);
-
-            void unlock();
-
-            bool try_lock();
-        };
 
       private:
-        std::list<BufferDelegate *> delegates; //!< The reference delegates for this buffer, used to prevent the buffer from being deleted while it is still in use
+        BufferDelegate *delegate;
 
         friend BufferView;
         friend BufferManager;
@@ -155,13 +94,13 @@ namespace skyline::gpu {
          * @brief Creates a buffer object wrapping the guest buffer with a backing that can represent the guest buffer data
          * @note The guest mappings will not be setup until SetupGuestMappings() is called
          */
-        Buffer(GPU &gpu, GuestBuffer guest);
+        Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, GuestBuffer guest, size_t id);
 
         /**
          * @brief Creates a host-only Buffer which isn't backed by any guest buffer
          * @note The created buffer won't have a mirror so any operations cannot depend on a mirror existing
          */
-        Buffer(GPU &gpu, vk::DeviceSize size);
+        Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, vk::DeviceSize size, size_t id);
 
         ~Buffer();
 
@@ -311,10 +250,16 @@ namespace skyline::gpu {
         void Write(bool isFirstUsage, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset);
 
         /**
-         * @return A cached or newly created view into this buffer with the supplied attributes
+         * @return A view into this buffer with the supplied attributes
          * @note The buffer **must** be locked prior to calling this
          */
-        BufferView GetView(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format = {});
+        BufferView GetView(vk::DeviceSize offset, vk::DeviceSize size);
+
+        /**
+         * @return A view into this buffer containing the given mapping, if the buffer doesn't contain the mapping an empty view will be returned
+         * @note The buffer **must** be locked prior to calling this
+         */
+        BufferView TryGetView(span<u8> mapping);
 
         /**
          * @brief Attempts to return the current sequence number and prepare the buffer for read accesses from the returned span
@@ -342,90 +287,140 @@ namespace skyline::gpu {
         span<u8> GetReadOnlyBackingSpan(bool isFirstUsage, const std::function<void()> &flushHostCallback);
     };
 
+    /**
+     * @brief A delegate for a strong reference to a Buffer by a BufferView which can be changed to another Buffer transparently
+     */
+    class BufferDelegate {
+      private:
+        union {
+            BufferDelegate *link{};
+            Buffer *buffer;
+        };
+        vk::DeviceSize offset{};
+
+        bool linked{};
+
+      public:
+        BufferDelegate(Buffer *buffer);
+
+        /**
+         * @brief Follows links to get the underlying target buffer of the delegate
+         */
+        Buffer *GetBuffer();
+
+        /**
+         * @brief Links the delegate to target a new buffer object
+         * @note Both the current target buffer object and new target buffer object **must** be locked prior to calling this
+         */
+        void Link(BufferDelegate *newTarget, vk::DeviceSize newOffset);
+
+        /**
+         * @return The offset of the delegate in the buffer
+         * @note The target buffer **must** be locked prior to calling this
+         */
+        vk::DeviceSize GetOffset();
+    };
+
     /**
      * @brief A contiguous view into a Vulkan Buffer that represents a single guest buffer (as opposed to Buffer objects which contain multiple)
      * @note The object **must** be locked prior to accessing any members as values will be mutated
      * @note This class conforms to the Lockable and BasicLockable C++ named requirements
      */
-    struct BufferView {
+    class BufferView {
+      private:
         constexpr static vk::DeviceSize MegaBufferingDisableThreshold{1024 * 128}; //!< The threshold at which the view is considered to be too large to be megabuffered (128KiB)
 
-        std::shared_ptr<Buffer::BufferDelegate> bufferDelegate;
-
-        BufferView(std::shared_ptr<Buffer> buffer, const Buffer::BufferViewStorage *view);
-
-        constexpr BufferView(nullptr_t = nullptr) : bufferDelegate(nullptr) {}
+        BufferDelegate *delegate{};
+        vk::DeviceSize offset{};
 
         /**
-         * @brief Acquires an exclusive lock on the buffer for the calling thread
-         * @note Naming is in accordance to the BasicLockable named requirement
-         */
-        void lock() const {
-            bufferDelegate->lock();
-        }
-
-        /**
-         * @brief Acquires an exclusive lock on the buffer for the calling thread
-         * @param tag A tag to associate with the lock, future invocations with the same tag prior to the unlock will acquire the lock without waiting (A default initialised tag will disable this behaviour)
-         * @return If the lock was acquired by this call as opposed to the buffer already being locked with the same tag
-         * @note All locks using the same tag **must** be from the same thread as it'll only have one corresponding unlock() call
-         */
-        bool LockWithTag(ContextTag tag) const {
-            return bufferDelegate->LockWithTag(tag);
-        }
-
-        /**
-         * @brief Relinquishes an existing lock on the buffer by the calling thread
-         * @note Naming is in accordance to the BasicLockable named requirement
-         */
-        void unlock() const {
-            bufferDelegate->unlock();
-        }
-
-        /**
-         * @brief Attempts to acquire an exclusive lock but returns immediately if it's captured by another thread
-         * @note Naming is in accordance to the Lockable named requirement
-         */
-        bool try_lock() const {
-            return bufferDelegate->try_lock();
-        }
-
-        constexpr operator bool() const {
-            return bufferDelegate != nullptr;
-        }
-
-        /**
-         * @note The buffer **must** be locked prior to calling this
-         */
-        Buffer::BufferDelegate *operator->() const {
-            return bufferDelegate.get();
-        }
-
-        /**
-         * @brief Registers a callback for a usage of this view, it may be called multiple times due to the view being recreated with different backings
-         * @note This will force the buffer to be host immutable for the current cycle, preventing megabuffering and requiring slower GPU inline writes instead
-         * @note The callback will be automatically called the first time after registration
+         * @brief Resolves the delegate's pointer chain so it directly points to the target buffer, updating offset accordingly
          * @note The view **must** be locked prior to calling this
          */
-        void RegisterUsage(LinearAllocatorState<> &allocator, const std::shared_ptr<FenceCycle> &cycle, Buffer::BufferDelegate::UsageCallback usageCallback);
+        void ResolveDelegate();
+
+      public:
+        vk::DeviceSize size{};
+
+        BufferView();
+
+        BufferView(BufferDelegate *delegate, vk::DeviceSize offset, vk::DeviceSize size);
+
+        /**
+         * @return A pointer to the current underlying buffer of the view
+         * @note The view **must** be locked prior to calling this
+         */
+        Buffer *GetBuffer() const;
+
+        /**
+         * @return The offset of the view in the underlying buffer
+         * @note The view **must** be locked prior to calling this
+         */
+        vk::DeviceSize GetOffset() const;
+
+        /**
+         * @brief Templated lock function that ensures correct locking of the delegate's underlying buffer
+         */
+        template<bool TryLock, typename LockFunction, typename UnlockFunction>
+        std::conditional_t<TryLock, bool, void> LockWithFunction(LockFunction lock, UnlockFunction unlock) {
+            while (true) {
+                auto preLockBuffer{delegate->GetBuffer()};
+                if constexpr (TryLock) {
+                    if (!lock(preLockBuffer))
+                        return false;
+                } else {
+                    lock(preLockBuffer);
+                }
+                auto postLockBuffer{delegate->GetBuffer()};
+                if (preLockBuffer == postLockBuffer)
+                    break;
+
+                preLockBuffer->unlock();
+            };
+
+            ResolveDelegate();
+
+            if constexpr (TryLock)
+                return true;
+            else
+                return;
+        }
+
+        void lock() {
+            LockWithFunction<false>([](Buffer *buffer) { buffer->lock(); }, [](Buffer *buffer) { buffer->unlock(); });
+        }
+
+        bool try_lock() {
+            return LockWithFunction<true>([](Buffer *buffer) { return buffer->try_lock(); }, [](Buffer *buffer) { buffer->unlock(); });
+        }
+
+        bool LockWithTag(ContextTag tag) {
+            bool result{};
+            LockWithFunction<false>([&result, tag](Buffer *buffer) { result = buffer->LockWithTag(tag); }, [](Buffer *buffer) { buffer->unlock(); });
+            return result;
+        }
+
+        void unlock() {
+            delegate->GetBuffer()->unlock();
+        }
 
         /**
          * @brief Reads data at the specified offset in the view
          * @note The view **must** be locked prior to calling this
          * @note See Buffer::Read
          */
-        void Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const;
+        void Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize readOffset) const;
 
         /**
          * @brief Writes data at the specified offset in the view
          * @note The view **must** be locked prior to calling this
          * @note See Buffer::Write
          */
-        void Write(bool isFirstUsage, const std::shared_ptr<FenceCycle> &cycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const;
+        bool Write(bool isFirstUsage, const std::shared_ptr<FenceCycle> &cycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize writeOffset, const std::function<void()> &gpuCopyCallback = {}) const;
 
         /**
-         * @brief If megabuffering is beneficial for the current buffer, pushes its contents into the megabuffer and returns the offset of the pushed data
-         * @return The megabuffer allocation for the buffer, may be invalid if megabuffering is not beneficial
+         * @brief If megabuffering is beneficial for the view, pushes its contents into the megabuffer and returns the offset of the pushed data
+         * @return The megabuffer allocation for the view, may be invalid if megabuffering is not beneficial
          * @note The view **must** be locked prior to calling this
          */
         MegaBufferAllocator::Allocation AcquireMegaBuffer(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator) const;
@@ -437,5 +432,9 @@ namespace skyline::gpu {
          * @note See Buffer::GetReadOnlyBackingSpan
          */
         span<u8> GetReadOnlyBackingSpan(bool isFirstUsage, const std::function<void()> &flushHostCallback);
+
+        constexpr operator bool() {
+            return delegate != nullptr;
+        }
     };
 }
diff --git a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
index bb04d8ea..bb2a032a 100644
--- a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
@@ -77,7 +77,7 @@ namespace skyline::gpu {
                 highestAddress = mapping.end().base();
         }
 
-        LockedBuffer newBuffer{std::make_shared<Buffer>(gpu, span<u8>{lowestAddress, highestAddress}), tag}; // If we don't lock the buffer prior to trapping it during synchronization, a race could occur with a guest trap acquiring the lock before we do and mutating the buffer prior to it being ready
+        LockedBuffer newBuffer{std::make_shared<Buffer>(delegateAllocatorState, gpu, span<u8>{lowestAddress, highestAddress}, nextBufferId++), tag}; // If we don't lock the buffer prior to trapping it during synchronization, a race could occur with a guest trap acquiring the lock before we do and mutating the buffer prior to it being ready
 
         newBuffer->SetupGuestMappings();
         newBuffer->SynchronizeHost(false); // Overlaps don't necessarily fully cover the buffer so we have to perform a sync here to prevent any gaps
@@ -132,64 +132,38 @@ namespace skyline::gpu {
 
             // Transfer all views from the overlapping buffer to the new buffer with the new buffer and updated offset, ensuring pointer stability
             vk::DeviceSize overlapOffset{static_cast<vk::DeviceSize>(srcBuffer->guest->begin() - newBuffer->guest->begin())};
-            for (auto it{srcBuffer->views.begin()}; it != srcBuffer->views.end(); it++) {
-                if (overlapOffset)
-                    // This is a slight hack as we really shouldn't be changing the underlying non-mutable set elements without a rehash but without writing our own set impl this is the best we can do
-                    const_cast<Buffer::BufferViewStorage *>(&*it)->offset += overlapOffset;
-
-                // Reset the sequence number to the initial one, if the new buffer was created from any GPU dirty overlaps then the new buffer's sequence will be incremented past this thus forcing a reacquire if necessary
-                // This is fine to do in the set since the hash and operator== do not use this value
-                it->lastAcquiredSequence = Buffer::InitialSequenceNumber;
-            }
-
-            if (overlapOffset)
-                // All current hashes are invalidated by above loop if overlapOffset is nonzero so rehash the container
-                srcBuffer->views.rehash(0);
-
-            // Merge the view sets, this will keep pointer stability hence avoiding any reallocation
-            newBuffer->views.merge(srcBuffer->views);
-
-            // Transfer all delegates references from the overlapping buffer to the new buffer
-            for (auto &delegate : srcBuffer->delegates) {
-                delegate->buffer = *newBuffer;
-                if (delegate->usageCallbacks)
-                    for (auto &callback : *delegate->usageCallbacks)
-                        callback(*delegate->view, *newBuffer);
-            }
-
-            newBuffer->delegates.splice(newBuffer->delegates.end(), srcBuffer->delegates);
+            srcBuffer->delegate->Link(newBuffer->delegate, overlapOffset);
         }
 
         return newBuffer;
     }
 
-    BufferView BufferManager::FindOrCreate(GuestBuffer guestMapping, ContextTag tag, const std::function<void(std::shared_ptr<Buffer>, ContextLock<Buffer> &&)> &attachBuffer) {
+    BufferView BufferManager::FindOrCreateImpl(GuestBuffer guestMapping, ContextTag tag, const std::function<void(std::shared_ptr<Buffer>, ContextLock<Buffer> &&)> &attachBuffer) {
         /*
          * We align the buffer to the page boundary to ensure that:
          * 1) Any buffer view has the same alignment guarantees as on the guest, this is required for UBOs, SSBOs and Texel buffers
          * 2) We can coalesce a lot of tiny buffers into a single large buffer covering an entire page, this is often the case for index buffers and vertex buffers
          */
-        auto alignedStart{util::AlignDown(guestMapping.begin().base(), PAGE_SIZE)}, alignedEnd{util::AlignUp(guestMapping.end().base(), PAGE_SIZE)};
-        vk::DeviceSize offset{static_cast<size_t>(guestMapping.begin().base() - alignedStart)}, size{guestMapping.size()};
-        guestMapping = span<u8>{alignedStart, alignedEnd};
+        auto alignedStart{util::AlignDown(guestMapping.begin().base(), constant::PageSize)}, alignedEnd{util::AlignUp(guestMapping.end().base(), constant::PageSize)};
+        span<u8> alignedGuestMapping{alignedStart, alignedEnd};
 
-        auto overlaps{Lookup(guestMapping, tag)};
+        auto overlaps{Lookup(alignedGuestMapping, tag)};
         if (overlaps.size() == 1) [[likely]] {
             // If we find a buffer which can entirely fit the guest mapping, we can just return a view into it
             auto &firstOverlap{overlaps.front()};
-            if (firstOverlap->guest->begin() <= guestMapping.begin() && firstOverlap->guest->end() >= guestMapping.end())
-                return firstOverlap->GetView(static_cast<vk::DeviceSize>(guestMapping.begin() - firstOverlap->guest->begin()) + offset, size);
+            if (firstOverlap->guest->begin() <= alignedGuestMapping.begin() && firstOverlap->guest->end() >= alignedGuestMapping.end())
+                return firstOverlap->GetView(static_cast<vk::DeviceSize>(guestMapping.begin() - firstOverlap->guest->begin()), guestMapping.size());
         }
 
         if (overlaps.empty()) {
             // If we couldn't find any overlapping buffers, create a new buffer without coalescing
-            LockedBuffer buffer{std::make_shared<Buffer>(gpu, guestMapping), tag};
+            LockedBuffer buffer{std::make_shared<Buffer>(delegateAllocatorState, gpu, alignedGuestMapping, nextBufferId++), tag};
             buffer->SetupGuestMappings();
             InsertBuffer(*buffer);
-            return buffer->GetView(offset, size);
+            return buffer->GetView(static_cast<vk::DeviceSize>(guestMapping.begin() - buffer->guest->begin()), guestMapping.size());
         } else {
             // If the new buffer overlaps other buffers, we need to create a new buffer and coalesce all overlapping buffers into one
-            auto buffer{CoalesceBuffers(guestMapping, overlaps, tag)};
+            auto buffer{CoalesceBuffers(alignedGuestMapping, overlaps, tag)};
 
             // If any overlapping buffer was already attached to the current context, we should also attach the new buffer
             for (auto &srcBuffer : overlaps) {
@@ -206,7 +180,7 @@ namespace skyline::gpu {
             }
             InsertBuffer(*buffer);
 
-            return buffer->GetView(static_cast<vk::DeviceSize>(guestMapping.begin() - buffer->guest->begin()) + offset, size);
+            return buffer->GetView(static_cast<vk::DeviceSize>(guestMapping.begin() - buffer->guest->begin()), guestMapping.size());
         }
     }
 }
diff --git a/app/src/main/cpp/skyline/gpu/buffer_manager.h b/app/src/main/cpp/skyline/gpu/buffer_manager.h
index 6ae7197a..c377b48f 100644
--- a/app/src/main/cpp/skyline/gpu/buffer_manager.h
+++ b/app/src/main/cpp/skyline/gpu/buffer_manager.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <common/linear_allocator.h>
 #include <common/segment_table.h>
 #include "buffer.h"
 
@@ -15,6 +16,8 @@ namespace skyline::gpu {
         GPU &gpu;
         std::mutex mutex; //!< Synchronizes access to the buffer mappings
         std::vector<std::shared_ptr<Buffer>> bufferMappings; //!< A sorted vector of all buffer mappings
+        LinearAllocatorState<> delegateAllocatorState; //!< Linear allocator used to allocate buffer delegates
+        size_t nextBufferId{}; //!< The next unique buffer id to be assigned
 
         static constexpr size_t L2EntryGranularity{19}; //!< The amount of AS (in bytes) a single L2 PTE covers (512 KiB == 1 << 19)
         SegmentTable<Buffer *, constant::AddressSpaceSize, constant::PageSizeBits, L2EntryGranularity> bufferTable; //!< A page table of all buffer mappings for O(1) lookups on full matches
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
index e55d5f0b..a752ce81 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@@ -133,15 +133,9 @@ namespace skyline::gpu::interconnect {
             // See AttachTexture(...)
             bufferManagerLock.emplace(gpu.buffer);
 
-        bool didLock{view->LockWithTag(tag)};
+        bool didLock{view.LockWithTag(tag)};
         if (didLock)
-            attachedBuffers.emplace_back(view->buffer);
-
-        if (view.bufferDelegate->attached)
-            return didLock;
-
-        attachedBufferDelegates.emplace_back(view.bufferDelegate);
-        view.bufferDelegate->attached = true;
+            attachedBuffers.emplace_back(view.GetBuffer()->shared_from_this());
         return didLock;
     }
 
@@ -152,15 +146,9 @@ namespace skyline::gpu::interconnect {
 
         if (lock.OwnsLock()) {
             // Transfer ownership to executor so that the resource will stay locked for the period it is used on the GPU
-            attachedBuffers.emplace_back(view->buffer);
+            attachedBuffers.emplace_back(view.GetBuffer()->shared_from_this());
             lock.Release(); // The executor will handle unlocking the lock so it doesn't need to be handled here
         }
-
-        if (view.bufferDelegate->attached)
-            return;
-
-        attachedBufferDelegates.emplace_back(view.bufferDelegate);
-        view.bufferDelegate->attached = true;
     }
 
     void CommandExecutor::AttachLockedBuffer(std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
@@ -322,14 +310,6 @@ namespace skyline::gpu::interconnect {
     void CommandExecutor::ResetInternal() {
         attachedTextures.clear();
         textureManagerLock.reset();
-
-        for (const auto &delegate : attachedBufferDelegates) {
-            delegate->usageCallbacks.reset();
-            delegate->attached = false;
-            delegate->view->megaBufferAllocation = {};
-        }
-
-        attachedBufferDelegates.clear();
         attachedBuffers.clear();
         bufferManagerLock.reset();
         megaBufferAllocatorLock.reset();
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
index 4b0aed20..f903c653 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@@ -64,8 +64,6 @@ namespace skyline::gpu::interconnect {
 
         std::vector<LockedBuffer> attachedBuffers; //!< All textures that are attached to the current execution
 
-        using SharedBufferDelegate = std::shared_ptr<Buffer::BufferDelegate>;
-        std::vector<SharedBufferDelegate> attachedBufferDelegates; //!< All buffers that are attached to the current execution
 
         std::vector<TextureView *> lastSubpassAttachments; //!< The storage backing for attachments used in the last subpass
         span<TextureView *> lastSubpassInputAttachments; //!< The set of input attachments used in the last subpass