Rewrite buffer megabuffering to be per view and more efficient

This commit implements several key optimisations in megabuffering that are all inherently interlinked. - Megabuffering is moved from per-buffer to per-view copies, this makes megabuffering possible for small views into larger underlying buffers which is often the case with even the simplest of games, - Megabuffering is no longer the default option, it is only enabled for buffer views that have had inline GPU writes applied to them in the past as that is the only case where they are beneficial. In any other case the cost of copying, even with a 128KiB limit can be significant. - With both of these changes, there is now possibility for overlapping views where one uses megabuffering and one does not. In order to allow GPU inline writes to work consistently in such cases a system of 'host immutability' has been implemented, when a buffer is marked as host immutable for a given cycle, all writes to the buffer from that point to the point the cycle is signalled will be performed on the GPU, ensuring that the backing contents are correctly sequenced
2024-11-29 19:44:17 +01:00 · 2022-06-10 21:26:19 +01:00 · 2022-06-10 21:26:19 +01:00 · 7709dc8cf6
commit 7709dc8cf6
parent 2e356b8f0b
5 changed files with 128 additions and 77 deletions
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@ -8,9 +8,11 @@
 #include "buffer.h"

 namespace skyline::gpu {
-    void Buffer::TryEnableMegaBuffering() {
-        megaBufferOffset = 0;
-        megaBufferingEnabled = backing.size() < MegaBufferingDisableThreshold;
+    bool Buffer::CheckHostImmutable() {
+        if (hostImmutableCycle && hostImmutableCycle->Poll())
+            hostImmutableCycle.reset();
+
+        return hostImmutableCycle != nullptr;
    }

    void Buffer::SetupGuestMappings() {
@ -33,14 +35,11 @@ namespace skyline::gpu {
    }

    Buffer::Buffer(GPU &gpu, GuestBuffer guest) : gpu(gpu), backing(gpu.memory.AllocateBuffer(guest.size())), guest(guest) {
-        TryEnableMegaBuffering();
        SetupGuestMappings();
    }

    Buffer::Buffer(GPU &gpu, const std::shared_ptr<FenceCycle> &pCycle, GuestBuffer guest, span<std::shared_ptr<Buffer>> srcBuffers) : gpu(gpu), backing(gpu.memory.AllocateBuffer(guest.size())), guest(guest) {
        std::scoped_lock bufLock{*this};
-
-        TryEnableMegaBuffering();
        SetupGuestMappings();

        // Source buffers don't necessarily fully overlap with us so we have to perform a sync here to prevent any gaps
@ -63,8 +62,15 @@ namespace skyline::gpu {
        for (const auto &srcBuffer : srcBuffers) {
            std::scoped_lock lock{*srcBuffer};
            if (srcBuffer->guest) {
-                if (!srcBuffer->megaBufferingEnabled)
-                    megaBufferingEnabled = false;
+                if (srcBuffer->hostImmutableCycle) {
+                    // Propagate any host immutability
+                    if (hostImmutableCycle) {
+                        if (srcBuffer->hostImmutableCycle.owner_before(hostImmutableCycle))
+                            hostImmutableCycle = srcBuffer->hostImmutableCycle;
+                    } else {
+                        hostImmutableCycle = srcBuffer->hostImmutableCycle;
+                    }
+                }

                if (srcBuffer->dirtyState == Buffer::DirtyState::GpuDirty) {
                    // If the source buffer is GPU dirty we cannot directly copy over its GPU backing contents
@ -80,7 +86,7 @@ namespace skyline::gpu {
                    }
                } else if (srcBuffer->dirtyState == Buffer::DirtyState::Clean) {
                    // For clean buffers we can just copy over the GPU backing data directly
-                    // This is necessary since clean buffers may not have matching GPU/CPU data in the case of non-megabuffered inline updates
+                    // This is necessary since clean buffers may not have matching GPU/CPU data in the case of inline updates for host immutable buffers
                    copyBuffer(guest, *srcBuffer->guest, backing.data(), srcBuffer->backing.data());
                }

@ -90,7 +96,6 @@ namespace skyline::gpu {
    }

    Buffer::Buffer(GPU &gpu, vk::DeviceSize size) : gpu(gpu), backing(gpu.memory.AllocateBuffer(size)) {
-        TryEnableMegaBuffering();
        dirtyState = DirtyState::Clean; // Since this is a host-only buffer it's always going to be clean
    }

@ -107,7 +112,7 @@ namespace skyline::gpu {
        if (dirtyState == DirtyState::GpuDirty || !guest)
            return;

-        megaBufferingEnabled = false; // We can no longer megabuffer this buffer after it has been written by the GPU
+        AdvanceSequence(); // The GPU will modify buffer contents so advance to the next sequence
        gpu.state.nce->RetrapRegions(*trapHandle, false);
        dirtyState = DirtyState::GpuDirty;
    }
@ -139,13 +144,10 @@ namespace skyline::gpu {

        TRACE_EVENT("gpu", "Buffer::SynchronizeHost");

-        // If we have performed a CPU->GPU sync and megabuffering is enabled for this buffer the megabuffer copy of the buffer will no longer be up-to-date
-        InvalidateMegaBuffer();
-
+        AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
        std::memcpy(backing.data(), mirror.data(), mirror.size());

        if (rwTrap) {
-            megaBufferingEnabled = false; // We can't megabuffer a buffer written by the GPU
            gpu.state.nce->RetrapRegions(*trapHandle, false);
            dirtyState = DirtyState::GpuDirty;
        } else {
@ -163,13 +165,10 @@ namespace skyline::gpu {

        TRACE_EVENT("gpu", "Buffer::SynchronizeHostWithCycle");

-        // If we have performed a CPU->GPU sync and megabuffering is enabled for this buffer the megabuffer copy of the buffer will no longer be up-to-date so force a recreation
-        InvalidateMegaBuffer();
-
+        AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
        std::memcpy(backing.data(), mirror.data(), mirror.size());

        if (rwTrap) {
-            megaBufferingEnabled = false; // We can't megabuffer a buffer written by the GPU
            gpu.state.nce->RetrapRegions(*trapHandle, false);
            dirtyState = DirtyState::GpuDirty;
        } else {
@ -195,7 +194,6 @@ namespace skyline::gpu {
            gpu.state.nce->RetrapRegions(*trapHandle, true);

        dirtyState = DirtyState::Clean;
-        TryEnableMegaBuffering(); // If megaBuffering was disabled due to potential GPU dirtiness we can safely try to re-enable it now that the buffer is clean
    }

    /**
@ -236,7 +234,8 @@ namespace skyline::gpu {
    }

    void Buffer::Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) {
-        InvalidateMegaBuffer(); // Since we're writing to the backing buffer the megabuffer contents will require refresh
+        AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
+        everHadInlineUpdate = true;

        // Perform a syncs in both directions to ensure correct ordering of writes
        if (dirtyState == DirtyState::CpuDirty)
@ -249,13 +248,12 @@ namespace skyline::gpu {

        std::memcpy(mirror.data() + offset, data.data(), data.size()); // Always copy to mirror since any CPU side reads will need the up-to-date contents

-        if (megaBufferingEnabled) {
-            // If megabuffering is enabled then we don't need to do any special sequencing here, we can write directly to the backing and the sequencing for it will be handled at usage time
-            std::memcpy(backing.data() + offset, data.data(), data.size());
-        } else {
-            // Fallback to a GPU-side inline update for the buffer contents to ensure correct sequencing with draws
+        if (CheckHostImmutable())
+            // Perform a GPU-side inline update for the buffer contents if this buffer is host immutable since we can't directly modify the backing
            gpuCopyCallback();
-        }
+        else
+            // If that's not the case we don't need to do any GPU-side sequencing here, we can write directly to the backing and the sequencing for it will be handled at usage time
+            std::memcpy(backing.data() + offset, data.data(), data.size());
    }

    BufferView Buffer::GetView(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format) {
@ -264,23 +262,20 @@ namespace skyline::gpu {
        return BufferView{shared_from_this(), &(*it)};
    }

-    vk::DeviceSize Buffer::AcquireMegaBuffer(MegaBuffer &megaBuffer) {
-        SynchronizeGuest(false, true); // First try and enable megabuffering by doing an immediate sync
+    std::pair<u64, span<u8>> Buffer::AcquireCurrentSequence() {
+        SynchronizeGuest(false, true); // First try to remove GPU dirtiness by doing an immediate sync and taking a quick shower

-        if (!megaBufferingEnabled)
-            return 0; // Bail out if megabuffering is disabled for this buffer
+        if (dirtyState == DirtyState::GpuDirty)
+            // Bail out if buffer is GPU dirty - since we don't know the contents ahead of time the sequence is indeterminate
+            return {};

-        SynchronizeHost(); // Since pushes to the megabuffer use the GPU backing contents ensure they're up-to-date by performing a CPU -> GPU sync
+        SynchronizeHost(); // Ensure that the returned mirror is fully up-to-date by performing a CPU -> GPU sync

-        if (megaBufferOffset)
-            return megaBufferOffset; // If the current buffer contents haven't been changed since the last acquire, we can just return the existing offset
-
-        megaBufferOffset = megaBuffer.Push(backing, true); // Buffers are required to be page aligned in the megabuffer
-        return megaBufferOffset;
+        return {sequenceNumber, mirror};
    }

-    void Buffer::InvalidateMegaBuffer() {
-        megaBufferOffset = 0;
+    void Buffer::AdvanceSequence() {
+        sequenceNumber++;
    }

    span<u8> Buffer::GetReadOnlyBackingSpan(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback) {
@ -290,6 +285,10 @@ namespace skyline::gpu {
        return mirror;
    }

+    void Buffer::MarkHostImmutable(const std::shared_ptr<FenceCycle> &pCycle) {
+        hostImmutableCycle = pCycle;
+    }
+
    Buffer::BufferViewStorage::BufferViewStorage(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format) : offset(offset), size(size), format(format) {}

    Buffer::BufferDelegate::BufferDelegate(std::shared_ptr<Buffer> pBuffer, const Buffer::BufferViewStorage *view) : buffer(std::move(pBuffer)), view(view) {
@ -347,7 +346,10 @@ namespace skyline::gpu {
        }
    }

-    void BufferView::RegisterUsage(const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback) {
+    void BufferView::RegisterUsage(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback) {
+        // Users of RegisterUsage expect the buffer contents to be sequenced as the guest GPU would be, so force any further writes in the current cycle to occur on the GPU
+        bufferDelegate->buffer->MarkHostImmutable(pCycle);
+
        usageCallback(*bufferDelegate->view, bufferDelegate->buffer);
        if (!bufferDelegate->usageCallback) {
            bufferDelegate->usageCallback = usageCallback;
@ -364,17 +366,39 @@ namespace skyline::gpu {
    }

    void BufferView::Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const {
+        // If megabuffering can't be enabled we have to do a GPU-side copy to ensure sequencing
+        bool gpuCopy{bufferDelegate->view->size > MegaBufferingDisableThreshold};
+        if (gpuCopy)
+            // This will force the host buffer contents to stay as is for the current cycle, requiring that write operations are instead sequenced on the GPU for the entire buffer
+            bufferDelegate->buffer->MarkHostImmutable(pCycle);
+
        bufferDelegate->buffer->Write(pCycle, flushHostCallback, gpuCopyCallback, data, offset + bufferDelegate->view->offset);
    }

    vk::DeviceSize BufferView::AcquireMegaBuffer(MegaBuffer &megaBuffer) const {
-        vk::DeviceSize bufferOffset{bufferDelegate->buffer->AcquireMegaBuffer(megaBuffer)};
-
-        // Propagate 0 results since they signify that megabuffering isn't supported for a buffer
-        if (bufferOffset)
-            return bufferOffset + bufferDelegate->view->offset;
-        else
+        if (!bufferDelegate->buffer->EverHadInlineUpdate())
+            // Don't megabuffer buffers that have never had inline updates since performance is only going to be harmed as a result of the constant copying and there wont be any benefit since there are no GPU inline updates that would be avoided
            return 0;
+
+        if (bufferDelegate->view->size > MegaBufferingDisableThreshold)
+            return 0;
+
+        auto[newSequence, sequenceSpan]{bufferDelegate->buffer->AcquireCurrentSequence()};
+        if (!newSequence)
+            return 0; // If the sequence can't be acquired then the buffer is GPU dirty and we can't megabuffer
+
+        // If a copy of the view for the current sequence is already in megabuffer then we can just use that
+        if (newSequence == bufferDelegate->view->lastAcquiredSequence && bufferDelegate->view->megabufferOffset)
+            return bufferDelegate->view->megabufferOffset;
+
+        // If the view is not in the megabuffer then we need to allocate a new copy
+        auto viewBackingSpan{sequenceSpan.subspan(bufferDelegate->view->offset, bufferDelegate->view->size)};
+
+        // TODO: we could optimise the alignment requirements here based on buffer usage
+        bufferDelegate->view->megabufferOffset = megaBuffer.Push(viewBackingSpan, true);
+        bufferDelegate->view->lastAcquiredSequence = newSequence;
+
+        return bufferDelegate->view->megabufferOffset; // Success!
    }

    span<u8> BufferView::GetReadOnlyBackingSpan(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback) {
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@ -35,15 +35,14 @@ namespace skyline::gpu {
            GpuDirty, //!< The GPU buffer has been modified but the CPU mappings have not been updated
        } dirtyState{DirtyState::CpuDirty}; //!< The state of the CPU mappings with respect to the GPU buffer

-        constexpr static vk::DeviceSize MegaBufferingDisableThreshold{0x10'000}; //!< The threshold at which the buffer is considered to be too large to be megabuffered (64KiB)
+        bool everHadInlineUpdate{}; //!< Whether the buffer has ever had an inline update since it was created, if this is set then megabuffering will be attempted by views to avoid the cost of inline GPU updates

-        bool megaBufferingEnabled{}; //!< If megabuffering can be used for this buffer at the current moment, is set based on MegaBufferingDisableThreshold and dirty state
-        vk::DeviceSize megaBufferOffset{}; //!< The offset into the megabuffer where the current buffer contents are stored, 0 if there is no up-to-date megabuffer entry for the current buffer contents
+        std::shared_ptr<FenceCycle> hostImmutableCycle; //!< The cycle for when the buffer was last immutable, if this is signalled the buffer is no longer immutable

        /**
-         * @brief Resets megabuffering state based off of the buffer size
+         * @return If the buffer should be treated as host immutable
         */
-        void TryEnableMegaBuffering();
+        bool CheckHostImmutable();

      public:
        /**
@ -54,11 +53,19 @@ namespace skyline::gpu {
            vk::DeviceSize size;
            vk::Format format;

+            // These are not accounted for in hash nor operator== since they are not an inherent property of the view, but they are required nonetheless for megabuffering on a per-view basis
+            mutable u64 lastAcquiredSequence{}; //!< The last sequence number for the attached buffer that the megabuffer copy of this view was acquired from, if this is equal to the current sequence of the attached buffer then the copy at `megabufferOffset` is still valid
+            mutable vk::DeviceSize megabufferOffset{}; //!< Offset of the current copy of the view in the megabuffer (if any), 0 if no copy exists and this is only valid if `lastAcquiredSequence` is equal to the current sequence of the attached buffer
+
            BufferViewStorage(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format);

-            auto operator<=>(const BufferViewStorage &) const = default;
+            bool operator==(const BufferViewStorage &other) const {
+                return other.offset == offset && other.size == size && other.format == format;
+            }
        };

+        static constexpr u64 InitialSequenceNumber{1}; //!< Sequence number that all buffers start off with
+
      private:
        /**
         * @brief Hash function for BufferViewStorage to be used in the views set
@ -70,12 +77,15 @@ namespace skyline::gpu {
                boost::hash_combine(seed, entry.size);
                boost::hash_combine(seed, entry.format);

+                // The mutable fields {lastAcquiredSequence, megabufferOffset} are deliberately ignored
                return seed;
            }
        };

        std::unordered_set<BufferViewStorage, BufferViewStorageHash> views; //!< BufferViewStorage(s) that are backed by this Buffer, used for storage and repointing to a new Buffer on deletion

+        u64 sequenceNumber{InitialSequenceNumber}; //!< Sequence number that is incremented after all modifications to the host side `backing` buffer, used to prevent redundant copies of the buffer being stored in the megabuffer by views
+
      public:
        /**
         * @brief A delegate for a strong reference to a Buffer by a BufferView which can be changed to another Buffer transparently
@ -233,7 +243,7 @@ namespace skyline::gpu {
        void Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset);

        /**
-         * @brief Writes data at the specified offset in the buffer
+         * @brief Writes data at the specified offset in the buffer, falling back to GPU side copies if the buffer is host immutable
         * @param pCycle The FenceCycle associated with the current workload, utilised for waiting and flushing semantics
         * @param flushHostCallback Callback to flush and execute all pending GPU work to allow for synchronisation of GPU dirty buffers
         * @param gpuCopyCallback Callback to perform a GPU-side copy for this Write
@ -247,20 +257,20 @@ namespace skyline::gpu {
        BufferView GetView(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format = {});

        /**
-         * @brief Pushes the current buffer contents into the megabuffer (if necessary)
-         * @return The offset of the pushed buffer contents in the megabuffer
+         * @brief Attempts to return the current sequence number and prepare the buffer for read accesses from the returned span
+         * @return The current sequence number and a span of the buffers guest mirror given that the buffer is not GPU dirty, if it is then a zero sequence number is returned
+         * @note The contents of the returned span can be cached safely given the sequence number is unchanged
         * @note The buffer **must** be locked prior to calling this
-         * @note This will only push into the megabuffer when there have been modifications after the previous acquire, otherwise the previous offset will be reused
-         * @note An implicit CPU -> GPU sync will be performed when calling this, an immediate GPU -> CPU sync will also be attempted if the buffer is GPU dirty in the hope that megabuffering can be reenabled
+         * @note An implicit CPU -> GPU sync will be performed when calling this, an immediate GPU -> CPU sync will also be attempted if the buffer is GPU dirty
         */
-        vk::DeviceSize AcquireMegaBuffer(MegaBuffer& megaBuffer);
+        std::pair<u64, span<u8>> AcquireCurrentSequence();

        /**
-         * @brief Forces the buffer contents to be pushed into the megabuffer on the next AcquireMegaBuffer call
+         * @brief Increments the sequence number of the buffer, any futher calls to AcquireCurrentSequence will return this new sequence number. See the comment for `sequenceNumber`
         * @note The buffer **must** be locked prior to calling this
-         * @note This **must** be called after any modifications of the backing buffer data
+         * @note This **must** be called after any modifications of the backing buffer data (but not mirror)
         */
-        void InvalidateMegaBuffer();
+        void AdvanceSequence();

        /**
         * @param pCycle The FenceCycle associated with the current workload, utilised for waiting and flushing semantics
@ -270,6 +280,14 @@ namespace skyline::gpu {
         * @note The buffer **must** be kept locked until the span is no longer in use
         */
        span<u8> GetReadOnlyBackingSpan(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback);
+
+        /**
+         * @brief Prevents any further writes to the `backing` host side buffer for the duration of the current cycle, forcing slower inline GPU updates instead
+         * @note The buffer **must** be locked prior to calling this
+         */
+        void MarkHostImmutable(const std::shared_ptr<FenceCycle> &pCycle);
+
+        bool EverHadInlineUpdate() const { return everHadInlineUpdate; }
    };

    /**
@ -278,6 +296,8 @@ namespace skyline::gpu {
     * @note This class conforms to the Lockable and BasicLockable C++ named requirements
     */
    struct BufferView {
+        constexpr static vk::DeviceSize MegaBufferingDisableThreshold{1024 * 128}; //!< The threshold at which the view is considered to be too large to be megabuffered (128KiB)
+
        std::shared_ptr<Buffer::BufferDelegate> bufferDelegate;

        BufferView(std::shared_ptr<Buffer> buffer, const Buffer::BufferViewStorage *view);
@ -327,10 +347,11 @@ namespace skyline::gpu {

        /**
         * @brief Registers a callback for a usage of this view, it may be called multiple times due to the view being recreated with different backings
+         * @note This will force the buffer to be host immutable for the current cycle, preventing megabuffering and requiring slower GPU inline writes instead
         * @note The callback will be automatically called the first time after registration
         * @note The view **must** be locked prior to calling this
         */
-        void RegisterUsage(const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback);
+        void RegisterUsage(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback);

        /**
         * @brief Reads data at the specified offset in the view
@ -347,10 +368,9 @@ namespace skyline::gpu {
        void Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const;

        /**
-         * @brief Pushes the current buffer contents into the megabuffer (if necessary)
-         * @return The offset of the pushed buffer contents in the megabuffer
+         * @brief If megabuffering is beneficial for the current buffer, pushes its contents into the megabuffer and returns the offset of the pushed data
+         * @return The offset of the pushed buffer contents in the megabuffer, or 0 if megabuffering is not to be used
         * @note The view **must** be locked prior to calling this
-         * @note See Buffer::AcquireMegaBuffer
         */
        vk::DeviceSize AcquireMegaBuffer(MegaBuffer &megaBuffer) const;

--- a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
@ -57,15 +57,20 @@ namespace skyline::gpu {

            // Transfer all views from the overlapping buffer to the new buffer with the new buffer and updated offset, ensuring pointer stability
            vk::DeviceSize overlapOffset{static_cast<vk::DeviceSize>(overlap->guest->begin() - newBuffer->guest->begin())};
-            if (overlapOffset != 0) {
-                // This is a slight hack as we really shouldn't be changing the underlying set elements without a rehash but without writing our own set impl this is the best we can do
-                for (auto it{overlap->views.begin()}; it != overlap->views.end(); it++)
+            for (auto it{overlap->views.begin()}; it != overlap->views.end(); it++) {
+                if (overlapOffset)
+                    // This is a slight hack as we really shouldn't be changing the underlying non-mutable set elements without a rehash but without writing our own set impl this is the best we can do
                    const_cast<Buffer::BufferViewStorage *>(&*it)->offset += overlapOffset;

-                // All current hashes are invalidated by above loop so rehash the container
-                overlap->views.rehash(0);
+                // Reset the sequence number to the initial one, if the new buffer was created from any GPU dirty overlaps then the new buffer's sequence will be incremented past this thus forcing a reacquire if neccessary
+                // This is fine to do in the set since the hash and operator== do not use this value
+                it->lastAcquiredSequence = Buffer::InitialSequenceNumber;
            }

+            if (overlapOffset)
+                // All current hashes are invalidated by above loop if overlapOffset is nonzero so rehash the container
+                overlap->views.rehash(0);
+
            // Merge the view sets, this will keep pointer stability hence avoiding any reallocation
            newBuffer->views.merge(overlap->views);

--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@ -219,7 +219,7 @@ namespace skyline::gpu::interconnect {
                gpu.scheduler.SubmitCommandBuffer(commandBuffer, activeCommandBuffer.GetFence());

                for (const auto &delegate : attachedBuffers)
-                    delegate->buffer->InvalidateMegaBuffer();
+                    delegate->view->megabufferOffset = 0;

                nodes.clear();
                attachedTextures.clear();
--- a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
@ -1118,7 +1118,7 @@ namespace skyline::gpu::interconnect {
                                .range = view->view->size
                            };
                        } else {
-                            view.RegisterUsage([descriptor = bufferDescriptors.data() + bufferIndex](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
+                            view.RegisterUsage(executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
                                *descriptor = vk::DescriptorBufferInfo{
                                    .buffer = buffer->GetBacking(),
                                    .offset = view.offset,
@ -1151,8 +1151,10 @@ namespace skyline::gpu::interconnect {
                        auto view{GetSsboViewFromDescriptor(storageBuffer, pipelineStage.constantBuffers)};

                        std::scoped_lock lock{view};
-                        view->buffer->MarkGpuDirty(); // SSBOs may be written to by the GPU so mark as dirty (will also disable megabuffering)
-                        view.RegisterUsage([descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
+                        if (storageBuffer.is_written)
+                            view->buffer->MarkGpuDirty();
+
+                        view.RegisterUsage(executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
                            *descriptor = vk::DescriptorBufferInfo{
                                .buffer = buffer->GetBacking(),
                                .offset = view.offset,
@ -2842,7 +2844,7 @@ namespace skyline::gpu::interconnect {
                        boundIndexBuffer->handle = executor.megaBuffer.GetBacking();
                        boundIndexBuffer->offset = megaBufferOffset;
                    } else {
-                        indexBufferView.RegisterUsage([=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
+                        indexBufferView.RegisterUsage(executor.cycle, [=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
                            boundIndexBuffer->handle = buffer->GetBacking();
                            boundIndexBuffer->offset = view.offset;
                        });
@ -2877,7 +2879,7 @@ namespace skyline::gpu::interconnect {
                        boundVertexBuffers->handles[index] = executor.megaBuffer.GetBacking();
                        boundVertexBuffers->offsets[index] = megaBufferOffset;
                    } else {
-                        vertexBufferView.RegisterUsage([handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
+                        vertexBufferView.RegisterUsage(executor.cycle, [handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
                            *handle = buffer->GetBacking();
                            *offset = view.offset;
                        });