Implement overhead-free sequenced buffer updates with megabuffers

Previously constant buffer updates would be handled on the CPU and only the end result would be synced to the GPU before execute. This caused issues as if the constant buffer contents was changed between each draw in a renderpass (e.g. text rendering) the draws themselves would only see the final resulting constant buffer. We had earlier tried to fix this by using vkCmdUpdateBuffer however this caused significant performance loss due to an oversight in Adreno drivers. We could have worked around this simply by using vkCmdCopy buffer however there would still be a performance loss due to renderpasses being split up with copies inbetween. To avoid this we introduce 'megabuffers', a brand new technique not done before in any other switch emulators. Rather than replaying the copies in sequence on the GPU, we take advantage of the fact that buffers are generally small in order to replay buffers on the GPU instead. Each write and subsequent usage of a buffer will cause a copy of the buffer with that write, and all prior applied to be pushed into the megabuffer, this way at the start of execute the megabuffer will hold all used states of the buffer simultaneously. Draws then reference these individual states in sequence to allow everything to work without any copies. In order to support this buffers have been moved to an immediate sync model, with synchronisation being done at usage-time rather than execute (in order to keep contents properly sequenced) and GPU-side writes now need to be explictly marked (since they prevent megabuffering). It should also be noted that a fallback path using cmdCopyBuffer exists for the cases where buffers are too large or GPU dirty.
2024-12-26 13:21:50 +01:00 · 2022-04-23 18:10:39 +01:00 · 2022-04-23 18:10:39 +01:00 · de796cd2cd
commit de796cd2cd
parent 0d9992cb8e
7 changed files with 363 additions and 59 deletions
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@ -8,6 +8,11 @@
 #include "buffer.h"

 namespace skyline::gpu {
+    void Buffer::TryEnableMegaBuffering() {
+        megaBufferOffset = 0;
+        megaBufferingEnabled = backing.size() < MegaBufferingDisableThreshold;
+    }
+
    void Buffer::SetupGuestMappings() {
        u8 *alignedData{util::AlignDown(guest->data(), PAGE_SIZE)};
        size_t alignedSize{static_cast<size_t>(util::AlignUp(guest->data() + guest->size(), PAGE_SIZE) - alignedData)};
@ -28,10 +33,64 @@ namespace skyline::gpu {
    }

    Buffer::Buffer(GPU &gpu, GuestBuffer guest) : gpu(gpu), backing(gpu.memory.AllocateBuffer(guest.size())), guest(guest) {
+        TryEnableMegaBuffering();
        SetupGuestMappings();
    }

+    Buffer::Buffer(GPU &gpu, const std::shared_ptr<FenceCycle> &pCycle, GuestBuffer guest, span<std::shared_ptr<Buffer>> srcBuffers) : gpu(gpu), backing(gpu.memory.AllocateBuffer(guest.size())), guest(guest) {
+        std::scoped_lock bufLock{*this};
+
+        TryEnableMegaBuffering();
+        SetupGuestMappings();
+
+        // Source buffers don't necessarily fully overlap with us so we have to perform a sync here to prevent any gaps
+        SynchronizeHost(false);
+
+        // Copies between two buffers based off of their mappings in guest memory
+        auto copyBuffer{[](auto dstGuest, auto srcGuest, auto dstPtr, auto srcPtr) {
+            if (dstGuest.begin().base() <= srcGuest.begin().base()) {
+                size_t dstOffset{static_cast<size_t>(srcGuest.begin().base() - dstGuest.begin().base())};
+                size_t copySize{std::min(dstGuest.size() - dstOffset, srcGuest.size())};
+                std::memcpy(dstPtr + dstOffset, srcPtr, copySize);
+            } else if (dstGuest.begin().base() > srcGuest.begin().base()) {
+                size_t srcOffset{static_cast<size_t>(dstGuest.begin().base() - srcGuest.begin().base())};
+                size_t copySize{std::min(dstGuest.size(), srcGuest.size() - srcOffset)};
+                std::memcpy(dstPtr, srcPtr + srcOffset, copySize);
+            }
+        }};
+
+        // Transfer data/state from source buffers
+        for (const auto &srcBuffer : srcBuffers) {
+            std::scoped_lock lock{*srcBuffer};
+            if (srcBuffer->guest) {
+                if (!srcBuffer->megaBufferingEnabled)
+                    megaBufferingEnabled = false;
+
+                if (srcBuffer->dirtyState == Buffer::DirtyState::GpuDirty) {
+                    // If the source buffer is GPU dirty we cannot directly copy over its GPU backing contents
+
+                    // Only sync back the buffer if it's not attched to the current fence cycle, otherwise propagate the GPU dirtiness
+                    if (!srcBuffer->cycle.owner_before(pCycle)) {
+                        // Perform a GPU -> CPU sync on the source then do a CPU -> GPU sync for the region occupied by the source
+                        // This is required since if we were created from a two buffers: one GPU dirty in the current cycle, and one GPU dirty in the previous cycle, if we marked ourselves as CPU dirty here then the GPU dirtiness from the current cycle buffer would be ignored and cause writes to be missed
+                        srcBuffer->SynchronizeGuest(true);
+                        copyBuffer(guest, *srcBuffer->guest, backing.data(), srcBuffer->mirror.data());
+                    } else {
+                        MarkGpuDirty();
+                    }
+                } else if (srcBuffer->dirtyState == Buffer::DirtyState::Clean) {
+                    // For clean buffers we can just copy over the GPU backing data directly
+                    // This is necessary since clean buffers may not have matching GPU/CPU data in the case of non-megabuffered inline updates
+                    copyBuffer(guest, *srcBuffer->guest, backing.data(), srcBuffer->backing.data());
+                }
+
+                // CPU dirty buffers are already synchronized in the initial SynchronizeHost call so don't need special handling
+            }
+        }
+    }
+
    Buffer::Buffer(GPU &gpu, vk::DeviceSize size) : gpu(gpu), backing(gpu.memory.AllocateBuffer(size)) {
+        TryEnableMegaBuffering();
        dirtyState = DirtyState::Clean; // Since this is a host-only buffer it's always going to be clean
    }

@ -47,6 +106,8 @@ namespace skyline::gpu {
    void Buffer::MarkGpuDirty() {
        if (dirtyState == DirtyState::GpuDirty || !guest)
            return;
+
+        megaBufferingEnabled = false; // We can no longer megabuffer this buffer after it has been written by the GPU
        gpu.state.nce->RetrapRegions(*trapHandle, false);
        dirtyState = DirtyState::GpuDirty;
    }
@ -61,6 +122,15 @@ namespace skyline::gpu {
        }
    }

+    bool Buffer::PollFence() {
+        auto lCycle{cycle.lock()};
+        if (lCycle && lCycle->Poll()) {
+            cycle.reset();
+            return true;
+        }
+        return false;
+    }
+
    void Buffer::SynchronizeHost(bool rwTrap) {
        if (dirtyState != DirtyState::CpuDirty || !guest)
            return; // If the buffer has not been modified on the CPU or there's no guest buffer, there is no need to synchronize it
@ -69,9 +139,13 @@ namespace skyline::gpu {

        TRACE_EVENT("gpu", "Buffer::SynchronizeHost");

+        // If we have performed a CPU->GPU sync and megabuffering is enabled for this buffer the megabuffer copy of the buffer will no longer be up-to-date
+        InvalidateMegaBuffer();
+
        std::memcpy(backing.data(), mirror.data(), mirror.size());

        if (rwTrap) {
+            megaBufferingEnabled = false; // We can't megabuffer a buffer written by the GPU
            gpu.state.nce->RetrapRegions(*trapHandle, false);
            dirtyState = DirtyState::GpuDirty;
        } else {
@ -89,9 +163,13 @@ namespace skyline::gpu {

        TRACE_EVENT("gpu", "Buffer::SynchronizeHostWithCycle");

+        // If we have performed a CPU->GPU sync and megabuffering is enabled for this buffer the megabuffer copy of the buffer will no longer be up-to-date so force a recreation
+        InvalidateMegaBuffer();
+
        std::memcpy(backing.data(), mirror.data(), mirror.size());

        if (rwTrap) {
+            megaBufferingEnabled = false; // We can't megabuffer a buffer written by the GPU
            gpu.state.nce->RetrapRegions(*trapHandle, false);
            dirtyState = DirtyState::GpuDirty;
        } else {
@ -100,11 +178,13 @@ namespace skyline::gpu {
        }
    }

-    void Buffer::SynchronizeGuest(bool skipTrap, bool skipFence) {
+    void Buffer::SynchronizeGuest(bool skipTrap, bool nonBlocking) {
        if (dirtyState != DirtyState::GpuDirty || !guest)
            return; // If the buffer has not been used on the GPU or there's no guest buffer, there is no need to synchronize it

-        if (!skipFence)
+        if (nonBlocking && !PollFence())
+            return;
+        else if (!nonBlocking)
            WaitOnFence();

        TRACE_EVENT("gpu", "Buffer::SynchronizeGuest");
@ -113,7 +193,9 @@ namespace skyline::gpu {

        if (!skipTrap)
            gpu.state.nce->RetrapRegions(*trapHandle, true);
+
        dirtyState = DirtyState::Clean;
+        TryEnableMegaBuffering(); // If megaBuffering was disabled due to potential GPU dirtiness we can safely try to re-enable it now that the buffer is clean
    }

    /**
@ -138,18 +220,45 @@ namespace skyline::gpu {
        cycle = pCycle;
    }

-    void Buffer::Read(span<u8> data, vk::DeviceSize offset) {
-        if (dirtyState == DirtyState::CpuDirty || dirtyState == DirtyState::Clean)
+    void Buffer::Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) {
+        if (dirtyState == DirtyState::CpuDirty || dirtyState == DirtyState::Clean) {
            std::memcpy(data.data(), mirror.data() + offset, data.size());
-        else if (dirtyState == DirtyState::GpuDirty)
+        } else if (dirtyState == DirtyState::GpuDirty) {
+            // If this buffer was attached to the current cycle, flush all pending host GPU work and wait to ensure that we read valid data
+            if (!cycle.owner_before(pCycle))
+                flushHostCallback();
+
+            SynchronizeGuest();
+
            std::memcpy(data.data(), backing.data() + offset, data.size());
+        }
    }

-    void Buffer::Write(span<u8> data, vk::DeviceSize offset) {
-        if (dirtyState == DirtyState::CpuDirty || dirtyState == DirtyState::Clean)
-            std::memcpy(mirror.data() + offset, data.data(), data.size());
-        if (dirtyState == DirtyState::GpuDirty || dirtyState == DirtyState::Clean)
+    void Buffer::Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) {
+        InvalidateMegaBuffer(); // Since we're writing to the backing buffer the megabuffer contents will require refresh
+
+        if (dirtyState == DirtyState::CpuDirty) {
+            SynchronizeHostWithCycle(pCycle); // Perform a CPU -> GPU sync to ensure correct ordering of writes
+        } else if (dirtyState == DirtyState::GpuDirty) {
+            // If this buffer was attached to the current cycle, flush all pending host GPU work and wait to ensure that writes are correctly ordered
+            if (!cycle.owner_before(pCycle))
+                flushHostCallback();
+
+            SynchronizeGuest();
+        }
+
+        if (dirtyState != DirtyState::Clean)
+            Logger::Error("Attempting to write to a dirty buffer"); // This should never happen since we do syncs in both directions above
+
+        std::memcpy(mirror.data() + offset, data.data(), data.size()); // Always copy to mirror since any CPU side reads will need the up-to-date contents
+
+        if (megaBufferingEnabled) {
+            // If megabuffering is enabled then we don't need to do any special sequencing here, we can write directly to the backing and the sequencing for it will be handled at usage time
            std::memcpy(backing.data() + offset, data.data(), data.size());
+        } else {
+            // Fallback to a GPU-side inline update for the buffer contents to ensure correct sequencing with draws
+            gpuCopyCallback();
+        }
    }

    Buffer::BufferViewStorage::BufferViewStorage(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format) : offset(offset), size(size), format(format) {}
@ -207,6 +316,25 @@ namespace skyline::gpu {
        return BufferView{shared_from_this(), &views.back()};
    }

+    vk::DeviceSize Buffer::AcquireMegaBuffer() {
+        SynchronizeGuest(false, true); // First try and enable megabuffering by doing an immediate sync
+
+        if (!megaBufferingEnabled)
+            return 0; // Bail out if megabuffering is disabled for this buffer
+
+        SynchronizeHost(); // Since pushes to the megabuffer use the GPU backing contents ensure they're up-to-date by performing a CPU -> GPU sync
+
+        if (megaBufferOffset)
+            return megaBufferOffset; // If the current buffer contents haven't been changed since the last acquire, we can just return the existing offset
+
+        megaBufferOffset = gpu.buffer.megaBuffer.Push(backing, true); // Buffers are required to be page aligned in the megabuffer
+        return megaBufferOffset;
+    }
+
+    void Buffer::InvalidateMegaBuffer() {
+        megaBufferOffset = 0;
+    }
+
    BufferView::BufferView(std::shared_ptr<Buffer> buffer, Buffer::BufferViewStorage *view) : bufferDelegate(std::make_shared<Buffer::BufferDelegate>(std::move(buffer), view)) {}

    void BufferView::AttachCycle(const std::shared_ptr<FenceCycle> &cycle) {
@ -230,11 +358,21 @@ namespace skyline::gpu {
        }
    }

-    void BufferView::Read(span<u8> data, vk::DeviceSize offset) const {
-        bufferDelegate->buffer->Read(data, offset + bufferDelegate->view->offset);
+    void BufferView::Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const {
+        bufferDelegate->buffer->Read(pCycle, flushHostCallback, data, offset + bufferDelegate->view->offset);
    }

-    void BufferView::Write(span<u8> data, vk::DeviceSize offset) const {
-        bufferDelegate->buffer->Write(data, offset + bufferDelegate->view->offset);
+    void BufferView::Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const {
+        bufferDelegate->buffer->Write(pCycle, flushHostCallback, gpuCopyCallback, data, offset + bufferDelegate->view->offset);
+    }
+
+    vk::DeviceSize BufferView::AcquireMegaBuffer() const {
+        vk::DeviceSize bufferOffset{bufferDelegate->buffer->AcquireMegaBuffer()};
+
+        // Propagate 0 results since they signify that megabuffering isn't supported for a buffer
+        if (bufferOffset)
+            return bufferOffset + bufferDelegate->view->offset;
+        else
+            return 0;
    }
 }
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@ -32,6 +32,16 @@ namespace skyline::gpu {
            GpuDirty, //!< The GPU buffer has been modified but the CPU mappings have not been updated
        } dirtyState{DirtyState::CpuDirty}; //!< The state of the CPU mappings with respect to the GPU buffer

+        constexpr static vk::DeviceSize MegaBufferingDisableThreshold{0x10'000}; //!< The threshold at which the buffer is considered to be too large to be megabuffered (64KiB)
+
+        bool megaBufferingEnabled{}; //!< If megabuffering can be used for this buffer at the current moment, is set based on MegaBufferingDisableThreshold and dirty state
+        vk::DeviceSize megaBufferOffset{}; //!< The offset into the megabuffer where the current buffer contents are stored, 0 if there is no up-to-date megabuffer entry for the current buffer contents
+
+        /**
+         * @brief Resets megabuffering state based off of the buffer size
+         */
+        void TryEnableMegaBuffering();
+
      public:
        /**
         * @brief Storage for all metadata about a specific view into the buffer, used to prevent redundant view creation and duplication of VkBufferView(s)
@ -99,6 +109,13 @@ namespace skyline::gpu {

        Buffer(GPU &gpu, GuestBuffer guest);

+        /**
+         * @brief Creates a Buffer that is pre-synchronised with the contents of the input buffers
+         * @param pCycle The FenceCycle associated with the current workload, utilised for synchronising GPU dirty buffers
+         * @param srcBuffers Span of overlapping source buffers
+         */
+        Buffer(GPU &gpu, const std::shared_ptr<FenceCycle> &pCycle, GuestBuffer guest, span<std::shared_ptr<Buffer>> srcBuffers);
+
        /**
         * @brief Creates a host-only Buffer which isn't backed by any guest buffer
         * @note The created buffer won't have a mirror so any operations cannot depend on a mirror existing
@ -144,6 +161,13 @@ namespace skyline::gpu {
         */
        void WaitOnFence();

+        /**
+         * @brief Polls a fence cycle if it exists and resets it if signalled
+         * @return Whether the fence cycle was signalled
+         * @note The buffer **must** be locked prior to calling this
+         */
+        bool PollFence();
+
        /**
         * @brief Synchronizes the host buffer with the guest
         * @param rwTrap If true, the guest buffer will be read/write trapped rather than only being write trapped which is more efficient than calling MarkGpuDirty directly after
@ -162,10 +186,10 @@ namespace skyline::gpu {
        /**
         * @brief Synchronizes the guest buffer with the host buffer
         * @param skipTrap If true, setting up a CPU trap will be skipped and the dirty state will be Clean/CpuDirty
-         * @param skipFence If true, waiting on the currently attached fence will be skipped
+         * @param nonBlocking If true, the call will return immediately if the fence is not signalled, skipping the sync
         * @note The buffer **must** be locked prior to calling this
         */
-        void SynchronizeGuest(bool skipTrap = false, bool skipFence = false);
+        void SynchronizeGuest(bool skipTrap = false, bool nonBlocking = false);

        /**
         * @brief Synchronizes the guest buffer with the host buffer when the FenceCycle is signalled
@ -176,19 +200,40 @@ namespace skyline::gpu {

        /**
         * @brief Reads data at the specified offset in the buffer
+         * @param pCycle The FenceCycle associated with the current workload, utilised for waiting and flushing semantics
+         * @param flushHostCallback Callback to flush and execute all pending GPU work to allow for synchronisation of GPU dirty buffers
         */
-        void Read(span<u8> data, vk::DeviceSize offset);
+        void Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset);

        /**
         * @brief Writes data at the specified offset in the buffer
+         * @param pCycle The FenceCycle associated with the current workload, utilised for waiting and flushing semantics
+         * @param flushHostCallback Callback to flush and execute all pending GPU work to allow for synchronisation of GPU dirty buffers
+         * @param gpuCopyCallback Callback to perform a GPU-side copy for this Write
         */
-        void Write(span<u8> data, vk::DeviceSize offset);
+        void Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset);

        /**
         * @return A cached or newly created view into this buffer with the supplied attributes
         * @note The buffer **must** be locked prior to calling this
         */
        BufferView GetView(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format = {});
+
+        /**
+         * @brief Pushes the current buffer contents into the megabuffer (if necessary)
+         * @return The offset of the pushed buffer contents in the megabuffer
+         * @note The buffer **must** be locked prior to calling this
+         * @note This will only push into the megabuffer when there have been modifications after the previous acquire, otherwise the previous offset will be reused
+         * @note An implicit CPU -> GPU sync will be performed when calling this, an immediate GPU -> CPU sync will also be attempted if the buffer is GPU dirty in the hope that megabuffering can be reenabled
+         */
+        vk::DeviceSize AcquireMegaBuffer();
+
+        /**
+         * @brief Forces the buffer contents to be pushed into the megabuffer on the next AcquireMegaBuffer call
+         * @note The buffer **must** be locked prior to calling this
+         * @note This **must** be called after any modifications of the backing buffer data
+         */
+        void InvalidateMegaBuffer();
    };

    /**
@ -254,13 +299,23 @@ namespace skyline::gpu {
        /**
         * @brief Reads data at the specified offset in the view
         * @note The view **must** be locked prior to calling this
+         * @note See Buffer::Read
         */
-        void Read(span<u8> data, vk::DeviceSize offset) const;
+        void Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const;

        /**
         * @brief Writes data at the specified offset in the view
         * @note The view **must** be locked prior to calling this
+         * @note See Buffer::Write
         */
-        void Write(span<u8> data, vk::DeviceSize offset) const;
+        void Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const;
+
+        /**
+         * @brief Pushes the current buffer contents into the megabuffer (if necessary)
+         * @return The offset of the pushed buffer contents in the megabuffer
+         * @note The view **must** be locked prior to calling this
+         * @note See Buffer::AcquireMegaBuffer
+         */
+        vk::DeviceSize AcquireMegaBuffer() const;
    };
 }
--- a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
@ -6,7 +6,39 @@
 #include "buffer_manager.h"

 namespace skyline::gpu {
-    BufferManager::BufferManager(GPU &gpu) : gpu(gpu) {}
+    MegaBuffer::MegaBuffer(GPU &gpu) : backing(gpu.memory.AllocateBuffer(Size)), freeRegion(backing.subspan(PAGE_SIZE)) {}
+
+    void MegaBuffer::Reset() {
+        std::scoped_lock lock{mutex};
+        freeRegion = backing.subspan(PAGE_SIZE);
+    }
+
+    vk::Buffer MegaBuffer::GetBacking() const {
+        return backing.vkBuffer;
+    }
+
+    vk::DeviceSize MegaBuffer::Push(span<u8> data, bool pageAlign) {
+        std::scoped_lock lock{mutex};
+
+        if (data.size() > freeRegion.size())
+            throw exception("Ran out of megabuffer space! Alloc size: 0x{:X}", data.size());
+
+        if (pageAlign) {
+            // If page aligned data was requested then align the free
+            auto alignedFreeBase{util::AlignUp(static_cast<size_t>(freeRegion.data() - backing.data()), PAGE_SIZE)};
+            freeRegion = backing.subspan(alignedFreeBase);
+        }
+
+        // Allocate space for data from the free region
+        auto resultSpan{freeRegion.subspan(0, data.size())};
+        resultSpan.copy_from(data);
+
+        // Move the free region along
+        freeRegion = freeRegion.subspan(data.size());
+        return static_cast<vk::DeviceSize>(resultSpan.data() - backing.data());
+    }
+
+    BufferManager::BufferManager(GPU &gpu) : gpu(gpu), megaBuffer(gpu) {}

    bool BufferManager::BufferLessThan(const std::shared_ptr<Buffer> &it, u8 *pointer) {
        return it->guest->begin().base() < pointer;
@ -49,14 +81,10 @@ namespace skyline::gpu {
                highestAddress = mapping.end().base();
        }

-        auto newBuffer{std::make_shared<Buffer>(gpu, span<u8>(lowestAddress, highestAddress))};
+        auto newBuffer{std::make_shared<Buffer>(gpu, cycle, span<u8>(lowestAddress, highestAddress), overlaps)};
        for (auto &overlap : overlaps) {
            std::scoped_lock overlapLock{*overlap};

-            if (!overlap->cycle.owner_before(cycle))
-                overlap->WaitOnFence(); // We want to only wait on the fence cycle if it's not the current fence cycle
-            overlap->SynchronizeGuest(true, true); // Sync back the buffer before we destroy it
-
            buffers.erase(std::find(buffers.begin(), buffers.end(), overlap));

            // Transfer all views from the overlapping buffer to the new buffer with the new buffer and updated offset
--- a/app/src/main/cpp/skyline/gpu/buffer_manager.h
+++ b/app/src/main/cpp/skyline/gpu/buffer_manager.h
@ -6,6 +6,37 @@
 #include "buffer.h"

 namespace skyline::gpu {
+    /**
+     * @brief A simple linearly allocated GPU-side buffer used to temporarily store buffer modifications allowing them to be replayed in-sequence on the GPU
+     */
+    class MegaBuffer {
+      private:
+        constexpr static vk::DeviceSize Size{0x6'400'000}; //!< Size in bytes of the megabuffer (100MiB)
+
+        memory::Buffer backing; //!< The backing GPU buffer
+        std::mutex mutex; //!< Synchronizes access to freeRegion
+        span<u8> freeRegion; //!< Span of unallocated space in the megabuffer
+
+      public:
+        MegaBuffer(GPU &gpu);
+
+        /**
+         * @brief Resets the free region of the megabuffer to its initial state, data is left intact but may be overwritten
+         */
+        void Reset();
+
+        /**
+         * @brief Returns the underlying Vulkan buffer for the megabuffer
+         */
+        vk::Buffer GetBacking() const;
+
+        /**
+         * @brief Pushes data to the megabuffer and returns the offset at which it was written
+         * @param pageAlign Whether the pushed data should be page aligned in the megabuffer
+         */
+        vk::DeviceSize Push(span<u8> data, bool pageAlign = false);
+    };
+
    /**
     * @brief The Buffer Manager is responsible for maintaining a global view of buffers being mapped from the guest to the host, any lookups and creation of host buffer from equivalent guest buffer alongside reconciliation of any overlaps with existing textures
     */
@ -21,6 +52,8 @@ namespace skyline::gpu {
        static bool BufferLessThan(const std::shared_ptr<Buffer> &it, u8 *pointer);

      public:
+        MegaBuffer megaBuffer; //!< The megabuffer used to temporarily store buffer modifications allowing them to be replayed in-sequence on the GPU
+
        BufferManager(GPU &gpu);

        /**
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@ -30,18 +30,20 @@ namespace skyline::gpu::interconnect {

    void CommandExecutor::AttachTexture(TextureView *view) {
        auto texture{view->texture.get()};
-        if (!syncTextures.contains(texture)) {
+        if (!attachedTextures.contains(texture)) {
            texture->WaitOnFence();
            texture->cycle = cycle;
-            syncTextures.emplace(texture);
+            attachedTextures.emplace(texture);
        }
        cycle->AttachObject(view->shared_from_this());
    }

    void CommandExecutor::AttachBuffer(BufferView &view) {
-        if (!syncBuffers.contains(view.bufferDelegate)) {
+        view->buffer->SynchronizeHost();
+
+        if (!attachedBuffers.contains(view.bufferDelegate)) {
            view.AttachCycle(cycle);
-            syncBuffers.emplace(view.bufferDelegate);
+            attachedBuffers.emplace(view.bufferDelegate);
        }
    }

@ -142,16 +144,13 @@ namespace skyline::gpu::interconnect {
                    .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
                });

-                for (auto texture : syncTextures) {
+                for (auto texture : attachedTextures) {
                    texture->SynchronizeHostWithBuffer(commandBuffer, cycle, true);
                    texture->MarkGpuDirty();
                }

-                for (const auto& delegate : syncBuffers) {
-                    delegate->buffer->SynchronizeHostWithCycle(cycle, true);
-                    delegate->buffer->MarkGpuDirty();
+                for (const auto& delegate : attachedBuffers)
                    delegate->usageCallback = nullptr;
-                }

                vk::RenderPass lRenderPass;
                u32 subpassIndex;
@ -182,11 +181,16 @@ namespace skyline::gpu::interconnect {
                commandBuffer.end();
                gpu.scheduler.SubmitCommandBuffer(commandBuffer, activeCommandBuffer.GetFence());

+                for (const auto& delegate : attachedBuffers)
+                    delegate->buffer->InvalidateMegaBuffer();
+
                nodes.clear();
-                syncTextures.clear();
-                syncBuffers.clear();
+                attachedTextures.clear();
+                attachedBuffers.clear();

                cycle = activeCommandBuffer.Reset();
+
+                gpu.buffer.megaBuffer.Reset();
            }
        }
    }
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@ -19,10 +19,10 @@ namespace skyline::gpu::interconnect {
        boost::container::stable_vector<node::NodeVariant> nodes;
        node::RenderPassNode *renderPass{};
        size_t subpassCount{}; //!< The number of subpasses in the current render pass
-        std::unordered_set<Texture *> syncTextures; //!< All textures that need to be synced prior to and after execution
+        std::unordered_set<Texture *> attachedTextures; //!< All textures that need to be synced prior to and after execution

        using SharedBufferDelegate = std::shared_ptr<Buffer::BufferDelegate>;
-        std::unordered_set<SharedBufferDelegate> syncBuffers; //!< All buffers that need to be synced prior to and after execution
+        std::unordered_set<SharedBufferDelegate> attachedBuffers; //!< All buffers that are attached to the current execution

        /**
         * @return If a new render pass was created by the function or the current one was reused as it was compatible
--- a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
@ -618,10 +618,13 @@ namespace skyline::gpu::interconnect {
             * @note This must only be called when the GuestBuffer is resolved correctly
             */
            template<typename T>
-            T Read(size_t offset) const {
+            T Read(CommandExecutor &pExecutor, size_t dstOffset) const {
                T object;
                std::scoped_lock lock{view};
-                view.Read(span<T>(object).template cast<u8>(), offset);
+                view.Read(pExecutor.cycle, []() {
+                    // TODO: here we should trigger an execute, however that doesn't currently work due to Read being called mid-draw and attached objects not handling this case
+                    Logger::Warn("GPU dirty buffer reads for attached buffers are unimplemented");
+                }, span<T>(object).template cast<u8>(), dstOffset);
                return object;
            }

@ -630,9 +633,26 @@ namespace skyline::gpu::interconnect {
             * @note This must only be called when the GuestBuffer is resolved correctly
             */
            template<typename T>
-            void Write(span<T> buf, size_t offset) {
+            void Write(CommandExecutor &pExecutor, MegaBuffer &megaBuffer, span<T> buf, size_t dstOffset) {
+                auto srcCpuBuf{buf.template cast<u8>()};
+
                std::scoped_lock lock{view};
-                view.Write(buf.template cast<u8>(), offset);
+                view.Write(pExecutor.cycle, []() {
+                    // TODO: see Read()
+                    Logger::Warn("GPU dirty buffer reads for attached buffers are unimplemented");
+                }, [&megaBuffer, &pExecutor, srcCpuBuf, dstOffset, view = this->view]() {
+                    auto srcGpuOffset{megaBuffer.Push(srcCpuBuf)};
+                    auto srcGpuBuf{megaBuffer.GetBacking()};
+                    pExecutor.AddOutsideRpCommand([=](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
+                        std::scoped_lock lock{view};
+                        vk::BufferCopy copyRegion{
+                            .size = srcCpuBuf.size_bytes(),
+                            .srcOffset = srcGpuOffset,
+                            .dstOffset = view->view->offset + dstOffset
+                        };
+                        commandBuffer.copyBuffer(srcGpuBuf, view->buffer->GetBacking(), copyRegion);
+                    });
+                }, srcCpuBuf, dstOffset);
            }
        };
        ConstantBuffer constantBufferSelector; //!< The constant buffer selector is used to bind a constant buffer to a stage or update data in it
@ -710,7 +730,7 @@ namespace skyline::gpu::interconnect {

        void ConstantBufferUpdate(std::vector<u32> data, u32 offset) {
            auto constantBuffer{GetConstantBufferSelector().value()};
-            constantBuffer.Write<u32>(data, offset);
+            constantBuffer.Write<u32>(executor, gpu.buffer.megaBuffer, data, offset);
        }

        /* Shader Program */
@ -869,7 +889,7 @@ namespace skyline::gpu::interconnect {
            };

            auto &cbuf{constantBuffers[descriptor.cbuf_index]};
-            auto ssbo{cbuf.Read<SsboDescriptor>(descriptor.cbuf_offset)};
+            auto ssbo{cbuf.Read<SsboDescriptor>(executor, descriptor.cbuf_offset)};

            auto mappings{channelCtx.asCtx->gmmu.TranslateRange(ssbo.iova, ssbo.size)};
            if (mappings.size() != 1)
@ -1024,15 +1044,27 @@ namespace skyline::gpu::interconnect {
                        });

                        auto view{pipelineStage.constantBuffers[constantBuffer.index].view};
+
                        std::scoped_lock lock(view);
-                        view.RegisterUsage([descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
-                            *descriptor = vk::DescriptorBufferInfo{
-                                .buffer = buffer->GetBacking(),
-                                .offset = view.offset,
-                                .range = view.size,
+                        if (auto megaBufferOffset{view.AcquireMegaBuffer()}) {
+                            // If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
+                            bufferDescriptors[bufferIndex] = vk::DescriptorBufferInfo{
+                                .buffer = gpu.buffer.megaBuffer.GetBacking(),
+                                .offset = megaBufferOffset,
+                                .range = view->view->size
                            };
-                        });
+                        } else {
+                            view.RegisterUsage([descriptor = bufferDescriptors.data() + bufferIndex](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
+                                *descriptor = vk::DescriptorBufferInfo{
+                                    .buffer = buffer->GetBacking(),
+                                    .offset = view.offset,
+                                    .range = view.size,
+                                };
+                            });
+                        }
+
                        executor.AttachBuffer(view);
+                        bufferIndex++;
                    }
                }

@ -1053,7 +1085,9 @@ namespace skyline::gpu::interconnect {
                        });

                        auto view{GetSsboViewFromDescriptor(storageBuffer, pipelineStage.constantBuffers)};
+
                        std::scoped_lock lock{view};
+                        view->buffer->MarkGpuDirty(); // SSBOs may be written to by the GPU so mark as dirty (will also disable megabuffering)
                        view.RegisterUsage([descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
                            *descriptor = vk::DescriptorBufferInfo{
                                .buffer = buffer->GetBacking(),
@ -1105,7 +1139,7 @@ namespace skyline::gpu::interconnect {
                                u32 textureIndex : 20;
                                u32 samplerIndex : 12;
                            };
-                        } handle{constantBuffer.Read<u32>(texture.cbuf_offset)};
+                        } handle{constantBuffer.Read<u32>(executor, texture.cbuf_offset)};

                        auto sampler{GetSampler(handle.samplerIndex)};
                        auto textureView{GetPoolTextureView(handle.textureIndex)};
@ -2634,10 +2668,16 @@ namespace skyline::gpu::interconnect {
                    std::scoped_lock lock(indexBufferView);

                    boundIndexBuffer->type = indexBuffer.type;
-                    indexBufferView.RegisterUsage([=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
-                        boundIndexBuffer->handle = buffer->GetBacking();
-                        boundIndexBuffer->offset = view.offset;
-                    });
+                    if (auto megaBufferOffset{indexBufferView.AcquireMegaBuffer()}) {
+                        // If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
+                        boundIndexBuffer->handle = gpu.buffer.megaBuffer.GetBacking();
+                        boundIndexBuffer->offset = megaBufferOffset;
+                    } else {
+                        indexBufferView.RegisterUsage([=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
+                            boundIndexBuffer->handle = buffer->GetBacking();
+                            boundIndexBuffer->offset = view.offset;
+                        });
+                    }

                    executor.AttachBuffer(indexBufferView);
                }
@ -2662,11 +2702,17 @@ namespace skyline::gpu::interconnect {
                        vertexBindingDivisorsDescriptions.push_back(vertexBuffer->bindingDivisorDescription);

                    std::scoped_lock vertexBufferLock(vertexBufferView);
-                    vertexBufferView.RegisterUsage([handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
-                        *handle = buffer->GetBacking();
-                        *offset = view.offset;
-                    });

+                    if (auto megaBufferOffset{vertexBufferView.AcquireMegaBuffer()}) {
+                        // If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
+                        boundVertexBuffers->handles[index] = gpu.buffer.megaBuffer.GetBacking();
+                        boundVertexBuffers->offsets[index] = megaBufferOffset;
+                    } else {
+                        vertexBufferView.RegisterUsage([handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
+                            *handle = buffer->GetBacking();
+                            *offset = view.offset;
+                        });
+                    }
                    executor.AttachBuffer(vertexBufferView);
                }
            }