Redesign buffer megabuffering

Due to the frequency at which is is called megabuffering performance is critical to the performance of the entire emulator, especially in high-drawcall-count scenarios. After the view redesign, megabuffering on a per-view level was no longer possible nor desirable, and thus megabuffering was modified to just copy for every usage of a view. This worked great at the time since there were other bottlenecks, however gpu-new has since removed almost all of them and megabuffering is now a major sore point. Fix this by megabuffering small chunks and storing them in a page-table like structure within the buffer, these chunks can be referenced by multiple views and will be smartly invalidated whenever the sequence number or execution number changes to avoid any sequencing issues. In addition to this, to help the case where almost the whole buffer is read every single frame across a set of multiple views, an optimisation to skip the chunked tracking and use one large single megabuffer allocation and one single memcpy has been introduced. This reduces the overall amount of time spent in memcpy since large memcpys are quicker.
2025-02-17 00:56:23 +01:00 · 2022-10-21 22:10:09 +01:00 · 2022-10-21 22:10:09 +01:00 · e45e7546c8
commit e45e7546c8
parent 7ea9aa52f5
2 changed files with 65 additions and 34 deletions
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@ -8,6 +8,15 @@
 #include "buffer.h"

 namespace skyline::gpu {
+    void Buffer::ResetMegabufferState() {
+        if (megaBufferTableUsed)
+            megaBufferTableValidity.reset();
+
+        megaBufferTableUsed = false;
+        megaBufferViewAccumulatedSize = 0;
+        unifiedMegaBuffer = {};
+    }
+
    void Buffer::SetupGuestMappings() {
        u8 *alignedData{util::AlignDown(guest->data(), constant::PageSize)};
        size_t alignedSize{static_cast<size_t>(util::AlignUp(guest->data() + guest->size(), constant::PageSize) - alignedData)};
@ -285,39 +294,58 @@ namespace skyline::gpu {
            return {};
    }

-    BufferBinding Buffer::TryMegaBufferView(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, size_t executionNumber,
+    BufferBinding Buffer::TryMegaBufferView(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, u32 executionNumber,
                                            vk::DeviceSize offset, vk::DeviceSize size) {
        if (!everHadInlineUpdate && sequenceNumber < FrequentlySyncedThreshold)
            // Don't megabuffer buffers that have never had inline updates and are not frequently synced since performance is only going to be harmed as a result of the constant copying and there wont be any benefit since there are no GPU inline updates that would be avoided
            return {};

-        if (size > MegaBufferingDisableThreshold)
-            return {};
-
        // We are safe to check dirty state here since it will only ever be set GPU dirty with the buffer locked and from the active GPFIFO thread. This helps with perf since the lock ends up being slightly expensive
        if (dirtyState == DirtyState::GpuDirty && !SynchronizeGuest(false, true))
            // Bail out if buffer cannot be synced, we don't know the contents ahead of time so the sequence is indeterminate
            return {};

+        // If the active execution has changed all previous allocations are now invalid
+        if (executionNumber != lastExecutionNumber) [[unlikely]]
+            ResetMegabufferState();
+
+        // If more than half the buffer has been megabuffered in chunks within the same execution assume this will generally be the case for this buffer and just megabuffer the whole thing without chunking
+        if (unifiedMegaBufferEnabled || megaBufferViewAccumulatedSize > (backing.size() / 2)) {
+            if (!unifiedMegaBuffer) {
+                unifiedMegaBuffer = allocator.Push(pCycle, mirror, true);
+                unifiedMegaBufferEnabled = true;
+            }
+
+            return BufferBinding{unifiedMegaBuffer.buffer, unifiedMegaBuffer.offset + offset, size};
+        }
+
+        if (size > MegaBufferingDisableThreshold && sequenceNumber < FrequentlySyncedThresholdHigh)
+            return {};
+
        size_t entryIdx{offset >> megaBufferTableShift};
        size_t bufferEntryOffset{entryIdx << megaBufferTableShift};
        size_t entryViewOffset{offset - bufferEntryOffset};
-        auto &entry{megaBufferTable[entryIdx]};

-        // If the cached allocation is invalid or not up to date, allocate a new one
-        if (!entry.allocation || entry.executionNumber != executionNumber ||
-              entry.sequenceNumber != sequenceNumber || entry.allocation.region.size() + entryViewOffset < size) {
+        if (entryIdx >= megaBufferTable.size())
+            return {};
+
+        auto &allocation{megaBufferTable[entryIdx]};
+
+        // If the cached allocation is invalid or too small, allocate a new one
+        if (!megaBufferTableValidity.test(entryIdx) || allocation.region.size() < (size + entryViewOffset)) {
            // Use max(oldSize, newSize) to avoid redundant reallocations within an execution if a larger allocation comes along later
-            auto mirrorAllocationRegion{mirror.subspan(bufferEntryOffset, std::max(entryViewOffset + size, entry.allocation.region.size()))};
-            entry.allocation = allocator.Push(pCycle, mirrorAllocationRegion, true);
-            entry.executionNumber = executionNumber;
-            entry.sequenceNumber = sequenceNumber;
+            auto mirrorAllocationRegion{mirror.subspan(bufferEntryOffset, std::max(entryViewOffset + size, allocation.region.size()))};
+            allocation = allocator.Push(pCycle, mirrorAllocationRegion, true);
+            megaBufferTableValidity.set(entryIdx);
+            megaBufferViewAccumulatedSize += mirrorAllocationRegion.size();
+            megaBufferTableUsed = true;
        }

-        return {entry.allocation.buffer, entry.allocation.offset + entryViewOffset, size};
+        return {allocation.buffer, allocation.offset + entryViewOffset, size};
    }

    void Buffer::AdvanceSequence() {
+        ResetMegabufferState();
        sequenceNumber++;
    }

@ -408,7 +436,7 @@ namespace skyline::gpu {
        return GetBuffer()->Write(isFirstUsage, flushHostCallback, data, writeOffset + GetOffset(), gpuCopyCallback);
    }

-    BufferBinding BufferView::TryMegaBuffer(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, size_t executionNumber, size_t sizeOverride) const {
+    BufferBinding BufferView::TryMegaBuffer(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, u32 executionNumber, size_t sizeOverride) const {
        return GetBuffer()->TryMegaBufferView(pCycle, allocator, executionNumber, GetOffset(), sizeOverride ? sizeOverride : size);
    }

--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@ -67,31 +67,35 @@ namespace skyline::gpu {
        } backingImmutability{}; //!< Describes how the buffer backing should be accessed by the current context
        RecursiveSpinLock stateMutex; //!< Synchronizes access to the dirty state and backing immutability

-        bool everHadInlineUpdate{}; //!< Whether the buffer has ever had an inline update since it was created, if this is set then megabuffering will be attempted by views to avoid the cost of inline GPU updates
+        static constexpr u32 InitialSequenceNumber{1}; //!< Sequence number that all buffers start off with
+        static constexpr u32 FrequentlySyncedThreshold{6}; //!< Threshold for the sequence number after which the buffer is considered elegible for megabuffering
+        static constexpr u32 FrequentlySyncedThresholdHigh{16}; //!< Threshold for the sequence number after which the buffer is considered elegible for megabuffering irrespective of view size
+        u32 sequenceNumber{InitialSequenceNumber}; //!< Sequence number that is incremented after all modifications to the host side `backing` buffer, used to prevent redundant copies of the buffer being stored in the megabuffer by views

-        static constexpr u64 InitialSequenceNumber{1}; //!< Sequence number that all buffers start off with
-        static constexpr u64 FrequentlySyncedThreshold{15}; //!< Threshold for the sequence number after which the buffer is considered elegible for megabuffering
-        u64 sequenceNumber{InitialSequenceNumber}; //!< Sequence number that is incremented after all modifications to the host side `backing` buffer, used to prevent redundant copies of the buffer being stored in the megabuffer by views
+        constexpr static vk::DeviceSize MegaBufferingDisableThreshold{1024 * 256}; //!< The threshold at which a view is considered to be too large to be megabuffered (256KiB)

-        constexpr static vk::DeviceSize MegaBufferingDisableThreshold{1024 * 128}; //!< The threshold at which a view is considered to be too large to be megabuffered (128KiB)
-
-        /**
-         * @brief Holds a single megabuffer copy with sequencing information for an offset within the buffer
-         */
-        struct MegaBufferTableEntry {
-            MegaBufferAllocator::Allocation allocation{}; //!< The allocation in the megabuffer for the entry, can be any size
-            size_t executionNumber; //!< Execution number of when the allocation was made
-            size_t sequenceNumber; //!< Sequence number of when the allocation was made
-        };
-
-        static constexpr int MegaBufferTableShiftMin{std::countr_zero(0x80U)}; //!< The minimum shift for megabuffer table entries, giving an alignment of at least 128 bytes
+        static constexpr int MegaBufferTableShiftMin{std::countr_zero(0x100U)}; //!< The minimum shift for megabuffer table entries, giving an alignment of at least 256 bytes
        static constexpr size_t MegaBufferTableMaxEntries{0x500U}; //!< Maximum number of entries in the megabuffer table, `megaBufferTableShift` is set based on this and the total buffer size
        int megaBufferTableShift; //!< Shift to apply to buffer offsets to get their megabuffer table index
-        std::vector<MegaBufferTableEntry> megaBufferTable; //!< Table of megabuffer allocations for regions of the buffer
+        std::vector<MegaBufferAllocator::Allocation> megaBufferTable; //!< Table of megabuffer allocations for regions of the buffer
+        std::bitset<MegaBufferTableMaxEntries> megaBufferTableValidity{}; //!< Bitset keeping track of which entries in the megabuffer table are valid
+        bool megaBufferTableUsed{}; //!< If the megabuffer table has been used at all since the last time it was cleared
+        bool unifiedMegaBufferEnabled{}; //!< If the unified megabuffer is enabled for this buffer and should be used instead of the table
+        bool everHadInlineUpdate{}; //!< Whether the buffer has ever had an inline update since it was created, if this is set then megabuffering will be attempted by views to avoid the cost of inline GPU updates
+
+        u32 lastExecutionNumber{}; //!< The execution number of the last time megabuffer data was updated
+
+        size_t megaBufferViewAccumulatedSize{};
+        MegaBufferAllocator::Allocation unifiedMegaBuffer{}; //!< An optional full-size mirror of the buffer in the megabuffer for use when the buffer is frequently updated and *all* of the buffer is frequently used. Replaces all uses of the table when active

        static constexpr size_t FrequentlyLockedThreshold{2}; //!< Threshold for the number of times a buffer can be locked (not from context locks, only normal) before it should be considered frequently locked
        size_t accumulatedCpuLockCounter{}; //!< Number of times buffer has been locked through non-ContextLocks

+        /**
+         * @brief Resets all megabuffer tracking state
+         */
+        void ResetMegabufferState();
+
      private:
        BufferDelegate *delegate;

@ -307,7 +311,7 @@ namespace skyline::gpu {
         * @return A binding to the megabuffer allocation for the view, may be invalid if megabuffering is not beneficial
         * @note The buffer **must** be locked prior to calling this
         */
-        BufferBinding TryMegaBufferView(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, size_t executionNumber,
+        BufferBinding TryMegaBufferView(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, u32 executionNumber,
                                        vk::DeviceSize offset, vk::DeviceSize size);

        /**
@ -436,14 +440,13 @@ namespace skyline::gpu {
        bool Write(bool isFirstUsage, const std::shared_ptr<FenceCycle> &cycle, const std::function<void()> &flushHostCallback,
                   span<u8> data, vk::DeviceSize writeOffset, const std::function<void()> &gpuCopyCallback = {}) const;

-
        /*
         * @brief If megabuffering is determined to be beneficial for the underlying buffer, allocates and copies this view into the megabuffer (in case of cache miss), returning a binding of the allocated megabuffer region
         * @param sizeOverride If non-zero, specifies the size of the megabuffer region to allocate and copy to, *MUST* be smaller than the size of the view
         * @note The view **must** be locked prior to calling this
         * @note See Buffer::TryMegaBufferView
         */
-        BufferBinding TryMegaBuffer(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, size_t executionNumber, size_t sizeOverride = 0) const;
+        BufferBinding TryMegaBuffer(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, u32 executionNumber, size_t sizeOverride = 0) const;

        /**
         * @return A span of the backing buffer contents