Implement thread-safe MegaBuffer pool

We currently have a global `MegaBuffer` instance that is shared across all channels, this is very problematic as `MegaBuffer` fundamentally works like a state machine with allocations (especially resetting/freeing) and is thread-specific. Therefore, we now have a pool of several `MegaBuffer`s which is allocated from by the `CommandExecutor` and kept channel specific as a result which also limits its usage to a single thread, this allows for individually resetting or freeing any allocations.
2025-01-23 02:11:12 +01:00 · 2022-06-05 13:02:33 +05:30 · 2022-06-05 13:02:33 +05:30 · a5ca370c36
commit a5ca370c36
parent 3e08494146
7 changed files with 126 additions and 79 deletions
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@ -267,7 +267,7 @@ namespace skyline::gpu {
        return BufferView{shared_from_this(), &views.back()};
    }

-    vk::DeviceSize Buffer::AcquireMegaBuffer() {
+    vk::DeviceSize Buffer::AcquireMegaBuffer(MegaBuffer& megaBuffer) {
        SynchronizeGuest(false, true); // First try and enable megabuffering by doing an immediate sync

        if (!megaBufferingEnabled)
@ -278,7 +278,7 @@ namespace skyline::gpu {
        if (megaBufferOffset)
            return megaBufferOffset; // If the current buffer contents haven't been changed since the last acquire, we can just return the existing offset

-        megaBufferOffset = gpu.buffer.megaBuffer.Push(backing, true); // Buffers are required to be page aligned in the megabuffer
+        megaBufferOffset = megaBuffer.Push(backing, true); // Buffers are required to be page aligned in the megabuffer
        return megaBufferOffset;
    }

@ -370,8 +370,8 @@ namespace skyline::gpu {
        bufferDelegate->buffer->Write(pCycle, flushHostCallback, gpuCopyCallback, data, offset + bufferDelegate->view->offset);
    }

-    vk::DeviceSize BufferView::AcquireMegaBuffer() const {
-        vk::DeviceSize bufferOffset{bufferDelegate->buffer->AcquireMegaBuffer()};
+    vk::DeviceSize BufferView::AcquireMegaBuffer(MegaBuffer& megaBuffer) const {
+        vk::DeviceSize bufferOffset{bufferDelegate->buffer->AcquireMegaBuffer(megaBuffer)};

        // Propagate 0 results since they signify that megabuffering isn't supported for a buffer
        if (bufferOffset)
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@ -11,6 +11,7 @@ namespace skyline::gpu {

    struct BufferView;
    class BufferManager;
+    class MegaBuffer;

    /**
     * @brief A buffer which is backed by host constructs while being synchronized with the underlying guest buffer
@ -234,7 +235,7 @@ namespace skyline::gpu {
         * @note This will only push into the megabuffer when there have been modifications after the previous acquire, otherwise the previous offset will be reused
         * @note An implicit CPU -> GPU sync will be performed when calling this, an immediate GPU -> CPU sync will also be attempted if the buffer is GPU dirty in the hope that megabuffering can be reenabled
         */
-        vk::DeviceSize AcquireMegaBuffer();
+        vk::DeviceSize AcquireMegaBuffer(MegaBuffer& megaBuffer);

        /**
         * @brief Forces the buffer contents to be pushed into the megabuffer on the next AcquireMegaBuffer call
@ -333,7 +334,7 @@ namespace skyline::gpu {
         * @note The view **must** be locked prior to calling this
         * @note See Buffer::AcquireMegaBuffer
         */
-        vk::DeviceSize AcquireMegaBuffer() const;
+        vk::DeviceSize AcquireMegaBuffer(MegaBuffer& megaBuffer) const;

        /**
         * @return A span of the backing buffer contents
--- a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
@ -6,39 +6,7 @@
 #include "buffer_manager.h"

 namespace skyline::gpu {
-    MegaBuffer::MegaBuffer(GPU &gpu) : backing(gpu.memory.AllocateBuffer(Size)), freeRegion(backing.subspan(PAGE_SIZE)) {}
-
-    void MegaBuffer::Reset() {
-        std::scoped_lock lock{mutex};
-        freeRegion = backing.subspan(PAGE_SIZE);
-    }
-
-    vk::Buffer MegaBuffer::GetBacking() const {
-        return backing.vkBuffer;
-    }
-
-    vk::DeviceSize MegaBuffer::Push(span<u8> data, bool pageAlign) {
-        std::scoped_lock lock{mutex};
-
-        if (data.size() > freeRegion.size())
-            throw exception("Ran out of megabuffer space! Alloc size: 0x{:X}", data.size());
-
-        if (pageAlign) {
-            // If page aligned data was requested then align the free
-            auto alignedFreeBase{util::AlignUp(static_cast<size_t>(freeRegion.data() - backing.data()), PAGE_SIZE)};
-            freeRegion = backing.subspan(alignedFreeBase);
-        }
-
-        // Allocate space for data from the free region
-        auto resultSpan{freeRegion.subspan(0, data.size())};
-        resultSpan.copy_from(data);
-
-        // Move the free region along
-        freeRegion = freeRegion.subspan(data.size());
-        return static_cast<vk::DeviceSize>(resultSpan.data() - backing.data());
-    }
-
-    BufferManager::BufferManager(GPU &gpu) : gpu(gpu), megaBuffer(gpu) {}
+    BufferManager::BufferManager(GPU &gpu) : gpu(gpu) {}

    bool BufferManager::BufferLessThan(const std::shared_ptr<Buffer> &it, u8 *pointer) {
        return it->guest->begin().base() < pointer;
@ -109,4 +77,58 @@ namespace skyline::gpu {

        return newBuffer->GetView(static_cast<vk::DeviceSize>(guestMapping.begin() - newBuffer->guest->begin()) + offset, size);
    }
+
+    BufferManager::MegaBufferSlot::MegaBufferSlot(GPU &gpu) : backing(gpu.memory.AllocateBuffer(Size)) {}
+
+    MegaBuffer::MegaBuffer(BufferManager::MegaBufferSlot &slot) : slot{slot}, freeRegion{slot.backing.subspan(PAGE_SIZE)} {}
+
+    MegaBuffer::~MegaBuffer() {
+        slot.active.clear(std::memory_order_release);
+    }
+
+    void MegaBuffer::Reset() {
+        freeRegion = slot.backing.subspan(PAGE_SIZE);
+    }
+
+    vk::Buffer MegaBuffer::GetBacking() const {
+        return slot.backing.vkBuffer;
+    }
+
+    vk::DeviceSize MegaBuffer::Push(span<u8> data, bool pageAlign) {
+        if (data.size() > freeRegion.size())
+            throw exception("Ran out of megabuffer space! Alloc size: 0x{:X}", data.size());
+
+        if (pageAlign) {
+            // If page aligned data was requested then align the free
+            auto alignedFreeBase{util::AlignUp(static_cast<size_t>(freeRegion.data() - slot.backing.data()), PAGE_SIZE)};
+            freeRegion = slot.backing.subspan(alignedFreeBase);
+        }
+
+        // Allocate space for data from the free region
+        auto resultSpan{freeRegion.subspan(0, data.size())};
+        resultSpan.copy_from(data);
+
+        // Move the free region along
+        freeRegion = freeRegion.subspan(data.size());
+        return static_cast<vk::DeviceSize>(resultSpan.data() - slot.backing.data());
+    }
+
+    MegaBuffer BufferManager::AcquireMegaBuffer(const std::shared_ptr<FenceCycle> &cycle) {
+        std::lock_guard lock{mutex};
+
+        for (auto &slot : megaBuffers) {
+            if (!slot.active.test_and_set(std::memory_order_acq_rel)) {
+                if (slot.cycle->Poll()) {
+                    slot.cycle = cycle;
+                    return {slot};
+                } else {
+                    slot.active.clear(std::memory_order_release);
+                }
+            }
+        }
+
+        auto& megaBuffer{megaBuffers.emplace_back(gpu)};
+        megaBuffer.cycle = cycle;
+        return {megaBuffer};
+    }
 }
--- a/app/src/main/cpp/skyline/gpu/buffer_manager.h
+++ b/app/src/main/cpp/skyline/gpu/buffer_manager.h
@ -6,19 +6,67 @@
 #include "buffer.h"

 namespace skyline::gpu {
+    class MegaBuffer;
+
+    /**
+     * @brief The Buffer Manager is responsible for maintaining a global view of buffers being mapped from the guest to the host, any lookups and creation of host buffer from equivalent guest buffer alongside reconciliation of any overlaps with existing textures
+     */
+    class BufferManager {
+      private:
+        GPU &gpu;
+        std::mutex mutex; //!< Synchronizes access to the buffer mappings
+        std::vector<std::shared_ptr<Buffer>> buffers; //!< A sorted vector of all buffer mappings
+
+        friend class MegaBuffer;
+
+        /**
+         * @brief A wrapper around a buffer which can be utilized as backing storage for a megabuffer and can track its state to avoid concurrent usage
+         */
+        struct MegaBufferSlot {
+            std::atomic_flag active{true}; //!< If the megabuffer is currently being utilized, we want to construct a buffer as active
+            std::shared_ptr<FenceCycle> cycle; //!< The latest cycle on the fence, all waits must be performed through this
+
+            constexpr static vk::DeviceSize Size{100 * 1024 * 1024}; //!< Size in bytes of the megabuffer (100MiB)
+            memory::Buffer backing; //!< The GPU buffer as the backing storage for the megabuffer
+
+            MegaBufferSlot(GPU &gpu);
+        };
+
+        /**
+         * @return If the end of the supplied buffer is less than the supplied pointer
+         */
+        static bool BufferLessThan(const std::shared_ptr<Buffer> &it, u8 *pointer);
+
+      public:
+        std::list<MegaBufferSlot> megaBuffers; //!< A pool of all allocated megabuffers, these are dynamically utilized
+
+        BufferManager(GPU &gpu);
+
+        /**
+         * @return A dynamically allocated megabuffer which can be used to store buffer modifications allowing them to be replayed in-sequence on the GPU
+         * @note This object **must** be destroyed to be reclaimed by the manager and prevent a memory leak
+         */
+        MegaBuffer AcquireMegaBuffer(const std::shared_ptr<FenceCycle> &cycle);
+
+        /**
+         * @return A pre-existing or newly created Buffer object which covers the supplied mappings
+         */
+        BufferView FindOrCreate(GuestBuffer guestMapping, const std::shared_ptr<FenceCycle> &cycle = nullptr);
+    };
+
    /**
     * @brief A simple linearly allocated GPU-side buffer used to temporarily store buffer modifications allowing them to be replayed in-sequence on the GPU
+     * @note This class is **not** thread-safe and any calls must be externally synchronized
     */
    class MegaBuffer {
      private:
-        constexpr static vk::DeviceSize Size{0x6'400'000}; //!< Size in bytes of the megabuffer (100MiB)
-
-        memory::Buffer backing; //!< The backing GPU buffer
-        std::mutex mutex; //!< Synchronizes access to freeRegion
-        span<u8> freeRegion; //!< Span of unallocated space in the megabuffer
+        BufferManager::MegaBufferSlot &slot;
+        span<u8> freeRegion; //!< The unallocated space in the megabuffer

      public:
-        MegaBuffer(GPU &gpu);
+        MegaBuffer(BufferManager::MegaBufferSlot &slot);
+
+        ~MegaBuffer();

        /**
         * @brief Resets the free region of the megabuffer to its initial state, data is left intact but may be overwritten
@ -36,29 +84,4 @@ namespace skyline::gpu {
         */
        vk::DeviceSize Push(span<u8> data, bool pageAlign = false);
    };
-
-    /**
-     * @brief The Buffer Manager is responsible for maintaining a global view of buffers being mapped from the guest to the host, any lookups and creation of host buffer from equivalent guest buffer alongside reconciliation of any overlaps with existing textures
-     */
-    class BufferManager {
-      private:
-        GPU &gpu;
-        std::mutex mutex; //!< Synchronizes access to the buffer mappings
-        std::vector<std::shared_ptr<Buffer>> buffers; //!< A sorted vector of all buffer mappings
-
-        /**
-         * @return If the end of the supplied buffer is less than the supplied pointer
-         */
-        static bool BufferLessThan(const std::shared_ptr<Buffer> &it, u8 *pointer);
-
-      public:
-        MegaBuffer megaBuffer; //!< The megabuffer used to temporarily store buffer modifications allowing them to be replayed in-sequence on the GPU
-
-        BufferManager(GPU &gpu);
-
-        /**
-         * @return A pre-existing or newly created Buffer object which covers the supplied mappings
-         */
-        BufferView FindOrCreate(GuestBuffer guestMapping, const std::shared_ptr<FenceCycle> &cycle = nullptr);
-    };
 }
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@ -5,7 +5,7 @@
 #include "command_executor.h"

 namespace skyline::gpu::interconnect {
-    CommandExecutor::CommandExecutor(const DeviceState &state) : gpu(*state.gpu), activeCommandBuffer(gpu.scheduler.AllocateCommandBuffer()), cycle(activeCommandBuffer.GetFenceCycle()) {}
+    CommandExecutor::CommandExecutor(const DeviceState &state) : gpu(*state.gpu), activeCommandBuffer(gpu.scheduler.AllocateCommandBuffer()), cycle(activeCommandBuffer.GetFenceCycle()), megaBuffer(gpu.buffer.AcquireMegaBuffer(cycle)) {}

    CommandExecutor::~CommandExecutor() {
        cycle->Cancel();
@ -227,7 +227,7 @@ namespace skyline::gpu::interconnect {

                cycle = activeCommandBuffer.Reset();

-                gpu.buffer.megaBuffer.Reset();
+                megaBuffer.Reset();
            }
        }
    }
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@ -45,6 +45,7 @@ namespace skyline::gpu::interconnect {

      public:
        std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
+        MegaBuffer megaBuffer; //!< The megabuffer used to temporarily store buffer modifications allowing them to be replayed in-sequence on the GPU

        CommandExecutor(const DeviceState &state);

--- a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
@ -738,7 +738,7 @@ namespace skyline::gpu::interconnect {

        void ConstantBufferUpdate(std::vector<u32> data, u32 offset) {
            auto constantBuffer{GetConstantBufferSelector().value()};
-            constantBuffer.Write<u32>(executor, gpu.buffer.megaBuffer, data, offset);
+            constantBuffer.Write<u32>(executor, executor.megaBuffer, data, offset);
        }

        /* Shader Program */
@ -1110,10 +1110,10 @@ namespace skyline::gpu::interconnect {
                        auto view{pipelineStage.constantBuffers[constantBuffer.index].view};

                        std::scoped_lock lock(view);
-                        if (auto megaBufferOffset{view.AcquireMegaBuffer()}) {
+                        if (auto megaBufferOffset{view.AcquireMegaBuffer(executor.megaBuffer)}) {
                            // If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
                            bufferDescriptors[bufferIndex] = vk::DescriptorBufferInfo{
-                                .buffer = gpu.buffer.megaBuffer.GetBacking(),
+                                .buffer = executor.megaBuffer.GetBacking(),
                                .offset = megaBufferOffset,
                                .range = view->view->size
                            };
@ -2837,9 +2837,9 @@ namespace skyline::gpu::interconnect {
                    std::scoped_lock lock(indexBufferView);

                    boundIndexBuffer->type = indexBuffer.type;
-                    if (auto megaBufferOffset{indexBufferView.AcquireMegaBuffer()}) {
+                    if (auto megaBufferOffset{indexBufferView.AcquireMegaBuffer(executor.megaBuffer)}) {
                        // If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
-                        boundIndexBuffer->handle = gpu.buffer.megaBuffer.GetBacking();
+                        boundIndexBuffer->handle = executor.megaBuffer.GetBacking();
                        boundIndexBuffer->offset = megaBufferOffset;
                    } else {
                        indexBufferView.RegisterUsage([=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
@ -2872,9 +2872,9 @@ namespace skyline::gpu::interconnect {

                    std::scoped_lock vertexBufferLock(vertexBufferView);

-                    if (auto megaBufferOffset{vertexBufferView.AcquireMegaBuffer()}) {
+                    if (auto megaBufferOffset{vertexBufferView.AcquireMegaBuffer(executor.megaBuffer)}) {
                        // If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
-                        boundVertexBuffers->handles[index] = gpu.buffer.megaBuffer.GetBacking();
+                        boundVertexBuffers->handles[index] = executor.megaBuffer.GetBacking();
                        boundVertexBuffers->offsets[index] = megaBufferOffset;
                    } else {
                        vertexBufferView.RegisterUsage([handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {