From 683cd594adc9d8773dfd403e267454fa14263c80 Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Sun, 31 Jul 2022 13:41:28 +0100
Subject: [PATCH] Use a linear allocator for most per-execution GPU allocations

Currently we heavily thrash the heap each draw, with malloc/free taking up about 10% of GPFIFOs execution time. Using a linear allocator for the main offenders of buffer usage callbacks and index/vertex state helps to reduce this to about 4%
---
 app/src/main/cpp/skyline/gpu/buffer.cpp            | 14 +++++---------
 app/src/main/cpp/skyline/gpu/buffer.h              |  6 ++++--
 app/src/main/cpp/skyline/gpu/buffer_manager.cpp    |  5 +++--
 .../skyline/gpu/interconnect/command_executor.cpp  |  3 ++-
 .../skyline/gpu/interconnect/command_executor.h    |  2 ++
 .../skyline/gpu/interconnect/graphics_context.h    | 12 ++++++------
 6 files changed, 22 insertions(+), 20 deletions(-)
diff --git a/app/src/main/cpp/skyline/gpu/buffer.cpp b/app/src/main/cpp/skyline/gpu/buffer.cpp
index c76deb44..51a2f152 100644
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@@ -320,19 +320,15 @@ namespace skyline::gpu {
 
     BufferView::BufferView(std::shared_ptr<Buffer> buffer, const Buffer::BufferViewStorage *view) : bufferDelegate(std::make_shared<Buffer::BufferDelegate>(std::move(buffer), view)) {}
 
-    void BufferView::RegisterUsage(const std::shared_ptr<FenceCycle> &cycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback) {
+    void BufferView::RegisterUsage(LinearAllocatorState<> &allocator, const std::shared_ptr<FenceCycle> &cycle, Buffer::BufferDelegate::UsageCallback usageCallback) {
+        if (!bufferDelegate->usageCallbacks)
+            bufferDelegate->usageCallbacks = decltype(bufferDelegate->usageCallbacks)::value_type{allocator};
+
         // Users of RegisterUsage expect the buffer contents to be sequenced as the guest GPU would be, so force any further sequenced writes in the current cycle to occur on the GPU
         bufferDelegate->buffer->BlockSequencedCpuBackingWrites();
 
         usageCallback(*bufferDelegate->view, bufferDelegate->buffer);
-        if (!bufferDelegate->usageCallback) {
-            bufferDelegate->usageCallback = usageCallback;
-        } else {
-            bufferDelegate->usageCallback = [usageCallback, oldCallback = std::move(bufferDelegate->usageCallback)](const Buffer::BufferViewStorage &pView, const std::shared_ptr<Buffer> &buffer) {
-                oldCallback(pView, buffer);
-                usageCallback(pView, buffer);
-            };
-        }
+        bufferDelegate->usageCallbacks->emplace_back(std::move(usageCallback));
     }
 
     void BufferView::Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const {
diff --git a/app/src/main/cpp/skyline/gpu/buffer.h b/app/src/main/cpp/skyline/gpu/buffer.h
index 1e2b0446..02ddf954 100644
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@@ -6,6 +6,7 @@
 #include <unordered_set>
 #include <boost/functional/hash.hpp>
 #include <common/lockable_shared_ptr.h>
+#include <common/linear_allocator.h>
 #include <nce.h>
 #include <gpu/tag_allocator.h>
 #include "megabuffer.h"
@@ -101,7 +102,8 @@ namespace skyline::gpu {
             LockableSharedPtr<Buffer> buffer;
             const Buffer::BufferViewStorage *view;
             bool attached{};
-            std::function<void(const BufferViewStorage &, const std::shared_ptr<Buffer> &)> usageCallback;
+            using UsageCallback = std::function<void(const BufferViewStorage &, const std::shared_ptr<Buffer> &)>;
+            std::optional<std::vector<UsageCallback, LinearAllocator<UsageCallback>>> usageCallbacks;
             std::list<BufferDelegate *>::iterator iterator;
 
             BufferDelegate(std::shared_ptr<Buffer> buffer, const Buffer::BufferViewStorage *view);
@@ -405,7 +407,7 @@ namespace skyline::gpu {
          * @note The callback will be automatically called the first time after registration
          * @note The view **must** be locked prior to calling this
          */
-        void RegisterUsage(const std::shared_ptr<FenceCycle> &cycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback);
+        void RegisterUsage(LinearAllocatorState<> &allocator, const std::shared_ptr<FenceCycle> &cycle, Buffer::BufferDelegate::UsageCallback usageCallback);
 
         /**
          * @brief Reads data at the specified offset in the view
diff --git a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
index c63567e6..f9ac0a5a 100644
--- a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
@@ -152,8 +152,9 @@ namespace skyline::gpu {
             // Transfer all delegates references from the overlapping buffer to the new buffer
             for (auto &delegate : srcBuffer->delegates) {
                 delegate->buffer = *newBuffer;
-                if (delegate->usageCallback)
-                    delegate->usageCallback(*delegate->view, *newBuffer);
+                if (delegate->usageCallbacks)
+                    for (auto &callback : *delegate->usageCallbacks)
+                        callback(*delegate->view, *newBuffer);
             }
 
             newBuffer->delegates.splice(newBuffer->delegates.end(), srcBuffer->delegates);
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
index 06358fcd..e55d5f0b 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@@ -324,7 +324,7 @@ namespace skyline::gpu::interconnect {
         textureManagerLock.reset();
 
         for (const auto &delegate : attachedBufferDelegates) {
-            delegate->usageCallback = nullptr;
+            delegate->usageCallbacks.reset();
             delegate->attached = false;
             delegate->view->megaBufferAllocation = {};
         }
@@ -333,6 +333,7 @@ namespace skyline::gpu::interconnect {
         attachedBuffers.clear();
         bufferManagerLock.reset();
         megaBufferAllocatorLock.reset();
+        allocator.Reset();
     }
 
     void CommandExecutor::Submit() {
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
index b673a5e4..4b0aed20 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@@ -5,6 +5,7 @@
 
 #include <boost/container/stable_vector.hpp>
 #include <unordered_set>
+#include <common/linear_allocator.h>
 #include <gpu/megabuffer.h>
 #include "command_nodes.h"
 
@@ -98,6 +99,7 @@ namespace skyline::gpu::interconnect {
 
       public:
         std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
+        LinearAllocatorState<> allocator;
         ContextTag tag; //!< The tag associated with this command executor, any tagged resource locking must utilize this tag
 
         CommandExecutor(const DeviceState &state);
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
index 1c886d98..d35d3eed 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
@@ -1122,7 +1122,7 @@ namespace skyline::gpu::interconnect {
                                 .range = view->view->size
                             };
                         } else {
-                            view.RegisterUsage(executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
+                            view.RegisterUsage(executor.allocator, executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
                                 *descriptor = vk::DescriptorBufferInfo{
                                     .buffer = buffer->GetBacking(),
                                     .offset = view.offset,
@@ -1157,7 +1157,7 @@ namespace skyline::gpu::interconnect {
                         if (storageBuffer.is_written)
                             view->buffer->MarkGpuDirty();
 
-                        view.RegisterUsage(executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
+                        view.RegisterUsage(executor.allocator, executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
                             *descriptor = vk::DescriptorBufferInfo{
                                 .buffer = buffer->GetBacking(),
                                 .offset = view.offset,
@@ -2829,14 +2829,14 @@ namespace skyline::gpu::interconnect {
                 auto indexBufferView{GetIndexBuffer(count)};
                 executor.AttachBuffer(indexBufferView);
 
-                boundIndexBuffer = std::make_shared<BoundIndexBuffer>();
+                boundIndexBuffer = std::allocate_shared<BoundIndexBuffer, LinearAllocator<BoundIndexBuffer>>(executor.allocator);
                 boundIndexBuffer->type = indexBuffer.type;
                 if (auto megaBufferAllocation{indexBufferView.AcquireMegaBuffer(executor.cycle, executor.AcquireMegaBufferAllocator())}) {
                     // If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
                     boundIndexBuffer->handle = megaBufferAllocation.buffer;
                     boundIndexBuffer->offset = megaBufferAllocation.offset;
                 } else {
-                    indexBufferView.RegisterUsage(executor.cycle, [=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
+                    indexBufferView.RegisterUsage(executor.allocator, executor.cycle, [=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
                         boundIndexBuffer->handle = buffer->GetBacking();
                         boundIndexBuffer->offset = view.offset;
                     });
@@ -2858,7 +2858,7 @@ namespace skyline::gpu::interconnect {
                 std::array<vk::Buffer, maxwell3d::VertexBufferCount> handles{};
                 std::array<vk::DeviceSize, maxwell3d::VertexBufferCount> offsets{};
             };
-            auto boundVertexBuffers{std::make_shared<BoundVertexBuffers>()};
+            auto boundVertexBuffers{std::allocate_shared<BoundVertexBuffers, LinearAllocator<BoundVertexBuffers>>(executor.allocator)};
 
             boost::container::static_vector<vk::VertexInputBindingDescription, maxwell3d::VertexBufferCount> vertexBindingDescriptions{};
             boost::container::static_vector<vk::VertexInputBindingDivisorDescriptionEXT, maxwell3d::VertexBufferCount> vertexBindingDivisorsDescriptions{};
@@ -2877,7 +2877,7 @@ namespace skyline::gpu::interconnect {
                         boundVertexBuffers->handles[index] = megaBufferAllocation.buffer;
                         boundVertexBuffers->offsets[index] = megaBufferAllocation.offset;
                     } else {
-                        vertexBufferView.RegisterUsage(executor.cycle, [handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
+                        vertexBufferView.RegisterUsage(executor.allocator, executor.cycle, [handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
                             *handle = buffer->GetBacking();
                             *offset = view.offset;
                         });