From 683cd594adc9d8773dfd403e267454fa14263c80 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Sun, 31 Jul 2022 13:41:28 +0100 Subject: [PATCH] Use a linear allocator for most per-execution GPU allocations Currently we heavily thrash the heap each draw, with malloc/free taking up about 10% of GPFIFOs execution time. Using a linear allocator for the main offenders of buffer usage callbacks and index/vertex state helps to reduce this to about 4% --- app/src/main/cpp/skyline/gpu/buffer.cpp | 14 +++++--------- app/src/main/cpp/skyline/gpu/buffer.h | 6 ++++-- app/src/main/cpp/skyline/gpu/buffer_manager.cpp | 5 +++-- .../skyline/gpu/interconnect/command_executor.cpp | 3 ++- .../skyline/gpu/interconnect/command_executor.h | 2 ++ .../skyline/gpu/interconnect/graphics_context.h | 12 ++++++------ 6 files changed, 22 insertions(+), 20 deletions(-) diff --git a/app/src/main/cpp/skyline/gpu/buffer.cpp b/app/src/main/cpp/skyline/gpu/buffer.cpp index c76deb44..51a2f152 100644 --- a/app/src/main/cpp/skyline/gpu/buffer.cpp +++ b/app/src/main/cpp/skyline/gpu/buffer.cpp @@ -320,19 +320,15 @@ namespace skyline::gpu { BufferView::BufferView(std::shared_ptr buffer, const Buffer::BufferViewStorage *view) : bufferDelegate(std::make_shared(std::move(buffer), view)) {} - void BufferView::RegisterUsage(const std::shared_ptr &cycle, const std::function &)> &usageCallback) { + void BufferView::RegisterUsage(LinearAllocatorState<> &allocator, const std::shared_ptr &cycle, Buffer::BufferDelegate::UsageCallback usageCallback) { + if (!bufferDelegate->usageCallbacks) + bufferDelegate->usageCallbacks = decltype(bufferDelegate->usageCallbacks)::value_type{allocator}; + // Users of RegisterUsage expect the buffer contents to be sequenced as the guest GPU would be, so force any further sequenced writes in the current cycle to occur on the GPU bufferDelegate->buffer->BlockSequencedCpuBackingWrites(); usageCallback(*bufferDelegate->view, bufferDelegate->buffer); - if (!bufferDelegate->usageCallback) { - bufferDelegate->usageCallback = usageCallback; - } else { - bufferDelegate->usageCallback = [usageCallback, oldCallback = std::move(bufferDelegate->usageCallback)](const Buffer::BufferViewStorage &pView, const std::shared_ptr &buffer) { - oldCallback(pView, buffer); - usageCallback(pView, buffer); - }; - } + bufferDelegate->usageCallbacks->emplace_back(std::move(usageCallback)); } void BufferView::Read(bool isFirstUsage, const std::function &flushHostCallback, span data, vk::DeviceSize offset) const { diff --git a/app/src/main/cpp/skyline/gpu/buffer.h b/app/src/main/cpp/skyline/gpu/buffer.h index 1e2b0446..02ddf954 100644 --- a/app/src/main/cpp/skyline/gpu/buffer.h +++ b/app/src/main/cpp/skyline/gpu/buffer.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include "megabuffer.h" @@ -101,7 +102,8 @@ namespace skyline::gpu { LockableSharedPtr buffer; const Buffer::BufferViewStorage *view; bool attached{}; - std::function &)> usageCallback; + using UsageCallback = std::function &)>; + std::optional>> usageCallbacks; std::list::iterator iterator; BufferDelegate(std::shared_ptr buffer, const Buffer::BufferViewStorage *view); @@ -405,7 +407,7 @@ namespace skyline::gpu { * @note The callback will be automatically called the first time after registration * @note The view **must** be locked prior to calling this */ - void RegisterUsage(const std::shared_ptr &cycle, const std::function &)> &usageCallback); + void RegisterUsage(LinearAllocatorState<> &allocator, const std::shared_ptr &cycle, Buffer::BufferDelegate::UsageCallback usageCallback); /** * @brief Reads data at the specified offset in the view diff --git a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp index c63567e6..f9ac0a5a 100644 --- a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp +++ b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp @@ -152,8 +152,9 @@ namespace skyline::gpu { // Transfer all delegates references from the overlapping buffer to the new buffer for (auto &delegate : srcBuffer->delegates) { delegate->buffer = *newBuffer; - if (delegate->usageCallback) - delegate->usageCallback(*delegate->view, *newBuffer); + if (delegate->usageCallbacks) + for (auto &callback : *delegate->usageCallbacks) + callback(*delegate->view, *newBuffer); } newBuffer->delegates.splice(newBuffer->delegates.end(), srcBuffer->delegates); diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp index 06358fcd..e55d5f0b 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp @@ -324,7 +324,7 @@ namespace skyline::gpu::interconnect { textureManagerLock.reset(); for (const auto &delegate : attachedBufferDelegates) { - delegate->usageCallback = nullptr; + delegate->usageCallbacks.reset(); delegate->attached = false; delegate->view->megaBufferAllocation = {}; } @@ -333,6 +333,7 @@ namespace skyline::gpu::interconnect { attachedBuffers.clear(); bufferManagerLock.reset(); megaBufferAllocatorLock.reset(); + allocator.Reset(); } void CommandExecutor::Submit() { diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h index b673a5e4..4b0aed20 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h @@ -5,6 +5,7 @@ #include #include +#include #include #include "command_nodes.h" @@ -98,6 +99,7 @@ namespace skyline::gpu::interconnect { public: std::shared_ptr cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands + LinearAllocatorState<> allocator; ContextTag tag; //!< The tag associated with this command executor, any tagged resource locking must utilize this tag CommandExecutor(const DeviceState &state); diff --git a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h index 1c886d98..d35d3eed 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h @@ -1122,7 +1122,7 @@ namespace skyline::gpu::interconnect { .range = view->view->size }; } else { - view.RegisterUsage(executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex](const Buffer::BufferViewStorage &view, const std::shared_ptr &buffer) { + view.RegisterUsage(executor.allocator, executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex](const Buffer::BufferViewStorage &view, const std::shared_ptr &buffer) { *descriptor = vk::DescriptorBufferInfo{ .buffer = buffer->GetBacking(), .offset = view.offset, @@ -1157,7 +1157,7 @@ namespace skyline::gpu::interconnect { if (storageBuffer.is_written) view->buffer->MarkGpuDirty(); - view.RegisterUsage(executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr &buffer) { + view.RegisterUsage(executor.allocator, executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr &buffer) { *descriptor = vk::DescriptorBufferInfo{ .buffer = buffer->GetBacking(), .offset = view.offset, @@ -2829,14 +2829,14 @@ namespace skyline::gpu::interconnect { auto indexBufferView{GetIndexBuffer(count)}; executor.AttachBuffer(indexBufferView); - boundIndexBuffer = std::make_shared(); + boundIndexBuffer = std::allocate_shared>(executor.allocator); boundIndexBuffer->type = indexBuffer.type; if (auto megaBufferAllocation{indexBufferView.AcquireMegaBuffer(executor.cycle, executor.AcquireMegaBufferAllocator())}) { // If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage boundIndexBuffer->handle = megaBufferAllocation.buffer; boundIndexBuffer->offset = megaBufferAllocation.offset; } else { - indexBufferView.RegisterUsage(executor.cycle, [=](const Buffer::BufferViewStorage &view, const std::shared_ptr &buffer) { + indexBufferView.RegisterUsage(executor.allocator, executor.cycle, [=](const Buffer::BufferViewStorage &view, const std::shared_ptr &buffer) { boundIndexBuffer->handle = buffer->GetBacking(); boundIndexBuffer->offset = view.offset; }); @@ -2858,7 +2858,7 @@ namespace skyline::gpu::interconnect { std::array handles{}; std::array offsets{}; }; - auto boundVertexBuffers{std::make_shared()}; + auto boundVertexBuffers{std::allocate_shared>(executor.allocator)}; boost::container::static_vector vertexBindingDescriptions{}; boost::container::static_vector vertexBindingDivisorsDescriptions{}; @@ -2877,7 +2877,7 @@ namespace skyline::gpu::interconnect { boundVertexBuffers->handles[index] = megaBufferAllocation.buffer; boundVertexBuffers->offsets[index] = megaBufferAllocation.offset; } else { - vertexBufferView.RegisterUsage(executor.cycle, [handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr &buffer) { + vertexBufferView.RegisterUsage(executor.allocator, executor.cycle, [handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr &buffer) { *handle = buffer->GetBacking(); *offset = view.offset; });