Use a linear allocator for most per-execution GPU allocations

Currently we heavily thrash the heap each draw, with malloc/free taking up about 10% of GPFIFOs execution time. Using a linear allocator for the main offenders of buffer usage callbacks and index/vertex state helps to reduce this to about 4%
This commit is contained in:
Billy Laws 2022-07-31 13:41:28 +01:00
parent 70eec5a414
commit 683cd594ad
6 changed files with 22 additions and 20 deletions

View File

@ -320,19 +320,15 @@ namespace skyline::gpu {
BufferView::BufferView(std::shared_ptr<Buffer> buffer, const Buffer::BufferViewStorage *view) : bufferDelegate(std::make_shared<Buffer::BufferDelegate>(std::move(buffer), view)) {}
void BufferView::RegisterUsage(const std::shared_ptr<FenceCycle> &cycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback) {
void BufferView::RegisterUsage(LinearAllocatorState<> &allocator, const std::shared_ptr<FenceCycle> &cycle, Buffer::BufferDelegate::UsageCallback usageCallback) {
if (!bufferDelegate->usageCallbacks)
bufferDelegate->usageCallbacks = decltype(bufferDelegate->usageCallbacks)::value_type{allocator};
// Users of RegisterUsage expect the buffer contents to be sequenced as the guest GPU would be, so force any further sequenced writes in the current cycle to occur on the GPU
bufferDelegate->buffer->BlockSequencedCpuBackingWrites();
usageCallback(*bufferDelegate->view, bufferDelegate->buffer);
if (!bufferDelegate->usageCallback) {
bufferDelegate->usageCallback = usageCallback;
} else {
bufferDelegate->usageCallback = [usageCallback, oldCallback = std::move(bufferDelegate->usageCallback)](const Buffer::BufferViewStorage &pView, const std::shared_ptr<Buffer> &buffer) {
oldCallback(pView, buffer);
usageCallback(pView, buffer);
};
}
bufferDelegate->usageCallbacks->emplace_back(std::move(usageCallback));
}
void BufferView::Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const {

View File

@ -6,6 +6,7 @@
#include <unordered_set>
#include <boost/functional/hash.hpp>
#include <common/lockable_shared_ptr.h>
#include <common/linear_allocator.h>
#include <nce.h>
#include <gpu/tag_allocator.h>
#include "megabuffer.h"
@ -101,7 +102,8 @@ namespace skyline::gpu {
LockableSharedPtr<Buffer> buffer;
const Buffer::BufferViewStorage *view;
bool attached{};
std::function<void(const BufferViewStorage &, const std::shared_ptr<Buffer> &)> usageCallback;
using UsageCallback = std::function<void(const BufferViewStorage &, const std::shared_ptr<Buffer> &)>;
std::optional<std::vector<UsageCallback, LinearAllocator<UsageCallback>>> usageCallbacks;
std::list<BufferDelegate *>::iterator iterator;
BufferDelegate(std::shared_ptr<Buffer> buffer, const Buffer::BufferViewStorage *view);
@ -405,7 +407,7 @@ namespace skyline::gpu {
* @note The callback will be automatically called the first time after registration
* @note The view **must** be locked prior to calling this
*/
void RegisterUsage(const std::shared_ptr<FenceCycle> &cycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback);
void RegisterUsage(LinearAllocatorState<> &allocator, const std::shared_ptr<FenceCycle> &cycle, Buffer::BufferDelegate::UsageCallback usageCallback);
/**
* @brief Reads data at the specified offset in the view

View File

@ -152,8 +152,9 @@ namespace skyline::gpu {
// Transfer all delegates references from the overlapping buffer to the new buffer
for (auto &delegate : srcBuffer->delegates) {
delegate->buffer = *newBuffer;
if (delegate->usageCallback)
delegate->usageCallback(*delegate->view, *newBuffer);
if (delegate->usageCallbacks)
for (auto &callback : *delegate->usageCallbacks)
callback(*delegate->view, *newBuffer);
}
newBuffer->delegates.splice(newBuffer->delegates.end(), srcBuffer->delegates);

View File

@ -324,7 +324,7 @@ namespace skyline::gpu::interconnect {
textureManagerLock.reset();
for (const auto &delegate : attachedBufferDelegates) {
delegate->usageCallback = nullptr;
delegate->usageCallbacks.reset();
delegate->attached = false;
delegate->view->megaBufferAllocation = {};
}
@ -333,6 +333,7 @@ namespace skyline::gpu::interconnect {
attachedBuffers.clear();
bufferManagerLock.reset();
megaBufferAllocatorLock.reset();
allocator.Reset();
}
void CommandExecutor::Submit() {

View File

@ -5,6 +5,7 @@
#include <boost/container/stable_vector.hpp>
#include <unordered_set>
#include <common/linear_allocator.h>
#include <gpu/megabuffer.h>
#include "command_nodes.h"
@ -98,6 +99,7 @@ namespace skyline::gpu::interconnect {
public:
std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
LinearAllocatorState<> allocator;
ContextTag tag; //!< The tag associated with this command executor, any tagged resource locking must utilize this tag
CommandExecutor(const DeviceState &state);

View File

@ -1122,7 +1122,7 @@ namespace skyline::gpu::interconnect {
.range = view->view->size
};
} else {
view.RegisterUsage(executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
view.RegisterUsage(executor.allocator, executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
*descriptor = vk::DescriptorBufferInfo{
.buffer = buffer->GetBacking(),
.offset = view.offset,
@ -1157,7 +1157,7 @@ namespace skyline::gpu::interconnect {
if (storageBuffer.is_written)
view->buffer->MarkGpuDirty();
view.RegisterUsage(executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
view.RegisterUsage(executor.allocator, executor.cycle, [descriptor = bufferDescriptors.data() + bufferIndex++](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
*descriptor = vk::DescriptorBufferInfo{
.buffer = buffer->GetBacking(),
.offset = view.offset,
@ -2829,14 +2829,14 @@ namespace skyline::gpu::interconnect {
auto indexBufferView{GetIndexBuffer(count)};
executor.AttachBuffer(indexBufferView);
boundIndexBuffer = std::make_shared<BoundIndexBuffer>();
boundIndexBuffer = std::allocate_shared<BoundIndexBuffer, LinearAllocator<BoundIndexBuffer>>(executor.allocator);
boundIndexBuffer->type = indexBuffer.type;
if (auto megaBufferAllocation{indexBufferView.AcquireMegaBuffer(executor.cycle, executor.AcquireMegaBufferAllocator())}) {
// If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
boundIndexBuffer->handle = megaBufferAllocation.buffer;
boundIndexBuffer->offset = megaBufferAllocation.offset;
} else {
indexBufferView.RegisterUsage(executor.cycle, [=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
indexBufferView.RegisterUsage(executor.allocator, executor.cycle, [=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
boundIndexBuffer->handle = buffer->GetBacking();
boundIndexBuffer->offset = view.offset;
});
@ -2858,7 +2858,7 @@ namespace skyline::gpu::interconnect {
std::array<vk::Buffer, maxwell3d::VertexBufferCount> handles{};
std::array<vk::DeviceSize, maxwell3d::VertexBufferCount> offsets{};
};
auto boundVertexBuffers{std::make_shared<BoundVertexBuffers>()};
auto boundVertexBuffers{std::allocate_shared<BoundVertexBuffers, LinearAllocator<BoundVertexBuffers>>(executor.allocator)};
boost::container::static_vector<vk::VertexInputBindingDescription, maxwell3d::VertexBufferCount> vertexBindingDescriptions{};
boost::container::static_vector<vk::VertexInputBindingDivisorDescriptionEXT, maxwell3d::VertexBufferCount> vertexBindingDivisorsDescriptions{};
@ -2877,7 +2877,7 @@ namespace skyline::gpu::interconnect {
boundVertexBuffers->handles[index] = megaBufferAllocation.buffer;
boundVertexBuffers->offsets[index] = megaBufferAllocation.offset;
} else {
vertexBufferView.RegisterUsage(executor.cycle, [handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
vertexBufferView.RegisterUsage(executor.allocator, executor.cycle, [handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
*handle = buffer->GetBacking();
*offset = view.offset;
});