Implement thread-safe MegaBuffer pool

We currently have a global `MegaBuffer` instance that is shared across all channels, this is very problematic as `MegaBuffer` fundamentally works like a state machine with allocations (especially resetting/freeing) and is thread-specific. Therefore, we now have a pool of several `MegaBuffer`s which is allocated from by the `CommandExecutor` and kept channel specific as a result which also limits its usage to a single thread, this allows for individually resetting or freeing any allocations.
This commit is contained in:
PixelyIon 2022-06-05 13:02:33 +05:30
parent 3e08494146
commit a5ca370c36
7 changed files with 126 additions and 79 deletions

View File

@ -267,7 +267,7 @@ namespace skyline::gpu {
return BufferView{shared_from_this(), &views.back()}; return BufferView{shared_from_this(), &views.back()};
} }
vk::DeviceSize Buffer::AcquireMegaBuffer() { vk::DeviceSize Buffer::AcquireMegaBuffer(MegaBuffer& megaBuffer) {
SynchronizeGuest(false, true); // First try and enable megabuffering by doing an immediate sync SynchronizeGuest(false, true); // First try and enable megabuffering by doing an immediate sync
if (!megaBufferingEnabled) if (!megaBufferingEnabled)
@ -278,7 +278,7 @@ namespace skyline::gpu {
if (megaBufferOffset) if (megaBufferOffset)
return megaBufferOffset; // If the current buffer contents haven't been changed since the last acquire, we can just return the existing offset return megaBufferOffset; // If the current buffer contents haven't been changed since the last acquire, we can just return the existing offset
megaBufferOffset = gpu.buffer.megaBuffer.Push(backing, true); // Buffers are required to be page aligned in the megabuffer megaBufferOffset = megaBuffer.Push(backing, true); // Buffers are required to be page aligned in the megabuffer
return megaBufferOffset; return megaBufferOffset;
} }
@ -370,8 +370,8 @@ namespace skyline::gpu {
bufferDelegate->buffer->Write(pCycle, flushHostCallback, gpuCopyCallback, data, offset + bufferDelegate->view->offset); bufferDelegate->buffer->Write(pCycle, flushHostCallback, gpuCopyCallback, data, offset + bufferDelegate->view->offset);
} }
vk::DeviceSize BufferView::AcquireMegaBuffer() const { vk::DeviceSize BufferView::AcquireMegaBuffer(MegaBuffer& megaBuffer) const {
vk::DeviceSize bufferOffset{bufferDelegate->buffer->AcquireMegaBuffer()}; vk::DeviceSize bufferOffset{bufferDelegate->buffer->AcquireMegaBuffer(megaBuffer)};
// Propagate 0 results since they signify that megabuffering isn't supported for a buffer // Propagate 0 results since they signify that megabuffering isn't supported for a buffer
if (bufferOffset) if (bufferOffset)

View File

@ -11,6 +11,7 @@ namespace skyline::gpu {
struct BufferView; struct BufferView;
class BufferManager; class BufferManager;
class MegaBuffer;
/** /**
* @brief A buffer which is backed by host constructs while being synchronized with the underlying guest buffer * @brief A buffer which is backed by host constructs while being synchronized with the underlying guest buffer
@ -234,7 +235,7 @@ namespace skyline::gpu {
* @note This will only push into the megabuffer when there have been modifications after the previous acquire, otherwise the previous offset will be reused * @note This will only push into the megabuffer when there have been modifications after the previous acquire, otherwise the previous offset will be reused
* @note An implicit CPU -> GPU sync will be performed when calling this, an immediate GPU -> CPU sync will also be attempted if the buffer is GPU dirty in the hope that megabuffering can be reenabled * @note An implicit CPU -> GPU sync will be performed when calling this, an immediate GPU -> CPU sync will also be attempted if the buffer is GPU dirty in the hope that megabuffering can be reenabled
*/ */
vk::DeviceSize AcquireMegaBuffer(); vk::DeviceSize AcquireMegaBuffer(MegaBuffer& megaBuffer);
/** /**
* @brief Forces the buffer contents to be pushed into the megabuffer on the next AcquireMegaBuffer call * @brief Forces the buffer contents to be pushed into the megabuffer on the next AcquireMegaBuffer call
@ -333,7 +334,7 @@ namespace skyline::gpu {
* @note The view **must** be locked prior to calling this * @note The view **must** be locked prior to calling this
* @note See Buffer::AcquireMegaBuffer * @note See Buffer::AcquireMegaBuffer
*/ */
vk::DeviceSize AcquireMegaBuffer() const; vk::DeviceSize AcquireMegaBuffer(MegaBuffer& megaBuffer) const;
/** /**
* @return A span of the backing buffer contents * @return A span of the backing buffer contents

View File

@ -6,39 +6,7 @@
#include "buffer_manager.h" #include "buffer_manager.h"
namespace skyline::gpu { namespace skyline::gpu {
MegaBuffer::MegaBuffer(GPU &gpu) : backing(gpu.memory.AllocateBuffer(Size)), freeRegion(backing.subspan(PAGE_SIZE)) {} BufferManager::BufferManager(GPU &gpu) : gpu(gpu) {}
void MegaBuffer::Reset() {
std::scoped_lock lock{mutex};
freeRegion = backing.subspan(PAGE_SIZE);
}
vk::Buffer MegaBuffer::GetBacking() const {
return backing.vkBuffer;
}
vk::DeviceSize MegaBuffer::Push(span<u8> data, bool pageAlign) {
std::scoped_lock lock{mutex};
if (data.size() > freeRegion.size())
throw exception("Ran out of megabuffer space! Alloc size: 0x{:X}", data.size());
if (pageAlign) {
// If page aligned data was requested then align the free
auto alignedFreeBase{util::AlignUp(static_cast<size_t>(freeRegion.data() - backing.data()), PAGE_SIZE)};
freeRegion = backing.subspan(alignedFreeBase);
}
// Allocate space for data from the free region
auto resultSpan{freeRegion.subspan(0, data.size())};
resultSpan.copy_from(data);
// Move the free region along
freeRegion = freeRegion.subspan(data.size());
return static_cast<vk::DeviceSize>(resultSpan.data() - backing.data());
}
BufferManager::BufferManager(GPU &gpu) : gpu(gpu), megaBuffer(gpu) {}
bool BufferManager::BufferLessThan(const std::shared_ptr<Buffer> &it, u8 *pointer) { bool BufferManager::BufferLessThan(const std::shared_ptr<Buffer> &it, u8 *pointer) {
return it->guest->begin().base() < pointer; return it->guest->begin().base() < pointer;
@ -109,4 +77,58 @@ namespace skyline::gpu {
return newBuffer->GetView(static_cast<vk::DeviceSize>(guestMapping.begin() - newBuffer->guest->begin()) + offset, size); return newBuffer->GetView(static_cast<vk::DeviceSize>(guestMapping.begin() - newBuffer->guest->begin()) + offset, size);
} }
BufferManager::MegaBufferSlot::MegaBufferSlot(GPU &gpu) : backing(gpu.memory.AllocateBuffer(Size)) {}
MegaBuffer::MegaBuffer(BufferManager::MegaBufferSlot &slot) : slot{slot}, freeRegion{slot.backing.subspan(PAGE_SIZE)} {}
MegaBuffer::~MegaBuffer() {
slot.active.clear(std::memory_order_release);
}
void MegaBuffer::Reset() {
freeRegion = slot.backing.subspan(PAGE_SIZE);
}
vk::Buffer MegaBuffer::GetBacking() const {
return slot.backing.vkBuffer;
}
vk::DeviceSize MegaBuffer::Push(span<u8> data, bool pageAlign) {
if (data.size() > freeRegion.size())
throw exception("Ran out of megabuffer space! Alloc size: 0x{:X}", data.size());
if (pageAlign) {
// If page aligned data was requested then align the free
auto alignedFreeBase{util::AlignUp(static_cast<size_t>(freeRegion.data() - slot.backing.data()), PAGE_SIZE)};
freeRegion = slot.backing.subspan(alignedFreeBase);
}
// Allocate space for data from the free region
auto resultSpan{freeRegion.subspan(0, data.size())};
resultSpan.copy_from(data);
// Move the free region along
freeRegion = freeRegion.subspan(data.size());
return static_cast<vk::DeviceSize>(resultSpan.data() - slot.backing.data());
}
MegaBuffer BufferManager::AcquireMegaBuffer(const std::shared_ptr<FenceCycle> &cycle) {
std::lock_guard lock{mutex};
for (auto &slot : megaBuffers) {
if (!slot.active.test_and_set(std::memory_order_acq_rel)) {
if (slot.cycle->Poll()) {
slot.cycle = cycle;
return {slot};
} else {
slot.active.clear(std::memory_order_release);
}
}
}
auto& megaBuffer{megaBuffers.emplace_back(gpu)};
megaBuffer.cycle = cycle;
return {megaBuffer};
}
} }

View File

@ -6,19 +6,67 @@
#include "buffer.h" #include "buffer.h"
namespace skyline::gpu { namespace skyline::gpu {
class MegaBuffer;
/**
* @brief The Buffer Manager is responsible for maintaining a global view of buffers being mapped from the guest to the host, any lookups and creation of host buffer from equivalent guest buffer alongside reconciliation of any overlaps with existing textures
*/
class BufferManager {
private:
GPU &gpu;
std::mutex mutex; //!< Synchronizes access to the buffer mappings
std::vector<std::shared_ptr<Buffer>> buffers; //!< A sorted vector of all buffer mappings
friend class MegaBuffer;
/**
* @brief A wrapper around a buffer which can be utilized as backing storage for a megabuffer and can track its state to avoid concurrent usage
*/
struct MegaBufferSlot {
std::atomic_flag active{true}; //!< If the megabuffer is currently being utilized, we want to construct a buffer as active
std::shared_ptr<FenceCycle> cycle; //!< The latest cycle on the fence, all waits must be performed through this
constexpr static vk::DeviceSize Size{100 * 1024 * 1024}; //!< Size in bytes of the megabuffer (100MiB)
memory::Buffer backing; //!< The GPU buffer as the backing storage for the megabuffer
MegaBufferSlot(GPU &gpu);
};
/**
* @return If the end of the supplied buffer is less than the supplied pointer
*/
static bool BufferLessThan(const std::shared_ptr<Buffer> &it, u8 *pointer);
public:
std::list<MegaBufferSlot> megaBuffers; //!< A pool of all allocated megabuffers, these are dynamically utilized
BufferManager(GPU &gpu);
/**
* @return A dynamically allocated megabuffer which can be used to store buffer modifications allowing them to be replayed in-sequence on the GPU
* @note This object **must** be destroyed to be reclaimed by the manager and prevent a memory leak
*/
MegaBuffer AcquireMegaBuffer(const std::shared_ptr<FenceCycle> &cycle);
/**
* @return A pre-existing or newly created Buffer object which covers the supplied mappings
*/
BufferView FindOrCreate(GuestBuffer guestMapping, const std::shared_ptr<FenceCycle> &cycle = nullptr);
};
/** /**
* @brief A simple linearly allocated GPU-side buffer used to temporarily store buffer modifications allowing them to be replayed in-sequence on the GPU * @brief A simple linearly allocated GPU-side buffer used to temporarily store buffer modifications allowing them to be replayed in-sequence on the GPU
* @note This class is **not** thread-safe and any calls must be externally synchronized
*/ */
class MegaBuffer { class MegaBuffer {
private: private:
constexpr static vk::DeviceSize Size{0x6'400'000}; //!< Size in bytes of the megabuffer (100MiB) BufferManager::MegaBufferSlot &slot;
span<u8> freeRegion; //!< The unallocated space in the megabuffer
memory::Buffer backing; //!< The backing GPU buffer
std::mutex mutex; //!< Synchronizes access to freeRegion
span<u8> freeRegion; //!< Span of unallocated space in the megabuffer
public: public:
MegaBuffer(GPU &gpu); MegaBuffer(BufferManager::MegaBufferSlot &slot);
~MegaBuffer();
/** /**
* @brief Resets the free region of the megabuffer to its initial state, data is left intact but may be overwritten * @brief Resets the free region of the megabuffer to its initial state, data is left intact but may be overwritten
@ -36,29 +84,4 @@ namespace skyline::gpu {
*/ */
vk::DeviceSize Push(span<u8> data, bool pageAlign = false); vk::DeviceSize Push(span<u8> data, bool pageAlign = false);
}; };
/**
* @brief The Buffer Manager is responsible for maintaining a global view of buffers being mapped from the guest to the host, any lookups and creation of host buffer from equivalent guest buffer alongside reconciliation of any overlaps with existing textures
*/
class BufferManager {
private:
GPU &gpu;
std::mutex mutex; //!< Synchronizes access to the buffer mappings
std::vector<std::shared_ptr<Buffer>> buffers; //!< A sorted vector of all buffer mappings
/**
* @return If the end of the supplied buffer is less than the supplied pointer
*/
static bool BufferLessThan(const std::shared_ptr<Buffer> &it, u8 *pointer);
public:
MegaBuffer megaBuffer; //!< The megabuffer used to temporarily store buffer modifications allowing them to be replayed in-sequence on the GPU
BufferManager(GPU &gpu);
/**
* @return A pre-existing or newly created Buffer object which covers the supplied mappings
*/
BufferView FindOrCreate(GuestBuffer guestMapping, const std::shared_ptr<FenceCycle> &cycle = nullptr);
};
} }

View File

@ -5,7 +5,7 @@
#include "command_executor.h" #include "command_executor.h"
namespace skyline::gpu::interconnect { namespace skyline::gpu::interconnect {
CommandExecutor::CommandExecutor(const DeviceState &state) : gpu(*state.gpu), activeCommandBuffer(gpu.scheduler.AllocateCommandBuffer()), cycle(activeCommandBuffer.GetFenceCycle()) {} CommandExecutor::CommandExecutor(const DeviceState &state) : gpu(*state.gpu), activeCommandBuffer(gpu.scheduler.AllocateCommandBuffer()), cycle(activeCommandBuffer.GetFenceCycle()), megaBuffer(gpu.buffer.AcquireMegaBuffer(cycle)) {}
CommandExecutor::~CommandExecutor() { CommandExecutor::~CommandExecutor() {
cycle->Cancel(); cycle->Cancel();
@ -227,7 +227,7 @@ namespace skyline::gpu::interconnect {
cycle = activeCommandBuffer.Reset(); cycle = activeCommandBuffer.Reset();
gpu.buffer.megaBuffer.Reset(); megaBuffer.Reset();
} }
} }
} }

View File

@ -45,6 +45,7 @@ namespace skyline::gpu::interconnect {
public: public:
std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands std::shared_ptr<FenceCycle> cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands
MegaBuffer megaBuffer; //!< The megabuffer used to temporarily store buffer modifications allowing them to be replayed in-sequence on the GPU
CommandExecutor(const DeviceState &state); CommandExecutor(const DeviceState &state);

View File

@ -738,7 +738,7 @@ namespace skyline::gpu::interconnect {
void ConstantBufferUpdate(std::vector<u32> data, u32 offset) { void ConstantBufferUpdate(std::vector<u32> data, u32 offset) {
auto constantBuffer{GetConstantBufferSelector().value()}; auto constantBuffer{GetConstantBufferSelector().value()};
constantBuffer.Write<u32>(executor, gpu.buffer.megaBuffer, data, offset); constantBuffer.Write<u32>(executor, executor.megaBuffer, data, offset);
} }
/* Shader Program */ /* Shader Program */
@ -1110,10 +1110,10 @@ namespace skyline::gpu::interconnect {
auto view{pipelineStage.constantBuffers[constantBuffer.index].view}; auto view{pipelineStage.constantBuffers[constantBuffer.index].view};
std::scoped_lock lock(view); std::scoped_lock lock(view);
if (auto megaBufferOffset{view.AcquireMegaBuffer()}) { if (auto megaBufferOffset{view.AcquireMegaBuffer(executor.megaBuffer)}) {
// If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage // If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
bufferDescriptors[bufferIndex] = vk::DescriptorBufferInfo{ bufferDescriptors[bufferIndex] = vk::DescriptorBufferInfo{
.buffer = gpu.buffer.megaBuffer.GetBacking(), .buffer = executor.megaBuffer.GetBacking(),
.offset = megaBufferOffset, .offset = megaBufferOffset,
.range = view->view->size .range = view->view->size
}; };
@ -2837,9 +2837,9 @@ namespace skyline::gpu::interconnect {
std::scoped_lock lock(indexBufferView); std::scoped_lock lock(indexBufferView);
boundIndexBuffer->type = indexBuffer.type; boundIndexBuffer->type = indexBuffer.type;
if (auto megaBufferOffset{indexBufferView.AcquireMegaBuffer()}) { if (auto megaBufferOffset{indexBufferView.AcquireMegaBuffer(executor.megaBuffer)}) {
// If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage // If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
boundIndexBuffer->handle = gpu.buffer.megaBuffer.GetBacking(); boundIndexBuffer->handle = executor.megaBuffer.GetBacking();
boundIndexBuffer->offset = megaBufferOffset; boundIndexBuffer->offset = megaBufferOffset;
} else { } else {
indexBufferView.RegisterUsage([=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) { indexBufferView.RegisterUsage([=](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {
@ -2872,9 +2872,9 @@ namespace skyline::gpu::interconnect {
std::scoped_lock vertexBufferLock(vertexBufferView); std::scoped_lock vertexBufferLock(vertexBufferView);
if (auto megaBufferOffset{vertexBufferView.AcquireMegaBuffer()}) { if (auto megaBufferOffset{vertexBufferView.AcquireMegaBuffer(executor.megaBuffer)}) {
// If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage // If the buffer is megabuffered then since we don't get out data from the underlying buffer, rather the megabuffer which stays consistent throughout a single execution, we can skip registering usage
boundVertexBuffers->handles[index] = gpu.buffer.megaBuffer.GetBacking(); boundVertexBuffers->handles[index] = executor.megaBuffer.GetBacking();
boundVertexBuffers->offsets[index] = megaBufferOffset; boundVertexBuffers->offsets[index] = megaBufferOffset;
} else { } else {
vertexBufferView.RegisterUsage([handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) { vertexBufferView.RegisterUsage([handle = boundVertexBuffers->handles.data() + index, offset = boundVertexBuffers->offsets.data() + index](const Buffer::BufferViewStorage &view, const std::shared_ptr<Buffer> &buffer) {