Rework Descriptor Set Allocation/Updates

A substantial amount of time would be spent on creation/destruction of `VkDescriptorSet` which scales on titles doing a substantial amount of draws with bindings, this leads to poor performance on those titles as the frametime is dragged down by performing these tasks while they repeatedly create descriptor sets of the same layouts.

This commit fixes it by pooling descriptor sets per-layout in a dynamically resizable pool and keeping them around rather than destroying them after usage which leads to the vast majority of cases not requiring a new descriptor set to even be created. It leads to significantly improved performance where it would otherwise be spent on redundant destruction/recreation or push descriptor updates which took a substantial amount of time themselves.

Additionally, the `BaseDescriptorSizes` were not kept up to date with all of the descriptor types, it led to no crashes on Adreno/Mali as they were purely used for size calculations on either driver but has been corrected to avoid any future issues.
This commit is contained in:
PixelyIon 2022-08-03 04:38:27 +05:30
parent e1a4325137
commit 8fc3cc7a16
No known key found for this signature in database
GPG Key ID: 11BC6C3201BC2C05
3 changed files with 134 additions and 64 deletions

View File

@ -6,72 +6,120 @@
#include "descriptor_allocator.h"
namespace skyline::gpu {
DescriptorAllocator::DescriptorPool::DescriptorPool(const vk::raii::Device &device, const vk::DescriptorPoolCreateInfo &createInfo) : vk::raii::DescriptorPool(device, createInfo), freeSetCount(createInfo.maxSets) {}
DescriptorAllocator::DescriptorSetSlot::DescriptorSetSlot(vk::DescriptorSet descriptorSet) : descriptorSet{descriptorSet} {}
DescriptorAllocator::DescriptorSetSlot::DescriptorSetSlot(DescriptorAllocator::DescriptorSetSlot &&other) : descriptorSet{other.descriptorSet} {
other.descriptorSet = nullptr;
}
DescriptorAllocator::DescriptorPool::DescriptorPool(const vk::raii::Device &device, const vk::DescriptorPoolCreateInfo &createInfo) : vk::raii::DescriptorPool{device, createInfo}, freeSetCount{createInfo.maxSets} {}
void DescriptorAllocator::AllocateDescriptorPool() {
namespace maxwell3d = soc::gm20b::engine::maxwell3d::type; // We use Maxwell3D as reference for base descriptor counts
using DescriptorSizes = std::array<vk::DescriptorPoolSize, 2>;
using DescriptorSizes = std::array<vk::DescriptorPoolSize, 5>;
constexpr DescriptorSizes BaseDescriptorSizes{
vk::DescriptorPoolSize{
.descriptorCount = maxwell3d::PipelineStageConstantBufferCount,
.type = vk::DescriptorType::eUniformBuffer,
},
vk::DescriptorPoolSize{
.descriptorCount = maxwell3d::PipelineStageCount * 20,
.descriptorCount = maxwell3d::PipelineStageCount * 5,
.type = vk::DescriptorType::eStorageBuffer,
},
vk::DescriptorPoolSize{
.descriptorCount = maxwell3d::PipelineStageCount * 5,
.type = vk::DescriptorType::eCombinedImageSampler,
},
};
vk::DescriptorPoolSize{
.descriptorCount = maxwell3d::PipelineStageCount,
.type = vk::DescriptorType::eStorageImage,
},
vk::DescriptorPoolSize{
.descriptorCount = maxwell3d::RenderTargetCount,
.type = vk::DescriptorType::eInputAttachment,
},
}; //!< A best approximate ratio of descriptors of each type that may be utilized, the total amount will grow in these ratios
DescriptorSizes descriptorSizes{BaseDescriptorSizes};
for (auto &descriptorSize : descriptorSizes)
descriptorSize.descriptorCount *= descriptorMultiplier;
pool = std::make_shared<DescriptorPool>(gpu.vkDevice, vk::DescriptorPoolCreateInfo{
.flags = vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet,
.maxSets = descriptorSetCount,
.pPoolSizes = descriptorSizes.data(),
.poolSizeCount = descriptorSizes.size(),
});
}
DescriptorAllocator::ActiveDescriptorSet::ActiveDescriptorSet(std::shared_ptr<DescriptorPool> pPool, vk::DescriptorSet set) : pool(std::move(pPool)), DescriptorSet(set) {
pool->freeSetCount--;
}
DescriptorAllocator::ActiveDescriptorSet::ActiveDescriptorSet(DescriptorAllocator::ActiveDescriptorSet &&other) noexcept {
pool = std::move(other.pool);
static_cast<vk::DescriptorSet &>(*this) = std::exchange(static_cast<vk::DescriptorSet &>(other), vk::DescriptorSet{});
}
DescriptorAllocator::ActiveDescriptorSet::~ActiveDescriptorSet() {
if (static_cast<vk::DescriptorSet &>(*this)) {
std::scoped_lock lock(*pool);
pool->getDevice().freeDescriptorSets(**pool, 1, this, *pool->getDispatcher());
pool->freeSetCount++;
}
}
DescriptorAllocator::DescriptorAllocator(GPU &gpu) : gpu(gpu) {
AllocateDescriptorPool();
}
DescriptorAllocator::ActiveDescriptorSet DescriptorAllocator::AllocateSet(vk::DescriptorSetLayout layout) {
std::scoped_lock allocatorLock(mutex);
while (true) {
std::scoped_lock poolLock(*pool);
vk::ResultValue<vk::DescriptorSet> DescriptorAllocator::AllocateVkDescriptorSet(vk::DescriptorSetLayout layout) {
vk::DescriptorSetAllocateInfo allocateInfo{
.descriptorPool = **pool,
.pSetLayouts = &layout,
.descriptorSetCount = 1,
};
vk::DescriptorSet set{};
vk::DescriptorSet descriptorSet{};
auto result{(*gpu.vkDevice).allocateDescriptorSets(&allocateInfo, &set, *gpu.vkDevice.getDispatcher())};
if (result == vk::Result::eSuccess) {
return ActiveDescriptorSet(pool, set);
} else if (result == vk::Result::eErrorOutOfPoolMemory) {
auto result{(*gpu.vkDevice).allocateDescriptorSets(&allocateInfo, &descriptorSet, *gpu.vkDevice.getDispatcher())};
return vk::createResultValue(result, descriptorSet, __builtin_FUNCTION(), {
vk::Result::eSuccess,
vk::Result::eErrorOutOfPoolMemory,
vk::Result::eErrorFragmentedPool
});
}
DescriptorAllocator::ActiveDescriptorSet::ActiveDescriptorSet(std::shared_ptr<DescriptorPool> pPool, DescriptorSetSlot *slot) : pool{std::move(pPool)}, slot{slot} {
pool->freeSetCount--;
}
DescriptorAllocator::ActiveDescriptorSet::ActiveDescriptorSet(DescriptorAllocator::ActiveDescriptorSet &&other) noexcept {
pool = std::move(other.pool);
slot = std::exchange(other.slot, nullptr);
}
DescriptorAllocator::ActiveDescriptorSet::~ActiveDescriptorSet() {
if (slot) {
slot->active.clear(std::memory_order_release);
pool->freeSetCount++;
}
}
DescriptorAllocator::DescriptorAllocator(GPU &gpu) : gpu{gpu} {
AllocateDescriptorPool();
}
DescriptorAllocator::ActiveDescriptorSet DescriptorAllocator::AllocateSet(vk::DescriptorSetLayout layout) {
std::scoped_lock allocatorLock{mutex};
auto it{pool->layoutSlots.find(layout)};
vk::Result lastResult{};
if (it != pool->layoutSlots.end()) {
auto &slots{it->second};
for (auto &slot : it->second)
if (!slot.active.test_and_set(std::memory_order_acq_rel))
return ActiveDescriptorSet{pool, &slot};
// If we couldn't find an available slot, we need to allocate a new one
auto set{AllocateVkDescriptorSet(layout)};
if (set.result == vk::Result::eSuccess) {
auto &slot{slots.emplace_back(set.value)};
return ActiveDescriptorSet{pool, &slot};
} else {
lastResult = set.result;
}
} else {
// If we couldn't find a layout, we need to allocate a new one
auto set{AllocateVkDescriptorSet(layout)};
if (set.result == vk::Result::eSuccess) {
auto &layoutSlots{pool->layoutSlots.try_emplace(layout).first->second};
return ActiveDescriptorSet{pool, &layoutSlots.emplace_back(set.value)};
} else {
lastResult = set.result;
}
}
while (true) {
// We attempt to modify the pool based on the last result
if (lastResult == vk::Result::eErrorOutOfPoolMemory) {
if (pool->freeSetCount == 0)
// The amount of maximum descriptor sets is insufficient
descriptorSetCount += DescriptorSetCountIncrement;
@ -79,12 +127,17 @@ namespace skyline::gpu {
// The amount of maximum descriptors is insufficient
descriptorMultiplier++;
AllocateDescriptorPool();
continue; // Attempt to allocate again with the new pool
} else if (result == vk::Result::eErrorFragmentedPool) {
} else if (lastResult == vk::Result::eErrorFragmentedPool) {
AllocateDescriptorPool(); // If the pool is fragmented, we reallocate without increasing the size
continue;
}
// Try to allocate a new layout
auto set{AllocateVkDescriptorSet(layout)};
if (set.result == vk::Result::eSuccess) {
auto &layoutSlots{pool->layoutSlots.try_emplace(layout).first->second};
return ActiveDescriptorSet{pool, &layoutSlots.emplace_back(set.value)};
} else {
vk::throwResultException(result, __builtin_FUNCTION());
lastResult = set.result;
}
}
}

View File

@ -3,7 +3,8 @@
#pragma once
#include "fence_cycle.h"
#include <vulkan/vulkan.hpp>
#include <common.h>
namespace skyline::gpu {
/**
@ -18,11 +19,24 @@ namespace skyline::gpu {
u32 descriptorSetCount{DescriptorSetCountIncrement}; //!< The maximum amount of descriptor sets in the pool
u32 descriptorMultiplier{1}; //!< A multiplier for the maximum amount of descriptors in the pool
/**
* @brief A slot representing a single descriptor set dynamically allocated from the pool
*/
struct DescriptorSetSlot {
std::atomic_flag active{true}; //!< If the descriptor is currently being utilized
vk::DescriptorSet descriptorSet; //!< The descriptor set allocated from the pool
DescriptorSetSlot(vk::DescriptorSet descriptorSet);
DescriptorSetSlot(DescriptorSetSlot &&other);
};
/**
* @brief A lockable VkDescriptorPool for maintaining external synchronization requirements
*/
struct DescriptorPool : public std::mutex, public vk::raii::DescriptorPool {
u64 freeSetCount{}; //!< The amount of sets free to allocate from this pool
struct DescriptorPool : public vk::raii::DescriptorPool {
std::atomic<u64> freeSetCount{}; //!< The amount of sets free to allocate from this pool
std::unordered_map<vk::DescriptorSetLayout, std::list<DescriptorSetSlot>> layoutSlots; //!< A map of pools based on the layout of the descriptor sets
DescriptorPool(vk::raii::Device const &device, vk::DescriptorPoolCreateInfo const &createInfo);
};
@ -35,35 +49,47 @@ namespace skyline::gpu {
*/
void AllocateDescriptorPool();
/**
* @brief Allocates a descriptor set with the specified layout from the pool
* @return An error code that's either `eSuccess`, `eErrorOutOfPoolMemory` or `eErrorFragmentedPool`
*/
vk::ResultValue<vk::DescriptorSet> AllocateVkDescriptorSet(vk::DescriptorSetLayout layout);
public:
/**
* @brief A RAII-bound descriptor set that automatically frees resources into the pool on destruction while respecting external synchronization requirements
*/
struct ActiveDescriptorSet : public vk::DescriptorSet {
struct ActiveDescriptorSet {
private:
friend DescriptorAllocator;
std::shared_ptr<DescriptorPool> pool;
DescriptorSetSlot *slot;
/**
* @note The supplied pool **must** be locked prior to calling this
*/
ActiveDescriptorSet(std::shared_ptr<DescriptorPool> pool, vk::DescriptorSet set);
friend class DescriptorAllocator;
ActiveDescriptorSet(std::shared_ptr<DescriptorPool> pool, DescriptorSetSlot *slot);
public:
ActiveDescriptorSet(ActiveDescriptorSet &&other) noexcept;
/* Delete the move constructor to prevent early freeing of the descriptor set */
/* Delete the copy constructor/assignment to prevent early freeing of the descriptor set */
ActiveDescriptorSet(const ActiveDescriptorSet &) = delete;
ActiveDescriptorSet &operator=(const ActiveDescriptorSet &) = delete;
~ActiveDescriptorSet();
vk::DescriptorSet &operator*() const {
return slot->descriptorSet;
}
};
DescriptorAllocator(GPU &gpu);
/**
* @note It is UB to allocate a set with a descriptor type that isn't in the pool as defined in AllocateDescriptorPool
* @brief Allocates a descriptor set from the pool with the supplied layout
* @note The layout object must be reused for equivalent layouts to avoid unnecessary descriptor set creation
* @note It is UB to allocate a set with a descriptor type that isn't in the pool as defined in AllocateDescriptorPool()
* @note The supplied ActiveDescriptorSet **must** stay alive until the descriptor set can be freed, it must not be destroyed after being bound but after any associated commands have completed execution
*/
ActiveDescriptorSet AllocateSet(vk::DescriptorSetLayout layout);
};

View File

@ -1893,7 +1893,7 @@ namespace skyline::gpu::interconnect {
public:
void SetPrimitiveTopology(maxwell3d::PrimitiveTopology topology) {
auto[vkTopology, shaderTopology, isQuad] = [topology]() -> std::tuple<vk::PrimitiveTopology, ShaderCompiler::InputTopology, bool> {
auto[vkTopology, shaderTopology, isQuad]{[topology]() -> std::tuple<vk::PrimitiveTopology, ShaderCompiler::InputTopology, bool> {
using MaxwellTopology = maxwell3d::PrimitiveTopology;
using VkTopology = vk::PrimitiveTopology;
using ShaderTopology = ShaderCompiler::InputTopology;
@ -1922,7 +1922,7 @@ namespace skyline::gpu::interconnect {
default:
throw exception("Unimplemented Maxwell3D Primitive Topology: {}", maxwell3d::ToString(topology));
}
}();
}()};
inputAssemblyState.topology = vkTopology;
needsQuadConversion = isQuad;
@ -2844,7 +2844,7 @@ namespace skyline::gpu::interconnect {
}
} else if (needsQuadConversion) {
// Convert the guest-supplied quad list to an indexed triangle list
auto[bufferView, indexType, indexCount] = GetNonIndexedQuadConversionBuffer(count);
auto[bufferView, indexType, indexCount]{GetNonIndexedQuadConversionBuffer(count)};
executor.AttachBuffer(bufferView);
count = indexCount;
@ -2948,28 +2948,23 @@ namespace skyline::gpu::interconnect {
.depthStencilAttachment = depthRenderTargetView,
}, programState.descriptorSetBindings)};
// Draw Persistent Storage
// Descriptor Set Binding + Update Setup
struct DrawStorage {
ShaderProgramState::DescriptorSetWrites descriptorSetWrites;
std::optional<DescriptorAllocator::ActiveDescriptorSet> descriptorSet;
DescriptorAllocator::ActiveDescriptorSet descriptorSet;
DrawStorage(ShaderProgramState::DescriptorSetWrites &&descriptorSetWrites) : descriptorSetWrites(std::move(descriptorSetWrites)) {}
DrawStorage(ShaderProgramState::DescriptorSetWrites &&descriptorSetWrites, DescriptorAllocator::ActiveDescriptorSet &&descriptorSet) : descriptorSetWrites(std::move(descriptorSetWrites)), descriptorSet(std::move(descriptorSet)) {}
DrawStorage(ShaderProgramState::DescriptorSetWrites &&descriptorSetWrites, DescriptorAllocator::ActiveDescriptorSet &&descriptorSet) : descriptorSetWrites{std::move(descriptorSetWrites)}, descriptorSet{std::move(descriptorSet)} {}
};
std::shared_ptr<DrawStorage> drawStorage{};
if (!programState.descriptorSetWrites->empty()) {
if (gpu.traits.supportsPushDescriptors)
drawStorage = std::make_shared<DrawStorage>(std::move(programState.descriptorSetWrites));
else {
drawStorage = std::make_shared<DrawStorage>(std::move(programState.descriptorSetWrites), gpu.descriptor.AllocateSet(compiledPipeline.descriptorSetLayout));
}
// We can't update the descriptor set here as the bindings might be retroactively updated by future draws
executor.AttachDependency(drawStorage);
}
// Submit Draw
executor.AddSubpass([=, drawStorage = std::move(drawStorage), &vkDevice = gpu.vkDevice, pipelineLayout = compiledPipeline.pipelineLayout, pipeline = compiledPipeline.pipeline, supportsPushDescriptors = gpu.traits.supportsPushDescriptors](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle, GPU &, vk::RenderPass renderPass, u32 subpassIndex) mutable {
executor.AddSubpass([=, drawStorage = std::move(drawStorage), pipelineLayout = compiledPipeline.pipelineLayout, pipeline = compiledPipeline.pipeline](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle, GPU &, vk::RenderPass renderPass, u32 subpassIndex) mutable {
auto &vertexBufferHandles{boundVertexBuffers->handles};
for (u32 bindingIndex{}; bindingIndex != vertexBufferHandles.size(); bindingIndex++) {
// We need to bind all non-null vertex buffers while skipping any null ones
@ -2984,16 +2979,12 @@ namespace skyline::gpu::interconnect {
}
if (drawStorage) {
if (supportsPushDescriptors) {
commandBuffer.pushDescriptorSetKHR(vk::PipelineBindPoint::eGraphics, pipelineLayout, 0, *drawStorage->descriptorSetWrites);
} else {
vk::DescriptorSet descriptorSet{*drawStorage->descriptorSet};
for (auto &descriptorSetWrite : *drawStorage->descriptorSetWrites)
descriptorSetWrite.dstSet = *drawStorage->descriptorSet;
vkDevice.updateDescriptorSets(*drawStorage->descriptorSetWrites, nullptr);
commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, pipelineLayout, 0, *drawStorage->descriptorSet, nullptr);
}
descriptorSetWrite.dstSet = descriptorSet;
gpu.vkDevice.updateDescriptorSets(*drawStorage->descriptorSetWrites, nullptr);
cycle->AttachObject(drawStorage);
commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, pipelineLayout, 0, descriptorSet, nullptr);
}
commandBuffer.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline);