Add a fast binding sync path for when only one cbuf has changed

SMO implements instanced draws by repeating the same draw just with a different constant buffer bound. Reduce the cost of this significantly by detecting such cases and instead of processing every descriptor, copy the previous descriptor set and update only the ones affected by the bound constant buffer.

Credits to ripinperiperi for the initial idea and making me aware of how SMO does these draws
This commit is contained in:
Billy Laws 2022-09-18 16:14:24 +01:00
parent 89edd9b303
commit 7b4da52445
7 changed files with 186 additions and 39 deletions

View File

@ -80,9 +80,24 @@ namespace skyline::gpu::interconnect::maxwell3d {
void ConstantBuffers::Bind(InterconnectContext &ctx, engine::ShaderStage stage, size_t index) {
auto &view{*selectorState.UpdateGet(ctx).view};
boundConstantBuffers[static_cast<size_t>(stage)][index] = {view};
if (quickBindEnabled && quickBind)
DisableQuickBind(); // We can only quick bind one buffer per draw
else if (quickBindEnabled)
quickBind = QuickBind{index, stage};
}
void ConstantBuffers::Unbind(engine::ShaderStage stage, size_t index) {
boundConstantBuffers[static_cast<size_t>(stage)][index] = {};
}
void ConstantBuffers::ResetQuickBind() {
quickBindEnabled = true;
quickBind.reset();
}
void ConstantBuffers::DisableQuickBind() {
quickBindEnabled = false;
quickBind.reset();
}
}

View File

@ -52,6 +52,16 @@ namespace skyline::gpu::interconnect::maxwell3d {
public:
ConstantBufferSet boundConstantBuffers;
/**
* @brief Allows for a single constant buffer to be bound between two draws without requiring a full descriptor sync
*/
struct QuickBind {
size_t index; //!< The index of the constant buffer to bind
engine::ShaderStage stage; //!< The shader stage to bind the constant buffer to
};
std::optional<QuickBind> quickBind;
bool quickBindEnabled{}; //!< If quick binding can occur, if multiple bindings, constant buffer loads or other engines have been used since the last draw this is disabled
ConstantBuffers(DirtyManager &manager, const ConstantBufferSelectorState::EngineRegisters &constantBufferSelectorRegisters);
void MarkAllDirty();
@ -61,5 +71,15 @@ namespace skyline::gpu::interconnect::maxwell3d {
void Bind(InterconnectContext &ctx, engine::ShaderStage stage, size_t index);
void Unbind(engine::ShaderStage stage, size_t index);
/**
* @brief Resets quick binding state to be ready store a new bind, this should be called after every draw
*/
void ResetQuickBind();
/**
* @brief Diables quick binding, this should be called before any operation that could impact contents of bound constant buffers
*/
void DisableQuickBind();
};
}

View File

@ -88,6 +88,10 @@ namespace skyline::gpu::interconnect::maxwell3d {
constantBuffers.Unbind(stage, index);
}
void Maxwell3D::DisableQuickConstantBufferBind() {
constantBuffers.DisableQuickBind();
}
void Maxwell3D::Clear(engine::ClearSurface &clearSurface) {
auto scissor{GetClearScissor()};
if (scissor.extent.width == 0 || scissor.extent.height == 0)

View File

@ -62,6 +62,11 @@ namespace skyline::gpu::interconnect::maxwell3d {
*/
void BindConstantBuffer(engine::ShaderStage stage, u32 index, bool enable);
/**
* @note See ConstantBuffers::DisableQuickBind
*/
void DisableQuickConstantBufferBind();
void Clear(engine::ClearSurface &clearSurface);
void Draw(engine::DrawTopology topology, bool indexed, u32 count, u32 first, u32 instanceCount, u32 vertexOffset, u32 firstInstance);

View File

@ -221,16 +221,22 @@ namespace skyline::gpu::interconnect::maxwell3d {
Pipeline::DescriptorInfo descriptorInfo{};
u32 bindingIndex{};
for (const auto &stage : shaderStages) {
for (size_t i{}; i < engine::ShaderStageCount; i++) {
const auto &stage{shaderStages[i]};
if (!stage.module)
continue;
auto pushBindings{[&](vk::DescriptorType type, const auto &descs, u32 &count, bool individualDescWrites = false) {
auto &stageCbufUsage{descriptorInfo.cbufUsages[i]};
auto pushBindings{[&](vk::DescriptorType type, const auto &descs, u32 &count, auto &&descCb, bool individualDescWrites = false) {
descriptorInfo.writeDescCount += individualDescWrites ? descs.size() : ((descs.size() > 0) ? 1 : 0);
for (const auto &desc : descs) {
for (u32 descIdx{}; descIdx < descs.size(); descIdx++) {
const auto &desc{descs[descIdx]};
count += desc.count;
descCb(desc, descIdx);
descriptorInfo.descriptorSetLayoutBindings.push_back(vk::DescriptorSetLayoutBinding{
.binding = bindingIndex++,
.descriptorType = type,
@ -240,19 +246,32 @@ namespace skyline::gpu::interconnect::maxwell3d {
}
}};
pushBindings(vk::DescriptorType::eUniformBuffer, stage.info.constant_buffer_descriptors, descriptorInfo.uniformBufferDescCount);
pushBindings(vk::DescriptorType::eStorageBuffer, stage.info.storage_buffers_descriptors, descriptorInfo.storageBufferDescCount);
pushBindings(vk::DescriptorType::eUniformBuffer, stage.info.constant_buffer_descriptors, descriptorInfo.uniformBufferDescCount, [&](const Shader::ConstantBufferDescriptor &desc, u32 descIdx) {
for (u32 cbufIdx{desc.index}; cbufIdx < desc.index + desc.count; cbufIdx++) {
auto &usage{stageCbufUsage[cbufIdx]};
usage.uniformBuffers.push_back({bindingIndex, descIdx});
usage.totalBufferDescCount += desc.count;
usage.writeDescCount++;
}
});
pushBindings(vk::DescriptorType::eStorageBuffer, stage.info.storage_buffers_descriptors, descriptorInfo.storageBufferDescCount, [&](const Shader::StorageBufferDescriptor &desc, u32 descIdx) {
auto &usage{stageCbufUsage[desc.cbuf_index]};
usage.storageBuffers.push_back({bindingIndex, descIdx});
usage.totalBufferDescCount += desc.count;
usage.writeDescCount++;
});
descriptorInfo.totalBufferDescCount += descriptorInfo.uniformBufferDescCount + descriptorInfo.storageBufferDescCount;
pushBindings(vk::DescriptorType::eUniformTexelBuffer, stage.info.texture_buffer_descriptors, descriptorInfo.uniformTexelBufferDescCount);
pushBindings(vk::DescriptorType::eStorageTexelBuffer, stage.info.image_buffer_descriptors, descriptorInfo.storageTexelBufferDescCount);
pushBindings(vk::DescriptorType::eUniformTexelBuffer, stage.info.texture_buffer_descriptors, descriptorInfo.uniformTexelBufferDescCount, [](const auto &, u32) {});
pushBindings(vk::DescriptorType::eStorageTexelBuffer, stage.info.image_buffer_descriptors, descriptorInfo.storageTexelBufferDescCount, [](const auto &, u32) {});
descriptorInfo.totalTexelBufferDescCount += descriptorInfo.uniformTexelBufferDescCount + descriptorInfo.storageTexelBufferDescCount;
pushBindings(vk::DescriptorType::eCombinedImageSampler, stage.info.texture_descriptors, descriptorInfo.combinedImageSamplerDescCount, needsIndividualTextureBindingWrites);
pushBindings(vk::DescriptorType::eStorageImage, stage.info.image_descriptors, descriptorInfo.storageImageDescCount);
pushBindings(vk::DescriptorType::eCombinedImageSampler, stage.info.texture_descriptors, descriptorInfo.combinedImageSamplerDescCount, [](const auto &, u32) {}, needsIndividualTextureBindingWrites);
pushBindings(vk::DescriptorType::eStorageImage, stage.info.image_descriptors, descriptorInfo.storageImageDescCount, [](const auto &, u32) {});
descriptorInfo.totalImageDescCount += descriptorInfo.combinedImageSamplerDescCount + descriptorInfo.storageImageDescCount;
}
descriptorInfo.totalElemCount = descriptorInfo.totalBufferDescCount + descriptorInfo.totalTexelBufferDescCount + descriptorInfo.totalImageDescCount;
return descriptorInfo;
}
@ -542,6 +561,38 @@ namespace skyline::gpu::interconnect::maxwell3d {
transitionCacheNextIdx = (transitionCacheNextIdx + 1) % transitionCache.size();
}
static DynamicBufferBinding GetConstantBufferBinding(InterconnectContext &ctx, const Shader::Info &info, BufferView view, size_t idx) {
ctx.executor.AttachBuffer(view);
size_t sizeOverride{std::min<size_t>(info.constant_buffer_used_sizes[idx], view.size)};
if (auto megaBufferBinding{view.TryMegaBuffer(ctx.executor.cycle, ctx.executor.AcquireMegaBufferAllocator(), ctx.executor.executionNumber, sizeOverride)}) {
return megaBufferBinding;
} else {
view.GetBuffer()->BlockSequencedCpuBackingWrites();
return view;
}
}
static DynamicBufferBinding GetStorageBufferBinding(InterconnectContext &ctx, const Shader::Info &info, ConstantBuffer &cbuf, CachedMappedBufferView &cachedView, size_t idx) {
struct SsboDescriptor {
u64 address;
u32 size;
};
const auto &desc{info.storage_buffers_descriptors[idx]};
auto ssbo{cbuf.Read<SsboDescriptor>(ctx.executor, desc.cbuf_offset)};
cachedView.Update(ctx, ssbo.address, ssbo.size);
auto view{cachedView.view};
ctx.executor.AttachBuffer(view);
view.GetBuffer()->BlockSequencedCpuBackingWrites();
if (desc.is_written)
view.GetBuffer()->MarkGpuDirty();
return view;
}
// TODO: EXEC ID FOR STORAGE BUFS PURGE REMAP
void Pipeline::SyncDescriptors(InterconnectContext &ctx, ConstantBufferSet &constantBuffers) {
u32 bindingIdx{};
@ -572,41 +623,69 @@ namespace skyline::gpu::interconnect::maxwell3d {
for (size_t i{}; i < shaderStages.size(); i++) {
const auto &stage{shaderStages[i]};
if (!stage.module)
continue;
writeBufferDescs(vk::DescriptorType::eUniformBuffer, stage.info.constant_buffer_descriptors, descriptorInfo.uniformBufferDescCount,
[&](const Shader::ConstantBufferDescriptor &desc, size_t descIdx, size_t arrayIdx) -> DynamicBufferBinding {
[&](const Shader::ConstantBufferDescriptor &desc, size_t descIdx, size_t arrayIdx) {
size_t cbufIdx{desc.index + arrayIdx};
auto view{constantBuffers[i][cbufIdx].view};
ctx.executor.AttachBuffer(view);
size_t sizeOverride{std::min<size_t>(stage.info.constant_buffer_used_sizes[cbufIdx], view.size)};
if (auto megaBufferBinding{view.TryMegaBuffer(ctx.executor.cycle, ctx.executor.AcquireMegaBufferAllocator(), ctx.executor.executionNumber, sizeOverride)}) {
return megaBufferBinding;
} else {
view.GetBuffer()->BlockSequencedCpuBackingWrites();
return view;
}
return GetConstantBufferBinding(ctx, stage.info, constantBuffers[i][cbufIdx].view, cbufIdx);
});
writeBufferDescs(vk::DescriptorType::eStorageBuffer, stage.info.storage_buffers_descriptors, descriptorInfo.storageBufferDescCount, [&](const Shader::StorageBufferDescriptor &desc, size_t descIdx, size_t arrayIdx) {
struct SsboDescriptor {
u64 address;
u32 size;
writeBufferDescs(vk::DescriptorType::eStorageBuffer, stage.info.storage_buffers_descriptors, descriptorInfo.storageBufferDescCount,
[&](const Shader::StorageBufferDescriptor &desc, size_t descIdx, size_t arrayIdx) {
return GetStorageBufferBinding(ctx, stage.info, constantBuffers[i][desc.cbuf_index], storageBufferViews[descIdx], descIdx);
});
}
}
void Pipeline::SyncDescriptorsQuickBind(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, ConstantBuffers::QuickBind quickBind) {
const auto &cbufUsageInfo{descriptorInfo.cbufUsages[static_cast<size_t>(quickBind.stage)][quickBind.index]};
const auto &shaderInfo{shaderStages[static_cast<size_t>(quickBind.stage)].info};
auto &stageConstantBuffers{constantBuffers[static_cast<size_t>(quickBind.stage)]};
auto copy{ctx.executor.allocator.AllocateUntracked<vk::CopyDescriptorSet>()};
auto writes{ctx.executor.allocator.AllocateUntracked<vk::WriteDescriptorSet>(cbufUsageInfo.writeDescCount)};
size_t writeIdx{};
size_t bufferIdx{};
auto bufferDescs{ctx.executor.allocator.AllocateUntracked<vk::DescriptorBufferInfo>(cbufUsageInfo.totalBufferDescCount)};
auto bufferDescViews{ctx.executor.allocator.AllocateUntracked<DynamicBufferBinding>(cbufUsageInfo.totalBufferDescCount)};
// TODO: opt this to do partial copy
*copy = vk::CopyDescriptorSet{
.srcBinding = 0,
.srcArrayElement = 0,
.dstBinding = 0,
.dstArrayElement = 0,
.descriptorCount = descriptorInfo.totalElemCount,
};
auto &cbuf{constantBuffers[i][desc.cbuf_index]};
auto ssbo{cbuf.Read<SsboDescriptor>(ctx.executor, desc.cbuf_offset)};
storageBufferViews[descIdx].Update(ctx, ssbo.address, ssbo.size);
auto writeBufferDescs{[&](vk::DescriptorType type, const auto &usages, const auto &descs, u32 count, auto getBufferCb) {
for (const auto &usage : usages) {
const auto &shaderDesc{descs[usage.shaderDescIdx]};
auto view{storageBufferViews[descIdx].view};
ctx.executor.AttachBuffer(view);
writes[writeIdx++] = {
.dstBinding = usage.binding,
.descriptorCount = shaderDesc.count,
.descriptorType = type,
.pBufferInfo = &bufferDescs[bufferIdx],
};
if (desc.is_written)
view.GetBuffer()->MarkGpuDirty();
return view;
});
for (size_t i{}; i < shaderDesc.count; i++)
bufferDescViews[bufferIdx++] = getBufferCb(shaderDesc, usage.shaderDescIdx, i);
}
}};
writeBufferDescs(vk::DescriptorType::eUniformBuffer, cbufUsageInfo.uniformBuffers, shaderInfo.constant_buffer_descriptors, descriptorInfo.uniformBufferDescCount,
[&](const Shader::ConstantBufferDescriptor &desc, size_t descIdx, size_t arrayIdx) -> DynamicBufferBinding {
size_t cbufIdx{desc.index + arrayIdx};
return GetConstantBufferBinding(ctx, shaderInfo, stageConstantBuffers[cbufIdx].view, cbufIdx);
});
writeBufferDescs(vk::DescriptorType::eStorageBuffer, cbufUsageInfo.storageBuffers, shaderInfo.storage_buffers_descriptors, descriptorInfo.storageBufferDescCount,
[&](const Shader::StorageBufferDescriptor &desc, size_t descIdx, size_t arrayIdx) {
return GetStorageBufferBinding(ctx, shaderInfo, stageConstantBuffers[desc.cbuf_index], storageBufferViews[bufferIdx], descIdx);
});
}
}

View File

@ -40,6 +40,24 @@ namespace skyline::gpu::interconnect::maxwell3d {
u32 combinedImageSamplerDescCount{};
u32 storageImageDescCount{};
u32 totalImageDescCount{};
u32 totalElemCount{};
/**
* @brief Keeps track of all bindings that are dependent on a given constant buffer index to allow for quick binding
*/
struct ConstantBufferDescriptorUsages {
struct Usage {
u32 binding; //!< Vulkan binding index
u32 shaderDescIdx; //!< Index of the descriptor in the appropriate shader info member
};
boost::container::small_vector<Usage, 2> uniformBuffers;
boost::container::small_vector<Usage, 2> storageBuffers;
u32 totalBufferDescCount{};
u32 writeDescCount{};
};
std::array<std::array<ConstantBufferDescriptorUsages, engine::ShaderStageConstantBufferCount>, engine::ShaderStageCount> cbufUsages{};
};
private:
@ -61,6 +79,8 @@ namespace skyline::gpu::interconnect::maxwell3d {
void AddTransition(Pipeline *next);
void SyncDescriptors(InterconnectContext &ctx, ConstantBufferSet &constantBuffers);
void SyncDescriptorsQuickBind(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, ConstantBuffers::QuickBind quickBind);
};
class PipelineManager {

View File

@ -107,6 +107,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
#undef LOAD_CONSTANT_BUFFER_CALLBACKS
default:
// When a method other than constant buffer update is called submit our submit the previously built-up update as a batch
interconnect.DisableQuickConstantBufferBind();
interconnect.LoadConstantBuffer(batchLoadConstantBuffer.buffer, batchLoadConstantBuffer.Invalidate());
batchLoadConstantBuffer.Reset();
break; // Continue on here to handle the actual method
@ -177,6 +178,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
})
ENGINE_STRUCT_CASE(i2m, launchDma, {
FlushEngineState();
i2m.LaunchDma(*registers.i2m);
})
@ -297,6 +299,8 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
interconnect.LoadConstantBuffer(batchLoadConstantBuffer.buffer, batchLoadConstantBuffer.Invalidate());
batchLoadConstantBuffer.Reset();
}
interconnect.DisableQuickConstantBufferBind();
}
__attribute__((always_inline)) void Maxwell3D::CallMethod(u32 method, u32 argument) {