From 7b4da524451354e51e6896018cb608433b941486 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Sun, 18 Sep 2022 16:14:24 +0100 Subject: [PATCH] Add a fast binding sync path for when only one cbuf has changed SMO implements instanced draws by repeating the same draw just with a different constant buffer bound. Reduce the cost of this significantly by detecting such cases and instead of processing every descriptor, copy the previous descriptor set and update only the ones affected by the bound constant buffer. Credits to ripinperiperi for the initial idea and making me aware of how SMO does these draws --- .../maxwell_3d/constant_buffers.cpp | 15 ++ .../maxwell_3d/constant_buffers.h | 20 +++ .../interconnect/maxwell_3d/maxwell_3d.cpp | 4 + .../gpu/interconnect/maxwell_3d/maxwell_3d.h | 5 + .../maxwell_3d/pipeline_manager.cpp | 155 +++++++++++++----- .../maxwell_3d/pipeline_manager.h | 20 +++ .../skyline/soc/gm20b/engines/maxwell_3d.cpp | 6 +- 7 files changed, 186 insertions(+), 39 deletions(-) diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp index 4c1ae404..f01fbdbb 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp @@ -80,9 +80,24 @@ namespace skyline::gpu::interconnect::maxwell3d { void ConstantBuffers::Bind(InterconnectContext &ctx, engine::ShaderStage stage, size_t index) { auto &view{*selectorState.UpdateGet(ctx).view}; boundConstantBuffers[static_cast(stage)][index] = {view}; + + if (quickBindEnabled && quickBind) + DisableQuickBind(); // We can only quick bind one buffer per draw + else if (quickBindEnabled) + quickBind = QuickBind{index, stage}; } void ConstantBuffers::Unbind(engine::ShaderStage stage, size_t index) { boundConstantBuffers[static_cast(stage)][index] = {}; } + + void ConstantBuffers::ResetQuickBind() { + quickBindEnabled = true; + quickBind.reset(); + } + + void ConstantBuffers::DisableQuickBind() { + quickBindEnabled = false; + quickBind.reset(); + } } \ No newline at end of file diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.h b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.h index 91623139..3569f7e7 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.h @@ -52,6 +52,16 @@ namespace skyline::gpu::interconnect::maxwell3d { public: ConstantBufferSet boundConstantBuffers; + /** + * @brief Allows for a single constant buffer to be bound between two draws without requiring a full descriptor sync + */ + struct QuickBind { + size_t index; //!< The index of the constant buffer to bind + engine::ShaderStage stage; //!< The shader stage to bind the constant buffer to + }; + std::optional quickBind; + bool quickBindEnabled{}; //!< If quick binding can occur, if multiple bindings, constant buffer loads or other engines have been used since the last draw this is disabled + ConstantBuffers(DirtyManager &manager, const ConstantBufferSelectorState::EngineRegisters &constantBufferSelectorRegisters); void MarkAllDirty(); @@ -61,5 +71,15 @@ namespace skyline::gpu::interconnect::maxwell3d { void Bind(InterconnectContext &ctx, engine::ShaderStage stage, size_t index); void Unbind(engine::ShaderStage stage, size_t index); + + /** + * @brief Resets quick binding state to be ready store a new bind, this should be called after every draw + */ + void ResetQuickBind(); + + /** + * @brief Diables quick binding, this should be called before any operation that could impact contents of bound constant buffers + */ + void DisableQuickBind(); }; } diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp index 380cff2f..ac06cf4f 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp @@ -88,6 +88,10 @@ namespace skyline::gpu::interconnect::maxwell3d { constantBuffers.Unbind(stage, index); } + void Maxwell3D::DisableQuickConstantBufferBind() { + constantBuffers.DisableQuickBind(); + } + void Maxwell3D::Clear(engine::ClearSurface &clearSurface) { auto scissor{GetClearScissor()}; if (scissor.extent.width == 0 || scissor.extent.height == 0) diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.h b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.h index 0ab3ac3b..5bc713cb 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.h @@ -62,6 +62,11 @@ namespace skyline::gpu::interconnect::maxwell3d { */ void BindConstantBuffer(engine::ShaderStage stage, u32 index, bool enable); + /** + * @note See ConstantBuffers::DisableQuickBind + */ + void DisableQuickConstantBufferBind(); + void Clear(engine::ClearSurface &clearSurface); void Draw(engine::DrawTopology topology, bool indexed, u32 count, u32 first, u32 instanceCount, u32 vertexOffset, u32 firstInstance); diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp index 9e16c220..6350a914 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp @@ -221,16 +221,22 @@ namespace skyline::gpu::interconnect::maxwell3d { Pipeline::DescriptorInfo descriptorInfo{}; u32 bindingIndex{}; - for (const auto &stage : shaderStages) { + for (size_t i{}; i < engine::ShaderStageCount; i++) { + const auto &stage{shaderStages[i]}; if (!stage.module) continue; - auto pushBindings{[&](vk::DescriptorType type, const auto &descs, u32 &count, bool individualDescWrites = false) { + auto &stageCbufUsage{descriptorInfo.cbufUsages[i]}; + + auto pushBindings{[&](vk::DescriptorType type, const auto &descs, u32 &count, auto &&descCb, bool individualDescWrites = false) { descriptorInfo.writeDescCount += individualDescWrites ? descs.size() : ((descs.size() > 0) ? 1 : 0); - for (const auto &desc : descs) { + for (u32 descIdx{}; descIdx < descs.size(); descIdx++) { + const auto &desc{descs[descIdx]}; count += desc.count; + descCb(desc, descIdx); + descriptorInfo.descriptorSetLayoutBindings.push_back(vk::DescriptorSetLayoutBinding{ .binding = bindingIndex++, .descriptorType = type, @@ -240,19 +246,32 @@ namespace skyline::gpu::interconnect::maxwell3d { } }}; - pushBindings(vk::DescriptorType::eUniformBuffer, stage.info.constant_buffer_descriptors, descriptorInfo.uniformBufferDescCount); - pushBindings(vk::DescriptorType::eStorageBuffer, stage.info.storage_buffers_descriptors, descriptorInfo.storageBufferDescCount); + pushBindings(vk::DescriptorType::eUniformBuffer, stage.info.constant_buffer_descriptors, descriptorInfo.uniformBufferDescCount, [&](const Shader::ConstantBufferDescriptor &desc, u32 descIdx) { + for (u32 cbufIdx{desc.index}; cbufIdx < desc.index + desc.count; cbufIdx++) { + auto &usage{stageCbufUsage[cbufIdx]}; + usage.uniformBuffers.push_back({bindingIndex, descIdx}); + usage.totalBufferDescCount += desc.count; + usage.writeDescCount++; + } + }); + pushBindings(vk::DescriptorType::eStorageBuffer, stage.info.storage_buffers_descriptors, descriptorInfo.storageBufferDescCount, [&](const Shader::StorageBufferDescriptor &desc, u32 descIdx) { + auto &usage{stageCbufUsage[desc.cbuf_index]}; + usage.storageBuffers.push_back({bindingIndex, descIdx}); + usage.totalBufferDescCount += desc.count; + usage.writeDescCount++; + }); descriptorInfo.totalBufferDescCount += descriptorInfo.uniformBufferDescCount + descriptorInfo.storageBufferDescCount; - pushBindings(vk::DescriptorType::eUniformTexelBuffer, stage.info.texture_buffer_descriptors, descriptorInfo.uniformTexelBufferDescCount); - pushBindings(vk::DescriptorType::eStorageTexelBuffer, stage.info.image_buffer_descriptors, descriptorInfo.storageTexelBufferDescCount); + pushBindings(vk::DescriptorType::eUniformTexelBuffer, stage.info.texture_buffer_descriptors, descriptorInfo.uniformTexelBufferDescCount, [](const auto &, u32) {}); + pushBindings(vk::DescriptorType::eStorageTexelBuffer, stage.info.image_buffer_descriptors, descriptorInfo.storageTexelBufferDescCount, [](const auto &, u32) {}); descriptorInfo.totalTexelBufferDescCount += descriptorInfo.uniformTexelBufferDescCount + descriptorInfo.storageTexelBufferDescCount; - pushBindings(vk::DescriptorType::eCombinedImageSampler, stage.info.texture_descriptors, descriptorInfo.combinedImageSamplerDescCount, needsIndividualTextureBindingWrites); - pushBindings(vk::DescriptorType::eStorageImage, stage.info.image_descriptors, descriptorInfo.storageImageDescCount); + pushBindings(vk::DescriptorType::eCombinedImageSampler, stage.info.texture_descriptors, descriptorInfo.combinedImageSamplerDescCount, [](const auto &, u32) {}, needsIndividualTextureBindingWrites); + pushBindings(vk::DescriptorType::eStorageImage, stage.info.image_descriptors, descriptorInfo.storageImageDescCount, [](const auto &, u32) {}); descriptorInfo.totalImageDescCount += descriptorInfo.combinedImageSamplerDescCount + descriptorInfo.storageImageDescCount; } + descriptorInfo.totalElemCount = descriptorInfo.totalBufferDescCount + descriptorInfo.totalTexelBufferDescCount + descriptorInfo.totalImageDescCount; return descriptorInfo; } @@ -542,6 +561,38 @@ namespace skyline::gpu::interconnect::maxwell3d { transitionCacheNextIdx = (transitionCacheNextIdx + 1) % transitionCache.size(); } + static DynamicBufferBinding GetConstantBufferBinding(InterconnectContext &ctx, const Shader::Info &info, BufferView view, size_t idx) { + ctx.executor.AttachBuffer(view); + + size_t sizeOverride{std::min(info.constant_buffer_used_sizes[idx], view.size)}; + if (auto megaBufferBinding{view.TryMegaBuffer(ctx.executor.cycle, ctx.executor.AcquireMegaBufferAllocator(), ctx.executor.executionNumber, sizeOverride)}) { + return megaBufferBinding; + } else { + view.GetBuffer()->BlockSequencedCpuBackingWrites(); + return view; + } + } + + static DynamicBufferBinding GetStorageBufferBinding(InterconnectContext &ctx, const Shader::Info &info, ConstantBuffer &cbuf, CachedMappedBufferView &cachedView, size_t idx) { + struct SsboDescriptor { + u64 address; + u32 size; + }; + + const auto &desc{info.storage_buffers_descriptors[idx]}; + auto ssbo{cbuf.Read(ctx.executor, desc.cbuf_offset)}; + cachedView.Update(ctx, ssbo.address, ssbo.size); + + auto view{cachedView.view}; + ctx.executor.AttachBuffer(view); + view.GetBuffer()->BlockSequencedCpuBackingWrites(); + + if (desc.is_written) + view.GetBuffer()->MarkGpuDirty(); + + return view; + } + // TODO: EXEC ID FOR STORAGE BUFS PURGE REMAP void Pipeline::SyncDescriptors(InterconnectContext &ctx, ConstantBufferSet &constantBuffers) { u32 bindingIdx{}; @@ -572,41 +623,69 @@ namespace skyline::gpu::interconnect::maxwell3d { for (size_t i{}; i < shaderStages.size(); i++) { const auto &stage{shaderStages[i]}; + if (!stage.module) + continue; + writeBufferDescs(vk::DescriptorType::eUniformBuffer, stage.info.constant_buffer_descriptors, descriptorInfo.uniformBufferDescCount, - [&](const Shader::ConstantBufferDescriptor &desc, size_t descIdx, size_t arrayIdx) -> DynamicBufferBinding { + [&](const Shader::ConstantBufferDescriptor &desc, size_t descIdx, size_t arrayIdx) { size_t cbufIdx{desc.index + arrayIdx}; - auto view{constantBuffers[i][cbufIdx].view}; - - ctx.executor.AttachBuffer(view); - - size_t sizeOverride{std::min(stage.info.constant_buffer_used_sizes[cbufIdx], view.size)}; - if (auto megaBufferBinding{view.TryMegaBuffer(ctx.executor.cycle, ctx.executor.AcquireMegaBufferAllocator(), ctx.executor.executionNumber, sizeOverride)}) { - return megaBufferBinding; - } else { - view.GetBuffer()->BlockSequencedCpuBackingWrites(); - return view; - } + return GetConstantBufferBinding(ctx, stage.info, constantBuffers[i][cbufIdx].view, cbufIdx); }); - writeBufferDescs(vk::DescriptorType::eStorageBuffer, stage.info.storage_buffers_descriptors, descriptorInfo.storageBufferDescCount, [&](const Shader::StorageBufferDescriptor &desc, size_t descIdx, size_t arrayIdx) { - struct SsboDescriptor { - u64 address; - u32 size; - }; - - auto &cbuf{constantBuffers[i][desc.cbuf_index]}; - auto ssbo{cbuf.Read(ctx.executor, desc.cbuf_offset)}; - storageBufferViews[descIdx].Update(ctx, ssbo.address, ssbo.size); - - auto view{storageBufferViews[descIdx].view}; - ctx.executor.AttachBuffer(view); - - if (desc.is_written) - view.GetBuffer()->MarkGpuDirty(); - - return view; + writeBufferDescs(vk::DescriptorType::eStorageBuffer, stage.info.storage_buffers_descriptors, descriptorInfo.storageBufferDescCount, + [&](const Shader::StorageBufferDescriptor &desc, size_t descIdx, size_t arrayIdx) { + return GetStorageBufferBinding(ctx, stage.info, constantBuffers[i][desc.cbuf_index], storageBufferViews[descIdx], descIdx); }); } } + + void Pipeline::SyncDescriptorsQuickBind(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, ConstantBuffers::QuickBind quickBind) { + const auto &cbufUsageInfo{descriptorInfo.cbufUsages[static_cast(quickBind.stage)][quickBind.index]}; + const auto &shaderInfo{shaderStages[static_cast(quickBind.stage)].info}; + auto &stageConstantBuffers{constantBuffers[static_cast(quickBind.stage)]}; + auto copy{ctx.executor.allocator.AllocateUntracked()}; + auto writes{ctx.executor.allocator.AllocateUntracked(cbufUsageInfo.writeDescCount)}; + size_t writeIdx{}; + size_t bufferIdx{}; + + auto bufferDescs{ctx.executor.allocator.AllocateUntracked(cbufUsageInfo.totalBufferDescCount)}; + auto bufferDescViews{ctx.executor.allocator.AllocateUntracked(cbufUsageInfo.totalBufferDescCount)}; + + // TODO: opt this to do partial copy + *copy = vk::CopyDescriptorSet{ + .srcBinding = 0, + .srcArrayElement = 0, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = descriptorInfo.totalElemCount, + }; + + auto writeBufferDescs{[&](vk::DescriptorType type, const auto &usages, const auto &descs, u32 count, auto getBufferCb) { + for (const auto &usage : usages) { + const auto &shaderDesc{descs[usage.shaderDescIdx]}; + + writes[writeIdx++] = { + .dstBinding = usage.binding, + .descriptorCount = shaderDesc.count, + .descriptorType = type, + .pBufferInfo = &bufferDescs[bufferIdx], + }; + + for (size_t i{}; i < shaderDesc.count; i++) + bufferDescViews[bufferIdx++] = getBufferCb(shaderDesc, usage.shaderDescIdx, i); + } + }}; + + writeBufferDescs(vk::DescriptorType::eUniformBuffer, cbufUsageInfo.uniformBuffers, shaderInfo.constant_buffer_descriptors, descriptorInfo.uniformBufferDescCount, + [&](const Shader::ConstantBufferDescriptor &desc, size_t descIdx, size_t arrayIdx) -> DynamicBufferBinding { + size_t cbufIdx{desc.index + arrayIdx}; + return GetConstantBufferBinding(ctx, shaderInfo, stageConstantBuffers[cbufIdx].view, cbufIdx); + }); + + writeBufferDescs(vk::DescriptorType::eStorageBuffer, cbufUsageInfo.storageBuffers, shaderInfo.storage_buffers_descriptors, descriptorInfo.storageBufferDescCount, + [&](const Shader::StorageBufferDescriptor &desc, size_t descIdx, size_t arrayIdx) { + return GetStorageBufferBinding(ctx, shaderInfo, stageConstantBuffers[desc.cbuf_index], storageBufferViews[bufferIdx], descIdx); + }); + } } diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h index 3ff02554..3782b7af 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h @@ -40,6 +40,24 @@ namespace skyline::gpu::interconnect::maxwell3d { u32 combinedImageSamplerDescCount{}; u32 storageImageDescCount{}; u32 totalImageDescCount{}; + u32 totalElemCount{}; + + /** + * @brief Keeps track of all bindings that are dependent on a given constant buffer index to allow for quick binding + */ + struct ConstantBufferDescriptorUsages { + struct Usage { + u32 binding; //!< Vulkan binding index + u32 shaderDescIdx; //!< Index of the descriptor in the appropriate shader info member + }; + + boost::container::small_vector uniformBuffers; + boost::container::small_vector storageBuffers; + u32 totalBufferDescCount{}; + u32 writeDescCount{}; + }; + + std::array, engine::ShaderStageCount> cbufUsages{}; }; private: @@ -61,6 +79,8 @@ namespace skyline::gpu::interconnect::maxwell3d { void AddTransition(Pipeline *next); void SyncDescriptors(InterconnectContext &ctx, ConstantBufferSet &constantBuffers); + + void SyncDescriptorsQuickBind(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, ConstantBuffers::QuickBind quickBind); }; class PipelineManager { diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp index 6f8f9587..76319d10 100644 --- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp +++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp @@ -107,6 +107,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d { #undef LOAD_CONSTANT_BUFFER_CALLBACKS default: // When a method other than constant buffer update is called submit our submit the previously built-up update as a batch + interconnect.DisableQuickConstantBufferBind(); interconnect.LoadConstantBuffer(batchLoadConstantBuffer.buffer, batchLoadConstantBuffer.Invalidate()); batchLoadConstantBuffer.Reset(); break; // Continue on here to handle the actual method @@ -177,6 +178,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d { }) ENGINE_STRUCT_CASE(i2m, launchDma, { + FlushEngineState(); i2m.LaunchDma(*registers.i2m); }) @@ -258,7 +260,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d { #define PIPELINE_CALLBACKS(z, idx, data) \ ENGINE_ARRAY_STRUCT_CASE(bindGroups, idx, constantBuffer, { \ - interconnect.BindConstantBuffer(static_cast(idx), constantBuffer.shaderSlot, constantBuffer.valid); \ + interconnect.BindConstantBuffer(static_cast(idx), constantBuffer.shaderSlot, constantBuffer.valid); \ }) BOOST_PP_REPEAT(5, PIPELINE_CALLBACKS, 0) @@ -297,6 +299,8 @@ namespace skyline::soc::gm20b::engine::maxwell3d { interconnect.LoadConstantBuffer(batchLoadConstantBuffer.buffer, batchLoadConstantBuffer.Invalidate()); batchLoadConstantBuffer.Reset(); } + + interconnect.DisableQuickConstantBufferBind(); } __attribute__((always_inline)) void Maxwell3D::CallMethod(u32 method, u32 argument) {