From 99bf7dbb36df00b8e34a4d7471e894c53709c2ab Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Sun, 12 Feb 2023 21:14:05 +0000 Subject: [PATCH] Implement usage based implicit renderpass barrier generation Full pipeline barriers between every RP can be extremely expensive on HW, by analysing the inputs and outputs of a draw it's possible to construct a much more optimal barrier that only syncs what is neccessary. --- app/src/main/cpp/skyline/gpu/buffer.cpp | 7 +++ app/src/main/cpp/skyline/gpu/buffer.h | 5 ++ .../gpu/interconnect/command_executor.cpp | 8 +-- .../gpu/interconnect/command_executor.h | 4 +- .../gpu/interconnect/command_nodes.cpp | 44 +++++++++++----- .../skyline/gpu/interconnect/command_nodes.h | 3 +- .../skyline/gpu/interconnect/common/common.h | 2 +- .../gpu/interconnect/common/pipeline.inc | 25 +++++++-- .../cpp/skyline/gpu/interconnect/fermi_2d.cpp | 4 +- .../kepler_compute/kepler_compute.cpp | 12 ++++- .../kepler_compute/pipeline_manager.cpp | 17 +++++-- .../kepler_compute/pipeline_manager.h | 2 +- .../interconnect/maxwell_3d/active_state.cpp | 44 ++++++++++++---- .../interconnect/maxwell_3d/active_state.h | 18 ++++--- .../interconnect/maxwell_3d/maxwell_3d.cpp | 11 ++-- .../maxwell_3d/pipeline_manager.cpp | 51 ++++++++++++++++--- .../maxwell_3d/pipeline_manager.h | 6 ++- .../main/cpp/skyline/gpu/texture/texture.cpp | 27 ++++++++++ .../main/cpp/skyline/gpu/texture/texture.h | 11 ++++ 19 files changed, 237 insertions(+), 64 deletions(-) diff --git a/app/src/main/cpp/skyline/gpu/buffer.cpp b/app/src/main/cpp/skyline/gpu/buffer.cpp index a392a55e..f1f20e68 100644 --- a/app/src/main/cpp/skyline/gpu/buffer.cpp +++ b/app/src/main/cpp/skyline/gpu/buffer.cpp @@ -633,6 +633,13 @@ namespace skyline::gpu { return mirror; } + void Buffer::PopulateReadBarrier(vk::PipelineStageFlagBits dstStage, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask) { + if (currentExecutionGpuDirty) { + srcStageMask |= vk::PipelineStageFlagBits::eAllCommands; + dstStageMask |= dstStage; + } + } + void Buffer::lock() { mutex.lock(); accumulatedCpuLockCounter++; diff --git a/app/src/main/cpp/skyline/gpu/buffer.h b/app/src/main/cpp/skyline/gpu/buffer.h index 5092a47e..b9bbfd37 100644 --- a/app/src/main/cpp/skyline/gpu/buffer.h +++ b/app/src/main/cpp/skyline/gpu/buffer.h @@ -430,6 +430,11 @@ namespace skyline::gpu { * @note The buffer **must** be kept locked until the span is no longer in use */ span GetReadOnlyBackingSpan(bool isFirstUsage, const std::function &flushHostCallback); + + /** + * @brief Populates the input src and dst stage masks with appropriate read barrier parameters for the current buffer state + */ + void PopulateReadBarrier(vk::PipelineStageFlagBits dstStage, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask); }; /** diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp index 7274d20e..90e694ef 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp @@ -279,9 +279,9 @@ namespace skyline::gpu::interconnect { return (!a && !b) || (a && b && b->GetView() == a); } - bool CommandExecutor::CreateRenderPassWithSubpass(vk::Rect2D renderArea, span sampledImages, span inputAttachments, span colorAttachments, TextureView *depthStencilAttachment, bool noSubpassCreation) { + bool CommandExecutor::CreateRenderPassWithSubpass(vk::Rect2D renderArea, span sampledImages, span inputAttachments, span colorAttachments, TextureView *depthStencilAttachment, bool noSubpassCreation, vk::PipelineStageFlags srcStageMask, vk::PipelineStageFlags dstStageMask) { auto addSubpass{[&] { - renderPass->AddSubpass(inputAttachments, colorAttachments, depthStencilAttachment, gpu); + renderPass->AddSubpass(inputAttachments, colorAttachments, depthStencilAttachment, gpu, srcStageMask, dstStageMask); lastSubpassColorAttachments.clear(); lastSubpassInputAttachments.clear(); @@ -415,8 +415,8 @@ namespace skyline::gpu::interconnect { cycle->AttachObject(dependency); } - void CommandExecutor::AddSubpass(std::function &, GPU &, vk::RenderPass, u32)> &&function, vk::Rect2D renderArea, span sampledImages, span inputAttachments, span colorAttachments, TextureView *depthStencilAttachment, bool noSubpassCreation) { - bool gotoNext{CreateRenderPassWithSubpass(renderArea, sampledImages, inputAttachments, colorAttachments, depthStencilAttachment ? &*depthStencilAttachment : nullptr, noSubpassCreation)}; + void CommandExecutor::AddSubpass(std::function &, GPU &, vk::RenderPass, u32)> &&function, vk::Rect2D renderArea, span sampledImages, span inputAttachments, span colorAttachments, TextureView *depthStencilAttachment, bool noSubpassCreation, vk::PipelineStageFlags srcStageMask, vk::PipelineStageFlags dstStageMask) { + bool gotoNext{CreateRenderPassWithSubpass(renderArea, sampledImages, inputAttachments, colorAttachments, depthStencilAttachment ? &*depthStencilAttachment : nullptr, noSubpassCreation, srcStageMask, dstStageMask)}; if (gotoNext) slot->nodes.emplace_back(std::in_place_type_t(), std::forward(function)); else diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h index bf133f9a..d03679c6 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h @@ -190,7 +190,7 @@ namespace skyline::gpu::interconnect { * @note This also checks for subpass coalescing and will merge the new subpass with the previous one when possible * @return If the next subpass must be started prior to issuing any commands */ - bool CreateRenderPassWithSubpass(vk::Rect2D renderArea, span sampledImages, span inputAttachments, span colorAttachments, TextureView *depthStencilAttachment, bool noSubpassCreation = false); + bool CreateRenderPassWithSubpass(vk::Rect2D renderArea, span sampledImages, span inputAttachments, span colorAttachments, TextureView *depthStencilAttachment, bool noSubpassCreation = false, vk::PipelineStageFlags srcStageMask = {}, vk::PipelineStageFlags dstStageMask = {}); /** * @brief Ends a render pass if one is currently active and resets all corresponding state @@ -264,7 +264,7 @@ namespace skyline::gpu::interconnect { * @param exclusiveSubpass If this subpass should be the only subpass in a render pass * @note Any supplied texture should be attached prior and not undergo any persistent layout transitions till execution */ - void AddSubpass(std::function &, GPU &, vk::RenderPass, u32)> &&function, vk::Rect2D renderArea, span sampledImages, span inputAttachments = {}, span colorAttachments = {}, TextureView *depthStencilAttachment = {}, bool noSubpassCreation = false); + void AddSubpass(std::function &, GPU &, vk::RenderPass, u32)> &&function, vk::Rect2D renderArea, span sampledImages, span inputAttachments = {}, span colorAttachments = {}, TextureView *depthStencilAttachment = {}, bool noSubpassCreation = false, vk::PipelineStageFlags srcStageMask = {}, vk::PipelineStageFlags dstStageMask = {}); /** * @brief Adds a subpass that clears the entirety of the specified attachment with a color value, it may utilize VK_ATTACHMENT_LOAD_OP_CLEAR for a more efficient clear when possible diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.cpp index fb6cfe83..b193ae89 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.cpp @@ -2,21 +2,18 @@ // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/) #include "command_nodes.h" +#include "gpu/texture/texture.h" +#include namespace skyline::gpu::interconnect::node { - RenderPassNode::RenderPassNode(vk::Rect2D renderArea) : subpassDependencies( - { - // We assume all past commands have been executed when this RP starts - vk::SubpassDependency{ - .srcSubpass = VK_SUBPASS_EXTERNAL, - .dstSubpass = 0, - .srcStageMask = vk::PipelineStageFlagBits::eAllGraphics, - .dstStageMask = vk::PipelineStageFlagBits::eAllGraphics, - .srcAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite, - .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite, - } - } - ), renderArea(renderArea) {} + RenderPassNode::RenderPassNode(vk::Rect2D renderArea) + : externalDependency{vk::SubpassDependency{ + .srcSubpass = VK_SUBPASS_EXTERNAL, + .dstSubpass = 0, + .srcAccessMask = vk::AccessFlagBits::eMemoryWrite, + .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite, + }}, + renderArea{renderArea} {} u32 RenderPassNode::AddAttachment(TextureView *view, GPU &gpu) { auto vkView{view->GetView()}; @@ -42,6 +39,19 @@ namespace skyline::gpu::interconnect::node { .finalLayout = view->texture->layout, .flags = vk::AttachmentDescriptionFlagBits::eMayAlias }); + + if (auto usage{view->texture->GetLastRenderPassUsage()}; usage != texture::RenderPassUsage::None) { + if (view->format->vkAspect & vk::ImageAspectFlagBits::eColor) + externalDependency.dstStageMask |= vk::PipelineStageFlagBits::eColorAttachmentOutput; + else if (view->format->vkAspect & (vk::ImageAspectFlagBits::eDepth | vk::ImageAspectFlagBits::eStencil)) + externalDependency.dstStageMask |= vk::PipelineStageFlagBits::eEarlyFragmentTests | vk::PipelineStageFlagBits::eLateFragmentTests; + + if (usage == texture::RenderPassUsage::RenderTarget) + externalDependency.srcStageMask |= externalDependency.dstStageMask; + else if (usage == texture::RenderPassUsage::Sampled) + externalDependency.srcStageMask |= vk::PipelineStageFlagBits::eAllGraphics; + } + return static_cast(attachments.size() - 1); } else { // If we've got a match from a previous subpass, we need to preserve the attachment till the current subpass @@ -116,7 +126,10 @@ namespace skyline::gpu::interconnect::node { } } - void RenderPassNode::AddSubpass(span inputAttachments, span colorAttachments, TextureView *depthStencilAttachment, GPU& gpu) { + void RenderPassNode::AddSubpass(span inputAttachments, span colorAttachments, TextureView *depthStencilAttachment, GPU &gpu, vk::PipelineStageFlags srcStageMask, vk::PipelineStageFlags dstStageMask) { + externalDependency.srcStageMask |= srcStageMask; + externalDependency.dstStageMask |= dstStageMask; + attachmentReferences.reserve(attachmentReferences.size() + inputAttachments.size() + colorAttachments.size() + (depthStencilAttachment ? 1 : 0)); auto inputAttachmentsOffset{attachmentReferences.size() * sizeof(vk::AttachmentReference)}; @@ -225,6 +238,9 @@ namespace skyline::gpu::interconnect::node { preserveAttachmentIt++; } + if (externalDependency.srcStageMask && externalDependency.dstStageMask) + subpassDependencies.push_back(externalDependency); + auto renderPass{gpu.renderPassCache.GetRenderPass(vk::RenderPassCreateInfo{ .attachmentCount = static_cast(attachmentDescriptions.size()), .pAttachments = attachmentDescriptions.data(), diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h b/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h index e74b235d..dc2edc1b 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h @@ -48,6 +48,7 @@ namespace skyline::gpu::interconnect::node { public: std::vector subpassDescriptions; std::vector subpassDependencies; + vk::SubpassDependency externalDependency; vk::Rect2D renderArea; std::vector clearValues; @@ -63,7 +64,7 @@ namespace skyline::gpu::interconnect::node { /** * @brief Creates a subpass with the attachments bound in the specified order */ - void AddSubpass(span inputAttachments, span colorAttachments, TextureView *depthStencilAttachment, GPU& gpu); + void AddSubpass(span inputAttachments, span colorAttachments, TextureView *depthStencilAttachment, GPU &gpu, vk::PipelineStageFlags srcStageMask, vk::PipelineStageFlags dstStageMask); /** * @brief Clears a color attachment in the current subpass with VK_ATTACHMENT_LOAD_OP_CLEAR diff --git a/app/src/main/cpp/skyline/gpu/interconnect/common/common.h b/app/src/main/cpp/skyline/gpu/interconnect/common/common.h index 98b30499..059a2f1e 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/common/common.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/common/common.h @@ -44,7 +44,7 @@ namespace skyline::gpu::interconnect { u64 blockMappingEndAddr; //!< The end GPU address of `blockMapping` public: - BufferView view; //!< The buffer view created as a result of a call to `Update()` + BufferView view{}; //!< The buffer view created as a result of a call to `Update()` /** * @brief Updates `view` based on the supplied GPU mapping diff --git a/app/src/main/cpp/skyline/gpu/interconnect/common/pipeline.inc b/app/src/main/cpp/skyline/gpu/interconnect/common/pipeline.inc index 1fb39a26..c79db589 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/common/pipeline.inc +++ b/app/src/main/cpp/skyline/gpu/interconnect/common/pipeline.inc @@ -17,11 +17,16 @@ namespace skyline::gpu::interconnect { }; }; - static DynamicBufferBinding GetConstantBufferBinding(InterconnectContext &ctx, span cbufSizes, BufferView view, size_t idx) { + static DynamicBufferBinding GetConstantBufferBinding(InterconnectContext &ctx, + span cbufSizes, + BufferView view, size_t idx, + vk::PipelineStageFlagBits dstStage, + vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask) { if (!view) // Return a dummy buffer if the constant buffer isn't bound return BufferBinding{ctx.gpu.megaBufferAllocator.Allocate(ctx.executor.cycle, PAGE_SIZE).buffer, 0, PAGE_SIZE}; ctx.executor.AttachBuffer(view); + view.GetBuffer()->PopulateReadBarrier(dstStage, srcStageMask, dstStageMask); size_t sizeOverride{std::min(cbufSizes[idx], view.size)}; if (auto megaBufferBinding{view.TryMegaBuffer(ctx.executor.cycle, ctx.gpu.megaBufferAllocator, ctx.executor.executionTag, sizeOverride)}) { @@ -32,7 +37,10 @@ namespace skyline::gpu::interconnect { } } - static DynamicBufferBinding GetStorageBufferBinding(InterconnectContext &ctx, const auto &desc, ConstantBuffer &cbuf, CachedMappedBufferView &cachedView) { + static DynamicBufferBinding GetStorageBufferBinding(InterconnectContext &ctx, const auto &desc, + ConstantBuffer &cbuf, CachedMappedBufferView &cachedView, + vk::PipelineStageFlagBits dstStage, + vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask) { struct SsboDescriptor { u64 address; u32 size; @@ -46,8 +54,14 @@ namespace skyline::gpu::interconnect { auto view{cachedView.view}; ctx.executor.AttachBuffer(view); + view.GetBuffer()->PopulateReadBarrier(dstStage, srcStageMask, dstStageMask); if (desc.is_written) { + if (view.GetBuffer()->SequencedCpuBackingWritesBlocked()) { + srcStageMask |= vk::PipelineStageFlagBits::eAllCommands; + dstStageMask |= dstStage; + } + view.GetBuffer()->MarkGpuDirty(); } else { if (auto megaBufferBinding{view.TryMegaBuffer(ctx.executor.cycle, ctx.gpu.megaBufferAllocator, ctx.executor.executionTag)}) @@ -77,11 +91,16 @@ namespace skyline::gpu::interconnect { return {.raw = primaryVal}; } - static std::pair GetTextureBinding(InterconnectContext &ctx, const auto &desc, Samplers &samplers, Textures &textures, BindlessHandle handle) { + static std::pair GetTextureBinding(InterconnectContext &ctx, const auto &desc, + Samplers &samplers, Textures &textures, + BindlessHandle handle, + vk::PipelineStageFlagBits dstStage, + vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask) { auto sampler{samplers.GetSampler(ctx, handle.samplerIndex, handle.textureIndex)}; auto texture{textures.GetTexture(ctx, handle.textureIndex, desc.type)}; ctx.executor.AttachTexture(texture); auto view{texture->GetView()}; + texture->texture->PopulateReadBarrier(dstStage, srcStageMask, dstStageMask); return { vk::DescriptorImageInfo{ diff --git a/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp b/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp index 5570d6a2..b543758e 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp @@ -149,7 +149,9 @@ namespace skyline::gpu::interconnect { [=](auto &&executionCallback) { auto dst{dstTextureView.get()}; std::array sampledImages{srcTextureView.get()}; - executor.AddSubpass(std::move(executionCallback), {{static_cast(dstRectX), static_cast(dstRectY)}, {dstRectWidth, dstRectHeight} }, sampledImages, {}, {dst}); + executor.AddSubpass(std::move(executionCallback), {{static_cast(dstRectX), static_cast(dstRectY)}, {dstRectWidth, dstRectHeight} }, + sampledImages, {}, {dst}, {}, false, + vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands); } ); diff --git a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp index 1d3b7576..5fb41e14 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/kepler_compute.cpp @@ -37,7 +37,8 @@ namespace skyline::gpu::interconnect::kepler_compute { samplers.Update(ctx, qmd.samplerIndex == soc::gm20b::engine::kepler_compute::QMD::SamplerIndex::ViaHeaderIndex); auto *pipeline{pipelineState.Update(ctx, builder, textures, constantBuffers.boundConstantBuffers, qmd)}; - auto *descUpdateInfo{pipeline->SyncDescriptors(ctx, constantBuffers.boundConstantBuffers, samplers, textures)}; + vk::PipelineStageFlags srcStageMask{}, dstStageMask{}; + auto *descUpdateInfo{pipeline->SyncDescriptors(ctx, constantBuffers.boundConstantBuffers, samplers, textures, srcStageMask, dstStageMask)}; builder.SetPipeline(*pipeline->compiledPipeline.pipeline, vk::PipelineBindPoint::eCompute); if (ctx.gpu.traits.supportsPushDescriptors) { @@ -57,13 +58,20 @@ namespace skyline::gpu::interconnect::kepler_compute { struct DrawParams { StateUpdater stateUpdater; std::array dimensions; + vk::PipelineStageFlags srcStageMask, dstStageMask; }; - auto *drawParams{ctx.executor.allocator->EmplaceUntracked(DrawParams{stateUpdater, {qmd.ctaRasterWidth, qmd.ctaRasterHeight, qmd.ctaRasterDepth}})}; + auto *drawParams{ctx.executor.allocator->EmplaceUntracked(DrawParams{stateUpdater, {qmd.ctaRasterWidth, qmd.ctaRasterHeight, qmd.ctaRasterDepth}, srcStageMask, dstStageMask})}; ctx.executor.AddOutsideRpCommand([drawParams](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr &, GPU &gpu) { drawParams->stateUpdater.RecordAll(gpu, commandBuffer); + if (drawParams->srcStageMask && drawParams->dstStageMask) + commandBuffer.pipelineBarrier(drawParams->srcStageMask, drawParams->dstStageMask, {}, {vk::MemoryBarrier{ + .srcAccessMask = vk::AccessFlagBits::eMemoryWrite, + .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite + }}, {}, {}); + commandBuffer.dispatch(drawParams->dimensions[0], drawParams->dimensions[1], drawParams->dimensions[2]); }); } diff --git a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_manager.cpp b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_manager.cpp index 5bbcc11b..e8f10e37 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_manager.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_manager.cpp @@ -120,7 +120,7 @@ namespace skyline::gpu::interconnect::kepler_compute { } } - DescriptorUpdateInfo *Pipeline::SyncDescriptors(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, Samplers &samplers, Textures &textures) { + DescriptorUpdateInfo *Pipeline::SyncDescriptors(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, Samplers &samplers, Textures &textures, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask) { SyncCachedStorageBufferViews(ctx.executor.executionTag); u32 writeIdx{}; @@ -175,12 +175,18 @@ namespace skyline::gpu::interconnect::kepler_compute { writeBufferDescs(vk::DescriptorType::eUniformBuffer, shaderStage.info.constant_buffer_descriptors, [&](const Shader::ConstantBufferDescriptor &desc, size_t arrayIdx) { size_t cbufIdx{desc.index + arrayIdx}; - return GetConstantBufferBinding(ctx, shaderStage.info.constant_buffer_used_sizes, constantBuffers[cbufIdx].view, cbufIdx); + return GetConstantBufferBinding(ctx, shaderStage.info.constant_buffer_used_sizes, + constantBuffers[cbufIdx].view, cbufIdx, + vk::PipelineStageFlagBits::eComputeShader, + srcStageMask, dstStageMask); }); writeBufferDescs(vk::DescriptorType::eStorageBuffer, shaderStage.info.storage_buffers_descriptors, [&](const Shader::StorageBufferDescriptor &desc, size_t arrayIdx) { - auto binding{GetStorageBufferBinding(ctx, desc, constantBuffers[desc.cbuf_index], storageBufferViews[storageBufferIdx])}; + auto binding{GetStorageBufferBinding(ctx, desc, constantBuffers[desc.cbuf_index], + storageBufferViews[storageBufferIdx], + vk::PipelineStageFlagBits::eComputeShader, + srcStageMask, dstStageMask)}; storageBufferIdx += arrayIdx ? 0 : 1; return binding; }); @@ -188,7 +194,10 @@ namespace skyline::gpu::interconnect::kepler_compute { writeImageDescs(vk::DescriptorType::eCombinedImageSampler, shaderStage.info.texture_descriptors, [&](const Shader::TextureDescriptor &desc, size_t arrayIdx) { BindlessHandle handle{ReadBindlessHandle(ctx, constantBuffers, desc, arrayIdx)}; - auto binding{GetTextureBinding(ctx, desc, samplers, textures, handle)}; + auto binding{GetTextureBinding(ctx, desc, + samplers, textures, handle, + vk::PipelineStageFlagBits::eComputeShader, + srcStageMask, dstStageMask)}; return binding.first; }); diff --git a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_manager.h b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_manager.h index 1ca4f03a..96c0839e 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_manager.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/kepler_compute/pipeline_manager.h @@ -55,7 +55,7 @@ namespace skyline::gpu::interconnect::kepler_compute { /** * @brief Creates a descriptor set update from the current GPU state */ - DescriptorUpdateInfo *SyncDescriptors(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, Samplers &samplers, Textures &textures); + DescriptorUpdateInfo *SyncDescriptors(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, Samplers &samplers, Textures &textures, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask); }; class PipelineManager { diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/active_state.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/active_state.cpp index e1cbc2ce..a9145e08 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/active_state.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/active_state.cpp @@ -21,13 +21,14 @@ namespace skyline::gpu::interconnect::maxwell3d { VertexBufferState::VertexBufferState(dirty::Handle dirtyHandle, DirtyManager &manager, const EngineRegisters &engine, u32 index) : engine{manager, dirtyHandle, engine}, index{index} {} - void VertexBufferState::Flush(InterconnectContext &ctx, StateUpdateBuilder &builder) { + void VertexBufferState::Flush(InterconnectContext &ctx, StateUpdateBuilder &builder, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask) { size_t size{engine->vertexStreamLimit - engine->vertexStream.location + 1}; if (engine->vertexStream.format.enable && engine->vertexStream.location != 0 && size) { view.Update(ctx, engine->vertexStream.location, size); if (*view) { ctx.executor.AttachBuffer(*view); + view->GetBuffer()->PopulateReadBarrier(vk::PipelineStageFlagBits::eVertexInput, srcStageMask, dstStageMask); if (megaBufferBinding = view->TryMegaBuffer(ctx.executor.cycle, ctx.gpu.megaBufferAllocator, ctx.executor.executionTag); megaBufferBinding) @@ -48,7 +49,10 @@ namespace skyline::gpu::interconnect::maxwell3d { builder.SetVertexBuffer(index, {ctx.gpu.megaBufferAllocator.Allocate(ctx.executor.cycle, 0).buffer}, ctx.gpu.traits.supportsExtendedDynamicState, engine->vertexStream.format.stride); } - bool VertexBufferState::Refresh(InterconnectContext &ctx, StateUpdateBuilder &builder) { + bool VertexBufferState::Refresh(InterconnectContext &ctx, StateUpdateBuilder &builder, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask) { + if (*view) + view->GetBuffer()->PopulateReadBarrier(vk::PipelineStageFlagBits::eVertexInput, srcStageMask, dstStageMask); + if (megaBufferBinding) { if (auto newMegaBufferBinding{view->TryMegaBuffer(ctx.executor.cycle, ctx.gpu.megaBufferAllocator, ctx.executor.executionTag)}; newMegaBufferBinding != megaBufferBinding) { @@ -117,7 +121,7 @@ namespace skyline::gpu::interconnect::maxwell3d { IndexBufferState::IndexBufferState(dirty::Handle dirtyHandle, DirtyManager &manager, const EngineRegisters &engine) : engine{manager, dirtyHandle, engine} {} - void IndexBufferState::Flush(InterconnectContext &ctx, StateUpdateBuilder &builder, bool quadConversion, u32 firstIndex, u32 elementCount) { + void IndexBufferState::Flush(InterconnectContext &ctx, StateUpdateBuilder &builder, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask, bool quadConversion, u32 firstIndex, u32 elementCount) { usedElementCount = elementCount; usedFirstIndex = firstIndex; usedQuadConversion = quadConversion; @@ -130,6 +134,7 @@ namespace skyline::gpu::interconnect::maxwell3d { } ctx.executor.AttachBuffer(*view); + view->GetBuffer()->PopulateReadBarrier(vk::PipelineStageFlagBits::eVertexInput, srcStageMask, dstStageMask); indexType = ConvertIndexType(engine->indexBuffer.indexSize); @@ -144,7 +149,10 @@ namespace skyline::gpu::interconnect::maxwell3d { builder.SetIndexBuffer(*view, indexType); } - bool IndexBufferState::Refresh(InterconnectContext &ctx, StateUpdateBuilder &builder, bool quadConversion, u32 firstIndex, u32 elementCount) { + bool IndexBufferState::Refresh(InterconnectContext &ctx, StateUpdateBuilder &builder, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask, bool quadConversion, u32 firstIndex, u32 elementCount) { + if (*view) + view->GetBuffer()->PopulateReadBarrier(vk::PipelineStageFlagBits::eVertexInput, srcStageMask, dstStageMask); + if (elementCount > usedElementCount) return true; @@ -185,7 +193,7 @@ namespace skyline::gpu::interconnect::maxwell3d { TransformFeedbackBufferState::TransformFeedbackBufferState(dirty::Handle dirtyHandle, DirtyManager &manager, const EngineRegisters &engine, u32 index) : engine{manager, dirtyHandle, engine}, index{index} {} - void TransformFeedbackBufferState::Flush(InterconnectContext &ctx, StateUpdateBuilder &builder) { + void TransformFeedbackBufferState::Flush(InterconnectContext &ctx, StateUpdateBuilder &builder, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask) { if (engine->streamOutEnable) { if (engine->streamOutBuffer.size) { view.Update(ctx, engine->streamOutBuffer.address + engine->streamOutBuffer.loadWritePointerStartOffset, engine->streamOutBuffer.size); @@ -193,6 +201,11 @@ namespace skyline::gpu::interconnect::maxwell3d { if (*view) { ctx.executor.AttachBuffer(*view); + if (view->GetBuffer()->SequencedCpuBackingWritesBlocked()) { + srcStageMask |= vk::PipelineStageFlagBits::eAllCommands; + dstStageMask |= vk::PipelineStageFlagBits::eTransformFeedbackEXT; + } + view->GetBuffer()->MarkGpuDirty(); builder.SetTransformFeedbackBuffer(index, *view); return; @@ -206,6 +219,15 @@ namespace skyline::gpu::interconnect::maxwell3d { } } + bool TransformFeedbackBufferState::Refresh(InterconnectContext &ctx, StateUpdateBuilder &builder, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask) { + if (*view && view->GetBuffer()->SequencedCpuBackingWritesBlocked()) { + srcStageMask |= vk::PipelineStageFlagBits::eAllCommands; + dstStageMask |= vk::PipelineStageFlagBits::eTransformFeedbackEXT; + } + + return false; + } + void TransformFeedbackBufferState::PurgeCaches() { view.PurgeCaches(); } @@ -408,18 +430,22 @@ namespace skyline::gpu::interconnect::maxwell3d { dirtyFunc(stencilValues); } - void ActiveState::Update(InterconnectContext &ctx, Textures &textures, ConstantBufferSet &constantBuffers, StateUpdateBuilder &builder, bool indexed, engine::DrawTopology topology, u32 drawFirstIndex, u32 drawElementCount) { + void ActiveState::Update(InterconnectContext &ctx, Textures &textures, ConstantBufferSet &constantBuffers, StateUpdateBuilder &builder, + bool indexed, engine::DrawTopology topology, u32 drawFirstIndex, u32 drawElementCount, + vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask) { if (topology != directState.inputAssembly.GetPrimitiveTopology()) { directState.inputAssembly.SetPrimitiveTopology(topology); pipeline.MarkDirty(false); } auto updateFunc{[&](auto &stateElem, auto &&... args) { stateElem.Update(ctx, builder, args...); }}; + auto updateFuncBuffer{[&](auto &stateElem, auto &&... args) { stateElem.Update(ctx, builder, srcStageMask, dstStageMask, args...); }}; + pipeline.Update(ctx, textures, constantBuffers, builder); - ranges::for_each(vertexBuffers, updateFunc); + ranges::for_each(vertexBuffers, updateFuncBuffer); if (indexed) - updateFunc(indexBuffer, directState.inputAssembly.NeedsQuadConversion(), drawFirstIndex, drawElementCount); - ranges::for_each(transformFeedbackBuffers, updateFunc); + updateFuncBuffer(indexBuffer, directState.inputAssembly.NeedsQuadConversion(), drawFirstIndex, drawElementCount); + ranges::for_each(transformFeedbackBuffers, updateFuncBuffer); ranges::for_each(viewports, updateFunc); ranges::for_each(scissors, updateFunc); updateFunc(lineWidth); diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/active_state.h b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/active_state.h index 78acd081..70c5aff0 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/active_state.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/active_state.h @@ -27,9 +27,9 @@ namespace skyline::gpu::interconnect::maxwell3d { public: VertexBufferState(dirty::Handle dirtyHandle, DirtyManager &manager, const EngineRegisters &engine, u32 index); - void Flush(InterconnectContext &ctx, StateUpdateBuilder &builder); + void Flush(InterconnectContext &ctx, StateUpdateBuilder &builder, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask); - bool Refresh(InterconnectContext &ctx, StateUpdateBuilder &builder); + bool Refresh(InterconnectContext &ctx, StateUpdateBuilder &builder, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask); void PurgeCaches(); }; @@ -54,14 +54,14 @@ namespace skyline::gpu::interconnect::maxwell3d { public: IndexBufferState(dirty::Handle dirtyHandle, DirtyManager &manager, const EngineRegisters &engine); - void Flush(InterconnectContext &ctx, StateUpdateBuilder &builder, bool quadConversion, u32 firstIndex, u32 elementCount); + void Flush(InterconnectContext &ctx, StateUpdateBuilder &builder, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask, bool quadConversion, u32 firstIndex, u32 elementCount); - bool Refresh(InterconnectContext &ctx, StateUpdateBuilder &builder, bool quadConversion, u32 firstIndex, u32 elementCount); + bool Refresh(InterconnectContext &ctx, StateUpdateBuilder &builder, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask, bool quadConversion, u32 firstIndex, u32 elementCount); void PurgeCaches(); }; - class TransformFeedbackBufferState : dirty::CachedManualDirty { + class TransformFeedbackBufferState : dirty::CachedManualDirty, dirty::RefreshableManualDirty { public: struct EngineRegisters { const engine::StreamOutBuffer &streamOutBuffer; @@ -78,7 +78,9 @@ namespace skyline::gpu::interconnect::maxwell3d { public: TransformFeedbackBufferState(dirty::Handle dirtyHandle, DirtyManager &manager, const EngineRegisters &engine, u32 index); - void Flush(InterconnectContext &ctx, StateUpdateBuilder &builder); + void Flush(InterconnectContext &ctx, StateUpdateBuilder &builder, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask); + + bool Refresh(InterconnectContext &ctx, StateUpdateBuilder &builder, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask); void PurgeCaches(); }; @@ -258,7 +260,9 @@ namespace skyline::gpu::interconnect::maxwell3d { /** * @brief Updates the active state for a given draw operation, removing the dirtiness of all member states */ - void Update(InterconnectContext &ctx, Textures &textures, ConstantBufferSet &constantBuffers, StateUpdateBuilder &builder, bool indexed, engine::DrawTopology topology, u32 drawFirstIndex, u32 drawElementCount); + void Update(InterconnectContext &ctx, Textures &textures, ConstantBufferSet &constantBuffers, StateUpdateBuilder &builder, + bool indexed, engine::DrawTopology topology, u32 drawFirstIndex, u32 drawElementCount, + vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask); Pipeline *GetPipeline(); diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp index 8b099fba..e42bfc12 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/maxwell_3d.cpp @@ -212,10 +212,11 @@ namespace skyline::gpu::interconnect::maxwell3d { void Maxwell3D::Draw(engine::DrawTopology topology, bool transformFeedbackEnable, bool indexed, u32 count, u32 first, u32 instanceCount, u32 vertexOffset, u32 firstInstance) { StateUpdateBuilder builder{*ctx.executor.allocator}; + vk::PipelineStageFlags srcStageMask{}, dstStageMask{}; Pipeline *oldPipeline{activeState.GetPipeline()}; samplers.Update(ctx, samplerBinding.value == engine::SamplerBinding::Value::ViaHeaderBinding); - activeState.Update(ctx, textures, constantBuffers.boundConstantBuffers, builder, indexed, topology, first, count); + activeState.Update(ctx, textures, constantBuffers.boundConstantBuffers, builder, indexed, topology, first, count, srcStageMask, dstStageMask); if (directState.inputAssembly.NeedsQuadConversion()) { count = conversion::quads::GetIndexCount(count); first = 0; @@ -231,17 +232,18 @@ namespace skyline::gpu::interconnect::maxwell3d { Pipeline *pipeline{activeState.GetPipeline()}; activeDescriptorSetSampledImages.resize(pipeline->GetTotalSampledImageCount()); + auto *descUpdateInfo{[&]() -> DescriptorUpdateInfo * { if (((oldPipeline == pipeline) || (oldPipeline && oldPipeline->CheckBindingMatch(pipeline))) && constantBuffers.quickBindEnabled) { // If bindings between the old and new pipelines are the same we can reuse the descriptor sets given that quick bind is enabled (meaning that no buffer updates or calls to non-graphics engines have occurred that could invalidate them) if (constantBuffers.quickBind) // If only a single constant buffer has been rebound between draws we can perform a partial descriptor update - return pipeline->SyncDescriptorsQuickBind(ctx, constantBuffers.boundConstantBuffers, samplers, textures, *constantBuffers.quickBind, activeDescriptorSetSampledImages); + return pipeline->SyncDescriptorsQuickBind(ctx, constantBuffers.boundConstantBuffers, samplers, textures, *constantBuffers.quickBind, activeDescriptorSetSampledImages, srcStageMask, dstStageMask); else return nullptr; } else { // If bindings have changed or quick bind is disabled, perform a full descriptor update - return pipeline->SyncDescriptors(ctx, constantBuffers.boundConstantBuffers, samplers, textures, activeDescriptorSetSampledImages); + return pipeline->SyncDescriptors(ctx, constantBuffers.boundConstantBuffers, samplers, textures, activeDescriptorSetSampledImages, srcStageMask, dstStageMask); } }()}; @@ -319,7 +321,6 @@ namespace skyline::gpu::interconnect::maxwell3d { if (drawParams->transformFeedbackEnable) commandBuffer.endTransformFeedbackEXT(0, {}, {}); - }, scissor, activeDescriptorSetSampledImages, {}, activeState.GetColorAttachments(), activeState.GetDepthAttachment(), !ctx.gpu.traits.quirks.relaxedRenderPassCompatibility); - + }, scissor, activeDescriptorSetSampledImages, {}, activeState.GetColorAttachments(), activeState.GetDepthAttachment(), !ctx.gpu.traits.quirks.relaxedRenderPassCompatibility, srcStageMask, dstStageMask); } } \ No newline at end of file diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp index 20c9478c..ea6b8df3 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.cpp @@ -258,6 +258,22 @@ namespace skyline::gpu::interconnect::maxwell3d { return shaderStages; } + static vk::PipelineStageFlagBits ConvertShaderToPipelineStage(vk::ShaderStageFlagBits stage) { + switch (stage) { + case vk::ShaderStageFlagBits::eVertex: + return vk::PipelineStageFlagBits::eVertexShader; + case vk::ShaderStageFlagBits::eTessellationControl: + return vk::PipelineStageFlagBits::eTessellationControlShader; + case vk::ShaderStageFlagBits::eTessellationEvaluation: + return vk::PipelineStageFlagBits::eTessellationEvaluationShader; + case vk::ShaderStageFlagBits::eGeometry: + return vk::PipelineStageFlagBits::eGeometryShader; + case vk::ShaderStageFlagBits::eFragment: + return vk::PipelineStageFlagBits::eFragmentShader; + default: + throw exception("Invalid shader stage"); + } + } static Pipeline::DescriptorInfo MakePipelineDescriptorInfo(const std::array &shaderStages, bool needsIndividualTextureBindingWrites) { Pipeline::DescriptorInfo descriptorInfo{}; u16 bindingIndex{}; @@ -268,6 +284,7 @@ namespace skyline::gpu::interconnect::maxwell3d { continue; auto &stageDescInfo{descriptorInfo.stages[i]}; + stageDescInfo.stage = ConvertShaderToPipelineStage(stage.stage); auto pushBindings{[&](vk::DescriptorType type, const auto &descs, u16 &count, auto &outputDescs, auto &&descCb, bool individualDescWrites = false) { descriptorInfo.totalWriteDescCount += individualDescWrites ? descs.size() : ((descs.size() > 0) ? 1 : 0); @@ -712,7 +729,7 @@ namespace skyline::gpu::interconnect::maxwell3d { return descriptorInfo.totalCombinedImageSamplerCount; } - DescriptorUpdateInfo *Pipeline::SyncDescriptors(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, Samplers &samplers, Textures &textures, span sampledImages) { + DescriptorUpdateInfo *Pipeline::SyncDescriptors(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, Samplers &samplers, Textures &textures, span sampledImages, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask) { SyncCachedStorageBufferViews(ctx.executor.executionTag); u32 writeIdx{}; @@ -788,12 +805,18 @@ namespace skyline::gpu::interconnect::maxwell3d { writeBufferDescs(vk::DescriptorType::eUniformBuffer, stage.uniformBufferDescs, stage.uniformBufferDescTotalCount, [&](const DescriptorInfo::StageDescriptorInfo::UniformBufferDesc &desc, size_t arrayIdx) { size_t cbufIdx{desc.index + arrayIdx}; - return GetConstantBufferBinding(ctx, {stage.constantBufferUsedSizes}, constantBuffers[i][cbufIdx].view, cbufIdx); + return GetConstantBufferBinding(ctx, {stage.constantBufferUsedSizes}, + constantBuffers[i][cbufIdx].view, cbufIdx, + stage.stage, + srcStageMask, dstStageMask); }); writeBufferDescs(vk::DescriptorType::eStorageBuffer, stage.storageBufferDescs, stage.storageBufferDescTotalCount, [&](const DescriptorInfo::StageDescriptorInfo::StorageBufferDesc &desc, size_t arrayIdx) { - return GetStorageBufferBinding(ctx, desc, constantBuffers[i][desc.cbuf_index], storageBufferViews[storageBufferIdx++]); + return GetStorageBufferBinding(ctx, desc, constantBuffers[i][desc.cbuf_index], + storageBufferViews[storageBufferIdx++], + stage.stage, + srcStageMask, dstStageMask); }); bindingIdx += stage.uniformTexelBufferDescs.size(); @@ -802,7 +825,10 @@ namespace skyline::gpu::interconnect::maxwell3d { writeImageDescs(vk::DescriptorType::eCombinedImageSampler, stage.combinedImageSamplerDescs, stage.combinedImageSamplerDescTotalCount, [&](const DescriptorInfo::StageDescriptorInfo::CombinedImageSamplerDesc &desc, size_t arrayIdx) { BindlessHandle handle{ReadBindlessHandle(ctx, constantBuffers[i], desc, arrayIdx)}; - auto binding{GetTextureBinding(ctx, desc, samplers, textures, handle)}; + auto binding{GetTextureBinding(ctx, desc, + samplers, textures, handle, + stage.stage, + srcStageMask, dstStageMask)}; sampledImages[combinedImageSamplerIdx++] = binding.second; return binding.first; }, ctx.gpu.traits.quirks.needsIndividualTextureBindingWrites); @@ -825,7 +851,7 @@ namespace skyline::gpu::interconnect::maxwell3d { }); } - DescriptorUpdateInfo *Pipeline::SyncDescriptorsQuickBind(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, Samplers &samplers, Textures &textures, ConstantBuffers::QuickBind quickBind, span sampledImages) { + DescriptorUpdateInfo *Pipeline::SyncDescriptorsQuickBind(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, Samplers &samplers, Textures &textures, ConstantBuffers::QuickBind quickBind, span sampledImages, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask) { SyncCachedStorageBufferViews(ctx.executor.executionTag); size_t stageIndex{static_cast(quickBind.stage)}; @@ -879,18 +905,27 @@ namespace skyline::gpu::interconnect::maxwell3d { writeDescs.operator()(vk::DescriptorType::eUniformBuffer, cbufUsageInfo.uniformBuffers, stageDescInfo.uniformBufferDescs, [&](auto usage, const DescriptorInfo::StageDescriptorInfo::UniformBufferDesc &desc, size_t arrayIdx) -> DynamicBufferBinding { size_t cbufIdx{desc.index + arrayIdx}; - return GetConstantBufferBinding(ctx, {stageDescInfo.constantBufferUsedSizes}, stageConstantBuffers[cbufIdx].view, cbufIdx); + return GetConstantBufferBinding(ctx, {stageDescInfo.constantBufferUsedSizes}, + stageConstantBuffers[cbufIdx].view, cbufIdx, + stageDescInfo.stage, + srcStageMask, dstStageMask); }); writeDescs.operator()(vk::DescriptorType::eStorageBuffer, cbufUsageInfo.storageBuffers, stageDescInfo.storageBufferDescs, [&](auto usage, const DescriptorInfo::StageDescriptorInfo::StorageBufferDesc &desc, size_t arrayIdx) { - return GetStorageBufferBinding(ctx, desc, stageConstantBuffers[desc.cbuf_index], storageBufferViews[usage.entirePipelineIdx + arrayIdx]); + return GetStorageBufferBinding(ctx, desc, stageConstantBuffers[desc.cbuf_index], + storageBufferViews[usage.entirePipelineIdx + arrayIdx], + stageDescInfo.stage, + srcStageMask, dstStageMask); }); writeDescs.operator()(vk::DescriptorType::eCombinedImageSampler, cbufUsageInfo.combinedImageSamplers, stageDescInfo.combinedImageSamplerDescs, [&](auto usage, const DescriptorInfo::StageDescriptorInfo::CombinedImageSamplerDesc &desc, size_t arrayIdx) { BindlessHandle handle{ReadBindlessHandle(ctx, stageConstantBuffers, desc, arrayIdx)}; - auto binding{GetTextureBinding(ctx, desc, samplers, textures, handle)}; + auto binding{GetTextureBinding(ctx, desc, + samplers, textures, handle, + stageDescInfo.stage, + srcStageMask, dstStageMask)}; sampledImages[usage.entirePipelineIdx + arrayIdx] = binding.second; return binding.first; }); diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h index 62351991..fdc95e04 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/pipeline_manager.h @@ -29,6 +29,8 @@ namespace skyline::gpu::interconnect::maxwell3d { std::vector descriptorSetLayoutBindings; struct StageDescriptorInfo { + vk::PipelineStageFlagBits stage; + // Unwrapped counts (counting each array element as a separate descriptor) for the below desc structs u16 uniformBufferDescTotalCount; u16 storageBufferDescTotalCount; @@ -248,13 +250,13 @@ namespace skyline::gpu::interconnect::maxwell3d { * @brief Creates a descriptor set update from the current GPU state * @param sampledImages A span of size `GetTotalSampledImageCount()` in which texture view pointers for each sampled image will be written */ - DescriptorUpdateInfo *SyncDescriptors(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, Samplers &samplers, Textures &textures, span sampledImages); + DescriptorUpdateInfo *SyncDescriptors(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, Samplers &samplers, Textures &textures, span sampledImages, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask); /** * @brief Creates a partial descriptor set update from the current GPU state for only the subset of descriptors changed by the quick bind constant buffer * @param sampledImages A span of size `GetTotalSampledImageCount()` in which texture view pointers for each sampled image will be written */ - DescriptorUpdateInfo *SyncDescriptorsQuickBind(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, Samplers &samplers, Textures &textures, ConstantBuffers::QuickBind quickBind, span sampledImages); + DescriptorUpdateInfo *SyncDescriptorsQuickBind(InterconnectContext &ctx, ConstantBufferSet &constantBuffers, Samplers &samplers, Textures &textures, ConstantBuffers::QuickBind quickBind, span sampledImages, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask); }; /** diff --git a/app/src/main/cpp/skyline/gpu/texture/texture.cpp b/app/src/main/cpp/skyline/gpu/texture/texture.cpp index a5a2de04..6595d1bf 100644 --- a/app/src/main/cpp/skyline/gpu/texture/texture.cpp +++ b/app/src/main/cpp/skyline/gpu/texture/texture.cpp @@ -1006,5 +1006,32 @@ namespace skyline::gpu { void Texture::UpdateRenderPassUsage(u32 renderPassIndex, texture::RenderPassUsage renderPassUsage) { lastRenderPassUsage = renderPassUsage; lastRenderPassIndex = renderPassIndex; + + if (renderPassUsage == texture::RenderPassUsage::RenderTarget) + pendingStageMask = vk::PipelineStageFlagBits::eVertexShader | + vk::PipelineStageFlagBits::eTessellationControlShader | + vk::PipelineStageFlagBits::eTessellationEvaluationShader | + vk::PipelineStageFlagBits::eGeometryShader | + vk::PipelineStageFlagBits::eFragmentShader | + vk::PipelineStageFlagBits::eComputeShader; + else if (renderPassUsage == texture::RenderPassUsage::None) + pendingStageMask = {}; + } + + texture::RenderPassUsage Texture::GetLastRenderPassUsage() { + return lastRenderPassUsage; + } + + void Texture::PopulateReadBarrier(vk::PipelineStageFlagBits dstStage, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask) { + if (!(pendingStageMask & dstStage)) + return; + + if (format->vkAspect & (vk::ImageAspectFlagBits::eDepth | vk::ImageAspectFlagBits::eStencil)) + srcStageMask |= vk::PipelineStageFlagBits::eEarlyFragmentTests | vk::PipelineStageFlagBits::eLateFragmentTests; + else if (format->vkAspect & vk::ImageAspectFlagBits::eColor) + srcStageMask |= vk::PipelineStageFlagBits::eColorAttachmentOutput; + + pendingStageMask &= ~dstStage; + dstStageMask |= dstStage; } } diff --git a/app/src/main/cpp/skyline/gpu/texture/texture.h b/app/src/main/cpp/skyline/gpu/texture/texture.h index d9866d07..63c36092 100644 --- a/app/src/main/cpp/skyline/gpu/texture/texture.h +++ b/app/src/main/cpp/skyline/gpu/texture/texture.h @@ -410,6 +410,7 @@ namespace skyline::gpu { u32 lastRenderPassIndex{}; //!< The index of the last render pass that used this texture texture::RenderPassUsage lastRenderPassUsage{texture::RenderPassUsage::None}; //!< The type of usage in the last render pass + vk::PipelineStageFlags pendingStageMask{}; //!< List of pipeline stages that are yet to be flushed for reads since the last time this texture was used an an RT friend TextureManager; friend TextureView; @@ -606,5 +607,15 @@ namespace skyline::gpu { * @brief Updates renderpass usage tracking information */ void UpdateRenderPassUsage(u32 renderPassIndex, texture::RenderPassUsage renderPassUsage); + + /** + * @return The last usage of the texture + */ + texture::RenderPassUsage GetLastRenderPassUsage(); + + /** + * @brief Populates the input src and dst stage masks with appropriate read barrier parameters for the current texture state + */ + void PopulateReadBarrier(vk::PipelineStageFlagBits dstStage, vk::PipelineStageFlags &srcStageMask, vk::PipelineStageFlags &dstStageMask); }; }