Use semaphores for presentation engine frame synchronisation

Avoids waits on the CPU which can be costly and confuse the scheduler, also reduces latency significantly.
2024-12-26 18:41:49 +01:00 · 2022-10-16 20:47:17 +01:00 · 2022-10-16 20:47:17 +01:00 · 1a0819fb76
commit 1a0819fb76
parent 0670e0e0dc
4 changed files with 106 additions and 87 deletions
--- a/app/src/main/cpp/skyline/gpu/presentation_engine.cpp
+++ b/app/src/main/cpp/skyline/gpu/presentation_engine.cpp
@ -24,7 +24,8 @@ namespace skyline::gpu {
    PresentationEngine::PresentationEngine(const DeviceState &state, GPU &gpu)
        : state{state},
          gpu{gpu},
-          acquireFence{gpu.vkDevice, vk::FenceCreateInfo{}},
+          presentSemaphores{util::MakeFilledArray<vk::raii::Semaphore, MaxSwapchainImageCount>(gpu.vkDevice, vk::SemaphoreCreateInfo{})},
+          acquireSemaphores{util::MakeFilledArray<vk::raii::Semaphore, MaxSwapchainImageCount>(gpu.vkDevice, vk::SemaphoreCreateInfo{})},
          presentationTrack{static_cast<u64>(trace::TrackIds::Presentation), perfetto::ProcessTrack::Current()},
          vsyncEvent{std::make_shared<kernel::type::KEvent>(state, true)},
          choreographerThread{&PresentationEngine::ChoreographerThread, this},
@ -116,35 +117,31 @@ namespace skyline::gpu {
            windowScalingMode = frame.scalingMode;
        }

-        if (frame.transform != windowTransform) {
-            if ((result = window->perform(window, NATIVE_WINDOW_SET_BUFFERS_TRANSFORM, static_cast<i32>(frame.transform))))
-                throw exception("Setting the buffer transform to '{}' failed with {}", ToString(frame.transform), result);
-            windowTransform = frame.transform;
-        }
+        if ((result = window->perform(window, NATIVE_WINDOW_SET_BUFFERS_TRANSFORM, static_cast<i32>(frame.transform))))
+            throw exception("Setting the buffer transform to '{}' failed with {}", ToString(frame.transform), result);
+        windowTransform = frame.transform;

-        gpu.vkDevice.resetFences(*acquireFence);
+        auto &acquireSemaphore{acquireSemaphores[acquireSemaphoreIndex]};
+        acquireSemaphoreIndex = (acquireSemaphoreIndex + 1) % swapchainImageCount;

        std::pair<vk::Result, u32> nextImage;
-        while (nextImage = vkSwapchain->acquireNextImage(std::numeric_limits<u64>::max(), {}, *acquireFence), nextImage.first != vk::Result::eSuccess) [[unlikely]] {
+        while (nextImage = vkSwapchain->acquireNextImage(std::numeric_limits<u64>::max(), *acquireSemaphore, {}), nextImage.first != vk::Result::eSuccess) [[unlikely]] {
            if (nextImage.first == vk::Result::eSuboptimalKHR)
                surfaceCondition.wait(lock, [this]() { return vkSurface.has_value(); });
            else
                throw exception("vkAcquireNextImageKHR returned an unhandled result '{}'", vk::to_string(nextImage.first));
        }
-        auto &nextImageTexture{images.at(nextImage.second)};

-        std::ignore = gpu.vkDevice.waitForFences(*acquireFence, true, std::numeric_limits<u64>::max());
+        auto &nextImageTexture{images.at(nextImage.second)};
+        auto &presentSemaphore{presentSemaphores[nextImage.second]};

        texture->SynchronizeHost();
-        nextImageTexture->CopyFrom(texture, vk::ImageSubresourceRange{
+        nextImageTexture->CopyFrom(texture, *acquireSemaphore, *presentSemaphore, vk::ImageSubresourceRange{
            .aspectMask = vk::ImageAspectFlagBits::eColor,
            .levelCount = 1,
            .layerCount = 1,
        });

-        // Wait on the copy to the swapchain image to complete before submitting for presentation
-        nextImageTexture->WaitOnFence();
-
        auto getMonotonicNsNow{[]() -> i64 {
            timespec time;
            if (clock_gettime(CLOCK_MONOTONIC, &time))
@ -194,6 +191,8 @@ namespace skyline::gpu {
                .swapchainCount = 1,
                .pSwapchains = &**vkSwapchain,
                .pImageIndices = &nextImage.second,
+                .waitSemaphoreCount = 1,
+                .pWaitSemaphores = &*presentSemaphore,
            }); // We don't care about suboptimal images as they are caused by not respecting the transform hint, we handle transformations externally
        }

@ -328,6 +327,7 @@ namespace skyline::gpu {

        swapchainFormat = format;
        swapchainExtent = extent;
+        swapchainImageCount = vkImages.size();
    }

    void PresentationEngine::UpdateSurface(jobject newSurface) {
--- a/app/src/main/cpp/skyline/gpu/presentation_engine.h
+++ b/app/src/main/cpp/skyline/gpu/presentation_engine.h
@ -35,12 +35,15 @@ namespace skyline::gpu {
        vk::SurfaceCapabilitiesKHR vkSurfaceCapabilities{}; //!< The capabilities of the current Vulkan Surface

        std::optional<vk::raii::SwapchainKHR> vkSwapchain; //!< The Vulkan swapchain and the properties associated with it
-        vk::raii::Fence acquireFence; //!< A fence for acquiring an image from the swapchain
        texture::Format swapchainFormat{}; //!< The image format of the textures in the current swapchain
        texture::Dimensions swapchainExtent{}; //!< The extent of images in the current swapchain

        static constexpr size_t MaxSwapchainImageCount{6}; //!< The maximum amount of swapchain textures, this affects the amount of images that can be in the swapchain
        std::array<std::shared_ptr<Texture>, MaxSwapchainImageCount> images; //!< All the swapchain textures in the same order as supplied by the host swapchain
+        std::array<vk::raii::Semaphore, MaxSwapchainImageCount> presentSemaphores; //!< Array of semaphores used to signal that swapchain images are ready to be completed, indexed by Vulkan swapchain index
+        std::array<vk::raii::Semaphore, MaxSwapchainImageCount> acquireSemaphores; //!< Array of semaphores used to wait on the GPU for swapchain images to be acquired, indexed by `acquireSemaphoreIndex`
+        size_t acquireSemaphoreIndex{}; //!< The index of the next semaphore to be used for acquiring swapchain images
+        size_t swapchainImageCount{}; //!< The number of images in the current swapchain

        i64 frameTimestamp{}; //!< The timestamp of the last frame being shown in nanoseconds
        i64 averageFrametimeNs{}; //!< The average time between frames in nanoseconds
--- a/app/src/main/cpp/skyline/gpu/texture/texture.cpp
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.cpp
@ -839,14 +839,16 @@ namespace skyline::gpu {
        return std::make_shared<TextureView>(shared_from_this(), type, range, pFormat, mapping);
    }

-    void Texture::CopyFrom(std::shared_ptr<Texture> source, const vk::ImageSubresourceRange &subresource) {
-        WaitOnBacking();
-        source->WaitOnBacking();
+    void Texture::CopyFrom(std::shared_ptr<Texture> source, vk::Semaphore waitSemaphore, vk::Semaphore signalSemaphore, const vk::ImageSubresourceRange &subresource) {
        if (cycle)
            cycle->WaitSubmit();
        if (source->cycle)
            source->cycle->WaitSubmit();

+        WaitOnBacking();
+        source->WaitOnBacking();
+        WaitOnFence();
+
        if (source->layout == vk::ImageLayout::eUndefined)
            throw exception("Cannot copy from image with undefined layout");
        else if (source->dimensions != dimensions)
@ -854,78 +856,92 @@ namespace skyline::gpu {

        TRACE_EVENT("gpu", "Texture::CopyFrom");

-        auto lCycle{gpu.scheduler.Submit([&](vk::raii::CommandBuffer &commandBuffer) {
-            auto sourceBacking{source->GetBacking()};
-            if (source->layout != vk::ImageLayout::eTransferSrcOptimal) {
-                commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTopOfPipe, vk::PipelineStageFlagBits::eTransfer, {}, {}, {}, vk::ImageMemoryBarrier{
-                    .image = sourceBacking,
-                    .srcAccessMask = vk::AccessFlagBits::eMemoryWrite,
-                    .dstAccessMask = vk::AccessFlagBits::eTransferRead,
-                    .oldLayout = source->layout,
-                    .newLayout = vk::ImageLayout::eTransferSrcOptimal,
-                    .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                    .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                    .subresourceRange = subresource,
-                });
-            }
+        auto submitFunc{[&](vk::Semaphore extraWaitSemaphore){
+            boost::container::small_vector<vk::Semaphore, 2> waitSemaphores;
+            if (waitSemaphore)
+                waitSemaphores.push_back(waitSemaphore);

-            auto destinationBacking{GetBacking()};
-            if (layout != vk::ImageLayout::eTransferDstOptimal) {
-                commandBuffer.pipelineBarrier(layout != vk::ImageLayout::eUndefined ? vk::PipelineStageFlagBits::eTopOfPipe : vk::PipelineStageFlagBits::eBottomOfPipe, vk::PipelineStageFlagBits::eTransfer, {}, {}, {}, vk::ImageMemoryBarrier{
-                    .image = destinationBacking,
-                    .srcAccessMask = vk::AccessFlagBits::eMemoryRead,
-                    .dstAccessMask = vk::AccessFlagBits::eTransferWrite,
-                    .oldLayout = layout,
-                    .newLayout = vk::ImageLayout::eTransferDstOptimal,
-                    .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                    .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                    .subresourceRange = subresource,
-                });
+            if (extraWaitSemaphore)
+                waitSemaphores.push_back(extraWaitSemaphore);

-                if (layout == vk::ImageLayout::eUndefined)
-                    layout = vk::ImageLayout::eTransferDstOptimal;
-            }
+            return gpu.scheduler.Submit([&](vk::raii::CommandBuffer &commandBuffer) {
+                auto sourceBacking{source->GetBacking()};
+                if (source->layout != vk::ImageLayout::eTransferSrcOptimal) {
+                    commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTopOfPipe, vk::PipelineStageFlagBits::eTransfer, {}, {}, {}, vk::ImageMemoryBarrier{
+                        .image = sourceBacking,
+                        .srcAccessMask = vk::AccessFlagBits::eMemoryWrite,
+                        .dstAccessMask = vk::AccessFlagBits::eTransferRead,
+                        .oldLayout = source->layout,
+                        .newLayout = vk::ImageLayout::eTransferSrcOptimal,
+                        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                        .subresourceRange = subresource,
+                        });
+                }

-            vk::ImageSubresourceLayers subresourceLayers{
-                .aspectMask = subresource.aspectMask,
-                .mipLevel = subresource.baseMipLevel,
-                .baseArrayLayer = subresource.baseArrayLayer,
-                .layerCount = subresource.layerCount == VK_REMAINING_ARRAY_LAYERS ? layerCount - subresource.baseArrayLayer : subresource.layerCount,
-            };
-            for (; subresourceLayers.mipLevel < (subresource.levelCount == VK_REMAINING_MIP_LEVELS ? levelCount - subresource.baseMipLevel : subresource.levelCount); subresourceLayers.mipLevel++)
-                commandBuffer.copyImage(sourceBacking, vk::ImageLayout::eTransferSrcOptimal, destinationBacking, vk::ImageLayout::eTransferDstOptimal, vk::ImageCopy{
-                    .srcSubresource = subresourceLayers,
-                    .dstSubresource = subresourceLayers,
-                    .extent = dimensions,
-                });
+                auto destinationBacking{GetBacking()};
+                if (layout != vk::ImageLayout::eTransferDstOptimal) {
+                    commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, {}, {}, {}, vk::ImageMemoryBarrier{
+                        .image = destinationBacking,
+                        .srcAccessMask = vk::AccessFlagBits::eMemoryRead,
+                        .dstAccessMask = vk::AccessFlagBits::eTransferWrite,
+                        .oldLayout = layout,
+                        .newLayout = vk::ImageLayout::eTransferDstOptimal,
+                        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                        .subresourceRange = subresource,
+                        });

-            if (layout != vk::ImageLayout::eTransferDstOptimal)
-                commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eTransfer, {}, {}, {}, vk::ImageMemoryBarrier{
-                    .image = destinationBacking,
-                    .srcAccessMask = vk::AccessFlagBits::eTransferWrite,
-                    .dstAccessMask = vk::AccessFlagBits::eMemoryRead,
-                    .oldLayout = vk::ImageLayout::eTransferDstOptimal,
-                    .newLayout = layout,
-                    .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                    .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                    .subresourceRange = subresource,
-                });
+                    if (layout == vk::ImageLayout::eUndefined)
+                        layout = vk::ImageLayout::eTransferDstOptimal;
+                }

-            if (source->layout != vk::ImageLayout::eTransferSrcOptimal)
-                commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eTransfer, {}, {}, {}, vk::ImageMemoryBarrier{
-                    .image = sourceBacking,
-                    .srcAccessMask = vk::AccessFlagBits::eTransferRead,
-                    .dstAccessMask = vk::AccessFlagBits::eMemoryWrite,
-                    .oldLayout = vk::ImageLayout::eTransferSrcOptimal,
-                    .newLayout = source->layout,
-                    .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                    .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                    .subresourceRange = subresource,
-                });
-        })};
-        lCycle->AttachObjects(std::move(source), shared_from_this());
-        lCycle->ChainCycle(cycle);
-        lCycle->ChainCycle(source->cycle);
-        cycle = lCycle;
+                vk::ImageSubresourceLayers subresourceLayers{
+                    .aspectMask = subresource.aspectMask,
+                    .mipLevel = subresource.baseMipLevel,
+                    .baseArrayLayer = subresource.baseArrayLayer,
+                    .layerCount = subresource.layerCount == VK_REMAINING_ARRAY_LAYERS ? layerCount - subresource.baseArrayLayer : subresource.layerCount,
+                    };
+                for (; subresourceLayers.mipLevel < (subresource.levelCount == VK_REMAINING_MIP_LEVELS ? levelCount - subresource.baseMipLevel : subresource.levelCount); subresourceLayers.mipLevel++)
+                    commandBuffer.copyImage(sourceBacking, vk::ImageLayout::eTransferSrcOptimal, destinationBacking, vk::ImageLayout::eTransferDstOptimal, vk::ImageCopy{
+                        .srcSubresource = subresourceLayers,
+                        .dstSubresource = subresourceLayers,
+                        .extent = dimensions,
+                        });
+
+                if (layout != vk::ImageLayout::eTransferDstOptimal)
+                    commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, {}, {}, {}, vk::ImageMemoryBarrier{
+                        .image = destinationBacking,
+                        .srcAccessMask = vk::AccessFlagBits::eTransferWrite,
+                        .dstAccessMask = vk::AccessFlagBits::eMemoryRead,
+                        .oldLayout = vk::ImageLayout::eTransferDstOptimal,
+                        .newLayout = layout,
+                        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                        .subresourceRange = subresource,
+                        });
+
+                if (source->layout != vk::ImageLayout::eTransferSrcOptimal)
+                    commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, {}, {}, {}, vk::ImageMemoryBarrier{
+                        .image = sourceBacking,
+                        .srcAccessMask = vk::AccessFlagBits::eTransferRead,
+                        .dstAccessMask = vk::AccessFlagBits::eMemoryWrite,
+                        .oldLayout = vk::ImageLayout::eTransferSrcOptimal,
+                        .newLayout = source->layout,
+                        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                        .subresourceRange = subresource,
+                        });
+            }, waitSemaphores, span<vk::Semaphore>{signalSemaphore});
+        }};
+
+        auto newCycle{[&]{
+            if (source->cycle)
+                return source->cycle->RecordSemaphoreWaitUsage(std::move(submitFunc));
+            else
+                return submitFunc({});
+        }()};
+        newCycle->AttachObjects(std::move(source), shared_from_this());
+        cycle = newCycle;
    }
 }
--- a/app/src/main/cpp/skyline/gpu/texture/texture.h
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.h
@ -559,7 +559,7 @@ namespace skyline::gpu {
        /**
         * @brief Copies the contents of the supplied source texture into the current texture
         */
-        void CopyFrom(std::shared_ptr<Texture> source, const vk::ImageSubresourceRange &subresource = vk::ImageSubresourceRange{
+        void CopyFrom(std::shared_ptr<Texture> source, vk::Semaphore waitSemaphore, vk::Semaphore signalSemaphore, const vk::ImageSubresourceRange &subresource = vk::ImageSubresourceRange{
            .aspectMask = vk::ImageAspectFlagBits::eColor,
            .levelCount = VK_REMAINING_MIP_LEVELS,
            .layerCount = VK_REMAINING_ARRAY_LAYERS,