Implement access-driven Buffer synchronization

Similar to constant redundant synchronization for textures, there is a lot of redundant synchronization of buffers. Albeit, buffer synchronization is far cheaper than texture synchronization it still has associated costs which have now been reduced by only synchronizing on access.
2025-02-22 07:17:11 +01:00 · 2022-03-06 21:07:37 +05:30 · 2022-03-06 21:07:37 +05:30 · 881bb969c4
commit 881bb969c4
parent 7532eaf050
3 changed files with 79 additions and 28 deletions
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@ -45,20 +45,39 @@ namespace skyline::gpu {
            alignedMirror = gpu.state.process->memory.CreateMirrors(alignedMappings);
            mirror = alignedMirror.subspan(static_cast<size_t>(frontMapping.data() - alignedData), totalSize);
        }
+
+        trapHandle = gpu.state.nce->TrapRegions(mappings, true, [this] {
+            std::lock_guard lock(*this);
+            SynchronizeGuest(true); // We can skip trapping since the caller will do it
+            WaitOnFence();
+        }, [this] {
+            std::lock_guard lock(*this);
+            SynchronizeGuest(true);
+            dirtyState = DirtyState::CpuDirty; // We need to assume the buffer is dirty since we don't know what the guest is writing
+            WaitOnFence();
+        });
    }

    Buffer::Buffer(GPU &gpu, GuestBuffer guest) : gpu(gpu), size(guest.BufferSize()), backing(gpu.memory.AllocateBuffer(size)), guest(std::move(guest)) {
        SetupGuestMappings();
-        SynchronizeHost();
    }

    Buffer::~Buffer() {
        std::lock_guard lock(*this);
+        if (trapHandle)
+            gpu.state.nce->DeleteTrap(*trapHandle);
        SynchronizeGuest(true);
        if (alignedMirror.valid())
            munmap(alignedMirror.data(), alignedMirror.size());
    }

+    void Buffer::MarkGpuDirty() {
+        if (dirtyState == DirtyState::GpuDirty)
+            return;
+        gpu.state.nce->RetrapRegions(*trapHandle, false);
+        dirtyState = DirtyState::GpuDirty;
+    }
+
    void Buffer::WaitOnFence() {
        TRACE_EVENT("gpu", "Buffer::WaitOnFence");

@ -69,44 +88,58 @@ namespace skyline::gpu {
        }
    }

-    void Buffer::SynchronizeHost() {
+    void Buffer::SynchronizeHost(bool rwTrap) {
+        if (dirtyState != DirtyState::CpuDirty)
+            return; // If the buffer has not been modified on the CPU, there is no need to synchronize it
+
        WaitOnFence();

        TRACE_EVENT("gpu", "Buffer::SynchronizeHost");

-        auto host{backing.data()};
-        for (auto &mapping : guest.mappings) {
-            auto mappingSize{mapping.size_bytes()};
-            std::memcpy(host, mapping.data(), mappingSize);
-            host += mappingSize;
+        std::memcpy(backing.data(), mirror.data(), mirror.size());
+
+        if (rwTrap) {
+            gpu.state.nce->RetrapRegions(*trapHandle, false);
+            dirtyState = DirtyState::GpuDirty;
+        } else {
+            gpu.state.nce->RetrapRegions(*trapHandle, true);
+            dirtyState = DirtyState::Clean;
        }
    }

-    void Buffer::SynchronizeHostWithCycle(const std::shared_ptr<FenceCycle> &pCycle) {
+    void Buffer::SynchronizeHostWithCycle(const std::shared_ptr<FenceCycle> &pCycle, bool rwTrap) {
+        if (dirtyState != DirtyState::CpuDirty)
+            return;
+
        if (pCycle != cycle.lock())
            WaitOnFence();

        TRACE_EVENT("gpu", "Buffer::SynchronizeHostWithCycle");

-        auto host{backing.data()};
-        for (auto &mapping : guest.mappings) {
-            auto mappingSize{mapping.size_bytes()};
-            std::memcpy(host, mapping.data(), mappingSize);
-            host += mappingSize;
+        std::memcpy(backing.data(), mirror.data(), mirror.size());
+
+        if (rwTrap) {
+            gpu.state.nce->RetrapRegions(*trapHandle, false);
+            dirtyState = DirtyState::GpuDirty;
+        } else {
+            gpu.state.nce->RetrapRegions(*trapHandle, true);
+            dirtyState = DirtyState::Clean;
        }
    }

-    void Buffer::SynchronizeGuest() {
+    void Buffer::SynchronizeGuest(bool skipTrap) {
+        if (dirtyState != DirtyState::GpuDirty)
+            return; // If the buffer has not been used on the GPU, there is no need to synchronize it
+
        WaitOnFence();

        TRACE_EVENT("gpu", "Buffer::SynchronizeGuest");

-        auto host{backing.data()};
-        for (auto &mapping : guest.mappings) {
-            auto mappingSize{mapping.size_bytes()};
-            std::memcpy(mapping.data(), host, mappingSize);
-            host += mappingSize;
-        }
+        std::memcpy(mirror.data(), backing.data(), mirror.size());
+
+        if (!skipTrap)
+            gpu.state.nce->RetrapRegions(*trapHandle, true);
+        dirtyState = DirtyState::Clean;
    }

    /**
@ -132,7 +165,10 @@ namespace skyline::gpu {
    }

    void Buffer::Write(span<u8> data, vk::DeviceSize offset) {
-        std::memcpy(mirror.data() + offset, data.data(), data.size());
+        if (dirtyState == DirtyState::CpuDirty || dirtyState == DirtyState::Clean)
+            std::memcpy(mirror.data() + offset, data.data(), data.size());
+        if (dirtyState == DirtyState::GpuDirty || dirtyState == DirtyState::Clean)
+            std::memcpy(backing.data() + offset, data.data(), data.size());
    }

    std::shared_ptr<BufferView> Buffer::GetView(vk::DeviceSize offset, vk::DeviceSize range, vk::Format format) {
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@ -3,6 +3,7 @@

 #pragma once

+#include <nce.h>
 #include "memory_manager.h"

 namespace skyline::gpu {
@ -36,6 +37,13 @@ namespace skyline::gpu {

        span<u8> mirror{}; //!< A contiguous mirror of all the guest mappings to allow linear access on the CPU
        span<u8> alignedMirror{}; //!< The mirror mapping aligned to page size to reflect the full mapping
+        std::optional<nce::NCE::TrapHandle> trapHandle{}; //!< The handle of the traps for the guest mappings
+        enum class DirtyState {
+            Clean, //!< The CPU mappings are in sync with the GPU buffer
+            CpuDirty, //!< The CPU mappings have been modified but the GPU buffer is not up to date
+            GpuDirty, //!< The GPU buffer has been modified but the CPU mappings have not been updated
+        } dirtyState{DirtyState::CpuDirty}; //!< The state of the CPU mappings with respect to the GPU buffer
+
        std::vector<std::weak_ptr<BufferView>> views; //!< BufferView(s) that are backed by this Buffer, used for repointing to a new Buffer on deletion

        friend BufferView;
@ -81,6 +89,13 @@ namespace skyline::gpu {
            return mutex.try_lock();
        }

+        /**
+         * @brief Marks the buffer as dirty on the GPU, it will be synced on the next call to SynchronizeGuest
+         * @note This **must** be called after syncing the buffer to the GPU not before
+         * @note The buffer **must** be locked prior to calling this
+         */
+        void MarkGpuDirty();
+
        /**
         * @brief Waits on a fence cycle if it exists till it's signalled and resets it after
         * @note The buffer **must** be locked prior to calling this
@ -89,22 +104,25 @@ namespace skyline::gpu {

        /**
         * @brief Synchronizes the host buffer with the guest
+         * @param rwTrap If true, the guest buffer will be read/write trapped rather than only being write trapped which is more efficient than calling MarkGpuDirty directly after
         * @note The buffer **must** be locked prior to calling this
         */
-        void SynchronizeHost();
+        void SynchronizeHost(bool rwTrap = false);

        /**
         * @brief Synchronizes the host buffer with the guest
         * @param cycle A FenceCycle that is checked against the held one to skip waiting on it when equal
+         * @param rwTrap If true, the guest buffer will be read/write trapped rather than only being write trapped which is more efficient than calling MarkGpuDirty directly after
         * @note The buffer **must** be locked prior to calling this
         */
-        void SynchronizeHostWithCycle(const std::shared_ptr<FenceCycle> &cycle);
+        void SynchronizeHostWithCycle(const std::shared_ptr<FenceCycle> &cycle, bool rwTrap = false);

        /**
         * @brief Synchronizes the guest buffer with the host buffer
+         * @param skipTrap If true, setting up a CPU trap will be skipped and the dirty state will be Clean/CpuDirty
         * @note The buffer **must** be locked prior to calling this
         */
-        void SynchronizeGuest();
+        void SynchronizeGuest(bool skipTrap = false);

        /**
         * @brief Synchronizes the guest buffer with the host buffer when the FenceCycle is signalled
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@ -134,7 +134,7 @@ namespace skyline::gpu::interconnect {
                    texture->SynchronizeHostWithBuffer(commandBuffer, cycle, true);

                for (auto buffer : syncBuffers)
-                    buffer->SynchronizeHostWithCycle(cycle);
+                    buffer->SynchronizeHostWithCycle(cycle, true);

                vk::RenderPass lRenderPass;
                u32 subpassIndex;
@ -162,9 +162,6 @@ namespace skyline::gpu::interconnect {
                    #undef NODE
                }

-                for (auto buffer : syncBuffers)
-                    buffer->SynchronizeGuestWithCycle(cycle);
-
                commandBuffer.end();
                gpu.scheduler.SubmitCommandBuffer(commandBuffer, activeCommandBuffer.GetFence());