Rework Texture Synchronization API + Locking

Similar to `Buffer`s, `Texture`s suffered from unoptimal behavior due to using atomics for `DirtyState` and had certain bugs with replacement of the variable at times where it shouldn't be possible which have now been fixed by moving to using a mutex instead of atomics. This commit also updates the API to more closely match what is expected of it now and removes any functions that weren't utilized such as `SynchronizeGuestWithBuffer`.
2024-12-24 18:31:49 +01:00 · 2022-07-29 02:34:58 +05:30 · 2022-07-29 02:34:58 +05:30 · 8a62f8d37b
commit 8a62f8d37b
parent 04bcd7e580
3 changed files with 88 additions and 122 deletions
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@ -255,10 +255,8 @@ namespace skyline::gpu::interconnect {
                }, {}, {}
            );
-            for (const auto &texture : attachedTextures) {
+            for (const auto &texture : attachedTextures)
-                texture->SynchronizeHostWithBuffer(commandBuffer, cycle, true);
+                texture->SynchronizeHostInline(commandBuffer, cycle, true);
                texture->MarkGpuDirty();
            }
            vk::RenderPass lRenderPass;
            u32 subpassIndex;
--- a/app/src/main/cpp/skyline/gpu/texture/texture.cpp
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.cpp
@ -150,7 +150,13 @@ namespace skyline::gpu {
            if (!texture)
                return;
            std::unique_lock stateLock{texture->stateMutex};
            if (texture->dirtyState == DirtyState::GpuDirty) {
                stateLock.unlock(); // If the lock isn't unlocked, a deadlock from threads waiting on the other lock can occur
                // If this mutex would cause other callbacks to be blocked then we should block on this mutex in advance
                std::scoped_lock lock{*texture};
            }
        }, [weakThis] {
            TRACE_EVENT("gpu", "Texture::ReadTrap");
@ -158,11 +164,18 @@ namespace skyline::gpu {
            if (!texture)
                return true;
            std::unique_lock stateLock{texture->stateMutex, std::try_to_lock};
            if (!stateLock)
                return false;
            if (texture->dirtyState != DirtyState::GpuDirty)
                return true; // If state is already CPU dirty/Clean we don't need to do anything
            std::unique_lock lock{*texture, std::try_to_lock};
            if (!lock)
                return false;
-            texture->SynchronizeGuest(true); // We can skip trapping since the caller will do it
+            texture->SynchronizeGuest(false, true); // We can skip trapping since the caller will do it
            texture->WaitOnFence();
            return true;
        }, [weakThis] {
@ -172,9 +185,14 @@ namespace skyline::gpu {
            if (!texture)
                return true;
-            DirtyState expectedState{DirtyState::Clean};
+            std::unique_lock stateLock{texture->stateMutex, std::try_to_lock};
-            if (texture->dirtyState.compare_exchange_strong(expectedState, DirtyState::CpuDirty, std::memory_order_relaxed) || expectedState == DirtyState::CpuDirty)
+            if (!stateLock)
-                return true; // If we can transition the texture to CPU dirty (from Clean) or if it already is CPU dirty then we can just return, we only need to do the lock and corresponding sync if the texture is GPU dirty
+                return false;
            if (texture->dirtyState != DirtyState::GpuDirty) {
                texture->dirtyState = DirtyState::CpuDirty;
                return true; // If the texture is already CPU dirty or we can transition it to being CPU dirty then we don't need to do anything
            }
            std::unique_lock lock{*texture, std::try_to_lock};
            if (!lock)
@ -603,19 +621,6 @@ namespace skyline::gpu {
        return mutex.try_lock();
    }
    void Texture::MarkGpuDirty() {
        if (!guest || format != guest->format)
            return; // We need to skip GPU dirty if the host format and guest format differ as we don't support re-encoding compressed textures which is when this generally occurs
        auto currentState{dirtyState.load(std::memory_order_relaxed)};
        do {
            if (currentState == DirtyState::GpuDirty)
                return;
        } while (!dirtyState.compare_exchange_strong(currentState, DirtyState::GpuDirty, std::memory_order_relaxed));
        gpu.state.nce->RetrapRegions(*trapHandle, false);
    }
    bool Texture::WaitOnBacking() {
        TRACE_EVENT("gpu", "Texture::WaitOnBacking");
@ -675,17 +680,24 @@ namespace skyline::gpu {
        }
    }
-    void Texture::SynchronizeHost(bool rwTrap) {
+    void Texture::SynchronizeHost(bool gpuDirty) {
-        if (!guest)
+        if (!guest || format != guest->format)
-            return;
+            return; // We need to skip GPU dirty if the host format and guest format differ as we don't support re-encoding compressed textures which is when this generally occurs
        auto currentState{dirtyState.load(std::memory_order_relaxed)};
        do {
            if (currentState != DirtyState::CpuDirty)
                return; // If the texture has not been modified on the CPU or has no mappings, there is no need to synchronize it
        } while (!dirtyState.compare_exchange_strong(currentState, rwTrap ? DirtyState::GpuDirty : DirtyState::Clean, std::memory_order_relaxed));
        TRACE_EVENT("gpu", "Texture::SynchronizeHost");
        {
            std::scoped_lock lock{stateMutex};
            if (gpuDirty && dirtyState == DirtyState::Clean) {
                // If a texture is Clean then we can just transition it to being GPU dirty and retrap it
                dirtyState = DirtyState::GpuDirty;
                gpu.state.nce->RetrapRegions(*trapHandle, false);
                return;
            } else if (dirtyState != DirtyState::CpuDirty) {
                return; // If the texture has not been modified on the CPU, there is no need to synchronize it
            }
            dirtyState = gpuDirty ? DirtyState::GpuDirty : DirtyState::Clean;
        }
        auto stagingBuffer{SynchronizeHostImpl()};
        if (stagingBuffer) {
@ -697,20 +709,27 @@ namespace skyline::gpu {
            cycle = lCycle;
        }
-        gpu.state.nce->RetrapRegions(*trapHandle, !rwTrap); // Trap any future CPU reads (optionally) + writes to this texture
+        gpu.state.nce->RetrapRegions(*trapHandle, !gpuDirty); // Trap any future CPU reads (optionally) + writes to this texture
    }
-    void Texture::SynchronizeHostWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &pCycle, bool rwTrap) {
+    void Texture::SynchronizeHostInline(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &pCycle, bool gpuDirty) {
-        if (!guest)
+        if (!guest || format != guest->format)
-            return;
+            return; // See SynchronizeHost(...)
-        auto currentState{dirtyState.load(std::memory_order_relaxed)};
+        TRACE_EVENT("gpu", "Texture::SynchronizeHostInline");
        do {
            if (currentState != DirtyState::CpuDirty)
                return;
        } while (!dirtyState.compare_exchange_strong(currentState, rwTrap ? DirtyState::GpuDirty : DirtyState::Clean, std::memory_order_relaxed));
-        TRACE_EVENT("gpu", "Texture::SynchronizeHostWithBuffer");
+        {
            std::scoped_lock lock{stateMutex};
            if (gpuDirty && dirtyState == DirtyState::Clean) {
                dirtyState = DirtyState::GpuDirty;
                gpu.state.nce->RetrapRegions(*trapHandle, false);
                return;
            } else if (dirtyState != DirtyState::CpuDirty) {
                return;
            }
            dirtyState = gpuDirty ? DirtyState::GpuDirty : DirtyState::Clean;
        }
        auto stagingBuffer{SynchronizeHostImpl()};
        if (stagingBuffer) {
@ -720,26 +739,28 @@ namespace skyline::gpu {
            cycle = pCycle;
        }
-        gpu.state.nce->RetrapRegions(*trapHandle, !rwTrap); // Trap any future CPU reads (optionally) + writes to this texture
+        gpu.state.nce->RetrapRegions(*trapHandle, !gpuDirty); // Trap any future CPU reads (optionally) + writes to this texture
    }
-    void Texture::SynchronizeGuest(bool skipTrap, bool setDirty) {
+    void Texture::SynchronizeGuest(bool cpuDirty, bool skipTrap) {
        if (!guest)
            return;
        auto currentState{dirtyState.load(std::memory_order_relaxed)};
        do {
            if (currentState == DirtyState::CpuDirty || (currentState == DirtyState::Clean && setDirty))
                return; // If the texture is synchronized (Clean/CpuDirty), there is no need to synchronize it
        } while (!dirtyState.compare_exchange_strong(currentState, setDirty ? DirtyState::CpuDirty : DirtyState::Clean, std::memory_order_relaxed));
        TRACE_EVENT("gpu", "Texture::SynchronizeGuest");
        {
            std::scoped_lock lock{stateMutex};
            if (cpuDirty && dirtyState == DirtyState::Clean) {
                dirtyState = DirtyState::CpuDirty;
                if (!skipTrap)
-            gpu.state.nce->RetrapRegions(*trapHandle, !setDirty);
+                    gpu.state.nce->DeleteTrap(*trapHandle);
                return;
            } else if (dirtyState != DirtyState::GpuDirty) {
                return;
            }
-        if (setDirty && currentState == DirtyState::Clean)
+            dirtyState = cpuDirty ? DirtyState::CpuDirty : DirtyState::Clean;
-            return; // If the texture was simply transitioned from Clean to CpuDirty, there is no need to synchronize it
+        }
        if (layout == vk::ImageLayout::eUndefined || format != guest->format)
            // If the state of the host texture is undefined then so can the guest
@ -764,43 +785,12 @@ namespace skyline::gpu {
        } else {
            throw exception("Host -> Guest synchronization of images tiled as '{}' isn't implemented", vk::to_string(tiling));
        }
    }
-    void Texture::SynchronizeGuestWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &pCycle) {
+        if (!skipTrap)
-        if (!guest)
+            if (cpuDirty)
-            return;
+                gpu.state.nce->DeleteTrap(*trapHandle);
-
+            else
-        auto currentState{dirtyState.load(std::memory_order_relaxed)};
+                gpu.state.nce->RetrapRegions(*trapHandle, true); // Trap any future CPU writes to this texture
        do {
            if (currentState != DirtyState::GpuDirty)
                return; // If the texture has not been used on the GPU, there is no need to synchronize it
        } while (!dirtyState.compare_exchange_strong(currentState, DirtyState::Clean, std::memory_order_relaxed));
        TRACE_EVENT("gpu", "Texture::SynchronizeGuestWithBuffer");
        gpu.state.nce->RetrapRegions(*trapHandle, true);
        if (layout == vk::ImageLayout::eUndefined || format != guest->format)
            // If the state of the host texture is undefined then so can the guest
            // If the texture has differing formats on the guest and host, we don't support converting back in that case as it may involve recompression of a decompressed texture
            return;
        WaitOnBacking();
        pCycle->ChainCycle(cycle);
        if (tiling == vk::ImageTiling::eOptimal || !std::holds_alternative<memory::Image>(backing)) {
            auto stagingBuffer{gpu.memory.AllocateStagingBuffer(surfaceSize)};
            CopyIntoStagingBuffer(commandBuffer, stagingBuffer);
            pCycle->AttachObject(std::make_shared<TextureBufferCopy>(shared_from_this(), stagingBuffer));
            cycle = pCycle;
        } else if (tiling == vk::ImageTiling::eLinear) {
            CopyToGuest(std::get<memory::Image>(backing).data());
            pCycle->AttachObject(std::make_shared<TextureBufferCopy>(shared_from_this()));
            cycle = pCycle;
        } else {
            throw exception("Host -> Guest synchronization of images tiled as '{}' isn't implemented", vk::to_string(tiling));
        }
    }
    std::shared_ptr<TextureView> Texture::GetView(vk::ImageViewType type, vk::ImageSubresourceRange range, texture::Format pFormat, vk::ComponentMapping mapping) {
--- a/app/src/main/cpp/skyline/gpu/texture/texture.h
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.h
@ -367,9 +367,8 @@ namespace skyline::gpu {
            Clean, //!< The CPU mappings are in sync with the GPU texture
            CpuDirty, //!< The CPU mappings have been modified but the GPU texture is not up to date
            GpuDirty, //!< The GPU texture has been modified but the CPU mappings have not been updated
-        };
+        } dirtyState{DirtyState::CpuDirty}; //!< The state of the CPU mappings with respect to the GPU texture
-        std::atomic<DirtyState> dirtyState{DirtyState::CpuDirty}; //!< The state of the CPU mappings with respect to the GPU texture
+        std::recursive_mutex stateMutex; //!< Synchronizes access to the dirty state
        static_assert(std::atomic<DirtyState>::is_always_lock_free);
        /**
         * @brief Storage for all metadata about a specific view into the buffer, used to prevent redundant view creation and duplication of VkBufferView(s)
@ -381,7 +380,7 @@ namespace skyline::gpu {
            vk::ImageSubresourceRange range;
            vk::raii::ImageView vkView;
-            TextureViewStorage(vk::ImageViewType type, texture::Format format, vk::ComponentMapping mapping, vk::ImageSubresourceRange range, vk::raii::ImageView&& vkView);
+            TextureViewStorage(vk::ImageViewType type, texture::Format format, vk::ComponentMapping mapping, vk::ImageSubresourceRange range, vk::raii::ImageView &&vkView);
        };
        std::vector<TextureViewStorage> views;
@ -503,13 +502,6 @@ namespace skyline::gpu {
         */
        bool try_lock();
        /**
         * @brief Marks the texture as dirty on the GPU, it will be synced on the next call to SynchronizeGuest
         * @note This **must** be called after syncing the texture to the GPU not before
         * @note The texture **must** be locked prior to calling this
         */
        void MarkGpuDirty();
        /**
         * @brief Waits on the texture backing to be a valid non-null Vulkan image
         * @return If the mutex could be unlocked during the function
@ -535,44 +527,30 @@ namespace skyline::gpu {
         */
        void TransitionLayout(vk::ImageLayout layout);
        /**
         * @brief Converts the texture to have the specified format
         */
        void SetFormat(texture::Format format);
        /**
         * @brief Synchronizes the host texture with the guest after it has been modified
-         * @param rwTrap If true, the guest buffer will be read/write trapped rather than only being write trapped which is more efficient than calling MarkGpuDirty directly after
+         * @param gpuDirty If true, the texture will be transitioned to being GpuDirty by this call
         * @note This function is not blocking and the synchronization will not be complete until the associated fence is signalled, it can be waited on with WaitOnFence()
         * @note The texture **must** be locked prior to calling this
         * @note The guest texture backing should exist prior to calling this
         */
-        void SynchronizeHost(bool rwTrap = false);
+        void SynchronizeHost(bool gpuDirty = false);
        /**
         * @brief Same as SynchronizeHost but this records any commands into the supplied command buffer rather than creating one as necessary
-         * @param rwTrap If true, the guest buffer will be read/write trapped rather than only being write trapped which is more efficient than calling MarkGpuDirty directly after
+         * @param gpuDirty If true, the texture will be transitioned to being GpuDirty by this call
         * @note It is more efficient to call SynchronizeHost than allocating a command buffer purely for this function as it may conditionally not record any commands
         * @note The texture **must** be locked prior to calling this
         * @note The guest texture backing should exist prior to calling this
         */
-        void SynchronizeHostWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle, bool rwTrap = false);
+        void SynchronizeHostInline(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle, bool gpuDirty = false);
        /**
         * @brief Synchronizes the guest texture with the host texture after it has been modified
-         * @param skipTrap If true, setting up a CPU trap will be skipped and the dirty state will be Clean/CpuDirty
+         * @param cpuDirty If true, the texture will be transitioned to being CpuDirty by this call
-         * @param setDirty If true, the texture will be marked as CpuDirty rather than Clean
+         * @param skipTrap If true, trapping/untrapping the guest mappings will be skipped and has to be handled by the caller
         * @note This function is not blocking and the synchronization will not be complete until the associated fence is signalled, it can be waited on with WaitOnFence()
         * @note The texture **must** be locked prior to calling this
         * @note The guest texture should not be null prior to calling this
         */
-        void SynchronizeGuest(bool skipTrap = false, bool setDirty = false);
+        void SynchronizeGuest(bool cpuDirty = false, bool skipTrap = false);
        /**
         * @brief Synchronizes the guest texture with the host texture after it has been modified
         * @note It is more efficient to call SynchronizeHost than allocating a command buffer purely for this function as it may conditionally not record any commands
         * @note The texture **must** be locked prior to calling this
         * @note The guest texture should not be null prior to calling this
         */
        void SynchronizeGuestWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle);
        /**
         * @return A cached or newly created view into this texture with the supplied attributes