Rework Texture & Buffer for Context and FenceCycle Chaining

GPU resources have been designed with locking by fences in mind, fences were treated as implicit locks on a GPU, design paradigms such as `GraphicsContext` simply unlocking the texture mutex after attaching it which would set the fence cycle were considered fine prior but are unoptimal as it enforces that a `FenceCycle` effectively ensures exclusivity. This conflates the function of a mutex which is mutual exclusion and that of the fence which is to track GPU-side completion and led to tying if it was acceptable to use a GPU resource to GPU completion rather than simply if it was not currently being used by the CPU which is the function of the mutex. This rework fixes this with the groundwork that has been laid with previous commits, as `Context` semantics are utilized to move back to using mutexes for locking of resources and tracking the usage on the GPU in a cleaner way rather than arbitrary fence comparisons. This also leads to cleaning up a lot of methods that involved usage of fences that no longer require it and therefore can be entirely removed, further cleaning up the codebase. It also opens the door for future improvements such as the removal of `hostImmutableCycle` and replacing them with better solutions, the implementation of which is broken at the moment regardless. While moving to `Context`-based locking the question of multiple GPU workloads being in-flight while using overlapping resources came up which brought a fundamental limitation of `FenceCycle` to light which was that only one resource could be concurrently attached to a cycle and it could not adequately represent multi-cycle dependencies. `FenceCycle` chaining was designed to fix this inadequacy and allows for several different GPU workloads to be in-flight concurrently while utilizing the same resources as long as they can ensure GPU-GPU synchronization.
2024-11-23 12:49:18 +01:00 · 2022-06-26 14:40:46 +05:30 · 2022-06-26 14:40:46 +05:30 · 1239907ce8
commit 1239907ce8
parent 07d45ee504
6 changed files with 83 additions and 142 deletions
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@ -64,8 +64,7 @@ namespace skyline::gpu {
                if (srcBuffer->hostImmutableCycle) {
                    // Propagate any host immutability
                    if (hostImmutableCycle) {
-                        if (srcBuffer->hostImmutableCycle.owner_before(hostImmutableCycle))
-                            hostImmutableCycle = srcBuffer->hostImmutableCycle;
+                        srcBuffer->hostImmutableCycle->Wait();
                    } else {
                        hostImmutableCycle = srcBuffer->hostImmutableCycle;
                    }
@ -119,17 +118,15 @@ namespace skyline::gpu {
    void Buffer::WaitOnFence() {
        TRACE_EVENT("gpu", "Buffer::WaitOnFence");

-        auto lCycle{cycle.lock()};
-        if (lCycle) {
-            lCycle->Wait();
-            cycle.reset();
+        if (cycle) {
+            cycle->Wait();
+            cycle = nullptr;
        }
    }

    bool Buffer::PollFence() {
-        auto lCycle{cycle.lock()};
-        if (lCycle && lCycle->Poll()) {
-            cycle.reset();
+        if (cycle && cycle->Poll()) {
+            cycle = nullptr;
            return true;
        }
        return false;
@ -155,27 +152,6 @@ namespace skyline::gpu {
        }
    }

-    void Buffer::SynchronizeHostWithCycle(const std::shared_ptr<FenceCycle> &pCycle, bool rwTrap) {
-        if (dirtyState != DirtyState::CpuDirty || !guest)
-            return;
-
-        if (!cycle.owner_before(pCycle))
-            WaitOnFence();
-
-        TRACE_EVENT("gpu", "Buffer::SynchronizeHostWithCycle");
-
-        AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
-        std::memcpy(backing.data(), mirror.data(), mirror.size());
-
-        if (rwTrap) {
-            gpu.state.nce->RetrapRegions(*trapHandle, false);
-            dirtyState = DirtyState::GpuDirty;
-        } else {
-            gpu.state.nce->RetrapRegions(*trapHandle, true);
-            dirtyState = DirtyState::Clean;
-        }
-    }
-
    void Buffer::SynchronizeGuest(bool skipTrap, bool nonBlocking) {
        if (dirtyState != DirtyState::GpuDirty || !guest)
            return; // If the buffer has not been used on the GPU or there's no guest buffer, there is no need to synchronize it
@ -195,52 +171,30 @@ namespace skyline::gpu {
        dirtyState = DirtyState::Clean;
    }

-    /**
-     * @brief A FenceCycleDependency that synchronizes the contents of a host buffer with the guest buffer
-     */
-    struct BufferGuestSync {
-        std::shared_ptr<Buffer> buffer;
-
-        explicit BufferGuestSync(std::shared_ptr<Buffer> buffer) : buffer(std::move(buffer)) {}
-
-        ~BufferGuestSync() {
-            TRACE_EVENT("gpu", "Buffer::BufferGuestSync");
-            buffer->SynchronizeGuest();
-        }
-    };
-
-    void Buffer::SynchronizeGuestWithCycle(const std::shared_ptr<FenceCycle> &pCycle) {
-        if (!cycle.owner_before(pCycle))
-            WaitOnFence();
-
-        pCycle->AttachObject(std::make_shared<BufferGuestSync>(shared_from_this()));
-        cycle = pCycle;
-    }
-
-    void Buffer::SynchronizeGuestImmediate(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback) {
+    void Buffer::SynchronizeGuestImmediate(bool isFirstUsage, const std::function<void()> &flushHostCallback) {
        // If this buffer was attached to the current cycle, flush all pending host GPU work and wait to ensure that we read valid data
-        if (cycle.owner_before(pCycle))
+        if (!isFirstUsage)
            flushHostCallback();

        SynchronizeGuest();
    }

-    void Buffer::Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) {
+    void Buffer::Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) {
        if (dirtyState == DirtyState::GpuDirty)
-            SynchronizeGuestImmediate(pCycle, flushHostCallback);
+            SynchronizeGuestImmediate(isFirstUsage, flushHostCallback);

        std::memcpy(data.data(), mirror.data() + offset, data.size());
    }

-    void Buffer::Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) {
+    void Buffer::Write(bool isFirstUsage, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) {
        AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
        everHadInlineUpdate = true;

        // Perform a syncs in both directions to ensure correct ordering of writes
        if (dirtyState == DirtyState::CpuDirty)
-            SynchronizeHostWithCycle(pCycle);
+            SynchronizeHost();
        else if (dirtyState == DirtyState::GpuDirty)
-            SynchronizeGuestImmediate(pCycle, flushHostCallback);
+            SynchronizeGuestImmediate(isFirstUsage, flushHostCallback);

        if (dirtyState != DirtyState::Clean)
            Logger::Error("Attempting to write to a dirty buffer"); // This should never happen since we do syncs in both directions above
@ -277,9 +231,9 @@ namespace skyline::gpu {
        sequenceNumber++;
    }

-    span<u8> Buffer::GetReadOnlyBackingSpan(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback) {
+    span<u8> Buffer::GetReadOnlyBackingSpan(bool isFirstUsage, const std::function<void()> &flushHostCallback) {
        if (dirtyState == DirtyState::GpuDirty)
-            SynchronizeGuestImmediate(pCycle, flushHostCallback);
+            SynchronizeGuestImmediate(isFirstUsage, flushHostCallback);

        return mirror;
    }
@ -372,18 +326,9 @@ namespace skyline::gpu {

    BufferView::BufferView(std::shared_ptr<Buffer> buffer, const Buffer::BufferViewStorage *view) : bufferDelegate(std::make_shared<Buffer::BufferDelegate>(std::move(buffer), view)) {}

-    void BufferView::AttachCycle(const std::shared_ptr<FenceCycle> &cycle) {
-        auto buffer{bufferDelegate->buffer.get()};
-        if (!buffer->cycle.owner_before(cycle)) {
-            buffer->WaitOnFence();
-            buffer->cycle = cycle;
-            cycle->AttachObject(bufferDelegate);
-        }
-    }
-
-    void BufferView::RegisterUsage(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback) {
+    void BufferView::RegisterUsage(const std::shared_ptr<FenceCycle> &cycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback) {
        // Users of RegisterUsage expect the buffer contents to be sequenced as the guest GPU would be, so force any further writes in the current cycle to occur on the GPU
-        bufferDelegate->buffer->MarkHostImmutable(pCycle);
+        bufferDelegate->buffer->MarkHostImmutable(cycle);

        usageCallback(*bufferDelegate->view, bufferDelegate->buffer);
        if (!bufferDelegate->usageCallback) {
@ -396,18 +341,18 @@ namespace skyline::gpu {
        }
    }

-    void BufferView::Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const {
-        bufferDelegate->buffer->Read(pCycle, flushHostCallback, data, offset + bufferDelegate->view->offset);
+    void BufferView::Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const {
+        bufferDelegate->buffer->Read(isFirstUsage, flushHostCallback, data, offset + bufferDelegate->view->offset);
    }

-    void BufferView::Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const {
+    void BufferView::Write(bool isFirstUsage, const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const {
        // If megabuffering can't be enabled we have to do a GPU-side copy to ensure sequencing
        bool gpuCopy{bufferDelegate->view->size > MegaBufferingDisableThreshold};
        if (gpuCopy)
            // This will force the host buffer contents to stay as is for the current cycle, requiring that write operations are instead sequenced on the GPU for the entire buffer
            bufferDelegate->buffer->MarkHostImmutable(pCycle);

-        bufferDelegate->buffer->Write(pCycle, flushHostCallback, gpuCopyCallback, data, offset + bufferDelegate->view->offset);
+        bufferDelegate->buffer->Write(isFirstUsage, flushHostCallback, gpuCopyCallback, data, offset + bufferDelegate->view->offset);
    }

    vk::DeviceSize BufferView::AcquireMegaBuffer(MegaBuffer &megaBuffer) const {
@ -436,8 +381,8 @@ namespace skyline::gpu {
        return bufferDelegate->view->megabufferOffset; // Success!
    }

-    span<u8> BufferView::GetReadOnlyBackingSpan(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback) {
-        auto backing{bufferDelegate->buffer->GetReadOnlyBackingSpan(pCycle, flushHostCallback)};
+    span<u8> BufferView::GetReadOnlyBackingSpan(bool isFirstUsage, const std::function<void()> &flushHostCallback) {
+        auto backing{bufferDelegate->buffer->GetReadOnlyBackingSpan(isFirstUsage, flushHostCallback)};
        return backing.subspan(bufferDelegate->view->offset, bufferDelegate->view->size);
    }
 }
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@ -124,7 +124,7 @@ namespace skyline::gpu {
        void SetupGuestMappings();

      public:
-        std::weak_ptr<FenceCycle> cycle{}; //!< A fence cycle for when any host operation mutating the buffer has completed, it must be waited on prior to any mutations to the backing
+        std::shared_ptr<FenceCycle> cycle{}; //!< A fence cycle for when any host operation mutating the buffer has completed, it must be waited on prior to any mutations to the backing

        constexpr vk::Buffer GetBacking() {
            return backing.vkBuffer;
@ -210,14 +210,6 @@ namespace skyline::gpu {
         */
        void SynchronizeHost(bool rwTrap = false);

-        /**
-         * @brief Synchronizes the host buffer with the guest
-         * @param cycle A FenceCycle that is checked against the held one to skip waiting on it when equal
-         * @param rwTrap If true, the guest buffer will be read/write trapped rather than only being write trapped which is more efficient than calling MarkGpuDirty directly after
-         * @note The buffer **must** be locked prior to calling this
-         */
-        void SynchronizeHostWithCycle(const std::shared_ptr<FenceCycle> &cycle, bool rwTrap = false);
-
        /**
         * @brief Synchronizes the guest buffer with the host buffer
         * @param skipTrap If true, setting up a CPU trap will be skipped and the dirty state will be Clean/CpuDirty
@ -226,35 +218,28 @@ namespace skyline::gpu {
         */
        void SynchronizeGuest(bool skipTrap = false, bool nonBlocking = false);

-        /**
-         * @brief Synchronizes the guest buffer with the host buffer when the FenceCycle is signalled
-         * @note The buffer **must** be locked prior to calling this
-         * @note The guest buffer should not be null prior to calling this
-         */
-        void SynchronizeGuestWithCycle(const std::shared_ptr<FenceCycle> &cycle);
-
        /**
         * @brief Synchronizes the guest buffer with the host buffer immediately, flushing GPU work if necessary
-         * @note The buffer **must** be locked prior to calling this
-         * @param pCycle The FenceCycle associated with the current workload, utilised for waiting and flushing semantics
+         * @param isFirstUsage If this is the first usage of this resource in the context as returned from LockWithTag(...)
         * @param flushHostCallback Callback to flush and execute all pending GPU work to allow for synchronisation of GPU dirty buffers
+         * @note The buffer **must** be locked prior to calling this
         */
-        void SynchronizeGuestImmediate(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback);
+        void SynchronizeGuestImmediate(bool isFirstUsage, const std::function<void()> &flushHostCallback);

        /**
         * @brief Reads data at the specified offset in the buffer
-         * @param pCycle The FenceCycle associated with the current workload, utilised for waiting and flushing semantics
+         * @param isFirstUsage If this is the first usage of this resource in the context as returned from LockWithTag(...)
         * @param flushHostCallback Callback to flush and execute all pending GPU work to allow for synchronisation of GPU dirty buffers
         */
-        void Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset);
+        void Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset);

        /**
         * @brief Writes data at the specified offset in the buffer, falling back to GPU side copies if the buffer is host immutable
-         * @param pCycle The FenceCycle associated with the current workload, utilised for waiting and flushing semantics
+         * @param isFirstUsage If this is the first usage of this resource in the context as returned from LockWithTag(...)
         * @param flushHostCallback Callback to flush and execute all pending GPU work to allow for synchronisation of GPU dirty buffers
         * @param gpuCopyCallback Callback to perform a GPU-side copy for this Write
         */
-        void Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset);
+        void Write(bool isFirstUsage, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset);

        /**
         * @return A cached or newly created view into this buffer with the supplied attributes
@ -279,19 +264,19 @@ namespace skyline::gpu {
        void AdvanceSequence();

        /**
-         * @param pCycle The FenceCycle associated with the current workload, utilised for waiting and flushing semantics
+         * @param isFirstUsage If this is the first usage of this resource in the context as returned from LockWithTag(...)
         * @param flushHostCallback Callback to flush and execute all pending GPU work to allow for synchronisation of GPU dirty buffers
         * @return A span of the backing buffer contents
         * @note The returned span **must** not be written to
         * @note The buffer **must** be kept locked until the span is no longer in use
         */
-        span<u8> GetReadOnlyBackingSpan(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback);
+        span<u8> GetReadOnlyBackingSpan(bool isFirstUsage, const std::function<void()> &flushHostCallback);

        /**
         * @brief Prevents any further writes to the `backing` host side buffer for the duration of the current cycle, forcing slower inline GPU updates instead
         * @note The buffer **must** be locked prior to calling this
         */
-        void MarkHostImmutable(const std::shared_ptr<FenceCycle> &pCycle);
+        void MarkHostImmutable(const std::shared_ptr<FenceCycle> &cycle);

        bool EverHadInlineUpdate() const { return everHadInlineUpdate; }
    };
@ -355,33 +340,27 @@ namespace skyline::gpu {
            return bufferDelegate.get();
        }

-        /**
-         * @brief Attaches a fence cycle to the underlying buffer in a way that it will be synchronized with the latest backing buffer
-         * @note The view **must** be locked prior to calling this
-         */
-        void AttachCycle(const std::shared_ptr<FenceCycle> &cycle);
-
        /**
         * @brief Registers a callback for a usage of this view, it may be called multiple times due to the view being recreated with different backings
         * @note This will force the buffer to be host immutable for the current cycle, preventing megabuffering and requiring slower GPU inline writes instead
         * @note The callback will be automatically called the first time after registration
         * @note The view **must** be locked prior to calling this
         */
-        void RegisterUsage(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback);
+        void RegisterUsage(const std::shared_ptr<FenceCycle> &cycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback);

        /**
         * @brief Reads data at the specified offset in the view
         * @note The view **must** be locked prior to calling this
         * @note See Buffer::Read
         */
-        void Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const;
+        void Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const;

        /**
         * @brief Writes data at the specified offset in the view
         * @note The view **must** be locked prior to calling this
         * @note See Buffer::Write
         */
-        void Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const;
+        void Write(bool isFirstUsage, const std::shared_ptr<FenceCycle> &cycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const;

        /**
         * @brief If megabuffering is beneficial for the current buffer, pushes its contents into the megabuffer and returns the offset of the pushed data
@ -396,6 +375,6 @@ namespace skyline::gpu {
         * @note The view **must** be kept locked until the span is no longer in use
         * @note See Buffer::GetReadOnlyBackingSpan
         */
-        span<u8> GetReadOnlyBackingSpan(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback);
+        span<u8> GetReadOnlyBackingSpan(bool isFirstUsage, const std::function<void()> &flushHostCallback);
    };
 }
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@ -203,14 +203,19 @@ namespace skyline::gpu::interconnect {
                .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
            });

+            // We need this barrier here to ensure that resources are in the state we expect them to be in, we shouldn't overwrite resources while prior commands might still be using them or read from them while they might be modified by prior commands
+            commandBuffer.pipelineBarrier(
+                vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{
+                    .srcAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
+                    .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
+                }, {}, {}
+            );
+
            for (const auto &texture : attachedTextures) {
                texture->SynchronizeHostWithBuffer(commandBuffer, cycle, true);
                texture->MarkGpuDirty();
            }

-            for (const auto &delegate : attachedBufferDelegates)
-                delegate->usageCallback = nullptr;
-
            vk::RenderPass lRenderPass;
            u32 subpassIndex;

@ -244,13 +249,30 @@ namespace skyline::gpu::interconnect {

            gpu.scheduler.SubmitCommandBuffer(commandBuffer, activeCommandBuffer.GetFence());

-            for (const auto &delegate : attachedBufferDelegates)
-                delegate->view->megabufferOffset = 0;
-
            nodes.clear();
+
+            for (const auto &attachedTexture : attachedTextures) {
+                cycle->AttachObject(attachedTexture.texture);
+                cycle->ChainCycle(attachedTexture->cycle);
+                attachedTexture->cycle = cycle;
+            }
            attachedTextures.clear();
+            textureManagerLock.reset();
+
+            for (const auto &attachedBuffer : attachedBuffers) {
+                cycle->AttachObject(attachedBuffer.buffer);
+                cycle->ChainCycle(attachedBuffer->cycle);
+                attachedBuffer->cycle = cycle;
+            }
+
+            for (const auto &delegate : attachedBufferDelegates) {
+                delegate->usageCallback = nullptr;
+                delegate->view->megabufferOffset = 0;
+            }
+
            attachedBuffers.clear();
            attachedBufferDelegates.clear();
+            bufferManagerLock.reset();
        }
    }

--- a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
@ -627,7 +627,7 @@ namespace skyline::gpu::interconnect {
            T Read(CommandExecutor &pExecutor, size_t dstOffset) const {
                T object;
                ContextLock lock{pExecutor.tag, view};
-                view.Read(pExecutor.cycle, []() {
+                view.Read(lock.isFirst, []() {
                    // TODO: here we should trigger a SubmitWithFlush, however that doesn't currently work due to Read being called mid-draw and attached objects not handling this case
                    Logger::Warn("GPU dirty buffer reads for attached buffers are unimplemented");
                }, span<T>(object).template cast<u8>(), dstOffset);
@ -643,7 +643,7 @@ namespace skyline::gpu::interconnect {
                auto srcCpuBuf{buf.template cast<u8>()};

                ContextLock lock{pExecutor.tag, view};
-                view.Write(pExecutor.cycle, []() {
+                view.Write(lock.isFirst, pExecutor.cycle, []() {
                    // TODO: see Read()
                    Logger::Warn("GPU dirty buffer reads for attached buffers are unimplemented");
                }, [&megaBuffer, &pExecutor, srcCpuBuf, dstOffset, view = this->view]() {
--- a/app/src/main/cpp/skyline/gpu/texture/texture.cpp
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.cpp
@ -185,7 +185,7 @@ namespace skyline::gpu {
        });
    }

-    std::shared_ptr<memory::StagingBuffer> Texture::SynchronizeHostImpl(const std::shared_ptr<FenceCycle> &pCycle) {
+    std::shared_ptr<memory::StagingBuffer> Texture::SynchronizeHostImpl() {
        if (!guest)
            throw exception("Synchronization of host textures requires a valid guest texture to synchronize from");
        else if (guest->dimensions != dimensions)
@ -207,8 +207,7 @@ namespace skyline::gpu {
                if (layout == vk::ImageLayout::eUndefined)
                    TransitionLayout(vk::ImageLayout::eGeneral);
                bufferData = std::get<memory::Image>(backing).data();
-                if (cycle.lock() != pCycle)
-                    WaitOnFence();
+                WaitOnFence();
                return nullptr;
            } else {
                throw exception("Guest -> Host synchronization of images tiled as '{}' isn't implemented", vk::to_string(tiling));
@ -313,9 +312,6 @@ namespace skyline::gpu {
            }
        }

-        if (stagingBuffer && cycle.lock() != pCycle)
-            WaitOnFence();
-
        return stagingBuffer;
    }

@ -632,10 +628,9 @@ namespace skyline::gpu {
    void Texture::WaitOnFence() {
        TRACE_EVENT("gpu", "Texture::WaitOnFence");

-        auto lCycle{cycle.lock()};
-        if (lCycle) {
-            lCycle->Wait();
-            cycle.reset();
+        if (cycle) {
+            cycle->Wait();
+            cycle = nullptr;
        }
    }

@ -682,12 +677,13 @@ namespace skyline::gpu {

        TRACE_EVENT("gpu", "Texture::SynchronizeHost");

-        auto stagingBuffer{SynchronizeHostImpl(nullptr)};
+        auto stagingBuffer{SynchronizeHostImpl()};
        if (stagingBuffer) {
            auto lCycle{gpu.scheduler.Submit([&](vk::raii::CommandBuffer &commandBuffer) {
                CopyFromStagingBuffer(commandBuffer, stagingBuffer);
            })};
            lCycle->AttachObjects(stagingBuffer, shared_from_this());
+            lCycle->ChainCycle(cycle);
            cycle = lCycle;
        }

@ -706,10 +702,11 @@ namespace skyline::gpu {

        TRACE_EVENT("gpu", "Texture::SynchronizeHostWithBuffer");

-        auto stagingBuffer{SynchronizeHostImpl(pCycle)};
+        auto stagingBuffer{SynchronizeHostImpl()};
        if (stagingBuffer) {
            CopyFromStagingBuffer(commandBuffer, stagingBuffer);
            pCycle->AttachObjects(stagingBuffer, shared_from_this());
+            pCycle->ChainCycle(cycle);
            cycle = pCycle;
        }

@ -743,7 +740,6 @@ namespace skyline::gpu {
        TRACE_EVENT("gpu", "Texture::SynchronizeGuest");

        WaitOnBacking();
-        WaitOnFence();

        if (tiling == vk::ImageTiling::eOptimal || !std::holds_alternative<memory::Image>(backing)) {
            auto stagingBuffer{gpu.memory.AllocateStagingBuffer(surfaceSize)};
@ -752,9 +748,11 @@ namespace skyline::gpu {
                CopyIntoStagingBuffer(commandBuffer, stagingBuffer);
            })};
            lCycle->AttachObject(std::make_shared<TextureBufferCopy>(shared_from_this(), stagingBuffer));
+            lCycle->ChainCycle(cycle);
            cycle = lCycle;
        } else if (tiling == vk::ImageTiling::eLinear) {
            // We can optimize linear texture sync on a UMA by mapping the texture onto the CPU and copying directly from it rather than using a staging buffer
+            WaitOnFence();
            CopyToGuest(std::get<memory::Image>(backing).data());
        } else {
            throw exception("Host -> Guest synchronization of images tiled as '{}' isn't implemented", vk::to_string(tiling));
@ -779,8 +777,7 @@ namespace skyline::gpu {
        TRACE_EVENT("gpu", "Texture::SynchronizeGuestWithBuffer");

        WaitOnBacking();
-        if (cycle.lock() != pCycle)
-            WaitOnFence();
+        pCycle->ChainCycle(cycle);

        if (tiling == vk::ImageTiling::eOptimal || !std::holds_alternative<memory::Image>(backing)) {
            auto stagingBuffer{gpu.memory.AllocateStagingBuffer(surfaceSize)};
@ -812,10 +809,7 @@ namespace skyline::gpu {

    void Texture::CopyFrom(std::shared_ptr<Texture> source, const vk::ImageSubresourceRange &subresource) {
        WaitOnBacking();
-        WaitOnFence();
-
        source->WaitOnBacking();
-        source->WaitOnFence();

        if (source->layout == vk::ImageLayout::eUndefined)
            throw exception("Cannot copy from image with undefined layout");
@ -896,6 +890,7 @@ namespace skyline::gpu {
                });
        })};
        lCycle->AttachObjects(std::move(source), shared_from_this());
+        lCycle->ChainCycle(cycle);
        cycle = lCycle;
    }
 }
--- a/app/src/main/cpp/skyline/gpu/texture/texture.h
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.h
@ -395,7 +395,7 @@ namespace skyline::gpu {
         * @brief An implementation function for guest -> host texture synchronization, it allocates and copies data into a staging buffer or directly into a linear host texture
         * @return If a staging buffer was required for the texture sync, it's returned filled with guest texture data and must be copied to the host texture by the callee
         */
-        std::shared_ptr<memory::StagingBuffer> SynchronizeHostImpl(const std::shared_ptr<FenceCycle> &pCycle);
+        std::shared_ptr<memory::StagingBuffer> SynchronizeHostImpl();

        /**
         * @brief Records commands for copying data from a staging buffer to the texture's backing into the supplied command buffer
@ -432,7 +432,7 @@ namespace skyline::gpu {
        boost::container::small_vector<vk::BufferImageCopy, 10> GetBufferImageCopies();

      public:
-        std::weak_ptr<FenceCycle> cycle; //!< A fence cycle for when any host operation mutating the texture has completed, it must be waited on prior to any mutations to the backing
+        std::shared_ptr<FenceCycle> cycle; //!< A fence cycle for when any host operation mutating the texture has completed, it must be waited on prior to any mutations to the backing
        std::optional<GuestTexture> guest;
        texture::Dimensions dimensions;
        texture::Format format;