From 090151f0c38986895734f59871869c26f3d884ac Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Sat, 4 Mar 2023 20:11:34 +0000
Subject: [PATCH] Introduce usage tracker for dirty tracking within an
 execution

This is neccessary as e.g. shaders can be updated through a mirror and never hit modification traps. By tracking which addresses have sequenced writes applied, the shader manager can then correctly detect if a given shader has been modified by the GPU.
---
 app/src/main/cpp/skyline/gpu/buffer.cpp       | 60 ++++++++++++-------
 app/src/main/cpp/skyline/gpu/buffer.h         | 26 +++++---
 .../main/cpp/skyline/gpu/buffer_manager.cpp   |  4 +-
 .../gpu/interconnect/command_executor.cpp     |  4 +-
 .../gpu/interconnect/command_executor.h       |  2 +
 .../gpu/interconnect/common/pipeline.inc      |  2 +-
 .../gpu/interconnect/common/shader_cache.cpp  |  8 +--
 .../gpu/interconnect/common/shader_cache.h    |  2 +-
 .../cpp/skyline/gpu/interconnect/fermi_2d.cpp |  1 +
 .../gpu/interconnect/inline2memory.cpp        |  2 +-
 .../interconnect/maxwell_3d/active_state.cpp  |  2 +-
 .../maxwell_3d/constant_buffers.cpp           |  4 +-
 .../skyline/gpu/interconnect/maxwell_dma.cpp  |  2 +-
 .../main/cpp/skyline/gpu/texture/texture.cpp  |  6 ++
 .../main/cpp/skyline/gpu/texture/texture.h    |  6 ++
 app/src/main/cpp/skyline/gpu/usage_tracker.h  | 16 +++++
 16 files changed, 103 insertions(+), 44 deletions(-)
 create mode 100644 app/src/main/cpp/skyline/gpu/usage_tracker.h
diff --git a/app/src/main/cpp/skyline/gpu/buffer.cpp b/app/src/main/cpp/skyline/gpu/buffer.cpp
index d90fd638..5fb28b7d 100644
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@@ -194,13 +194,15 @@ namespace skyline::gpu {
         return isDirect ? ValidateMegaBufferViewImplDirect(size) : ValidateMegaBufferViewImplStaged(size);
     }
 
-    void Buffer::CopyFromImplDirect(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback) {
+    void Buffer::CopyFromImplDirect(vk::DeviceSize dstOffset,
+                                    Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size,
+                                    UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback) {
         everHadInlineUpdate = true;
         bool needsGpuTracking{src->RefreshGpuWritesActiveDirect() || RefreshGpuWritesActiveDirect()};
         bool needsCpuTracking{RefreshGpuReadsActiveDirect() && !needsGpuTracking};
         if (needsGpuTracking || needsCpuTracking) {
             if (needsGpuTracking) // Force buffer to be dirty for this cycle if either of the sources are dirty, this is needed as otherwise it could have just been dirty from the previous cycle
-                MarkGpuDirty();
+                MarkGpuDirty(usageTracker);
             gpuCopyCallback();
 
             if (needsCpuTracking)
@@ -210,7 +212,9 @@ namespace skyline::gpu {
         }
     }
 
-    void Buffer::CopyFromImplStaged(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback) {
+    void Buffer::CopyFromImplStaged(vk::DeviceSize dstOffset,
+                                    Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size,
+                                    UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback) {
         std::scoped_lock lock{stateMutex, src->stateMutex}; // Fine even if src and dst are same since recursive mutex
 
         if (dirtyState == DirtyState::CpuDirty && SequencedCpuBackingWritesBlocked())
@@ -230,18 +234,19 @@ namespace skyline::gpu {
             else
                 gpuCopyCallback();
         } else {
-            MarkGpuDirty();
+            MarkGpuDirty(usageTracker);
             gpuCopyCallback();
         }
     }
 
-    bool Buffer::WriteImplDirect(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback) {
+    bool Buffer::WriteImplDirect(span<u8> data, vk::DeviceSize offset,
+                                 UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback) {
         // If the buffer is GPU dirty do the write on the GPU and we're done
         if (RefreshGpuWritesActiveDirect()) {
             if (gpuCopyCallback) {
                 // Propagate dirtiness to the current cycle, since if this is only dirty in a previous cycle that could change at any time and we would need to have the write saved somewhere for CPU reads
                 // By propagating the dirtiness to the current cycle we can avoid this and force a wait on any reads
-                MarkGpuDirty();
+                MarkGpuDirty(usageTracker);
                 gpuCopyCallback();
                 return false;
             } else {
@@ -349,6 +354,15 @@ namespace skyline::gpu {
         AdvanceSequence(); // The GPU will modify buffer contents so advance to the next sequence
     }
 
+    void Buffer::MarkGpuDirtyImpl() {
+        currentExecutionGpuDirty = true;
+
+        if (isDirect)
+            MarkGpuDirtyImplDirect();
+        else
+            MarkGpuDirtyImplStaged();
+    }
+
     Buffer::Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, GuestBuffer guest, size_t id, bool direct)
         : gpu{gpu},
           guest{guest},
@@ -382,16 +396,12 @@ namespace skyline::gpu {
         WaitOnFence();
     }
 
-    void Buffer::MarkGpuDirty() {
+    void Buffer::MarkGpuDirty(UsageTracker &usageTracker) {
         if (!guest)
             return;
 
-        currentExecutionGpuDirty = true;
-
-        if (isDirect)
-            MarkGpuDirtyImplDirect();
-        else
-            MarkGpuDirtyImplStaged();
+        usageTracker.dirtyIntervals.Insert(*guest);
+        MarkGpuDirtyImpl();
     }
 
     void Buffer::WaitOnFence() {
@@ -493,24 +503,30 @@ namespace skyline::gpu {
             ReadImplStaged(isFirstUsage, flushHostCallback, data, offset);
     }
 
-    bool Buffer::Write(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback) {
+    bool Buffer::Write(span<u8> data, vk::DeviceSize offset, UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback) {
         AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
         everHadInlineUpdate = true;
 
+        usageTracker.sequencedIntervals.Insert(*guest);
+
         if (isDirect)
-            return WriteImplDirect(data, offset, gpuCopyCallback);
+            return WriteImplDirect(data, offset, usageTracker, gpuCopyCallback);
         else
             return WriteImplStaged(data, offset, gpuCopyCallback);
     }
 
-    void Buffer::CopyFrom(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback) {
+    void Buffer::CopyFrom(vk::DeviceSize dstOffset,
+                          Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size,
+                          UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback) {
         AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
         everHadInlineUpdate = true;
 
+        usageTracker.sequencedIntervals.Insert(*guest);
+
         if (isDirect)
-            CopyFromImplDirect(dstOffset, src, srcOffset, size, gpuCopyCallback);
+            CopyFromImplDirect(dstOffset, src, srcOffset, size, usageTracker, gpuCopyCallback);
         else
-            CopyFromImplStaged(dstOffset, src, srcOffset, size, gpuCopyCallback);
+            CopyFromImplStaged(dstOffset, src, srcOffset, size, usageTracker, gpuCopyCallback);
     }
 
     BufferView Buffer::GetView(vk::DeviceSize offset, vk::DeviceSize size) {
@@ -676,8 +692,8 @@ namespace skyline::gpu {
         GetBuffer()->Read(isFirstUsage, flushHostCallback, data, readOffset + GetOffset());
     }
 
-    bool BufferView::Write(span<u8> data, vk::DeviceSize writeOffset, const std::function<void()> &gpuCopyCallback) const {
-        return GetBuffer()->Write(data, writeOffset + GetOffset(), gpuCopyCallback);
+    bool BufferView::Write(span<u8> data, vk::DeviceSize writeOffset, UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback) const {
+        return GetBuffer()->Write(data, writeOffset + GetOffset(), usageTracker, gpuCopyCallback);
     }
 
     BufferBinding BufferView::TryMegaBuffer(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, ContextTag executionTag, size_t sizeOverride) const {
@@ -689,9 +705,9 @@ namespace skyline::gpu {
         return backing.subspan(GetOffset(), size);
     }
 
-    void BufferView::CopyFrom(BufferView src, const std::function<void()> &gpuCopyCallback) {
+    void BufferView::CopyFrom(BufferView src, UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback) {
         if (src.size != size)
             throw exception("Copy size mismatch!");
-        return GetBuffer()->CopyFrom(GetOffset(), src.GetBuffer(), src.GetOffset(), size, gpuCopyCallback);
+        return GetBuffer()->CopyFrom(GetOffset(), src.GetBuffer(), src.GetOffset(), size, usageTracker, gpuCopyCallback);
     }
 }
diff --git a/app/src/main/cpp/skyline/gpu/buffer.h b/app/src/main/cpp/skyline/gpu/buffer.h
index b538ad99..2b2db8f5 100644
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@@ -8,6 +8,7 @@
 #include <common/spin_lock.h>
 #include <nce.h>
 #include <gpu/tag_allocator.h>
+#include "usage_tracker.h"
 #include "megabuffer.h"
 #include "memory_manager.h"
 
@@ -146,11 +147,16 @@ namespace skyline::gpu {
          */
         bool ValidateMegaBufferView(vk::DeviceSize size);
 
-        void CopyFromImplDirect(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback);
+        void CopyFromImplDirect(vk::DeviceSize dstOffset,
+                                Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size,
+                                UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback);
 
-        void CopyFromImplStaged(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback);
+        void CopyFromImplStaged(vk::DeviceSize dstOffset,
+                                Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size,
+                                UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback);
 
-        bool WriteImplDirect(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback = {});
+        bool WriteImplDirect(span<u8> data, vk::DeviceSize offset,
+                             UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback = {});
 
         bool WriteImplStaged(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback = {});
 
@@ -162,6 +168,8 @@ namespace skyline::gpu {
 
         void MarkGpuDirtyImplStaged();
 
+        void MarkGpuDirtyImpl();
+
       public:
         void UpdateCycle(const std::shared_ptr<FenceCycle> &newCycle) {
             newCycle->ChainCycle(cycle);
@@ -227,7 +235,7 @@ namespace skyline::gpu {
          * @note This **must** be called after syncing the buffer to the GPU not before
          * @note The buffer **must** be locked prior to calling this
          */
-        void MarkGpuDirty();
+        void MarkGpuDirty(UsageTracker &usageTracker);
 
         /**
          * @brief Prevents sequenced writes to this buffer's backing from occuring on the CPU, forcing sequencing on the GPU instead for the duration of the context. Unsequenced writes such as those from the guest can still occur however.
@@ -365,13 +373,15 @@ namespace skyline::gpu {
          * @param gpuCopyCallback Optional callback to perform a GPU-side copy for this Write if necessary, if such a copy is needed and this is not supplied `true` will be returned to indicate that the write needs to be repeated with the callback present
          * @return Whether the write needs to be repeated with `gpuCopyCallback` provided, always false if `gpuCopyCallback` is provided
          */
-        bool Write(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback = {});
+        bool Write(span<u8> data, vk::DeviceSize offset, UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback = {});
 
         /**
          * @brief Copies a region of the src buffer into a region of this buffer
          * @note The src/dst buffers **must** be locked prior to calling this
          */
-        void CopyFrom(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback);
+        void CopyFrom(vk::DeviceSize dstOffset,
+                      Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size,
+                      UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback);
 
         /**
          * @return A view into this buffer with the supplied attributes
@@ -528,7 +538,7 @@ namespace skyline::gpu {
          * @note The view **must** be locked prior to calling this
          * @note See Buffer::Write
          */
-        bool Write(span<u8> data, vk::DeviceSize writeOffset, const std::function<void()> &gpuCopyCallback = {}) const;
+        bool Write(span<u8> data, vk::DeviceSize writeOffset, UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback = {}) const;
 
         /*
          * @brief If megabuffering is determined to be beneficial for the underlying buffer, allocates and copies this view into the megabuffer (in case of cache miss), returning a binding of the allocated megabuffer region
@@ -550,7 +560,7 @@ namespace skyline::gpu {
          * @brief Copies the contents of one view into this one
          * @note The src/dst views **must** be locked prior to calling this
          */
-        void CopyFrom(BufferView src, const std::function<void()> &gpuCopyCallback);
+        void CopyFrom(BufferView src, UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback);
 
         constexpr operator bool() {
             return delegate != nullptr;
diff --git a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
index 9d894a92..edf6cc60 100644
--- a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
@@ -113,7 +113,7 @@ namespace skyline::gpu {
                     if (srcBuffer.lock.IsFirstUsage() && newBuffer->dirtyState != Buffer::DirtyState::GpuDirty)
                         copyBuffer(*newBuffer->guest, *srcBuffer->guest, newBuffer->mirror.data(), srcBuffer->backing->data());
                     else
-                        newBuffer->MarkGpuDirty();
+                        newBuffer->MarkGpuDirtyImpl();
 
                     // Since we don't synchost source buffers and the source buffers here are GPU dirty their mirrors will be out of date, meaning the backing contents of this source buffer's region in the new buffer from the initial synchost call will be incorrect. By copying backings directly here we can ensure that no writes are lost and that if the newly created buffer needs to turn GPU dirty during recreation no copies need to be done since the backing is as up to date as the mirror at a minimum.
                     copyBuffer(*newBuffer->guest, *srcBuffer->guest, newBuffer->backing->data(), srcBuffer->backing->data());
@@ -126,7 +126,7 @@ namespace skyline::gpu {
                 }
             } else {
                 if (srcBuffer->directGpuWritesActive) {
-                    newBuffer->MarkGpuDirty();
+                    newBuffer->MarkGpuDirtyImpl();
                 } else if (srcBuffer->directTrackedShadowActive) {
                     newBuffer->EnableTrackedShadowDirect();
                     copyBuffer(*newBuffer->guest, *srcBuffer->guest, newBuffer->directTrackedShadow.data(), srcBuffer->directTrackedShadow.data());
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
index c1f403e1..b5dd7f09 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@@ -562,6 +562,7 @@ namespace skyline::gpu::interconnect {
         attachedBuffers.clear();
         allocator->Reset();
         renderPassIndex = 0;
+        usageTracker.sequencedIntervals.Clear();
 
         // Periodically clear preserve attachments just in case there are new waiters which would otherwise end up waiting forever
         if ((submissionNumber % (2U << *state.settings->executorSlotCountScale)) == 0) {
@@ -586,7 +587,6 @@ namespace skyline::gpu::interconnect {
 
             SubmitInternal();
             submissionNumber++;
-
         } else {
             if (callback && *state.settings->useDirectMemoryImport)
                 waiterThread.Queue(nullptr, std::move(callback));
@@ -598,6 +598,8 @@ namespace skyline::gpu::interconnect {
         ResetInternal();
 
         if (wait) {
+            usageTracker.dirtyIntervals.Clear();
+
             std::condition_variable cv;
             std::mutex mutex;
             bool gpuDone{};
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
index d03679c6..9eaf5997 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@@ -6,6 +6,7 @@
 #include <boost/container/stable_vector.hpp>
 #include <renderdoc_app.h>
 #include <common/linear_allocator.h>
+#include <gpu/usage_tracker.h>
 #include <gpu/megabuffer.h>
 #include "command_nodes.h"
 #include "common/spin_lock.h"
@@ -217,6 +218,7 @@ namespace skyline::gpu::interconnect {
         size_t submissionNumber{};
         ContextTag executionTag{};
         bool captureNextExecution{};
+        UsageTracker usageTracker;
 
         CommandExecutor(const DeviceState &state);
 
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/common/pipeline.inc b/app/src/main/cpp/skyline/gpu/interconnect/common/pipeline.inc
index c79db589..1b293b73 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/common/pipeline.inc
+++ b/app/src/main/cpp/skyline/gpu/interconnect/common/pipeline.inc
@@ -62,7 +62,7 @@ namespace skyline::gpu::interconnect {
                 dstStageMask |= dstStage;
             }
 
-            view.GetBuffer()->MarkGpuDirty();
+            view.GetBuffer()->MarkGpuDirty(ctx.executor.usageTracker);
         } else {
             if (auto megaBufferBinding{view.TryMegaBuffer(ctx.executor.cycle, ctx.gpu.megaBufferAllocator, ctx.executor.executionTag)})
                 return megaBufferBinding;
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.cpp b/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.cpp
index b1a0607d..8242a9c4 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.cpp
@@ -53,13 +53,13 @@ namespace skyline::gpu::interconnect {
             mirrorBlock = blockMapping;
         }
 
-        if (entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber) {
-            entry->channelSequenceNumber = ctx.channelCtx.channelSequenceNumber;
+        if (entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->executionTag != ctx.executor.executionTag) {
+            entry->executionTag = ctx.executor.executionTag;
             entry->dirty = true;
         }
 
         // If the mirror entry has been written to, clear its shader binary cache and retrap to catch any future writes
-        if (entry->dirty) {
+        if (entry->dirty || ctx.executor.usageTracker.sequencedIntervals.Intersect(blockMapping.subspan(blockOffset))) {
             entry->cache.clear();
             entry->dirty = false;
 
@@ -129,7 +129,7 @@ namespace skyline::gpu::interconnect {
         if (programBase != lastProgramBase || programOffset != lastProgramOffset)
             return true;
 
-        if (entry && entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber)
+        if (entry && entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->executionTag != ctx.executor.executionTag)
             return true;
         else if (entry && entry->dirty)
             return true;
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.h b/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.h
index a832af16..a2a3e142 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/common/shader_cache.h
@@ -22,7 +22,7 @@ namespace skyline::gpu::interconnect {
 
             static constexpr u32 SkipTrapThreshold{20}; //!< Threshold for the number of times a mirror trap needs to be hit before we fallback to always hashing
             u32 trapCount{}; //!< The number of times the trap has been hit, used to avoid trapping in cases where the constant retraps would harm performance
-            size_t channelSequenceNumber{}; //!< For the case where `trapCount > SkipTrapThreshold`, the memory sequence number number used to clear the cache after every access
+            ContextTag executionTag{}; //!< For the case where `trapCount > SkipTrapThreshold`, the memory sequence number number used to clear the cache after every access
             bool dirty{}; //!< If the trap has been hit and the cache needs to be cleared
 
             MirrorEntry(span<u8> alignedMirror) : mirror{alignedMirror} {}
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp b/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp
index 4a4e7bee..ff846bd4 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/fermi_2d.cpp
@@ -123,6 +123,7 @@ namespace skyline::gpu::interconnect {
         auto dstTextureView{gpu.texture.FindOrCreate(dstGuestTexture, executor.tag)};
         executor.AttachDependency(dstTextureView);
         executor.AttachTexture(dstTextureView.get());
+        dstTextureView->texture->MarkGpuDirty(executor.usageTracker);
 
         // Blit shader always samples from centre so adjust if necessary
         float centredSrcRectX{sampleOrigin == SampleModeOrigin::Corner ? srcRectX - 0.5f : srcRectX};
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.cpp b/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.cpp
index b4e4f442..42d22baf 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.cpp
@@ -22,7 +22,7 @@ namespace skyline::gpu::interconnect {
         ContextLock dstBufLock{executor.tag, dstBuf};
 
 
-        dstBuf.Write(src, 0, [&]() {
+        dstBuf.Write(src, 0, executor.usageTracker, [&]() {
             executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock));
             // This will prevent any CPU accesses to backing for the duration of the usage
             dstBuf.GetBuffer()->BlockAllCpuBackingWrites();
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/active_state.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/active_state.cpp
index a9145e08..63e496c9 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/active_state.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/active_state.cpp
@@ -206,7 +206,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
                         dstStageMask |=  vk::PipelineStageFlagBits::eTransformFeedbackEXT;
                     }
 
-                    view->GetBuffer()->MarkGpuDirty();
+                    view->GetBuffer()->MarkGpuDirty(ctx.executor.usageTracker);
                     builder.SetTransformFeedbackBuffer(index, *view);
                     return;
                 } else {
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp
index c69d5d0d..0e0d6c39 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp
@@ -46,7 +46,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
         ContextLock lock{ctx.executor.tag, view};
 
         // First attempt the write without setting up the gpu copy callback as a fast path
-        if (view.Write(srcCpuBuf, offset)) [[unlikely]] {
+        if (view.Write(srcCpuBuf, offset, ctx.executor.usageTracker)) [[unlikely]] {
             // Store callback data in a stack allocated struct to avoid heap allocation for the gpu copy callback lambda
             struct GpuCopyCallbackData {
                 InterconnectContext &ctx;
@@ -56,7 +56,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
                 BufferView &view;
             } callbackData{ctx, srcCpuBuf, offset, lock, view};
 
-            view.Write(srcCpuBuf, offset, [&callbackData]() {
+            view.Write(srcCpuBuf, offset, ctx.executor.usageTracker, [&callbackData]() {
                 callbackData.ctx.executor.AttachLockedBufferView(callbackData.view, std::move(callbackData.lock));
                 // This will prevent any CPU accesses to backing for the duration of the usage
                 callbackData.view.GetBuffer()->BlockAllCpuBackingWrites();
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.cpp
index 21dae9d0..07051bf6 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.cpp
@@ -24,7 +24,7 @@ namespace skyline::gpu::interconnect {
         })};
         ContextLock dstBufLock{executor.tag, dstBuf};
 
-        dstBuf.CopyFrom(srcBuf, [&]() {
+        dstBuf.CopyFrom(srcBuf, executor.usageTracker, [&]() {
             executor.AttachLockedBufferView(srcBuf, std::move(srcBufLock));
             executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock));
             // This will prevent any CPU accesses to backing for the duration of the usage
diff --git a/app/src/main/cpp/skyline/gpu/texture/texture.cpp b/app/src/main/cpp/skyline/gpu/texture/texture.cpp
index 37b128c6..6e7b8d37 100644
--- a/app/src/main/cpp/skyline/gpu/texture/texture.cpp
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.cpp
@@ -725,6 +725,12 @@ namespace skyline::gpu {
         }
     }
 
+    void Texture::MarkGpuDirty(UsageTracker &usageTracker) {
+        for (auto mapping : guest->mappings)
+            if (mapping.valid())
+                usageTracker.dirtyIntervals.Insert(mapping);
+    }
+
     void Texture::SynchronizeHost(bool gpuDirty) {
         if (!guest)
             return;
diff --git a/app/src/main/cpp/skyline/gpu/texture/texture.h b/app/src/main/cpp/skyline/gpu/texture/texture.h
index 98933e4e..ac9cbe4b 100644
--- a/app/src/main/cpp/skyline/gpu/texture/texture.h
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.h
@@ -10,6 +10,7 @@
 #include <nce.h>
 #include <gpu/tag_allocator.h>
 #include <gpu/memory_manager.h>
+#include <gpu/usage_tracker.h>
 
 namespace skyline::gpu {
     namespace texture {
@@ -560,6 +561,11 @@ namespace skyline::gpu {
          */
         void TransitionLayout(vk::ImageLayout layout);
 
+        /**
+         * @brief Marks the texture as being GPU dirty
+         */
+        void MarkGpuDirty(UsageTracker &usageTracker);
+
         /**
          * @brief Synchronizes the host texture with the guest after it has been modified
          * @param gpuDirty If true, the texture will be transitioned to being GpuDirty by this call
diff --git a/app/src/main/cpp/skyline/gpu/usage_tracker.h b/app/src/main/cpp/skyline/gpu/usage_tracker.h
new file mode 100644
index 00000000..9bc1f16a
--- /dev/null
+++ b/app/src/main/cpp/skyline/gpu/usage_tracker.h
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <common/interval_list.h>
+
+namespace skyline::gpu {
+    /**
+     * @brief Tracks the usage of GPU memory and buffers to allow for fine-grained flushing
+     */
+    struct UsageTracker {
+        IntervalList<u8 *> dirtyIntervals; //!< Intervals of GPU-dirty contents that requires a flush before accessing
+        IntervalList<u8 *> sequencedIntervals; //!< Intervals of GPFIFO-sequenced writes that occur within an execution
+    };
+}