From 38d3ff4300d24c5721e6ea602255f1a59c3d300a Mon Sep 17 00:00:00 2001
From: PixelyIon <pixelyion@protonmail.com>
Date: Sun, 17 Jul 2022 18:18:33 +0530
Subject: [PATCH] Fix `BufferManager::FindOrCreate` Recreation Bugs

It was determined that `FindOrCreate` has several issues which this commit fixes:
* It wouldn't correctly handle locking of the newly created `Buffer` as the constructor would setup traps prior to being able to lock it which could lead to UB
* It wouldn't propagate the `usedByContext`/`everHadInlineUpdate` flags correctly
* It wouldn't correctly set the `dirtyState` of the buffer according to that of its source buffers
---
 app/src/main/cpp/skyline/gpu/buffer.cpp       |  62 ++---------
 app/src/main/cpp/skyline/gpu/buffer.h         |  14 +--
 .../main/cpp/skyline/gpu/buffer_manager.cpp   | 100 +++++++++++++-----
 3 files changed, 93 insertions(+), 83 deletions(-)
diff --git a/app/src/main/cpp/skyline/gpu/buffer.cpp b/app/src/main/cpp/skyline/gpu/buffer.cpp
index 6ebc0d95..6dbab898 100644
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@@ -40,58 +40,6 @@ namespace skyline::gpu {
         SetupGuestMappings();
     }
 
-    Buffer::Buffer(GPU &gpu, GuestBuffer guest, ContextTag tag, span<std::shared_ptr<Buffer>> srcBuffers) : gpu{gpu}, backing{gpu.memory.AllocateBuffer(guest.size())}, guest{guest} {
-        SetupGuestMappings();
-
-        // Source buffers don't necessarily fully overlap with us so we have to perform a sync here to prevent any gaps
-        SynchronizeHost(false);
-
-        // Copies between two buffers based off of their mappings in guest memory
-        auto copyBuffer{[](auto dstGuest, auto srcGuest, auto dstPtr, auto srcPtr) {
-            if (dstGuest.begin().base() <= srcGuest.begin().base()) {
-                size_t dstOffset{static_cast<size_t>(srcGuest.begin().base() - dstGuest.begin().base())};
-                size_t copySize{std::min(dstGuest.size() - dstOffset, srcGuest.size())};
-                std::memcpy(dstPtr + dstOffset, srcPtr, copySize);
-            } else if (dstGuest.begin().base() > srcGuest.begin().base()) {
-                size_t srcOffset{static_cast<size_t>(dstGuest.begin().base() - srcGuest.begin().base())};
-                size_t copySize{std::min(dstGuest.size(), srcGuest.size() - srcOffset)};
-                std::memcpy(dstPtr, srcPtr + srcOffset, copySize);
-            }
-        }};
-
-        // Transfer data/state from source buffers
-        for (const auto &srcBuffer : srcBuffers) {
-            ContextLock lock{tag, *srcBuffer};
-            if (srcBuffer->guest) {
-                if (srcBuffer->cycle && cycle != srcBuffer->cycle)
-                    if (cycle)
-                        cycle->ChainCycle(srcBuffer->cycle);
-                    else
-                        cycle = srcBuffer->cycle;
-
-                if (srcBuffer->dirtyState == Buffer::DirtyState::GpuDirty) {
-                    // If the source buffer is GPU dirty we cannot directly copy over its GPU backing contents
-
-                    // Only sync back the buffer if it's not attached to the current context, otherwise propagate the GPU dirtiness
-                    if (lock.IsFirstUsage()) {
-                        // Perform a GPU -> CPU sync on the source then do a CPU -> GPU sync for the region occupied by the source
-                        // This is required since if we were created from a two buffers: one GPU dirty in the current cycle, and one GPU dirty in the previous cycle, if we marked ourselves as CPU dirty here then the GPU dirtiness from the current cycle buffer would be ignored and cause writes to be missed
-                        srcBuffer->SynchronizeGuest(true);
-                        copyBuffer(guest, *srcBuffer->guest, backing.data(), srcBuffer->mirror.data());
-                    } else {
-                        MarkGpuDirty();
-                    }
-                } else if (srcBuffer->dirtyState == Buffer::DirtyState::Clean) {
-                    // For clean buffers we can just copy over the GPU backing data directly
-                    // This is necessary since clean buffers may not have matching GPU/CPU data in the case of inline updates for host immutable buffers
-                    copyBuffer(guest, *srcBuffer->guest, backing.data(), srcBuffer->backing.data());
-                }
-
-                // CPU dirty buffers are already synchronized in the initial SynchronizeHost call so don't need special handling
-            }
-        }
-    }
-
     Buffer::Buffer(GPU &gpu, vk::DeviceSize size) : gpu(gpu), backing(gpu.memory.AllocateBuffer(size)) {
         dirtyState = DirtyState::Clean; // Since this is a host-only buffer it's always going to be clean
     }
@@ -140,6 +88,16 @@ namespace skyline::gpu {
         return false;
     }
 
+    void Buffer::Invalidate() {
+        if (trapHandle) {
+            gpu.state.nce->DeleteTrap(*trapHandle);
+            trapHandle = {};
+        }
+
+        // Will prevent any sync operations so even if the trap handler is partway through running and hasn't yet acquired the lock it won't do anything
+        guest = {};
+    }
+
     void Buffer::SynchronizeHost(bool rwTrap) {
         if (!guest)
             return;
diff --git a/app/src/main/cpp/skyline/gpu/buffer.h b/app/src/main/cpp/skyline/gpu/buffer.h
index 7c533ed3..6cc28993 100644
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@@ -140,13 +140,6 @@ namespace skyline::gpu {
 
         Buffer(GPU &gpu, GuestBuffer guest);
 
-        /**
-         * @brief Creates a Buffer that is pre-synchronised with the contents of the input buffers
-         * @param tag The tag to associate locking of srcBuffers with
-         * @param srcBuffers Span of overlapping source buffers
-         */
-        Buffer(GPU &gpu, GuestBuffer guest, ContextTag tag, span<std::shared_ptr<Buffer>> srcBuffers);
-
         /**
          * @brief Creates a host-only Buffer which isn't backed by any guest buffer
          * @note The created buffer won't have a mirror so any operations cannot depend on a mirror existing
@@ -222,6 +215,13 @@ namespace skyline::gpu {
          */
         bool PollFence();
 
+        /**
+         * @brief Invalidates the Buffer on the guest and deletes the trap that backs this buffer as it is no longer necessary
+         * @note This will not clear any views or delegates on the buffer, it will only remove guest mappings and delete the trap
+         * @note The buffer **must** be locked prior to calling this
+         */
+        void Invalidate();
+
         /**
          * @brief Synchronizes the host buffer with the guest
          * @param rwTrap If true, the guest buffer will be read/write trapped rather than only being write trapped which is more efficient than calling MarkGpuDirty directly after
diff --git a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
index c5e2613a..5d9fa185 100644
--- a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
@@ -34,17 +34,31 @@ namespace skyline::gpu {
         vk::DeviceSize offset{static_cast<size_t>(guestMapping.begin().base() - alignedStart)}, size{guestMapping.size()};
         guestMapping = span<u8>{alignedStart, alignedEnd};
 
+        struct LockedBuffer {
+            std::shared_ptr<Buffer> buffer;
+            ContextLock<Buffer> lock;
+
+            LockedBuffer(std::shared_ptr<Buffer> pBuffer, ContextTag tag) : buffer{std::move(pBuffer)}, lock{tag, *buffer} {}
+
+            Buffer *operator->() const {
+                return buffer.get();
+            }
+
+            std::shared_ptr<Buffer> &operator*() {
+                return buffer;
+            }
+        };
+
         // Lookup for any buffers overlapping with the supplied guest mapping
-        boost::container::small_vector<std::shared_ptr<Buffer>, 4> overlaps;
+        boost::container::small_vector<LockedBuffer, 4> overlaps;
         for (auto entryIt{std::lower_bound(buffers.begin(), buffers.end(), guestMapping.end().base(), BufferLessThan)}; entryIt != buffers.begin() && (*--entryIt)->guest->begin() <= guestMapping.end();)
             if ((*entryIt)->guest->end() > guestMapping.begin())
-                overlaps.push_back(*entryIt);
+                overlaps.emplace_back(*entryIt, tag);
 
         if (overlaps.size() == 1) [[likely]] {
-            auto buffer{overlaps.front()};
+            auto &buffer{overlaps.front()};
             if (buffer->guest->begin() <= guestMapping.begin() && buffer->guest->end() >= guestMapping.end()) {
                 // If we find a buffer which can entirely fit the guest mapping, we can just return a view into it
-                ContextLock bufferLock{tag, *buffer};
                 return buffer->GetView(static_cast<vk::DeviceSize>(guestMapping.begin() - buffer->guest->begin()) + offset, size);
             }
         }
@@ -59,20 +73,56 @@ namespace skyline::gpu {
                 highestAddress = mapping.end().base();
         }
 
-        auto newBuffer{std::make_shared<Buffer>(gpu, span<u8>{lowestAddress, highestAddress}, tag, overlaps)};
-        ContextLock newLock{tag, *newBuffer};
-        bool hasAlreadyAttached{}; //!< If any overlapping view was already attached to the current context
-        for (auto &overlap : overlaps) {
-            ContextLock overlapLock{tag, *overlap};
+        LockedBuffer newBuffer{std::make_shared<Buffer>(gpu, span<u8>{lowestAddress, highestAddress}), tag}; // If we don't lock the buffer prior to trapping it during synchronization, a race could occur with a guest trap acquiring the lock before we do and mutating the buffer prior to it being ready
 
-            if (!overlapLock.IsFirstUsage())
-                hasAlreadyAttached = true;
+        newBuffer->SynchronizeHost(false); // Overlaps don't necessarily fully cover the buffer so we have to perform a sync here to prevent any gaps
 
-            buffers.erase(std::find(buffers.begin(), buffers.end(), overlap));
+        auto copyBuffer{[](auto dstGuest, auto srcGuest, auto dstPtr, auto srcPtr) {
+            if (dstGuest.begin().base() <= srcGuest.begin().base()) {
+                size_t dstOffset{static_cast<size_t>(srcGuest.begin().base() - dstGuest.begin().base())};
+                size_t copySize{std::min(dstGuest.size() - dstOffset, srcGuest.size())};
+                std::memcpy(dstPtr + dstOffset, srcPtr, copySize);
+            } else if (dstGuest.begin().base() > srcGuest.begin().base()) {
+                size_t srcOffset{static_cast<size_t>(dstGuest.begin().base() - srcGuest.begin().base())};
+                size_t copySize{std::min(dstGuest.size(), srcGuest.size() - srcOffset)};
+                std::memcpy(dstPtr, srcPtr + srcOffset, copySize);
+            }
+        }}; //!< Copies between two buffers based off of their mappings in guest memory
+
+        bool shouldAttach{}; //!< If the new buffer should be attached to the current context
+        for (auto &srcBuffer : overlaps) {
+            if (!srcBuffer.lock.IsFirstUsage())
+                // If any overlapping buffer was already attached to the current context, we should also attach the current context
+                shouldAttach = true;
+
+            // All newly created buffers that have this set are guaranteed to be attached in buffer FindOrCreate, attach will then lock the buffer without resetting this flag, which will only finally be reset when the lock is released
+            newBuffer->usedByContext |= srcBuffer->usedByContext;
+            newBuffer->everHadInlineUpdate |= srcBuffer->everHadInlineUpdate;
+
+            if (srcBuffer->cycle && newBuffer->cycle != srcBuffer->cycle)
+                if (newBuffer->cycle)
+                    newBuffer->cycle->ChainCycle(srcBuffer->cycle);
+                else
+                    newBuffer->cycle = srcBuffer->cycle;
+
+            if (srcBuffer->dirtyState == Buffer::DirtyState::GpuDirty) {
+                srcBuffer->WaitOnFence();
+
+                if (srcBuffer.lock.IsFirstUsage() && newBuffer->dirtyState != Buffer::DirtyState::GpuDirty)
+                    copyBuffer(*newBuffer->guest, *srcBuffer->guest, newBuffer->mirror.data(), srcBuffer->backing.data());
+                else
+                    newBuffer->MarkGpuDirty();
+            } else if (srcBuffer->usedByContext) {
+                if (srcBuffer->dirtyState == Buffer::DirtyState::CpuDirty)
+                    Logger::Error("Buffer (0x{}-0x{}) is marked as CPU dirty while being utilized by the context, this is not valid", srcBuffer->guest->begin().base(), srcBuffer->guest->end().base());
+                // We need the backing to be stable so that any accesses within this context are sequenced correctly, we can't use the source mirror here either since buffer writes within this context will update the mirror on CPU and backing on GPU
+                srcBuffer->WaitOnFence();
+                copyBuffer(*newBuffer->guest, *srcBuffer->guest, newBuffer->backing.data(), srcBuffer->backing.data());
+            }
 
             // Transfer all views from the overlapping buffer to the new buffer with the new buffer and updated offset, ensuring pointer stability
-            vk::DeviceSize overlapOffset{static_cast<vk::DeviceSize>(overlap->guest->begin() - newBuffer->guest->begin())};
-            for (auto it{overlap->views.begin()}; it != overlap->views.end(); it++) {
+            vk::DeviceSize overlapOffset{static_cast<vk::DeviceSize>(srcBuffer->guest->begin() - newBuffer->guest->begin())};
+            for (auto it{srcBuffer->views.begin()}; it != srcBuffer->views.end(); it++) {
                 if (overlapOffset)
                     // This is a slight hack as we really shouldn't be changing the underlying non-mutable set elements without a rehash but without writing our own set impl this is the best we can do
                     const_cast<Buffer::BufferViewStorage *>(&*it)->offset += overlapOffset;
@@ -84,26 +134,28 @@ namespace skyline::gpu {
 
             if (overlapOffset)
                 // All current hashes are invalidated by above loop if overlapOffset is nonzero so rehash the container
-                overlap->views.rehash(0);
+                srcBuffer->views.rehash(0);
 
             // Merge the view sets, this will keep pointer stability hence avoiding any reallocation
-            newBuffer->views.merge(overlap->views);
+            newBuffer->views.merge(srcBuffer->views);
 
             // Transfer all delegates references from the overlapping buffer to the new buffer
-            for (auto &delegate : overlap->delegates) {
-                delegate->buffer = newBuffer;
+            for (auto &delegate : srcBuffer->delegates) {
+                delegate->buffer = *newBuffer;
                 if (delegate->usageCallback)
-                    delegate->usageCallback(*delegate->view, newBuffer);
+                    delegate->usageCallback(*delegate->view, *newBuffer);
             }
 
-            newBuffer->delegates.splice(newBuffer->delegates.end(), overlap->delegates);
+            newBuffer->delegates.splice(newBuffer->delegates.end(), srcBuffer->delegates);
+
+            srcBuffer->Invalidate(); // Invalidate the overlapping buffer so it can't be synced in the future
+            buffers.erase(std::find(buffers.begin(), buffers.end(), srcBuffer.buffer));
         }
 
-        if (hasAlreadyAttached)
-            // We need to reattach the buffer if any overlapping views were already attached to the current context
-            attachBuffer(newBuffer, std::move(newLock));
+        if (shouldAttach)
+            attachBuffer(*newBuffer, std::move(newBuffer.lock));
 
-        buffers.insert(std::lower_bound(buffers.begin(), buffers.end(), newBuffer->guest->end().base(), BufferLessThan), newBuffer);
+        buffers.insert(std::lower_bound(buffers.begin(), buffers.end(), newBuffer->guest->end().base(), BufferLessThan), *newBuffer);
 
         return newBuffer->GetView(static_cast<vk::DeviceSize>(guestMapping.begin() - newBuffer->guest->begin()) + offset, size);
     }