Implement an alternative buffer path using direct memory importing

By importing guest memory directly onto the host GPU we can avoid many of the complexities that occur with memory tracking as well as the heavy performance overhead in some situations. Since it's still desired to support the traditional buffer method, as it's faster in some cases and more widely supported, most of the exposed buffer methods have been split into two variants with just a small amount of shared code. While in most cases the code is simpler, one area with more complexity is handling CPU accesses that need to be sequenced, since we don't have any place we can easily apply writes to on the GPFIFO thread that wont also impact the buffer on the GPU, to solve this, when the GPU is actively using a buffer's contents, an interval list is used to keep track of any GPFIO-written regions on the CPU and any CPU reads to them will instead be directed to a shadow of the buffer with just those writes applied. Once the GPU has finished using buffer contents the shadow can then be removed as all writes will have been done by the GPU. The main caveat of this is that it requires tying host sync to guest sync, this can reduce performance in games which double buffer command buffers as it prevents us from fully saturating the CPU with the GPFIFO thread.
2024-11-23 01:59:19 +01:00 · 2022-12-27 18:21:58 +00:00 · 2022-12-27 18:21:58 +00:00 · 3d31ade35f
commit 3d31ade35f
parent b3f7e990cc
9 changed files with 481 additions and 152 deletions
--- a/app/src/main/cpp/skyline/common/android_settings.h
+++ b/app/src/main/cpp/skyline/common/android_settings.h
@ -42,6 +42,7 @@ namespace skyline {
            gpuDriverLibraryName = ktSettings.GetString("gpuDriverLibraryName");
            executorSlotCountScale = ktSettings.GetInt<u32>("executorSlotCountScale");
            executorFlushThreshold = ktSettings.GetInt<u32>("executorFlushThreshold");
            useDirectMemoryImport = ktSettings.GetBool("useDirectMemoryImport");
            enableFastGpuReadbackHack = ktSettings.GetBool("enableFastGpuReadbackHack");
            isAudioOutputDisabled = ktSettings.GetBool("isAudioOutputDisabled");
            validationLayer = ktSettings.GetBool("validationLayer");
--- a/app/src/main/cpp/skyline/common/settings.h
+++ b/app/src/main/cpp/skyline/common/settings.h
@ -74,6 +74,7 @@ namespace skyline {
        Setting<std::string> gpuDriverLibraryName; //!< The name of the GPU driver library to use
        Setting<u32> executorSlotCountScale; //!< Number of GPU executor slots that can be used concurrently
        Setting<u32> executorFlushThreshold; //!< Number of commands that need to accumulate before they're flushed to the GPU
        Setting<bool> useDirectMemoryImport; //!< If buffer emulation should be done by importing guest buffer mappings
        // Hacks
        Setting<bool> enableFastGpuReadbackHack; //!< If the CPU texture readback skipping hack should be used
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@ -1,6 +1,7 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
 #include <adrenotools/driver.h>
 #include <gpu.h>
 #include <kernel/memory.h>
 #include <kernel/types/KProcess.h>
@ -18,12 +19,9 @@ namespace skyline::gpu {
        unifiedMegaBuffer = {};
    }
-    void Buffer::SetupGuestMappings() {
+    void Buffer::SetupStagedTraps() {
-        u8 *alignedData{util::AlignDown(guest->data(), constant::PageSize)};
+        if (isDirect)
-        size_t alignedSize{static_cast<size_t>(util::AlignUp(guest->data() + guest->size(), constant::PageSize) - alignedData)};
+            return;
        alignedMirror = gpu.state.process->memory.CreateMirror(span<u8>{alignedData, alignedSize});
        mirror = alignedMirror.subspan(static_cast<size_t>(guest->data() - alignedData), guest->size());
        // We can't just capture this in the lambda since the lambda could exceed the lifetime of the buffer
        std::weak_ptr<Buffer> weakThis{shared_from_this()};
@ -99,7 +97,7 @@ namespace skyline::gpu {
            if (buffer->accumulatedGuestWaitTime > FastReadbackHackWaitTimeThreshold && *buffer->gpu.state.settings->enableFastGpuReadbackHack) {
                // As opposed to skipping readback as we do for textures, with buffers we can still perform the readback but just without syncinc the GPU
                // While the read data may be invalid it's still better than nothing and works in most cases
-                memcpy(buffer->mirror.data(), buffer->backing.data(), buffer->mirror.size());
+                memcpy(buffer->mirror.data(), buffer->backing->data(), buffer->mirror.size());
                buffer->dirtyState = DirtyState::Clean;
                return true;
            }
@ -118,37 +116,266 @@ namespace skyline::gpu {
        });
    }
-    Buffer::Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, GuestBuffer guest, size_t id)
+    void Buffer::InsertWriteIntervalDirect(WriteTrackingInterval entry) {
-        : gpu{gpu},
+        auto firstIt{std::lower_bound(directTrackedWrites.begin(), directTrackedWrites.end(), entry, [](const auto &lhs, const auto &rhs) {
-          backing{gpu.memory.AllocateBuffer(guest.size())},
+            return lhs.end < rhs.offset;
-          guest{guest},
+        })}; // Lowest offset entry that (maybe) overlaps with the new entry
-          delegate{delegateAllocator.EmplaceUntracked<BufferDelegate>(this)},
+
-          id{id},
+        if (firstIt == directTrackedWrites.end() || firstIt->offset >= entry.end) {
-          megaBufferTableShift{std::max(std::bit_width(guest.size() / MegaBufferTableMaxEntries - 1), MegaBufferTableShiftMin)} {
+            directTrackedWrites.insert(firstIt, entry);
-        megaBufferTable.resize(guest.size() / (1 << megaBufferTableShift));
+            return;
        }
        // Now firstIt will always overlap
        auto lastIt{firstIt}; // Highest offset entry that overlaps with the new entry
        while (std::next(lastIt) != directTrackedWrites.end() && std::next(lastIt)->offset < entry.end)
            lastIt++;
        // Since firstIt and lastIt both are guaranteed to overlap, max them to get the new entry's end
        size_t end{std::max(std::max(firstIt->end, entry.end), lastIt->end)};
        // Erase all overlapping entries but the first
        auto eraseStartIt{std::next(firstIt)};
        auto eraseEndIt{std::next(lastIt)};
        if (eraseStartIt != eraseEndIt) {
            lastIt = directTrackedWrites.erase(eraseStartIt, eraseEndIt);
            firstIt = std::prev(lastIt);
        }
-    Buffer::Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, vk::DeviceSize size, size_t id)
+        firstIt->offset = std::min(entry.offset, firstIt->offset);
-        : gpu{gpu},
+        firstIt->end = end;
          backing{gpu.memory.AllocateBuffer(size)},
          delegate{delegateAllocator.EmplaceUntracked<BufferDelegate>(this)},
          id{id} {
        dirtyState = DirtyState::Clean; // Since this is a host-only buffer it's always going to be clean
    }
-    Buffer::~Buffer() {
+    Buffer::QueryIntervalResult Buffer::QueryWriteIntervalDirect(u64 offset) {
-        if (trapHandle)
+        auto it{std::lower_bound(directTrackedWrites.begin(), directTrackedWrites.end(), offset, [](const auto &lhs, const auto &rhs) {
-            gpu.state.nce->DeleteTrap(*trapHandle);
+            return lhs.end < rhs;
-        SynchronizeGuest(true);
+        })}; // Lowest offset entry that (maybe) overlaps with the new entry
-        if (alignedMirror.valid())
+
-            munmap(alignedMirror.data(), alignedMirror.size());
+        if (it == directTrackedWrites.end()) // No overlaps for the entire rest of buffer
            return {false, mirror.size() - offset};
        else if (it->offset > offset) // No overlap, return the distance to the next possible overlap
            return {false, it->offset - offset};
        else // Overlap, return the distance to the end of the overlap
            return {true, it->end - offset};
    }
    void Buffer::EnableTrackedShadowDirect() {
        if (!directTrackedShadowActive) {
            directTrackedShadow.resize(guest->size());
            directTrackedShadowActive = true;
        }
    }
    span<u8> Buffer::BeginWriteCpuSequencedDirect(size_t offset, size_t size) {
        EnableTrackedShadowDirect();
        InsertWriteIntervalDirect({offset, offset + size});
        return {directTrackedShadow.data() + offset, size};
    }
    bool Buffer::RefreshGpuReadsActiveDirect() {
        bool readsActive{SequencedCpuBackingWritesBlocked() || !PollFence()};
        if (!readsActive) {
            if (directTrackedShadowActive) {
                directTrackedShadowActive = false;
                directTrackedShadow.clear();
                directTrackedShadow.shrink_to_fit();
            }
            directTrackedWrites.clear();
        }
        return readsActive;
    }
    bool Buffer::RefreshGpuWritesActiveDirect(bool wait, const std::function<void()> &flushHostCallback) {
        if (directGpuWritesActive && (!PollFence() || AllCpuBackingWritesBlocked())) {
            if (wait) {
                if (AllCpuBackingWritesBlocked()) // If we are dirty in the current cycle we'll need to flush
                    flushHostCallback();
                WaitOnFence();
                // No longer dirty
            } else {
                return true;
            }
        }
-    void Buffer::MarkGpuDirty() {
+        directGpuWritesActive = false;
-        if (!guest)
+        return false;
    }
    bool Buffer::ValidateMegaBufferViewImplDirect(vk::DeviceSize size) {
        if (!everHadInlineUpdate || size >= MegaBufferChunkSize)
            // Don't megabuffer buffers that have never had inline updates
            return false;
        if (RefreshGpuWritesActiveDirect())
            // If the buffer is currently being written to by the GPU then we can't megabuffer it
            return false;
        if (directTrackedShadowActive)
            // If the mirror contents aren't fully up to date then we can't megabuffer that would ignore any shadow tracked writes
            return false;
        return true;
    }
    bool Buffer::ValidateMegaBufferViewImplStaged(vk::DeviceSize size) {
        if ((!everHadInlineUpdate && sequenceNumber < FrequentlySyncedThreshold) || size >= MegaBufferChunkSize)
            // Don't megabuffer buffers that have never had inline updates and are not frequently synced since performance is only going to be harmed as a result of the constant copying and there wont be any benefit since there are no GPU inline updates that would be avoided
            return false;
        // We are safe to check dirty state here since it will only ever be set GPU dirty with the buffer locked and from the active GPFIFO thread. This helps with perf since the lock ends up being slightly expensive
        if (dirtyState == DirtyState::GpuDirty)
            // Bail out if buffer cannot be synced, we don't know the contents ahead of time so the sequence is indeterminate
            return false;
        return true;
    }
    bool Buffer::ValidateMegaBufferView(vk::DeviceSize size) {
        return isDirect ? ValidateMegaBufferViewImplDirect(size) : ValidateMegaBufferViewImplStaged(size);
    }
    void Buffer::CopyFromImplDirect(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback) {
        everHadInlineUpdate = true;
        bool needsGpuTracking{src->RefreshGpuWritesActiveDirect() || RefreshGpuWritesActiveDirect()};
        bool needsCpuTracking{RefreshGpuReadsActiveDirect() && !needsGpuTracking};
        if (needsGpuTracking || needsCpuTracking) {
            if (needsGpuTracking) // Force buffer to be dirty for this cycle if either of the sources are dirty, this is needed as otherwise it could have just been dirty from the previous cycle
                MarkGpuDirty();
            gpuCopyCallback();
            if (needsCpuTracking)
                src->Read(false, {}, BeginWriteCpuSequencedDirect(dstOffset, size), srcOffset);
        } else {
            src->Read(false, {}, {mirror.data() + dstOffset, size}, srcOffset);
        }
    }
    void Buffer::CopyFromImplStaged(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback) {
        std::scoped_lock lock{stateMutex, src->stateMutex}; // Fine even if src and dst are same since recursive mutex
        if (dirtyState == DirtyState::CpuDirty && SequencedCpuBackingWritesBlocked())
            // If the buffer is used in sequence directly on the GPU, SynchronizeHost before modifying the mirror contents to ensure proper sequencing. This write will then be sequenced on the GPU instead (the buffer will be kept clean for the rest of the execution due to gpuCopyCallback blocking all writes)
            SynchronizeHost();
        if (dirtyState != DirtyState::GpuDirty && src->dirtyState != DirtyState::GpuDirty) {
            std::memcpy(mirror.data() + dstOffset, src->mirror.data() + srcOffset, size);
            if (dirtyState == DirtyState::CpuDirty && !SequencedCpuBackingWritesBlocked())
                // Skip updating backing if the changes are gonna be updated later by SynchroniseHost in executor anyway
                return;
            if (!SequencedCpuBackingWritesBlocked() && PollFence())
                // We can write directly to the backing as long as this resource isn't being actively used by a past workload (in the current context or another)
                std::memcpy(backing->data() + dstOffset, src->mirror.data() + srcOffset, size);
            else
                gpuCopyCallback();
        } else {
            MarkGpuDirty();
            gpuCopyCallback();
        }
    }
    bool Buffer::WriteImplDirect(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback) {
        // If the buffer is GPU dirty do the write on the GPU and we're done
        if (RefreshGpuWritesActiveDirect()) {
            if (gpuCopyCallback) {
                // Propagate dirtiness to the current cycle, since if this is only dirty in a previous cycle that could change at any time and we would need to have the write saved somewhere for CPU reads
                // By propagating the dirtiness to the current cycle we can avoid this and force a wait on any reads
                MarkGpuDirty();
                gpuCopyCallback();
                return false;
            } else {
                return true;
            }
        }
        if (RefreshGpuReadsActiveDirect()) {
            // If the GPU could read the buffer we need to track the write in the shadow and do the actual write on the GPU
            if (gpuCopyCallback)
                gpuCopyCallback();
            else
                return true;
            BeginWriteCpuSequencedDirect(offset, data.size()).copy_from(data);
            return false;
        }
        // If the GPU isn't accessing the mirror we can just write directly to it
        std::memcpy(mirror.data() + offset, data.data(), data.size());
        return false;
    }
    bool Buffer::WriteImplStaged(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback) {
        // We cannot have *ANY* state changes for the duration of this function, if the buffer became CPU dirty partway through the GPU writes would mismatch the CPU writes
        std::scoped_lock lock{stateMutex};
        // If the buffer is GPU dirty do the write on the GPU and we're done
        if (dirtyState == DirtyState::GpuDirty) {
            if (gpuCopyCallback)
                gpuCopyCallback();
            else
                return true;
        }
        if (dirtyState == DirtyState::CpuDirty && SequencedCpuBackingWritesBlocked())
            // If the buffer is used in sequence directly on the GPU, SynchronizeHost before modifying the mirror contents to ensure proper sequencing. This write will then be sequenced on the GPU instead (the buffer will be kept clean for the rest of the execution due to gpuCopyCallback blocking all writes)
            SynchronizeHost();
        std::memcpy(mirror.data() + offset, data.data(), data.size()); // Always copy to mirror since any CPU side reads will need the up-to-date contents
        if (dirtyState == DirtyState::CpuDirty && !SequencedCpuBackingWritesBlocked())
            // Skip updating backing if the changes are gonna be updated later by SynchroniseHost in executor anyway
            return false;
        if (!SequencedCpuBackingWritesBlocked() && PollFence()) {
            // We can write directly to the backing as long as this resource isn't being actively used by a past workload (in the current context or another)
            std::memcpy(backing->data() + offset, data.data(), data.size());
        } else {
            // If this buffer is host immutable, perform a GPU-side inline update for the buffer contents since we can't directly modify the backing
            // If no copy callback is supplied, return true to indicate that the caller should repeat the write with an appropriate callback
            if (gpuCopyCallback)
                gpuCopyCallback();
            else
                return true;
        }
        return false;
    }
    void Buffer::ReadImplDirect(const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) {
        // If GPU writes are active then wait until that's no longer the case
        RefreshGpuWritesActiveDirect(true, flushHostCallback);
        if (directTrackedShadowActive && RefreshGpuReadsActiveDirect()) {
            size_t curOffset{offset};
            while (curOffset != data.size() + offset) {
                auto result{QueryWriteIntervalDirect(curOffset)};
                auto srcData{result.useShadow ? directTrackedShadow.data() : mirror.data()};
                std::memcpy(data.data() + curOffset - offset, srcData + curOffset, result.size);
                curOffset += result.size;
            }
        } else [[likely]] {
            std::memcpy(data.data(), mirror.data() + offset, data.size());
        }
    }
    void Buffer::ReadImplStaged(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) {
        if (dirtyState == DirtyState::GpuDirty)
            SynchronizeGuestImmediate(isFirstUsage, flushHostCallback);
        std::memcpy(data.data(), mirror.data() + offset, data.size());
    }
    void Buffer::MarkGpuDirtyImplDirect() {
        directGpuWritesActive = true;
        BlockAllCpuBackingWrites();
        AdvanceSequence();
    }
    void Buffer::MarkGpuDirtyImplStaged() {
        std::scoped_lock lock{stateMutex}; // stateMutex is locked to prevent state changes at any point during this function
        if (dirtyState == DirtyState::GpuDirty)
@ -166,6 +393,49 @@ namespace skyline::gpu {
        AdvanceSequence(); // The GPU will modify buffer contents so advance to the next sequence
    }
    Buffer::Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, GuestBuffer guest, size_t id, bool direct)
        : gpu{gpu},
          guest{guest},
          mirror{gpu.state.process->memory.CreateMirror(guest)},
          delegate{delegateAllocator.EmplaceUntracked<BufferDelegate>(this)},
          isDirect{direct},
          id{id},
          megaBufferTableShift{std::max(std::bit_width(guest.size() / MegaBufferTableMaxEntries - 1), MegaBufferTableShiftMin)} {
        if (isDirect)
            directBacking = gpu.memory.ImportBuffer(mirror);
        else
            backing = gpu.memory.AllocateBuffer(mirror.size());
        megaBufferTable.resize(guest.size() / (1 << megaBufferTableShift));
    }
    Buffer::Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, vk::DeviceSize size, size_t id)
        : gpu{gpu},
          backing{gpu.memory.AllocateBuffer(size)},
          delegate{delegateAllocator.EmplaceUntracked<BufferDelegate>(this)},
          id{id} {
        dirtyState = DirtyState::Clean; // Since this is a host-only buffer it's always going to be clean
    }
    Buffer::~Buffer() {
        if (trapHandle)
            gpu.state.nce->DeleteTrap(*trapHandle);
        SynchronizeGuest(true);
        if (mirror.valid())
            munmap(mirror.data(), mirror.size());
        WaitOnFence();
    }
    void Buffer::MarkGpuDirty() {
        if (!guest)
            return;
        if (isDirect)
            MarkGpuDirtyImplDirect();
        else
            MarkGpuDirtyImplStaged();
    }
    void Buffer::WaitOnFence() {
        TRACE_EVENT("gpu", "Buffer::WaitOnFence");
@ -198,7 +468,7 @@ namespace skyline::gpu {
    }
    void Buffer::SynchronizeHost(bool skipTrap) {
-        if (!guest)
+        if (!guest || isDirect)
            return;
        TRACE_EVENT("gpu", "Buffer::SynchronizeHost");
@ -217,11 +487,11 @@ namespace skyline::gpu {
                gpu.state.nce->TrapRegions(*trapHandle, true); // Trap any future CPU writes to this buffer, must be done before the memcpy so that any modifications during the copy are tracked
        }
-        std::memcpy(backing.data(), mirror.data(), mirror.size());
+        std::memcpy(backing->data(), mirror.data(), mirror.size());
    }
    bool Buffer::SynchronizeGuest(bool skipTrap, bool nonBlocking) {
-        if (!guest)
+        if (!guest || isDirect)
            return false;
        TRACE_EVENT("gpu", "Buffer::SynchronizeGuest");
@ -236,7 +506,7 @@ namespace skyline::gpu {
                return false; // If the fence is not signalled and non-blocking behaviour is requested then bail out
            WaitOnFence();
-            std::memcpy(mirror.data(), backing.data(), mirror.size());
+            std::memcpy(mirror.data(), backing->data(), mirror.size());
            dirtyState = DirtyState::Clean;
        }
@ -248,6 +518,9 @@ namespace skyline::gpu {
    }
    void Buffer::SynchronizeGuestImmediate(bool isFirstUsage, const std::function<void()> &flushHostCallback) {
        if (isDirect)
            return;
        // If this buffer was attached to the current cycle, flush all pending host GPU work and wait to ensure that we read valid data
        if (!isFirstUsage)
            flushHostCallback();
@ -256,81 +529,31 @@ namespace skyline::gpu {
    }
    void Buffer::Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) {
-        if (dirtyState == DirtyState::GpuDirty)
+        if (isDirect)
-            SynchronizeGuestImmediate(isFirstUsage, flushHostCallback);
+            ReadImplDirect(flushHostCallback, data, offset);
-
+        else
-        std::memcpy(data.data(), mirror.data() + offset, data.size());
+            ReadImplStaged(isFirstUsage, flushHostCallback, data, offset);
    }
    bool Buffer::Write(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback) {
        AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
        everHadInlineUpdate = true;
-        // We cannot have *ANY* state changes for the duration of this function, if the buffer became CPU dirty partway through the GPU writes would mismatch the CPU writes
+        if (isDirect)
-        std::scoped_lock lock{stateMutex};
+            return WriteImplDirect(data, offset, gpuCopyCallback);
        // If the buffer is GPU dirty do the write on the GPU and we're done
        if (dirtyState == DirtyState::GpuDirty) {
            if (gpuCopyCallback)
                gpuCopyCallback();
        else
-                return true;
+            return WriteImplStaged(data, offset, gpuCopyCallback);
        }
        if (dirtyState == DirtyState::CpuDirty && SequencedCpuBackingWritesBlocked())
            // If the buffer is used in sequence directly on the GPU, SynchronizeHost before modifying the mirror contents to ensure proper sequencing. This write will then be sequenced on the GPU instead (the buffer will be kept clean for the rest of the execution due to gpuCopyCallback blocking all writes)
            SynchronizeHost();
        std::memcpy(mirror.data() + offset, data.data(), data.size()); // Always copy to mirror since any CPU side reads will need the up-to-date contents
        if (dirtyState == DirtyState::CpuDirty && !SequencedCpuBackingWritesBlocked())
            // Skip updating backing if the changes are gonna be updated later by SynchroniseHost in executor anyway
            return false;
        if (!SequencedCpuBackingWritesBlocked() && PollFence()) {
            // We can write directly to the backing as long as this resource isn't being actively used by a past workload (in the current context or another)
            std::memcpy(backing.data() + offset, data.data(), data.size());
        } else {
            // If this buffer is host immutable, perform a GPU-side inline update for the buffer contents since we can't directly modify the backing
            // If no copy callback is supplied, return true to indicate that the caller should repeat the write with an appropriate callback
            if (gpuCopyCallback)
                gpuCopyCallback();
            else
                return true;
        }
        return false;
    }
    void Buffer::CopyFrom(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback) {
        AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
        everHadInlineUpdate = true;
-        std::scoped_lock lock{stateMutex, src->stateMutex}; // Fine even if src and dst are same since recursive mutex
+        if (isDirect)
-
+            CopyFromImplDirect(dstOffset, src, srcOffset, size, gpuCopyCallback);
-        if (dirtyState == DirtyState::CpuDirty && SequencedCpuBackingWritesBlocked())
+        else
-            // If the buffer is used in sequence directly on the GPU, SynchronizeHost before modifying the mirror contents to ensure proper sequencing. This write will then be sequenced on the GPU instead (the buffer will be kept clean for the rest of the execution due to gpuCopyCallback blocking all writes)
+            CopyFromImplStaged(dstOffset, src, srcOffset, size, gpuCopyCallback);
            SynchronizeHost();
        if (dirtyState != DirtyState::GpuDirty && src->dirtyState != DirtyState::GpuDirty) {
            std::memcpy(mirror.data() + dstOffset, src->mirror.data() + srcOffset, size);
            if (dirtyState == DirtyState::CpuDirty && !SequencedCpuBackingWritesBlocked())
                // Skip updating backing if the changes are gonna be updated later by SynchroniseHost in executor anyway
                return;
            if (!SequencedCpuBackingWritesBlocked() && PollFence()) {
                // We can write directly to the backing as long as this resource isn't being actively used by a past workload (in the current context or another)
                std::memcpy(backing.data() + dstOffset, src->mirror.data() + srcOffset, size);
            } else {
                gpuCopyCallback();
    }
        } else {
            MarkGpuDirty();
            gpuCopyCallback();
        }
    }
    BufferView Buffer::GetView(vk::DeviceSize offset, vk::DeviceSize size) {
        return BufferView{delegate, offset, size};
@ -345,13 +568,7 @@ namespace skyline::gpu {
    BufferBinding Buffer::TryMegaBufferView(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, ContextTag executionTag,
                                            vk::DeviceSize offset, vk::DeviceSize size) {
-        if ((!everHadInlineUpdate && sequenceNumber < FrequentlySyncedThreshold) || size >= MegaBufferChunkSize)
+        if (!ValidateMegaBufferView(size))
            // Don't megabuffer buffers that have never had inline updates and are not frequently synced since performance is only going to be harmed as a result of the constant copying and there wont be any benefit since there are no GPU inline updates that would be avoided
            return {};
        // We are safe to check dirty state here since it will only ever be set GPU dirty with the buffer locked and from the active GPFIFO thread. This helps with perf since the lock ends up being slightly expensive
        if (dirtyState == DirtyState::GpuDirty)
            // Bail out if buffer cannot be synced, we don't know the contents ahead of time so the sequence is indeterminate
            return {};
        // If the active execution has changed all previous allocations are now invalid
@ -361,7 +578,7 @@ namespace skyline::gpu {
        }
        // If more than half the buffer has been megabuffered in chunks within the same execution assume this will generally be the case for this buffer and just megabuffer the whole thing without chunking
-        if (unifiedMegaBufferEnabled || (megaBufferViewAccumulatedSize > (backing.size() / 2) && backing.size() < MegaBufferChunkSize)) {
+        if (unifiedMegaBufferEnabled || (megaBufferViewAccumulatedSize > (mirror.size() / 2) && mirror.size() < MegaBufferChunkSize)) {
            if (!unifiedMegaBuffer) {
                unifiedMegaBuffer = allocator.Push(pCycle, mirror, true);
                unifiedMegaBufferEnabled = true;
@ -502,5 +719,4 @@ namespace skyline::gpu {
            throw exception("Copy size mismatch!");
        return GetBuffer()->CopyFrom(GetOffset(), src.GetBuffer(), src.GetOffset(), size, gpuCopyCallback);
    }
 }
--- a/app/src/main/cpp/skyline/gpu/buffer.h
+++ b/app/src/main/cpp/skyline/gpu/buffer.h
@ -18,7 +18,6 @@ namespace skyline::gpu {
    class BufferManager;
    class BufferDelegate;
    /**
     * @brief Represents a bound Vulkan buffer that can be used for state updates
     */
@ -45,20 +44,34 @@ namespace skyline::gpu {
        GPU &gpu;
        RecursiveSpinLock mutex; //!< Synchronizes any mutations to the buffer or its backing
        std::atomic<ContextTag> tag{}; //!< The tag associated with the last lock call
        memory::Buffer backing;
        std::optional<GuestBuffer> guest;
        std::shared_ptr<FenceCycle> cycle{}; //!< A fence cycle for when any host operation mutating the buffer has completed, it must be waited on prior to any mutations to the backing
        size_t id;
        bool isDirect{}; //!< Indicates if a buffer is directly mapped from the guest
        /**
         * @brief Interval struct used to track which part of the buffer should be accessed through the shadow
         */
        struct WriteTrackingInterval {
            size_t offset;
            size_t end;
        };
        std::vector<WriteTrackingInterval> directTrackedWrites; //!< (Direct) A vector of write tracking intervals for the buffer, this is used to determine when to read from `directTrackedShadow`
        std::vector<u8> directTrackedShadow; //!< (Direct) Temporary mirror used to track any CPU-side writes to the buffer while it's being read by the GPU
        bool directTrackedShadowActive{}; //!< (Direct) If `directTrackedShadow` is currently being used to track writes
        span<u8> mirror{}; //!< A contiguous mirror of all the guest mappings to allow linear access on the CPU
-        span<u8> alignedMirror{}; //!< The mirror mapping aligned to page size to reflect the full mapping
+        std::optional<memory::Buffer> backing;
-        std::optional<nce::NCE::TrapHandle> trapHandle{}; //!< The handle of the traps for the guest mappings
+        std::optional<memory::ImportedBuffer> directBacking;
        std::optional<nce::NCE::TrapHandle> trapHandle{}; //!< (Staged) The handle of the traps for the guest mappings
        enum class DirtyState {
            Clean, //!< The CPU mappings are in sync with the GPU buffer
            CpuDirty, //!< The CPU mappings have been modified but the GPU buffer is not up to date
            GpuDirty, //!< The GPU buffer has been modified but the CPU mappings have not been updated
-        } dirtyState{DirtyState::CpuDirty}; //!< The state of the CPU mappings with respect to the GPU buffer
+        } dirtyState{DirtyState::CpuDirty}; //!< (Staged) The state of the CPU mappings with respect to the GPU buffer
        bool directGpuWritesActive{}; //!< (Direct) If the current/next GPU exection is writing to the buffer (basically GPU dirty)
        enum class BackingImmutability {
            None, //!< Backing can be freely written to and read from
@ -87,13 +100,13 @@ namespace skyline::gpu {
        size_t megaBufferViewAccumulatedSize{};
        MegaBufferAllocator::Allocation unifiedMegaBuffer{}; //!< An optional full-size mirror of the buffer in the megabuffer for use when the buffer is frequently updated and *all* of the buffer is frequently used. Replaces all uses of the table when active
-        static constexpr size_t FrequentlyLockedThreshold{2}; //!< Threshold for the number of times a buffer can be locked (not from context locks, only normal) before it should be considered frequently locked
+        static constexpr size_t FrequentlyLockedThreshold{2}; //!< (Staged) Threshold for the number of times a buffer can be locked (not from context locks, only normal) before it should be considered frequently locked
-        size_t accumulatedCpuLockCounter{}; //!< Number of times buffer has been locked through non-ContextLocks
+        size_t accumulatedCpuLockCounter{}; //!< (Staged) Number of times buffer has been locked through non-ContextLocks
-        static constexpr size_t FastReadbackHackWaitCountThreshold{6}; //!< Threshold for the number of times a buffer can be waited on before it should be considered for the readback hack
+        static constexpr size_t FastReadbackHackWaitCountThreshold{6}; //!< (Staged) Threshold for the number of times a buffer can be waited on before it should be considered for the readback hack
-        static constexpr std::chrono::nanoseconds FastReadbackHackWaitTimeThreshold{constant::NsInSecond /4}; //!< Threshold for the amount of time buffer texture can be waited on before it should be considered for the readback hack, `SkipReadbackHackWaitCountThreshold` needs to be hit before this
+        static constexpr std::chrono::nanoseconds FastReadbackHackWaitTimeThreshold{constant::NsInSecond / 4}; //!< (Staged) Threshold for the amount of time buffer texture can be waited on before it should be considered for the readback hack, `SkipReadbackHackWaitCountThreshold` needs to be hit before this
-        size_t accumulatedGuestWaitCounter{}; //!< Total number of times the buffer has been waited on
+        size_t accumulatedGuestWaitCounter{}; //!< (Staged) Total number of times the buffer has been waited on
-        std::chrono::nanoseconds accumulatedGuestWaitTime{}; //!< Amount of time the buffer has been waited on for since the `FastReadbackHackWaitTimeThreshold`th wait on it by the guest
+        std::chrono::nanoseconds accumulatedGuestWaitTime{}; //!< (Staged) Amount of time the buffer has been waited on for since the `FastReadbackHackWaitTimeThreshold`th wait on it by the guest
        /**
         * @brief Resets all megabuffer tracking state
@ -101,15 +114,73 @@ namespace skyline::gpu {
        void ResetMegabufferState();
      private:
        struct QueryIntervalResult {
            bool useShadow; //!< If the shadow should be used for buffer accesses within the interval
            u64 size; //!< Size of the interval starting from the query offset
        };
        BufferDelegate *delegate;
        friend BufferView;
        friend BufferManager;
        void SetupStagedTraps();
        /**
-         * @brief Sets up mirror mappings for the guest mappings, this must be called after construction for the mirror to be valid
+         * @brief Forces future accesses to the given interval to use the shadow copy
         */
-        void SetupGuestMappings();
+        void InsertWriteIntervalDirect(WriteTrackingInterval interval);
        /**
         * @return A struct describing the interval containing `offset`
         */
        QueryIntervalResult QueryWriteIntervalDirect(u64 offset);
        /**
         * @brief Enables the shadow buffer used for sequencing buffer contents independently of the GPU on the CPU
         */
        void EnableTrackedShadowDirect();
        /**
         * @return A span for the requested region that can be used to as a destination for a CPU-side buffer write
         */
        span<u8> BeginWriteCpuSequencedDirect(size_t offset, size_t size);
        /**
         * @return If GPU reads could occur using the buffer at a given moment, when true is returned the backing must not be modified by CPU writes
         */
        bool RefreshGpuReadsActiveDirect();
        /**
         * @param wait Whether to wait until GPU writes are no longer active before returning
         * @return If GPU writes of indeterminate contents could occur using the buffer at a given moment, when true is returned the backing must not be read/written on the CPU
         */
        bool RefreshGpuWritesActiveDirect(bool wait = false, const std::function<void()> &flushHostCallback = {});
        bool ValidateMegaBufferViewImplDirect(vk::DeviceSize size);
        bool ValidateMegaBufferViewImplStaged(vk::DeviceSize size);
        /**
         * @return True if megabuffering should occur for the given view, false otherwise
         */
        bool ValidateMegaBufferView(vk::DeviceSize size);
        void CopyFromImplDirect(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback);
        void CopyFromImplStaged(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback);
        bool WriteImplDirect(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback = {});
        bool WriteImplStaged(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback = {});
        void ReadImplDirect(const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset);
        void ReadImplStaged(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset);
        void MarkGpuDirtyImplDirect();
        void MarkGpuDirtyImplStaged();
      public:
        void UpdateCycle(const std::shared_ptr<FenceCycle> &newCycle) {
@ -118,7 +189,7 @@ namespace skyline::gpu {
        }
        constexpr vk::Buffer GetBacking() {
-            return backing.vkBuffer;
+            return backing ? backing->vkBuffer : *directBacking->vkBuffer;
        }
        /**
@ -128,14 +199,14 @@ namespace skyline::gpu {
        span<u8> GetBackingSpan() {
            if (guest)
                throw exception("Attempted to get a span of a guest-backed buffer");
-            return span<u8>(backing);
+            return backing ? span<u8>(*backing) : span<u8>(*directBacking);
        }
        /**
         * @brief Creates a buffer object wrapping the guest buffer with a backing that can represent the guest buffer data
         * @note The guest mappings will not be setup until SetupGuestMappings() is called
         */
-        Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, GuestBuffer guest, size_t id);
+        Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, GuestBuffer guest, size_t id, bool direct);
        /**
         * @brief Creates a host-only Buffer which isn't backed by any guest buffer
@ -183,7 +254,10 @@ namespace skyline::gpu {
         * @note The buffer **must** be locked prior to calling this
         */
        void BlockSequencedCpuBackingWrites() {
-            std::scoped_lock lock{stateMutex};
+            std::unique_lock lock{stateMutex, std::defer_lock};
            if (!isDirect)
                lock.lock();
            if (backingImmutability == BackingImmutability::None)
                backingImmutability = BackingImmutability::SequencedWrites;
        }
@ -193,12 +267,18 @@ namespace skyline::gpu {
         * @note The buffer **must** be locked prior to calling this
         */
        void BlockAllCpuBackingWrites() {
-            std::scoped_lock lock{stateMutex};
+            std::unique_lock lock{stateMutex, std::defer_lock};
            if (!isDirect)
                lock.lock();
            backingImmutability = BackingImmutability::AllWrites;
        }
        void AllowAllBackingWrites() {
-            std::scoped_lock lock{stateMutex};
+            std::unique_lock lock{stateMutex, std::defer_lock};
            if (!isDirect)
                lock.lock();
            backingImmutability = BackingImmutability::None;
        }
@ -207,7 +287,10 @@ namespace skyline::gpu {
         * @note The buffer **must** be locked prior to calling this
         */
        bool SequencedCpuBackingWritesBlocked() {
-            std::scoped_lock lock{stateMutex};
+            std::unique_lock lock{stateMutex, std::defer_lock};
            if (!isDirect)
                lock.lock();
            return backingImmutability == BackingImmutability::SequencedWrites || backingImmutability == BackingImmutability::AllWrites;
        }
@ -216,7 +299,10 @@ namespace skyline::gpu {
         * @note The buffer **must** be locked prior to calling this
         */
        bool AllCpuBackingWritesBlocked() {
-            std::scoped_lock lock{stateMutex};
+            std::unique_lock lock{stateMutex, std::defer_lock};
            if (!isDirect)
                lock.lock();
            return backingImmutability == BackingImmutability::AllWrites;
        }
--- a/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer_manager.cpp
@ -1,6 +1,7 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
 #include <common/settings.h>
 #include <gpu.h>
 #include "buffer_manager.h"
@ -54,11 +55,15 @@ namespace skyline::gpu {
    BufferManager::LockedBuffer BufferManager::CoalesceBuffers(span<u8> range, const LockedBuffers &srcBuffers, ContextTag tag) {
        std::shared_ptr<FenceCycle> newBufferCycle{};
        for (auto &srcBuffer : srcBuffers) {
-            // Wait on all source buffers before we lock the recreation mutex as locking it may prevent submissions of the cycles and introduce a deadlock
+            // Since new direct buffers will share the underlying backing of source buffers we don't need to wait for the GPU if they're dirty, for non direct buffers we do though as otherwise we won't be able to migrate their contents to the new backing
-            // We can't chain cycles here as that may also introduce a deadlock since we have no way to determine what order to chain them in right now
+            if (!*gpu.state.settings->useDirectMemoryImport && (srcBuffer->dirtyState == Buffer::DirtyState::GpuDirty || srcBuffer->AllCpuBackingWritesBlocked()))
            if (srcBuffer->dirtyState == Buffer::DirtyState::GpuDirty || srcBuffer->AllCpuBackingWritesBlocked() || (newBufferCycle && srcBuffer->cycle != newBufferCycle))
                srcBuffer->WaitOnFence();
-            else if (srcBuffer->cycle)
+
            // We can't chain cycles here as that may also introduce a deadlock since we have no way to determine what order to chain them in right now
            // Wait on all source buffers before we lock the recreation mutex as locking it may prevent submissions of the cycles and introduce a deadlock
            if (newBufferCycle && srcBuffer->cycle != newBufferCycle)
                srcBuffer->WaitOnFence();
            else
                newBufferCycle = srcBuffer->cycle;
        }
@ -76,9 +81,9 @@ namespace skyline::gpu {
                highestAddress = mapping.end().base();
        }
-        LockedBuffer newBuffer{std::make_shared<Buffer>(delegateAllocatorState, gpu, span<u8>{lowestAddress, highestAddress}, nextBufferId++), tag}; // If we don't lock the buffer prior to trapping it during synchronization, a race could occur with a guest trap acquiring the lock before we do and mutating the buffer prior to it being ready
+        LockedBuffer newBuffer{std::make_shared<Buffer>(delegateAllocatorState, gpu, span<u8>{lowestAddress, highestAddress}, nextBufferId++, *gpu.state.settings->useDirectMemoryImport), tag}; // If we don't lock the buffer prior to trapping it during synchronization, a race could occur with a guest trap acquiring the lock before we do and mutating the buffer prior to it being ready
-        newBuffer->SetupGuestMappings();
+        newBuffer->SetupStagedTraps();
        newBuffer->SynchronizeHost(false); // Overlaps don't necessarily fully cover the buffer so we have to perform a sync here to prevent any gaps
        newBuffer->cycle = newBufferCycle;
@ -103,20 +108,31 @@ namespace skyline::gpu {
            newBuffer->everHadInlineUpdate |= srcBuffer->everHadInlineUpdate;
            if (!*gpu.state.settings->useDirectMemoryImport) {
                if (srcBuffer->dirtyState == Buffer::DirtyState::GpuDirty) {
                    if (srcBuffer.lock.IsFirstUsage() && newBuffer->dirtyState != Buffer::DirtyState::GpuDirty)
-                    copyBuffer(*newBuffer->guest, *srcBuffer->guest, newBuffer->mirror.data(), srcBuffer->backing.data());
+                        copyBuffer(*newBuffer->guest, *srcBuffer->guest, newBuffer->mirror.data(), srcBuffer->backing->data());
                    else
                        newBuffer->MarkGpuDirty();
                    // Since we don't synchost source buffers and the source buffers here are GPU dirty their mirrors will be out of date, meaning the backing contents of this source buffer's region in the new buffer from the initial synchost call will be incorrect. By copying backings directly here we can ensure that no writes are lost and that if the newly created buffer needs to turn GPU dirty during recreation no copies need to be done since the backing is as up to date as the mirror at a minimum.
-                copyBuffer(*newBuffer->guest, *srcBuffer->guest, newBuffer->backing.data(), srcBuffer->backing.data());
+                    copyBuffer(*newBuffer->guest, *srcBuffer->guest, newBuffer->backing->data(), srcBuffer->backing->data());
                } else if (srcBuffer->AllCpuBackingWritesBlocked()) {
                    if (srcBuffer->dirtyState == Buffer::DirtyState::CpuDirty)
                        Logger::Error("Buffer (0x{}-0x{}) is marked as CPU dirty while CPU backing writes are blocked, this is not valid", srcBuffer->guest->begin().base(), srcBuffer->guest->end().base());
                    // We need the backing to be stable so that any writes within this context are sequenced correctly, we can't use the source mirror here either since buffer writes within this context will update the mirror on CPU and backing on GPU
-                copyBuffer(*newBuffer->guest, *srcBuffer->guest, newBuffer->backing.data(), srcBuffer->backing.data());
+                    copyBuffer(*newBuffer->guest, *srcBuffer->guest, newBuffer->backing->data(), srcBuffer->backing->data());
                }
            } else {
                if (srcBuffer->directGpuWritesActive) {
                    newBuffer->MarkGpuDirty();
                } else if (srcBuffer->directTrackedShadowActive) {
                    newBuffer->EnableTrackedShadowDirect();
                    copyBuffer(*newBuffer->guest, *srcBuffer->guest, newBuffer->directTrackedShadow.data(), srcBuffer->directTrackedShadow.data());
                    for (const auto &interval : srcBuffer->directTrackedWrites)
                        newBuffer->InsertWriteIntervalDirect(interval);
                }
            }
            // Transfer all views from the overlapping buffer to the new buffer with the new buffer and updated offset, ensuring pointer stability
@ -146,8 +162,8 @@ namespace skyline::gpu {
        if (overlaps.empty()) {
            // If we couldn't find any overlapping buffers, create a new buffer without coalescing
-            LockedBuffer buffer{std::make_shared<Buffer>(delegateAllocatorState, gpu, alignedGuestMapping, nextBufferId++), tag};
+            LockedBuffer buffer{std::make_shared<Buffer>(delegateAllocatorState, gpu, alignedGuestMapping, nextBufferId++, *gpu.state.settings->useDirectMemoryImport), tag};
-            buffer->SetupGuestMappings();
+            buffer->SetupStagedTraps();
            InsertBuffer(*buffer);
            return buffer->GetView(static_cast<vk::DeviceSize>(guestMapping.begin() - buffer->guest->begin()), guestMapping.size());
        } else {
--- a/app/src/main/java/emu/skyline/utils/NativeSettings.kt
+++ b/app/src/main/java/emu/skyline/utils/NativeSettings.kt
@ -28,6 +28,7 @@ class NativeSettings(context : Context, pref : PreferenceSettings) {
    var gpuDriverLibraryName : String = if (pref.gpuDriver == PreferenceSettings.SYSTEM_GPU_DRIVER) "" else GpuDriverHelper.getLibraryName(context, pref.gpuDriver)
    var executorSlotCountScale : Int = pref.executorSlotCountScale
    var executorFlushThreshold : Int = pref.executorFlushThreshold
    var useDirectMemoryImport : Boolean = pref.useDirectMemoryImport
    // Hacks
    var enableFastGpuReadbackHack : Boolean = pref.enableFastGpuReadbackHack
--- a/app/src/main/java/emu/skyline/utils/PreferenceSettings.kt
+++ b/app/src/main/java/emu/skyline/utils/PreferenceSettings.kt
@ -41,6 +41,7 @@ class PreferenceSettings @Inject constructor(@ApplicationContext private val con
    var gpuDriver by sharedPreferences(context, SYSTEM_GPU_DRIVER)
    var executorSlotCountScale by sharedPreferences(context, 6)
    var executorFlushThreshold by sharedPreferences(context, 256)
    var useDirectMemoryImport by sharedPreferences(context, false)
    var forceMaxGpuClocks by sharedPreferences(context, false)
    // Hacks
--- a/app/src/main/res/values/strings.xml
+++ b/app/src/main/res/values/strings.xml
@ -79,6 +79,8 @@
    <string name="executor_slot_count_scale_desc">Scale controlling the maximum number of simultaneous GPU executions (Higher may sometimes perform better but will use more RAM)</string>
    <string name="executor_flush_threshold">Executor Flush Threshold</string>
    <string name="executor_flush_threshold_desc">Controls how frequently work is flushed to the GPU</string>
    <string name="use_direct_memory_import">Use Direct Memory Import</string>
    <string name="use_direct_memory_import_desc">May alter performance and stability in some games\n<b>NOTE:</b> This option only works on proprietary Adreno drivers</string>
    <string name="force_max_gpu_clocks">Force Maximum GPU Clocks</string>
    <string name="force_max_gpu_clocks_desc">Forces the GPU to run at its maximum possible clock speed (May cause excessive heating and power usage)</string>
    <string name="force_max_gpu_clocks_desc_unsupported">Your device does not support forcing maximum GPU clocks</string>
--- a/app/src/main/res/xml/preferences.xml
+++ b/app/src/main/res/xml/preferences.xml
@ -146,6 +146,11 @@
            app:key="executor_flush_threshold"
            app:title="@string/executor_flush_threshold"
            app:showSeekBarValue="true" />
        <CheckBoxPreference
            android:defaultValue="false"
            android:summary="@string/use_direct_memory_import_desc"
            app:key="use_direct_memory_import"
            app:title="@string/use_direct_memory_import" />
        <CheckBoxPreference
            android:defaultValue="false"
            android:summary="@string/force_max_gpu_clocks_desc"