From b762d1df23d1a16f971e224c7b482e2e724dcf7e Mon Sep 17 00:00:00 2001
From: PixelyIon <pixelyion@protonmail.com>
Date: Wed, 6 Oct 2021 14:20:54 +0530
Subject: [PATCH] Introduce Texture Always Sync + Wait on GPU Execution + More
 RT Formats

Infrastructure for always syncing textures has been introduced now, they will be synced prior to and after every execution. This does considerably reduce the performance alongside waiting on GPU execution to finish but it will be partially recouped once conditional syncing is performed.
---
 .../main/cpp/skyline/gpu/command_scheduler.h  |  39 +-
 app/src/main/cpp/skyline/gpu/fence_cycle.h    |   8 +
 .../gpu/interconnect/command_executor.cpp     |  15 +-
 .../gpu/interconnect/command_executor.h       |   2 +
 .../gpu/interconnect/graphics_context.h       |   6 +
 app/src/main/cpp/skyline/gpu/texture/copy.h   | 133 +++++++
 app/src/main/cpp/skyline/gpu/texture/format.h |   9 +
 .../main/cpp/skyline/gpu/texture/texture.cpp  | 372 +++++++++++-------
 .../main/cpp/skyline/gpu/texture/texture.h    |  58 ++-
 .../hosbinder/GraphicBufferProducer.cpp       |   2 +-
 .../skyline/soc/gm20b/engines/maxwell/types.h |   3 +
 .../skyline/soc/gm20b/engines/maxwell_3d.cpp  |   2 +-
 12 files changed, 493 insertions(+), 156 deletions(-)
 create mode 100644 app/src/main/cpp/skyline/gpu/texture/copy.h
diff --git a/app/src/main/cpp/skyline/gpu/command_scheduler.h b/app/src/main/cpp/skyline/gpu/command_scheduler.h
index de24f675..259d0be3 100644
--- a/app/src/main/cpp/skyline/gpu/command_scheduler.h
+++ b/app/src/main/cpp/skyline/gpu/command_scheduler.h
@@ -85,27 +85,38 @@ namespace skyline::gpu {
         template<typename RecordFunction>
         std::shared_ptr<FenceCycle> Submit(RecordFunction recordFunction) {
             auto commandBuffer{AllocateCommandBuffer()};
-            commandBuffer->begin(vk::CommandBufferBeginInfo{
-                .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
-            });
-            recordFunction(*commandBuffer);
-            commandBuffer->end();
-            SubmitCommandBuffer(*commandBuffer, commandBuffer.GetFence());
-            return commandBuffer.GetFenceCycle();
+            try {
+                commandBuffer->begin(vk::CommandBufferBeginInfo{
+                    .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
+                });
+                recordFunction(*commandBuffer);
+                commandBuffer->end();
+                SubmitCommandBuffer(*commandBuffer, commandBuffer.GetFence());
+                return commandBuffer.GetFenceCycle();
+            } catch (...) {
+                commandBuffer.GetFenceCycle()->Cancel();
+                std::rethrow_exception(std::current_exception());
+            }
         }
 
         /**
          * @note Same as Submit but with FenceCycle as an argument rather than return value
          */
         template<typename RecordFunction>
-        void SubmitWithCycle(RecordFunction recordFunction) {
+        std::shared_ptr<FenceCycle> SubmitWithCycle(RecordFunction recordFunction) {
             auto commandBuffer{AllocateCommandBuffer()};
-            commandBuffer->begin(vk::CommandBufferBeginInfo{
-                .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
-            });
-            recordFunction(*commandBuffer, commandBuffer.GetFenceCycle());
-            commandBuffer->end();
-            SubmitCommandBuffer(*commandBuffer, commandBuffer.GetFence());
+            try {
+                commandBuffer->begin(vk::CommandBufferBeginInfo{
+                    .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
+                });
+                recordFunction(*commandBuffer, commandBuffer.GetFenceCycle());
+                commandBuffer->end();
+                SubmitCommandBuffer(*commandBuffer, commandBuffer.GetFence());
+                return commandBuffer.GetFenceCycle();
+            } catch (...) {
+                commandBuffer.GetFenceCycle()->Cancel();
+                std::rethrow_exception(std::current_exception());
+            }
         }
     };
 }
diff --git a/app/src/main/cpp/skyline/gpu/fence_cycle.h b/app/src/main/cpp/skyline/gpu/fence_cycle.h
index adc40fed..8d2372ba 100644
--- a/app/src/main/cpp/skyline/gpu/fence_cycle.h
+++ b/app/src/main/cpp/skyline/gpu/fence_cycle.h
@@ -53,6 +53,14 @@ namespace skyline::gpu {
             Wait();
         }
 
+        /**
+         * @brief Signals this fence regardless of if the underlying fence has been signalled or not
+         */
+        void Cancel() {
+            if (!signalled.test_and_set(std::memory_order_release))
+                DestroyDependencies();
+        }
+
         /**
          * @brief Wait on a fence cycle till it has been signalled
          */
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
index 9b2d2a19..c46c8c09 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@@ -20,6 +20,10 @@ namespace skyline::gpu::interconnect {
     }
 
     void CommandExecutor::AddSubpass(const std::function<void(vk::raii::CommandBuffer &, const std::shared_ptr<FenceCycle> &, GPU &)> &function, vk::Rect2D renderArea, std::vector<TextureView> inputAttachments, std::vector<TextureView> colorAttachments, std::optional<TextureView> depthStencilAttachment) {
+        for (const auto& attachments : {inputAttachments, colorAttachments})
+            for (const auto& attachment : attachments)
+                syncTextures.emplace(attachment.backing.get());
+
         bool newRenderpass{CreateRenderpass(renderArea)};
         renderpass->AddSubpass(inputAttachments, colorAttachments, depthStencilAttachment ? &*depthStencilAttachment : nullptr);
         if (newRenderpass)
@@ -56,12 +60,17 @@ namespace skyline::gpu::interconnect {
 
     void CommandExecutor::Execute() {
         if (!nodes.empty()) {
+            TRACE_EVENT("gpu", "CommandExecutor::Execute");
+
             if (renderpass) {
                 nodes.emplace_back(std::in_place_type_t<node::RenderpassEndNode>());
                 renderpass = nullptr;
             }
 
             gpu.scheduler.SubmitWithCycle([this](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle) {
+                for (auto texture : syncTextures)
+                    texture->SynchronizeHostWithBuffer(commandBuffer, cycle);
+
                 using namespace node;
                 for (NodeVariant &node : nodes) {
                     std::visit(VariantVisitor{
@@ -71,9 +80,13 @@ namespace skyline::gpu::interconnect {
                         [&](RenderpassEndNode &node) { node(commandBuffer, cycle, gpu); },
                     }, node);
                 }
-            });
+
+                for (auto texture : syncTextures)
+                    texture->SynchronizeGuestWithBuffer(commandBuffer, cycle);
+            })->Wait();
 
             nodes.clear();
+            syncTextures.clear();
         }
     }
 }
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
index ef2df3ce..7884e5f6 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <boost/container/stable_vector.hpp>
+#include <unordered_set>
 #include "command_nodes.h"
 
 namespace skyline::gpu::interconnect {
@@ -16,6 +17,7 @@ namespace skyline::gpu::interconnect {
         GPU &gpu;
         boost::container::stable_vector<node::NodeVariant> nodes;
         node::RenderpassNode *renderpass{};
+        std::unordered_set<Texture*> syncTextures; //!< All textures that need to be synced prior to and after execution
 
         /**
          * @return If a new renderpass was created by the function or the current one was reused as it was compatible
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
index 8dc53803..9795a5fd 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
@@ -122,6 +122,12 @@ namespace skyline::gpu::interconnect {
                         return format::R16Float;
                     case maxwell3d::RenderTarget::ColorFormat::R8Unorm:
                         return format::R8Unorm;
+                    case maxwell3d::RenderTarget::ColorFormat::R8Snorm:
+                        return format::R8Snorm;
+                    case maxwell3d::RenderTarget::ColorFormat::R8Sint:
+                        return format::R8Sint;
+                    case maxwell3d::RenderTarget::ColorFormat::R8Uint:
+                        return format::R8Uint;
                     default:
                         throw exception("Cannot translate the supplied RT format: 0x{:X}", static_cast<u32>(format));
                 }
diff --git a/app/src/main/cpp/skyline/gpu/texture/copy.h b/app/src/main/cpp/skyline/gpu/texture/copy.h
new file mode 100644
index 00000000..0fd4cebe
--- /dev/null
+++ b/app/src/main/cpp/skyline/gpu/texture/copy.h
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include "texture.h"
+
+namespace skyline::gpu {
+    /**
+     * @brief Copies the contents of a blocklinear guest texture to a linear output buffer
+     */
+    void CopyBlockLinearToLinear(GuestTexture& guest, u8* guestInput, u8* linearOutput) {
+        // Reference on Block-linear tiling: https://gist.github.com/PixelyIon/d9c35050af0ef5690566ca9f0965bc32
+        constexpr u8 SectorWidth{16}; // The width of a sector in bytes
+        constexpr u8 SectorHeight{2}; // The height of a sector in lines
+        constexpr u8 GobWidth{64}; // The width of a GOB in bytes
+        constexpr u8 GobHeight{8}; // The height of a GOB in lines
+
+        auto blockHeight{guest.tileConfig.blockHeight}; //!< The height of the blocks in GOBs
+        auto robHeight{GobHeight * blockHeight}; //!< The height of a single ROB (Row of Blocks) in lines
+        auto surfaceHeight{guest.dimensions.height / guest.format->blockHeight}; //!< The height of the surface in lines
+        auto surfaceHeightRobs{util::AlignUp(surfaceHeight, robHeight) / robHeight}; //!< The height of the surface in ROBs (Row Of Blocks)
+        auto robWidthBytes{util::AlignUp((guest.dimensions.width / guest.format->blockWidth) * guest.format->bpb, GobWidth)}; //!< The width of a ROB in bytes
+        auto robWidthBlocks{robWidthBytes / GobWidth}; //!< The width of a ROB in blocks (and GOBs because block width == 1 on the Tegra X1)
+        auto robBytes{robWidthBytes * robHeight}; //!< The size of a ROB in bytes
+        auto gobYOffset{robWidthBytes * GobHeight}; //!< The offset of the next Y-axis GOB from the current one in linear space
+
+        auto inputSector{guestInput};
+        auto outputRob{linearOutput};
+
+        for (u32 rob{}, y{}, paddingY{}; rob < surfaceHeightRobs; rob++) { // Every Surface contains `surfaceHeightRobs` ROBs
+            auto outputBlock{outputRob}; // We iterate through a block independently of the ROB
+            for (u32 block{}; block < robWidthBlocks; block++) { // Every ROB contains `surfaceWidthBlocks` Blocks
+                auto outputGob{outputBlock}; // We iterate through a GOB independently of the block
+                for (u32 gobY{}; gobY < blockHeight; gobY++) { // Every Block contains `blockHeight` Y-axis GOBs
+                    for (u32 index{}; index < SectorWidth * SectorHeight; index++) { // Every Y-axis GOB contains `sectorWidth * sectorHeight` sectors
+                        u32 xT{((index << 3) & 0b10000) | ((index << 1) & 0b100000)}; // Morton-Swizzle on the X-axis
+                        u32 yT{((index >> 1) & 0b110) | (index & 0b1)}; // Morton-Swizzle on the Y-axis
+                        std::memcpy(outputGob + (yT * robWidthBytes) + xT, inputSector, SectorWidth);
+                        inputSector += SectorWidth; // `sectorWidth` bytes are of sequential image data
+                    }
+                    outputGob += gobYOffset; // Increment the output GOB to the next Y-axis GOB
+                }
+                inputSector += paddingY; // Increment the input sector to the next sector
+                outputBlock += GobWidth; // Increment the output block to the next block (As Block Width = 1 GOB Width)
+            }
+            outputRob += robBytes; // Increment the output block to the next ROB
+
+            y += robHeight; // Increment the Y position to the next ROB
+            blockHeight = static_cast<u8>(std::min(static_cast<u32>(blockHeight), (surfaceHeight - y) / GobHeight)); // Calculate the amount of Y GOBs which aren't padding
+            paddingY = (guest.tileConfig.blockHeight - blockHeight) * (SectorWidth * SectorWidth * SectorHeight); // Calculate the amount of padding between contiguous sectors
+        }
+    }
+    /**
+     * @brief Copies the contents of a blocklinear guest texture to a linear output buffer
+     */
+    void CopyLinearToBlockLinear(GuestTexture& guest, u8* linearInput, u8* guestOutput) {
+        // Reference on Block-linear tiling: https://gist.github.com/PixelyIon/d9c35050af0ef5690566ca9f0965bc32
+        constexpr u8 SectorWidth{16}; // The width of a sector in bytes
+        constexpr u8 SectorHeight{2}; // The height of a sector in lines
+        constexpr u8 GobWidth{64}; // The width of a GOB in bytes
+        constexpr u8 GobHeight{8}; // The height of a GOB in lines
+
+        auto blockHeight{guest.tileConfig.blockHeight}; //!< The height of the blocks in GOBs
+        auto robHeight{GobHeight * blockHeight}; //!< The height of a single ROB (Row of Blocks) in lines
+        auto surfaceHeight{guest.dimensions.height / guest.format->blockHeight}; //!< The height of the surface in lines
+        auto surfaceHeightRobs{util::AlignUp(surfaceHeight, robHeight) / robHeight}; //!< The height of the surface in ROBs (Row Of Blocks)
+        auto robWidthBytes{util::AlignUp((guest.dimensions.width / guest.format->blockWidth) * guest.format->bpb, GobWidth)}; //!< The width of a ROB in bytes
+        auto robWidthBlocks{robWidthBytes / GobWidth}; //!< The width of a ROB in blocks (and GOBs because block width == 1 on the Tegra X1)
+        auto robBytes{robWidthBytes * robHeight}; //!< The size of a ROB in bytes
+        auto gobYOffset{robWidthBytes * GobHeight}; //!< The offset of the next Y-axis GOB from the current one in linear space
+
+        auto outputSector{guestOutput};
+        auto inputRob{linearInput};
+
+        for (u32 rob{}, y{}, paddingY{}; rob < surfaceHeightRobs; rob++) { // Every Surface contains `surfaceHeightRobs` ROBs
+            auto outputBlock{inputRob}; // We iterate through a block independently of the ROB
+            for (u32 block{}; block < robWidthBlocks; block++) { // Every ROB contains `surfaceWidthBlocks` Blocks
+                auto inputGob{outputBlock}; // We iterate through a GOB independently of the block
+                for (u32 gobY{}; gobY < blockHeight; gobY++) { // Every Block contains `blockHeight` Y-axis GOBs
+                    for (u32 index{}; index < SectorWidth * SectorHeight; index++) { // Every Y-axis GOB contains `sectorWidth * sectorHeight` sectors
+                        u32 xT{((index << 3) & 0b10000) | ((index << 1) & 0b100000)}; // Morton-Swizzle on the X-axis
+                        u32 yT{((index >> 1) & 0b110) | (index & 0b1)}; // Morton-Swizzle on the Y-axis
+                        std::memcpy(outputSector, inputGob + (yT * robWidthBytes) + xT, SectorWidth);
+                        outputSector += SectorWidth; // `sectorWidth` bytes are of sequential image data
+                    }
+                    inputGob += gobYOffset; // Increment the output GOB to the next Y-axis GOB
+                }
+                outputSector += paddingY; // Increment the input sector to the next sector
+                outputBlock += GobWidth; // Increment the output block to the next block (As Block Width = 1 GOB Width)
+            }
+            inputRob += robBytes; // Increment the output block to the next ROB
+
+            y += robHeight; // Increment the Y position to the next ROB
+            blockHeight = static_cast<u8>(std::min(static_cast<u32>(blockHeight), (surfaceHeight - y) / GobHeight)); // Calculate the amount of Y GOBs which aren't padding
+            paddingY = (guest.tileConfig.blockHeight - blockHeight) * (SectorWidth * SectorWidth * SectorHeight); // Calculate the amount of padding between contiguous sectors
+        }
+    }
+
+    /**
+     * @brief Copies the contents of a pitch-linear guest texture to a linear output buffer
+     */
+    void CopyPitchLinearToLinear(GuestTexture& guest, u8* guestInput, u8* linearOutput) {
+        auto sizeLine{guest.format->GetSize(guest.dimensions.width, 1)}; //!< The size of a single line of pixel data
+        auto sizeStride{guest.format->GetSize(guest.tileConfig.pitch, 1)}; //!< The size of a single stride of pixel data
+
+        auto inputLine{guestInput};
+        auto outputLine{linearOutput};
+
+        for (u32 line{}; line < guest.dimensions.height; line++) {
+            std::memcpy(outputLine, inputLine, sizeLine);
+            inputLine += sizeStride;
+            outputLine += sizeLine;
+        }
+    }
+
+    /**
+     * @brief Copies the contents of a linear buffer to a pitch-linear guest texture
+     */
+    void CopyLinearToPitchLinear(GuestTexture& guest, u8* linearInput, u8* guestOutput) {
+        auto sizeLine{guest.format->GetSize(guest.dimensions.width, 1)}; //!< The size of a single line of pixel data
+        auto sizeStride{guest.format->GetSize(guest.tileConfig.pitch, 1)}; //!< The size of a single stride of pixel data
+
+        auto inputLine{linearInput};
+        auto outputLine{guestOutput};
+
+        for (u32 line{}; line < guest.dimensions.height; line++) {
+            std::memcpy(outputLine, inputLine, sizeLine);
+            inputLine += sizeLine;
+            outputLine += sizeStride;
+        }
+    }
+}
diff --git a/app/src/main/cpp/skyline/gpu/texture/format.h b/app/src/main/cpp/skyline/gpu/texture/format.h
index 0049a2d2..14c21ac6 100644
--- a/app/src/main/cpp/skyline/gpu/texture/format.h
+++ b/app/src/main/cpp/skyline/gpu/texture/format.h
@@ -25,6 +25,9 @@ namespace skyline::gpu::format {
     constexpr Format R16Unorm{sizeof(u16), vkf::eR16Unorm};
     constexpr Format R16Float{sizeof(u16), vkf::eR16Sfloat};
     constexpr Format R8Unorm{sizeof(u8), vkf::eR8Unorm};
+    constexpr Format R8Snorm{sizeof(u8), vkf::eR8Snorm};
+    constexpr Format R8Sint{sizeof(u8), vkf::eR8Sint};
+    constexpr Format R8Uint{sizeof(u8), vkf::eR8Uint};
     constexpr Format R32B32G32A32Float{sizeof(u32) * 4, vkf::eR32G32B32A32Sfloat, .swizzle = {
         .blue = swc::Green,
         .green = swc::Blue,
@@ -66,6 +69,12 @@ namespace skyline::gpu::format {
                 return R8G8Snorm;
             case vk::Format::eR8Unorm:
                 return R8Unorm;
+            case vk::Format::eR8Snorm:
+                return R8Snorm;
+            case vk::Format::eR8Sint:
+                return R8Sint;
+            case vk::Format::eR8Uint:
+                return R8Uint;
             case vk::Format::eR16G16B16A16Unorm:
                 return R16G16B16A16Unorm;
             case vk::Format::eR16G16B16A16Uint:
diff --git a/app/src/main/cpp/skyline/gpu/texture/texture.cpp b/app/src/main/cpp/skyline/gpu/texture/texture.cpp
index fcfae680..a2374853 100644
--- a/app/src/main/cpp/skyline/gpu/texture/texture.cpp
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.cpp
@@ -5,8 +5,167 @@
 #include <common/trace.h>
 #include <kernel/types/KProcess.h>
 #include "texture.h"
+#include "copy.h"
 
 namespace skyline::gpu {
+    std::shared_ptr<memory::StagingBuffer> Texture::SynchronizeHostImpl(const std::shared_ptr<FenceCycle> &pCycle) {
+        if (!guest)
+            throw exception("Synchronization of host textures requires a valid guest texture to synchronize from");
+        else if (guest->mappings.size() != 1)
+            throw exception("Synchronization of non-contigious textures is not supported");
+        else if (guest->dimensions != dimensions)
+            throw exception("Guest and host dimensions being different is not supported currently");
+        else if (guest->mappings.size() > 1)
+            throw exception("Synchronizing textures across {} mappings is not supported", guest->mappings.size());
+
+        auto pointer{guest->mappings[0].data()};
+        auto size{format->GetSize(dimensions)};
+
+        WaitOnBacking();
+
+        u8 *bufferData;
+        auto stagingBuffer{[&]() -> std::shared_ptr<memory::StagingBuffer> {
+            if (tiling == vk::ImageTiling::eOptimal || !std::holds_alternative<memory::Image>(backing)) {
+                // We need a staging buffer for all optimal copies (since we aren't aware of the host optimal layout) and linear textures which we cannot map on the CPU since we do not have access to their backing VkDeviceMemory
+                auto stagingBuffer{gpu.memory.AllocateStagingBuffer(size)};
+                bufferData = stagingBuffer->data();
+                return stagingBuffer;
+            } else if (tiling == vk::ImageTiling::eLinear) {
+                // We can optimize linear texture sync on a UMA by mapping the texture onto the CPU and copying directly into it rather than a staging buffer
+                bufferData = std::get<memory::Image>(backing).data();
+                if (cycle.lock() != pCycle)
+                    WaitOnFence();
+                return nullptr;
+            } else {
+                throw exception("Guest -> Host synchronization of images tiled as '{}' isn't implemented", vk::to_string(tiling));
+            }
+        }()};
+
+        if (guest->tileConfig.mode == texture::TileMode::Block)
+            CopyBlockLinearToLinear(*guest, pointer, bufferData);
+        else if (guest->tileConfig.mode == texture::TileMode::Pitch)
+            CopyPitchLinearToLinear(*guest, pointer, bufferData);
+        else if (guest->tileConfig.mode == texture::TileMode::Linear)
+            std::memcpy(bufferData, pointer, size);
+
+        if (stagingBuffer && cycle.lock() != pCycle)
+            WaitOnFence();
+
+        return stagingBuffer;
+    }
+
+    void Texture::CopyFromStagingBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<memory::StagingBuffer> &stagingBuffer) {
+        auto image{GetBacking()};
+        if (layout != vk::ImageLayout::eTransferDstOptimal) {
+            commandBuffer.pipelineBarrier(layout != vk::ImageLayout::eUndefined ? vk::PipelineStageFlagBits::eTopOfPipe : vk::PipelineStageFlagBits::eBottomOfPipe, vk::PipelineStageFlagBits::eTransfer, {}, {}, {}, vk::ImageMemoryBarrier{
+                .image = image,
+                .srcAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
+                .dstAccessMask = vk::AccessFlagBits::eTransferWrite,
+                .oldLayout = layout,
+                .newLayout = vk::ImageLayout::eTransferDstOptimal,
+                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .subresourceRange = {
+                    .aspectMask = format->vkAspect,
+                    .levelCount = mipLevels,
+                    .layerCount = layerCount,
+                },
+            });
+
+            if (layout == vk::ImageLayout::eUndefined)
+                layout = vk::ImageLayout::eTransferDstOptimal;
+        }
+
+        commandBuffer.copyBufferToImage(stagingBuffer->vkBuffer, image, vk::ImageLayout::eTransferDstOptimal, vk::BufferImageCopy{
+            .imageExtent = dimensions,
+            .imageSubresource = {
+                .aspectMask = format->vkAspect,
+                .layerCount = layerCount,
+            },
+        });
+
+        if (layout != vk::ImageLayout::eTransferDstOptimal)
+            commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eTransfer, {}, {}, {}, vk::ImageMemoryBarrier{
+                .image = image,
+                .srcAccessMask = vk::AccessFlagBits::eTransferWrite,
+                .dstAccessMask = vk::AccessFlagBits::eMemoryRead,
+                .oldLayout = vk::ImageLayout::eTransferDstOptimal,
+                .newLayout = layout,
+                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .subresourceRange = {
+                    .aspectMask = format->vkAspect,
+                    .levelCount = mipLevels,
+                    .layerCount = layerCount,
+                },
+            });
+    }
+
+    void Texture::CopyIntoStagingBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<memory::StagingBuffer> &stagingBuffer) {
+        auto image{GetBacking()};
+        if (layout != vk::ImageLayout::eTransferSrcOptimal) {
+            commandBuffer.pipelineBarrier(layout != vk::ImageLayout::eUndefined ? vk::PipelineStageFlagBits::eTopOfPipe : vk::PipelineStageFlagBits::eBottomOfPipe, vk::PipelineStageFlagBits::eTransfer, {}, {}, {}, vk::ImageMemoryBarrier{
+                .image = image,
+                .srcAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
+                .dstAccessMask = vk::AccessFlagBits::eTransferRead,
+                .oldLayout = layout,
+                .newLayout = vk::ImageLayout::eTransferSrcOptimal,
+                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .subresourceRange = {
+                    .aspectMask = format->vkAspect,
+                    .levelCount = mipLevels,
+                    .layerCount = layerCount,
+                },
+            });
+
+            if (layout == vk::ImageLayout::eUndefined)
+                layout = vk::ImageLayout::eTransferSrcOptimal;
+        }
+
+        commandBuffer.copyImageToBuffer(image, vk::ImageLayout::eTransferSrcOptimal, stagingBuffer->vkBuffer, vk::BufferImageCopy{
+            .imageExtent = dimensions,
+            .imageSubresource = {
+                .aspectMask = format->vkAspect,
+                .layerCount = layerCount,
+            },
+        });
+
+        if (layout != vk::ImageLayout::eTransferSrcOptimal)
+            commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eTransfer, {}, {}, {}, vk::ImageMemoryBarrier{
+                .image = image,
+                .srcAccessMask = vk::AccessFlagBits::eTransferRead,
+                .dstAccessMask = vk::AccessFlagBits::eMemoryWrite,
+                .oldLayout = vk::ImageLayout::eTransferSrcOptimal,
+                .newLayout = layout,
+                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .subresourceRange = {
+                    .aspectMask = format->vkAspect,
+                    .levelCount = mipLevels,
+                    .layerCount = layerCount,
+                },
+            });
+    }
+
+    void Texture::CopyToGuest(u8 *hostBuffer) {
+        auto guestOutput{guest->mappings[0].data()};
+        auto size{format->GetSize(dimensions)};
+
+        if (guest->tileConfig.mode == texture::TileMode::Block)
+            CopyLinearToBlockLinear(*guest, hostBuffer, guestOutput);
+        else if (guest->tileConfig.mode == texture::TileMode::Pitch)
+            CopyLinearToPitchLinear(*guest, hostBuffer, guestOutput);
+        else if (guest->tileConfig.mode == texture::TileMode::Linear)
+            std::memcpy(hostBuffer, guestOutput, format->GetSize(dimensions));
+    }
+
+    Texture::TextureBufferCopy::TextureBufferCopy(std::shared_ptr<Texture> texture, std::shared_ptr<memory::StagingBuffer> stagingBuffer) : texture(std::move(texture)), stagingBuffer(std::move(stagingBuffer)) {}
+
+    Texture::TextureBufferCopy::~TextureBufferCopy() {
+        texture->CopyToGuest(stagingBuffer ? stagingBuffer->data() : std::get<memory::Image>(texture->backing).data());
+    }
+
     Texture::Texture(GPU &gpu, BackingType &&backing, GuestTexture guest, texture::Dimensions dimensions, texture::Format format, vk::ImageLayout layout, vk::ImageTiling tiling, u32 mipLevels, u32 layerCount, vk::SampleCountFlagBits sampleCount)
         : gpu(gpu),
           backing(std::move(backing)),
@@ -90,6 +249,8 @@ namespace skyline::gpu {
     }
 
     bool Texture::WaitOnBacking() {
+        TRACE_EVENT("gpu", "Texture::WaitOnBacking");
+
         if (GetBacking()) [[likely]] {
             return false;
         } else {
@@ -101,6 +262,8 @@ namespace skyline::gpu {
     }
 
     void Texture::WaitOnFence() {
+        TRACE_EVENT("gpu", "Texture::WaitOnFence");
+
         auto lCycle{cycle.lock()};
         if (lCycle) {
             lCycle->Wait();
@@ -121,6 +284,8 @@ namespace skyline::gpu {
         WaitOnBacking();
         WaitOnFence();
 
+        TRACE_EVENT("gpu", "Texture::TransitionLayout");
+
         if (layout != pLayout) {
             cycle = gpu.scheduler.Submit([&](vk::raii::CommandBuffer &commandBuffer) {
                 commandBuffer.pipelineBarrier(layout != vk::ImageLayout::eUndefined ? vk::PipelineStageFlagBits::eTopOfPipe : vk::PipelineStageFlagBits::eBottomOfPipe, vk::PipelineStageFlagBits::eBottomOfPipe, {}, {}, {}, vk::ImageMemoryBarrier{
@@ -132,9 +297,9 @@ namespace skyline::gpu {
                     .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
                     .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
                     .subresourceRange = {
-                        .aspectMask = vk::ImageAspectFlagBits::eColor,
-                        .levelCount = 1,
-                        .layerCount = 1,
+                        .aspectMask = format->vkAspect,
+                        .levelCount = mipLevels,
+                        .layerCount = layerCount,
                     },
                 });
             });
@@ -143,158 +308,91 @@ namespace skyline::gpu {
     }
 
     void Texture::SynchronizeHost() {
-        if (!guest)
-            throw exception("Synchronization of host textures requires a valid guest texture to synchronize from");
-        else if (guest->mappings.size() != 1)
-            throw exception("Synchronization of non-contigious textures is not supported");
-        else if (guest->dimensions != dimensions)
-            throw exception("Guest and host dimensions being different is not supported currently");
-
         TRACE_EVENT("gpu", "Texture::SynchronizeHost");
-        auto pointer{guest->mappings[0].data()};
-        auto size{format->GetSize(dimensions)};
-
-        u8 *bufferData;
-        auto stagingBuffer{[&]() -> std::shared_ptr<memory::StagingBuffer> {
-            if (tiling == vk::ImageTiling::eOptimal || !std::holds_alternative<memory::Image>(backing)) {
-                // We need a staging buffer for all optimal copies (since we aren't aware of the host optimal layout) and linear textures which we cannot map on the CPU since we do not have access to their backing VkDeviceMemory
-                auto stagingBuffer{gpu.memory.AllocateStagingBuffer(size)};
-                bufferData = stagingBuffer->data();
-                return stagingBuffer;
-            } else if (tiling == vk::ImageTiling::eLinear) {
-                // We can optimize linear texture sync on a UMA by mapping the texture onto the CPU and copying directly into it rather than a staging buffer
-                bufferData = std::get<memory::Image>(backing).data();
-                WaitOnFence(); // We need to wait on fence here since we are mutating the texture directly after, the wait can be deferred till the copy when a staging buffer is used
-                return nullptr;
-            } else {
-                throw exception("Guest -> Host synchronization of images tiled as '{}' isn't implemented", vk::to_string(tiling));
-            }
-        }()};
-
-        if (guest->tileConfig.mode == texture::TileMode::Block) {
-            // Reference on Block-linear tiling: https://gist.github.com/PixelyIon/d9c35050af0ef5690566ca9f0965bc32
-            constexpr u8 SectorWidth{16}; // The width of a sector in bytes
-            constexpr u8 SectorHeight{2}; // The height of a sector in lines
-            constexpr u8 GobWidth{64}; // The width of a GOB in bytes
-            constexpr u8 GobHeight{8}; // The height of a GOB in lines
-
-            auto blockHeight{guest->tileConfig.blockHeight}; //!< The height of the blocks in GOBs
-            auto robHeight{GobHeight * blockHeight}; //!< The height of a single ROB (Row of Blocks) in lines
-            auto surfaceHeight{guest->dimensions.height / guest->format->blockHeight}; //!< The height of the surface in lines
-            auto surfaceHeightRobs{util::AlignUp(surfaceHeight, robHeight) / robHeight}; //!< The height of the surface in ROBs (Row Of Blocks)
-            auto robWidthBytes{util::AlignUp((guest->dimensions.width / guest->format->blockWidth) * guest->format->bpb, GobWidth)}; //!< The width of a ROB in bytes
-            auto robWidthBlocks{robWidthBytes / GobWidth}; //!< The width of a ROB in blocks (and GOBs because block width == 1 on the Tegra X1)
-            auto robBytes{robWidthBytes * robHeight}; //!< The size of a ROB in bytes
-            auto gobYOffset{robWidthBytes * GobHeight}; //!< The offset of the next Y-axis GOB from the current one in linear space
-
-            auto inputSector{pointer}; //!< The address of the input sector
-            auto outputRob{bufferData}; //!< The address of the output block
-
-            for (u32 rob{}, y{}, paddingY{}; rob < surfaceHeightRobs; rob++) { // Every Surface contains `surfaceHeightRobs` ROBs
-                auto outputBlock{outputRob}; // We iterate through a block independently of the ROB
-                for (u32 block{}; block < robWidthBlocks; block++) { // Every ROB contains `surfaceWidthBlocks` Blocks
-                    auto outputGob{outputBlock}; // We iterate through a GOB independently of the block
-                    for (u32 gobY{}; gobY < blockHeight; gobY++) { // Every Block contains `blockHeight` Y-axis GOBs
-                        for (u32 index{}; index < SectorWidth * SectorHeight; index++) { // Every Y-axis GOB contains `sectorWidth * sectorHeight` sectors
-                            u32 xT{((index << 3) & 0b10000) | ((index << 1) & 0b100000)}; // Morton-Swizzle on the X-axis
-                            u32 yT{((index >> 1) & 0b110) | (index & 0b1)}; // Morton-Swizzle on the Y-axis
-                            std::memcpy(outputGob + (yT * robWidthBytes) + xT, inputSector, SectorWidth);
-                            inputSector += SectorWidth; // `sectorWidth` bytes are of sequential image data
-                        }
-                        outputGob += gobYOffset; // Increment the output GOB to the next Y-axis GOB
-                    }
-                    inputSector += paddingY; // Increment the input sector to the next sector
-                    outputBlock += GobWidth; // Increment the output block to the next block (As Block Width = 1 GOB Width)
-                }
-                outputRob += robBytes; // Increment the output block to the next ROB
-
-                y += robHeight; // Increment the Y position to the next ROB
-                blockHeight = static_cast<u8>(std::min(static_cast<u32>(blockHeight), (surfaceHeight - y) / GobHeight)); // Calculate the amount of Y GOBs which aren't padding
-                paddingY = (guest->tileConfig.blockHeight - blockHeight) * (SectorWidth * SectorWidth * SectorHeight); // Calculate the amount of padding between contiguous sectors
-            }
-        } else if (guest->tileConfig.mode == texture::TileMode::Pitch) {
-            auto sizeLine{guest->format->GetSize(guest->dimensions.width, 1)}; //!< The size of a single line of pixel data
-            auto sizeStride{guest->format->GetSize(guest->tileConfig.pitch, 1)}; //!< The size of a single stride of pixel data
-
-            auto inputLine{pointer}; //!< The address of the input line
-            auto outputLine{bufferData}; //!< The address of the output line
-
-            for (u32 line{}; line < guest->dimensions.height; line++) {
-                std::memcpy(outputLine, inputLine, sizeLine);
-                inputLine += sizeStride;
-                outputLine += sizeLine;
-            }
-        } else if (guest->tileConfig.mode == texture::TileMode::Linear) {
-            std::memcpy(bufferData, pointer, size);
-        }
 
+        auto stagingBuffer{SynchronizeHostImpl(nullptr)};
         if (stagingBuffer) {
-            if (WaitOnBacking() && size != format->GetSize(dimensions))
-                throw exception("Backing properties changing during sync is not supported");
-            WaitOnFence();
-
             auto lCycle{gpu.scheduler.Submit([&](vk::raii::CommandBuffer &commandBuffer) {
-                auto image{GetBacking()};
-                if (layout != vk::ImageLayout::eTransferDstOptimal) {
-                    commandBuffer.pipelineBarrier(layout != vk::ImageLayout::eUndefined ? vk::PipelineStageFlagBits::eTopOfPipe : vk::PipelineStageFlagBits::eBottomOfPipe, vk::PipelineStageFlagBits::eTransfer, {}, {}, {}, vk::ImageMemoryBarrier{
-                        .image = image,
-                        .srcAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
-                        .dstAccessMask = vk::AccessFlagBits::eTransferWrite,
-                        .oldLayout = layout,
-                        .newLayout = vk::ImageLayout::eTransferDstOptimal,
-                        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                        .subresourceRange = {
-                            .aspectMask = vk::ImageAspectFlagBits::eColor,
-                            .levelCount = 1,
-                            .layerCount = 1,
-                        },
-                    });
-
-                    if (layout == vk::ImageLayout::eUndefined)
-                        layout = vk::ImageLayout::eTransferDstOptimal;
-                }
-
-                commandBuffer.copyBufferToImage(stagingBuffer->vkBuffer, image, vk::ImageLayout::eTransferDstOptimal, vk::BufferImageCopy{
-                    .imageExtent = dimensions,
-                    .imageSubresource = {
-                        .aspectMask = vk::ImageAspectFlagBits::eColor,
-                        .layerCount = 1,
-                    },
-                });
-
-                if (layout != vk::ImageLayout::eTransferDstOptimal)
-                    commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eTransfer, {}, {}, {}, vk::ImageMemoryBarrier{
-                        .image = image,
-                        .srcAccessMask = vk::AccessFlagBits::eTransferWrite,
-                        .dstAccessMask = vk::AccessFlagBits::eMemoryRead,
-                        .oldLayout = vk::ImageLayout::eTransferDstOptimal,
-                        .newLayout = layout,
-                        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-                        .subresourceRange = {
-                            .aspectMask = vk::ImageAspectFlagBits::eColor,
-                            .levelCount = 1,
-                            .layerCount = 1,
-                        },
-                    });
+                CopyFromStagingBuffer(commandBuffer, stagingBuffer);
             })};
             lCycle->AttachObjects(stagingBuffer, shared_from_this());
             cycle = lCycle;
         }
     }
 
+    void Texture::SynchronizeHostWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &pCycle) {
+        TRACE_EVENT("gpu", "Texture::SynchronizeHostWithBuffer");
+
+        auto stagingBuffer{SynchronizeHostImpl(pCycle)};
+        if (stagingBuffer) {
+            CopyFromStagingBuffer(commandBuffer, stagingBuffer);
+            pCycle->AttachObjects(stagingBuffer, shared_from_this());
+            cycle = pCycle;
+        }
+    }
+
     void Texture::SynchronizeGuest() {
         if (!guest)
             throw exception("Synchronization of guest textures requires a valid guest texture to synchronize to");
         else if (guest->mappings.size() != 1)
             throw exception("Synchronization of non-contigious textures is not supported");
+        else if (layout == vk::ImageLayout::eUndefined)
+            return; // If the state of the host texture is undefined then so can the guest
+        else if (guest->mappings.size() > 1)
+            throw exception("Synchronizing textures across {} mappings is not supported", guest->mappings.size());
+
+        TRACE_EVENT("gpu", "Texture::SynchronizeGuest");
 
         WaitOnBacking();
         WaitOnFence();
 
-        TRACE_EVENT("gpu", "Texture::SynchronizeGuest");
-        // TODO: Write Host -> Guest Synchronization
+        if (tiling == vk::ImageTiling::eOptimal || !std::holds_alternative<memory::Image>(backing)) {
+            auto size{format->GetSize(dimensions)};
+            auto stagingBuffer{gpu.memory.AllocateStagingBuffer(size)};
+
+            auto lCycle{gpu.scheduler.Submit([&](vk::raii::CommandBuffer &commandBuffer) {
+                CopyIntoStagingBuffer(commandBuffer, stagingBuffer);
+            })};
+            lCycle->AttachObject(std::make_shared<TextureBufferCopy>(shared_from_this(), stagingBuffer));
+            cycle = lCycle;
+        } else if (tiling == vk::ImageTiling::eLinear) {
+            // We can optimize linear texture sync on a UMA by mapping the texture onto the CPU and copying directly from it rather than using a staging buffer
+            CopyToGuest(std::get<memory::Image>(backing).data());
+        } else {
+            throw exception("Host -> Guest synchronization of images tiled as '{}' isn't implemented", vk::to_string(tiling));
+        }
+    }
+
+    void Texture::SynchronizeGuestWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &pCycle) {
+        if (!guest)
+            throw exception("Synchronization of guest textures requires a valid guest texture to synchronize to");
+        else if (guest->mappings.size() != 1)
+            throw exception("Synchronization of non-contigious textures is not supported");
+        else if (layout == vk::ImageLayout::eUndefined)
+            return; // If the state of the host texture is undefined then so can the guest
+        else if (guest->mappings.size() > 1)
+            throw exception("Synchronizing textures across {} mappings is not supported", guest->mappings.size());
+
+        TRACE_EVENT("gpu", "Texture::SynchronizeGuestWithBuffer");
+
+        WaitOnBacking();
+        if (cycle.lock() != pCycle)
+            WaitOnFence();
+
+        if (tiling == vk::ImageTiling::eOptimal || !std::holds_alternative<memory::Image>(backing)) {
+            auto size{format->GetSize(dimensions)};
+            auto stagingBuffer{gpu.memory.AllocateStagingBuffer(size)};
+
+            CopyIntoStagingBuffer(commandBuffer, stagingBuffer);
+            pCycle->AttachObject(std::make_shared<TextureBufferCopy>(shared_from_this(), stagingBuffer));
+            cycle = pCycle;
+        } else if (tiling == vk::ImageTiling::eLinear) {
+            CopyToGuest(std::get<memory::Image>(backing).data());
+            pCycle->AttachObject(std::make_shared<TextureBufferCopy>(shared_from_this()));
+            cycle = pCycle;
+        } else {
+            throw exception("Host -> Guest synchronization of images tiled as '{}' isn't implemented", vk::to_string(tiling));
+        }
     }
 
     void Texture::CopyFrom(std::shared_ptr<Texture> source, const vk::ImageSubresourceRange &subresource) {
@@ -311,6 +409,8 @@ namespace skyline::gpu {
         else if (source->format != format)
             throw exception("Cannot copy from image with different format");
 
+        TRACE_EVENT("gpu", "Texture::CopyFrom");
+
         auto lCycle{gpu.scheduler.Submit([&](vk::raii::CommandBuffer &commandBuffer) {
             auto sourceBacking{source->GetBacking()};
             if (source->layout != vk::ImageLayout::eTransferSrcOptimal) {
diff --git a/app/src/main/cpp/skyline/gpu/texture/texture.h b/app/src/main/cpp/skyline/gpu/texture/texture.h
index 0f8dbc23..97b59ccd 100644
--- a/app/src/main/cpp/skyline/gpu/texture/texture.h
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.h
@@ -248,7 +248,7 @@ namespace skyline::gpu {
      * @brief A descriptor for a texture present in guest memory, it can be used to create a corresponding Texture object for usage on the host
      */
     struct GuestTexture {
-        using Mappings = boost::container::small_vector<span < u8>, 3>;
+        using Mappings = boost::container::small_vector<span<u8>, 3>;
 
         Mappings mappings; //!< Spans to CPU memory for the underlying data backing this texture
         texture::Dimensions dimensions;
@@ -263,7 +263,7 @@ namespace skyline::gpu {
 
         GuestTexture(Mappings mappings, texture::Dimensions dimensions, texture::Format format, texture::TileConfig tileConfig, texture::TextureType type, u16 baseArrayLayer = 0, u16 layerCount = 1, u32 layerStride = 0) : mappings(mappings), dimensions(dimensions), format(format), tileConfig(tileConfig), type(type), baseArrayLayer(baseArrayLayer), layerCount(layerCount), layerStride(layerStride) {}
 
-        GuestTexture(span <u8> mapping, texture::Dimensions dimensions, texture::Format format, texture::TileConfig tileConfig, texture::TextureType type, u16 baseArrayLayer = 0, u16 layerCount = 1, u32 layerStride = 0) : mappings(1, mapping), dimensions(dimensions), format(format), tileConfig(tileConfig), type(type), baseArrayLayer(baseArrayLayer), layerCount(layerCount), layerStride(layerStride) {}
+        GuestTexture(span<u8> mapping, texture::Dimensions dimensions, texture::Format format, texture::TileConfig tileConfig, texture::TextureType type, u16 baseArrayLayer = 0, u16 layerCount = 1, u32 layerStride = 0) : mappings(1, mapping), dimensions(dimensions), format(format), tileConfig(tileConfig), type(type), baseArrayLayer(baseArrayLayer), layerCount(layerCount), layerStride(layerStride) {}
     };
 
     class TextureManager;
@@ -314,6 +314,40 @@ namespace skyline::gpu {
         friend TextureManager;
         friend TextureView;
 
+        /**
+         * @brief An implementation function for guest -> host texture synchronization, it allocates and copies data into a staging buffer or directly into a linear host texture
+         * @return If a staging buffer was required for the texture sync, it's returned filled with guest texture data and must be copied to the host texture by the callee
+         */
+        std::shared_ptr<memory::StagingBuffer> SynchronizeHostImpl(const std::shared_ptr<FenceCycle> &pCycle);
+
+        /**
+         * @brief Records commands for copying data from a staging buffer to the texture's backing into the supplied command buffer
+         */
+        void CopyFromStagingBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<memory::StagingBuffer>& stagingBuffer);
+
+        /**
+         * @brief Records commands for copying data from the texture's backing to a staging buffer into the supplied command buffer
+         */
+        void CopyIntoStagingBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<memory::StagingBuffer>& stagingBuffer);
+
+        /**
+         * @brief Copies data from the supplied host buffer into the guest texture
+         * @note The host buffer must be contain the entire image
+         */
+        void CopyToGuest(u8* hostBuffer);
+
+        /**
+         * @brief A FenceCycleDependency that copies the contents of a staging buffer or mapped image backing the texture to the guest texture
+         */
+        struct TextureBufferCopy : public FenceCycleDependency {
+            std::shared_ptr<Texture> texture;
+            std::shared_ptr<memory::StagingBuffer> stagingBuffer;
+
+            TextureBufferCopy(std::shared_ptr<Texture> texture, std::shared_ptr<memory::StagingBuffer> stagingBuffer = {});
+
+            ~TextureBufferCopy();
+        };
+
       public:
         std::weak_ptr<FenceCycle> cycle; //!< A fence cycle for when any host operation mutating the texture has completed, it must be waited on prior to any mutations to the backing
         std::optional<GuestTexture> guest;
@@ -404,11 +438,21 @@ namespace skyline::gpu {
 
         /**
          * @brief Synchronizes the host texture with the guest after it has been modified
+         * @param commandBuffer An optional command buffer that the command will be recorded into rather than creating one as necessary
+         * @note A command buffer **must** not be submitted if it is created just for the command as it can be more efficient to allocate one within the function as necessary which is done when one isn't passed in
          * @note The texture **must** be locked prior to calling this
-         * @note The guest texture should not be null prior to calling this
+         * @note The guest texture backing should exist prior to calling this
          */
         void SynchronizeHost();
 
+        /**
+         * @brief Same as SynchronizeHost but this records any commands into the supplied command buffer rather than creating one as necessary
+         * @note It is more efficient to call SynchronizeHost than allocating a command buffer purely for this function as it may conditionally not record any commands
+         * @note The texture **must** be locked prior to calling this
+         * @note The guest texture backing should exist prior to calling this
+         */
+        void SynchronizeHostWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle);
+
         /**
          * @brief Synchronizes the guest texture with the host texture after it has been modified
          * @note The texture **must** be locked prior to calling this
@@ -416,6 +460,14 @@ namespace skyline::gpu {
          */
         void SynchronizeGuest();
 
+        /**
+         * @brief Synchronizes the guest texture with the host texture after it has been modified
+         * @note It is more efficient to call SynchronizeHost than allocating a command buffer purely for this function as it may conditionally not record any commands
+         * @note The texture **must** be locked prior to calling this
+         * @note The guest texture should not be null prior to calling this
+         */
+        void SynchronizeGuestWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle);
+
         /**
          * @brief Copies the contents of the supplied source texture into the current texture
          */
diff --git a/app/src/main/cpp/skyline/services/hosbinder/GraphicBufferProducer.cpp b/app/src/main/cpp/skyline/services/hosbinder/GraphicBufferProducer.cpp
index eb2052b6..4b0ff4b4 100644
--- a/app/src/main/cpp/skyline/services/hosbinder/GraphicBufferProducer.cpp
+++ b/app/src/main/cpp/skyline/services/hosbinder/GraphicBufferProducer.cpp
@@ -386,7 +386,7 @@ namespace skyline::service::hosbinder {
         {
             auto &texture{buffer.texture};
             std::scoped_lock textureLock(*texture);
-            // texture->SynchronizeHost();
+            texture->SynchronizeHost();
             u64 frameId;
             state.gpu->presentation.Present(texture, isAutoTimestamp ? 0 : timestamp, swapInterval, crop, scalingMode, transform, frameId);
         }
diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/types.h b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/types.h
index 87937abd..bb3f37f7 100644
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/types.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/types.h
@@ -59,6 +59,9 @@ namespace skyline::soc::gm20b::engine::maxwell3d::type {
             R16Unorm = 0xEE,
             R16Float = 0xF2,
             R8Unorm = 0xF3,
+            R8Snorm = 0xF4,
+            R8Sint = 0xF5,
+            R8Uint = 0xF6,
         } format;
 
         struct TileMode {
diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
index dfbd03d1..c869e86e 100644
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
@@ -244,8 +244,8 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
 
             MAXWELL3D_CASE(syncpointAction, {
                 state.logger->Debug("Increment syncpoint: {}", static_cast<u16>(syncpointAction.id));
-                state.soc->host1x.syncpoints.at(syncpointAction.id).Increment();
                 state.soc->gm20b.executor.Execute();
+                state.soc->host1x.syncpoints.at(syncpointAction.id).Increment();
             })
 
             MAXWELL3D_CASE(clearBuffers, {