NEEDS CLEANUP: Reimplement GPU VMM and rewrite nvdrv VM impl

2024-06-01 15:39:02 +02:00 · 2021-08-14 20:42:11 +01:00 · 2021-08-14 20:42:11 +01:00 · d03b288db6
commit d03b288db6
parent 020aa0e43a
23 changed files with 794 additions and 444 deletions
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@ -100,7 +100,7 @@ add_library(skyline SHARED
        ${source_DIR}/skyline/gpu/command_scheduler.cpp
        ${source_DIR}/skyline/gpu/texture/texture.cpp
        ${source_DIR}/skyline/gpu/presentation_engine.cpp
-        ${source_DIR}/skyline/soc/gmmu.cpp
+        ${source_DIR}/skyline/soc/gm20b.cpp
        ${source_DIR}/skyline/soc/host1x/syncpoint.cpp
        ${source_DIR}/skyline/soc/gm20b/gpfifo.cpp
        ${source_DIR}/skyline/soc/gm20b/engines/maxwell_3d.cpp
--- a/app/src/main/cpp/skyline/common/address_space.h
+++ b/app/src/main/cpp/skyline/common/address_space.h
@ -0,0 +1,155 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <concepts>
+#include <common.h>
+
+namespace skyline {
+    template<typename VaType, size_t AddressSpaceBits>
+    concept AddressSpaceValid = std::is_unsigned_v<VaType> && sizeof(VaType) * 8 >= AddressSpaceBits;
+
+    /**
+     * @brief FlatAddressSpaceMap provides a generic VA->PA mapping implementation using a sorted vector
+     */
+    template<typename VaType, VaType UnmappedVa, typename PaType, PaType UnmappedPa, bool PaContigSplit, size_t AddressSpaceBits> requires AddressSpaceValid<VaType, AddressSpaceBits>
+    class FlatAddressSpaceMap {
+      private:
+        /**
+         * @brief Represents a block of memory in the AS
+         */
+        struct Block {
+            VaType virt{UnmappedVa}; //!< VA of the block
+            PaType phys{UnmappedPa}; //!< PA of the block, will increase 1-1 with VA until a new block is encountered
+            bool flag{}; //!< General purpose flag for use by derived classes
+
+            Block() = default;
+
+            Block(VaType virt, PaType phys, bool flag) : virt(virt), phys(phys), flag(flag) {}
+
+            constexpr bool Valid() {
+                return virt != UnmappedVa;
+            }
+
+            constexpr bool Mapped() {
+                return phys != UnmappedPa;
+            }
+
+            constexpr bool Unmapped() {
+                return phys == UnmappedPa;
+            }
+
+            bool operator<(const VaType &pVirt) const {
+                return virt < pVirt;
+            }
+        };
+
+      protected:
+        std::mutex blockMutex;
+        std::vector<Block> blocks{Block{}};
+
+        /**
+         * @brief Maps a PA range into the given AS region, optionally setting the flag
+         * @note blockMutex MUST be locked when calling this
+         */
+        void MapLocked(VaType virt, PaType phys, VaType size, bool flag = {});
+
+        /**
+         * @brief Unmaps the given range and merges it with other unmapped regions
+         * @note blockMutex MUST be locked when calling this
+         */
+        void UnmapLocked(VaType virt, VaType size);
+
+      public:
+        static constexpr VaType VaMaximum{(1ULL << (AddressSpaceBits - 1)) + ((1ULL << (AddressSpaceBits - 1)) - 1)}; //!< The maximum VA that this AS can technically reach
+
+        VaType vaLimit{VaMaximum}; //!< A soft limit on the maximum VA of the AS
+
+        FlatAddressSpaceMap(VaType pVaLimit);
+
+        FlatAddressSpaceMap() = default;
+
+        /**
+         * @brief Locked version of MapLocked
+         */
+        void Map(VaType virt, PaType phys, VaType size, bool flag = {});
+
+        /**
+         * @brief Locked version of UnmapLocked
+         */
+        void Unmap(VaType virt, VaType size);
+    };
+
+    /**
+     * @brief FlatMemoryManager specialises FlatAddressSpaceMap to focus on pointers as PAs, adding read/write functions
+     */
+    template<typename VaType, VaType UnmappedVa, size_t AddressSpaceBits> requires AddressSpaceValid<VaType, AddressSpaceBits>
+    class FlatMemoryManager : public FlatAddressSpaceMap<VaType, UnmappedVa, u8 *, nullptr, true, AddressSpaceBits> {
+      public:
+        /**
+         * @return A placeholder address for sparse mapped regions, this means nothing
+         */
+        static u8 *SparsePlaceholderAddress() {
+            return reinterpret_cast<u8 *>(0xCAFEBABE);
+        }
+
+        void Read(u8 *destination, VaType virt, VaType size);
+
+        template<typename T>
+        void Read(span <T> destination, VaType virt) {
+            Read(reinterpret_cast<u8 *>(destination.data()), virt, destination.size_bytes());
+        }
+
+        template<typename T>
+        T Read(VaType virt) {
+            T obj;
+            Read(reinterpret_cast<u8 *>(&obj), virt, sizeof(T));
+            return obj;
+        }
+
+        void Write(VaType virt, u8 *source, VaType size);
+
+        template<typename T>
+        void Write(VaType virt, span <T> source) {
+            Write(virt, reinterpret_cast<u8 *>(source.data()), source.size_bytes());
+        }
+
+        template<typename T>
+        void Write(VaType virt, T source) {
+            Write(virt, reinterpret_cast<u8 *>(&source), sizeof(T));
+        }
+    };
+
+
+    /**
+     * @brief FlatMemoryManager specialises FlatAddressSpaceMap to work as an allocator, with an initial, fast linear pass and a subsequent slower pass that iterates until it finds a free block
+     */
+    template<typename VaType, VaType UnmappedVa, size_t AddressSpaceBits> requires AddressSpaceValid<VaType, AddressSpaceBits>
+    class FlatAllocator : public FlatAddressSpaceMap<VaType, UnmappedVa, bool, false, false, AddressSpaceBits> {
+      private:
+        using Base = FlatAddressSpaceMap<VaType, UnmappedVa, bool, false, false, AddressSpaceBits>;
+
+        VaType currentLinearAllocEnd; //!< The end address for the initial linear allocation pass, once this reaches the AS limit the slower allocation path will be used
+
+      public:
+        VaType vaStart; //!< The base VA of the allocator, no allocations will be below this
+
+        FlatAllocator(VaType vaStart, VaType vaLimit);
+
+        /**
+         * @brief Allocates a region in the AS of the given size and returns its address
+         */
+        VaType Allocate(VaType size);
+
+        /**
+         * @brief Marks the given region in the AS as allocated
+         */
+        void AllocateFixed(VaType virt, VaType size);
+
+        /**
+         * @brief Frees an AS region so it can be used again
+         */
+        void Free(VaType virt, VaType size);
+    };
+}
--- a/app/src/main/cpp/skyline/common/address_space.inc
+++ b/app/src/main/cpp/skyline/common/address_space.inc
@ -0,0 +1,354 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <common/trace.h>
+#include <kernel/types/KProcess.h>
+#include "address_space.h"
+
+#define MAP_MEMBER(returnType) template<typename VaType, VaType UnmappedVa, typename PaType, PaType UnmappedPa, bool PaContigSplit, size_t AddressSpaceBits> requires AddressSpaceValid<VaType, AddressSpaceBits> returnType FlatAddressSpaceMap<VaType, UnmappedVa, PaType, UnmappedPa, PaContigSplit, AddressSpaceBits>
+
+#define MM_MEMBER(returnType) template<typename VaType, VaType UnmappedVa, size_t AddressSpaceBits> requires AddressSpaceValid<VaType, AddressSpaceBits> returnType FlatMemoryManager<VaType, UnmappedVa, AddressSpaceBits>
+
+#define ALLOC_MEMBER(returnType) template<typename VaType, VaType UnmappedVa, size_t AddressSpaceBits> requires AddressSpaceValid<VaType, AddressSpaceBits> returnType FlatAllocator<VaType, UnmappedVa, AddressSpaceBits>
+
+namespace skyline {
+    MAP_MEMBER()::FlatAddressSpaceMap(VaType pVaLimit) : vaLimit(pVaLimit) {
+        if (pVaLimit > VaMaximum)
+            throw exception("Invalid VA limit!");
+    }
+
+    MAP_MEMBER(void)::MapLocked(VaType virt, PaType phys, VaType size, bool flag) {
+        TRACE_EVENT("containers", "FlatAddressSpaceMap::Map");
+
+        VaType virtEnd{virt + size};
+
+        if (virtEnd > vaLimit)
+            throw exception("Trying to map a block past the VA limit!");
+
+        auto blockEndSuccessor{std::lower_bound(blocks.begin(), blocks.end(), virtEnd)};
+        if (blockEndSuccessor == blocks.begin())
+            throw exception("Unexpected Memory Manager state!");
+
+        auto blockEndPredecessor{std::prev(blockEndSuccessor)};
+
+        if (blockEndSuccessor != blocks.end()) {
+            // We have blocks in front of us, if one is directly in front then we don't have to add a tail
+            if (blockEndSuccessor->virt != virtEnd) {
+                PaType tailPhys{[&]() -> PaType {
+                    if (!PaContigSplit || blockEndPredecessor->Unmapped())
+                        return blockEndPredecessor->phys; // Always propagate unmapped regions
+                    else
+                        return blockEndPredecessor->phys + virtEnd - blockEndPredecessor->virt;
+                }()};
+
+                if (blockEndPredecessor->virt >= virt) {
+                    // If this block's start would be overlapped by the map then reuse it as a tail block
+                    blockEndPredecessor->virt = virtEnd;
+                    blockEndPredecessor->phys = tailPhys;
+                    blockEndPredecessor->flag = blockEndPredecessor->flag;
+                } else {
+                    // Else insert a new one and we're done
+                    blocks.insert(blockEndSuccessor, {Block(virt, phys, flag), Block(virtEnd, tailPhys, blockEndPredecessor->flag)});
+                    return;
+                }
+            }
+        } else {
+            // blockEndPredecessor will always be unmapped as blocks has to be terminated by an unmapped chunk
+            if (blockEndPredecessor != blocks.begin() && blockEndPredecessor->virt >= virt) {
+                // Move the unmapped block start backwards
+                blockEndPredecessor->virt = virtEnd;
+            } else {
+                // Else insert a new one and we're done
+                blocks.insert(blockEndSuccessor, {Block(virt, phys, flag), Block(virtEnd, UnmappedPa, false)});
+                return;
+            }
+        }
+
+        auto blockStartSuccessor{blockEndPredecessor};
+
+        // Walk the block vector to find the start successor as this is more efficient than another binary search in most scenarios
+        while (std::prev(blockStartSuccessor)->virt >= virt)
+            std::advance(blockStartSuccessor, -1);
+
+        if (blockStartSuccessor->virt > virtEnd)
+            throw exception("Unexpected Memory Manager state!");
+
+        if (blockStartSuccessor->virt == virtEnd) {
+            // We need to create a new block as there are none spare that we would overwrite
+            blocks.insert(blockStartSuccessor, Block(virt, phys, flag));
+            return;
+        } else {
+            blockStartSuccessor->virt = virt;
+            blockStartSuccessor->phys = phys;
+            blockStartSuccessor->flag = flag;
+
+            // Erase overwritten blocks
+            if (auto eraseStart{std::next(blockStartSuccessor)}; blockStartSuccessor != blockEndPredecessor) {
+                if (eraseStart == blockEndPredecessor)
+                    __builtin_trap();
+
+                blocks.erase(eraseStart, blockEndPredecessor);
+            }
+        }
+    }
+
+    MAP_MEMBER(void)::UnmapLocked(VaType virt, VaType size) {
+        TRACE_EVENT("containers", "FlatAddressSpaceMap::Unmap");
+
+        VaType virtEnd{virt + size};
+
+        if (virtEnd > vaLimit)
+            throw exception("Trying to map a block past the VA limit!");
+
+        auto blockEndSuccessor{std::lower_bound(blocks.begin(), blocks.end(), virtEnd)};
+        if (blockEndSuccessor == blocks.begin())
+            throw exception("Unexpected Memory Manager state!");
+
+        auto blockEndPredecessor{std::prev(blockEndSuccessor)};
+
+        auto walkBackToPredecessor{[&](auto iter) {
+            while (iter->virt >= virt)
+                std::advance(iter, -1);
+
+            return iter;
+        }};
+
+        auto eraseBlocksWithEndUnmapped{[&] (auto unmappedEnd) {
+            auto blockStartPredecessor{walkBackToPredecessor(unmappedEnd)};
+            auto blockStartSuccessor{std::next(blockStartPredecessor)};
+
+            auto eraseEnd{[&]() {
+                if (blockStartPredecessor->Unmapped()) {
+                    // If the start predecessor is unmapped then we can erase everything in our region and be done
+                    return std::next(unmappedEnd);
+                } else {
+                    // Else reuse the end predecessor as the start of our unmapped region then erase all up to it
+                    unmappedEnd->virt = virt;
+                    return unmappedEnd;
+                }
+            }()};
+
+            // We can't have two unmapped regions after each other
+            if (eraseEnd == blockStartSuccessor || (blockStartPredecessor->Unmapped() && eraseEnd->Unmapped()))
+                throw exception("Unexpected Memory Manager state!");
+
+            blocks.erase(blockStartSuccessor, eraseEnd);
+        }};
+
+        // We can avoid any splitting logic if these are the case
+        if (blockEndPredecessor->Unmapped()) {
+            if (blockEndPredecessor->virt > virt)
+                eraseBlocksWithEndUnmapped(blockEndPredecessor);
+
+            return; // The region is unmapped, bail out early
+        } else if (blockEndSuccessor->virt == virtEnd && blockEndSuccessor->Unmapped()) {
+            eraseBlocksWithEndUnmapped(blockEndSuccessor);
+            return; // The region is unmapped here and doesn't need splitting, bail out early
+        } else if (blockEndSuccessor == blocks.end()) {
+            // This should never happen as the end should always follow an unmapped block
+            throw exception("Unexpected Memory Manager state!");
+        } else if (blockEndSuccessor->virt != virtEnd) {
+            // If one block is directly in front then we don't have to add a tail
+            
+            // The previous block is mapped so we will need to add a tail with an offset
+            PaType tailPhys{[&]() {
+                if constexpr (PaContigSplit)
+                    return blockEndPredecessor->phys + virtEnd - blockEndPredecessor->virt;
+                else
+                    return blockEndPredecessor->phys;
+            }()};
+
+            if (blockEndPredecessor->virt >= virt) {
+                // If this block's start would be overlapped by the unmap then reuse it as a tail block
+                blockEndPredecessor->virt = virtEnd;
+                blockEndPredecessor->phys = tailPhys;
+            } else {
+                blocks.insert(blockEndSuccessor, {Block(virt, UnmappedPa, false), Block(virtEnd, tailPhys, blockEndPredecessor->flag)});
+                return; // The previous block is mapped and ends bef
+            }
+        }
+
+        // Walk the block vector to find the start predecessor as this is more efficient than another binary search in most scenarios
+        auto blockStartPredecessor{walkBackToPredecessor(blockEndPredecessor)};
+        auto blockStartSuccessor{std::next(blockStartPredecessor)};
+
+        if (blockStartSuccessor->virt > virtEnd)
+            throw exception("Unexpected Memory Manager state!");
+
+        if (blockStartSuccessor->virt == virtEnd) {
+            // There are no blocks between the start and the end that would let us skip inserting a new one for head
+
+            // The previous block is may be unmapped, if so we don't need to insert any unmaps after it
+            if (blockStartPredecessor->Mapped())
+                blocks.insert(blockStartSuccessor, Block(virt, UnmappedPa, false));
+        } else if (blockStartPredecessor->Unmapped()) {
+            // If the previous block is unmapped
+            blocks.erase(blockStartSuccessor, blockEndPredecessor);
+        } else {
+            // Add in the unmapped block header
+            blockStartSuccessor->virt = virt;
+            blockStartSuccessor->phys = UnmappedPa;
+
+            // Erase overwritten blocks, skipping the first one as we have written the unmapped start block there
+            if (auto eraseStart{std::next(blockStartSuccessor)}; blockStartSuccessor != blockEndPredecessor) {
+                if (eraseStart == blockEndPredecessor)
+                    __builtin_trap();
+
+                blocks.erase(eraseStart, blockEndPredecessor);
+            }
+        }
+    }
+
+    MAP_MEMBER(void)::Map(VaType virt, PaType phys, VaType size, bool flag) {
+        std::scoped_lock lock(blockMutex);
+        MapLocked(virt, phys, size, flag);
+    }
+
+    MAP_MEMBER(void)::Unmap(VaType virt, VaType size) {
+        std::scoped_lock lock(blockMutex);
+        UnmapLocked(virt, size);
+    }
+
+    MM_MEMBER(void)::Read(u8 *destination, VaType virt, VaType size) {
+        std::scoped_lock lock(this->blockMutex);
+
+        TRACE_EVENT("containers", "FlatMemoryManager::Read");
+
+        VaType virtEnd{virt + size};
+
+        auto successor{std::upper_bound(this->blocks.begin(), this->blocks.end(), virt, [] (auto virt, const auto &block) {
+            return virt < block.virt;
+        })};
+
+        auto predecessor{std::prev(successor)};
+
+        u8 *blockPhys{predecessor->phys + (virt - predecessor->virt)};
+        VaType blockReadSize{std::min(successor->virt - virt, size)};
+
+        while (size) {
+            if (predecessor->phys == nullptr) {
+                if (predecessor->flag) // Sparse mapping
+                    std::memset(destination, 0, blockReadSize);
+                else
+                    throw exception("Page fault at: 0x{:X}", predecessor->virt);
+            } else {
+                std::memcpy(destination, blockPhys, blockReadSize);
+            }
+
+            destination += blockReadSize;
+            size -= blockReadSize;
+
+            if (size) {
+                predecessor = successor++;
+                blockPhys = predecessor->phys;
+                blockReadSize = std::min(successor->virt - predecessor->virt, size);
+            }
+        }
+    }
+
+    MM_MEMBER(void)::Write(VaType virt, u8 *source, VaType size) {
+        std::scoped_lock lock(this->blockMutex);
+
+        TRACE_EVENT("containers", "FlatMemoryManager::Write");
+
+        VaType virtEnd{virt + size};
+
+        auto successor{std::upper_bound(this->blocks.begin(), this->blocks.end(), virt, [] (auto virt, const auto &block) {
+            return virt < block.virt;
+        })};
+
+        auto predecessor{std::prev(successor)};
+
+        u8 *blockPhys{predecessor->phys + (virt - predecessor->virt)};
+        VaType blockWriteSize{std::min(successor->virt - virt, size)};
+
+        while (size) {
+            if (predecessor->phys == nullptr) {
+                if (!predecessor->flag) // Sparse mappings allow unmapped writes
+                    throw exception("Page fault at: 0x{:X}", predecessor->virt);
+            } else {
+                std::memcpy(blockPhys, source, blockWriteSize);
+            }
+
+            source += blockWriteSize;
+            size -= blockWriteSize;
+
+            if (size) {
+                predecessor = successor++;
+                blockPhys = predecessor->phys;
+                blockWriteSize = std::min(successor->virt - predecessor->virt, size);
+            }
+        }
+
+    }
+
+    ALLOC_MEMBER()::FlatAllocator(VaType vaStart, VaType vaLimit) : Base(vaLimit), vaStart(vaStart), currentLinearAllocEnd(vaStart) {}
+
+    ALLOC_MEMBER(VaType)::Allocate(VaType size) {
+        std::scoped_lock lock(this->blockMutex);
+
+        TRACE_EVENT("containers", "FlatAllocator::Allocate");
+
+        VaType allocStart{UnmappedVa};
+        VaType allocEnd{currentLinearAllocEnd + size};
+
+        if (allocEnd >= currentLinearAllocEnd && allocEnd <= this->vaLimit) {
+            auto allocEndSuccessor{std::lower_bound(this->blocks.begin(), this->blocks.end(), allocEnd)};
+            if (allocEndSuccessor == this->blocks.begin())
+                throw exception("Unexpected allocator state!");
+
+            auto allocEndPredecessor{std::prev(allocEndSuccessor)};
+            if (allocEndPredecessor->virt <= currentLinearAllocEnd) {
+                allocStart = currentLinearAllocEnd;
+            } else {
+                while (allocEndSuccessor != this->blocks.end()) {
+                    if (allocEndSuccessor->virt - allocEndPredecessor->virt < size || allocEndPredecessor->Mapped() ) {
+                        allocStart = allocEndPredecessor->virt;
+                        break;
+                    }
+
+                    allocEndPredecessor = allocEndSuccessor++;
+
+                    if (allocEndSuccessor == this->blocks.end()) {
+                        allocEnd = allocEndPredecessor->virt + size;
+
+                        if (allocEnd >= allocEndPredecessor->virt && allocEnd <= this->vaLimit)
+                            allocStart = allocEndPredecessor->virt;
+                    }
+                }
+            }
+        }
+
+        if (allocStart != UnmappedVa) {
+            currentLinearAllocEnd = allocStart + size;
+        } else {  // If linear allocation overflows the AS then find a gap
+            if (this->blocks.size() <= 2)
+                throw exception("Unexpected allocator state!");
+
+            auto searchPredecessor{this->blocks.begin()};
+            auto searchSuccessor{std::next(searchPredecessor)};
+
+            while (searchSuccessor != this->blocks.end() &&
+                (searchSuccessor->virt - searchPredecessor->virt < size || searchPredecessor->Mapped())) {
+                searchPredecessor = searchSuccessor++;
+            }
+
+            if (searchSuccessor != this->blocks.end())
+                allocStart = searchPredecessor->virt;
+            else
+                throw exception("Unexpected allocator state!");
+        }
+
+
+        this->MapLocked(allocStart, true, size);
+        return allocStart;
+    }
+
+    ALLOC_MEMBER(void)::AllocateFixed(VaType virt, VaType size) {
+        this->MapLocked(virt, true, size);
+    }
+
+    ALLOC_MEMBER(void)::Free(VaType virt, VaType size) {
+        this->UnmapLocked(virt, size);
+    }
+}
--- a/app/src/main/cpp/skyline/common/circular_queue.h
+++ b/app/src/main/cpp/skyline/common/circular_queue.h
@ -3,6 +3,7 @@

 #pragma once

+#include <common/trace.h>
 #include <common.h>

 namespace skyline {
@ -51,10 +52,15 @@ namespace skyline {
         */
        template<typename F>
        [[noreturn]] void Process(F function) {
+            TRACE_EVENT_BEGIN("containers", "CircularQueue::Process");
+
            while (true) {
                if (start == end) {
                    std::unique_lock lock(productionMutex);
+
+                    TRACE_EVENT_END("containers");
                    produceCondition.wait(lock, [this]() { return start != end; });
+                    TRACE_EVENT_BEGIN("containers", "CircularQueue::Process");
                }

                while (start != end) {
--- a/app/src/main/cpp/skyline/common/signal.cpp
+++ b/app/src/main/cpp/skyline/common/signal.cpp
@ -183,8 +183,6 @@ namespace skyline::signal {
            std::call_once(signalHandlerOnce[signal], [signal, &action]() {
                struct sigaction oldAction;
                Sigaction(signal, &action, &oldAction);
-                if (oldAction.sa_flags && oldAction.sa_flags != action.sa_flags)
-                    throw exception("Old sigaction flags aren't equivalent to the replaced signal: {:#b} | {:#b}", oldAction.sa_flags, action.sa_flags);

                DefaultSignalHandlers.at(signal).function = (oldAction.sa_flags & SA_SIGINFO) ? oldAction.sa_sigaction : reinterpret_cast<void (*)(int, struct siginfo *, void *)>(oldAction.sa_handler);
            });
--- a/app/src/main/cpp/skyline/common/trace.h
+++ b/app/src/main/cpp/skyline/common/trace.h
@ -13,7 +13,8 @@ PERFETTO_DEFINE_CATEGORIES(
    perfetto::Category("kernel").SetDescription("Events from parts of the HLE kernel"),
    perfetto::Category("guest").SetDescription("Events relating to guest code"),
    perfetto::Category("gpu").SetDescription("Events from the emulated GPU"),
-    perfetto::Category("service").SetDescription("Events from the HLE sysmodule implementations")
+    perfetto::Category("service").SetDescription("Events from the HLE sysmodule implementations"),
+    perfetto::Category("containers").SetDescription("Events from custom container implementations")
 );

 namespace skyline::trace {
--- a/app/src/main/cpp/skyline/services/common/result.h
+++ b/app/src/main/cpp/skyline/services/common/result.h
@ -13,6 +13,7 @@ namespace skyline::service {
        Busy = 16, // EBUSY
        InvalidArgument = 22, // EINVAL
        InappropriateIoctlForDevice = 25, // ENOTTY
+        FunctionNotImplemented = 38, // ENOSYS
        NotSupported = 95, // EOPNOTSUPP, ENOTSUP
        TimedOut = 110, // ETIMEDOUT

--- a/app/src/main/cpp/skyline/services/nvdrv/devices/deserialisation/macro_def.inc
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/deserialisation/macro_def.inc
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/deserialisation/macro_undef.inc
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/deserialisation/macro_undef.inc
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.cpp
@ -1,10 +1,16 @@
 // SPDX-License-Identifier: MIT OR MPL-2.0
 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)

+#include <common/address_space.inc>
 #include <soc.h>
 #include <services/nvdrv/devices/deserialisation/deserialisation.h>
 #include "as_gpu.h"

+namespace skyline {
+    template class FlatAddressSpaceMap<u32, 0, bool, false, false, 32>;
+    template class FlatAllocator<u32, 0, 32>;
+}
+
 namespace skyline::service::nvdrv::device::nvhost {
    AsGpu::AsGpu(const DeviceState &state, Core &core, const SessionContext &ctx) : NvDevice(state, core, ctx) {}

@ -14,38 +20,66 @@ namespace skyline::service::nvdrv::device::nvhost {
    }

    PosixResult AsGpu::AllocSpace(In<u32> pages, In<u32> pageSize, In<MappingFlags> flags, InOut<u64> offset) {
-        // TODO: track this on the nvdrv side and have the gmmu only do virt -> phys
-        // Also fix error codes
-        u64 size{static_cast<u64>(pages) * static_cast<u64>(pageSize)};
+        state.logger->Debug("pages: 0x{:X}, pageSize: 0x{:X}, flags: ( fixed: {}, sparse: {} ), offset: 0x{:X}", pages, pageSize, flags.fixed, flags.sparse, offset);
+
+        if (pageSize != VM::PageSize && pageSize != vm.bigPageSize)
+            return PosixResult::InvalidArgument;
+
+        if (pageSize != vm.bigPageSize && flags.sparse)
+            return PosixResult::FunctionNotImplemented;
+
+        u32 pageSizeBits{pageSize == VM::PageSize ? VM::PageSizeBits : vm.bigPageSizeBits};
+
+        auto &allocator{[&] () -> auto & {
+            if (pageSize == VM::PageSize)
+                return vm.smallPageAllocator;
+            else
+                return vm.bigPageAllocator;
+        }()};

        if (flags.fixed)
-            offset = state.soc->gmmu.ReserveFixed(offset, size);
+            allocator->AllocateFixed(offset >> pageSizeBits, pages);
        else
-            offset = state.soc->gmmu.ReserveSpace(size, offset); // offset contains the input alignment
+            offset = static_cast<u64>(allocator->Allocate(pages)) << pageSizeBits;

-        if (offset == 0) {
-            state.logger->Warn("Failed to allocate GPU address space region!");
-            return PosixResult::InvalidArgument;
-        }
+        u64 size{static_cast<u64>(pages) * static_cast<u64>(pageSize)};
+
+        if (flags.sparse)
+            state.soc->gm20b.gmmu.Map(offset, soc::gm20b::GM20B::GMMU::SparsePlaceholderAddress(), size, true);
+
+        allocationMap[offset] = {
+            .size = size,
+            .pageSize = pageSize,
+            .sparse = flags.sparse
+        };

        return PosixResult::Success;
    }

    PosixResult AsGpu::FreeSpace(In<u64> offset, In<u32> pages, In<u32> pageSize) {
-        // TODO: implement this when we add nvdrv side address space allocation
+        // TODO: implement after UNMAP
        return PosixResult::Success;
    }

    PosixResult AsGpu::UnmapBuffer(In<u64> offset) {
+        state.logger->Debug("offset: 0x{:X}", offset);
+
        try {
-            auto region{regionMap.at(offset)};
+            auto mapping{mappingMap.at(offset)};

-            // Non-fixed regions are unmapped so that they can be used by future non-fixed mappings
-            if (!region.fixed)
-                if (!state.soc->gmmu.Unmap(offset, region.size))
-                    state.logger->Warn("Failed to unmap region at 0x{:X}", offset);
+            if (!mapping->fixed) {
+                auto &allocator{mapping->bigPage ? vm.bigPageAllocator : vm.smallPageAllocator};
+                u32 pageSizeBits{mapping->bigPage ? vm.bigPageSizeBits : VM::PageSizeBits};

-            regionMap.erase(offset);
+                allocator->Free(mapping->offset >> pageSizeBits, mapping->size >> pageSizeBits);
+            }
+
+            if (mapping->sparseAlloc)
+                state.soc->gm20b.gmmu.Map(offset, soc::gm20b::GM20B::GMMU::SparsePlaceholderAddress(), mapping->size, true);
+            else
+                state.soc->gm20b.gmmu.Unmap(offset, mapping->size);
+
+            mappingMap.erase(offset);
        } catch (const std::out_of_range &e) {
            state.logger->Warn("Couldn't find region to unmap at 0x{:X}", offset);
        }
@ -53,62 +87,94 @@ namespace skyline::service::nvdrv::device::nvhost {
        return PosixResult::Success;
    }

-    PosixResult AsGpu::MapBufferEx(In<MappingFlags> flags, In<u32> kind, In<core::NvMap::Handle::Id> handle, InOut<u32> pageSize, In<u64> bufferOffset, In<u64> mappingSize, InOut<u64> offset) {
-        state.logger->Debug("flags: ( fixed: {}, remap: {} ), kind: {}, handle: {}, pageSize: 0x{:X}, bufferOffset: 0x{:X}, mappingSize: 0x{:X}, offset: 0x{:X}", flags.fixed, flags.remap, kind, handle, pageSize, bufferOffset, mappingSize, offset);
+    PosixResult AsGpu::MapBufferEx(In<MappingFlags> flags, In<u32> kind, In<core::NvMap::Handle::Id> handle, In<u64> bufferOffset, In<u64> mappingSize, InOut<u64> offset) {
+        if (!vm.initialised)
+            return PosixResult::InvalidArgument;
+
+        state.logger->Debug("flags: ( fixed: {}, remap: {} ), kind: {}, handle: {}, bufferOffset: 0x{:X}, mappingSize: 0x{:X}, offset: 0x{:X}", flags.fixed, flags.remap, kind, handle, bufferOffset, mappingSize, offset);

        if (flags.remap) {
-            auto region{regionMap.lower_bound(offset)};
-            if (region == regionMap.end()) {
+            try {
+                auto mapping{mappingMap.at(offset)};
+
+                if (mapping->size < mappingSize) {
+                    state.logger->Warn("Cannot remap a partially mapped GPU address space region: 0x{:X}", offset);
+                    return PosixResult::InvalidArgument;
+                }
+
+                u64 gpuAddress{offset + bufferOffset};
+                u8 *cpuPtr{mapping->ptr + bufferOffset};
+
+                state.soc->gm20b.gmmu.Map(gpuAddress, cpuPtr, mappingSize);
+
+                return PosixResult::Success;
+            } catch (const std::out_of_range &e) {
                state.logger->Warn("Cannot remap an unmapped GPU address space region: 0x{:X}", offset);
                return PosixResult::InvalidArgument;
            }
-
-            if (region->second.size < mappingSize) {
-                state.logger->Warn("Cannot remap an partially mapped GPU address space region: 0x{:X}", offset);
-                return PosixResult::InvalidArgument;
-            }
-
-            u64 gpuAddress{offset + bufferOffset};
-            u8 *cpuPtr{region->second.ptr + bufferOffset};
-
-            if (!state.soc->gmmu.MapFixed(gpuAddress, cpuPtr, mappingSize)) {
-                state.logger->Warn("Failed to remap GPU address space region: 0x{:X}", gpuAddress);
-                return PosixResult::InvalidArgument;
-            }
-
-            return PosixResult::Success;
        }

        auto h{core.nvMap.GetHandle(handle)};
        if (!h)
            return PosixResult::InvalidArgument;

-        if (auto err{h->Duplicate(ctx.internalSession)}; err != PosixResult::Success)
-            return err;
-
        u8 *cpuPtr{reinterpret_cast<u8 *>(h->address + bufferOffset)};
        u64 size{mappingSize ? mappingSize : h->origSize};

-        if (flags.fixed)
-            offset = state.soc->gmmu.MapFixed(offset, cpuPtr, size);
-        else
-            offset = state.soc->gmmu.MapAllocate(cpuPtr, size);
+        if (flags.fixed) {
+            auto alloc{allocationMap.upper_bound(offset)};

-        if (offset == 0) {
-            state.logger->Warn("Failed to map GPU address space region!");
-            return PosixResult::InvalidArgument;
+            if (alloc-- == allocationMap.begin() || (offset - alloc->first) + size > alloc->second.size)
+                throw exception("Cannot perform a fixed mapping into an unallocated region!");

+            state.soc->gm20b.gmmu.Map(offset, cpuPtr, size);
+
+            auto mapping{std::make_shared<Mapping>(cpuPtr, offset, size, true, false, alloc->second.sparse)};
+            alloc->second.mappings.push_back(mapping);
+            mappingMap[offset] = mapping;
+        } else {
+            bool bigPage{[&] () {
+                if (util::IsAligned(h->align, vm.bigPageSize))
+                    return true;
+                else if (util::IsAligned(h->align, VM::PageSize))
+                    return false;
+                else
+                    throw exception("Invalid handle alignment: 0x{:X}", h->align);
+            }()};
+
+            auto &allocator{bigPage ? vm.bigPageAllocator : vm.smallPageAllocator};
+            u32 pageSize{bigPage ? vm.bigPageSize : VM::PageSize};
+            u32 pageSizeBits{bigPage ? vm.bigPageSizeBits : VM::PageSizeBits};
+
+            offset = static_cast<u64>(allocator->Allocate(util::AlignUp(size, pageSize) >> pageSizeBits)) << pageSizeBits;
+            state.soc->gm20b.gmmu.Map(offset, cpuPtr, size);
+
+            auto mapping{std::make_shared<Mapping>(cpuPtr, offset, size, false, bigPage, false)};
+            mappingMap[offset] = mapping;
        }

        state.logger->Debug("Mapped to 0x{:X}", offset);

-        regionMap[offset] = {cpuPtr, size, flags.fixed};
-
        return PosixResult::Success;
    }

    PosixResult AsGpu::GetVaRegions(In<u64> bufAddr, InOut<u32> bufSize, Out<std::array<VaRegion, 2>> vaRegions) {
-        // TODO: impl when we move allocator to nvdrv
+        if (!vm.initialised)
+            return PosixResult::InvalidArgument;
+
+        vaRegions = std::array<VaRegion, 2> {
+            VaRegion{
+                .pageSize = VM::PageSize,
+                .pages = vm.smallPageAllocator->vaLimit - vm.smallPageAllocator->vaStart,
+                .offset = vm.smallPageAllocator->vaStart << VM::PageSizeBits,
+            },
+            VaRegion{
+                .pageSize = vm.bigPageSize,
+                .pages = vm.bigPageAllocator->vaLimit - vm.bigPageAllocator->vaStart,
+                .offset = vm.bigPageAllocator->vaStart << vm.bigPageSizeBits,
+            }
+        };
+
        return PosixResult::Success;
    }

@ -116,30 +182,83 @@ namespace skyline::service::nvdrv::device::nvhost {
        return GetVaRegions(bufAddr, bufSize, vaRegions);
    }

-    PosixResult  AsGpu::AllocAsEx(In<u32> bigPageSize, In<FileDescriptor> asFd, In<u32> flags, In<u64> vaRangeStart, In<u64> vaRangeEnd, In<u64> vaRangeSplit) {
-        // TODO: create the allocator here
+    PosixResult AsGpu::AllocAsEx(In<u32> flags, In<FileDescriptor> asFd, In<u32> bigPageSize, In<u64> vaRangeStart, In<u64> vaRangeEnd, In<u64> vaRangeSplit) {
+        if (vm.initialised)
+            throw exception("Cannot initialise an address space twice!");
+
+        state.logger->Debug("bigPageSize: 0x{:X}, asFd: {}, flags: 0x{:X}, vaRangeStart: 0x{:X}, vaRangeEnd: 0x{:X}, vaRangeSplit: 0x{:X}",
+                            bigPageSize, asFd, flags, vaRangeStart, vaRangeEnd, vaRangeSplit);
+
+        if (bigPageSize) {
+            if (!std::ispow2(bigPageSize)) {
+                state.logger->Error("Non power-of-2 big page size: 0x{:X}!", bigPageSize);
+                return PosixResult::InvalidArgument;
+            }
+
+            if (!(bigPageSize & VM::SupportedBigPageSizes)) {
+                state.logger->Error("Unsupported big page size: 0x{:X}!", bigPageSize);
+                return PosixResult::InvalidArgument;
+            }
+
+            vm.bigPageSize = bigPageSize;
+            vm.bigPageSizeBits = std::countr_zero(bigPageSize);
+
+            vm.vaRangeStart = bigPageSize << VM::VaStartShift;
+        }
+
+        if (vaRangeStart) {
+            vm.vaRangeStart = vaRangeStart;
+            vm.vaRangeSplit = vaRangeSplit;
+            vm.vaRangeEnd = vaRangeEnd;
+        }
+
+        u64 startPages{vm.vaRangeStart >> VM::PageSizeBits};
+        u64 endPages{vm.vaRangeSplit >> VM::PageSizeBits};
+        vm.smallPageAllocator = std::make_unique<VM::Allocator>(startPages, endPages);
+
+        u64 startBigPages{vm.vaRangeSplit >> vm.bigPageSizeBits};
+        u64 endBigPages{(vm.vaRangeEnd - vm.vaRangeSplit) >> vm.bigPageSizeBits};
+        vm.bigPageAllocator = std::make_unique<VM::Allocator>(startBigPages, endBigPages);
+
+        vm.initialised = true;
+
        return PosixResult::Success;
    }

    PosixResult AsGpu::Remap(span<RemapEntry> entries) {
-        constexpr u32 BigPageSize{0x10}; //!< The big page size of the GPU
-
        for (const auto &entry : entries) {
-            auto h{core.nvMap.GetHandle(entry.handle)};
-            if (!h)
+            u64 virtAddr{static_cast<u64>(entry.asOffsetBigPages) << vm.bigPageSizeBits};
+            u64 size{static_cast<u64>(entry.bigPages) << vm.bigPageSizeBits};
+
+            auto alloc{allocationMap.upper_bound(virtAddr)};
+
+            if (alloc-- == allocationMap.begin() || (virtAddr - alloc->first) + size > alloc->second.size) {
+                state.logger->Warn("Cannot remap into an unallocated region!");
                return PosixResult::InvalidArgument;
+            }

-            u64 virtAddr{static_cast<u64>(entry.asOffsetBigPages) << BigPageSize};
-            u8 *cpuPtr{reinterpret_cast<u8 *>(h->address + (static_cast<u64>(entry.handleOffsetBigPages) << BigPageSize))};
-            u64 size{static_cast<u64>(entry.bigPages) << BigPageSize};
+            if (!alloc->second.sparse) {
+                state.logger->Warn("Cannot remap a non-sparse mapping!");
+                return PosixResult::InvalidArgument;
+            }

-            state.soc->gmmu.MapFixed(virtAddr, cpuPtr, size);
+            if (!entry.handle) {
+                state.soc->gm20b.gmmu.Map(virtAddr, soc::gm20b::GM20B::GMMU::SparsePlaceholderAddress(), size, true);
+            } else {
+                auto h{core.nvMap.GetHandle(entry.handle)};
+                if (!h)
+                    return PosixResult::InvalidArgument;
+
+                u8 *cpuPtr{reinterpret_cast<u8 *>(h->address + (static_cast<u64>(entry.handleOffsetBigPages) << vm.bigPageSizeBits))};
+
+                state.soc->gm20b.gmmu.Map(virtAddr, cpuPtr, size);
+            }
        }

        return PosixResult::Success;
    }

-#include <services/nvdrv/devices/deserialisation/macro_def.h>
+#include <services/nvdrv/devices/deserialisation/macro_def.inc>
    static constexpr u32 AsGpuMagic{0x41};

    VARIABLE_IOCTL_HANDLER_FUNC(AsGpu, ({
@ -152,7 +271,7 @@ namespace skyline::service::nvdrv::device::nvhost {
        IOCTL_CASE_ARGS(INOUT, SIZE(0x8),  MAGIC(AsGpuMagic), FUNC(0x5),
                        UnmapBuffer,  ARGS(In<u64>))
        IOCTL_CASE_ARGS(INOUT, SIZE(0x28), MAGIC(AsGpuMagic), FUNC(0x6),
-                        MapBufferEx,  ARGS(In<MappingFlags>, In<u32>, In<core::NvMap::Handle::Id>, InOut<u32>, In<u64>, In<u64>, InOut<u64>))
+                        MapBufferEx,  ARGS(In<MappingFlags>, In<u32>, In<core::NvMap::Handle::Id>, Pad<u32>, In<u64>, In<u64>, InOut<u64>))
        IOCTL_CASE_ARGS(INOUT, SIZE(0x40), MAGIC(AsGpuMagic), FUNC(0x8),
                        GetVaRegions, ARGS(In<u64>, InOut<u32>, Pad<u32>, Out<std::array<VaRegion, 2>>))
        IOCTL_CASE_ARGS(IN,    SIZE(0x28), MAGIC(AsGpuMagic), FUNC(0x9),
@ -166,5 +285,5 @@ namespace skyline::service::nvdrv::device::nvhost {
        INLINE_IOCTL_CASE_ARGS(INOUT, SIZE(0x40), MAGIC(AsGpuMagic), FUNC(0x8),
                               GetVaRegions3, ARGS(In<u64>, InOut<u32>, Pad<u32>, Out<std::array<VaRegion, 2>>))
    }))
-#include <services/nvdrv/devices/deserialisation/macro_undef.h>
+#include <services/nvdrv/devices/deserialisation/macro_undef.inc>
 }
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.h
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.h
@ -3,6 +3,8 @@

 #pragma once

+#include <common/address_space.h>
+
 #include <services/nvdrv/devices/nvdevice.h>

 namespace skyline::service::nvdrv::device::nvhost {
@ -12,18 +14,64 @@ namespace skyline::service::nvdrv::device::nvhost {
     */
    class AsGpu : public NvDevice {
      private:
-        struct AddressSpaceRegion {
+        struct Mapping {
            u8 *ptr;
+            u64 offset;
            u64 size;
            bool fixed;
+            bool bigPage; // Only valid if fixed == false
+            bool sparseAlloc;
+
+            Mapping(u8 *ptr, u64 offset, u64 size, bool fixed, bool bigPage, bool sparseAlloc) : ptr(ptr),
+                offset(offset),
+                size(size),
+                fixed(fixed),
+                bigPage(bigPage),
+                sparseAlloc(sparseAlloc) {}
        };

-        std::map<u64, AddressSpaceRegion> regionMap; //!< This maps the base addresses of mapped buffers to their total sizes and mapping type, this is needed as what was originally a single buffer may have been split into multiple GPU side buffers with the remap flag.
+        struct Allocation {
+            u64 size;
+            std::list<std::shared_ptr<Mapping>> mappings;
+            u32 pageSize;
+            bool sparse;
+        };
+
+        std::map<u64, std::shared_ptr<Mapping>> mappingMap; //!< This maps the base addresses of mapped buffers to their total sizes and mapping type, this is needed as what was originally a single buffer may have been split into multiple GPU side buffers with the remap flag.
+
+        std::map<u64, Allocation> allocationMap;
+
+
+        struct VM {
+            static constexpr u32 PageSize{0x1000};
+            static constexpr u32 PageSizeBits{std::countr_zero(PageSize)};
+
+            static constexpr u32 SupportedBigPageSizes{0x30000};
+            static constexpr u32 DefaultBigPageSize{0x20000};
+            u32 bigPageSize{DefaultBigPageSize};
+            u32 bigPageSizeBits{std::countr_zero(DefaultBigPageSize)};
+
+            static constexpr u32 VaStartShift{10};
+            static constexpr u64 DefaultVaSplit{1ULL << 34};
+            static constexpr u64 DefaultVaRange{1ULL << 37};
+            u64 vaRangeStart{DefaultBigPageSize << VaStartShift};
+            u64 vaRangeSplit{DefaultVaSplit};
+            u64 vaRangeEnd{DefaultVaRange};
+
+            using Allocator = FlatAllocator<u32, 0, 32>;
+
+            std::unique_ptr<Allocator> bigPageAllocator{};
+            std::unique_ptr<Allocator> smallPageAllocator{};
+
+            bool initialised{};
+        } vm;
+

      public:
        struct MappingFlags {
            bool fixed : 1;
-            u8 _pad0_ : 7;
+            bool sparse : 1;
+            u8 _pad0_ : 6;
            bool remap : 1;
            u32 _pad1_ : 23;
        };
@ -77,7 +125,7 @@ namespace skyline::service::nvdrv::device::nvhost {
         * @brief Maps a region into this address space with extra parameters
         * @url https://switchbrew.org/wiki/NV_services#NVGPU_AS_IOCTL_MAP_BUFFER_EX
         */
-        PosixResult MapBufferEx(In<MappingFlags> flags, In<u32> kind, In<core::NvMap::Handle::Id> handle, InOut<u32> pageSize, In<u64> bufferOffset, In<u64> mappingSize, InOut<u64> offset);
+        PosixResult MapBufferEx(In<MappingFlags> flags, In<u32> kind, In<core::NvMap::Handle::Id> handle, In<u64> bufferOffset, In<u64> mappingSize, InOut<u64> offset);

        /**
         * @brief Returns info about the address space and its page sizes
@ -94,7 +142,7 @@ namespace skyline::service::nvdrv::device::nvhost {
         * @brief Allocates this address space with the given parameters
         * @url https://switchbrew.org/wiki/NV_services#NVGPU_AS_IOCTL_ALLOC_AS_EX
         */
-        PosixResult AllocAsEx(In<u32> bigPageSize, In<FileDescriptor> asFd, In<u32> flags, In<u64> vaRangeStart, In<u64> vaRangeEnd, In<u64> vaRangeSplit);
+        PosixResult AllocAsEx(In<u32> flags, In<FileDescriptor> asFd, In<u32> bigPageSize, In<u64> vaRangeStart, In<u64> vaRangeEnd, In<u64> vaRangeSplit);

        /**
         * @brief Remaps a region of the GPU address space
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl.cpp
@ -234,7 +234,7 @@ namespace skyline::service::nvdrv::device::nvhost {
        return nullptr;
    }

-#include <services/nvdrv/devices/deserialisation/macro_def.h>
+#include <services/nvdrv/devices/deserialisation/macro_def.inc>
    static constexpr u32 CtrlMagic{0};

    IOCTL_HANDLER_FUNC(Ctrl, ({
@ -254,5 +254,5 @@ namespace skyline::service::nvdrv::device::nvhost {
        IOCTL_CASE_RESULT(INOUT, SIZE(0x183), MAGIC(CtrlMagic), FUNC(0x1B),
                          PosixResult::InvalidArgument) // GetConfig isn't available in production
    }))
-#include <services/nvdrv/devices/deserialisation/macro_undef.h>
+#include <services/nvdrv/devices/deserialisation/macro_undef.inc>
 }
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl_gpu.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl_gpu.cpp
@ -62,7 +62,7 @@ namespace skyline::service::nvdrv::device::nvhost {
        }
    }

-#include <services/nvdrv/devices/deserialisation/macro_def.h>
+#include <services/nvdrv/devices/deserialisation/macro_def.inc>
    static constexpr u32 CtrlGpuMagic{0x47};

    IOCTL_HANDLER_FUNC(CtrlGpu, ({
@ -77,5 +77,5 @@ namespace skyline::service::nvdrv::device::nvhost {
        IOCTL_CASE_ARGS(OUT,   SIZE(0x8),  MAGIC(CtrlGpuMagic), FUNC(0x14),
                        GetActiveSlotMask,  ARGS(Out<u32>, Out<u32>))
    }))
-#include <services/nvdrv/devices/deserialisation/macro_undef.h>
+#include <services/nvdrv/devices/deserialisation/macro_undef.inc>
 }
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/gpu_channel.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/gpu_channel.cpp
@ -104,7 +104,7 @@ namespace skyline::service::nvdrv::device::nvhost {
        }
    }

-#include <services/nvdrv/devices/deserialisation/macro_def.h>
+#include <services/nvdrv/devices/deserialisation/macro_def.inc>
    static constexpr u32 GpuChannelUserMagic{0x47};
    static constexpr u32 GpuChannelMagic{0x48};

@ -138,5 +138,5 @@ namespace skyline::service::nvdrv::device::nvhost {
        INLINE_IOCTL_CASE_ARGS(INOUT, SIZE(0x18), MAGIC(GpuChannelMagic), FUNC(0x1B),
                               SubmitGpfifo2, ARGS(In<u64>, In<u32>, InOut<SubmitGpfifoFlags>, InOut<Fence>))
    }))
-#include <services/nvdrv/devices/deserialisation/macro_undef.h>
+#include <services/nvdrv/devices/deserialisation/macro_undef.inc>
 }
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvmap.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvmap.cpp
@ -115,7 +115,7 @@ namespace skyline::service::nvdrv::device {
        return PosixResult::Success;
    }

-#include "deserialisation/macro_def.h"
+#include "deserialisation/macro_def.inc"
    static constexpr u32 NvMapMagic{1};

    IOCTL_HANDLER_FUNC(NvMap, ({
@ -132,6 +132,6 @@ namespace skyline::service::nvdrv::device {
        IOCTL_CASE_ARGS(INOUT, SIZE(0x8),  MAGIC(NvMapMagic), FUNC(0xE),
                        GetId,  ARGS(Out<NvMapCore::Handle::Id>, In<NvMapCore::Handle::Id>))
    }))
-#include "deserialisation/macro_undef.h"
+#include "deserialisation/macro_undef.inc"
 }

--- a/app/src/main/cpp/skyline/soc.h
+++ b/app/src/main/cpp/skyline/soc.h
@ -3,7 +3,6 @@

 #pragma once

-#include "soc/gmmu.h"
 #include "soc/host1x.h"
 #include "soc/gm20b.h"

@ -14,10 +13,9 @@ namespace skyline::soc {
     */
    class SOC {
      public:
-        gmmu::GraphicsMemoryManager gmmu;
        host1x::Host1X host1x;
        gm20b::GM20B gm20b;

-        SOC(const DeviceState &state) : gmmu(state), gm20b(state) {}
+        SOC(const DeviceState &state) : gm20b(state) {}
    };
 }
--- a/app/src/main/cpp/skyline/soc/gm20b.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b.cpp
@ -0,0 +1,20 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <common/address_space.inc>
+#include "gm20b.h"
+
+namespace skyline {
+    template class FlatAddressSpaceMap<u64, 0, u8 *, nullptr, true, soc::gm20b::GM20B::AddressSpaceBits>;
+    template class FlatMemoryManager<u64, 0, soc::gm20b::GM20B::AddressSpaceBits>;
+}
+
+namespace skyline::soc::gm20b {
+    GM20B::GM20B(const DeviceState &state) :
+        fermi2D(state),
+        keplerMemory(state),
+        maxwell3D(state),
+        maxwellCompute(state),
+        maxwellDma(state),
+        gpfifo(state) {}
+}
--- a/app/src/main/cpp/skyline/soc/gm20b.h
+++ b/app/src/main/cpp/skyline/soc/gm20b.h
@ -3,23 +3,28 @@

 #pragma once

+#include <common/address_space.h>
 #include "gm20b/engines/maxwell_3d.h"
 #include "gm20b/gpfifo.h"

 namespace skyline::soc::gm20b {
    /**
     * @brief The GPU block in the X1, it contains all GPU engines required for accelerating graphics operations
-     * @note We omit parts of components related to external access such as the GM20B Host, all accesses to the external components are done directly
+     * @note We omit parts of components related to external access such as the grhost, all accesses to the external components are done directly
     */
    class GM20B {
      public:
+        static constexpr u8 AddressSpaceBits{40}; //!< The width of the GMMU AS
+        using GMMU = FlatMemoryManager<u64, 0, AddressSpaceBits>;
+
        engine::Engine fermi2D;
        engine::maxwell3d::Maxwell3D maxwell3D;
        engine::Engine maxwellCompute;
        engine::Engine maxwellDma;
        engine::Engine keplerMemory;
        GPFIFO gpfifo;
+        GMMU gmmu;

-        GM20B(const DeviceState &state) : fermi2D(state), keplerMemory(state), maxwell3D(state), maxwellCompute(state), maxwellDma(state), gpfifo(state) {}
+        GM20B(const DeviceState &state);
    };
 }
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/macro_interpreter.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/macro_interpreter.cpp
@ -1,7 +1,7 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2020 Skyline Team and Contributors (https://github.com/skyline-emu/)

-#include <soc/gmmu.h>
+#include <common/address_space.h>
 #include <soc/gm20b/engines/maxwell_3d.h>

 namespace skyline::soc::gm20b::engine::maxwell3d {
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
@ -157,7 +157,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {

        switch (registers.semaphore.info.structureSize) {
            case Registers::SemaphoreInfo::StructureSize::OneWord:
-                state.soc->gmmu.Write<u32>(static_cast<u32>(result), registers.semaphore.address.Pack());
+                state.soc->gm20b.gmmu.Write<u32>(registers.semaphore.address.Pack(), static_cast<u32>(result));
                break;
            case Registers::SemaphoreInfo::StructureSize::FourWords: {
                // Convert the current nanosecond time to GPU ticks
@ -167,7 +167,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
                u64 nsTime{util::GetTimeNs()};
                u64 timestamp{(nsTime / NsToTickDenominator) * NsToTickNumerator + ((nsTime % NsToTickDenominator) * NsToTickNumerator) / NsToTickDenominator};

-                state.soc->gmmu.Write<FourWordResult>(FourWordResult{result, timestamp}, registers.semaphore.address.Pack());
+                state.soc->gm20b.gmmu.Write<FourWordResult>(registers.semaphore.address.Pack(), FourWordResult{result, timestamp});
                break;
            }
        }
--- a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
@ -56,7 +56,7 @@ namespace skyline::soc::gm20b {
        }

        pushBufferData.resize(gpEntry.size);
-        state.soc->gmmu.Read<u32>(pushBufferData, gpEntry.Address());
+        state.soc->gm20b.gmmu.Read<u32>(pushBufferData, gpEntry.Address());

        for (auto entry{pushBufferData.begin()}; entry != pushBufferData.end(); entry++) {
            // An entry containing all zeroes is a NOP, skip over it
@ -88,8 +88,7 @@ namespace skyline::soc::gm20b {
                    return;

                default:
-                    state.logger->Warn("Unsupported pushbuffer method SecOp: {}", static_cast<u8>(methodHeader.secOp));
-                    break;
+                    throw exception("Unsupported pushbuffer method SecOp: {}", static_cast<u8>(methodHeader.secOp));
            }
        }
    }
@ -106,7 +105,7 @@ namespace skyline::soc::gm20b {
        try {
            signal::SetSignalHandler({SIGINT, SIGILL, SIGTRAP, SIGBUS, SIGFPE, SIGSEGV}, signal::ExceptionalSignalHandler);
            pushBuffers->Process([this](GpEntry gpEntry) {
-                state.logger->Debug("Processing pushbuffer: 0x{:X}", gpEntry.Address());
+                state.logger->Warn("Processing pushbuffer: 0x{:X}", gpEntry.Address());
                Process(gpEntry);
            });
        } catch (const signal::SignalException &e) {
--- a/app/src/main/cpp/skyline/soc/gmmu.cpp
+++ b/app/src/main/cpp/skyline/soc/gmmu.cpp
@ -1,214 +0,0 @@
-// SPDX-License-Identifier: MPL-2.0
-// Copyright © 2020 Skyline Team and Contributors (https://github.com/skyline-emu/)
-
-#include <kernel/types/KProcess.h>
-#include "gmmu.h"
-
-namespace skyline::soc::gmmu {
-    constexpr u64 GpuPageSize{1 << 16}; //!< The page size of the GPU address space
-
-    GraphicsMemoryManager::GraphicsMemoryManager(const DeviceState &state) : state(state) {
-        constexpr u64 gpuAddressSpaceSize{1UL << 40}; //!< The size of the GPU address space
-        constexpr u64 gpuAddressSpaceBase{0x100000}; //!< The base of the GPU address space - must be non-zero
-
-        // Create the initial chunk that will be split to create new chunks
-        ChunkDescriptor baseChunk(gpuAddressSpaceBase, gpuAddressSpaceSize, nullptr, ChunkState::Unmapped);
-        chunks.push_back(baseChunk);
-    }
-
-    std::optional<ChunkDescriptor> GraphicsMemoryManager::FindChunk(ChunkState desiredState, u64 size, u64 alignment) {
-        auto chunk{std::find_if(chunks.begin(), chunks.end(), [desiredState, size, alignment](const ChunkDescriptor &chunk) -> bool {
-            return (alignment ? util::IsAligned(chunk.virtualAddress, alignment) : true) && chunk.size > size && chunk.state == desiredState;
-        })};
-
-        if (chunk != chunks.end())
-            return *chunk;
-
-        return std::nullopt;
-    }
-
-    u64 GraphicsMemoryManager::InsertChunk(const ChunkDescriptor &newChunk) {
-        auto chunkEnd{chunks.end()};
-        for (auto chunk{chunks.begin()}; chunk != chunkEnd; chunk++) {
-            if (chunk->CanContain(newChunk)) {
-                auto oldChunk{*chunk};
-                u64 newSize{newChunk.virtualAddress - chunk->virtualAddress};
-                u64 extension{chunk->size - newSize - newChunk.size};
-
-                if (newSize == 0) {
-                    *chunk = newChunk;
-                } else {
-                    chunk->size = newSize;
-                    chunk = chunks.insert(std::next(chunk), newChunk);
-                }
-
-                if (extension)
-                    chunks.insert(std::next(chunk), ChunkDescriptor(newChunk.virtualAddress + newChunk.size, extension, (oldChunk.state == ChunkState::Mapped) ? (oldChunk.cpuPtr + newSize + newChunk.size) : nullptr, oldChunk.state));
-
-                return newChunk.virtualAddress;
-            } else if (chunk->virtualAddress + chunk->size > newChunk.virtualAddress) {
-                chunk->size = newChunk.virtualAddress - chunk->virtualAddress;
-
-                // Deletes all chunks that are within the chunk being inserted and split the final one
-                auto tailChunk{std::next(chunk)};
-                while (tailChunk != chunkEnd) {
-                    if (tailChunk->virtualAddress + tailChunk->size >= newChunk.virtualAddress + newChunk.size)
-                        break;
-
-                    tailChunk = chunks.erase(tailChunk);
-                    chunkEnd = chunks.end();
-                }
-
-                // The given chunk is too large to fit into existing chunks
-                if (tailChunk == chunkEnd)
-                    break;
-
-                u64 chunkSliceOffset{newChunk.virtualAddress + newChunk.size - tailChunk->virtualAddress};
-                tailChunk->virtualAddress += chunkSliceOffset;
-                tailChunk->size -= chunkSliceOffset;
-                if (tailChunk->state == ChunkState::Mapped)
-                    tailChunk->cpuPtr += chunkSliceOffset;
-
-                // If the size of the head chunk is zero then we can directly replace it with our new one rather than inserting it
-                auto headChunk{std::prev(tailChunk)};
-                if (headChunk->size == 0)
-                    *headChunk = newChunk;
-                else
-                    chunks.insert(std::next(headChunk), newChunk);
-
-                return newChunk.virtualAddress;
-            }
-        }
-
-        throw exception("Failed to insert chunk into GPU address space!");
-    }
-
-    u64 GraphicsMemoryManager::ReserveSpace(u64 size, u64 alignment) {
-        size = util::AlignUp(size, GpuPageSize);
-
-        std::unique_lock lock(mutex);
-        auto newChunk{FindChunk(ChunkState::Unmapped, size, alignment)};
-        if (!newChunk) [[unlikely]]
-            return 0;
-
-        auto chunk{*newChunk};
-        chunk.size = size;
-        chunk.state = ChunkState::Reserved;
-
-        return InsertChunk(chunk);
-    }
-
-    u64 GraphicsMemoryManager::ReserveFixed(u64 virtualAddress, u64 size) {
-        if (!util::IsAligned(virtualAddress, GpuPageSize)) [[unlikely]]
-            return 0;
-
-        size = util::AlignUp(size, GpuPageSize);
-
-        std::unique_lock lock(mutex);
-        return InsertChunk(ChunkDescriptor(virtualAddress, size, nullptr, ChunkState::Reserved));
-    }
-
-    u64 GraphicsMemoryManager::MapAllocate(u8 *cpuPtr, u64 size) {
-        size = util::AlignUp(size, GpuPageSize);
-
-        std::unique_lock lock(mutex);
-        auto mappedChunk{FindChunk(ChunkState::Unmapped, size)};
-        if (!mappedChunk) [[unlikely]]
-            return 0;
-
-        auto chunk{*mappedChunk};
-        chunk.cpuPtr = cpuPtr;
-        chunk.size = size;
-        chunk.state = ChunkState::Mapped;
-
-        return InsertChunk(chunk);
-    }
-
-    u64 GraphicsMemoryManager::MapFixed(u64 virtualAddress, u8 *cpuPtr, u64 size) {
-        if (!util::IsAligned(virtualAddress, GpuPageSize)) [[unlikely]]
-            return 0;
-
-        size = util::AlignUp(size, GpuPageSize);
-
-        std::unique_lock lock(mutex);
-        return InsertChunk(ChunkDescriptor(virtualAddress, size, cpuPtr, ChunkState::Mapped));
-    }
-
-    bool GraphicsMemoryManager::Unmap(u64 virtualAddress, u64 size) {
-        if (!util::IsAligned(virtualAddress, GpuPageSize)) [[unlikely]]
-            return false;
-
-        try {
-            std::unique_lock lock(mutex);
-            InsertChunk(ChunkDescriptor(virtualAddress, size, nullptr, ChunkState::Unmapped));
-        } catch (const std::exception &e) {
-            return false;
-        }
-
-        return true;
-    }
-
-    void GraphicsMemoryManager::Read(u8 *destination, u64 virtualAddress, u64 size) {
-        std::shared_lock lock(mutex);
-
-        auto chunk{std::upper_bound(chunks.begin(), chunks.end(), virtualAddress, [](const u64 address, const ChunkDescriptor &chunk) -> bool {
-            return address < chunk.virtualAddress;
-        })};
-
-        if (chunk == chunks.end() || chunk->state != ChunkState::Mapped)
-            throw exception("Failed to read region in GPU address space: Address: 0x{:X}, Size: 0x{:X}", virtualAddress, size);
-
-        chunk--;
-
-        u64 initialSize{size};
-        u64 chunkOffset{virtualAddress - chunk->virtualAddress};
-        u8 *source{chunk->cpuPtr + chunkOffset};
-        u64 sourceSize{std::min(chunk->size - chunkOffset, size)};
-
-        // A continuous region in the GPU address space may be made up of several discontinuous regions in physical memory so we have to iterate over all chunks
-        while (size) {
-            std::memcpy(destination + (initialSize - size), source, sourceSize);
-
-            size -= sourceSize;
-            if (size) {
-                if (++chunk == chunks.end() || chunk->state != ChunkState::Mapped)
-                    throw exception("Failed to read region in GPU address space: Address: 0x{:X}, Size: 0x{:X}", virtualAddress, size);
-
-                source = chunk->cpuPtr;
-                sourceSize = std::min(chunk->size, size);
-            }
-        }
-    }
-
-    void GraphicsMemoryManager::Write(u8 *source, u64 virtualAddress, u64 size) {
-        std::shared_lock lock(mutex);
-
-        auto chunk{std::upper_bound(chunks.begin(), chunks.end(), virtualAddress, [](const u64 address, const ChunkDescriptor &chunk) -> bool {
-            return address < chunk.virtualAddress;
-        })};
-
-        if (chunk == chunks.end() || chunk->state != ChunkState::Mapped)
-            throw exception("Failed to write region in GPU address space: Address: 0x{:X}, Size: 0x{:X}", virtualAddress, size);
-
-        chunk--;
-
-        u64 initialSize{size};
-        u64 chunkOffset{virtualAddress - chunk->virtualAddress};
-        u8 *destination{chunk->cpuPtr + chunkOffset};
-        u64 destinationSize{std::min(chunk->size - chunkOffset, size)};
-
-        // A continuous region in the GPU address space may be made up of several discontinuous regions in physical memory so we have to iterate over all chunks
-        while (size) {
-            std::memcpy(destination, source + (initialSize - size), destinationSize);
-
-            size -= destinationSize;
-            if (size) {
-                if (++chunk == chunks.end() || chunk->state != ChunkState::Mapped)
-                    throw exception("Failed to write region in GPU address space: Address: 0x{:X}, Size: 0x{:X}", virtualAddress, size);
-
-                destination = chunk->cpuPtr;
-                destinationSize = std::min(chunk->size, size);
-            }
-        }
-    }
-}
--- a/app/src/main/cpp/skyline/soc/gmmu.h
+++ b/app/src/main/cpp/skyline/soc/gmmu.h
@ -1,140 +0,0 @@
-// SPDX-License-Identifier: MPL-2.0
-// Copyright © 2020 Skyline Team and Contributors (https://github.com/skyline-emu/)
-
-#pragma once
-
-#include <common.h>
-
-namespace skyline::soc::gmmu {
-    enum class ChunkState {
-        Unmapped, //!< The chunk is unmapped
-        Reserved, //!< The chunk is reserved
-        Mapped //!< The chunk is mapped and a CPU side address is present
-    };
-
-    struct ChunkDescriptor {
-        u64 virtualAddress; //!< The address of the chunk in the virtual address space
-        u64 size; //!< The size of the chunk in bytes
-        u8 *cpuPtr; //!< A pointer to the chunk in the application's address space (if mapped)
-        ChunkState state;
-
-        ChunkDescriptor(u64 virtualAddress, u64 size, u8 *cpuPtr, ChunkState state) : virtualAddress(virtualAddress), size(size), cpuPtr(cpuPtr), state(state) {}
-
-        /**
-         * @return If the given chunk can be contained wholly within this chunk
-         */
-        inline bool CanContain(const ChunkDescriptor &chunk) {
-            return (chunk.virtualAddress >= virtualAddress) && ((size + virtualAddress) >= (chunk.size + chunk.virtualAddress));
-        }
-    };
-
-    /**
-     * @brief The GraphicsMemoryManager class handles mapping between a Maxwell GPU virtual address space and an application's address space and is meant to roughly emulate the GMMU on the X1
-     * @note This is not accurate to the X1 as it would have an SMMU between the GMMU and physical memory but we don't emulate this abstraction at the moment
-     */
-    class GraphicsMemoryManager {
-      private:
-        const DeviceState &state;
-        std::vector<ChunkDescriptor> chunks;
-        std::shared_mutex mutex;
-
-        /**
-         * @brief Finds a chunk in the virtual address space that is larger than meets the given requirements
-         * @note vmmMutex MUST be locked when calling this
-         * @param desiredState The state of the chunk to find
-         * @param size The minimum size of the chunk to find
-         * @param alignment The minimum alignment of the chunk to find
-         * @return The first applicable chunk
-         */
-        std::optional<ChunkDescriptor> FindChunk(ChunkState desiredState, u64 size, u64 alignment = 0);
-
-        /**
-         * @brief Inserts a chunk into the chunk list, resizing and splitting as necessary
-         * @note vmmMutex MUST be locked when calling this
-         * @param newChunk The chunk to insert
-         * @return The base virtual address of the inserted chunk
-         */
-        u64 InsertChunk(const ChunkDescriptor &newChunk);
-
-      public:
-        GraphicsMemoryManager(const DeviceState &state);
-
-        /**
-         * @brief Reserves a region of the virtual address space so it will not be chosen automatically when mapping
-         * @param size The size of the region to reserve
-         * @param alignment The alignment of the region to reserve
-         * @return The base virtual address of the reserved region
-         */
-        u64 ReserveSpace(u64 size, u64 alignment);
-
-        /**
-         * @brief Reserves a fixed region of the virtual address space so it will not be chosen automatically when mapping
-         * @param virtualAddress The virtual base address of the region to allocate
-         * @param size The size of the region to allocate
-         * @return The base virtual address of the reserved region
-         */
-        u64 ReserveFixed(u64 virtualAddress, u64 size);
-
-        /**
-         * @brief Maps a CPU memory region into an automatically chosen region of the virtual address space
-         * @param cpuPtr A pointer to the region to be mapped into the virtual address space
-         * @param size The size of the region to map
-         * @return The base virtual address of the mapped region
-         */
-        u64 MapAllocate(u8 *cpuPtr, u64 size);
-
-        /**
-         * @brief Maps a CPU memory region to a fixed region in the virtual address space
-         * @param virtualAddress The target virtual address of the region
-         * @param cpuPtr A pointer to the region to be mapped into the virtual address space
-         * @param size The size of the region to map
-         * @return The base virtual address of the mapped region
-         */
-        u64 MapFixed(u64 virtualAddress, u8 *cpuPtr, u64 size);
-
-        /**
-         * @brief Unmaps all chunks in the given region from the virtual address space
-         * @return Whether the operation succeeded
-         */
-        bool Unmap(u64 virtualAddress, u64 size);
-
-        void Read(u8 *destination, u64 virtualAddress, u64 size);
-
-        /**
-         * @brief Reads in a span from a region of the virtual address space
-         */
-        template<typename T>
-        void Read(span <T> destination, u64 virtualAddress) {
-            Read(reinterpret_cast<u8 *>(destination.data()), virtualAddress, destination.size_bytes());
-        }
-
-        /**
-         * @brief Reads in an object from a region of the virtual address space
-         * @tparam T The type of object to return
-         */
-        template<typename T>
-        T Read(u64 virtualAddress) {
-            T obj;
-            Read(reinterpret_cast<u8 *>(&obj), virtualAddress, sizeof(T));
-            return obj;
-        }
-
-        void Write(u8 *source, u64 virtualAddress, u64 size);
-
-        /**
-         * @brief Writes out a span to a region of the virtual address space
-         */
-        template<typename T>
-        void Write(span <T> source, u64 virtualAddress) {
-            Write(reinterpret_cast<u8 *>(source.data()), virtualAddress, source.size_bytes());
-        }
-
-        /**
-         * @brief Reads in an object from a region of the virtual address space
-         */
-        template<typename T>
-        void Write(T source, u64 virtualAddress) {
-            Write(reinterpret_cast<u8 *>(&source), virtualAddress, sizeof(T));
-        }
-    };
-}