Implement multichannel support for GPU

Allows the execution of multiple channels at the same time, with locking being performed on the host GPU scheduler layer, address spaces can be bound to one or more channels.
2025-02-16 22:09:13 +01:00 · 2021-10-08 20:25:21 +01:00 · 2021-10-08 20:25:21 +01:00 · eb25f60033
commit eb25f60033
parent b762d1df23
26 changed files with 209 additions and 109 deletions
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@ -103,12 +103,12 @@ add_library(skyline SHARED
        ${source_DIR}/skyline/gpu/texture/texture.cpp
        ${source_DIR}/skyline/gpu/presentation_engine.cpp
        ${source_DIR}/skyline/gpu/interconnect/command_executor.cpp
-        ${source_DIR}/skyline/soc/gm20b.cpp
        ${source_DIR}/skyline/soc/host1x/syncpoint.cpp
+        ${source_DIR}/skyline/soc/gm20b/channel.cpp
        ${source_DIR}/skyline/soc/gm20b/gpfifo.cpp
+        ${source_DIR}/skyline/soc/gm20b/gmmu.cpp
        ${source_DIR}/skyline/soc/gm20b/engines/maxwell_3d.cpp
        ${source_DIR}/skyline/soc/gm20b/engines/maxwell/macro_interpreter.cpp
-        ${source_DIR}/skyline/soc/gm20b/gmmu.cpp
        ${source_DIR}/skyline/input/npad.cpp
        ${source_DIR}/skyline/input/npad_device.cpp
        ${source_DIR}/skyline/input/touch.cpp
--- a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
@ -4,8 +4,10 @@
 #pragma once

 #include <gpu/texture/format.h>
+#include <soc/gm20b/channel.h>
 #include <soc/gm20b/gmmu.h>
 #include <soc/gm20b/engines/maxwell/types.h>
+
 #include "command_executor.h"

 namespace skyline::gpu::interconnect {
@ -18,7 +20,7 @@ namespace skyline::gpu::interconnect {
    class GraphicsContext {
      private:
        GPU &gpu;
-        soc::gm20b::GMMU &gmmu;
+        soc::gm20b::ChannelContext &channelCtx;
        gpu::interconnect::CommandExecutor &executor;

        struct RenderTarget {
@ -50,7 +52,7 @@ namespace skyline::gpu::interconnect {


      public:
-        GraphicsContext(GPU &gpu, soc::gm20b::GMMU &gmmu, gpu::interconnect::CommandExecutor &executor) : gpu(gpu), gmmu(gmmu), executor(executor) {
+        GraphicsContext(GPU &gpu, soc::gm20b::ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor) : gpu(gpu), channelCtx(channelCtx), executor(executor) {
            scissors.fill(DefaultScissor);
        }

@ -182,7 +184,7 @@ namespace skyline::gpu::interconnect {

            if (renderTarget.guest.mappings.empty()) {
                auto size{std::max<u64>(renderTarget.guest.layerStride * (renderTarget.guest.layerCount - renderTarget.guest.baseArrayLayer), renderTarget.guest.format->GetSize(renderTarget.guest.dimensions))};
-                auto mappings{gmmu.TranslateRange(renderTarget.gpuAddress, size)};
+                auto mappings{channelCtx.asCtx->gmmu.TranslateRange(renderTarget.gpuAddress, size)};
                renderTarget.guest.mappings.assign(mappings.begin(), mappings.end());
            }

--- a/app/src/main/cpp/skyline/services/common/result.h
+++ b/app/src/main/cpp/skyline/services/common/result.h
@ -11,6 +11,7 @@ namespace skyline::service {
        NotPermitted = 1, // EPERM
        TryAgain = 11, // EAGAIN
        Busy = 16, // EBUSY
+        FileExists = 17, // EEXIST
        InvalidArgument = 22, // EINVAL
        InappropriateIoctlForDevice = 25, // ENOTTY
        FunctionNotImplemented = 38, // ENOSYS
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvdevice.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvdevice.cpp
@ -5,7 +5,11 @@
 #include "nvdevice.h"

 namespace skyline::service::nvdrv::device {
-    NvDevice::NvDevice(const DeviceState &state, Core &core, const SessionContext &ctx) : state(state), core(core), ctx(ctx) {}
+    NvDevice::NvDevice(const DeviceState &state, Driver &driver, Core &core, const SessionContext &ctx) :
+        state(state),
+        driver(driver),
+        core(core),
+        ctx(ctx) {}

    const std::string &NvDevice::GetName() {
        if (name.empty()) {
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvdevice.h
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvdevice.h
@ -11,6 +11,10 @@

 #include "deserialisation/types.h"

+namespace skyline::service::nvdrv {
+    class Driver;
+}
+
 namespace skyline::service::nvdrv::device {
    using namespace kernel;
    using namespace deserialisation;
@ -24,11 +28,12 @@ namespace skyline::service::nvdrv::device {

      protected:
        const DeviceState &state;
+        Driver &driver;
        Core &core;
        SessionContext ctx;

      public:
-        NvDevice(const DeviceState &state, Core &core, const SessionContext &ctx);
+        NvDevice(const DeviceState &state, Driver &driver, Core &core, const SessionContext &ctx);

        virtual ~NvDevice() = default;

--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.cpp
@ -3,7 +3,10 @@

 #include <common/address_space.inc>
 #include <soc.h>
+#include <soc/gm20b/gmmu.h>
+#include <services/nvdrv/driver.h>
 #include <services/nvdrv/devices/deserialisation/deserialisation.h>
+#include "gpu_channel.h"
 #include "as_gpu.h"

 namespace skyline {
@ -14,10 +17,31 @@ namespace skyline {
 namespace skyline::service::nvdrv::device::nvhost {
    using GMMU = soc::gm20b::GMMU;

-    AsGpu::AsGpu(const DeviceState &state, Core &core, const SessionContext &ctx) : NvDevice(state, core, ctx) {}
+    AsGpu::AsGpu(const DeviceState &state, Driver &driver, Core &core, const SessionContext &ctx) : NvDevice(state, driver, core, ctx) {}

    PosixResult AsGpu::BindChannel(In<FileDescriptor> channelFd) {
-        // TODO: support once multiple address spaces are supported
+        std::scoped_lock lock(mutex);
+
+        if  (!vm.initialised)
+            return PosixResult::InvalidArgument;
+
+        try {
+            std::shared_lock gpuLock(driver.deviceMutex);
+            auto &gpuCh{dynamic_cast<GpuChannel &>(*driver.devices.at(channelFd))};
+
+            std::scoped_lock channelLock(gpuCh.channelMutex);
+
+            if (gpuCh.asCtx) {
+                state.logger->Warn("Attempting to bind multiple ASes to a single GPU channel");
+                return PosixResult::InvalidArgument;
+            }
+            
+            gpuCh.asCtx = asCtx;
+        } catch (const std::out_of_range &e) {
+            state.logger->Warn("Attempting to bind AS to an invalid channel: {}", channelFd);
+            return PosixResult::InvalidArgument;
+        }
+        
        return PosixResult::Success;
    }

@ -53,7 +77,7 @@ namespace skyline::service::nvdrv::device::nvhost {
        u64 size{static_cast<u64>(pages) * pageSize};

        if (flags.sparse)
-            state.soc->gm20b.gmmu.Map(offset, GMMU::SparsePlaceholderAddress(), size, {true});
+            asCtx->gmmu.Map(offset, GMMU::SparsePlaceholderAddress(), size, {true});

        allocationMap[offset] = {
            .size = size,
@ -77,9 +101,9 @@ namespace skyline::service::nvdrv::device::nvhost {
        // Sparse mappings shouldn't be fully unmapped, just returned to their sparse state
        // Only FreeSpace can unmap them fully
        if (mapping->sparseAlloc)
-            state.soc->gm20b.gmmu.Map(offset, GMMU::SparsePlaceholderAddress(), mapping->size, {true});
+            asCtx->gmmu.Map(offset, GMMU::SparsePlaceholderAddress(), mapping->size, {true});
        else
-            state.soc->gm20b.gmmu.Unmap(offset, mapping->size);
+            asCtx->gmmu.Unmap(offset, mapping->size);

        mappingMap.erase(offset);
    }
@ -103,7 +127,7 @@ namespace skyline::service::nvdrv::device::nvhost {

            // Unset sparse flag if required
            if (allocation.sparse)
-                state.soc->gm20b.gmmu.Unmap(offset, allocation.size);
+                asCtx->gmmu.Unmap(offset, allocation.size);

            auto &allocator{pageSize == VM::PageSize ? vm.smallPageAllocator : vm.bigPageAllocator};
            u32 pageSizeBits{pageSize == VM::PageSize ? VM::PageSizeBits : vm.bigPageSizeBits};
@ -138,9 +162,9 @@ namespace skyline::service::nvdrv::device::nvhost {
            // Sparse mappings shouldn't be fully unmapped, just returned to their sparse state
            // Only FreeSpace can unmap them fully
            if (mapping->sparseAlloc)
-                state.soc->gm20b.gmmu.Map(offset, GMMU::SparsePlaceholderAddress(), mapping->size, {true});
+                asCtx->gmmu.Map(offset, GMMU::SparsePlaceholderAddress(), mapping->size, {true});
            else
-                state.soc->gm20b.gmmu.Unmap(offset, mapping->size);
+                asCtx->gmmu.Unmap(offset, mapping->size);

            mappingMap.erase(offset);
        } catch (const std::out_of_range &e) {
@ -172,7 +196,7 @@ namespace skyline::service::nvdrv::device::nvhost {
                u64 gpuAddress{offset + bufferOffset};
                u8 *cpuPtr{mapping->ptr + bufferOffset};

-                state.soc->gm20b.gmmu.Map(gpuAddress, cpuPtr, mappingSize);
+                asCtx->gmmu.Map(gpuAddress, cpuPtr, mappingSize);

                return PosixResult::Success;
            } catch (const std::out_of_range &e) {
@ -194,7 +218,7 @@ namespace skyline::service::nvdrv::device::nvhost {
            if (alloc-- == allocationMap.begin() || (offset - alloc->first) + size > alloc->second.size)
                throw exception("Cannot perform a fixed mapping into an unallocated region!");

-            state.soc->gm20b.gmmu.Map(offset, cpuPtr, size);
+            asCtx->gmmu.Map(offset, cpuPtr, size);

            auto mapping{std::make_shared<Mapping>(cpuPtr, offset, size, true, false, alloc->second.sparse)};
            alloc->second.mappings.push_back(mapping);
@ -214,7 +238,7 @@ namespace skyline::service::nvdrv::device::nvhost {
            u32 pageSizeBits{bigPage ? vm.bigPageSizeBits : VM::PageSizeBits};

            offset = static_cast<u64>(allocator->Allocate(util::AlignUp(size, pageSize) >> pageSizeBits)) << pageSizeBits;
-            state.soc->gm20b.gmmu.Map(offset, cpuPtr, size);
+            asCtx->gmmu.Map(offset, cpuPtr, size);

            auto mapping{std::make_shared<Mapping>(cpuPtr, offset, size, false, bigPage, false)};
            mappingMap[offset] = mapping;
@ -292,6 +316,7 @@ namespace skyline::service::nvdrv::device::nvhost {
        u64 endBigPages{(vm.vaRangeEnd - vm.vaRangeSplit) >> vm.bigPageSizeBits};
        vm.bigPageAllocator = std::make_unique<VM::Allocator>(startBigPages, endBigPages);

+        asCtx = std::make_shared<soc::gm20b::AddressSpaceContext>();
        vm.initialised = true;

        return PosixResult::Success;
@ -320,7 +345,7 @@ namespace skyline::service::nvdrv::device::nvhost {
            }

            if (!entry.handle) {
-                state.soc->gm20b.gmmu.Map(virtAddr, soc::gm20b::GMMU::SparsePlaceholderAddress(), size, {true});
+                asCtx->gmmu.Map(virtAddr, GMMU::SparsePlaceholderAddress(), size, {true});
            } else {
                auto h{core.nvMap.GetHandle(entry.handle)};
                if (!h)
@ -328,7 +353,7 @@ namespace skyline::service::nvdrv::device::nvhost {

                u8 *cpuPtr{reinterpret_cast<u8 *>(h->address + (static_cast<u64>(entry.handleOffsetBigPages) << vm.bigPageSizeBits))};

-                state.soc->gm20b.gmmu.Map(virtAddr, cpuPtr, size);
+                asCtx->gmmu.Map(virtAddr, cpuPtr, size);
            }
        }

--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.h
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/as_gpu.h
@ -4,7 +4,7 @@
 #pragma once

 #include <common/address_space.h>
-
+#include <soc/gm20b/gmmu.h>
 #include <services/nvdrv/devices/nvdevice.h>

 namespace skyline::service::nvdrv::device::nvhost {
@ -65,6 +65,8 @@ namespace skyline::service::nvdrv::device::nvhost {
            bool initialised{};
        } vm;

+        std::shared_ptr<soc::gm20b::AddressSpaceContext> asCtx;
+
        void FreeMappingLocked(u64 offset);

      public:
@ -95,7 +97,7 @@ namespace skyline::service::nvdrv::device::nvhost {
        };
        static_assert(sizeof(RemapEntry) == 0x14);

-        AsGpu(const DeviceState &state, Core &core, const SessionContext &ctx);
+        AsGpu(const DeviceState &state, Driver &driver, Core &core, const SessionContext &ctx);

        /**
         * @brief Binds this address space to a channel
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl.cpp
@ -34,7 +34,7 @@ namespace skyline::service::nvdrv::device::nvhost {
            state == SyncpointEvent::State::Signalling;
    }

-    Ctrl::Ctrl(const DeviceState &state, Core &core, const SessionContext &ctx) : NvDevice(state, core, ctx) {}
+    Ctrl::Ctrl(const DeviceState &state, Driver &driver, Core &core, const SessionContext &ctx) : NvDevice(state, driver, core, ctx) {}

    u32 Ctrl::FindFreeSyncpointEvent(u32 syncpointId) {
        u32 eventSlot{SyncpointEventCount}; //!< Holds the slot of the last populated event in the event array
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl.h
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl.h
@ -96,7 +96,7 @@ namespace skyline::service::nvdrv::device::nvhost {
        PosixResult SyncpointFreeEventLocked(In<u32> slot);

      public:
-        Ctrl(const DeviceState &state, Core &core, const SessionContext &ctx);
+        Ctrl(const DeviceState &state, Driver &driver, Core &core, const SessionContext &ctx);

        /**
         * @brief Clears a syncpoint event
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl_gpu.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl_gpu.cpp
@ -5,8 +5,8 @@
 #include "ctrl_gpu.h"

 namespace skyline::service::nvdrv::device::nvhost {
-    CtrlGpu::CtrlGpu(const DeviceState &state, Core &core, const SessionContext &ctx) :
-        NvDevice(state, core, ctx),
+    CtrlGpu::CtrlGpu(const DeviceState &state, Driver &driver, Core &core, const SessionContext &ctx) :
+        NvDevice(state, driver, core, ctx),
        errorNotifierEvent(std::make_shared<type::KEvent>(state, false)),
        unknownEvent(std::make_shared<type::KEvent>(state, false)) {}

--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl_gpu.h
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/ctrl_gpu.h
@ -73,7 +73,7 @@ namespace skyline::service::nvdrv::device::nvhost {
            u32 subregionCount{0x10};
        };

-        CtrlGpu(const DeviceState &state, Core &core, const SessionContext &ctx);
+        CtrlGpu(const DeviceState &state, Driver &driver, Core &core, const SessionContext &ctx);

        /**
         * @brief Returns the zcull context size
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/gpu_channel.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/gpu_channel.cpp
@ -6,8 +6,8 @@
 #include "gpu_channel.h"

 namespace skyline::service::nvdrv::device::nvhost {
-    GpuChannel::GpuChannel(const DeviceState &state, Core &core, const SessionContext &ctx) :
-        NvDevice(state, core, ctx),
+    GpuChannel::GpuChannel(const DeviceState &state, Driver &driver, Core &core, const SessionContext &ctx) :
+        NvDevice(state, driver, core, ctx),
        smExceptionBreakpointIntReportEvent(std::make_shared<type::KEvent>(state, false)),
        smExceptionBreakpointPauseReportEvent(std::make_shared<type::KEvent>(state, false)),
        errorNotifierEvent(std::make_shared<type::KEvent>(state, false)) {
@ -39,16 +39,20 @@ namespace skyline::service::nvdrv::device::nvhost {
            if (flags.incrementWithValue)
                return PosixResult::InvalidArgument;

-            if (core.syncpointManager.IsFenceSignalled(fence))
+            if (!core.syncpointManager.IsFenceSignalled(fence))
                throw exception("Waiting on a fence through SubmitGpfifo is unimplemented");
        }

-        state.soc->gm20b.gpfifo.Push(gpEntries.subspan(0, numEntries));
+        {
+            std::scoped_lock lock(channelMutex);

-        fence.id = channelSyncpoint;
+            channelCtx->gpfifo.Push(gpEntries.subspan(0, numEntries));

-        u32 increment{(flags.fenceIncrement ? 2 : 0) + (flags.incrementWithValue ? fence.threshold : 0)};
-        fence.threshold = core.syncpointManager.IncrementSyncpointMaxExt(channelSyncpoint, increment);
+            fence.id = channelSyncpoint;
+
+            u32 increment{(flags.fenceIncrement ? 2 : 0) + (flags.incrementWithValue ? fence.threshold : 0)};
+            fence.threshold = core.syncpointManager.IncrementSyncpointMaxExt(channelSyncpoint, increment);
+        }

        if (flags.fenceIncrement)
            throw exception("Incrementing a fence through SubmitGpfifo is unimplemented");
@ -84,7 +88,19 @@ namespace skyline::service::nvdrv::device::nvhost {

    PosixResult GpuChannel::AllocGpfifoEx2(In<u32> numEntries, In<u32> numJobs, In<u32> flags, Out<Fence> fence) {
        state.logger->Debug("numEntries: {}, numJobs: {}, flags: 0x{:X}", numEntries, numJobs, flags);
-        state.soc->gm20b.gpfifo.Initialize(numEntries);
+
+        std::scoped_lock lock(channelMutex);
+        if (!asCtx) {
+            state.logger->Warn("Trying to allocate a channel without a bound address space");
+            return PosixResult::InvalidArgument;
+        }
+
+        if (channelCtx) {
+            state.logger->Warn("Trying to allocate a channel twice!");
+            return PosixResult::FileExists;
+        }
+
+        channelCtx = std::make_unique<soc::gm20b::ChannelContext>(state, asCtx, numEntries);

        fence = core.syncpointManager.GetSyncpointFence(channelSyncpoint);

--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/gpu_channel.h
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/gpu_channel.h
@ -3,23 +3,32 @@

 #pragma once

-#include <soc/gm20b/gpfifo.h>
 #include <services/common/fence.h>
+#include <soc/gm20b/engines/maxwell_3d.h> // TODO: remove
+#include <soc/gm20b/channel.h>
 #include "services/nvdrv/devices/nvdevice.h"

 namespace skyline::service::nvdrv::device::nvhost {
+    class AsGpu;
+
    /**
     * @brief nvhost::GpuChannel is used to create and submit commands to channels which are effectively GPU processes
     * @url https://switchbrew.org/wiki/NV_services#Channels
     */
    class GpuChannel : public NvDevice {
      private:
-        u32 channelSyncpoint{};
+        u32 channelSyncpoint{}; //!< The syncpoint for submissions allocated to this channel in `AllocGpfifo`
        u32 channelUserData{};
+        std::mutex channelMutex;
        std::shared_ptr<type::KEvent> smExceptionBreakpointIntReportEvent;
        std::shared_ptr<type::KEvent> smExceptionBreakpointPauseReportEvent;
        std::shared_ptr<type::KEvent> errorNotifierEvent;

+        std::shared_ptr<soc::gm20b::AddressSpaceContext> asCtx;
+        std::unique_ptr<soc::gm20b::ChannelContext> channelCtx;
+
+        friend AsGpu;
+
      public:
        /**
         * @brief A bitfield of the flags that can be supplied for a specific GPFIFO submission
@ -37,7 +46,7 @@ namespace skyline::service::nvdrv::device::nvhost {
            u32 raw;
        };

-        GpuChannel(const DeviceState &state, Core &core, const SessionContext &ctx);
+        GpuChannel(const DeviceState &state, Driver &driver, Core &core, const SessionContext &ctx);

        /**
         * @brief Sets the nvmap handle id to be used for channel submits (does nothing for GPU channels)
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvmap.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvmap.cpp
@ -6,7 +6,7 @@
 #include "nvmap.h"

 namespace skyline::service::nvdrv::device {
-    NvMap::NvMap(const DeviceState &state, Core &core, const SessionContext &ctx) : NvDevice(state, core, ctx) {}
+    NvMap::NvMap(const DeviceState &state, Driver &driver, Core &core, const SessionContext &ctx) : NvDevice(state, driver, core, ctx) {}

    PosixResult NvMap::Create(In<u32> size, Out<NvMapCore::Handle::Id> handle) {
        auto handleDesc{core.nvMap.CreateHandle(util::AlignUp(size, PAGE_SIZE))};
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvmap.h
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvmap.h
@ -23,7 +23,7 @@ namespace skyline::service::nvdrv::device {
            IsSharedMemMapped = 6
        };

-        NvMap(const DeviceState &state, Core &core, const SessionContext &ctx);
+        NvMap(const DeviceState &state, Driver &driver, Core &core, const SessionContext &ctx);

        /**
         * @brief Creates an nvmap handle for the given size
--- a/app/src/main/cpp/skyline/services/nvdrv/driver.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/driver.cpp
@ -23,10 +23,13 @@ namespace skyline::service::nvdrv {
                    break;           \
            }

-        #define DEVICE_CASE(path, object) \
-            case util::Hash(path): \
-                devices.emplace(fd, std::make_unique<device::object>(state, core, ctx)); \
-                return NvResult::Success;
+        #define DEVICE_CASE(path, object)                                                           \
+            case util::Hash(path):                                                                  \
+                {                                                                                   \
+                    std::unique_lock lock(deviceMutex);                                             \
+                    devices.emplace(fd, std::make_unique<device::object>(state, *this, core, ctx)); \
+                    return NvResult::Success;                                                       \
+                }

        DEVICE_SWITCH(
            DEVICE_CASE("/dev/nvmap", NvMap)
@ -69,13 +72,13 @@ namespace skyline::service::nvdrv {
            default:
                throw exception("Unhandled POSIX result: {}!", static_cast<i32>(result));
        }
-
    }

    NvResult Driver::Ioctl(u32 fd, IoctlDescriptor cmd, span<u8> buffer) {
        state.logger->Debug("fd: {}, cmd: 0x{:X}, device: {}", fd, cmd.raw, devices.at(fd)->GetName());

        try {
+            std::shared_lock lock(deviceMutex);
            return ConvertResult(devices.at(fd)->Ioctl(cmd, buffer));
        } catch (const std::out_of_range &) {
            throw exception("Ioctl was called with invalid file descriptor: {}", fd);
@ -86,6 +89,7 @@ namespace skyline::service::nvdrv {
        state.logger->Debug("fd: {}, cmd: 0x{:X}, device: {}", fd, cmd.raw, devices.at(fd)->GetName());

        try {
+            std::shared_lock lock(deviceMutex);
            return ConvertResult(devices.at(fd)->Ioctl2(cmd, buffer, inlineBuffer));
        } catch (const std::out_of_range &) {
            throw exception("Ioctl2 was called with invalid file descriptor: 0x{:X}", fd);
@ -96,6 +100,7 @@ namespace skyline::service::nvdrv {
        state.logger->Debug("fd: {}, cmd: 0x{:X}, device: {}", fd, cmd.raw, devices.at(fd)->GetName());

        try {
+            std::shared_lock lock(deviceMutex);
            return ConvertResult(devices.at(fd)->Ioctl3(cmd, buffer, inlineBuffer));
        } catch (const std::out_of_range &) {
            throw exception("Ioctl3 was called with invalid file descriptor: {}", fd);
@ -104,6 +109,7 @@ namespace skyline::service::nvdrv {

    void Driver::CloseDevice(u32 fd) {
        try {
+            std::unique_lock lock(deviceMutex);
            devices.erase(fd);
        } catch (const std::out_of_range &) {
            state.logger->Warn("Trying to close non-existent file descriptor: {}");
@ -114,6 +120,7 @@ namespace skyline::service::nvdrv {
        state.logger->Debug("fd: {}, eventId: 0x{:X}, device: {}", fd, eventId, devices.at(fd)->GetName());

        try {
+            std::shared_lock lock(deviceMutex);
            return devices.at(fd)->QueryEvent(eventId);
        } catch (const std::exception &) {
            throw exception("QueryEvent was called with invalid file descriptor: {}", fd);
--- a/app/src/main/cpp/skyline/services/nvdrv/driver.h
+++ b/app/src/main/cpp/skyline/services/nvdrv/driver.h
@ -4,16 +4,27 @@
 #pragma once

 #include <common.h>
+#include <kernel/types/KEvent.h>
 #include "types.h"
-#include "devices/nvdevice.h"
 #include "core/core.h"
+#include "devices/nvdevice.h"

 namespace skyline::service::nvdrv {
+    namespace device {
+        namespace nvhost {
+            class AsGpu;
+        }
+    }
+
    class Driver {
      private:
        const DeviceState &state;
+
+        std::shared_mutex deviceMutex; //!< Protects access to `devices`
        std::unordered_map<FileDescriptor, std::unique_ptr<device::NvDevice>> devices;

+        friend device::nvhost::AsGpu; // For channel address space binding
+
      public:
        Core core; //!< The core global state object of nvdrv that is accessed by devices

--- a/app/src/main/cpp/skyline/soc.h
+++ b/app/src/main/cpp/skyline/soc.h
@ -4,7 +4,7 @@
 #pragma once

 #include "soc/host1x.h"
-#include "soc/gm20b.h"
+#include "soc/gm20b/gpfifo.h"

 namespace skyline::soc {
    /**
@ -14,8 +14,7 @@ namespace skyline::soc {
    class SOC {
      public:
        host1x::Host1X host1x;
-        gm20b::GM20B gm20b;

-        SOC(const DeviceState &state) : gm20b(state) {}
+        SOC(const DeviceState &state) {}
    };
 }
--- a/app/src/main/cpp/skyline/soc/gm20b.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b.cpp
@ -1,15 +0,0 @@
-// SPDX-License-Identifier: MPL-2.0
-// Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
-
-#include "gm20b.h"
-
-namespace skyline::soc::gm20b {
-    GM20B::GM20B(const DeviceState &state) :
-        fermi2D(state),
-        keplerMemory(state),
-        maxwell3D(state, gmmu, executor),
-        maxwellCompute(state),
-        maxwellDma(state),
-        gpfifo(state),
-        executor(state) {}
-}
--- a/app/src/main/cpp/skyline/soc/gm20b/channel.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/channel.cpp
@ -0,0 +1,17 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include "engines/maxwell_3d.h" //TODO: remove
+#include "channel.h"
+
+namespace skyline::soc::gm20b {
+    ChannelContext::ChannelContext(const DeviceState &state, std::shared_ptr<AddressSpaceContext> asCtx, size_t numEntries) :
+        fermi2D(state),
+        keplerMemory(state),
+        maxwell3D(std::make_unique<engine::maxwell3d::Maxwell3D>(state, *this, executor)),
+        maxwellCompute(state),
+        maxwellDma(state),
+        gpfifo(state, *this, numEntries),
+        executor(state),
+        asCtx(std::move(asCtx)){}
+}
--- a/app/src/main/cpp/skyline/soc/gm20b/channel.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/channel.h
@ -4,26 +4,30 @@
 #pragma once

 #include <gpu/interconnect/command_executor.h>
-#include "gm20b/engines/maxwell_3d.h"
-#include "gm20b/gpfifo.h"
-#include "gm20b/gmmu.h"
+#include "engines/engine.h"
+#include "gpfifo.h"

 namespace skyline::soc::gm20b {
+    namespace engine::maxwell3d {
+        class Maxwell3D;
+    }
+
+    struct AddressSpaceContext;
+
    /**
     * @brief The GPU block in the X1, it contains all GPU engines required for accelerating graphics operations
     * @note We omit parts of components related to external access such as the grhost, all accesses to the external components are done directly
     */
-    class GM20B {
-      public:
-        GMMU gmmu;
+    struct ChannelContext {
+        std::shared_ptr<AddressSpaceContext> asCtx;
        gpu::interconnect::CommandExecutor executor;
        engine::Engine fermi2D;
-        engine::maxwell3d::Maxwell3D maxwell3D;
+        std::unique_ptr<engine::maxwell3d::Maxwell3D> maxwell3D; //!< TODO: fix this once graphics context is moved into a cpp file
        engine::Engine maxwellCompute;
        engine::Engine maxwellDma;
        engine::Engine keplerMemory;
-        GPFIFO gpfifo;
+        ChannelGpfifo gpfifo;

-        GM20B(const DeviceState &state);
+        ChannelContext(const DeviceState &state, std::shared_ptr<AddressSpaceContext> asCtx, size_t numEntries);
    };
 }
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
@ -3,10 +3,11 @@
 // Copyright © 2018-2020 fincs (https://github.com/devkitPro/deko3d)

 #include <boost/preprocessor/repeat.hpp>
+#include "maxwell_3d.h"
 #include <soc.h>

 namespace skyline::soc::gm20b::engine::maxwell3d {
-    Maxwell3D::Maxwell3D(const DeviceState &state, GMMU &gmmu, gpu::interconnect::CommandExecutor &executor) : Engine(state), macroInterpreter(*this), context(*state.gpu, gmmu, executor) {
+    Maxwell3D::Maxwell3D(const DeviceState &state, ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor) : Engine(state), macroInterpreter(*this), context(*state.gpu, channelCtx, executor), channelCtx(channelCtx) {
        ResetRegs();
    }

@ -244,7 +245,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {

            MAXWELL3D_CASE(syncpointAction, {
                state.logger->Debug("Increment syncpoint: {}", static_cast<u16>(syncpointAction.id));
-                state.soc->gm20b.executor.Execute();
+                channelCtx.executor.Execute();
                state.soc->host1x.syncpoints.at(syncpointAction.id).Increment();
            })

@ -307,7 +308,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {

        switch (registers.semaphore.info.structureSize) {
            case type::SemaphoreInfo::StructureSize::OneWord:
-                state.soc->gm20b.gmmu.Write<u32>(registers.semaphore.address.Pack(), static_cast<u32>(result));
+                channelCtx.asCtx->gmmu.Write<u32>(registers.semaphore.address.Pack(), static_cast<u32>(result));
                break;

            case type::SemaphoreInfo::StructureSize::FourWords: {
@ -318,7 +319,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
                u64 nsTime{util::GetTimeNs()};
                u64 timestamp{(nsTime / NsToTickDenominator) * NsToTickNumerator + ((nsTime % NsToTickDenominator) * NsToTickNumerator) / NsToTickDenominator};

-                state.soc->gm20b.gmmu.Write<FourWordResult>(registers.semaphore.address.Pack(), FourWordResult{result, timestamp});
+                channelCtx.asCtx->gmmu.Write<FourWordResult>(registers.semaphore.address.Pack(), FourWordResult{result, timestamp});
                break;
            }
        }
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h
@ -8,6 +8,10 @@
 #include "engine.h"
 #include "maxwell/macro_interpreter.h"

+namespace skyline::soc::gm20b {
+    struct ChannelContext;
+}
+
 namespace skyline::soc::gm20b::engine::maxwell3d {
    /**
     * @brief The Maxwell 3D engine handles processing 3D graphics
@ -245,9 +249,11 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
        Registers registers{};
        Registers shadowRegisters{}; //!< A shadow-copy of the registers, their function is controlled by the 'shadowRamControl' register

+        ChannelContext &channelCtx;
+
        std::array<u32, 0x2000> macroCode{}; //!< Stores GPU macros, writes to it will wraparound on overflow

-        Maxwell3D(const DeviceState &state, GMMU &gmmu, gpu::interconnect::CommandExecutor &executor);
+        Maxwell3D(const DeviceState &state, ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor);

        /**
         * @brief Resets the Maxwell 3D registers to their default values
--- a/app/src/main/cpp/skyline/soc/gm20b/gmmu.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/gmmu.h
@ -10,8 +10,12 @@ namespace skyline::soc::gm20b {

    /**
     * @brief The GMMU (Graphics Memory Management Unit) class handles mapping between a Maxwell GPU virtual address space and an application's address space and is meant to roughly emulate the GMMU on the X1
-     * @note This is not accurate to the X1 as it would have an SMMU between the GMMU and physical memory but we don't emulate this abstraction at the moment
+     * @note This is not accurate to the X1 as it would have an SMMU between the GMMU and physical memory but we don't need to emulate this abstraction
     * @note The GMMU is implemented entirely as a template specialization over FlatMemoryManager
     */
    using GMMU = FlatMemoryManager<u64, 0, GmmuAddressSpaceBits>;
+
+    struct AddressSpaceContext {
+        GMMU gmmu;
+    };
 }
--- a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
@ -6,6 +6,7 @@
 #include <kernel/types/KProcess.h>
 #include <soc.h>
 #include <os.h>
+#include "engines/maxwell_3d.h"

 namespace skyline::soc::gm20b {
    /**
@ -58,7 +59,14 @@ namespace skyline::soc::gm20b {
    };
    static_assert(sizeof(PushBufferMethodHeader) == sizeof(u32));

-    void GPFIFO::Send(u32 method, u32 argument, u32 subChannel, bool lastCall) {
+    ChannelGpfifo::ChannelGpfifo(const DeviceState &state, ChannelContext &channelCtx, size_t numEntries) :
+        state(state),
+        gpfifoEngine(state),
+        channelCtx(channelCtx),
+        gpEntries(numEntries),
+        thread(std::thread(&ChannelGpfifo::Run, this)) {}
+
+    void ChannelGpfifo::Send(u32 method, u32 argument, u32 subChannel, bool lastCall) {
        constexpr u32 ThreeDSubChannel{0};
        constexpr u32 ComputeSubChannel{1};
        constexpr u32 Inline2MemorySubChannel{2};
@ -72,19 +80,19 @@ namespace skyline::soc::gm20b {
        } else {
            switch (subChannel) {
                case ThreeDSubChannel:
-                    state.soc->gm20b.maxwell3D.CallMethod(method, argument, lastCall);
+                    channelCtx.maxwell3D->CallMethod(method, argument, lastCall);
                    break;
                case ComputeSubChannel:
-                    state.soc->gm20b.maxwellCompute.CallMethod(method, argument, lastCall);
+                    channelCtx.maxwellCompute.CallMethod(method, argument, lastCall);
                    break;
                case Inline2MemorySubChannel:
-                    state.soc->gm20b.keplerMemory.CallMethod(method, argument, lastCall);
+                    channelCtx.keplerMemory.CallMethod(method, argument, lastCall);
                    break;
                case TwoDSubChannel:
-                    state.soc->gm20b.fermi2D.CallMethod(method, argument, lastCall);
+                    channelCtx.fermi2D.CallMethod(method, argument, lastCall);
                    break;
                case CopySubChannel:
-                    state.soc->gm20b.maxwellDma.CallMethod(method, argument, lastCall);
+                    channelCtx.maxwellDma.CallMethod(method, argument, lastCall);
                    break;
                default:
                    throw exception("Tried to call into a software subchannel: {}!", subChannel);
@ -92,7 +100,7 @@ namespace skyline::soc::gm20b {
        }
    }

-    void GPFIFO::Process(GpEntry gpEntry) {
+    void ChannelGpfifo::Process(GpEntry gpEntry) {
        if (!gpEntry.size) {
            // This is a GPFIFO control entry, all control entries have a zero length and contain no pushbuffers
            switch (gpEntry.opcode) {
@ -105,7 +113,7 @@ namespace skyline::soc::gm20b {
        }

        pushBufferData.resize(gpEntry.size);
-        state.soc->gm20b.gmmu.Read<u32>(pushBufferData, gpEntry.Address());
+        channelCtx.asCtx->gmmu.Read<u32>(pushBufferData, gpEntry.Address());

        for (auto entry{pushBufferData.begin()}; entry != pushBufferData.end(); entry++) {
            // An entry containing all zeroes is a NOP, skip over it
@ -142,18 +150,11 @@ namespace skyline::soc::gm20b {
        }
    }

-    void GPFIFO::Initialize(size_t numBuffers) {
-        if (pushBuffers)
-            throw exception("GPFIFO Initialization cannot be done multiple times");
-        pushBuffers.emplace(numBuffers);
-        thread = std::thread(&GPFIFO::Run, this);
-    }
-
-    void GPFIFO::Run() {
+    void ChannelGpfifo::Run() {
        pthread_setname_np(pthread_self(), "GPFIFO");
        try {
            signal::SetSignalHandler({SIGINT, SIGILL, SIGTRAP, SIGBUS, SIGFPE, SIGSEGV}, signal::ExceptionalSignalHandler);
-            pushBuffers->Process([this](GpEntry gpEntry) {
+            gpEntries.Process([this](GpEntry gpEntry) {
                state.logger->Debug("Processing pushbuffer: 0x{:X}", gpEntry.Address());
                Process(gpEntry);
            });
@ -170,11 +171,11 @@ namespace skyline::soc::gm20b {
        }
    }

-    void GPFIFO::Push(span<GpEntry> entries) {
-        pushBuffers->Append(entries);
+    void ChannelGpfifo::Push(span<GpEntry> entries) {
+        gpEntries.Append(entries);
    }

-    GPFIFO::~GPFIFO() {
+    ChannelGpfifo::~ChannelGpfifo() {
        if (thread.joinable()) {
            pthread_kill(thread.native_handle(), SIGINT);
            thread.join();
--- a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.h
@ -7,6 +7,8 @@
 #include "engines/gpfifo.h"

 namespace skyline::soc::gm20b {
+    struct ChannelContext;
+
    /**
     * @brief A GPFIFO entry as submitted through 'SubmitGpfifo'
     * @url https://nvidia.github.io/open-gpu-doc/manuals/volta/gv100/dev_pbdma.ref.txt
@ -73,15 +75,16 @@ namespace skyline::soc::gm20b {
    static_assert(sizeof(GpEntry) == sizeof(u64));

    /**
-     * @brief The GPFIFO class handles creating pushbuffers from GP entries and then processing them
+     * @brief The ChannelGpfifo class handles creating pushbuffers from GP entries and then processing them for a single channel
+     * @note A single ChannelGpfifo thread exists per channel with a single shared mutex in `GPFIFO` to enforce that only one channel can run at a time
     * @note This class doesn't perfectly map to any particular hardware component on the X1, it does a mix of the GPU Host PBDMA (With  and handling the GPFIFO entries
     * @url https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/manuals/volta/gv100/dev_pbdma.ref.txt#L62
     */
-    class GPFIFO {
+    class ChannelGpfifo {
        const DeviceState &state;
+        ChannelContext &channelCtx;
        engine::GPFIFO gpfifoEngine; //!< The engine for processing GPFIFO method calls
-        std::array<engine::Engine*, 8> subchannels;
-        std::optional<CircularQueue<GpEntry>> pushBuffers;
+        CircularQueue<GpEntry> gpEntries;
        std::thread thread; //!< The thread that manages processing of pushbuffers
        std::vector<u32> pushBufferData; //!< Persistent vector storing pushbuffer data to avoid constant reallocations

@ -96,14 +99,12 @@ namespace skyline::soc::gm20b {
        void Process(GpEntry gpEntry);

      public:
-        GPFIFO(const DeviceState &state) : state(state), gpfifoEngine(state) {}
-
-        ~GPFIFO();
-
        /**
-         * @param numBuffers The amount of push-buffers to allocate in the circular buffer
+         * @param numEntries The number of gpEntries to allocate space for in the FIFO
         */
-        void Initialize(size_t numBuffers);
+        ChannelGpfifo(const DeviceState &state, ChannelContext &channelCtx, size_t numEntries);
+
+        ~ChannelGpfifo();

        /**
         * @brief Executes all pending entries in the FIFO