diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt
index 591188bc..deabc37c 100644
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@@ -185,6 +185,7 @@ add_library(skyline SHARED
         ${source_DIR}/skyline/soc/gm20b/engines/maxwell_3d.cpp
         ${source_DIR}/skyline/soc/gm20b/engines/inline2memory.cpp
         ${source_DIR}/skyline/soc/gm20b/engines/kepler_compute.cpp
+        ${source_DIR}/skyline/soc/gm20b/engines/maxwell_dma.cpp
         ${source_DIR}/skyline/soc/gm20b/engines/maxwell/initialization.cpp
         ${source_DIR}/skyline/input/npad.cpp
         ${source_DIR}/skyline/input/npad_device.cpp
diff --git a/app/src/main/cpp/skyline/soc/gm20b/channel.cpp b/app/src/main/cpp/skyline/soc/gm20b/channel.cpp
index 29ab19c5..ca9b8301 100644
--- a/app/src/main/cpp/skyline/soc/gm20b/channel.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/channel.cpp
@@ -9,6 +9,7 @@ namespace skyline::soc::gm20b {
         : asCtx(std::move(pAsCtx)),
           executor(state),
           maxwell3D(std::make_unique<engine::maxwell3d::Maxwell3D>(state, *this, macroState, executor)),
+          maxwellDma(state, *this),
           keplerCompute(state, *this),
           inline2Memory(asCtx),
           gpfifo(state, *this, numEntries) {}
diff --git a/app/src/main/cpp/skyline/soc/gm20b/channel.h b/app/src/main/cpp/skyline/soc/gm20b/channel.h
index f0819d06..70029a16 100644
--- a/app/src/main/cpp/skyline/soc/gm20b/channel.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/channel.h
@@ -6,6 +6,7 @@
 #include <gpu/interconnect/command_executor.h>
 #include "macro/macro_state.h"
 #include "engines/engine.h"
+#include "engines/maxwell_dma.h"
 #include "engines/kepler_compute.h"
 #include "engines/inline2memory.h"
 #include "gpfifo.h"
@@ -26,6 +27,7 @@ namespace skyline::soc::gm20b {
         gpu::interconnect::CommandExecutor executor;
         MacroState macroState;
         std::unique_ptr<engine::maxwell3d::Maxwell3D> maxwell3D; //!< TODO: fix this once graphics context is moved into a cpp file
+        engine::MaxwellDma maxwellDma;
         engine::KeplerCompute keplerCompute;
         engine::Inline2Memory inline2Memory;
         ChannelGpfifo gpfifo;
diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.cpp b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.cpp
new file mode 100644
index 00000000..1693c089
--- /dev/null
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <soc.h>
+#include <soc/gm20b/channel.h>
+#include <soc/gm20b/gmmu.h>
+#include "maxwell_dma.h"
+
+namespace skyline::soc::gm20b::engine {
+    MaxwellDma::MaxwellDma(const DeviceState &state, ChannelContext &channelCtx)
+        : channelCtx(channelCtx), syncpoints(state.soc->host1x.syncpoints) {}
+
+    __attribute__((always_inline)) void MaxwellDma::CallMethod(u32 method, u32 argument) {
+        Logger::Verbose("Called method in Maxwell DMA: 0x{:X} args: 0x{:X}", method, argument);
+
+        HandleMethod(method, argument);
+    }
+
+    void MaxwellDma::HandleMethod(u32 method, u32 argument) {
+        registers.raw[method] = argument;
+
+        if (method == ENGINE_OFFSET(launchDma))
+            LaunchDma();
+    }
+
+    void MaxwellDma::LaunchDma() {
+        if (*registers.lineLengthIn == 0)
+            return; // Nothing to copy
+
+        if (registers.launchDma->multiLineEnable) {
+            // 2D/3D copy
+            Logger::Warn("2D/3D DMA engine copies are unimplemented");
+        } else {
+            // 1D buffer copy
+            // TODO: implement swizzled 1D copies based on VMM 'kind'
+            Logger::Debug("src: 0x{:X} dst: 0x{:X} size: 0x{:X}", u64{*registers.offsetIn}, u64{*registers.offsetOut}, *registers.lineLengthIn);
+            channelCtx.asCtx->gmmu.Copy(*registers.offsetOut, *registers.offsetIn, *registers.lineLengthIn);
+        }
+    }
+
+    void MaxwellDma::CallMethodBatchNonInc(u32 method, span<u32> arguments) {
+        for (u32 argument : arguments)
+            HandleMethod(method, argument);
+    }
+}
diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.h b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.h
new file mode 100644
index 00000000..0191df51
--- /dev/null
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.h
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include "engine.h"
+
+namespace skyline::soc::gm20b {
+    struct ChannelContext;
+}
+
+namespace skyline::soc::gm20b::engine {
+    /**
+    * @brief The Maxwell DMA Engine is used to perform DMA buffer/texture copies directly on the GPU
+    */
+    class MaxwellDma {
+      private:
+        host1x::SyncpointSet &syncpoints;
+        ChannelContext &channelCtx;
+
+        void HandleMethod(u32 method, u32 argument);
+
+        void LaunchDma();
+
+      public:
+        /**
+         * @url https://github.com/NVIDIA/open-gpu-doc/blob/master/classes/dma-copy/clb0b5.h
+         */
+        #pragma pack(push, 1)
+        union Registers {
+            std::array<u32, EngineMethodsEnd> raw;
+
+            template<size_t Offset, typename Type>
+            using Register = util::OffsetMember<Offset, Type, u32>;
+
+            Register<0x40, u32> nop;
+            Register<0x50, u32> pmTrigger;
+
+            struct Semaphore {
+                Address address;
+                u32 payload;
+            };
+            static_assert(sizeof(Semaphore) == 0xC);
+
+            Register<0x90, Semaphore> semaphore;
+
+            struct RenderEnable {
+                enum class Mode : u8 {
+                    False = 0,
+                    True = 1,
+                    Conditional = 2,
+                    RenderIfEqual = 3,
+                    RenderIfNotEqual = 4
+                };
+
+                Address address;
+                Mode mode : 3;
+                u32 _pad_ : 29;
+            };
+            static_assert(sizeof(RenderEnable) == 0xC);
+
+            Register<0x95, RenderEnable> renderEnable;
+
+            struct PhysMode {
+                enum class Target : u8 {
+                    LocalFb = 0,
+                    CoherentSysmem = 1,
+                    NoncoherentSysmem = 2
+                };
+
+                Target target : 2;
+                u32 _pad_ : 30;
+            };
+
+            Register<0x98, PhysMode> srcPhysMode;
+            Register<0x99, PhysMode> dstPhysMode;
+
+            struct LaunchDma {
+                enum class DataTransferType : u8 {
+                    None = 0,
+                    Pipelined = 1,
+                    NonPipelined = 2
+                };
+
+                enum class SemaphoreType : u8 {
+                    None = 0,
+                    ReleaseOneWordSemaphore = 1,
+                    ReleaseFourWordSemaphore = 2
+                };
+
+                enum class InterruptType : u8 {
+                    None = 0,
+                    Blocking = 1,
+                    NonBlocking = 2
+                };
+
+                enum class MemoryLayout : u8 {
+                    BlockLinear = 0,
+                    Pitch = 1
+                };
+
+                enum class Type : u8 {
+                    Virtual = 0,
+                    Physical = 1
+                };
+
+                enum class SemaphoreReduction : u8 {
+                    IMin = 0,
+                    IMax = 1,
+                    IXor = 2,
+                    IAnd = 3,
+                    IOr = 4,
+                    IAdd = 5,
+                    Inc = 6,
+                    Dec = 7,
+                    FAdd = 10,
+                };
+
+                enum class SemaphoreReductionSign : u8 {
+                    Signed = 0,
+                    Unsigned = 1,
+                };
+
+                enum class BypassL2 : u8 {
+                    UsePteSetting = 0,
+                    ForceVolatile = 1,
+                };
+
+                DataTransferType dataTransferType : 2;
+                bool flushEnable : 1;
+                SemaphoreType semaphoreType : 2;
+                InterruptType interruptType : 2;
+                MemoryLayout srcMemoryLayout : 1;
+                MemoryLayout dstMemoryLayout : 1;
+                bool multiLineEnable : 1;
+                bool remapEnable : 1;
+                bool rmwDisable : 1;
+                Type srcType : 1;
+                Type dstType : 1;
+                SemaphoreReduction semaphoreReduction : 4;
+                SemaphoreReductionSign semaphoreReductionSign : 1;
+                bool reductionEnable : 1;
+                BypassL2 bypassL2 : 1;
+                u16 _pad_ : 11;
+            };
+            static_assert(sizeof(LaunchDma) == 4);
+
+            Register<0xC0, LaunchDma> launchDma;
+
+            Register<0x100, Address> offsetIn;
+            Register<0x102, Address> offsetOut;
+
+            Register<0x104, u32> pitchIn;
+            Register<0x105, u32> pitchOut;
+
+            Register<0x106, u32> lineLengthIn;
+            Register<0x107, u32> lineCount;
+
+            Register<0x1C0, u32> remapConstA;
+            Register<0x1C1, u32> remapConstB;
+
+            struct RemapComponents {
+                enum class Swizzle : u8 {
+                    SrcX = 0,
+                    SrcY = 1,
+                    SrcZ = 2,
+                    SrcW = 3,
+                    ConstA = 4,
+                    ConstB = 5,
+                    NoWrite = 6
+                };
+
+                Address address;
+
+                Swizzle dstX : 3;
+                u8 _pad0_ : 1;
+                Swizzle dstY : 3;
+                u8 _pad1_ : 1;
+                Swizzle dstZ : 3;
+                u8 _pad2_ : 1;
+                Swizzle dstW : 3;
+                u8 _pad3_ : 1;
+
+                u8 componentSizeMinusOne : 2;
+                u8 _pad4_ : 2;
+                u8 numSrcComponentsMinusOne : 2;
+                u8 _pad5_ : 2;
+                u8 numDstComponentsMinusOne : 2;
+                u8 _pad6_ : 6;
+            };
+            static_assert(sizeof(RemapComponents) == 0xC);
+
+            Register<0x1C2, RemapComponents> remapComponents;
+
+            struct Surface {
+                // Nvidias docs here differ from other emus and deko3d so go with what they say
+                struct {
+                    u8 width : 4;
+                    u8 height : 4;
+                    u8 depth : 4;
+                    u8 gobHeight : 4;
+                    u16 _pad_;
+                } blockSize;
+                u32 width;
+                u32 height;
+                u32 depth;
+                u32 layer;
+
+                struct {
+                    u16 x;
+                    u16 y;
+                } origin;
+            };
+            static_assert(sizeof(Surface) == 0x18);
+
+            Register<0x1C3, Surface> dstSurface;
+            Register<0x1CA, Surface> srcSurface;
+        } registers{};
+        static_assert(sizeof(Registers) == (EngineMethodsEnd * 0x4));
+        #pragma pack(pop)
+
+        MaxwellDma(const DeviceState &state, ChannelContext &channelCtx);
+
+        void CallMethod(u32 method, u32 argument);
+
+        void CallMethodBatchNonInc(u32 method, span<u32> arguments);
+    };
+}
diff --git a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
index ecd641d7..77695743 100644
--- a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
@@ -112,11 +112,14 @@ namespace skyline::soc::gm20b {
             case SubchannelId::ThreeD:
                 channelCtx.maxwell3D->CallMethod(method, argument);
                 break;
+            case SubchannelId::Compute:
+                channelCtx.keplerCompute.CallMethod(method, argument);
+                break;
             case SubchannelId::Inline2Mem:
                 channelCtx.inline2Memory.CallMethod(method, argument);
                 break;
-            case SubchannelId::Compute:
-                channelCtx.keplerCompute.CallMethod(method, argument);
+            case SubchannelId::Copy:
+                channelCtx.maxwellDma.CallMethod(method, argument);
                 break;
             default:
                 Logger::Warn("Called method 0x{:X} in unimplemented engine 0x{:X}, args: 0x{:X}", method, subChannel, argument);
@@ -129,11 +132,14 @@ namespace skyline::soc::gm20b {
             case SubchannelId::ThreeD:
                 channelCtx.maxwell3D->CallMethodBatchNonInc(method, arguments);
                 break;
+            case SubchannelId::Compute:
+                channelCtx.keplerCompute.CallMethodBatchNonInc(method, arguments);
+                break;
             case SubchannelId::Inline2Mem:
                 channelCtx.inline2Memory.CallMethodBatchNonInc(method, arguments);
                 break;
-            case SubchannelId::Compute:
-                channelCtx.keplerCompute.CallMethodBatchNonInc(method, arguments);
+            case SubchannelId::Copy:
+                channelCtx.maxwellDma.CallMethodBatchNonInc(method, arguments);
                 break;
             default:
                 Logger::Warn("Called method 0x{:X} in unimplemented engine 0x{:X} with batch args", method, subChannel);