Implement the Maxwell DMA engine

The DMA engine is used to perform DMA buffer/texture copies directly on the GPU. It can deswizzle arbritary regions of input textures, perform component remapping and swizzle into output textures. This impl only supports 1D buffer copies, 2D ones will come later.
2025-02-17 03:36:24 +01:00 · 2022-04-09 13:14:06 +01:00 · 2022-04-09 13:14:06 +01:00 · 3c26921d54
commit 3c26921d54
parent 3df76e84c3
6 changed files with 287 additions and 4 deletions
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@ -185,6 +185,7 @@ add_library(skyline SHARED
        ${source_DIR}/skyline/soc/gm20b/engines/maxwell_3d.cpp
        ${source_DIR}/skyline/soc/gm20b/engines/inline2memory.cpp
        ${source_DIR}/skyline/soc/gm20b/engines/kepler_compute.cpp
+        ${source_DIR}/skyline/soc/gm20b/engines/maxwell_dma.cpp
        ${source_DIR}/skyline/soc/gm20b/engines/maxwell/initialization.cpp
        ${source_DIR}/skyline/input/npad.cpp
        ${source_DIR}/skyline/input/npad_device.cpp
--- a/app/src/main/cpp/skyline/soc/gm20b/channel.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/channel.cpp
@ -9,6 +9,7 @@ namespace skyline::soc::gm20b {
        : asCtx(std::move(pAsCtx)),
          executor(state),
          maxwell3D(std::make_unique<engine::maxwell3d::Maxwell3D>(state, *this, macroState, executor)),
+          maxwellDma(state, *this),
          keplerCompute(state, *this),
          inline2Memory(asCtx),
          gpfifo(state, *this, numEntries) {}
--- a/app/src/main/cpp/skyline/soc/gm20b/channel.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/channel.h
@ -6,6 +6,7 @@
 #include <gpu/interconnect/command_executor.h>
 #include "macro/macro_state.h"
 #include "engines/engine.h"
+#include "engines/maxwell_dma.h"
 #include "engines/kepler_compute.h"
 #include "engines/inline2memory.h"
 #include "gpfifo.h"
@ -26,6 +27,7 @@ namespace skyline::soc::gm20b {
        gpu::interconnect::CommandExecutor executor;
        MacroState macroState;
        std::unique_ptr<engine::maxwell3d::Maxwell3D> maxwell3D; //!< TODO: fix this once graphics context is moved into a cpp file
+        engine::MaxwellDma maxwellDma;
        engine::KeplerCompute keplerCompute;
        engine::Inline2Memory inline2Memory;
        ChannelGpfifo gpfifo;
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.cpp
@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <soc.h>
+#include <soc/gm20b/channel.h>
+#include <soc/gm20b/gmmu.h>
+#include "maxwell_dma.h"
+
+namespace skyline::soc::gm20b::engine {
+    MaxwellDma::MaxwellDma(const DeviceState &state, ChannelContext &channelCtx)
+        : channelCtx(channelCtx), syncpoints(state.soc->host1x.syncpoints) {}
+
+    __attribute__((always_inline)) void MaxwellDma::CallMethod(u32 method, u32 argument) {
+        Logger::Verbose("Called method in Maxwell DMA: 0x{:X} args: 0x{:X}", method, argument);
+
+        HandleMethod(method, argument);
+    }
+
+    void MaxwellDma::HandleMethod(u32 method, u32 argument) {
+        registers.raw[method] = argument;
+
+        if (method == ENGINE_OFFSET(launchDma))
+            LaunchDma();
+    }
+
+    void MaxwellDma::LaunchDma() {
+        if (*registers.lineLengthIn == 0)
+            return; // Nothing to copy
+
+        if (registers.launchDma->multiLineEnable) {
+            // 2D/3D copy
+            Logger::Warn("2D/3D DMA engine copies are unimplemented");
+        } else {
+            // 1D buffer copy
+            // TODO: implement swizzled 1D copies based on VMM 'kind'
+            Logger::Debug("src: 0x{:X} dst: 0x{:X} size: 0x{:X}", u64{*registers.offsetIn}, u64{*registers.offsetOut}, *registers.lineLengthIn);
+            channelCtx.asCtx->gmmu.Copy(*registers.offsetOut, *registers.offsetIn, *registers.lineLengthIn);
+        }
+    }
+
+    void MaxwellDma::CallMethodBatchNonInc(u32 method, span<u32> arguments) {
+        for (u32 argument : arguments)
+            HandleMethod(method, argument);
+    }
+}
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.h
@ -0,0 +1,228 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include "engine.h"
+
+namespace skyline::soc::gm20b {
+    struct ChannelContext;
+}
+
+namespace skyline::soc::gm20b::engine {
+    /**
+    * @brief The Maxwell DMA Engine is used to perform DMA buffer/texture copies directly on the GPU
+    */
+    class MaxwellDma {
+      private:
+        host1x::SyncpointSet &syncpoints;
+        ChannelContext &channelCtx;
+
+        void HandleMethod(u32 method, u32 argument);
+
+        void LaunchDma();
+
+      public:
+        /**
+         * @url https://github.com/NVIDIA/open-gpu-doc/blob/master/classes/dma-copy/clb0b5.h
+         */
+        #pragma pack(push, 1)
+        union Registers {
+            std::array<u32, EngineMethodsEnd> raw;
+
+            template<size_t Offset, typename Type>
+            using Register = util::OffsetMember<Offset, Type, u32>;
+
+            Register<0x40, u32> nop;
+            Register<0x50, u32> pmTrigger;
+
+            struct Semaphore {
+                Address address;
+                u32 payload;
+            };
+            static_assert(sizeof(Semaphore) == 0xC);
+
+            Register<0x90, Semaphore> semaphore;
+
+            struct RenderEnable {
+                enum class Mode : u8 {
+                    False = 0,
+                    True = 1,
+                    Conditional = 2,
+                    RenderIfEqual = 3,
+                    RenderIfNotEqual = 4
+                };
+
+                Address address;
+                Mode mode : 3;
+                u32 _pad_ : 29;
+            };
+            static_assert(sizeof(RenderEnable) == 0xC);
+
+            Register<0x95, RenderEnable> renderEnable;
+
+            struct PhysMode {
+                enum class Target : u8 {
+                    LocalFb = 0,
+                    CoherentSysmem = 1,
+                    NoncoherentSysmem = 2
+                };
+
+                Target target : 2;
+                u32 _pad_ : 30;
+            };
+
+            Register<0x98, PhysMode> srcPhysMode;
+            Register<0x99, PhysMode> dstPhysMode;
+
+            struct LaunchDma {
+                enum class DataTransferType : u8 {
+                    None = 0,
+                    Pipelined = 1,
+                    NonPipelined = 2
+                };
+
+                enum class SemaphoreType : u8 {
+                    None = 0,
+                    ReleaseOneWordSemaphore = 1,
+                    ReleaseFourWordSemaphore = 2
+                };
+
+                enum class InterruptType : u8 {
+                    None = 0,
+                    Blocking = 1,
+                    NonBlocking = 2
+                };
+
+                enum class MemoryLayout : u8 {
+                    BlockLinear = 0,
+                    Pitch = 1
+                };
+
+                enum class Type : u8 {
+                    Virtual = 0,
+                    Physical = 1
+                };
+
+                enum class SemaphoreReduction : u8 {
+                    IMin = 0,
+                    IMax = 1,
+                    IXor = 2,
+                    IAnd = 3,
+                    IOr = 4,
+                    IAdd = 5,
+                    Inc = 6,
+                    Dec = 7,
+                    FAdd = 10,
+                };
+
+                enum class SemaphoreReductionSign : u8 {
+                    Signed = 0,
+                    Unsigned = 1,
+                };
+
+                enum class BypassL2 : u8 {
+                    UsePteSetting = 0,
+                    ForceVolatile = 1,
+                };
+
+                DataTransferType dataTransferType : 2;
+                bool flushEnable : 1;
+                SemaphoreType semaphoreType : 2;
+                InterruptType interruptType : 2;
+                MemoryLayout srcMemoryLayout : 1;
+                MemoryLayout dstMemoryLayout : 1;
+                bool multiLineEnable : 1;
+                bool remapEnable : 1;
+                bool rmwDisable : 1;
+                Type srcType : 1;
+                Type dstType : 1;
+                SemaphoreReduction semaphoreReduction : 4;
+                SemaphoreReductionSign semaphoreReductionSign : 1;
+                bool reductionEnable : 1;
+                BypassL2 bypassL2 : 1;
+                u16 _pad_ : 11;
+            };
+            static_assert(sizeof(LaunchDma) == 4);
+
+            Register<0xC0, LaunchDma> launchDma;
+
+            Register<0x100, Address> offsetIn;
+            Register<0x102, Address> offsetOut;
+
+            Register<0x104, u32> pitchIn;
+            Register<0x105, u32> pitchOut;
+
+            Register<0x106, u32> lineLengthIn;
+            Register<0x107, u32> lineCount;
+
+            Register<0x1C0, u32> remapConstA;
+            Register<0x1C1, u32> remapConstB;
+
+            struct RemapComponents {
+                enum class Swizzle : u8 {
+                    SrcX = 0,
+                    SrcY = 1,
+                    SrcZ = 2,
+                    SrcW = 3,
+                    ConstA = 4,
+                    ConstB = 5,
+                    NoWrite = 6
+                };
+
+                Address address;
+
+                Swizzle dstX : 3;
+                u8 _pad0_ : 1;
+                Swizzle dstY : 3;
+                u8 _pad1_ : 1;
+                Swizzle dstZ : 3;
+                u8 _pad2_ : 1;
+                Swizzle dstW : 3;
+                u8 _pad3_ : 1;
+
+                u8 componentSizeMinusOne : 2;
+                u8 _pad4_ : 2;
+                u8 numSrcComponentsMinusOne : 2;
+                u8 _pad5_ : 2;
+                u8 numDstComponentsMinusOne : 2;
+                u8 _pad6_ : 6;
+            };
+            static_assert(sizeof(RemapComponents) == 0xC);
+
+            Register<0x1C2, RemapComponents> remapComponents;
+
+            struct Surface {
+                // Nvidias docs here differ from other emus and deko3d so go with what they say
+                struct {
+                    u8 width : 4;
+                    u8 height : 4;
+                    u8 depth : 4;
+                    u8 gobHeight : 4;
+                    u16 _pad_;
+                } blockSize;
+                u32 width;
+                u32 height;
+                u32 depth;
+                u32 layer;
+
+                struct {
+                    u16 x;
+                    u16 y;
+                } origin;
+            };
+            static_assert(sizeof(Surface) == 0x18);
+
+            Register<0x1C3, Surface> dstSurface;
+            Register<0x1CA, Surface> srcSurface;
+        } registers{};
+        static_assert(sizeof(Registers) == (EngineMethodsEnd * 0x4));
+        #pragma pack(pop)
+
+        MaxwellDma(const DeviceState &state, ChannelContext &channelCtx);
+
+        void CallMethod(u32 method, u32 argument);
+
+        void CallMethodBatchNonInc(u32 method, span<u32> arguments);
+    };
+}
--- a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
@ -112,11 +112,14 @@ namespace skyline::soc::gm20b {
            case SubchannelId::ThreeD:
                channelCtx.maxwell3D->CallMethod(method, argument);
                break;
+            case SubchannelId::Compute:
+                channelCtx.keplerCompute.CallMethod(method, argument);
+                break;
            case SubchannelId::Inline2Mem:
                channelCtx.inline2Memory.CallMethod(method, argument);
                break;
-            case SubchannelId::Compute:
-                channelCtx.keplerCompute.CallMethod(method, argument);
+            case SubchannelId::Copy:
+                channelCtx.maxwellDma.CallMethod(method, argument);
                break;
            default:
                Logger::Warn("Called method 0x{:X} in unimplemented engine 0x{:X}, args: 0x{:X}", method, subChannel, argument);
@ -129,11 +132,14 @@ namespace skyline::soc::gm20b {
            case SubchannelId::ThreeD:
                channelCtx.maxwell3D->CallMethodBatchNonInc(method, arguments);
                break;
+            case SubchannelId::Compute:
+                channelCtx.keplerCompute.CallMethodBatchNonInc(method, arguments);
+                break;
            case SubchannelId::Inline2Mem:
                channelCtx.inline2Memory.CallMethodBatchNonInc(method, arguments);
                break;
-            case SubchannelId::Compute:
-                channelCtx.keplerCompute.CallMethodBatchNonInc(method, arguments);
+            case SubchannelId::Copy:
+                channelCtx.maxwellDma.CallMethodBatchNonInc(method, arguments);
                break;
            default:
                Logger::Warn("Called method 0x{:X} in unimplemented engine 0x{:X} with batch args", method, subChannel);