Implement a skeleton compute engine

The Kepler compute engine is used to run compute jobs encapsulated in to QMDs on the GPU, this commit doesn't implement compute itself but adds the register and QMD structs that will be needed for it in the future.
2024-06-15 00:58:43 +02:00 · 2022-03-20 18:08:11 +00:00 · 2022-03-20 18:08:11 +00:00 · ae41ddf4f0
commit ae41ddf4f0
parent 0298a7b1f6
7 changed files with 503 additions and 0 deletions
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@ -184,6 +184,7 @@ add_library(skyline SHARED
        ${source_DIR}/skyline/soc/gm20b/engines/gpfifo.cpp
        ${source_DIR}/skyline/soc/gm20b/engines/maxwell_3d.cpp
        ${source_DIR}/skyline/soc/gm20b/engines/inline2memory.cpp
+        ${source_DIR}/skyline/soc/gm20b/engines/kepler_compute.cpp
        ${source_DIR}/skyline/soc/gm20b/engines/maxwell/initialization.cpp
        ${source_DIR}/skyline/input/npad.cpp
        ${source_DIR}/skyline/input/npad_device.cpp
--- a/app/src/main/cpp/skyline/soc/gm20b/channel.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/channel.cpp
@ -9,6 +9,7 @@ namespace skyline::soc::gm20b {
        : asCtx(std::move(pAsCtx)),
          executor(state),
          maxwell3D(std::make_unique<engine::maxwell3d::Maxwell3D>(state, *this, macroState, executor)),
+          keplerCompute(state, *this),
          inline2Memory(asCtx),
          gpfifo(state, *this, numEntries) {}
 }
--- a/app/src/main/cpp/skyline/soc/gm20b/channel.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/channel.h
@ -6,6 +6,7 @@
 #include <gpu/interconnect/command_executor.h>
 #include "macro/macro_state.h"
 #include "engines/engine.h"
+#include "engines/kepler_compute.h"
 #include "engines/inline2memory.h"
 #include "gpfifo.h"

@ -25,6 +26,7 @@ namespace skyline::soc::gm20b {
        gpu::interconnect::CommandExecutor executor;
        MacroState macroState;
        std::unique_ptr<engine::maxwell3d::Maxwell3D> maxwell3D; //!< TODO: fix this once graphics context is moved into a cpp file
+        engine::KeplerCompute keplerCompute;
        engine::Inline2Memory inline2Memory;
        ChannelGpfifo gpfifo;

--- a/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute.cpp
@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <soc.h>
+#include <soc/gm20b/channel.h>
+#include "kepler_compute/qmd.h"
+#include "kepler_compute.h"
+
+namespace skyline::soc::gm20b::engine {
+    KeplerCompute::KeplerCompute(const DeviceState &state, ChannelContext &channelCtx)
+        : syncpoints(state.soc->host1x.syncpoints), i2m(channelCtx.asCtx) {}
+
+    __attribute__((always_inline)) void KeplerCompute::CallMethod(u32 method, u32 argument) {
+        Logger::Verbose("Called method in Kepler compute: 0x{:X} args: 0x{:X}", method, argument);
+
+        HandleMethod(method, argument);
+    }
+
+#define KEPLER_COMPUTE_OFFSET(field) (sizeof(typeof(Registers::field)) - sizeof(std::remove_reference_t<decltype(*Registers::field)>)) / sizeof(u32)
+#define KEPLER_COMPUTE_STRUCT_OFFSET(field, member) KEPLER_COMPUTE_OFFSET(field) + U32_OFFSET(std::remove_reference_t<decltype(*Registers::field)>, member)
+
+    void KeplerCompute::HandleMethod(u32 method, u32 argument) {
+        registers.raw[method] = argument;
+
+        switch (method) {
+            case KEPLER_COMPUTE_STRUCT_OFFSET(i2m, launchDma):
+                i2m.LaunchDma(*registers.i2m);
+                return;
+            case KEPLER_COMPUTE_STRUCT_OFFSET(i2m, loadInlineData):
+                i2m.LoadInlineData(*registers.i2m, argument);
+                return;
+            case KEPLER_COMPUTE_OFFSET(sendSignalingPcasB):
+                Logger::Warn("Attempted to execute compute kernel!");
+                return;
+            case KEPLER_COMPUTE_STRUCT_OFFSET(reportSemaphore, action):
+                throw exception("Compute semaphores are unimplemented!");
+                return;
+            default:
+                return;
+        }
+
+    }
+
+    void KeplerCompute::CallMethodBatchNonInc(u32 method, span<u32> arguments) {
+        switch (method) {
+            case KEPLER_COMPUTE_STRUCT_OFFSET(i2m, loadInlineData):
+                i2m.LoadInlineData(*registers.i2m, arguments);
+                return;
+            default:
+                break;
+        }
+
+        for (u32 argument : arguments)
+            HandleMethod(method, argument);
+    }
+
+#undef KEPLER_COMPUTE_STRUCT_OFFSET
+#undef KEPLER_COMPUTE_OFFSET
+}
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute.h
@ -0,0 +1,170 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+// Copyright © 2018-2020 fincs (https://github.com/devkitPro/deko3d)
+
+#pragma once
+
+#include "engine.h"
+#include "inline2memory.h"
+
+namespace skyline::soc::gm20b {
+    struct ChannelContext;
+}
+
+namespace skyline::soc::gm20b::engine {
+    /**
+    * @brief The Kepler Compute Engine is used to execute compute jobs on the GPU
+    */
+    class KeplerCompute {
+      private:
+        host1x::SyncpointSet &syncpoints;
+        Inline2MemoryBackend i2m;
+
+        void HandleMethod(u32 method, u32 argument);
+
+      public:
+        /**
+         * @url https://github.com/devkitPro/deko3d/blob/master/source/maxwell/engine_compute.def
+         */
+        #pragma pack(push, 1)
+        union Registers {
+            std::array<u32, EngineMethodsEnd> raw;
+
+            template<size_t Offset, typename Type>
+            using Register = util::OffsetMember<Offset, Type, u32>;
+
+            Register<0x44, u32> waitForIdle;
+            Register<0x60, Inline2MemoryBackend::RegisterState> i2m;
+            Register<0x85, u32> setShaderSharedMemoryWindow;
+
+            struct InvalidateShaderCaches {
+                bool instruction : 1;
+                bool locks : 1;
+                bool flushData : 1;
+                u8 _pad0_ : 1;
+                bool data : 1;
+                u8 _pad1_ : 7;
+                bool constant : 1;
+                u32 _pad2_ : 19;
+            };
+            static_assert(sizeof(InvalidateShaderCaches) == 0x4);
+
+            Register<0x87, InvalidateShaderCaches> invalidateShaderCaches;
+
+            struct SendPcas {
+                u32 qmdAddressShifted8;
+                u32 from : 24;
+                u8 delta;
+            };
+            static_assert(sizeof(SendPcas) == 0x8);
+
+            Register<0xAD, SendPcas> sendPcas;
+
+            struct SendSignalingPcasB {
+                bool invalidate : 1;
+                bool schedule : 1;
+                u32 _pad_ : 30;
+            };
+            static_assert(sizeof(SendSignalingPcasB) == 0x4);
+
+            Register<0xAF, SendSignalingPcasB> sendSignalingPcasB;
+
+            struct ShaderLocalMemory {
+                u8 sizeUpper;
+                u32 _pad0_ : 24;
+                u32 sizeLower;
+                u16 maxSmCount : 9;
+                u32 _pad1_ : 23;
+            };
+            static_assert(sizeof(ShaderLocalMemory) == 0xC);
+
+            Register<0xB9, ShaderLocalMemory> shaderLocalMemoryNonThrottled;
+            Register<0xBC, ShaderLocalMemory> shaderLocalMemoryThrottled;
+
+            struct SpaVersion {
+                u8 minor;
+                u8 major;
+                u16 _pad_;
+            };
+            static_assert(sizeof(SpaVersion) == 0x4);
+
+            Register<0xC4, SpaVersion> spaVersion;
+
+            Register<0x1DF, u32> shaderLocalMemoryWindow;
+            Register<0x1E4, Address> shaderLocalMemory;
+
+            Register<0x54A, u32> shaderExceptions;
+
+            Register<0x557, Address> texSamplerPool;
+            Register<0x559, u32> texSamplerPoolMaximumIndex;
+            Register<0x55D, Address> texHeaderPool;
+            Register<0x55F, u32> texHeaderPoolMaximumIndex;
+
+            Register<0x582, Address> programRegion;
+
+            struct ReportSemaphore {
+                enum class Op : u8 {
+                    Release = 0,
+                    Trap = 3
+                };
+
+                enum class ReductionOp : u8 {
+                    Add = 0,
+                    Min = 1,
+                    Max = 2,
+                    Inc = 3,
+                    Dec = 4,
+                    And = 5,
+                    Or = 6,
+                    Xor = 7
+                };
+
+                enum class Format : u8 {
+                    Unsigned32 = 0,
+                    Signed32 = 1
+                };
+
+                enum class StructureSize : u8 {
+                    FourWords = 0,
+                    OneWord = 1
+                };
+
+                Address offset;
+                u32 payload;
+                struct {
+                    Op op : 2;
+                    bool flushDisable : 1;
+                    bool reductionEnable : 1;
+                    u8 _pad0_ : 5;
+                    ReductionOp reductionOp : 3;
+                    u8 _pad1_ : 5;
+                    Format format : 2;
+                    u8 _pad2_ : 1;
+                    bool awakenEnable : 1;
+                    u8 _pad3_ : 7;
+                    StructureSize structureSize : 1;
+                    u8 _pad4_ : 3;
+                } action;
+            };
+            static_assert(sizeof(ReportSemaphore) == 0x10);
+
+            Register<0x6C0, ReportSemaphore> reportSemaphore;
+
+            struct BindlessTexture {
+                u8 constantBufferSlotSelect : 3;
+                u32 _pad_ : 29;
+            };
+            static_assert(sizeof(BindlessTexture) == 0x4);
+
+            Register<0x982, BindlessTexture> bindlessTexture;
+        } registers{};
+        static_assert(sizeof(Registers) == (EngineMethodsEnd * 0x4));
+        #pragma pack(pop)
+
+        KeplerCompute(const DeviceState &state, ChannelContext &channelCtx);
+
+        void CallMethod(u32 method, u32 argument);
+
+        void CallMethodBatchNonInc(u32 method, span<u32> arguments);
+    };
+}
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute/qmd.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute/qmd.h
@ -0,0 +1,264 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+// Copyright © 2018-2020 fincs (https://github.com/devkitPro/deko3d)
+
+#pragma once
+
+#include <common.h>
+
+namespace skyline::soc::gm20b::engine::kepler_compute {
+    #pragma pack(push, 1)
+
+    /**
+     * @brief Holds the 'Compute Queue Metadata' structure which encapsulates the state needed to execute a compute task
+     * @url https://github.com/devkitPro/deko3d/blob/master/source/maxwell/compute_qmd.h
+     */
+    struct QMD {
+        enum class DependentQmdType : u32 {
+            Queue = 0,
+            Grid = 1
+        };
+
+        enum class ReleaseMemBarType : u32 {
+            FeNone = 0,
+            FeSysmem = 1
+        };
+
+        enum class CwdMemBarType : u32 {
+            L1None = 0,
+            L1SysmemBar = 1,
+            L1MemBar = 2
+        };
+
+        enum class Fp32NanBehaviour : u32 {
+            Legacy = 0,
+            Fp64Compatible = 1
+        };
+
+        enum class Fp32F2iNanBehavior : u32 {
+            PassZero = 0,
+            PassIndefinite = 1
+        };
+
+        enum class ApiVisibleCallLimit : u32 {
+            ThirtyTwo = 0,
+            NoCheck = 1
+        };
+
+        enum class SharedMemoryBankMapping : u32 {
+            FourBytesPerBank = 0,
+            EightBytesPerBank = 1
+        };
+
+        enum class SamplerIndex : u32 {
+            Independently = 0,
+            ViaHeaderIndex = 1
+        };
+
+        enum class Fp32NarrowInstruction : u32 {
+            KeepDenorms = 0,
+            FlushDenorms = 1
+        };
+
+        enum class L1Configuration : u32 {
+            DirectlyAddressableMemorySize16Kb = 0,
+            DirectlyAddressableMemorySize32Kb = 1,
+            DirectlyAddressableMemorySize48Kb = 2
+        };
+
+        enum class ReductionOp : u32 {
+            RedAdd = 0,
+            RedMin = 1,
+            RedMax = 2,
+            RedInc = 3,
+            RedDec = 4,
+            RedAnd = 5,
+            RedOr = 6,
+            RedXor = 7
+        };
+
+        enum class ReductionFormat : u32 {
+            Unsigned32 = 0,
+            Signed32 = 1
+        };
+
+        enum class StructureSize : u32 {
+            FourWords = 0,
+            OneWord = 1
+        };
+
+        u32 outerPut : 31;
+        u32 outerOverflow : 1;
+        u32 outerGet : 31;
+        u32 outerStickyOverflow : 1;
+
+        u32 innerGet : 31;
+        u32 innerOverflow : 1;
+        u32 innerPut : 31;
+        u32 innerStickyOverflow : 1;
+
+        u32 qmdReservedAA;
+
+        u32 dependentQmdPointer;
+
+        u32 qmdGroupId : 6;
+
+        u32 smGlobalCachingEnable : 1;
+
+        u32 runCtaInOneSmPartition : 1;
+
+        u32 isQueue : 1;
+
+        u32 addToHeadOfQmdGroupLinkedList : 1;
+
+        u32 semaphoreReleaseEnable0 : 1;
+        u32 semaphoreReleaseEnable1 : 1;
+
+        u32 requireSchedulingPcas : 1;
+        u32 dependentQmdScheduleEnable : 1;
+        DependentQmdType dependentQmdType : 1;
+        u32 dependentQmdFieldCopy : 1;
+
+        u32 qmdReservedB : 16;
+
+        u32 circularQueueSize : 25;
+
+        u32 qmdReservedC : 1;
+
+        u32 invalidateTextureHeaderCache : 1;
+        u32 invalidateTextureSamplerCache : 1;
+        u32 invalidateTextureDataCache : 1;
+        u32 invalidateShaderDataCache : 1;
+        u32 invalidateInstructionCache : 1;
+        u32 invalidateShaderConstantCache : 1;
+
+        u32 programOffset;
+
+        u32 circularQueueAddrLower;
+        u32 circularQueueAddrUpper : 8;
+
+        u32 qmdReservedD : 8;
+
+        u32 circularQueueEntrySize : 16;
+
+        u32 cwdReferenceCountId : 6;
+        u32 cwdReferenceCountDeltaMinusOne : 8;
+
+        ReleaseMemBarType releaseMembarType : 1;
+
+        u32 cwdReferenceCountIncrEnable : 1;
+        CwdMemBarType cwdMembarType : 2;
+
+        u32 sequentiallyRunCtas : 1;
+
+        u32 cwdReferenceCountDecrEnable : 1;
+
+        u32 throttled : 1;
+
+        u32 _pad0_ : 3;
+
+        Fp32NanBehaviour fp32NanBehavior : 1;
+
+        Fp32F2iNanBehavior fp32F2iNanBehavior : 1;
+
+        ApiVisibleCallLimit apiVisibleCallLimit : 1;
+
+        SharedMemoryBankMapping sharedMemoryBankMapping : 1;
+
+        u32 _pad1_ : 2;
+
+        SamplerIndex samplerIndex : 1;
+
+        Fp32NarrowInstruction fp32NarrowInstruction : 1;
+
+        u32 ctaRasterWidth;
+        u32 ctaRasterHeight : 16;
+        u32 ctaRasterDepth : 16;
+
+        u32 ctaRasterWidthResume;
+        u32 ctaRasterHeightResume : 16;
+        u32 ctaRasterDepthResume : 16;
+
+        u32 queueEntriesPerCtaMinusOne : 7;
+
+        u32 _pad2_ : 3;
+
+        u32 coalesceWaitingPeriod : 8;
+
+        u32 _pad3_ : 14;
+
+        u32 sharedMemorySize : 18;
+
+        u32 qmdReservedG : 14;
+
+        u32 qmdVersion : 4;
+        u32 qmdMajorVersion : 4;
+
+        u32 qmdReservedH : 8;
+
+        u32 ctaThreadDimension0 : 16;
+        u32 ctaThreadDimension1 : 16;
+        u32 ctaThreadDimension2 : 16;
+
+        u32 constantBufferValid : 8;
+
+        u32 qmdReservedI : 21;
+
+        L1Configuration l1Configuration : 3;
+
+        u32 smDisableMaskLower;
+        u32 smDisableMaskUpper;
+
+        struct {
+            u32 addressLower;
+            u32 addressUpper : 8;
+            u32 qmdReservedJL : 8;
+            u32 _pad4_ : 4;
+            ReductionOp reductionOp : 3;
+            u32 qmdReservedKM : 1;
+            ReductionFormat reductionFormat : 2;
+            u32 reductionEnable : 1;
+            u32 _pad5_ : 4;
+            StructureSize structureSize : 1;
+            u32 payload;
+        } release[2];
+
+        struct {
+            u32 addrLower;
+            u32 addrUpper : 8;
+            u32 reservedAddr : 6;
+            u32 invalidate : 1;
+            u32 size : 17;
+        } constantBuffer[8];
+
+        u32 shaderLocalMemoryLowSize : 24;
+
+        u32 qmdReservedN : 3;
+
+        u32 barrierCount : 5;
+        u32 shaderLocalMemoryHighSize : 24;
+        u32 registerCount : 8;
+        u32 shaderLocalMemoryCrsSize : 24;
+
+        u32 sassVersion : 8;
+
+        u32 hwOnlyInnerGet : 31;
+        u32 hwOnlyRequireSchedulingPcas : 1;
+        u32 hwOnlyInnerPut : 31;
+        u32 hwOnlyScgType : 1;
+        u32 hwOnlySpanListHeadIndex : 30;
+
+        u32 qmdReservedQ : 1;
+
+        u32 hwOnlySpanListHeadIndexValid : 1;
+        u32 hwOnlySkedNextQmdPointer;
+
+        u32 qmdSpareEFGHIJKLMN[10];
+
+        u32 debugIdLower;
+        u32 debugIdUpper;
+    };
+    static_assert(sizeof(QMD) == 0x100);
+    #pragma pack(pop)
+
+}
--- a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
@ -115,6 +115,9 @@ namespace skyline::soc::gm20b {
            case SubchannelId::Inline2Mem:
                channelCtx.inline2Memory.CallMethod(method, argument);
                break;
+            case SubchannelId::Compute:
+                channelCtx.keplerCompute.CallMethod(method, argument);
+                break;
            default:
                Logger::Warn("Called method 0x{:X} in unimplemented engine 0x{:X}, args: 0x{:X}", method, subChannel, argument);
                break;
@ -129,6 +132,9 @@ namespace skyline::soc::gm20b {
            case SubchannelId::Inline2Mem:
                channelCtx.inline2Memory.CallMethodBatchNonInc(method, arguments);
                break;
+            case SubchannelId::Compute:
+                channelCtx.keplerCompute.CallMethodBatchNonInc(method, arguments);
+                break;
            default:
                Logger::Warn("Called method 0x{:X} in unimplemented engine 0x{:X} with batch args", method, subChannel);
                break;