Optimise GPFIFO command processing for higher throughput

Using a u32 for the loop index prevents masking on all increments, giving a moderate performance increase. Passing methods as u32 parameters and stopping subChannel being passed gives quite a significant increase when combined with the inlining allowed by subchannel based engine selection.
2025-02-02 11:02:33 +01:00 · 2021-07-01 20:21:17 +01:00 · 2021-07-01 20:21:17 +01:00 · 39faa739b9
commit 39faa739b9
parent 3d538a29da
7 changed files with 127 additions and 124 deletions
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/engine.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/engine.h
@ -16,16 +16,6 @@ namespace skyline::soc::gm20b {
        MaxwellDma = 0xB0B5,
    };

-    /**
-     * @brief The parameters of a GPU engine method call
-     */
-    struct MethodParams {
-        u16 method;
-        u32 argument;
-        u32 subChannel;
-        bool lastCall; //!< If this is the last call in the pushbuffer entry to this specific macro
-    };
-
    namespace engine {
        /**
         * @brief The Engine class provides an interface that can be used to communicate with the GPU's internal engines
@ -37,13 +27,11 @@ namespace skyline::soc::gm20b {
          public:
            Engine(const DeviceState &state) : state(state) {}

-            virtual ~Engine() = default;
-
            /**
             * @brief Calls an engine method with the given parameters
             */
-            virtual void CallMethod(MethodParams params) {
-                state.logger->Warn("Called method in unimplemented engine: 0x{:X} args: 0x{:X}", params.method, params.argument);
+            void CallMethod(u32 method, u32 argument, bool lastCall) {
+                state.logger->Warn("Called method in unimplemented engine: 0x{:X} args: 0x{:X}", method, argument);
            };
        };
    }
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.h
@ -167,10 +167,10 @@ namespace skyline::soc::gm20b::engine {
      public:
        GPFIFO(const DeviceState &state) : Engine(state) {}

-        void CallMethod(MethodParams params) override {
-            state.logger->Debug("Called method in GPFIFO: 0x{:X} args: 0x{:X}", params.method, params.argument);
+        void CallMethod(u32 method, u32 argument, bool lastCall) {
+            state.logger->Debug("Called method in GPFIFO: 0x{:X} args: 0x{:X}", method, argument);

-            registers.raw[params.method] = params.argument;
+            registers.raw[method] = argument;
        };
    };
 }
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/macro_interpreter.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/macro_interpreter.cpp
@ -193,7 +193,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
    }

    FORCE_INLINE void MacroInterpreter::Send(u32 pArgument) {
-        maxwell3D.CallMethod(MethodParams{methodAddress.address, pArgument, 0, true});
+        maxwell3D.CallMethod(methodAddress.address, pArgument, true);
        methodAddress.address += methodAddress.increment;
    }

--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
@ -72,48 +72,62 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
        registers.viewportTransformEnable = true;
    }

-    void Maxwell3D::CallMethod(MethodParams params) {
-        state.logger->Debug("Called method in Maxwell 3D: 0x{:X} args: 0x{:X}", params.method, params.argument);
+    void Maxwell3D::CallMethod(u32 method, u32 argument, bool lastCall) {
+        state.logger->Debug("Called method in Maxwell 3D: 0x{:X} args: 0x{:X}", method, argument);

        // Methods that are greater than the register size are for macro control
-        if (params.method > RegisterCount) {
-            if (!(params.method & 1))
-                macroInvocation.index = ((params.method - RegisterCount) >> 1) % macroPositions.size();
+        if (method > RegisterCount) [[unlikely]] {
+            // Starting a new macro at index 'method - RegisterCount'
+            if (!(method & 1)) {
+                if (macroInvocation.index != -1) {
+                    // Flush the current macro as we are switching to another one
+                    macroInterpreter.Execute(macroPositions[macroInvocation.index], macroInvocation.arguments);
+                    macroInvocation.arguments.clear();
+                }

-            macroInvocation.arguments.push_back(params.argument);
-
-            // Macros are always executed on the last method call in a pushbuffer entry
-            if (params.lastCall) {
-                macroInterpreter.Execute(macroPositions[macroInvocation.index], macroInvocation.arguments);
-
-                macroInvocation.arguments.clear();
-                macroInvocation.index = 0;
+                // Setup for the new macro index
+                macroInvocation.index = ((method - RegisterCount) >> 1) % macroPositions.size();
            }
+
+            macroInvocation.arguments.emplace_back(argument);
+
+            // Flush macro after all of the data in the method call has been sent
+            if (lastCall && macroInvocation.index != -1) {
+                macroInterpreter.Execute(macroPositions[macroInvocation.index], macroInvocation.arguments);
+                macroInvocation.arguments.clear();
+                macroInvocation.index = -1;
+            }
+
+            // Bail out early
            return;
        }

-        registers.raw[params.method] = params.argument;
+        registers.raw[method] = argument;

        if (shadowRegisters.mme.shadowRamControl == Registers::MmeShadowRamControl::MethodTrack || shadowRegisters.mme.shadowRamControl == Registers::MmeShadowRamControl::MethodTrackWithFilter)
-            shadowRegisters.raw[params.method] = params.argument;
+            shadowRegisters.raw[method] = argument;
        else if (shadowRegisters.mme.shadowRamControl == Registers::MmeShadowRamControl::MethodReplay)
-            params.argument = shadowRegisters.raw[params.method];
+            argument = shadowRegisters.raw[method];

-        switch (params.method) {
+        switch (method) {
            case MAXWELL3D_OFFSET(mme.instructionRamLoad):
                if (registers.mme.instructionRamPointer >= macroCode.size())
                    throw exception("Macro memory is full!");

-                macroCode[registers.mme.instructionRamPointer++] = params.argument;
+                macroCode[registers.mme.instructionRamPointer++] = argument;
+
+                // Wraparound writes
+                registers.mme.instructionRamPointer %= macroCode.size();
+
                break;
            case MAXWELL3D_OFFSET(mme.startAddressRamLoad):
                if (registers.mme.startAddressRamPointer >= macroPositions.size())
                    throw exception("Maximum amount of macros reached!");

-                macroPositions[registers.mme.startAddressRamPointer++] = params.argument;
+                macroPositions[registers.mme.startAddressRamPointer++] = argument;
                break;
            case MAXWELL3D_OFFSET(mme.shadowRamControl):
-                shadowRegisters.mme.shadowRamControl = static_cast<Registers::MmeShadowRamControl>(params.argument);
+                shadowRegisters.mme.shadowRamControl = static_cast<Registers::MmeShadowRamControl>(argument);
                break;
            case MAXWELL3D_OFFSET(syncpointAction):
                state.logger->Debug("Increment syncpoint: {}", static_cast<u16>(registers.syncpointAction.id));
@ -135,6 +149,8 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
            case MAXWELL3D_OFFSET(firmwareCall[4]):
                registers.raw[0xD00] = 1;
                break;
+            default:
+                break;
        }
    }

--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h
@ -17,7 +17,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
        std::array<size_t, 0x80> macroPositions{}; //!< The positions of each individual macro in macro memory, there can be a maximum of 0x80 macros at any one time

        struct {
-            u32 index;
+            i32 index;
            std::vector<u32> arguments;
        } macroInvocation{}; //!< Data for a macro that is pending execution

@ -557,7 +557,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
        Registers registers{};
        Registers shadowRegisters{}; //!< The shadow registers, their function is controlled by the 'shadowRamControl' register

-        std::array<u32, 0x10000> macroCode{}; //!< This stores GPU macros, the 256KiB size is from Ryujinx
+        std::array<u32, 0x2000> macroCode{}; //!< This stores GPU macros, writes to it will wraparound on overflow

        Maxwell3D(const DeviceState &state);

@ -566,6 +566,6 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
         */
        void ResetRegs();

-        void CallMethod(MethodParams params) override;
+        void CallMethod(u32 method, u32 argument, bool lastCall);
    };
 }
--- a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp
@ -5,41 +5,90 @@
 #include <loader/loader.h>
 #include <kernel/types/KProcess.h>
 #include <soc.h>
+#include <os.h>

 namespace skyline::soc::gm20b {
-    void GPFIFO::Send(MethodParams params) {
-        state.logger->Debug("Called GPU method - method: 0x{:X} argument: 0x{:X} subchannel: 0x{:X} last: {}", params.method, params.argument, params.subChannel, params.lastCall);
+    /**
+     * @brief A single pushbuffer method header that describes a compressed method sequence
+     * @url https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/manuals/volta/gv100/dev_ram.ref.txt#L850
+     * @url https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/classes/host/clb06f.h#L179
+     */
+    union PushBufferMethodHeader {
+        u32 raw;

-        if (params.method == 0) {
-            switch (static_cast<EngineID>(params.argument)) {
-                case EngineID::Fermi2D:
-                    subchannels.at(params.subChannel) = &state.soc->gm20b.fermi2D;
+        enum class TertOp : u8 {
+            Grp0IncMethod = 0,
+            Grp0SetSubDevMask = 1,
+            Grp0StoreSubDevMask = 2,
+            Grp0UseSubDevMask = 3,
+            Grp2NonIncMethod = 0,
+        };
+
+        enum class SecOp : u8 {
+            Grp0UseTert = 0,
+            IncMethod = 1,
+            Grp2UseTert = 2,
+            NonIncMethod = 3,
+            ImmdDataMethod = 4,
+            OneInc = 5,
+            Reserved6 = 6,
+            EndPbSegment = 7,
+        };
+
+        u16 methodAddress : 12;
+        struct {
+            u8 _pad0_ : 4;
+            u16 subDeviceMask : 12;
+        };
+
+        struct {
+            u16 _pad1_ : 13;
+            u8 methodSubChannel : 3;
+            union {
+                TertOp tertOp : 3;
+                u16 methodCount : 13;
+                u16 immdData : 13;
+            };
+        };
+
+        struct {
+            u32 _pad2_ : 29;
+            SecOp secOp : 3;
+        };
+    };
+    static_assert(sizeof(PushBufferMethodHeader) == sizeof(u32));
+
+    void GPFIFO::Send(u32 method, u32 argument, u32 subChannel, bool lastCall) {
+        constexpr u32 ThreeDSubChannel{0};
+        constexpr u32 ComputeSubChannel{1};
+        constexpr u32 Inline2MemorySubChannel{2};
+        constexpr u32 TwoDSubChannel{3};
+        constexpr u32 CopySubChannel{4}; // HW forces a memory flush on a switch from this subchannel to others
+
+        state.logger->Debug("Called GPU method - method: 0x{:X} argument: 0x{:X} subchannel: 0x{:X} last: {}", method, argument, subChannel, lastCall);
+
+        if (method < engine::GPFIFO::RegisterCount) {
+            gpfifoEngine.CallMethod(method, argument, lastCall);
+        } else {
+            switch (subChannel) {
+                case ThreeDSubChannel:
+                    state.soc->gm20b.maxwell3D.CallMethod(method, argument, lastCall);
                    break;
-                case EngineID::KeplerMemory:
-                    subchannels.at(params.subChannel) = &state.soc->gm20b.keplerMemory;
+                case ComputeSubChannel:
+                    state.soc->gm20b.maxwellCompute.CallMethod(method, argument, lastCall);
                    break;
-                case EngineID::Maxwell3D:
-                    subchannels.at(params.subChannel) = &state.soc->gm20b.maxwell3D;
+                case Inline2MemorySubChannel:
+                    state.soc->gm20b.keplerMemory.CallMethod(method, argument, lastCall);
                    break;
-                case EngineID::MaxwellCompute:
-                    subchannels.at(params.subChannel) = &state.soc->gm20b.maxwellCompute;
+                case TwoDSubChannel:
+                    state.soc->gm20b.fermi2D.CallMethod(method, argument, lastCall);
                    break;
-                case EngineID::MaxwellDma:
-                    subchannels.at(params.subChannel) = &state.soc->gm20b.maxwellDma;
+                case CopySubChannel:
+                    state.soc->gm20b.maxwellDma.CallMethod(method, argument, lastCall);
                    break;
                default:
-                    throw exception("Unknown engine 0x{:X} cannot be bound to subchannel {}", params.argument, params.subChannel);
+                    throw exception("Tried to call into a software subchannel: {}!", subChannel);
            }
-
-            state.logger->Info("Bound GPU engine 0x{:X} to subchannel {}", params.argument, params.subChannel);
-            return;
-        } else if (params.method < engine::GPFIFO::RegisterCount) {
-            gpfifoEngine.CallMethod(params);
-        } else {
-            if (subchannels.at(params.subChannel) == nullptr)
-                throw exception("Calling method on unbound channel");
-
-            subchannels.at(params.subChannel)->CallMethod(params);
        }
    }

@ -66,22 +115,22 @@ namespace skyline::soc::gm20b {
            PushBufferMethodHeader methodHeader{.raw = *entry};
            switch (methodHeader.secOp) {
                case PushBufferMethodHeader::SecOp::IncMethod:
-                    for (u16 i{}; i < methodHeader.methodCount; i++)
-                        Send(MethodParams{static_cast<u16>(methodHeader.methodAddress + i), *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1});
+                    for (u32 i{}; i < methodHeader.methodCount; i++)
+                        Send(methodHeader.methodAddress + i, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);
                    break;

                case PushBufferMethodHeader::SecOp::NonIncMethod:
-                    for (u16 i{}; i < methodHeader.methodCount; i++)
-                        Send(MethodParams{methodHeader.methodAddress, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1});
+                    for (u32 i{}; i < methodHeader.methodCount; i++)
+                        Send(methodHeader.methodAddress, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);
                    break;

                case PushBufferMethodHeader::SecOp::OneInc:
-                    for (u16 i{}; i < methodHeader.methodCount; i++)
-                        Send(MethodParams{static_cast<u16>(methodHeader.methodAddress + static_cast<bool>(i)), *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1});
+                    for (u32 i{}; i < methodHeader.methodCount; i++)
+                        Send(methodHeader.methodAddress + !!i, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);
                    break;

                case PushBufferMethodHeader::SecOp::ImmdDataMethod:
-                    Send(MethodParams{methodHeader.methodAddress, methodHeader.immdData, methodHeader.methodSubChannel, true});
+                    Send(methodHeader.methodAddress, methodHeader.immdData, methodHeader.methodSubChannel, true);
                    break;

                case PushBufferMethodHeader::SecOp::EndPbSegment:
--- a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.h
@ -72,56 +72,6 @@ namespace skyline::soc::gm20b {
    };
    static_assert(sizeof(GpEntry) == sizeof(u64));

-    /**
-     * @brief A single pushbuffer method header that describes a compressed method sequence
-     * @url https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/manuals/volta/gv100/dev_ram.ref.txt#L850
-     * @url https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/classes/host/clb06f.h#L179
-     */
-    union PushBufferMethodHeader {
-        u32 raw;
-
-        enum class TertOp : u8 {
-            Grp0IncMethod = 0,
-            Grp0SetSubDevMask = 1,
-            Grp0StoreSubDevMask = 2,
-            Grp0UseSubDevMask = 3,
-            Grp2NonIncMethod = 0,
-        };
-
-        enum class SecOp : u8 {
-            Grp0UseTert = 0,
-            IncMethod = 1,
-            Grp2UseTert = 2,
-            NonIncMethod = 3,
-            ImmdDataMethod = 4,
-            OneInc = 5,
-            Reserved6 = 6,
-            EndPbSegment = 7,
-        };
-
-        u16 methodAddress : 12;
-        struct {
-            u8 _pad0_ : 4;
-            u16 subDeviceMask : 12;
-        };
-
-        struct {
-            u16 _pad1_ : 13;
-            u8 methodSubChannel : 3;
-            union {
-                TertOp tertOp : 3;
-                u16 methodCount : 13;
-                u16 immdData : 13;
-            };
-        };
-
-        struct {
-            u32 _pad2_ : 29;
-            SecOp secOp : 3;
-        };
-    };
-    static_assert(sizeof(PushBufferMethodHeader) == sizeof(u32));
-
    /**
     * @brief The GPFIFO class handles creating pushbuffers from GP entries and then processing them
     * @note This class doesn't perfectly map to any particular hardware component on the X1, it does a mix of the GPU Host PBDMA (With  and handling the GPFIFO entries
@ -138,7 +88,7 @@ namespace skyline::soc::gm20b {
        /**
         * @brief Sends a method call to the GPU hardware
         */
-        void Send(MethodParams params);
+        void Send(u32 method, u32 argument, u32 subchannel, bool lastCall);

        /**
         * @brief Processes the pushbuffer contained within the given GpEntry, calling methods as needed