From 39faa739b95f9c0a6dd22316b469dd6e55e77b42 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Thu, 1 Jul 2021 20:21:17 +0100 Subject: [PATCH] Optimise GPFIFO command processing for higher throughput Using a u32 for the loop index prevents masking on all increments, giving a moderate performance increase. Passing methods as u32 parameters and stopping subChannel being passed gives quite a significant increase when combined with the inlining allowed by subchannel based engine selection. --- .../cpp/skyline/soc/gm20b/engines/engine.h | 16 +-- .../cpp/skyline/soc/gm20b/engines/gpfifo.h | 6 +- .../engines/maxwell/macro_interpreter.cpp | 2 +- .../skyline/soc/gm20b/engines/maxwell_3d.cpp | 56 +++++---- .../skyline/soc/gm20b/engines/maxwell_3d.h | 6 +- app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp | 113 +++++++++++++----- app/src/main/cpp/skyline/soc/gm20b/gpfifo.h | 52 +------- 7 files changed, 127 insertions(+), 124 deletions(-) diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/engine.h b/app/src/main/cpp/skyline/soc/gm20b/engines/engine.h index fcf30e18..d5381c2d 100644 --- a/app/src/main/cpp/skyline/soc/gm20b/engines/engine.h +++ b/app/src/main/cpp/skyline/soc/gm20b/engines/engine.h @@ -16,16 +16,6 @@ namespace skyline::soc::gm20b { MaxwellDma = 0xB0B5, }; - /** - * @brief The parameters of a GPU engine method call - */ - struct MethodParams { - u16 method; - u32 argument; - u32 subChannel; - bool lastCall; //!< If this is the last call in the pushbuffer entry to this specific macro - }; - namespace engine { /** * @brief The Engine class provides an interface that can be used to communicate with the GPU's internal engines @@ -37,13 +27,11 @@ namespace skyline::soc::gm20b { public: Engine(const DeviceState &state) : state(state) {} - virtual ~Engine() = default; - /** * @brief Calls an engine method with the given parameters */ - virtual void CallMethod(MethodParams params) { - state.logger->Warn("Called method in unimplemented engine: 0x{:X} args: 0x{:X}", params.method, params.argument); + void CallMethod(u32 method, u32 argument, bool lastCall) { + state.logger->Warn("Called method in unimplemented engine: 0x{:X} args: 0x{:X}", method, argument); }; }; } diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.h b/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.h index a4fbde79..1b36163e 100644 --- a/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.h +++ b/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.h @@ -167,10 +167,10 @@ namespace skyline::soc::gm20b::engine { public: GPFIFO(const DeviceState &state) : Engine(state) {} - void CallMethod(MethodParams params) override { - state.logger->Debug("Called method in GPFIFO: 0x{:X} args: 0x{:X}", params.method, params.argument); + void CallMethod(u32 method, u32 argument, bool lastCall) { + state.logger->Debug("Called method in GPFIFO: 0x{:X} args: 0x{:X}", method, argument); - registers.raw[params.method] = params.argument; + registers.raw[method] = argument; }; }; } diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/macro_interpreter.cpp b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/macro_interpreter.cpp index 552ae145..e889719d 100644 --- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/macro_interpreter.cpp +++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell/macro_interpreter.cpp @@ -193,7 +193,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d { } FORCE_INLINE void MacroInterpreter::Send(u32 pArgument) { - maxwell3D.CallMethod(MethodParams{methodAddress.address, pArgument, 0, true}); + maxwell3D.CallMethod(methodAddress.address, pArgument, true); methodAddress.address += methodAddress.increment; } diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp index 4bc7004b..e17b062c 100644 --- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp +++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp @@ -72,48 +72,62 @@ namespace skyline::soc::gm20b::engine::maxwell3d { registers.viewportTransformEnable = true; } - void Maxwell3D::CallMethod(MethodParams params) { - state.logger->Debug("Called method in Maxwell 3D: 0x{:X} args: 0x{:X}", params.method, params.argument); + void Maxwell3D::CallMethod(u32 method, u32 argument, bool lastCall) { + state.logger->Debug("Called method in Maxwell 3D: 0x{:X} args: 0x{:X}", method, argument); // Methods that are greater than the register size are for macro control - if (params.method > RegisterCount) { - if (!(params.method & 1)) - macroInvocation.index = ((params.method - RegisterCount) >> 1) % macroPositions.size(); + if (method > RegisterCount) [[unlikely]] { + // Starting a new macro at index 'method - RegisterCount' + if (!(method & 1)) { + if (macroInvocation.index != -1) { + // Flush the current macro as we are switching to another one + macroInterpreter.Execute(macroPositions[macroInvocation.index], macroInvocation.arguments); + macroInvocation.arguments.clear(); + } - macroInvocation.arguments.push_back(params.argument); - - // Macros are always executed on the last method call in a pushbuffer entry - if (params.lastCall) { - macroInterpreter.Execute(macroPositions[macroInvocation.index], macroInvocation.arguments); - - macroInvocation.arguments.clear(); - macroInvocation.index = 0; + // Setup for the new macro index + macroInvocation.index = ((method - RegisterCount) >> 1) % macroPositions.size(); } + + macroInvocation.arguments.emplace_back(argument); + + // Flush macro after all of the data in the method call has been sent + if (lastCall && macroInvocation.index != -1) { + macroInterpreter.Execute(macroPositions[macroInvocation.index], macroInvocation.arguments); + macroInvocation.arguments.clear(); + macroInvocation.index = -1; + } + + // Bail out early return; } - registers.raw[params.method] = params.argument; + registers.raw[method] = argument; if (shadowRegisters.mme.shadowRamControl == Registers::MmeShadowRamControl::MethodTrack || shadowRegisters.mme.shadowRamControl == Registers::MmeShadowRamControl::MethodTrackWithFilter) - shadowRegisters.raw[params.method] = params.argument; + shadowRegisters.raw[method] = argument; else if (shadowRegisters.mme.shadowRamControl == Registers::MmeShadowRamControl::MethodReplay) - params.argument = shadowRegisters.raw[params.method]; + argument = shadowRegisters.raw[method]; - switch (params.method) { + switch (method) { case MAXWELL3D_OFFSET(mme.instructionRamLoad): if (registers.mme.instructionRamPointer >= macroCode.size()) throw exception("Macro memory is full!"); - macroCode[registers.mme.instructionRamPointer++] = params.argument; + macroCode[registers.mme.instructionRamPointer++] = argument; + + // Wraparound writes + registers.mme.instructionRamPointer %= macroCode.size(); + break; case MAXWELL3D_OFFSET(mme.startAddressRamLoad): if (registers.mme.startAddressRamPointer >= macroPositions.size()) throw exception("Maximum amount of macros reached!"); - macroPositions[registers.mme.startAddressRamPointer++] = params.argument; + macroPositions[registers.mme.startAddressRamPointer++] = argument; break; case MAXWELL3D_OFFSET(mme.shadowRamControl): - shadowRegisters.mme.shadowRamControl = static_cast(params.argument); + shadowRegisters.mme.shadowRamControl = static_cast(argument); break; case MAXWELL3D_OFFSET(syncpointAction): state.logger->Debug("Increment syncpoint: {}", static_cast(registers.syncpointAction.id)); @@ -135,6 +149,8 @@ namespace skyline::soc::gm20b::engine::maxwell3d { case MAXWELL3D_OFFSET(firmwareCall[4]): registers.raw[0xD00] = 1; break; + default: + break; } } diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h index 32eb8a75..c18aaa93 100644 --- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h +++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h @@ -17,7 +17,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d { std::array macroPositions{}; //!< The positions of each individual macro in macro memory, there can be a maximum of 0x80 macros at any one time struct { - u32 index; + i32 index; std::vector arguments; } macroInvocation{}; //!< Data for a macro that is pending execution @@ -557,7 +557,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d { Registers registers{}; Registers shadowRegisters{}; //!< The shadow registers, their function is controlled by the 'shadowRamControl' register - std::array macroCode{}; //!< This stores GPU macros, the 256KiB size is from Ryujinx + std::array macroCode{}; //!< This stores GPU macros, writes to it will wraparound on overflow Maxwell3D(const DeviceState &state); @@ -566,6 +566,6 @@ namespace skyline::soc::gm20b::engine::maxwell3d { */ void ResetRegs(); - void CallMethod(MethodParams params) override; + void CallMethod(u32 method, u32 argument, bool lastCall); }; } diff --git a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp index eea9967c..e1d5883d 100644 --- a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp +++ b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.cpp @@ -5,41 +5,90 @@ #include #include #include +#include namespace skyline::soc::gm20b { - void GPFIFO::Send(MethodParams params) { - state.logger->Debug("Called GPU method - method: 0x{:X} argument: 0x{:X} subchannel: 0x{:X} last: {}", params.method, params.argument, params.subChannel, params.lastCall); + /** + * @brief A single pushbuffer method header that describes a compressed method sequence + * @url https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/manuals/volta/gv100/dev_ram.ref.txt#L850 + * @url https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/classes/host/clb06f.h#L179 + */ + union PushBufferMethodHeader { + u32 raw; - if (params.method == 0) { - switch (static_cast(params.argument)) { - case EngineID::Fermi2D: - subchannels.at(params.subChannel) = &state.soc->gm20b.fermi2D; + enum class TertOp : u8 { + Grp0IncMethod = 0, + Grp0SetSubDevMask = 1, + Grp0StoreSubDevMask = 2, + Grp0UseSubDevMask = 3, + Grp2NonIncMethod = 0, + }; + + enum class SecOp : u8 { + Grp0UseTert = 0, + IncMethod = 1, + Grp2UseTert = 2, + NonIncMethod = 3, + ImmdDataMethod = 4, + OneInc = 5, + Reserved6 = 6, + EndPbSegment = 7, + }; + + u16 methodAddress : 12; + struct { + u8 _pad0_ : 4; + u16 subDeviceMask : 12; + }; + + struct { + u16 _pad1_ : 13; + u8 methodSubChannel : 3; + union { + TertOp tertOp : 3; + u16 methodCount : 13; + u16 immdData : 13; + }; + }; + + struct { + u32 _pad2_ : 29; + SecOp secOp : 3; + }; + }; + static_assert(sizeof(PushBufferMethodHeader) == sizeof(u32)); + + void GPFIFO::Send(u32 method, u32 argument, u32 subChannel, bool lastCall) { + constexpr u32 ThreeDSubChannel{0}; + constexpr u32 ComputeSubChannel{1}; + constexpr u32 Inline2MemorySubChannel{2}; + constexpr u32 TwoDSubChannel{3}; + constexpr u32 CopySubChannel{4}; // HW forces a memory flush on a switch from this subchannel to others + + state.logger->Debug("Called GPU method - method: 0x{:X} argument: 0x{:X} subchannel: 0x{:X} last: {}", method, argument, subChannel, lastCall); + + if (method < engine::GPFIFO::RegisterCount) { + gpfifoEngine.CallMethod(method, argument, lastCall); + } else { + switch (subChannel) { + case ThreeDSubChannel: + state.soc->gm20b.maxwell3D.CallMethod(method, argument, lastCall); break; - case EngineID::KeplerMemory: - subchannels.at(params.subChannel) = &state.soc->gm20b.keplerMemory; + case ComputeSubChannel: + state.soc->gm20b.maxwellCompute.CallMethod(method, argument, lastCall); break; - case EngineID::Maxwell3D: - subchannels.at(params.subChannel) = &state.soc->gm20b.maxwell3D; + case Inline2MemorySubChannel: + state.soc->gm20b.keplerMemory.CallMethod(method, argument, lastCall); break; - case EngineID::MaxwellCompute: - subchannels.at(params.subChannel) = &state.soc->gm20b.maxwellCompute; + case TwoDSubChannel: + state.soc->gm20b.fermi2D.CallMethod(method, argument, lastCall); break; - case EngineID::MaxwellDma: - subchannels.at(params.subChannel) = &state.soc->gm20b.maxwellDma; + case CopySubChannel: + state.soc->gm20b.maxwellDma.CallMethod(method, argument, lastCall); break; default: - throw exception("Unknown engine 0x{:X} cannot be bound to subchannel {}", params.argument, params.subChannel); + throw exception("Tried to call into a software subchannel: {}!", subChannel); } - - state.logger->Info("Bound GPU engine 0x{:X} to subchannel {}", params.argument, params.subChannel); - return; - } else if (params.method < engine::GPFIFO::RegisterCount) { - gpfifoEngine.CallMethod(params); - } else { - if (subchannels.at(params.subChannel) == nullptr) - throw exception("Calling method on unbound channel"); - - subchannels.at(params.subChannel)->CallMethod(params); } } @@ -66,22 +115,22 @@ namespace skyline::soc::gm20b { PushBufferMethodHeader methodHeader{.raw = *entry}; switch (methodHeader.secOp) { case PushBufferMethodHeader::SecOp::IncMethod: - for (u16 i{}; i < methodHeader.methodCount; i++) - Send(MethodParams{static_cast(methodHeader.methodAddress + i), *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1}); + for (u32 i{}; i < methodHeader.methodCount; i++) + Send(methodHeader.methodAddress + i, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1); break; case PushBufferMethodHeader::SecOp::NonIncMethod: - for (u16 i{}; i < methodHeader.methodCount; i++) - Send(MethodParams{methodHeader.methodAddress, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1}); + for (u32 i{}; i < methodHeader.methodCount; i++) + Send(methodHeader.methodAddress, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1); break; case PushBufferMethodHeader::SecOp::OneInc: - for (u16 i{}; i < methodHeader.methodCount; i++) - Send(MethodParams{static_cast(methodHeader.methodAddress + static_cast(i)), *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1}); + for (u32 i{}; i < methodHeader.methodCount; i++) + Send(methodHeader.methodAddress + !!i, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1); break; case PushBufferMethodHeader::SecOp::ImmdDataMethod: - Send(MethodParams{methodHeader.methodAddress, methodHeader.immdData, methodHeader.methodSubChannel, true}); + Send(methodHeader.methodAddress, methodHeader.immdData, methodHeader.methodSubChannel, true); break; case PushBufferMethodHeader::SecOp::EndPbSegment: diff --git a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.h b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.h index 2e601753..318fed61 100644 --- a/app/src/main/cpp/skyline/soc/gm20b/gpfifo.h +++ b/app/src/main/cpp/skyline/soc/gm20b/gpfifo.h @@ -72,56 +72,6 @@ namespace skyline::soc::gm20b { }; static_assert(sizeof(GpEntry) == sizeof(u64)); - /** - * @brief A single pushbuffer method header that describes a compressed method sequence - * @url https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/manuals/volta/gv100/dev_ram.ref.txt#L850 - * @url https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/classes/host/clb06f.h#L179 - */ - union PushBufferMethodHeader { - u32 raw; - - enum class TertOp : u8 { - Grp0IncMethod = 0, - Grp0SetSubDevMask = 1, - Grp0StoreSubDevMask = 2, - Grp0UseSubDevMask = 3, - Grp2NonIncMethod = 0, - }; - - enum class SecOp : u8 { - Grp0UseTert = 0, - IncMethod = 1, - Grp2UseTert = 2, - NonIncMethod = 3, - ImmdDataMethod = 4, - OneInc = 5, - Reserved6 = 6, - EndPbSegment = 7, - }; - - u16 methodAddress : 12; - struct { - u8 _pad0_ : 4; - u16 subDeviceMask : 12; - }; - - struct { - u16 _pad1_ : 13; - u8 methodSubChannel : 3; - union { - TertOp tertOp : 3; - u16 methodCount : 13; - u16 immdData : 13; - }; - }; - - struct { - u32 _pad2_ : 29; - SecOp secOp : 3; - }; - }; - static_assert(sizeof(PushBufferMethodHeader) == sizeof(u32)); - /** * @brief The GPFIFO class handles creating pushbuffers from GP entries and then processing them * @note This class doesn't perfectly map to any particular hardware component on the X1, it does a mix of the GPU Host PBDMA (With and handling the GPFIFO entries @@ -138,7 +88,7 @@ namespace skyline::soc::gm20b { /** * @brief Sends a method call to the GPU hardware */ - void Send(MethodParams params); + void Send(u32 method, u32 argument, u32 subchannel, bool lastCall); /** * @brief Processes the pushbuffer contained within the given GpEntry, calling methods as needed