Optimise GPFIFO command processing for higher throughput

Using a u32 for the loop index prevents masking on all increments,
giving a moderate performance increase.

Passing methods as u32 parameters and stopping subChannel being passed
gives quite a significant increase when combined with the inlining
allowed by subchannel based engine selection.
This commit is contained in:
Billy Laws 2021-07-01 20:21:17 +01:00
parent 67149ef7fb
commit 83eb88d78b
7 changed files with 127 additions and 124 deletions

View File

@ -16,16 +16,6 @@ namespace skyline::soc::gm20b {
MaxwellDma = 0xB0B5, MaxwellDma = 0xB0B5,
}; };
/**
* @brief The parameters of a GPU engine method call
*/
struct MethodParams {
u16 method;
u32 argument;
u32 subChannel;
bool lastCall; //!< If this is the last call in the pushbuffer entry to this specific macro
};
namespace engine { namespace engine {
/** /**
* @brief The Engine class provides an interface that can be used to communicate with the GPU's internal engines * @brief The Engine class provides an interface that can be used to communicate with the GPU's internal engines
@ -37,13 +27,11 @@ namespace skyline::soc::gm20b {
public: public:
Engine(const DeviceState &state) : state(state) {} Engine(const DeviceState &state) : state(state) {}
virtual ~Engine() = default;
/** /**
* @brief Calls an engine method with the given parameters * @brief Calls an engine method with the given parameters
*/ */
virtual void CallMethod(MethodParams params) { void CallMethod(u32 method, u32 argument, bool lastCall) {
state.logger->Warn("Called method in unimplemented engine: 0x{:X} args: 0x{:X}", params.method, params.argument); state.logger->Warn("Called method in unimplemented engine: 0x{:X} args: 0x{:X}", method, argument);
}; };
}; };
} }

View File

@ -167,10 +167,10 @@ namespace skyline::soc::gm20b::engine {
public: public:
GPFIFO(const DeviceState &state) : Engine(state) {} GPFIFO(const DeviceState &state) : Engine(state) {}
void CallMethod(MethodParams params) override { void CallMethod(u32 method, u32 argument, bool lastCall) {
state.logger->Debug("Called method in GPFIFO: 0x{:X} args: 0x{:X}", params.method, params.argument); state.logger->Debug("Called method in GPFIFO: 0x{:X} args: 0x{:X}", method, argument);
registers.raw[params.method] = params.argument; registers.raw[method] = argument;
}; };
}; };
} }

View File

@ -193,7 +193,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
} }
FORCE_INLINE void MacroInterpreter::Send(u32 pArgument) { FORCE_INLINE void MacroInterpreter::Send(u32 pArgument) {
maxwell3D.CallMethod(MethodParams{methodAddress.address, pArgument, 0, true}); maxwell3D.CallMethod(methodAddress.address, pArgument, true);
methodAddress.address += methodAddress.increment; methodAddress.address += methodAddress.increment;
} }

View File

@ -72,48 +72,62 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
registers.viewportTransformEnable = true; registers.viewportTransformEnable = true;
} }
void Maxwell3D::CallMethod(MethodParams params) { void Maxwell3D::CallMethod(u32 method, u32 argument, bool lastCall) {
state.logger->Debug("Called method in Maxwell 3D: 0x{:X} args: 0x{:X}", params.method, params.argument); state.logger->Debug("Called method in Maxwell 3D: 0x{:X} args: 0x{:X}", method, argument);
// Methods that are greater than the register size are for macro control // Methods that are greater than the register size are for macro control
if (params.method > RegisterCount) { if (method > RegisterCount) [[unlikely]] {
if (!(params.method & 1)) // Starting a new macro at index 'method - RegisterCount'
macroInvocation.index = ((params.method - RegisterCount) >> 1) % macroPositions.size(); if (!(method & 1)) {
if (macroInvocation.index != -1) {
// Flush the current macro as we are switching to another one
macroInterpreter.Execute(macroPositions[macroInvocation.index], macroInvocation.arguments);
macroInvocation.arguments.clear();
}
macroInvocation.arguments.push_back(params.argument); // Setup for the new macro index
macroInvocation.index = ((method - RegisterCount) >> 1) % macroPositions.size();
// Macros are always executed on the last method call in a pushbuffer entry
if (params.lastCall) {
macroInterpreter.Execute(macroPositions[macroInvocation.index], macroInvocation.arguments);
macroInvocation.arguments.clear();
macroInvocation.index = 0;
} }
macroInvocation.arguments.emplace_back(argument);
// Flush macro after all of the data in the method call has been sent
if (lastCall && macroInvocation.index != -1) {
macroInterpreter.Execute(macroPositions[macroInvocation.index], macroInvocation.arguments);
macroInvocation.arguments.clear();
macroInvocation.index = -1;
}
// Bail out early
return; return;
} }
registers.raw[params.method] = params.argument; registers.raw[method] = argument;
if (shadowRegisters.mme.shadowRamControl == Registers::MmeShadowRamControl::MethodTrack || shadowRegisters.mme.shadowRamControl == Registers::MmeShadowRamControl::MethodTrackWithFilter) if (shadowRegisters.mme.shadowRamControl == Registers::MmeShadowRamControl::MethodTrack || shadowRegisters.mme.shadowRamControl == Registers::MmeShadowRamControl::MethodTrackWithFilter)
shadowRegisters.raw[params.method] = params.argument; shadowRegisters.raw[method] = argument;
else if (shadowRegisters.mme.shadowRamControl == Registers::MmeShadowRamControl::MethodReplay) else if (shadowRegisters.mme.shadowRamControl == Registers::MmeShadowRamControl::MethodReplay)
params.argument = shadowRegisters.raw[params.method]; argument = shadowRegisters.raw[method];
switch (params.method) { switch (method) {
case MAXWELL3D_OFFSET(mme.instructionRamLoad): case MAXWELL3D_OFFSET(mme.instructionRamLoad):
if (registers.mme.instructionRamPointer >= macroCode.size()) if (registers.mme.instructionRamPointer >= macroCode.size())
throw exception("Macro memory is full!"); throw exception("Macro memory is full!");
macroCode[registers.mme.instructionRamPointer++] = params.argument; macroCode[registers.mme.instructionRamPointer++] = argument;
// Wraparound writes
registers.mme.instructionRamPointer %= macroCode.size();
break; break;
case MAXWELL3D_OFFSET(mme.startAddressRamLoad): case MAXWELL3D_OFFSET(mme.startAddressRamLoad):
if (registers.mme.startAddressRamPointer >= macroPositions.size()) if (registers.mme.startAddressRamPointer >= macroPositions.size())
throw exception("Maximum amount of macros reached!"); throw exception("Maximum amount of macros reached!");
macroPositions[registers.mme.startAddressRamPointer++] = params.argument; macroPositions[registers.mme.startAddressRamPointer++] = argument;
break; break;
case MAXWELL3D_OFFSET(mme.shadowRamControl): case MAXWELL3D_OFFSET(mme.shadowRamControl):
shadowRegisters.mme.shadowRamControl = static_cast<Registers::MmeShadowRamControl>(params.argument); shadowRegisters.mme.shadowRamControl = static_cast<Registers::MmeShadowRamControl>(argument);
break; break;
case MAXWELL3D_OFFSET(syncpointAction): case MAXWELL3D_OFFSET(syncpointAction):
state.logger->Debug("Increment syncpoint: {}", static_cast<u16>(registers.syncpointAction.id)); state.logger->Debug("Increment syncpoint: {}", static_cast<u16>(registers.syncpointAction.id));
@ -135,6 +149,8 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
case MAXWELL3D_OFFSET(firmwareCall[4]): case MAXWELL3D_OFFSET(firmwareCall[4]):
registers.raw[0xD00] = 1; registers.raw[0xD00] = 1;
break; break;
default:
break;
} }
} }

View File

@ -17,7 +17,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
std::array<size_t, 0x80> macroPositions{}; //!< The positions of each individual macro in macro memory, there can be a maximum of 0x80 macros at any one time std::array<size_t, 0x80> macroPositions{}; //!< The positions of each individual macro in macro memory, there can be a maximum of 0x80 macros at any one time
struct { struct {
u32 index; i32 index;
std::vector<u32> arguments; std::vector<u32> arguments;
} macroInvocation{}; //!< Data for a macro that is pending execution } macroInvocation{}; //!< Data for a macro that is pending execution
@ -557,7 +557,7 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
Registers registers{}; Registers registers{};
Registers shadowRegisters{}; //!< The shadow registers, their function is controlled by the 'shadowRamControl' register Registers shadowRegisters{}; //!< The shadow registers, their function is controlled by the 'shadowRamControl' register
std::array<u32, 0x10000> macroCode{}; //!< This stores GPU macros, the 256KiB size is from Ryujinx std::array<u32, 0x2000> macroCode{}; //!< This stores GPU macros, writes to it will wraparound on overflow
Maxwell3D(const DeviceState &state); Maxwell3D(const DeviceState &state);
@ -566,6 +566,6 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
*/ */
void ResetRegs(); void ResetRegs();
void CallMethod(MethodParams params) override; void CallMethod(u32 method, u32 argument, bool lastCall);
}; };
} }

View File

@ -5,41 +5,90 @@
#include <loader/loader.h> #include <loader/loader.h>
#include <kernel/types/KProcess.h> #include <kernel/types/KProcess.h>
#include <soc.h> #include <soc.h>
#include <os.h>
namespace skyline::soc::gm20b { namespace skyline::soc::gm20b {
void GPFIFO::Send(MethodParams params) { /**
state.logger->Debug("Called GPU method - method: 0x{:X} argument: 0x{:X} subchannel: 0x{:X} last: {}", params.method, params.argument, params.subChannel, params.lastCall); * @brief A single pushbuffer method header that describes a compressed method sequence
* @url https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/manuals/volta/gv100/dev_ram.ref.txt#L850
* @url https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/classes/host/clb06f.h#L179
*/
union PushBufferMethodHeader {
u32 raw;
if (params.method == 0) { enum class TertOp : u8 {
switch (static_cast<EngineID>(params.argument)) { Grp0IncMethod = 0,
case EngineID::Fermi2D: Grp0SetSubDevMask = 1,
subchannels.at(params.subChannel) = &state.soc->gm20b.fermi2D; Grp0StoreSubDevMask = 2,
Grp0UseSubDevMask = 3,
Grp2NonIncMethod = 0,
};
enum class SecOp : u8 {
Grp0UseTert = 0,
IncMethod = 1,
Grp2UseTert = 2,
NonIncMethod = 3,
ImmdDataMethod = 4,
OneInc = 5,
Reserved6 = 6,
EndPbSegment = 7,
};
u16 methodAddress : 12;
struct {
u8 _pad0_ : 4;
u16 subDeviceMask : 12;
};
struct {
u16 _pad1_ : 13;
u8 methodSubChannel : 3;
union {
TertOp tertOp : 3;
u16 methodCount : 13;
u16 immdData : 13;
};
};
struct {
u32 _pad2_ : 29;
SecOp secOp : 3;
};
};
static_assert(sizeof(PushBufferMethodHeader) == sizeof(u32));
void GPFIFO::Send(u32 method, u32 argument, u32 subChannel, bool lastCall) {
constexpr u32 ThreeDSubChannel{0};
constexpr u32 ComputeSubChannel{1};
constexpr u32 Inline2MemorySubChannel{2};
constexpr u32 TwoDSubChannel{3};
constexpr u32 CopySubChannel{4}; // HW forces a memory flush on a switch from this subchannel to others
state.logger->Debug("Called GPU method - method: 0x{:X} argument: 0x{:X} subchannel: 0x{:X} last: {}", method, argument, subChannel, lastCall);
if (method < engine::GPFIFO::RegisterCount) {
gpfifoEngine.CallMethod(method, argument, lastCall);
} else {
switch (subChannel) {
case ThreeDSubChannel:
state.soc->gm20b.maxwell3D.CallMethod(method, argument, lastCall);
break; break;
case EngineID::KeplerMemory: case ComputeSubChannel:
subchannels.at(params.subChannel) = &state.soc->gm20b.keplerMemory; state.soc->gm20b.maxwellCompute.CallMethod(method, argument, lastCall);
break; break;
case EngineID::Maxwell3D: case Inline2MemorySubChannel:
subchannels.at(params.subChannel) = &state.soc->gm20b.maxwell3D; state.soc->gm20b.keplerMemory.CallMethod(method, argument, lastCall);
break; break;
case EngineID::MaxwellCompute: case TwoDSubChannel:
subchannels.at(params.subChannel) = &state.soc->gm20b.maxwellCompute; state.soc->gm20b.fermi2D.CallMethod(method, argument, lastCall);
break; break;
case EngineID::MaxwellDma: case CopySubChannel:
subchannels.at(params.subChannel) = &state.soc->gm20b.maxwellDma; state.soc->gm20b.maxwellDma.CallMethod(method, argument, lastCall);
break; break;
default: default:
throw exception("Unknown engine 0x{:X} cannot be bound to subchannel {}", params.argument, params.subChannel); throw exception("Tried to call into a software subchannel: {}!", subChannel);
} }
state.logger->Info("Bound GPU engine 0x{:X} to subchannel {}", params.argument, params.subChannel);
return;
} else if (params.method < engine::GPFIFO::RegisterCount) {
gpfifoEngine.CallMethod(params);
} else {
if (subchannels.at(params.subChannel) == nullptr)
throw exception("Calling method on unbound channel");
subchannels.at(params.subChannel)->CallMethod(params);
} }
} }
@ -66,22 +115,22 @@ namespace skyline::soc::gm20b {
PushBufferMethodHeader methodHeader{.raw = *entry}; PushBufferMethodHeader methodHeader{.raw = *entry};
switch (methodHeader.secOp) { switch (methodHeader.secOp) {
case PushBufferMethodHeader::SecOp::IncMethod: case PushBufferMethodHeader::SecOp::IncMethod:
for (u16 i{}; i < methodHeader.methodCount; i++) for (u32 i{}; i < methodHeader.methodCount; i++)
Send(MethodParams{static_cast<u16>(methodHeader.methodAddress + i), *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1}); Send(methodHeader.methodAddress + i, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);
break; break;
case PushBufferMethodHeader::SecOp::NonIncMethod: case PushBufferMethodHeader::SecOp::NonIncMethod:
for (u16 i{}; i < methodHeader.methodCount; i++) for (u32 i{}; i < methodHeader.methodCount; i++)
Send(MethodParams{methodHeader.methodAddress, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1}); Send(methodHeader.methodAddress, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);
break; break;
case PushBufferMethodHeader::SecOp::OneInc: case PushBufferMethodHeader::SecOp::OneInc:
for (u16 i{}; i < methodHeader.methodCount; i++) for (u32 i{}; i < methodHeader.methodCount; i++)
Send(MethodParams{static_cast<u16>(methodHeader.methodAddress + static_cast<bool>(i)), *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1}); Send(methodHeader.methodAddress + !!i, *++entry, methodHeader.methodSubChannel, i == methodHeader.methodCount - 1);
break; break;
case PushBufferMethodHeader::SecOp::ImmdDataMethod: case PushBufferMethodHeader::SecOp::ImmdDataMethod:
Send(MethodParams{methodHeader.methodAddress, methodHeader.immdData, methodHeader.methodSubChannel, true}); Send(methodHeader.methodAddress, methodHeader.immdData, methodHeader.methodSubChannel, true);
break; break;
case PushBufferMethodHeader::SecOp::EndPbSegment: case PushBufferMethodHeader::SecOp::EndPbSegment:

View File

@ -72,56 +72,6 @@ namespace skyline::soc::gm20b {
}; };
static_assert(sizeof(GpEntry) == sizeof(u64)); static_assert(sizeof(GpEntry) == sizeof(u64));
/**
* @brief A single pushbuffer method header that describes a compressed method sequence
* @url https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/manuals/volta/gv100/dev_ram.ref.txt#L850
* @url https://github.com/NVIDIA/open-gpu-doc/blob/ab27fc22db5de0d02a4cabe08e555663b62db4d4/classes/host/clb06f.h#L179
*/
union PushBufferMethodHeader {
u32 raw;
enum class TertOp : u8 {
Grp0IncMethod = 0,
Grp0SetSubDevMask = 1,
Grp0StoreSubDevMask = 2,
Grp0UseSubDevMask = 3,
Grp2NonIncMethod = 0,
};
enum class SecOp : u8 {
Grp0UseTert = 0,
IncMethod = 1,
Grp2UseTert = 2,
NonIncMethod = 3,
ImmdDataMethod = 4,
OneInc = 5,
Reserved6 = 6,
EndPbSegment = 7,
};
u16 methodAddress : 12;
struct {
u8 _pad0_ : 4;
u16 subDeviceMask : 12;
};
struct {
u16 _pad1_ : 13;
u8 methodSubChannel : 3;
union {
TertOp tertOp : 3;
u16 methodCount : 13;
u16 immdData : 13;
};
};
struct {
u32 _pad2_ : 29;
SecOp secOp : 3;
};
};
static_assert(sizeof(PushBufferMethodHeader) == sizeof(u32));
/** /**
* @brief The GPFIFO class handles creating pushbuffers from GP entries and then processing them * @brief The GPFIFO class handles creating pushbuffers from GP entries and then processing them
* @note This class doesn't perfectly map to any particular hardware component on the X1, it does a mix of the GPU Host PBDMA (With and handling the GPFIFO entries * @note This class doesn't perfectly map to any particular hardware component on the X1, it does a mix of the GPU Host PBDMA (With and handling the GPFIFO entries
@ -138,7 +88,7 @@ namespace skyline::soc::gm20b {
/** /**
* @brief Sends a method call to the GPU hardware * @brief Sends a method call to the GPU hardware
*/ */
void Send(MethodParams params); void Send(u32 method, u32 argument, u32 subchannel, bool lastCall);
/** /**
* @brief Processes the pushbuffer contained within the given GpEntry, calling methods as needed * @brief Processes the pushbuffer contained within the given GpEntry, calling methods as needed