Implement macro HLE for instanced draw macros

gm20b performs instanced draws by repeating draw methods for each instance, the code to detect this together with the cost of interpreting macros took up around 6% of GPFIFO time in Metro Kingdom. By detecting these specific macros and performing an instanced draw directly much of that cost can be avoided.
This commit is contained in:
Billy Laws 2022-09-14 22:01:50 +01:00
parent cf0752f937
commit 3404a3abdb
7 changed files with 141 additions and 5 deletions

View File

@ -203,6 +203,7 @@ add_library(skyline SHARED
${source_DIR}/skyline/soc/gm20b/channel.cpp
${source_DIR}/skyline/soc/gm20b/gpfifo.cpp
${source_DIR}/skyline/soc/gm20b/gmmu.cpp
${source_DIR}/skyline/soc/gm20b/macro/macro_state.cpp
${source_DIR}/skyline/soc/gm20b/macro/macro_interpreter.cpp
${source_DIR}/skyline/soc/gm20b/engines/engine.cpp
${source_DIR}/skyline/soc/gm20b/engines/gpfifo.cpp

View File

@ -20,7 +20,7 @@ namespace skyline::soc::gm20b::engine {
if (!(macroMethodOffset & 1)) {
// Flush the current macro as we are switching to another one
if (macroInvocation.Valid()) {
macroState.macroInterpreter.Execute(macroState.macroPositions[macroInvocation.index], macroInvocation.arguments, this);
macroState.Execute(macroInvocation.index, macroInvocation.arguments, this);
macroInvocation.Reset();
}
@ -32,7 +32,7 @@ namespace skyline::soc::gm20b::engine {
// Flush macro after all of the data in the method call has been sent
if (lastCall && macroInvocation.Valid()) {
macroState.macroInterpreter.Execute(macroState.macroPositions[macroInvocation.index], macroInvocation.arguments, this);
macroState.Execute(macroInvocation.index, macroInvocation.arguments, this);
macroInvocation.Reset();
}
};

View File

@ -61,15 +61,15 @@ namespace skyline::soc::gm20b::engine {
MacroState &macroState;
struct {
size_t index{std::numeric_limits<size_t>::max()};
u32 index{std::numeric_limits<u32>::max()};
std::vector<u32> arguments;
bool Valid() {
return index != std::numeric_limits<size_t>::max();
return index != std::numeric_limits<u32>::max();
}
void Reset() {
index = std::numeric_limits<size_t>::max();
index = std::numeric_limits<u32>::max();
arguments.clear();
}
} macroInvocation{}; //!< Data for a macro that is pending execution
@ -88,6 +88,14 @@ namespace skyline::soc::gm20b::engine {
*/
virtual u32 ReadMethodFromMacro(u32 method) = 0;
virtual void DrawInstanced(bool setRegs, u32 drawTopology, u32 vertexArrayCount, u32 instanceCount, u32 vertexArrayStart, u32 globalBaseInstanceIndex) {
throw exception("DrawInstanced is not implemented for this engine");
}
virtual void DrawIndexedInstanced(bool setRegs, u32 drawTopology, u32 indexBufferCount, u32 instanceCount, u32 globalBaseVertexIndex, u32 indexBufferFirst, u32 globalBaseInstanceIndex) {
throw exception("DrawIndexedInstanced is not implemented for this engine");
}
/**
* @brief Handles a call to a method in the MME space
* @param macroMethodOffset The target offset from EngineMethodsEnd

View File

@ -323,4 +323,29 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
u32 Maxwell3D::ReadMethodFromMacro(u32 method) {
return registers.raw[method];
}
void Maxwell3D::DrawInstanced(bool setRegs, u32 drawTopology, u32 vertexArrayCount, u32 instanceCount, u32 vertexArrayStart, u32 globalBaseInstanceIndex) {
auto topology{static_cast<type::DrawTopology>(drawTopology)};
if (setRegs) {
registers.begin->op = topology;
registers.drawVertexArray->count = vertexArrayCount;
registers.vertexArrayStart = vertexArrayStart;
registers.globalBaseInstanceIndex = globalBaseInstanceIndex;
}
interconnect.Draw(topology, false, vertexArrayCount, vertexArrayStart, instanceCount, 0, globalBaseInstanceIndex);
}
void Maxwell3D::DrawIndexedInstanced(bool setRegs, u32 drawTopology, u32 indexBufferCount, u32 instanceCount, u32 globalBaseVertexIndex, u32 indexBufferFirst, u32 globalBaseInstanceIndex) {
auto topology{static_cast<type::DrawTopology>(drawTopology)};
if (setRegs) {
registers.begin->op = topology;
registers.drawIndexBuffer->count = indexBufferCount;
registers.indexBuffer->first = indexBufferFirst;
registers.globalBaseVertexIndex = globalBaseVertexIndex;
registers.globalBaseInstanceIndex = globalBaseInstanceIndex;
}
interconnect.Draw(topology, true, indexBufferCount, indexBufferFirst, instanceCount, globalBaseVertexIndex, globalBaseInstanceIndex);
}
}

View File

@ -392,5 +392,9 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
void CallMethodFromMacro(u32 method, u32 argument) override;
u32 ReadMethodFromMacro(u32 method) override;
void DrawInstanced(bool setRegs, u32 drawTopology, u32 vertexArrayCount, u32 instanceCount, u32 vertexArrayStart, u32 globalBaseInstanceIndex) override;
void DrawIndexedInstanced(bool setRegs, u32 drawTopology, u32 indexBufferCount, u32 instanceCount, u32 globalBaseVertexIndex, u32 indexBufferFirst, u32 globalBaseInstanceIndex) override;
};
}

View File

@ -0,0 +1,83 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 yuzu Emulator Project (https://yuzu-emu.org/)
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
#include <soc/gm20b/engines/engine.h>
#include "macro_state.h"
namespace skyline::soc::gm20b {
namespace macro_hle {
void DrawInstanced(size_t offset, span<u32> args, engine::MacroEngineBase *targetEngine) {
u32 instanceCount{targetEngine->ReadMethodFromMacro(0xD1B) & args[2]};
targetEngine->DrawInstanced(true, args[0], args[1], instanceCount, args[3], args[4]);
}
void DrawIndexedInstanced(size_t offset, span<u32> args, engine::MacroEngineBase *targetEngine) {
u32 instanceCount{targetEngine->ReadMethodFromMacro(0xD1B) & args[2]};
targetEngine->DrawIndexedInstanced(true, args[0], args[1], instanceCount, args[3], args[4], args[5]);
}
void DrawInstancedIndexedWithConstantBuffer(size_t offset, span<u32> args, engine::MacroEngineBase *targetEngine) {
// Writes globalBaseVertexIndex and globalBaseInstanceIndex to the bound constant buffer before performing a standard instanced indexed draw
u32 instanceCount{targetEngine->ReadMethodFromMacro(0xD1B) & args[2]};
targetEngine->CallMethodFromMacro(0x8e3, 0x640);
targetEngine->CallMethodFromMacro(0x8e4, args[4]);
targetEngine->CallMethodFromMacro(0x8e5, args[5]);
targetEngine->DrawIndexedInstanced(false, args[0], args[1], instanceCount, args[4], args[3], args[5]);
targetEngine->CallMethodFromMacro(0x8e3, 0x640);
targetEngine->CallMethodFromMacro(0x8e4, 0x0);
targetEngine->CallMethodFromMacro(0x8e5, 0x0);
}
struct HleFunctionInfo {
Function function;
u64 size;
u32 hash;
};
constexpr std::array<HleFunctionInfo, 0x3> functions{{
{DrawInstanced, 0x12, 0x6F0DD310},
{DrawIndexedInstanced, 0x17, 0x2764C4F},
{DrawInstancedIndexedWithConstantBuffer, 0x1F, 0xF2F16988},
}};
static Function LookupFunction(span<u32> code) {
for (const auto &function : functions) {
if (function.size > code.size())
continue;
auto macro{code.subspan(0, function.size)};
if (XXH32(code.data(), code.size_bytes(), 0) == function.hash)
return function.function;
}
return {};
}
}
void MacroState::Invalidate() {
invalidatePending = true;
}
void MacroState::Execute(u32 position, span<u32> args, engine::MacroEngineBase *targetEngine) {
size_t offset{macroPositions[position]};
if (invalidatePending)
macroHleFunctions.fill({});
auto &hleEntry{macroHleFunctions[position]};
if (!hleEntry.valid) {
hleEntry.function = macro_hle::LookupFunction(span(macroCode).subspan(offset));
hleEntry.valid = true;
}
if (macroHleFunctions[position].function)
macroHleFunctions[position].function(offset, args, targetEngine);
else
macroInterpreter.Execute(offset, args, targetEngine);
}
}

View File

@ -7,14 +7,29 @@
#include "macro_interpreter.h"
namespace skyline::soc::gm20b {
namespace macro_hle {
using Function = void (*)(size_t offset, span<u32> args, engine::MacroEngineBase *targetEngine);
}
/**
* @brief Holds per-channel macro state
*/
struct MacroState {
struct MacroHleEntry {
macro_hle::Function function;
bool valid;
};
engine::MacroInterpreter macroInterpreter; //!< The macro interpreter for handling 3D/2D macros
std::array<u32, 0x2000> macroCode{}; //!< Stores GPU macros, writes to it will wraparound on overflow
std::array<size_t, 0x80> macroPositions{}; //!< The positions of each individual macro in macro code memory, there can be a maximum of 0x80 macros at any one time
std::array<MacroHleEntry, 0x80> macroHleFunctions{}; //!< The HLE functions for each macro position, used to optionally override the interpreter
bool invalidatePending{};
MacroState() : macroInterpreter(macroCode) {}
void Invalidate();
void Execute(u32 position, span<u32> args, engine::MacroEngineBase *targetEngine);
};
}