From 925724c9900e357e8621a7a592be189cb4141ed8 Mon Sep 17 00:00:00 2001 From: Jannik Vogel Date: Mon, 8 May 2017 23:37:38 +0200 Subject: [PATCH 1/3] Pica: Set program code / swizzle data limit to 4096 One of the later commits will enable writing to GS regs. It turns out that on startup, most games will write 4096 GS program words. The current limit of 1024 would hence result in 3072 (4096 - 1024) error messages: ``` HW.GPU video_core/shader/shader.cpp:WriteProgramCode:229: Invalid GS program offset 1024 ``` New constants have been introduced to represent these limits. The swizzle data size has also been raised. This matches the given field sizes of [GPUREG_SH_OPDESCS_INDEX](https://3dbrew.org/wiki/GPU/Internal_Registers#GPUREG_SH_OPDESCS_INDEX) and [GPUREG_SH_CODETRANSFER_INDEX](https://www.3dbrew.org/wiki/GPU/Internal_Registers#GPUREG_SH_CODETRANSFER_INDEX) (12 bit = [0; 4095]). --- src/video_core/shader/shader.h | 7 +++++-- src/video_core/shader/shader_interpreter.cpp | 2 +- src/video_core/shader/shader_jit_x64.cpp | 2 +- src/video_core/shader/shader_jit_x64_compiler.cpp | 4 ++-- src/video_core/shader/shader_jit_x64_compiler.h | 14 +++++++------- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 38ea717ab..e156f6aef 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -24,6 +24,9 @@ namespace Pica { namespace Shader { +constexpr unsigned MAX_PROGRAM_CODE_LENGTH = 4096; +constexpr unsigned MAX_SWIZZLE_DATA_LENGTH = 4096; + struct AttributeBuffer { alignas(16) Math::Vec4 attr[16]; }; @@ -144,8 +147,8 @@ struct ShaderSetup { return offsetof(ShaderSetup, uniforms.i) + index * sizeof(Math::Vec4); } - std::array program_code; - std::array swizzle_data; + std::array program_code; + std::array swizzle_data; /// Data private to ShaderEngines struct EngineData { diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index f4d1c46c5..aa1cec81f 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -653,7 +653,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData } void InterpreterEngine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) { - ASSERT(entry_point < 1024); + ASSERT(entry_point < MAX_PROGRAM_CODE_LENGTH); setup.engine_data.entry_point = entry_point; } diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index 0ee0dd9ef..73c21871c 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -15,7 +15,7 @@ JitX64Engine::JitX64Engine() = default; JitX64Engine::~JitX64Engine() = default; void JitX64Engine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) { - ASSERT(entry_point < 1024); + ASSERT(entry_point < MAX_PROGRAM_CODE_LENGTH); setup.engine_data.entry_point = entry_point; u64 code_hash = Common::ComputeHash64(&setup.program_code, sizeof(setup.program_code)); diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp index 2dbc8b147..5d9b6448c 100644 --- a/src/video_core/shader/shader_jit_x64_compiler.cpp +++ b/src/video_core/shader/shader_jit_x64_compiler.cpp @@ -834,8 +834,8 @@ void JitShader::FindReturnOffsets() { std::sort(return_offsets.begin(), return_offsets.end()); } -void JitShader::Compile(const std::array* program_code_, - const std::array* swizzle_data_) { +void JitShader::Compile(const std::array* program_code_, + const std::array* swizzle_data_) { program_code = program_code_; swizzle_data = swizzle_data_; diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h index f27675560..31af0ca48 100644 --- a/src/video_core/shader/shader_jit_x64_compiler.h +++ b/src/video_core/shader/shader_jit_x64_compiler.h @@ -22,8 +22,8 @@ namespace Pica { namespace Shader { -/// Memory allocated for each compiled shader (64Kb) -constexpr size_t MAX_SHADER_SIZE = 1024 * 64; +/// Memory allocated for each compiled shader +constexpr size_t MAX_SHADER_SIZE = MAX_PROGRAM_CODE_LENGTH * 64; /** * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 @@ -37,8 +37,8 @@ public: program(&setup, &state, instruction_labels[offset].getAddress()); } - void Compile(const std::array* program_code, - const std::array* swizzle_data); + void Compile(const std::array* program_code, + const std::array* swizzle_data); void Compile_ADD(Instruction instr); void Compile_DP3(Instruction instr); @@ -104,11 +104,11 @@ private: */ void FindReturnOffsets(); - const std::array* program_code = nullptr; - const std::array* swizzle_data = nullptr; + const std::array* program_code = nullptr; + const std::array* swizzle_data = nullptr; /// Mapping of Pica VS instructions to pointers in the emitted code - std::array instruction_labels; + std::array instruction_labels; /// Offsets in code where a return needs to be inserted std::vector return_offsets; From 3fd3775d35c3634f7fdf7a1e9933e2d8d6e8198e Mon Sep 17 00:00:00 2001 From: Jannik Vogel Date: Sat, 17 Dec 2016 13:28:59 +0100 Subject: [PATCH 2/3] Pica: Write shader registers in functions The commit after this one adds GS register writes, so this moves the VS handlers into functions so they can be re-used and extended more easily. --- src/video_core/command_processor.cpp | 160 +++++++++++++++++---------- 1 file changed, 103 insertions(+), 57 deletions(-) diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 2e32ff905..c29ad6775 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -32,12 +32,13 @@ namespace Pica { namespace CommandProcessor { -static int float_regs_counter = 0; +static int vs_float_regs_counter = 0; +static u32 vs_uniform_write_buffer[4]; -static u32 uniform_write_buffer[4]; +static int gs_float_regs_counter = 0; +static u32 gs_uniform_write_buffer[4]; static int default_attr_counter = 0; - static u32 default_attr_write_buffer[3]; // Expand a 4-bit mask to 4-byte mask, e.g. 0b0101 -> 0x00FF00FF @@ -48,6 +49,97 @@ static const u32 expand_bits_to_bytes[] = { MICROPROFILE_DEFINE(GPU_Drawing, "GPU", "Drawing", MP_RGB(50, 50, 240)); +static const char* GetShaderSetupTypeName(Shader::ShaderSetup& setup) { + if (&setup == &g_state.vs) { + return "vertex shader"; + } + if (&setup == &g_state.gs) { + return "geometry shader"; + } + return "unknown shader"; +} + +static void WriteUniformBoolReg(Shader::ShaderSetup& setup, u32 value) { + for (unsigned i = 0; i < setup.uniforms.b.size(); ++i) + setup.uniforms.b[i] = (value & (1 << i)) != 0; +} + +static void WriteUniformIntReg(Shader::ShaderSetup& setup, unsigned index, + const Math::Vec4& values) { + ASSERT(index < setup.uniforms.i.size()); + setup.uniforms.i[index] = values; + LOG_TRACE(HW_GPU, "Set %s integer uniform %d to %02x %02x %02x %02x", + GetShaderSetupTypeName(setup), index, values.x, values.y, values.z, values.w); +} + +static void WriteUniformFloatReg(ShaderRegs& config, Shader::ShaderSetup& setup, + int& float_regs_counter, u32 uniform_write_buffer[4], u32 value) { + auto& uniform_setup = config.uniform_setup; + + // TODO: Does actual hardware indeed keep an intermediate buffer or does + // it directly write the values? + uniform_write_buffer[float_regs_counter++] = value; + + // Uniforms are written in a packed format such that four float24 values are encoded in + // three 32-bit numbers. We write to internal memory once a full such vector is + // written. + if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) || + (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) { + float_regs_counter = 0; + + auto& uniform = setup.uniforms.f[uniform_setup.index]; + + if (uniform_setup.index >= 96) { + LOG_ERROR(HW_GPU, "Invalid %s float uniform index %d", GetShaderSetupTypeName(setup), + (int)uniform_setup.index); + } else { + + // NOTE: The destination component order indeed is "backwards" + if (uniform_setup.IsFloat32()) { + for (auto i : {0, 1, 2, 3}) + uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i])); + } else { + // TODO: Untested + uniform.w = float24::FromRaw(uniform_write_buffer[0] >> 8); + uniform.z = float24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | + ((uniform_write_buffer[1] >> 16) & 0xFFFF)); + uniform.y = float24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | + ((uniform_write_buffer[2] >> 24) & 0xFF)); + uniform.x = float24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF); + } + + LOG_TRACE(HW_GPU, "Set %s float uniform %x to (%f %f %f %f)", + GetShaderSetupTypeName(setup), (int)uniform_setup.index, + uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(), + uniform.w.ToFloat32()); + + // TODO: Verify that this actually modifies the register! + uniform_setup.index.Assign(uniform_setup.index + 1); + } + } +} + +static void WriteProgramCode(ShaderRegs& config, Shader::ShaderSetup& setup, + unsigned max_program_code_length, u32 value) { + if (config.program.offset >= max_program_code_length) { + LOG_ERROR(HW_GPU, "Invalid %s program offset %d", GetShaderSetupTypeName(setup), + (int)config.program.offset); + } else { + setup.program_code[config.program.offset] = value; + config.program.offset++; + } +} + +static void WriteSwizzlePatterns(ShaderRegs& config, Shader::ShaderSetup& setup, u32 value) { + if (config.swizzle_patterns.offset >= setup.swizzle_data.size()) { + LOG_ERROR(HW_GPU, "Invalid %s swizzle pattern offset %d", GetShaderSetupTypeName(setup), + (int)config.swizzle_patterns.offset); + } else { + setup.swizzle_data[config.swizzle_patterns.offset] = value; + config.swizzle_patterns.offset++; + } +} + static void WritePicaReg(u32 id, u32 value, u32 mask) { auto& regs = g_state.regs; @@ -331,20 +423,17 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { } case PICA_REG_INDEX(vs.bool_uniforms): - for (unsigned i = 0; i < 16; ++i) - g_state.vs.uniforms.b[i] = (regs.vs.bool_uniforms.Value() & (1 << i)) != 0; - + WriteUniformBoolReg(g_state.vs, value); break; case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1): case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[1], 0x2b2): case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[2], 0x2b3): case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[3], 0x2b4): { - int index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1)); + unsigned index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1)); auto values = regs.vs.int_uniforms[index]; - g_state.vs.uniforms.i[index] = Math::Vec4(values.x, values.y, values.z, values.w); - LOG_TRACE(HW_GPU, "Set integer uniform %d to %02x %02x %02x %02x", index, values.x.Value(), - values.y.Value(), values.z.Value(), values.w.Value()); + WriteUniformIntReg(g_state.vs, index, + Math::Vec4(values.x, values.y, values.z, values.w)); break; } @@ -356,51 +445,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[5], 0x2c6): case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[6], 0x2c7): case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[7], 0x2c8): { - auto& uniform_setup = regs.vs.uniform_setup; - - // TODO: Does actual hardware indeed keep an intermediate buffer or does - // it directly write the values? - uniform_write_buffer[float_regs_counter++] = value; - - // Uniforms are written in a packed format such that four float24 values are encoded in - // three 32-bit numbers. We write to internal memory once a full such vector is - // written. - if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) || - (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) { - float_regs_counter = 0; - - auto& uniform = g_state.vs.uniforms.f[uniform_setup.index]; - - if (uniform_setup.index > 95) { - LOG_ERROR(HW_GPU, "Invalid VS uniform index %d", (int)uniform_setup.index); - break; - } - - // NOTE: The destination component order indeed is "backwards" - if (uniform_setup.IsFloat32()) { - for (auto i : {0, 1, 2, 3}) - uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i])); - } else { - // TODO: Untested - uniform.w = float24::FromRaw(uniform_write_buffer[0] >> 8); - uniform.z = float24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | - ((uniform_write_buffer[1] >> 16) & 0xFFFF)); - uniform.y = float24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | - ((uniform_write_buffer[2] >> 24) & 0xFF)); - uniform.x = float24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF); - } - - LOG_TRACE(HW_GPU, "Set uniform %x to (%f %f %f %f)", (int)uniform_setup.index, - uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(), - uniform.w.ToFloat32()); - - // TODO: Verify that this actually modifies the register! - uniform_setup.index.Assign(uniform_setup.index + 1); - } + WriteUniformFloatReg(g_state.regs.vs, g_state.vs, vs_float_regs_counter, + vs_uniform_write_buffer, value); break; } - // Load shader program code case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[0], 0x2cc): case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[1], 0x2cd): case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[2], 0x2ce): @@ -409,12 +458,10 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[5], 0x2d1): case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[6], 0x2d2): case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[7], 0x2d3): { - g_state.vs.program_code[regs.vs.program.offset] = value; - regs.vs.program.offset++; + WriteProgramCode(g_state.regs.vs, g_state.vs, 512, value); break; } - // Load swizzle pattern data case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[0], 0x2d6): case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[1], 0x2d7): case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[2], 0x2d8): @@ -423,8 +470,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[5], 0x2db): case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[6], 0x2dc): case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[7], 0x2dd): { - g_state.vs.swizzle_data[regs.vs.swizzle_patterns.offset] = value; - regs.vs.swizzle_patterns.offset++; + WriteSwizzlePatterns(g_state.regs.vs, g_state.vs, value); break; } From ba722be2ac86a7ef703400fa23b376df2f022190 Mon Sep 17 00:00:00 2001 From: Jannik Vogel Date: Thu, 22 Sep 2016 22:42:36 +0200 Subject: [PATCH 3/3] Pica: Write GS registers This adds the handlers for the geometry shader register writes which will call the functions from the previous commit to update registers for the GS. --- src/video_core/command_processor.cpp | 52 ++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index c29ad6775..9a09f81dc 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -422,6 +422,58 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { break; } + case PICA_REG_INDEX(gs.bool_uniforms): + WriteUniformBoolReg(g_state.gs, value); + break; + + case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[0], 0x281): + case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[1], 0x282): + case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[2], 0x283): + case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[3], 0x284): { + unsigned index = (id - PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[0], 0x281)); + auto values = regs.gs.int_uniforms[index]; + WriteUniformIntReg(g_state.gs, index, + Math::Vec4(values.x, values.y, values.z, values.w)); + break; + } + + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[0], 0x291): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[1], 0x292): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[2], 0x293): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[3], 0x294): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[4], 0x295): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[5], 0x296): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[6], 0x297): + case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[7], 0x298): { + WriteUniformFloatReg(g_state.regs.gs, g_state.gs, gs_float_regs_counter, + gs_uniform_write_buffer, value); + break; + } + + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[0], 0x29c): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[1], 0x29d): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[2], 0x29e): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[3], 0x29f): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[4], 0x2a0): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[5], 0x2a1): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[6], 0x2a2): + case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[7], 0x2a3): { + WriteProgramCode(g_state.regs.gs, g_state.gs, 4096, value); + break; + } + + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[0], 0x2a6): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[1], 0x2a7): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[2], 0x2a8): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[3], 0x2a9): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[4], 0x2aa): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[5], 0x2ab): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[6], 0x2ac): + case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[7], 0x2ad): { + WriteSwizzlePatterns(g_state.regs.gs, g_state.gs, value); + break; + } + case PICA_REG_INDEX(vs.bool_uniforms): WriteUniformBoolReg(g_state.vs, value); break;