From 3e925b77074ec50fe5f6c825d12c56cb5b9d44f5 Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Sat, 23 Sep 2023 22:53:57 +0200 Subject: [PATCH] Latte: Bound uniform buffers based on access patterns within the shader --- src/Cafe/HW/Latte/Core/LatteBufferData.cpp | 12 +-- .../LegacyShaderDecompiler/LatteDecompiler.h | 13 ++-- .../LatteDecompilerAnalyzer.cpp | 74 +++++++++---------- .../LatteDecompilerEmitGLSLHeader.hpp | 30 +------- .../LatteDecompilerInternal.h | 70 ++++++++++++++---- .../Renderer/Vulkan/VulkanRendererCore.cpp | 8 +- 6 files changed, 114 insertions(+), 93 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteBufferData.cpp b/src/Cafe/HW/Latte/Core/LatteBufferData.cpp index d31a8651..85d4cdf7 100644 --- a/src/Cafe/HW/Latte/Core/LatteBufferData.cpp +++ b/src/Cafe/HW/Latte/Core/LatteBufferData.cpp @@ -132,22 +132,18 @@ void LatteBufferCache_syncGPUUniformBuffers(LatteDecompilerShader* shader, const { if (shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) { - // use full uniform buffers - for (sint32 t = 0; t < shader->uniformBufferListCount; t++) + for(const auto& buf : shader->list_quickBufferList) { - sint32 i = shader->uniformBufferList[t]; + sint32 i = buf.index; MPTR physicalAddr = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 0]; uint32 uniformSize = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 1] + 1; - - if (physicalAddr == MPTR_NULL) + if (physicalAddr == MPTR_NULL) [[unlikely]] { - // no data g_renderer->buffer_bindUniformBuffer(shaderType, i, 0, 0); continue; } - + uniformSize = std::min(uniformSize, buf.size); uint32 bindOffset = LatteBufferCache_retrieveDataInCache(physicalAddr, uniformSize); - g_renderer->buffer_bindUniformBuffer(shaderType, i, bindOffset, uniformSize); } } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h index f7a0ea5f..92777844 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompiler.h @@ -1,6 +1,7 @@ #pragma once #include "Cafe/HW/Latte/Core/LatteConst.h" #include "Cafe/HW/Latte/Renderer/RendererShader.h" +#include namespace LatteDecompiler { @@ -158,11 +159,13 @@ struct LatteDecompilerShader struct LatteFetchShader* compatibleFetchShader{}; // error tracking bool hasError{false}; // if set, the shader cannot be used - // optimized access / iteration - // list of uniform buffers used - uint8 uniformBufferList[LATTE_NUM_MAX_UNIFORM_BUFFERS]; - uint8 uniformBufferListCount{ 0 }; - // list of used texture units (faster access than iterating textureUnitMask) + // compact resource lists for optimized access + struct QuickBufferEntry + { + uint8 index; + uint16 size; + }; + boost::container::static_vector list_quickBufferList; uint8 textureUnitList[LATTE_NUM_MAX_TEX_UNITS]; uint8 textureUnitListCount{ 0 }; // input diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp index e482be2c..7285d312 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerAnalyzer.cpp @@ -230,47 +230,39 @@ void LatteDecompiler_analyzeALUClause(LatteDecompilerShaderContext* shaderContex // check input for uniform access if( aluInstruction.sourceOperand[f].sel == 0xFFFFFFFF ) continue; // source operand not set/used + // about uniform register and buffer access tracking: + // for absolute indices we can determine a maximum size that is accessed + // relative accesses are tricky because the upper bound of accessed indices is unknown + // worst case we have to load the full file (256 * 16 byte entries) or for buffers an arbitrary upper bound (64KB in our case) if( GPU7_ALU_SRC_IS_CFILE(aluInstruction.sourceOperand[f].sel) ) { - // uniform register access - - // relative register file accesses are tricky because the range of possible indices is unknown - // worst case we have to load the full file (256 * 16 byte entries) - // by tracking the accessed base indices the shader analyzer can determine bounds for the potentially accessed ranges - - shaderContext->analyzer.uniformRegisterAccess = true; if (aluInstruction.sourceOperand[f].rel) { - shaderContext->analyzer.uniformRegisterDynamicAccess = true; - shaderContext->analyzer.uniformRegisterAccessIndices.emplace_back(GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel), true); + shaderContext->analyzer.uniformRegisterAccessTracker.TrackAccess(GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel), true); } else { _remapUniformAccess(shaderContext, true, 0, GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel)); - shaderContext->analyzer.uniformRegisterAccessIndices.emplace_back(GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel), false); + shaderContext->analyzer.uniformRegisterAccessTracker.TrackAccess(GPU7_ALU_SRC_GET_CFILE_INDEX(aluInstruction.sourceOperand[f].sel), false); } } else if( GPU7_ALU_SRC_IS_CBANK0(aluInstruction.sourceOperand[f].sel) ) { // uniform bank 0 (uniform buffer with index cfInstruction->cBank0Index) uint32 uniformBufferIndex = cfInstruction->cBank0Index; - if( uniformBufferIndex >= LATTE_NUM_MAX_UNIFORM_BUFFERS) - debugBreakpoint(); - shaderContext->analyzer.uniformBufferAccessMask |= (1<analyzer.uniformBufferDynamicAccessMask |= (1<cBank0AddrBase); + cemu_assert(uniformBufferIndex < LATTE_NUM_MAX_UNIFORM_BUFFERS); + uint32 offset = GPU7_ALU_SRC_GET_CBANK0_INDEX(aluInstruction.sourceOperand[f].sel)+cfInstruction->cBank0AddrBase; + _remapUniformAccess(shaderContext, false, uniformBufferIndex, offset); + shaderContext->analyzer.uniformBufferAccessTracker[uniformBufferIndex].TrackAccess(offset, aluInstruction.sourceOperand[f].rel); } else if( GPU7_ALU_SRC_IS_CBANK1(aluInstruction.sourceOperand[f].sel) ) { // uniform bank 1 (uniform buffer with index cfInstruction->cBank1Index) uint32 uniformBufferIndex = cfInstruction->cBank1Index; - if( uniformBufferIndex >= LATTE_NUM_MAX_UNIFORM_BUFFERS) - debugBreakpoint(); - shaderContext->analyzer.uniformBufferAccessMask |= (1<analyzer.uniformBufferDynamicAccessMask |= (1<cBank1AddrBase); + cemu_assert(uniformBufferIndex < LATTE_NUM_MAX_UNIFORM_BUFFERS); + uint32 offset = GPU7_ALU_SRC_GET_CBANK1_INDEX(aluInstruction.sourceOperand[f].sel)+cfInstruction->cBank1AddrBase; + _remapUniformAccess(shaderContext, false, uniformBufferIndex, offset); + shaderContext->analyzer.uniformBufferAccessTracker[uniformBufferIndex].TrackAccess(offset, aluInstruction.sourceOperand[f].rel); } else if( GPU7_ALU_SRC_IS_GPR(aluInstruction.sourceOperand[f].sel) ) { @@ -360,8 +352,7 @@ void LatteDecompiler_analyzeTEXClause(LatteDecompilerShaderContext* shaderContex if( texInstruction.textureFetch.textureIndex >= 0x80 && texInstruction.textureFetch.textureIndex <= 0x8F ) { uint32 uniformBufferIndex = texInstruction.textureFetch.textureIndex - 0x80; - shaderContext->analyzer.uniformBufferAccessMask |= (1<analyzer.uniformBufferDynamicAccessMask |= (1<analyzer.uniformBufferAccessTracker[uniformBufferIndex].TrackAccess(0, true); } else if( texInstruction.textureFetch.textureIndex == 0x9F && shader->shaderType == LatteConst::ShaderType::Geometry ) { @@ -576,7 +567,7 @@ namespace LatteDecompiler // for Vulkan we use consecutive indices for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) { - if ((decompilerContext->analyzer.uniformBufferAccessMask&(1 << i)) == 0) + if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) continue; sint32 uniformBindingPoint = i; if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) @@ -592,7 +583,7 @@ namespace LatteDecompiler // for OpenGL we use the relative buffer index for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) { - if ((decompilerContext->analyzer.uniformBufferAccessMask&(1 << i)) == 0) + if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) continue; sint32 uniformBindingPoint = i; if (decompilerContext->shaderType == LatteConst::ShaderType::Geometry) @@ -765,17 +756,24 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD LatteDecompiler_analyzeSubroutine(shaderContext, subroutineAddr); } // decide which uniform mode to use - if(shaderContext->analyzer.uniformBufferAccessMask != 0 && shaderContext->analyzer.uniformRegisterAccess ) - debugBreakpoint(); // not allowed - if(shaderContext->analyzer.uniformBufferDynamicAccessMask != 0 ) + bool hasAnyDynamicBufferAccess = false; + bool hasAnyBufferAccess = false; + for(auto& it : shaderContext->analyzer.uniformBufferAccessTracker) + { + if( it.HasRelativeAccess() ) + hasAnyDynamicBufferAccess = true; + if( it.HasAccess() ) + hasAnyBufferAccess = true; + } + if (hasAnyDynamicBufferAccess) { shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK; } - else if(shaderContext->analyzer.uniformRegisterDynamicAccess ) + else if(shaderContext->analyzer.uniformRegisterAccessTracker.HasRelativeAccess() ) { shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE; } - else if(shaderContext->analyzer.uniformBufferAccessMask != 0 || shaderContext->analyzer.uniformRegisterAccess != 0 ) + else if(hasAnyBufferAccess || shaderContext->analyzer.uniformRegisterAccessTracker.HasAccess() ) { shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_REMAPPED; } @@ -783,16 +781,18 @@ void LatteDecompiler_analyze(LatteDecompilerShaderContext* shaderContext, LatteD { shader->uniformMode = LATTE_DECOMPILER_UNIFORM_MODE_NONE; } - // generate list of uniform buffers based on uniformBufferAccessMask (for faster access) - shader->uniformBufferListCount = 0; + // generate compact list of uniform buffers (for faster access) + cemu_assert_debug(shader->list_quickBufferList.empty()); for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) { - if( !HAS_FLAG(shaderContext->analyzer.uniformBufferAccessMask, (1<analyzer.uniformBufferAccessTracker[i].HasAccess() ) continue; - shader->uniformBufferList[shader->uniformBufferListCount] = i; - shader->uniformBufferListCount++; + LatteDecompilerShader::QuickBufferEntry entry; + entry.index = i; + entry.size = shaderContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE) * 16; + shader->list_quickBufferList.push_back(entry); } - // get dimension of each used textures + // get dimension of each used texture _LatteRegisterSetTextureUnit* texRegs = nullptr; if( shader->shaderType == LatteConst::ShaderType::Vertex ) texRegs = shaderContext->contextRegistersNew->SQ_TEX_START_VS; diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSLHeader.hpp index 0bd4eb6f..21cae093 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitGLSLHeader.hpp @@ -37,36 +37,14 @@ namespace LatteDecompiler } else if (decompilerContext->shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CFILE) { - // here we try to predict the accessed range so we dont have to upload the whole register file - // we assume that if there is a fixed-index access on an index higher than a relative access, it bounds the prior relative access - sint16 highestAccessIndex = -1; - bool highestAccessIndexIsRel = false; - for(auto& accessItr : decompilerContext->analyzer.uniformRegisterAccessIndices) - { - if (accessItr.index > highestAccessIndex || (accessItr.index == highestAccessIndex && accessItr.isRelative && !highestAccessIndexIsRel)) - { - highestAccessIndex = accessItr.index; - highestAccessIndexIsRel = accessItr.isRelative; - } - } - if (highestAccessIndex < 0) - highestAccessIndex = 0; - - uint32 cfileSize; - if (highestAccessIndexIsRel) - cfileSize = 256; - else - cfileSize = highestAccessIndex + 1; - - // full uniform register file has to be present + uint32 cfileSize = decompilerContext->analyzer.uniformRegisterAccessTracker.DetermineSize(256); + // full or partial uniform register file has to be present if (shaderType == LatteConst::ShaderType::Vertex) shaderSrc->addFmt("uniform ivec4 uf_uniformRegisterVS[{}];" _CRLF, cfileSize); else if (shaderType == LatteConst::ShaderType::Pixel) shaderSrc->addFmt("uniform ivec4 uf_uniformRegisterPS[{}];" _CRLF, cfileSize); else if (shaderType == LatteConst::ShaderType::Geometry) shaderSrc->addFmt("uniform ivec4 uf_uniformRegisterGS[{}];" _CRLF, cfileSize); - else - debugBreakpoint(); uniformOffsets.offset_uniformRegister = uniformCurrentOffset; uniformOffsets.count_uniformRegister = cfileSize; uniformCurrentOffset += 16 * cfileSize; @@ -168,7 +146,7 @@ namespace LatteDecompiler { for (uint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) { - if ((decompilerContext->analyzer.uniformBufferAccessMask&(1 << i)) == 0) + if (!decompilerContext->analyzer.uniformBufferAccessTracker[i].HasAccess()) continue; cemu_assert_debug(decompilerContext->output->resourceMappingGL.uniformBuffersBindingPoint[i] >= 0); @@ -178,7 +156,7 @@ namespace LatteDecompiler shaderSrc->addFmt("uniform {}{}" _CRLF, _getShaderUniformBlockInterfaceName(decompilerContext->shaderType), i); shaderSrc->add("{" _CRLF); - shaderSrc->addFmt("vec4 {}{}[{}];" _CRLF, _getShaderUniformBlockVariableName(decompilerContext->shaderType), i, LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE); + shaderSrc->addFmt("vec4 {}{}[{}];" _CRLF, _getShaderUniformBlockVariableName(decompilerContext->shaderType), i, decompilerContext->analyzer.uniformBufferAccessTracker[i].DetermineSize(LATTE_GLSL_DYNAMIC_UNIFORM_BLOCK_SIZE)); shaderSrc->add("};" _CRLF _CRLF); shaderSrc->add(_CRLF); } diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h index 53fb61ef..54112ddf 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerInternal.h @@ -125,19 +125,66 @@ struct LatteDecompilerCFInstruction LatteDecompilerCFInstruction& operator=(LatteDecompilerCFInstruction&& mE) = default; }; -struct LatteDecompilerCFileAccess -{ - LatteDecompilerCFileAccess(uint8 index, bool isRelative) : index(index), isRelative(isRelative) {}; - uint8 index; - bool isRelative; -}; - struct LatteDecompilerSubroutineInfo { uint32 cfAddr; std::vector instructions; }; +// helper struct to track the highest accessed offset within a buffer +struct LatteDecompilerBufferAccessTracker +{ + bool hasStaticIndexAccess{false}; + bool hasDynamicIndexAccess{false}; + sint32 highestAccessDynamicIndex{0}; + sint32 highestAccessStaticIndex{0}; + + // track access, index is the array index and not a byte offset + void TrackAccess(sint32 index, bool isDynamicIndex) + { + if (isDynamicIndex) + { + hasDynamicIndexAccess = true; + if (index > highestAccessDynamicIndex) + highestAccessDynamicIndex = index; + } + else + { + hasStaticIndexAccess = true; + if (index > highestAccessStaticIndex) + highestAccessStaticIndex = index; + } + } + + sint32 DetermineSize(sint32 maximumSize) const + { + // here we try to predict the accessed range so we dont have to upload the whole buffer + // potential risky optimization: assume that if there is a fixed-index access on an index higher than any other non-zero relative accesses, it bounds the prior relative access + sint32 highestAccessIndex = -1; + if(hasStaticIndexAccess) + { + highestAccessIndex = highestAccessStaticIndex; + } + if(hasDynamicIndexAccess) + { + return maximumSize; // dynamic index exists and no bound can be determined + } + if (highestAccessIndex < 0) + return 1; // no access at all? But avoid zero as a size + return highestAccessIndex + 1; + } + + bool HasAccess() const + { + return hasStaticIndexAccess || hasDynamicIndexAccess; + } + + bool HasRelativeAccess() const + { + return hasDynamicIndexAccess; + } +}; + struct LatteDecompilerShaderContext { LatteDecompilerOutput_t* output; @@ -174,12 +221,9 @@ struct LatteDecompilerShaderContext bool isPointsPrimitive{}; // set if current render primitive is points bool outputPointSize{}; // set if the current shader should output the point size std::bitset<256> inputAttributSemanticMask; // one set bit for every used semanticId - todo: there are only 128 bit available semantic locations? The MSB has special meaning? - // uniform - bool uniformRegisterAccess; // set to true if cfile (uniform register) is accessed - bool uniformRegisterDynamicAccess; // set to true if cfile (uniform register) is accessed with a dynamic index - uint32 uniformBufferAccessMask; // 1 bit per buffer, set if the uniform buffer is accessed - uint32 uniformBufferDynamicAccessMask; // 1 bit per buffer, set if the uniform buffer is accessed by dynamic index - std::vector uniformRegisterAccessIndices; + // uniforms + LatteDecompilerBufferAccessTracker uniformRegisterAccessTracker; + LatteDecompilerBufferAccessTracker uniformBufferAccessTracker[LATTE_NUM_MAX_UNIFORM_BUFFERS]; // ssbo bool hasSSBORead; // shader has instructions that read from SSBO bool hasSSBOWrite; // shader has instructions that write to SSBO diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp index 9b47a14b..5bffcc68 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp @@ -1591,10 +1591,9 @@ void VulkanRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader { if (shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) { - // use full uniform buffers - for (sint32 t = 0; t < shader->uniformBufferListCount; t++) + for(const auto& buf : shader->list_quickBufferList) { - sint32 i = shader->uniformBufferList[t]; + sint32 i = buf.index; MPTR physicalAddr = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 0]; uint32 uniformSize = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 1] + 1; @@ -1603,6 +1602,7 @@ void VulkanRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader cemu_assert_unimplemented(); continue; } + uniformSize = std::min(uniformSize, buf.size); cemu_assert_debug(physicalAddr < 0x50000000); @@ -1621,7 +1621,7 @@ void VulkanRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader dynamicOffsetInfo.shaderUB[VulkanRendererConst::SHADER_STAGE_INDEX_FRAGMENT].unformBufferOffset[bufferIndex] = physicalAddr - m_importedMemBaseAddress; break; default: - cemu_assert_debug(false); + UNREACHABLE; } } }