From ab41de4f9feffa277ddcf3710d300d78f81dfd2b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 3 Nov 2024 11:53:16 +0100 Subject: [PATCH] use host memory instead of buffer cache when possible --- .../HW/Latte/Renderer/Metal/MetalCommon.h | 4 +- .../Renderer/Metal/MetalMemoryManager.cpp | 21 +++- .../Latte/Renderer/Metal/MetalMemoryManager.h | 13 ++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 116 ++++++++++++++---- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 8 +- 5 files changed, 138 insertions(+), 24 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index d7de0a28..20fd6b9d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -31,7 +31,9 @@ struct MetalQueryRange #define MAX_MTL_BUFFERS 31 // Buffer indices 28-30 are reserved for the helper shaders -#define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 4) +#define MTL_RESERVED_BUFFERS 3 +#define MAX_MTL_VERTEX_BUFFERS (MAX_MTL_BUFFERS - MTL_RESERVED_BUFFERS) +#define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_VERTEX_BUFFERS - index - 1) #define MAX_MTL_TEXTURES 31 #define MAX_MTL_SAMPLERS 16 diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 1c788e21..4eb4d105 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" +#include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" /* @@ -115,7 +116,23 @@ void MetalMemoryManager::InitBufferCache(size_t size) { cemu_assert_debug(!m_bufferCache); - m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModePrivate); + // First, try to import the host memory as a buffer + // TODO: only import if the option is ticked in game profile + if (m_mtlr->IsAppleGPU()) + { + m_importedMemBaseAddress = 0x10000000; + size_t hostAllocationSize = 0x40000000ull; + // TODO: get size of allocation + m_bufferCache = m_mtlr->GetDevice()->newBuffer(memory_getPointerFromVirtualOffset(m_importedMemBaseAddress), hostAllocationSize, MTL::ResourceStorageModeShared, nullptr); + if (m_bufferCache) + m_useHostMemoryForCache = true; + else + cemuLog_logDebug(LogType::Force, "Failed to import host memory as a buffer"); + } + + if (!m_useHostMemoryForCache) + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModePrivate); + #ifdef CEMU_DEBUG_ASSERT m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); #endif @@ -123,6 +140,7 @@ void MetalMemoryManager::InitBufferCache(size_t size) void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, size_t size) { + cemu_assert_debug(!m_useHostMemoryForCache); cemu_assert_debug(m_bufferCache); cemu_assert_debug((offset + size) <= m_bufferCache->length()); @@ -147,6 +165,7 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size) { + cemu_assert_debug(!m_useHostMemoryForCache); cemu_assert_debug(m_bufferCache); m_mtlr->CopyBufferToBuffer(m_bufferCache, srcOffset, m_bufferCache, dstOffset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 4ea5769e..4e8b2594 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -115,6 +115,17 @@ public: } */ + // Getters + bool UseHostMemoryForCache() const + { + return m_useHostMemoryForCache; + } + + MPTR GetImportedMemBaseAddress() const + { + return m_importedMemBaseAddress; + } + private: class MetalRenderer* m_mtlr; @@ -126,4 +137,6 @@ private: //MetalVertexBufferCache m_vertexBufferCache; MTL::Buffer* m_bufferCache = nullptr; + bool m_useHostMemoryForCache = false; + MPTR m_importedMemBaseAddress; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 8b3377ac..7c80a0bc 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -30,8 +30,6 @@ #include "imgui/imgui_extension.h" #include "imgui/imgui_impl_metal.h" -#define DEFAULT_COMMIT_TRESHOLD 196 - extern bool hasValidFramebufferAttached; float supportBufferData[512 * 4]; @@ -90,6 +88,12 @@ MetalRenderer::MetalRenderer() m_depthStencilCache = new MetalDepthStencilCache(this); m_samplerCache = new MetalSamplerCache(this); + // Lower the commit treshold when host memory is used for cache to reduce latency + if (m_memoryManager->UseHostMemoryForCache()) + m_defaultCommitTreshlod = 64; + else + m_defaultCommitTreshlod = 196; + // Occlusion queries m_occlusionQuery.m_resultBuffer = m_device->newBuffer(OCCLUSION_QUERY_POOL_SIZE * sizeof(uint64), MTL::ResourceStorageModeShared); #ifdef CEMU_DEBUG_ASSERT @@ -97,8 +101,11 @@ MetalRenderer::MetalRenderer() #endif m_occlusionQuery.m_resultsPtr = (uint64*)m_occlusionQuery.m_resultBuffer->contents(); - // Initialize state - for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) + // Reset vertex and uniform buffers + for (uint32 i = 0; i < MAX_MTL_VERTEX_BUFFERS; i++) + m_state.m_vertexBufferOffsets[i] = INVALID_OFFSET; + + for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) { for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++) m_state.m_uniformBufferOffsets[i][j] = INVALID_OFFSET; @@ -821,23 +828,28 @@ void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { + if (m_memoryManager->UseHostMemoryForCache()) + dstOffset -= m_memoryManager->GetImportedMemBaseAddress(); + CopyBufferToBuffer(GetXfbRingBuffer(), srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size, MTL::RenderStageVertex | MTL::RenderStageMesh, ALL_MTL_RENDER_STAGES); } void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) { + cemu_assert_debug(!m_memoryManager->UseHostMemoryForCache()); cemu_assert_debug(bufferIndex < LATTE_MAX_VERTEX_BUFFERS); - auto& buffer = m_state.m_vertexBuffers[bufferIndex]; - if (buffer.offset == offset && buffer.size == size) - return; + + m_state.m_vertexBufferOffsets[bufferIndex] = offset; + //if (buffer.offset == offset && buffer.size == size) + // return; //if (buffer.offset != INVALID_OFFSET) //{ // m_memoryManager->UntrackVertexBuffer(bufferIndex); //} - buffer.offset = offset; - buffer.size = size; + //buffer.offset = offset; + //buffer.size = size; //buffer.restrideInfo = {}; //m_memoryManager->TrackVertexBuffer(bufferIndex, offset, size, &buffer.restrideInfo); @@ -845,6 +857,8 @@ void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, u void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) { + cemu_assert_debug(!m_memoryManager->UseHostMemoryForCache()); + m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shaderType)][bufferIndex] = offset; } @@ -988,9 +1002,24 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 uint32 indexBufferIndex = 0; LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex); - // synchronize vertex and uniform cache and update buffer bindings - // We need to call this before getting the render command encoder, since it can cause buffer copies - LatteBufferCache_Sync(indexMin + baseVertex, indexMax + baseVertex, baseInstance, instanceCount); + // Buffer cache + if (m_memoryManager->UseHostMemoryForCache()) + { + // direct memory access (Wii U memory space imported as a buffer), update buffer bindings + draw_updateVertexBuffersDirectAccess(); + if (vertexShader) + draw_updateUniformBuffersDirectAccess(vertexShader, mmSQ_VTX_UNIFORM_BLOCK_START); + if (geometryShader) + draw_updateUniformBuffersDirectAccess(geometryShader, mmSQ_GS_UNIFORM_BLOCK_START); + if (pixelShader) + draw_updateUniformBuffersDirectAccess(pixelShader, mmSQ_PS_UNIFORM_BLOCK_START); + } + else + { + // synchronize vertex and uniform cache and update buffer bindings + // We need to call this before getting the render command encoder, since it can cause buffer copies + LatteBufferCache_Sync(indexMin + baseVertex, indexMax + baseVertex, baseInstance, instanceCount); + } // Render pass auto renderCommandEncoder = GetRenderCommandEncoder(); @@ -1190,10 +1219,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Vertex buffers //std::vector barrierBuffers; - for (uint8 i = 0; i < MAX_MTL_BUFFERS; i++) + for (uint8 i = 0; i < MAX_MTL_VERTEX_BUFFERS; i++) { - auto& vertexBufferRange = m_state.m_vertexBuffers[i]; - if (vertexBufferRange.offset != INVALID_OFFSET) + size_t offset = m_state.m_vertexBufferOffsets[i]; + if (offset != INVALID_OFFSET) { /* MTL::Buffer* buffer; @@ -1218,11 +1247,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } */ - MTL::Buffer* buffer = m_memoryManager->GetBufferCache(); - size_t offset = m_state.m_vertexBuffers[i].offset; - // Bind - SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, usesGeometryShader), buffer, offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); + SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, usesGeometryShader), m_memoryManager->GetBufferCache(), offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); } } @@ -1301,7 +1327,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 m_occlusionQuery.m_currentIndex = (m_occlusionQuery.m_currentIndex + 1) % OCCLUSION_QUERY_POOL_SIZE; // Streamout - LatteStreamout_FinishDrawcall(false); + LatteStreamout_FinishDrawcall(m_memoryManager->UseHostMemoryForCache()); // Debug if (fetchVertexManually) @@ -1333,6 +1359,54 @@ void MetalRenderer::draw_endSequence() } } +void MetalRenderer::draw_updateVertexBuffersDirectAccess() +{ + LatteFetchShader* parsedFetchShader = LatteSHRC_GetActiveFetchShader(); + if (!parsedFetchShader) + return; + + for (auto& bufferGroup : parsedFetchShader->bufferGroups) + { + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + MPTR bufferAddress = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 0]; + //uint32 bufferSize = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 1] + 1; + //uint32 bufferStride = (LatteGPUState.contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + if (bufferAddress == MPTR_NULL) [[unlikely]] + bufferAddress = 0x10000000; // TODO: really? + + m_state.m_vertexBufferOffsets[bufferIndex] = bufferAddress - m_memoryManager->GetImportedMemBaseAddress(); + } +} + +void MetalRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader* shader, const uint32 uniformBufferRegOffset) +{ + if (shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) + { + for (const auto& buf : shader->list_quickBufferList) + { + sint32 i = buf.index; + MPTR physicalAddr = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 0]; + uint32 uniformSize = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 1] + 1; + + if (physicalAddr == MPTR_NULL) [[unlikely]] + { + cemu_assert_unimplemented(); + continue; + } + uniformSize = std::min(uniformSize, buf.size); + + cemu_assert_debug(physicalAddr < 0x50000000); + + uint32 bufferIndex = i; + cemu_assert_debug(bufferIndex < 16); + + m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shader->shaderType)][bufferIndex] = physicalAddr - m_memoryManager->GetImportedMemBaseAddress(); + } + } +} + void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) { auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); @@ -1486,7 +1560,7 @@ MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() m_commandBuffers.push_back({mtlCommandBuffer}); m_recordedDrawcalls = 0; - m_commitTreshold = DEFAULT_COMMIT_TRESHOLD; + m_commitTreshold = m_defaultCommitTreshlod; // Notify memory manager about the new command buffer m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(mtlCommandBuffer); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 9c1bb2dc..9ddc5e93 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -29,6 +29,7 @@ struct MetalRestrideInfo }; */ +/* struct MetalBoundBuffer { size_t offset = INVALID_OFFSET; @@ -36,6 +37,7 @@ struct MetalBoundBuffer // Memory manager will write restride info to this variable //MetalRestrideInfo restrideInfo; }; +*/ enum MetalGeneralShaderType { @@ -141,7 +143,7 @@ struct MetalState // If the FBO changes, but it's the same FBO as the last one with some omitted attachments, this FBO doesn't change MetalActiveFBOState m_lastUsedFBO; - MetalBoundBuffer m_vertexBuffers[MAX_MTL_BUFFERS] = {{}}; + size_t m_vertexBufferOffsets[MAX_MTL_VERTEX_BUFFERS] = {INVALID_OFFSET}; // TODO: find out what is the max number of bound textures on the Wii U class LatteTextureViewMtl* m_textures[64] = {nullptr}; size_t m_uniformBufferOffsets[METAL_GENERAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; @@ -277,6 +279,9 @@ public: void draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) override; void draw_endSequence() override; + void draw_updateVertexBuffersDirectAccess(); + void draw_updateUniformBuffersDirectAccess(LatteDecompilerShader* shader, const uint32 uniformBufferRegOffset); + // index void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override; void indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) override; @@ -506,6 +511,7 @@ private: MTL::CommandEncoder* m_commandEncoder = nullptr; uint32 m_recordedDrawcalls; + uint32 m_defaultCommitTreshlod; uint32 m_commitTreshold; // State