From ab41de4f9feffa277ddcf3710d300d78f81dfd2b Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 3 Nov 2024 11:53:16 +0100 Subject: [PATCH 1/3] use host memory instead of buffer cache when possible --- .../HW/Latte/Renderer/Metal/MetalCommon.h | 4 +- .../Renderer/Metal/MetalMemoryManager.cpp | 21 +++- .../Latte/Renderer/Metal/MetalMemoryManager.h | 13 ++ .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 116 ++++++++++++++---- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 8 +- 5 files changed, 138 insertions(+), 24 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h index d7de0a28..20fd6b9d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalCommon.h @@ -31,7 +31,9 @@ struct MetalQueryRange #define MAX_MTL_BUFFERS 31 // Buffer indices 28-30 are reserved for the helper shaders -#define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_BUFFERS - index - 4) +#define MTL_RESERVED_BUFFERS 3 +#define MAX_MTL_VERTEX_BUFFERS (MAX_MTL_BUFFERS - MTL_RESERVED_BUFFERS) +#define GET_MTL_VERTEX_BUFFER_INDEX(index) (MAX_MTL_VERTEX_BUFFERS - index - 1) #define MAX_MTL_TEXTURES 31 #define MAX_MTL_SAMPLERS 16 diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 1c788e21..4eb4d105 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -1,6 +1,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" +#include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" /* @@ -115,7 +116,23 @@ void MetalMemoryManager::InitBufferCache(size_t size) { cemu_assert_debug(!m_bufferCache); - m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModePrivate); + // First, try to import the host memory as a buffer + // TODO: only import if the option is ticked in game profile + if (m_mtlr->IsAppleGPU()) + { + m_importedMemBaseAddress = 0x10000000; + size_t hostAllocationSize = 0x40000000ull; + // TODO: get size of allocation + m_bufferCache = m_mtlr->GetDevice()->newBuffer(memory_getPointerFromVirtualOffset(m_importedMemBaseAddress), hostAllocationSize, MTL::ResourceStorageModeShared, nullptr); + if (m_bufferCache) + m_useHostMemoryForCache = true; + else + cemuLog_logDebug(LogType::Force, "Failed to import host memory as a buffer"); + } + + if (!m_useHostMemoryForCache) + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModePrivate); + #ifdef CEMU_DEBUG_ASSERT m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); #endif @@ -123,6 +140,7 @@ void MetalMemoryManager::InitBufferCache(size_t size) void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, size_t size) { + cemu_assert_debug(!m_useHostMemoryForCache); cemu_assert_debug(m_bufferCache); cemu_assert_debug((offset + size) <= m_bufferCache->length()); @@ -147,6 +165,7 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size) { + cemu_assert_debug(!m_useHostMemoryForCache); cemu_assert_debug(m_bufferCache); m_mtlr->CopyBufferToBuffer(m_bufferCache, srcOffset, m_bufferCache, dstOffset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 4ea5769e..4e8b2594 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -115,6 +115,17 @@ public: } */ + // Getters + bool UseHostMemoryForCache() const + { + return m_useHostMemoryForCache; + } + + MPTR GetImportedMemBaseAddress() const + { + return m_importedMemBaseAddress; + } + private: class MetalRenderer* m_mtlr; @@ -126,4 +137,6 @@ private: //MetalVertexBufferCache m_vertexBufferCache; MTL::Buffer* m_bufferCache = nullptr; + bool m_useHostMemoryForCache = false; + MPTR m_importedMemBaseAddress; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 8b3377ac..7c80a0bc 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -30,8 +30,6 @@ #include "imgui/imgui_extension.h" #include "imgui/imgui_impl_metal.h" -#define DEFAULT_COMMIT_TRESHOLD 196 - extern bool hasValidFramebufferAttached; float supportBufferData[512 * 4]; @@ -90,6 +88,12 @@ MetalRenderer::MetalRenderer() m_depthStencilCache = new MetalDepthStencilCache(this); m_samplerCache = new MetalSamplerCache(this); + // Lower the commit treshold when host memory is used for cache to reduce latency + if (m_memoryManager->UseHostMemoryForCache()) + m_defaultCommitTreshlod = 64; + else + m_defaultCommitTreshlod = 196; + // Occlusion queries m_occlusionQuery.m_resultBuffer = m_device->newBuffer(OCCLUSION_QUERY_POOL_SIZE * sizeof(uint64), MTL::ResourceStorageModeShared); #ifdef CEMU_DEBUG_ASSERT @@ -97,8 +101,11 @@ MetalRenderer::MetalRenderer() #endif m_occlusionQuery.m_resultsPtr = (uint64*)m_occlusionQuery.m_resultBuffer->contents(); - // Initialize state - for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) + // Reset vertex and uniform buffers + for (uint32 i = 0; i < MAX_MTL_VERTEX_BUFFERS; i++) + m_state.m_vertexBufferOffsets[i] = INVALID_OFFSET; + + for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) { for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++) m_state.m_uniformBufferOffsets[i][j] = INVALID_OFFSET; @@ -821,23 +828,28 @@ void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { + if (m_memoryManager->UseHostMemoryForCache()) + dstOffset -= m_memoryManager->GetImportedMemBaseAddress(); + CopyBufferToBuffer(GetXfbRingBuffer(), srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size, MTL::RenderStageVertex | MTL::RenderStageMesh, ALL_MTL_RENDER_STAGES); } void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) { + cemu_assert_debug(!m_memoryManager->UseHostMemoryForCache()); cemu_assert_debug(bufferIndex < LATTE_MAX_VERTEX_BUFFERS); - auto& buffer = m_state.m_vertexBuffers[bufferIndex]; - if (buffer.offset == offset && buffer.size == size) - return; + + m_state.m_vertexBufferOffsets[bufferIndex] = offset; + //if (buffer.offset == offset && buffer.size == size) + // return; //if (buffer.offset != INVALID_OFFSET) //{ // m_memoryManager->UntrackVertexBuffer(bufferIndex); //} - buffer.offset = offset; - buffer.size = size; + //buffer.offset = offset; + //buffer.size = size; //buffer.restrideInfo = {}; //m_memoryManager->TrackVertexBuffer(bufferIndex, offset, size, &buffer.restrideInfo); @@ -845,6 +857,8 @@ void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, u void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) { + cemu_assert_debug(!m_memoryManager->UseHostMemoryForCache()); + m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shaderType)][bufferIndex] = offset; } @@ -988,9 +1002,24 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 uint32 indexBufferIndex = 0; LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex); - // synchronize vertex and uniform cache and update buffer bindings - // We need to call this before getting the render command encoder, since it can cause buffer copies - LatteBufferCache_Sync(indexMin + baseVertex, indexMax + baseVertex, baseInstance, instanceCount); + // Buffer cache + if (m_memoryManager->UseHostMemoryForCache()) + { + // direct memory access (Wii U memory space imported as a buffer), update buffer bindings + draw_updateVertexBuffersDirectAccess(); + if (vertexShader) + draw_updateUniformBuffersDirectAccess(vertexShader, mmSQ_VTX_UNIFORM_BLOCK_START); + if (geometryShader) + draw_updateUniformBuffersDirectAccess(geometryShader, mmSQ_GS_UNIFORM_BLOCK_START); + if (pixelShader) + draw_updateUniformBuffersDirectAccess(pixelShader, mmSQ_PS_UNIFORM_BLOCK_START); + } + else + { + // synchronize vertex and uniform cache and update buffer bindings + // We need to call this before getting the render command encoder, since it can cause buffer copies + LatteBufferCache_Sync(indexMin + baseVertex, indexMax + baseVertex, baseInstance, instanceCount); + } // Render pass auto renderCommandEncoder = GetRenderCommandEncoder(); @@ -1190,10 +1219,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 // Vertex buffers //std::vector barrierBuffers; - for (uint8 i = 0; i < MAX_MTL_BUFFERS; i++) + for (uint8 i = 0; i < MAX_MTL_VERTEX_BUFFERS; i++) { - auto& vertexBufferRange = m_state.m_vertexBuffers[i]; - if (vertexBufferRange.offset != INVALID_OFFSET) + size_t offset = m_state.m_vertexBufferOffsets[i]; + if (offset != INVALID_OFFSET) { /* MTL::Buffer* buffer; @@ -1218,11 +1247,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } */ - MTL::Buffer* buffer = m_memoryManager->GetBufferCache(); - size_t offset = m_state.m_vertexBuffers[i].offset; - // Bind - SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, usesGeometryShader), buffer, offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); + SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, usesGeometryShader), m_memoryManager->GetBufferCache(), offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); } } @@ -1301,7 +1327,7 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 m_occlusionQuery.m_currentIndex = (m_occlusionQuery.m_currentIndex + 1) % OCCLUSION_QUERY_POOL_SIZE; // Streamout - LatteStreamout_FinishDrawcall(false); + LatteStreamout_FinishDrawcall(m_memoryManager->UseHostMemoryForCache()); // Debug if (fetchVertexManually) @@ -1333,6 +1359,54 @@ void MetalRenderer::draw_endSequence() } } +void MetalRenderer::draw_updateVertexBuffersDirectAccess() +{ + LatteFetchShader* parsedFetchShader = LatteSHRC_GetActiveFetchShader(); + if (!parsedFetchShader) + return; + + for (auto& bufferGroup : parsedFetchShader->bufferGroups) + { + uint32 bufferIndex = bufferGroup.attributeBufferIndex; + uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; + MPTR bufferAddress = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 0]; + //uint32 bufferSize = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 1] + 1; + //uint32 bufferStride = (LatteGPUState.contextRegister[bufferBaseRegisterIndex + 2] >> 11) & 0xFFFF; + + if (bufferAddress == MPTR_NULL) [[unlikely]] + bufferAddress = 0x10000000; // TODO: really? + + m_state.m_vertexBufferOffsets[bufferIndex] = bufferAddress - m_memoryManager->GetImportedMemBaseAddress(); + } +} + +void MetalRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader* shader, const uint32 uniformBufferRegOffset) +{ + if (shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) + { + for (const auto& buf : shader->list_quickBufferList) + { + sint32 i = buf.index; + MPTR physicalAddr = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 0]; + uint32 uniformSize = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 1] + 1; + + if (physicalAddr == MPTR_NULL) [[unlikely]] + { + cemu_assert_unimplemented(); + continue; + } + uniformSize = std::min(uniformSize, buf.size); + + cemu_assert_debug(physicalAddr < 0x50000000); + + uint32 bufferIndex = i; + cemu_assert_debug(bufferIndex < 16); + + m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shader->shaderType)][bufferIndex] = physicalAddr - m_memoryManager->GetImportedMemBaseAddress(); + } + } +} + void* MetalRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) { auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); @@ -1486,7 +1560,7 @@ MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() m_commandBuffers.push_back({mtlCommandBuffer}); m_recordedDrawcalls = 0; - m_commitTreshold = DEFAULT_COMMIT_TRESHOLD; + m_commitTreshold = m_defaultCommitTreshlod; // Notify memory manager about the new command buffer m_memoryManager->GetTemporaryBufferAllocator().SetActiveCommandBuffer(mtlCommandBuffer); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 9c1bb2dc..9ddc5e93 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -29,6 +29,7 @@ struct MetalRestrideInfo }; */ +/* struct MetalBoundBuffer { size_t offset = INVALID_OFFSET; @@ -36,6 +37,7 @@ struct MetalBoundBuffer // Memory manager will write restride info to this variable //MetalRestrideInfo restrideInfo; }; +*/ enum MetalGeneralShaderType { @@ -141,7 +143,7 @@ struct MetalState // If the FBO changes, but it's the same FBO as the last one with some omitted attachments, this FBO doesn't change MetalActiveFBOState m_lastUsedFBO; - MetalBoundBuffer m_vertexBuffers[MAX_MTL_BUFFERS] = {{}}; + size_t m_vertexBufferOffsets[MAX_MTL_VERTEX_BUFFERS] = {INVALID_OFFSET}; // TODO: find out what is the max number of bound textures on the Wii U class LatteTextureViewMtl* m_textures[64] = {nullptr}; size_t m_uniformBufferOffsets[METAL_GENERAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; @@ -277,6 +279,9 @@ public: void draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) override; void draw_endSequence() override; + void draw_updateVertexBuffersDirectAccess(); + void draw_updateUniformBuffersDirectAccess(LatteDecompilerShader* shader, const uint32 uniformBufferRegOffset); + // index void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override; void indexData_uploadIndexMemory(uint32 bufferIndex, uint32 offset, uint32 size) override; @@ -506,6 +511,7 @@ private: MTL::CommandEncoder* m_commandEncoder = nullptr; uint32 m_recordedDrawcalls; + uint32 m_defaultCommitTreshlod; uint32 m_commitTreshold; // State From 03d4e86b617835e12664a6db24a561d52d8238fc Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 3 Nov 2024 12:09:47 +0100 Subject: [PATCH 2/3] add an option to use the host memory instead of buffer cache --- src/Cafe/CafeSystem.cpp | 1 + src/Cafe/GameProfile/GameProfile.cpp | 14 +++++++++----- src/Cafe/GameProfile/GameProfile.h | 2 ++ .../HW/Latte/Renderer/Metal/MetalMemoryManager.cpp | 6 +++--- src/gui/GameProfileWindow.cpp | 9 +++++++++ src/gui/GameProfileWindow.h | 3 ++- 6 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/Cafe/CafeSystem.cpp b/src/Cafe/CafeSystem.cpp index 40d26a67..08228b62 100644 --- a/src/Cafe/CafeSystem.cpp +++ b/src/Cafe/CafeSystem.cpp @@ -258,6 +258,7 @@ void InfoLog_PrintActiveSettings() { cemuLog_log(LogType::Force, "Async compile: {}", GetConfig().async_compile.GetValue() ? "true" : "false"); cemuLog_log(LogType::Force, "Fast math: {}", GetConfig().fast_math.GetValue() ? "true" : "false"); + cemuLog_log(LogType::Force, "Use host memory for cache: {}", g_current_game_profile->UseHostMemForCache() ? "true" : "false"); if (!GetConfig().vk_accurate_barriers.GetValue()) cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); } diff --git a/src/Cafe/GameProfile/GameProfile.cpp b/src/Cafe/GameProfile/GameProfile.cpp index ee92107a..337786ed 100644 --- a/src/Cafe/GameProfile/GameProfile.cpp +++ b/src/Cafe/GameProfile/GameProfile.cpp @@ -127,7 +127,7 @@ bool gameProfile_loadIntegerOption(IniParser& iniParser, const char* optionName, { cemuLog_log(LogType::Force, "Value '{}' is out of range for option '{}' in game profile", *option_value, optionName); return false; - } + } } template @@ -224,8 +224,9 @@ bool GameProfile::Load(uint64_t title_id) gameProfile_loadIntegerOption(&iniParser, "graphics_api", &graphicsApi, -1, 0, 1); if (graphicsApi.value != -1) m_graphics_api = (GraphicAPI)graphicsApi.value; - + gameProfile_loadEnumOption(iniParser, "accurateShaderMul", m_accurateShaderMul); + gameProfile_loadBooleanOption2(iniParser, "useHostMemForCache", m_useHostMemForCache); // legacy support auto option_precompiledShaders = iniParser.FindOption("precompiledShaders"); @@ -277,7 +278,7 @@ bool GameProfile::Load(uint64_t title_id) void GameProfile::Save(uint64_t title_id) { auto gameProfileDir = ActiveSettings::GetConfigPath("gameProfiles"); - if (std::error_code ex_ec; !fs::exists(gameProfileDir, ex_ec)) + if (std::error_code ex_ec; !fs::exists(gameProfileDir, ex_ec)) fs::create_directories(gameProfileDir, ex_ec); auto gameProfilePath = gameProfileDir / fmt::format("{:016x}.ini", title_id); FileStream* fs = FileStream::createFile2(gameProfilePath); @@ -308,6 +309,7 @@ void GameProfile::Save(uint64_t title_id) fs->writeLine("[Graphics]"); WRITE_ENTRY(accurateShaderMul); + WRITE_ENTRY(useHostMemForCache); WRITE_OPTIONAL_ENTRY(precompiledShaders); WRITE_OPTIONAL_ENTRY(graphics_api); fs->writeLine(""); @@ -337,6 +339,7 @@ void GameProfile::ResetOptional() // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; + m_useHostMemForCache = false; // cpu settings m_threadQuantum = kThreadQuantumDefault; m_cpuMode.reset(); // CPUModeOption::kSingleCoreRecompiler; @@ -354,9 +357,10 @@ void GameProfile::Reset() // general settings m_loadSharedLibraries = true; m_startWithPadView = false; - + // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; + m_useHostMemForCache = false; m_precompiledShaders = PrecompiledShaderOption::Auto; // cpu settings m_threadQuantum = kThreadQuantumDefault; @@ -366,4 +370,4 @@ void GameProfile::Reset() // controller settings for (auto& profile : m_controllerProfile) profile.reset(); -} \ No newline at end of file +} diff --git a/src/Cafe/GameProfile/GameProfile.h b/src/Cafe/GameProfile/GameProfile.h index 6a1f2ebd..e2ab29f7 100644 --- a/src/Cafe/GameProfile/GameProfile.h +++ b/src/Cafe/GameProfile/GameProfile.h @@ -31,6 +31,7 @@ public: [[nodiscard]] const std::optional& GetGraphicsAPI() const { return m_graphics_api; } [[nodiscard]] const AccurateShaderMulOption& GetAccurateShaderMul() const { return m_accurateShaderMul; } + [[nodiscard]] bool UseHostMemForCache() const { return m_useHostMemForCache; } [[nodiscard]] const std::optional& GetPrecompiledShadersState() const { return m_precompiledShaders; } [[nodiscard]] uint32 GetThreadQuantum() const { return m_threadQuantum; } @@ -54,6 +55,7 @@ private: // graphic settings std::optional m_graphics_api{}; AccurateShaderMulOption m_accurateShaderMul = AccurateShaderMulOption::True; + bool m_useHostMemForCache = false; std::optional m_precompiledShaders{}; // cpu settings uint32 m_threadQuantum = kThreadQuantumDefault; // values: 20000 45000 60000 80000 100000 diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 4eb4d105..5f02847a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -1,8 +1,10 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" + #include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" +#include "GameProfile/GameProfile.h" /* MetalVertexBufferCache::~MetalVertexBufferCache() @@ -117,8 +119,7 @@ void MetalMemoryManager::InitBufferCache(size_t size) cemu_assert_debug(!m_bufferCache); // First, try to import the host memory as a buffer - // TODO: only import if the option is ticked in game profile - if (m_mtlr->IsAppleGPU()) + if (g_current_game_profile->UseHostMemForCache() && m_mtlr->IsAppleGPU()) { m_importedMemBaseAddress = 0x10000000; size_t hostAllocationSize = 0x40000000ull; @@ -165,7 +166,6 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size) { - cemu_assert_debug(!m_useHostMemoryForCache); cemu_assert_debug(m_bufferCache); m_mtlr->CopyBufferToBuffer(m_bufferCache, srcOffset, m_bufferCache, dstOffset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); diff --git a/src/gui/GameProfileWindow.cpp b/src/gui/GameProfileWindow.cpp index 76b8801c..c1aa63e4 100644 --- a/src/gui/GameProfileWindow.cpp +++ b/src/gui/GameProfileWindow.cpp @@ -127,6 +127,13 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) m_shader_mul_accuracy->SetToolTip(_("EXPERT OPTION\nControls the accuracy of floating point multiplication in shaders.\n\nRecommended: true")); first_row->Add(m_shader_mul_accuracy, 0, wxALL, 5); + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Use host memory for cache")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + + wxString mem_values[] = { _("false"), _("true")}; + m_use_host_mem_for_cache = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(mem_values), mem_values); + m_use_host_mem_for_cache->SetToolTip(_("EXPERT OPTION\nAllows the GPU to access data directly without the need for an intermediate cache. May increase performance and reduce memory usage, but can also cause flickering.\n\nMetal only\n\nRecommended: false")); + first_row->Add(m_use_host_mem_for_cache, 0, wxALL, 5); + /*first_row->Add(new wxStaticText(panel, wxID_ANY, _("GPU buffer cache accuracy")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); wxString accuarcy_values[] = { _("high"), _("medium"), _("low") }; m_cache_accuracy = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(accuarcy_values), accuarcy_values); @@ -273,6 +280,7 @@ void GameProfileWindow::ApplyProfile() else m_graphic_api->SetSelection(1 + m_game_profile.m_graphics_api.value()); // "", OpenGL, Vulkan, Metal m_shader_mul_accuracy->SetSelection((int)m_game_profile.m_accurateShaderMul); + m_use_host_mem_for_cache->SetSelection((int)m_game_profile.m_useHostMemForCache); //// audio //m_disable_audio->Set3StateValue(GetCheckboxState(m_game_profile.disableAudio)); @@ -332,6 +340,7 @@ void GameProfileWindow::SaveProfile() // gpu m_game_profile.m_accurateShaderMul = (AccurateShaderMulOption)m_shader_mul_accuracy->GetSelection(); + m_game_profile.m_useHostMemForCache = (bool)m_use_host_mem_for_cache->GetSelection(); if (m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::False && m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::True) m_game_profile.m_accurateShaderMul = AccurateShaderMulOption::True; // force a legal value diff --git a/src/gui/GameProfileWindow.h b/src/gui/GameProfileWindow.h index 6ca36de6..a1fe8132 100644 --- a/src/gui/GameProfileWindow.h +++ b/src/gui/GameProfileWindow.h @@ -40,6 +40,7 @@ private: wxChoice* m_graphic_api; wxChoice* m_shader_mul_accuracy; + wxChoice* m_use_host_mem_for_cache; //wxChoice* m_cache_accuracy; // audio @@ -47,4 +48,4 @@ private: // controller wxComboBox* m_controller_profile[8]; -}; \ No newline at end of file +}; From b38ca6a58ad757368088d58ea2400eb1825719d7 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sun, 3 Nov 2024 12:43:35 +0100 Subject: [PATCH 3/3] add an option to choose buffer cache type --- src/Cafe/CafeSystem.cpp | 2 +- src/Cafe/GameProfile/GameProfile.cpp | 8 +-- src/Cafe/GameProfile/GameProfile.h | 4 +- .../Renderer/Metal/MetalMemoryManager.cpp | 51 ++++++++++++------- .../Latte/Renderer/Metal/MetalMemoryManager.h | 6 ++- src/config/CemuConfig.h | 23 +++++++++ src/gui/GameProfileWindow.cpp | 15 +++--- src/gui/GameProfileWindow.h | 2 +- 8 files changed, 75 insertions(+), 36 deletions(-) diff --git a/src/Cafe/CafeSystem.cpp b/src/Cafe/CafeSystem.cpp index 08228b62..7ba93fc8 100644 --- a/src/Cafe/CafeSystem.cpp +++ b/src/Cafe/CafeSystem.cpp @@ -258,7 +258,7 @@ void InfoLog_PrintActiveSettings() { cemuLog_log(LogType::Force, "Async compile: {}", GetConfig().async_compile.GetValue() ? "true" : "false"); cemuLog_log(LogType::Force, "Fast math: {}", GetConfig().fast_math.GetValue() ? "true" : "false"); - cemuLog_log(LogType::Force, "Use host memory for cache: {}", g_current_game_profile->UseHostMemForCache() ? "true" : "false"); + cemuLog_log(LogType::Force, "Buffer cache type: {}", g_current_game_profile->GetBufferCacheType()); if (!GetConfig().vk_accurate_barriers.GetValue()) cemuLog_log(LogType::Force, "Accurate barriers are disabled!"); } diff --git a/src/Cafe/GameProfile/GameProfile.cpp b/src/Cafe/GameProfile/GameProfile.cpp index 337786ed..a4ce8fe8 100644 --- a/src/Cafe/GameProfile/GameProfile.cpp +++ b/src/Cafe/GameProfile/GameProfile.cpp @@ -226,7 +226,7 @@ bool GameProfile::Load(uint64_t title_id) m_graphics_api = (GraphicAPI)graphicsApi.value; gameProfile_loadEnumOption(iniParser, "accurateShaderMul", m_accurateShaderMul); - gameProfile_loadBooleanOption2(iniParser, "useHostMemForCache", m_useHostMemForCache); + gameProfile_loadEnumOption(iniParser, "bufferCacheType", m_bufferCacheType); // legacy support auto option_precompiledShaders = iniParser.FindOption("precompiledShaders"); @@ -309,7 +309,7 @@ void GameProfile::Save(uint64_t title_id) fs->writeLine("[Graphics]"); WRITE_ENTRY(accurateShaderMul); - WRITE_ENTRY(useHostMemForCache); + WRITE_ENTRY(bufferCacheType); WRITE_OPTIONAL_ENTRY(precompiledShaders); WRITE_OPTIONAL_ENTRY(graphics_api); fs->writeLine(""); @@ -339,7 +339,7 @@ void GameProfile::ResetOptional() // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; - m_useHostMemForCache = false; + m_bufferCacheType = BufferCacheType::DevicePrivate; // cpu settings m_threadQuantum = kThreadQuantumDefault; m_cpuMode.reset(); // CPUModeOption::kSingleCoreRecompiler; @@ -360,7 +360,7 @@ void GameProfile::Reset() // graphic settings m_accurateShaderMul = AccurateShaderMulOption::True; - m_useHostMemForCache = false; + m_bufferCacheType = BufferCacheType::DevicePrivate; m_precompiledShaders = PrecompiledShaderOption::Auto; // cpu settings m_threadQuantum = kThreadQuantumDefault; diff --git a/src/Cafe/GameProfile/GameProfile.h b/src/Cafe/GameProfile/GameProfile.h index e2ab29f7..5c2d28d7 100644 --- a/src/Cafe/GameProfile/GameProfile.h +++ b/src/Cafe/GameProfile/GameProfile.h @@ -31,7 +31,7 @@ public: [[nodiscard]] const std::optional& GetGraphicsAPI() const { return m_graphics_api; } [[nodiscard]] const AccurateShaderMulOption& GetAccurateShaderMul() const { return m_accurateShaderMul; } - [[nodiscard]] bool UseHostMemForCache() const { return m_useHostMemForCache; } + [[nodiscard]] BufferCacheType GetBufferCacheType() const { return m_bufferCacheType; } [[nodiscard]] const std::optional& GetPrecompiledShadersState() const { return m_precompiledShaders; } [[nodiscard]] uint32 GetThreadQuantum() const { return m_threadQuantum; } @@ -55,7 +55,7 @@ private: // graphic settings std::optional m_graphics_api{}; AccurateShaderMulOption m_accurateShaderMul = AccurateShaderMulOption::True; - bool m_useHostMemForCache = false; + BufferCacheType m_bufferCacheType = BufferCacheType::DevicePrivate; std::optional m_precompiledShaders{}; // cpu settings uint32 m_threadQuantum = kThreadQuantumDefault; // values: 20000 45000 60000 80000 100000 diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 5f02847a..cd041c5a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -4,7 +4,6 @@ #include "Cemu/Logging/CemuLogging.h" #include "Common/precompiled.h" -#include "GameProfile/GameProfile.h" /* MetalVertexBufferCache::~MetalVertexBufferCache() @@ -118,21 +117,24 @@ void MetalMemoryManager::InitBufferCache(size_t size) { cemu_assert_debug(!m_bufferCache); + m_bufferCacheType = g_current_game_profile->GetBufferCacheType(); + // First, try to import the host memory as a buffer - if (g_current_game_profile->UseHostMemForCache() && m_mtlr->IsAppleGPU()) + if (m_bufferCacheType == BufferCacheType::Host && m_mtlr->IsAppleGPU()) { m_importedMemBaseAddress = 0x10000000; size_t hostAllocationSize = 0x40000000ull; // TODO: get size of allocation m_bufferCache = m_mtlr->GetDevice()->newBuffer(memory_getPointerFromVirtualOffset(m_importedMemBaseAddress), hostAllocationSize, MTL::ResourceStorageModeShared, nullptr); - if (m_bufferCache) - m_useHostMemoryForCache = true; - else + if (!m_bufferCache) + { cemuLog_logDebug(LogType::Force, "Failed to import host memory as a buffer"); + m_bufferCacheType = BufferCacheType::DevicePrivate; + } } - if (!m_useHostMemoryForCache) - m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModePrivate); + if (!m_bufferCache) + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, (m_bufferCacheType == BufferCacheType::DevicePrivate ? MTL::ResourceStorageModePrivate : MTL::ResourceStorageModeShared)); #ifdef CEMU_DEBUG_ASSERT m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); @@ -141,24 +143,31 @@ void MetalMemoryManager::InitBufferCache(size_t size) void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, size_t size) { - cemu_assert_debug(!m_useHostMemoryForCache); + cemu_assert_debug(m_bufferCacheType != BufferCacheType::Host); cemu_assert_debug(m_bufferCache); cemu_assert_debug((offset + size) <= m_bufferCache->length()); - auto allocation = m_tempBufferAllocator.GetBufferAllocation(size); - auto buffer = m_tempBufferAllocator.GetBufferOutsideOfCommandBuffer(allocation.bufferIndex); - memcpy((uint8*)buffer->contents() + allocation.offset, data, size); + if (m_bufferCacheType == BufferCacheType::DevicePrivate) + { + auto allocation = m_tempBufferAllocator.GetBufferAllocation(size); + auto buffer = m_tempBufferAllocator.GetBufferOutsideOfCommandBuffer(allocation.bufferIndex); + memcpy((uint8*)buffer->contents() + allocation.offset, data, size); - // Lock the buffer to make sure it's not deallocated before the copy is done - m_tempBufferAllocator.LockBuffer(allocation.bufferIndex); + // Lock the buffer to make sure it's not deallocated before the copy is done + m_tempBufferAllocator.LockBuffer(allocation.bufferIndex); - m_mtlr->CopyBufferToBuffer(buffer, allocation.offset, m_bufferCache, offset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); + m_mtlr->CopyBufferToBuffer(buffer, allocation.offset, m_bufferCache, offset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); - // Make sure the buffer has the right command buffer - m_tempBufferAllocator.GetBuffer(allocation.bufferIndex); // TODO: make a helper function for this + // Make sure the buffer has the right command buffer + m_tempBufferAllocator.GetBuffer(allocation.bufferIndex); // TODO: make a helper function for this - // We can now safely unlock the buffer - m_tempBufferAllocator.UnlockBuffer(allocation.bufferIndex); + // We can now safely unlock the buffer + m_tempBufferAllocator.UnlockBuffer(allocation.bufferIndex); + } + else + { + memcpy((uint8*)m_bufferCache->contents() + offset, data, size); + } // Notify vertex buffer cache about the change //m_vertexBufferCache.MemoryRangeChanged(offset, size); @@ -166,7 +175,11 @@ void MetalMemoryManager::UploadToBufferCache(const void* data, size_t offset, si void MetalMemoryManager::CopyBufferCache(size_t srcOffset, size_t dstOffset, size_t size) { + cemu_assert_debug(m_bufferCacheType != BufferCacheType::Host); cemu_assert_debug(m_bufferCache); - m_mtlr->CopyBufferToBuffer(m_bufferCache, srcOffset, m_bufferCache, dstOffset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); + if (m_bufferCacheType == BufferCacheType::DevicePrivate) + m_mtlr->CopyBufferToBuffer(m_bufferCache, srcOffset, m_bufferCache, dstOffset, size, ALL_MTL_RENDER_STAGES, ALL_MTL_RENDER_STAGES); + else + memcpy((uint8*)m_bufferCache->contents() + dstOffset, (uint8*)m_bufferCache->contents() + srcOffset, size); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 4e8b2594..6cc4ab1e 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -2,6 +2,8 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h" +#include "GameProfile/GameProfile.h" + /* struct MetalRestridedBufferRange { @@ -118,7 +120,7 @@ public: // Getters bool UseHostMemoryForCache() const { - return m_useHostMemoryForCache; + return (m_bufferCacheType == BufferCacheType::Host); } MPTR GetImportedMemBaseAddress() const @@ -137,6 +139,6 @@ private: //MetalVertexBufferCache m_vertexBufferCache; MTL::Buffer* m_bufferCache = nullptr; - bool m_useHostMemoryForCache = false; + BufferCacheType m_bufferCacheType; MPTR m_importedMemBaseAddress; }; diff --git a/src/config/CemuConfig.h b/src/config/CemuConfig.h index 988916eb..02dc873a 100644 --- a/src/config/CemuConfig.h +++ b/src/config/CemuConfig.h @@ -124,6 +124,14 @@ enum class AccurateShaderMulOption }; ENABLE_ENUM_ITERATORS(AccurateShaderMulOption, AccurateShaderMulOption::False, AccurateShaderMulOption::True); +enum class BufferCacheType +{ + DevicePrivate, + DeviceShared, + Host, +}; +ENABLE_ENUM_ITERATORS(BufferCacheType, BufferCacheType::DevicePrivate, BufferCacheType::Host); + enum class CPUMode { SinglecoreInterpreter = 0, @@ -222,6 +230,21 @@ struct fmt::formatter : formatter { } }; template <> +struct fmt::formatter : formatter { + template + auto format(const BufferCacheType c, FormatContext &ctx) const { + string_view name; + switch (c) + { + case BufferCacheType::DevicePrivate: name = "device private"; break; + case BufferCacheType::DeviceShared: name = "device shared"; break; + case BufferCacheType::Host: name = "host"; break; + default: name = "unknown"; break; + } + return formatter::format(name, ctx); + } +}; +template <> struct fmt::formatter : formatter { template auto format(const CPUMode c, FormatContext &ctx) const { diff --git a/src/gui/GameProfileWindow.cpp b/src/gui/GameProfileWindow.cpp index c1aa63e4..f54a8fb4 100644 --- a/src/gui/GameProfileWindow.cpp +++ b/src/gui/GameProfileWindow.cpp @@ -8,6 +8,7 @@ #include #include +#include "config/CemuConfig.h" #include "gui/helpers/wxHelpers.h" #include "input/InputManager.h" @@ -127,12 +128,12 @@ GameProfileWindow::GameProfileWindow(wxWindow* parent, uint64_t title_id) m_shader_mul_accuracy->SetToolTip(_("EXPERT OPTION\nControls the accuracy of floating point multiplication in shaders.\n\nRecommended: true")); first_row->Add(m_shader_mul_accuracy, 0, wxALL, 5); - first_row->Add(new wxStaticText(panel, wxID_ANY, _("Use host memory for cache")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); + first_row->Add(new wxStaticText(panel, wxID_ANY, _("Buffer cache type")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); - wxString mem_values[] = { _("false"), _("true")}; - m_use_host_mem_for_cache = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(mem_values), mem_values); - m_use_host_mem_for_cache->SetToolTip(_("EXPERT OPTION\nAllows the GPU to access data directly without the need for an intermediate cache. May increase performance and reduce memory usage, but can also cause flickering.\n\nMetal only\n\nRecommended: false")); - first_row->Add(m_use_host_mem_for_cache, 0, wxALL, 5); + wxString cache_values[] = { _("device private"), _("device shared"), _("host")}; + m_buffer_cache_type = new wxChoice(panel, wxID_ANY, wxDefaultPosition, wxDefaultSize, (int)std::size(cache_values), cache_values); + m_buffer_cache_type->SetToolTip(_("EXPERT OPTION\nDecides how the buffer cache memory will be managed.\n\nMetal only\n\nRecommended: device private")); + first_row->Add(m_buffer_cache_type, 0, wxALL, 5); /*first_row->Add(new wxStaticText(panel, wxID_ANY, _("GPU buffer cache accuracy")), 0, wxALIGN_CENTER_VERTICAL | wxALL, 5); wxString accuarcy_values[] = { _("high"), _("medium"), _("low") }; @@ -280,7 +281,7 @@ void GameProfileWindow::ApplyProfile() else m_graphic_api->SetSelection(1 + m_game_profile.m_graphics_api.value()); // "", OpenGL, Vulkan, Metal m_shader_mul_accuracy->SetSelection((int)m_game_profile.m_accurateShaderMul); - m_use_host_mem_for_cache->SetSelection((int)m_game_profile.m_useHostMemForCache); + m_buffer_cache_type->SetSelection((int)m_game_profile.m_bufferCacheType); //// audio //m_disable_audio->Set3StateValue(GetCheckboxState(m_game_profile.disableAudio)); @@ -340,7 +341,7 @@ void GameProfileWindow::SaveProfile() // gpu m_game_profile.m_accurateShaderMul = (AccurateShaderMulOption)m_shader_mul_accuracy->GetSelection(); - m_game_profile.m_useHostMemForCache = (bool)m_use_host_mem_for_cache->GetSelection(); + m_game_profile.m_bufferCacheType = (BufferCacheType)m_buffer_cache_type->GetSelection(); if (m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::False && m_game_profile.m_accurateShaderMul != AccurateShaderMulOption::True) m_game_profile.m_accurateShaderMul = AccurateShaderMulOption::True; // force a legal value diff --git a/src/gui/GameProfileWindow.h b/src/gui/GameProfileWindow.h index a1fe8132..22eda48d 100644 --- a/src/gui/GameProfileWindow.h +++ b/src/gui/GameProfileWindow.h @@ -40,7 +40,7 @@ private: wxChoice* m_graphic_api; wxChoice* m_shader_mul_accuracy; - wxChoice* m_use_host_mem_for_cache; + wxChoice* m_buffer_cache_type; //wxChoice* m_cache_accuracy; // audio