From a460a5d28abeecfe2c1364b5f63720f0e51f5319 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Sat, 21 Sep 2024 09:28:46 +0200 Subject: [PATCH] do vertex restride on the CPU --- .../Metal/LatteTextureReadbackMtl.cpp | 13 ++++------ .../Renderer/Metal/MetalMemoryManager.cpp | 24 ++++++++--------- .../Latte/Renderer/Metal/MetalMemoryManager.h | 2 +- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 26 ++++++++++++------- 4 files changed, 34 insertions(+), 31 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp index 0bcab09f..ca4e31a7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.cpp @@ -27,19 +27,16 @@ void LatteTextureReadbackInfoMtl::StartTransfer() blitCommandEncoder->copyFromTexture(baseTexture->GetTexture(), 0, 0, MTL::Origin{0, 0, 0}, MTL::Size{(uint32)baseTexture->width, (uint32)baseTexture->height, 1}, m_mtlr->GetTextureReadbackBuffer(), m_bufferOffset, bytesPerRow, bytesPerImage); m_commandBuffer = m_mtlr->GetCurrentCommandBuffer()->retain(); - m_mtlr->RequestSoonCommit(); + // TODO: uncomment? + //m_mtlr->RequestSoonCommit(); + m_mtlr->CommitCommandBuffer(); } bool LatteTextureReadbackInfoMtl::IsFinished() { - // TODO: is this needed? - if (!m_commandBuffer) - return false; - - // TODO: remove this? // Command buffer wasn't even comitted, let's commit immediately - if (m_mtlr->GetCurrentCommandBuffer() == m_commandBuffer) - m_mtlr->CommitCommandBuffer(); + //if (m_mtlr->GetCurrentCommandBuffer() == m_commandBuffer) + // m_mtlr->CommitCommandBuffer(); return CommandBufferCompleted(m_commandBuffer); } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index 6173532c..e406abf6 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -28,15 +28,14 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu restrideInfo.allocation = m_bufferAllocator.GetBufferAllocation(newSize); buffer = m_bufferAllocator.GetBuffer(restrideInfo.allocation.bufferIndex); - //uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; - //uint8* newPtr = (uint8*)buffer->contents() + restrideInfo.allocation.bufferOffset; + // HACK: the restriding is done on the CPU, since doing it on the GPU was causing over-synchronization + uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; + uint8* newPtr = (uint8*)buffer->contents() + restrideInfo.allocation.offset; - //for (size_t elem = 0; elem < vertexBufferRange.size / stride; elem++) - //{ - // memcpy(newPtr + elem * newStride, oldPtr + elem * stride, stride); - //} - //debug_printf("Restrided vertex buffer (old stride: %zu, new stride: %zu, old size: %zu, new size: %zu)\n", stride, newStride, vertexBufferRange.size, newSize); + for (size_t elem = 0; elem < vertexBufferRange.size / stride; elem++) + memcpy(newPtr + elem * newStride, oldPtr + elem * stride, stride); + /* if (m_mtlr->GetEncoderType() == MetalEncoderType::Render) { auto renderCommandEncoder = static_cast(m_mtlr->GetCommandEncoder()); @@ -60,18 +59,19 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu // TODO: do the barriers in one call? MTL::Resource* barrierBuffers[] = {buffer}; renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex, MTL::RenderStageVertex); - - // Debug - m_mtlr->GetPerformanceMonitor().m_vertexBufferRestrides++; } else { - debug_printf("vertex buffer restride needs an active render encoder\n"); + debug_printf("vertex buffer restride needs an active render command encoder\n"); cemu_assert_suspicious(); } + */ restrideInfo.memoryInvalidated = false; restrideInfo.lastStride = newStride; + + // Debug + m_mtlr->GetPerformanceMonitor().m_vertexBufferRestrides++; } else { @@ -121,7 +121,7 @@ void MetalMemoryManager::InitBufferCache(size_t size) { cemu_assert_debug(!m_bufferCache); - m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModePrivate); + m_bufferCache = m_mtlr->GetDevice()->newBuffer(size, MTL::ResourceStorageModeShared); #ifdef CEMU_DEBUG_ASSERT m_bufferCache->setLabel(GetLabel("Buffer cache", m_bufferCache)); #endif diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 62254b21..87327c4c 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -58,7 +58,7 @@ private: class MetalMemoryManager { public: - MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer, m_mtlr->GetOptimalBufferStorageMode()), m_framePersistentBufferAllocator(metalRenderer, MTL::ResourceStorageModePrivate), m_tempBufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer, m_framePersistentBufferAllocator) {} + MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer, m_mtlr->GetOptimalBufferStorageMode()), m_framePersistentBufferAllocator(metalRenderer, MTL::ResourceStorageModeShared), m_tempBufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer, m_framePersistentBufferAllocator) {} ~MetalMemoryManager(); // Pipelines diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 78404949..363d5e6d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -26,7 +26,7 @@ #include "imgui/imgui_extension.h" #include "imgui/imgui_impl_metal.h" -#define DEFAULT_COMMIT_TRESHOLD 256 +#define DEFAULT_COMMIT_TRESHOLD 196 #define OCCLUSION_QUERY_POOL_SIZE 1024 extern bool hasValidFramebufferAttached; @@ -917,15 +917,18 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 const auto fetchShader = LatteSHRC_GetActiveFetchShader(); // Check if we need to end the render pass - // Fragment shader is most likely to require a render pass flush, so check for it first - bool endRenderPass = CheckIfRenderPassNeedsFlush(pixelShader); - if (!endRenderPass) - endRenderPass = CheckIfRenderPassNeedsFlush(vertexShader); - if (!endRenderPass && geometryShader) - endRenderPass = CheckIfRenderPassNeedsFlush(geometryShader); + if (!m_state.m_isFirstDrawInRenderPass) + { + // Fragment shader is most likely to require a render pass flush, so check for it first + bool endRenderPass = CheckIfRenderPassNeedsFlush(pixelShader); + if (!endRenderPass) + endRenderPass = CheckIfRenderPassNeedsFlush(vertexShader); + if (!endRenderPass && geometryShader) + endRenderPass = CheckIfRenderPassNeedsFlush(geometryShader); - if (endRenderPass) - EndEncoding(); + if (endRenderPass) + EndEncoding(); + } // Primitive type const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); @@ -1889,7 +1892,9 @@ void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 s void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size, MTL::RenderStages after, MTL::RenderStages before) { + // TODO: uncomment and fix performance issues // Do the copy in a vertex shader on Apple GPUs + /* if (m_isAppleGPU && m_encoderType == MetalEncoderType::Render) { auto renderCommandEncoder = static_cast(m_commandEncoder); @@ -1910,10 +1915,11 @@ void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL:: } else { + */ auto blitCommandEncoder = GetBlitCommandEncoder(); blitCommandEncoder->copyFromBuffer(src, srcOffset, dst, dstOffset, size); - } + //} } void MetalRenderer::SwapBuffer(bool mainWindow)