From 27925a4fd9f13394bc3a92eb51d54354f621e1ea Mon Sep 17 00:00:00 2001 From: Samuliak Date: Tue, 13 Aug 2024 18:28:10 +0200 Subject: [PATCH] do vertex buffer restride in a void vertex function --- .../Renderer/Metal/MetalMemoryManager.cpp | 57 +++++++++++++++---- .../Latte/Renderer/Metal/MetalMemoryManager.h | 14 ++++- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 18 +++--- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 12 ++++ .../Renderer/Metal/UtilityShaderSource.h | 14 ++++- 5 files changed, 94 insertions(+), 21 deletions(-) diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp index aef458a7..ef6871a7 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.cpp @@ -1,7 +1,10 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalCommon.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" -#include "Metal/MTLResource.hpp" +#include "Cafe/HW/Latte/Renderer/Metal/MetalHybridComputePipeline.h" +#include "Common/precompiled.h" +#include "Foundation/NSRange.hpp" +#include "Metal/MTLRenderCommandEncoder.hpp" const size_t BUFFER_ALLOCATION_SIZE = 8 * 1024 * 1024; @@ -93,21 +96,51 @@ MetalRestridedBufferRange MetalVertexBufferCache::RestrideBufferIfNeeded(MTL::Bu if (restrideInfo.memoryInvalidated || stride != restrideInfo.lastStride) { - // TODO: use compute/void vertex function instead size_t newStride = Align(stride, 4); size_t newSize = vertexBufferRange.size / stride * newStride; - // TODO: use one big buffer for all restrided buffers - restrideInfo.buffer = m_mtlr->GetDevice()->newBuffer(newSize, MTL::StorageModeShared); + if (!restrideInfo.buffer || newSize != restrideInfo.buffer->length()) + { + if (restrideInfo.buffer) + restrideInfo.buffer->release(); + // TODO: use one big buffer for all restrided buffers + restrideInfo.buffer = m_mtlr->GetDevice()->newBuffer(newSize, MTL::StorageModeShared); + } - uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; - uint8* newPtr = (uint8*)restrideInfo.buffer->contents(); + //uint8* oldPtr = (uint8*)bufferCache->contents() + vertexBufferRange.offset; + //uint8* newPtr = (uint8*)restrideInfo.buffer->contents(); - for (size_t elem = 0; elem < vertexBufferRange.size / stride; elem++) - { - memcpy(newPtr + elem * newStride, oldPtr + elem * stride, stride); - } - // TODO: remove - debug_printf("Restrided vertex buffer (old stride: %zu, new stride: %zu, old size: %zu, new size: %zu)\n", stride, newStride, vertexBufferRange.size, newSize); + //for (size_t elem = 0; elem < vertexBufferRange.size / stride; elem++) + //{ + // memcpy(newPtr + elem * newStride, oldPtr + elem * stride, stride); + //} + //debug_printf("Restrided vertex buffer (old stride: %zu, new stride: %zu, old size: %zu, new size: %zu)\n", stride, newStride, vertexBufferRange.size, newSize); + + if (m_mtlr->GetEncoderType() == MetalEncoderType::Render) + { + auto renderCommandEncoder = static_cast(m_mtlr->GetCommandEncoder()); + + renderCommandEncoder->setRenderPipelineState(m_restrideBufferPipeline->GetRenderPipelineState()); + MTL::Buffer* buffers[] = {bufferCache, restrideInfo.buffer}; + size_t offsets[] = {vertexBufferRange.offset, 0}; + renderCommandEncoder->setVertexBuffers(buffers, offsets, NS::Range(0, 2)); + + struct + { + uint32 oldStride; + uint32 newStride; + } strideData = {static_cast(stride), static_cast(newStride)}; + renderCommandEncoder->setVertexBytes(&strideData, sizeof(strideData), 2); + + renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), vertexBufferRange.size / stride); + + MTL::Resource* barrierBuffers[] = {restrideInfo.buffer}; + renderCommandEncoder->memoryBarrier(barrierBuffers, 1, MTL::RenderStageVertex, MTL::RenderStageVertex); + } + else + { + debug_printf("vertex buffer restride needs an active render encoder\n"); + cemu_assert_suspicious(); + } restrideInfo.memoryInvalidated = false; restrideInfo.lastStride = newStride; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h index 4f875687..0fc55936 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h @@ -80,7 +80,11 @@ public: MetalVertexBufferCache(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer} {} ~MetalVertexBufferCache(); - // Vertex buffer cache + void SetRestrideBufferPipeline(class MetalHybridComputePipeline* restrideBufferPipeline) + { + m_restrideBufferPipeline = restrideBufferPipeline; + } + void TrackVertexBuffer(uint32 bufferIndex, size_t offset, size_t size, MetalRestrideInfo* restrideInfo) { m_bufferRanges[bufferIndex] = MetalVertexBufferRange{offset, size, restrideInfo}; @@ -101,6 +105,8 @@ public: private: class MetalRenderer* m_mtlr; + class MetalHybridComputePipeline* m_restrideBufferPipeline = nullptr; + MetalVertexBufferRange m_bufferRanges[LATTE_MAX_VERTEX_BUFFERS] = {}; void MemoryRangeChanged(size_t offset, size_t size); @@ -112,6 +118,12 @@ public: MetalMemoryManager(class MetalRenderer* metalRenderer) : m_mtlr{metalRenderer}, m_bufferAllocator(metalRenderer), m_vertexBufferCache(metalRenderer) {} ~MetalMemoryManager(); + // Pipelines + void SetRestrideBufferPipeline(class MetalHybridComputePipeline* restrideBufferPipeline) + { + m_vertexBufferCache.SetRestrideBufferPipeline(restrideBufferPipeline); + } + void ResetTemporaryBuffers() { m_bufferAllocator/*s[m_bufferAllocatorIndex]*/.ResetTemporaryBuffers(); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 80246aab..3c1b59e5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -62,8 +62,9 @@ MetalRenderer::MetalRenderer() MTL::Library* utilityLibrary = m_device->newLibrary(NS::String::string(utilityShaderSource, NS::ASCIIStringEncoding), nullptr, &error); if (error) { - debug_printf("failed to create present library (error: %s)\n", error->localizedDescription()->utf8String()); + debug_printf("failed to create utility library (error: %s)\n", error->localizedDescription()->utf8String()); error->release(); + throw; return; } @@ -98,12 +99,16 @@ MetalRenderer::MetalRenderer() // Hybrid pipelines m_copyTextureToTexturePipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexCopyTextureToTexture", "kernelCopyTextureToTexture"); + m_restrideBufferPipeline = new MetalHybridComputePipeline(this, utilityLibrary, "vertexRestrideBuffer", "kernelRestrideBuffer"); utilityLibrary->release(); + + m_memoryManager->SetRestrideBufferPipeline(m_restrideBufferPipeline); } MetalRenderer::~MetalRenderer() { delete m_copyTextureToTexturePipeline; + delete m_restrideBufferPipeline; m_presentPipelineLinear->release(); m_presentPipelineSRGB->release(); @@ -688,11 +693,6 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } const auto fetchShader = LatteSHRC_GetActiveFetchShader(); - // Render pipeline state - // TODO: use `m_lastUsedFBO` instead of `m_activeFBO` - MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_activeFBO, LatteGPUState.contextNew); - renderCommandEncoder->setRenderPipelineState(renderPipelineState); - // Depth stencil state MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(LatteGPUState.contextNew); renderCommandEncoder->setDepthStencilState(depthStencilState); @@ -794,6 +794,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 } } + // Render pipeline state + MTL::RenderPipelineState* renderPipelineState = m_pipelineCache->GetPipelineState(fetchShader, vertexShader, pixelShader, m_state.m_activeFBO, LatteGPUState.contextNew); + renderCommandEncoder->setRenderPipelineState(renderPipelineState); + // Uniform buffers, textures and samplers BindStageResources(renderCommandEncoder, vertexShader); BindStageResources(renderCommandEncoder, pixelShader); @@ -1308,7 +1312,7 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE { LatteMRT::GetCurrentFragCoordScale(GET_UNIFORM_DATA_PTR(shader->uniform.loc_fragCoordScale)); } - // TODO: uncomment? + // TODO: uncomment /* if (shader->uniform.loc_verticesPerInstance >= 0) { diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index 067788ff..5e86d7d9 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -9,6 +9,7 @@ #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Common/precompiled.h" #include "Metal/MTLCommandBuffer.hpp" +#include "Metal/MTLCommandEncoder.hpp" #include "Metal/MTLRenderPass.hpp" #define MAX_MTL_BUFFERS 31 @@ -244,6 +245,16 @@ public: return m_commandBuffers[m_commandBuffers.size() - 1].m_commandBuffer; } + MTL::CommandEncoder* GetCommandEncoder() + { + return m_commandEncoder; + } + + MetalEncoderType GetEncoderType() + { + return m_encoderType; + } + MTL::CommandBuffer* GetCommandBuffer(); bool CommandBufferCompleted(MTL::CommandBuffer* commandBuffer); void WaitForCommandBufferCompletion(MTL::CommandBuffer* commandBuffer); @@ -284,6 +295,7 @@ private: // Hybrid pipelines class MetalHybridComputePipeline* m_copyTextureToTexturePipeline; + class MetalHybridComputePipeline* m_restrideBufferPipeline; // Basic MTL::SamplerState* m_nearestSampler; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h index 2e49fa95..3bc2ff75 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h @@ -30,8 +30,20 @@ inline const char* utilityShaderSource = \ " uint dstSlice;\n" \ "};\n" \ "\n" \ -"vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d_array src [[texture(0)]], texture2d_array dst [[texture(1)]], constant CopyParams& params) {\n" \ +"vertex void vertexCopyTextureToTexture(uint vid [[vertex_id]], texture2d_array src [[texture(0)]], texture2d_array dst [[texture(1)]], constant CopyParams& params [[buffer(0)]]) {\n" \ " uint2 coord = uint2(vid % params.width, vid / params.width);\n" \ " return dst.write(float4(src.read(coord, params.srcSlice, params.srcMip).r, 0.0, 0.0, 0.0), coord, params.dstSlice, params.dstMip);\n" \ "}\n" \ +"\n" \ +"struct RestrideParams {\n" \ +" uint oldStride;\n" \ +" uint newStride;\n" \ +"};\n" \ +"\n" \ +/* TODO: use uint32? Since that would require less iterations */ \ +"vertex void vertexRestrideBuffer(uint vid [[vertex_id]], device uint8_t* src [[buffer(0)]], device uint8_t* dst [[buffer(1)]], constant RestrideParams& params [[buffer(2)]]) {\n" \ +" for (uint32_t i = 0; i < params.oldStride; i++) {\n" \ +" dst[vid * params.newStride + i] = src[vid * params.oldStride + i];\n" \ +" }\n" \ +"}\n" \ "\n";