From e2f66b8aa3bec12df4571df088a636cbd1996102 Mon Sep 17 00:00:00 2001 From: Samuliak Date: Mon, 19 Aug 2024 13:07:30 +0200 Subject: [PATCH] fix: streamout --- .../LatteDecompilerEmitMSL.cpp | 4 ---- .../HW/Latte/Renderer/Metal/MetalRenderer.cpp | 24 ++++++++++++------- .../HW/Latte/Renderer/Metal/MetalRenderer.h | 12 ++++++++++ 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index a46da96c..95e91d40 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -3284,8 +3284,6 @@ static void _emitCFRingWriteCode(LatteDecompilerShaderContext* shaderContext, La continue; uint32 u32Offset = streamWrite->exportArrayBase + i; - // HACK: disable streamout temporarily, since it causes GPU hangs - continue; src->addFmt("sb[sbBase{} + {}]", streamWrite->bufferIndex, u32Offset); src->add(" = "); @@ -3399,8 +3397,6 @@ static void _emitStreamWriteCode(LatteDecompilerShaderContext* shaderContext, La continue; uint32 u32Offset = cfInstruction->exportArrayBase + i; - // HACK: disable streamout temporarily, since it causes GPU hangs - continue; src->addFmt("sb[sbBase{} + {}]", streamoutBufferIndex, u32Offset); src->add(" = "); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 76139e63..af092c15 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -634,7 +634,9 @@ void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { - debug_printf("MetalRenderer::bufferCache_copyStreamoutToMainBuffer not implemented\n"); + auto blitCommandEncoder = GetBlitCommandEncoder(); + + blitCommandEncoder->copyFromBuffer(m_xfbRingBuffer, srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size); } void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) @@ -669,17 +671,18 @@ RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, ui void MetalRenderer::streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize) { - debug_printf("MetalRenderer::streamout_setupXfbBuffer not implemented\n"); + m_state.m_streamoutState.buffers[bufferIndex].enabled = true; + m_state.m_streamoutState.buffers[bufferIndex].ringBufferOffset = ringBufferOffset; } void MetalRenderer::streamout_begin() { - debug_printf("MetalRenderer::streamout_begin not implemented\n"); + // Do nothing } void MetalRenderer::streamout_rendererFinishDrawcall() { - debug_printf("MetalRenderer::streamout_rendererFinishDrawcall not implemented\n"); + // Do nothing } void MetalRenderer::draw_beginSequence() @@ -966,6 +969,10 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 encoderState.m_renderPipelineState = renderPipelineState; } + // Prepare streamout + m_state.m_streamoutState.verticesPerInstance = count; + LatteStreamout_PrepareDrawcall(count, instanceCount); + // Uniform buffers, textures and samplers BindStageResources(renderCommandEncoder, vertexShader); BindStageResources(renderCommandEncoder, pixelShader); @@ -981,6 +988,8 @@ void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 renderCommandEncoder->drawPrimitives(mtlPrimitiveType, baseVertex, count, instanceCount, baseInstance); } + LatteStreamout_FinishDrawcall(false); + LatteGPUState.drawCallCounter++; } @@ -1498,20 +1507,17 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE { LatteMRT::GetCurrentFragCoordScale(GET_UNIFORM_DATA_PTR(shader->uniform.loc_fragCoordScale)); } - // TODO: uncomment - /* if (shader->uniform.loc_verticesPerInstance >= 0) { - *(int*)(supportBufferData + ((size_t)shader->uniform.loc_verticesPerInstance / 4)) = m_streamoutState.verticesPerInstance; + *(int*)(supportBufferData + ((size_t)shader->uniform.loc_verticesPerInstance / 4)) = m_state.m_streamoutState.verticesPerInstance; for (sint32 b = 0; b < LATTE_NUM_STREAMOUT_BUFFER; b++) { if (shader->uniform.loc_streamoutBufferBase[b] >= 0) { - *(uint32*)GET_UNIFORM_DATA_PTR(shader->uniform.loc_streamoutBufferBase[b]) = m_streamoutState.buffer[b].ringBufferOffset; + *(uint32*)GET_UNIFORM_DATA_PTR(shader->uniform.loc_streamoutBufferBase[b]) = m_state.m_streamoutState.buffers[b].ringBufferOffset; } } } - */ auto& bufferAllocator = m_memoryManager->GetTemporaryBufferAllocator(); size_t size = shader->uniform.uniformRangeSize; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index c3eb9ab7..e18e619a 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -80,6 +80,16 @@ struct MetalEncoderState size_t m_uniformBufferOffsets[METAL_SHADER_TYPE_TOTAL][MAX_MTL_BUFFERS]; }; +struct MetalStreamoutState +{ + struct + { + bool enabled; + uint32 ringBufferOffset; + } buffers[LATTE_NUM_STREAMOUT_BUFFER]; + sint32 verticesPerInstance; +}; + struct MetalState { MetalEncoderState m_encoderState{}; @@ -99,6 +109,8 @@ struct MetalState MTL::Viewport m_viewport; MTL::ScissorRect m_scissor; + + MetalStreamoutState m_streamoutState; }; struct MetalCommandBuffer