diff --git a/src/Cafe/HW/Latte/Core/LatteShader.cpp b/src/Cafe/HW/Latte/Core/LatteShader.cpp index 486516ef..66539a76 100644 --- a/src/Cafe/HW/Latte/Core/LatteShader.cpp +++ b/src/Cafe/HW/Latte/Core/LatteShader.cpp @@ -9,6 +9,7 @@ #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h" #include "Cafe/OS/libs/gx2/GX2.h" // todo - remove dependency #include "Cafe/GraphicPack/GraphicPack2.h" +#include "HW/Latte/Renderer/Renderer.h" #include "util/helpers/StringParser.h" #include "config/ActiveSettings.h" #include "Cafe/GameProfile/GameProfile.h" @@ -688,9 +689,9 @@ void LatteShader_GetDecompilerOptions(LatteDecompilerOptions& options, LatteCons { options.usesGeometryShader = geometryShaderEnabled; options.spirvInstrinsics.hasRoundingModeRTEFloat32 = false; + options.useTFViaSSBO = g_renderer->UseTFViaSSBO(); if (g_renderer->GetType() == RendererAPI::Vulkan) { - options.useTFViaSSBO = VulkanRenderer::GetInstance()->UseTFViaSSBO(); options.spirvInstrinsics.hasRoundingModeRTEFloat32 = VulkanRenderer::GetInstance()->HasSPRIVRoundingModeRTE32(); } options.strictMul = g_current_game_profile->GetAccurateShaderMul() != AccurateShaderMulOption::False; diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp index 39ff895c..8cbbbe37 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSL.cpp @@ -2752,9 +2752,9 @@ static void _emitTEXGetGradientsHV(LatteDecompilerShaderContext* shaderContext, const char* funcName; if (texInstruction->opcode == GPU7_TEX_INST_GET_GRADIENTS_H) - funcName = "dFdx"; + funcName = "dfdx"; else - funcName = "dFdy"; + funcName = "dfdy"; src->add(" = "); @@ -3273,15 +3273,8 @@ static void _emitCFRingWriteCode(LatteDecompilerShaderContext* shaderContext, La if ((cfInstruction->memWriteCompMask&(1 << i)) == 0) continue; - if (shaderContext->options->useTFViaSSBO) - { - uint32 u32Offset = streamWrite->exportArrayBase + i; - src->addFmt("sb_buffer[sbBase{} + {}]", streamWrite->bufferIndex, u32Offset); - } - else - { - src->addFmt("sb{}[{}]", streamWrite->bufferIndex, streamWrite->exportArrayBase + i); - } + uint32 u32Offset = streamWrite->exportArrayBase + i; + src->addFmt("sb[sbBase{} + {}]", streamWrite->bufferIndex, u32Offset); src->add(" = "); @@ -3393,15 +3386,8 @@ static void _emitStreamWriteCode(LatteDecompilerShaderContext* shaderContext, La if ((cfInstruction->memWriteCompMask&(1 << i)) == 0) continue; - if (shaderContext->options->useTFViaSSBO) - { - uint32 u32Offset = cfInstruction->exportArrayBase + i; - src->addFmt("sb_buffer[sbBase{} + {}]", streamoutBufferIndex, u32Offset); - } - else - { - src->addFmt("sb{}[{}]", streamoutBufferIndex, cfInstruction->exportArrayBase + i); - } + uint32 u32Offset = cfInstruction->exportArrayBase + i; + src->addFmt("sb[sbBase{} + {}]", streamoutBufferIndex, u32Offset); src->add(" = "); @@ -3595,15 +3581,12 @@ void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderConte // emit vertex src->add("EmitVertex();" _CRLF); // increment transform feedback pointer - if (shaderContext->analyzer.useSSBOForStreamout) + for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) { - for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) - { - if (!shaderContext->output->streamoutBufferWriteMask[i]) - continue; - cemu_assert_debug((shaderContext->output->streamoutBufferStride[i] & 3) == 0); - src->addFmt("sbBase{} += {};" _CRLF, i, shaderContext->output->streamoutBufferStride[i] / 4); - } + if (!shaderContext->output->streamoutBufferWriteMask[i]) + continue; + cemu_assert_debug((shaderContext->output->streamoutBufferStride[i] & 3) == 0); + src->addFmt("sbBase{} += {};" _CRLF, i, shaderContext->output->streamoutBufferStride[i] / 4); } if( shaderContext->analyzer.modifiesPixelActiveState ) @@ -3970,7 +3953,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext, src->addFmt("float cubeMapArrayIndex{} = 0.0;" _CRLF, i); } // init base offset for streamout buffer writes - if (shaderContext->analyzer.useSSBOForStreamout && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)) + if (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry) { for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++) { diff --git a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp index 9d52196a..114cd6fa 100644 --- a/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp +++ b/src/Cafe/HW/Latte/LegacyShaderDecompiler/LatteDecompilerEmitMSLHeader.hpp @@ -94,9 +94,8 @@ namespace LatteDecompiler uniformCurrentOffset += 8; } // define verticesPerInstance + streamoutBufferBaseX - if (decompilerContext->analyzer.useSSBOForStreamout && - (shader->shaderType == LatteConst::ShaderType::Vertex && decompilerContext->options->usesGeometryShader == false) || - (shader->shaderType == LatteConst::ShaderType::Geometry) ) + if ((shader->shaderType == LatteConst::ShaderType::Vertex && decompilerContext->options->usesGeometryShader == false) || + (shader->shaderType == LatteConst::ShaderType::Geometry)) { src->add("int verticesPerInstance;" _CRLF); uniformOffsets.offset_verticesPerInstance = uniformCurrentOffset; @@ -251,8 +250,6 @@ namespace LatteDecompiler { _emitAttributes(decompilerContext); _emitVSOutputs(decompilerContext); - - // TODO: transform feedback } else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel) { @@ -379,6 +376,13 @@ namespace LatteDecompiler case LatteConst::ShaderType::Vertex: src->add(", uint vid [[vertex_id]]"); src->add(", uint iid [[instance_id]]"); + + // streamout buffer (transform feedback) + if (decompilerContext->analyzer.hasStreamoutEnable && decompilerContext->analyzer.hasStreamoutWrite) + { + src->addFmt(", device int* sb [[buffer({})]]" _CRLF, decompilerContext->output->resourceMappingVK.getTFStorageBufferBindingPoint()); + } + break; case LatteConst::ShaderType::Pixel: src->add(", bool frontFacing [[front_facing]]"); diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index b65160eb..51323eec 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -16,6 +16,7 @@ #include "Cemu/Logging/CemuDebugLogging.h" #include "HW/Latte/Core/Latte.h" #include "HW/Latte/ISA/LatteReg.h" +#include "Metal/MTLResource.hpp" #include "Metal/MTLTypes.hpp" #include "gui/guiWrapper.h" @@ -39,6 +40,9 @@ MetalRenderer::MetalRenderer() // Texture readback m_readbackBuffer = m_device->newBuffer(TEXTURE_READBACK_SIZE, MTL::StorageModeShared); + // Transform feedback + m_xfbRingBuffer = m_device->newBuffer(LatteStreamout_GetRingBufferSize(), MTL::StorageModeShared); + // Initialize state for (uint32 i = 0; i < (uint32)LatteConst::ShaderType::TotalCount; i++) { @@ -1185,7 +1189,21 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE // Storage buffer if (shader->resourceMapping.tfStorageBindingPoint >= 0) { - debug_printf("storage buffer not implemented, index: %i\n", shader->resourceMapping.tfStorageBindingPoint); + switch (shader->shaderType) + { + case LatteConst::ShaderType::Vertex: + { + renderCommandEncoder->setVertexBuffer(m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint); + break; + } + case LatteConst::ShaderType::Pixel: + { + renderCommandEncoder->setFragmentBuffer(m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint); + break; + } + default: + UNREACHABLE; + } } } diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h index c770017b..ef33c95d 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h @@ -147,6 +147,7 @@ public: cemuLog_log(LogType::MetalLogging, "Imgui is not yet supported on Metal"); }; + bool UseTFViaSSBO() const override { return true; } void AppendOverlayDebugInfo() override; // rendertarget @@ -265,6 +266,9 @@ private: MTL::Buffer* m_readbackBuffer; uint32 m_readbackBufferWriteOffset = 0; + // Transform feedback + MTL::Buffer* m_xfbRingBuffer; + // Active objects MTL::CommandBuffer* m_commandBuffer = nullptr; MetalEncoderType m_encoderType = MetalEncoderType::None; diff --git a/src/Cafe/HW/Latte/Renderer/Renderer.h b/src/Cafe/HW/Latte/Renderer/Renderer.h index a94ad155..7bd143d0 100644 --- a/src/Cafe/HW/Latte/Renderer/Renderer.h +++ b/src/Cafe/HW/Latte/Renderer/Renderer.h @@ -85,6 +85,7 @@ public: virtual void DeleteFontTextures() = 0; GfxVendor GetVendor() const { return m_vendor; } + virtual bool UseTFViaSSBO() const { return false; } virtual void AppendOverlayDebugInfo() = 0; // rendertarget diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h index 6df53da4..e4b4cbf9 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h @@ -73,11 +73,11 @@ public: return true; } - + template struct direct_hash { - size_t operator()(const uint64& k) const noexcept + size_t operator()(const uint64& k) const noexcept { return k; } @@ -277,7 +277,6 @@ public: // texture functions void* texture_acquireTextureUploadBuffer(uint32 size) override; void texture_releaseTextureUploadBuffer(uint8* mem) override; - TextureDecoder* texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) override; @@ -370,7 +369,7 @@ private: VkRect2D currentScissorRect{}; // vertex bindings - struct + struct { uint32 offset; }currentVertexBinding[LATTE_MAX_VERTEX_BUFFERS]{}; @@ -457,17 +456,17 @@ private: bool shaderRoundingModeRTEFloat32{ false }; }shaderFloatControls; // from VK_KHR_shader_float_controls - struct + struct { bool debug_utils = false; // VK_EXT_DEBUG_UTILS }instanceExtensions; - struct + struct { bool useTFEmulationViaSSBO = true; // emulate transform feedback via shader writes to a storage buffer }mode; - struct + struct { uint32 minUniformBufferOffsetAlignment = 256; uint32 nonCoherentAtomSize = 256; @@ -497,7 +496,7 @@ private: void CreateCommandBuffers(); void swapchain_createDescriptorSetLayout(); - + // shader bool IsAsyncPipelineAllowed(uint32 numIndices); @@ -512,6 +511,8 @@ private: void DeleteFontTextures() override; bool BeginFrame(bool mainWindow) override; + bool UseTFViaSSBO() const override { return m_featureControl.mode.useTFEmulationViaSSBO; } + // drawcall emulation PipelineInfo* draw_createGraphicsPipeline(uint32 indexCount); PipelineInfo* draw_getOrCreateGraphicsPipeline(uint32 indexCount); @@ -574,7 +575,7 @@ private: VkDevice m_logicalDevice = VK_NULL_HANDLE; VkDebugUtilsMessengerEXT m_debugCallback = nullptr; volatile bool m_destructionRequested = false; - + QueueFamilyIndices m_indices{}; Semaphore m_pipeline_cache_semaphore; @@ -583,7 +584,7 @@ private: VkPipelineCache m_pipeline_cache{ nullptr }; VkPipelineLayout m_pipelineLayout{nullptr}; VkCommandPool m_commandPool{ nullptr }; - + // buffer to cache uniform vars VkBuffer m_uniformVarBuffer = VK_NULL_HANDLE; VkDeviceMemory m_uniformVarBufferMemory = VK_NULL_HANDLE; @@ -652,19 +653,19 @@ private: bool m_submitOnIdle{}; // submit current buffer if Latte command processor goes into idle state (no more commands or waiting for externally signaled condition) // tracking for dynamic offsets - struct + struct { uint32 uniformVarBufferOffset[VulkanRendererConst::SHADER_STAGE_INDEX_COUNT]; - struct + struct { uint32 unformBufferOffset[LATTE_NUM_MAX_UNIFORM_BUFFERS]; }shaderUB[VulkanRendererConst::SHADER_STAGE_INDEX_COUNT]; }dynamicOffsetInfo{}; // streamout - struct + struct { - struct + struct { bool enabled; uint32 ringBufferOffset; @@ -714,11 +715,11 @@ private: accessFlags = 0; if constexpr ((TSyncOp & BUFFER_SHADER_READ) != 0) { - // in theory: VK_ACCESS_INDEX_READ_BIT should be set here too but indices are currently separated + // in theory: VK_ACCESS_INDEX_READ_BIT should be set here too but indices are currently separated stages |= VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; accessFlags |= VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT | VK_ACCESS_SHADER_READ_BIT; } - + if constexpr ((TSyncOp & BUFFER_SHADER_WRITE) != 0) { stages |= VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; @@ -921,7 +922,6 @@ private: public: bool GetDisableMultithreadedCompilation() const { return m_featureControl.disableMultithreadedCompilation; } - bool UseTFViaSSBO() const { return m_featureControl.mode.useTFEmulationViaSSBO; } bool HasSPRIVRoundingModeRTE32() const { return m_featureControl.shaderFloatControls.shaderRoundingModeRTEFloat32; } bool IsDebugUtilsEnabled() const { return m_featureControl.debugMarkersSupported && m_featureControl.instanceExtensions.debug_utils; } @@ -931,7 +931,7 @@ private: void debug_genericBarrier(); // shaders - struct + struct { RendererShaderVk* copySurface_vs{}; RendererShaderVk* copySurface_psDepth2Color{};