implement transform feedback

This commit is contained in:
Samuliak 2024-08-08 13:52:48 +02:00
parent e0791c3bf4
commit 5c246d55bd
7 changed files with 65 additions and 54 deletions

View File

@ -9,6 +9,7 @@
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h"
#include "Cafe/OS/libs/gx2/GX2.h" // todo - remove dependency
#include "Cafe/GraphicPack/GraphicPack2.h"
#include "HW/Latte/Renderer/Renderer.h"
#include "util/helpers/StringParser.h"
#include "config/ActiveSettings.h"
#include "Cafe/GameProfile/GameProfile.h"
@ -688,9 +689,9 @@ void LatteShader_GetDecompilerOptions(LatteDecompilerOptions& options, LatteCons
{
options.usesGeometryShader = geometryShaderEnabled;
options.spirvInstrinsics.hasRoundingModeRTEFloat32 = false;
options.useTFViaSSBO = g_renderer->UseTFViaSSBO();
if (g_renderer->GetType() == RendererAPI::Vulkan)
{
options.useTFViaSSBO = VulkanRenderer::GetInstance()->UseTFViaSSBO();
options.spirvInstrinsics.hasRoundingModeRTEFloat32 = VulkanRenderer::GetInstance()->HasSPRIVRoundingModeRTE32();
}
options.strictMul = g_current_game_profile->GetAccurateShaderMul() != AccurateShaderMulOption::False;

View File

@ -2752,9 +2752,9 @@ static void _emitTEXGetGradientsHV(LatteDecompilerShaderContext* shaderContext,
const char* funcName;
if (texInstruction->opcode == GPU7_TEX_INST_GET_GRADIENTS_H)
funcName = "dFdx";
funcName = "dfdx";
else
funcName = "dFdy";
funcName = "dfdy";
src->add(" = ");
@ -3273,15 +3273,8 @@ static void _emitCFRingWriteCode(LatteDecompilerShaderContext* shaderContext, La
if ((cfInstruction->memWriteCompMask&(1 << i)) == 0)
continue;
if (shaderContext->options->useTFViaSSBO)
{
uint32 u32Offset = streamWrite->exportArrayBase + i;
src->addFmt("sb_buffer[sbBase{} + {}]", streamWrite->bufferIndex, u32Offset);
}
else
{
src->addFmt("sb{}[{}]", streamWrite->bufferIndex, streamWrite->exportArrayBase + i);
}
uint32 u32Offset = streamWrite->exportArrayBase + i;
src->addFmt("sb[sbBase{} + {}]", streamWrite->bufferIndex, u32Offset);
src->add(" = ");
@ -3393,15 +3386,8 @@ static void _emitStreamWriteCode(LatteDecompilerShaderContext* shaderContext, La
if ((cfInstruction->memWriteCompMask&(1 << i)) == 0)
continue;
if (shaderContext->options->useTFViaSSBO)
{
uint32 u32Offset = cfInstruction->exportArrayBase + i;
src->addFmt("sb_buffer[sbBase{} + {}]", streamoutBufferIndex, u32Offset);
}
else
{
src->addFmt("sb{}[{}]", streamoutBufferIndex, cfInstruction->exportArrayBase + i);
}
uint32 u32Offset = cfInstruction->exportArrayBase + i;
src->addFmt("sb[sbBase{} + {}]", streamoutBufferIndex, u32Offset);
src->add(" = ");
@ -3595,15 +3581,12 @@ void LatteDecompiler_emitClauseCodeMSL(LatteDecompilerShaderContext* shaderConte
// emit vertex
src->add("EmitVertex();" _CRLF);
// increment transform feedback pointer
if (shaderContext->analyzer.useSSBOForStreamout)
for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++)
{
for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++)
{
if (!shaderContext->output->streamoutBufferWriteMask[i])
continue;
cemu_assert_debug((shaderContext->output->streamoutBufferStride[i] & 3) == 0);
src->addFmt("sbBase{} += {};" _CRLF, i, shaderContext->output->streamoutBufferStride[i] / 4);
}
if (!shaderContext->output->streamoutBufferWriteMask[i])
continue;
cemu_assert_debug((shaderContext->output->streamoutBufferStride[i] & 3) == 0);
src->addFmt("sbBase{} += {};" _CRLF, i, shaderContext->output->streamoutBufferStride[i] / 4);
}
if( shaderContext->analyzer.modifiesPixelActiveState )
@ -3970,7 +3953,7 @@ void LatteDecompiler_emitMSLShader(LatteDecompilerShaderContext* shaderContext,
src->addFmt("float cubeMapArrayIndex{} = 0.0;" _CRLF, i);
}
// init base offset for streamout buffer writes
if (shaderContext->analyzer.useSSBOForStreamout && (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry))
if (shader->shaderType == LatteConst::ShaderType::Vertex || shader->shaderType == LatteConst::ShaderType::Geometry)
{
for (sint32 i = 0; i < LATTE_NUM_STREAMOUT_BUFFER; i++)
{

View File

@ -94,9 +94,8 @@ namespace LatteDecompiler
uniformCurrentOffset += 8;
}
// define verticesPerInstance + streamoutBufferBaseX
if (decompilerContext->analyzer.useSSBOForStreamout &&
(shader->shaderType == LatteConst::ShaderType::Vertex && decompilerContext->options->usesGeometryShader == false) ||
(shader->shaderType == LatteConst::ShaderType::Geometry) )
if ((shader->shaderType == LatteConst::ShaderType::Vertex && decompilerContext->options->usesGeometryShader == false) ||
(shader->shaderType == LatteConst::ShaderType::Geometry))
{
src->add("int verticesPerInstance;" _CRLF);
uniformOffsets.offset_verticesPerInstance = uniformCurrentOffset;
@ -251,8 +250,6 @@ namespace LatteDecompiler
{
_emitAttributes(decompilerContext);
_emitVSOutputs(decompilerContext);
// TODO: transform feedback
}
else if (decompilerContext->shaderType == LatteConst::ShaderType::Pixel)
{
@ -379,6 +376,13 @@ namespace LatteDecompiler
case LatteConst::ShaderType::Vertex:
src->add(", uint vid [[vertex_id]]");
src->add(", uint iid [[instance_id]]");
// streamout buffer (transform feedback)
if (decompilerContext->analyzer.hasStreamoutEnable && decompilerContext->analyzer.hasStreamoutWrite)
{
src->addFmt(", device int* sb [[buffer({})]]" _CRLF, decompilerContext->output->resourceMappingVK.getTFStorageBufferBindingPoint());
}
break;
case LatteConst::ShaderType::Pixel:
src->add(", bool frontFacing [[front_facing]]");

View File

@ -16,6 +16,7 @@
#include "Cemu/Logging/CemuDebugLogging.h"
#include "HW/Latte/Core/Latte.h"
#include "HW/Latte/ISA/LatteReg.h"
#include "Metal/MTLResource.hpp"
#include "Metal/MTLTypes.hpp"
#include "gui/guiWrapper.h"
@ -39,6 +40,9 @@ MetalRenderer::MetalRenderer()
// Texture readback
m_readbackBuffer = m_device->newBuffer(TEXTURE_READBACK_SIZE, MTL::StorageModeShared);
// Transform feedback
m_xfbRingBuffer = m_device->newBuffer(LatteStreamout_GetRingBufferSize(), MTL::StorageModeShared);
// Initialize state
for (uint32 i = 0; i < (uint32)LatteConst::ShaderType::TotalCount; i++)
{
@ -1185,7 +1189,21 @@ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandE
// Storage buffer
if (shader->resourceMapping.tfStorageBindingPoint >= 0)
{
debug_printf("storage buffer not implemented, index: %i\n", shader->resourceMapping.tfStorageBindingPoint);
switch (shader->shaderType)
{
case LatteConst::ShaderType::Vertex:
{
renderCommandEncoder->setVertexBuffer(m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint);
break;
}
case LatteConst::ShaderType::Pixel:
{
renderCommandEncoder->setFragmentBuffer(m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint);
break;
}
default:
UNREACHABLE;
}
}
}

View File

@ -147,6 +147,7 @@ public:
cemuLog_log(LogType::MetalLogging, "Imgui is not yet supported on Metal");
};
bool UseTFViaSSBO() const override { return true; }
void AppendOverlayDebugInfo() override;
// rendertarget
@ -265,6 +266,9 @@ private:
MTL::Buffer* m_readbackBuffer;
uint32 m_readbackBufferWriteOffset = 0;
// Transform feedback
MTL::Buffer* m_xfbRingBuffer;
// Active objects
MTL::CommandBuffer* m_commandBuffer = nullptr;
MetalEncoderType m_encoderType = MetalEncoderType::None;

View File

@ -85,6 +85,7 @@ public:
virtual void DeleteFontTextures() = 0;
GfxVendor GetVendor() const { return m_vendor; }
virtual bool UseTFViaSSBO() const { return false; }
virtual void AppendOverlayDebugInfo() = 0;
// rendertarget

View File

@ -73,11 +73,11 @@ public:
return true;
}
template<typename T>
struct direct_hash
{
size_t operator()(const uint64& k) const noexcept
size_t operator()(const uint64& k) const noexcept
{
return k;
}
@ -277,7 +277,6 @@ public:
// texture functions
void* texture_acquireTextureUploadBuffer(uint32 size) override;
void texture_releaseTextureUploadBuffer(uint8* mem) override;
TextureDecoder* texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) override;
@ -370,7 +369,7 @@ private:
VkRect2D currentScissorRect{};
// vertex bindings
struct
struct
{
uint32 offset;
}currentVertexBinding[LATTE_MAX_VERTEX_BUFFERS]{};
@ -457,17 +456,17 @@ private:
bool shaderRoundingModeRTEFloat32{ false };
}shaderFloatControls; // from VK_KHR_shader_float_controls
struct
struct
{
bool debug_utils = false; // VK_EXT_DEBUG_UTILS
}instanceExtensions;
struct
struct
{
bool useTFEmulationViaSSBO = true; // emulate transform feedback via shader writes to a storage buffer
}mode;
struct
struct
{
uint32 minUniformBufferOffsetAlignment = 256;
uint32 nonCoherentAtomSize = 256;
@ -497,7 +496,7 @@ private:
void CreateCommandBuffers();
void swapchain_createDescriptorSetLayout();
// shader
bool IsAsyncPipelineAllowed(uint32 numIndices);
@ -512,6 +511,8 @@ private:
void DeleteFontTextures() override;
bool BeginFrame(bool mainWindow) override;
bool UseTFViaSSBO() const override { return m_featureControl.mode.useTFEmulationViaSSBO; }
// drawcall emulation
PipelineInfo* draw_createGraphicsPipeline(uint32 indexCount);
PipelineInfo* draw_getOrCreateGraphicsPipeline(uint32 indexCount);
@ -574,7 +575,7 @@ private:
VkDevice m_logicalDevice = VK_NULL_HANDLE;
VkDebugUtilsMessengerEXT m_debugCallback = nullptr;
volatile bool m_destructionRequested = false;
QueueFamilyIndices m_indices{};
Semaphore m_pipeline_cache_semaphore;
@ -583,7 +584,7 @@ private:
VkPipelineCache m_pipeline_cache{ nullptr };
VkPipelineLayout m_pipelineLayout{nullptr};
VkCommandPool m_commandPool{ nullptr };
// buffer to cache uniform vars
VkBuffer m_uniformVarBuffer = VK_NULL_HANDLE;
VkDeviceMemory m_uniformVarBufferMemory = VK_NULL_HANDLE;
@ -652,19 +653,19 @@ private:
bool m_submitOnIdle{}; // submit current buffer if Latte command processor goes into idle state (no more commands or waiting for externally signaled condition)
// tracking for dynamic offsets
struct
struct
{
uint32 uniformVarBufferOffset[VulkanRendererConst::SHADER_STAGE_INDEX_COUNT];
struct
struct
{
uint32 unformBufferOffset[LATTE_NUM_MAX_UNIFORM_BUFFERS];
}shaderUB[VulkanRendererConst::SHADER_STAGE_INDEX_COUNT];
}dynamicOffsetInfo{};
// streamout
struct
struct
{
struct
struct
{
bool enabled;
uint32 ringBufferOffset;
@ -714,11 +715,11 @@ private:
accessFlags = 0;
if constexpr ((TSyncOp & BUFFER_SHADER_READ) != 0)
{
// in theory: VK_ACCESS_INDEX_READ_BIT should be set here too but indices are currently separated
// in theory: VK_ACCESS_INDEX_READ_BIT should be set here too but indices are currently separated
stages |= VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
accessFlags |= VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT | VK_ACCESS_SHADER_READ_BIT;
}
if constexpr ((TSyncOp & BUFFER_SHADER_WRITE) != 0)
{
stages |= VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
@ -921,7 +922,6 @@ private:
public:
bool GetDisableMultithreadedCompilation() const { return m_featureControl.disableMultithreadedCompilation; }
bool UseTFViaSSBO() const { return m_featureControl.mode.useTFEmulationViaSSBO; }
bool HasSPRIVRoundingModeRTE32() const { return m_featureControl.shaderFloatControls.shaderRoundingModeRTEFloat32; }
bool IsDebugUtilsEnabled() const { return m_featureControl.debugMarkersSupported && m_featureControl.instanceExtensions.debug_utils; }
@ -931,7 +931,7 @@ private:
void debug_genericBarrier();
// shaders
struct
struct
{
RendererShaderVk* copySurface_vs{};
RendererShaderVk* copySurface_psDepth2Color{};