diff --git a/Source/Core/Core/Config/GraphicsSettings.cpp b/Source/Core/Core/Config/GraphicsSettings.cpp index c52403d476..da1f82e1ea 100644 --- a/Source/Core/Core/Config/GraphicsSettings.cpp +++ b/Source/Core/Core/Config/GraphicsSettings.cpp @@ -93,6 +93,7 @@ const Info GFX_SAVE_TEXTURE_CACHE_TO_STATE{ {System::GFX, "Settings", "SaveTextureCacheToState"}, true}; const Info GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION{ {System::GFX, "Settings", "PreferVSForLinePointExpansion"}, false}; +const Info GFX_CPU_CULL{{System::GFX, "Settings", "CPUCull"}, false}; const Info GFX_MTL_MANUALLY_UPLOAD_BUFFERS{ {System::GFX, "Settings", "ManuallyUploadBuffers"}, TriState::Auto}; diff --git a/Source/Core/Core/Config/GraphicsSettings.h b/Source/Core/Core/Config/GraphicsSettings.h index df97bd12c2..13c8bb8148 100644 --- a/Source/Core/Core/Config/GraphicsSettings.h +++ b/Source/Core/Core/Config/GraphicsSettings.h @@ -82,6 +82,7 @@ extern const Info GFX_SHADER_COMPILER_THREADS; extern const Info GFX_SHADER_PRECOMPILER_THREADS; extern const Info GFX_SAVE_TEXTURE_CACHE_TO_STATE; extern const Info GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION; +extern const Info GFX_CPU_CULL; extern const Info GFX_MTL_MANUALLY_UPLOAD_BUFFERS; extern const Info GFX_MTL_USE_PRESENT_DRAWABLE; diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp index 4d0597ba88..9bee673db4 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.cpp +++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp @@ -30,6 +30,7 @@ #include "VideoCommon/VertexLoaderBase.h" #include "VideoCommon/VertexManagerBase.h" #include "VideoCommon/VertexShaderManager.h" +#include "VideoCommon/VideoConfig.h" #include "VideoCommon/XFMemory.h" namespace VertexLoaderManager @@ -366,17 +367,33 @@ int RunVertices(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int coun vertex_shader_manager.SetVertexFormat(loader->m_native_components, loader->m_native_vertex_format->GetVertexDeclaration()); - // if cull mode is CULL_ALL, tell VertexManager to skip triangles and quads. - // They still need to go through vertex loading, because we need to calculate a zfreeze refrence - // slope. - bool cullall = (bpmem.genMode.cullmode == CullMode::All && - primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES); + // CPUCull's performance increase comes from encoding fewer GPU commands, not sending less data + // Therefore it's only useful to check if culling could remove a flush + const bool can_cpu_cull = g_ActiveConfig.bCPUCull && + primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES && + !g_vertex_manager->HasSendableVertices(); - DataReader dst = g_vertex_manager->PrepareForAdditionalData( - primitive, count, loader->m_native_vtx_decl.stride, cullall); + // if cull mode is CULL_ALL, tell VertexManager to skip triangles and quads. + // They still need to go through vertex loading, because we need to calculate a zfreeze + // reference slope. + const bool cullall = (bpmem.genMode.cullmode == CullMode::All && + primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES); + + const int stride = loader->m_native_vtx_decl.stride; + DataReader dst = g_vertex_manager->PrepareForAdditionalData(primitive, count, stride, + cullall || can_cpu_cull); count = loader->RunVertices(src, dst.GetPointer(), count); + if (can_cpu_cull && !cullall) + { + if (!g_vertex_manager->AreAllVerticesCulled(loader, primitive, dst.GetPointer(), count)) + { + DataReader new_dst = g_vertex_manager->DisableCullAll(stride); + memmove(new_dst.GetPointer(), dst.GetPointer(), count * stride); + } + } + g_vertex_manager->AddIndices(primitive, count); g_vertex_manager->FlushData(count, loader->m_native_vtx_decl.stride); diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp index 88d7c47fba..88bdfccb24 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/VertexManagerBase.cpp @@ -104,6 +104,7 @@ VertexManagerBase::~VertexManagerBase() = default; bool VertexManagerBase::Initialize() { m_index_generator.Init(); + m_cpu_cull.Init(); return true; } @@ -117,6 +118,13 @@ void VertexManagerBase::AddIndices(OpcodeDecoder::Primitive primitive, u32 num_v m_index_generator.AddIndices(primitive, num_vertices); } +bool VertexManagerBase::AreAllVerticesCulled(VertexLoaderBase* loader, + OpcodeDecoder::Primitive primitive, const u8* src, + u32 count) +{ + return m_cpu_cull.AreAllVerticesCulled(loader, primitive, src, count); +} + DataReader VertexManagerBase::PrepareForAdditionalData(OpcodeDecoder::Primitive primitive, u32 count, u32 stride, bool cullall) { @@ -185,6 +193,16 @@ DataReader VertexManagerBase::PrepareForAdditionalData(OpcodeDecoder::Primitive return DataReader(m_cur_buffer_pointer, m_end_buffer_pointer); } +DataReader VertexManagerBase::DisableCullAll(u32 stride) +{ + if (m_cull_all) + { + m_cull_all = false; + ResetBuffer(stride); + } + return DataReader(m_cur_buffer_pointer, m_end_buffer_pointer); +} + void VertexManagerBase::FlushData(u32 count, u32 stride) { m_cur_buffer_pointer += count * stride; @@ -546,6 +564,8 @@ void VertexManagerBase::Flush() // Now the vertices can be flushed to the GPU. Everything following the CommitBuffer() call // must be careful to not upload any utility vertices, as the binding will be lost otherwise. const u32 num_indices = m_index_generator.GetIndexLen(); + if (num_indices == 0) + return; u32 base_vertex, base_index; CommitBuffer(m_index_generator.GetNumVerts(), VertexLoaderManager::GetCurrentVertexFormat()->GetVertexStride(), num_indices, diff --git a/Source/Core/VideoCommon/VertexManagerBase.h b/Source/Core/VideoCommon/VertexManagerBase.h index ba3777a7fe..3b8180c5d2 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.h +++ b/Source/Core/VideoCommon/VertexManagerBase.h @@ -9,6 +9,7 @@ #include "Common/BitSet.h" #include "Common/CommonTypes.h" #include "Common/MathUtil.h" +#include "VideoCommon/CPUCull.h" #include "VideoCommon/IndexGenerator.h" #include "VideoCommon/RenderState.h" #include "VideoCommon/ShaderCache.h" @@ -100,11 +101,18 @@ public: PrimitiveType GetCurrentPrimitiveType() const { return m_current_primitive_type; } void AddIndices(OpcodeDecoder::Primitive primitive, u32 num_vertices); + bool AreAllVerticesCulled(VertexLoaderBase* loader, OpcodeDecoder::Primitive primitive, + const u8* src, u32 count); virtual DataReader PrepareForAdditionalData(OpcodeDecoder::Primitive primitive, u32 count, u32 stride, bool cullall); + /// Switch cullall off after a call to PrepareForAdditionalData with cullall true + /// Expects that you will add a nonzero number of primitives before the next flush + /// Returns whether cullall was changed (false if cullall was already off) + DataReader DisableCullAll(u32 stride); void FlushData(u32 count, u32 stride); void Flush(); + bool HasSendableVertices() const { return !m_is_flushed && !m_cull_all; } void DoState(PointerWrap& p); @@ -201,6 +209,7 @@ protected: bool m_cull_all = false; IndexGenerator m_index_generator; + CPUCull m_cpu_cull; private: // Minimum number of draws per command buffer when attempting to preempt a readback operation. diff --git a/Source/Core/VideoCommon/VideoConfig.cpp b/Source/Core/VideoCommon/VideoConfig.cpp index 66ba09c581..6283235cf9 100644 --- a/Source/Core/VideoCommon/VideoConfig.cpp +++ b/Source/Core/VideoCommon/VideoConfig.cpp @@ -113,6 +113,7 @@ void VideoConfig::Refresh() iShaderCompilationMode = Config::Get(Config::GFX_SHADER_COMPILATION_MODE); iShaderCompilerThreads = Config::Get(Config::GFX_SHADER_COMPILER_THREADS); iShaderPrecompilerThreads = Config::Get(Config::GFX_SHADER_PRECOMPILER_THREADS); + bCPUCull = Config::Get(Config::GFX_CPU_CULL); texture_filtering_mode = Config::Get(Config::GFX_ENHANCE_FORCE_TEXTURE_FILTERING); iMaxAnisotropy = Config::Get(Config::GFX_ENHANCE_MAX_ANISOTROPY); diff --git a/Source/Core/VideoCommon/VideoConfig.h b/Source/Core/VideoCommon/VideoConfig.h index d60f969b8d..628771860a 100644 --- a/Source/Core/VideoCommon/VideoConfig.h +++ b/Source/Core/VideoCommon/VideoConfig.h @@ -138,6 +138,7 @@ struct VideoConfig final bool bPerfQueriesEnable = false; bool bBBoxEnable = false; bool bForceProgressive = false; + bool bCPUCull = false; bool bEFBEmulateFormatChanges = false; bool bSkipEFBCopyToRam = false;