From add59b3bea032e331363dcf28361f8e0de65e43d Mon Sep 17 00:00:00 2001 From: NanoByte011 Date: Tue, 13 Jan 2015 02:55:25 -0700 Subject: [PATCH] Fixes Mario Tennis Gimmick Courts and adds support for FastDepthCalc - Calculate ZSlope every flush but only set PixelShader Constant on Reset Buffer when zfreeze - Fixed another Pixel Shader bug in D3D that was giving me grief --- .../Core/VideoBackends/D3D/VertexManager.cpp | 9 ++++---- .../Core/VideoBackends/OGL/VertexManager.cpp | 9 ++++---- Source/Core/VideoCommon/PixelShaderGen.cpp | 18 ++++++++++----- .../Core/VideoCommon/PixelShaderManager.cpp | 5 +++-- Source/Core/VideoCommon/VertexManagerBase.cpp | 22 +++++++++++++------ Source/Core/VideoCommon/VertexManagerBase.h | 12 ++++++++-- .../Core/VideoCommon/VertexShaderManager.cpp | 3 +-- 7 files changed, 52 insertions(+), 26 deletions(-) diff --git a/Source/Core/VideoBackends/D3D/VertexManager.cpp b/Source/Core/VideoBackends/D3D/VertexManager.cpp index 2c38ac9d22..8546ed8ca7 100644 --- a/Source/Core/VideoBackends/D3D/VertexManager.cpp +++ b/Source/Core/VideoBackends/D3D/VertexManager.cpp @@ -181,12 +181,10 @@ void VertexManager::vFlush(bool useDstAlpha) PrepareDrawBuffers(stride); - if (!bpmem.genMode.zfreeze && IndexGenerator::GetIndexLen() >= 3) - { + if (!bpmem.genMode.zfreeze) CalculateZSlope(stride); - } - // if cull mode is CULL_ALL, ignore triangles and quads + // If cull mode is CULL_ALL, do not render these triangles if (bpmem.genMode.cullmode == GenMode::CULL_ALL && current_primitive_type == PRIMITIVE_TRIANGLES) return; @@ -202,6 +200,9 @@ void VertexManager::ResetBuffer(u32 stride) { s_pCurBufferPointer = s_pBaseBufferPointer; IndexGenerator::Start(GetIndexBuffer()); + + if (bpmem.genMode.zfreeze) + PixelShaderManager::SetZSlope(ZSlope.dfdx, ZSlope.dfdy, ZSlope.f0); } } // namespace diff --git a/Source/Core/VideoBackends/OGL/VertexManager.cpp b/Source/Core/VideoBackends/OGL/VertexManager.cpp index 859a3b8db4..81c377fd02 100644 --- a/Source/Core/VideoBackends/OGL/VertexManager.cpp +++ b/Source/Core/VideoBackends/OGL/VertexManager.cpp @@ -89,6 +89,9 @@ void VertexManager::ResetBuffer(u32 stride) buffer = s_indexBuffer->Map(MAXIBUFFERSIZE * sizeof(u16)); IndexGenerator::Start((u16*)buffer.first); s_index_offset = buffer.second; + + if (bpmem.genMode.zfreeze) + PixelShaderManager::SetZSlope(ZSlope.dfdx, ZSlope.dfdy, ZSlope.f0); } void VertexManager::Draw(u32 stride) @@ -140,12 +143,10 @@ void VertexManager::vFlush(bool useDstAlpha) PrepareDrawBuffers(stride); - if (!bpmem.genMode.zfreeze && IndexGenerator::GetIndexLen() >= 3) - { + if (!bpmem.genMode.zfreeze) CalculateZSlope(stride); - } - // if cull mode is CULL_ALL, ignore triangles and quads + // If cull mode is CULL_ALL, do not render these triangles if (bpmem.genMode.cullmode == GenMode::CULL_ALL && current_primitive_type == PRIMITIVE_TRIANGLES) return; diff --git a/Source/Core/VideoCommon/PixelShaderGen.cpp b/Source/Core/VideoCommon/PixelShaderGen.cpp index 5899f0100b..bd57f0c888 100644 --- a/Source/Core/VideoCommon/PixelShaderGen.cpp +++ b/Source/Core/VideoCommon/PixelShaderGen.cpp @@ -271,7 +271,11 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T GenerateVSOutputMembers(out, ApiType); out.Write("};\n"); - const bool forced_early_z = g_ActiveConfig.backend_info.bSupportsEarlyZ && bpmem.UseEarlyDepthTest() && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED); + const bool forced_early_z = g_ActiveConfig.backend_info.bSupportsEarlyZ && bpmem.UseEarlyDepthTest() + && (g_ActiveConfig.bFastDepthCalc || bpmem.alpha_test.TestResult() == AlphaTest::UNDETERMINED) + // We can't allow early_ztest for zfreeze because a reference poly is used + // to control the depth and we need a depth test after the alpha test. + && !bpmem.genMode.zfreeze; const bool per_pixel_depth = (bpmem.ztex2.op != ZTEXTURE_DISABLE && bpmem.UseLateDepthTest()) || (!g_ActiveConfig.bFastDepthCalc && bpmem.zmode.testenable && !forced_early_z) || bpmem.genMode.zfreeze; if (forced_early_z) @@ -365,7 +369,7 @@ static inline void GeneratePixelShader(T& out, DSTALPHA_MODE dstAlphaMode, API_T out.Write("void main(\n"); out.Write(" out float4 ocol0 : SV_Target0,%s%s\n in float4 rawpos : SV_Position,\n", dstAlphaMode == DSTALPHA_DUAL_SOURCE_BLEND ? "\n out float4 ocol1 : SV_Target1," : "", - per_pixel_depth ? "\n out float depth : SV_Depth," : ""); + (per_pixel_depth && bpmem.zmode.testenable) ? "\n out float depth : SV_Depth," : ""); out.Write(" in centroid float4 colors_0 : COLOR0,\n"); out.Write(" in centroid float4 colors_1 : COLOR1\n"); @@ -1023,7 +1027,11 @@ static inline void WriteAlphaTest(T& out, pixel_shader_uid_data* uid_data, API_T // Tests seem to have proven that writing depth even when the alpha test fails is more // important that a reliable alpha test, so we just force the alpha test to always succeed. // At least this seems to be less buggy. - uid_data->alpha_test_use_zcomploc_hack = bpmem.UseEarlyDepthTest() && bpmem.zmode.updateenable && !g_ActiveConfig.backend_info.bSupportsEarlyZ; + uid_data->alpha_test_use_zcomploc_hack = bpmem.UseEarlyDepthTest() + && bpmem.zmode.updateenable + && !g_ActiveConfig.backend_info.bSupportsEarlyZ + && !bpmem.genMode.zfreeze; // Might not be neccessary + if (!uid_data->alpha_test_use_zcomploc_hack) { out.Write("\t\tdiscard;\n"); @@ -1114,10 +1122,10 @@ static inline void WritePerPixelDepth(T& out, pixel_shader_uid_data* uid_data, A out.Write("\tfloat2 screenpos = rawpos.xy * " I_EFBSCALE".xy;\n"); // Opengl has reversed vertical screenspace coordiantes - if(ApiType == API_OPENGL) + if (ApiType == API_OPENGL) out.Write("\tscreenpos.y = %i - screenpos.y - 1;\n", EFB_HEIGHT); - out.Write("\tdepth = float(" I_ZSLOPE".z + " I_ZSLOPE".x * screenpos.x + " I_ZSLOPE".y * screenpos.y) / float(0xffffff);\n"); + out.Write("\tdepth = float(" I_ZSLOPE".z + " I_ZSLOPE".x * screenpos.x + " I_ZSLOPE".y * screenpos.y) / float(0xFFFFFF);\n"); } else { diff --git a/Source/Core/VideoCommon/PixelShaderManager.cpp b/Source/Core/VideoCommon/PixelShaderManager.cpp index a94dfaf991..f80fc114ce 100644 --- a/Source/Core/VideoCommon/PixelShaderManager.cpp +++ b/Source/Core/VideoCommon/PixelShaderManager.cpp @@ -50,7 +50,7 @@ void PixelShaderManager::Dirty() SetZTextureBias(); SetViewportChanged(); SetEfbScaleChanged(); - SetZSlope(0, 0, 1); + SetZSlope(0, 0, (float)0xFFFFFF); SetIndTexScaleChanged(false); SetIndTexScaleChanged(true); SetIndMatrixChanged(0); @@ -116,7 +116,8 @@ void PixelShaderManager::SetConstants() s_bViewPortChanged = false; } - if (s_bEFBScaleChanged) { + if (s_bEFBScaleChanged) + { constants.efbscale[0] = 1.0f / float(Renderer::EFBToScaledXf(1)); constants.efbscale[1] = 1.0f / float(Renderer::EFBToScaledYf(1)); dirty = true; diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp index bcdaf466a4..23eb770c6d 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/VertexManagerBase.cpp @@ -25,6 +25,8 @@ u8 *VertexManager::s_pEndBufferPointer; PrimitiveType VertexManager::current_primitive_type; +Slope VertexManager::ZSlope; + bool VertexManager::IsFlushed; static const PrimitiveType primitive_from_gx[8] = { @@ -246,6 +248,8 @@ void VertexManager::CalculateZSlope(u32 stride) { float vtx[9]; float out[12]; + float viewOffset[2] = { xfmem.viewport.xOrig - bpmem.scissorOffset.x * 2, + xfmem.viewport.yOrig - bpmem.scissorOffset.y * 2}; // Lookup vertices of the last rendered triangle and software-transform them // This allows us to determine the depth slope, which will be used if zfreeze @@ -260,9 +264,11 @@ void VertexManager::CalculateZSlope(u32 stride) VertexShaderManager::TransformToClipSpace(&vtx[i * 3], &out[i * 4]); // Transform to Screenspace - out[0 + i * 4] = out[0 + i * 4] / out[3 + i * 4] * xfmem.viewport.wd + (xfmem.viewport.xOrig - 342); - out[1 + i * 4] = out[1 + i * 4] / out[3 + i * 4] * xfmem.viewport.ht + (xfmem.viewport.yOrig - 342); - out[2 + i * 4] = out[2 + i * 4] / out[3 + i * 4] * xfmem.viewport.zRange + xfmem.viewport.farZ; + float w = out[3 + i * 4]; + + out[0 + i * 4] = out[0 + i * 4] / w * xfmem.viewport.wd + viewOffset[0]; + out[1 + i * 4] = out[1 + i * 4] / w * xfmem.viewport.ht + viewOffset[1]; + out[2 + i * 4] = out[2 + i * 4] / w * xfmem.viewport.zRange + xfmem.viewport.farZ; } float dx31 = out[8] - out[0]; @@ -276,9 +282,11 @@ void VertexManager::CalculateZSlope(u32 stride) float b = dx31 * DF21 + dx12 * DF31; float c = -dx12 * dy31 - dx31 * -dy12; - float slope_dfdx = -a / c; - float slope_dfdy = -b / c; - float slope_f0 = out[2] - (out[0] * slope_dfdx + out[1] * slope_dfdy); + // Stop divide by zero + if (c == 0) + return; - PixelShaderManager::SetZSlope(slope_dfdx, slope_dfdy, slope_f0); + ZSlope.dfdx = -a / c; + ZSlope.dfdy = -b / c; + ZSlope.f0 = out[2] - (out[0] * ZSlope.dfdx + out[1] * ZSlope.dfdy); } diff --git a/Source/Core/VideoCommon/VertexManagerBase.h b/Source/Core/VideoCommon/VertexManagerBase.h index 524f3e5a0c..143e6b811c 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.h +++ b/Source/Core/VideoCommon/VertexManagerBase.h @@ -14,6 +14,13 @@ enum PrimitiveType { PRIMITIVE_TRIANGLES, }; +struct Slope +{ + float dfdx; + float dfdy; + float f0; +}; + class VertexManager { private: @@ -41,8 +48,6 @@ public: static void DoState(PointerWrap& p); - static void CalculateZSlope(u32 stride); - protected: virtual void vDoState(PointerWrap& p) { } @@ -57,6 +62,9 @@ protected: static u32 GetRemainingSize(); static u32 GetRemainingIndices(int primitive); + static Slope ZSlope; + static void CalculateZSlope(u32 stride); + private: static bool IsFlushed; diff --git a/Source/Core/VideoCommon/VertexShaderManager.cpp b/Source/Core/VideoCommon/VertexShaderManager.cpp index a745f7004f..5320e0af2e 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.cpp +++ b/Source/Core/VideoCommon/VertexShaderManager.cpp @@ -692,6 +692,7 @@ void VertexShaderManager::ResetView() void VertexShaderManager::TransformToClipSpace(const float* data, float *out) { + // Can we use constants.posnormalmatrix here instead? const float *world_matrix = (const float *)xfmem.posMatrices + g_main_cp_state.matrix_index_a.PosNormalMtxIdx * 4; const float *proj_matrix = &g_fProjectionMatrix[0]; @@ -700,8 +701,6 @@ void VertexShaderManager::TransformToClipSpace(const float* data, float *out) t[1] = data[0] * world_matrix[4] + data[1] * world_matrix[5] + data[2] * world_matrix[6] + world_matrix[7]; t[2] = data[0] * world_matrix[8] + data[1] * world_matrix[9] + data[2] * world_matrix[10] + world_matrix[11]; - // TODO: this requires g_fProjectionMatrix to be up to date, which is not really a good design decision. - out[0] = t[0] * proj_matrix[0] + t[1] * proj_matrix[1] + t[2] * proj_matrix[2] + proj_matrix[3]; out[1] = t[0] * proj_matrix[4] + t[1] * proj_matrix[5] + t[2] * proj_matrix[6] + proj_matrix[7]; out[2] = t[0] * proj_matrix[8] + t[1] * proj_matrix[9] + t[2] * proj_matrix[10] + proj_matrix[11];