From b170ef9651cda959a5bfa277bb533815f10de5d8 Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Thu, 10 Nov 2022 19:30:49 -0600 Subject: [PATCH] VideoCommon: Add class for quickly transforming and culling vertices on the CPU --- Source/Core/DolphinLib.props | 3 + Source/Core/VideoCommon/CMakeLists.txt | 3 + Source/Core/VideoCommon/CPUCull.cpp | 160 ++++ Source/Core/VideoCommon/CPUCull.h | 38 + Source/Core/VideoCommon/CPUCullImpl.h | 714 ++++++++++++++++++ Source/Core/VideoCommon/ConstantManager.h | 6 +- .../Core/VideoCommon/VertexShaderManager.cpp | 172 +++-- Source/Core/VideoCommon/VertexShaderManager.h | 3 + Source/Core/VideoCommon/XFMemory.h | 2 +- 9 files changed, 1017 insertions(+), 84 deletions(-) create mode 100644 Source/Core/VideoCommon/CPUCull.cpp create mode 100644 Source/Core/VideoCommon/CPUCull.h create mode 100644 Source/Core/VideoCommon/CPUCullImpl.h diff --git a/Source/Core/DolphinLib.props b/Source/Core/DolphinLib.props index a7354ee00b..7bb0144137 100644 --- a/Source/Core/DolphinLib.props +++ b/Source/Core/DolphinLib.props @@ -634,6 +634,8 @@ + + @@ -1238,6 +1240,7 @@ + diff --git a/Source/Core/VideoCommon/CMakeLists.txt b/Source/Core/VideoCommon/CMakeLists.txt index a7a52c5e40..4e8bff36fc 100644 --- a/Source/Core/VideoCommon/CMakeLists.txt +++ b/Source/Core/VideoCommon/CMakeLists.txt @@ -23,6 +23,9 @@ add_library(videocommon ConstantManager.h CPMemory.cpp CPMemory.h + CPUCull.cpp + CPUCull.h + CPUCullImpl.h DriverDetails.cpp DriverDetails.h Fifo.cpp diff --git a/Source/Core/VideoCommon/CPUCull.cpp b/Source/Core/VideoCommon/CPUCull.cpp new file mode 100644 index 0000000000..bb68376b09 --- /dev/null +++ b/Source/Core/VideoCommon/CPUCull.cpp @@ -0,0 +1,160 @@ +// Copyright 2022 Dolphin Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "VideoCommon/CPUCull.h" + +#include "Common/Assert.h" +#include "Common/CPUDetect.h" +#include "Common/MathUtil.h" +#include "Common/MemoryUtil.h" +#include "Core/System.h" + +#include "VideoCommon/CPMemory.h" +#include "VideoCommon/VertexManagerBase.h" +#include "VideoCommon/VertexShaderManager.h" +#include "VideoCommon/VideoConfig.h" +#include "VideoCommon/XFMemory.h" + +#if defined(_M_X86) || defined(_M_X86_64) +#define USE_SSE +#elif defined(_M_ARM_64) +#define USE_NEON +#else +#define NO_SIMD +#endif + +#if defined(USE_SSE) +#include +#elif defined(USE_NEON) +#include +#endif + +#include "VideoCommon/CPUCullImpl.h" +#ifdef USE_SSE +#define USE_SSE3 +#include "VideoCommon/CPUCullImpl.h" +#define USE_SSE41 +#include "VideoCommon/CPUCullImpl.h" +#define USE_AVX +#include "VideoCommon/CPUCullImpl.h" +#define USE_FMA +#include "VideoCommon/CPUCullImpl.h" +#endif + +#if defined(USE_SSE) +#if defined(__AVX__) && defined(__FMA__) +static constexpr int MIN_SSE = 51; +#elif defined(__AVX__) +static constexpr int MIN_SSE = 50; +#elif defined(__SSE4_1__) +static constexpr int MIN_SSE = 41; +#elif defined(__SSE3__) +static constexpr int MIN_SSE = 30; +#else +static constexpr int MIN_SSE = 0; +#endif +#endif + +template +static CPUCull::TransformFunction GetTransformFunction() +{ +#if defined(USE_SSE) + if (MIN_SSE >= 51 || (cpu_info.bAVX && cpu_info.bFMA)) + return CPUCull_FMA::TransformVertices; + else if (MIN_SSE >= 50 || cpu_info.bAVX) + return CPUCull_AVX::TransformVertices; + else if (PositionHas3Elems && PerVertexPosMtx && (MIN_SSE >= 41 || cpu_info.bSSE4_1)) + return CPUCull_SSE41::TransformVertices; + else if (PositionHas3Elems && (MIN_SSE >= 30 || cpu_info.bSSE3)) + return CPUCull_SSE3::TransformVertices; + else + return CPUCull_SSE::TransformVertices; +#elif defined(USE_NEON) + return CPUCull_NEON::TransformVertices; +#else + return CPUCull_Scalar::TransformVertices; +#endif +} + +template +static CPUCull::CullFunction GetCullFunction0() +{ +#if defined(USE_SSE) + // Note: AVX version only actually AVX on compilers that support __attribute__((target)) + // Sorry, MSVC + Sandy Bridge. (Ivy+ and AMD see very little benefit thanks to mov elimination) + if (MIN_SSE >= 50 || cpu_info.bAVX) + return CPUCull_AVX::AreAllVerticesCulled; + else if (MIN_SSE >= 30 || cpu_info.bSSE3) + return CPUCull_SSE3::AreAllVerticesCulled; + else + return CPUCull_SSE::AreAllVerticesCulled; +#elif defined(USE_NEON) + return CPUCull_NEON::AreAllVerticesCulled; +#else + return CPUCull_Scalar::AreAllVerticesCulled; +#endif +} + +template +static Common::EnumMap GetCullFunction1() +{ + return { + GetCullFunction0(), + GetCullFunction0(), + GetCullFunction0(), + GetCullFunction0(), + }; +} + +CPUCull::~CPUCull() = default; + +void CPUCull::Init() +{ + m_transform_table[false][false] = GetTransformFunction(); + m_transform_table[false][true] = GetTransformFunction(); + m_transform_table[true][false] = GetTransformFunction(); + m_transform_table[true][true] = GetTransformFunction(); + using Prim = OpcodeDecoder::Primitive; + m_cull_table[Prim::GX_DRAW_QUADS] = GetCullFunction1(); + m_cull_table[Prim::GX_DRAW_QUADS_2] = GetCullFunction1(); + m_cull_table[Prim::GX_DRAW_TRIANGLES] = GetCullFunction1(); + m_cull_table[Prim::GX_DRAW_TRIANGLE_STRIP] = GetCullFunction1(); + m_cull_table[Prim::GX_DRAW_TRIANGLE_FAN] = GetCullFunction1(); +} + +bool CPUCull::AreAllVerticesCulled(VertexLoaderBase* loader, OpcodeDecoder::Primitive primitive, + const u8* src, u32 count) +{ + ASSERT_MSG(VIDEO, primitive < OpcodeDecoder::Primitive::GX_DRAW_LINES, + "CPUCull should not be called on lines or points"); + const u32 stride = loader->m_native_vtx_decl.stride; + const bool posHas3Elems = loader->m_native_vtx_decl.position.components >= 3; + const bool perVertexPosMtx = loader->m_native_vtx_decl.posmtx.enable; + if (m_transform_buffer_size < count) [[unlikely]] + { + u32 new_size = MathUtil::NextPowerOf2(count); + m_transform_buffer_size = new_size; + m_transform_buffer.reset(static_cast( + Common::AllocateAlignedMemory(new_size * sizeof(TransformedVertex), 32))); + } + + // transform functions need the projection matrix to tranform to clip space + Core::System::GetInstance().GetVertexShaderManager().SetProjectionMatrix(); + + static constexpr Common::EnumMap cullmode_invert = { + CullMode::None, CullMode::Front, CullMode::Back, CullMode::All}; + + CullMode cullmode = bpmem.genMode.cullmode; + if (xfmem.viewport.ht > 0) // See videosoftware Clipper.cpp:IsBackface + cullmode = cullmode_invert[cullmode]; + const TransformFunction transform = m_transform_table[posHas3Elems][perVertexPosMtx]; + transform(m_transform_buffer.get(), src, stride, count); + const CullFunction cull = m_cull_table[primitive][cullmode]; + return cull(m_transform_buffer.get(), count); +} + +template +void CPUCull::BufferDeleter::operator()(T* ptr) +{ + Common::FreeAlignedMemory(ptr); +} diff --git a/Source/Core/VideoCommon/CPUCull.h b/Source/Core/VideoCommon/CPUCull.h new file mode 100644 index 0000000000..40248035e8 --- /dev/null +++ b/Source/Core/VideoCommon/CPUCull.h @@ -0,0 +1,38 @@ +// Copyright 2022 Dolphin Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "VideoCommon/BPMemory.h" +#include "VideoCommon/DataReader.h" +#include "VideoCommon/OpcodeDecoding.h" + +class CPUCull +{ +public: + ~CPUCull(); + void Init(); + bool AreAllVerticesCulled(VertexLoaderBase* loader, OpcodeDecoder::Primitive primitive, + const u8* src, u32 count); + + struct alignas(16) TransformedVertex + { + float x, y, z, w; + }; + + using TransformFunction = void (*)(void*, const void*, u32, int); + using CullFunction = bool (*)(const CPUCull::TransformedVertex*, int); + +private: + template + struct BufferDeleter + { + void operator()(T* ptr); + }; + std::unique_ptr> m_transform_buffer; + u32 m_transform_buffer_size = 0; + std::array, 2> m_transform_table; + Common::EnumMap, + OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_FAN> + m_cull_table; +}; diff --git a/Source/Core/VideoCommon/CPUCullImpl.h b/Source/Core/VideoCommon/CPUCullImpl.h new file mode 100644 index 0000000000..e9a1d545c2 --- /dev/null +++ b/Source/Core/VideoCommon/CPUCullImpl.h @@ -0,0 +1,714 @@ +// Copyright 2022 Dolphin Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#if defined(USE_FMA) +#define VECTOR_NAMESPACE CPUCull_FMA +#elif defined(USE_AVX) +#define VECTOR_NAMESPACE CPUCull_AVX +#elif defined(USE_SSE41) +#define VECTOR_NAMESPACE CPUCull_SSE41 +#elif defined(USE_SSE3) +#define VECTOR_NAMESPACE CPUCull_SSE3 +#elif defined(USE_SSE) +#define VECTOR_NAMESPACE CPUCull_SSE +#elif defined(USE_NEON) +#define VECTOR_NAMESPACE CPUCull_NEON +#elif defined(NO_SIMD) +#define VECTOR_NAMESPACE CPUCull_Scalar +#else +#error This file is meant to be used by CPUCull.cpp only! +#endif + +#if defined(__GNUC__) && defined(USE_FMA) && !(defined(__AVX__) && defined(__FMA__)) +#define ATTR_TARGET __attribute__((target("avx,fma"))) +#elif defined(__GNUC__) && defined(USE_AVX) && !defined(__AVX__) +#define ATTR_TARGET __attribute__((target("avx"))) +#elif defined(__GNUC__) && defined(USE_SSE41) && !defined(__SSE4_1__) +#define ATTR_TARGET __attribute__((target("sse4.1"))) +#elif defined(__GNUC__) && defined(USE_SSE3) && !defined(__SSE3__) +#define ATTR_TARGET __attribute__((target("sse3"))) +#else +#define ATTR_TARGET +#endif + +namespace VECTOR_NAMESPACE +{ +#if defined(USE_SSE) +typedef __m128 Vector; +#elif defined(USE_NEON) +typedef float32x4_t Vector; +#else +struct alignas(16) Vector +{ + float x, y, z, w; +}; +#endif +static_assert(sizeof(Vector) == 16); + +#ifdef USE_NEON +ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector vsetr_f32(float x, float y, float z, float w) +{ + float tmp[4] = {x, y, z, w}; + return vld1q_f32(tmp); +} +ATTR_TARGET DOLPHIN_FORCE_INLINE static void vuzp12q_f32(Vector& a, Vector& b) +{ + Vector tmp = vuzp2q_f32(a, b); + a = vuzp1q_f32(a, b); + b = tmp; +} +#endif +#ifdef USE_SSE +template +ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector vector_broadcast(Vector v) +{ + return _mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i)); +} +#endif +#ifdef USE_AVX +template +ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256 vector_broadcast(__m256 v) +{ + return _mm256_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i)); +} +#endif + +#ifdef USE_AVX +ATTR_TARGET DOLPHIN_FORCE_INLINE static void TransposeYMM(__m256& o0, __m256& o1, // + __m256& o2, __m256& o3) +{ + __m256d tmp0 = _mm256_castps_pd(_mm256_unpacklo_ps(o0, o1)); + __m256d tmp1 = _mm256_castps_pd(_mm256_unpacklo_ps(o2, o3)); + __m256d tmp2 = _mm256_castps_pd(_mm256_unpackhi_ps(o0, o1)); + __m256d tmp3 = _mm256_castps_pd(_mm256_unpackhi_ps(o2, o3)); + o0 = _mm256_castpd_ps(_mm256_unpacklo_pd(tmp0, tmp1)); + o1 = _mm256_castpd_ps(_mm256_unpackhi_pd(tmp0, tmp1)); + o2 = _mm256_castpd_ps(_mm256_unpacklo_pd(tmp2, tmp3)); + o3 = _mm256_castpd_ps(_mm256_unpackhi_pd(tmp2, tmp3)); +} + +ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadTransposedYMM(const void* source, __m256& o0, + __m256& o1, __m256& o2, __m256& o3) +{ + const Vector* vsource = static_cast(source); + o0 = _mm256_broadcast_ps(&vsource[0]); + o1 = _mm256_broadcast_ps(&vsource[1]); + o2 = _mm256_broadcast_ps(&vsource[2]); + o3 = _mm256_broadcast_ps(&vsource[3]); + TransposeYMM(o0, o1, o2, o3); +} + +ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadPosYMM(const void* sourcel, const void* sourceh, + __m256& o0, __m256& o1, __m256& o2) +{ + const Vector* vsourcel = static_cast(sourcel); + const Vector* vsourceh = static_cast(sourceh); + o0 = _mm256_insertf128_ps(_mm256_castps128_ps256(vsourcel[0]), vsourceh[0], 1); + o1 = _mm256_insertf128_ps(_mm256_castps128_ps256(vsourcel[1]), vsourceh[1], 1); + o2 = _mm256_insertf128_ps(_mm256_castps128_ps256(vsourcel[2]), vsourceh[2], 1); +} + +ATTR_TARGET DOLPHIN_FORCE_INLINE static void +LoadTransposedPosYMM(const void* source, __m256& o0, __m256& o1, __m256& o2, __m256& o3) +{ + const Vector* vsource = static_cast(source); + o0 = _mm256_broadcast_ps(&vsource[0]); + o1 = _mm256_broadcast_ps(&vsource[1]); + o2 = _mm256_broadcast_ps(&vsource[2]); + o3 = _mm256_setr_ps(0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f); + TransposeYMM(o0, o1, o2, o3); +} + +ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256 ApplyMatrixYMM(__m256 v, __m256 m0, __m256 m1, + __m256 m2, __m256 m3) +{ + __m256 output = _mm256_mul_ps(vector_broadcast<0>(v), m0); +#ifdef USE_FMA + output = _mm256_fmadd_ps(vector_broadcast<1>(v), m1, output); + output = _mm256_fmadd_ps(vector_broadcast<2>(v), m2, output); + output = _mm256_fmadd_ps(vector_broadcast<3>(v), m3, output); +#else + output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<1>(v), m1)); + output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<2>(v), m2)); + output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<3>(v), m3)); +#endif + return output; +} + +ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256 +TransformVertexNoTransposeYMM(__m256 vertex, __m256 pos0, __m256 pos1, __m256 pos2, // + __m256 proj0, __m256 proj1, __m256 proj2, __m256 proj3) +{ + __m256 mul0 = _mm256_mul_ps(vertex, pos0); + __m256 mul1 = _mm256_mul_ps(vertex, pos1); + __m256 mul2 = _mm256_mul_ps(vertex, pos2); + __m256 mul3 = _mm256_setr_ps(0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 1.0f); + __m256 output = _mm256_hadd_ps(_mm256_hadd_ps(mul0, mul1), _mm256_hadd_ps(mul2, mul3)); + return ApplyMatrixYMM(output, proj0, proj1, proj2, proj3); +} + +template +ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256 +TransformVertexYMM(__m256 vertex, __m256 pos0, __m256 pos1, __m256 pos2, __m256 pos3, // + __m256 proj0, __m256 proj1, __m256 proj2, __m256 proj3) +{ + __m256 output = pos3; // vertex.w is always 1.0 +#ifdef USE_FMA + output = _mm256_fmadd_ps(vector_broadcast<0>(vertex), pos0, output); + output = _mm256_fmadd_ps(vector_broadcast<1>(vertex), pos1, output); + if constexpr (PositionHas3Elems) + output = _mm256_fmadd_ps(vector_broadcast<2>(vertex), pos2, output); +#else + output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<0>(vertex), pos0)); + output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<1>(vertex), pos1)); + if constexpr (PositionHas3Elems) + output = _mm256_add_ps(output, _mm256_mul_ps(vector_broadcast<2>(vertex), pos2)); +#endif + return ApplyMatrixYMM(output, proj0, proj1, proj2, proj3); +} + +template +ATTR_TARGET DOLPHIN_FORCE_INLINE static __m256 +LoadTransform2Vertices(const u8* v0data, const u8* v1data, // + __m256 pos0, __m256 pos1, __m256 pos2, __m256 pos3, // + __m256 proj0, __m256 proj1, __m256 proj2, __m256 proj3) +{ + __m256 v01; + if constexpr (PerVertexPosMtx) + { + // Vertex data layout always starts with posmtx data if available, then position data + // Convenient for us, that means offsets are always fixed + u32 v0idx = v0data[0] & 0x3f; + u32 v1idx = v1data[0] & 0x3f; + v0data += sizeof(u32); + v1data += sizeof(u32); + + const float* v0fdata = reinterpret_cast(v0data); + const float* v1fdata = reinterpret_cast(v1data); + + LoadPosYMM(&xfmem.posMatrices[v0idx * 4], &xfmem.posMatrices[v1idx * 4], pos0, pos1, pos2); + + if constexpr (PositionHas3Elems) + { + __m256 base = _mm256_set1_ps(1.0f); + v01 = _mm256_blend_ps(_mm256_loadu2_m128(v1fdata, v0fdata), base, 0x88); + } + else + { + __m256 base = _mm256_unpacklo_ps(_mm256_setzero_ps(), _mm256_set1_ps(1.0f)); + __m256 v1 = _mm256_castpd_ps(_mm256_broadcast_sd(reinterpret_cast(v1data))); + __m128 v0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast(v0data)); + v01 = _mm256_blend_ps(_mm256_castps128_ps256(v0), v1, 0x30); + v01 = _mm256_blend_ps(v01, base, 0xcc); + } + + v01 = TransformVertexNoTransposeYMM(v01, pos0, pos1, pos2, proj0, proj1, proj2, proj3); + } + else + { + const float* v0fdata = reinterpret_cast(v0data); + const float* v1fdata = reinterpret_cast(v1data); + if constexpr (PositionHas3Elems) + { + v01 = _mm256_loadu2_m128(v1fdata, v0fdata); + } + else + { + __m256 v1 = _mm256_castpd_ps(_mm256_broadcast_sd(reinterpret_cast(v1data))); + __m128 v0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast(v0data)); + v01 = _mm256_blend_ps(_mm256_castps128_ps256(v0), v1, 0x30); + } + +#ifdef __clang__ + // Clang's optimizer is dumb, yay + // It sees TransformVertexYMM doing broadcasts and is like + // "let's broadcast *before* we combine v0 and v1! Then we can use vbroadcastss!" + // Prevent it from "optimizing" here + asm("" : "+x"(v01)::); +#endif + + v01 = TransformVertexYMM(v01, pos0, pos1, pos2, pos3, // + proj0, proj1, proj2, proj3); + } + + return v01; +} + +#endif + +#ifndef USE_AVX +// Note: Assumes 16-byte aligned source +ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadTransposed(const void* source, Vector& o0, + Vector& o1, Vector& o2, Vector& o3) +{ +#if defined(USE_SSE) + const Vector* vsource = static_cast(source); + o0 = vsource[0]; + o1 = vsource[1]; + o2 = vsource[2]; + o3 = vsource[3]; + _MM_TRANSPOSE4_PS(o0, o1, o2, o3); +#elif defined(USE_NEON) + float32x4x4_t ld = vld4q_f32(static_cast(source)); + o0 = ld.val[0]; + o1 = ld.val[1]; + o2 = ld.val[2]; + o3 = ld.val[3]; +#else + const Vector* vsource = static_cast(source); + // clang-format off + o0.x = vsource[0].x; o0.y = vsource[1].x; o0.z = vsource[2].x; o0.w = vsource[3].x; + o1.x = vsource[0].y; o1.y = vsource[1].y; o1.z = vsource[2].y; o1.w = vsource[3].y; + o2.x = vsource[0].z; o2.y = vsource[1].z; o2.z = vsource[2].z; o2.w = vsource[3].z; + o3.x = vsource[0].w; o3.y = vsource[1].w; o3.z = vsource[2].w; o3.w = vsource[3].w; + // clang-format on +#endif +} + +ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadTransposedPos(const void* source, Vector& o0, + Vector& o1, Vector& o2, Vector& o3) +{ + const Vector* vsource = static_cast(source); +#if defined(USE_SSE) + o0 = vsource[0]; + o1 = vsource[1]; + o2 = vsource[2]; + o3 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f); + _MM_TRANSPOSE4_PS(o0, o1, o2, o3); +#elif defined(USE_NEON) + float32x4x2_t ld01 = vld2q_f32(static_cast(source)); + o0 = ld01.val[0]; + o1 = ld01.val[1]; + o2 = vsource[2]; + o3 = vsetr_f32(0.0f, 0.0f, 0.0f, 1.0f); + vuzp12q_f32(o2, o3); + vuzp12q_f32(o0, o2); + vuzp12q_f32(o1, o3); +#else + // clang-format off + o0.x = vsource[0].x; o0.y = vsource[1].x; o0.z = vsource[2].x; o0.w = 0.0f; + o1.x = vsource[0].y; o1.y = vsource[1].y; o1.z = vsource[2].y; o1.w = 0.0f; + o2.x = vsource[0].z; o2.y = vsource[1].z; o2.z = vsource[2].z; o2.w = 0.0f; + o3.x = vsource[0].w; o3.y = vsource[1].w; o3.z = vsource[2].w; o3.w = 1.0f; + // clang-format on +#endif +} +#endif + +#ifndef USE_NEON +ATTR_TARGET DOLPHIN_FORCE_INLINE static void LoadPos(const void* source, // + Vector& o0, Vector& o1, Vector& o2) +{ + const Vector* vsource = static_cast(source); + o0 = vsource[0]; + o1 = vsource[1]; + o2 = vsource[2]; +} +#endif + +ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector ApplyMatrix(Vector v, Vector m0, Vector m1, + Vector m2, Vector m3) +{ +#if defined(USE_SSE) + Vector output = _mm_mul_ps(vector_broadcast<0>(v), m0); +#ifdef USE_FMA + output = _mm_fmadd_ps(vector_broadcast<1>(v), m1, output); + output = _mm_fmadd_ps(vector_broadcast<2>(v), m2, output); + output = _mm_fmadd_ps(vector_broadcast<3>(v), m3, output); +#else + output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<1>(v), m1)); + output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<2>(v), m2)); + output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<3>(v), m3)); +#endif + return output; +#elif defined(USE_NEON) + Vector output = vmulq_laneq_f32(m0, v, 0); + output = vfmaq_laneq_f32(output, m1, v, 1); + output = vfmaq_laneq_f32(output, m2, v, 2); + output = vfmaq_laneq_f32(output, m3, v, 3); + return output; +#else + Vector output; + output.x = v.x * m0.x + v.y * m1.x + v.z * m2.x + v.w * m3.x; + output.y = v.x * m0.y + v.y * m1.y + v.z * m2.y + v.w * m3.y; + output.z = v.x * m0.z + v.y * m1.z + v.z * m2.z + v.w * m3.z; + output.w = v.x * m0.w + v.y * m1.w + v.z * m2.w + v.w * m3.w; + return output; +#endif +} + +#ifndef USE_NEON +ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector +TransformVertexNoTranspose(Vector vertex, Vector pos0, Vector pos1, Vector pos2, // + Vector proj0, Vector proj1, Vector proj2, Vector proj3) +{ +#ifdef USE_SSE + Vector mul0 = _mm_mul_ps(vertex, pos0); + Vector mul1 = _mm_mul_ps(vertex, pos1); + Vector mul2 = _mm_mul_ps(vertex, pos2); + Vector mul3 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f); +#ifdef USE_SSE3 + Vector output = _mm_hadd_ps(_mm_hadd_ps(mul0, mul1), _mm_hadd_ps(mul2, mul3)); +#else + Vector t0 = _mm_add_ps(_mm_unpacklo_ps(mul0, mul2), _mm_unpackhi_ps(mul0, mul2)); + Vector t1 = _mm_add_ps(_mm_unpacklo_ps(mul1, mul3), _mm_unpackhi_ps(mul1, mul3)); + Vector output = _mm_add_ps(_mm_unpacklo_ps(t0, t1), _mm_unpackhi_ps(t0, t1)); +#endif +#else + Vector output; + output.x = vertex.x * pos0.x + vertex.y * pos0.y + vertex.z * pos0.z + vertex.w * pos0.w; + output.y = vertex.x * pos1.x + vertex.y * pos1.y + vertex.z * pos1.z + vertex.w * pos1.w; + output.z = vertex.x * pos2.x + vertex.y * pos2.y + vertex.z * pos2.z + vertex.w * pos2.w; + output.w = 1.0f; +#endif + output = ApplyMatrix(output, proj0, proj1, proj2, proj3); + return output; +} +#endif + +template +ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector +TransformVertex(Vector vertex, Vector pos0, Vector pos1, Vector pos2, Vector pos3, // + Vector proj0, Vector proj1, Vector proj2, Vector proj3) +{ + Vector output = pos3; // vertex.w is always 1.0 +#if defined(USE_FMA) + output = _mm_fmadd_ps(vector_broadcast<0>(vertex), pos0, output); + output = _mm_fmadd_ps(vector_broadcast<1>(vertex), pos1, output); + if constexpr (PositionHas3Elems) + output = _mm_fmadd_ps(vector_broadcast<2>(vertex), pos2, output); +#elif defined(USE_SSE) + output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<0>(vertex), pos0)); + output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<1>(vertex), pos1)); + if constexpr (PositionHas3Elems) + output = _mm_add_ps(output, _mm_mul_ps(vector_broadcast<2>(vertex), pos2)); +#elif defined(USE_NEON) + output = vfmaq_laneq_f32(output, pos0, vertex, 0); + output = vfmaq_laneq_f32(output, pos1, vertex, 1); + if constexpr (PositionHas3Elems) + output = vfmaq_laneq_f32(output, pos2, vertex, 2); +#else + output.x += vertex.x * pos0.x + vertex.y * pos1.x; + output.y += vertex.x * pos0.y + vertex.y * pos1.y; + output.z += vertex.x * pos0.z + vertex.y * pos1.z; + if constexpr (PositionHas3Elems) + { + output.x += vertex.z * pos2.x; + output.y += vertex.z * pos2.y; + output.z += vertex.z * pos2.z; + } +#endif + output = ApplyMatrix(output, proj0, proj1, proj2, proj3); + return output; +} + +template +ATTR_TARGET DOLPHIN_FORCE_INLINE static Vector +LoadTransformVertex(const u8* data, Vector pos0, Vector pos1, Vector pos2, Vector pos3, + Vector proj0, Vector proj1, Vector proj2, Vector proj3) +{ + Vector vertex; + if constexpr (PerVertexPosMtx) + { + // Vertex data layout always starts with posmtx data if available, then position data + // Convenient for us, that means offsets are always fixed + u32 idx = data[0] & 0x3f; + data += sizeof(u32); + + const float* fdata = reinterpret_cast(data); + +#ifdef USE_NEON + LoadTransposedPos(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2, pos3); + + if constexpr (PositionHas3Elems) + { + vertex = vld1q_f32(fdata); + } + else + { + vertex = vcombine_f32(vld1_f32(fdata), vdup_n_f32(0.0f)); + } + + vertex = TransformVertex(vertex, pos0, pos1, pos2, pos3, // + proj0, proj1, proj2, proj3); +#else + LoadPos(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2); + + if constexpr (PositionHas3Elems) + { +#if defined(USE_SSE) +#ifdef USE_SSE41 + Vector base = _mm_set1_ps(1.0f); + vertex = _mm_blend_ps(_mm_loadu_ps(fdata), base, 8); +#else + Vector base = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f); + Vector mask = _mm_castsi128_ps(_mm_setr_epi32(-1, -1, -1, 0)); + vertex = _mm_or_ps(_mm_and_ps(_mm_loadu_ps(fdata), mask), base); +#endif +#else + vertex.x = fdata[0]; + vertex.y = fdata[1]; + vertex.z = fdata[2]; + vertex.w = 1.0f; +#endif + } + else + { +#if defined(USE_SSE) + Vector base = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f); + vertex = _mm_loadl_pi(base, reinterpret_cast(fdata)); +#else + vertex.x = fdata[0]; + vertex.y = fdata[1]; + vertex.z = 0.0f; + vertex.w = 1.0f; +#endif + } + + vertex = TransformVertexNoTranspose(vertex, pos0, pos1, pos2, proj0, proj1, proj2, proj3); +#endif + } + else + { + const float* fdata = reinterpret_cast(data); + if constexpr (PositionHas3Elems) + { +#if defined(USE_SSE) + vertex = _mm_loadu_ps(fdata); +#elif defined(USE_NEON) + vertex = vld1q_f32(fdata); +#else + vertex.x = fdata[0]; + vertex.y = fdata[1]; + vertex.z = fdata[2]; + vertex.w = 1.0f; +#endif + } + else + { +#if defined(USE_SSE) + vertex = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast(fdata)); +#elif defined(USE_NEON) + vertex = vcombine_f32(vld1_f32(fdata), vdup_n_f32(0.0f)); +#else + vertex.x = fdata[0]; + vertex.y = fdata[1]; + vertex.z = 0.0f; + vertex.w = 1.0f; +#endif + } + + vertex = TransformVertex(vertex, pos0, pos1, pos2, pos3, // + proj0, proj1, proj2, proj3); + } + + return vertex; +} + +template +ATTR_TARGET static void TransformVertices(void* output, const void* vertices, u32 stride, int count) +{ + const VertexShaderManager& vsmanager = Core::System::GetInstance().GetVertexShaderManager(); + const u8* cvertices = static_cast(vertices); + Vector* voutput = static_cast(output); + u32 idx = g_main_cp_state.matrix_index_a.PosNormalMtxIdx & 0x3f; +#ifdef USE_AVX + __m256 proj0, proj1, proj2, proj3; + __m256 pos0, pos1, pos2, pos3; + LoadTransposedYMM(vsmanager.constants.projection.data(), proj0, proj1, proj2, proj3); + LoadTransposedPosYMM(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2, pos3); + for (int i = 1; i < count; i += 2) + { + const u8* v0data = cvertices; + const u8* v1data = cvertices + stride; + __m256 v01 = LoadTransform2Vertices( + v0data, v1data, pos0, pos1, pos2, pos3, proj0, proj1, proj2, proj3); + _mm256_store_ps(reinterpret_cast(voutput), v01); + cvertices += stride * 2; + voutput += 2; + } + if (count & 1) + { + *voutput = LoadTransformVertex( + cvertices, // + _mm256_castps256_ps128(pos0), _mm256_castps256_ps128(pos1), // + _mm256_castps256_ps128(pos2), _mm256_castps256_ps128(pos3), // + _mm256_castps256_ps128(proj0), _mm256_castps256_ps128(proj1), // + _mm256_castps256_ps128(proj2), _mm256_castps256_ps128(proj3)); + } +#else + Vector proj0, proj1, proj2, proj3; + Vector pos0, pos1, pos2, pos3; + LoadTransposed(vsmanager.constants.projection.data(), proj0, proj1, proj2, proj3); + LoadTransposedPos(&xfmem.posMatrices[idx * 4], pos0, pos1, pos2, pos3); + for (int i = 0; i < count; i++) + { + *voutput = LoadTransformVertex( + cvertices, pos0, pos1, pos2, pos3, proj0, proj1, proj2, proj3); + cvertices += stride; + voutput += 1; + } +#endif +} + +template +ATTR_TARGET DOLPHIN_FORCE_INLINE static bool CullTriangle(const CPUCull::TransformedVertex& a, + const CPUCull::TransformedVertex& b, + const CPUCull::TransformedVertex& c) +{ + if (Mode == CullMode::All) + return true; + + Vector va = reinterpret_cast(a); + Vector vb = reinterpret_cast(b); + Vector vc = reinterpret_cast(c); + + // See videosoftware Clipper.cpp + +#if defined(USE_SSE) + Vector wxzya = _mm_shuffle_ps(va, va, _MM_SHUFFLE(1, 2, 0, 3)); + Vector wxzyc = _mm_shuffle_ps(vc, vc, _MM_SHUFFLE(1, 2, 0, 3)); + Vector ywzxb = _mm_shuffle_ps(vb, vb, _MM_SHUFFLE(0, 2, 3, 1)); + Vector part0 = _mm_mul_ps(va, wxzyc); + Vector part1 = _mm_mul_ps(vc, wxzya); + Vector part2 = _mm_mul_ps(_mm_sub_ps(part0, part1), ywzxb); +#ifdef USE_SSE3 + Vector part3 = _mm_movehdup_ps(part2); +#else + Vector part3 = vector_broadcast<1>(part2); +#endif + Vector part4 = vector_broadcast<3>(part2); + Vector part5 = _mm_add_ss(_mm_add_ss(part2, part3), part4); + float normal_z_dir; + _mm_store_ss(&normal_z_dir, part5); +#elif defined(USE_NEON) + Vector zero = vdupq_n_f32(0.0f); + Vector wx0ya = vextq_f32(va, vzip1q_f32(va, zero), 3); + Vector wx0yc = vextq_f32(vc, vzip1q_f32(vc, zero), 3); + Vector ywxxb = vuzp2q_f32(vb, vdupq_laneq_f32(vb, 0)); + Vector part0 = vmulq_f32(va, wx0yc); + Vector part1 = vmulq_f32(vc, wx0ya); + Vector part2 = vmulq_f32(vsubq_f32(part0, part1), ywxxb); + float normal_z_dir = vaddvq_f32(part2); +#else + float normal_z_dir = (c.w * a.x - a.w * c.x) * b.y + // + (c.x * a.y - a.x * c.y) * b.w + // + (c.y * a.w - a.y * c.w) * b.x; +#endif + bool cull = false; + switch (Mode) + { + case CullMode::None: + cull = normal_z_dir == 0; + break; + case CullMode::Front: + cull = normal_z_dir <= 0; + break; + case CullMode::Back: + cull = normal_z_dir >= 0; + break; + case CullMode::All: + cull = true; + break; + } + if (cull) + return true; + +#if defined(USE_SSE) + Vector xyab = _mm_unpacklo_ps(va, vb); + Vector zwab = _mm_unpackhi_ps(va, vb); + Vector allx = _mm_shuffle_ps(xyab, vc, _MM_SHUFFLE(0, 0, 1, 0)); + Vector ally = _mm_shuffle_ps(xyab, vc, _MM_SHUFFLE(1, 1, 3, 2)); + Vector allpw = _mm_shuffle_ps(zwab, vc, _MM_SHUFFLE(3, 3, 3, 2)); + Vector allnw = _mm_xor_ps(allpw, _mm_set1_ps(-0.0f)); + __m128i x_gt_pw = _mm_castps_si128(_mm_cmple_ps(allpw, allx)); + __m128i y_gt_pw = _mm_castps_si128(_mm_cmple_ps(allpw, ally)); + __m128i x_lt_nw = _mm_castps_si128(_mm_cmplt_ps(allx, allnw)); + __m128i y_lt_nw = _mm_castps_si128(_mm_cmplt_ps(ally, allnw)); + __m128i any_out_of_bounds = _mm_packs_epi16(_mm_packs_epi32(x_lt_nw, y_lt_nw), // + _mm_packs_epi32(x_gt_pw, y_gt_pw)); + cull |= 0 != _mm_movemask_epi8(_mm_cmpeq_epi32(_mm_set1_epi32(~0), any_out_of_bounds)); +#elif defined(USE_NEON) + float64x2_t xyab = vreinterpretq_f64_f32(vzip1q_f32(va, vb)); + float64x2_t xycc = vreinterpretq_f64_f32(vzip1q_f32(vc, vc)); + float32x4_t allx = vreinterpretq_f32_f64(vzip1q_f64(xyab, xycc)); + float32x4_t ally = vreinterpretq_f32_f64(vzip2q_f64(xyab, xycc)); + float32x4_t allpw = vextq_f32(vzip2q_f32(va, vb), vdupq_laneq_f32(vc, 3), 2); + float32x4_t allnw = vnegq_f32(allpw); + uint16x8_t x_gt_pw = vreinterpretq_u16_u32(vcgtq_f32(allx, allpw)); + uint16x8_t y_gt_pw = vreinterpretq_u16_u32(vcgtq_f32(ally, allpw)); + uint16x8_t x_lt_nw = vreinterpretq_u16_u32(vcltq_f32(allx, allnw)); + uint16x8_t y_lt_nw = vreinterpretq_u16_u32(vcltq_f32(ally, allnw)); + uint8x16_t lt_nw = vreinterpretq_u8_u16(vuzp1q_u16(x_lt_nw, y_lt_nw)); + uint8x16_t gt_pw = vreinterpretq_u8_u16(vuzp1q_u16(x_gt_pw, y_gt_pw)); + uint32x4_t any_out_of_bounds = vreinterpretq_u32_u8(vuzp1q_u8(lt_nw, gt_pw)); + cull |= 0xFFFFFFFF == vmaxvq_u32(any_out_of_bounds); +#else + cull |= a.x < -a.w && b.x < -b.w && c.x < -c.w; + cull |= a.y < -a.w && b.y < -b.w && c.y < -c.w; + cull |= a.x > a.w && b.x > b.w && c.x > c.w; + cull |= a.y > a.w && b.y > b.w && c.y > c.w; +#endif + + return cull; +} + +template +ATTR_TARGET static bool AreAllVerticesCulled(const CPUCull::TransformedVertex* transformed, + int count) +{ + switch (Primitive) + { + case OpcodeDecoder::Primitive::GX_DRAW_QUADS: + case OpcodeDecoder::Primitive::GX_DRAW_QUADS_2: + { + int i = 3; + for (; i < count; i += 4) + { + if (!CullTriangle(transformed[i - 3], transformed[i - 2], transformed[i - 1])) + return false; + if (!CullTriangle(transformed[i - 3], transformed[i - 1], transformed[i - 0])) + return false; + } + // three vertices remaining, so render a triangle + if (i == count) + { + if (!CullTriangle(transformed[i - 3], transformed[i - 2], transformed[i - 1])) + return false; + } + break; + } + case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLES: + for (int i = 2; i < count; i += 3) + { + if (!CullTriangle(transformed[i - 2], transformed[i - 1], transformed[i - 0])) + return false; + } + break; + case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_STRIP: + { + bool wind = false; + for (int i = 2; i < count; ++i) + { + if (!CullTriangle(transformed[i - 2], transformed[i - !wind], transformed[i - wind])) + return false; + wind = !wind; + } + break; + } + case OpcodeDecoder::Primitive::GX_DRAW_TRIANGLE_FAN: + for (int i = 2; i < count; ++i) + { + if (!CullTriangle(transformed[0], transformed[i - 1], transformed[i])) + return false; + } + break; + } + + return true; +} + +} // namespace VECTOR_NAMESPACE + +#undef ATTR_TARGET +#undef VECTOR_NAMESPACE diff --git a/Source/Core/VideoCommon/ConstantManager.h b/Source/Core/VideoCommon/ConstantManager.h index 6e60929056..88c25a9823 100644 --- a/Source/Core/VideoCommon/ConstantManager.h +++ b/Source/Core/VideoCommon/ConstantManager.h @@ -17,7 +17,7 @@ enum class SrcBlendFactor : u32; enum class ZTexOp : u32; enum class LogicOp : u32; -struct PixelShaderConstants +struct alignas(16) PixelShaderConstants { std::array colors; std::array kcolors; @@ -60,7 +60,7 @@ struct PixelShaderConstants LogicOp logic_op_mode; }; -struct VertexShaderConstants +struct alignas(16) VertexShaderConstants { u32 components; // .x u32 xfmem_dualTexInfo; // .y @@ -109,7 +109,7 @@ enum class VSExpand : u32 Line, }; -struct GeometryShaderConstants +struct alignas(16) GeometryShaderConstants { float4 stereoparams; float4 lineptparams; diff --git a/Source/Core/VideoCommon/VertexShaderManager.cpp b/Source/Core/VideoCommon/VertexShaderManager.cpp index abc59b862c..deda07fd65 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.cpp +++ b/Source/Core/VideoCommon/VertexShaderManager.cpp @@ -65,6 +65,97 @@ void VertexShaderManager::Dirty() dirty = true; } +Common::Matrix44 VertexShaderManager::LoadProjectionMatrix() +{ + const auto& rawProjection = xfmem.projection.rawProjection; + + switch (xfmem.projection.type) + { + case ProjectionType::Perspective: + { + const Common::Vec2 fov_multiplier = g_freelook_camera.IsActive() ? + g_freelook_camera.GetFieldOfViewMultiplier() : + Common::Vec2{1, 1}; + m_projection_matrix[0] = rawProjection[0] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x; + m_projection_matrix[1] = 0.0f; + m_projection_matrix[2] = rawProjection[1] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x; + m_projection_matrix[3] = 0.0f; + + m_projection_matrix[4] = 0.0f; + m_projection_matrix[5] = rawProjection[2] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y; + m_projection_matrix[6] = rawProjection[3] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y; + m_projection_matrix[7] = 0.0f; + + m_projection_matrix[8] = 0.0f; + m_projection_matrix[9] = 0.0f; + m_projection_matrix[10] = rawProjection[4]; + m_projection_matrix[11] = rawProjection[5]; + + m_projection_matrix[12] = 0.0f; + m_projection_matrix[13] = 0.0f; + + m_projection_matrix[14] = -1.0f; + m_projection_matrix[15] = 0.0f; + + g_stats.gproj = m_projection_matrix; + } + break; + + case ProjectionType::Orthographic: + { + m_projection_matrix[0] = rawProjection[0]; + m_projection_matrix[1] = 0.0f; + m_projection_matrix[2] = 0.0f; + m_projection_matrix[3] = rawProjection[1]; + + m_projection_matrix[4] = 0.0f; + m_projection_matrix[5] = rawProjection[2]; + m_projection_matrix[6] = 0.0f; + m_projection_matrix[7] = rawProjection[3]; + + m_projection_matrix[8] = 0.0f; + m_projection_matrix[9] = 0.0f; + m_projection_matrix[10] = rawProjection[4]; + m_projection_matrix[11] = rawProjection[5]; + + m_projection_matrix[12] = 0.0f; + m_projection_matrix[13] = 0.0f; + + m_projection_matrix[14] = 0.0f; + m_projection_matrix[15] = 1.0f; + + g_stats.g2proj = m_projection_matrix; + g_stats.proj = rawProjection; + } + break; + + default: + ERROR_LOG_FMT(VIDEO, "Unknown projection type: {}", xfmem.projection.type); + } + + PRIM_LOG("Projection: {} {} {} {} {} {}", rawProjection[0], rawProjection[1], rawProjection[2], + rawProjection[3], rawProjection[4], rawProjection[5]); + + auto corrected_matrix = m_viewport_correction * Common::Matrix44::FromArray(m_projection_matrix); + + if (g_freelook_camera.IsActive() && xfmem.projection.type == ProjectionType::Perspective) + corrected_matrix *= g_freelook_camera.GetView(); + + g_freelook_camera.GetController()->SetClean(); + + return corrected_matrix; +} + +void VertexShaderManager::SetProjectionMatrix() +{ + if (m_projection_changed || g_freelook_camera.GetController()->IsDirty()) + { + m_projection_changed = false; + auto corrected_matrix = LoadProjectionMatrix(); + memcpy(constants.projection.data(), corrected_matrix.data.data(), 4 * sizeof(float4)); + } +} + // Syncs the shader constant buffers with xfmem // TODO: A cleaner way to control the matrices without making a mess in the parameters field void VertexShaderManager::SetConstants(const std::vector& textures) @@ -317,84 +408,7 @@ void VertexShaderManager::SetConstants(const std::vector& textures) m_projection_changed = false; m_projection_graphics_mod_change = !projection_actions.empty(); - const auto& rawProjection = xfmem.projection.rawProjection; - - switch (xfmem.projection.type) - { - case ProjectionType::Perspective: - { - const Common::Vec2 fov_multiplier = g_freelook_camera.IsActive() ? - g_freelook_camera.GetFieldOfViewMultiplier() : - Common::Vec2{1, 1}; - m_projection_matrix[0] = - rawProjection[0] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x; - m_projection_matrix[1] = 0.0f; - m_projection_matrix[2] = - rawProjection[1] * g_ActiveConfig.fAspectRatioHackW * fov_multiplier.x; - m_projection_matrix[3] = 0.0f; - - m_projection_matrix[4] = 0.0f; - m_projection_matrix[5] = - rawProjection[2] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y; - m_projection_matrix[6] = - rawProjection[3] * g_ActiveConfig.fAspectRatioHackH * fov_multiplier.y; - m_projection_matrix[7] = 0.0f; - - m_projection_matrix[8] = 0.0f; - m_projection_matrix[9] = 0.0f; - m_projection_matrix[10] = rawProjection[4]; - m_projection_matrix[11] = rawProjection[5]; - - m_projection_matrix[12] = 0.0f; - m_projection_matrix[13] = 0.0f; - - m_projection_matrix[14] = -1.0f; - m_projection_matrix[15] = 0.0f; - - g_stats.gproj = m_projection_matrix; - } - break; - - case ProjectionType::Orthographic: - { - m_projection_matrix[0] = rawProjection[0]; - m_projection_matrix[1] = 0.0f; - m_projection_matrix[2] = 0.0f; - m_projection_matrix[3] = rawProjection[1]; - - m_projection_matrix[4] = 0.0f; - m_projection_matrix[5] = rawProjection[2]; - m_projection_matrix[6] = 0.0f; - m_projection_matrix[7] = rawProjection[3]; - - m_projection_matrix[8] = 0.0f; - m_projection_matrix[9] = 0.0f; - m_projection_matrix[10] = rawProjection[4]; - m_projection_matrix[11] = rawProjection[5]; - - m_projection_matrix[12] = 0.0f; - m_projection_matrix[13] = 0.0f; - - m_projection_matrix[14] = 0.0f; - m_projection_matrix[15] = 1.0f; - - g_stats.g2proj = m_projection_matrix; - g_stats.proj = rawProjection; - } - break; - - default: - ERROR_LOG_FMT(VIDEO, "Unknown projection type: {}", xfmem.projection.type); - } - - PRIM_LOG("Projection: {} {} {} {} {} {}", rawProjection[0], rawProjection[1], rawProjection[2], - rawProjection[3], rawProjection[4], rawProjection[5]); - - auto corrected_matrix = - m_viewport_correction * Common::Matrix44::FromArray(m_projection_matrix); - - if (g_freelook_camera.IsActive() && xfmem.projection.type == ProjectionType::Perspective) - corrected_matrix *= g_freelook_camera.GetView(); + auto corrected_matrix = LoadProjectionMatrix(); GraphicsModActionData::Projection projection{&corrected_matrix}; for (auto action : projection_actions) @@ -404,8 +418,6 @@ void VertexShaderManager::SetConstants(const std::vector& textures) memcpy(constants.projection.data(), corrected_matrix.data.data(), 4 * sizeof(float4)); - g_freelook_camera.GetController()->SetClean(); - dirty = true; } diff --git a/Source/Core/VideoCommon/VertexShaderManager.h b/Source/Core/VideoCommon/VertexShaderManager.h index ce157bd596..9a150980da 100644 --- a/Source/Core/VideoCommon/VertexShaderManager.h +++ b/Source/Core/VideoCommon/VertexShaderManager.h @@ -24,6 +24,7 @@ public: void DoState(PointerWrap& p); // constant management + void SetProjectionMatrix(); void SetConstants(const std::vector& textures); void InvalidateXFRange(int start, int end); @@ -64,4 +65,6 @@ private: std::array m_minmax_lights_changed{}; Common::Matrix44 m_viewport_correction{}; + + Common::Matrix44 LoadProjectionMatrix(); }; diff --git a/Source/Core/VideoCommon/XFMemory.h b/Source/Core/VideoCommon/XFMemory.h index 71d38cb8bd..e0a2696317 100644 --- a/Source/Core/VideoCommon/XFMemory.h +++ b/Source/Core/VideoCommon/XFMemory.h @@ -423,7 +423,7 @@ struct Projection ProjectionType type; }; -struct XFMemory +struct alignas(16) XFMemory { float posMatrices[256]; // 0x0000 - 0x00ff u32 unk0[768]; // 0x0100 - 0x03ff