From 729498ab41c75240909611e4d002c86e488f9465 Mon Sep 17 00:00:00 2001 From: Pokechu22 Date: Thu, 14 Jul 2022 15:54:32 -0700 Subject: [PATCH 01/10] VertexLoaderTest: Add DirectAllComponents We have one that does a similar thing, but only to measure speed and uses indices. This one verifies accuracy (and uses the largest possible input size by using direct components). --- .../VideoCommon/VertexLoaderTest.cpp | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) diff --git a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp index b12dad1661..13a20df3e2 100644 --- a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp +++ b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp @@ -366,3 +366,167 @@ TEST_F(VertexLoaderTest, LargeFloatVertexSpeed) for (int i = 0; i < 100; ++i) RunVertices(100000); } + +TEST_F(VertexLoaderTest, DirectAllComponents) +{ + m_vtx_desc.low.PosMatIdx = 1; + m_vtx_desc.low.Tex0MatIdx = 1; + m_vtx_desc.low.Tex1MatIdx = 1; + m_vtx_desc.low.Tex2MatIdx = 1; + m_vtx_desc.low.Tex3MatIdx = 1; + m_vtx_desc.low.Tex4MatIdx = 1; + m_vtx_desc.low.Tex5MatIdx = 1; + m_vtx_desc.low.Tex6MatIdx = 1; + m_vtx_desc.low.Tex7MatIdx = 1; + m_vtx_desc.low.Position = VertexComponentFormat::Direct; + m_vtx_desc.low.Normal = VertexComponentFormat::Direct; + m_vtx_desc.low.Color0 = VertexComponentFormat::Direct; + m_vtx_desc.low.Color1 = VertexComponentFormat::Direct; + m_vtx_desc.high.Tex0Coord = VertexComponentFormat::Direct; + m_vtx_desc.high.Tex1Coord = VertexComponentFormat::Direct; + m_vtx_desc.high.Tex2Coord = VertexComponentFormat::Direct; + m_vtx_desc.high.Tex3Coord = VertexComponentFormat::Direct; + m_vtx_desc.high.Tex4Coord = VertexComponentFormat::Direct; + m_vtx_desc.high.Tex5Coord = VertexComponentFormat::Direct; + m_vtx_desc.high.Tex6Coord = VertexComponentFormat::Direct; + m_vtx_desc.high.Tex7Coord = VertexComponentFormat::Direct; + + m_vtx_attr.g0.PosElements = CoordComponentCount::XYZ; + m_vtx_attr.g0.PosFormat = ComponentFormat::Float; + m_vtx_attr.g0.NormalElements = NormalComponentCount::NTB; + m_vtx_attr.g0.NormalFormat = ComponentFormat::Float; + m_vtx_attr.g0.Color0Elements = ColorComponentCount::RGBA; + m_vtx_attr.g0.Color0Comp = ColorFormat::RGBA8888; + m_vtx_attr.g0.Color1Elements = ColorComponentCount::RGBA; + m_vtx_attr.g0.Color1Comp = ColorFormat::RGBA8888; + m_vtx_attr.g0.Tex0CoordElements = TexComponentCount::ST; + m_vtx_attr.g0.Tex0CoordFormat = ComponentFormat::Float; + m_vtx_attr.g1.Tex1CoordElements = TexComponentCount::ST; + m_vtx_attr.g1.Tex1CoordFormat = ComponentFormat::Float; + m_vtx_attr.g1.Tex2CoordElements = TexComponentCount::ST; + m_vtx_attr.g1.Tex2CoordFormat = ComponentFormat::Float; + m_vtx_attr.g1.Tex3CoordElements = TexComponentCount::ST; + m_vtx_attr.g1.Tex3CoordFormat = ComponentFormat::Float; + m_vtx_attr.g1.Tex4CoordElements = TexComponentCount::ST; + m_vtx_attr.g1.Tex4CoordFormat = ComponentFormat::Float; + m_vtx_attr.g2.Tex5CoordElements = TexComponentCount::ST; + m_vtx_attr.g2.Tex5CoordFormat = ComponentFormat::Float; + m_vtx_attr.g2.Tex6CoordElements = TexComponentCount::ST; + m_vtx_attr.g2.Tex6CoordFormat = ComponentFormat::Float; + m_vtx_attr.g2.Tex7CoordElements = TexComponentCount::ST; + m_vtx_attr.g2.Tex7CoordFormat = ComponentFormat::Float; + + CreateAndCheckSizes(129, 39 * sizeof(float)); + + // Pos matrix idx + Input(20); + // Tex matrix idx + Input(0); + Input(1); + Input(2); + Input(3); + Input(4); + Input(5); + Input(6); + Input(7); + // Position + Input(-1.0f); + Input(-2.0f); + Input(-3.0f); + // Normal + Input(-4.0f); + Input(-5.0f); + Input(-6.0f); + // Tangent + Input(-7.0f); + Input(-8.0f); + Input(-9.0f); + // Binormal + Input(-10.0f); + Input(-11.0f); + Input(-12.0f); + // Colors + Input(0x01234567); + Input(0x89abcdef); + // Texture coordinates + Input(0.1f); + Input(-0.9f); + Input(1.1f); + Input(-1.9f); + Input(2.1f); + Input(-2.9f); + Input(3.1f); + Input(-3.9f); + Input(4.1f); + Input(-4.9f); + Input(5.1f); + Input(-5.9f); + Input(6.1f); + Input(-6.9f); + Input(7.1f); + Input(-7.9f); + + RunVertices(1); + + // Position matrix + ASSERT_EQ(m_loader->m_native_vtx_decl.posmtx.offset, 0 * sizeof(float)); + EXPECT_EQ((m_dst.Read()), 20u); + // Position + ASSERT_EQ(m_loader->m_native_vtx_decl.position.offset, 1 * sizeof(float)); + ExpectOut(-1.0f); + ExpectOut(-2.0f); + ExpectOut(-3.0f); + // Normal + ASSERT_EQ(m_loader->m_native_vtx_decl.normals[0].offset, 4 * sizeof(float)); + ExpectOut(-4.0f); + ExpectOut(-5.0f); + ExpectOut(-6.0f); + // Tangent + ASSERT_EQ(m_loader->m_native_vtx_decl.normals[1].offset, 7 * sizeof(float)); + ExpectOut(-7.0f); + ExpectOut(-8.0f); + ExpectOut(-9.0f); + // Binormal + ASSERT_EQ(m_loader->m_native_vtx_decl.normals[2].offset, 10 * sizeof(float)); + ExpectOut(-10.0f); + ExpectOut(-11.0f); + ExpectOut(-12.0f); + // Colors + ASSERT_EQ(m_loader->m_native_vtx_decl.colors[0].offset, 13 * sizeof(float)); + EXPECT_EQ((m_dst.Read()), 0x01234567u); + ASSERT_EQ(m_loader->m_native_vtx_decl.colors[1].offset, 14 * sizeof(float)); + EXPECT_EQ((m_dst.Read()), 0x89abcdefu); + // Texture coordinates and matrices (interleaved) + ASSERT_EQ(m_loader->m_native_vtx_decl.texcoords[0].offset, 15 * sizeof(float)); + ExpectOut(0.1f); // S + ExpectOut(-0.9f); // T + ExpectOut(0.0f); // matrix (yes, a float) + ASSERT_EQ(m_loader->m_native_vtx_decl.texcoords[1].offset, 18 * sizeof(float)); + ExpectOut(1.1f); + ExpectOut(-1.9f); + ExpectOut(1.0f); + ASSERT_EQ(m_loader->m_native_vtx_decl.texcoords[2].offset, 21 * sizeof(float)); + ExpectOut(2.1f); + ExpectOut(-2.9f); + ExpectOut(2.0f); + ASSERT_EQ(m_loader->m_native_vtx_decl.texcoords[3].offset, 24 * sizeof(float)); + ExpectOut(3.1f); + ExpectOut(-3.9f); + ExpectOut(3.0f); + ASSERT_EQ(m_loader->m_native_vtx_decl.texcoords[4].offset, 27 * sizeof(float)); + ExpectOut(4.1f); + ExpectOut(-4.9f); + ExpectOut(4.0f); + ASSERT_EQ(m_loader->m_native_vtx_decl.texcoords[5].offset, 30 * sizeof(float)); + ExpectOut(5.1f); + ExpectOut(-5.9f); + ExpectOut(5.0f); + ASSERT_EQ(m_loader->m_native_vtx_decl.texcoords[6].offset, 33 * sizeof(float)); + ExpectOut(6.1f); + ExpectOut(-6.9f); + ExpectOut(6.0f); + ASSERT_EQ(m_loader->m_native_vtx_decl.texcoords[7].offset, 36 * sizeof(float)); + ExpectOut(7.1f); + ExpectOut(-7.9f); + ExpectOut(7.0f); +} From 53ee1b50fe4ae75675e31a1f4892293d3c5cfc85 Mon Sep 17 00:00:00 2001 From: Pokechu22 Date: Thu, 14 Jul 2022 17:34:08 -0700 Subject: [PATCH 02/10] VertexLoaderTest: Add NormalAll This currently fails for direct with NormalIndex3 enabled (see https://bugs.dolphin-emu.org/issues/12952). The goal of this test is to be able to confidently say that that bug has been fixed. --- .../VideoCommon/VertexLoaderTest.cpp | 246 ++++++++++++++++++ 1 file changed, 246 insertions(+) diff --git a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp index 13a20df3e2..5e3898ed28 100644 --- a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp +++ b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp @@ -530,3 +530,249 @@ TEST_F(VertexLoaderTest, DirectAllComponents) ExpectOut(-7.9f); ExpectOut(7.0f); } + +class VertexLoaderNormalTest + : public VertexLoaderTest, + public ::testing::WithParamInterface< + std::tuple> +{ +}; +INSTANTIATE_TEST_CASE_P( + AllCombinations, VertexLoaderNormalTest, + ::testing::Combine( + ::testing::Values(VertexComponentFormat::NotPresent, VertexComponentFormat::Direct, + VertexComponentFormat::Index8, VertexComponentFormat::Index16), + ::testing::Values(ComponentFormat::UByte, ComponentFormat::Byte, ComponentFormat::UShort, + ComponentFormat::Short, ComponentFormat::Float), + ::testing::Values(NormalComponentCount::N, NormalComponentCount::NTB), + ::testing::Values(false, true))); + +TEST_P(VertexLoaderNormalTest, NormalAll) +{ + VertexComponentFormat addr; + ComponentFormat format; + NormalComponentCount elements; + bool index3; + std::tie(addr, format, elements, index3) = GetParam(); + + m_vtx_desc.low.Position = VertexComponentFormat::Direct; + m_vtx_attr.g0.PosFormat = ComponentFormat::Float; + m_vtx_attr.g0.PosElements = CoordComponentCount::XY; + m_vtx_attr.g0.PosFrac = 0; + m_vtx_desc.low.Normal = addr; + m_vtx_attr.g0.NormalFormat = format; + m_vtx_attr.g0.NormalElements = elements; + m_vtx_attr.g0.NormalIndex3 = index3; + + const u32 in_size = [&]() -> u32 { + if (addr == VertexComponentFormat::NotPresent) + return 0; + + if (IsIndexed(addr)) + { + const u32 base_size = (addr == VertexComponentFormat::Index8) ? 1 : 2; + if (elements == NormalComponentCount::NTB) + return (index3 ? 3 : 1) * base_size; + else + return 1 * base_size; + } + else + { + const u32 base_count = (elements == NormalComponentCount::NTB) ? 9 : 3; + const u32 base_size = GetElementSize(format); + return base_count * base_size; + } + }(); + const u32 out_size = [&]() -> u32 { + if (addr == VertexComponentFormat::NotPresent) + return 0; + + const u32 base_count = (elements == NormalComponentCount::NTB) ? 9 : 3; + return base_count * sizeof(float); + }(); + + CreateAndCheckSizes(2 * sizeof(float) + in_size, 2 * sizeof(float) + out_size); + + auto input_with_expected_type = [&](float value) { + switch (format) + { + case ComponentFormat::UByte: + Input(value * (1 << 7)); + break; + case ComponentFormat::Byte: + Input(value * (1 << 6)); + break; + case ComponentFormat::UShort: + Input(value * (1 << 15)); + break; + case ComponentFormat::Short: + Input(value * (1 << 14)); + break; + case ComponentFormat::Float: + default: + Input(value); + break; + } + }; + + auto create_normal = [&](int counter_base) { + if (addr == VertexComponentFormat::Direct) + { + input_with_expected_type(counter_base / 32.f); + input_with_expected_type((counter_base + 1) / 32.f); + input_with_expected_type((counter_base + 2) / 32.f); + } + else if (addr == VertexComponentFormat::Index8) + { + // We set up arrays so that this works + Input(counter_base); + } + else if (addr == VertexComponentFormat::Index16) + { + Input(counter_base); + } + // Do nothing for NotPresent + }; + auto create_tangent_and_binormal = [&](int counter_base) { + if (IsIndexed(addr)) + { + // With NormalIndex3, specifying the same index 3 times should give the same result + // as specifying one index in non-index3 mode (as the index is biased by bytes). + // If index3 is disabled, we don't want to write any more indices. + if (index3) + { + // Tangent + create_normal(counter_base); + // Binormal + create_normal(counter_base); + } + } + else + { + // Tangent + create_normal(counter_base + 3); + // Binormal + create_normal(counter_base + 6); + } + }; + + // Create our two vertices + // Position 1 + Input(4.0f); + Input(8.0f); + // Normal 1 + create_normal(1); + if (elements == NormalComponentCount::NTB) + { + create_tangent_and_binormal(1); + } + + // Position 2 + Input(6.0f); + Input(12.0f); + // Normal 1 + create_normal(10); + if (elements == NormalComponentCount::NTB) + { + create_tangent_and_binormal(10); + } + + // Create an array for indexed representations + for (int i = 0; i < NUM_VERTEX_COMPONENT_ARRAYS; i++) + { + VertexLoaderManager::cached_arraybases[static_cast(i)] = m_src.GetPointer(); + g_main_cp_state.array_strides[static_cast(i)] = GetElementSize(format); + } + + for (int i = 0; i < 32; i++) + input_with_expected_type(i / 32.f); + + // Pre-fill these values to detect if they're modified + VertexLoaderManager::binormal_cache = {42.f, 43.f, 44.f, 45.f}; + VertexLoaderManager::tangent_cache = {46.f, 47.f, 48.f, 49.f}; + + RunVertices(2); + + // First vertex, position + ExpectOut(4.0f); + ExpectOut(8.0f); + if (addr != VertexComponentFormat::NotPresent) + { + // Normal + ExpectOut(1 / 32.f); + ExpectOut(2 / 32.f); + ExpectOut(3 / 32.f); + if (elements == NormalComponentCount::NTB) + { + // Tangent + ExpectOut(4 / 32.f); + ExpectOut(5 / 32.f); + ExpectOut(6 / 32.f); + // Binormal + ExpectOut(7 / 32.f); + ExpectOut(8 / 32.f); + ExpectOut(9 / 32.f); + } + } + + // Second vertex, position + ExpectOut(6.0f); + ExpectOut(12.0f); + if (addr != VertexComponentFormat::NotPresent) + { + // Normal + ExpectOut(10 / 32.f); + ExpectOut(11 / 32.f); + ExpectOut(12 / 32.f); + if (elements == NormalComponentCount::NTB) + { + // Tangent + ExpectOut(13 / 32.f); + ExpectOut(14 / 32.f); + ExpectOut(15 / 32.f); + // Binormal + ExpectOut(16 / 32.f); + ExpectOut(17 / 32.f); + ExpectOut(18 / 32.f); + + EXPECT_EQ(VertexLoaderManager::tangent_cache[0], 13 / 32.f); + EXPECT_EQ(VertexLoaderManager::tangent_cache[1], 14 / 32.f); + EXPECT_EQ(VertexLoaderManager::tangent_cache[2], 15 / 32.f); + // Last index is padding/junk + EXPECT_EQ(VertexLoaderManager::binormal_cache[0], 16 / 32.f); + EXPECT_EQ(VertexLoaderManager::binormal_cache[1], 17 / 32.f); + EXPECT_EQ(VertexLoaderManager::binormal_cache[2], 18 / 32.f); + } + } + + if (addr == VertexComponentFormat::NotPresent || elements == NormalComponentCount::N) + { + // Expect these to not be written + EXPECT_EQ(VertexLoaderManager::binormal_cache[0], 42.f); + EXPECT_EQ(VertexLoaderManager::binormal_cache[1], 43.f); + EXPECT_EQ(VertexLoaderManager::binormal_cache[2], 44.f); + EXPECT_EQ(VertexLoaderManager::binormal_cache[3], 45.f); + EXPECT_EQ(VertexLoaderManager::tangent_cache[0], 46.f); + EXPECT_EQ(VertexLoaderManager::tangent_cache[1], 47.f); + EXPECT_EQ(VertexLoaderManager::tangent_cache[2], 48.f); + EXPECT_EQ(VertexLoaderManager::tangent_cache[3], 49.f); + } +} + +// For gtest, which doesn't know about our fmt::formatters by default +static void PrintTo(const VertexComponentFormat& t, std::ostream* os) +{ + *os << fmt::to_string(t); +} +static void PrintTo(const ComponentFormat& t, std::ostream* os) +{ + *os << fmt::to_string(t); +} +static void PrintTo(const CoordComponentCount& t, std::ostream* os) +{ + *os << fmt::to_string(t); +} +static void PrintTo(const NormalComponentCount& t, std::ostream* os) +{ + *os << fmt::to_string(t); +} From 5cc2f7729e9fd65a9e9aea1e8fd48dab2c2e07d5 Mon Sep 17 00:00:00 2001 From: Pokechu22 Date: Thu, 14 Jul 2022 19:27:39 -0700 Subject: [PATCH 03/10] VertexLoaderX64: Use EnumMap for normal scales --- Source/Core/VideoCommon/VertexLoaderX64.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoaderX64.cpp b/Source/Core/VideoCommon/VertexLoaderX64.cpp index 27e24e6f28..d46ce9ebf4 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderX64.cpp @@ -459,8 +459,9 @@ void VertexLoaderX64::GenerateVertexLoader() if (m_VtxDesc.low.Normal != VertexComponentFormat::NotPresent) { - static const u8 map[8] = {7, 6, 15, 14}; - const u8 scaling_exponent = map[u32(m_VtxAttr.g0.NormalFormat.Value())]; + static constexpr Common::EnumMap(7)> SCALE_MAP = {7, 6, 15, 14, + 0, 0, 0, 0}; + const u8 scaling_exponent = SCALE_MAP[m_VtxAttr.g0.NormalFormat]; const int limit = m_VtxAttr.g0.NormalElements == NormalComponentCount::NTB ? 3 : 1; for (int i = 0; i < limit; i++) From 200676f4e3ba96303b6951126dbbe1926adba337 Mon Sep 17 00:00:00 2001 From: Pokechu22 Date: Mon, 4 Jul 2022 11:21:38 -0700 Subject: [PATCH 04/10] VertexLoaderX64: Fix direct normal+tangent+binormal with index3 set Fixes https://bugs.dolphin-emu.org/issues/12952 --- Source/Core/VideoCommon/VertexLoaderX64.cpp | 51 +++++++++++++++------ Source/Core/VideoCommon/VertexLoaderX64.h | 6 +-- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoaderX64.cpp b/Source/Core/VideoCommon/VertexLoaderX64.cpp index d46ce9ebf4..b1474fd2fa 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderX64.cpp @@ -79,9 +79,10 @@ OpArg VertexLoaderX64::GetVertexAddr(CPArray array, VertexComponentFormat attrib } } -int VertexLoaderX64::ReadVertex(OpArg data, VertexComponentFormat attribute, ComponentFormat format, - int count_in, int count_out, bool dequantize, u8 scaling_exponent, - AttributeFormat* native_format) +void VertexLoaderX64::ReadVertex(OpArg data, VertexComponentFormat attribute, + ComponentFormat format, int count_in, int count_out, + bool dequantize, u8 scaling_exponent, + AttributeFormat* native_format) { static const __m128i shuffle_lut[5][3] = { {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF00L), // 1x u8 @@ -276,8 +277,6 @@ int VertexLoaderX64::ReadVertex(OpArg data, VertexComponentFormat attribute, Com } write_zfreeze(); - - return load_bytes; } void VertexLoaderX64::ReadColor(OpArg data, VertexComponentFormat attribute, ColorFormat format) @@ -462,18 +461,42 @@ void VertexLoaderX64::GenerateVertexLoader() static constexpr Common::EnumMap(7)> SCALE_MAP = {7, 6, 15, 14, 0, 0, 0, 0}; const u8 scaling_exponent = SCALE_MAP[m_VtxAttr.g0.NormalFormat]; - const int limit = m_VtxAttr.g0.NormalElements == NormalComponentCount::NTB ? 3 : 1; - for (int i = 0; i < limit; i++) + // Normal + data = GetVertexAddr(CPArray::Normal, m_VtxDesc.low.Normal); + ReadVertex(data, m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, true, scaling_exponent, + &m_native_vtx_decl.normals[0]); + + if (m_VtxAttr.g0.NormalElements == NormalComponentCount::NTB) { - if (!i || m_VtxAttr.g0.NormalIndex3) - { + const bool index3 = IsIndexed(m_VtxDesc.low.Normal) && m_VtxAttr.g0.NormalIndex3; + const int elem_size = GetElementSize(m_VtxAttr.g0.NormalFormat); + const int load_bytes = elem_size * 3; + + // Tangent + // If in Index3 mode, and indexed components are used, replace the index with a new index. + if (index3) data = GetVertexAddr(CPArray::Normal, m_VtxDesc.low.Normal); - int elem_size = GetElementSize(m_VtxAttr.g0.NormalFormat); - data.AddMemOffset(i * elem_size * 3); - } - data.AddMemOffset(ReadVertex(data, m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, - true, scaling_exponent, &m_native_vtx_decl.normals[i])); + // The tangent comes after the normal; even in index3 mode, this offset is applied. + // Note that this is different from adding 1 to the index, as the stride for indices may be + // different from the size of the tangent itself. + data.AddMemOffset(load_bytes); + + ReadVertex(data, m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, true, + scaling_exponent, &m_native_vtx_decl.normals[1]); + + // Undo the offset above so that data points to the normal instead of the tangent. + // This way, we can add 2*elem_size below to always point to the binormal, even if we replace + // data with a new index (which would point to the normal). + data.AddMemOffset(-load_bytes); + + // Binormal + if (index3) + data = GetVertexAddr(CPArray::Normal, m_VtxDesc.low.Normal); + data.AddMemOffset(load_bytes * 2); + + ReadVertex(data, m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, true, + scaling_exponent, &m_native_vtx_decl.normals[2]); } } diff --git a/Source/Core/VideoCommon/VertexLoaderX64.h b/Source/Core/VideoCommon/VertexLoaderX64.h index 6a0cf7b785..8650c70c27 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.h +++ b/Source/Core/VideoCommon/VertexLoaderX64.h @@ -25,9 +25,9 @@ private: u32 m_dst_ofs = 0; Gen::FixupBranch m_skip_vertex; Gen::OpArg GetVertexAddr(CPArray array, VertexComponentFormat attribute); - int ReadVertex(Gen::OpArg data, VertexComponentFormat attribute, ComponentFormat format, - int count_in, int count_out, bool dequantize, u8 scaling_exponent, - AttributeFormat* native_format); + void ReadVertex(Gen::OpArg data, VertexComponentFormat attribute, ComponentFormat format, + int count_in, int count_out, bool dequantize, u8 scaling_exponent, + AttributeFormat* native_format); void ReadColor(Gen::OpArg data, VertexComponentFormat attribute, ColorFormat format); void GenerateVertexLoader(); }; From afe5adb74d4db1815e0aa20017f980314ca9027d Mon Sep 17 00:00:00 2001 From: Pokechu22 Date: Thu, 14 Jul 2022 19:27:44 -0700 Subject: [PATCH 05/10] VertexLoaderARM64: Use EnumMap for normal scales --- Source/Core/VideoCommon/VertexLoaderARM64.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoaderARM64.cpp b/Source/Core/VideoCommon/VertexLoaderARM64.cpp index 9388c21a7b..f5cd7c27d5 100644 --- a/Source/Core/VideoCommon/VertexLoaderARM64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderARM64.cpp @@ -470,8 +470,9 @@ void VertexLoaderARM64::GenerateVertexLoader() if (m_VtxDesc.low.Normal != VertexComponentFormat::NotPresent) { - static const u8 map[8] = {7, 6, 15, 14}; - const u8 scaling_exponent = map[u32(m_VtxAttr.g0.NormalFormat.Value())]; + static constexpr Common::EnumMap(7)> SCALE_MAP = {7, 6, 15, 14, + 0, 0, 0, 0}; + const u8 scaling_exponent = SCALE_MAP[m_VtxAttr.g0.NormalFormat]; const int limit = m_VtxAttr.g0.NormalElements == NormalComponentCount::NTB ? 3 : 1; s32 offset = -1; From a34d5e5960f9191f06ec6d31d3cea33a70067688 Mon Sep 17 00:00:00 2001 From: Pokechu22 Date: Thu, 14 Jul 2022 20:00:12 -0700 Subject: [PATCH 06/10] Arm64Emitter: Add additional alignment assertions Before, unaligned values would be silently ignored in most cases. --- Source/Core/Common/Arm64Emitter.cpp | 38 ++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 075987c3f4..013285329f 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -485,13 +485,22 @@ void ARM64XEmitter::EncodeLoadStorePairedInst(u32 op, ARM64Reg Rt, ARM64Reg Rt2, bool bVec = IsVector(Rt); if (b128Bit) + { + ASSERT_MSG(DYNA_REC, (imm & 0xf) == 0, "128-bit load/store must use aligned offset: {}", imm); imm >>= 4; + } else if (b64Bit) + { + ASSERT_MSG(DYNA_REC, (imm & 0x7) == 0, "64-bit load/store must use aligned offset: {}", imm); imm >>= 3; + } else + { + ASSERT_MSG(DYNA_REC, (imm & 0x3) == 0, "32-bit load/store must use aligned offset: {}", imm); imm >>= 2; + } - ASSERT_MSG(DYNA_REC, !(imm & ~0xF), "offset too large {}", imm); + ASSERT_MSG(DYNA_REC, (imm & ~0xF) == 0, "offset too large {}", imm); u32 opc = 0; if (b128Bit) @@ -524,11 +533,20 @@ void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, bool bVec = IsVector(Rt); if (size == 64) + { + ASSERT_MSG(DYNA_REC, (imm & 0x7) == 0, "64-bit load/store must use aligned offset: {}", imm); imm >>= 3; + } else if (size == 32) + { + ASSERT_MSG(DYNA_REC, (imm & 0x3) == 0, "32-bit load/store must use aligned offset: {}", imm); imm >>= 2; + } else if (size == 16) + { + ASSERT_MSG(DYNA_REC, (imm & 0x1) == 0, "16-bit load/store must use aligned offset: {}", imm); imm >>= 1; + } ASSERT_MSG(DYNA_REC, imm >= 0, "(IndexType::Unsigned): offset must be positive {}", imm); ASSERT_MSG(DYNA_REC, !(imm & ~0xFFF), "(IndexType::Unsigned): offset too large {}", imm); @@ -615,10 +633,12 @@ void ARM64XEmitter::EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64R if (b64Bit) { op |= 0b10; + ASSERT_MSG(DYNA_REC, (imm & 0x7) == 0, "64-bit load/store must use aligned offset: {}", imm); imm >>= 3; } else { + ASSERT_MSG(DYNA_REC, (imm & 0x3) == 0, "32-bit load/store must use aligned offset: {}", imm); imm >>= 2; } @@ -2072,19 +2092,29 @@ void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, if (type == IndexType::Unsigned) { - ASSERT_MSG(DYNA_REC, !(imm & ((size - 1) >> 3)), - "(IndexType::Unsigned) immediate offset must be aligned to size! ({}) ({})", imm, - fmt::ptr(m_emit->GetCodePtr())); ASSERT_MSG(DYNA_REC, imm >= 0, "(IndexType::Unsigned) immediate offset must be positive! ({})", imm); if (size == 16) + { + ASSERT_MSG(DYNA_REC, (imm & 0x1) == 0, "16-bit load/store must use aligned offset: {}", imm); imm >>= 1; + } else if (size == 32) + { + ASSERT_MSG(DYNA_REC, (imm & 0x3) == 0, "32-bit load/store must use aligned offset: {}", imm); imm >>= 2; + } else if (size == 64) + { + ASSERT_MSG(DYNA_REC, (imm & 0x7) == 0, "64-bit load/store must use aligned offset: {}", imm); imm >>= 3; + } else if (size == 128) + { + ASSERT_MSG(DYNA_REC, (imm & 0xf) == 0, "128-bit load/store must use aligned offset: {}", imm); imm >>= 4; + } + ASSERT_MSG(DYNA_REC, imm <= 0xFFF, "Immediate value is too big: {}", imm); encoded_imm = (imm & 0xFFF); } else From 9a290c3d501acaf3eb4c3dceafec821d5a644fad Mon Sep 17 00:00:00 2001 From: Pokechu22 Date: Thu, 14 Jul 2022 20:02:55 -0700 Subject: [PATCH 07/10] VertexLoaderARM64: Always use unscaled load/store instructions The source and destination offsets will always be less than 255, so we can get rid of a lot of the complexity by doing this. --- Source/Core/VideoCommon/VertexLoaderARM64.cpp | 94 ++++--------------- 1 file changed, 17 insertions(+), 77 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoaderARM64.cpp b/Source/Core/VideoCommon/VertexLoaderARM64.cpp index f5cd7c27d5..59ca260fe2 100644 --- a/Source/Core/VideoCommon/VertexLoaderARM64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderARM64.cpp @@ -66,32 +66,12 @@ void VertexLoaderARM64::GetVertexAddr(CPArray array, VertexComponentFormat attri { if (attribute == VertexComponentFormat::Index8) { - if (m_src_ofs < 4096) - { - LDRB(IndexType::Unsigned, scratch1_reg, src_reg, m_src_ofs); - } - else - { - ADD(reg, src_reg, m_src_ofs); - LDRB(IndexType::Unsigned, scratch1_reg, reg, 0); - } + LDURB(scratch1_reg, src_reg, m_src_ofs); m_src_ofs += 1; } - else + else // Index16 { - if (m_src_ofs < 256) - { - LDURH(scratch1_reg, src_reg, m_src_ofs); - } - else if (m_src_ofs <= 8190 && !(m_src_ofs & 1)) - { - LDRH(IndexType::Unsigned, scratch1_reg, src_reg, m_src_ofs); - } - else - { - ADD(reg, src_reg, m_src_ofs); - LDRH(IndexType::Unsigned, scratch1_reg, reg, 0); - } + LDURH(scratch1_reg, src_reg, m_src_ofs); m_src_ofs += 2; REV16(scratch1_reg, scratch1_reg); } @@ -118,7 +98,7 @@ void VertexLoaderARM64::GetVertexAddr(CPArray array, VertexComponentFormat attri s32 VertexLoaderARM64::GetAddressImm(CPArray array, VertexComponentFormat attribute, Arm64Gen::ARM64Reg reg, u32 align) { - if (IsIndexed(attribute) || (m_src_ofs > 255 && (m_src_ofs & (align - 1)))) + if (IsIndexed(attribute)) GetVertexAddr(array, attribute, reg); else return m_src_ofs; @@ -145,13 +125,9 @@ int VertexLoaderARM64::ReadVertex(VertexComponentFormat attribute, ComponentForm else m_float_emit.LD1(elem_size, 1, coords, EncodeRegTo64(scratch1_reg)); } - else if (offset & (load_size - 1)) // Not aligned - unscaled - { - m_float_emit.LDUR(load_size, coords, src_reg, offset); - } else { - m_float_emit.LDR(load_size, IndexType::Unsigned, coords, src_reg, offset); + m_float_emit.LDUR(load_size, coords, src_reg, offset); } if (format != ComponentFormat::Float) @@ -191,20 +167,7 @@ int VertexLoaderARM64::ReadVertex(VertexComponentFormat attribute, ComponentForm } const u32 write_size = count_out == 3 ? 128 : count_out * 32; - const u32 mask = count_out == 3 ? 0xF : count_out == 2 ? 0x7 : 0x3; - if (m_dst_ofs < 256) - { - m_float_emit.STUR(write_size, coords, dst_reg, m_dst_ofs); - } - else if (!(m_dst_ofs & mask)) - { - m_float_emit.STR(write_size, IndexType::Unsigned, coords, dst_reg, m_dst_ofs); - } - else - { - ADD(EncodeRegTo64(scratch2_reg), dst_reg, m_dst_ofs); - m_float_emit.ST1(32, 1, coords, EncodeRegTo64(scratch2_reg)); - } + m_float_emit.STUR(write_size, coords, dst_reg, m_dst_ofs); // Z-Freeze if (native_format == &m_native_vtx_decl.position) @@ -253,10 +216,8 @@ void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat f case ColorFormat::RGBA8888: if (offset == -1) LDR(IndexType::Unsigned, scratch2_reg, EncodeRegTo64(scratch1_reg), 0); - else if (offset & 3) // Not aligned - unscaled - LDUR(scratch2_reg, src_reg, offset); else - LDR(IndexType::Unsigned, scratch2_reg, src_reg, offset); + LDUR(scratch2_reg, src_reg, offset); if (format != ColorFormat::RGBA8888) ORR(scratch2_reg, scratch2_reg, LogicalImm(0xFF000000, 32)); @@ -269,10 +230,8 @@ void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat f // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR if (offset == -1) LDRH(IndexType::Unsigned, scratch3_reg, EncodeRegTo64(scratch1_reg), 0); - else if (offset & 1) // Not aligned - unscaled - LDURH(scratch3_reg, src_reg, offset); else - LDRH(IndexType::Unsigned, scratch3_reg, src_reg, offset); + LDURH(scratch3_reg, src_reg, offset); REV16(scratch3_reg, scratch3_reg); @@ -306,10 +265,8 @@ void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat f // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR if (offset == -1) LDRH(IndexType::Unsigned, scratch3_reg, EncodeRegTo64(scratch1_reg), 0); - else if (offset & 1) // Not aligned - unscaled - LDURH(scratch3_reg, src_reg, offset); else - LDRH(IndexType::Unsigned, scratch3_reg, src_reg, offset); + LDURH(scratch3_reg, src_reg, offset); // R UBFM(scratch1_reg, scratch3_reg, 4, 7); @@ -337,17 +294,9 @@ void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat f // RRRRRRGG GGGGBBBB BBAAAAAA // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR if (offset == -1) - { LDUR(scratch3_reg, EncodeRegTo64(scratch1_reg), -1); - } else - { - offset -= 1; - if (offset & 3) // Not aligned - unscaled - LDUR(scratch3_reg, src_reg, offset); - else - LDR(IndexType::Unsigned, scratch3_reg, src_reg, offset); - } + LDUR(scratch3_reg, src_reg, offset - 1); REV32(scratch3_reg, scratch3_reg); @@ -385,6 +334,12 @@ void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat f void VertexLoaderARM64::GenerateVertexLoader() { + // The largest input vertex (with the position matrix index and all texture matrix indices + // enabled, and all components set as direct) is 129 bytes (corresponding to a 156-byte + // output). This is small enough that we can always use the unscaled load/store instructions + // (which allow an offset from -256 to +255). + ASSERT(m_vertex_size <= 255); + // R0 - Source pointer // R1 - Destination pointer // R2 - Count @@ -568,22 +523,7 @@ void VertexLoaderARM64::GenerateVertexLoader() { m_native_vtx_decl.texcoords[i].offset = m_dst_ofs; - if (m_dst_ofs < 256) - { - STUR(ARM64Reg::SP, dst_reg, m_dst_ofs); - } - else if (!(m_dst_ofs & 7)) - { - // If m_dst_ofs isn't 8byte aligned we can't store an 8byte zero register - // So store two 4byte zero registers - // The destination is always 4byte aligned - STR(IndexType::Unsigned, ARM64Reg::WSP, dst_reg, m_dst_ofs); - STR(IndexType::Unsigned, ARM64Reg::WSP, dst_reg, m_dst_ofs + 4); - } - else - { - STR(IndexType::Unsigned, ARM64Reg::SP, dst_reg, m_dst_ofs); - } + STUR(ARM64Reg::SP, dst_reg, m_dst_ofs); m_float_emit.STR(32, IndexType::Unsigned, ARM64Reg::D31, dst_reg, m_dst_ofs + 8); m_dst_ofs += sizeof(float) * 3; From ad644d5e92a45aa2313f382374a5b6dcfe61abf8 Mon Sep 17 00:00:00 2001 From: Pokechu22 Date: Fri, 15 Jul 2022 13:02:55 -0700 Subject: [PATCH 08/10] VertexLoaderARM64: Merge GetAddressImm into GetVertexAddr This way it more closely matches VertexLoaderX64, and is in general easier to understand. --- Source/Core/VideoCommon/VertexLoaderARM64.cpp | 53 +++++-------------- Source/Core/VideoCommon/VertexLoaderARM64.h | 4 +- 2 files changed, 15 insertions(+), 42 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoaderARM64.cpp b/Source/Core/VideoCommon/VertexLoaderARM64.cpp index 59ca260fe2..de602e18df 100644 --- a/Source/Core/VideoCommon/VertexLoaderARM64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderARM64.cpp @@ -60,7 +60,7 @@ VertexLoaderARM64::VertexLoaderARM64(const TVtxDesc& vtx_desc, const VAT& vtx_at WriteProtect(); } -void VertexLoaderARM64::GetVertexAddr(CPArray array, VertexComponentFormat attribute, ARM64Reg reg) +s32 VertexLoaderARM64::GetVertexAddr(CPArray array, VertexComponentFormat attribute, ARM64Reg reg) { if (IsIndexed(attribute)) { @@ -90,19 +90,12 @@ void VertexLoaderARM64::GetVertexAddr(CPArray array, VertexComponentFormat attri LDR(IndexType::Unsigned, EncodeRegTo64(scratch2_reg), arraybase_reg, static_cast(array) * 8); ADD(EncodeRegTo64(reg), EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch2_reg)); + return -1; } else - ADD(reg, src_reg, m_src_ofs); -} - -s32 VertexLoaderARM64::GetAddressImm(CPArray array, VertexComponentFormat attribute, - Arm64Gen::ARM64Reg reg, u32 align) -{ - if (IsIndexed(attribute)) - GetVertexAddr(array, attribute, reg); - else + { return m_src_ofs; - return -1; + } } int VertexLoaderARM64::ReadVertex(VertexComponentFormat attribute, ComponentFormat format, @@ -411,14 +404,10 @@ void VertexLoaderARM64::GenerateVertexLoader() // Position { - int elem_size = GetElementSize(m_VtxAttr.g0.PosFormat); - int pos_elements = m_VtxAttr.g0.PosElements == CoordComponentCount::XY ? 2 : 3; - int load_bytes = elem_size * pos_elements; - int load_size = GetLoadSize(load_bytes); - load_size <<= 3; + const int pos_elements = m_VtxAttr.g0.PosElements == CoordComponentCount::XY ? 2 : 3; - s32 offset = GetAddressImm(CPArray::Position, m_VtxDesc.low.Position, - EncodeRegTo64(scratch1_reg), load_size); + const s32 offset = + GetVertexAddr(CPArray::Position, m_VtxDesc.low.Position, EncodeRegTo64(scratch1_reg)); ReadVertex(m_VtxDesc.low.Position, m_VtxAttr.g0.PosFormat, pos_elements, pos_elements, m_VtxAttr.g0.ByteDequant, m_VtxAttr.g0.PosFrac, &m_native_vtx_decl.position, offset); } @@ -435,13 +424,9 @@ void VertexLoaderARM64::GenerateVertexLoader() { if (!i || m_VtxAttr.g0.NormalIndex3) { - int elem_size = GetElementSize(m_VtxAttr.g0.NormalFormat); + const int elem_size = GetElementSize(m_VtxAttr.g0.NormalFormat); - int load_bytes = elem_size * 3; - int load_size = GetLoadSize(load_bytes); - - offset = GetAddressImm(CPArray::Normal, m_VtxDesc.low.Normal, EncodeRegTo64(scratch1_reg), - load_size << 3); + offset = GetVertexAddr(CPArray::Normal, m_VtxDesc.low.Normal, EncodeRegTo64(scratch1_reg)); if (offset == -1) ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), i * elem_size * 3); @@ -466,13 +451,8 @@ void VertexLoaderARM64::GenerateVertexLoader() if (m_VtxDesc.low.Color[i] != VertexComponentFormat::NotPresent) { - u32 align = 4; - if (m_VtxAttr.GetColorFormat(i) == ColorFormat::RGB565 || - m_VtxAttr.GetColorFormat(i) == ColorFormat::RGBA4444) - align = 2; - - s32 offset = GetAddressImm(CPArray::Color0 + i, m_VtxDesc.low.Color[i], - EncodeRegTo64(scratch1_reg), align); + const s32 offset = + GetVertexAddr(CPArray::Color0 + i, m_VtxDesc.low.Color[i], EncodeRegTo64(scratch1_reg)); ReadColor(m_VtxDesc.low.Color[i], m_VtxAttr.GetColorFormat(i), offset); m_native_vtx_decl.colors[i].components = 4; m_native_vtx_decl.colors[i].enable = true; @@ -489,16 +469,11 @@ void VertexLoaderARM64::GenerateVertexLoader() m_native_vtx_decl.texcoords[i].type = ComponentFormat::Float; m_native_vtx_decl.texcoords[i].integer = false; - int elements = m_VtxAttr.GetTexElements(i) == TexComponentCount::S ? 1 : 2; + const int elements = m_VtxAttr.GetTexElements(i) == TexComponentCount::S ? 1 : 2; if (m_VtxDesc.high.TexCoord[i] != VertexComponentFormat::NotPresent) { - int elem_size = GetElementSize(m_VtxAttr.GetTexFormat(i)); - int load_bytes = elem_size * (elements + 2); - int load_size = GetLoadSize(load_bytes); - load_size <<= 3; - - s32 offset = GetAddressImm(CPArray::TexCoord0 + i, m_VtxDesc.high.TexCoord[i], - EncodeRegTo64(scratch1_reg), load_size); + s32 offset = GetVertexAddr(CPArray::TexCoord0 + i, m_VtxDesc.high.TexCoord[i], + EncodeRegTo64(scratch1_reg)); u8 scaling_exponent = m_VtxAttr.GetTexFrac(i); ReadVertex(m_VtxDesc.high.TexCoord[i], m_VtxAttr.GetTexFormat(i), elements, m_VtxDesc.low.TexMatIdx[i] ? 2 : elements, m_VtxAttr.g0.ByteDequant, diff --git a/Source/Core/VideoCommon/VertexLoaderARM64.h b/Source/Core/VideoCommon/VertexLoaderARM64.h index eccf3f0ad8..293ee75a58 100644 --- a/Source/Core/VideoCommon/VertexLoaderARM64.h +++ b/Source/Core/VideoCommon/VertexLoaderARM64.h @@ -26,9 +26,7 @@ private: u32 m_dst_ofs = 0; Arm64Gen::FixupBranch m_skip_vertex; Arm64Gen::ARM64FloatEmitter m_float_emit; - void GetVertexAddr(CPArray array, VertexComponentFormat attribute, Arm64Gen::ARM64Reg reg); - s32 GetAddressImm(CPArray array, VertexComponentFormat attribute, Arm64Gen::ARM64Reg reg, - u32 align); + s32 GetVertexAddr(CPArray array, VertexComponentFormat attribute, Arm64Gen::ARM64Reg reg); int ReadVertex(VertexComponentFormat attribute, ComponentFormat format, int count_in, int count_out, bool dequantize, u8 scaling_exponent, AttributeFormat* native_format, s32 offset = -1); From f148de161f3126d2b328d5c022acb04379f1da1f Mon Sep 17 00:00:00 2001 From: Pokechu22 Date: Fri, 15 Jul 2022 13:22:58 -0700 Subject: [PATCH 09/10] VertexLoaderARM64: Specify the register to use as a parameter to ReadVertex This also means that both a register and a vertex are always specified, though right now if the register is scratch1_reg the offset is always 0. --- Source/Core/VideoCommon/VertexLoaderARM64.cpp | 83 +++++++------------ Source/Core/VideoCommon/VertexLoaderARM64.h | 9 +- 2 files changed, 35 insertions(+), 57 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoaderARM64.cpp b/Source/Core/VideoCommon/VertexLoaderARM64.cpp index de602e18df..3d64e85cb7 100644 --- a/Source/Core/VideoCommon/VertexLoaderARM64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderARM64.cpp @@ -60,7 +60,11 @@ VertexLoaderARM64::VertexLoaderARM64(const TVtxDesc& vtx_desc, const VAT& vtx_at WriteProtect(); } -s32 VertexLoaderARM64::GetVertexAddr(CPArray array, VertexComponentFormat attribute, ARM64Reg reg) +// Returns the register to use as the base and an offset from that register. +// For indexed attributes, the index is read into scratch1_reg, and then scratch1_reg with no offset +// is returned. For direct attributes, an offset from src_reg is returned. +std::pair VertexLoaderARM64::GetVertexAddr(CPArray array, + VertexComponentFormat attribute) { if (IsIndexed(attribute)) { @@ -89,18 +93,18 @@ s32 VertexLoaderARM64::GetVertexAddr(CPArray array, VertexComponentFormat attrib LDR(IndexType::Unsigned, EncodeRegTo64(scratch2_reg), arraybase_reg, static_cast(array) * 8); - ADD(EncodeRegTo64(reg), EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch2_reg)); - return -1; + ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch2_reg)); + return {EncodeRegTo64(scratch1_reg), 0}; } else { - return m_src_ofs; + return {src_reg, m_src_ofs}; } } int VertexLoaderARM64::ReadVertex(VertexComponentFormat attribute, ComponentFormat format, int count_in, int count_out, bool dequantize, u8 scaling_exponent, - AttributeFormat* native_format, s32 offset) + AttributeFormat* native_format, ARM64Reg reg, u32 offset) { ARM64Reg coords = count_in == 3 ? ARM64Reg::Q31 : ARM64Reg::D31; ARM64Reg scale = count_in == 3 ? ARM64Reg::Q30 : ARM64Reg::D30; @@ -111,17 +115,7 @@ int VertexLoaderARM64::ReadVertex(VertexComponentFormat attribute, ComponentForm load_size <<= 3; elem_size <<= 3; - if (offset == -1) - { - if (count_in == 1) - m_float_emit.LDR(elem_size, IndexType::Unsigned, coords, EncodeRegTo64(scratch1_reg), 0); - else - m_float_emit.LD1(elem_size, 1, coords, EncodeRegTo64(scratch1_reg)); - } - else - { - m_float_emit.LDUR(load_size, coords, src_reg, offset); - } + m_float_emit.LDUR(load_size, coords, reg, offset); if (format != ComponentFormat::Float) { @@ -199,7 +193,8 @@ int VertexLoaderARM64::ReadVertex(VertexComponentFormat attribute, ComponentForm return load_bytes; } -void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat format, s32 offset) +void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat format, ARM64Reg reg, + u32 offset) { int load_bytes = 0; switch (format) @@ -207,10 +202,7 @@ void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat f case ColorFormat::RGB888: case ColorFormat::RGB888x: case ColorFormat::RGBA8888: - if (offset == -1) - LDR(IndexType::Unsigned, scratch2_reg, EncodeRegTo64(scratch1_reg), 0); - else - LDUR(scratch2_reg, src_reg, offset); + LDUR(scratch2_reg, reg, offset); if (format != ColorFormat::RGBA8888) ORR(scratch2_reg, scratch2_reg, LogicalImm(0xFF000000, 32)); @@ -221,10 +213,7 @@ void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat f case ColorFormat::RGB565: // RRRRRGGG GGGBBBBB // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR - if (offset == -1) - LDRH(IndexType::Unsigned, scratch3_reg, EncodeRegTo64(scratch1_reg), 0); - else - LDURH(scratch3_reg, src_reg, offset); + LDURH(scratch3_reg, reg, offset); REV16(scratch3_reg, scratch3_reg); @@ -256,10 +245,7 @@ void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat f // BBBBAAAA RRRRGGGG // REV16 - RRRRGGGG BBBBAAAA // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR - if (offset == -1) - LDRH(IndexType::Unsigned, scratch3_reg, EncodeRegTo64(scratch1_reg), 0); - else - LDURH(scratch3_reg, src_reg, offset); + LDURH(scratch3_reg, reg, offset); // R UBFM(scratch1_reg, scratch3_reg, 4, 7); @@ -286,10 +272,7 @@ void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat f case ColorFormat::RGBA6666: // RRRRRRGG GGGGBBBB BBAAAAAA // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR - if (offset == -1) - LDUR(scratch3_reg, EncodeRegTo64(scratch1_reg), -1); - else - LDUR(scratch3_reg, src_reg, offset - 1); + LDUR(scratch3_reg, reg, offset - 1); REV32(scratch3_reg, scratch3_reg); @@ -406,10 +389,10 @@ void VertexLoaderARM64::GenerateVertexLoader() { const int pos_elements = m_VtxAttr.g0.PosElements == CoordComponentCount::XY ? 2 : 3; - const s32 offset = - GetVertexAddr(CPArray::Position, m_VtxDesc.low.Position, EncodeRegTo64(scratch1_reg)); + const auto [reg, offset] = GetVertexAddr(CPArray::Position, m_VtxDesc.low.Position); ReadVertex(m_VtxDesc.low.Position, m_VtxAttr.g0.PosFormat, pos_elements, pos_elements, - m_VtxAttr.g0.ByteDequant, m_VtxAttr.g0.PosFrac, &m_native_vtx_decl.position, offset); + m_VtxAttr.g0.ByteDequant, m_VtxAttr.g0.PosFrac, &m_native_vtx_decl.position, reg, + offset); } if (m_VtxDesc.low.Normal != VertexComponentFormat::NotPresent) @@ -419,27 +402,21 @@ void VertexLoaderARM64::GenerateVertexLoader() const u8 scaling_exponent = SCALE_MAP[m_VtxAttr.g0.NormalFormat]; const int limit = m_VtxAttr.g0.NormalElements == NormalComponentCount::NTB ? 3 : 1; - s32 offset = -1; + ARM64Reg reg{}; + u32 offset{}; for (int i = 0; i < limit; i++) { if (!i || m_VtxAttr.g0.NormalIndex3) { const int elem_size = GetElementSize(m_VtxAttr.g0.NormalFormat); - offset = GetVertexAddr(CPArray::Normal, m_VtxDesc.low.Normal, EncodeRegTo64(scratch1_reg)); - - if (offset == -1) - ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), i * elem_size * 3); - else - offset += i * elem_size * 3; + std::tie(reg, offset) = GetVertexAddr(CPArray::Normal, m_VtxDesc.low.Normal); + offset += i * elem_size * 3; } int bytes_read = ReadVertex(m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, true, - scaling_exponent, &m_native_vtx_decl.normals[i], offset); + scaling_exponent, &m_native_vtx_decl.normals[i], reg, offset); - if (offset == -1) - ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), bytes_read); - else - offset += bytes_read; + offset += bytes_read; } } @@ -451,9 +428,8 @@ void VertexLoaderARM64::GenerateVertexLoader() if (m_VtxDesc.low.Color[i] != VertexComponentFormat::NotPresent) { - const s32 offset = - GetVertexAddr(CPArray::Color0 + i, m_VtxDesc.low.Color[i], EncodeRegTo64(scratch1_reg)); - ReadColor(m_VtxDesc.low.Color[i], m_VtxAttr.GetColorFormat(i), offset); + const auto [reg, offset] = GetVertexAddr(CPArray::Color0 + i, m_VtxDesc.low.Color[i]); + ReadColor(m_VtxDesc.low.Color[i], m_VtxAttr.GetColorFormat(i), reg, offset); m_native_vtx_decl.colors[i].components = 4; m_native_vtx_decl.colors[i].enable = true; m_native_vtx_decl.colors[i].offset = m_dst_ofs; @@ -472,12 +448,11 @@ void VertexLoaderARM64::GenerateVertexLoader() const int elements = m_VtxAttr.GetTexElements(i) == TexComponentCount::S ? 1 : 2; if (m_VtxDesc.high.TexCoord[i] != VertexComponentFormat::NotPresent) { - s32 offset = GetVertexAddr(CPArray::TexCoord0 + i, m_VtxDesc.high.TexCoord[i], - EncodeRegTo64(scratch1_reg)); + const auto [reg, offset] = GetVertexAddr(CPArray::TexCoord0 + i, m_VtxDesc.high.TexCoord[i]); u8 scaling_exponent = m_VtxAttr.GetTexFrac(i); ReadVertex(m_VtxDesc.high.TexCoord[i], m_VtxAttr.GetTexFormat(i), elements, m_VtxDesc.low.TexMatIdx[i] ? 2 : elements, m_VtxAttr.g0.ByteDequant, - scaling_exponent, &m_native_vtx_decl.texcoords[i], offset); + scaling_exponent, &m_native_vtx_decl.texcoords[i], reg, offset); } if (m_VtxDesc.low.TexMatIdx[i]) { diff --git a/Source/Core/VideoCommon/VertexLoaderARM64.h b/Source/Core/VideoCommon/VertexLoaderARM64.h index 293ee75a58..da68173f10 100644 --- a/Source/Core/VideoCommon/VertexLoaderARM64.h +++ b/Source/Core/VideoCommon/VertexLoaderARM64.h @@ -3,6 +3,8 @@ #pragma once +#include + #include "Common/Arm64Emitter.h" #include "Common/CommonTypes.h" #include "VideoCommon/VertexLoaderBase.h" @@ -26,10 +28,11 @@ private: u32 m_dst_ofs = 0; Arm64Gen::FixupBranch m_skip_vertex; Arm64Gen::ARM64FloatEmitter m_float_emit; - s32 GetVertexAddr(CPArray array, VertexComponentFormat attribute, Arm64Gen::ARM64Reg reg); + std::pair GetVertexAddr(CPArray array, VertexComponentFormat attribute); int ReadVertex(VertexComponentFormat attribute, ComponentFormat format, int count_in, int count_out, bool dequantize, u8 scaling_exponent, - AttributeFormat* native_format, s32 offset = -1); - void ReadColor(VertexComponentFormat attribute, ColorFormat format, s32 offset); + AttributeFormat* native_format, Arm64Gen::ARM64Reg reg, u32 offset); + void ReadColor(VertexComponentFormat attribute, ColorFormat format, Arm64Gen::ARM64Reg reg, + u32 offset); void GenerateVertexLoader(); }; From d80201a57f6433bf660be2a1aecd60b97ddb6599 Mon Sep 17 00:00:00 2001 From: Pokechu22 Date: Mon, 4 Jul 2022 19:20:15 -0700 Subject: [PATCH 10/10] VertexLoaderARM64: Fix direct normal+tangent+binormal with index3 set Fixes https://bugs.dolphin-emu.org/issues/12952 --- Source/Core/VideoCommon/VertexLoaderARM64.cpp | 45 +++++++++++-------- Source/Core/VideoCommon/VertexLoaderARM64.h | 6 +-- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoaderARM64.cpp b/Source/Core/VideoCommon/VertexLoaderARM64.cpp index 3d64e85cb7..99ba5b3357 100644 --- a/Source/Core/VideoCommon/VertexLoaderARM64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderARM64.cpp @@ -102,9 +102,10 @@ std::pair VertexLoaderARM64::GetVertexAddr(CPArray arra } } -int VertexLoaderARM64::ReadVertex(VertexComponentFormat attribute, ComponentFormat format, - int count_in, int count_out, bool dequantize, u8 scaling_exponent, - AttributeFormat* native_format, ARM64Reg reg, u32 offset) +void VertexLoaderARM64::ReadVertex(VertexComponentFormat attribute, ComponentFormat format, + int count_in, int count_out, bool dequantize, + u8 scaling_exponent, AttributeFormat* native_format, + ARM64Reg reg, u32 offset) { ARM64Reg coords = count_in == 3 ? ARM64Reg::Q31 : ARM64Reg::D31; ARM64Reg scale = count_in == 3 ? ARM64Reg::Q30 : ARM64Reg::D30; @@ -113,7 +114,6 @@ int VertexLoaderARM64::ReadVertex(VertexComponentFormat attribute, ComponentForm int load_bytes = elem_size * count_in; int load_size = GetLoadSize(load_bytes); load_size <<= 3; - elem_size <<= 3; m_float_emit.LDUR(load_size, coords, reg, offset); @@ -189,8 +189,6 @@ int VertexLoaderARM64::ReadVertex(VertexComponentFormat attribute, ComponentForm if (attribute == VertexComponentFormat::Direct) m_src_ofs += load_bytes; - - return load_bytes; } void VertexLoaderARM64::ReadColor(VertexComponentFormat attribute, ColorFormat format, ARM64Reg reg, @@ -400,23 +398,32 @@ void VertexLoaderARM64::GenerateVertexLoader() static constexpr Common::EnumMap(7)> SCALE_MAP = {7, 6, 15, 14, 0, 0, 0, 0}; const u8 scaling_exponent = SCALE_MAP[m_VtxAttr.g0.NormalFormat]; - const int limit = m_VtxAttr.g0.NormalElements == NormalComponentCount::NTB ? 3 : 1; - ARM64Reg reg{}; - u32 offset{}; - for (int i = 0; i < limit; i++) + // Normal + auto [reg, offset] = GetVertexAddr(CPArray::Normal, m_VtxDesc.low.Normal); + ReadVertex(m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, true, scaling_exponent, + &m_native_vtx_decl.normals[0], reg, offset); + + if (m_VtxAttr.g0.NormalElements == NormalComponentCount::NTB) { - if (!i || m_VtxAttr.g0.NormalIndex3) - { - const int elem_size = GetElementSize(m_VtxAttr.g0.NormalFormat); + const bool index3 = IsIndexed(m_VtxDesc.low.Normal) && m_VtxAttr.g0.NormalIndex3; + const int elem_size = GetElementSize(m_VtxAttr.g0.NormalFormat); + const int load_bytes = elem_size * 3; + // Tangent + // If in Index3 mode, and indexed components are used, replace the index with a new index. + if (index3) std::tie(reg, offset) = GetVertexAddr(CPArray::Normal, m_VtxDesc.low.Normal); - offset += i * elem_size * 3; - } - int bytes_read = ReadVertex(m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, true, - scaling_exponent, &m_native_vtx_decl.normals[i], reg, offset); - - offset += bytes_read; + // The tangent comes after the normal; even in index3 mode, an extra offset of load_bytes is + // applied. Note that this is different from adding 1 to the index, as the stride for indices + // may be different from the size of the tangent itself. + ReadVertex(m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, true, scaling_exponent, + &m_native_vtx_decl.normals[1], reg, offset + load_bytes); + // Binormal + if (index3) + std::tie(reg, offset) = GetVertexAddr(CPArray::Normal, m_VtxDesc.low.Normal); + ReadVertex(m_VtxDesc.low.Normal, m_VtxAttr.g0.NormalFormat, 3, 3, true, scaling_exponent, + &m_native_vtx_decl.normals[2], reg, offset + load_bytes * 2); } } diff --git a/Source/Core/VideoCommon/VertexLoaderARM64.h b/Source/Core/VideoCommon/VertexLoaderARM64.h index da68173f10..83ebbc1a7a 100644 --- a/Source/Core/VideoCommon/VertexLoaderARM64.h +++ b/Source/Core/VideoCommon/VertexLoaderARM64.h @@ -29,9 +29,9 @@ private: Arm64Gen::FixupBranch m_skip_vertex; Arm64Gen::ARM64FloatEmitter m_float_emit; std::pair GetVertexAddr(CPArray array, VertexComponentFormat attribute); - int ReadVertex(VertexComponentFormat attribute, ComponentFormat format, int count_in, - int count_out, bool dequantize, u8 scaling_exponent, - AttributeFormat* native_format, Arm64Gen::ARM64Reg reg, u32 offset); + void ReadVertex(VertexComponentFormat attribute, ComponentFormat format, int count_in, + int count_out, bool dequantize, u8 scaling_exponent, + AttributeFormat* native_format, Arm64Gen::ARM64Reg reg, u32 offset); void ReadColor(VertexComponentFormat attribute, ColorFormat format, Arm64Gen::ARM64Reg reg, u32 offset); void GenerateVertexLoader();