From 873902b4a3cc40b6a03bef197e54c5165f98d842 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Mon, 19 Jan 2015 18:33:08 +0100 Subject: [PATCH 1/4] VertexLoader: remove non-JIT SSE code --- Source/Core/Core/FifoPlayer/FifoAnalyzer.cpp | 2 - Source/Core/VideoCommon/VertexLoader.cpp | 22 +-- Source/Core/VideoCommon/VertexLoader.h | 75 +--------- .../Core/VideoCommon/VertexLoader_Normal.cpp | 130 ------------------ .../VideoCommon/VertexLoader_Position.cpp | 70 +--------- .../VideoCommon/VertexLoader_TextCoord.cpp | 58 +------- 6 files changed, 8 insertions(+), 349 deletions(-) diff --git a/Source/Core/Core/FifoPlayer/FifoAnalyzer.cpp b/Source/Core/Core/FifoPlayer/FifoAnalyzer.cpp index aeecca8d00..31beb60107 100644 --- a/Source/Core/Core/FifoPlayer/FifoAnalyzer.cpp +++ b/Source/Core/Core/FifoPlayer/FifoAnalyzer.cpp @@ -17,8 +17,6 @@ namespace FifoAnalyzer void Init() { VertexLoader_Normal::Init(); - VertexLoader_Position::Init(); - VertexLoader_TextCoord::Init(); } u8 ReadFifo8(u8 *&data) diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index 1122a04f72..732f0db696 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -28,16 +28,6 @@ u8* g_video_buffer_read_ptr; u8* g_vertex_manager_write_ptr; -void* VertexLoader::operator new (size_t size) -{ - return AllocateAlignedMemory(size, 16); -} - -void VertexLoader::operator delete (void *p) -{ - FreeAlignedMemory(p); -} - static void LOADERDECL PosMtx_ReadDirect_UByte(VertexLoader* loader) { u8 posmtx = BoundingBox::posMtxIdx = DataReadU8() & 0x3f; @@ -66,15 +56,9 @@ static void LOADERDECL TexMtx_Write_Float2(VertexLoader* loader) static void LOADERDECL TexMtx_Write_Float3(VertexLoader* loader) { -#if _M_SSE >= 0x200 - __m128 output = _mm_cvtsi32_ss(_mm_castsi128_ps(_mm_setzero_si128()), loader->m_curtexmtx[loader->m_texmtxwrite++]); - _mm_storeu_ps((float*)g_vertex_manager_write_ptr, _mm_shuffle_ps(output, output, 0x45 /* 1, 1, 0, 1 */)); - g_vertex_manager_write_ptr += sizeof(float) * 3; -#else DataWrite(0.f); DataWrite(0.f); DataWrite(float(loader->m_curtexmtx[loader->m_texmtxwrite++])); -#endif } static void LOADERDECL SkipVertex(VertexLoader* loader) @@ -92,15 +76,13 @@ VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr) : VertexLoaderBase(vtx_desc, vtx_attr) { VertexLoader_Normal::Init(); - VertexLoader_Position::Init(); - VertexLoader_TextCoord::Init(); CompileVertexTranslator(); // generate frac factors - m_posScale[0] = m_posScale[1] = m_posScale[2] = m_posScale[3] = 1.0f / (1U << m_VtxAttr.PosFrac); + m_posScale = 1.0f / (1U << m_VtxAttr.PosFrac); for (int i = 0; i < 8; i++) - m_tcScale[i][0] = m_tcScale[i][1] = 1.0f / (1U << m_VtxAttr.texCoord[i].Frac); + m_tcScale[i] = 1.0f / (1U << m_VtxAttr.texCoord[i].Frac); for (int i = 0; i < 2; i++) m_colElements[i] = m_VtxAttr.color[i].Elements; diff --git a/Source/Core/VideoCommon/VertexLoader.h b/Source/Core/VideoCommon/VertexLoader.h index a7acd5dba1..247dce8f41 100644 --- a/Source/Core/VideoCommon/VertexLoader.h +++ b/Source/Core/VideoCommon/VertexLoader.h @@ -18,13 +18,6 @@ #include "VideoCommon/VertexLoaderBase.h" #include "VideoCommon/VertexLoaderUtils.h" -#if _M_SSE >= 0x401 -#include -#include -#elif _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__) -#include -#endif - #ifdef WIN32 #define LOADERDECL __cdecl #else @@ -37,11 +30,6 @@ typedef void (LOADERDECL *TPipelineFunction)(VertexLoader* loader); class VertexLoader : public VertexLoaderBase { public: - // This class need a 16 byte alignment. As this is broken on - // MSVC right now (Dec 2014), we use custom allocation. - void* operator new (size_t size); - void operator delete (void *p); - VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr); int RunVertices(int primitive, int count, DataReader src, DataReader dst) override; @@ -49,9 +37,8 @@ public: bool IsInitialized() override { return true; } // This vertex loader supports all formats // They are used for the communication with the loader functions - // Duplicated (4x and 2x respectively) and used in SSE code in the vertex loader JIT - GC_ALIGNED128(float m_posScale[4]); - GC_ALIGNED64(float m_tcScale[8][2]); + float m_posScale; + float m_tcScale[8]; int m_tcIndex; int m_colIndex; int m_colElements[2]; @@ -73,61 +60,3 @@ private: void WriteCall(TPipelineFunction); }; - -#if _M_SSE >= 0x301 -static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L); -static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L); -static const __m128i kMaskSwap16to32l_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFF0405L, 0xFFFF0203L, 0xFFFF0001L); -static const __m128i kMaskSwap16to32l_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFF0203L, 0xFFFF0001L); -static const __m128i kMaskSwap16to32h_3 = _mm_set_epi32(0xFFFFFFFFL, 0x0405FFFFL, 0x0203FFFFL, 0x0001FFFFL); -static const __m128i kMaskSwap16to32h_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0203FFFFL, 0x0001FFFFL); -static const __m128i kMask8to32l_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFF02L, 0xFFFFFF01L, 0xFFFFFF00L); -static const __m128i kMask8to32l_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF01L, 0xFFFFFF00L); -static const __m128i kMask8to32h_3 = _mm_set_epi32(0xFFFFFFFFL, 0x02FFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL); -static const __m128i kMask8to32h_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL); - -template -__forceinline void Vertex_Read_SSSE3(const T* pData, __m128 scale) -{ - __m128i coords, mask; - - int loadBytes = sizeof(T) * (2 + threeIn); - if (loadBytes > 8) - coords = _mm_loadu_si128((__m128i*)pData); - else if (loadBytes > 4) - coords = _mm_loadl_epi64((__m128i*)pData); - else - coords = _mm_cvtsi32_si128(*(u32*)pData); - - // Float case (no scaling) - if (sizeof(T) == 4) - { - coords = _mm_shuffle_epi8(coords, threeIn ? kMaskSwap32_3 : kMaskSwap32_2); - if (threeOut) - _mm_storeu_si128((__m128i*)g_vertex_manager_write_ptr, coords); - else - _mm_storel_epi64((__m128i*)g_vertex_manager_write_ptr, coords); - } - else - { - // Byte swap, unpack, and move to high bytes for sign extend. - if (std::is_unsigned::value) - mask = sizeof(T) == 2 ? (threeIn ? kMaskSwap16to32l_3 : kMaskSwap16to32l_2) : (threeIn ? kMask8to32l_3 : kMask8to32l_2); - else - mask = sizeof(T) == 2 ? (threeIn ? kMaskSwap16to32h_3 : kMaskSwap16to32h_2) : (threeIn ? kMask8to32h_3 : kMask8to32h_2); - coords = _mm_shuffle_epi8(coords, mask); - - // Sign extend - if (std::is_signed::value) - coords = _mm_srai_epi32(coords, 32 - sizeof(T) * 8); - - __m128 out = _mm_mul_ps(_mm_cvtepi32_ps(coords), scale); - if (threeOut) - _mm_storeu_ps((float*)g_vertex_manager_write_ptr, out); - else - _mm_storel_pi((__m64*)g_vertex_manager_write_ptr, out); - } - - g_vertex_manager_write_ptr += sizeof(float) * (2 + threeOut); -} -#endif diff --git a/Source/Core/VideoCommon/VertexLoader_Normal.cpp b/Source/Core/VideoCommon/VertexLoader_Normal.cpp index 0d794d9944..298d01ed01 100644 --- a/Source/Core/VideoCommon/VertexLoader_Normal.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Normal.cpp @@ -6,8 +6,6 @@ #include #include "Common/CommonTypes.h" -#include "Common/CPUDetect.h" - #include "VideoCommon/VertexLoader.h" #include "VideoCommon/VertexLoader_Normal.h" #include "VideoCommon/VertexManagerBase.h" @@ -102,63 +100,6 @@ struct Normal_Index_Indices3 static const int size = sizeof(I) * 3; }; -#if _M_SSE >= 0x301 -template -struct Normal_Direct_SSSE3 -{ - static void LOADERDECL function(VertexLoader* loader) - { - const T* pData = reinterpret_cast(DataGetPosition()); - const float frac = 1. / float(1u << (sizeof(T) * 8 - std::is_signed::value - 1)); - const __m128 scale = _mm_set_ps1(frac); - for (int i = 0; i < N; i++, pData += 3) - Vertex_Read_SSSE3(pData, scale); - DataSkip(); - } - - static const int size = sizeof(T) * N * 3; -}; - -template -__forceinline void Normal_Index_Offset_SSSE3() -{ - static_assert(std::is_unsigned::value, "Only unsigned I is sane!"); - - auto const index = DataRead(); - const T* pData = (const T*)(cached_arraybases[ARRAY_NORMAL] - + (index * g_main_cp_state.array_strides[ARRAY_NORMAL]) + sizeof(T) * 3 * Offset); - const float frac = 1. / float(1u << (sizeof(T) * 8 - std::is_signed::value - 1)); - const __m128 scale = _mm_set_ps1(frac); - for (int i = 0; i < N; i++, pData += 3) - Vertex_Read_SSSE3(pData, scale); -} - -template -struct Normal_Index_SSSE3 -{ - static void LOADERDECL function(VertexLoader* loader) - { - Normal_Index_Offset_SSSE3(); - } - - static const int size = sizeof(I); -}; - -template -struct Normal_Index_Indices3_SSSE3 -{ - static void LOADERDECL function(VertexLoader* loader) - { - Normal_Index_Offset_SSSE3(); - Normal_Index_Offset_SSSE3(); - Normal_Index_Offset_SSSE3(); - } - - static const int size = sizeof(I) * 3; -}; - -#endif - } void VertexLoader_Normal::Init() @@ -231,77 +172,6 @@ void VertexLoader_Normal::Init() m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Index_Indices3(); m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Index_Indices3(); m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_Indices3(); - -#if _M_SSE >= 0x301 - if (cpu_info.bSSSE3) - { - m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_UBYTE] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_BYTE] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_USHORT] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_SHORT] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_FLOAT] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_UBYTE] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_BYTE] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_USHORT] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_SHORT] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_FLOAT] = Normal_Direct_SSSE3(); - - m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_UBYTE] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_BYTE] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_USHORT] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_SHORT] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_FLOAT] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_UBYTE] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_BYTE] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Direct_SSSE3(); - m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Direct_SSSE3(); - - m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_BYTE] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_USHORT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_SHORT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_SSSE3(); - - m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_Indices3_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_BYTE] = Normal_Index_Indices3_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Index_Indices3_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Index_Indices3_SSSE3(); - m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_Indices3_SSSE3(); - - m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_BYTE] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_USHORT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_SHORT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_SSSE3(); - - m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_Indices3_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_BYTE] = Normal_Index_Indices3_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Index_Indices3_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Index_Indices3_SSSE3(); - m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_Indices3_SSSE3(); - } -#endif } unsigned int VertexLoader_Normal::GetSize(u64 _type, diff --git a/Source/Core/VideoCommon/VertexLoader_Position.cpp b/Source/Core/VideoCommon/VertexLoader_Position.cpp index 43e56119ea..3dac5a55b1 100644 --- a/Source/Core/VideoCommon/VertexLoader_Position.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Position.cpp @@ -5,8 +5,6 @@ #include #include "Common/CommonTypes.h" -#include "Common/CPUDetect.h" - #include "VideoCommon/VertexLoader.h" #include "VideoCommon/VertexLoader_Position.h" #include "VideoCommon/VertexManagerBase.h" @@ -28,7 +26,7 @@ template void LOADERDECL Pos_ReadDirect(VertexLoader* loader) { static_assert(N <= 3, "N > 3 is not sane!"); - auto const scale = loader->m_posScale[0];; + auto const scale = loader->m_posScale; DataReader dst(g_vertex_manager_write_ptr, nullptr); DataReader src(g_video_buffer_read_ptr, nullptr); @@ -49,7 +47,7 @@ void LOADERDECL Pos_ReadIndex(VertexLoader* loader) auto const index = DataRead(); loader->m_vertexSkip = index == std::numeric_limits::max(); auto const data = reinterpret_cast(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION])); - auto const scale = loader->m_posScale[0]; + auto const scale = loader->m_posScale; DataReader dst(g_vertex_manager_write_ptr, nullptr); for (int i = 0; i < 3; ++i) @@ -59,28 +57,6 @@ void LOADERDECL Pos_ReadIndex(VertexLoader* loader) LOG_VTX(); } -#if _M_SSE >= 0x301 -template -void LOADERDECL Pos_ReadDirect_SSSE3(VertexLoader* loader) -{ - const T* pData = reinterpret_cast(DataGetPosition()); - Vertex_Read_SSSE3(pData, *(__m128*)loader->m_posScale); - DataSkip<(2 + three) * sizeof(T)>(); - LOG_VTX(); -} - -template -void LOADERDECL Pos_ReadIndex_SSSE3(VertexLoader* loader) -{ - static_assert(std::is_unsigned::value, "Only unsigned I is sane!"); - auto const index = DataRead(); - loader->m_vertexSkip = index == std::numeric_limits::max(); - const T* pData = (const T*)(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION])); - Vertex_Read_SSSE3(pData, *(__m128*)loader->m_posScale); - LOG_VTX(); -} -#endif - static TPipelineFunction tableReadPosition[4][8][2] = { { {nullptr, nullptr,}, @@ -127,48 +103,6 @@ static int tableReadPositionVertexSize[4][8][2] = { }, }; - -void VertexLoader_Position::Init() -{ - -#if _M_SSE >= 0x301 - if (cpu_info.bSSSE3) - { - tableReadPosition[1][0][0] = Pos_ReadDirect_SSSE3; - tableReadPosition[1][0][1] = Pos_ReadDirect_SSSE3; - tableReadPosition[1][1][0] = Pos_ReadDirect_SSSE3; - tableReadPosition[1][1][1] = Pos_ReadDirect_SSSE3; - tableReadPosition[1][2][0] = Pos_ReadDirect_SSSE3; - tableReadPosition[1][2][1] = Pos_ReadDirect_SSSE3; - tableReadPosition[1][3][0] = Pos_ReadDirect_SSSE3; - tableReadPosition[1][3][1] = Pos_ReadDirect_SSSE3; - tableReadPosition[1][4][0] = Pos_ReadDirect_SSSE3; - tableReadPosition[1][4][1] = Pos_ReadDirect_SSSE3; - tableReadPosition[2][0][0] = Pos_ReadIndex_SSSE3; - tableReadPosition[2][0][1] = Pos_ReadIndex_SSSE3; - tableReadPosition[3][0][0] = Pos_ReadIndex_SSSE3; - tableReadPosition[3][0][1] = Pos_ReadIndex_SSSE3; - tableReadPosition[2][1][0] = Pos_ReadIndex_SSSE3; - tableReadPosition[2][1][1] = Pos_ReadIndex_SSSE3; - tableReadPosition[3][1][0] = Pos_ReadIndex_SSSE3; - tableReadPosition[3][1][1] = Pos_ReadIndex_SSSE3; - tableReadPosition[2][2][0] = Pos_ReadIndex_SSSE3; - tableReadPosition[2][2][1] = Pos_ReadIndex_SSSE3; - tableReadPosition[3][2][0] = Pos_ReadIndex_SSSE3; - tableReadPosition[3][2][1] = Pos_ReadIndex_SSSE3; - tableReadPosition[2][3][0] = Pos_ReadIndex_SSSE3; - tableReadPosition[2][3][1] = Pos_ReadIndex_SSSE3; - tableReadPosition[3][3][0] = Pos_ReadIndex_SSSE3; - tableReadPosition[3][3][1] = Pos_ReadIndex_SSSE3; - tableReadPosition[2][4][0] = Pos_ReadIndex_SSSE3; - tableReadPosition[2][4][1] = Pos_ReadIndex_SSSE3; - tableReadPosition[3][4][0] = Pos_ReadIndex_SSSE3; - tableReadPosition[3][4][1] = Pos_ReadIndex_SSSE3; - } -#endif - -} - unsigned int VertexLoader_Position::GetSize(u64 _type, unsigned int _format, unsigned int _elements) { return tableReadPositionVertexSize[_type][_format][_elements]; diff --git a/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp b/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp index 6e807a46e8..69e8ca155e 100644 --- a/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp +++ b/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp @@ -5,8 +5,6 @@ #include #include "Common/CommonTypes.h" -#include "Common/CPUDetect.h" - #include "VideoCommon/VertexLoader.h" #include "VideoCommon/VertexLoader_TextCoord.h" #include "VideoCommon/VertexManagerBase.h" @@ -49,7 +47,7 @@ float TCScale(float val, float scale) template void LOADERDECL TexCoord_ReadDirect(VertexLoader* loader) { - auto const scale = loader->m_tcScale[loader->m_tcIndex][0]; + auto const scale = loader->m_tcScale[loader->m_tcIndex]; DataReader dst(g_vertex_manager_write_ptr, nullptr); DataReader src(g_video_buffer_read_ptr, nullptr); @@ -71,7 +69,7 @@ void LOADERDECL TexCoord_ReadIndex(VertexLoader* loader) auto const index = DataRead(); auto const data = reinterpret_cast(cached_arraybases[ARRAY_TEXCOORD0 + loader->m_tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0 + loader->m_tcIndex])); - auto const scale = loader->m_tcScale[loader->m_tcIndex][0]; + auto const scale = loader->m_tcScale[loader->m_tcIndex]; DataReader dst(g_vertex_manager_write_ptr, nullptr); for (int i = 0; i != N; ++i) @@ -82,32 +80,6 @@ void LOADERDECL TexCoord_ReadIndex(VertexLoader* loader) ++loader->m_tcIndex; } -#if _M_SSE >= 0x301 -template -void LOADERDECL TexCoord_ReadDirect2_SSSE3(VertexLoader* loader) -{ - const T* pData = reinterpret_cast(DataGetPosition()); - __m128 scale = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)loader->m_tcScale[loader->m_tcIndex])); - Vertex_Read_SSSE3(pData, scale); - DataSkip<2 * sizeof(T)>(); - LOG_TEX<2>(); - loader->m_tcIndex++; -} - -template -void LOADERDECL TexCoord_ReadIndex2_SSSE3(VertexLoader* loader) -{ - static_assert(std::is_unsigned::value, "Only unsigned I is sane!"); - - auto const index = DataRead(); - const T* pData = (const T*)(cached_arraybases[ARRAY_TEXCOORD0 + loader->m_tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0 + loader->m_tcIndex])); - __m128 scale = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)loader->m_tcScale[loader->m_tcIndex])); - Vertex_Read_SSSE3(pData, scale); - LOG_TEX<2>(); - loader->m_tcIndex++; -} -#endif - static TPipelineFunction tableReadTexCoord[4][8][2] = { { {nullptr, nullptr,}, @@ -154,32 +126,6 @@ static int tableReadTexCoordVertexSize[4][8][2] = { }, }; -void VertexLoader_TextCoord::Init() -{ - -#if _M_SSE >= 0x301 - if (cpu_info.bSSSE3) - { - tableReadTexCoord[1][0][1] = TexCoord_ReadDirect2_SSSE3; - tableReadTexCoord[1][1][1] = TexCoord_ReadDirect2_SSSE3; - tableReadTexCoord[1][2][1] = TexCoord_ReadDirect2_SSSE3; - tableReadTexCoord[1][3][1] = TexCoord_ReadDirect2_SSSE3; - tableReadTexCoord[1][4][1] = TexCoord_ReadDirect2_SSSE3; - tableReadTexCoord[2][0][1] = TexCoord_ReadIndex2_SSSE3; - tableReadTexCoord[2][1][1] = TexCoord_ReadIndex2_SSSE3; - tableReadTexCoord[2][2][1] = TexCoord_ReadIndex2_SSSE3; - tableReadTexCoord[2][3][1] = TexCoord_ReadIndex2_SSSE3; - tableReadTexCoord[2][4][1] = TexCoord_ReadIndex2_SSSE3; - tableReadTexCoord[3][0][1] = TexCoord_ReadIndex2_SSSE3; - tableReadTexCoord[3][1][1] = TexCoord_ReadIndex2_SSSE3; - tableReadTexCoord[3][2][1] = TexCoord_ReadIndex2_SSSE3; - tableReadTexCoord[3][3][1] = TexCoord_ReadIndex2_SSSE3; - tableReadTexCoord[3][4][1] = TexCoord_ReadIndex2_SSSE3; - } -#endif - -} - unsigned int VertexLoader_TextCoord::GetSize(u64 _type, unsigned int _format, unsigned int _elements) { return tableReadTexCoordVertexSize[_type][_format][_elements]; From 80617ec6bd9c654835865780831e58fb434be90e Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Tue, 20 Jan 2015 00:24:18 +0100 Subject: [PATCH 2/4] VertexLoader: remove weird line --- Source/Core/VideoCommon/VertexLoader.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index 732f0db696..f764aae478 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -270,7 +270,6 @@ void VertexLoader::CompileVertexTranslator() } else { - components |= VB_HAS_UV0 << i; // have to include since using now m_native_vtx_decl.texcoords[i].components = 3; nat_offset += 12; WriteCall(TexMtx_Write_Float3); From 46ab5d63d62ccb70b3502fbdfffcbd04a3eefb65 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Tue, 20 Jan 2015 01:58:11 +0100 Subject: [PATCH 3/4] VertexLoader: never reset alpha in 8888 colors Fixes the opening menu of Xenoblade Chronicles. --- Source/Core/VideoCommon/VertexLoader.cpp | 3 --- Source/Core/VideoCommon/VertexLoader.h | 1 - Source/Core/VideoCommon/VertexLoaderX64.cpp | 7 +++---- Source/Core/VideoCommon/VertexLoaderX64.h | 2 +- Source/Core/VideoCommon/VertexLoader_Color.cpp | 11 +---------- 5 files changed, 5 insertions(+), 19 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index f764aae478..4edb6a89ef 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -83,9 +83,6 @@ VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr) m_posScale = 1.0f / (1U << m_VtxAttr.PosFrac); for (int i = 0; i < 8; i++) m_tcScale[i] = 1.0f / (1U << m_VtxAttr.texCoord[i].Frac); - - for (int i = 0; i < 2; i++) - m_colElements[i] = m_VtxAttr.color[i].Elements; } void VertexLoader::CompileVertexTranslator() diff --git a/Source/Core/VideoCommon/VertexLoader.h b/Source/Core/VideoCommon/VertexLoader.h index 247dce8f41..009c1ae006 100644 --- a/Source/Core/VideoCommon/VertexLoader.h +++ b/Source/Core/VideoCommon/VertexLoader.h @@ -41,7 +41,6 @@ public: float m_tcScale[8]; int m_tcIndex; int m_colIndex; - int m_colElements[2]; // Matrix components are first in GC format but later in PC format - we need to store it temporarily // when decoding each vertex. diff --git a/Source/Core/VideoCommon/VertexLoaderX64.cpp b/Source/Core/VideoCommon/VertexLoaderX64.cpp index c9b435bae3..891643a592 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderX64.cpp @@ -145,7 +145,7 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count return load_bytes; } -void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format, int elements) +void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format) { int load_bytes = 0; switch (format) @@ -154,8 +154,7 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format, int eleme case FORMAT_32B_888x: case FORMAT_32B_8888: MOV(32, R(scratch1), data); - // See VertexLoader_Color.cpp for a comment on this condition. - if (format != FORMAT_32B_8888 || !elements) + if (format != FORMAT_32B_8888) OR(32, R(scratch1), Imm32(0xFF000000)); MOV(32, MDisp(dst_reg, m_dst_ofs), R(scratch1)); load_bytes = 3 + (format != FORMAT_24B_888); @@ -363,7 +362,7 @@ void VertexLoaderX64::GenerateVertexLoader() if (col[i]) { data = GetVertexAddr(ARRAY_COLOR + i, col[i]); - ReadColor(data, col[i], m_VtxAttr.color[i].Comp, m_VtxAttr.color[i].Elements); + ReadColor(data, col[i], m_VtxAttr.color[i].Comp); m_native_components |= VB_HAS_COL0 << i; m_native_vtx_decl.colors[i].components = 4; m_native_vtx_decl.colors[i].enable = true; diff --git a/Source/Core/VideoCommon/VertexLoaderX64.h b/Source/Core/VideoCommon/VertexLoaderX64.h index 3cf9d7c8ba..206da1cf30 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.h +++ b/Source/Core/VideoCommon/VertexLoaderX64.h @@ -17,6 +17,6 @@ private: Gen::FixupBranch m_skip_vertex; Gen::OpArg GetVertexAddr(int array, u64 attribute); int ReadVertex(Gen::OpArg data, u64 attribute, int format, int count_in, int count_out, u8 scaling_exponent, AttributeFormat* native_format); - void ReadColor(Gen::OpArg data, u64 attribute, int format, int elements); + void ReadColor(Gen::OpArg data, u64 attribute, int format); void GenerateVertexLoader(); }; diff --git a/Source/Core/VideoCommon/VertexLoader_Color.cpp b/Source/Core/VideoCommon/VertexLoader_Color.cpp index 5f2a8106c7..00255de00a 100644 --- a/Source/Core/VideoCommon/VertexLoader_Color.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Color.cpp @@ -92,18 +92,9 @@ void LOADERDECL Color_ReadDirect_24b_6666(VertexLoader* loader) _SetCol6666(loader, Common::swap32(DataGetPosition() - 1)); DataSkip(3); } -// F|RES: i am not 100 percent sure, but the colElements seems to be important for rendering only -// at least it fixes mario party 4 void LOADERDECL Color_ReadDirect_32b_8888(VertexLoader* loader) { - // TODO (mb2): check this - u32 col = DataReadU32Unswapped(); - - // "kill" the alpha - if (!loader->m_colElements[loader->m_colIndex]) - col |= AMASK; - - _SetCol(loader, col); + _SetCol(loader, DataReadU32Unswapped()); } template From 1dcf49237b81b850d5683887e3608fbab8c54b27 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Tue, 20 Jan 2015 02:12:03 +0100 Subject: [PATCH 4/4] VertexLoaderX64: support VAT.ByteDequant=0 --- Source/Core/VideoCommon/CPMemory.h | 2 +- Source/Core/VideoCommon/VertexLoaderBase.cpp | 5 ----- Source/Core/VideoCommon/VertexLoaderX64.cpp | 13 ++++++++----- Source/Core/VideoCommon/VertexLoaderX64.h | 2 +- Source/UnitTests/VideoCommon/VertexLoaderTest.cpp | 1 + 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/Source/Core/VideoCommon/CPMemory.h b/Source/Core/VideoCommon/CPMemory.h index 51e1ccc74f..e657464015 100644 --- a/Source/Core/VideoCommon/CPMemory.h +++ b/Source/Core/VideoCommon/CPMemory.h @@ -187,7 +187,7 @@ struct TVtxAttr u8 NormalFormat; ColorAttr color[2]; TexAttr texCoord[8]; - u8 ByteDequant; + bool ByteDequant; u8 NormalIndex3; }; diff --git a/Source/Core/VideoCommon/VertexLoaderBase.cpp b/Source/Core/VideoCommon/VertexLoaderBase.cpp index 815e162689..c0e8e533df 100644 --- a/Source/Core/VideoCommon/VertexLoaderBase.cpp +++ b/Source/Core/VideoCommon/VertexLoaderBase.cpp @@ -66,11 +66,6 @@ void VertexLoaderBase::SetVAT(const VAT& vat) m_VtxAttr.texCoord[7].Elements = vat.g2.Tex7CoordElements; m_VtxAttr.texCoord[7].Format = vat.g2.Tex7CoordFormat; m_VtxAttr.texCoord[7].Frac = vat.g2.Tex7Frac; - - if (!m_VtxAttr.ByteDequant) - { - ERROR_LOG(VIDEO, "ByteDequant is set to zero"); - } }; void VertexLoaderBase::AppendToString(std::string *dest) const diff --git a/Source/Core/VideoCommon/VertexLoaderX64.cpp b/Source/Core/VideoCommon/VertexLoaderX64.cpp index 891643a592..de3b90b449 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderX64.cpp @@ -67,7 +67,7 @@ OpArg VertexLoaderX64::GetVertexAddr(int array, u64 attribute) } } -int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count_in, int count_out, u8 scaling_exponent, AttributeFormat* native_format) +int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count_in, int count_out, bool dequantize, u8 scaling_exponent, AttributeFormat* native_format) { static const __m128i shuffle_lut[5][3] = { {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF00L), // 1x u8 @@ -120,7 +120,7 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count CVTDQ2PS(coords, R(coords)); - if (scaling_exponent) + if (dequantize && scaling_exponent) MULPS(coords, M(&scale_factors[scaling_exponent])); } @@ -333,7 +333,8 @@ void VertexLoaderX64::GenerateVertexLoader() } OpArg data = GetVertexAddr(ARRAY_POSITION, m_VtxDesc.Position); - ReadVertex(data, m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements + 2, 3, m_VtxAttr.PosFrac, &m_native_vtx_decl.position); + ReadVertex(data, m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements + 2, 3, + m_VtxAttr.ByteDequant, m_VtxAttr.PosFrac, &m_native_vtx_decl.position); if (m_VtxDesc.Normal) { @@ -348,7 +349,8 @@ void VertexLoaderX64::GenerateVertexLoader() int elem_size = 1 << (m_VtxAttr.NormalFormat / 2); data.offset += i * elem_size * 3; } - data.offset += ReadVertex(data, m_VtxDesc.Normal, m_VtxAttr.NormalFormat, 3, 3, scaling_exponent, &m_native_vtx_decl.normals[i]); + data.offset += ReadVertex(data, m_VtxDesc.Normal, m_VtxAttr.NormalFormat, 3, 3, + true, scaling_exponent, &m_native_vtx_decl.normals[i]); } m_native_components |= VB_HAS_NRM0; @@ -384,7 +386,8 @@ void VertexLoaderX64::GenerateVertexLoader() { data = GetVertexAddr(ARRAY_TEXCOORD0 + i, tc[i]); u8 scaling_exponent = m_VtxAttr.texCoord[i].Frac; - ReadVertex(data, tc[i], m_VtxAttr.texCoord[i].Format, elements, tm[i] ? 2 : elements, scaling_exponent, &m_native_vtx_decl.texcoords[i]); + ReadVertex(data, tc[i], m_VtxAttr.texCoord[i].Format, elements, tm[i] ? 2 : elements, + m_VtxAttr.ByteDequant, scaling_exponent, &m_native_vtx_decl.texcoords[i]); m_native_components |= VB_HAS_UV0 << i; } if (tm[i]) diff --git a/Source/Core/VideoCommon/VertexLoaderX64.h b/Source/Core/VideoCommon/VertexLoaderX64.h index 206da1cf30..da65fb2e34 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.h +++ b/Source/Core/VideoCommon/VertexLoaderX64.h @@ -16,7 +16,7 @@ private: u32 m_dst_ofs = 0; Gen::FixupBranch m_skip_vertex; Gen::OpArg GetVertexAddr(int array, u64 attribute); - int ReadVertex(Gen::OpArg data, u64 attribute, int format, int count_in, int count_out, u8 scaling_exponent, AttributeFormat* native_format); + int ReadVertex(Gen::OpArg data, u64 attribute, int format, int count_in, int count_out, bool dequantize, u8 scaling_exponent, AttributeFormat* native_format); void ReadColor(Gen::OpArg data, u64 attribute, int format); void GenerateVertexLoader(); }; diff --git a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp index 601e90dcc2..864e7de296 100644 --- a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp +++ b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp @@ -159,6 +159,7 @@ TEST_F(VertexLoaderTest, PositionDirectU16XY) // Test that scale works on U16 inputs. Input(42); Input(24); m_vtx_attr.g0.PosFrac = 1; + m_vtx_attr.g0.ByteDequant = 1; loader = VertexLoaderBase::CreateVertexLoader(m_vtx_desc, m_vtx_attr); count = loader->RunVertices(7, 1, src, dst); src.Skip(1 * loader->m_VertexSize);