From e3578683e3d7e674a0c6453225ed0884e5214e86 Mon Sep 17 00:00:00 2001 From: Fiora Date: Sun, 30 Nov 2014 10:50:36 -0800 Subject: [PATCH] Vertex loader: optimize texmtx_write_float4 Seems to be pretty high in the profile in some geometry-heavy games like The Last Story, and the compiler-generated assembly is terrifyingly bad, so SSE-ize it. --- Source/Core/VideoCommon/VertexLoader.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index 3fd644ec97..3f7161b8f7 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -91,11 +91,17 @@ static void LOADERDECL TexMtx_Write_Float2() static void LOADERDECL TexMtx_Write_Float4() { +#if _M_SSE >= 0x200 + __m128 output = _mm_cvtsi32_ss(_mm_castsi128_ps(_mm_setzero_si128()), s_curtexmtx[s_texmtxwrite++]); + _mm_storeu_ps((float*)VertexManager::s_pCurBufferPointer, _mm_shuffle_ps(output, output, 0x45 /* 1, 1, 0, 1 */)); + VertexManager::s_pCurBufferPointer += sizeof(float) * 4; +#else DataWrite(0.f); DataWrite(0.f); DataWrite(float(s_curtexmtx[s_texmtxwrite++])); // Just to fill out with 0. DataWrite(0.f); +#endif } VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr)