From f9e4e73e428ea0ce594fb2017272fc47d447f2fa Mon Sep 17 00:00:00 2001 From: xsacha Date: Mon, 10 Jan 2011 13:14:56 +0000 Subject: [PATCH] Use SSSE3 shuffle for DataReader's DataReadU32xN in VideoCommon. The function is used for reading up to 16 u32's at a time (512-bits) and then converting endianness. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6802 8ced0084-cf51-0410-be5f-012b33b47a6e --- Source/Core/VideoCommon/Src/DLCache.cpp | 2 - Source/Core/VideoCommon/Src/DataReader.h | 52 +++++++++++++++++-- .../Core/VideoCommon/Src/OpcodeDecoding.cpp | 30 ++++++++++- .../VideoCommon/Src/VertexLoader_Position.cpp | 4 -- 4 files changed, 78 insertions(+), 10 deletions(-) diff --git a/Source/Core/VideoCommon/Src/DLCache.cpp b/Source/Core/VideoCommon/Src/DLCache.cpp index 4d76cfb290..8d1a9b829b 100644 --- a/Source/Core/VideoCommon/Src/DLCache.cpp +++ b/Source/Core/VideoCommon/Src/DLCache.cpp @@ -305,7 +305,6 @@ u8 AnalyzeAndRunDisplayList(u32 address, int size, CachedDisplayList *dl) u32 Cmd2 = DataReadU32(); int transfer_size = ((Cmd2 >> 16) & 15) + 1; u32 xf_address = Cmd2 & 0xFFFF; - // TODO - speed this up. pshufb? u32 data_buffer[16]; DataReadU32xFuncs[transfer_size-1](data_buffer); LoadXFReg(transfer_size, xf_address, data_buffer); @@ -453,7 +452,6 @@ bool CompileAndRunDisplayList(u32 address, int size, CachedDisplayList *dl) u32 Cmd2 = DataReadU32(); int transfer_size = ((Cmd2 >> 16) & 15) + 1; u32 xf_address = Cmd2 & 0xFFFF; - // TODO - speed this up. pshufb? ReferencedDataRegion* NewRegion = new ReferencedDataRegion; NewRegion->MustClean = true; NewRegion->size = transfer_size * 4; diff --git a/Source/Core/VideoCommon/Src/DataReader.h b/Source/Core/VideoCommon/Src/DataReader.h index 933de18849..fb9a6bbe60 100644 --- a/Source/Core/VideoCommon/Src/DataReader.h +++ b/Source/Core/VideoCommon/Src/DataReader.h @@ -15,11 +15,17 @@ // Official SVN repository and contact information can be found at // http://code.google.com/p/dolphin-emu/ + + #ifndef _DATAREADER_H #define _DATAREADER_H extern u8* g_pVideoData; +#if _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__) +#include +#endif + __forceinline void DataSkip(u32 skip) { g_pVideoData += skip; @@ -64,6 +70,49 @@ __forceinline u32 DataReadU32() return tmp; } +typedef void (*DataReadU32xNfunc)(u32 *buf); +extern DataReadU32xNfunc DataReadU32xFuncs[16]; + +#if _M_SSE >= 0x301 + +const __m128i mask1 = _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,0,1,2,3); +const __m128i mask2 = _mm_set_epi8(15,14,13,12,11,10,9,8,4,5,6,7,0,1,2,3); +const __m128i mask3 = _mm_set_epi8(15,14,13,12,8,9,10,11,4,5,6,7,0,1,2,3); +const __m128i mask4 = _mm_set_epi8(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3); + +template +void DataReadU32xN_SSSE3(u32 *bufx16) +{ + __m128i* store = (__m128i *)bufx16; + __m128i* load = (__m128i *)g_pVideoData; + switch(N) + { + case 13: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); + case 9: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); + case 5: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); + case 1: _mm_storeu_si128(store, _mm_shuffle_epi8(_mm_loadu_si128(load), mask1)); + break; + case 14: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); + case 10: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); + case 6: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); + case 2: _mm_storeu_si128(store, _mm_shuffle_epi8(_mm_loadu_si128(load), mask2)); + break; + case 15: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); + case 11: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); + case 7: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); + case 3: _mm_storeu_si128(store, _mm_shuffle_epi8(_mm_loadu_si128(load), mask3)); + break; + case 16: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); + case 12: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); + case 8: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4)); + case 4: _mm_storeu_si128(store, _mm_shuffle_epi8(_mm_loadu_si128(load), mask4)); + break; + } + g_pVideoData += (sizeof(u32) * N); +} + +#endif + template void DataReadU32xN(u32 *bufx16) { @@ -87,9 +136,6 @@ void DataReadU32xN(u32 *bufx16) g_pVideoData += (sizeof(u32) * N); } -typedef void (*DataReadU32xNfunc)(u32 *buf); -extern DataReadU32xNfunc DataReadU32xFuncs[16]; - __forceinline u32 DataReadU32Unswapped() { u32 tmp = *(u32*)g_pVideoData; diff --git a/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp b/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp index ff4d6d1d97..3555f1fba1 100644 --- a/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp +++ b/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp @@ -30,6 +30,7 @@ #include "Profiler.h" #include "OpcodeDecoding.h" #include "CommandProcessor.h" +#include "CPUDetect.h" #include "VertexLoaderManager.h" @@ -47,6 +48,27 @@ #include "VideoConfig.h" u8* g_pVideoData = 0; +#if _M_SSE >= 0x301 +DataReadU32xNfunc DataReadU32xFuncs_SSSE3[16] = { + DataReadU32xN_SSSE3<1>, + DataReadU32xN_SSSE3<2>, + DataReadU32xN_SSSE3<3>, + DataReadU32xN_SSSE3<4>, + DataReadU32xN_SSSE3<5>, + DataReadU32xN_SSSE3<6>, + DataReadU32xN_SSSE3<7>, + DataReadU32xN_SSSE3<8>, + DataReadU32xN_SSSE3<9>, + DataReadU32xN_SSSE3<10>, + DataReadU32xN_SSSE3<11>, + DataReadU32xN_SSSE3<12>, + DataReadU32xN_SSSE3<13>, + DataReadU32xN_SSSE3<14>, + DataReadU32xN_SSSE3<15>, + DataReadU32xN_SSSE3<16> +}; +#endif + DataReadU32xNfunc DataReadU32xFuncs[16] = { DataReadU32xN<1>, DataReadU32xN<2>, @@ -250,7 +272,6 @@ static void Decode() u32 Cmd2 = DataReadU32(); int transfer_size = ((Cmd2 >> 16) & 15) + 1; u32 xf_address = Cmd2 & 0xFFFF; - // TODO - speed this up. pshufb? u32 data_buffer[16]; DataReadU32xFuncs[transfer_size-1](data_buffer); @@ -401,6 +422,13 @@ void OpcodeDecoder_Init() { g_pVideoData = FAKE_GetFifoStartPtr(); +#if _M_SSE >= 0x301 + if (cpu_info.bSSSE3) + { + *DataReadU32xFuncs = *DataReadU32xFuncs_SSSE3; + } +#endif + if (g_Config.bEnableOpenCL) { OpenCL::Initialize(); diff --git a/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp b/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp index 6f5d7e680c..b8483b119c 100644 --- a/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp +++ b/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp @@ -22,10 +22,6 @@ #include "VertexManagerBase.h" #include "CPUDetect.h" -#if _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__) -#include -#endif - extern float posScale; extern TVtxAttr *pVtxAttr;