From b038df64bfad478c4e2605985809f58f351ec11c Mon Sep 17 00:00:00 2001 From: "james.jdunne" Date: Thu, 30 Dec 2010 19:17:08 +0000 Subject: [PATCH] TextureDecoder.cpp: new SSE2 optimized GX_TF_I8 decoder. Probably not ultimately optimal SSE2 code, but provably better (on my machine) than the memset version. Tested with __rdtsc counts in an independent project. I get about 6-7 FPS more on average during the intro movie playback in Mario Kart Wii. Hope this compiles for GCC okay. TextureDecoder.cpp: merged two functionally identical decode5A3RGBA and decode5A3rgba methods. OpcodeDecoding.cpp and DLCache.cpp: optimization for GX_LOAD_XF_REG. The PSUHFB solution sounds better for SSSE3, but this is a small win for the default case. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6692 8ced0084-cf51-0410-be5f-012b33b47a6e --- Source/Core/VideoCommon/Src/DLCache.cpp | 6 +- Source/Core/VideoCommon/Src/DataReader.h | 26 +++ .../Core/VideoCommon/Src/OpcodeDecoding.cpp | 34 +++- .../Core/VideoCommon/Src/TextureDecoder.cpp | 161 ++++++++++++------ Source/Core/VideoCommon/VideoCommon.vcproj | 3 + 5 files changed, 164 insertions(+), 66 deletions(-) diff --git a/Source/Core/VideoCommon/Src/DLCache.cpp b/Source/Core/VideoCommon/Src/DLCache.cpp index 932d561580..4d76cfb290 100644 --- a/Source/Core/VideoCommon/Src/DLCache.cpp +++ b/Source/Core/VideoCommon/Src/DLCache.cpp @@ -307,8 +307,7 @@ u8 AnalyzeAndRunDisplayList(u32 address, int size, CachedDisplayList *dl) u32 xf_address = Cmd2 & 0xFFFF; // TODO - speed this up. pshufb? u32 data_buffer[16]; - for (int i = 0; i < transfer_size; i++) - data_buffer[i] = DataReadU32(); + DataReadU32xFuncs[transfer_size-1](data_buffer); LoadXFReg(transfer_size, xf_address, data_buffer); INCSTAT(stats.thisFrame.numXFLoads); num_xf_reg++; @@ -462,8 +461,7 @@ bool CompileAndRunDisplayList(u32 address, int size, CachedDisplayList *dl) NewRegion->hash = 0; dl->InsertRegion(NewRegion); u32 *data_buffer = (u32*)NewRegion->start_address; - for (int i = 0; i < transfer_size; i++) - data_buffer[i] = DataReadU32(); + DataReadU32xFuncs[transfer_size-1](data_buffer); LoadXFReg(transfer_size, xf_address, data_buffer); INCSTAT(stats.thisFrame.numXFLoads); // Compile diff --git a/Source/Core/VideoCommon/Src/DataReader.h b/Source/Core/VideoCommon/Src/DataReader.h index f98736c407..933de18849 100644 --- a/Source/Core/VideoCommon/Src/DataReader.h +++ b/Source/Core/VideoCommon/Src/DataReader.h @@ -64,6 +64,32 @@ __forceinline u32 DataReadU32() return tmp; } +template +void DataReadU32xN(u32 *bufx16) +{ + memcpy(bufx16, g_pVideoData, sizeof(u32) * N); + if (N >= 1) bufx16[0] = Common::swap32(bufx16[0]); + if (N >= 2) bufx16[1] = Common::swap32(bufx16[1]); + if (N >= 3) bufx16[2] = Common::swap32(bufx16[2]); + if (N >= 4) bufx16[3] = Common::swap32(bufx16[3]); + if (N >= 5) bufx16[4] = Common::swap32(bufx16[4]); + if (N >= 6) bufx16[5] = Common::swap32(bufx16[5]); + if (N >= 7) bufx16[6] = Common::swap32(bufx16[6]); + if (N >= 8) bufx16[7] = Common::swap32(bufx16[7]); + if (N >= 9) bufx16[8] = Common::swap32(bufx16[8]); + if (N >= 10) bufx16[9] = Common::swap32(bufx16[9]); + if (N >= 11) bufx16[10] = Common::swap32(bufx16[10]); + if (N >= 12) bufx16[11] = Common::swap32(bufx16[11]); + if (N >= 13) bufx16[12] = Common::swap32(bufx16[12]); + if (N >= 14) bufx16[13] = Common::swap32(bufx16[13]); + if (N >= 15) bufx16[14] = Common::swap32(bufx16[14]); + if (N >= 16) bufx16[15] = Common::swap32(bufx16[15]); + g_pVideoData += (sizeof(u32) * N); +} + +typedef void (*DataReadU32xNfunc)(u32 *buf); +extern DataReadU32xNfunc DataReadU32xFuncs[16]; + __forceinline u32 DataReadU32Unswapped() { u32 tmp = *(u32*)g_pVideoData; diff --git a/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp b/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp index 4fb9574478..fbb81fb40d 100644 --- a/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp +++ b/Source/Core/VideoCommon/Src/OpcodeDecoding.cpp @@ -49,6 +49,24 @@ #endif u8* g_pVideoData = 0; +DataReadU32xNfunc DataReadU32xFuncs[16] = { + DataReadU32xN<1>, + DataReadU32xN<2>, + DataReadU32xN<3>, + DataReadU32xN<4>, + DataReadU32xN<5>, + DataReadU32xN<6>, + DataReadU32xN<7>, + DataReadU32xN<8>, + DataReadU32xN<9>, + DataReadU32xN<10>, + DataReadU32xN<11>, + DataReadU32xN<12>, + DataReadU32xN<13>, + DataReadU32xN<14>, + DataReadU32xN<15>, + DataReadU32xN<16> +}; extern u8* FAKE_GetFifoStartPtr(); extern u8* FAKE_GetFifoEndPtr(); @@ -233,12 +251,13 @@ static void Decode() { u32 Cmd2 = DataReadU32(); int transfer_size = ((Cmd2 >> 16) & 15) + 1; - u32 xf_address = Cmd2 & 0xFFFF; + u32 xf_address = Cmd2 & 0xFFFF; // TODO - speed this up. pshufb? - u32 data_buffer[16]; - for (int i = 0; i < transfer_size; i++) - data_buffer[i] = DataReadU32(); + u32 data_buffer[16]; + DataReadU32xFuncs[transfer_size-1](data_buffer); + LoadXFReg(transfer_size, xf_address, data_buffer); + INCSTAT(stats.thisFrame.numXFLoads); } break; @@ -317,7 +336,7 @@ static void DecodeSemiNop() u8 sub_cmd = DataReadU8(); u32 value = DataReadU32(); LoadCPReg(sub_cmd, value); - INCSTAT(stats.thisFrame.numCPLoads); + INCSTAT(stats.thisFrame.numCPLoads); } break; @@ -328,10 +347,9 @@ static void DecodeSemiNop() u32 address = Cmd2 & 0xFFFF; // TODO - speed this up. pshufb? u32 data_buffer[16]; - for (int i = 0; i < transfer_size; i++) - data_buffer[i] = DataReadU32(); + DataReadU32xFuncs[transfer_size-1](data_buffer); LoadXFReg(transfer_size, address, data_buffer); - INCSTAT(stats.thisFrame.numXFLoads); + INCSTAT(stats.thisFrame.numXFLoads); } break; diff --git a/Source/Core/VideoCommon/Src/TextureDecoder.cpp b/Source/Core/VideoCommon/Src/TextureDecoder.cpp index dd350d45c7..af29d6b4b4 100644 --- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp +++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp @@ -32,6 +32,7 @@ #if _M_SSE >= 0x401 #include +#include #elif _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__) #include #endif @@ -209,43 +210,6 @@ inline u32 decode5A3(u16 val) return (a << 24) | (r << 16) | (g << 8) | b; } -inline u32 decode5A3rgba(u16 val) -{ - int r,g,b,a; - if ((val & 0x8000)) - { - a = 0xFF; - r = Convert5To8((val >> 10) & 0x1F); - g = Convert5To8((val >> 5) & 0x1F); - b = Convert5To8(val & 0x1F); - } - else - { - a = Convert3To8((val >> 12) & 0x7); - r = Convert4To8((val >> 8) & 0xF); - g = Convert4To8((val >> 4) & 0xF); - b = Convert4To8(val & 0xF); - } - return (a << 24) | (b << 16) | (g << 8) | r; -} - -inline u32 decode565RGBA(u16 val) -{ - int r,g,b,a; - r=Convert5To8((val>>11) & 0x1f); - g=Convert6To8((val>>5 ) & 0x3f); - b=Convert5To8((val ) & 0x1f); - a=0xFF; - return r | (g<<8) | (b << 16) | (a << 24); -} - -inline u32 decodeIA8Swapped(u16 val) -{ - int a = val & 0xFF; - int i = val >> 8; - return i | (i<<8) | (i<<16) | (a<<24); -} - inline u32 decode5A3RGBA(u16 val) { int r,g,b,a; @@ -266,6 +230,23 @@ inline u32 decode5A3RGBA(u16 val) return r | (g<<8) | (b << 16) | (a << 24); } +inline u32 decode565RGBA(u16 val) +{ + int r,g,b,a; + r=Convert5To8((val>>11) & 0x1f); + g=Convert6To8((val>>5 ) & 0x3f); + b=Convert5To8((val ) & 0x1f); + a=0xFF; + return r | (g<<8) | (b << 16) | (a << 24); +} + +inline u32 decodeIA8Swapped(u16 val) +{ + int a = val & 0xFF; + int i = val >> 8; + return i | (i<<8) | (i<<16) | (a<<24); +} + struct DXTBlock @@ -293,8 +274,8 @@ inline void decodebytesC4_5A3_To_rgba32(u32 *dst, const u8 *src, int tlutaddr) for (int x = 0; x < 4; x++) { u8 val = src[x]; - *dst++ = decode5A3rgba(Common::swap16(tlut[val >> 4])); - *dst++ = decode5A3rgba(Common::swap16(tlut[val & 0xF])); + *dst++ = decode5A3RGBA(Common::swap16(tlut[val >> 4])); + *dst++ = decode5A3RGBA(Common::swap16(tlut[val & 0xF])); } } @@ -348,7 +329,7 @@ inline void decodebytesC8_5A3_To_RGBA32(u32 *dst, const u8 *src, int tlutaddr) for (int x = 0; x < 8; x++) { u8 val = src[x]; - *dst++ = decode5A3rgba(Common::swap16(tlut[val])); + *dst++ = decode5A3RGBA(Common::swap16(tlut[val])); } } @@ -422,7 +403,7 @@ inline void decodebytesC14X2_5A3_To_RGBA(u32 *dst, const u16 *src, int tlutaddr) for (int x = 0; x < 4; x++) { u16 val = Common::swap16(src[x]); - *dst++ = decode5A3rgba(Common::swap16(tlut[(val & 0x3FFF)])); + *dst++ = decode5A3RGBA(Common::swap16(tlut[(val & 0x3FFF)])); } } @@ -481,23 +462,43 @@ inline void decodebytesIA4RGBA(u32 *dst, const u8 *src) inline void decodebytesRGB5A3(u32 *dst, const u16 *src) { +#if 0 for (int x = 0; x < 4; x++) dst[x] = decode5A3(Common::swap16(src[x])); +#else + dst[0] = decode5A3(Common::swap16(src[0])); + dst[1] = decode5A3(Common::swap16(src[1])); + dst[2] = decode5A3(Common::swap16(src[2])); + dst[3] = decode5A3(Common::swap16(src[3])); +#endif } inline void decodebytesRGB5A3rgba(u32 *dst, const u16 *src) { +#if 0 for (int x = 0; x < 4; x++) - dst[x] = decode5A3rgba(Common::swap16(src[x])); + dst[x] = decode5A3RGBA(Common::swap16(src[x])); +#else + dst[0] = decode5A3RGBA(Common::swap16(src[0])); + dst[1] = decode5A3RGBA(Common::swap16(src[1])); + dst[2] = decode5A3RGBA(Common::swap16(src[2])); + dst[3] = decode5A3RGBA(Common::swap16(src[3])); +#endif } // This one is used by many video formats. It'd therefore be good if it was fast. // Needs more speed. inline void decodebytesARGB8_4(u32 *dst, const u16 *src, const u16 *src2) { - for (int x = 0; x < 4; x++) { +#if 0 + for (int x = 0; x < 4; x++) dst[x] = Common::swap32((src2[x] << 16) | src[x]); - } +#else + dst[0] = Common::swap32((src2[0] << 16) | src[0]); + dst[1] = Common::swap32((src2[1] << 16) | src[1]); + dst[2] = Common::swap32((src2[2] << 16) | src[2]); + dst[3] = Common::swap32((src2[3] << 16) | src[3]); +#endif // This can probably be done in a few SSE pack/unpack instructions + pshufb // some unpack instruction x2: @@ -508,11 +509,18 @@ inline void decodebytesARGB8_4(u32 *dst, const u16 *src, const u16 *src2) // and we are done. } -inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 *src2) +inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 * src2) { - for (int x = 0; x < 4; x++) { +#if 0 + for (int x = 0; x < 4; x++) { dst[x] = ((src[x] & 0xFF) << 24) | ((src[x] & 0xFF00)>>8) | (src2[x] << 8); - } + } +#else + dst[0] = ((src[0] & 0xFF) << 24) | ((src[0] & 0xFF00)>>8) | (src2[0] << 8); + dst[1] = ((src[1] & 0xFF) << 24) | ((src[1] & 0xFF00)>>8) | (src2[1] << 8); + dst[2] = ((src[2] & 0xFF) << 24) | ((src[2] & 0xFF00)>>8) | (src2[2] << 8); + dst[3] = ((src[3] & 0xFF) << 24) | ((src[3] & 0xFF00)>>8) | (src2[3] << 8); +#endif } inline u32 makecol(int r, int g, int b, int a) @@ -919,7 +927,7 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh -PC_TexFormat TexDecoder_Decode_RGBA(u32 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) +PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt) { switch (texformat) { @@ -966,9 +974,52 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 *dst, const u8 *src, int width, int heig { for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 8) - for (int iy = 0; iy < 4; iy++, src += 8) - for (int ix = 0; ix < 8; ix++) - memset(dst + (y + iy)*width+x+ ix, (src + ix)[0], 4); +#if _M_SSE >= 0x401 + + for (int iy = 0; iy < 4; ++iy, src += 8) + { + __m128i *quaddst = (__m128i *)(dst + (y + iy)*width + x); + const __m128i m0 = _mm_or_si128( + _mm_or_si128( + _mm_and_si128(_mm_set1_epi8(src[0]), _mm_set_epi32(0, 0, 0, (int)0xffffffffU)), + _mm_and_si128(_mm_set1_epi8(src[1]), _mm_set_epi32(0, 0, (int)0xffffffffU, 0)) + ), + _mm_or_si128( + _mm_and_si128(_mm_set1_epi8(src[2]), _mm_set_epi32(0, (int)0xffffffffU, 0, 0)), + _mm_and_si128(_mm_set1_epi8(src[3]), _mm_set_epi32((int)0xffffffffU, 0, 0, 0)) + ) + ); + _mm_store_si128(quaddst, m0); + + const __m128i m1 = _mm_or_si128( + _mm_or_si128( + _mm_and_si128(_mm_set1_epi8(src[4]), _mm_set_epi32(0, 0, 0, (int)0xffffffffU)), + _mm_and_si128(_mm_set1_epi8(src[5]), _mm_set_epi32(0, 0, (int)0xffffffffU, 0)) + ), + _mm_or_si128( + _mm_and_si128(_mm_set1_epi8(src[6]), _mm_set_epi32(0, (int)0xffffffffU, 0, 0)), + _mm_and_si128(_mm_set1_epi8(src[7]), _mm_set_epi32((int)0xffffffffU, 0, 0, 0)) + ) + ); + _mm_store_si128(quaddst+1, m1); + } +#else + for (int iy = 0; iy < 4; ++iy, src += 8) + { + u32 * newdst = dst + (y + iy)*width+x; + const u8 * newsrc = src; + u8 srcval; + + srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24); + srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24); + srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24); + srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24); + srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24); + srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24); + srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24); + srcval = newsrc[0]; newdst[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24); + } +#endif } break; case GX_TF_C8: @@ -1014,8 +1065,10 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 *dst, const u8 *src, int width, int heig { u32 *ptr = dst + (y + iy) * width + x; u16 *s = (u16 *)src; - for(int j = 0; j < 4; j++) - *ptr++ = decodeIA8Swapped(*s++); + ptr[0] = decodeIA8Swapped(s[0]); + ptr[1] = decodeIA8Swapped(s[1]); + ptr[2] = decodeIA8Swapped(s[2]); + ptr[3] = decodeIA8Swapped(s[3]); } } @@ -1058,7 +1111,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 *dst, const u8 *src, int width, int heig } break; case GX_TF_RGB5A3: - { + { // JSD: speed critical for Mario Kart Wii intro movie (at least) for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 4) for (int iy = 0; iy < 4; iy++, src += 8) diff --git a/Source/Core/VideoCommon/VideoCommon.vcproj b/Source/Core/VideoCommon/VideoCommon.vcproj index 7685d1bba8..e30905f34a 100644 --- a/Source/Core/VideoCommon/VideoCommon.vcproj +++ b/Source/Core/VideoCommon/VideoCommon.vcproj @@ -387,6 +387,9 @@ />