mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-01-10 08:09:26 +01:00
TextureDecoder.cpp: new SSE2 optimized GX_TF_I8 decoder. Probably not ultimately optimal SSE2 code, but provably better (on my machine) than the memset version. Tested with __rdtsc counts in an independent project. I get about 6-7 FPS more on average during the intro movie playback in Mario Kart Wii. Hope this compiles for GCC okay.
TextureDecoder.cpp: merged two functionally identical decode5A3RGBA and decode5A3rgba methods. OpcodeDecoding.cpp and DLCache.cpp: optimization for GX_LOAD_XF_REG. The PSUHFB solution sounds better for SSSE3, but this is a small win for the default case. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6692 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
6cf9b3688d
commit
b038df64bf
@ -307,8 +307,7 @@ u8 AnalyzeAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
|
||||
u32 xf_address = Cmd2 & 0xFFFF;
|
||||
// TODO - speed this up. pshufb?
|
||||
u32 data_buffer[16];
|
||||
for (int i = 0; i < transfer_size; i++)
|
||||
data_buffer[i] = DataReadU32();
|
||||
DataReadU32xFuncs[transfer_size-1](data_buffer);
|
||||
LoadXFReg(transfer_size, xf_address, data_buffer);
|
||||
INCSTAT(stats.thisFrame.numXFLoads);
|
||||
num_xf_reg++;
|
||||
@ -462,8 +461,7 @@ bool CompileAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
|
||||
NewRegion->hash = 0;
|
||||
dl->InsertRegion(NewRegion);
|
||||
u32 *data_buffer = (u32*)NewRegion->start_address;
|
||||
for (int i = 0; i < transfer_size; i++)
|
||||
data_buffer[i] = DataReadU32();
|
||||
DataReadU32xFuncs[transfer_size-1](data_buffer);
|
||||
LoadXFReg(transfer_size, xf_address, data_buffer);
|
||||
INCSTAT(stats.thisFrame.numXFLoads);
|
||||
// Compile
|
||||
|
@ -64,6 +64,32 @@ __forceinline u32 DataReadU32()
|
||||
return tmp;
|
||||
}
|
||||
|
||||
template<unsigned int N>
|
||||
void DataReadU32xN(u32 *bufx16)
|
||||
{
|
||||
memcpy(bufx16, g_pVideoData, sizeof(u32) * N);
|
||||
if (N >= 1) bufx16[0] = Common::swap32(bufx16[0]);
|
||||
if (N >= 2) bufx16[1] = Common::swap32(bufx16[1]);
|
||||
if (N >= 3) bufx16[2] = Common::swap32(bufx16[2]);
|
||||
if (N >= 4) bufx16[3] = Common::swap32(bufx16[3]);
|
||||
if (N >= 5) bufx16[4] = Common::swap32(bufx16[4]);
|
||||
if (N >= 6) bufx16[5] = Common::swap32(bufx16[5]);
|
||||
if (N >= 7) bufx16[6] = Common::swap32(bufx16[6]);
|
||||
if (N >= 8) bufx16[7] = Common::swap32(bufx16[7]);
|
||||
if (N >= 9) bufx16[8] = Common::swap32(bufx16[8]);
|
||||
if (N >= 10) bufx16[9] = Common::swap32(bufx16[9]);
|
||||
if (N >= 11) bufx16[10] = Common::swap32(bufx16[10]);
|
||||
if (N >= 12) bufx16[11] = Common::swap32(bufx16[11]);
|
||||
if (N >= 13) bufx16[12] = Common::swap32(bufx16[12]);
|
||||
if (N >= 14) bufx16[13] = Common::swap32(bufx16[13]);
|
||||
if (N >= 15) bufx16[14] = Common::swap32(bufx16[14]);
|
||||
if (N >= 16) bufx16[15] = Common::swap32(bufx16[15]);
|
||||
g_pVideoData += (sizeof(u32) * N);
|
||||
}
|
||||
|
||||
typedef void (*DataReadU32xNfunc)(u32 *buf);
|
||||
extern DataReadU32xNfunc DataReadU32xFuncs[16];
|
||||
|
||||
__forceinline u32 DataReadU32Unswapped()
|
||||
{
|
||||
u32 tmp = *(u32*)g_pVideoData;
|
||||
|
@ -49,6 +49,24 @@
|
||||
#endif
|
||||
|
||||
u8* g_pVideoData = 0;
|
||||
DataReadU32xNfunc DataReadU32xFuncs[16] = {
|
||||
DataReadU32xN<1>,
|
||||
DataReadU32xN<2>,
|
||||
DataReadU32xN<3>,
|
||||
DataReadU32xN<4>,
|
||||
DataReadU32xN<5>,
|
||||
DataReadU32xN<6>,
|
||||
DataReadU32xN<7>,
|
||||
DataReadU32xN<8>,
|
||||
DataReadU32xN<9>,
|
||||
DataReadU32xN<10>,
|
||||
DataReadU32xN<11>,
|
||||
DataReadU32xN<12>,
|
||||
DataReadU32xN<13>,
|
||||
DataReadU32xN<14>,
|
||||
DataReadU32xN<15>,
|
||||
DataReadU32xN<16>
|
||||
};
|
||||
|
||||
extern u8* FAKE_GetFifoStartPtr();
|
||||
extern u8* FAKE_GetFifoEndPtr();
|
||||
@ -236,9 +254,10 @@ static void Decode()
|
||||
u32 xf_address = Cmd2 & 0xFFFF;
|
||||
// TODO - speed this up. pshufb?
|
||||
u32 data_buffer[16];
|
||||
for (int i = 0; i < transfer_size; i++)
|
||||
data_buffer[i] = DataReadU32();
|
||||
DataReadU32xFuncs[transfer_size-1](data_buffer);
|
||||
|
||||
LoadXFReg(transfer_size, xf_address, data_buffer);
|
||||
|
||||
INCSTAT(stats.thisFrame.numXFLoads);
|
||||
}
|
||||
break;
|
||||
@ -328,8 +347,7 @@ static void DecodeSemiNop()
|
||||
u32 address = Cmd2 & 0xFFFF;
|
||||
// TODO - speed this up. pshufb?
|
||||
u32 data_buffer[16];
|
||||
for (int i = 0; i < transfer_size; i++)
|
||||
data_buffer[i] = DataReadU32();
|
||||
DataReadU32xFuncs[transfer_size-1](data_buffer);
|
||||
LoadXFReg(transfer_size, address, data_buffer);
|
||||
INCSTAT(stats.thisFrame.numXFLoads);
|
||||
}
|
||||
|
@ -32,6 +32,7 @@
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
#include <smmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#elif _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
@ -209,43 +210,6 @@ inline u32 decode5A3(u16 val)
|
||||
return (a << 24) | (r << 16) | (g << 8) | b;
|
||||
}
|
||||
|
||||
inline u32 decode5A3rgba(u16 val)
|
||||
{
|
||||
int r,g,b,a;
|
||||
if ((val & 0x8000))
|
||||
{
|
||||
a = 0xFF;
|
||||
r = Convert5To8((val >> 10) & 0x1F);
|
||||
g = Convert5To8((val >> 5) & 0x1F);
|
||||
b = Convert5To8(val & 0x1F);
|
||||
}
|
||||
else
|
||||
{
|
||||
a = Convert3To8((val >> 12) & 0x7);
|
||||
r = Convert4To8((val >> 8) & 0xF);
|
||||
g = Convert4To8((val >> 4) & 0xF);
|
||||
b = Convert4To8(val & 0xF);
|
||||
}
|
||||
return (a << 24) | (b << 16) | (g << 8) | r;
|
||||
}
|
||||
|
||||
inline u32 decode565RGBA(u16 val)
|
||||
{
|
||||
int r,g,b,a;
|
||||
r=Convert5To8((val>>11) & 0x1f);
|
||||
g=Convert6To8((val>>5 ) & 0x3f);
|
||||
b=Convert5To8((val ) & 0x1f);
|
||||
a=0xFF;
|
||||
return r | (g<<8) | (b << 16) | (a << 24);
|
||||
}
|
||||
|
||||
inline u32 decodeIA8Swapped(u16 val)
|
||||
{
|
||||
int a = val & 0xFF;
|
||||
int i = val >> 8;
|
||||
return i | (i<<8) | (i<<16) | (a<<24);
|
||||
}
|
||||
|
||||
inline u32 decode5A3RGBA(u16 val)
|
||||
{
|
||||
int r,g,b,a;
|
||||
@ -266,6 +230,23 @@ inline u32 decode5A3RGBA(u16 val)
|
||||
return r | (g<<8) | (b << 16) | (a << 24);
|
||||
}
|
||||
|
||||
inline u32 decode565RGBA(u16 val)
|
||||
{
|
||||
int r,g,b,a;
|
||||
r=Convert5To8((val>>11) & 0x1f);
|
||||
g=Convert6To8((val>>5 ) & 0x3f);
|
||||
b=Convert5To8((val ) & 0x1f);
|
||||
a=0xFF;
|
||||
return r | (g<<8) | (b << 16) | (a << 24);
|
||||
}
|
||||
|
||||
inline u32 decodeIA8Swapped(u16 val)
|
||||
{
|
||||
int a = val & 0xFF;
|
||||
int i = val >> 8;
|
||||
return i | (i<<8) | (i<<16) | (a<<24);
|
||||
}
|
||||
|
||||
|
||||
|
||||
struct DXTBlock
|
||||
@ -293,8 +274,8 @@ inline void decodebytesC4_5A3_To_rgba32(u32 *dst, const u8 *src, int tlutaddr)
|
||||
for (int x = 0; x < 4; x++)
|
||||
{
|
||||
u8 val = src[x];
|
||||
*dst++ = decode5A3rgba(Common::swap16(tlut[val >> 4]));
|
||||
*dst++ = decode5A3rgba(Common::swap16(tlut[val & 0xF]));
|
||||
*dst++ = decode5A3RGBA(Common::swap16(tlut[val >> 4]));
|
||||
*dst++ = decode5A3RGBA(Common::swap16(tlut[val & 0xF]));
|
||||
}
|
||||
}
|
||||
|
||||
@ -348,7 +329,7 @@ inline void decodebytesC8_5A3_To_RGBA32(u32 *dst, const u8 *src, int tlutaddr)
|
||||
for (int x = 0; x < 8; x++)
|
||||
{
|
||||
u8 val = src[x];
|
||||
*dst++ = decode5A3rgba(Common::swap16(tlut[val]));
|
||||
*dst++ = decode5A3RGBA(Common::swap16(tlut[val]));
|
||||
}
|
||||
}
|
||||
|
||||
@ -422,7 +403,7 @@ inline void decodebytesC14X2_5A3_To_RGBA(u32 *dst, const u16 *src, int tlutaddr)
|
||||
for (int x = 0; x < 4; x++)
|
||||
{
|
||||
u16 val = Common::swap16(src[x]);
|
||||
*dst++ = decode5A3rgba(Common::swap16(tlut[(val & 0x3FFF)]));
|
||||
*dst++ = decode5A3RGBA(Common::swap16(tlut[(val & 0x3FFF)]));
|
||||
}
|
||||
}
|
||||
|
||||
@ -481,23 +462,43 @@ inline void decodebytesIA4RGBA(u32 *dst, const u8 *src)
|
||||
|
||||
inline void decodebytesRGB5A3(u32 *dst, const u16 *src)
|
||||
{
|
||||
#if 0
|
||||
for (int x = 0; x < 4; x++)
|
||||
dst[x] = decode5A3(Common::swap16(src[x]));
|
||||
#else
|
||||
dst[0] = decode5A3(Common::swap16(src[0]));
|
||||
dst[1] = decode5A3(Common::swap16(src[1]));
|
||||
dst[2] = decode5A3(Common::swap16(src[2]));
|
||||
dst[3] = decode5A3(Common::swap16(src[3]));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void decodebytesRGB5A3rgba(u32 *dst, const u16 *src)
|
||||
{
|
||||
#if 0
|
||||
for (int x = 0; x < 4; x++)
|
||||
dst[x] = decode5A3rgba(Common::swap16(src[x]));
|
||||
dst[x] = decode5A3RGBA(Common::swap16(src[x]));
|
||||
#else
|
||||
dst[0] = decode5A3RGBA(Common::swap16(src[0]));
|
||||
dst[1] = decode5A3RGBA(Common::swap16(src[1]));
|
||||
dst[2] = decode5A3RGBA(Common::swap16(src[2]));
|
||||
dst[3] = decode5A3RGBA(Common::swap16(src[3]));
|
||||
#endif
|
||||
}
|
||||
|
||||
// This one is used by many video formats. It'd therefore be good if it was fast.
|
||||
// Needs more speed.
|
||||
inline void decodebytesARGB8_4(u32 *dst, const u16 *src, const u16 *src2)
|
||||
{
|
||||
for (int x = 0; x < 4; x++) {
|
||||
#if 0
|
||||
for (int x = 0; x < 4; x++)
|
||||
dst[x] = Common::swap32((src2[x] << 16) | src[x]);
|
||||
}
|
||||
#else
|
||||
dst[0] = Common::swap32((src2[0] << 16) | src[0]);
|
||||
dst[1] = Common::swap32((src2[1] << 16) | src[1]);
|
||||
dst[2] = Common::swap32((src2[2] << 16) | src[2]);
|
||||
dst[3] = Common::swap32((src2[3] << 16) | src[3]);
|
||||
#endif
|
||||
|
||||
// This can probably be done in a few SSE pack/unpack instructions + pshufb
|
||||
// some unpack instruction x2:
|
||||
@ -508,11 +509,18 @@ inline void decodebytesARGB8_4(u32 *dst, const u16 *src, const u16 *src2)
|
||||
// and we are done.
|
||||
}
|
||||
|
||||
inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 *src2)
|
||||
inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 * src2)
|
||||
{
|
||||
#if 0
|
||||
for (int x = 0; x < 4; x++) {
|
||||
dst[x] = ((src[x] & 0xFF) << 24) | ((src[x] & 0xFF00)>>8) | (src2[x] << 8);
|
||||
}
|
||||
#else
|
||||
dst[0] = ((src[0] & 0xFF) << 24) | ((src[0] & 0xFF00)>>8) | (src2[0] << 8);
|
||||
dst[1] = ((src[1] & 0xFF) << 24) | ((src[1] & 0xFF00)>>8) | (src2[1] << 8);
|
||||
dst[2] = ((src[2] & 0xFF) << 24) | ((src[2] & 0xFF00)>>8) | (src2[2] << 8);
|
||||
dst[3] = ((src[3] & 0xFF) << 24) | ((src[3] & 0xFF00)>>8) | (src2[3] << 8);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline u32 makecol(int r, int g, int b, int a)
|
||||
@ -919,7 +927,7 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
|
||||
|
||||
|
||||
|
||||
PC_TexFormat TexDecoder_Decode_RGBA(u32 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
|
||||
PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
|
||||
{
|
||||
switch (texformat)
|
||||
{
|
||||
@ -966,9 +974,52 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 *dst, const u8 *src, int width, int heig
|
||||
{
|
||||
for (int y = 0; y < height; y += 4)
|
||||
for (int x = 0; x < width; x += 8)
|
||||
for (int iy = 0; iy < 4; iy++, src += 8)
|
||||
for (int ix = 0; ix < 8; ix++)
|
||||
memset(dst + (y + iy)*width+x+ ix, (src + ix)[0], 4);
|
||||
#if _M_SSE >= 0x401
|
||||
|
||||
for (int iy = 0; iy < 4; ++iy, src += 8)
|
||||
{
|
||||
__m128i *quaddst = (__m128i *)(dst + (y + iy)*width + x);
|
||||
const __m128i m0 = _mm_or_si128(
|
||||
_mm_or_si128(
|
||||
_mm_and_si128(_mm_set1_epi8(src[0]), _mm_set_epi32(0, 0, 0, (int)0xffffffffU)),
|
||||
_mm_and_si128(_mm_set1_epi8(src[1]), _mm_set_epi32(0, 0, (int)0xffffffffU, 0))
|
||||
),
|
||||
_mm_or_si128(
|
||||
_mm_and_si128(_mm_set1_epi8(src[2]), _mm_set_epi32(0, (int)0xffffffffU, 0, 0)),
|
||||
_mm_and_si128(_mm_set1_epi8(src[3]), _mm_set_epi32((int)0xffffffffU, 0, 0, 0))
|
||||
)
|
||||
);
|
||||
_mm_store_si128(quaddst, m0);
|
||||
|
||||
const __m128i m1 = _mm_or_si128(
|
||||
_mm_or_si128(
|
||||
_mm_and_si128(_mm_set1_epi8(src[4]), _mm_set_epi32(0, 0, 0, (int)0xffffffffU)),
|
||||
_mm_and_si128(_mm_set1_epi8(src[5]), _mm_set_epi32(0, 0, (int)0xffffffffU, 0))
|
||||
),
|
||||
_mm_or_si128(
|
||||
_mm_and_si128(_mm_set1_epi8(src[6]), _mm_set_epi32(0, (int)0xffffffffU, 0, 0)),
|
||||
_mm_and_si128(_mm_set1_epi8(src[7]), _mm_set_epi32((int)0xffffffffU, 0, 0, 0))
|
||||
)
|
||||
);
|
||||
_mm_store_si128(quaddst+1, m1);
|
||||
}
|
||||
#else
|
||||
for (int iy = 0; iy < 4; ++iy, src += 8)
|
||||
{
|
||||
u32 * newdst = dst + (y + iy)*width+x;
|
||||
const u8 * newsrc = src;
|
||||
u8 srcval;
|
||||
|
||||
srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
|
||||
srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
|
||||
srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
|
||||
srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
|
||||
srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
|
||||
srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
|
||||
srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
|
||||
srcval = newsrc[0]; newdst[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
case GX_TF_C8:
|
||||
@ -1014,8 +1065,10 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 *dst, const u8 *src, int width, int heig
|
||||
{
|
||||
u32 *ptr = dst + (y + iy) * width + x;
|
||||
u16 *s = (u16 *)src;
|
||||
for(int j = 0; j < 4; j++)
|
||||
*ptr++ = decodeIA8Swapped(*s++);
|
||||
ptr[0] = decodeIA8Swapped(s[0]);
|
||||
ptr[1] = decodeIA8Swapped(s[1]);
|
||||
ptr[2] = decodeIA8Swapped(s[2]);
|
||||
ptr[3] = decodeIA8Swapped(s[3]);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1058,7 +1111,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 *dst, const u8 *src, int width, int heig
|
||||
}
|
||||
break;
|
||||
case GX_TF_RGB5A3:
|
||||
{
|
||||
{ // JSD: speed critical for Mario Kart Wii intro movie (at least)
|
||||
for (int y = 0; y < height; y += 4)
|
||||
for (int x = 0; x < width; x += 4)
|
||||
for (int iy = 0; iy < 4; iy++, src += 8)
|
||||
|
@ -387,6 +387,9 @@
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
Optimization="2"
|
||||
InlineFunctionExpansion="1"
|
||||
EnableIntrinsicFunctions="true"
|
||||
AdditionalIncludeDirectories="../../PluginSpecs;../../../Externals/CLRun/include;../../../Externals/SOIL;../Common/Src;../Core/Src"
|
||||
PreprocessorDefinitions="WIN32;NDEBUG;_LIB;_SECURE_SCL=0;__WXMSW__;wxUSE_BASE=0;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE"
|
||||
RuntimeLibrary="0"
|
||||
|
Loading…
x
Reference in New Issue
Block a user