Enabled SSE2 optimization of GX_TF_CMPR decoder only for x86 builds. It can't compete with the x64 optimized reference C code.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6755 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
james.jdunne 2011-01-06 00:32:52 +00:00
parent 809670611b
commit 5ca3adde3c

View File

@ -1707,20 +1707,18 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
case GX_TF_CMPR: // speed critical
// The metroid games use this format almost exclusively.
{
#if !defined(_M_X64)
// JSD optimized with SSE2 intrinsics.
// Produces a ~40% improvement in speed over reference C implementation on an x86 Intel Core 2 Duo
// but also a -10% to 0% "improvement" on an x64 AMD Phenom II. Further optimization is required to
// ensure that all architectures and CPUs gain benefits.
// Produces a 30% improvement for x86 code only.
for (int y = 0; y < height; y += 8)
{
for (int x = 0; x < width; x += 8)
{
u32 *dst32 = dst + y * width + x;
// We handle two DXT blocks simultaneously to take full advantage of SSE2's 128-bit registers.
// This is ideal because a single DXT block contains 2 RGBA colors when decoded from their 16-bit.
// Two DXT blocks therefore contain 4 RGBA colors to be processed. The processing is parallelizable
// at this level, so we do.
for (int z = 0; z < 2; ++z, src += sizeof(struct DXTBlock) * 2, dst32 += (width * 4))
for (int z = 0; z < 2; ++z, src += sizeof(struct DXTBlock) * 2)
{
// JSD NOTE: You may see many strange patterns of behavior in the below code, but they
// are for performance reasons. Sometimes, calculating what should be obvious hard-coded
@ -1820,12 +1818,6 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
_mm_slli_si128(_mm_srli_si128(rgb31, 4), 8 + 4)
);
// Copy the color arrays from the XMM registers to local variables in RAM:
GC_ALIGNED16(u32 colors0A[4]);
GC_ALIGNED16(u32 colors0B[4]);
_mm_store_si128((__m128i *)colors0A, rgb01230);
_mm_store_si128((__m128i *)colors0B, rgb01450);
// Create an array for color lookups for DXT1 so we can use the 2-bit indices:
const __m128i rgb01231 = _mm_or_si128(
_mm_or_si128(
@ -1842,12 +1834,6 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
_mm_slli_si128(_mm_srli_si128(rgb31, 8 + 4), 8 + 4)
);
// Copy the color arrays from the XMM registers to aligned local variables in RAM:
GC_ALIGNED16(u32 colors1A[4]);
GC_ALIGNED16(u32 colors1B[4]);
_mm_store_si128((__m128i *)colors1A, rgb01231);
_mm_store_si128((__m128i *)colors1B, rgb01451);
// The #ifdef CHECKs here and below are to compare correctness of output against the reference code.
// Don't use them in a normal build.
#ifdef CHECK
@ -1867,11 +1853,48 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
// the assembly generated was swapping a lot of temporaries to RAM.
// Compare rgb0 to rgb1:
// Each 32-bit word will contain either 0xFFFFFFFF or 0x00000000 for true/false.
//const __m128i cmprgb0rgb1 = _mm_cmpgt_epi32(rgb0, rgb1);
const __m128i c0cmp = _mm_srli_epi32(_mm_slli_epi32(_mm_srli_epi64(c0, 8), 16), 16);
// rgb1rgb1 rgb0rgb0 rgb1rgb1 rgb0rgb0
// c0 example: {0x 5ef05ef0 c003c003 3ff83ff8 e003e003} in BIG ENDIAN
// c0cmp contains rgb0 values in 64-bit words in LITTLE ENDIAN
// c0cmp: {0x 00000000 000003c0 00000000 000003e0}
// c0shr contains rgb1 values in 64-bit words in LITTLE ENDIAN
// c0cmp: {0x 00000000 0000f05e 00000000 0000f83f}
__m128i c0cmp = _mm_srli_epi32(_mm_slli_epi32(_mm_srli_epi64(c0, 8), 16), 16);
const __m128i c0shr = _mm_srli_epi64(c0cmp, 32);
const __m128i cmprgb0rgb1 = _mm_cmpgt_epi32(c0cmp, c0shr);
c0cmp = _mm_srli_epi64(_mm_slli_epi64(c0cmp, 48), 48);
#if 0
// SSE4 only.
const __m128i cmprgb0rgb1 = _mm_cmpgt_epi64(c0cmp, c0shr);
#else
// We need 64-bit words full of 0xFF bytes for comparison masks. cmgt_epi64 would do that but it's only SSE4
// so we fake it on SSE2 by shuffling the 32-bit comparison masks to copy the 0xFF bytes to the high 32-bits of
// each 64-bit word.
__m128i cmprgb0rgb1 = _mm_cmpgt_epi32(c0cmp, c0shr);
cmprgb0rgb1 = _mm_shuffle_epi32(cmprgb0rgb1, _MM_SHUFFLE(2, 2, 0, 0));
#endif
// Now use the comparison mask from rgb0 > rgb1 to conditionally move colors from
// either rgb0123X or rgb0145X:
const __m128i cmp0 = _mm_unpacklo_epi64(cmprgb0rgb1, cmprgb0rgb1);
const __m128i cmp1 = _mm_unpackhi_epi64(cmprgb0rgb1, cmprgb0rgb1);
const __m128i mmcolors0 = _mm_or_si128(
_mm_and_si128(cmp0, rgb01230),
_mm_andnot_si128(cmp0, rgb01450)
);
const __m128i mmcolors1 = _mm_or_si128(
_mm_and_si128(cmp1, rgb01231),
_mm_andnot_si128(cmp1, rgb01451)
);
// Copy the color arrays from the XMM registers to local variables in RAM:
GC_ALIGNED16(u32 colors0[4]);
GC_ALIGNED16(u32 colors1[4]);
_mm_store_si128((__m128i *)colors0, mmcolors0);
_mm_store_si128((__m128i *)colors1, mmcolors1);
// Copy the 2-bit indices from each DXT block:
GC_ALIGNED16(u32 dxttmp[4]);
@ -1880,244 +1903,97 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
u32 dxt0sel = dxttmp[1];
u32 dxt1sel = dxttmp[3];
// Copy the comparison results:
GC_ALIGNED16(u16 cmptmp[8]);
_mm_store_si128((__m128i *)cmptmp, cmprgb0rgb1);
// Per each row written we alternate storing RGBA values from DXT0 and DXT1.
__m128i *dst128 = (__m128i *)( dst + (y + z*4) * width + x );
// Row 0:
__m128i col0, col1;
if (cmptmp[0])
{
// rgb0a > rgb1a
col0 = _mm_set_epi32(
colors0A[(dxt0sel >> ((0*8)+0)) & 3],
colors0A[(dxt0sel >> ((0*8)+2)) & 3],
colors0A[(dxt0sel >> ((0*8)+4)) & 3],
colors0A[(dxt0sel >> ((0*8)+6)) & 3]
colors0[(dxt0sel >> ((0*8)+0)) & 3],
colors0[(dxt0sel >> ((0*8)+2)) & 3],
colors0[(dxt0sel >> ((0*8)+4)) & 3],
colors0[(dxt0sel >> ((0*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 0), col0);
#ifdef CHECK
assert( memcmp(&(tmp0[0]), dst128 + ((width / 4) * 0), 16) == 0 );
#endif
}
else
{
// rgb0a <= rgb1a
col0 = _mm_set_epi32(
colors0B[(dxt0sel >> ((0*8)+0)) & 3],
colors0B[(dxt0sel >> ((0*8)+2)) & 3],
colors0B[(dxt0sel >> ((0*8)+4)) & 3],
colors0B[(dxt0sel >> ((0*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 0), col0);
#ifdef CHECK
assert( memcmp(&(tmp0[0]), dst128 + ((width / 4) * 0), 16) == 0 );
#endif
}
if (cmptmp[4])
{
// (rgb0b > rgb1b)
col1 = _mm_set_epi32(
colors1A[(dxt1sel >> ((0*8)+0)) & 3],
colors1A[(dxt1sel >> ((0*8)+2)) & 3],
colors1A[(dxt1sel >> ((0*8)+4)) & 3],
colors1A[(dxt1sel >> ((0*8)+6)) & 3]
colors1[(dxt1sel >> ((0*8)+0)) & 3],
colors1[(dxt1sel >> ((0*8)+2)) & 3],
colors1[(dxt1sel >> ((0*8)+4)) & 3],
colors1[(dxt1sel >> ((0*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 0) + 1, col1);
#ifdef CHECK
assert( memcmp(&(tmp1[0]), dst128 + ((width / 4) * 0) + 1, 16) == 0 );
#endif
}
else
{
// (rgb0b <= rgb1b)
col1 = _mm_set_epi32(
colors1B[(dxt1sel >> ((0*8)+0)) & 3],
colors1B[(dxt1sel >> ((0*8)+2)) & 3],
colors1B[(dxt1sel >> ((0*8)+4)) & 3],
colors1B[(dxt1sel >> ((0*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 0) + 1, col1);
#ifdef CHECK
assert( memcmp(&(tmp1[0]), dst128 + ((width / 4) * 0) + 1, 16) == 0 );
#endif
}
// Row 1:
if (cmptmp[0])
{
// rgb0a > rgb1a
col0 = _mm_set_epi32(
colors0A[(dxt0sel >> ((1*8)+0)) & 3],
colors0A[(dxt0sel >> ((1*8)+2)) & 3],
colors0A[(dxt0sel >> ((1*8)+4)) & 3],
colors0A[(dxt0sel >> ((1*8)+6)) & 3]
colors0[(dxt0sel >> ((1*8)+0)) & 3],
colors0[(dxt0sel >> ((1*8)+2)) & 3],
colors0[(dxt0sel >> ((1*8)+4)) & 3],
colors0[(dxt0sel >> ((1*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 1), col0);
#ifdef CHECK
assert( memcmp(&(tmp0[1]), dst128 + ((width / 4) * 1), 16) == 0 );
#endif
}
else
{
// rgb0a <= rgb1a
col0 = _mm_set_epi32(
colors0B[(dxt0sel >> ((1*8)+0)) & 3],
colors0B[(dxt0sel >> ((1*8)+2)) & 3],
colors0B[(dxt0sel >> ((1*8)+4)) & 3],
colors0B[(dxt0sel >> ((1*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 1), col0);
#ifdef CHECK
assert( memcmp(&(tmp0[1]), dst128 + ((width / 4) * 1), 16) == 0 );
#endif
}
if (cmptmp[4])
{
// (rgb0b > rgb1b)
col1 = _mm_set_epi32(
colors1A[(dxt1sel >> ((1*8)+0)) & 3],
colors1A[(dxt1sel >> ((1*8)+2)) & 3],
colors1A[(dxt1sel >> ((1*8)+4)) & 3],
colors1A[(dxt1sel >> ((1*8)+6)) & 3]
colors1[(dxt1sel >> ((1*8)+0)) & 3],
colors1[(dxt1sel >> ((1*8)+2)) & 3],
colors1[(dxt1sel >> ((1*8)+4)) & 3],
colors1[(dxt1sel >> ((1*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 1) + 1, col1);
#ifdef CHECK
assert( memcmp(&(tmp1[1]), dst128 + ((width / 4) * 1) + 1, 16) == 0 );
#endif
}
else
{
// (rgb0b <= rgb1b)
col1 = _mm_set_epi32(
colors1B[(dxt1sel >> ((1*8)+0)) & 3],
colors1B[(dxt1sel >> ((1*8)+2)) & 3],
colors1B[(dxt1sel >> ((1*8)+4)) & 3],
colors1B[(dxt1sel >> ((1*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 1) + 1, col1);
#ifdef CHECK
assert( memcmp(&(tmp1[1]), dst128 + ((width / 4) * 1) + 1, 16) == 0 );
#endif
}
// Row 2:
if (cmptmp[0])
{
// rgb0a > rgb1a
col0 = _mm_set_epi32(
colors0A[(dxt0sel >> ((2*8)+0)) & 3],
colors0A[(dxt0sel >> ((2*8)+2)) & 3],
colors0A[(dxt0sel >> ((2*8)+4)) & 3],
colors0A[(dxt0sel >> ((2*8)+6)) & 3]
colors0[(dxt0sel >> ((2*8)+0)) & 3],
colors0[(dxt0sel >> ((2*8)+2)) & 3],
colors0[(dxt0sel >> ((2*8)+4)) & 3],
colors0[(dxt0sel >> ((2*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 2), col0);
#ifdef CHECK
assert( memcmp(&(tmp0[2]), dst128 + ((width / 4) * 2), 16) == 0 );
#endif
}
else
{
// rgb0a <= rgb1a
col0 = _mm_set_epi32(
colors0B[(dxt0sel >> ((2*8)+0)) & 3],
colors0B[(dxt0sel >> ((2*8)+2)) & 3],
colors0B[(dxt0sel >> ((2*8)+4)) & 3],
colors0B[(dxt0sel >> ((2*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 2), col0);
#ifdef CHECK
assert( memcmp(&(tmp0[2]), dst128 + ((width / 4) * 2), 16) == 0 );
#endif
}
if (cmptmp[4])
{
// (rgb0b > rgb1b)
col1 = _mm_set_epi32(
colors1A[(dxt1sel >> ((2*8)+0)) & 3],
colors1A[(dxt1sel >> ((2*8)+2)) & 3],
colors1A[(dxt1sel >> ((2*8)+4)) & 3],
colors1A[(dxt1sel >> ((2*8)+6)) & 3]
colors1[(dxt1sel >> ((2*8)+0)) & 3],
colors1[(dxt1sel >> ((2*8)+2)) & 3],
colors1[(dxt1sel >> ((2*8)+4)) & 3],
colors1[(dxt1sel >> ((2*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 2) + 1, col1);
#ifdef CHECK
assert( memcmp(&(tmp1[2]), dst128 + ((width / 4) * 2) + 1, 16) == 0 );
#endif
}
else
{
// (rgb0b <= rgb1b)
col1 = _mm_set_epi32(
colors1B[(dxt1sel >> ((2*8)+0)) & 3],
colors1B[(dxt1sel >> ((2*8)+2)) & 3],
colors1B[(dxt1sel >> ((2*8)+4)) & 3],
colors1B[(dxt1sel >> ((2*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 2) + 1, col1);
#ifdef CHECK
assert( memcmp(&(tmp1[2]), dst128 + ((width / 4) * 2) + 1, 16) == 0 );
#endif
}
// Row 3:
if (cmptmp[0])
{
// rgb0a > rgb1a
col0 = _mm_set_epi32(
colors0A[(dxt0sel >> ((3*8)+0)) & 3],
colors0A[(dxt0sel >> ((3*8)+2)) & 3],
colors0A[(dxt0sel >> ((3*8)+4)) & 3],
colors0A[(dxt0sel >> ((3*8)+6)) & 3]
colors0[(dxt0sel >> ((3*8)+0)) & 3],
colors0[(dxt0sel >> ((3*8)+2)) & 3],
colors0[(dxt0sel >> ((3*8)+4)) & 3],
colors0[(dxt0sel >> ((3*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 3), col0);
#ifdef CHECK
assert( memcmp(&(tmp0[3]), dst128 + ((width / 4) * 3), 16) == 0 );
#endif
}
else
{
// rgb0a <= rgb1a
col0 = _mm_set_epi32(
colors0B[(dxt0sel >> ((3*8)+0)) & 3],
colors0B[(dxt0sel >> ((3*8)+2)) & 3],
colors0B[(dxt0sel >> ((3*8)+4)) & 3],
colors0B[(dxt0sel >> ((3*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 3), col0);
#ifdef CHECK
assert( memcmp(&(tmp0[3]), dst128 + ((width / 4) * 3), 16) == 0 );
#endif
}
if (cmptmp[4])
{
// (rgb0b > rgb1b)
col1 = _mm_set_epi32(
colors1A[(dxt1sel >> ((3*8)+0)) & 3],
colors1A[(dxt1sel >> ((3*8)+2)) & 3],
colors1A[(dxt1sel >> ((3*8)+4)) & 3],
colors1A[(dxt1sel >> ((3*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 3) + 1, col1);
#ifdef CHECK
assert( memcmp(&(tmp1[3]), dst128 + ((width / 4) * 3) + 1, 16) == 0 );
#endif
}
else
{
// (rgb0b <= rgb1b)
col1 = _mm_set_epi32(
colors1B[(dxt1sel >> ((3*8)+0)) & 3],
colors1B[(dxt1sel >> ((3*8)+2)) & 3],
colors1B[(dxt1sel >> ((3*8)+4)) & 3],
colors1B[(dxt1sel >> ((3*8)+6)) & 3]
colors1[(dxt1sel >> ((3*8)+0)) & 3],
colors1[(dxt1sel >> ((3*8)+2)) & 3],
colors1[(dxt1sel >> ((3*8)+4)) & 3],
colors1[(dxt1sel >> ((3*8)+6)) & 3]
);
_mm_store_si128(dst128 + ((width / 4) * 3) + 1, col1);
#ifdef CHECK
@ -2126,8 +2002,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
}
}
}
}
#if 0
#else
for (int y = 0; y < height; y += 8)
{
for (int x = 0; x < width; x += 8)