From 5ca3adde3c30d4f0563c425282f24450c7791859 Mon Sep 17 00:00:00 2001 From: "james.jdunne" Date: Thu, 6 Jan 2011 00:32:52 +0000 Subject: [PATCH] Enabled SSE2 optimization of GX_TF_CMPR decoder only for x86 builds. It can't compete with the x64 optimized reference C code. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6755 8ced0084-cf51-0410-be5f-012b33b47a6e --- .../Core/VideoCommon/Src/TextureDecoder.cpp | 343 ++++++------------ 1 file changed, 109 insertions(+), 234 deletions(-) diff --git a/Source/Core/VideoCommon/Src/TextureDecoder.cpp b/Source/Core/VideoCommon/Src/TextureDecoder.cpp index eb630cdee2..7e6c168cc7 100644 --- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp +++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp @@ -1707,20 +1707,18 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he case GX_TF_CMPR: // speed critical // The metroid games use this format almost exclusively. { +#if !defined(_M_X64) // JSD optimized with SSE2 intrinsics. - // Produces a ~40% improvement in speed over reference C implementation on an x86 Intel Core 2 Duo - // but also a -10% to 0% "improvement" on an x64 AMD Phenom II. Further optimization is required to - // ensure that all architectures and CPUs gain benefits. + // Produces a 30% improvement for x86 code only. for (int y = 0; y < height; y += 8) { for (int x = 0; x < width; x += 8) { - u32 *dst32 = dst + y * width + x; // We handle two DXT blocks simultaneously to take full advantage of SSE2's 128-bit registers. // This is ideal because a single DXT block contains 2 RGBA colors when decoded from their 16-bit. // Two DXT blocks therefore contain 4 RGBA colors to be processed. The processing is parallelizable // at this level, so we do. - for (int z = 0; z < 2; ++z, src += sizeof(struct DXTBlock) * 2, dst32 += (width * 4)) + for (int z = 0; z < 2; ++z, src += sizeof(struct DXTBlock) * 2) { // JSD NOTE: You may see many strange patterns of behavior in the below code, but they // are for performance reasons. Sometimes, calculating what should be obvious hard-coded @@ -1820,12 +1818,6 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he _mm_slli_si128(_mm_srli_si128(rgb31, 4), 8 + 4) ); - // Copy the color arrays from the XMM registers to local variables in RAM: - GC_ALIGNED16(u32 colors0A[4]); - GC_ALIGNED16(u32 colors0B[4]); - _mm_store_si128((__m128i *)colors0A, rgb01230); - _mm_store_si128((__m128i *)colors0B, rgb01450); - // Create an array for color lookups for DXT1 so we can use the 2-bit indices: const __m128i rgb01231 = _mm_or_si128( _mm_or_si128( @@ -1842,12 +1834,6 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he _mm_slli_si128(_mm_srli_si128(rgb31, 8 + 4), 8 + 4) ); - // Copy the color arrays from the XMM registers to aligned local variables in RAM: - GC_ALIGNED16(u32 colors1A[4]); - GC_ALIGNED16(u32 colors1B[4]); - _mm_store_si128((__m128i *)colors1A, rgb01231); - _mm_store_si128((__m128i *)colors1B, rgb01451); - // The #ifdef CHECKs here and below are to compare correctness of output against the reference code. // Don't use them in a normal build. #ifdef CHECK @@ -1867,11 +1853,48 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he // the assembly generated was swapping a lot of temporaries to RAM. // Compare rgb0 to rgb1: - // Each 32-bit word will contain either 0xFFFFFFFF or 0x00000000 for true/false. - //const __m128i cmprgb0rgb1 = _mm_cmpgt_epi32(rgb0, rgb1); - const __m128i c0cmp = _mm_srli_epi32(_mm_slli_epi32(_mm_srli_epi64(c0, 8), 16), 16); + + // rgb1rgb1 rgb0rgb0 rgb1rgb1 rgb0rgb0 + // c0 example: {0x 5ef05ef0 c003c003 3ff83ff8 e003e003} in BIG ENDIAN + // c0cmp contains rgb0 values in 64-bit words in LITTLE ENDIAN + // c0cmp: {0x 00000000 000003c0 00000000 000003e0} + // c0shr contains rgb1 values in 64-bit words in LITTLE ENDIAN + // c0cmp: {0x 00000000 0000f05e 00000000 0000f83f} + + __m128i c0cmp = _mm_srli_epi32(_mm_slli_epi32(_mm_srli_epi64(c0, 8), 16), 16); const __m128i c0shr = _mm_srli_epi64(c0cmp, 32); - const __m128i cmprgb0rgb1 = _mm_cmpgt_epi32(c0cmp, c0shr); + c0cmp = _mm_srli_epi64(_mm_slli_epi64(c0cmp, 48), 48); + +#if 0 + // SSE4 only. + const __m128i cmprgb0rgb1 = _mm_cmpgt_epi64(c0cmp, c0shr); +#else + // We need 64-bit words full of 0xFF bytes for comparison masks. cmgt_epi64 would do that but it's only SSE4 + // so we fake it on SSE2 by shuffling the 32-bit comparison masks to copy the 0xFF bytes to the high 32-bits of + // each 64-bit word. + __m128i cmprgb0rgb1 = _mm_cmpgt_epi32(c0cmp, c0shr); + cmprgb0rgb1 = _mm_shuffle_epi32(cmprgb0rgb1, _MM_SHUFFLE(2, 2, 0, 0)); +#endif + + // Now use the comparison mask from rgb0 > rgb1 to conditionally move colors from + // either rgb0123X or rgb0145X: + const __m128i cmp0 = _mm_unpacklo_epi64(cmprgb0rgb1, cmprgb0rgb1); + const __m128i cmp1 = _mm_unpackhi_epi64(cmprgb0rgb1, cmprgb0rgb1); + const __m128i mmcolors0 = _mm_or_si128( + _mm_and_si128(cmp0, rgb01230), + _mm_andnot_si128(cmp0, rgb01450) + ); + const __m128i mmcolors1 = _mm_or_si128( + _mm_and_si128(cmp1, rgb01231), + _mm_andnot_si128(cmp1, rgb01451) + ); + + // Copy the color arrays from the XMM registers to local variables in RAM: + GC_ALIGNED16(u32 colors0[4]); + GC_ALIGNED16(u32 colors1[4]); + + _mm_store_si128((__m128i *)colors0, mmcolors0); + _mm_store_si128((__m128i *)colors1, mmcolors1); // Copy the 2-bit indices from each DXT block: GC_ALIGNED16(u32 dxttmp[4]); @@ -1880,254 +1903,106 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he u32 dxt0sel = dxttmp[1]; u32 dxt1sel = dxttmp[3]; - // Copy the comparison results: - GC_ALIGNED16(u16 cmptmp[8]); - _mm_store_si128((__m128i *)cmptmp, cmprgb0rgb1); - // Per each row written we alternate storing RGBA values from DXT0 and DXT1. __m128i *dst128 = (__m128i *)( dst + (y + z*4) * width + x ); // Row 0: __m128i col0, col1; - if (cmptmp[0]) - { - // rgb0a > rgb1a - col0 = _mm_set_epi32( - colors0A[(dxt0sel >> ((0*8)+0)) & 3], - colors0A[(dxt0sel >> ((0*8)+2)) & 3], - colors0A[(dxt0sel >> ((0*8)+4)) & 3], - colors0A[(dxt0sel >> ((0*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 0), col0); + col0 = _mm_set_epi32( + colors0[(dxt0sel >> ((0*8)+0)) & 3], + colors0[(dxt0sel >> ((0*8)+2)) & 3], + colors0[(dxt0sel >> ((0*8)+4)) & 3], + colors0[(dxt0sel >> ((0*8)+6)) & 3] + ); + _mm_store_si128(dst128 + ((width / 4) * 0), col0); #ifdef CHECK - assert( memcmp(&(tmp0[0]), dst128 + ((width / 4) * 0), 16) == 0 ); + assert( memcmp(&(tmp0[0]), dst128 + ((width / 4) * 0), 16) == 0 ); #endif - } - else - { - // rgb0a <= rgb1a - col0 = _mm_set_epi32( - colors0B[(dxt0sel >> ((0*8)+0)) & 3], - colors0B[(dxt0sel >> ((0*8)+2)) & 3], - colors0B[(dxt0sel >> ((0*8)+4)) & 3], - colors0B[(dxt0sel >> ((0*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 0), col0); -#ifdef CHECK - assert( memcmp(&(tmp0[0]), dst128 + ((width / 4) * 0), 16) == 0 ); -#endif - } - if (cmptmp[4]) - { - // (rgb0b > rgb1b) - col1 = _mm_set_epi32( - colors1A[(dxt1sel >> ((0*8)+0)) & 3], - colors1A[(dxt1sel >> ((0*8)+2)) & 3], - colors1A[(dxt1sel >> ((0*8)+4)) & 3], - colors1A[(dxt1sel >> ((0*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 0) + 1, col1); + col1 = _mm_set_epi32( + colors1[(dxt1sel >> ((0*8)+0)) & 3], + colors1[(dxt1sel >> ((0*8)+2)) & 3], + colors1[(dxt1sel >> ((0*8)+4)) & 3], + colors1[(dxt1sel >> ((0*8)+6)) & 3] + ); + _mm_store_si128(dst128 + ((width / 4) * 0) + 1, col1); #ifdef CHECK - assert( memcmp(&(tmp1[0]), dst128 + ((width / 4) * 0) + 1, 16) == 0 ); + assert( memcmp(&(tmp1[0]), dst128 + ((width / 4) * 0) + 1, 16) == 0 ); #endif - } - else - { - // (rgb0b <= rgb1b) - col1 = _mm_set_epi32( - colors1B[(dxt1sel >> ((0*8)+0)) & 3], - colors1B[(dxt1sel >> ((0*8)+2)) & 3], - colors1B[(dxt1sel >> ((0*8)+4)) & 3], - colors1B[(dxt1sel >> ((0*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 0) + 1, col1); -#ifdef CHECK - assert( memcmp(&(tmp1[0]), dst128 + ((width / 4) * 0) + 1, 16) == 0 ); -#endif - } // Row 1: - if (cmptmp[0]) - { - // rgb0a > rgb1a - col0 = _mm_set_epi32( - colors0A[(dxt0sel >> ((1*8)+0)) & 3], - colors0A[(dxt0sel >> ((1*8)+2)) & 3], - colors0A[(dxt0sel >> ((1*8)+4)) & 3], - colors0A[(dxt0sel >> ((1*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 1), col0); + col0 = _mm_set_epi32( + colors0[(dxt0sel >> ((1*8)+0)) & 3], + colors0[(dxt0sel >> ((1*8)+2)) & 3], + colors0[(dxt0sel >> ((1*8)+4)) & 3], + colors0[(dxt0sel >> ((1*8)+6)) & 3] + ); + _mm_store_si128(dst128 + ((width / 4) * 1), col0); #ifdef CHECK - assert( memcmp(&(tmp0[1]), dst128 + ((width / 4) * 1), 16) == 0 ); + assert( memcmp(&(tmp0[1]), dst128 + ((width / 4) * 1), 16) == 0 ); #endif - } - else - { - // rgb0a <= rgb1a - col0 = _mm_set_epi32( - colors0B[(dxt0sel >> ((1*8)+0)) & 3], - colors0B[(dxt0sel >> ((1*8)+2)) & 3], - colors0B[(dxt0sel >> ((1*8)+4)) & 3], - colors0B[(dxt0sel >> ((1*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 1), col0); -#ifdef CHECK - assert( memcmp(&(tmp0[1]), dst128 + ((width / 4) * 1), 16) == 0 ); -#endif - } - if (cmptmp[4]) - { - // (rgb0b > rgb1b) - col1 = _mm_set_epi32( - colors1A[(dxt1sel >> ((1*8)+0)) & 3], - colors1A[(dxt1sel >> ((1*8)+2)) & 3], - colors1A[(dxt1sel >> ((1*8)+4)) & 3], - colors1A[(dxt1sel >> ((1*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 1) + 1, col1); + col1 = _mm_set_epi32( + colors1[(dxt1sel >> ((1*8)+0)) & 3], + colors1[(dxt1sel >> ((1*8)+2)) & 3], + colors1[(dxt1sel >> ((1*8)+4)) & 3], + colors1[(dxt1sel >> ((1*8)+6)) & 3] + ); + _mm_store_si128(dst128 + ((width / 4) * 1) + 1, col1); #ifdef CHECK - assert( memcmp(&(tmp1[1]), dst128 + ((width / 4) * 1) + 1, 16) == 0 ); + assert( memcmp(&(tmp1[1]), dst128 + ((width / 4) * 1) + 1, 16) == 0 ); #endif - } - else - { - // (rgb0b <= rgb1b) - col1 = _mm_set_epi32( - colors1B[(dxt1sel >> ((1*8)+0)) & 3], - colors1B[(dxt1sel >> ((1*8)+2)) & 3], - colors1B[(dxt1sel >> ((1*8)+4)) & 3], - colors1B[(dxt1sel >> ((1*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 1) + 1, col1); -#ifdef CHECK - assert( memcmp(&(tmp1[1]), dst128 + ((width / 4) * 1) + 1, 16) == 0 ); -#endif - } // Row 2: - if (cmptmp[0]) - { - // rgb0a > rgb1a - col0 = _mm_set_epi32( - colors0A[(dxt0sel >> ((2*8)+0)) & 3], - colors0A[(dxt0sel >> ((2*8)+2)) & 3], - colors0A[(dxt0sel >> ((2*8)+4)) & 3], - colors0A[(dxt0sel >> ((2*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 2), col0); + col0 = _mm_set_epi32( + colors0[(dxt0sel >> ((2*8)+0)) & 3], + colors0[(dxt0sel >> ((2*8)+2)) & 3], + colors0[(dxt0sel >> ((2*8)+4)) & 3], + colors0[(dxt0sel >> ((2*8)+6)) & 3] + ); + _mm_store_si128(dst128 + ((width / 4) * 2), col0); #ifdef CHECK - assert( memcmp(&(tmp0[2]), dst128 + ((width / 4) * 2), 16) == 0 ); + assert( memcmp(&(tmp0[2]), dst128 + ((width / 4) * 2), 16) == 0 ); #endif - } - else - { - // rgb0a <= rgb1a - col0 = _mm_set_epi32( - colors0B[(dxt0sel >> ((2*8)+0)) & 3], - colors0B[(dxt0sel >> ((2*8)+2)) & 3], - colors0B[(dxt0sel >> ((2*8)+4)) & 3], - colors0B[(dxt0sel >> ((2*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 2), col0); -#ifdef CHECK - assert( memcmp(&(tmp0[2]), dst128 + ((width / 4) * 2), 16) == 0 ); -#endif - } - if (cmptmp[4]) - { - // (rgb0b > rgb1b) - col1 = _mm_set_epi32( - colors1A[(dxt1sel >> ((2*8)+0)) & 3], - colors1A[(dxt1sel >> ((2*8)+2)) & 3], - colors1A[(dxt1sel >> ((2*8)+4)) & 3], - colors1A[(dxt1sel >> ((2*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 2) + 1, col1); + col1 = _mm_set_epi32( + colors1[(dxt1sel >> ((2*8)+0)) & 3], + colors1[(dxt1sel >> ((2*8)+2)) & 3], + colors1[(dxt1sel >> ((2*8)+4)) & 3], + colors1[(dxt1sel >> ((2*8)+6)) & 3] + ); + _mm_store_si128(dst128 + ((width / 4) * 2) + 1, col1); #ifdef CHECK - assert( memcmp(&(tmp1[2]), dst128 + ((width / 4) * 2) + 1, 16) == 0 ); + assert( memcmp(&(tmp1[2]), dst128 + ((width / 4) * 2) + 1, 16) == 0 ); #endif - } - else - { - // (rgb0b <= rgb1b) - col1 = _mm_set_epi32( - colors1B[(dxt1sel >> ((2*8)+0)) & 3], - colors1B[(dxt1sel >> ((2*8)+2)) & 3], - colors1B[(dxt1sel >> ((2*8)+4)) & 3], - colors1B[(dxt1sel >> ((2*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 2) + 1, col1); -#ifdef CHECK - assert( memcmp(&(tmp1[2]), dst128 + ((width / 4) * 2) + 1, 16) == 0 ); -#endif - } // Row 3: - if (cmptmp[0]) - { - // rgb0a > rgb1a - col0 = _mm_set_epi32( - colors0A[(dxt0sel >> ((3*8)+0)) & 3], - colors0A[(dxt0sel >> ((3*8)+2)) & 3], - colors0A[(dxt0sel >> ((3*8)+4)) & 3], - colors0A[(dxt0sel >> ((3*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 3), col0); + col0 = _mm_set_epi32( + colors0[(dxt0sel >> ((3*8)+0)) & 3], + colors0[(dxt0sel >> ((3*8)+2)) & 3], + colors0[(dxt0sel >> ((3*8)+4)) & 3], + colors0[(dxt0sel >> ((3*8)+6)) & 3] + ); + _mm_store_si128(dst128 + ((width / 4) * 3), col0); #ifdef CHECK - assert( memcmp(&(tmp0[3]), dst128 + ((width / 4) * 3), 16) == 0 ); + assert( memcmp(&(tmp0[3]), dst128 + ((width / 4) * 3), 16) == 0 ); #endif - } - else - { - // rgb0a <= rgb1a - col0 = _mm_set_epi32( - colors0B[(dxt0sel >> ((3*8)+0)) & 3], - colors0B[(dxt0sel >> ((3*8)+2)) & 3], - colors0B[(dxt0sel >> ((3*8)+4)) & 3], - colors0B[(dxt0sel >> ((3*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 3), col0); -#ifdef CHECK - assert( memcmp(&(tmp0[3]), dst128 + ((width / 4) * 3), 16) == 0 ); -#endif - } - if (cmptmp[4]) - { - // (rgb0b > rgb1b) - col1 = _mm_set_epi32( - colors1A[(dxt1sel >> ((3*8)+0)) & 3], - colors1A[(dxt1sel >> ((3*8)+2)) & 3], - colors1A[(dxt1sel >> ((3*8)+4)) & 3], - colors1A[(dxt1sel >> ((3*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 3) + 1, col1); + col1 = _mm_set_epi32( + colors1[(dxt1sel >> ((3*8)+0)) & 3], + colors1[(dxt1sel >> ((3*8)+2)) & 3], + colors1[(dxt1sel >> ((3*8)+4)) & 3], + colors1[(dxt1sel >> ((3*8)+6)) & 3] + ); + _mm_store_si128(dst128 + ((width / 4) * 3) + 1, col1); #ifdef CHECK - assert( memcmp(&(tmp1[3]), dst128 + ((width / 4) * 3) + 1, 16) == 0 ); + assert( memcmp(&(tmp1[3]), dst128 + ((width / 4) * 3) + 1, 16) == 0 ); #endif - } - else - { - // (rgb0b <= rgb1b) - col1 = _mm_set_epi32( - colors1B[(dxt1sel >> ((3*8)+0)) & 3], - colors1B[(dxt1sel >> ((3*8)+2)) & 3], - colors1B[(dxt1sel >> ((3*8)+4)) & 3], - colors1B[(dxt1sel >> ((3*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 3) + 1, col1); -#ifdef CHECK - assert( memcmp(&(tmp1[3]), dst128 + ((width / 4) * 3) + 1, 16) == 0 ); -#endif - } } } } -#if 0 +#else for (int y = 0; y < height; y += 8) { for (int x = 0; x < width; x += 8)