diff --git a/Source/Core/VideoCommon/Src/TextureDecoder.cpp b/Source/Core/VideoCommon/Src/TextureDecoder.cpp index 3b7f93b6f1..76b2abaa08 100644 --- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp +++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp @@ -1444,201 +1444,251 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he // This is the hard-coded 0xFF alpha constant that is ORed in place after the RGB are calculated // for the RGB555 case when (s[x] & 0x8000) is true for all pixels. const __m128i aVxff00 = _mm_set_epi32(0xFF000000L, 0xFF000000L, 0xFF000000L, 0xFF000000L); - for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 4) for (int iy = 0; iy < 4; iy++, src += 8) { u32 *newdst = dst+(y+iy)*width+x; - const u16 *newsrc = (const u16*)src; - - // TODO: weak point - const u16 val0 = Common::swap16(newsrc[0]); - const u16 val1 = Common::swap16(newsrc[1]); - const u16 val2 = Common::swap16(newsrc[2]); - const u16 val3 = Common::swap16(newsrc[3]); - - // Need to check all 4 pixels' MSBs to ensure we can do data-parallelism: - if (((val0 & 0x8000) & (val1 & 0x8000) & (val2 & 0x8000) & (val3 & 0x8000)) == 0x8000) +#if _M_SSE >= 0x301 + // Produces a ~40% speed improvement over reference C implementation + if (cpu_info.bSSE3) { - // SSE2 case #1: all 4 pixels are in RGB555 and alpha = 0xFF. - - const __m128i valV = _mm_set_epi16(0, val3, 0, val2, 0, val1, 0, val0); - - // Swizzle bits: 00012345 -> 12345123 - - //r0 = (((val0>>10) & 0x1f) << 3) | (((val0>>10) & 0x1f) >> 2); - const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 10), kMask_x1f); - const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 3), _mm_srli_epi16(tmprV, 2) ); - - //newdst[0] = r0 | (_______) | (________) | (________); - __m128i final = rV; - - //g0 = (((val0>>5 ) & 0x1f) << 3) | (((val0>>5 ) & 0x1f) >> 2); - const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 5), kMask_x1f); - const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 3), _mm_srli_epi16(tmpgV, 2) ); - - //newdst[0] = r0 | (g0 << 8) | (________) | (________); - final = _mm_or_si128( - final, - _mm_slli_epi32(gV, 8) - ); - - //b0 = (((val0 ) & 0x1f) << 3) | (((val0 ) & 0x1f) >> 2); - const __m128i tmpbV = _mm_and_si128(valV, kMask_x1f); - const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 3), _mm_srli_epi16(tmpbV, 2) ); - - //newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (________); - final = _mm_or_si128( - final, - _mm_slli_epi32(bV, 16) - ); - - // Alphas are ORed in as a constant __m128i. - //a0 = 0xFF; - - //newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24); - final = _mm_or_si128( - final, - aVxff00 - ); - - // write the final result: - _mm_storeu_si128( (__m128i*)newdst, final ); - } - else if (((val0 & 0x8000) | (val1 & 0x8000) | (val2 & 0x8000) | (val3 & 0x8000)) == 0x0000) - { - // SSE2 case #2: all 4 pixels are in RGBA4443. - - const __m128i valV = _mm_set_epi16(0, val3, 0, val2, 0, val1, 0, val0); - - // Swizzle bits: 00001234 -> 12341234 - - //r0 = (((val0>>8 ) & 0xf) << 4) | ((val0>>8 ) & 0xf); - const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 8), kMask_x0f); - const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 4), tmprV ); - - //newdst[0] = r0 | (_______) | (________) | (________); - __m128i final = rV; - - //g0 = (((val0>>4 ) & 0xf) << 4) | ((val0>>4 ) & 0xf); - const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 4), kMask_x0f); - const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 4), tmpgV ); - - //newdst[0] = r0 | (g0 << 8) | (________) | (________); - final = _mm_or_si128( - final, - _mm_slli_epi32(gV, 8) - ); - - //b0 = (((val0 ) & 0xf) << 4) | ((val0 ) & 0xf); - const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f); - const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 4), tmpbV ); - - //newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (________); - final = _mm_or_si128( - final, - _mm_slli_epi32(bV, 16) - ); - - //a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1); - const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07); - const __m128i aV = _mm_or_si128( - _mm_slli_epi16(tmpaV, 5), - _mm_or_si128( - _mm_slli_epi16(tmpaV, 2), - _mm_srli_epi16(tmpaV, 1) - ) - ); - - //newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24); - final = _mm_or_si128( - final, - _mm_slli_epi32(aV, 24) - ); - - // write the final result: - _mm_storeu_si128( (__m128i*)newdst, final ); - } - else - { - // Horrific fallback case, but hey at least it's inlined :D - // Maybe overkill? I see slight improvements on my machine as far as RDTSC - // counts and it's all done in registers (on x64). No temp memory moves! - int r0,g0,b0,a0; - int r1,g1,b1,a1; - int r2,g2,b2,a2; - int r3,g3,b3,a3; - - // Normal operation, no parallelism to take advantage of: - if (val0 & 0x8000) + const __m128i mask = _mm_set_epi8(128,128,6,7,128,128,4,5,128,128,2,3,128,128,0,1); + const __m128i valV = _mm_shuffle_epi8(_mm_loadl_epi64((const __m128i*)src),mask); + int cmp = _mm_movemask_epi8(valV); //MSB: 0x2 = val0; 0x20=val1; 0x200 = val2; 0x2000=val3 + if ((cmp&0x2222)==0x2222) // SSSE3 case #1: all 4 pixels are in RGB555 and alpha = 0xFF. { // Swizzle bits: 00012345 -> 12345123 - r0 = (((val0>>10) & 0x1f) << 3) | (((val0>>10) & 0x1f) >> 2); - g0 = (((val0>>5 ) & 0x1f) << 3) | (((val0>>5 ) & 0x1f) >> 2); - b0 = (((val0 ) & 0x1f) << 3) | (((val0 ) & 0x1f) >> 2); - a0 = 0xFF; + + //r0 = (((val0>>10) & 0x1f) << 3) | (((val0>>10) & 0x1f) >> 2); + const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 10), kMask_x1f); + const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 3), _mm_srli_epi16(tmprV, 2) ); + + //g0 = (((val0>>5 ) & 0x1f) << 3) | (((val0>>5 ) & 0x1f) >> 2); + const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 5), kMask_x1f); + const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 3), _mm_srli_epi16(tmpgV, 2) ); + + //b0 = (((val0 ) & 0x1f) << 3) | (((val0 ) & 0x1f) >> 2); + const __m128i tmpbV = _mm_and_si128(valV, kMask_x1f); + const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 3), _mm_srli_epi16(tmpbV, 2) ); + + //newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24); + const __m128i final = _mm_or_si128( _mm_or_si128(rV,_mm_slli_epi32(gV, 8)), + _mm_or_si128(_mm_slli_epi32(bV, 16), aVxff00)); + _mm_storeu_si128( (__m128i*)newdst, final ); } - else + else if (!(cmp&0x2222)) // SSSE3 case #2: all 4 pixels are in RGBA4443. { - a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1); // Swizzle bits: 00001234 -> 12341234 - r0 = (((val0>>8 ) & 0xf) << 4) | ((val0>>8 ) & 0xf); - g0 = (((val0>>4 ) & 0xf) << 4) | ((val0>>4 ) & 0xf); - b0 = (((val0 ) & 0xf) << 4) | ((val0 ) & 0xf); - } - newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24); - if (val1 & 0x8000) - { - // Swizzle bits: 00012345 -> 12345123 - r1 = (((val1>>10) & 0x1f) << 3) | (((val1>>10) & 0x1f) >> 2); - g1 = (((val1>>5 ) & 0x1f) << 3) | (((val1>>5 ) & 0x1f) >> 2); - b1 = (((val1 ) & 0x1f) << 3) | (((val1 ) & 0x1f) >> 2); - a1 = 0xFF; + //r0 = (((val0>>8 ) & 0xf) << 4) | ((val0>>8 ) & 0xf); + const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 8), kMask_x0f); + const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 4), tmprV ); + + //g0 = (((val0>>4 ) & 0xf) << 4) | ((val0>>4 ) & 0xf); + const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 4), kMask_x0f); + const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 4), tmpgV ); + + //b0 = (((val0 ) & 0xf) << 4) | ((val0 ) & 0xf); + const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f); + const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 4), tmpbV ); + //a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1); + const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07); + const __m128i aV = _mm_or_si128( + _mm_slli_epi16(tmpaV, 5), + _mm_or_si128( + _mm_slli_epi16(tmpaV, 2), + _mm_srli_epi16(tmpaV, 1) + ) + ); + + //newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24); + const __m128i final = _mm_or_si128( _mm_or_si128(rV,_mm_slli_epi32(gV, 8)), + _mm_or_si128(_mm_slli_epi32(bV, 16), _mm_slli_epi32(aV, 24))); + _mm_storeu_si128( (__m128i*)newdst, final ); } else { - a1 = (((val1>>12) & 0x7) << 5) | (((val1>>12) & 0x7) << 2) | (((val1>>12) & 0x7) >> 1); - r1 = (((val1>>8 ) & 0xf) << 4) | ((val1>>8 ) & 0xf); - g1 = (((val1>>4 ) & 0xf) << 4) | ((val1>>4 ) & 0xf); - b1 = (((val1 ) & 0xf) << 4) | ((val1 ) & 0xf); + // TODO: Vectorise (Either 4-way branch or do both and select is better than this) + unsigned __int32 *vals = (unsigned __int32*) &valV; + int r,g,b,a; + for (int i=0; i < 4; ++i) + { + if (vals[i] & 0x8000) + { + // Swizzle bits: 00012345 -> 12345123 + r = (((vals[i]>>10) & 0x1f) << 3) | (((vals[i]>>10) & 0x1f) >> 2); + g = (((vals[i]>>5 ) & 0x1f) << 3) | (((vals[i]>>5 ) & 0x1f) >> 2); + b = (((vals[i] ) & 0x1f) << 3) | (((vals[i] ) & 0x1f) >> 2); + a = 0xFF; + } + else + { + a = (((vals[i]>>12) & 0x7) << 5) | (((vals[i]>>12) & 0x7) << 2) | (((vals[i]>>12) & 0x7) >> 1); + // Swizzle bits: 00001234 -> 12341234 + r = (((vals[i]>>8 ) & 0xf) << 4) | ((vals[i]>>8 ) & 0xf); + g = (((vals[i]>>4 ) & 0xf) << 4) | ((vals[i]>>4 ) & 0xf); + b = (((vals[i] ) & 0xf) << 4) | ((vals[i] ) & 0xf); + } + newdst[i] = r | (g << 8) | (b << 16) | (a << 24); + } } - newdst[1] = r1 | (g1 << 8) | (b1 << 16) | (a1 << 24); + } else +#endif + { + const u16 *newsrc = (const u16*)src; - if (val2 & 0x8000) + // TODO: weak point + const u16 val0 = Common::swap16(newsrc[0]); + const u16 val1 = Common::swap16(newsrc[1]); + const u16 val2 = Common::swap16(newsrc[2]); + const u16 val3 = Common::swap16(newsrc[3]); + + // Need to check all 4 pixels' MSBs to ensure we can do data-parallelism: + if (((val0 & 0x8000) & (val1 & 0x8000) & (val2 & 0x8000) & (val3 & 0x8000)) == 0x8000) { + // SSE2 case #1: all 4 pixels are in RGB555 and alpha = 0xFF. + + const __m128i valV = _mm_set_epi16(0, val3, 0, val2, 0, val1, 0, val0); + // Swizzle bits: 00012345 -> 12345123 - r2 = (((val2>>10) & 0x1f) << 3) | (((val2>>10) & 0x1f) >> 2); - g2 = (((val2>>5 ) & 0x1f) << 3) | (((val2>>5 ) & 0x1f) >> 2); - b2 = (((val2 ) & 0x1f) << 3) | (((val2 ) & 0x1f) >> 2); - a2 = 0xFF; + + //r0 = (((val0>>10) & 0x1f) << 3) | (((val0>>10) & 0x1f) >> 2); + const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 10), kMask_x1f); + const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 3), _mm_srli_epi16(tmprV, 2) ); + + //g0 = (((val0>>5 ) & 0x1f) << 3) | (((val0>>5 ) & 0x1f) >> 2); + const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 5), kMask_x1f); + const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 3), _mm_srli_epi16(tmpgV, 2) ); + + //b0 = (((val0 ) & 0x1f) << 3) | (((val0 ) & 0x1f) >> 2); + const __m128i tmpbV = _mm_and_si128(valV, kMask_x1f); + const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 3), _mm_srli_epi16(tmpbV, 2) ); + + //newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24); + const __m128i final = _mm_or_si128( _mm_or_si128(rV,_mm_slli_epi32(gV, 8)), + _mm_or_si128(_mm_slli_epi32(bV, 16), aVxff00)); + + // write the final result: + _mm_storeu_si128( (__m128i*)newdst, final ); + } + else if (((val0 & 0x8000) | (val1 & 0x8000) | (val2 & 0x8000) | (val3 & 0x8000)) == 0x0000) + { + // SSE2 case #2: all 4 pixels are in RGBA4443. + + const __m128i valV = _mm_set_epi16(0, val3, 0, val2, 0, val1, 0, val0); + + // Swizzle bits: 00001234 -> 12341234 + + //r0 = (((val0>>8 ) & 0xf) << 4) | ((val0>>8 ) & 0xf); + const __m128i tmprV = _mm_and_si128(_mm_srli_epi16(valV, 8), kMask_x0f); + const __m128i rV = _mm_or_si128( _mm_slli_epi16(tmprV, 4), tmprV ); + + //g0 = (((val0>>4 ) & 0xf) << 4) | ((val0>>4 ) & 0xf); + const __m128i tmpgV = _mm_and_si128(_mm_srli_epi16(valV, 4), kMask_x0f); + const __m128i gV = _mm_or_si128( _mm_slli_epi16(tmpgV, 4), tmpgV ); + + //b0 = (((val0 ) & 0xf) << 4) | ((val0 ) & 0xf); + const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f); + const __m128i bV = _mm_or_si128( _mm_slli_epi16(tmpbV, 4), tmpbV ); + + //a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1); + const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07); + const __m128i aV = _mm_or_si128( + _mm_slli_epi16(tmpaV, 5), + _mm_or_si128( + _mm_slli_epi16(tmpaV, 2), + _mm_srli_epi16(tmpaV, 1) + ) + ); + + //newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24); + const __m128i final = _mm_or_si128( _mm_or_si128(rV,_mm_slli_epi32(gV, 8)), + _mm_or_si128(_mm_slli_epi32(bV, 16), _mm_slli_epi32(aV, 24))); + + // write the final result: + _mm_storeu_si128( (__m128i*)newdst, final ); } else { - a2 = (((val2>>12) & 0x7) << 5) | (((val2>>12) & 0x7) << 2) | (((val2>>12) & 0x7) >> 1); - r2 = (((val2>>8 ) & 0xf) << 4) | ((val2>>8 ) & 0xf); - g2 = (((val2>>4 ) & 0xf) << 4) | ((val2>>4 ) & 0xf); - b2 = (((val2 ) & 0xf) << 4) | ((val2 ) & 0xf); - } - newdst[2] = r2 | (g2 << 8) | (b2 << 16) | (a2 << 24); + // Horrific fallback case, but hey at least it's inlined :D + // Maybe overkill? I see slight improvements on my machine as far as RDTSC + // counts and it's all done in registers (on x64). No temp memory moves! + int r0,g0,b0,a0; + int r1,g1,b1,a1; + int r2,g2,b2,a2; + int r3,g3,b3,a3; - if (val3 & 0x8000) - { - // Swizzle bits: 00012345 -> 12345123 - r3 = (((val3>>10) & 0x1f) << 3) | (((val3>>10) & 0x1f) >> 2); - g3 = (((val3>>5 ) & 0x1f) << 3) | (((val3>>5 ) & 0x1f) >> 2); - b3 = (((val3 ) & 0x1f) << 3) | (((val3 ) & 0x1f) >> 2); - a3 = 0xFF; + // Normal operation, no parallelism to take advantage of: + if (val0 & 0x8000) + { + // Swizzle bits: 00012345 -> 12345123 + r0 = (((val0>>10) & 0x1f) << 3) | (((val0>>10) & 0x1f) >> 2); + g0 = (((val0>>5 ) & 0x1f) << 3) | (((val0>>5 ) & 0x1f) >> 2); + b0 = (((val0 ) & 0x1f) << 3) | (((val0 ) & 0x1f) >> 2); + a0 = 0xFF; + } + else + { + a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1); + // Swizzle bits: 00001234 -> 12341234 + r0 = (((val0>>8 ) & 0xf) << 4) | ((val0>>8 ) & 0xf); + g0 = (((val0>>4 ) & 0xf) << 4) | ((val0>>4 ) & 0xf); + b0 = (((val0 ) & 0xf) << 4) | ((val0 ) & 0xf); + } + newdst[0] = r0 | (g0 << 8) | (b0 << 16) | (a0 << 24); + + if (val1 & 0x8000) + { + // Swizzle bits: 00012345 -> 12345123 + r1 = (((val1>>10) & 0x1f) << 3) | (((val1>>10) & 0x1f) >> 2); + g1 = (((val1>>5 ) & 0x1f) << 3) | (((val1>>5 ) & 0x1f) >> 2); + b1 = (((val1 ) & 0x1f) << 3) | (((val1 ) & 0x1f) >> 2); + a1 = 0xFF; + } + else + { + a1 = (((val1>>12) & 0x7) << 5) | (((val1>>12) & 0x7) << 2) | (((val1>>12) & 0x7) >> 1); + r1 = (((val1>>8 ) & 0xf) << 4) | ((val1>>8 ) & 0xf); + g1 = (((val1>>4 ) & 0xf) << 4) | ((val1>>4 ) & 0xf); + b1 = (((val1 ) & 0xf) << 4) | ((val1 ) & 0xf); + } + newdst[1] = r1 | (g1 << 8) | (b1 << 16) | (a1 << 24); + + if (val2 & 0x8000) + { + // Swizzle bits: 00012345 -> 12345123 + r2 = (((val2>>10) & 0x1f) << 3) | (((val2>>10) & 0x1f) >> 2); + g2 = (((val2>>5 ) & 0x1f) << 3) | (((val2>>5 ) & 0x1f) >> 2); + b2 = (((val2 ) & 0x1f) << 3) | (((val2 ) & 0x1f) >> 2); + a2 = 0xFF; + } + else + { + a2 = (((val2>>12) & 0x7) << 5) | (((val2>>12) & 0x7) << 2) | (((val2>>12) & 0x7) >> 1); + r2 = (((val2>>8 ) & 0xf) << 4) | ((val2>>8 ) & 0xf); + g2 = (((val2>>4 ) & 0xf) << 4) | ((val2>>4 ) & 0xf); + b2 = (((val2 ) & 0xf) << 4) | ((val2 ) & 0xf); + } + newdst[2] = r2 | (g2 << 8) | (b2 << 16) | (a2 << 24); + + if (val3 & 0x8000) + { + // Swizzle bits: 00012345 -> 12345123 + r3 = (((val3>>10) & 0x1f) << 3) | (((val3>>10) & 0x1f) >> 2); + g3 = (((val3>>5 ) & 0x1f) << 3) | (((val3>>5 ) & 0x1f) >> 2); + b3 = (((val3 ) & 0x1f) << 3) | (((val3 ) & 0x1f) >> 2); + a3 = 0xFF; + } + else + { + a3 = (((val3>>12) & 0x7) << 5) | (((val3>>12) & 0x7) << 2) | (((val3>>12) & 0x7) >> 1); + r3 = (((val3>>8 ) & 0xf) << 4) | ((val3>>8 ) & 0xf); + g3 = (((val3>>4 ) & 0xf) << 4) | ((val3>>4 ) & 0xf); + b3 = (((val3 ) & 0xf) << 4) | ((val3 ) & 0xf); + } + newdst[3] = r3 | (g3 << 8) | (b3 << 16) | (a3 << 24); } - else - { - a3 = (((val3>>12) & 0x7) << 5) | (((val3>>12) & 0x7) << 2) | (((val3>>12) & 0x7) >> 1); - r3 = (((val3>>8 ) & 0xf) << 4) | ((val3>>8 ) & 0xf); - g3 = (((val3>>4 ) & 0xf) << 4) | ((val3>>4 ) & 0xf); - b3 = (((val3 ) & 0xf) << 4) | ((val3 ) & 0xf); - } - newdst[3] = r3 | (g3 << 8) | (b3 << 16) | (a3 << 24); } } #if 0