From 34a4fee1c50b54bc408e3b056650c646d8d6eea3 Mon Sep 17 00:00:00 2001 From: hrydgard Date: Wed, 15 Oct 2008 21:29:44 +0000 Subject: [PATCH] Substantial XFB speedup. There's more to get though, for example by using a shader for color conversion instead - but i like having a fast CPU implementation too. Also adds some sanity checks. PAL games still have problems. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@885 8ced0084-cf51-0410-be5f-012b33b47a6e --- Source/Core/Core/Src/HW/VideoInterface.cpp | 19 ++- Source/Core/DolphinWX/Src/Frame.cpp | 5 +- Source/Core/VideoCommon/Src/XFBConvert.cpp | 121 ++++++++++++++---- Source/Core/VideoCommon/Src/XFBConvert.h | 2 + Source/Core/VideoCommon/VideoCommon.vcproj | 17 +++ Source/Plugins/Plugin_VideoDX9/Src/main.cpp | 3 +- .../Plugins/Plugin_VideoOGL/Src/OS/Win32.cpp | 2 + Source/Plugins/Plugin_VideoOGL/Src/XFB.cpp | 8 +- Source/Plugins/Plugin_VideoOGL/Src/main.cpp | 2 + 9 files changed, 145 insertions(+), 34 deletions(-) diff --git a/Source/Core/Core/Src/HW/VideoInterface.cpp b/Source/Core/Core/Src/HW/VideoInterface.cpp index 871bf878a2..b7423aeccf 100644 --- a/Source/Core/Core/Src/HW/VideoInterface.cpp +++ b/Source/Core/Core/Src/HW/VideoInterface.cpp @@ -508,25 +508,32 @@ void Update() VerticalBeamPos = 1; } - if(VerticalBeamPos == NextXFBRender) + if (VerticalBeamPos == NextXFBRender) { u8* xfbPtr = 0; int yOffset = 0; - if(NextXFBRender == 1) + if (NextXFBRender == 1) { NextXFBRender = LinesPerField; - xfbPtr = Memory::GetPointer(VideoInterface::m_FrameBufferTop.Hex); + // The & mask is a hack for mario kart + u32 addr = (VideoInterface::m_FrameBufferTop.Hex & 0xFFFFFFF) | 0x80000000; + if (addr >= 0x80000000 && + addr <= (0x81800000-640*480*2)) + xfbPtr = Memory::GetPointer(addr); } else { NextXFBRender = 1; - xfbPtr = Memory::GetPointer(VideoInterface::m_FrameBufferBottom.Hex); + u32 addr = (VideoInterface::m_FrameBufferBottom.Hex & 0xFFFFFFF) | 0x80000000; + if (addr >= 0x80000000 && + addr <= (0x81800000-640*480*2)) + xfbPtr = Memory::GetPointer(addr); yOffset = -1; } - if(xfbPtr && PluginVideo::IsLoaded()) + if (xfbPtr && PluginVideo::IsLoaded()) { int fbWidth = m_VIHorizontalStepping.FieldSteps * 16; int fbHeight = (m_VIHorizontalStepping.FbSteps / m_VIHorizontalStepping.FieldSteps) * m_VIVerticalTimingRegister.ACV; @@ -549,6 +556,6 @@ void Update() } } } - + } diff --git a/Source/Core/DolphinWX/Src/Frame.cpp b/Source/Core/DolphinWX/Src/Frame.cpp index fcb4f83e65..8233438f25 100644 --- a/Source/Core/DolphinWX/Src/Frame.cpp +++ b/Source/Core/DolphinWX/Src/Frame.cpp @@ -347,7 +347,7 @@ void CFrame::OnOpen(wxCommandEvent& WXUNUSED (event)) wxEmptyString, wxEmptyString, wxEmptyString, wxString::Format ( - _T("All GC/Wii files (elf, dol, gcm, iso)|*.elf;*.dol;*.gcm;*.iso|All files (%s)|%s"), + _T("All GC/Wii files (elf, dol, gcm, iso)|*.elf;*.dol;*.gcm;*.iso;*.gcz|All files (%s)|%s"), wxFileSelectorDefaultWildcardStr, wxFileSelectorDefaultWildcardStr ), @@ -607,6 +607,9 @@ void CFrame::OnToggleStatusbar(wxCommandEvent& event) void CFrame::OnKeyDown(wxKeyEvent& event) { + event.Skip(); + return; + if (((event.GetKeyCode() == WXK_RETURN) && (event.GetModifiers() == wxMOD_ALT)) || (event.GetKeyCode() == WXK_ESCAPE)) { diff --git a/Source/Core/VideoCommon/Src/XFBConvert.cpp b/Source/Core/VideoCommon/Src/XFBConvert.cpp index 7d08ecc924..c975d0965b 100644 --- a/Source/Core/VideoCommon/Src/XFBConvert.cpp +++ b/Source/Core/VideoCommon/Src/XFBConvert.cpp @@ -15,26 +15,39 @@ // Official SVN repository and contact information can be found at // http://code.google.com/p/dolphin-emu/ +#if _WIN32 +#include +#endif + +#include + #include "XFBConvert.h" #include "Common.h" -// TODO: Convert this thing into wicked fast SSE2. - namespace { -int bound(int i) +u8 bound_table[3*256]; +int y[256]; +int v1[256]; +int v2[256]; +int u1[256]; +int u2[256]; +u8 *bound_lut = bound_table + 256; + +inline int bound(int i) { - return (i>255)?255:((i<0)?0:i); + return bound_lut[i]; } -void yuv2rgb(int y, int u, int v, int &r, int &g, int &b) +inline void yuv2rgb(int y, int u, int v, int &r, int &g, int &b) { - b = bound((76283*(y - 16) + 132252*(u - 128))>>16); - g = bound((76283*(y - 16) - 53281 *(v - 128) - 25624*(u - 128))>>16); //last one u? - r = bound((76283*(y - 16) + 104595*(v - 128))>>16); + int gray = 76283*(y - 16); + b = bound((gray + 132252*(u - 128))>>16); + g = bound((gray - 53281 *(v - 128) - 25624*(u - 128))>>16); + r = bound((gray + 104595*(v - 128))>>16); } -void rgb2yuv(int r, int g, int b, int &y, int &u, int &v) +inline void rgb2yuv(int r, int g, int b, int &y, int &u, int &v) { y = (((16843 * r) + (33030 * g) + (6423 * b)) >> 16) + 16; v = (((28770 * r) - (24117 * g) - (4653 * b)) >> 16) + 128; @@ -43,24 +56,61 @@ void rgb2yuv(int r, int g, int b, int &y, int &u, int &v) } // namespace + +//const __m128i _bias1 = _mm_set_epi32(16 << 16, 128/2 << 16, 0, 128/2 << 16); +//const __m128i _bias2 = _mm_set_epi32(0, 128/2 << 16, 16 << 16, 128/2 << 16); + +const __m128i _bias1 = _mm_set_epi32(128/2 << 16, 0, 128/2 << 16, 16 << 16); +const __m128i _bias2 = _mm_set_epi32(128/2 << 16, 16 << 16, 128/2 << 16, 0); +__m128i _r1[256]; +__m128i _r2[256]; +__m128i _g1[256]; +__m128i _g2[256]; +__m128i _b1[256]; +__m128i _b2[256]; + +void InitXFBConvTables() +{ + for (int i = 0; i < 256; i++) + { + bound_table[i] = 0; + bound_table[256 + i] = i; + bound_table[512 + i] = 255; + + y[i] = 76283*(i - 16); + u1[i] = 132252 * (i - 128); + u2[i] = -25624 * (i - 128); + v1[i] = -53281 * (i - 128); + v2[i] = 104595 * (i - 128); + + _r1[i] = _mm_set_epi32( 28770 * i / 2, 0, -9699 * i / 2, 16843 * i); + _g1[i] = _mm_set_epi32(-24117 * i / 2, 0, -19071 * i / 2, 33030 * i); + _b1[i] = _mm_set_epi32( -4653 * i / 2, 0, 28770 * i / 2, 6423 * i); + + _r2[i] = _mm_set_epi32( 28770 * i / 2, 16843 * i, -9699 * i / 2, 0); + _g2[i] = _mm_set_epi32(-24117 * i / 2, 33030 * i, -19071 * i / 2, 0); + _b2[i] = _mm_set_epi32( -4653 * i / 2, 6423 * i, 28770 * i / 2, 0); + } +} + void ConvertFromXFB(u32 *dst, const u8* _pXFB, int width, int height) { const unsigned char *src = _pXFB; - u32 numBlocks = (width * height) / 2; - for (u32 i = 0; i < numBlocks; i++, src += 4) { - int Y1 = src[0]; + int Y1 = y[src[0]]; + int Y2 = y[src[2]]; int U = src[1]; - int Y2 = src[2]; int V = src[3]; - - int r, g, b; - yuv2rgb(Y1,U,V, r,g,b); - *dst++ = 0xFF000000 | (r<<16) | (g<<8) | (b); - yuv2rgb(Y2,U,V, r,g,b); - *dst++ = 0xFF000000 | (r<<16) | (g<<8) | (b); + int b1 = bound((Y1 + u1[U]) >> 16); + int g1 = bound((Y1 + v1[V] + u2[U])>>16); + int r1 = bound((Y1 + v2[V]) >> 16); + int b2 = bound((Y2 + u1[U]) >> 16); + int g2 = bound((Y2 + v1[V] + u2[U]) >> 16); + int r2 = bound((Y2 + v2[V]) >> 16); + *dst++ = 0xFF000000 | (r1<<16) | (g1<<8) | (b1); + *dst++ = 0xFF000000 | (r2<<16) | (g2<<8) | (b2); } } @@ -69,18 +119,43 @@ void ConvertToXFB(u32 *dst, const u8* _pEFB, int width, int height) const unsigned char *src = _pEFB; u32 numBlocks = (width * height) / 2; - - for (u32 i = 0; i < numBlocks; i++) +#if 1 + __m128i zero = _mm_setzero_si128(); + for (int i = 0; i < numBlocks / 4; i++) + { + __m128i yuyv[4]; + for (int j = 0; j < 4; j++) { + yuyv[j] = _mm_srai_epi32( + _mm_add_epi32( + _mm_add_epi32( + _mm_add_epi32(_r1[src[0]], _mm_add_epi32(_g1[src[1]], _b1[src[2]])), _bias1), + _mm_add_epi32( + _mm_add_epi32(_r2[src[4]], _mm_add_epi32(_g2[src[5]], _b2[src[6]])), _bias2) + ), 16); + src += 8; + } + __m128i four_dest = _mm_packus_epi16(_mm_packs_epi32(yuyv[0], yuyv[1]), _mm_packs_epi32(yuyv[2], yuyv[3])); + _mm_storeu_si128((__m128i *)dst, four_dest); + dst += 4; + } +#else + for (int i = 0; i < numBlocks; i++) { int y1 = (((16843 * src[0]) + (33030 * src[1]) + (6423 * src[2])) >> 16) + 16; int u1 = ((-(9699 * src[0]) - (19071 * src[1]) + (28770 * src[2])) >> 16) + 128; + + int v1 = (((28770 * src[0]) - (24117 * src[1]) - (4653 * src[2])) >> 16) + 128; src += 4; + int u2 = ((-(9699 * src[0]) - (19071 * src[1]) + (28770 * src[2])) >> 16) + 128; int y2 = (((16843 * src[0]) + (33030 * src[1]) + (6423 * src[2])) >> 16) + 16; int v2 = (((28770 * src[0]) - (24117 * src[1]) - (4653 * src[2])) >> 16) + 128; src += 4; - *dst++ = (v2 << 24) | (y2 << 16) | (u1 << 8) | (y1); - } + int u = bound_lut[(u1 + u2) / 2]; + int v = bound_lut[(v1 + v2) / 2]; + *dst++ = (v << 24) | (y2 << 16) | (u << 8) | (y1); + } +#endif } diff --git a/Source/Core/VideoCommon/Src/XFBConvert.h b/Source/Core/VideoCommon/Src/XFBConvert.h index 7655f8ed09..e67d1d47b0 100644 --- a/Source/Core/VideoCommon/Src/XFBConvert.h +++ b/Source/Core/VideoCommon/Src/XFBConvert.h @@ -20,6 +20,8 @@ #include "Common.h" +void InitXFBConvTables(); + void ConvertFromXFB(u32 *dst, const u8* _pXFB, int width, int height); // converts 32-bit RGBA data to 16-bit 4:2:2 YUV data diff --git a/Source/Core/VideoCommon/VideoCommon.vcproj b/Source/Core/VideoCommon/VideoCommon.vcproj index b87b31b000..bc489c85af 100644 --- a/Source/Core/VideoCommon/VideoCommon.vcproj +++ b/Source/Core/VideoCommon/VideoCommon.vcproj @@ -173,10 +173,15 @@ /> + + +