// Copyright (C) 2003-2008 Dolphin Project. // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, version 2.0. // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License 2.0 for more details. // A copy of the GPL 2.0 should have been included with the program. // If not, see http://www.gnu.org/licenses/ // Official SVN repository and contact information can be found at // http://code.google.com/p/dolphin-emu/ #include "Common.h" //#include "VideoCommon.h" // to get debug logs #include "CPUDetect.h" #include "TextureDecoder.h" #include "LookUpTables.h" #include #ifdef __SSSE3__ #include #endif //Uncomment this to enable Texture Format ID overlays #define OVERLAY_TEXFMT #ifdef OVERLAY_TEXFMT bool TexFmt_Overlay_Enable=false; bool TexFmt_Overlay_Center=false; #endif // TRAM // STATE_TO_SAVE u8 texMem[TMEM_SIZE]; ////////////////////////////////////////////////////////////////////////// // Gamecube/Wii texture decoder ////////////////////////////////////////////////////////////////////////// // Decodes all known Gamecube/Wii texture formats. // by ector ////////////////////////////////////////////////////////////////////////// int TexDecoder_GetTexelSizeInNibbles(int format) { switch (format & 0x3f) { case GX_TF_I4: return 1; case GX_TF_I8: return 2; case GX_TF_IA4: return 2; case GX_TF_IA8: return 4; case GX_TF_RGB565: return 4; case GX_TF_RGB5A3: return 4; case GX_TF_RGBA8: return 8; case GX_TF_C4: return 1; case GX_TF_C8: return 2; case GX_TF_C14X2: return 4; case GX_TF_CMPR: return 1; default: return 1; } } int TexDecoder_GetTextureSizeInBytes(int width, int height, int format) { return (width * height * TexDecoder_GetTexelSizeInNibbles(format)) / 2; } u32 TexDecoder_GetTlutHash(const u8* src, int len) { //char str[40000], st[20]; str[0]='\0';for (int i=0;i>11) & 0x1f]; g=lut6to8[(val>>5 ) & 0x3f]; b=lut5to8[(val ) & 0x1f]; a=0xFF; return (a<<24) | (r<<16) | (g<<8) | b; } inline u32 decodeIA8(u16 val) { int a=val>>8; int i=val&0xFF; return (a<<24) | (i<<16) | (i<<8) | i; } inline u32 decode5A3(u16 val) { int r,g,b,a; if ((val&0x8000)) { r=lut5to8[(val>>10) & 0x1f]; g=lut5to8[(val>>5 ) & 0x1f]; b=lut5to8[(val ) & 0x1f]; a=0xFF; } else { a=lut3to8[(val>>12) & 0x7]; r=lut4to8[(val>>8 ) & 0xf]; g=lut4to8[(val>>4 ) & 0xf]; b=lut4to8[(val ) & 0xf]; } return (a<<24) | (r<<16) | (g<<8) | b; } struct DXTBlock { u16 color1; u16 color2; u8 lines[4]; }; //inline void decodebytesC4(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt) inline void decodebytesC4(u32 *dst, const u8 *src, int tlutaddr, int tlutfmt) { u16 *tlut = (u16*)(texMem + tlutaddr); for (int x = 0; x < 4; x++) { int val = src[x]; switch (tlutfmt) { case 0: *dst++ = decodeIA8(Common::swap16(tlut[val >> 4])); *dst++ = decodeIA8(Common::swap16(tlut[val & 15])); break; case 1: *dst++ = decode565(Common::swap16(tlut[val >> 4])); *dst++ = decode565(Common::swap16(tlut[val & 15])); break; case 2: *dst++ = decode5A3(Common::swap16(tlut[val >> 4])); *dst++ = decode5A3(Common::swap16(tlut[val & 15])); break; case 3: //ERROR *dst++ = 0xFFFF00FF; *dst++ = 0xFFFF00FF; break; } } } //inline void decodebytesC8(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt) inline void decodebytesC8(u32 *dst, const u8 *src, int tlutaddr, int tlutfmt) { u16 *tlut = (u16*)(texMem+tlutaddr); for (int x = 0; x < 8; x++) { int val = src[x]; switch (tlutfmt) { case 0: *dst++ = decodeIA8(Common::swap16(tlut[val])); break; case 1: *dst++ = decode565(Common::swap16(tlut[val])); break; case 2: *dst++ = decode5A3(Common::swap16(tlut[val])); break; case 3: //ERROR *dst++ = 0xFFFF00FF; break; } } } //inline void decodebytesC14X2(u32 *dst, const u16 *src, int numpixels, int tlutaddr, int tlutfmt) inline void decodebytesC14X2(u32 *dst, const u16 *src, int tlutaddr, int tlutfmt) { u16 *tlut = (u16*)(texMem+tlutaddr); for (int x = 0; x < 4; x++) { int val = Common::swap16(src[x]); switch (tlutfmt) { case 0: *dst++ = decodeIA8(Common::swap16(tlut[(val&0x3FFF)])); break; case 1: *dst++ = decode565(Common::swap16(tlut[(val&0x3FFF)])); break; case 2: *dst++ = decode5A3(Common::swap16(tlut[(val&0x3FFF)])); break; case 3: //ERROR *dst++ = 0xFFFF00FF; break; } } } //inline void decodebytesRGB565(u32 *dst, const u16 *src, int numpixels) inline void decodebytesRGB565(u32 *dst, const u16 *src) { for (int x = 0; x < 4; x++) *dst++ = decode565(Common::swap16(src[x])); } //inline void decodebytesIA4(u32 *dst, const u8 *src, int numbytes) inline void decodebytesIA4(u32 *dst, const u8 *src) { for (int x = 0; x < 8; x++) { int val = src[x]; int a = lut4to8[val>>4]; int r = lut4to8[val&15]; dst[x] = (a<<24) | (r<<16) | (r<<8) | r; } } //inline void decodebytesIA8(u32 *dst, const u16 *src, int numpixels) inline void decodebytesIA8(u32 *dst, const u16 *src) { for (int x = 0; x < 4; x++) dst[x] = decodeIA8(Common::swap16(src[x])); } //inline void decodebytesRGB5A3(u32 *dst, const u16 *src, int numpixels) inline void decodebytesRGB5A3(u32 *dst, const u16 *src) { for (int x = 0; x < 4; x++) dst[x] = decode5A3(Common::swap16(src[x])); } // This one is used by many video formats. It'd therefore be good if it was fast. inline void decodebytesARGB8_4(u32 *dst, const u16 *src, const u16 *src2) { for (int x = 0; x < 4; x++) { dst[x] = Common::swap32((src2[x] << 16) | src[x]); } // This can probably be done in a few SSE pack/unpack instructions + pshufb // some unpack instruction x2: // ABABABABABABABAB 1212121212121212 -> // AB12AB12AB12AB12 AB12AB12AB12AB12 // 2x pshufb-> // 21BA21BA21BA21BA 21BA21BA21BA21BA // and we are done. } inline u32 makecol(int r, int g, int b, int a) { return (a<<24)|(r<<16)|(g<<8)|b; } void decodeDXTBlock(u32 *dst, const DXTBlock *src, int pitch) { u16 c1 = Common::swap16(src->color1); u16 c2 = Common::swap16(src->color2); int blue1 = lut5to8[c1 & 0x1F]; int blue2 = lut5to8[c2 & 0x1F]; int green1 = lut6to8[(c1>>5) & 0x3F]; int green2 = lut6to8[(c2>>5) & 0x3F]; int red1 = lut5to8[(c1>>11) & 0x1F]; int red2 = lut5to8[(c2>>11) & 0x1F]; int colors[4]; if (c1 > c2) { colors[0] = makecol(red1, green1, blue1, 255); colors[1] = makecol(red2, green2, blue2, 255); colors[2] = makecol(red1+(red2-red1)/3, green1+(green2-green1)/3, blue1+(blue2-blue1)/3, 255); colors[3] = makecol(red2+(red1-red2)/3, green2+(green1-green2)/3, blue2+(blue1-blue2)/3, 255); } else { colors[0] = makecol(red1, green1, blue1, 255); colors[1] = makecol(red2, green2, blue2, 255); colors[2] = makecol((red1+red2)/2, (green1+green2)/2, (blue1+blue2)/2, 255); colors[3] = makecol(0,0,0,0); //transparent } for (int y = 0; y < 4; y++) { int val = src->lines[y]; for (int x = 0; x < 4; x++) { dst[x] = colors[(val >> 6) & 3]; val <<= 2; } dst += pitch; } } //switch endianness, unswizzle //TODO: to save memory, don't blindly convert everything to argb8888 //also ARGB order needs to be swapped later, to accommodate modern hardware better //need to add DXT support too #ifdef OVERLAY_TEXFMT PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) #else PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) #endif { switch (texformat) { case GX_TF_C4: { for (int y = 0; y < height; y += 8) for (int x = 0; x < width; x += 8) for (int iy = 0; iy < 8; iy++, src += 4) //decodebytesC4((u32*)dst+(y+iy)*width+x, src, 4, tlutaddr, tlutfmt); decodebytesC4((u32*)dst+(y+iy)*width+x, src, tlutaddr, tlutfmt); } return PC_TEX_FMT_BGRA32; case GX_TF_I4: { for (int y = 0; y < height; y += 8) for (int x = 0; x < width; x += 8) for (int iy = 0; iy < 8; iy++, src += 4) for (int ix = 0; ix < 4; ix++) { int val = src[ix]; dst[(y+iy)*width+x+ix*2] = lut4to8[val>>4]; dst[(y+iy)*width+x+ix*2+1] = lut4to8[val&15]; } } return PC_TEX_FMT_I8; case GX_TF_I8: // speed critical { for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 8) for (int iy = 0; iy < 4; iy++, src += 8) memcpy(dst+(y+iy)*width+x, src, 8); } return PC_TEX_FMT_I8; case GX_TF_C8: { for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 8) for (int iy = 0; iy < 4; iy++, src += 8) //decodebytesC8((u32*)dst+(y+iy)*width+x, src, 8, tlutaddr, tlutfmt); decodebytesC8((u32*)dst+(y+iy)*width+x, src, tlutaddr, tlutfmt); } return PC_TEX_FMT_BGRA32; case GX_TF_IA4: { for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 8) for (int iy = 0; iy < 4; iy++, src += 8) //decodebytesIA4((u32*)dst+(y+iy)*width+x, src, 8); decodebytesIA4((u32*)dst+(y+iy)*width+x, src); } return PC_TEX_FMT_BGRA32; case GX_TF_IA8: { for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 4) for (int iy = 0; iy < 4; iy++, src += 8) //decodebytesIA8((u32*)dst+(y+iy)*width+x, (u16*)src, 4); decodebytesIA8((u32*)dst+(y+iy)*width+x, (u16*)src); } return PC_TEX_FMT_BGRA32; case GX_TF_C14X2: { for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 4) for (int iy = 0; iy < 4; iy++, src += 8) //decodebytesC14X2((u32*)dst+(y+iy)*width+x, (u16*)src, 4, tlutaddr, tlutfmt); decodebytesC14X2((u32*)dst+(y+iy)*width+x, (u16*)src, tlutaddr, tlutfmt); } return PC_TEX_FMT_BGRA32; case GX_TF_RGB565: { for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 4) for (int iy = 0; iy < 4; iy++, src += 8) //decodebytesRGB565((u32*)dst+(y+iy)*width+x, (u16*)src, 4); decodebytesRGB565((u32*)dst+(y+iy)*width+x, (u16*)src); } return PC_TEX_FMT_BGRA32; case GX_TF_RGB5A3: { for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 4) for (int iy = 0; iy < 4; iy++, src += 8) //decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src, 4); decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src); } return PC_TEX_FMT_BGRA32; case GX_TF_RGBA8: // speed critical { for (int y = 0; y < height; y += 4) { for (int x = 0; x < width; x += 4) { for (int iy = 0; iy < 4; iy++) { decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16); } src += 64; } } } return PC_TEX_FMT_BGRA32; case GX_TF_CMPR: // speed critical { // TODO: Shuffle to PC S3TC (DXTC) format instead of converting // 11111111 22222222 55555555 66666666 // 33333333 44444444 77777777 88888888 // The metroid games use this format almost exclusively. for (int y = 0; y < height; y += 8) for (int x = 0; x < width; x += 8) { decodeDXTBlock((u32*)dst+y*width+x, (DXTBlock*)src, width); src += sizeof(DXTBlock); decodeDXTBlock((u32*)dst+y*width+x+4, (DXTBlock*)src, width); src += sizeof(DXTBlock); decodeDXTBlock((u32*)dst+(y+4)*width+x, (DXTBlock*)src, width); src += sizeof(DXTBlock); decodeDXTBlock((u32*)dst+(y+4)*width+x+4, (DXTBlock*)src, width); src += sizeof(DXTBlock); } } return PC_TEX_FMT_BGRA32; } // The "copy" texture formats, too? return PC_TEX_FMT_NONE; } void TexDecoder_SetTexFmtOverlayOptions(bool enable, bool center) { #ifdef OVERLAY_TEXFMT TexFmt_Overlay_Enable = enable; TexFmt_Overlay_Center = center; #endif } #ifdef OVERLAY_TEXFMT extern const char* texfmt[]; extern const unsigned char sfont_map[]; extern const unsigned char sfont_raw[][9*10]; PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) { PC_TexFormat retval = TexDecoder_Decode_real(dst,src,width,height,texformat,tlutaddr,tlutfmt); if((!TexFmt_Overlay_Enable)||(retval==PC_TEX_FMT_NONE)) return retval; // assume ABGR/ARGB (32bit) int *dtp = (int*)dst; int w = min(width,40); int h = min(height,10); int xoff = (width-w)>>1; int yoff = (height-h)>>1; if(!TexFmt_Overlay_Center) { xoff=0; yoff=0; } const char* fmt = texfmt[texformat&15]; while(*fmt) { int xcnt = 0; int nchar = sfont_map[(int)*fmt]; const unsigned char *ptr = sfont_raw[nchar]; // each char is up to 9x10 for(int x=0;x<9;x++) { if(ptr[x]==0x78) break; xcnt++; } for(int y=0;y<10;y++) { for(int x=0;x