From 05ad8d01d3677fd5f9d4417fa3a56ae249f0608b Mon Sep 17 00:00:00 2001 From: Glenn Rice Date: Sun, 19 Dec 2010 14:59:57 +0000 Subject: [PATCH] Fix TextureDecoder.cl to work on both NVidia and ATI video cards. To do so I had to re-add the casting bloat removed in revision 6102. Also, for some odd reason the NVidia OpenCL drivers don't like 8 bit rotations, but are okay with 2, 4 bit rotations. These are apparently bugs in the NVidia drivers that are hopefully fixed in future versions. Also, on linux make sure the TextureDecoder.cl file is copied from the shared data directory to the users directory. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6611 8ced0084-cf51-0410-be5f-012b33b47a6e --- Data/User/OpenCL/TextureDecoder.cl | 94 ++++++++++--------- Source/Core/Common/Src/CommonPaths.h | 1 + Source/Core/Common/Src/FileUtil.cpp | 4 + Source/Core/Common/Src/FileUtil.h | 1 + Source/Core/DolphinWX/Src/Main.cpp | 1 + .../Src/OpenCL/OCLTextureDecoder.cpp | 6 +- 6 files changed, 58 insertions(+), 49 deletions(-) diff --git a/Data/User/OpenCL/TextureDecoder.cl b/Data/User/OpenCL/TextureDecoder.cl index daa4ed81e9..bc7be0a92a 100644 --- a/Data/User/OpenCL/TextureDecoder.cl +++ b/Data/User/OpenCL/TextureDecoder.cl @@ -24,9 +24,9 @@ kernel void DecodeI4(global uchar *dst, { uchar4 val = vload4(srcOffset, src); uchar8 res; - res.even = (val >> 4) & 0x0F; - res.odd = val & 0x0F; - res |= res << 4; + res.even = (val >> (uchar4)4) & (uchar4)0x0F; + res.odd = val & (uchar4)0x0F; + res |= res << (uchar8)4; vstore8(res, 0, dst + ((y + iy)*width + x)); srcOffset++; } @@ -41,9 +41,9 @@ kernel void DecodeI4_RGBA(global uint *dst, { uchar4 val = vload4(srcOffset, src); uchar8 res; - res.even = (val >> 4) & 0x0F; - res.odd = val & 0x0F; - res |= res << 4; + res.even = (val >> (uchar4)4) & (uchar4)0x0F; + res.odd = val & (uchar4)0x0F; + res |= res << (uchar8)4; vstore8(upsample(upsample(res,res),upsample(res,res)), 0, dst + ((y + iy)*width + x)); srcOffset++; } @@ -108,8 +108,8 @@ kernel void DecodeIA4(global ushort *dst, for (int iy = 0; iy < 4; iy++) { val = vload8(srcOffset++, src); - res = upsample(val >> 4, val & 0xF); - res |= res << 4; + res = upsample(val >> (uchar8)4, val & (uchar8)0xF); + res |= res << (ushort8)4; vstore8(res, 0, dst + y*width + x); dst+=width; } @@ -125,10 +125,10 @@ kernel void DecodeIA4_RGBA(global uint *dst, for (int iy = 0; iy < 4; iy++) { val = vload8(srcOffset++, src); - uchar8 a = val >> 4; - uchar8 l = val & 0xF; + uchar8 a = val >> (uchar8)4; + uchar8 l = val & (uchar8)0xF; res = upsample(upsample(a, l), upsample(l,l)); - res |= res << 4; + res |= res << (uint8)4; vstore8(res, 0, dst + y*width + x); dst+=width; } @@ -142,7 +142,8 @@ kernel void DecodeRGBA8(global ushort *dst, for (int iy = 0; iy < 4; iy++) { ushort8 val = (ushort8)(vload4(srcOffset, src), vload4(srcOffset + 4, src)); - ushort8 bgra = rotate(val,8).s40516273; + ushort8 temp = rotate(val, (ushort8)4); + ushort8 bgra = rotate(temp, (ushort8)4).s40516273; vstore8(bgra, 0, dst + ((y + iy)*width + x) * 2); srcOffset++; } @@ -175,7 +176,8 @@ kernel void DecodeRGB565(global ushort *dst, dst += width*y + x; for (int iy = 0; iy < 4; iy++) { - vstore4(rotate(vload4(srcOffset++, src),8), 0, dst + iy*width); + ushort4 val = rotate(vload4(srcOffset++, src),(ushort4)4); + vstore4(rotate(val,(ushort4)4), 0, dst + iy*width); } } @@ -189,10 +191,10 @@ kernel void DecodeRGB565_RGBA(global uchar *dst, uchar8 val = vload8(srcOffset++, src); uchar16 res; - res.even.even = bitselect(val.even, val.even >> 5, 7); - res.odd.even = bitselect((val.odd >> 3) | (val.even << 5), val.even >> 1, 3); - res.even.odd = bitselect(val.odd << 3, val.odd >> 2, 7); - res.odd.odd = 0xFF; + res.even.even = bitselect(val.even, val.even >> (uchar4)5, (uchar4)7); + res.odd.even = bitselect((val.odd >> (uchar4)3) | (val.even << (uchar4)5), val.even >> (uchar4)1, (uchar4)3); + res.even.odd = bitselect(val.odd << (uchar4)3, val.odd >> (uchar4)2, (uchar4)7); + res.odd.odd = (uchar4)0xFF; vstore16(res, 0, dst + ((y + iy)*width + x) * 4); } @@ -207,16 +209,16 @@ kernel void DecodeRGB5A3(global uchar *dst, uchar16 resNoAlpha, resAlpha, choice; #define iterateRGB5A3() \ val = vload8(srcOffset++, src); \ - resNoAlpha.s26AE = val.even << 1; \ - resNoAlpha.s159D = val.even << 6 | val.odd >> 2; \ - resNoAlpha.s048C = val.odd << 3; \ - resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> 5, 0x3); \ - resNoAlpha.s37BF = 0xFF; \ - resAlpha.s26AE = bitselect(val.even << 4, val.even, 0xF); \ - resAlpha.s159D = bitselect(val.odd, val.odd >> 4, 0xF); \ - resAlpha.s048C = bitselect(val.odd << 4, val.odd, 0xF); \ - resAlpha.s37BF = bitselect(val.even << 1, val.even >> 2, 0x1C); \ - resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> 5, 0x3); \ + resNoAlpha.s26AE = val.even << (uchar4)1; \ + resNoAlpha.s159D = val.even << (uchar4)6 | val.odd >> (uchar4)2; \ + resNoAlpha.s048C = val.odd << (uchar4)3; \ + resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> (uchar16)5, (uchar16)0x3); \ + resNoAlpha.s37BF = (uchar4)(0xFF); \ + resAlpha.s26AE = bitselect(val.even << (uchar4)4, val.even, (uchar4)0xF); \ + resAlpha.s159D = bitselect(val.odd, val.odd >> (uchar4)4, (uchar4)0xF); \ + resAlpha.s048C = bitselect(val.odd << (uchar4)4, val.odd, (uchar4)0xF); \ + resAlpha.s37BF = bitselect(val.even << (uchar4)1, val.even >> (uchar4)2, (uchar4)0x1C); \ + resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> (uchar4)5, (uchar4)0x3); \ choice = (uchar16)((uchar4)(val.even.s0), \ (uchar4)(val.even.s1), \ (uchar4)(val.even.s2), \ @@ -237,16 +239,16 @@ kernel void DecodeRGB5A3_RGBA(global uchar *dst, uchar16 resNoAlpha, resAlpha, choice; #define iterateRGB5A3_RGBA() \ val = vload8(srcOffset++, src); \ - resNoAlpha.s048C = val.even << 1; \ - resNoAlpha.s159D = val.even << 6 | val.odd >> 2; \ - resNoAlpha.s26AE = val.odd << 3; \ - resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> 5, 0x3); \ - resNoAlpha.s37BF = 0xFF; \ - resAlpha.s048C = bitselect(val.even << 4, val.even, 0xF); \ - resAlpha.s159D = bitselect(val.odd, val.odd >> 4, 0xF); \ - resAlpha.s26AE = bitselect(val.odd << 4, val.odd, 0xF); \ - resAlpha.s37BF = bitselect(val.even << 1, val.even >> 2, 0x1C); \ - resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> 5, 0x3); \ + resNoAlpha.s048C = val.even << (uchar4)1; \ + resNoAlpha.s159D = val.even << (uchar4)6 | val.odd >> (uchar4)2; \ + resNoAlpha.s26AE = val.odd << (uchar4)3; \ + resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> (uchar16)5, (uchar16)0x3); \ + resNoAlpha.s37BF = (uchar4)(0xFF); \ + resAlpha.s048C = bitselect(val.even << (uchar4)4, val.even, (uchar4)0xF); \ + resAlpha.s159D = bitselect(val.odd, val.odd >> (uchar4)4, (uchar4)0xF); \ + resAlpha.s26AE = bitselect(val.odd << (uchar4)4, val.odd, (uchar4)0xF); \ + resAlpha.s37BF = bitselect(val.even << (uchar4)1, val.even >> (uchar4)2, (uchar4)0x1C); \ + resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> (uchar4)5, (uchar4)0x3); \ choice = (uchar16)((uchar4)(val.even.s0), \ (uchar4)(val.even.s1), \ (uchar4)(val.even.s2), \ @@ -274,13 +276,13 @@ kernel void decodeCMPRBlock(global uchar *dst, uchar2 colora565 = (uchar2)(val.s1, val.s3); uchar2 colorb565 = (uchar2)(val.s0, val.s2); - uchar8 color32 = (uchar8)(bitselect(colora565 << 3, colora565 >> 2, 7), - bitselect((colora565 >> 3) | (colorb565 << 5), colorb565 >> 1, 3), - bitselect(colorb565, colorb565 >> 5, 7), + uchar8 color32 = (uchar8)(bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7), + bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3), + bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7), (uchar2)0xFF); ushort4 frac2 = convert_ushort4(color32.even) - convert_ushort4(color32.odd); - uchar4 frac = convert_uchar4((frac2 * 3) / 8); + uchar4 frac = convert_uchar4((frac2 * (ushort4)3) / (ushort4)8); ushort4 colorAlpha = upsample((uchar4)(color32.even.s0,color32.even.s1,color32.even.s2,0), rhadd(color32.odd, color32.even)); @@ -320,13 +322,13 @@ kernel void decodeCMPRBlock_RGBA(global uchar *dst, uchar2 colora565 = (uchar2)(val.s1, val.s3); uchar2 colorb565 = (uchar2)(val.s0, val.s2); - uchar8 color32 = (uchar8)(bitselect(colorb565, colorb565 >> 5, 7), - bitselect((colora565 >> 3) | (colorb565 << 5), colorb565 >> 1, 3), - bitselect(colora565 << 3, colora565 >> 2, 7), + uchar8 color32 = (uchar8)(bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)7), + bitselect((colora565 >> (uchar2)3) | (colorb565 << (uchar2)5), colorb565 >> (uchar2)1, (uchar2)3), + bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)7), (uchar2)0xFF); ushort4 frac2 = convert_ushort4(color32.even) - convert_ushort4(color32.odd); - uchar4 frac = convert_uchar4((frac2 * 3) / 8); + uchar4 frac = convert_uchar4((frac2 * (ushort4)3) / (ushort4)8); ushort4 colorAlpha = upsample((uchar4)(color32.even.s0,color32.even.s1,color32.even.s2,0), rhadd(color32.odd, color32.even)); @@ -356,4 +358,4 @@ kernel void DecodeCMPR_RGBA(global uchar *dst, decodeCMPRBlock_RGBA(dst + 16, src, width); src += 8; decodeCMPRBlock_RGBA(dst + 16 * width, src, width); src += 8; decodeCMPRBlock_RGBA(dst + 16 * (width + 1), src, width); -} \ No newline at end of file +} diff --git a/Source/Core/Common/Src/CommonPaths.h b/Source/Core/Common/Src/CommonPaths.h index ca0676ebb8..32f476d62e 100644 --- a/Source/Core/Common/Src/CommonPaths.h +++ b/Source/Core/Common/Src/CommonPaths.h @@ -100,6 +100,7 @@ #define SHADERCACHE_DIR "ShaderCache" #define STATESAVES_DIR "StateSaves" #define SCREENSHOTS_DIR "ScreenShots" +#define OPENCL_DIR "OpenCL" #define LOAD_DIR "Load" #define HIRES_TEXTURES_DIR LOAD_DIR DIR_SEP "Textures" #define DUMP_DIR "Dump" diff --git a/Source/Core/Common/Src/FileUtil.cpp b/Source/Core/Common/Src/FileUtil.cpp index 0a239b6551..3c820ef135 100644 --- a/Source/Core/Common/Src/FileUtil.cpp +++ b/Source/Core/Common/Src/FileUtil.cpp @@ -647,6 +647,7 @@ const char *GetUserPath(int DirIDX) static char ShadersDir[MAX_PATH] = {0}; static char StateSavesDir[MAX_PATH] = {0}; static char ScreenShotsDir[MAX_PATH] = {0}; + static char OpenCLDir[MAX_PATH] = {0}; static char HiresTexturesDir[MAX_PATH] = {0}; static char DumpDir[MAX_PATH] = {0}; static char DumpFramesDir[MAX_PATH] = {0}; @@ -689,6 +690,7 @@ const char *GetUserPath(int DirIDX) snprintf(ShadersDir, sizeof(ShadersDir), "%s" SHADERS_DIR DIR_SEP, UserDir); snprintf(StateSavesDir, sizeof(StateSavesDir), "%s" STATESAVES_DIR DIR_SEP, UserDir); snprintf(ScreenShotsDir, sizeof(ScreenShotsDir), "%s" SCREENSHOTS_DIR DIR_SEP, UserDir); + snprintf(OpenCLDir, sizeof(OpenCLDir), "%s" OPENCL_DIR DIR_SEP, UserDir); snprintf(HiresTexturesDir, sizeof(HiresTexturesDir), "%s" HIRES_TEXTURES_DIR DIR_SEP, UserDir); snprintf(DumpDir, sizeof(DumpDir), "%s" DUMP_DIR DIR_SEP, UserDir); snprintf(DumpFramesDir, sizeof(DumpFramesDir), "%s" DUMP_FRAMES_DIR DIR_SEP, UserDir); @@ -732,6 +734,8 @@ const char *GetUserPath(int DirIDX) return StateSavesDir; case D_SCREENSHOTS_IDX: return ScreenShotsDir; + case D_OPENCL_IDX: + return OpenCLDir; case D_HIRESTEXTURES_IDX: return HiresTexturesDir; case D_DUMP_IDX: diff --git a/Source/Core/Common/Src/FileUtil.h b/Source/Core/Common/Src/FileUtil.h index 69f2b8bf6b..1d7a24ee1e 100644 --- a/Source/Core/Common/Src/FileUtil.h +++ b/Source/Core/Common/Src/FileUtil.h @@ -38,6 +38,7 @@ enum { D_SHADERS_IDX, D_STATESAVES_IDX, D_SCREENSHOTS_IDX, + D_OPENCL_IDX, D_HIRESTEXTURES_IDX, D_DUMP_IDX, D_DUMPFRAMES_IDX, diff --git a/Source/Core/DolphinWX/Src/Main.cpp b/Source/Core/DolphinWX/Src/Main.cpp index d95cc6dabd..0659d3e50a 100644 --- a/Source/Core/DolphinWX/Src/Main.cpp +++ b/Source/Core/DolphinWX/Src/Main.cpp @@ -300,6 +300,7 @@ bool DolphinApp::OnInit() File::CopyDir(SHARED_USER_DIR MAPS_DIR DIR_SEP, File::GetUserPath(D_MAPS_IDX)); File::CopyDir(SHARED_USER_DIR SHADERS_DIR DIR_SEP, File::GetUserPath(D_SHADERS_IDX)); File::CopyDir(SHARED_USER_DIR WII_USER_DIR DIR_SEP, File::GetUserPath(D_WIIUSER_IDX)); + File::CopyDir(SHARED_USER_DIR OPENCL_DIR DIR_SEP, File::GetUserPath(D_OPENCL_IDX)); if (!File::Exists(File::GetUserPath(D_GCUSER_IDX))) File::CreateFullPath(File::GetUserPath(D_GCUSER_IDX)); diff --git a/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp b/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp index 13034b2b50..cb0bb04866 100644 --- a/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp +++ b/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp @@ -103,7 +103,7 @@ void TexDecoder_OpenCL_Initialize() char **binaries = NULL; char filename[1024]; - sprintf(filename, "%sOpenCL/kernel.bin", File::GetUserPath(D_USER_IDX)); + sprintf(filename, "%skernel.bin", File::GetUserPath(D_OPENCL_IDX)); FILE *input = NULL; @@ -155,7 +155,7 @@ void TexDecoder_OpenCL_Initialize() if (err) { std::string code; - sprintf(filename, "%sOpenCL/TextureDecoder.cl", File::GetUserPath(D_USER_IDX)); + sprintf(filename, "%sTextureDecoder.cl", File::GetUserPath(D_OPENCL_IDX)); if (!File::ReadFileToString(true, filename, code)) { ERROR_LOG(VIDEO, "Failed to load OpenCL code %s - file is missing?", filename); @@ -204,7 +204,7 @@ void TexDecoder_OpenCL_Initialize() if (!err) { - sprintf(filename, "%sOpenCL/kernel.bin", File::GetUserPath(D_USER_IDX)); + sprintf(filename, "%skernel.bin", File::GetUserPath(D_OPENCL_IDX)); const char *current_rev = SVN_REV_STR + '\0'; FILE *output = NULL;