diff --git a/Data/User/OpenCL/TextureDecoder.cl b/Data/User/OpenCL/TextureDecoder.cl new file mode 100644 index 0000000000..052865021c --- /dev/null +++ b/Data/User/OpenCL/TextureDecoder.cl @@ -0,0 +1,252 @@ +// Copyright (C) 2003 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +kernel void DecodeI4(global uchar *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 8, y = get_global_id(1) * 8; + int srcOffset = x + y * width / 8; + for (int iy = 0; iy < 8; iy++) + { + uchar4 val = vload4(srcOffset, src); + uchar8 res; + res.even = (val >> 4) & 0x0F; + res.odd = val & 0x0F; + res |= res << 4; + vstore8(res, 0, dst + ((y + iy)*width + x)); + srcOffset++; + } +} + +kernel void DecodeI8(global uchar *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 8, y = get_global_id(1) * 4; + int srcOffset = ((x * 4) + (y * width)) / 8; + for (int iy = 0; iy < 4; iy++) + { + vstore8(vload8(srcOffset, src), + 0, dst + ((y + iy)*width + x)); + srcOffset++; + } +} + +kernel void DecodeIA8(global uchar *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 4, y = get_global_id(1) * 4; + int srcOffset = ((x * 4) + (y * width)) / 4; + for (int iy = 0; iy < 4; iy++) + { + uchar8 val = vload8(srcOffset, src); + uchar8 res; + res.odd = val.even; + res.even = val.odd; + vstore8(res, 0, dst + ((y + iy)*width + x) * 2); + srcOffset++; + } +} + +kernel void DecodeIA4(global uchar *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 8, y = get_global_id(1) * 4; + int srcOffset = ((x * 4) + (y * width)) / 8; + for (int iy = 0; iy < 4; iy++) + { + uchar8 val = vload8(srcOffset, src); + uchar16 res; + res.odd = (val >> 4) & 0x0F; + res.even = val & 0x0F; + res |= res << 4; + vstore16(res, 0, dst + ((y + iy)*width + x) * 2); + srcOffset++; + } +} + +kernel void DecodeRGBA8(global uchar *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 4, y = get_global_id(1) * 4; + int srcOffset = (x * 2) + (y * width) / 2; + for (int iy = 0; iy < 4; iy++) + { + uchar8 ar = vload8(srcOffset, src); + uchar8 gb = vload8(srcOffset + 4, src); + uchar16 res; + res.even.even = gb.odd; + res.even.odd = ar.odd; + res.odd.even = gb.even; + res.odd.odd = ar.even; + vstore16(res, 0, dst + ((y + iy)*width + x) * 4); + srcOffset++; + } +} + +kernel void DecodeRGB565(global ushort *dst, + const global ushort *src, int width) +{ + int x = get_global_id(0) * 4, y = get_global_id(1) * 4; + int srcOffset = x + (y * width) / 4; + for (int iy = 0; iy < 4; iy++) + { + ushort4 val = vload4(srcOffset, src); + val = (val >> 8) | (val << 8); + vstore4(val, 0, dst + ((y + iy)*width + x)); + srcOffset++; + } +} + +kernel void DecodeRGB5A3(global uchar *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 4, y = get_global_id(1) * 4; + int srcOffset = x + (y * width) / 4; + for (int iy = 0; iy < 4; iy++) + { + ushort8 val = convert_ushort8(vload8(srcOffset, src)); + ushort4 vs = val.odd | (val.even << 8); + + uchar16 resNoAlpha; + resNoAlpha.s26AE = convert_uchar4(vs >> 7); // R + resNoAlpha.s159D = convert_uchar4(vs >> 2); // G + resNoAlpha.s048C = convert_uchar4(vs << 3); // B + resNoAlpha &= 0xF8; + resNoAlpha |= (resNoAlpha >> 5) & 3; // 5 -> 8 + resNoAlpha.s37BF = (uchar4)(0xFF); + + uchar16 resAlpha; + resAlpha.s26AE = convert_uchar4(vs >> 8); // R + resAlpha.s159D = convert_uchar4(vs >> 4); // G + resAlpha.s048C = convert_uchar4(vs); // B + resAlpha &= 0x0F; + resAlpha |= (resAlpha << 4); + resAlpha.s37BF = convert_uchar4(vs >> 7) & 0xE0; + resAlpha.s37BF |= ((resAlpha.s37BF >> 3) & 0x1C) + | ((resAlpha.s37BF >> 6) & 0x3); + uchar16 choice = (uchar16)((uchar4)(vs.s0 >> 8), + (uchar4)(vs.s1 >> 8), + (uchar4)(vs.s2 >> 8), + (uchar4)(vs.s3 >> 8)); + uchar16 res; + res = select(resAlpha, resNoAlpha, choice); + vstore16(res, 0, dst + ((y + iy)*width + x) * 4); + srcOffset++; + } +} + +uint4 unpack2bits(uchar b) +{ + return (uint4)(b >> 6, + (b >> 4) & 3, + (b >> 2) & 3, + b & 3); +} + +/* +Lots of debug code there that I'm using to find the problems with CMPR decoding +I think blocks having no alpha are properly decoded, only the blocks with alpha +are problematic. This is WIP ! +*/ +kernel void decodeCMPRBlock(global uchar *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 4, y = get_global_id(1) * 4; + int srcOffset = (x * 2 + y * width / 2) / 8; //x / 4 + y * width / 16; //(x * 4) + (y * width) / 16; + uchar8 val = vload8(0, src); + ushort2 color565 = (ushort2)((val.s1 & 0xFF) | (val.s0 << 8), (val.s3 & 0xFF) | (val.s2 << 8)); + uchar8 color32 = convert_uchar8((ushort8) + (((color565 << 3) & 0xF8) | ((color565 >> 2) & 0x7), + ((color565 >> 3) & 0xFC) | ((color565 >> 9) & 0x3), + ((color565 >> 8) & 0xF8) | ((color565 >> 13) & 0x7), + 0xFF, 0xFF)); + uint4 colors; + uint4 choice = (uint4)((color565.s0 - color565.s1) << 16); + uint4 colorNoAlpha; + //uchar4 frac = (color32.odd - color32.even) / 2; + //frac = frac - (frac / 4); + uchar4 frac = convert_uchar4((((convert_ushort4(color32.even) & 0xFF) - (convert_ushort4(color32.odd) & 0xFF)) * 3) / 8); + //colorNoAlpha = convert_uint4(frac); + colorNoAlpha = convert_uint4(color32.even - frac); + colorNoAlpha = (colorNoAlpha << 8) | convert_uint4(color32.odd + frac); + colorNoAlpha = (colorNoAlpha << 8) | convert_uint4(color32.odd); + colorNoAlpha = (colorNoAlpha << 8) | convert_uint4(color32.even); + + uint4 colorAlpha; + //uchar4 midpoint = rhadd(color32.odd, color32.even); + uchar4 midpoint = convert_uchar4((convert_ushort4(color32.odd) + convert_ushort4(color32.even) + 1) / 2); + midpoint.s3 = 0xFF; + //colorAlpha = convert_uint4(color32.odd); + colorAlpha = convert_uint4((uchar4)(0, 0, 0, 0)); + colorAlpha = (colorAlpha << 8) | convert_uint4(midpoint); + colorAlpha = (colorAlpha << 8) | convert_uint4(color32.odd); + colorAlpha = (colorAlpha << 8) | convert_uint4(color32.even); + + //colorNoAlpha = (uint4)(0xFFFFFFFF); + //colorAlpha = (uint4)(0, 0, 0, 0xFFFFFFFF); + + colors = select(colorNoAlpha, colorAlpha, choice); + + uint16 colorsFull = (uint16)(colors, colors, colors, colors); + + uint4 shift0 = unpack2bits(val.s4); + uint4 shift1 = unpack2bits(val.s5); + uint4 shift2 = unpack2bits(val.s6); + uint4 shift3 = unpack2bits(val.s7); + uint16 shifts = (uint16)((uint4)(shift3.s0), (uint4)(shift3.s1), (uint4)(shift3.s2), (uint4)(shift3.s3)); + shifts = (shifts << 8) | (uint16)((uint4)(shift2.s0), (uint4)(shift2.s1), (uint4)(shift2.s2), (uint4)(shift2.s3)); + shifts = (shifts << 8) | (uint16)((uint4)(shift1.s0), (uint4)(shift1.s1), (uint4)(shift1.s2), (uint4)(shift1.s3)); + shifts = (shifts << 8) | (uint16)((uint4)(shift0.s0), (uint4)(shift0.s1), (uint4)(shift0.s2), (uint4)(shift0.s3)); + shifts <<= 3; + + for (int iy = 0; iy < 4; iy++) + { + uchar16 res; + res = convert_uchar16(colorsFull >> (shifts & 0xFF)); + shifts >>= 8; + //uchar4 t = convert_uchar4((ushort4)(color565.s0 >> 8, color565.s0 & 0xFF, color565.s1 >> 8, color565.s1 & 0xFF)); + //res = (uchar16)(t, t, t, t); + //res = (uchar16)(frac, color32.even - color32.odd, (color32.even - color32.odd) / 2, (color32.even - color32.odd) / 2 - ((color32.even - color32.odd) / 8)); + //res = (uchar16)(color32.even, color32.odd, frac, convert_uchar4(choice)); + //res = convert_uchar16((uint16)(colorNoAlpha >> 24, colorNoAlpha >> 16, colorNoAlpha >> 8, colorNoAlpha)); + //res = convert_uchar16((uint16)(colorAlpha >> 24, colorAlpha >> 16, colorAlpha >> 8, colorAlpha)); + //res = convert_uchar16((uint16)(colors >> 24, colors >> 16, colors >> 8, colors)); + //res = convert_uchar16(shifts & 0xFF); + //res = convert_uchar16((uint16)(shift0, shift1, shift2, shift3)); + //res = (uchar16)(((x))) + (uchar16)(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3); + //res.lo = val; res.s8 = x >> 8; res.s9 = x; res.sA = (iy + y) >> 8; res.sB = y + iy; res.sC = width >> 8; res.sD = width; res.sE = srcOffset >> 8; res.sF = srcOffset; + vstore16(res, 0, dst); + dst += width * 4; + } +} + +kernel void DecodeCMPR(global uchar *dst, + const global uchar *src, int width) +{ + int x = get_global_id(0) * 8, y = get_global_id(1) * 8; + + src += x * 4 + (y * width) / 2; + + decodeCMPRBlock(dst + (y * width + x) * 4, src, width); + src += 8; + decodeCMPRBlock(dst + (y * width + x + 4) * 4, src, width); + src += 8; + decodeCMPRBlock(dst + ((y + 4) * width + x) * 4, src, width); + src += 8; + decodeCMPRBlock(dst + ((y + 4) * width + x + 4) * 4, src, width); + +} \ No newline at end of file diff --git a/Source/Core/Common/Src/OpenCL.cpp b/Source/Core/Common/Src/OpenCL.cpp index d24aa38972..7ed4c39349 100644 --- a/Source/Core/Common/Src/OpenCL.cpp +++ b/Source/Core/Common/Src/OpenCL.cpp @@ -45,7 +45,7 @@ bool Initialize() { err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { - PanicAlert("Error: Failed to create a device group!"); + HandleCLError(err, "Failed to create a device group!"); return false; } @@ -54,7 +54,7 @@ bool Initialize() { g_context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!g_context) { - PanicAlert("Error: Failed to create a compute context!"); + HandleCLError(err, "Failed to create a compute context!"); return false; } @@ -63,7 +63,7 @@ bool Initialize() { g_cmdq = clCreateCommandQueue(g_context, device_id, 0, &err); if (!g_cmdq) { - PanicAlert("Error: Failed to create a command commands!"); + HandleCLError(err, "Failed to create a command commands!"); return false; } //PanicAlert("Initialized OpenCL fine!"); @@ -88,7 +88,7 @@ cl_program CompileProgram(const char *Kernel) { program = clCreateProgramWithSource(OpenCL::g_context, 1, (const char **) & Kernel, NULL, &err); if (!program) { - printf("Error: Failed to create compute program!"); + HandleCLError(err, "Error: Failed to create compute program!"); return NULL; } @@ -114,7 +114,7 @@ cl_kernel CompileKernel(cl_program program, const char *Function) cl_kernel kernel = clCreateKernel(program, Function, &err); if (!kernel || err != CL_SUCCESS) { - PanicAlert("Error: Failed to create compute kernel!"); + HandleCLError(err, "Failed to create compute kernel!"); return NULL; } return kernel; @@ -130,5 +130,66 @@ void Destroy() { #endif } +void HandleCLError(cl_int error, char* str) +{ + char* name; + switch(error) + { +#define CL_ERROR(x) case (x): name = #x; break + CL_ERROR(CL_SUCCESS); + CL_ERROR(CL_DEVICE_NOT_FOUND); + CL_ERROR(CL_DEVICE_NOT_AVAILABLE); + CL_ERROR(CL_COMPILER_NOT_AVAILABLE); + CL_ERROR(CL_MEM_OBJECT_ALLOCATION_FAILURE); + CL_ERROR(CL_OUT_OF_RESOURCES); + CL_ERROR(CL_OUT_OF_HOST_MEMORY); + CL_ERROR(CL_PROFILING_INFO_NOT_AVAILABLE); + CL_ERROR(CL_MEM_COPY_OVERLAP); + CL_ERROR(CL_IMAGE_FORMAT_MISMATCH); + CL_ERROR(CL_IMAGE_FORMAT_NOT_SUPPORTED); + CL_ERROR(CL_BUILD_PROGRAM_FAILURE); + CL_ERROR(CL_MAP_FAILURE); + CL_ERROR(CL_INVALID_VALUE); + CL_ERROR(CL_INVALID_DEVICE_TYPE); + CL_ERROR(CL_INVALID_PLATFORM); + CL_ERROR(CL_INVALID_DEVICE); + CL_ERROR(CL_INVALID_CONTEXT); + CL_ERROR(CL_INVALID_QUEUE_PROPERTIES); + CL_ERROR(CL_INVALID_COMMAND_QUEUE); + CL_ERROR(CL_INVALID_HOST_PTR); + CL_ERROR(CL_INVALID_MEM_OBJECT); + CL_ERROR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); + CL_ERROR(CL_INVALID_IMAGE_SIZE); + CL_ERROR(CL_INVALID_SAMPLER); + CL_ERROR(CL_INVALID_BINARY); + CL_ERROR(CL_INVALID_BUILD_OPTIONS); + CL_ERROR(CL_INVALID_PROGRAM); + CL_ERROR(CL_INVALID_PROGRAM_EXECUTABLE); + CL_ERROR(CL_INVALID_KERNEL_NAME); + CL_ERROR(CL_INVALID_KERNEL_DEFINITION); + CL_ERROR(CL_INVALID_KERNEL); + CL_ERROR(CL_INVALID_ARG_INDEX); + CL_ERROR(CL_INVALID_ARG_VALUE); + CL_ERROR(CL_INVALID_ARG_SIZE); + CL_ERROR(CL_INVALID_KERNEL_ARGS); + CL_ERROR(CL_INVALID_WORK_DIMENSION); + CL_ERROR(CL_INVALID_WORK_GROUP_SIZE); + CL_ERROR(CL_INVALID_WORK_ITEM_SIZE); + CL_ERROR(CL_INVALID_GLOBAL_OFFSET); + CL_ERROR(CL_INVALID_EVENT_WAIT_LIST); + CL_ERROR(CL_INVALID_EVENT); + CL_ERROR(CL_INVALID_OPERATION); + CL_ERROR(CL_INVALID_GL_OBJECT); + CL_ERROR(CL_INVALID_BUFFER_SIZE); + CL_ERROR(CL_INVALID_MIP_LEVEL); +#undef CL_ERROR + default: + name = "Unknown error code"; + } + if(!str) + str = ""; + PanicAlert("OpenCL error: %s %s (%d)", str, name, error); + +} }; diff --git a/Source/Core/Common/Src/OpenCL.h b/Source/Core/Common/Src/OpenCL.h index 1c7ae2e0e2..6e63ccb1e6 100644 --- a/Source/Core/Common/Src/OpenCL.h +++ b/Source/Core/Common/Src/OpenCL.h @@ -60,6 +60,7 @@ void Destroy(); cl_program CompileProgram(const char *Kernel); cl_kernel CompileKernel(cl_program program, const char *Function); +void HandleCLError(cl_int error, char* str = 0); }; diff --git a/Source/Core/VideoCommon/OpenCL.rules b/Source/Core/VideoCommon/OpenCL.rules new file mode 100644 index 0000000000..0f88d1efc5 --- /dev/null +++ b/Source/Core/VideoCommon/OpenCL.rules @@ -0,0 +1,18 @@ + + + + + + + + + diff --git a/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp b/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp index cf8d79c85b..43179d9821 100644 --- a/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp +++ b/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp @@ -18,6 +18,7 @@ #include "OCLTextureDecoder.h" #include "OpenCL.h" +#include "FileUtil.h" #include #include @@ -26,159 +27,16 @@ #include #include #include - +#include + +//#define DEBUG_OPENCL + struct sDecoders { const char name[256]; // kernel name cl_kernel kernel; // compute kernel }; -const char *Kernel = " \n\ -kernel void DecodeI4(global uchar *dst, \n\ - const global uchar *src, int width) \n\ -{ \n\ - int x = get_global_id(0) * 8, y = get_global_id(1) * 8; \n\ - int srcOffset = x + y * width / 8; \n\ - for (int iy = 0; iy < 8; iy++) \n\ - { \n\ - uchar4 val = vload4(srcOffset, src); \n\ - uchar8 res; \n\ - res.even = (val >> 4) & 0x0F; \n\ - res.odd = val & 0x0F; \n\ - res |= res << 4; \n\ - vstore8(res, 0, dst + ((y + iy)*width + x)); \n\ - srcOffset++; \n\ - } \n\ -} \n\ - \n\ -kernel void DecodeI8(global uchar *dst, \n\ - const global uchar *src, int width) \n\ -{ \n\ - int x = get_global_id(0) * 8, y = get_global_id(1) * 4; \n\ - int srcOffset = ((x * 4) + (y * width)) / 8; \n\ - for (int iy = 0; iy < 4; iy++) \n\ - { \n\ - vstore8(vload8(srcOffset, src), \n\ - 0, dst + ((y + iy)*width + x)); \n\ - srcOffset++; \n\ - } \n\ -} \n\ - \n\ -kernel void DecodeIA8(global uchar *dst, \n\ - const global uchar *src, int width) \n\ -{ \n\ - int x = get_global_id(0) * 4, y = get_global_id(1) * 4; \n\ - int srcOffset = ((x * 4) + (y * width)) / 4; \n\ - for (int iy = 0; iy < 4; iy++) \n\ - { \n\ - uchar8 val = vload8(srcOffset, src); \n\ - uchar8 res; \n\ - res.odd = val.even; \n\ - res.even = val.odd; \n\ - vstore8(res, 0, dst + ((y + iy)*width + x) * 2); \n\ - srcOffset++; \n\ - } \n\ -} \n\ - \n\ -kernel void DecodeIA4(global uchar *dst, \n\ - const global uchar *src, int width) \n\ -{ \n\ - int x = get_global_id(0) * 8, y = get_global_id(1) * 4; \n\ - int srcOffset = ((x * 4) + (y * width)) / 8; \n\ - for (int iy = 0; iy < 4; iy++) \n\ - { \n\ - uchar8 val = vload8(srcOffset, src); \n\ - uchar16 res; \n\ - res.odd = (val >> 4) & 0x0F; \n\ - res.even = val & 0x0F; \n\ - res |= res << 4; \n\ - vstore16(res, 0, dst + ((y + iy)*width + x) * 2); \n\ - srcOffset++; \n\ - } \n\ -} \n\ - \n\ -kernel void DecodeRGBA8(global uchar *dst, \n\ - const global uchar *src, int width) \n\ -{ \n\ - int x = get_global_id(0) * 4, y = get_global_id(1) * 4; \n\ - int srcOffset = (x * 2) + (y * width) / 2; \n\ - for (int iy = 0; iy < 4; iy++) \n\ - { \n\ - uchar8 ar = vload8(srcOffset, src); \n\ - uchar8 gb = vload8(srcOffset + 4, src); \n\ - uchar16 res; \n\ - res.even.even = gb.odd; \n\ - res.even.odd = ar.odd; \n\ - res.odd.even = gb.even; \n\ - res.odd.odd = ar.even; \n\ - vstore16(res, 0, dst + ((y + iy)*width + x) * 4); \n\ - srcOffset++; \n\ - } \n\ -} \n\ - \n\ -kernel void DecodeRGB565(global ushort *dst, \n\ - const global ushort *src, int width) \n\ -{ \n\ - int x = get_global_id(0) * 4, y = get_global_id(1) * 4; \n\ - int srcOffset = x + (y * width) / 4; \n\ - for (int iy = 0; iy < 4; iy++) \n\ - { \n\ - ushort4 val = vload4(srcOffset, src); \n\ - val = (val >> 8) | (val << 8); \n\ - vstore4(val, 0, dst + ((y + iy)*width + x)); \n\ - srcOffset++; \n\ - } \n\ -} \n\ - \n\ -kernel void DecodeRGB5A3(global uchar *dst, \n\ - const global uchar *src, int width) \n\ -{ \n\ - int x = get_global_id(0) * 4, y = get_global_id(1) * 4; \n\ - int srcOffset = x + (y * width) / 4; \n\ - for (int iy = 0; iy < 4; iy++) \n\ - { \n\ - ushort8 val = convert_ushort8(vload8(srcOffset, src));\n\ - ushort4 vs = val.odd | (val.even << 8); \n\ - uchar16 resNoAlpha; \n\ - resNoAlpha.odd.odd = (uchar4)(0xFF); \n\ - resNoAlpha.even.odd = convert_uchar4(vs >> 7) & 0xF8; \n\ - resNoAlpha.odd.even = convert_uchar4(vs >> 2) & 0xF8; \n\ - resNoAlpha.even.even = convert_uchar4(vs << 3) & 0xF8;\n\ - // Better but cause color bleeding \n\ - //resNoAlpha |= resNoAlpha >> 5; \n\ - uchar16 resAlpha; \n\ - resAlpha.even.odd = convert_uchar4(vs >> 8) & 0x0F; \n\ - resAlpha.odd.even = convert_uchar4(vs >> 4) & 0x0F; \n\ - resAlpha.even.even = convert_uchar4(vs) & 0x0F; \n\ - resAlpha |= resNoAlpha << 4; \n\ - resAlpha.odd.odd = convert_uchar4(vs >> 7) & 0xE0; \n\ - resAlpha.odd.odd |= (resAlpha.odd.odd >> 3) \n\ - | (resAlpha.odd.odd >> 6); \n\ - uchar16 choice = (uchar16)((uchar4)(vs.s0 >> 8), \n\ - (uchar4)(vs.s1 >> 8), \n\ - (uchar4)(vs.s2 >> 8), \n\ - (uchar4)(vs.s3 >> 8)); \n\ - uchar16 res; \n\ - res = select(resAlpha, resNoAlpha, choice); \n\ - vstore16(res, 0, dst + ((y + iy)*width + x) * 4); \n\ - srcOffset++; \n\ - } \n\ -} \n\ - \n\ -kernel void DecodeDXT(global ulong *dst, \n\ - const global ulong *src, int width) \n\ -{ // TODO: PLEASE NOTE THAT THIS CODE DOES NOT WORK \n\ - int x = get_global_id(0) * 8, y = get_global_id(1) * 8; \n\ - int srcOffset = ((x * 4) + (y * width)) / 8; \n\ - for (int iy = 0; iy < 4; iy++) \n\ - { \n\ - vstore8(vload8(srcOffset, src), \n\ - 0, dst + ((y + iy)*width + x)); \n\ - srcOffset++; \n\ - } \n\ -} \n\ -"; - cl_program g_program; // NULL terminated set of kernels sDecoders Decoders[] = { @@ -189,22 +47,29 @@ sDecoders Decoders[] = { {"DecodeRGBA8", NULL}, {"DecodeRGB565", NULL}, {"DecodeRGB5A3", NULL}, -{"DecodeDXT", NULL}, +{"DecodeCMPR", NULL}, {"", NULL}, }; bool g_Inited = false; cl_mem g_clsrc, g_cldst; // texture buffer memory objects -void TexDecoder_OpenCL_Initialize() { -#if defined(HAVE_OPENCL) && HAVE_OPENCL +void TexDecoder_OpenCL_Initialize() { +#if defined(HAVE_OPENCL) && HAVE_OPENCL if(!g_Inited) { if(!OpenCL::Initialize()) return; + std::string code; + char* filename = "User/OpenCL/TextureDecoder.cl"; + if (!File::ReadFileToString(true, filename, code)) + { + ERROR_LOG(VIDEO, "Failed to load OpenCL code %s - file is missing?", filename); + return; + } - g_program = OpenCL::CompileProgram(Kernel); + g_program = OpenCL::CompileProgram(code.c_str()); int i = 0; while(strlen(Decoders[i].name) > 0) { @@ -213,16 +78,18 @@ void TexDecoder_OpenCL_Initialize() { } // Allocating maximal Wii texture size in advance, so that we don't have to allocate/deallocate per texture +#ifndef DEBUG_OPENCL g_clsrc = clCreateBuffer(OpenCL::GetContext(), CL_MEM_READ_ONLY , 1024 * 1024 * sizeof(u32), NULL, NULL); g_cldst = clCreateBuffer(OpenCL::GetContext(), CL_MEM_WRITE_ONLY, 1024 * 1024 * sizeof(u32), NULL, NULL); +#endif g_Inited = true; - } -#endif -} - + } +#endif +} + void TexDecoder_OpenCL_Shutdown() { -#if defined(HAVE_OPENCL) && HAVE_OPENCL +#if defined(HAVE_OPENCL) && HAVE_OPENCL && !defined(DEBUG_OPENCL) if(g_clsrc) clReleaseMemObject(g_clsrc); @@ -257,7 +124,6 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei formatResult = PC_TEX_FMT_I8; break; case GX_TF_IA4: - // Maybe a cleaner way is needed kernelToRun = Decoders[2].kernel; sizeOfSrc = sizeof(u8); sizeOfDst = sizeof(u16); @@ -295,9 +161,10 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei formatResult = PC_TEX_FMT_BGRA32; break; case GX_TF_CMPR: - return PC_TEX_FMT_NONE; // <-- TODO: Fix CMPR + return PC_TEX_FMT_NONE; // Remove to test CMPR kernelToRun = Decoders[7].kernel; - sizeOfSrc = sizeOfDst = sizeof(u32); + sizeOfSrc = sizeof(u8) / 2.0f; + sizeOfDst = sizeof(u32); xSkip = 8; ySkip = 8; formatResult = PC_TEX_FMT_BGRA32; @@ -306,6 +173,11 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei return PC_TEX_FMT_NONE; } +#ifdef DEBUG_OPENCL + g_clsrc = clCreateBuffer(OpenCL::GetContext(), CL_MEM_READ_ONLY , 1024 * 1024 * sizeof(u32), NULL, NULL); + g_cldst = clCreateBuffer(OpenCL::GetContext(), CL_MEM_WRITE_ONLY, 1024 * 1024 * sizeof(u32), NULL, NULL); +#endif + clEnqueueWriteBuffer(OpenCL::GetCommandQueue(), g_clsrc, CL_TRUE, 0, (size_t)(width * height * sizeOfSrc), src, 0, NULL, NULL); clSetKernelArg(kernelToRun, 0, sizeof(cl_mem), &g_cldst); @@ -324,125 +196,22 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei err = clEnqueueNDRangeKernel(OpenCL::GetCommandQueue(), kernelToRun, 2, NULL, global, NULL, 0, NULL, NULL); if(err) - PanicAlert("Error queueing kernel"); + OpenCL::HandleCLError(err, "Failed to enqueue kernel"); clFinish(OpenCL::GetCommandQueue()); clEnqueueReadBuffer(OpenCL::GetCommandQueue(), g_cldst, CL_TRUE, 0, (size_t)(width * height * sizeOfDst), dst, 0, NULL, NULL); - + +#ifdef DEBUG_OPENCL + clReleaseMemObject(g_clsrc); + clReleaseMemObject(g_cldst); +#endif + return formatResult; #else return PC_TEX_FMT_NONE; #endif - /* switch (texformat) - { - case GX_TF_C4: - if (tlutfmt == 2) - { - // Special decoding is required for TLUT format 5A3 - for (int y = 0; y < height; y += 8) - for (int x = 0; x < width; x += 8) - for (int iy = 0; iy < 8; iy++, src += 4) - decodebytesC4_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src, tlutaddr); - } - else - { - for (int y = 0; y < height; y += 8) - for (int x = 0; x < width; x += 8) - for (int iy = 0; iy < 8; iy++, src += 4) - decodebytesC4_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr); - } - return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_C8: - if (tlutfmt == 2) - { - // Special decoding is required for TLUT format 5A3 - for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 8) - for (int iy = 0; iy < 4; iy++, src += 8) - decodebytesC8_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src, tlutaddr); - } - else - { - for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 8) - for (int iy = 0; iy < 4; iy++, src += 8) - decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr); - } - return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_C14X2: - if (tlutfmt == 2) - { - // Special decoding is required for TLUT format 5A3 - for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 4) - for (int iy = 0; iy < 4; iy++, src += 8) - decodebytesC14X2_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, (u16*)src, tlutaddr); - } - else - { - for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 4) - for (int iy = 0; iy < 4; iy++, src += 8) - decodebytesC14X2_To_Raw16((u16*)dst + (y + iy) * width + x, (u16*)src, tlutaddr); - } - return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_RGB565: - { - for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 4) - for (int iy = 0; iy < 4; iy++, src += 8) - { - u16 *ptr = (u16 *)dst + (y + iy) * width + x; - u16 *s = (u16 *)src; - for(int j = 0; j < 4; j++) - *ptr++ = Common::swap16(*s++); - } - } - return PC_TEX_FMT_RGB565; - case GX_TF_RGB5A3: - { - for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 4) - for (int iy = 0; iy < 4; iy++, src += 8) - //decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src, 4); - decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src); - } - return PC_TEX_FMT_BGRA32; - case GX_TF_RGBA8: // speed critical - { - for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 4) - { - for (int iy = 0; iy < 4; iy++) - decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16); - src += 64; - } - } - return PC_TEX_FMT_BGRA32; - case GX_TF_CMPR: // speed critical - // The metroid games use this format almost exclusively. - { - for (int y = 0; y < height; y += 8) - { - for (int x = 0; x < width; x += 8) - { - decodeDXTBlock((u32*)dst + y * width + x, (DXTBlock*)src, width); - src += sizeof(DXTBlock); - decodeDXTBlock((u32*)dst + y * width + x + 4, (DXTBlock*)src, width); - src += sizeof(DXTBlock); - decodeDXTBlock((u32*)dst + (y + 4) * width + x, (DXTBlock*)src, width); - src += sizeof(DXTBlock); - decodeDXTBlock((u32*)dst + (y + 4) * width + x + 4, (DXTBlock*)src, width); - src += sizeof(DXTBlock); - } - } - return PC_TEX_FMT_BGRA32; - } - } -*/ - // The "copy" texture formats, too? return PC_TEX_FMT_NONE; } diff --git a/Source/Core/VideoCommon/VideoCommon.vcproj b/Source/Core/VideoCommon/VideoCommon.vcproj index a74ddf1217..0d32a1b96d 100644 --- a/Source/Core/VideoCommon/VideoCommon.vcproj +++ b/Source/Core/VideoCommon/VideoCommon.vcproj @@ -1,7 +1,7 @@ + + @@ -95,6 +101,9 @@ + @@ -160,6 +169,9 @@ + @@ -226,6 +238,9 @@ + @@ -292,6 +307,9 @@ + @@ -353,6 +371,9 @@ + @@ -699,6 +720,10 @@ RelativePath=".\Src\OpenCL\OCLTextureDecoder.h" > + +