diff --git a/Data/User/OpenCL/TextureDecoder.cl b/Data/User/OpenCL/TextureDecoder.cl
new file mode 100644
index 0000000000..052865021c
--- /dev/null
+++ b/Data/User/OpenCL/TextureDecoder.cl
@@ -0,0 +1,252 @@
+// Copyright (C) 2003 Dolphin Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official SVN repository and contact information can be found at
+// http://code.google.com/p/dolphin-emu/
+
+kernel void DecodeI4(global uchar *dst,
+                     const global uchar *src, int width)
+{
+    int x = get_global_id(0) * 8, y = get_global_id(1) * 8;
+    int srcOffset = x + y * width / 8;
+    for (int iy = 0; iy < 8; iy++)
+    {
+        uchar4 val = vload4(srcOffset, src);
+        uchar8 res;
+        res.even = (val >> 4) & 0x0F;
+        res.odd = val & 0x0F;
+        res |= res << 4;
+        vstore8(res, 0, dst + ((y + iy)*width + x));
+        srcOffset++;
+    }
+}
+
+kernel void DecodeI8(global uchar *dst,
+                     const global uchar *src, int width)
+{
+    int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
+    int srcOffset = ((x * 4) + (y * width)) / 8;
+    for (int iy = 0; iy < 4; iy++)
+    {
+        vstore8(vload8(srcOffset, src),
+                0, dst + ((y + iy)*width + x));
+        srcOffset++;
+    }
+}
+
+kernel void DecodeIA8(global uchar *dst,
+                      const global uchar *src, int width)
+{
+    int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
+    int srcOffset = ((x * 4) + (y * width)) / 4;
+    for (int iy = 0; iy < 4; iy++)
+    {
+        uchar8 val = vload8(srcOffset, src);
+        uchar8 res;
+        res.odd = val.even;
+        res.even = val.odd;
+        vstore8(res, 0, dst + ((y + iy)*width + x) * 2);
+        srcOffset++;
+    }
+}
+
+kernel void DecodeIA4(global uchar *dst,
+                     const global uchar *src, int width)
+{
+    int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
+    int srcOffset = ((x * 4) + (y * width)) / 8;
+    for (int iy = 0; iy < 4; iy++)
+    {
+        uchar8 val = vload8(srcOffset, src);
+        uchar16 res;
+        res.odd = (val >> 4) & 0x0F;
+        res.even  = val & 0x0F;
+        res |= res << 4;
+        vstore16(res, 0, dst + ((y + iy)*width + x) * 2);
+        srcOffset++;
+    }
+}
+
+kernel void DecodeRGBA8(global uchar *dst,
+                      const global uchar *src, int width)
+{
+    int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
+    int srcOffset = (x * 2) + (y * width) / 2;
+    for (int iy = 0; iy < 4; iy++)
+    {
+        uchar8 ar = vload8(srcOffset, src);
+        uchar8 gb = vload8(srcOffset + 4, src);
+        uchar16 res;
+        res.even.even = gb.odd;
+        res.even.odd = ar.odd;
+        res.odd.even = gb.even;
+        res.odd.odd = ar.even;
+        vstore16(res, 0, dst + ((y + iy)*width + x) * 4);
+        srcOffset++;
+    }
+}
+
+kernel void DecodeRGB565(global ushort *dst,
+                         const global ushort *src, int width)
+{
+    int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
+    int srcOffset = x + (y * width) / 4;
+    for (int iy = 0; iy < 4; iy++)
+    {
+        ushort4 val = vload4(srcOffset, src);
+        val = (val >> 8) | (val << 8);
+        vstore4(val, 0, dst + ((y + iy)*width + x));
+        srcOffset++;
+    }
+}
+
+kernel void DecodeRGB5A3(global uchar *dst,
+                         const global uchar *src, int width)
+{
+    int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
+    int srcOffset = x + (y * width) / 4;
+    for (int iy = 0; iy < 4; iy++)
+    {
+        ushort8 val = convert_ushort8(vload8(srcOffset, src));
+        ushort4 vs = val.odd | (val.even << 8);
+        
+        uchar16 resNoAlpha;
+        resNoAlpha.s26AE = convert_uchar4(vs >> 7); // R
+        resNoAlpha.s159D = convert_uchar4(vs >> 2); // G
+        resNoAlpha.s048C = convert_uchar4(vs << 3); // B
+        resNoAlpha &= 0xF8;        
+        resNoAlpha |= (resNoAlpha >> 5) & 3; // 5 -> 8
+        resNoAlpha.s37BF = (uchar4)(0xFF);
+        
+        uchar16 resAlpha;
+        resAlpha.s26AE = convert_uchar4(vs >> 8); // R
+        resAlpha.s159D = convert_uchar4(vs >> 4); // G
+        resAlpha.s048C = convert_uchar4(vs);      // B
+        resAlpha &= 0x0F;
+        resAlpha |= (resAlpha << 4);
+        resAlpha.s37BF = convert_uchar4(vs >> 7) & 0xE0;
+        resAlpha.s37BF |= ((resAlpha.s37BF >> 3) & 0x1C)
+                            | ((resAlpha.s37BF >> 6) & 0x3);
+        uchar16 choice = (uchar16)((uchar4)(vs.s0 >> 8),
+                                   (uchar4)(vs.s1 >> 8),
+                                   (uchar4)(vs.s2 >> 8),
+                                   (uchar4)(vs.s3 >> 8));
+        uchar16 res;
+        res = select(resAlpha, resNoAlpha, choice);
+        vstore16(res, 0, dst + ((y + iy)*width + x) * 4);
+        srcOffset++;
+    }
+}
+
+uint4 unpack2bits(uchar b)
+{
+    return (uint4)(b >> 6,
+                  (b >> 4) & 3,
+                  (b >> 2) & 3,
+                  b & 3);
+}       
+
+/*
+Lots of debug code there that I'm using to find the problems with CMPR decoding
+I think blocks having no alpha are properly decoded, only the blocks with alpha
+are problematic. This is WIP !
+*/
+kernel void decodeCMPRBlock(global uchar *dst,
+                      const global uchar *src, int width)
+{
+    int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
+    int srcOffset = (x * 2 + y * width / 2) / 8; //x / 4 + y * width / 16; //(x * 4) + (y * width) / 16;
+    uchar8 val = vload8(0, src);
+    ushort2 color565 = (ushort2)((val.s1 & 0xFF) | (val.s0 << 8), (val.s3 & 0xFF) | (val.s2 << 8));
+    uchar8 color32 = convert_uchar8((ushort8)
+         (((color565 << 3) & 0xF8) | ((color565 >>  2) & 0x7),
+          ((color565 >> 3) & 0xFC) | ((color565 >>  9) & 0x3),
+          ((color565 >> 8) & 0xF8) | ((color565 >> 13) & 0x7),
+            0xFF, 0xFF));
+    uint4 colors;
+    uint4 choice = (uint4)((color565.s0 - color565.s1) << 16);
+    uint4 colorNoAlpha;
+    //uchar4 frac = (color32.odd - color32.even) / 2;
+    //frac = frac - (frac / 4);
+    uchar4 frac = convert_uchar4((((convert_ushort4(color32.even) & 0xFF) - (convert_ushort4(color32.odd) & 0xFF)) * 3) / 8);
+    //colorNoAlpha = convert_uint4(frac);
+    colorNoAlpha = convert_uint4(color32.even - frac);
+    colorNoAlpha = (colorNoAlpha << 8) | convert_uint4(color32.odd + frac);
+    colorNoAlpha = (colorNoAlpha << 8) | convert_uint4(color32.odd);
+    colorNoAlpha = (colorNoAlpha << 8) | convert_uint4(color32.even);
+
+    uint4 colorAlpha;
+    //uchar4 midpoint = rhadd(color32.odd, color32.even);
+    uchar4 midpoint = convert_uchar4((convert_ushort4(color32.odd) + convert_ushort4(color32.even) + 1) / 2);
+    midpoint.s3 = 0xFF;
+    //colorAlpha = convert_uint4(color32.odd);
+    colorAlpha = convert_uint4((uchar4)(0, 0, 0, 0));
+    colorAlpha = (colorAlpha << 8) | convert_uint4(midpoint);    
+    colorAlpha = (colorAlpha << 8) | convert_uint4(color32.odd);
+    colorAlpha = (colorAlpha << 8) | convert_uint4(color32.even);
+
+    //colorNoAlpha = (uint4)(0xFFFFFFFF);
+    //colorAlpha = (uint4)(0, 0, 0, 0xFFFFFFFF);
+
+    colors  = select(colorNoAlpha, colorAlpha, choice);
+
+    uint16 colorsFull = (uint16)(colors, colors, colors, colors);
+
+    uint4 shift0 = unpack2bits(val.s4);
+    uint4 shift1 = unpack2bits(val.s5);
+    uint4 shift2 = unpack2bits(val.s6);
+    uint4 shift3 = unpack2bits(val.s7);
+    uint16 shifts =          (uint16)((uint4)(shift3.s0), (uint4)(shift3.s1), (uint4)(shift3.s2), (uint4)(shift3.s3));
+    shifts = (shifts << 8) | (uint16)((uint4)(shift2.s0), (uint4)(shift2.s1), (uint4)(shift2.s2), (uint4)(shift2.s3));
+    shifts = (shifts << 8) | (uint16)((uint4)(shift1.s0), (uint4)(shift1.s1), (uint4)(shift1.s2), (uint4)(shift1.s3));
+    shifts = (shifts << 8) | (uint16)((uint4)(shift0.s0), (uint4)(shift0.s1), (uint4)(shift0.s2), (uint4)(shift0.s3));
+    shifts <<= 3;
+    
+    for (int iy = 0; iy < 4; iy++)
+    {
+        uchar16 res;
+        res = convert_uchar16(colorsFull >> (shifts & 0xFF));
+        shifts >>= 8;
+        //uchar4 t = convert_uchar4((ushort4)(color565.s0 >> 8, color565.s0 & 0xFF, color565.s1 >> 8, color565.s1 & 0xFF));
+        //res = (uchar16)(t, t, t, t);
+        //res = (uchar16)(frac, color32.even - color32.odd, (color32.even - color32.odd) / 2, (color32.even - color32.odd) / 2 - ((color32.even - color32.odd) / 8));
+        //res = (uchar16)(color32.even, color32.odd, frac, convert_uchar4(choice));
+        //res = convert_uchar16((uint16)(colorNoAlpha >> 24, colorNoAlpha >> 16, colorNoAlpha >> 8, colorNoAlpha));
+        //res = convert_uchar16((uint16)(colorAlpha >> 24, colorAlpha >> 16, colorAlpha >> 8, colorAlpha));
+        //res = convert_uchar16((uint16)(colors >> 24, colors >> 16, colors >> 8, colors));
+        //res = convert_uchar16(shifts & 0xFF);
+        //res = convert_uchar16((uint16)(shift0, shift1, shift2, shift3));
+        //res = (uchar16)(((x))) + (uchar16)(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3);
+        //res.lo = val; res.s8 = x >> 8; res.s9 = x; res.sA = (iy + y) >> 8; res.sB = y + iy; res.sC = width >> 8; res.sD = width; res.sE = srcOffset >> 8; res.sF = srcOffset;
+        vstore16(res, 0, dst);
+        dst += width * 4;
+    }
+}
+
+kernel void DecodeCMPR(global uchar *dst,
+                      const global uchar *src, int width)
+{
+    int x = get_global_id(0) * 8, y = get_global_id(1) * 8;
+    
+    src += x * 4 + (y * width) / 2;
+    
+    decodeCMPRBlock(dst + (y * width + x) * 4, src, width);
+    src += 8;
+    decodeCMPRBlock(dst + (y * width + x + 4) * 4, src, width);
+    src += 8;
+    decodeCMPRBlock(dst + ((y + 4) * width + x) * 4, src, width);
+    src += 8;
+    decodeCMPRBlock(dst + ((y + 4) * width + x + 4) * 4, src, width);
+
+}
\ No newline at end of file
diff --git a/Source/Core/Common/Src/OpenCL.cpp b/Source/Core/Common/Src/OpenCL.cpp
index d24aa38972..7ed4c39349 100644
--- a/Source/Core/Common/Src/OpenCL.cpp
+++ b/Source/Core/Common/Src/OpenCL.cpp
@@ -45,7 +45,7 @@ bool Initialize() {
     err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
     if (err != CL_SUCCESS)
     {
-        PanicAlert("Error: Failed to create a device group!");
+        HandleCLError(err, "Failed to create a device group!");
         return false;
     }
   
@@ -54,7 +54,7 @@ bool Initialize() {
     g_context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
     if (!g_context)
     {
-        PanicAlert("Error: Failed to create a compute context!");
+        HandleCLError(err, "Failed to create a compute context!");
         return false;
     }
 
@@ -63,7 +63,7 @@ bool Initialize() {
     g_cmdq = clCreateCommandQueue(g_context, device_id, 0, &err);
     if (!g_cmdq)
     {
-        PanicAlert("Error: Failed to create a command commands!");
+        HandleCLError(err, "Failed to create a command commands!");
         return false;
     }
 	//PanicAlert("Initialized OpenCL fine!");
@@ -88,7 +88,7 @@ cl_program CompileProgram(const char *Kernel) {
 		program = clCreateProgramWithSource(OpenCL::g_context, 1, (const char **) & Kernel, NULL, &err);
 		if (!program)
 		{
-			printf("Error: Failed to create compute program!");
+			HandleCLError(err, "Error: Failed to create compute program!");
 			return NULL;
 		}
 
@@ -114,7 +114,7 @@ cl_kernel CompileKernel(cl_program program, const char *Function)
 		cl_kernel kernel = clCreateKernel(program, Function, &err);
 		if (!kernel || err != CL_SUCCESS)
 		{
-			PanicAlert("Error: Failed to create compute kernel!");
+			HandleCLError(err, "Failed to create compute kernel!");
 			return NULL;
 		} 
 		return kernel;
@@ -130,5 +130,66 @@ void Destroy() {
 #endif
 }
 
+void HandleCLError(cl_int error, char* str)
+{
+    char* name;
+    switch(error)
+    {
+#define CL_ERROR(x) case (x): name = #x; break
+        CL_ERROR(CL_SUCCESS);
+        CL_ERROR(CL_DEVICE_NOT_FOUND);
+        CL_ERROR(CL_DEVICE_NOT_AVAILABLE);
+        CL_ERROR(CL_COMPILER_NOT_AVAILABLE);
+        CL_ERROR(CL_MEM_OBJECT_ALLOCATION_FAILURE);
+        CL_ERROR(CL_OUT_OF_RESOURCES);
+        CL_ERROR(CL_OUT_OF_HOST_MEMORY);
+        CL_ERROR(CL_PROFILING_INFO_NOT_AVAILABLE);
+        CL_ERROR(CL_MEM_COPY_OVERLAP);
+        CL_ERROR(CL_IMAGE_FORMAT_MISMATCH);
+        CL_ERROR(CL_IMAGE_FORMAT_NOT_SUPPORTED);
+        CL_ERROR(CL_BUILD_PROGRAM_FAILURE);
+        CL_ERROR(CL_MAP_FAILURE);
+        CL_ERROR(CL_INVALID_VALUE);
+        CL_ERROR(CL_INVALID_DEVICE_TYPE);
+        CL_ERROR(CL_INVALID_PLATFORM);
+        CL_ERROR(CL_INVALID_DEVICE);
+        CL_ERROR(CL_INVALID_CONTEXT);
+        CL_ERROR(CL_INVALID_QUEUE_PROPERTIES);
+        CL_ERROR(CL_INVALID_COMMAND_QUEUE);
+        CL_ERROR(CL_INVALID_HOST_PTR);
+        CL_ERROR(CL_INVALID_MEM_OBJECT);
+        CL_ERROR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+        CL_ERROR(CL_INVALID_IMAGE_SIZE);
+        CL_ERROR(CL_INVALID_SAMPLER);
+        CL_ERROR(CL_INVALID_BINARY);
+        CL_ERROR(CL_INVALID_BUILD_OPTIONS);
+        CL_ERROR(CL_INVALID_PROGRAM);
+        CL_ERROR(CL_INVALID_PROGRAM_EXECUTABLE);
+        CL_ERROR(CL_INVALID_KERNEL_NAME);
+        CL_ERROR(CL_INVALID_KERNEL_DEFINITION);
+        CL_ERROR(CL_INVALID_KERNEL);
+        CL_ERROR(CL_INVALID_ARG_INDEX);
+        CL_ERROR(CL_INVALID_ARG_VALUE);
+        CL_ERROR(CL_INVALID_ARG_SIZE);
+        CL_ERROR(CL_INVALID_KERNEL_ARGS);
+        CL_ERROR(CL_INVALID_WORK_DIMENSION);
+        CL_ERROR(CL_INVALID_WORK_GROUP_SIZE);
+        CL_ERROR(CL_INVALID_WORK_ITEM_SIZE);
+        CL_ERROR(CL_INVALID_GLOBAL_OFFSET);
+        CL_ERROR(CL_INVALID_EVENT_WAIT_LIST);
+        CL_ERROR(CL_INVALID_EVENT);
+        CL_ERROR(CL_INVALID_OPERATION);
+        CL_ERROR(CL_INVALID_GL_OBJECT);
+        CL_ERROR(CL_INVALID_BUFFER_SIZE);
+        CL_ERROR(CL_INVALID_MIP_LEVEL);
+#undef CL_ERROR
+    default:
+        name = "Unknown error code";
+    }
+    if(!str)
+        str = "";
+    PanicAlert("OpenCL error: %s %s (%d)", str, name, error);
+
+}
 
 };
diff --git a/Source/Core/Common/Src/OpenCL.h b/Source/Core/Common/Src/OpenCL.h
index 1c7ae2e0e2..6e63ccb1e6 100644
--- a/Source/Core/Common/Src/OpenCL.h
+++ b/Source/Core/Common/Src/OpenCL.h
@@ -60,6 +60,7 @@ void Destroy();
 
 cl_program CompileProgram(const char *Kernel);
 cl_kernel CompileKernel(cl_program program, const char *Function);
+void HandleCLError(cl_int error, char* str = 0);
 
 };
 
diff --git a/Source/Core/VideoCommon/OpenCL.rules b/Source/Core/VideoCommon/OpenCL.rules
new file mode 100644
index 0000000000..0f88d1efc5
--- /dev/null
+++ b/Source/Core/VideoCommon/OpenCL.rules
@@ -0,0 +1,18 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<VisualStudioToolFile
+	Name="OpenCL"
+	Version="8,00"
+	>
+	<Rules>
+		<CustomBuildRule
+			Name="OpenCL source"
+			DisplayName="OpenCL"
+			CommandLine="xcopy /I /Y $(InputPath) $(SolutionDir)\..\Binary\$(PlatformName)\User\OpenCL"
+			Outputs="$(SolutionDir)\..\Binary\$(PlatformName)\User\OpenCL\$(InputFileName)"
+			FileExtensions="*.cl"
+			>
+			<Properties>
+			</Properties>
+		</CustomBuildRule>
+	</Rules>
+</VisualStudioToolFile>
diff --git a/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp b/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp
index cf8d79c85b..43179d9821 100644
--- a/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp
+++ b/Source/Core/VideoCommon/Src/OpenCL/OCLTextureDecoder.cpp
@@ -18,6 +18,7 @@
 #include "OCLTextureDecoder.h"
 
 #include "OpenCL.h"
+#include "FileUtil.h"
 
 #include <fcntl.h>
 #include <stdio.h>
@@ -26,159 +27,16 @@
 #include <math.h>
 #include <sys/types.h>
 #include <sys/stat.h>
-    
+#include <string>
+
+//#define DEBUG_OPENCL
+
 struct sDecoders
 {
     const char name[256];               // kernel name
     cl_kernel kernel;                   // compute kernel
 };
 
-const char *Kernel = "                                        \n\
-kernel void DecodeI4(global uchar *dst,                       \n\
-                     const global uchar *src, int width)      \n\
-{                                                             \n\
-    int x = get_global_id(0) * 8, y = get_global_id(1) * 8;   \n\
-    int srcOffset = x + y * width / 8;                        \n\
-    for (int iy = 0; iy < 8; iy++)                            \n\
-    {                                                         \n\
-        uchar4 val = vload4(srcOffset, src);                  \n\
-        uchar8 res;                                           \n\
-        res.even = (val >> 4) & 0x0F;                         \n\
-        res.odd = val & 0x0F;                                 \n\
-        res |= res << 4;                                      \n\
-        vstore8(res, 0, dst + ((y + iy)*width + x));          \n\
-        srcOffset++;                                          \n\
-    }                                                         \n\
-}                                                             \n\
-                                                              \n\
-kernel void DecodeI8(global uchar *dst,                       \n\
-                     const global uchar *src, int width)      \n\
-{                                                             \n\
-    int x = get_global_id(0) * 8, y = get_global_id(1) * 4;   \n\
-    int srcOffset = ((x * 4) + (y * width)) / 8;              \n\
-    for (int iy = 0; iy < 4; iy++)                            \n\
-    {                                                         \n\
-        vstore8(vload8(srcOffset, src),                       \n\
-                0, dst + ((y + iy)*width + x));               \n\
-        srcOffset++;                                          \n\
-    }                                                         \n\
-}                                                             \n\
-                                                              \n\
-kernel void DecodeIA8(global uchar *dst,                      \n\
-                      const global uchar *src, int width)     \n\
-{                                                             \n\
-    int x = get_global_id(0) * 4, y = get_global_id(1) * 4;   \n\
-    int srcOffset = ((x * 4) + (y * width)) / 4;              \n\
-    for (int iy = 0; iy < 4; iy++)                            \n\
-    {                                                         \n\
-        uchar8 val = vload8(srcOffset, src);                  \n\
-        uchar8 res;                                           \n\
-        res.odd = val.even;                                   \n\
-        res.even = val.odd;                                   \n\
-        vstore8(res, 0, dst + ((y + iy)*width + x) * 2);      \n\
-        srcOffset++;                                          \n\
-    }                                                         \n\
-}                                                             \n\
-                                                              \n\
-kernel void DecodeIA4(global uchar *dst,                      \n\
-                     const global uchar *src, int width)      \n\
-{                                                             \n\
-    int x = get_global_id(0) * 8, y = get_global_id(1) * 4;   \n\
-    int srcOffset = ((x * 4) + (y * width)) / 8;              \n\
-    for (int iy = 0; iy < 4; iy++)                            \n\
-    {                                                         \n\
-        uchar8 val = vload8(srcOffset, src);                  \n\
-        uchar16 res;                                          \n\
-        res.odd = (val >> 4) & 0x0F;                          \n\
-        res.even  = val & 0x0F;                               \n\
-        res |= res << 4;                                      \n\
-        vstore16(res, 0, dst + ((y + iy)*width + x) * 2);     \n\
-        srcOffset++;                                          \n\
-    }                                                         \n\
-}                                                             \n\
-                                                              \n\
-kernel void DecodeRGBA8(global uchar *dst,                    \n\
-                      const global uchar *src, int width)     \n\
-{                                                             \n\
-    int x = get_global_id(0) * 4, y = get_global_id(1) * 4;   \n\
-    int srcOffset = (x * 2) + (y * width) / 2;                \n\
-    for (int iy = 0; iy < 4; iy++)                            \n\
-    {                                                         \n\
-        uchar8 ar = vload8(srcOffset, src);                   \n\
-        uchar8 gb = vload8(srcOffset + 4, src);               \n\
-        uchar16 res;                                          \n\
-        res.even.even = gb.odd;                               \n\
-        res.even.odd = ar.odd;                                \n\
-        res.odd.even = gb.even;                               \n\
-        res.odd.odd = ar.even;                                \n\
-        vstore16(res, 0, dst + ((y + iy)*width + x) * 4);     \n\
-        srcOffset++;                                          \n\
-    }                                                         \n\
-}                                                             \n\
-                                                              \n\
-kernel void DecodeRGB565(global ushort *dst,                  \n\
-                         const global ushort *src, int width) \n\
-{                                                             \n\
-    int x = get_global_id(0) * 4, y = get_global_id(1) * 4;   \n\
-    int srcOffset = x + (y * width) / 4;                      \n\
-    for (int iy = 0; iy < 4; iy++)                            \n\
-    {                                                         \n\
-        ushort4 val = vload4(srcOffset, src);                 \n\
-        val = (val >> 8) | (val << 8);                        \n\
-        vstore4(val, 0, dst + ((y + iy)*width + x));          \n\
-        srcOffset++;                                          \n\
-    }                                                         \n\
-}                                                             \n\
-                                                              \n\
-kernel void DecodeRGB5A3(global uchar *dst,                   \n\
-                         const global uchar *src, int width)  \n\
-{                                                             \n\
-    int x = get_global_id(0) * 4, y = get_global_id(1) * 4;   \n\
-    int srcOffset = x + (y * width) / 4;                      \n\
-    for (int iy = 0; iy < 4; iy++)                            \n\
-    {                                                         \n\
-        ushort8 val = convert_ushort8(vload8(srcOffset, src));\n\
-        ushort4 vs = val.odd | (val.even << 8);               \n\
-        uchar16 resNoAlpha;                                   \n\
-        resNoAlpha.odd.odd = (uchar4)(0xFF);                  \n\
-        resNoAlpha.even.odd = convert_uchar4(vs >> 7) & 0xF8; \n\
-        resNoAlpha.odd.even = convert_uchar4(vs >> 2) & 0xF8; \n\
-        resNoAlpha.even.even = convert_uchar4(vs << 3) & 0xF8;\n\
-        // Better but cause color bleeding                    \n\
-        //resNoAlpha |= resNoAlpha >> 5;                      \n\
-        uchar16 resAlpha;                                     \n\
-        resAlpha.even.odd = convert_uchar4(vs >> 8) & 0x0F;   \n\
-        resAlpha.odd.even = convert_uchar4(vs >> 4) & 0x0F;   \n\
-        resAlpha.even.even = convert_uchar4(vs) & 0x0F;       \n\
-        resAlpha |= resNoAlpha << 4;                          \n\
-        resAlpha.odd.odd = convert_uchar4(vs >> 7) & 0xE0;    \n\
-        resAlpha.odd.odd |= (resAlpha.odd.odd >> 3)           \n\
-                            | (resAlpha.odd.odd >> 6);        \n\
-        uchar16 choice = (uchar16)((uchar4)(vs.s0 >> 8),      \n\
-                                   (uchar4)(vs.s1 >> 8),      \n\
-                                   (uchar4)(vs.s2 >> 8),      \n\
-                                   (uchar4)(vs.s3 >> 8));     \n\
-        uchar16 res;                                          \n\
-        res = select(resAlpha, resNoAlpha, choice);           \n\
-        vstore16(res, 0, dst + ((y + iy)*width + x) * 4);     \n\
-        srcOffset++;                                          \n\
-    }                                                         \n\
-}                                                             \n\
-                                                              \n\
-kernel void DecodeDXT(global ulong *dst,                      \n\
-                      const global ulong *src, int width)     \n\
-{   // TODO: PLEASE NOTE THAT THIS CODE DOES NOT WORK         \n\
-    int x = get_global_id(0) * 8, y = get_global_id(1) * 8;   \n\
-    int srcOffset = ((x * 4) + (y * width)) / 8;              \n\
-    for (int iy = 0; iy < 4; iy++)                            \n\
-    {                                                         \n\
-        vstore8(vload8(srcOffset, src),                       \n\
-                0, dst + ((y + iy)*width + x));               \n\
-        srcOffset++;                                          \n\
-    }                                                         \n\
-}                                                             \n\
-";
-
 cl_program g_program;
 // NULL terminated set of kernels
 sDecoders Decoders[] = {
@@ -189,22 +47,29 @@ sDecoders Decoders[] = {
 {"DecodeRGBA8", NULL},
 {"DecodeRGB565", NULL},
 {"DecodeRGB5A3", NULL},
-{"DecodeDXT", NULL},
+{"DecodeCMPR", NULL},
 {"", NULL},
 };
 
 bool g_Inited = false;
 cl_mem g_clsrc, g_cldst;                    // texture buffer memory objects
 
-void TexDecoder_OpenCL_Initialize() {
-#if defined(HAVE_OPENCL) && HAVE_OPENCL
+void TexDecoder_OpenCL_Initialize() {
+#if defined(HAVE_OPENCL) && HAVE_OPENCL
 	if(!g_Inited)
     {
 		if(!OpenCL::Initialize())
 			return;
 
+        std::string code;
+        char* filename = "User/OpenCL/TextureDecoder.cl";
+		if (!File::ReadFileToString(true, filename, code))
+		{
+			ERROR_LOG(VIDEO, "Failed to load OpenCL code %s - file is missing?", filename);
+            return;
+		}
 		
-		g_program = OpenCL::CompileProgram(Kernel);
+		g_program = OpenCL::CompileProgram(code.c_str());
 
 		int i = 0;
 		while(strlen(Decoders[i].name) > 0) {
@@ -213,16 +78,18 @@ void TexDecoder_OpenCL_Initialize() {
 		}
 
 		// Allocating maximal Wii texture size in advance, so that we don't have to allocate/deallocate per texture
+#ifndef DEBUG_OPENCL
 		g_clsrc = clCreateBuffer(OpenCL::GetContext(), CL_MEM_READ_ONLY , 1024 * 1024 * sizeof(u32), NULL, NULL);
 		g_cldst = clCreateBuffer(OpenCL::GetContext(), CL_MEM_WRITE_ONLY, 1024 * 1024 * sizeof(u32), NULL, NULL);
+#endif
 
 		g_Inited = true;
-	}
-#endif
-}
-
+	}
+#endif
+}
+
 void TexDecoder_OpenCL_Shutdown() {
-#if defined(HAVE_OPENCL) && HAVE_OPENCL
+#if defined(HAVE_OPENCL) && HAVE_OPENCL && !defined(DEBUG_OPENCL)
 	if(g_clsrc)
 		clReleaseMemObject(g_clsrc);
 
@@ -257,7 +124,6 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei
 			formatResult = PC_TEX_FMT_I8;
 			break;
 		case GX_TF_IA4:
-			// Maybe a cleaner way is needed
 			kernelToRun = Decoders[2].kernel;
 			sizeOfSrc = sizeof(u8);
 			sizeOfDst = sizeof(u16);
@@ -295,9 +161,10 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei
 			formatResult = PC_TEX_FMT_BGRA32;
 			break;
 		case GX_TF_CMPR:
-			return PC_TEX_FMT_NONE; // <-- TODO: Fix CMPR
+			return PC_TEX_FMT_NONE; // Remove to test CMPR
 			kernelToRun = Decoders[7].kernel;
-			sizeOfSrc = sizeOfDst = sizeof(u32);
+			sizeOfSrc = sizeof(u8) / 2.0f;
+            sizeOfDst = sizeof(u32);
 			xSkip = 8;
 			ySkip = 8;
 			formatResult = PC_TEX_FMT_BGRA32;
@@ -306,6 +173,11 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei
 			return PC_TEX_FMT_NONE;
 	}
 
+#ifdef DEBUG_OPENCL
+	g_clsrc = clCreateBuffer(OpenCL::GetContext(), CL_MEM_READ_ONLY , 1024 * 1024 * sizeof(u32), NULL, NULL);
+	g_cldst = clCreateBuffer(OpenCL::GetContext(), CL_MEM_WRITE_ONLY, 1024 * 1024 * sizeof(u32), NULL, NULL);
+#endif
+
 	clEnqueueWriteBuffer(OpenCL::GetCommandQueue(), g_clsrc, CL_TRUE, 0, (size_t)(width * height * sizeOfSrc), src, 0, NULL, NULL);
 
 	clSetKernelArg(kernelToRun, 0, sizeof(cl_mem), &g_cldst);
@@ -324,125 +196,22 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei
 
 	err = clEnqueueNDRangeKernel(OpenCL::GetCommandQueue(), kernelToRun, 2, NULL, global, NULL, 0, NULL, NULL);
 	if(err)
-		PanicAlert("Error queueing kernel");
+        OpenCL::HandleCLError(err, "Failed to enqueue kernel");
 
 	clFinish(OpenCL::GetCommandQueue());
 	
 	clEnqueueReadBuffer(OpenCL::GetCommandQueue(), g_cldst, CL_TRUE, 0, (size_t)(width * height * sizeOfDst), dst, 0, NULL, NULL);
-	
+
+#ifdef DEBUG_OPENCL
+	clReleaseMemObject(g_clsrc);
+	clReleaseMemObject(g_cldst);
+#endif
+
 	return formatResult;
 #else
 	return PC_TEX_FMT_NONE;
 #endif
 
-  /*  switch (texformat)
-    {
-    case GX_TF_C4:
-		if (tlutfmt == 2)
-        {
-			// Special decoding is required for TLUT format 5A3
-            for (int y = 0; y < height; y += 8)
-                for (int x = 0; x < width; x += 8)
-                    for (int iy = 0; iy < 8; iy++, src += 4)
-                        decodebytesC4_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src, tlutaddr);
-        }
-		else
-		{
-            for (int y = 0; y < height; y += 8)
-                for (int x = 0; x < width; x += 8)
-                    for (int iy = 0; iy < 8; iy++, src += 4)
-                        decodebytesC4_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr);
-		}
-        return GetPCFormatFromTLUTFormat(tlutfmt);
-    case GX_TF_C8:
-		if (tlutfmt == 2)
-        {
-			// Special decoding is required for TLUT format 5A3
-            for (int y = 0; y < height; y += 4)
-                for (int x = 0; x < width; x += 8)
-                    for (int iy = 0; iy < 4; iy++, src += 8)
-                        decodebytesC8_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src, tlutaddr);
-        }
-		else
-		{
-            for (int y = 0; y < height; y += 4)
-                for (int x = 0; x < width; x += 8)
-                    for (int iy = 0; iy < 4; iy++, src += 8)
-                        decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr);
-		}
-        return GetPCFormatFromTLUTFormat(tlutfmt);
-    case GX_TF_C14X2: 
-		if (tlutfmt == 2)
-        {
-			// Special decoding is required for TLUT format 5A3
-            for (int y = 0; y < height; y += 4)
-                for (int x = 0; x < width; x += 4)
-                    for (int iy = 0; iy < 4; iy++, src += 8)
-                        decodebytesC14X2_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, (u16*)src, tlutaddr);
-        }
-		else
-		{
-            for (int y = 0; y < height; y += 4)
-                for (int x = 0; x < width; x += 4)
-                    for (int iy = 0; iy < 4; iy++, src += 8)
-                        decodebytesC14X2_To_Raw16((u16*)dst + (y + iy) * width + x, (u16*)src, tlutaddr);
-		}
-		return GetPCFormatFromTLUTFormat(tlutfmt);
-    case GX_TF_RGB565:
-		{
-			for (int y = 0; y < height; y += 4)
-				for (int x = 0; x < width; x += 4)
-					for (int iy = 0; iy < 4; iy++, src += 8)
-					{
-						u16 *ptr = (u16 *)dst + (y + iy) * width + x;
-						u16 *s = (u16 *)src;
-						for(int j = 0; j < 4; j++)
-							*ptr++ = Common::swap16(*s++);
-					}
-		}
-		return PC_TEX_FMT_RGB565;
-    case GX_TF_RGB5A3:
-        {
-            for (int y = 0; y < height; y += 4)
-                for (int x = 0; x < width; x += 4)
-                    for (int iy = 0; iy < 4; iy++, src += 8)
-                        //decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src, 4);
-                        decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src);
-        }
-        return PC_TEX_FMT_BGRA32;
-    case GX_TF_RGBA8:  // speed critical
-        {
-			for (int y = 0; y < height; y += 4)
-                for (int x = 0; x < width; x += 4)
-                {
-					for (int iy = 0; iy < 4; iy++)
-                        decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16);
-					src += 64;
-                }
-        }
-        return PC_TEX_FMT_BGRA32;
-    case GX_TF_CMPR:  // speed critical
-        // The metroid games use this format almost exclusively.
-		{
-			for (int y = 0; y < height; y += 8)
-			{
-                for (int x = 0; x < width; x += 8)
-                {
-                    decodeDXTBlock((u32*)dst + y * width + x, (DXTBlock*)src, width);
-                                        src += sizeof(DXTBlock);
-                    decodeDXTBlock((u32*)dst + y * width + x + 4, (DXTBlock*)src, width);
-                                        src += sizeof(DXTBlock);
-                    decodeDXTBlock((u32*)dst + (y + 4) * width + x, (DXTBlock*)src, width);
-                                        src += sizeof(DXTBlock);
-                    decodeDXTBlock((u32*)dst + (y + 4) * width + x + 4, (DXTBlock*)src, width);
-                                        src += sizeof(DXTBlock);
-                }
-			}
-			return PC_TEX_FMT_BGRA32;
-		}
-    }
-*/
-	// The "copy" texture formats, too?
     return PC_TEX_FMT_NONE;
 }
 
diff --git a/Source/Core/VideoCommon/VideoCommon.vcproj b/Source/Core/VideoCommon/VideoCommon.vcproj
index a74ddf1217..0d32a1b96d 100644
--- a/Source/Core/VideoCommon/VideoCommon.vcproj
+++ b/Source/Core/VideoCommon/VideoCommon.vcproj
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="Windows-1252"?>
 <VisualStudioProject
 	ProjectType="Visual C++"
-	Version="9.00"
+	Version="9,00"
 	Name="VideoCommon"
 	ProjectGUID="{E5D1F0C0-AA07-4841-A4EB-4CF4DAA6B0FA}"
 	RootNamespace="VideoCommon"
@@ -17,6 +17,9 @@
 		/>
 	</Platforms>
 	<ToolFiles>
+		<ToolFile
+			RelativePath=".\OpenCL.rules"
+		/>
 	</ToolFiles>
 	<Configurations>
 		<Configuration
@@ -32,6 +35,9 @@
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
+			<Tool
+				Name="OpenCL source"
+			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
@@ -95,6 +101,9 @@
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
+			<Tool
+				Name="OpenCL source"
+			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
@@ -160,6 +169,9 @@
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
+			<Tool
+				Name="OpenCL source"
+			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
@@ -226,6 +238,9 @@
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
+			<Tool
+				Name="OpenCL source"
+			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
@@ -292,6 +307,9 @@
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
+			<Tool
+				Name="OpenCL source"
+			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
@@ -353,6 +371,9 @@
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
+			<Tool
+				Name="OpenCL source"
+			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
@@ -699,6 +720,10 @@
 					RelativePath=".\Src\OpenCL\OCLTextureDecoder.h"
 					>
 				</File>
+				<File
+					RelativePath="..\..\..\Data\User\OpenCL\TextureDecoder.cl"
+					>
+				</File>
 			</Filter>
 		</Filter>
 		<File