mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-01-25 15:31:17 +01:00
963ece2017
Stream 2.2 is minimum requirement for OpenCL 1.1 and binary kernels as well (I hope to implement these soon).Remove unnecessary casting bloat. Since AMD Stream 2.2 this is no longer a requirement. Sidenote: Stream 2.2/Catalyst 10.7b is minimum requirement for OpenCL 1.1 and binary kernels as well. I hope to implement these soon. For the latest ATI Drivers and SDK: http://support.amd.com/us/kbarticles/Pages/OpenCL11ATICat107UpdateDriver.aspx http://support.amd.com/us/kbarticles/Pages/OpenCL11ATICat107UpdateDriver.aspx git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6102 8ced0084-cf51-0410-be5f-012b33b47a6e
359 lines
13 KiB
Common Lisp
359 lines
13 KiB
Common Lisp
// Copyright (C) 2003 Dolphin Project.
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation, version 2.0.
|
|
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License 2.0 for more details.
|
|
|
|
// A copy of the GPL 2.0 should have been included with the program.
|
|
// If not, see http://www.gnu.org/licenses/
|
|
|
|
// Official SVN repository and contact information can be found at
|
|
// http://code.google.com/p/dolphin-emu/
|
|
|
|
kernel void DecodeI4(global uchar *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 8, y = get_global_id(1) * 8;
|
|
int srcOffset = x + y * width / 8;
|
|
for (int iy = 0; iy < 8; iy++)
|
|
{
|
|
uchar4 val = vload4(srcOffset, src);
|
|
uchar8 res;
|
|
res.even = (val >> 4) & 0x0F;
|
|
res.odd = val & 0x0F;
|
|
res |= res << 4;
|
|
vstore8(res, 0, dst + ((y + iy)*width + x));
|
|
srcOffset++;
|
|
}
|
|
}
|
|
|
|
kernel void DecodeI4_RGBA(global uint *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 8, y = get_global_id(1) * 8;
|
|
int srcOffset = x + y * width / 8;
|
|
for (int iy = 0; iy < 8; iy++)
|
|
{
|
|
uchar4 val = vload4(srcOffset, src);
|
|
uchar8 res;
|
|
res.even = (val >> 4) & 0x0F;
|
|
res.odd = val & 0x0F;
|
|
res |= res << 4;
|
|
vstore8(upsample(upsample(res,res),upsample(res,res)), 0, dst + ((y + iy)*width + x));
|
|
srcOffset++;
|
|
}
|
|
}
|
|
|
|
kernel void DecodeI8(global uchar *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
|
|
int srcOffset = ((x * 4) + (y * width)) / 8;
|
|
for (int iy = 0; iy < 4; iy++)
|
|
{
|
|
vstore8(vload8(srcOffset++, src),
|
|
0, dst + ((y + iy)*width + x));
|
|
}
|
|
}
|
|
|
|
kernel void DecodeI8_RGBA(global uint *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
|
|
int srcOffset = ((x * 4) + (y * width)) / 8;
|
|
for (int iy = 0; iy < 4; iy++)
|
|
{
|
|
uchar8 val = vload8(srcOffset++, src);
|
|
vstore8(upsample(upsample(val,val),upsample(val,val)),
|
|
0, dst + ((y + iy)*width + x));
|
|
}
|
|
}
|
|
|
|
kernel void DecodeIA8(global ushort *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
|
|
int srcOffset = ((x * 4) + (y * width)) / 4;
|
|
for (int iy = 0; iy < 4; iy++)
|
|
{
|
|
uchar8 val = vload8(srcOffset++, src);
|
|
vstore4(upsample(val.even, val.odd), 0, dst + ((y + iy)*width + x));
|
|
}
|
|
}
|
|
|
|
kernel void DecodeIA8_RGBA(global uint *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
|
|
int srcOffset = ((x * 4) + (y * width)) / 4;
|
|
for (int iy = 0; iy < 4; iy++)
|
|
{
|
|
uchar8 val = vload8(srcOffset++, src);
|
|
vstore4(upsample(upsample(val.even,val.odd),upsample(val.odd, val.odd)), 0, dst + ((y + iy)*width + x));
|
|
}
|
|
}
|
|
|
|
kernel void DecodeIA4(global ushort *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
|
|
int srcOffset = ((x * 4) + (y * width)) / 8;
|
|
uchar8 val;
|
|
ushort8 res;
|
|
for (int iy = 0; iy < 4; iy++)
|
|
{
|
|
val = vload8(srcOffset++, src);
|
|
res = upsample(val >> 4, val & 0xF);
|
|
res |= res << 4;
|
|
vstore8(res, 0, dst + y*width + x);
|
|
dst+=width;
|
|
}
|
|
}
|
|
|
|
kernel void DecodeIA4_RGBA(global uint *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 8, y = get_global_id(1) * 4;
|
|
int srcOffset = ((x * 4) + (y * width)) / 8;
|
|
uchar8 val;
|
|
uint8 res;
|
|
for (int iy = 0; iy < 4; iy++)
|
|
{
|
|
val = vload8(srcOffset++, src);
|
|
uchar8 a = val >> 4;
|
|
uchar8 l = val & 0xF;
|
|
res = upsample(upsample(a, l), upsample(l,l));
|
|
res |= res << 4;
|
|
vstore8(res, 0, dst + y*width + x);
|
|
dst+=width;
|
|
}
|
|
}
|
|
|
|
kernel void DecodeRGBA8(global ushort *dst,
|
|
const global ushort *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
|
|
int srcOffset = (x * 2) + (y * width) / 2;
|
|
for (int iy = 0; iy < 4; iy++)
|
|
{
|
|
ushort8 val = (ushort8)(vload4(srcOffset, src), vload4(srcOffset + 4, src));
|
|
ushort8 bgra = rotate(val,8).s40516273;
|
|
vstore8(bgra, 0, dst + ((y + iy)*width + x) * 2);
|
|
srcOffset++;
|
|
}
|
|
}
|
|
|
|
kernel void DecodeRGBA8_RGBA(global uchar *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
|
|
int srcOffset = (x * 2) + (y * width) / 2;
|
|
for (int iy = 0; iy < 4; iy++)
|
|
{
|
|
uchar8 ar = vload8(srcOffset, src);
|
|
uchar8 gb = vload8(srcOffset + 4, src);
|
|
uchar16 res;
|
|
res.even.even = ar.odd;
|
|
res.even.odd = gb.odd;
|
|
res.odd.even = gb.even;
|
|
res.odd.odd = ar.even;
|
|
vstore16(res, 0, dst + ((y + iy)*width + x) * 4);
|
|
srcOffset++;
|
|
}
|
|
}
|
|
|
|
kernel void DecodeRGB565(global ushort *dst,
|
|
const global ushort *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
|
|
int srcOffset = x + (y * width) / 4;
|
|
dst += width*y + x;
|
|
for (int iy = 0; iy < 4; iy++)
|
|
{
|
|
vstore4(rotate(vload4(srcOffset++, src),8), 0, dst + iy*width);
|
|
}
|
|
}
|
|
|
|
kernel void DecodeRGB565_RGBA(global uchar *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
|
|
int srcOffset = x + (y * width) / 4;
|
|
for (int iy = 0; iy < 4; iy++)
|
|
{
|
|
uchar8 val = vload8(srcOffset++, src);
|
|
|
|
uchar16 res;
|
|
res.even.even = bitselect(val.even, val.even >> 5, 7);
|
|
res.odd.even = bitselect((val.odd >> 3) | (val.even << 5), val.even >> 1, 3);
|
|
res.even.odd = bitselect(val.odd << 3, val.odd >> 2, 7);
|
|
res.odd.odd = 0xFF;
|
|
|
|
vstore16(res, 0, dst + ((y + iy)*width + x) * 4);
|
|
}
|
|
}
|
|
|
|
kernel void DecodeRGB5A3(global uchar *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
|
|
int srcOffset = x + (y * width) / 4;
|
|
uchar8 val;
|
|
uchar16 resNoAlpha, resAlpha, choice;
|
|
#define iterateRGB5A3() \
|
|
val = vload8(srcOffset++, src); \
|
|
resNoAlpha.s26AE = val.even << 1; \
|
|
resNoAlpha.s159D = val.even << 6 | val.odd >> 2; \
|
|
resNoAlpha.s048C = val.odd << 3; \
|
|
resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> 5, 0x3); \
|
|
resNoAlpha.s37BF = 0xFF; \
|
|
resAlpha.s26AE = bitselect(val.even << 4, val.even, 0xF); \
|
|
resAlpha.s159D = bitselect(val.odd, val.odd >> 4, 0xF); \
|
|
resAlpha.s048C = bitselect(val.odd << 4, val.odd, 0xF); \
|
|
resAlpha.s37BF = bitselect(val.even << 1, val.even >> 2, 0x1C); \
|
|
resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> 5, 0x3); \
|
|
choice = (uchar16)((uchar4)(val.even.s0), \
|
|
(uchar4)(val.even.s1), \
|
|
(uchar4)(val.even.s2), \
|
|
(uchar4)(val.even.s3)); \
|
|
vstore16(select(resAlpha, resNoAlpha, choice), 0, dst + (y * width + x) * 4);
|
|
iterateRGB5A3(); dst += width*4;
|
|
iterateRGB5A3(); dst += width*4;
|
|
iterateRGB5A3(); dst += width*4;
|
|
iterateRGB5A3();
|
|
}
|
|
|
|
kernel void DecodeRGB5A3_RGBA(global uchar *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
|
|
int srcOffset = x + (y * width) / 4;
|
|
uchar8 val;
|
|
uchar16 resNoAlpha, resAlpha, choice;
|
|
#define iterateRGB5A3_RGBA() \
|
|
val = vload8(srcOffset++, src); \
|
|
resNoAlpha.s048C = val.even << 1; \
|
|
resNoAlpha.s159D = val.even << 6 | val.odd >> 2; \
|
|
resNoAlpha.s26AE = val.odd << 3; \
|
|
resNoAlpha = bitselect(resNoAlpha, resNoAlpha >> 5, 0x3); \
|
|
resNoAlpha.s37BF = 0xFF; \
|
|
resAlpha.s048C = bitselect(val.even << 4, val.even, 0xF); \
|
|
resAlpha.s159D = bitselect(val.odd, val.odd >> 4, 0xF); \
|
|
resAlpha.s26AE = bitselect(val.odd << 4, val.odd, 0xF); \
|
|
resAlpha.s37BF = bitselect(val.even << 1, val.even >> 2, 0x1C); \
|
|
resAlpha.s37BF = bitselect(resAlpha.s37BF, val.even >> 5, 0x3); \
|
|
choice = (uchar16)((uchar4)(val.even.s0), \
|
|
(uchar4)(val.even.s1), \
|
|
(uchar4)(val.even.s2), \
|
|
(uchar4)(val.even.s3)); \
|
|
vstore16(select(resAlpha, resNoAlpha, choice), 0, dst + (y * width + x) * 4);
|
|
iterateRGB5A3_RGBA(); dst += width*4;
|
|
iterateRGB5A3_RGBA(); dst += width*4;
|
|
iterateRGB5A3_RGBA(); dst += width*4;
|
|
iterateRGB5A3_RGBA();
|
|
}
|
|
|
|
uint16 unpack(uchar b)
|
|
{
|
|
return (uint16)((uint4)(b >> 3 & 0x18),
|
|
(uint4)(b >> 1 & 0x18),
|
|
(uint4)(b << 1 & 0x18),
|
|
(uint4)(b << 3 & 0x18));
|
|
}
|
|
|
|
kernel void decodeCMPRBlock(global uchar *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
|
|
uchar8 val = vload8(0, src);
|
|
|
|
uchar2 colora565 = (uchar2)(val.s1, val.s3);
|
|
uchar2 colorb565 = (uchar2)(val.s0, val.s2);
|
|
uchar8 color32 = (uchar8)(bitselect(colora565 << 3, colora565 >> 2, 7),
|
|
bitselect((colora565 >> 3) | (colorb565 << 5), colorb565 >> 1, 3),
|
|
bitselect(colorb565, colorb565 >> 5, 7),
|
|
(uchar2)0xFF);
|
|
|
|
ushort4 frac2 = convert_ushort4(color32.even) - convert_ushort4(color32.odd);
|
|
uchar4 frac = convert_uchar4((frac2 * 3) / 8);
|
|
|
|
ushort4 colorAlpha = upsample((uchar4)(color32.even.s0,color32.even.s1,color32.even.s2,0),
|
|
rhadd(color32.odd, color32.even));
|
|
colorAlpha.s3 = 0xFF;
|
|
ushort4 colorNoAlpha = upsample(color32.odd + frac, color32.even - frac);
|
|
|
|
uint4 colors = upsample((upsample(val.s0,val.s1) > upsample(val.s2,val.s3))?colorNoAlpha:colorAlpha,
|
|
upsample(color32.odd, color32.even));
|
|
|
|
uint16 colorsFull = (uint16)(colors, colors, colors, colors);
|
|
|
|
vstore16(convert_uchar16(colorsFull >> unpack(val.s4)), 0, dst);
|
|
vstore16(convert_uchar16(colorsFull >> unpack(val.s5)), 0, dst+=width*4);
|
|
vstore16(convert_uchar16(colorsFull >> unpack(val.s6)), 0, dst+=width*4);
|
|
vstore16(convert_uchar16(colorsFull >> unpack(val.s7)), 0, dst+=width*4);
|
|
}
|
|
|
|
kernel void DecodeCMPR(global uchar *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 8, y = get_global_id(1) * 8;
|
|
|
|
src += x * 4 + (y * width) / 2;
|
|
dst += (y * width + x) * 4;
|
|
|
|
decodeCMPRBlock(dst, src, width); src += 8;
|
|
decodeCMPRBlock(dst + 16, src, width); src += 8;
|
|
decodeCMPRBlock(dst + 16 * width, src, width); src += 8;
|
|
decodeCMPRBlock(dst + 16 * (width + 1), src, width);
|
|
}
|
|
|
|
kernel void decodeCMPRBlock_RGBA(global uchar *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
|
|
uchar8 val = vload8(0, src);
|
|
|
|
uchar2 colora565 = (uchar2)(val.s1, val.s3);
|
|
uchar2 colorb565 = (uchar2)(val.s0, val.s2);
|
|
uchar8 color32 = (uchar8)(bitselect(colorb565, colorb565 >> 5, 7),
|
|
bitselect((colora565 >> 3) | (colorb565 << 5), colorb565 >> 1, 3),
|
|
bitselect(colora565 << 3, colora565 >> 2, 7),
|
|
(uchar2)0xFF);
|
|
|
|
ushort4 frac2 = convert_ushort4(color32.even) - convert_ushort4(color32.odd);
|
|
uchar4 frac = convert_uchar4((frac2 * 3) / 8);
|
|
|
|
ushort4 colorAlpha = upsample((uchar4)(color32.even.s0,color32.even.s1,color32.even.s2,0),
|
|
rhadd(color32.odd, color32.even));
|
|
colorAlpha.s3 = 0xFF;
|
|
ushort4 colorNoAlpha = upsample(color32.odd + frac, color32.even - frac);
|
|
|
|
uint4 colors = upsample((upsample(val.s0,val.s1) > upsample(val.s2,val.s3))?colorNoAlpha:colorAlpha,
|
|
upsample(color32.odd, color32.even));
|
|
|
|
uint16 colorsFull = (uint16)(colors, colors, colors, colors);
|
|
|
|
vstore16(convert_uchar16(colorsFull >> unpack(val.s4)), 0, dst);
|
|
vstore16(convert_uchar16(colorsFull >> unpack(val.s5)), 0, dst+=width*4);
|
|
vstore16(convert_uchar16(colorsFull >> unpack(val.s6)), 0, dst+=width*4);
|
|
vstore16(convert_uchar16(colorsFull >> unpack(val.s7)), 0, dst+=width*4);
|
|
}
|
|
|
|
kernel void DecodeCMPR_RGBA(global uchar *dst,
|
|
const global uchar *src, int width)
|
|
{
|
|
int x = get_global_id(0) * 8, y = get_global_id(1) * 8;
|
|
|
|
src += x * 4 + (y * width) / 2;
|
|
dst += (y * width + x) * 4;
|
|
|
|
decodeCMPRBlock_RGBA(dst, src, width); src += 8;
|
|
decodeCMPRBlock_RGBA(dst + 16, src, width); src += 8;
|
|
decodeCMPRBlock_RGBA(dst + 16 * width, src, width); src += 8;
|
|
decodeCMPRBlock_RGBA(dst + 16 * (width + 1), src, width);
|
|
} |