Some more OCL changes : Gives 10x speedup for RGB5A3 on pre-DX11 hardware. Minor speedup for CMPR. (code by xsacha)

plus a segfault fix for issue 2779

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@5751 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
luisr142004 2010-06-20 05:02:26 +00:00
parent 6e83fe2416
commit be3c06f326
2 changed files with 69 additions and 68 deletions

View File

@ -112,36 +112,34 @@ kernel void DecodeRGB5A3(global uchar *dst,
{ {
int x = get_global_id(0) * 4, y = get_global_id(1) * 4; int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
int srcOffset = x + (y * width) / 4; int srcOffset = x + (y * width) / 4;
for (int iy = 0; iy < 4; iy++) uchar8 val;
{ uchar16 resNoAlpha, resAlpha, res, choice;
uchar8 val = vload8(srcOffset++, src); uchar4 gNoAlpha, aAlpha;
ushort4 vs = upsample(val.even, val.odd); #define iterateRGB5A3() \
val = vload8(srcOffset++, src); \
uchar16 resNoAlpha; gNoAlpha = (val.even << (uchar4)6) | (val.odd >> (uchar4)2); \
resNoAlpha.s26AE = (uchar4)(vs >> (ushort4)7); // R resNoAlpha.s26AE = bitselect(val.even >> (uchar4)4, val.even << (uchar4)1, (uchar4)0xFFF); \
resNoAlpha.s159D = (uchar4)(vs >> (ushort4)2); // G resNoAlpha.s159D = bitselect(gNoAlpha >> (uchar4)5, gNoAlpha, (uchar4)0xFFF); \
resNoAlpha.s048C = (uchar4)(vs << (ushort4)3); // B resNoAlpha.s048C = bitselect(val.odd >> (uchar4)2, val.odd << (uchar4)3, (uchar4)0xFFF); \
resNoAlpha &= (uchar16)0xF8; resNoAlpha.s37BF = (uchar4)(0xFF); \
resNoAlpha |= (uchar16)(resNoAlpha >> (uchar16)5); // 5 -> 8 resAlpha.s26AE = val.even; \
resNoAlpha.s37BF = (uchar4)(0xFF); resAlpha.s159D = val.odd >> (uchar4)4; \
resAlpha.s048C = val.odd; \
uchar16 resAlpha; resAlpha &= (uchar16)0x0F; \
resAlpha.s26AE = val.even; // R resAlpha |= (resAlpha << (uchar16)4); \
resAlpha.s159D = val.odd >> (uchar4)4; // G resAlpha.s37BF = val.even << (uchar4)1 & (uchar4)0xE0; \
resAlpha.s048C = val.odd; // B resAlpha.s37BF |= ((resAlpha.s37BF >> (uchar4)3) & (uchar4)0x1C) \
resAlpha &= (uchar16)0x0F; | ((resAlpha.s37BF >> (uchar4)6) & (uchar4)0x3); \
resAlpha |= (resAlpha << (uchar16)4); choice = (uchar16)((uchar4)(val.even.s0), \
resAlpha.s37BF = convert_uchar4(vs >> (ushort4)7) & (uchar4)0xE0; (uchar4)(val.even.s1), \
resAlpha.s37BF |= ((resAlpha.s37BF >> (uchar4)3) & (uchar4)0x1C) (uchar4)(val.even.s2), \
| ((resAlpha.s37BF >> (uchar4)6) & (uchar4)0x3); (uchar4)(val.even.s3)); \
uchar16 choice = (uchar16)((uchar4)(val.even.s0), vstore16(select(resAlpha, resNoAlpha, choice), 0, dst + (y * width + x) * 4); \
(uchar4)(val.even.s1), dst += width*4; // This may look ugly but unrolling loops is required for pre-DX11 hardware.
(uchar4)(val.even.s2), iterateRGB5A3();
(uchar4)(val.even.s3)); iterateRGB5A3();
uchar16 res; iterateRGB5A3();
res = select(resAlpha, resNoAlpha, choice); iterateRGB5A3();
vstore16(res, 0, dst + ((y + iy) * width + x) * 4);
}
} }
uint16 unpack(uchar b) uint16 unpack(uchar b)
@ -158,11 +156,11 @@ kernel void decodeCMPRBlock(global uchar *dst,
int x = get_global_id(0) * 4, y = get_global_id(1) * 4; int x = get_global_id(0) * 4, y = get_global_id(1) * 4;
uchar8 val = vload8(0, src); uchar8 val = vload8(0, src);
uchar2 colora565 = (uchar2)(val.s1, val.s3); uchar2 colora565 = (uchar2)(val.s1, val.s3);
uchar2 colorb565 = (uchar2)(val.s0, val.s2); uchar2 colorb565 = (uchar2)(val.s0, val.s2);
uchar8 color32 = (uchar8)((colora565 << (uchar2)3) | (colora565 >> (uchar2)2 & (uchar2)7), uchar8 color32 = (uchar8)(bitselect(colora565 << (uchar2)3, colora565 >> (uchar2)2, (uchar2)0xFFFFF000),
(colora565 >> (uchar2)3) | (colorb565 << (uchar2)5) | (colorb565 >> (uchar2)1 & (uchar2)3), colora565 >> (uchar2)3 | bitselect(colorb565 << (uchar2)5, colorb565 >> (uchar2)1, (uchar2)0xFFFFFF00),
(colorb565 & (uchar2)0xF8) | (colorb565 >> (uchar2)5 & (uchar2)7), bitselect(colorb565, colorb565 >> (uchar2)5, (uchar2)0xFFFFF000),
(uchar2)0xFF); (uchar2)0xFF);
uint4 colors; uint4 colors;
uint4 colorNoAlpha; uint4 colorNoAlpha;
@ -174,29 +172,29 @@ kernel void decodeCMPRBlock(global uchar *dst,
colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.even); colorNoAlpha = (colorNoAlpha << (uint4)8) | convert_uint4(color32.even);
uint4 colorAlpha; uint4 colorAlpha;
uchar4 midpoint = convert_uchar4((convert_ushort4(color32.odd) + convert_ushort4(color32.even) + (ushort4)1) / (ushort4)2); uchar4 midpoint = rhadd(color32.odd, color32.even);
midpoint.s3 = 0xFF; midpoint.s3 = 0xFF;
colorAlpha = convert_uint4(midpoint); colorAlpha = convert_uint4(midpoint);
colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.odd); colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.odd);
colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.even); colorAlpha = (colorAlpha << (uint4)8) | convert_uint4(color32.even);
uint4 choice = isgreater(upsample(val.s0,val.s1),upsample(val.s2, val.s3)); uint4 choice = isgreater(upsample(val.s0,val.s1),upsample(val.s2, val.s3));
colors = select(colorNoAlpha, colorAlpha, choice); colors = bitselect(colorNoAlpha, colorAlpha, choice);
uint16 colorsFull = (uint16)(colors, colors, colors, colors); uint16 colorsFull = (uint16)(colors, colors, colors, colors);
uint16 shifts = (((unpack(val.s7) << (uint16)8 uint16 shifts = (((unpack(val.s7) << (uint16)8
| unpack(val.s6)) << (uint16)8 | unpack(val.s6)) << (uint16)8
| unpack(val.s5)) << (uint16)8 | unpack(val.s5)) << (uint16)8
| unpack(val.s4)) << (uint16)3; | unpack(val.s4)) << (uint16)3;
vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width * 4); vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst);
shifts = shifts >> (uint16)8; shifts = shifts >> (uint16)8;
vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width * 4); vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width*4);
shifts = shifts >> (uint16)8; shifts = shifts >> (uint16)8;
vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width * 4); vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width*4);
shifts = shifts >> (uint16)8; shifts = shifts >> (uint16)8;
vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width * 4); vstore16(convert_uchar16(colorsFull >> (shifts & (uint16)0xFF)), 0, dst+=width*4);
} }
kernel void DecodeCMPR(global uchar *dst, kernel void DecodeCMPR(global uchar *dst,

View File

@ -25,30 +25,30 @@
void AOSound::SoundLoop() void AOSound::SoundLoop()
{ {
uint_32 numBytesToRender = 256; uint_32 numBytesToRender = 256;
ao_initialize(); ao_initialize();
default_driver = ao_default_driver_id(); default_driver = ao_default_driver_id();
format.bits = 16; format.bits = 16;
format.channels = 2; format.channels = 2;
format.rate = m_mixer->GetSampleRate(); format.rate = m_mixer->GetSampleRate();
format.byte_format = AO_FMT_LITTLE; format.byte_format = AO_FMT_LITTLE;
device = ao_open_live(default_driver, &format, NULL /* no options */); device = ao_open_live(default_driver, &format, NULL /* no options */);
if (!device) if (!device)
{ {
PanicAlert("AudioCommon: Error opening AO device.\n"); PanicAlert("AudioCommon: Error opening AO device.\n");
ao_shutdown(); ao_shutdown();
Stop(); Stop();
return; return;
} }
buf_size = format.bits/8 * format.channels * format.rate; buf_size = format.bits/8 * format.channels * format.rate;
while (!threadData) while (!threadData)
{ {
m_mixer->Mix(realtimeBuffer, numBytesToRender >> 2); m_mixer->Mix(realtimeBuffer, numBytesToRender >> 2);
soundCriticalSection.Enter(); soundCriticalSection.Enter();
ao_play(device, (char*)realtimeBuffer, numBytesToRender); ao_play(device, (char*)realtimeBuffer, numBytesToRender);
soundCriticalSection.Leave(); soundCriticalSection.Leave();
soundSyncEvent.Wait(); soundSyncEvent.Wait();
} }
@ -62,34 +62,37 @@ void *soundThread(void *args)
bool AOSound::Start() bool AOSound::Start()
{ {
memset(realtimeBuffer, 0, sizeof(realtimeBuffer)); memset(realtimeBuffer, 0, sizeof(realtimeBuffer));
soundSyncEvent.Init(); soundSyncEvent.Init();
thread = new Common::Thread(soundThread, (void *)this); thread = new Common::Thread(soundThread, (void *)this);
return true; return true;
} }
void AOSound::Update() void AOSound::Update()
{ {
soundSyncEvent.Set(); soundSyncEvent.Set();
} }
void AOSound::Stop() void AOSound::Stop()
{ {
threadData = 1; threadData = 1;
soundSyncEvent.Set(); soundSyncEvent.Set();
soundCriticalSection.Enter(); soundCriticalSection.Enter();
delete thread; delete thread;
thread = NULL; thread = NULL;
if (device)
ao_close(device);
ao_shutdown(); ao_shutdown();
ao_close(device);
device = NULL; device = NULL;
soundCriticalSection.Leave(); soundCriticalSection.Leave();
soundSyncEvent.Shutdown(); soundSyncEvent.Shutdown();
} }
AOSound::~AOSound() AOSound::~AOSound()