From c33f46406e60fb2a0cf6613fdd062c6934e4914b Mon Sep 17 00:00:00 2001 From: pierre Date: Mon, 27 Dec 2010 15:05:18 +0000 Subject: [PATCH] Core/DSPCore: Improve Interpreter address register add/sub, convert to assembler for JIT. Replace JIT ToMask() with a different variant. Remove superfluous zeroWriteBackLog calls(added by me). Core/Common: Don't bother creating a string and calling into a Logs trigger() when there is noone listening. Change AtomicLoadAcquire for gcc to just make the compiler not reorder memory accesses around it instead of doing a full memory barrier, per the comment in the win32 variant. Core/AudioCommon: Fix a use of uninitialized variable inside libalsa. Microbenchmarking results for ToMask variants:(1 000 000 000 iterations): cpu\variant| shifts | bit scan intel mobile C2D@2.5GHz | 5.5s | 4.0s amd athlon64x2@3GHz | 6.1s | 6.4s (including some constant overhead identical to both variants) git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6667 8ced0084-cf51-0410-be5f-012b33b47a6e --- .../Core/AudioCommon/Src/AlsaSoundStream.cpp | 1 + Source/Core/Common/Src/Atomic_GCC.h | 8 +- Source/Core/Common/Src/LogManager.cpp | 2 +- Source/Core/Common/Src/LogManager.h | 1 + Source/Core/DSPCore/Src/DSPEmitter.h | 2 +- Source/Core/DSPCore/Src/DSPIntUtil.h | 13 +- Source/Core/DSPCore/Src/DSPTables.cpp | 4 +- Source/Core/DSPCore/Src/Jit/DSPJitMisc.cpp | 7 +- Source/Core/DSPCore/Src/Jit/DSPJitUtil.cpp | 370 +++++++++--------- 9 files changed, 214 insertions(+), 194 deletions(-) diff --git a/Source/Core/AudioCommon/Src/AlsaSoundStream.cpp b/Source/Core/AudioCommon/Src/AlsaSoundStream.cpp index da715e9811..1ba81faf62 100644 --- a/Source/Core/AudioCommon/Src/AlsaSoundStream.cpp +++ b/Source/Core/AudioCommon/Src/AlsaSoundStream.cpp @@ -123,6 +123,7 @@ bool AlsaSound::AlsaInit() return false; } + dir = 0; err = snd_pcm_hw_params_set_rate_near(handle, hwparams, &sample_rate, &dir); if (err < 0) { diff --git a/Source/Core/Common/Src/Atomic_GCC.h b/Source/Core/Common/Src/Atomic_GCC.h index 363cf7a4f7..8eb2f5d818 100644 --- a/Source/Core/Common/Src/Atomic_GCC.h +++ b/Source/Core/Common/Src/Atomic_GCC.h @@ -57,8 +57,12 @@ inline u32 AtomicLoad(volatile u32& src) { return src; // 32-bit reads are always atomic. } inline u32 AtomicLoadAcquire(volatile u32& src) { - __sync_synchronize(); // TODO: May not be necessary. - return src; + //keep the compiler from caching any memory references + u32 result = src; // 32-bit reads are always atomic. + //__sync_synchronize(); // TODO: May not be necessary. + // Compiler instruction only. x86 loads always have acquire semantics. + __asm__ __volatile__ ( "":::"memory" ); + return result; } inline void AtomicOr(volatile u32& target, u32 value) { diff --git a/Source/Core/Common/Src/LogManager.cpp b/Source/Core/Common/Src/LogManager.cpp index 651cb01ad2..cdf9a99acb 100644 --- a/Source/Core/Common/Src/LogManager.cpp +++ b/Source/Core/Common/Src/LogManager.cpp @@ -116,7 +116,7 @@ void LogManager::Log(LogTypes::LOG_LEVELS level, LogTypes::LOG_TYPE type, char msg[MAX_MSGLEN * 2]; LogContainer *log = m_Log[type]; - if (! log->isEnable() || level > log->getLevel()) + if (! log->isEnable() || level > log->getLevel() || ! log->hasListeners()) return; CharArrayFromFormatV(temp, MAX_MSGLEN, format, args); diff --git a/Source/Core/Common/Src/LogManager.h b/Source/Core/Common/Src/LogManager.h index 7e247925c4..37a0ca3c45 100644 --- a/Source/Core/Common/Src/LogManager.h +++ b/Source/Core/Common/Src/LogManager.h @@ -89,6 +89,7 @@ public: void setLevel(LogTypes::LOG_LEVELS level) { m_level = level; } + bool hasListeners() const { return listeners.size() > 0; } private: char m_fullName[128]; diff --git a/Source/Core/DSPCore/Src/DSPEmitter.h b/Source/Core/DSPCore/Src/DSPEmitter.h index 2e61764dce..83e5731b6c 100644 --- a/Source/Core/DSPCore/Src/DSPEmitter.h +++ b/Source/Core/DSPCore/Src/DSPEmitter.h @@ -259,7 +259,7 @@ private: void Update_SR_Register(Gen::X64Reg val = Gen::EAX); - void ToMask(Gen::X64Reg value_reg = Gen::EDI, Gen::X64Reg temp_reg = Gen::ESI); + void ToMask(Gen::X64Reg value_reg = Gen::EDI); void dsp_increment_one(Gen::X64Reg ar = Gen::EAX, Gen::X64Reg wr = Gen::EDX, Gen::X64Reg wr_pow = Gen::EDI, Gen::X64Reg temp_reg = Gen::ESI); void dsp_decrement_one(Gen::X64Reg ar = Gen::EAX, Gen::X64Reg wr = Gen::EDX, Gen::X64Reg wr_pow = Gen::EDI, Gen::X64Reg temp_reg = Gen::ESI); void get_long_prod(Gen::X64Reg long_prod = Gen::RAX); diff --git a/Source/Core/DSPCore/Src/DSPIntUtil.h b/Source/Core/DSPCore/Src/DSPIntUtil.h index 2130b62aca..21f919d30e 100644 --- a/Source/Core/DSPCore/Src/DSPIntUtil.h +++ b/Source/Core/DSPCore/Src/DSPIntUtil.h @@ -68,10 +68,10 @@ inline u16 dsp_increase_addr_reg(u16 reg, s16 ix) u16 m = ToMask(wr) | 1; u16 nar = ar+ix; if (ix >= 0) { - if((ar&m)+(ix&m) -m-1 >= 0) + if((ar&m) + (int)(ix&m) -(int)m-1 >= 0) nar -= wr+1; } else { - if((ar&m)+(ix&m) -m-1 < m-wr) + if((ar&m) + (int)(ix&m) -(int)m-1 < m-wr) nar += wr+1; } return nar; @@ -82,13 +82,12 @@ inline u16 dsp_decrease_addr_reg(u16 reg, s16 ix) u16 ar = g_dsp.r[reg]; u16 wr = g_dsp.r[reg+8]; u16 m = ToMask(wr) | 1; - ix = -ix-1; - u16 nar = ar+ix+1; - if (ix-1 >= 0) { - if((ar&m)+(ix&m) -m >= 0) + u16 nar = ar-ix; + if ((u16)ix > 0x8000) { // equiv: ix < 0 && ix != -0x8000 + if((ar&m) - (int)(ix&m) >= 0) nar -= wr+1; } else { - if((ar&m)+(ix&m) -m < m-wr) + if((ar&m) - (int)(ix&m) < m-wr) nar += wr+1; } return nar; diff --git a/Source/Core/DSPCore/Src/DSPTables.cpp b/Source/Core/DSPCore/Src/DSPTables.cpp index 884d28a5da..109c83120e 100644 --- a/Source/Core/DSPCore/Src/DSPTables.cpp +++ b/Source/Core/DSPCore/Src/DSPTables.cpp @@ -37,8 +37,8 @@ const DSPOPCTemplate opcodes[] = {"DAR", 0x0004, 0xfffc, DSPInterpreter::dar, &DSPEmitter::dar, 1, 1, {{P_REG, 1, 0, 0, 0x0003}}, false, false, false, false, false}, {"IAR", 0x0008, 0xfffc, DSPInterpreter::iar, &DSPEmitter::iar, 1, 1, {{P_REG, 1, 0, 0, 0x0003}}, false, false, false, false, false}, - {"SUBARN", 0x000c, 0xfffc, DSPInterpreter::subarn, NULL/*&DSPEmitter::subarn*/, 1, 1, {{P_REG, 1, 0, 0, 0x0003}}, false, false, false, false, false}, - {"ADDARN", 0x0010, 0xfff0, DSPInterpreter::addarn, NULL/*&DSPEmitter::addarn*/, 1, 2, {{P_REG, 1, 0, 0, 0x0003}, {P_REG04, 1, 0, 2, 0x000c}}, false, false, false, false, false}, + {"SUBARN", 0x000c, 0xfffc, DSPInterpreter::subarn, &DSPEmitter::subarn, 1, 1, {{P_REG, 1, 0, 0, 0x0003}}, false, false, false, false, false}, + {"ADDARN", 0x0010, 0xfff0, DSPInterpreter::addarn, &DSPEmitter::addarn, 1, 2, {{P_REG, 1, 0, 0, 0x0003}, {P_REG04, 1, 0, 2, 0x000c}}, false, false, false, false, false}, {"HALT", 0x0021, 0xffff, DSPInterpreter::halt, NULL, 1, 0, {}, false, true, true, false, false}, diff --git a/Source/Core/DSPCore/Src/Jit/DSPJitMisc.cpp b/Source/Core/DSPCore/Src/Jit/DSPJitMisc.cpp index 543c64603d..79e72f7d5d 100644 --- a/Source/Core/DSPCore/Src/Jit/DSPJitMisc.cpp +++ b/Source/Core/DSPCore/Src/Jit/DSPJitMisc.cpp @@ -370,7 +370,6 @@ void DSPEmitter::nx(const UDSPInstruction opc) void DSPEmitter::dar(const UDSPInstruction opc) { // g_dsp.r[opc & 0x3] = dsp_decrement_addr_reg(opc & 0x3); - zeroWriteBackLog(opc); decrement_addr_reg(opc & 0x3); } @@ -381,7 +380,6 @@ void DSPEmitter::dar(const UDSPInstruction opc) void DSPEmitter::iar(const UDSPInstruction opc) { // g_dsp.r[opc & 0x3] = dsp_increment_addr_reg(opc & 0x3); - zeroWriteBackLog(opc); increment_addr_reg(opc & 0x3); } @@ -393,7 +391,6 @@ void DSPEmitter::subarn(const UDSPInstruction opc) { // u8 dreg = opc & 0x3; // g_dsp.r[dreg] = dsp_decrease_addr_reg(dreg, (s16)g_dsp.r[DSP_REG_IX0 + dreg]); - zeroWriteBackLog(opc); decrease_addr_reg(opc & 0x3); } @@ -408,7 +405,6 @@ void DSPEmitter::addarn(const UDSPInstruction opc) // g_dsp.r[dreg] = dsp_increase_addr_reg(dreg, (s16)g_dsp.r[DSP_REG_IX0 + sreg]); // From looking around it is always called with the matching index register - zeroWriteBackLog(opc); increase_addr_reg(opc & 0x3); } @@ -448,7 +444,6 @@ void DSPEmitter::sbclr(const UDSPInstruction opc) { u8 bit = (opc & 0x7) + 6; - zeroWriteBackLog(opc); clrCompileSR(1 << bit); } @@ -460,10 +455,10 @@ void DSPEmitter::sbset(const UDSPInstruction opc) { u8 bit = (opc & 0x7) + 6; - zeroWriteBackLog(opc); setCompileSR(1 << bit); } +// 1000 1bbb xxxx xxxx, bbb >= 010 // This is a bunch of flag setters, flipping bits in SR. So far so good, // but it's harder to know exactly what effect they have. void DSPEmitter::srbith(const UDSPInstruction opc) diff --git a/Source/Core/DSPCore/Src/Jit/DSPJitUtil.cpp b/Source/Core/DSPCore/Src/Jit/DSPJitUtil.cpp index 5ee0950086..6a30f5f922 100644 --- a/Source/Core/DSPCore/Src/Jit/DSPJitUtil.cpp +++ b/Source/Core/DSPCore/Src/Jit/DSPJitUtil.cpp @@ -27,126 +27,141 @@ using namespace Gen; // Performs the hashing required by increment/increase/decrease_addr_reg -void DSPEmitter::ToMask(X64Reg value_reg, X64Reg temp_reg) +// clobbers RCX +void DSPEmitter::ToMask(X64Reg value_reg) { - MOV(16, R(temp_reg), R(value_reg)); - SHR(16, R(temp_reg), Imm8(8)); - OR(16, R(value_reg), R(temp_reg)); - MOV(16, R(temp_reg), R(value_reg)); - SHR(16, R(temp_reg), Imm8(4)); - OR(16, R(value_reg), R(temp_reg)); - MOV(16, R(temp_reg), R(value_reg)); - SHR(16, R(temp_reg), Imm8(2)); - OR(16, R(value_reg), R(temp_reg)); - MOV(16, R(temp_reg), R(value_reg)); - SHR(16, R(temp_reg), Imm8(1)); - OR(16, R(value_reg), R(temp_reg)); -} +#if 0 + MOV(16, R(CX), R(value_reg)); + SHR(16, R(CX), Imm8(8)); + OR(16, R(value_reg), R(CX)); + MOV(16, R(CX), R(value_reg)); + SHR(16, R(CX), Imm8(4)); + OR(16, R(value_reg), R(CX)); + MOV(16, R(CX), R(value_reg)); + SHR(16, R(CX), Imm8(2)); + OR(16, R(value_reg), R(CX)); + MOV(16, R(CX), R(value_reg)); + SHR(16, R(CX), Imm8(1)); + OR(16, R(value_reg), R(CX)); + MOVZX(32,16,value_reg, R(value_reg)); +#else + BSR(16, CX, R(value_reg)); + FixupBranch undef = J_CC(CC_Z); //CX is written, but undefined -// HORRIBLE UGLINESS, someone please fix. -// See http://code.google.com/p/dolphin-emu/source/detail?r=3125 -void DSPEmitter::dsp_increment_one(X64Reg ar, X64Reg wr, X64Reg wr_pow, X64Reg temp_reg) -{ - // if ((tmp & tmb) == tmb) - MOV(16, R(temp_reg), R(ar)); - AND(16, R(temp_reg), R(wr_pow)); - CMP(16, R(temp_reg), R(wr_pow)); - FixupBranch not_equal = J_CC(CC_NE); + MOV(32, R(value_reg), Imm32(2)); + SHL(32, R(value_reg), R(CL)); + SUB(32, R(value_reg), Imm32(1)); + //don't waste an instruction on jumping over an effective noop - // tmp -= wr_reg - SUB(16, R(ar), R(wr)); - - FixupBranch end = J(); - SetJumpTarget(not_equal); - - // else tmp++ - ADD(16, R(ar), Imm16(1)); - SetJumpTarget(end); + SetJumpTarget(undef); +#endif + OR(16, R(value_reg), Imm16(1)); + XOR(64, R(RCX), R(RCX)); } // EAX = g_dsp.r[reg] // EDX = g_dsp.r[DSP_REG_WR0 + reg] +//clobbers RCX void DSPEmitter::increment_addr_reg(int reg) { + /* + u16 ar = g_dsp.r[reg]; + u16 wr = g_dsp.r[reg+8]; + u16 nar = ar+1; + //this works, because nar^ar will have all the bits from the highest + //changed bit downwards set(true only for +1!) + //based on an idea by Mylek + if((nar^ar)>=((wr<<1)|1)) + nar -= wr+1; + */ + // s16 tmp = g_dsp.r[reg]; #ifdef _M_IX86 // All32 - MOV(16, R(EAX), M(&g_dsp.r[reg])); - MOV(16, R(EDX), M(&g_dsp.r[DSP_REG_WR0 + reg])); + MOV(16, R(AX), M(&g_dsp.r[reg])); + MOV(16, R(DX), M(&g_dsp.r[DSP_REG_WR0 + reg])); #else MOV(64, R(R11), ImmPtr(&g_dsp.r)); - MOV(16, R(EAX), MDisp(R11,reg*2)); - MOV(16, R(EDX), MDisp(R11,(DSP_REG_WR0 + reg)*2)); + MOV(16, R(AX), MDisp(R11,reg*2)); + MOV(16, R(DX), MDisp(R11,(DSP_REG_WR0 + reg)*2)); #endif - // ToMask(WR0), calculating it into EDI - MOV(16, R(EDI), R(EDX)); - ToMask(EDI); + MOV(16,R(DI), R(AX)); + ADD(16,R(AX), Imm16(1)); + XOR(16,R(DI), R(AX)); + MOV(16,R(SI), R(DX)); - dsp_increment_one(EAX, EDX, EDI); + SHL(16,R(SI), Imm8(1)); + OR(16,R(SI), Imm16(3)); + CMP(16,R(DI), R(SI)); + FixupBranch nowrap = J_CC(CC_L); + + SUB(16,R(AX), R(DX)); + SUB(16,R(AX), Imm16(1)); + + SetJumpTarget(nowrap); // g_dsp.r[reg] = tmp; #ifdef _M_IX86 // All32 - MOV(16, M(&g_dsp.r[reg]), R(EAX)); + MOV(16, M(&g_dsp.r[reg]), R(AX)); #else MOV(64, R(R11), ImmPtr(&g_dsp.r)); - MOV(16, MDisp(R11,reg*2), R(EAX)); + MOV(16, MDisp(R11,reg*2), R(AX)); #endif } -// See http://code.google.com/p/dolphin-emu/source/detail?r=3125 -void DSPEmitter::dsp_decrement_one(X64Reg ar, X64Reg wr, X64Reg wr_pow, X64Reg temp_reg) -{ - // compute min from wr_pow and ar - // min = (tmb+1-ar)&tmb; - LEA(16, temp_reg, MDisp(wr_pow, 1)); - SUB(16, R(temp_reg), R(ar)); - AND(16, R(temp_reg), R(wr_pow)); - - // wr < min - CMP(16, R(wr), R(temp_reg)); - FixupBranch wr_lt_min = J_CC(CC_B); - // !min - TEST(16, R(temp_reg), R(temp_reg)); - FixupBranch min_zero = J_CC(CC_Z); - - // ar--; - SUB(16, R(ar), Imm16(1)); - FixupBranch end = J(); - - // ar += wr; - SetJumpTarget(wr_lt_min); - SetJumpTarget(min_zero); - ADD(16, R(ar), R(wr)); - - SetJumpTarget(end); -} - // EAX = g_dsp.r[reg] // EDX = g_dsp.r[DSP_REG_WR0 + reg] +//clobbers RCX void DSPEmitter::decrement_addr_reg(int reg) { + /* + u16 ar = g_dsp.r[reg]; + u16 wr = g_dsp.r[reg+8]; + u16 m = ToMask(wr) | 1; + u16 nar = ar-1; + if((ar&m) - 1 < m-wr) + nar += wr+1; + return nar; + */ + // s16 ar = g_dsp.r[reg]; #ifdef _M_IX86 // All32 - MOV(16, R(EAX), M(&g_dsp.r[reg])); - MOV(16, R(EDX), M(&g_dsp.r[DSP_REG_WR0 + reg])); + MOV(16, R(AX), M(&g_dsp.r[reg])); + MOVZX(32, 16, EDX, M(&g_dsp.r[DSP_REG_WR0 + reg])); #else MOV(64, R(R11), ImmPtr(&g_dsp.r)); - MOV(16, R(EAX), MDisp(R11,reg*2)); - MOV(16, R(EDX), MDisp(R11,(DSP_REG_WR0 + reg)*2)); + MOV(16, R(AX), MDisp(R11,reg*2)); + MOVZX(32, 16, EDX, MDisp(R11,(DSP_REG_WR0 + reg)*2)); #endif // ToMask(WR0), calculating it into EDI - MOV(16, R(EDI), R(EDX)); - ToMask(EDI); + //u16 m = ToMask(wr) | 1; + MOV(16, R(DI), R(DX)); + ToMask(DI); - dsp_decrement_one(EAX, EDX, EDI); - - // g_dsp.r[reg] = tmp; + //u16 nar = ar-1; + MOV(16, R(CX), R(AX)); + SUB(16, R(AX), Imm16(1)); + + //(ar&m) - 1 + AND(32, R(ECX), R(EDI)); + SUB(32, R(ECX), Imm32(1)); + + //m-wr + SUB(32, R(EDI), R(EDX)); + CMP(32, R(ECX), R(EDI)); + FixupBranch out1 = J_CC(CC_GE); + ADD(16,R(AX),R(DX)); + ADD(16,R(AX),Imm16(1)); + + SetJumpTarget(out1); + + // g_dsp.r[reg] = tmp; #ifdef _M_IX86 // All32 - MOV(16, M(&g_dsp.r[reg]), R(EAX)); + MOV(16, M(&g_dsp.r[reg]), R(AX)); #else MOV(64, R(R11), ImmPtr(&g_dsp.r)); - MOV(16, MDisp(R11,reg*2), R(EAX)); + MOV(16, MDisp(R11,reg*2), R(AX)); #endif } @@ -156,72 +171,71 @@ void DSPEmitter::decrement_addr_reg(int reg) // EDX = g_dsp.r[DSP_REG_WR0 + reg] // EDI = tomask(EDX) void DSPEmitter::increase_addr_reg(int reg) -{ -#ifdef _M_IX86 // All32 - MOVZX(32, 16, ECX, M(&g_dsp.r[DSP_REG_IX0 + reg])); -#else - MOV(64, R(R11), ImmPtr(&g_dsp.r)); - MOVZX(32, 16, ECX, MDisp(R11,(DSP_REG_IX0 + reg)*2)); -#endif - // IX0 == 0, bail out - - TEST(16, R(ECX), R(ECX)); - // code too long for a 5-byte jump - // TODO: optimize a bit, maybe merge loops? - FixupBranch end = J_CC(CC_Z, true); +{ + /* + u16 ar = g_dsp.r[reg]; + u16 wr = g_dsp.r[reg+8]; + u16 ix = g_dsp.r[reg+4]; + u16 m = ToMask(wr) | 1; + u16 nar = ar+ix; + if (ix >= 0) { + if((ar&m) + (ix&m) -(int)m-1 >= 0) + nar -= wr+1; + } else { + if((ar&m) + (ix&m) -(int)m-1 < m-wr) + nar += wr+1; + } + return nar; + */ #ifdef _M_IX86 // All32 - MOV(16, R(EAX), M(&g_dsp.r[reg])); - MOV(16, R(EDX), M(&g_dsp.r[DSP_REG_WR0 + reg])); + MOV(16, R(SI), M(&g_dsp.r[DSP_REG_IX0 + reg])); + MOV(16, R(AX), M(&g_dsp.r[reg])); + MOVZX(32, 16, EDX, M(&g_dsp.r[DSP_REG_WR0 + reg])); #else - MOV(16, R(EAX), MDisp(R11,reg*2)); - MOV(16, R(EDX), MDisp(R11,(DSP_REG_WR0 + reg)*2)); + MOV(64, R(R11), ImmPtr(&g_dsp.r)); + MOV(16, R(SI), MDisp(R11,(DSP_REG_IX0 + reg)*2)); + MOV(16, R(AX), MDisp(R11,reg*2)); + MOVZX(32, 16, EDX, MDisp(R11,(DSP_REG_WR0 + reg)*2)); #endif // ToMask(WR0), calculating it into EDI - MOV(16, R(EDI), R(EDX)); - ToMask(EDI); + //u16 m = ToMask(wr) | 1; + MOV(16, R(DI), R(DX)); + ToMask(DI); - // IX0 > 0 - // TODO: ToMask flushes flags set by TEST, - // needs another CMP here. - CMP(16, R(ECX), Imm16(0)); - FixupBranch negative = J_CC(CC_L); + //u16 nar = ar+ix; + MOV(16, R(CX), R(AX)); + ADD(16, R(AX), R(SI)); - JumpTarget loop_pos = GetCodePtr(); + //(ar&m) + (ix&m) -(int)m-1 + AND(32, R(ECX), R(EDI)); + AND(32, R(ESI), R(EDI)); + ADD(32, R(ECX), R(ESI)); + SUB(32, R(ECX), R(EDI)); + SUB(32, R(ECX), Imm32(1)); - // dsp_increment - dsp_increment_one(EAX, EDX, EDI); + TEST(16,R(SI), Imm16(0x8000)); + FixupBranch negative = J_CC(CC_NZ); - SUB(16, R(ECX), Imm16(1)); // value-- -#ifdef _M_IX86 // All32 - CMP(16, M(&g_dsp.r[DSP_REG_IX0 + reg]), Imm16(127)); -#else - MOV(64, R(R11), ImmPtr(&g_dsp.r)); - CMP(16, MDisp(R11,(DSP_REG_IX0 + reg)*2), Imm16(127)); -#endif - FixupBranch dbg = J_CC(CC_NE); - CMP(16, R(ECX), Imm16(1)); - FixupBranch dbg2 = J_CC(CC_NE); - INT3(); - SetJumpTarget(dbg2); - SetJumpTarget(dbg); - CMP(16, R(ECX), Imm16(0)); // value > 0 - J_CC(CC_G, loop_pos); - FixupBranch end_pos = J(); + CMP(32, R(ECX), Imm32(0)); + FixupBranch out1 = J_CC(CC_L); + SUB(16,R(AX),R(DX)); + SUB(16,R(AX),Imm16(1)); + FixupBranch out2 = J(); - // else, IX0 < 0 SetJumpTarget(negative); - JumpTarget loop_neg = GetCodePtr(); - // dsp_decrement - dsp_decrement_one(EAX, EDX, EDI); + //m-wr + SUB(32, R(EDI), R(EDX)); + CMP(32, R(ECX), R(EDI)); + FixupBranch out3 = J_CC(CC_GE); + ADD(16,R(AX),R(DX)); + ADD(16,R(AX),Imm16(1)); - ADD(16, R(ECX), Imm16(1)); // value++ - CMP(16, R(ECX), Imm16(0)); // value < 0 - J_CC(CC_L, loop_neg); - - SetJumpTarget(end_pos); + SetJumpTarget(out1); + SetJumpTarget(out2); + SetJumpTarget(out3); // g_dsp.r[reg] = tmp; #ifdef _M_IX86 // All32 @@ -230,8 +244,6 @@ void DSPEmitter::increase_addr_reg(int reg) MOV(64, R(R11), ImmPtr(&g_dsp.r)); MOV(16, MDisp(R11,reg*2), R(EAX)); #endif - - SetJumpTarget(end); } // Decrease addr register according to the correspond ix register @@ -241,58 +253,68 @@ void DSPEmitter::increase_addr_reg(int reg) // EDI = tomask(EDX) void DSPEmitter::decrease_addr_reg(int reg) { -#ifdef _M_IX86 // All32 - MOV(16, R(ECX), M(&g_dsp.r[DSP_REG_IX0 + reg])); -#else - MOV(64, R(R11), ImmPtr(&g_dsp.r)); - MOV(16, R(ECX), MDisp(R11,(DSP_REG_IX0 + reg)*2)); -#endif - // IX0 == 0, bail out - TEST(16, R(ECX), R(ECX)); - // code too long for a 5-byte jump - // TODO: optimize a bit, maybe merge loops? - FixupBranch end = J_CC(CC_Z, true); + /* + u16 ar = g_dsp.r[reg]; + u16 wr = g_dsp.r[reg+8]; + u16 ix = g_dsp.r[reg+4]; + u16 m = ToMask(wr) | 1; + u16 nar = ar-ix; //!! + if ((u16)ix > 0x8000) { // equiv: ix < 0 && ix != -0x8000 //!! + if((ar&m) - (int)(ix&m) >= 0) //!! + nar -= wr+1; + } else { + if((ar&m) - (int)(ix&m) < m-wr) //!! + nar += wr+1; + } + return nar; + */ #ifdef _M_IX86 // All32 - MOV(16, R(EAX), M(&g_dsp.r[reg])); - MOV(16, R(EDX), M(&g_dsp.r[DSP_REG_WR0 + reg])); + MOV(16, R(SI), M(&g_dsp.r[DSP_REG_IX0 + reg])); + MOV(16, R(AX), M(&g_dsp.r[reg])); + MOVZX(32, 16, EDX, M(&g_dsp.r[DSP_REG_WR0 + reg])); #else - MOV(16, R(EAX), MDisp(R11,reg*2)); - MOV(16, R(EDX), MDisp(R11,(DSP_REG_WR0 + reg)*2)); + MOV(64, R(R11), ImmPtr(&g_dsp.r)); + MOV(16, R(SI), MDisp(R11,(DSP_REG_IX0 + reg)*2)); + MOV(16, R(AX), MDisp(R11,reg*2)); + MOVZX(32, 16, EDX, MDisp(R11,(DSP_REG_WR0 + reg)*2)); #endif // ToMask(WR0), calculating it into EDI - MOV(16, R(EDI), R(EDX)); - ToMask(EDI); + //u16 m = ToMask(wr) | 1; + MOV(16, R(DI), R(DX)); + ToMask(DI); - // IX0 > 0 - // TODO: ToMask flushes flags set by TEST, - // needs another CMP here. - CMP(16, R(ECX), Imm16(0)); - FixupBranch negative = J_CC(CC_L); + //u16 nar = ar-ix; + MOV(16, R(CX), R(AX)); + SUB(16, R(AX), R(SI)); - JumpTarget loop_pos = GetCodePtr(); + //(ar&m) + (ix&m) + AND(32, R(ECX), R(EDI)); + AND(32, R(ESI), R(EDI)); + SUB(32, R(ECX), R(ESI)); - // dsp_decrement - dsp_decrement_one(EAX, EDX, EDI); + CMP(16,R(SI), Imm16(0x8000)); + FixupBranch negative = J_CC(CC_BE); - SUB(16, R(ECX), Imm16(1)); // value-- - CMP(16, R(ECX), Imm16(0)); // value > 0 - J_CC(CC_G, loop_pos); - FixupBranch end_pos = J(); + CMP(32, R(ECX), Imm32(0)); + FixupBranch out1 = J_CC(CC_L); + SUB(16,R(AX),R(DX)); + SUB(16,R(AX),Imm16(1)); + FixupBranch out2 = J(); - // else, IX0 < 0 SetJumpTarget(negative); - JumpTarget loop_neg = GetCodePtr(); - // dsp_increment - dsp_increment_one(EAX, EDX, EDI); + //m-wr + SUB(32, R(EDI), R(EDX)); + CMP(32, R(ECX), R(EDI)); + FixupBranch out3 = J_CC(CC_GE); + ADD(16,R(AX),R(DX)); + ADD(16,R(AX),Imm16(1)); - ADD(16, R(ECX), Imm16(1)); // value++ - CMP(16, R(ECX), Imm16(0)); // value < 0 - J_CC(CC_L, loop_neg); - - SetJumpTarget(end_pos); + SetJumpTarget(out1); + SetJumpTarget(out2); + SetJumpTarget(out3); // g_dsp.r[reg] = tmp; #ifdef _M_IX86 // All32 @@ -301,8 +323,6 @@ void DSPEmitter::decrease_addr_reg(int reg) MOV(64, R(R11), ImmPtr(&g_dsp.r)); MOV(16, MDisp(R11,reg*2), R(EAX)); #endif - - SetJumpTarget(end); }