From 49a4e0971952248c66ee8bbb2a92a8f1a0a6d5ab Mon Sep 17 00:00:00 2001 From: Sintendo Date: Wed, 29 Jan 2020 23:01:57 +0100 Subject: [PATCH 1/3] DSPJit: Replace/optimize LEAs - LEA is a bit silly when the source and the destination are the same. A simple ADD or SHL will do in those cases. 66 8D 04 45 00 00 00 00 lea ax,[rax*2] 66 03 C0 add ax,ax 48 8D 04 00 lea rax,[rax+rax] 48 03 C0 add rax,rax 66 8D 14 D5 00 00 00 00 lea dx,[rdx*8] 66 C1 E2 03 shl dx,3 - When scaling by 2, consider summing the register with itself instead. The former always needs a 32-bit displacement, so the sum is more compact. 66 8D 14 45 00 00 00 00 lea dx,[rax*2] 66 8D 14 00 lea dx,[rax+rax] --- Source/Core/Core/DSP/Jit/x64/DSPJitBranch.cpp | 6 +++--- Source/Core/Core/DSP/Jit/x64/DSPJitMultiplier.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Source/Core/Core/DSP/Jit/x64/DSPJitBranch.cpp b/Source/Core/Core/DSP/Jit/x64/DSPJitBranch.cpp index e08a6ac57e..339d81b4e3 100644 --- a/Source/Core/Core/DSP/Jit/x64/DSPJitBranch.cpp +++ b/Source/Core/Core/DSP/Jit/x64/DSPJitBranch.cpp @@ -38,7 +38,7 @@ void DSPEmitter::ReJitConditional(const UDSPInstruction opc, case 0x3: // LE - Less Equal LEA(16, EDX, MScaled(EAX, SCALE_4, 0)); XOR(16, R(EAX), R(EDX)); - LEA(16, EAX, MScaled(EAX, SCALE_2, 0)); + ADD(16, R(EAX), R(EAX)); OR(16, R(EAX), R(EDX)); TEST(16, R(EAX), Imm16(0x10)); break; @@ -56,9 +56,9 @@ void DSPEmitter::ReJitConditional(const UDSPInstruction opc, break; case 0xa: // ? case 0xb: // ? - LEA(16, EDX, MScaled(EAX, SCALE_2, 0)); + LEA(16, EDX, MRegSum(EAX, EAX)); OR(16, R(EAX), R(EDX)); - LEA(16, EDX, MScaled(EDX, SCALE_8, 0)); + SHL(16, R(EDX), Imm8(3)); NOT(16, R(EAX)); OR(16, R(EAX), R(EDX)); TEST(16, R(EAX), Imm16(0x20)); diff --git a/Source/Core/Core/DSP/Jit/x64/DSPJitMultiplier.cpp b/Source/Core/Core/DSP/Jit/x64/DSPJitMultiplier.cpp index 79e6252f12..63acadc559 100644 --- a/Source/Core/Core/DSP/Jit/x64/DSPJitMultiplier.cpp +++ b/Source/Core/Core/DSP/Jit/x64/DSPJitMultiplier.cpp @@ -30,7 +30,7 @@ void DSPEmitter::multiply() TEST(16, sr_reg, Imm16(SR_MUL_MODIFY)); FixupBranch noMult2 = J_CC(CC_NZ); // prod <<= 1; - LEA(64, RAX, MRegSum(RAX, RAX)); + ADD(64, R(RAX), R(RAX)); SetJumpTarget(noMult2); m_gpr.PutReg(DSP_REG_SR, false); // return prod; @@ -130,7 +130,7 @@ void DSPEmitter::multiply_mulx(u8 axh0, u8 axh1) TEST(16, sr_reg, Imm16(SR_MUL_MODIFY)); FixupBranch noMult2 = J_CC(CC_NZ); // prod <<= 1; - LEA(64, RAX, MRegSum(RAX, RAX)); + ADD(64, R(RAX), R(RAX)); SetJumpTarget(noMult2); m_gpr.PutReg(DSP_REG_SR, false); // return prod; From 618d261b910d1e90acbfcd504bc3a2b04f457f10 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Wed, 29 Jan 2020 23:17:18 +0100 Subject: [PATCH 2/3] DSPJitMultiplier: addpaxz - AND constant directly There's no need to load the 64-bit immediate into a temporary register. x64 will sign-extend 32-bit immediates to 64 bits, giving us the exact value we need in this case. 48 C7 C0 00 00 FF FF mov rax,0FFFFFFFFFFFF0000h 48 21 C2 and rdx,rax 48 81 E2 00 00 FF FF and rdx,0FFFFFFFFFFFF0000h --- Source/Core/Core/DSP/Jit/x64/DSPJitMultiplier.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Source/Core/Core/DSP/Jit/x64/DSPJitMultiplier.cpp b/Source/Core/Core/DSP/Jit/x64/DSPJitMultiplier.cpp index 63acadc559..5e102ab25f 100644 --- a/Source/Core/Core/DSP/Jit/x64/DSPJitMultiplier.cpp +++ b/Source/Core/Core/DSP/Jit/x64/DSPJitMultiplier.cpp @@ -252,8 +252,7 @@ void DSPEmitter::addpaxz(const UDSPInstruction opc) get_long_acx(sreg, tmp1); MOV(64, R(RDX), R(tmp1)); // s64 res = prod + (ax & ~0xffff); - MOV(64, R(RAX), Imm64(~0xffff)); - AND(64, R(RDX), R(RAX)); + AND(64, R(RDX), Imm32(~0xffff)); // s64 prod = dsp_get_long_prod_round_prodl(); get_long_prod_round_prodl(); ADD(64, R(RAX), R(RDX)); From 363f3f82bbedd05701a1c4cda6c440f833ef7f12 Mon Sep 17 00:00:00 2001 From: Sintendo Date: Mon, 3 Aug 2020 23:58:09 +0200 Subject: [PATCH 3/3] DSPJitRegCache: Simplify WriteReg The intent here is to generate a more compact instruction if a 32-bit immediate can be zero-extended to the desired 64-bit immediate. Nowadays the emitter is smart enough to do this for us, so this logic is redundant. --- Source/Core/Core/DSP/Jit/x64/DSPJitRegCache.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/Source/Core/Core/DSP/Jit/x64/DSPJitRegCache.cpp b/Source/Core/Core/DSP/Jit/x64/DSPJitRegCache.cpp index ed8af72355..2b2850fe36 100644 --- a/Source/Core/Core/DSP/Jit/x64/DSPJitRegCache.cpp +++ b/Source/Core/Core/DSP/Jit/x64/DSPJitRegCache.cpp @@ -835,14 +835,7 @@ void DSPJitRegCache::WriteReg(int dreg, OpArg arg) m_emitter.MOV(32, reg, Imm32(arg.Imm32())); break; case 8: - if ((u32)arg.Imm64() == arg.Imm64()) - { - m_emitter.MOV(64, reg, Imm32((u32)arg.Imm64())); - } - else - { - m_emitter.MOV(64, reg, Imm64(arg.Imm64())); - } + m_emitter.MOV(64, reg, Imm64(arg.Imm64())); break; default: ASSERT_MSG(DSPLLE, 0, "unsupported memory size");