Jit64: divwx - Optimize constant divisor

Optimize division by a constant into multiplication. This method is also
used by GCC and LLVM.

We also add optimized paths for divisors 0, 1, and -1, because they
don't work using this method. They don't occur very often, but are
necessary for correctness.

- Division by 1
Before:
41 BF 01 00 00 00    mov         r15d,1
41 8B C5             mov         eax,r13d
45 85 FF             test        r15d,r15d
74 0D                je          overflow
3D 00 00 00 80       cmp         eax,80000000h
75 0E                jne         normal_path
41 83 FF FF          cmp         r15d,0FFFFFFFFh
75 08                jne         normal_path
overflow:
C1 F8 1F             sar         eax,1Fh
44 8B F8             mov         r15d,eax
EB 07                jmp         done
normal_path:
99                   cdq
41 F7 FF             idiv        eax,r15d
44 8B F8             mov         r15d,eax
done:

After:
45 8B FD             mov         r15d,r13d

- Division by 30307
Before:
41 BA 63 76 00 00    mov         r10d,7663h
41 8B C5             mov         eax,r13d
45 85 D2             test        r10d,r10d
74 0D                je          overflow
3D 00 00 00 80       cmp         eax,80000000h
75 0E                jne         normal_path
41 83 FA FF          cmp         r10d,0FFFFFFFFh
75 08                jne         normal_path
overflow:
C1 F8 1F             sar         eax,1Fh
44 8B C0             mov         r8d,eax
EB 07                jmp         done
normal_path:
99                   cdq
41 F7 FA             idiv        eax,r10d
44 8B C0             mov         r8d,eax
done:

After:
49 63 C5             movsxd      rax,r13d
48 69 C0 65 6B 32 45 imul        rax,rax,45326B65h
4C 8B C0             mov         r8,rax
48 C1 E8 3F          shr         rax,3Fh
49 C1 F8 2D          sar         r8,2Dh
44 03 C0             add         r8d,eax

- Division by 30323
Before:
41 BA 73 76 00 00    mov         r10d,7673h
41 8B C5             mov         eax,r13d
45 85 D2             test        r10d,r10d
74 0D                je          overflow
3D 00 00 00 80       cmp         eax,80000000h
75 0E                jne         normal_path
41 83 FA FF          cmp         r10d,0FFFFFFFFh
75 08                jne         normal_path
overflow:
C1 F8 1F             sar         eax,1Fh
44 8B C0             mov         r8d,eax
EB 07                jmp         00000000161737E7
normal_path:
99                   cdq
41 F7 FA             idiv        eax,r10d
44 8B C0             mov         r8d,eax
done:

After:
49 63 C5             movsxd      rax,r13d
4C 69 C0 19 25 52 8A imul        r8,rax,0FFFFFFFF8A522519h
49 C1 E8 20          shr         r8,20h
44 03 C0             add         r8d,eax
C1 E8 1F             shr         eax,1Fh
41 C1 F8 0E          sar         r8d,0Eh
44 03 C0             add         r8d,eax
This commit is contained in:
Sintendo 2021-03-04 20:17:50 +01:00
parent 5bb8798df6
commit 95698c5ae1

View File

@ -16,10 +16,12 @@
#include "Core/PowerPC/Jit64/Jit.h"
#include "Core/PowerPC/Jit64/RegCache/JitRegCache.h"
#include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"
#include "Core/PowerPC/JitCommon/DivUtils.h"
#include "Core/PowerPC/PPCAnalyst.h"
#include "Core/PowerPC/PowerPC.h"
using namespace Gen;
using namespace JitCommon;
void Jit64::GenerateConstantOverflow(s64 val)
{
@ -1414,6 +1416,88 @@ void Jit64::divwx(UGeckoInstruction inst)
SetJumpTarget(done);
}
}
else if (gpr.IsImm(b))
{
// Constant divisor
const s32 divisor = gpr.SImm32(b);
RCOpArg Ra = gpr.Use(a, RCMode::Read);
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
RegCache::Realize(Ra, Rd);
// Handle 0, 1, and -1 explicitly
if (divisor == 0)
{
if (d != a)
MOV(32, Rd, Ra);
SAR(32, Rd, Imm8(31));
if (inst.OE)
GenerateConstantOverflow(true);
}
else if (divisor == 1)
{
if (d != a)
MOV(32, Rd, Ra);
if (inst.OE)
GenerateConstantOverflow(false);
}
else if (divisor == -1)
{
if (d != a)
MOV(32, Rd, Ra);
CMP(32, Rd, Imm32(0x80000000));
const FixupBranch normal = J_CC(CC_NE);
MOV(32, Rd, Imm32(0xFFFFFFFF));
if (inst.OE)
GenerateConstantOverflow(true);
const FixupBranch done = J();
SetJumpTarget(normal);
NEG(32, Rd);
if (inst.OE)
GenerateConstantOverflow(false);
SetJumpTarget(done);
}
else
{
// Optimize signed 32-bit integer division by a constant
Magic m = SignedDivisionConstants(divisor);
MOVSX(64, 32, RSCRATCH, Ra);
if (divisor > 0 && m.multiplier < 0)
{
IMUL(64, Rd, R(RSCRATCH), Imm32(m.multiplier));
SHR(64, Rd, Imm8(32));
ADD(32, Rd, R(RSCRATCH));
SHR(32, R(RSCRATCH), Imm8(31));
SAR(32, Rd, Imm8(m.shift));
}
else if (divisor < 0 && m.multiplier > 0)
{
IMUL(64, Rd, R(RSCRATCH), Imm32(m.multiplier));
SHR(64, R(RSCRATCH), Imm8(32));
SUB(32, R(RSCRATCH), Rd);
MOV(32, Rd, R(RSCRATCH));
SHR(32, Rd, Imm8(31));
SAR(32, R(RSCRATCH), Imm8(m.shift));
}
else
{
IMUL(64, RSCRATCH, R(RSCRATCH), Imm32(m.multiplier));
MOV(64, Rd, R(RSCRATCH));
SHR(64, R(RSCRATCH), Imm8(63));
SAR(64, R(Rd), Imm8(32 + m.shift));
}
ADD(32, Rd, R(RSCRATCH));
if (inst.OE)
GenerateConstantOverflow(false);
}
}
else
{
RCOpArg Ra = gpr.Use(a, RCMode::Read);