mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-01-10 08:09:26 +01:00
Merge pull request #12661 from Sintendo/arm64divwux
JitArm64: Optimize divwux
This commit is contained in:
commit
5f6a054ffc
@ -1451,12 +1451,10 @@ void Jit64::divwux(UGeckoInstruction inst)
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
u32 shift = 31;
|
if (MathUtil::IsPow2(divisor))
|
||||||
while (!(divisor & (1 << shift)))
|
|
||||||
shift--;
|
|
||||||
|
|
||||||
if (divisor == (u32)(1 << shift))
|
|
||||||
{
|
{
|
||||||
|
u32 shift = MathUtil::IntLog2(divisor);
|
||||||
|
|
||||||
RCOpArg Ra = gpr.Use(a, RCMode::Read);
|
RCOpArg Ra = gpr.Use(a, RCMode::Read);
|
||||||
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
|
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
|
||||||
RegCache::Realize(Ra, Rd);
|
RegCache::Realize(Ra, Rd);
|
||||||
@ -1468,24 +1466,22 @@ void Jit64::divwux(UGeckoInstruction inst)
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
u64 magic_dividend = 0x100000000ULL << shift;
|
UnsignedMagic m = UnsignedDivisionConstants(divisor);
|
||||||
u32 magic = (u32)(magic_dividend / divisor);
|
|
||||||
u32 max_quotient = magic >> shift;
|
|
||||||
|
|
||||||
// Test for failure in round-up method
|
// Test for failure in round-up method
|
||||||
if (((u64)(magic + 1) * (max_quotient * divisor - 1)) >> (shift + 32) != max_quotient - 1)
|
if (!m.fast)
|
||||||
{
|
{
|
||||||
// If failed, use slower round-down method
|
// If failed, use slower round-down method
|
||||||
RCOpArg Ra = gpr.Use(a, RCMode::Read);
|
RCOpArg Ra = gpr.Use(a, RCMode::Read);
|
||||||
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
|
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
|
||||||
RegCache::Realize(Ra, Rd);
|
RegCache::Realize(Ra, Rd);
|
||||||
|
|
||||||
MOV(32, R(RSCRATCH), Imm32(magic));
|
MOV(32, R(RSCRATCH), Imm32(m.multiplier));
|
||||||
if (d != a)
|
if (d != a)
|
||||||
MOV(32, Rd, Ra);
|
MOV(32, Rd, Ra);
|
||||||
IMUL(64, Rd, R(RSCRATCH));
|
IMUL(64, Rd, R(RSCRATCH));
|
||||||
ADD(64, Rd, R(RSCRATCH));
|
ADD(64, Rd, R(RSCRATCH));
|
||||||
SHR(64, Rd, Imm8(shift + 32));
|
SHR(64, Rd, Imm8(m.shift + 32));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -1494,32 +1490,23 @@ void Jit64::divwux(UGeckoInstruction inst)
|
|||||||
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
|
RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
|
||||||
RegCache::Realize(Ra, Rd);
|
RegCache::Realize(Ra, Rd);
|
||||||
|
|
||||||
magic++;
|
|
||||||
|
|
||||||
// Use smallest magic number and shift amount possible
|
|
||||||
while ((magic & 1) == 0 && shift > 0)
|
|
||||||
{
|
|
||||||
magic >>= 1;
|
|
||||||
shift--;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Three-operand IMUL sign extends the immediate to 64 bits, so we may only
|
// Three-operand IMUL sign extends the immediate to 64 bits, so we may only
|
||||||
// use it when the magic number has its most significant bit set to 0
|
// use it when the magic number has its most significant bit set to 0
|
||||||
if ((magic & 0x80000000) == 0)
|
if ((m.multiplier & 0x80000000) == 0)
|
||||||
{
|
{
|
||||||
IMUL(64, Rd, Ra, Imm32(magic));
|
IMUL(64, Rd, Ra, Imm32(m.multiplier));
|
||||||
}
|
}
|
||||||
else if (d == a)
|
else if (d == a)
|
||||||
{
|
{
|
||||||
MOV(32, R(RSCRATCH), Imm32(magic));
|
MOV(32, R(RSCRATCH), Imm32(m.multiplier));
|
||||||
IMUL(64, Rd, R(RSCRATCH));
|
IMUL(64, Rd, R(RSCRATCH));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
MOV(32, Rd, Imm32(magic));
|
MOV(32, Rd, Imm32(m.multiplier));
|
||||||
IMUL(64, Rd, Ra);
|
IMUL(64, Rd, Ra);
|
||||||
}
|
}
|
||||||
SHR(64, Rd, Imm8(shift + 32));
|
SHR(64, Rd, Imm8(m.shift + 32));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (inst.OE)
|
if (inst.OE)
|
||||||
@ -1792,7 +1779,7 @@ void Jit64::divwx(UGeckoInstruction inst)
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Optimize signed 32-bit integer division by a constant
|
// Optimize signed 32-bit integer division by a constant
|
||||||
Magic m = SignedDivisionConstants(divisor);
|
SignedMagic m = SignedDivisionConstants(divisor);
|
||||||
|
|
||||||
MOVSX(64, 32, RSCRATCH, Ra);
|
MOVSX(64, 32, RSCRATCH, Ra);
|
||||||
|
|
||||||
|
@ -1538,6 +1538,60 @@ void JitArm64::divwux(UGeckoInstruction inst)
|
|||||||
if (inst.Rc)
|
if (inst.Rc)
|
||||||
ComputeRC0(gpr.GetImm(d));
|
ComputeRC0(gpr.GetImm(d));
|
||||||
}
|
}
|
||||||
|
else if (gpr.IsImm(b))
|
||||||
|
{
|
||||||
|
const u32 divisor = gpr.GetImm(b);
|
||||||
|
|
||||||
|
if (divisor == 0)
|
||||||
|
{
|
||||||
|
gpr.SetImmediate(d, 0);
|
||||||
|
if (inst.Rc)
|
||||||
|
ComputeRC0(0);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const bool allocate_reg = d == a;
|
||||||
|
gpr.BindToRegister(d, allocate_reg);
|
||||||
|
|
||||||
|
ARM64Reg RD = gpr.R(d);
|
||||||
|
ARM64Reg RA = gpr.R(a);
|
||||||
|
|
||||||
|
if (MathUtil::IsPow2(divisor))
|
||||||
|
{
|
||||||
|
int shift = MathUtil::IntLog2(divisor);
|
||||||
|
if (shift)
|
||||||
|
LSR(RD, RA, shift);
|
||||||
|
else if (d != a)
|
||||||
|
MOV(RD, RA);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
UnsignedMagic m = UnsignedDivisionConstants(divisor);
|
||||||
|
|
||||||
|
ARM64Reg WI = allocate_reg ? gpr.GetReg() : RD;
|
||||||
|
ARM64Reg XD = EncodeRegTo64(RD);
|
||||||
|
|
||||||
|
MOVI2R(WI, m.multiplier);
|
||||||
|
|
||||||
|
if (m.fast)
|
||||||
|
{
|
||||||
|
UMULL(XD, RA, WI);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
UMADDL(XD, RA, WI, EncodeRegTo64(WI));
|
||||||
|
}
|
||||||
|
|
||||||
|
LSR(XD, XD, 32 + m.shift);
|
||||||
|
|
||||||
|
if (allocate_reg)
|
||||||
|
gpr.Unlock(WI);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inst.Rc)
|
||||||
|
ComputeRC0(gpr.R(d));
|
||||||
|
}
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
gpr.BindToRegister(d, d == a || d == b);
|
gpr.BindToRegister(d, d == a || d == b);
|
||||||
@ -1675,7 +1729,7 @@ void JitArm64::divwx(UGeckoInstruction inst)
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Optimize signed 32-bit integer division by a constant
|
// Optimize signed 32-bit integer division by a constant
|
||||||
Magic m = SignedDivisionConstants(divisor);
|
SignedMagic m = SignedDivisionConstants(divisor);
|
||||||
|
|
||||||
ARM64Reg WA = gpr.GetReg();
|
ARM64Reg WA = gpr.GetReg();
|
||||||
ARM64Reg WB = gpr.GetReg();
|
ARM64Reg WB = gpr.GetReg();
|
||||||
|
@ -3,16 +3,18 @@
|
|||||||
|
|
||||||
#include "Core/PowerPC/JitCommon/DivUtils.h"
|
#include "Core/PowerPC/JitCommon/DivUtils.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <bit>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
|
||||||
namespace JitCommon
|
namespace JitCommon
|
||||||
{
|
{
|
||||||
Magic SignedDivisionConstants(s32 d)
|
SignedMagic SignedDivisionConstants(s32 divisor)
|
||||||
{
|
{
|
||||||
const u32 two31 = 2147483648;
|
const u32 two31 = 2147483648;
|
||||||
|
|
||||||
const u32 ad = std::abs(d);
|
const u32 ad = std::abs(divisor);
|
||||||
const u32 t = two31 - (d < 0);
|
const u32 t = two31 - (divisor < 0);
|
||||||
const u32 anc = t - 1 - t % ad;
|
const u32 anc = t - 1 - t % ad;
|
||||||
u32 q1 = two31 / anc;
|
u32 q1 = two31 / anc;
|
||||||
u32 r1 = two31 - q1 * anc;
|
u32 r1 = two31 - q1 * anc;
|
||||||
@ -44,13 +46,43 @@ Magic SignedDivisionConstants(s32 d)
|
|||||||
delta = ad - r2;
|
delta = ad - r2;
|
||||||
} while (q1 < delta || (q1 == delta && r1 == 0));
|
} while (q1 < delta || (q1 == delta && r1 == 0));
|
||||||
|
|
||||||
Magic mag;
|
SignedMagic mag;
|
||||||
mag.multiplier = q2 + 1;
|
mag.multiplier = q2 + 1;
|
||||||
if (d < 0)
|
if (divisor < 0)
|
||||||
mag.multiplier = -mag.multiplier;
|
mag.multiplier = -mag.multiplier;
|
||||||
mag.shift = p - 32;
|
mag.shift = p - 32;
|
||||||
|
|
||||||
return mag;
|
return mag;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
UnsignedMagic UnsignedDivisionConstants(u32 divisor)
|
||||||
|
{
|
||||||
|
u32 shift = 31 - std::countl_zero(divisor);
|
||||||
|
|
||||||
|
u64 magic_dividend = 0x100000000ULL << shift;
|
||||||
|
u32 multiplier = magic_dividend / divisor;
|
||||||
|
u32 max_quotient = multiplier >> shift;
|
||||||
|
|
||||||
|
// Test for failure in round-up method
|
||||||
|
u32 round_up = (u64(multiplier + 1) * (max_quotient * divisor - 1)) >> (shift + 32);
|
||||||
|
bool fast = round_up == max_quotient - 1;
|
||||||
|
|
||||||
|
if (fast)
|
||||||
|
{
|
||||||
|
multiplier++;
|
||||||
|
|
||||||
|
// Use smallest magic number and shift amount possible
|
||||||
|
u32 trailing_zeroes = std::min(shift, u32(std::countr_zero(multiplier)));
|
||||||
|
multiplier >>= trailing_zeroes;
|
||||||
|
shift -= trailing_zeroes;
|
||||||
|
}
|
||||||
|
|
||||||
|
UnsignedMagic mag;
|
||||||
|
mag.multiplier = multiplier;
|
||||||
|
mag.shift = shift;
|
||||||
|
mag.fast = fast;
|
||||||
|
|
||||||
|
return mag;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace JitCommon
|
} // namespace JitCommon
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
namespace JitCommon
|
namespace JitCommon
|
||||||
{
|
{
|
||||||
struct Magic
|
struct SignedMagic
|
||||||
{
|
{
|
||||||
s32 multiplier;
|
s32 multiplier;
|
||||||
u8 shift;
|
u8 shift;
|
||||||
@ -16,6 +16,27 @@ struct Magic
|
|||||||
// Calculate the constants required to optimize a signed 32-bit integer division.
|
// Calculate the constants required to optimize a signed 32-bit integer division.
|
||||||
// Taken from The PowerPC Compiler Writer's Guide and LLVM.
|
// Taken from The PowerPC Compiler Writer's Guide and LLVM.
|
||||||
// Divisor must not be -1, 0, 1 or INT_MIN.
|
// Divisor must not be -1, 0, 1 or INT_MIN.
|
||||||
Magic SignedDivisionConstants(s32 divisor);
|
SignedMagic SignedDivisionConstants(s32 divisor);
|
||||||
|
|
||||||
|
struct UnsignedMagic
|
||||||
|
{
|
||||||
|
u32 multiplier;
|
||||||
|
u8 shift;
|
||||||
|
bool fast;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Calculate the constants required to optimize an unsigned 32-bit integer
|
||||||
|
/// division.
|
||||||
|
/// Divisor must not be 0, 1, or a power of two.
|
||||||
|
///
|
||||||
|
/// Original implementation by calc84maniac.
|
||||||
|
/// Results are the same as the approach laid out in Hacker's Delight, with an
|
||||||
|
/// improvement for so-called uncooperative divisors (e.g. 7), as discovered by
|
||||||
|
/// ridiculousfish.
|
||||||
|
///
|
||||||
|
/// See also:
|
||||||
|
/// https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
|
||||||
|
/// https://rubenvannieuwpoort.nl/posts/division-by-constant-unsigned-integers
|
||||||
|
UnsignedMagic UnsignedDivisionConstants(u32 divisor);
|
||||||
|
|
||||||
} // namespace JitCommon
|
} // namespace JitCommon
|
||||||
|
@ -9,12 +9,12 @@ using namespace JitCommon;
|
|||||||
|
|
||||||
TEST(DivUtils, Signed)
|
TEST(DivUtils, Signed)
|
||||||
{
|
{
|
||||||
Magic m3 = SignedDivisionConstants(3);
|
SignedMagic m3 = SignedDivisionConstants(3);
|
||||||
Magic m5 = SignedDivisionConstants(5);
|
SignedMagic m5 = SignedDivisionConstants(5);
|
||||||
Magic m7 = SignedDivisionConstants(7);
|
SignedMagic m7 = SignedDivisionConstants(7);
|
||||||
Magic minus3 = SignedDivisionConstants(-3);
|
SignedMagic minus3 = SignedDivisionConstants(-3);
|
||||||
Magic minus5 = SignedDivisionConstants(-5);
|
SignedMagic minus5 = SignedDivisionConstants(-5);
|
||||||
Magic minus7 = SignedDivisionConstants(-7);
|
SignedMagic minus7 = SignedDivisionConstants(-7);
|
||||||
|
|
||||||
EXPECT_EQ(0x55555556, m3.multiplier);
|
EXPECT_EQ(0x55555556, m3.multiplier);
|
||||||
EXPECT_EQ(0, m3.shift);
|
EXPECT_EQ(0, m3.shift);
|
||||||
@ -30,3 +30,32 @@ TEST(DivUtils, Signed)
|
|||||||
EXPECT_EQ(0x6DB6DB6D, minus7.multiplier);
|
EXPECT_EQ(0x6DB6DB6D, minus7.multiplier);
|
||||||
EXPECT_EQ(2, minus7.shift);
|
EXPECT_EQ(2, minus7.shift);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(DivUtils, Unsigned)
|
||||||
|
{
|
||||||
|
UnsignedMagic m3 = UnsignedDivisionConstants(3);
|
||||||
|
UnsignedMagic m5 = UnsignedDivisionConstants(5);
|
||||||
|
UnsignedMagic m7 = UnsignedDivisionConstants(7);
|
||||||
|
UnsignedMagic m9 = UnsignedDivisionConstants(9);
|
||||||
|
UnsignedMagic m19 = UnsignedDivisionConstants(19);
|
||||||
|
|
||||||
|
EXPECT_EQ(0xAAAAAAABU, m3.multiplier);
|
||||||
|
EXPECT_EQ(1, m3.shift);
|
||||||
|
EXPECT_TRUE(m3.fast);
|
||||||
|
|
||||||
|
EXPECT_EQ(0xCCCCCCCDU, m5.multiplier);
|
||||||
|
EXPECT_EQ(2, m5.shift);
|
||||||
|
EXPECT_TRUE(m5.fast);
|
||||||
|
|
||||||
|
EXPECT_EQ(0x92492492U, m7.multiplier);
|
||||||
|
EXPECT_EQ(2, m7.shift);
|
||||||
|
EXPECT_FALSE(m7.fast);
|
||||||
|
|
||||||
|
EXPECT_EQ(0x38E38E39U, m9.multiplier);
|
||||||
|
EXPECT_EQ(1, m9.shift);
|
||||||
|
EXPECT_TRUE(m9.fast);
|
||||||
|
|
||||||
|
EXPECT_EQ(0xD79435E5U, m19.multiplier);
|
||||||
|
EXPECT_EQ(4, m19.shift);
|
||||||
|
EXPECT_FALSE(m19.fast);
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user