Merge pull request #12661 from Sintendo/arm64divwux

JitArm64: Optimize divwux
This commit is contained in:
JosJuice 2024-03-29 15:36:18 +01:00 committed by GitHub
commit 5f6a054ffc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 163 additions and 40 deletions

View File

@ -1451,12 +1451,10 @@ void Jit64::divwux(UGeckoInstruction inst)
} }
else else
{ {
u32 shift = 31; if (MathUtil::IsPow2(divisor))
while (!(divisor & (1 << shift)))
shift--;
if (divisor == (u32)(1 << shift))
{ {
u32 shift = MathUtil::IntLog2(divisor);
RCOpArg Ra = gpr.Use(a, RCMode::Read); RCOpArg Ra = gpr.Use(a, RCMode::Read);
RCX64Reg Rd = gpr.Bind(d, RCMode::Write); RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
RegCache::Realize(Ra, Rd); RegCache::Realize(Ra, Rd);
@ -1468,24 +1466,22 @@ void Jit64::divwux(UGeckoInstruction inst)
} }
else else
{ {
u64 magic_dividend = 0x100000000ULL << shift; UnsignedMagic m = UnsignedDivisionConstants(divisor);
u32 magic = (u32)(magic_dividend / divisor);
u32 max_quotient = magic >> shift;
// Test for failure in round-up method // Test for failure in round-up method
if (((u64)(magic + 1) * (max_quotient * divisor - 1)) >> (shift + 32) != max_quotient - 1) if (!m.fast)
{ {
// If failed, use slower round-down method // If failed, use slower round-down method
RCOpArg Ra = gpr.Use(a, RCMode::Read); RCOpArg Ra = gpr.Use(a, RCMode::Read);
RCX64Reg Rd = gpr.Bind(d, RCMode::Write); RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
RegCache::Realize(Ra, Rd); RegCache::Realize(Ra, Rd);
MOV(32, R(RSCRATCH), Imm32(magic)); MOV(32, R(RSCRATCH), Imm32(m.multiplier));
if (d != a) if (d != a)
MOV(32, Rd, Ra); MOV(32, Rd, Ra);
IMUL(64, Rd, R(RSCRATCH)); IMUL(64, Rd, R(RSCRATCH));
ADD(64, Rd, R(RSCRATCH)); ADD(64, Rd, R(RSCRATCH));
SHR(64, Rd, Imm8(shift + 32)); SHR(64, Rd, Imm8(m.shift + 32));
} }
else else
{ {
@ -1494,32 +1490,23 @@ void Jit64::divwux(UGeckoInstruction inst)
RCX64Reg Rd = gpr.Bind(d, RCMode::Write); RCX64Reg Rd = gpr.Bind(d, RCMode::Write);
RegCache::Realize(Ra, Rd); RegCache::Realize(Ra, Rd);
magic++;
// Use smallest magic number and shift amount possible
while ((magic & 1) == 0 && shift > 0)
{
magic >>= 1;
shift--;
}
// Three-operand IMUL sign extends the immediate to 64 bits, so we may only // Three-operand IMUL sign extends the immediate to 64 bits, so we may only
// use it when the magic number has its most significant bit set to 0 // use it when the magic number has its most significant bit set to 0
if ((magic & 0x80000000) == 0) if ((m.multiplier & 0x80000000) == 0)
{ {
IMUL(64, Rd, Ra, Imm32(magic)); IMUL(64, Rd, Ra, Imm32(m.multiplier));
} }
else if (d == a) else if (d == a)
{ {
MOV(32, R(RSCRATCH), Imm32(magic)); MOV(32, R(RSCRATCH), Imm32(m.multiplier));
IMUL(64, Rd, R(RSCRATCH)); IMUL(64, Rd, R(RSCRATCH));
} }
else else
{ {
MOV(32, Rd, Imm32(magic)); MOV(32, Rd, Imm32(m.multiplier));
IMUL(64, Rd, Ra); IMUL(64, Rd, Ra);
} }
SHR(64, Rd, Imm8(shift + 32)); SHR(64, Rd, Imm8(m.shift + 32));
} }
} }
if (inst.OE) if (inst.OE)
@ -1792,7 +1779,7 @@ void Jit64::divwx(UGeckoInstruction inst)
else else
{ {
// Optimize signed 32-bit integer division by a constant // Optimize signed 32-bit integer division by a constant
Magic m = SignedDivisionConstants(divisor); SignedMagic m = SignedDivisionConstants(divisor);
MOVSX(64, 32, RSCRATCH, Ra); MOVSX(64, 32, RSCRATCH, Ra);

View File

@ -1538,6 +1538,60 @@ void JitArm64::divwux(UGeckoInstruction inst)
if (inst.Rc) if (inst.Rc)
ComputeRC0(gpr.GetImm(d)); ComputeRC0(gpr.GetImm(d));
} }
else if (gpr.IsImm(b))
{
const u32 divisor = gpr.GetImm(b);
if (divisor == 0)
{
gpr.SetImmediate(d, 0);
if (inst.Rc)
ComputeRC0(0);
}
else
{
const bool allocate_reg = d == a;
gpr.BindToRegister(d, allocate_reg);
ARM64Reg RD = gpr.R(d);
ARM64Reg RA = gpr.R(a);
if (MathUtil::IsPow2(divisor))
{
int shift = MathUtil::IntLog2(divisor);
if (shift)
LSR(RD, RA, shift);
else if (d != a)
MOV(RD, RA);
}
else
{
UnsignedMagic m = UnsignedDivisionConstants(divisor);
ARM64Reg WI = allocate_reg ? gpr.GetReg() : RD;
ARM64Reg XD = EncodeRegTo64(RD);
MOVI2R(WI, m.multiplier);
if (m.fast)
{
UMULL(XD, RA, WI);
}
else
{
UMADDL(XD, RA, WI, EncodeRegTo64(WI));
}
LSR(XD, XD, 32 + m.shift);
if (allocate_reg)
gpr.Unlock(WI);
}
if (inst.Rc)
ComputeRC0(gpr.R(d));
}
}
else else
{ {
gpr.BindToRegister(d, d == a || d == b); gpr.BindToRegister(d, d == a || d == b);
@ -1675,7 +1729,7 @@ void JitArm64::divwx(UGeckoInstruction inst)
else else
{ {
// Optimize signed 32-bit integer division by a constant // Optimize signed 32-bit integer division by a constant
Magic m = SignedDivisionConstants(divisor); SignedMagic m = SignedDivisionConstants(divisor);
ARM64Reg WA = gpr.GetReg(); ARM64Reg WA = gpr.GetReg();
ARM64Reg WB = gpr.GetReg(); ARM64Reg WB = gpr.GetReg();

View File

@ -3,16 +3,18 @@
#include "Core/PowerPC/JitCommon/DivUtils.h" #include "Core/PowerPC/JitCommon/DivUtils.h"
#include <algorithm>
#include <bit>
#include <cstdlib> #include <cstdlib>
namespace JitCommon namespace JitCommon
{ {
Magic SignedDivisionConstants(s32 d) SignedMagic SignedDivisionConstants(s32 divisor)
{ {
const u32 two31 = 2147483648; const u32 two31 = 2147483648;
const u32 ad = std::abs(d); const u32 ad = std::abs(divisor);
const u32 t = two31 - (d < 0); const u32 t = two31 - (divisor < 0);
const u32 anc = t - 1 - t % ad; const u32 anc = t - 1 - t % ad;
u32 q1 = two31 / anc; u32 q1 = two31 / anc;
u32 r1 = two31 - q1 * anc; u32 r1 = two31 - q1 * anc;
@ -44,13 +46,43 @@ Magic SignedDivisionConstants(s32 d)
delta = ad - r2; delta = ad - r2;
} while (q1 < delta || (q1 == delta && r1 == 0)); } while (q1 < delta || (q1 == delta && r1 == 0));
Magic mag; SignedMagic mag;
mag.multiplier = q2 + 1; mag.multiplier = q2 + 1;
if (d < 0) if (divisor < 0)
mag.multiplier = -mag.multiplier; mag.multiplier = -mag.multiplier;
mag.shift = p - 32; mag.shift = p - 32;
return mag; return mag;
} }
UnsignedMagic UnsignedDivisionConstants(u32 divisor)
{
u32 shift = 31 - std::countl_zero(divisor);
u64 magic_dividend = 0x100000000ULL << shift;
u32 multiplier = magic_dividend / divisor;
u32 max_quotient = multiplier >> shift;
// Test for failure in round-up method
u32 round_up = (u64(multiplier + 1) * (max_quotient * divisor - 1)) >> (shift + 32);
bool fast = round_up == max_quotient - 1;
if (fast)
{
multiplier++;
// Use smallest magic number and shift amount possible
u32 trailing_zeroes = std::min(shift, u32(std::countr_zero(multiplier)));
multiplier >>= trailing_zeroes;
shift -= trailing_zeroes;
}
UnsignedMagic mag;
mag.multiplier = multiplier;
mag.shift = shift;
mag.fast = fast;
return mag;
}
} // namespace JitCommon } // namespace JitCommon

View File

@ -7,7 +7,7 @@
namespace JitCommon namespace JitCommon
{ {
struct Magic struct SignedMagic
{ {
s32 multiplier; s32 multiplier;
u8 shift; u8 shift;
@ -16,6 +16,27 @@ struct Magic
// Calculate the constants required to optimize a signed 32-bit integer division. // Calculate the constants required to optimize a signed 32-bit integer division.
// Taken from The PowerPC Compiler Writer's Guide and LLVM. // Taken from The PowerPC Compiler Writer's Guide and LLVM.
// Divisor must not be -1, 0, 1 or INT_MIN. // Divisor must not be -1, 0, 1 or INT_MIN.
Magic SignedDivisionConstants(s32 divisor); SignedMagic SignedDivisionConstants(s32 divisor);
struct UnsignedMagic
{
u32 multiplier;
u8 shift;
bool fast;
};
/// Calculate the constants required to optimize an unsigned 32-bit integer
/// division.
/// Divisor must not be 0, 1, or a power of two.
///
/// Original implementation by calc84maniac.
/// Results are the same as the approach laid out in Hacker's Delight, with an
/// improvement for so-called uncooperative divisors (e.g. 7), as discovered by
/// ridiculousfish.
///
/// See also:
/// https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
/// https://rubenvannieuwpoort.nl/posts/division-by-constant-unsigned-integers
UnsignedMagic UnsignedDivisionConstants(u32 divisor);
} // namespace JitCommon } // namespace JitCommon

View File

@ -9,12 +9,12 @@ using namespace JitCommon;
TEST(DivUtils, Signed) TEST(DivUtils, Signed)
{ {
Magic m3 = SignedDivisionConstants(3); SignedMagic m3 = SignedDivisionConstants(3);
Magic m5 = SignedDivisionConstants(5); SignedMagic m5 = SignedDivisionConstants(5);
Magic m7 = SignedDivisionConstants(7); SignedMagic m7 = SignedDivisionConstants(7);
Magic minus3 = SignedDivisionConstants(-3); SignedMagic minus3 = SignedDivisionConstants(-3);
Magic minus5 = SignedDivisionConstants(-5); SignedMagic minus5 = SignedDivisionConstants(-5);
Magic minus7 = SignedDivisionConstants(-7); SignedMagic minus7 = SignedDivisionConstants(-7);
EXPECT_EQ(0x55555556, m3.multiplier); EXPECT_EQ(0x55555556, m3.multiplier);
EXPECT_EQ(0, m3.shift); EXPECT_EQ(0, m3.shift);
@ -30,3 +30,32 @@ TEST(DivUtils, Signed)
EXPECT_EQ(0x6DB6DB6D, minus7.multiplier); EXPECT_EQ(0x6DB6DB6D, minus7.multiplier);
EXPECT_EQ(2, minus7.shift); EXPECT_EQ(2, minus7.shift);
} }
TEST(DivUtils, Unsigned)
{
UnsignedMagic m3 = UnsignedDivisionConstants(3);
UnsignedMagic m5 = UnsignedDivisionConstants(5);
UnsignedMagic m7 = UnsignedDivisionConstants(7);
UnsignedMagic m9 = UnsignedDivisionConstants(9);
UnsignedMagic m19 = UnsignedDivisionConstants(19);
EXPECT_EQ(0xAAAAAAABU, m3.multiplier);
EXPECT_EQ(1, m3.shift);
EXPECT_TRUE(m3.fast);
EXPECT_EQ(0xCCCCCCCDU, m5.multiplier);
EXPECT_EQ(2, m5.shift);
EXPECT_TRUE(m5.fast);
EXPECT_EQ(0x92492492U, m7.multiplier);
EXPECT_EQ(2, m7.shift);
EXPECT_FALSE(m7.fast);
EXPECT_EQ(0x38E38E39U, m9.multiplier);
EXPECT_EQ(1, m9.shift);
EXPECT_TRUE(m9.fast);
EXPECT_EQ(0xD79435E5U, m19.multiplier);
EXPECT_EQ(4, m19.shift);
EXPECT_FALSE(m19.fast);
}