Merge pull request #11127 from JosJuice/aarch64-jitasm-macro-op

JitAsm64: Reorder instructions in routines to allow macro-op fusion
This commit is contained in:
Markus Wick 2022-10-22 10:31:54 +02:00 committed by GitHub
commit 7a2256469e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -239,8 +239,8 @@ void JitArm64::GenerateFres()
UBFX(ARM64Reg::X2, ARM64Reg::X1, 52, 11); // Grab the exponent UBFX(ARM64Reg::X2, ARM64Reg::X1, 52, 11); // Grab the exponent
m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D0); m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D0);
CMP(ARM64Reg::X2, 895);
AND(ARM64Reg::X3, ARM64Reg::X1, LogicalImm(Common::DOUBLE_SIGN, 64)); AND(ARM64Reg::X3, ARM64Reg::X1, LogicalImm(Common::DOUBLE_SIGN, 64));
CMP(ARM64Reg::X2, 895);
FixupBranch small_exponent = B(CCFlags::CC_LO); FixupBranch small_exponent = B(CCFlags::CC_LO);
MOVI2R(ARM64Reg::X4, 1148LL); MOVI2R(ARM64Reg::X4, 1148LL);
@ -291,8 +291,8 @@ void JitArm64::GenerateFrsqrte()
// inf, even the mantissa matches. But the mantissa does not match for most other inputs, so in // inf, even the mantissa matches. But the mantissa does not match for most other inputs, so in
// the normal case we calculate the mantissa using the table-based algorithm from the interpreter. // the normal case we calculate the mantissa using the table-based algorithm from the interpreter.
TST(ARM64Reg::X1, LogicalImm(Common::DOUBLE_EXP | Common::DOUBLE_FRAC, 64));
m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D0); m_float_emit.FMOV(ARM64Reg::X0, ARM64Reg::D0);
TST(ARM64Reg::X1, LogicalImm(Common::DOUBLE_EXP | Common::DOUBLE_FRAC, 64));
FixupBranch zero = B(CCFlags::CC_EQ); FixupBranch zero = B(CCFlags::CC_EQ);
AND(ARM64Reg::X2, ARM64Reg::X1, LogicalImm(Common::DOUBLE_EXP, 64)); AND(ARM64Reg::X2, ARM64Reg::X1, LogicalImm(Common::DOUBLE_EXP, 64));
MOVI2R(ARM64Reg::X3, Common::DOUBLE_EXP); MOVI2R(ARM64Reg::X3, Common::DOUBLE_EXP);
@ -314,7 +314,7 @@ void JitArm64::GenerateFrsqrte()
LSR(ARM64Reg::X2, ARM64Reg::X2, 48); LSR(ARM64Reg::X2, ARM64Reg::X2, 48);
AND(ARM64Reg::X2, ARM64Reg::X2, LogicalImm(0x10, 64)); AND(ARM64Reg::X2, ARM64Reg::X2, LogicalImm(0x10, 64));
MOVP2R(ARM64Reg::X1, &Common::frsqrte_expected); MOVP2R(ARM64Reg::X1, &Common::frsqrte_expected);
ORR(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X8, ShiftType::LSR, 48)); ORR(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSR, 48));
EOR(ARM64Reg::X2, ARM64Reg::X2, LogicalImm(0x10, 64)); EOR(ARM64Reg::X2, ARM64Reg::X2, LogicalImm(0x10, 64));
ADD(ARM64Reg::X2, ARM64Reg::X1, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3)); ADD(ARM64Reg::X2, ARM64Reg::X1, ARM64Reg::X2, ArithOption(ARM64Reg::X2, ShiftType::LSL, 3));
LDP(IndexType::Signed, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::X2, 0); LDP(IndexType::Signed, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::X2, 0);
@ -351,9 +351,9 @@ void JitArm64::GenerateFrsqrte()
void JitArm64::GenerateConvertDoubleToSingle() void JitArm64::GenerateConvertDoubleToSingle()
{ {
UBFX(ARM64Reg::X2, ARM64Reg::X0, 52, 11); UBFX(ARM64Reg::X2, ARM64Reg::X0, 52, 11);
LSR(ARM64Reg::X1, ARM64Reg::X0, 32);
SUB(ARM64Reg::W3, ARM64Reg::W2, 874); SUB(ARM64Reg::W3, ARM64Reg::W2, 874);
CMP(ARM64Reg::W3, 896 - 874); CMP(ARM64Reg::W3, 896 - 874);
LSR(ARM64Reg::X1, ARM64Reg::X0, 32);
FixupBranch denormal = B(CCFlags::CC_LS); FixupBranch denormal = B(CCFlags::CC_LS);
BFXIL(ARM64Reg::X1, ARM64Reg::X0, 29, 30); BFXIL(ARM64Reg::X1, ARM64Reg::X0, 29, 30);
@ -397,8 +397,8 @@ void JitArm64::GenerateConvertSingleToDouble()
RET(); RET();
SetJumpTarget(normal_or_nan); SetJumpTarget(normal_or_nan);
CMP(ARM64Reg::W1, 0xff);
AND(ARM64Reg::W2, ARM64Reg::W0, LogicalImm(0x40000000, 32)); AND(ARM64Reg::W2, ARM64Reg::W0, LogicalImm(0x40000000, 32));
CMP(ARM64Reg::W1, 0xff);
CSET(ARM64Reg::W4, CCFlags::CC_NEQ); CSET(ARM64Reg::W4, CCFlags::CC_NEQ);
AND(ARM64Reg::W3, ARM64Reg::W0, LogicalImm(0xc0000000, 32)); AND(ARM64Reg::W3, ARM64Reg::W0, LogicalImm(0xc0000000, 32));
EOR(ARM64Reg::W2, ARM64Reg::W4, ARM64Reg::W2, ArithOption(ARM64Reg::W2, ShiftType::LSR, 30)); EOR(ARM64Reg::W2, ARM64Reg::W4, ARM64Reg::W2, ArithOption(ARM64Reg::W2, ShiftType::LSR, 30));
@ -440,14 +440,13 @@ void JitArm64::GenerateFPRF(bool single)
// First of all, start the load of the old FPSCR value, in case it takes a while // First of all, start the load of the old FPSCR value, in case it takes a while
LDR(IndexType::Unsigned, fpscr_reg, PPC_REG, PPCSTATE_OFF(fpscr)); LDR(IndexType::Unsigned, fpscr_reg, PPC_REG, PPCSTATE_OFF(fpscr));
CMP(input_reg, 0); // Grab sign bit (conveniently the same bit for floats as for integers)
AND(exp_reg, input_reg, LogicalImm(input_exp_mask, input_size)); // Grab exponent
// Most branches handle the sign in the same way. Perform that handling before branching // Most branches handle the sign in the same way. Perform that handling before branching
MOVI2R(ARM64Reg::W3, Common::PPC_FPCLASS_PN); MOVI2R(ARM64Reg::W3, Common::PPC_FPCLASS_PN);
MOVI2R(ARM64Reg::W1, Common::PPC_FPCLASS_NN); MOVI2R(ARM64Reg::W1, Common::PPC_FPCLASS_NN);
CMP(input_reg, 0); // Grab sign bit (conveniently the same bit for floats as for integers)
CSEL(fprf_reg, ARM64Reg::W1, ARM64Reg::W3, CCFlags::CC_LT); CSEL(fprf_reg, ARM64Reg::W1, ARM64Reg::W3, CCFlags::CC_LT);
AND(exp_reg, input_reg, LogicalImm(input_exp_mask, input_size)); // Grab exponent
FixupBranch zero_or_denormal = CBZ(exp_reg); FixupBranch zero_or_denormal = CBZ(exp_reg);
// exp != 0 // exp != 0
@ -477,9 +476,9 @@ void JitArm64::GenerateFPRF(bool single)
// exp == EXP_MASK // exp == EXP_MASK
SetJumpTarget(nan_or_inf); SetJumpTarget(nan_or_inf);
TST(input_reg, LogicalImm(input_frac_mask, input_size));
ORR(ARM64Reg::W1, fprf_reg, LogicalImm(Common::PPC_FPCLASS_PINF & ~output_sign_mask, 32));
MOVI2R(ARM64Reg::W2, Common::PPC_FPCLASS_QNAN); MOVI2R(ARM64Reg::W2, Common::PPC_FPCLASS_QNAN);
ORR(ARM64Reg::W1, fprf_reg, LogicalImm(Common::PPC_FPCLASS_PINF & ~output_sign_mask, 32));
TST(input_reg, LogicalImm(input_frac_mask, input_size));
CSEL(fprf_reg, ARM64Reg::W1, ARM64Reg::W2, CCFlags::CC_EQ); CSEL(fprf_reg, ARM64Reg::W1, ARM64Reg::W2, CCFlags::CC_EQ);
B(write_fprf_and_ret); B(write_fprf_and_ret);
} }