Merge pull request #7428 from MerryMage/rm-j-GenFrsqrte

GenFrsqrte: Reduce branches in fast-path and inline most behavior
This commit is contained in:
Mat M 2018-09-28 13:41:23 -04:00 committed by GitHub
commit 0e0fd18d5e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 148 additions and 17 deletions

View File

@ -32,17 +32,14 @@ void CommonAsmRoutines::GenFrsqrte()
// This function clobbers all three RSCRATCH.
MOVQ_xmm(R(RSCRATCH), XMM0);
// Negative and zero inputs set an exception and take the complex path.
TEST(64, R(RSCRATCH), R(RSCRATCH));
FixupBranch zero = J_CC(CC_Z, true);
FixupBranch negative = J_CC(CC_S, true);
// Extract exponent
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
// Zero and max exponents (non-normal floats) take the complex path.
FixupBranch complex1 = J_CC(CC_Z, true);
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
FixupBranch complex2 = J_CC(CC_E, true);
// Negatives, zeros, denormals, infinities and NaNs take the complex path.
LEA(32, RSCRATCH2, MDisp(RSCRATCH_EXTRA, -1));
CMP(32, R(RSCRATCH2), Imm32(0x7FE));
FixupBranch complex = J_CC(CC_AE, true);
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD));
SAR(32, R(RSCRATCH_EXTRA), Imm8(1));
@ -75,24 +72,53 @@ void CommonAsmRoutines::GenFrsqrte()
MOVQ_xmm(XMM0, R(RSCRATCH2));
RET();
// Exception flags for zero input.
SetJumpTarget(zero);
SetJumpTarget(complex);
AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
FixupBranch nan_or_inf = J_CC(CC_E);
MOV(64, R(RSCRATCH2), R(RSCRATCH));
SHL(64, R(RSCRATCH2), Imm8(1));
FixupBranch nonzero = J_CC(CC_NZ);
// +0.0 or -0.0
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
FixupBranch complex3 = J();
SetJumpTarget(skip_set_fx1);
MOV(64, R(RSCRATCH2), Imm64(0x7FF0'0000'0000'0000));
OR(64, R(RSCRATCH2), R(RSCRATCH));
MOVQ_xmm(XMM0, R(RSCRATCH2));
RET();
// Exception flags for negative input.
// SNaN or QNaN or +Inf or -Inf
SetJumpTarget(nan_or_inf);
MOV(64, R(RSCRATCH2), R(RSCRATCH));
SHL(64, R(RSCRATCH2), Imm8(12));
FixupBranch inf = J_CC(CC_Z);
BTS(64, R(RSCRATCH), Imm8(51));
MOVQ_xmm(XMM0, R(RSCRATCH));
RET();
SetJumpTarget(inf);
BT(64, R(RSCRATCH), Imm8(63));
FixupBranch negative = J_CC(CC_C);
XORPD(XMM0, R(XMM0));
RET();
SetJumpTarget(nonzero);
FixupBranch denormal = J_CC(CC_NC);
// Negative sign
SetJumpTarget(negative);
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT));
FixupBranch skip_set_fx2 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT));
SetJumpTarget(skip_set_fx1);
SetJumpTarget(skip_set_fx2);
SetJumpTarget(complex1);
SetJumpTarget(complex2);
SetJumpTarget(complex3);
MOV(64, R(RSCRATCH2), Imm64(0x7FF8'0000'0000'0000));
MOVQ_xmm(XMM0, R(RSCRATCH2));
RET();
SetJumpTarget(denormal);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
ABI_CallFunction(Common::ApproximateReciprocalSquareRoot);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);

View File

@ -13,3 +13,7 @@ add_dolphin_test(DSPAssemblyTest
add_dolphin_test(ESFormatsTest IOS/ES/FormatsTest.cpp IOS/ES/TestBinaryData.cpp)
add_dolphin_test(FileSystemTest IOS/FS/FileSystemTest.cpp)
if(_M_X86)
add_dolphin_test(PowerPCTest PowerPC/Jit64Common/Frsqrte.cpp)
endif()

View File

@ -0,0 +1,101 @@
// Copyright 2018 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.
#include <cstring>
#include <vector>
#include "Common/BitUtils.h"
#include "Common/CommonTypes.h"
#include "Common/FloatUtils.h"
#include "Common/x64ABI.h"
#include "Core/PowerPC/Gekko.h"
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
#include "Core/PowerPC/Jit64Common/Jit64Base.h"
#include "Core/PowerPC/Jit64Common/Jit64PowerPCState.h"
#include <gtest/gtest.h>
class TestCommonAsmRoutines : public CommonAsmRoutines
{
public:
TestCommonAsmRoutines()
{
using namespace Gen;
AllocCodeSpace(4096);
m_const_pool.Init(AllocChildCodeSpace(1024), 1024);
const auto raw_frsqrte = reinterpret_cast<double (*)(double)>(AlignCode4());
GenFrsqrte();
wrapped_frsqrte = reinterpret_cast<u64 (*)(u64, UReg_FPSCR&)>(AlignCode4());
ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
// We know the frsqrte implementation only accesses the fpscr. We manufacture a
// PPCSTATE pointer so we read/write to our provided fpscr argument instead.
XOR(32, R(RPPCSTATE), R(RPPCSTATE));
LEA(64, RSCRATCH, PPCSTATE(fpscr));
SUB(64, R(ABI_PARAM2), R(RSCRATCH));
MOV(64, R(RPPCSTATE), R(ABI_PARAM2));
// Call
MOVQ_xmm(XMM0, R(ABI_PARAM1));
ABI_CallFunction(raw_frsqrte);
MOVQ_xmm(R(ABI_RETURN), XMM0);
ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8, 16);
RET();
}
u64 (*wrapped_frsqrte)(u64, UReg_FPSCR&);
};
TEST(Jit64, Frsqrte)
{
TestCommonAsmRoutines routines;
const std::vector<u64> special_values{
0x0000'0000'0000'0000, // positive zero
0x0000'0000'0000'0001, // smallest positive denormal
0x0000'0000'0100'0000,
0x000F'FFFF'FFFF'FFFF, // largest positive denormal
0x0010'0000'0000'0000, // smallest positive normal
0x0010'0000'0000'0002,
0x3FF0'0000'0000'0000, // 1.0
0x7FEF'FFFF'FFFF'FFFF, // largest positive normal
0x7FF0'0000'0000'0000, // positive infinity
0x7FF0'0000'0000'0001, // first positive SNaN
0x7FF7'FFFF'FFFF'FFFF, // last positive SNaN
0x7FF8'0000'0000'0000, // first positive QNaN
0x7FFF'FFFF'FFFF'FFFF, // last positive QNaN
0x8000'0000'0000'0000, // negative zero
0x8000'0000'0000'0001, // smallest negative denormal
0x8000'0000'0100'0000,
0x800F'FFFF'FFFF'FFFF, // largest negative denormal
0x8010'0000'0000'0000, // smallest negative normal
0x8010'0000'0000'0002,
0xBFF0'0000'0000'0000, // -1.0
0xFFEF'FFFF'FFFF'FFFF, // largest negative normal
0xFFF0'0000'0000'0000, // negative infinity
0xFFF0'0000'0000'0001, // first negative SNaN
0xFFF7'FFFF'FFFF'FFFF, // last negative SNaN
0xFFF8'0000'0000'0000, // first negative QNaN
0xFFFF'FFFF'FFFF'FFFF, // last negative QNaN
};
UReg_FPSCR fpscr;
for (u64 ivalue : special_values)
{
double dvalue = Common::BitCast<double>(ivalue);
u64 expected = Common::BitCast<u64>(Common::ApproximateReciprocalSquareRoot(dvalue));
u64 actual = routines.wrapped_frsqrte(ivalue, fpscr);
printf("%016llx -> %016llx == %016llx\n", ivalue, actual, expected);
EXPECT_EQ(expected, actual);
}
}