Merge pull request #9458 from JosJuice/arm-fpu-round

JitArm64: Set flush-to-zero/rounding mode and improve float/double conversion accuracy
This commit is contained in:
JMC47 2021-04-25 10:23:19 -04:00 committed by GitHub
commit 5da85f3a25
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 819 additions and 120 deletions

View File

@ -3601,6 +3601,14 @@ void ARM64FloatEmitter::FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{ {
Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn); Emit2RegMisc(IsQuad(Rd), 0, 2 | (size >> 6), 0xE, Rd, Rn);
} }
void ARM64FloatEmitter::FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(1, size >> 6, 0x1D, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(1, 2 | (size >> 6), 0x1D, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond) void ARM64FloatEmitter::FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond)
{ {

View File

@ -1094,6 +1094,8 @@ public:
void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FACGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FACGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
// Conditional select // Conditional select
void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond); void FCSEL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, CCFlags cond);

View File

@ -69,6 +69,7 @@ void CPUInfo::Detect()
CPU64bit = true; CPU64bit = true;
Mode64bit = true; Mode64bit = true;
vendor = CPUVendor::ARM; vendor = CPUVendor::ARM;
bFlushToZero = true;
#ifdef _WIN32 #ifdef _WIN32
num_cores = std::thread::hardware_concurrency(); num_cores = std::thread::hardware_concurrency();

View File

@ -0,0 +1,78 @@
// Copyright 2021 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.
#include "Common/CommonTypes.h"
#include "Common/FPURoundMode.h"
#ifdef _MSC_VER
#include <intrin.h>
#endif
static u64 GetFPCR()
{
#ifdef _MSC_VER
return _ReadStatusReg(ARM64_FPCR);
#else
u64 fpcr;
__asm__ __volatile__("mrs %0, fpcr" : "=r"(fpcr));
return fpcr;
#endif
}
static void SetFPCR(u64 fpcr)
{
#ifdef _MSC_VER
_WriteStatusReg(ARM64_FPCR, fpcr);
#else
__asm__ __volatile__("msr fpcr, %0" : : "ri"(fpcr));
#endif
}
namespace FPURoundMode
{
static const u64 default_fpcr = GetFPCR();
static u64 saved_fpcr = default_fpcr;
void SetRoundMode(int mode)
{
// We don't need to do anything here since SetSIMDMode is always called after calling this
}
void SetPrecisionMode(PrecisionMode mode)
{
}
void SetSIMDMode(int rounding_mode, bool non_ieee_mode)
{
// Flush-To-Zero (non-IEEE mode: denormal outputs are set to +/- 0)
constexpr u32 FZ = 1 << 24;
// lookup table for FPSCR.RN-to-FPCR.RMode translation
constexpr u32 rounding_mode_table[] = {
(0 << 22), // nearest
(3 << 22), // zero
(1 << 22), // +inf
(2 << 22), // -inf
};
const u64 base = default_fpcr & ~(0b111 << 22);
SetFPCR(base | rounding_mode_table[rounding_mode] | (non_ieee_mode ? FZ : 0));
}
void SaveSIMDState()
{
saved_fpcr = GetFPCR();
}
void LoadSIMDState()
{
SetFPCR(saved_fpcr);
}
void LoadDefaultSIMDState()
{
SetFPCR(default_fpcr);
}
} // namespace FPURoundMode

View File

@ -199,7 +199,7 @@ if(_M_ARM_64)
Arm64Emitter.h Arm64Emitter.h
ArmCommon.h ArmCommon.h
ArmCPUDetect.cpp ArmCPUDetect.cpp
GenericFPURoundMode.cpp ArmFPURoundMode.cpp
) )
else() else()
if(_M_X86) #X86 if(_M_X86) #X86

View File

@ -982,6 +982,7 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
js.compilerPC = op.address; js.compilerPC = op.address;
js.op = &op; js.op = &op;
js.fpr_is_store_safe = op.fprIsStoreSafeBeforeInst;
js.instructionNumber = i; js.instructionNumber = i;
js.instructionsLeft = (code_block.m_num_instructions - 1) - i; js.instructionsLeft = (code_block.m_num_instructions - 1) - i;
const GekkoOPInfo* opinfo = op.opinfo; const GekkoOPInfo* opinfo = op.opinfo;
@ -1118,6 +1119,8 @@ bool Jit64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
CompileInstruction(op); CompileInstruction(op);
js.fpr_is_store_safe = op.fprIsStoreSafeAfterInst;
if (jo.memcheck && (opinfo->flags & FL_LOADSTORE)) if (jo.memcheck && (opinfo->flags & FL_LOADSTORE))
{ {
// If we have a fastmem loadstore, we can omit the exception check and let fastmem handle // If we have a fastmem loadstore, we can omit the exception check and let fastmem handle

View File

@ -105,7 +105,7 @@ void Jit64::stfXXX(UGeckoInstruction inst)
if (single) if (single)
{ {
if (js.op->fprIsStoreSafe[s]) if (js.fpr_is_store_safe[s])
{ {
RCOpArg Rs = fpr.Use(s, RCMode::Read); RCOpArg Rs = fpr.Use(s, RCMode::Read);
RegCache::Realize(Rs); RegCache::Realize(Rs);

View File

@ -695,6 +695,7 @@ void JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
js.compilerPC = op.address; js.compilerPC = op.address;
js.op = &op; js.op = &op;
js.fpr_is_store_safe = op.fprIsStoreSafeBeforeInst;
js.instructionNumber = i; js.instructionNumber = i;
js.instructionsLeft = (code_block.m_num_instructions - 1) - i; js.instructionsLeft = (code_block.m_num_instructions - 1) - i;
const GekkoOPInfo* opinfo = op.opinfo; const GekkoOPInfo* opinfo = op.opinfo;
@ -830,6 +831,9 @@ void JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)
} }
CompileInstruction(op); CompileInstruction(op);
js.fpr_is_store_safe = op.fprIsStoreSafeAfterInst;
if (!CanMergeNextInstructions(1) || js.op[1].opinfo->type != ::OpType::Integer) if (!CanMergeNextInstructions(1) || js.op[1].opinfo->type != ::OpType::Integer)
FlushCarry(); FlushCarry();

View File

@ -152,7 +152,20 @@ public:
void psq_l(UGeckoInstruction inst); void psq_l(UGeckoInstruction inst);
void psq_st(UGeckoInstruction inst); void psq_st(UGeckoInstruction inst);
private: void ConvertDoubleToSingleLower(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg,
Arm64Gen::ARM64Reg src_reg);
void ConvertDoubleToSinglePair(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg,
Arm64Gen::ARM64Reg src_reg);
void ConvertSingleToDoubleLower(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg,
Arm64Gen::ARM64Reg src_reg,
Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG);
void ConvertSingleToDoublePair(size_t guest_reg, Arm64Gen::ARM64Reg dest_reg,
Arm64Gen::ARM64Reg src_reg,
Arm64Gen::ARM64Reg scratch_reg = Arm64Gen::ARM64Reg::INVALID_REG);
bool IsFPRStoreSafe(size_t guest_reg) const;
protected:
struct SlowmemHandler struct SlowmemHandler
{ {
Arm64Gen::ARM64Reg dest_reg; Arm64Gen::ARM64Reg dest_reg;
@ -184,14 +197,18 @@ private:
nearcode = GetWritableCodePtr(); nearcode = GetWritableCodePtr();
SetCodePtrUnsafe(farcode.GetWritableCodePtr()); SetCodePtrUnsafe(farcode.GetWritableCodePtr());
AlignCode16(); AlignCode16();
m_in_farcode = true;
} }
void SwitchToNearCode() void SwitchToNearCode()
{ {
farcode.SetCodePtrUnsafe(GetWritableCodePtr()); farcode.SetCodePtrUnsafe(GetWritableCodePtr());
SetCodePtrUnsafe(nearcode); SetCodePtrUnsafe(nearcode);
m_in_farcode = false;
} }
bool IsInFarCode() const { return m_in_farcode; }
// Dump a memory range of code // Dump a memory range of code
void DumpCode(const u8* start, const u8* end); void DumpCode(const u8* start, const u8* end);
@ -215,6 +232,9 @@ private:
// AsmRoutines // AsmRoutines
void GenerateAsm(); void GenerateAsm();
void GenerateCommonAsm(); void GenerateCommonAsm();
void GenerateConvertDoubleToSingle();
void GenerateConvertSingleToDouble();
void GenerateQuantizedLoadStores();
// Profiling // Profiling
void BeginTimeProfile(JitBlock* b); void BeginTimeProfile(JitBlock* b);
@ -254,6 +274,7 @@ private:
Arm64Gen::ARM64CodeBlock farcode; Arm64Gen::ARM64CodeBlock farcode;
u8* nearcode; // Backed up when we switch to far code. u8* nearcode; // Backed up when we switch to far code.
bool m_in_farcode = false;
bool m_enable_blr_optimization; bool m_enable_blr_optimization;
bool m_cleanup_after_stackfault = false; bool m_cleanup_after_stackfault = false;

View File

@ -61,23 +61,11 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR
if (flags & BackPatchInfo::FLAG_STORE && flags & BackPatchInfo::FLAG_MASK_FLOAT) if (flags & BackPatchInfo::FLAG_STORE && flags & BackPatchInfo::FLAG_MASK_FLOAT)
{ {
if (flags & BackPatchInfo::FLAG_SIZE_F32) if (flags & BackPatchInfo::FLAG_SIZE_F32)
{
m_float_emit.FCVT(32, 64, ARM64Reg::D0, RS);
m_float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0);
m_float_emit.STR(32, ARM64Reg::D0, MEM_REG, addr);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32I)
{ {
m_float_emit.REV32(8, ARM64Reg::D0, RS); m_float_emit.REV32(8, ARM64Reg::D0, RS);
m_float_emit.STR(32, ARM64Reg::D0, MEM_REG, addr); m_float_emit.STR(32, ARM64Reg::D0, MEM_REG, addr);
} }
else if (flags & BackPatchInfo::FLAG_SIZE_F32X2) else if (flags & BackPatchInfo::FLAG_SIZE_F32X2)
{
m_float_emit.FCVTN(32, ARM64Reg::D0, RS);
m_float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0);
m_float_emit.STR(64, ARM64Reg::Q0, MEM_REG, addr);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32X2I)
{ {
m_float_emit.REV32(8, ARM64Reg::D0, RS); m_float_emit.REV32(8, ARM64Reg::D0, RS);
m_float_emit.STR(64, ARM64Reg::Q0, MEM_REG, addr); m_float_emit.STR(64, ARM64Reg::Q0, MEM_REG, addr);
@ -184,37 +172,22 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR
if (flags & BackPatchInfo::FLAG_STORE && flags & BackPatchInfo::FLAG_MASK_FLOAT) if (flags & BackPatchInfo::FLAG_STORE && flags & BackPatchInfo::FLAG_MASK_FLOAT)
{ {
if (flags & BackPatchInfo::FLAG_SIZE_F32) if (flags & BackPatchInfo::FLAG_SIZE_F32)
{
m_float_emit.FCVT(32, 64, ARM64Reg::D0, RS);
m_float_emit.UMOV(32, ARM64Reg::W0, ARM64Reg::Q0, 0);
MOVP2R(ARM64Reg::X8, &PowerPC::Write_U32);
BLR(ARM64Reg::X8);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32I)
{ {
m_float_emit.UMOV(32, ARM64Reg::W0, RS, 0); m_float_emit.UMOV(32, ARM64Reg::W0, RS, 0);
MOVP2R(ARM64Reg::X8, &PowerPC::Write_U32); MOVP2R(ARM64Reg::X8, &PowerPC::Write_U32);
BLR(ARM64Reg::X8); BLR(ARM64Reg::X8);
} }
else if (flags & BackPatchInfo::FLAG_SIZE_F32X2) else if (flags & BackPatchInfo::FLAG_SIZE_F32X2)
{
m_float_emit.FCVTN(32, ARM64Reg::D0, RS);
m_float_emit.UMOV(64, ARM64Reg::X0, ARM64Reg::D0, 0);
ROR(ARM64Reg::X0, ARM64Reg::X0, 32);
MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64);
BLR(ARM64Reg::X8);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32X2I)
{ {
m_float_emit.UMOV(64, ARM64Reg::X0, RS, 0); m_float_emit.UMOV(64, ARM64Reg::X0, RS, 0);
ROR(ARM64Reg::X0, ARM64Reg::X0, 32);
MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64); MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64);
ROR(ARM64Reg::X0, ARM64Reg::X0, 32);
BLR(ARM64Reg::X8); BLR(ARM64Reg::X8);
} }
else else
{ {
MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64);
m_float_emit.UMOV(64, ARM64Reg::X0, RS, 0); m_float_emit.UMOV(64, ARM64Reg::X0, RS, 0);
MOVP2R(ARM64Reg::X8, &PowerPC::Write_U64);
BLR(ARM64Reg::X8); BLR(ARM64Reg::X8);
} }
} }

View File

@ -220,30 +220,28 @@ void JitArm64::fselx(UGeckoInstruction inst)
const u32 c = inst.FC; const u32 c = inst.FC;
const u32 d = inst.FD; const u32 d = inst.FD;
const bool a_single = fpr.IsSingle(a, true); const bool b_and_c_singles = fpr.IsSingle(b, true) && fpr.IsSingle(c, true);
if (a_single) const RegType b_and_c_type = b_and_c_singles ? RegType::LowerPairSingle : RegType::LowerPair;
{ const auto b_and_c_reg_encoder = b_and_c_singles ? EncodeRegToSingle : EncodeRegToDouble;
const ARM64Reg VA = fpr.R(a, RegType::LowerPairSingle);
m_float_emit.FCMPE(EncodeRegToSingle(VA));
}
else
{
const ARM64Reg VA = fpr.R(a, RegType::LowerPair);
m_float_emit.FCMPE(EncodeRegToDouble(VA));
}
const bool a_single = fpr.IsSingle(a, true) && (b_and_c_singles || (a != b && a != c));
const RegType a_type = a_single ? RegType::LowerPairSingle : RegType::LowerPair;
const auto a_reg_encoder = a_single ? EncodeRegToSingle : EncodeRegToDouble;
const ARM64Reg VA = fpr.R(a, a_type);
const ARM64Reg VB = fpr.R(b, b_and_c_type);
const ARM64Reg VC = fpr.R(c, b_and_c_type);
// If a == d, the RW call below may change the type of a to double. This is okay, because the
// actual value in the register is not altered by RW. So let's just assert before calling RW.
ASSERT_MSG(DYNA_REC, a_single == fpr.IsSingle(a, true), ASSERT_MSG(DYNA_REC, a_single == fpr.IsSingle(a, true),
"Register allocation turned singles into doubles in the middle of fselx"); "Register allocation turned singles into doubles in the middle of fselx");
const bool b_and_c_singles = fpr.IsSingle(b, true) && fpr.IsSingle(c, true); const ARM64Reg VD = fpr.RW(d, b_and_c_type);
const RegType type = b_and_c_singles ? RegType::LowerPairSingle : RegType::LowerPair;
const auto reg_encoder = b_and_c_singles ? EncodeRegToSingle : EncodeRegToDouble;
const ARM64Reg VB = fpr.R(b, type); m_float_emit.FCMPE(a_reg_encoder(VA));
const ARM64Reg VC = fpr.R(c, type); m_float_emit.FCSEL(b_and_c_reg_encoder(VD), b_and_c_reg_encoder(VC), b_and_c_reg_encoder(VB),
const ARM64Reg VD = fpr.RW(d, type); CC_GE);
m_float_emit.FCSEL(reg_encoder(VD), reg_encoder(VC), reg_encoder(VB), CC_GE);
ASSERT_MSG(DYNA_REC, b_and_c_singles == (fpr.IsSingle(b, true) && fpr.IsSingle(c, true)), ASSERT_MSG(DYNA_REC, b_and_c_singles == (fpr.IsSingle(b, true) && fpr.IsSingle(c, true)),
"Register allocation turned singles into doubles in the middle of fselx"); "Register allocation turned singles into doubles in the middle of fselx");
@ -260,7 +258,7 @@ void JitArm64::frspx(UGeckoInstruction inst)
const u32 d = inst.FD; const u32 d = inst.FD;
const bool single = fpr.IsSingle(b, true); const bool single = fpr.IsSingle(b, true);
if (single) if (single && js.fpr_is_store_safe[b])
{ {
// Source is already in single precision, so no need to do anything but to copy to PSR1. // Source is already in single precision, so no need to do anything but to copy to PSR1.
const ARM64Reg VB = fpr.R(b, RegType::LowerPairSingle); const ARM64Reg VB = fpr.R(b, RegType::LowerPairSingle);
@ -268,6 +266,9 @@ void JitArm64::frspx(UGeckoInstruction inst)
if (b != d) if (b != d)
m_float_emit.FMOV(EncodeRegToSingle(VD), EncodeRegToSingle(VB)); m_float_emit.FMOV(EncodeRegToSingle(VD), EncodeRegToSingle(VB));
ASSERT_MSG(DYNA_REC, fpr.IsSingle(b, true),
"Register allocation turned singles into doubles in the middle of frspx");
} }
else else
{ {
@ -276,9 +277,6 @@ void JitArm64::frspx(UGeckoInstruction inst)
m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB)); m_float_emit.FCVT(32, 64, EncodeRegToDouble(VD), EncodeRegToDouble(VB));
} }
ASSERT_MSG(DYNA_REC, b == d || single == fpr.IsSingle(b, true),
"Register allocation turned singles into doubles in the middle of frspx");
} }
void JitArm64::fcmpX(UGeckoInstruction inst) void JitArm64::fcmpX(UGeckoInstruction inst)
@ -386,3 +384,196 @@ void JitArm64::fctiwzx(UGeckoInstruction inst)
ASSERT_MSG(DYNA_REC, b == d || single == fpr.IsSingle(b, true), ASSERT_MSG(DYNA_REC, b == d || single == fpr.IsSingle(b, true),
"Register allocation turned singles into doubles in the middle of fctiwzx"); "Register allocation turned singles into doubles in the middle of fctiwzx");
} }
// Since the following float conversion functions are used in non-arithmetic PPC float
// instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs
// into QNaNs. This means we can't just use FCVT/FCVTL/FCVTN.
void JitArm64::ConvertDoubleToSingleLower(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg)
{
if (js.fpr_is_store_safe[guest_reg])
{
m_float_emit.FCVT(32, 64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
return;
}
FlushCarry();
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 30};
ABI_PushRegisters(gpr_saved);
m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 0);
BL(cdts);
m_float_emit.INS(32, dest_reg, 0, ARM64Reg::W1);
ABI_PopRegisters(gpr_saved);
}
void JitArm64::ConvertDoubleToSinglePair(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg)
{
if (js.fpr_is_store_safe[guest_reg])
{
m_float_emit.FCVTN(32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
return;
}
FlushCarry();
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 30};
ABI_PushRegisters(gpr_saved);
m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 0);
BL(cdts);
m_float_emit.INS(32, dest_reg, 0, ARM64Reg::W1);
m_float_emit.UMOV(64, ARM64Reg::X0, src_reg, 1);
BL(cdts);
m_float_emit.INS(32, dest_reg, 1, ARM64Reg::W1);
ABI_PopRegisters(gpr_saved);
}
void JitArm64::ConvertSingleToDoubleLower(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg,
ARM64Reg scratch_reg)
{
ASSERT(scratch_reg != src_reg);
if (js.fpr_is_store_safe[guest_reg])
{
m_float_emit.FCVT(64, 32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
return;
}
const bool switch_to_farcode = !IsInFarCode();
FlushCarry();
// Do we know that the input isn't NaN, and that the input isn't denormal or FPCR.FZ is not set?
// (This check unfortunately also catches zeroes)
FixupBranch fast;
if (scratch_reg != ARM64Reg::INVALID_REG)
{
m_float_emit.FABS(EncodeRegToSingle(scratch_reg), EncodeRegToSingle(src_reg));
m_float_emit.FCMP(EncodeRegToSingle(scratch_reg));
fast = B(CCFlags::CC_GT);
if (switch_to_farcode)
{
FixupBranch slow = B();
SwitchToFarCode();
SetJumpTarget(slow);
}
}
// If no (or if we don't have a scratch register), call the bit-exact routine
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
ABI_PushRegisters(gpr_saved);
m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 0);
BL(cstd);
m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
ABI_PopRegisters(gpr_saved);
// If yes, do a fast conversion with FCVT
if (scratch_reg != ARM64Reg::INVALID_REG)
{
FixupBranch continue1 = B();
if (switch_to_farcode)
SwitchToNearCode();
SetJumpTarget(fast);
m_float_emit.FCVT(64, 32, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
SetJumpTarget(continue1);
}
}
void JitArm64::ConvertSingleToDoublePair(size_t guest_reg, ARM64Reg dest_reg, ARM64Reg src_reg,
ARM64Reg scratch_reg)
{
ASSERT(scratch_reg != src_reg);
if (js.fpr_is_store_safe[guest_reg])
{
m_float_emit.FCVTL(64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
return;
}
const bool switch_to_farcode = !IsInFarCode();
FlushCarry();
// Do we know that neither input is NaN, and that neither input is denormal or FPCR.FZ is not set?
// (This check unfortunately also catches zeroes)
FixupBranch fast;
if (scratch_reg != ARM64Reg::INVALID_REG)
{
// Set each 32-bit element of scratch_reg to 0x0000'0000 or 0xFFFF'FFFF depending on whether
// the absolute value of the corresponding element in src_reg compares greater than 0
m_float_emit.MOVI(8, EncodeRegToDouble(scratch_reg), 0);
m_float_emit.FACGT(32, EncodeRegToDouble(scratch_reg), EncodeRegToDouble(src_reg),
EncodeRegToDouble(scratch_reg));
// 0x0000'0000'0000'0000 (zero) -> 0x0000'0000'0000'0000 (zero)
// 0x0000'0000'FFFF'FFFF (denormal) -> 0xFF00'0000'FFFF'FFFF (normal)
// 0xFFFF'FFFF'0000'0000 (NaN) -> 0x00FF'FFFF'0000'0000 (normal)
// 0xFFFF'FFFF'FFFF'FFFF (NaN) -> 0xFFFF'FFFF'FFFF'FFFF (NaN)
m_float_emit.INS(8, EncodeRegToDouble(scratch_reg), 7, EncodeRegToDouble(scratch_reg), 0);
// Is scratch_reg a NaN (0xFFFF'FFFF'FFFF'FFFF)?
m_float_emit.FCMP(EncodeRegToDouble(scratch_reg));
fast = B(CCFlags::CC_VS);
if (switch_to_farcode)
{
FixupBranch slow = B();
SwitchToFarCode();
SetJumpTarget(slow);
}
}
// If no (or if we don't have a scratch register), call the bit-exact routine
// Save X0-X4 and X30 if they're in use
const BitSet32 gpr_saved = gpr.GetCallerSavedUsed() & BitSet32{0, 1, 2, 3, 4, 30};
ABI_PushRegisters(gpr_saved);
m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 1);
BL(cstd);
m_float_emit.INS(64, dest_reg, 1, ARM64Reg::X0);
m_float_emit.UMOV(32, ARM64Reg::W0, src_reg, 0);
BL(cstd);
m_float_emit.INS(64, dest_reg, 0, ARM64Reg::X0);
ABI_PopRegisters(gpr_saved);
// If yes, do a fast conversion with FCVTL
if (scratch_reg != ARM64Reg::INVALID_REG)
{
FixupBranch continue1 = B();
if (switch_to_farcode)
SwitchToNearCode();
SetJumpTarget(fast);
m_float_emit.FCVTL(64, EncodeRegToDouble(dest_reg), EncodeRegToDouble(src_reg));
SetJumpTarget(continue1);
}
}
bool JitArm64::IsFPRStoreSafe(size_t guest_reg) const
{
return js.fpr_is_store_safe[guest_reg];
}

View File

@ -189,6 +189,7 @@ void JitArm64::stfXX(UGeckoInstruction inst)
u32 a = inst.RA, b = inst.RB; u32 a = inst.RA, b = inst.RB;
bool want_single = false;
s32 offset = inst.SIMM_16; s32 offset = inst.SIMM_16;
u32 flags = BackPatchInfo::FLAG_STORE; u32 flags = BackPatchInfo::FLAG_STORE;
bool update = false; bool update = false;
@ -200,10 +201,12 @@ void JitArm64::stfXX(UGeckoInstruction inst)
switch (inst.SUBOP10) switch (inst.SUBOP10)
{ {
case 663: // stfsx case 663: // stfsx
want_single = true;
flags |= BackPatchInfo::FLAG_SIZE_F32; flags |= BackPatchInfo::FLAG_SIZE_F32;
offset_reg = b; offset_reg = b;
break; break;
case 695: // stfsux case 695: // stfsux
want_single = true;
flags |= BackPatchInfo::FLAG_SIZE_F32; flags |= BackPatchInfo::FLAG_SIZE_F32;
update = true; update = true;
offset_reg = b; offset_reg = b;
@ -218,16 +221,19 @@ void JitArm64::stfXX(UGeckoInstruction inst)
offset_reg = b; offset_reg = b;
break; break;
case 983: // stfiwx case 983: // stfiwx
flags |= BackPatchInfo::FLAG_SIZE_F32I; // This instruction writes the lower 32 bits of a double. want_single must be false
flags |= BackPatchInfo::FLAG_SIZE_F32;
offset_reg = b; offset_reg = b;
break; break;
} }
break; break;
case 53: // stfsu case 53: // stfsu
want_single = true;
flags |= BackPatchInfo::FLAG_SIZE_F32; flags |= BackPatchInfo::FLAG_SIZE_F32;
update = true; update = true;
break; break;
case 52: // stfs case 52: // stfs
want_single = true;
flags |= BackPatchInfo::FLAG_SIZE_F32; flags |= BackPatchInfo::FLAG_SIZE_F32;
break; break;
case 55: // stfdu case 55: // stfdu
@ -242,19 +248,22 @@ void JitArm64::stfXX(UGeckoInstruction inst)
u32 imm_addr = 0; u32 imm_addr = 0;
bool is_immediate = false; bool is_immediate = false;
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
fpr.Lock(ARM64Reg::Q0); fpr.Lock(ARM64Reg::Q0);
const bool single = (flags & BackPatchInfo::FLAG_SIZE_F32) && fpr.IsSingle(inst.FS, true); const bool have_single = fpr.IsSingle(inst.FS, true);
const ARM64Reg V0 = fpr.R(inst.FS, single ? RegType::LowerPairSingle : RegType::LowerPair); ARM64Reg V0 =
fpr.R(inst.FS, want_single && have_single ? RegType::LowerPairSingle : RegType::LowerPair);
if (single) if (want_single && !have_single)
{ {
flags &= ~BackPatchInfo::FLAG_SIZE_F32; const ARM64Reg single_reg = fpr.GetReg();
flags |= BackPatchInfo::FLAG_SIZE_F32I; ConvertDoubleToSingleLower(inst.FS, single_reg, V0);
V0 = single_reg;
} }
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
ARM64Reg addr_reg = ARM64Reg::W1; ARM64Reg addr_reg = ARM64Reg::W1;
if (update) if (update)
@ -359,19 +368,11 @@ void JitArm64::stfXX(UGeckoInstruction inst)
accessSize = 32; accessSize = 32;
LDR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr)); LDR(IndexType::Unsigned, ARM64Reg::X0, PPC_REG, PPCSTATE_OFF(gather_pipe_ptr));
if (flags & BackPatchInfo::FLAG_SIZE_F64) if (flags & BackPatchInfo::FLAG_SIZE_F64)
{
m_float_emit.REV64(8, ARM64Reg::Q0, V0); m_float_emit.REV64(8, ARM64Reg::Q0, V0);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32) else if (flags & BackPatchInfo::FLAG_SIZE_F32)
{
m_float_emit.FCVT(32, 64, ARM64Reg::D0, EncodeRegToDouble(V0));
m_float_emit.REV32(8, ARM64Reg::D0, ARM64Reg::D0);
}
else if (flags & BackPatchInfo::FLAG_SIZE_F32I)
{
m_float_emit.REV32(8, ARM64Reg::D0, V0); m_float_emit.REV32(8, ARM64Reg::D0, V0);
}
m_float_emit.STR(accessSize, IndexType::Post, accessSize == 64 ? ARM64Reg::Q0 : ARM64Reg::D0, m_float_emit.STR(accessSize, IndexType::Post, accessSize == 64 ? ARM64Reg::Q0 : ARM64Reg::D0,
ARM64Reg::X0, accessSize >> 3); ARM64Reg::X0, accessSize >> 3);
@ -399,6 +400,10 @@ void JitArm64::stfXX(UGeckoInstruction inst)
{ {
EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, V0, XA, regs_in_use, fprs_in_use); EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, V0, XA, regs_in_use, fprs_in_use);
} }
if (want_single && !have_single)
fpr.Unlock(V0);
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30); gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W30);
fpr.Unlock(ARM64Reg::Q0); fpr.Unlock(ARM64Reg::Q0);
} }

View File

@ -116,13 +116,44 @@ void JitArm64::psq_st(UGeckoInstruction inst)
const bool update = inst.OPCD == 61; const bool update = inst.OPCD == 61;
const s32 offset = inst.SIMM_12; const s32 offset = inst.SIMM_12;
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30);
fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1); fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1);
const bool single = fpr.IsSingle(inst.RS); const bool have_single = fpr.IsSingle(inst.RS);
ARM64Reg VS = fpr.R(inst.RS, have_single ? RegType::Single : RegType::Register);
if (js.assumeNoPairedQuantize)
{
if (!have_single)
{
const ARM64Reg single_reg = fpr.GetReg();
if (inst.W)
m_float_emit.FCVT(32, 64, EncodeRegToDouble(single_reg), EncodeRegToDouble(VS));
else
m_float_emit.FCVTN(32, EncodeRegToDouble(single_reg), EncodeRegToDouble(VS));
VS = single_reg;
}
}
else
{
if (have_single)
{
m_float_emit.ORR(ARM64Reg::D0, VS, VS);
}
else
{
if (inst.W)
m_float_emit.FCVT(32, 64, ARM64Reg::D0, VS);
else
m_float_emit.FCVTN(32, ARM64Reg::D0, VS);
}
}
gpr.Lock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30);
const ARM64Reg arm_addr = gpr.R(inst.RA); const ARM64Reg arm_addr = gpr.R(inst.RA);
const ARM64Reg VS = fpr.R(inst.RS, single ? RegType::Single : RegType::Register);
constexpr ARM64Reg scale_reg = ARM64Reg::W0; constexpr ARM64Reg scale_reg = ARM64Reg::W0;
constexpr ARM64Reg addr_reg = ARM64Reg::W1; constexpr ARM64Reg addr_reg = ARM64Reg::W1;
@ -157,9 +188,6 @@ void JitArm64::psq_st(UGeckoInstruction inst)
{ {
u32 flags = BackPatchInfo::FLAG_STORE; u32 flags = BackPatchInfo::FLAG_STORE;
if (single)
flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32I : BackPatchInfo::FLAG_SIZE_F32X2I);
else
flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2); flags |= (inst.W ? BackPatchInfo::FLAG_SIZE_F32 : BackPatchInfo::FLAG_SIZE_F32X2);
EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, VS, EncodeRegTo64(addr_reg), gprs_in_use, EmitBackpatchRoutine(flags, jo.fastmem, jo.fastmem, VS, EncodeRegTo64(addr_reg), gprs_in_use,
@ -167,18 +195,6 @@ void JitArm64::psq_st(UGeckoInstruction inst)
} }
else else
{ {
if (single)
{
m_float_emit.ORR(ARM64Reg::D0, VS, VS);
}
else
{
if (inst.W)
m_float_emit.FCVT(32, 64, ARM64Reg::D0, VS);
else
m_float_emit.FCVTN(32, ARM64Reg::D0, VS);
}
LDR(IndexType::Unsigned, scale_reg, PPC_REG, PPCSTATE_OFF_SPR(SPR_GQR0 + inst.I)); LDR(IndexType::Unsigned, scale_reg, PPC_REG, PPCSTATE_OFF_SPR(SPR_GQR0 + inst.I));
UBFM(type_reg, scale_reg, 0, 2); // Type UBFM(type_reg, scale_reg, 0, 2); // Type
UBFM(scale_reg, scale_reg, 8, 13); // Scale UBFM(scale_reg, scale_reg, 8, 13); // Scale
@ -212,6 +228,9 @@ void JitArm64::psq_st(UGeckoInstruction inst)
SetJumpTarget(continue1); SetJumpTarget(continue1);
} }
if (js.assumeNoPairedQuantize && !have_single)
fpr.Unlock(VS);
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30); gpr.Unlock(ARM64Reg::W0, ARM64Reg::W1, ARM64Reg::W2, ARM64Reg::W30);
fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1); fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1);
} }

View File

@ -17,9 +17,10 @@
using namespace Arm64Gen; using namespace Arm64Gen;
void Arm64RegCache::Init(ARM64XEmitter* emitter) void Arm64RegCache::Init(JitArm64* jit)
{ {
m_emit = emitter; m_jit = jit;
m_emit = jit;
m_float_emit.reset(new ARM64FloatEmitter(m_emit)); m_float_emit.reset(new ARM64FloatEmitter(m_emit));
GetAllocationOrder(); GetAllocationOrder();
} }
@ -467,7 +468,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
return host_reg; return host_reg;
// Else convert this register back to doubles. // Else convert this register back to doubles.
m_float_emit->FCVTL(64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); const ARM64Reg tmp_reg = GetReg();
m_jit->ConvertSingleToDoublePair(preg, host_reg, host_reg, tmp_reg);
UnlockRegister(tmp_reg);
reg.Load(host_reg, RegType::Register); reg.Load(host_reg, RegType::Register);
[[fallthrough]]; [[fallthrough]];
} }
@ -482,7 +486,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
return host_reg; return host_reg;
// Else convert this register back to a double. // Else convert this register back to a double.
m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); const ARM64Reg tmp_reg = GetReg();
m_jit->ConvertSingleToDoubleLower(preg, host_reg, host_reg, tmp_reg);
UnlockRegister(tmp_reg);
reg.Load(host_reg, RegType::LowerPair); reg.Load(host_reg, RegType::LowerPair);
[[fallthrough]]; [[fallthrough]];
} }
@ -516,7 +523,10 @@ ARM64Reg Arm64FPRCache::R(size_t preg, RegType type)
return host_reg; return host_reg;
} }
m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); const ARM64Reg tmp_reg = GetReg();
m_jit->ConvertSingleToDoubleLower(preg, host_reg, host_reg, tmp_reg);
UnlockRegister(tmp_reg);
reg.Load(host_reg, RegType::Duplicated); reg.Load(host_reg, RegType::Duplicated);
[[fallthrough]]; [[fallthrough]];
} }
@ -584,7 +594,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
if ((type == RegType::LowerPair || type == RegType::LowerPairSingle) && was_dirty) if ((type == RegType::LowerPair || type == RegType::LowerPairSingle) && was_dirty)
{ {
// We must *not* change host_reg as this register might still be in use. So it's fine to // We must *not* change host_reg as this register might still be in use. So it's fine to
// store this register, but it's *not* fine to convert it to double. So for double convertion, // store this register, but it's *not* fine to convert it to double. So for double conversion,
// a temporary register needs to be used. // a temporary register needs to be used.
ARM64Reg host_reg = reg.GetReg(); ARM64Reg host_reg = reg.GetReg();
ARM64Reg flush_reg = host_reg; ARM64Reg flush_reg = host_reg;
@ -592,9 +602,27 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
switch (reg.GetType()) switch (reg.GetType())
{ {
case RegType::Single: case RegType::Single:
// For a store-safe register, conversion is just one instruction regardless of whether
// we're whether we're converting a pair, so ConvertSingleToDoublePair followed by a
// 128-bit store is faster than INS followed by ConvertSingleToDoubleLower and a
// 64-bit store. But for registers which are not store-safe, the latter is better.
flush_reg = GetReg(); flush_reg = GetReg();
m_float_emit->FCVTL(64, EncodeRegToDouble(flush_reg), EncodeRegToDouble(host_reg)); if (!m_jit->IsFPRStoreSafe(preg))
[[fallthrough]]; {
ARM64Reg scratch_reg = GetReg();
m_float_emit->INS(32, flush_reg, 0, host_reg, 1);
m_jit->ConvertSingleToDoubleLower(preg, flush_reg, flush_reg, scratch_reg);
m_float_emit->STR(64, IndexType::Unsigned, flush_reg, PPC_REG, u32(PPCSTATE_OFF_PS1(preg)));
Unlock(scratch_reg);
break;
}
else
{
m_jit->ConvertSingleToDoublePair(preg, flush_reg, host_reg, flush_reg);
m_float_emit->STR(128, IndexType::Unsigned, flush_reg, PPC_REG,
u32(PPCSTATE_OFF_PS0(preg)));
}
break;
case RegType::Register: case RegType::Register:
// We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit // We are doing a full 128bit store because it takes 2 cycles on a Cortex-A57 to do a 128bit
// store. // store.
@ -604,7 +632,7 @@ ARM64Reg Arm64FPRCache::RW(size_t preg, RegType type)
break; break;
case RegType::DuplicatedSingle: case RegType::DuplicatedSingle:
flush_reg = GetReg(); flush_reg = GetReg();
m_float_emit->FCVT(64, 32, EncodeRegToDouble(flush_reg), EncodeRegToDouble(host_reg)); m_jit->ConvertSingleToDoubleLower(preg, flush_reg, host_reg, flush_reg);
[[fallthrough]]; [[fallthrough]];
case RegType::Duplicated: case RegType::Duplicated:
// Store PSR1 (which is equal to PSR0) in memory. // Store PSR1 (which is equal to PSR0) in memory.
@ -708,17 +736,20 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
const bool dirty = reg.IsDirty(); const bool dirty = reg.IsDirty();
RegType type = reg.GetType(); RegType type = reg.GetType();
// If FlushRegister calls GetReg with all registers locked, we can get infinite recursion
const ARM64Reg tmp_reg = GetUnlockedRegisterCount() > 0 ? GetReg() : ARM64Reg::INVALID_REG;
// If we're in single mode, just convert it back to a double. // If we're in single mode, just convert it back to a double.
if (type == RegType::Single) if (type == RegType::Single)
{ {
if (dirty) if (dirty)
m_float_emit->FCVTL(64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); m_jit->ConvertSingleToDoublePair(preg, host_reg, host_reg, tmp_reg);
type = RegType::Register; type = RegType::Register;
} }
if (type == RegType::DuplicatedSingle || type == RegType::LowerPairSingle) if (type == RegType::DuplicatedSingle || type == RegType::LowerPairSingle)
{ {
if (dirty) if (dirty)
m_float_emit->FCVT(64, 32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); m_jit->ConvertSingleToDoubleLower(preg, host_reg, host_reg, tmp_reg);
if (type == RegType::DuplicatedSingle) if (type == RegType::DuplicatedSingle)
type = RegType::Duplicated; type = RegType::Duplicated;
@ -770,6 +801,9 @@ void Arm64FPRCache::FlushRegister(size_t preg, bool maintain_state)
reg.Flush(); reg.Flush();
} }
} }
if (tmp_reg != ARM64Reg::INVALID_REG)
UnlockRegister(tmp_reg);
} }
void Arm64FPRCache::FlushRegisters(BitSet32 regs, bool maintain_state) void Arm64FPRCache::FlushRegisters(BitSet32 regs, bool maintain_state)
@ -806,7 +840,7 @@ void Arm64FPRCache::FixSinglePrecision(size_t preg)
m_float_emit->FCVT(32, 64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); m_float_emit->FCVT(32, 64, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
reg.Load(host_reg, RegType::DuplicatedSingle); reg.Load(host_reg, RegType::DuplicatedSingle);
break; break;
case RegType::Register: // PS0 and PS1 needs to be converted case RegType::Register: // PS0 and PS1 need to be converted
m_float_emit->FCVTN(32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg)); m_float_emit->FCVTN(32, EncodeRegToDouble(host_reg), EncodeRegToDouble(host_reg));
reg.Load(host_reg, RegType::Single); reg.Load(host_reg, RegType::Single);
break; break;

View File

@ -15,6 +15,8 @@
#include "Core/PowerPC/PPCAnalyst.h" #include "Core/PowerPC/PPCAnalyst.h"
#include "Core/PowerPC/PowerPC.h" #include "Core/PowerPC/PowerPC.h"
class JitArm64;
// Dedicated host registers // Dedicated host registers
// memory base register // memory base register
@ -150,7 +152,7 @@ public:
explicit Arm64RegCache(size_t guest_reg_count) : m_guest_registers(guest_reg_count) {} explicit Arm64RegCache(size_t guest_reg_count) : m_guest_registers(guest_reg_count) {}
virtual ~Arm64RegCache() = default; virtual ~Arm64RegCache() = default;
void Init(Arm64Gen::ARM64XEmitter* emitter); void Init(JitArm64* jit);
virtual void Start(PPCAnalyst::BlockRegStats& stats) {} virtual void Start(PPCAnalyst::BlockRegStats& stats) {}
void DiscardRegisters(BitSet32 regs); void DiscardRegisters(BitSet32 regs);
@ -166,6 +168,9 @@ public:
void UpdateLastUsed(BitSet32 regs_used); void UpdateLastUsed(BitSet32 regs_used);
// Get available host registers
u32 GetUnlockedRegisterCount() const;
// Locks a register so a cache cannot use it // Locks a register so a cache cannot use it
// Useful for function calls // Useful for function calls
template <typename T = Arm64Gen::ARM64Reg, typename... Args> template <typename T = Arm64Gen::ARM64Reg, typename... Args>
@ -209,15 +214,14 @@ protected:
void DiscardRegister(size_t preg); void DiscardRegister(size_t preg);
virtual void FlushRegister(size_t preg, bool maintain_state) = 0; virtual void FlushRegister(size_t preg, bool maintain_state) = 0;
// Get available host registers
u32 GetUnlockedRegisterCount() const;
void IncrementAllUsed() void IncrementAllUsed()
{ {
for (auto& reg : m_guest_registers) for (auto& reg : m_guest_registers)
reg.IncrementLastUsed(); reg.IncrementLastUsed();
} }
JitArm64* m_jit = nullptr;
// Code emitter // Code emitter
Arm64Gen::ARM64XEmitter* m_emit = nullptr; Arm64Gen::ARM64XEmitter* m_emit = nullptr;

View File

@ -194,6 +194,85 @@ void JitArm64::GenerateAsm()
} }
void JitArm64::GenerateCommonAsm() void JitArm64::GenerateCommonAsm()
{
GetAsmRoutines()->cdts = GetCodePtr();
GenerateConvertDoubleToSingle();
JitRegister::Register(GetAsmRoutines()->cdts, GetCodePtr(), "JIT_cdts");
GetAsmRoutines()->cstd = GetCodePtr();
GenerateConvertSingleToDouble();
JitRegister::Register(GetAsmRoutines()->cdts, GetCodePtr(), "JIT_cstd");
GenerateQuantizedLoadStores();
}
// Input in X0, output in W1, clobbers X0-X3 and flags.
void JitArm64::GenerateConvertDoubleToSingle()
{
UBFX(ARM64Reg::X2, ARM64Reg::X0, 52, 11);
SUB(ARM64Reg::W3, ARM64Reg::W2, 874);
CMP(ARM64Reg::W3, 896 - 874);
LSR(ARM64Reg::X1, ARM64Reg::X0, 32);
FixupBranch denormal = B(CCFlags::CC_LS);
ANDI2R(ARM64Reg::X1, ARM64Reg::X1, 0xc0000000);
BFXIL(ARM64Reg::X1, ARM64Reg::X0, 29, 30);
RET();
SetJumpTarget(denormal);
LSR(ARM64Reg::X3, ARM64Reg::X0, 21);
MOVZ(ARM64Reg::X0, 905);
ORRI2R(ARM64Reg::W3, ARM64Reg::W3, 0x80000000);
SUB(ARM64Reg::W2, ARM64Reg::W0, ARM64Reg::W2);
LSRV(ARM64Reg::W2, ARM64Reg::W3, ARM64Reg::W2);
ANDI2R(ARM64Reg::X3, ARM64Reg::X1, 0x80000000);
ORR(ARM64Reg::X1, ARM64Reg::X3, ARM64Reg::X2);
RET();
}
// Input in W0, output in X0, clobbers X0-X4 and flags.
void JitArm64::GenerateConvertSingleToDouble()
{
UBFX(ARM64Reg::W1, ARM64Reg::W0, 23, 8);
FixupBranch normal_or_nan = CBNZ(ARM64Reg::W1);
ANDI2R(ARM64Reg::W1, ARM64Reg::W0, 0x007fffff);
FixupBranch denormal = CBNZ(ARM64Reg::W1);
// Zero
LSL(ARM64Reg::X0, ARM64Reg::X0, 32);
RET();
SetJumpTarget(denormal);
ANDI2R(ARM64Reg::W2, ARM64Reg::W0, 0x80000000);
CLZ(ARM64Reg::X3, ARM64Reg::X1);
LSL(ARM64Reg::X2, ARM64Reg::X2, 32);
ORRI2R(ARM64Reg::X4, ARM64Reg::X3, 0xffffffffffffffc0);
SUB(ARM64Reg::X2, ARM64Reg::X2, ARM64Reg::X3, ArithOption(ARM64Reg::X3, ShiftType::LSL, 52));
ADD(ARM64Reg::X3, ARM64Reg::X4, 23);
LSLV(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::X3);
BFI(ARM64Reg::X2, ARM64Reg::X1, 30, 22);
MOVI2R(ARM64Reg::X1, 0x3a90000000000000);
ADD(ARM64Reg::X0, ARM64Reg::X2, ARM64Reg::X1);
RET();
SetJumpTarget(normal_or_nan);
CMP(ARM64Reg::W1, 0xff);
ANDI2R(ARM64Reg::W2, ARM64Reg::W0, 0x40000000);
CSET(ARM64Reg::W4, CCFlags::CC_NEQ);
ANDI2R(ARM64Reg::W3, ARM64Reg::W0, 0xc0000000);
EOR(ARM64Reg::W2, ARM64Reg::W4, ARM64Reg::W2, ArithOption(ARM64Reg::W2, ShiftType::LSR, 30));
MOVI2R(ARM64Reg::X1, 0x3800000000000000);
ANDI2R(ARM64Reg::W4, ARM64Reg::W0, 0x3fffffff);
LSL(ARM64Reg::X3, ARM64Reg::X3, 32);
CMP(ARM64Reg::W2, 0);
CSEL(ARM64Reg::X1, ARM64Reg::X1, ARM64Reg::ZR, CCFlags::CC_NEQ);
BFI(ARM64Reg::X3, ARM64Reg::X4, 29, 30);
ORR(ARM64Reg::X0, ARM64Reg::X3, ARM64Reg::X1);
RET();
}
void JitArm64::GenerateQuantizedLoadStores()
{ {
// X0 is the scale // X0 is the scale
// X1 is address // X1 is address
@ -654,6 +733,4 @@ void JitArm64::GenerateCommonAsm()
paired_store_quantized[29] = storeSingleU16Slow; paired_store_quantized[29] = storeSingleU16Slow;
paired_store_quantized[30] = storeSingleS8Slow; paired_store_quantized[30] = storeSingleS8Slow;
paired_store_quantized[31] = storeSingleS16Slow; paired_store_quantized[31] = storeSingleS16Slow;
GetAsmRoutines()->mfcr = nullptr;
} }

View File

@ -16,14 +16,11 @@ struct BackPatchInfo
FLAG_SIZE_32 = (1 << 4), FLAG_SIZE_32 = (1 << 4),
FLAG_SIZE_F32 = (1 << 5), FLAG_SIZE_F32 = (1 << 5),
FLAG_SIZE_F32X2 = (1 << 6), FLAG_SIZE_F32X2 = (1 << 6),
FLAG_SIZE_F32X2I = (1 << 7), FLAG_SIZE_F64 = (1 << 7),
FLAG_SIZE_F64 = (1 << 8), FLAG_REVERSE = (1 << 8),
FLAG_REVERSE = (1 << 9), FLAG_EXTEND = (1 << 9),
FLAG_EXTEND = (1 << 10), FLAG_ZERO_256 = (1 << 10),
FLAG_SIZE_F32I = (1 << 11), FLAG_MASK_FLOAT = FLAG_SIZE_F32 | FLAG_SIZE_F32X2 | FLAG_SIZE_F64,
FLAG_ZERO_256 = (1 << 12),
FLAG_MASK_FLOAT =
FLAG_SIZE_F32 | FLAG_SIZE_F32X2 | FLAG_SIZE_F32X2I | FLAG_SIZE_F64 | FLAG_SIZE_F32I,
}; };
static u32 GetFlagSize(u32 flags) static u32 GetFlagSize(u32 flags)
@ -34,8 +31,10 @@ struct BackPatchInfo
return 16; return 16;
if (flags & FLAG_SIZE_32) if (flags & FLAG_SIZE_32)
return 32; return 32;
if (flags & FLAG_SIZE_F32 || flags & FLAG_SIZE_F32I) if (flags & FLAG_SIZE_F32)
return 32; return 32;
if (flags & FLAG_SIZE_F32X2)
return 64;
if (flags & FLAG_SIZE_F64) if (flags & FLAG_SIZE_F64)
return 64; return 64;
if (flags & FLAG_ZERO_256) if (flags & FLAG_ZERO_256)

View File

@ -26,6 +26,7 @@ struct CommonAsmRoutinesBase
const u8* fres; const u8* fres;
const u8* mfcr; const u8* mfcr;
const u8* cdts; const u8* cdts;
const u8* cstd;
// In: array index: GQR to use. // In: array index: GQR to use.
// In: ECX: Address to read from. // In: ECX: Address to read from.

View File

@ -8,6 +8,7 @@
#include <map> #include <map>
#include <unordered_set> #include <unordered_set>
#include "Common/BitSet.h"
#include "Common/CommonTypes.h" #include "Common/CommonTypes.h"
#include "Common/x64Emitter.h" #include "Common/x64Emitter.h"
#include "Core/ConfigManager.h" #include "Core/ConfigManager.h"
@ -98,6 +99,7 @@ protected:
PPCAnalyst::BlockRegStats gpa; PPCAnalyst::BlockRegStats gpa;
PPCAnalyst::BlockRegStats fpa; PPCAnalyst::BlockRegStats fpa;
PPCAnalyst::CodeOp* op; PPCAnalyst::CodeOp* op;
BitSet32 fpr_is_store_safe;
JitBlock* curBlock; JitBlock* curBlock;

View File

@ -976,7 +976,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
op.fprIsSingle = fprIsSingle; op.fprIsSingle = fprIsSingle;
op.fprIsDuplicated = fprIsDuplicated; op.fprIsDuplicated = fprIsDuplicated;
op.fprIsStoreSafe = fprIsStoreSafe; op.fprIsStoreSafeBeforeInst = fprIsStoreSafe;
if (op.fregOut >= 0) if (op.fregOut >= 0)
{ {
if (op.opinfo->type == OpType::SingleFP) if (op.opinfo->type == OpType::SingleFP)
@ -1036,6 +1036,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, std:
(op.opinfo->type == OpType::SingleFP || op.opinfo->type == OpType::PS); (op.opinfo->type == OpType::SingleFP || op.opinfo->type == OpType::PS);
} }
} }
op.fprIsStoreSafeAfterInst = fprIsStoreSafe;
if (op.opinfo->type == OpType::StorePS || op.opinfo->type == OpType::LoadPS) if (op.opinfo->type == OpType::StorePS || op.opinfo->type == OpType::LoadPS)
{ {

View File

@ -66,7 +66,8 @@ struct CodeOp // 16B
// convert between single and double formats by just using the host machine's instruction for it. // convert between single and double formats by just using the host machine's instruction for it.
// (The reason why we can't always do this is because some games rely on the exact bits of // (The reason why we can't always do this is because some games rely on the exact bits of
// denormals and SNaNs being preserved as long as no arithmetic operation is performed on them.) // denormals and SNaNs being preserved as long as no arithmetic operation is performed on them.)
BitSet32 fprIsStoreSafe; BitSet32 fprIsStoreSafeBeforeInst;
BitSet32 fprIsStoreSafeAfterInst;
BitSet32 GetFregsOut() const BitSet32 GetFregsOut() const
{ {

View File

@ -13,7 +13,7 @@
<ItemGroup> <ItemGroup>
<ClCompile Include="Common\Arm64Emitter.cpp" /> <ClCompile Include="Common\Arm64Emitter.cpp" />
<ClCompile Include="Common\ArmCPUDetect.cpp" /> <ClCompile Include="Common\ArmCPUDetect.cpp" />
<ClCompile Include="Common\GenericFPURoundMode.cpp" /> <ClCompile Include="Common\ArmFPURoundMode.cpp" />
<ClCompile Include="Core\PowerPC\JitArm64\Jit_Util.cpp" /> <ClCompile Include="Core\PowerPC\JitArm64\Jit_Util.cpp" />
<ClCompile Include="Core\PowerPC\JitArm64\Jit.cpp" /> <ClCompile Include="Core\PowerPC\JitArm64\Jit.cpp" />
<ClCompile Include="Core\PowerPC\JitArm64\JitArm64_BackPatch.cpp" /> <ClCompile Include="Core\PowerPC\JitArm64\JitArm64_BackPatch.cpp" />

View File

@ -21,6 +21,7 @@ if(_M_X86)
) )
elseif(_M_ARM_64) elseif(_M_ARM_64)
add_dolphin_test(PowerPCTest add_dolphin_test(PowerPCTest
PowerPC/JitArm64/ConvertSingleDouble.cpp
PowerPC/JitArm64/MovI2R.cpp PowerPC/JitArm64/MovI2R.cpp
) )
endif() endif()

View File

@ -0,0 +1,273 @@
// Copyright 2021 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.
#include <functional>
#include <vector>
#include "Common/Arm64Emitter.h"
#include "Common/BitUtils.h"
#include "Common/CommonTypes.h"
#include "Common/FPURoundMode.h"
#include "Core/PowerPC/Interpreter/Interpreter_FPUtils.h"
#include "Core/PowerPC/JitArm64/Jit.h"
#include <fmt/format.h>
#include <gtest/gtest.h>
namespace
{
using namespace Arm64Gen;
// The ABI situation for returning an std::tuple seems annoying. Let's use this struct instead
template <typename T>
struct Pair
{
T value1;
T value2;
};
class TestConversion : private JitArm64
{
public:
TestConversion()
{
AllocCodeSpace(4096);
AddChildCodeSpace(&farcode, 2048);
gpr.Init(this);
fpr.Init(this);
js.fpr_is_store_safe = BitSet32(0);
GetAsmRoutines()->cdts = GetCodePtr();
GenerateConvertDoubleToSingle();
GetAsmRoutines()->cstd = GetCodePtr();
GenerateConvertSingleToDouble();
gpr.Lock(ARM64Reg::W30);
fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1);
convert_single_to_double_lower = Common::BitCast<u64 (*)(u32)>(GetCodePtr());
m_float_emit.INS(32, ARM64Reg::S0, 0, ARM64Reg::W0);
ConvertSingleToDoubleLower(0, ARM64Reg::D0, ARM64Reg::S0, ARM64Reg::Q1);
m_float_emit.UMOV(64, ARM64Reg::X0, ARM64Reg::D0, 0);
RET();
convert_single_to_double_pair = Common::BitCast<Pair<u64> (*)(u32, u32)>(GetCodePtr());
m_float_emit.INS(32, ARM64Reg::D0, 0, ARM64Reg::W0);
m_float_emit.INS(32, ARM64Reg::D0, 1, ARM64Reg::W1);
ConvertSingleToDoublePair(0, ARM64Reg::Q0, ARM64Reg::D0, ARM64Reg::Q1);
m_float_emit.UMOV(64, ARM64Reg::X0, ARM64Reg::Q0, 0);
m_float_emit.UMOV(64, ARM64Reg::X1, ARM64Reg::Q0, 1);
RET();
convert_double_to_single_lower = Common::BitCast<u32 (*)(u64)>(GetCodePtr());
m_float_emit.INS(64, ARM64Reg::D0, 0, ARM64Reg::X0);
ConvertDoubleToSingleLower(0, ARM64Reg::S0, ARM64Reg::D0);
m_float_emit.UMOV(32, ARM64Reg::W0, ARM64Reg::S0, 0);
RET();
convert_double_to_single_pair = Common::BitCast<Pair<u32> (*)(u64, u64)>(GetCodePtr());
m_float_emit.INS(64, ARM64Reg::Q0, 0, ARM64Reg::X0);
m_float_emit.INS(64, ARM64Reg::Q0, 1, ARM64Reg::X1);
ConvertDoubleToSinglePair(0, ARM64Reg::D0, ARM64Reg::Q0);
m_float_emit.UMOV(64, ARM64Reg::X0, ARM64Reg::D0, 0);
RET();
gpr.Unlock(ARM64Reg::W30);
fpr.Unlock(ARM64Reg::Q0, ARM64Reg::Q1);
FlushIcache();
// Set the rounding mode to something that's as annoying as possible to handle
// (flush-to-zero enabled, and rounding not symmetric about the origin)
FPURoundMode::SetSIMDMode(FPURoundMode::RoundMode::ROUND_UP, true);
}
~TestConversion() override
{
FPURoundMode::LoadDefaultSIMDState();
FreeCodeSpace();
}
u64 ConvertSingleToDouble(u32 value) { return convert_single_to_double_lower(value); }
Pair<u64> ConvertSingleToDouble(u32 value1, u32 value2)
{
return convert_single_to_double_pair(value1, value2);
}
u32 ConvertDoubleToSingle(u64 value) { return convert_double_to_single_lower(value); }
Pair<u32> ConvertDoubleToSingle(u64 value1, u64 value2)
{
return convert_double_to_single_pair(value1, value2);
}
private:
std::function<u64(u32)> convert_single_to_double_lower;
std::function<Pair<u64>(u32, u32)> convert_single_to_double_pair;
std::function<u32(u64)> convert_double_to_single_lower;
std::function<Pair<u32>(u64, u64)> convert_double_to_single_pair;
};
} // namespace
TEST(JitArm64, ConvertDoubleToSingle)
{
TestConversion test;
const std::vector<u64> input_values{
// Special values
0x0000'0000'0000'0000, // positive zero
0x0000'0000'0000'0001, // smallest positive denormal
0x0000'0000'0100'0000,
0x000F'FFFF'FFFF'FFFF, // largest positive denormal
0x0010'0000'0000'0000, // smallest positive normal
0x0010'0000'0000'0002,
0x3FF0'0000'0000'0000, // 1.0
0x7FEF'FFFF'FFFF'FFFF, // largest positive normal
0x7FF0'0000'0000'0000, // positive infinity
0x7FF0'0000'0000'0001, // first positive SNaN
0x7FF7'FFFF'FFFF'FFFF, // last positive SNaN
0x7FF8'0000'0000'0000, // first positive QNaN
0x7FFF'FFFF'FFFF'FFFF, // last positive QNaN
0x8000'0000'0000'0000, // negative zero
0x8000'0000'0000'0001, // smallest negative denormal
0x8000'0000'0100'0000,
0x800F'FFFF'FFFF'FFFF, // largest negative denormal
0x8010'0000'0000'0000, // smallest negative normal
0x8010'0000'0000'0002,
0xBFF0'0000'0000'0000, // -1.0
0xFFEF'FFFF'FFFF'FFFF, // largest negative normal
0xFFF0'0000'0000'0000, // negative infinity
0xFFF0'0000'0000'0001, // first negative SNaN
0xFFF7'FFFF'FFFF'FFFF, // last negative SNaN
0xFFF8'0000'0000'0000, // first negative QNaN
0xFFFF'FFFF'FFFF'FFFF, // last negative QNaN
// (exp > 896) Boundary Case
0x3800'0000'0000'0000, // 2^(-127) = Denormal in single-prec
0x3810'0000'0000'0000, // 2^(-126) = Smallest single-prec normal
0xB800'0000'0000'0000, // -2^(-127) = Denormal in single-prec
0xB810'0000'0000'0000, // -2^(-126) = Smallest single-prec normal
0x3800'1234'5678'9ABC, 0x3810'1234'5678'9ABC, 0xB800'1234'5678'9ABC, 0xB810'1234'5678'9ABC,
// (exp >= 874) Boundary Case
0x3680'0000'0000'0000, // 2^(-150) = Unrepresentable in single-prec
0x36A0'0000'0000'0000, // 2^(-149) = Smallest single-prec denormal
0x36B0'0000'0000'0000, // 2^(-148) = Single-prec denormal
0xB680'0000'0000'0000, // -2^(-150) = Unrepresentable in single-prec
0xB6A0'0000'0000'0000, // -2^(-149) = Smallest single-prec denormal
0xB6B0'0000'0000'0000, // -2^(-148) = Single-prec denormal
0x3680'1234'5678'9ABC, 0x36A0'1234'5678'9ABC, 0x36B0'1234'5678'9ABC, 0xB680'1234'5678'9ABC,
0xB6A0'1234'5678'9ABC, 0xB6B0'1234'5678'9ABC,
// Some typical numbers
0x3FF8'0000'0000'0000, // 1.5
0x408F'4000'0000'0000, // 1000
0xC008'0000'0000'0000, // -3
};
for (const u64 input : input_values)
{
const u32 expected = ConvertToSingle(input);
const u32 actual = test.ConvertDoubleToSingle(input);
if (expected != actual)
fmt::print("{:016x} -> {:08x} == {:08x}\n", input, actual, expected);
EXPECT_EQ(expected, actual);
}
for (const u64 input1 : input_values)
{
for (const u64 input2 : input_values)
{
const u32 expected1 = ConvertToSingle(input1);
const u32 expected2 = ConvertToSingle(input2);
const auto [actual1, actual2] = test.ConvertDoubleToSingle(input1, input2);
if (expected1 != actual1 || expected2 != actual2)
{
fmt::print("{:016x} -> {:08x} == {:08x},\n", input1, actual1, expected1);
fmt::print("{:016x} -> {:08x} == {:08x}\n", input2, actual2, expected2);
}
EXPECT_EQ(expected1, actual1);
EXPECT_EQ(expected2, actual2);
}
}
}
TEST(JitArm64, ConvertSingleToDouble)
{
TestConversion test;
const std::vector<u32> input_values{
// Special values
0x0000'0000, // positive zero
0x0000'0001, // smallest positive denormal
0x0000'1000,
0x007F'FFFF, // largest positive denormal
0x0080'0000, // smallest positive normal
0x0080'0002,
0x3F80'0000, // 1.0
0x7F7F'FFFF, // largest positive normal
0x7F80'0000, // positive infinity
0x7F80'0001, // first positive SNaN
0x7FBF'FFFF, // last positive SNaN
0x7FC0'0000, // first positive QNaN
0x7FFF'FFFF, // last positive QNaN
0x8000'0000, // negative zero
0x8000'0001, // smallest negative denormal
0x8000'1000,
0x807F'FFFF, // largest negative denormal
0x8080'0000, // smallest negative normal
0x8080'0002,
0xBFF0'0000, // -1.0
0xFF7F'FFFF, // largest negative normal
0xFF80'0000, // negative infinity
0xFF80'0001, // first negative SNaN
0xFFBF'FFFF, // last negative SNaN
0xFFC0'0000, // first negative QNaN
0xFFFF'FFFF, // last negative QNaN
// Some typical numbers
0x3FC0'0000, // 1.5
0x447A'0000, // 1000
0xC040'0000, // -3
};
for (const u32 input : input_values)
{
const u64 expected = ConvertToDouble(input);
const u64 actual = test.ConvertSingleToDouble(input);
if (expected != actual)
fmt::print("{:08x} -> {:016x} == {:016x}\n", input, actual, expected);
EXPECT_EQ(expected, actual);
}
for (const u32 input1 : input_values)
{
for (const u32 input2 : input_values)
{
const u64 expected1 = ConvertToDouble(input1);
const u64 expected2 = ConvertToDouble(input2);
const auto [actual1, actual2] = test.ConvertSingleToDouble(input1, input2);
if (expected1 != actual1 || expected2 != actual2)
{
fmt::print("{:08x} -> {:016x} == {:016x},\n", input1, actual1, expected1);
fmt::print("{:08x} -> {:016x} == {:016x}\n", input2, actual2, expected2);
}
EXPECT_EQ(expected1, actual1);
EXPECT_EQ(expected2, actual2);
}
}
}

View File

@ -81,6 +81,7 @@
<ClCompile Include="Core\PowerPC\Jit64Common\Frsqrte.cpp" /> <ClCompile Include="Core\PowerPC\Jit64Common\Frsqrte.cpp" />
</ItemGroup> </ItemGroup>
<ItemGroup Condition="'$(Platform)'=='ARM64'"> <ItemGroup Condition="'$(Platform)'=='ARM64'">
<ClCompile Include="Core\PowerPC\JitArm64\ConvertSingleDouble.cpp" />
<ClCompile Include="Core\PowerPC\JitArm64\MovI2R.cpp" /> <ClCompile Include="Core\PowerPC\JitArm64\MovI2R.cpp" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>