JitArm64: Implement memcheck for psq_lXX/psq_stXX with update

This commit is contained in:
JosJuice 2021-10-12 20:09:59 +02:00
parent 61c73061e9
commit 5490797867
2 changed files with 42 additions and 26 deletions

View File

@ -36,8 +36,6 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
const int i = indexed ? inst.Ix : inst.I;
const int w = indexed ? inst.Wx : inst.W;
FALLBACK_IF(jo.memcheck && update);
gpr.Lock(ARM64Reg::W0, ARM64Reg::W30);
fpr.Lock(ARM64Reg::Q0);
if (!js.assumeNoPairedQuantize)
@ -68,7 +66,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
MOVI2R(addr_reg, (u32)offset);
}
if (update)
const bool early_update = !jo.memcheck;
if (update && early_update)
{
gpr.BindToRegister(inst.RA, false);
MOV(gpr.R(inst.RA), addr_reg);
@ -80,7 +79,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();
// Wipe the registers we are using as temporaries
gprs_in_use[DecodeReg(ARM64Reg::W0)] = false;
if (!update || early_update)
gprs_in_use[DecodeReg(ARM64Reg::W0)] = false;
fprs_in_use[DecodeReg(ARM64Reg::Q0)] = false;
if (!jo.memcheck)
fprs_in_use[DecodeReg(VS)] = 0;
@ -116,6 +116,12 @@ void JitArm64::psq_lXX(UGeckoInstruction inst)
const ARM64Reg VS_again = fpr.RW(inst.RS, RegType::Single, true);
ASSERT(VS == VS_again);
if (update && !early_update)
{
gpr.BindToRegister(inst.RA, false);
MOV(gpr.R(inst.RA), addr_reg);
}
gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30);
fpr.Unlock(ARM64Reg::Q0);
if (!js.assumeNoPairedQuantize)
@ -144,8 +150,6 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
const int i = indexed ? inst.Ix : inst.I;
const int w = indexed ? inst.Wx : inst.W;
FALLBACK_IF(jo.memcheck && update);
if (!js.assumeNoPairedQuantize)
fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1);
@ -207,7 +211,8 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
MOVI2R(addr_reg, (u32)offset);
}
if (update)
const bool early_update = !jo.memcheck;
if (update && early_update)
{
gpr.BindToRegister(inst.RA, false);
MOV(gpr.R(inst.RA), addr_reg);
@ -220,7 +225,8 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
// Wipe the registers we are using as temporaries
gprs_in_use[DecodeReg(ARM64Reg::W0)] = false;
gprs_in_use[DecodeReg(ARM64Reg::W1)] = false;
if (!update || early_update)
gprs_in_use[DecodeReg(ARM64Reg::W1)] = false;
u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32;
if (!w)
@ -242,6 +248,12 @@ void JitArm64::psq_stXX(UGeckoInstruction inst)
WriteConditionalExceptionExit(EXCEPTION_DSI, ARM64Reg::X30, ARM64Reg::Q1);
}
if (update && !early_update)
{
gpr.BindToRegister(inst.RA, false);
MOV(gpr.R(inst.RA), addr_reg);
}
if (js.assumeNoPairedQuantize && !have_single)
fpr.Unlock(VS);

View File

@ -495,7 +495,9 @@ void JitArm64::GenerateQuantizedLoads()
// Q1 is a temporary
ARM64Reg addr_reg = ARM64Reg::X0;
ARM64Reg scale_reg = ARM64Reg::X1;
BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 2, 3};
BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{2, 3};
if (!jo.memcheck)
gprs_to_push &= ~BitSet32{0};
BitSet32 fprs_to_push = BitSet32(0xFFFFFFFF) & ~BitSet32{0, 1};
ARM64FloatEmitter float_emit(this);
@ -524,8 +526,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@ -542,8 +544,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@ -559,8 +561,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@ -576,8 +578,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@ -605,8 +607,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@ -623,8 +625,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@ -640,8 +642,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@ -657,8 +659,8 @@ void JitArm64::GenerateQuantizedLoads()
float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0);
float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0);
MOVP2R(addr_reg, &m_dequantizeTableS);
ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
MOVP2R(ARM64Reg::X2, &m_dequantizeTableS);
ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3));
float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0);
float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0);
RET(ARM64Reg::X30);
@ -701,7 +703,9 @@ void JitArm64::GenerateQuantizedStores()
// Q1 is a temporary
ARM64Reg scale_reg = ARM64Reg::X0;
ARM64Reg addr_reg = ARM64Reg::X1;
BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 1, 2};
BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 2};
if (!jo.memcheck)
gprs_to_push &= ~BitSet32{1};
BitSet32 fprs_to_push = BitSet32(0xFFFFFFFF) & ~BitSet32{0, 1};
ARM64FloatEmitter float_emit(this);