diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index 7f22e5e8fc..5a4daa58cc 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -36,8 +36,6 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) const int i = indexed ? inst.Ix : inst.I; const int w = indexed ? inst.Wx : inst.W; - FALLBACK_IF(jo.memcheck && update); - gpr.Lock(ARM64Reg::W0, ARM64Reg::W30); fpr.Lock(ARM64Reg::Q0); if (!js.assumeNoPairedQuantize) @@ -68,7 +66,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) MOVI2R(addr_reg, (u32)offset); } - if (update) + const bool early_update = !jo.memcheck; + if (update && early_update) { gpr.BindToRegister(inst.RA, false); MOV(gpr.R(inst.RA), addr_reg); @@ -80,7 +79,8 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) BitSet32 fprs_in_use = fpr.GetCallerSavedUsed(); // Wipe the registers we are using as temporaries - gprs_in_use[DecodeReg(ARM64Reg::W0)] = false; + if (!update || early_update) + gprs_in_use[DecodeReg(ARM64Reg::W0)] = false; fprs_in_use[DecodeReg(ARM64Reg::Q0)] = false; if (!jo.memcheck) fprs_in_use[DecodeReg(VS)] = 0; @@ -116,6 +116,12 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) const ARM64Reg VS_again = fpr.RW(inst.RS, RegType::Single, true); ASSERT(VS == VS_again); + if (update && !early_update) + { + gpr.BindToRegister(inst.RA, false); + MOV(gpr.R(inst.RA), addr_reg); + } + gpr.Unlock(ARM64Reg::W0, ARM64Reg::W30); fpr.Unlock(ARM64Reg::Q0); if (!js.assumeNoPairedQuantize) @@ -144,8 +150,6 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) const int i = indexed ? inst.Ix : inst.I; const int w = indexed ? inst.Wx : inst.W; - FALLBACK_IF(jo.memcheck && update); - if (!js.assumeNoPairedQuantize) fpr.Lock(ARM64Reg::Q0, ARM64Reg::Q1); @@ -207,7 +211,8 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) MOVI2R(addr_reg, (u32)offset); } - if (update) + const bool early_update = !jo.memcheck; + if (update && early_update) { gpr.BindToRegister(inst.RA, false); MOV(gpr.R(inst.RA), addr_reg); @@ -220,7 +225,8 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) // Wipe the registers we are using as temporaries gprs_in_use[DecodeReg(ARM64Reg::W0)] = false; - gprs_in_use[DecodeReg(ARM64Reg::W1)] = false; + if (!update || early_update) + gprs_in_use[DecodeReg(ARM64Reg::W1)] = false; u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; if (!w) @@ -242,6 +248,12 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) WriteConditionalExceptionExit(EXCEPTION_DSI, ARM64Reg::X30, ARM64Reg::Q1); } + if (update && !early_update) + { + gpr.BindToRegister(inst.RA, false); + MOV(gpr.R(inst.RA), addr_reg); + } + if (js.assumeNoPairedQuantize && !have_single) fpr.Unlock(VS); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 5f5b8826fd..bf650baf15 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -495,7 +495,9 @@ void JitArm64::GenerateQuantizedLoads() // Q1 is a temporary ARM64Reg addr_reg = ARM64Reg::X0; ARM64Reg scale_reg = ARM64Reg::X1; - BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 2, 3}; + BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{2, 3}; + if (!jo.memcheck) + gprs_to_push &= ~BitSet32{0}; BitSet32 fprs_to_push = BitSet32(0xFFFFFFFF) & ~BitSet32{0, 1}; ARM64FloatEmitter float_emit(this); @@ -524,8 +526,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -542,8 +544,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -559,8 +561,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -576,8 +578,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -605,8 +607,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -623,8 +625,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -640,8 +642,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -657,8 +659,8 @@ void JitArm64::GenerateQuantizedLoads() float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); - MOVP2R(addr_reg, &m_dequantizeTableS); - ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + MOVP2R(ARM64Reg::X2, &m_dequantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); RET(ARM64Reg::X30); @@ -701,7 +703,9 @@ void JitArm64::GenerateQuantizedStores() // Q1 is a temporary ARM64Reg scale_reg = ARM64Reg::X0; ARM64Reg addr_reg = ARM64Reg::X1; - BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 1, 2}; + BitSet32 gprs_to_push = CALLER_SAVED_GPRS & ~BitSet32{0, 2}; + if (!jo.memcheck) + gprs_to_push &= ~BitSet32{1}; BitSet32 fprs_to_push = BitSet32(0xFFFFFFFF) & ~BitSet32{0, 1}; ARM64FloatEmitter float_emit(this);