From 96760093e9cdce17fad57c9efa0370a14b908b1e Mon Sep 17 00:00:00 2001 From: JosJuice Date: Fri, 9 Jul 2021 12:13:58 +0200 Subject: [PATCH] JitArm64: Move psq_st address check to EmitBackpatchRoutine This way the address check will take up less icache (since it's only emitted once for each routine rather than once for each psq_st instruction), and we also get address checking for psq_l. Matches Jit64's approach. The disadvantage: In the slowmem case, the routines have to push *every* caller-saved register onto the stack, even though most callers probably don't need it. But at long as the slowmem case isn't hit frequently, this is fine. --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 4 +- .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 70 ++-- .../PowerPC/JitArm64/JitArm64_LoadStore.cpp | 10 + .../JitArm64/JitArm64_LoadStorePaired.cpp | 38 +-- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 311 +++++++----------- 5 files changed, 182 insertions(+), 251 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index bf266acfd3..34f25c04c3 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -225,10 +225,9 @@ protected: void DumpCode(const u8* start, const u8* end); // Backpatching routines - bool DisasmLoadStore(const u8* ptr, u32* flags, Arm64Gen::ARM64Reg* reg); void EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, Arm64Gen::ARM64Reg RS, Arm64Gen::ARM64Reg addr, BitSet32 gprs_to_push = BitSet32(0), - BitSet32 fprs_to_push = BitSet32(0)); + BitSet32 fprs_to_push = BitSet32(0), bool emitting_routine = false); // Loadstore routines void SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 offset, bool update); void SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s32 offset); @@ -236,6 +235,7 @@ protected: // jumps to the returned FixupBranch. Clobbers tmp and the 17 lower bits of addr_out. Arm64Gen::FixupBranch BATAddressLookup(Arm64Gen::ARM64Reg addr_out, Arm64Gen::ARM64Reg addr_in, Arm64Gen::ARM64Reg tmp, const void* bat_table); + Arm64Gen::FixupBranch CheckIfSafeAddress(Arm64Gen::ARM64Reg addr); void DoJit(u32 em_address, JitBlock* b, u32 nextPC); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index fa5ed3e67d..de42f5811f 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include "Common/BitSet.h" @@ -51,13 +52,18 @@ void JitArm64::DoBacktrace(uintptr_t access_address, SContext* ctx) } void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, ARM64Reg RS, - ARM64Reg addr, BitSet32 gprs_to_push, BitSet32 fprs_to_push) + ARM64Reg addr, BitSet32 gprs_to_push, BitSet32 fprs_to_push, + bool emitting_routine) { bool in_far_code = false; const u8* fastmem_start = GetCodePtr(); + std::optional slowmem_fixup; if (fastmem) { + if (do_farcode && emitting_routine) + slowmem_fixup = CheckIfSafeAddress(addr); + if ((flags & BackPatchInfo::FLAG_STORE) && (flags & BackPatchInfo::FLAG_FLOAT)) { ARM64Reg temp = ARM64Reg::D0; @@ -110,34 +116,45 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR { if (fastmem && do_farcode) { - SlowmemHandler handler; - handler.dest_reg = RS; - handler.addr_reg = addr; - handler.gprs = gprs_to_push; - handler.fprs = fprs_to_push; - handler.flags = flags; - - FastmemArea* fastmem_area = &m_fault_to_handler[fastmem_end]; - auto handler_loc_iter = m_handler_to_loc.find(handler); - - if (handler_loc_iter == m_handler_to_loc.end()) + if (emitting_routine) { in_far_code = true; SwitchToFarCode(); - const u8* handler_loc = GetCodePtr(); - m_handler_to_loc[handler] = handler_loc; - fastmem_area->fastmem_code = fastmem_start; - fastmem_area->slowmem_code = handler_loc; } else { - const u8* handler_loc = handler_loc_iter->second; - fastmem_area->fastmem_code = fastmem_start; - fastmem_area->slowmem_code = handler_loc; - return; + SlowmemHandler handler; + handler.dest_reg = RS; + handler.addr_reg = addr; + handler.gprs = gprs_to_push; + handler.fprs = fprs_to_push; + handler.flags = flags; + + FastmemArea* fastmem_area = &m_fault_to_handler[fastmem_end]; + auto handler_loc_iter = m_handler_to_loc.find(handler); + + if (handler_loc_iter == m_handler_to_loc.end()) + { + in_far_code = true; + SwitchToFarCode(); + const u8* handler_loc = GetCodePtr(); + m_handler_to_loc[handler] = handler_loc; + fastmem_area->fastmem_code = fastmem_start; + fastmem_area->slowmem_code = handler_loc; + } + else + { + const u8* handler_loc = handler_loc_iter->second; + fastmem_area->fastmem_code = fastmem_start; + fastmem_area->slowmem_code = handler_loc; + return; + } } } + if (slowmem_fixup) + SetJumpTarget(*slowmem_fixup); + ABI_PushRegisters(gprs_to_push); m_float_emit.ABI_PushRegisters(fprs_to_push, ARM64Reg::X30); @@ -229,8 +246,17 @@ void JitArm64::EmitBackpatchRoutine(u32 flags, bool fastmem, bool do_farcode, AR if (in_far_code) { - RET(ARM64Reg::X30); - SwitchToNearCode(); + if (emitting_routine) + { + FixupBranch done = B(); + SwitchToNearCode(); + SetJumpTarget(done); + } + else + { + RET(ARM64Reg::X30); + SwitchToNearCode(); + } } } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 0cedefac45..3d18b977bb 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -288,6 +288,16 @@ FixupBranch JitArm64::BATAddressLookup(ARM64Reg addr_out, ARM64Reg addr_in, ARM6 return fail; } +FixupBranch JitArm64::CheckIfSafeAddress(Arm64Gen::ARM64Reg addr) +{ + // FIXME: This doesn't correctly account for the BAT configuration. + TST(addr, LogicalImm(0x0c000000, 32)); + FixupBranch pass = B(CC_EQ); + FixupBranch fail = B(); + SetJumpTarget(pass); + return fail; +} + void JitArm64::lXX(UGeckoInstruction inst) { INSTRUCTION_START diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp index 305eb8a9f2..60e86225a4 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStorePaired.cpp @@ -19,10 +19,10 @@ void JitArm64::psq_lXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStorePairedOff); - FALLBACK_IF(jo.memcheck || !jo.fastmem); + FALLBACK_IF(jo.memcheck); - // The asm routines assume address translation is on. - FALLBACK_IF(!MSR.DR); + // If we have a fastmem arena, the asm routines assume address translation is on. + FALLBACK_IF(!js.assumeNoPairedQuantize && jo.fastmem_arena && !MSR.DR); // X30 is LR // X0 is the address @@ -111,10 +111,10 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStorePairedOff); - FALLBACK_IF(jo.memcheck || !jo.fastmem); + FALLBACK_IF(jo.memcheck); - // The asm routines assume address translation is on. - FALLBACK_IF(!MSR.DR); + // If we have a fastmem arena, the asm routines assume address translation is on. + FALLBACK_IF(!js.assumeNoPairedQuantize && jo.fastmem_arena && !MSR.DR); // X30 is LR // X0 contains the scale @@ -213,33 +213,9 @@ void JitArm64::psq_stXX(UGeckoInstruction inst) UBFM(type_reg, scale_reg, 0, 2); // Type UBFM(scale_reg, scale_reg, 8, 13); // Scale - // Inline address check - // FIXME: This doesn't correctly account for the BAT configuration. - TST(addr_reg, LogicalImm(0x0c000000, 32)); - FixupBranch pass = B(CC_EQ); - FixupBranch fail = B(); - - SwitchToFarCode(); - SetJumpTarget(fail); - // Slow - MOVP2R(ARM64Reg::X30, &paired_store_quantized[16 + w * 8]); - LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true)); - - ABI_PushRegisters(gprs_in_use); - m_float_emit.ABI_PushRegisters(fprs_in_use, ARM64Reg::X30); - BLR(EncodeRegTo64(type_reg)); - m_float_emit.ABI_PopRegisters(fprs_in_use, ARM64Reg::X30); - ABI_PopRegisters(gprs_in_use); - FixupBranch continue1 = B(); - SwitchToNearCode(); - SetJumpTarget(pass); - - // Fast - MOVP2R(ARM64Reg::X30, &paired_store_quantized[w * 8]); + MOVP2R(ARM64Reg::X30, w ? single_store_quantized : paired_store_quantized); LDR(EncodeRegTo64(type_reg), ARM64Reg::X30, ArithOption(EncodeRegTo64(type_reg), true)); BLR(EncodeRegTo64(type_reg)); - - SetJumpTarget(continue1); } if (js.assumeNoPairedQuantize && !have_single) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index f083ba3dee..095fe14aff 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -506,8 +506,9 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_32; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push & ~BitSet32{1}, - fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push & ~BitSet32{1}, fprs_to_push); + RET(ARM64Reg::X30); } const u8* loadPairedU8Two = GetCodePtr(); @@ -515,7 +516,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.UXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); @@ -532,7 +534,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.SXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); @@ -549,7 +552,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -565,7 +569,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -582,8 +587,9 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push & ~BitSet32{1}, - fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push & ~BitSet32{1}, fprs_to_push); + RET(ARM64Reg::X30); } const u8* loadPairedU8One = GetCodePtr(); @@ -591,7 +597,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.UXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); @@ -608,7 +615,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.SXTL(8, ARM64Reg::D0, ARM64Reg::D0); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); @@ -625,7 +633,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.UXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.UCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -641,7 +650,8 @@ void JitArm64::GenerateQuantizedLoads() constexpr u32 flags = BackPatchInfo::FLAG_LOAD | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); float_emit.SXTL(16, ARM64Reg::D0, ARM64Reg::D0); float_emit.SCVTF(32, ARM64Reg::D0, ARM64Reg::D0); @@ -697,256 +707,181 @@ void JitArm64::GenerateQuantizedStores() const u8* start = GetCodePtr(); const u8* storePairedIllegal = GetCodePtr(); BRK(0x101); - const u8* storePairedFloat; - const u8* storePairedFloatSlow; + const u8* storePairedFloat = GetCodePtr(); { constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_32; - storePairedFloat = GetCodePtr(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storePairedFloatSlow = GetCodePtr(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - - const u8* storePairedU8; - const u8* storePairedU8Slow; + const u8* storePairedU8 = GetCodePtr(); { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); - float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); + float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; - storePairedU8 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storePairedU8Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - const u8* storePairedS8; - const u8* storePairedS8Slow; + const u8* storePairedS8 = GetCodePtr(); { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); - float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); + float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_8; - storePairedS8 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storePairedS8Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - - const u8* storePairedU16; - const u8* storePairedU16Slow; + const u8* storePairedU16 = GetCodePtr(); { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); - float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; - storePairedU16 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storePairedU16Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - const u8* storePairedS16; // Used by Viewtiful Joe's intro movie - const u8* storePairedS16Slow; + const u8* storePairedS16 = GetCodePtr(); // Used by Viewtiful Joe's intro movie { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1, 0); - float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_PAIR | BackPatchInfo::FLAG_SIZE_16; - storePairedS16 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storePairedS16Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - const u8* storeSingleFloat; - const u8* storeSingleFloatSlow; + const u8* storeSingleFloat = GetCodePtr(); { constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_32; - storeSingleFloat = GetCodePtr(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push); - storeSingleFloatSlow = GetCodePtr(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - const u8* storeSingleU8; // Used by MKWii - const u8* storeSingleU8Slow; + const u8* storeSingleU8 = GetCodePtr(); // Used by MKWii { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); - float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); + float_emit.UQXTN(8, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; - storeSingleU8 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storeSingleU8Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - const u8* storeSingleS8; - const u8* storeSingleS8Slow; + const u8* storeSingleS8 = GetCodePtr(); { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); - float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); + float_emit.SQXTN(8, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_8; - storeSingleS8 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storeSingleS8Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - const u8* storeSingleU16; // Used by MKWii - const u8* storeSingleU16Slow; + const u8* storeSingleU16 = GetCodePtr(); // Used by MKWii { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); - float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZU(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.UQXTN(16, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; - storeSingleU16 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storeSingleU16Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } - const u8* storeSingleS16; - const u8* storeSingleS16Slow; + const u8* storeSingleS16 = GetCodePtr(); { - auto emit_quantize = [this, &float_emit, scale_reg]() { - MOVP2R(ARM64Reg::X2, &m_quantizeTableS); - ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); - float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); - float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); + MOVP2R(ARM64Reg::X2, &m_quantizeTableS); + ADD(scale_reg, ARM64Reg::X2, scale_reg, ArithOption(scale_reg, ShiftType::LSL, 3)); + float_emit.LDR(32, IndexType::Unsigned, ARM64Reg::D1, scale_reg, 0); + float_emit.FMUL(32, ARM64Reg::D0, ARM64Reg::D0, ARM64Reg::D1); - float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); - float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); - }; + float_emit.FCVTZS(32, ARM64Reg::D0, ARM64Reg::D0); + float_emit.SQXTN(16, ARM64Reg::D0, ARM64Reg::D0); constexpr u32 flags = BackPatchInfo::FLAG_STORE | BackPatchInfo::FLAG_FLOAT | BackPatchInfo::FLAG_SIZE_16; - storeSingleS16 = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, true, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); - RET(ARM64Reg::X30); + EmitBackpatchRoutine(flags, jo.fastmem_arena, jo.fastmem_arena, ARM64Reg::D0, addr_reg, + gprs_to_push, fprs_to_push, true); - storeSingleS16Slow = GetCodePtr(); - emit_quantize(); - EmitBackpatchRoutine(flags, false, false, ARM64Reg::D0, addr_reg, gprs_to_push, fprs_to_push); RET(ARM64Reg::X30); } JitRegister::Register(start, GetCodePtr(), "JIT_QuantizedStore"); paired_store_quantized = reinterpret_cast(AlignCode16()); - ReserveCodeSpace(32 * sizeof(u8*)); + ReserveCodeSpace(8 * sizeof(u8*)); - // Fast paired_store_quantized[0] = storePairedFloat; paired_store_quantized[1] = storePairedIllegal; paired_store_quantized[2] = storePairedIllegal; @@ -956,31 +891,15 @@ void JitArm64::GenerateQuantizedStores() paired_store_quantized[6] = storePairedS8; paired_store_quantized[7] = storePairedS16; - paired_store_quantized[8] = storeSingleFloat; - paired_store_quantized[9] = storePairedIllegal; - paired_store_quantized[10] = storePairedIllegal; - paired_store_quantized[11] = storePairedIllegal; - paired_store_quantized[12] = storeSingleU8; - paired_store_quantized[13] = storeSingleU16; - paired_store_quantized[14] = storeSingleS8; - paired_store_quantized[15] = storeSingleS16; + single_store_quantized = reinterpret_cast(AlignCode16()); + ReserveCodeSpace(8 * sizeof(u8*)); - // Slow - paired_store_quantized[16] = storePairedFloatSlow; - paired_store_quantized[17] = storePairedIllegal; - paired_store_quantized[18] = storePairedIllegal; - paired_store_quantized[19] = storePairedIllegal; - paired_store_quantized[20] = storePairedU8Slow; - paired_store_quantized[21] = storePairedU16Slow; - paired_store_quantized[22] = storePairedS8Slow; - paired_store_quantized[23] = storePairedS16Slow; - - paired_store_quantized[24] = storeSingleFloatSlow; - paired_store_quantized[25] = storePairedIllegal; - paired_store_quantized[26] = storePairedIllegal; - paired_store_quantized[27] = storePairedIllegal; - paired_store_quantized[28] = storeSingleU8Slow; - paired_store_quantized[29] = storeSingleU16Slow; - paired_store_quantized[30] = storeSingleS8Slow; - paired_store_quantized[31] = storeSingleS16Slow; + single_store_quantized[0] = storeSingleFloat; + single_store_quantized[1] = storePairedIllegal; + single_store_quantized[2] = storePairedIllegal; + single_store_quantized[3] = storePairedIllegal; + single_store_quantized[4] = storeSingleU8; + single_store_quantized[5] = storeSingleU16; + single_store_quantized[6] = storeSingleS8; + single_store_quantized[7] = storeSingleS16; }