From 8b8310d28cd589fb371f425383c756699766c3a1 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sat, 28 Feb 2015 04:39:15 -0600 Subject: [PATCH 1/6] [AArch64] Optimize FPR pushing and popping. Previously on FPR pushing and popping we would do a single STR/LDR per quad FPR we wanted to push/pop. In most of our cases when we are pushing and popping VFP registers they will be consecutive registers that will save more efficiently using the NEON loadstores that can do up to four quad registers. So this can potentially cutting instructions down to ~1/4th the amount of instructions if the registers are all consecutive. On the Cortex-A57 this is basically just an icache improvement, but on the Nvidia Denver this may be optimized to be more efficient. Either way it's a win. --- Source/Core/Common/Arm64Emitter.cpp | 145 ++++++++++++++++++++++++++-- Source/Core/Common/Arm64Emitter.h | 5 +- 2 files changed, 139 insertions(+), 11 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index b8a7ee78a8..a730560177 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -1891,6 +1891,27 @@ void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opc (encoded_size << 10) | (Rn << 5) | Rt); } +void ARM64FloatEmitter::EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + bool quad = IsQuad(Rt); + u32 encoded_size = 0; + + if (size == 16) + encoded_size = 1; + else if (size == 32) + encoded_size = 2; + else if (size == 64) + encoded_size = 3; + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (0b11001 << 23) | (L << 22) | (Rm << 16) | (opcode << 12) | \ + (encoded_size << 10) | (Rn << 5) | Rt); + +} + void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) { _assert_msg_(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__); @@ -2234,6 +2255,22 @@ void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) opcode = 0b0010; EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn); } +void ARM64FloatEmitter::LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + _assert_msg_(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __FUNCTION__); + _assert_msg_(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __FUNCTION__); + + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructurePost(size, 1, opcode, Rt, Rn, Rm); +} void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) { _assert_msg_(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __FUNCTION__); @@ -2248,6 +2285,22 @@ void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) opcode = 0b0010; EmitLoadStoreMultipleStructure(size, 0, opcode, Rt, Rn); } +void ARM64FloatEmitter::ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + _assert_msg_(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __FUNCTION__); + _assert_msg_(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __FUNCTION__); + + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructurePost(size, 0, opcode, Rt, Rn, Rm); +} // Scalar - 1 Source void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn) @@ -2761,21 +2814,93 @@ void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers) { - for (auto it : registers) - STR(128, INDEX_PRE, (ARM64Reg)(Q0 + it), SP, -16); - -} -void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask) -{ - for (int i = 31; i >= 0; --i) + bool bundled_loadstore = false; + for (int i = 0; i < 32; ++i) { if (!registers[i]) continue; - if (ignore_mask[i]) - m_emit->ADD(SP, SP, 16); - else + int count = 0; + while (++count < 4 && (i + count) < 32 && registers[i + count]) {} + if (count > 1) + { + bundled_loadstore = true; + break; + } + } + + if (!bundled_loadstore) + { + for (auto it : registers) + STR(128, INDEX_PRE, (ARM64Reg)(Q0 + it), SP, -16); + } + else + { + int num_regs = registers.Count(); + // Violating the AAPCS64 never felt so right. + m_emit->SUB(SP, SP, num_regs * 16); + for (int i = 0; i < 32; ++i) + { + if (!registers[i]) + continue; + + int count = 0; + + // 0 = true + // 1 < 4 && registers[i + 1] true! + // 2 < 4 && registers[i + 2] true! + // 3 < 4 && registers[i + 3] true! + // 4 < 4 && registers[i + 4] false! + while (++count < 4 && (i + count) < 32 && registers[i + count]) {} + + ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP); + + i += count - 1; + } + m_emit->SUB(SP, SP, num_regs * 16); + } +} +void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers) +{ + bool bundled_loadstore = false; + for (int i = 0; i < 32; ++i) + { + if (!registers[i]) + continue; + + int count = 0; + while (++count < 4 && (i + count) < 32 && registers[i + count]) {} + if (count > 1) + { + bundled_loadstore = true; + break; + } + } + + if (!bundled_loadstore) + { + for (int i = 31; i >= 0; --i) + { + if (!registers[i]) + continue; + LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16); + } + } + else + { + for (int i = 0; i < 32; ++i) + { + if (!registers[i]) + continue; + + int count = 0; + while (++count < 4 && (i + count) < 32 && registers[i + count]) {} + + LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP); + + i += count - 1; + } } } diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index bb3bf770ce..77d72183c5 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -658,7 +658,9 @@ public: // Loadstore multiple structure void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); + void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP); void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); + void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP); // Scalar - 1 Source void FABS(ARM64Reg Rd, ARM64Reg Rn); @@ -748,7 +750,7 @@ public: // ABI related void ABI_PushRegisters(BitSet32 registers); - void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0)); + void ABI_PopRegisters(BitSet32 registers); private: ARM64XEmitter* m_emit; @@ -770,6 +772,7 @@ private: void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm); void EmitShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn); + void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); From 39e357d62d4a0cf936d3a1cdd7ff306d168cc57d Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 2 Mar 2015 03:41:19 -0600 Subject: [PATCH 2/6] [AArch64] Implement VFP loadstore paired in the emitter. --- Source/Core/Common/Arm64Emitter.cpp | 62 +++++++++++++++++++++++++++++ Source/Core/Common/Arm64Emitter.h | 7 ++++ 2 files changed, 69 insertions(+) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index a730560177..4fc7c38ae0 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -512,6 +512,9 @@ void ARM64XEmitter::EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64R case INDEX_PRE: type_encode = 0b011; break; + case INDEX_SIGNED: + _assert_msg_(DYNA_REC, false, "%s doesn't support INDEX_SIGNED!", __FUNCTION__); + break; } if (b64Bit) @@ -1944,6 +1947,55 @@ void ARM64FloatEmitter::EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM Write32((size << 30) | (0b1111 << 26) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt); } +void ARM64FloatEmitter::EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) +{ + u32 type_encode = 0; + u32 opc = 0; + + switch (type) + { + case INDEX_SIGNED: + type_encode = 0b010; + break; + case INDEX_POST: + type_encode = 0b001; + break; + case INDEX_PRE: + type_encode = 0b011; + break; + case INDEX_UNSIGNED: + _assert_msg_(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __FUNCTION__); + break; + } + + if (size == 128) + { + _assert_msg_(DYNA_REC, !(imm & 0xF), "%s received invalid offset 0x%x!", __FUNCTION__, imm); + opc = 2; + imm >>= 4; + } + else if (size == 64) + { + _assert_msg_(DYNA_REC, !(imm & 0x7), "%s received invalid offset 0x%x!", __FUNCTION__, imm); + opc = 1; + imm >>= 3; + } + else if (size == 32) + { + _assert_msg_(DYNA_REC, !(imm & 0x3), "%s received invalid offset 0x%x!", __FUNCTION__, imm); + opc = 0; + imm >>= 2; + } + + Rt = DecodeReg(Rt); + Rt2 = DecodeReg(Rt2); + Rn = DecodeReg(Rn); + + Write32((opc << 30) | (0b1011 << 26) | (type_encode << 23) | (load << 22) | \ + ((imm & 0x7F) << 15) | (Rt2 << 10) | (Rn << 5) | Rt); + +} + void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm); @@ -2302,6 +2354,16 @@ void ARM64FloatEmitter::ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM6 EmitLoadStoreMultipleStructurePost(size, 0, opcode, Rt, Rn, Rm); } +// Loadstore paired +void ARM64FloatEmitter::LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStorePair(size, true, type, Rt, Rt2, Rn, imm); +} +void ARM64FloatEmitter::STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStorePair(size, false, type, Rt, Rt2, Rn, imm); +} + // Scalar - 1 Source void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn) { diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 77d72183c5..3fd3d390c7 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -109,6 +109,8 @@ enum IndexType INDEX_UNSIGNED, INDEX_POST, INDEX_PRE, + // Only for VFP loadstore paired + INDEX_SIGNED, }; enum ShiftAmount @@ -662,6 +664,10 @@ public: void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP); + // Loadstore paired + void LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + void STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + // Scalar - 1 Source void FABS(ARM64Reg Rd, ARM64Reg Rn); void FNEG(ARM64Reg Rd, ARM64Reg Rn); @@ -776,6 +782,7 @@ private: void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); }; class ARM64CodeBlock : public CodeBlock From fad46729b097ecd808508e9f57cc5609ed335054 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 2 Mar 2015 03:41:45 -0600 Subject: [PATCH 3/6] [AArch64] Implemented paired pushing/popping for the VFP. A bit more efficient if we are only pushing two VFP registers. We can probably be a bit more efficient in the future by mixing paired loadstores in to the other paths as well. --- Source/Core/Common/Arm64Emitter.cpp | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 4fc7c38ae0..4cb4086d7c 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -2877,6 +2877,19 @@ void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers) { bool bundled_loadstore = false; + int num_regs = registers.Count(); + + if (num_regs == 2) + { + int i = 0; + ARM64Reg regs[2]; + for (auto it : registers) + regs[i++] = (ARM64Reg)(Q0 + it); + + STP(128, INDEX_PRE, regs[0], regs[1], SP, -32); + return; + } + for (int i = 0; i < 32; ++i) { if (!registers[i]) @@ -2898,7 +2911,6 @@ void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers) } else { - int num_regs = registers.Count(); // Violating the AAPCS64 never felt so right. m_emit->SUB(SP, SP, num_regs * 16); for (int i = 0; i < 32; ++i) @@ -2925,6 +2937,19 @@ void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers) void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers) { bool bundled_loadstore = false; + int num_regs = registers.Count(); + + if (num_regs == 2) + { + int i = 0; + ARM64Reg regs[2]; + for (auto it : registers) + regs[i++] = (ARM64Reg)(Q0 + it); + + LDP(128, INDEX_POST, regs[0], regs[1], SP, 32); + return; + } + for (int i = 0; i < 32; ++i) { if (!registers[i]) From f1a9db9bdc13e0e706018ed44c0c731c0d076b2a Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 2 Mar 2015 11:21:15 -0600 Subject: [PATCH 4/6] [AArch64] Stop violating the AAPCS64 so much. --- Source/Core/Common/Arm64Emitter.cpp | 44 ++++++++++---------- Source/Core/Common/Arm64Emitter.h | 4 +- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 40 +++++++++--------- 3 files changed, 44 insertions(+), 44 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 4cb4086d7c..355275caf8 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -2874,7 +2874,7 @@ void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 EmitVectorxElement(0, 2 | (size >> 6), L, 0b1001, H, Rd, Rn, Rm); } -void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers) +void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp) { bool bundled_loadstore = false; int num_regs = registers.Count(); @@ -2904,15 +2904,10 @@ void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers) } } - if (!bundled_loadstore) + if (bundled_loadstore && tmp != INVALID_REG) { - for (auto it : registers) - STR(128, INDEX_PRE, (ARM64Reg)(Q0 + it), SP, -16); - } - else - { - // Violating the AAPCS64 never felt so right. m_emit->SUB(SP, SP, num_regs * 16); + m_emit->ADD(tmp, SP, 0); for (int i = 0; i < 32; ++i) { if (!registers[i]) @@ -2927,14 +2922,18 @@ void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers) // 4 < 4 && registers[i + 4] false! while (++count < 4 && (i + count) < 32 && registers[i + count]) {} - ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP); + ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp); i += count - 1; } - m_emit->SUB(SP, SP, num_regs * 16); + } + else + { + for (auto it : registers) + STR(128, INDEX_PRE, (ARM64Reg)(Q0 + it), SP, -16); } } -void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers) +void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp) { bool bundled_loadstore = false; int num_regs = registers.Count(); @@ -2964,18 +2963,9 @@ void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers) } } - if (!bundled_loadstore) - { - for (int i = 31; i >= 0; --i) - { - if (!registers[i]) - continue; - - LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16); - } - } - else + if (bundled_loadstore && tmp != INVALID_REG) { + // The temporary register is only used to indicate that we can use this code path for (int i = 0; i < 32; ++i) { if (!registers[i]) @@ -2989,6 +2979,16 @@ void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers) i += count - 1; } } + else + { + for (int i = 31; i >= 0; --i) + { + if (!registers[i]) + continue; + + LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16); + } + } } } diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 3fd3d390c7..ab933225c5 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -755,8 +755,8 @@ public: void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); // ABI related - void ABI_PushRegisters(BitSet32 registers); - void ABI_PopRegisters(BitSet32 registers); + void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG); + void ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG); private: ARM64XEmitter* m_emit; diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 62cce8374b..e82367873b 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -279,12 +279,12 @@ void JitArm64AsmRoutineManager::GenerateCommon() SetJumpTarget(argh); ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs); + float_emit.ABI_PushRegisters(fprs, X3); float_emit.UMOV(64, X0, Q0, 0); ORR(X0, SP, X0, ArithOption(X0, ST_ROR, 32)); MOVI2R(X30, (u64)PowerPC::Write_U64); BLR(X30); - float_emit.ABI_PopRegisters(fprs); + float_emit.ABI_PopRegisters(fprs, X3); ABI_PopRegisters(gprs); RET(X30); } @@ -310,12 +310,12 @@ void JitArm64AsmRoutineManager::GenerateCommon() SetJumpTarget(argh); ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs); + float_emit.ABI_PushRegisters(fprs, X3); float_emit.UMOV(16, W0, Q0, 0); REV16(W0, W0); MOVI2R(X30, (u64)PowerPC::Write_U16); BLR(X30); - float_emit.ABI_PopRegisters(fprs); + float_emit.ABI_PopRegisters(fprs, X3); ABI_PopRegisters(gprs); RET(X30); } @@ -341,12 +341,12 @@ void JitArm64AsmRoutineManager::GenerateCommon() SetJumpTarget(argh); ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs); + float_emit.ABI_PushRegisters(fprs, X3); float_emit.UMOV(16, W0, Q0, 0); REV16(W0, W0); MOVI2R(X30, (u64)PowerPC::Write_U16); BLR(X30); - float_emit.ABI_PopRegisters(fprs); + float_emit.ABI_PopRegisters(fprs, X3); ABI_PopRegisters(gprs); RET(X30); } @@ -372,12 +372,12 @@ void JitArm64AsmRoutineManager::GenerateCommon() SetJumpTarget(argh); ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs); + float_emit.ABI_PushRegisters(fprs, X3); float_emit.REV32(8, D0, D0); float_emit.UMOV(32, W0, Q0, 0); MOVI2R(X30, (u64)PowerPC::Write_U32); BLR(X30); - float_emit.ABI_PopRegisters(fprs); + float_emit.ABI_PopRegisters(fprs, X3); ABI_PopRegisters(gprs); RET(X30); } @@ -402,12 +402,12 @@ void JitArm64AsmRoutineManager::GenerateCommon() SetJumpTarget(argh); ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs); + float_emit.ABI_PushRegisters(fprs, X3); float_emit.REV32(8, D0, D0); float_emit.UMOV(32, W0, Q0, 0); MOVI2R(X30, (u64)PowerPC::Write_U32); BLR(X30); - float_emit.ABI_PopRegisters(fprs); + float_emit.ABI_PopRegisters(fprs, X3); ABI_PopRegisters(gprs); RET(X30); } @@ -428,11 +428,11 @@ void JitArm64AsmRoutineManager::GenerateCommon() SetJumpTarget(argh); ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs); + float_emit.ABI_PushRegisters(fprs, X3); float_emit.UMOV(32, W0, Q0, 0); MOVI2R(X30, (u64)&PowerPC::Write_U32); BLR(X30); - float_emit.ABI_PopRegisters(fprs); + float_emit.ABI_PopRegisters(fprs, X3); ABI_PopRegisters(gprs); RET(X30); } @@ -457,11 +457,11 @@ void JitArm64AsmRoutineManager::GenerateCommon() SetJumpTarget(argh); ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs); + float_emit.ABI_PushRegisters(fprs, X3); float_emit.UMOV(32, W0, Q0, 0); MOVI2R(X30, (u64)&PowerPC::Write_U8); BLR(X30); - float_emit.ABI_PopRegisters(fprs); + float_emit.ABI_PopRegisters(fprs, X3); ABI_PopRegisters(gprs); RET(X30); } @@ -486,11 +486,11 @@ void JitArm64AsmRoutineManager::GenerateCommon() SetJumpTarget(argh); ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs); + float_emit.ABI_PushRegisters(fprs, X3); float_emit.SMOV(32, W0, Q0, 0); MOVI2R(X30, (u64)&PowerPC::Write_U8); BLR(X30); - float_emit.ABI_PopRegisters(fprs); + float_emit.ABI_PopRegisters(fprs, X3); ABI_PopRegisters(gprs); RET(X30); } @@ -515,11 +515,11 @@ void JitArm64AsmRoutineManager::GenerateCommon() SetJumpTarget(argh); ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs); + float_emit.ABI_PushRegisters(fprs, X3); float_emit.UMOV(32, W0, Q0, 0); MOVI2R(X30, (u64)&PowerPC::Write_U16); BLR(X30); - float_emit.ABI_PopRegisters(fprs); + float_emit.ABI_PopRegisters(fprs, X3); ABI_PopRegisters(gprs); RET(X30); } @@ -544,11 +544,11 @@ void JitArm64AsmRoutineManager::GenerateCommon() SetJumpTarget(argh); ABI_PushRegisters(gprs); - float_emit.ABI_PushRegisters(fprs); + float_emit.ABI_PushRegisters(fprs, X3); float_emit.SMOV(32, W0, Q0, 0); MOVI2R(X30, (u64)&PowerPC::Write_U16); BLR(X30); - float_emit.ABI_PopRegisters(fprs); + float_emit.ABI_PopRegisters(fprs, X3); ABI_PopRegisters(gprs); RET(X30); } From 81242dd4a7a219dfb0995a709c20ce4e2ee9f4a2 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 2 Mar 2015 21:38:09 -0600 Subject: [PATCH 5/6] [AArch64] Allow the other FPR push/pop routines to use a temporary register. --- .../Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp | 8 ++++---- .../PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp | 12 ++++++------ Source/Core/Core/PowerPC/JitArm64/Jit_Util.cpp | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp index 99fe6a1581..0e10fcbb52 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStore.cpp @@ -167,12 +167,12 @@ void JitArm64::SafeLoadToReg(u32 dest, s32 addr, s32 offsetReg, u32 flags, s32 o // Has a chance of being backpatched which will destroy our state // push and pop everything in this instance ABI_PushRegisters(regs_in_use); - m_float_emit.ABI_PushRegisters(fprs_in_use); + m_float_emit.ABI_PushRegisters(fprs_in_use, X30); EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, dest_reg, XA); - m_float_emit.ABI_PopRegisters(fprs_in_use); + m_float_emit.ABI_PopRegisters(fprs_in_use, X30); ABI_PopRegisters(regs_in_use); } @@ -318,12 +318,12 @@ void JitArm64::SafeStoreFromReg(s32 dest, u32 value, s32 regOffset, u32 flags, s // Has a chance of being backpatched which will destroy our state // push and pop everything in this instance ABI_PushRegisters(regs_in_use); - m_float_emit.ABI_PushRegisters(fprs_in_use); + m_float_emit.ABI_PushRegisters(fprs_in_use, X30); EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, RS, XA); - m_float_emit.ABI_PopRegisters(fprs_in_use); + m_float_emit.ABI_PopRegisters(fprs_in_use, X30); ABI_PopRegisters(regs_in_use); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index 21da40da3f..1247dacd6d 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -190,12 +190,12 @@ void JitArm64::lfXX(UGeckoInstruction inst) // Has a chance of being backpatched which will destroy our state // push and pop everything in this instance ABI_PushRegisters(regs_in_use); - m_float_emit.ABI_PushRegisters(fprs_in_use); + m_float_emit.ABI_PushRegisters(fprs_in_use, X30); EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, VD, XA); - m_float_emit.ABI_PopRegisters(fprs_in_use); + m_float_emit.ABI_PopRegisters(fprs_in_use, X30); ABI_PopRegisters(regs_in_use); } @@ -406,9 +406,9 @@ void JitArm64::stfXX(UGeckoInstruction inst) else { ABI_PushRegisters(regs_in_use); - m_float_emit.ABI_PushRegisters(fprs_in_use); + m_float_emit.ABI_PushRegisters(fprs_in_use, X30); EmitBackpatchRoutine(this, flags, false, false, V0, XA); - m_float_emit.ABI_PopRegisters(fprs_in_use); + m_float_emit.ABI_PopRegisters(fprs_in_use, X30); ABI_PopRegisters(regs_in_use); } } @@ -417,12 +417,12 @@ void JitArm64::stfXX(UGeckoInstruction inst) // Has a chance of being backpatched which will destroy our state // push and pop everything in this instance ABI_PushRegisters(regs_in_use); - m_float_emit.ABI_PushRegisters(fprs_in_use); + m_float_emit.ABI_PushRegisters(fprs_in_use, X30); EmitBackpatchRoutine(this, flags, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, SConfig::GetInstance().m_LocalCoreStartupParameter.bFastmem, V0, XA); - m_float_emit.ABI_PopRegisters(fprs_in_use); + m_float_emit.ABI_PopRegisters(fprs_in_use, X30); ABI_PopRegisters(regs_in_use); } gpr.Unlock(W0, W1, W30); diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitArm64/Jit_Util.cpp index 6c35cc0649..20606a9554 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/Jit_Util.cpp @@ -79,11 +79,11 @@ private: ARM64FloatEmitter float_emit(m_emit); m_emit->ABI_PushRegisters(m_gprs_in_use); - float_emit.ABI_PushRegisters(m_fprs_in_use); + float_emit.ABI_PushRegisters(m_fprs_in_use, X1); m_emit->MOVI2R(W1, m_address); m_emit->MOV(W2, m_src_reg); m_emit->BLR(m_emit->ABI_SetupLambda(lambda)); - float_emit.ABI_PopRegisters(m_fprs_in_use); + float_emit.ABI_PopRegisters(m_fprs_in_use, X1); m_emit->ABI_PopRegisters(m_gprs_in_use); } @@ -179,10 +179,10 @@ private: ARM64FloatEmitter float_emit(m_emit); m_emit->ABI_PushRegisters(m_gprs_in_use); - float_emit.ABI_PushRegisters(m_fprs_in_use); + float_emit.ABI_PushRegisters(m_fprs_in_use, X1); m_emit->MOVI2R(W1, m_address); m_emit->BLR(m_emit->ABI_SetupLambda(lambda)); - float_emit.ABI_PopRegisters(m_fprs_in_use); + float_emit.ABI_PopRegisters(m_fprs_in_use, X1); m_emit->ABI_PopRegisters(m_gprs_in_use); if (m_sign_extend) From fbdee7b15f5d123613736bed2bd965281bd0fdca Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 2 Mar 2015 23:50:33 -0600 Subject: [PATCH 6/6] [AArch64] Handle FPR island registers in a less dumb way. --- Source/Core/Common/Arm64Emitter.cpp | 94 ++++++++++++++++++++--------- 1 file changed, 67 insertions(+), 27 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 355275caf8..69a680f7e3 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -2877,18 +2877,6 @@ void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp) { bool bundled_loadstore = false; - int num_regs = registers.Count(); - - if (num_regs == 2) - { - int i = 0; - ARM64Reg regs[2]; - for (auto it : registers) - regs[i++] = (ARM64Reg)(Q0 + it); - - STP(128, INDEX_PRE, regs[0], regs[1], SP, -32); - return; - } for (int i = 0; i < 32; ++i) { @@ -2906,8 +2894,10 @@ void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp) if (bundled_loadstore && tmp != INVALID_REG) { + int num_regs = registers.Count(); m_emit->SUB(SP, SP, num_regs * 16); m_emit->ADD(tmp, SP, 0); + std::vector island_regs; for (int i = 0; i < 32; ++i) { if (!registers[i]) @@ -2922,15 +2912,42 @@ void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp) // 4 < 4 && registers[i + 4] false! while (++count < 4 && (i + count) < 32 && registers[i + count]) {} - ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp); + if (count == 1) + island_regs.push_back((ARM64Reg)(Q0 + i)); + else + ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp); i += count - 1; } + + // Handle island registers + std::vector pair_regs; + for (auto& it : island_regs) + { + pair_regs.push_back(it); + if (pair_regs.size() == 2) + { + STP(128, INDEX_POST, pair_regs[0], pair_regs[1], tmp, 32); + pair_regs.clear(); + } + } + if (pair_regs.size()) + STR(128, INDEX_POST, pair_regs[0], tmp, 16); } else { + std::vector pair_regs; for (auto it : registers) - STR(128, INDEX_PRE, (ARM64Reg)(Q0 + it), SP, -16); + { + pair_regs.push_back((ARM64Reg)(Q0 + it)); + if (pair_regs.size() == 2) + { + STP(128, INDEX_PRE, pair_regs[0], pair_regs[1], SP, -32); + pair_regs.clear(); + } + } + if (pair_regs.size()) + STR(128, INDEX_PRE, pair_regs[0], SP, -16); } } void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp) @@ -2938,17 +2955,6 @@ void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp) bool bundled_loadstore = false; int num_regs = registers.Count(); - if (num_regs == 2) - { - int i = 0; - ARM64Reg regs[2]; - for (auto it : registers) - regs[i++] = (ARM64Reg)(Q0 + it); - - LDP(128, INDEX_POST, regs[0], regs[1], SP, 32); - return; - } - for (int i = 0; i < 32; ++i) { if (!registers[i]) @@ -2966,6 +2972,7 @@ void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp) if (bundled_loadstore && tmp != INVALID_REG) { // The temporary register is only used to indicate that we can use this code path + std::vector island_regs; for (int i = 0; i < 32; ++i) { if (!registers[i]) @@ -2974,19 +2981,52 @@ void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp) int count = 0; while (++count < 4 && (i + count) < 32 && registers[i + count]) {} - LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP); + if (count == 1) + island_regs.push_back((ARM64Reg)(Q0 + i)); + else + LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP); i += count - 1; } + + // Handle island registers + std::vector pair_regs; + for (auto& it : island_regs) + { + pair_regs.push_back(it); + if (pair_regs.size() == 2) + { + LDP(128, INDEX_POST, pair_regs[0], pair_regs[1], SP, 32); + pair_regs.clear(); + } + } + if (pair_regs.size()) + LDR(128, INDEX_POST, pair_regs[0], SP, 16); } else { + bool odd = num_regs % 2; + std::vector pair_regs; for (int i = 31; i >= 0; --i) { if (!registers[i]) continue; - LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16); + if (odd) + { + // First load must be a regular LDR if odd + odd = false; + LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16); + } + else + { + pair_regs.push_back((ARM64Reg)(Q0 + i)); + if (pair_regs.size() == 2) + { + LDP(128, INDEX_POST, pair_regs[1], pair_regs[0], SP, 32); + pair_regs.clear(); + } + } } } }