From e6af4970d8bfadd2ad72e69bd80e6fec6463e7a7 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 8 Sep 2013 07:07:06 +0000 Subject: [PATCH] [ARM] Use NEON for loading the values from psq_l, gives a minimal performance increase. This change also begins a new NEONXEmitter for having cleaner support for NEON. --- Source/Core/Common/Src/ArmEmitter.cpp | 144 ++++++++++++------ Source/Core/Common/Src/ArmEmitter.h | 73 +++++++-- Source/Core/Core/Src/PowerPC/JitArm32/Jit.cpp | 3 +- .../Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp | 10 +- 4 files changed, 162 insertions(+), 68 deletions(-) diff --git a/Source/Core/Common/Src/ArmEmitter.cpp b/Source/Core/Common/Src/ArmEmitter.cpp index fd62b0b2c9..e0f02244a8 100644 --- a/Source/Core/Common/Src/ArmEmitter.cpp +++ b/Source/Core/Common/Src/ArmEmitter.cpp @@ -892,54 +892,6 @@ ARMReg ARMXEmitter::SubBase(ARMReg Reg) return Reg; } -// NEON Specific -void ARMXEmitter::VABD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm) -{ - _dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VABD(float)"); - _dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VABD(float) when CPU doesn't support it"); - bool register_quad = Vd >= Q0; - - // Gets encoded as a double register - Vd = SubBase(Vd); - Vn = SubBase(Vn); - Vm = SubBase(Vm); - - Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \ - | ((Vd & 0xF) << 12) | (0xD << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \ - | ((Vm & 0x10) << 2) | (Vm & 0xF)); -} -void ARMXEmitter::VADD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm) -{ - _dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VADD(integer)"); - _dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VADD(integer) when CPU doesn't support it"); - - bool register_quad = Vd >= Q0; - - // Gets encoded as a double register - Vd = SubBase(Vd); - Vn = SubBase(Vn); - Vm = SubBase(Vm); - - Write32((0xF2 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \ - | ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \ - | ((Vm & 0x10) << 1) | (Vm & 0xF)); - -} -void ARMXEmitter::VSUB(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm) -{ - _dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to VSUB(integer)"); - _dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VSUB(integer) when CPU doesn't support it"); - - // Gets encoded as a double register - Vd = SubBase(Vd); - Vn = SubBase(Vn); - Vm = SubBase(Vm); - - Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \ - | ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (1 << 6) \ - | ((Vm & 0x10) << 2) | (Vm & 0xF)); -} - // Double/single, Neon extern const VFPEnc VFPOps[16][2] = { {{0xE0, 0xA0}, {0x20, 0xD1}}, // 0: VMLA @@ -1269,4 +1221,100 @@ void ARMXEmitter::VCVT(ARMReg Dest, ARMReg Source, int flags) } } +void NEONXEmitter::VABD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm) +{ + _dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VABD(float)"); + _dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VABD(float) when CPU doesn't support it"); + bool register_quad = Vd >= Q0; + + // Gets encoded as a double register + Vd = SubBase(Vd); + Vn = SubBase(Vn); + Vm = SubBase(Vm); + + Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (encodedSize(Size) << 20) | ((Vn & 0xF) << 16) \ + | ((Vd & 0xF) << 12) | (0xD << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \ + | ((Vm & 0x10) << 2) | (Vm & 0xF)); } +void NEONXEmitter::VADD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm) +{ + _dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VADD(integer)"); + _dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VADD(integer) when CPU doesn't support it"); + + bool register_quad = Vd >= Q0; + + // Gets encoded as a double register + Vd = SubBase(Vd); + Vn = SubBase(Vn); + Vm = SubBase(Vm); + + Write32((0xF2 << 24) | ((Vd & 0x10) << 18) | (encodedSize(Size) << 20) | ((Vn & 0xF) << 16) \ + | ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \ + | ((Vm & 0x10) << 1) | (Vm & 0xF)); + +} +void NEONXEmitter::VSUB(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm) +{ + _dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to VSUB(integer)"); + _dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VSUB(integer) when CPU doesn't support it"); + + // Gets encoded as a double register + Vd = SubBase(Vd); + Vn = SubBase(Vn); + Vm = SubBase(Vm); + + Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (encodedSize(Size) << 20) | ((Vn & 0xF) << 16) \ + | ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (1 << 6) \ + | ((Vm & 0x10) << 2) | (Vm & 0xF)); +} + +void NEONXEmitter::VLD1(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm) +{ + u32 spacing = 0x7; // Only support loading to 1 reg + // Gets encoded as a double register + Vd = SubBase(Vd); + + Write32((0xF4 << 24) | ((Vd & 0x10) << 18) | (1 << 21) | (Rn << 16) + | ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6) + | (align << 4) | Rm); +} + +void NEONXEmitter::VLD2(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm) +{ + u32 spacing = 0x8; // Single spaced registers + // Gets encoded as a double register + Vd = SubBase(Vd); + + Write32((0xF4 << 24) | ((Vd & 0x10) << 18) | (1 << 21) | (Rn << 16) + | ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6) + | (align << 4) | Rm); +} + +void NEONXEmitter::VREVX(u32 size, NEONElementType Size, ARMReg Vd, ARMReg Vm) +{ + bool register_quad = Vd >= Q0; + Vd = SubBase(Vd); + Vm = SubBase(Vm); + + Write32((0xF3 << 24) | (1 << 23) | ((Vd & 0x10) << 18) | (0x3 << 20) + | (encodedSize(Size) << 18) | ((Vd & 0xF) << 12) | (size << 7) + | (register_quad << 6) | ((Vm & 0x10) << 2) | (Vm & 0xF)); +} + +void NEONXEmitter::VREV64(NEONElementType Size, ARMReg Vd, ARMReg Vm) +{ + VREVX(2, Size, Vd, Vm); +} + +void NEONXEmitter::VREV32(NEONElementType Size, ARMReg Vd, ARMReg Vm) +{ + VREVX(1, Size, Vd, Vm); +} + +void NEONXEmitter::VREV16(NEONElementType Size, ARMReg Vd, ARMReg Vm) +{ + VREVX(0, Size, Vd, Vm); +} + +} + diff --git a/Source/Core/Common/Src/ArmEmitter.h b/Source/Core/Common/Src/ArmEmitter.h index 1592da7d4f..79121e832a 100644 --- a/Source/Core/Common/Src/ArmEmitter.h +++ b/Source/Core/Common/Src/ArmEmitter.h @@ -104,13 +104,6 @@ enum ShiftType ST_ROR = 3, ST_RRX = 4 }; -enum IntegerSize -{ - I_I8 = 0, - I_I16, - I_I32, - I_I64 -}; enum { @@ -349,6 +342,7 @@ typedef const u8* JumpTarget; class ARMXEmitter { friend struct OpArg; // for Write8 etc + friend class NEONXEmitter; private: u8 *code, *startcode; u8 *lastCacheFlushEnd; @@ -533,11 +527,7 @@ public: // Subtracts the base from the register to give us the real one ARMReg SubBase(ARMReg Reg); - // NEON Only - void VABD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); - void VADD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); - void VSUB(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); - + // VFP Only void VLDR(ARMReg Dest, ARMReg Base, s16 offset); void VSTR(ARMReg Src, ARMReg Base, s16 offset); @@ -584,6 +574,65 @@ public: }; // class ARMXEmitter +enum NEONElementType +{ + I_8 = (1 << 0), + I_16 = (1 << 1), + I_32 = (1 << 2), + I_64 = (1 << 3), + I_SIGNED = (1 << 4), + I_UNSIGNED = (1 << 5), + F_32 = (1 << 6) +}; + +enum NEONAlignment +{ + ALIGN_NONE = 0, + ALIGN_64 = 1, + ALIGN_128 = 2, + ALIGN_256 = 3 +}; + + +class NEONXEmitter +{ +private: + ARMXEmitter *_emit; + ARMReg SubBase(ARMReg Reg) { return _emit->SubBase(Reg); } + inline void Write32(u32 value) { _emit->Write32(value); } + + inline u32 encodedSize(u32 value) + { + if (value & I_8) + return 0; + else if (value & I_16) + return 1; + else if (value & I_32) + return 2; + else if (value & I_64) + return 3; + else + _dbg_assert_msg_(DYNA_REC, false, "Passed invalid size to integer NEON instruction"); + return 0; + } + + void VREVX(u32 size, NEONElementType Size, ARMReg Vd, ARMReg Vm); + +public: + NEONXEmitter(ARMXEmitter *emit) + : _emit(emit) + {} + + void VABD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); + void VADD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); + void VSUB(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm); + void VREV64(NEONElementType Size, ARMReg Vd, ARMReg Vm); + void VREV32(NEONElementType Size, ARMReg Vd, ARMReg Vm); + void VREV16(NEONElementType Size, ARMReg Vd, ARMReg Vm); + + void VLD1(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC); + void VLD2(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC); +}; // Everything that needs to generate X86 code should inherit from this. // You get memory management for free, plus, you can use all the MOV etc functions without diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/Jit.cpp b/Source/Core/Core/Src/PowerPC/JitArm32/Jit.cpp index 83609af768..8b11840682 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/Jit.cpp +++ b/Source/Core/Core/Src/PowerPC/JitArm32/Jit.cpp @@ -467,7 +467,8 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo MOVI2R(RB, (u32)&One); VLDR(VA, RA, 0); VLDR(VB, RB, 0); - VADD(I_I64, VA, VA, VB); + NEONXEmitter nemit(this); + nemit.VADD(I_64, VA, VA, VB); VSTR(VA, RA, 0); gpr.Unlock(RA, RB); fpr.Unlock(VA); diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp b/Source/Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp index 6cf761c949..ca9073887f 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp +++ b/Source/Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp @@ -157,13 +157,9 @@ void JitArmAsmRoutineManager::GenerateCommon() MOVI2R(R14, (u32)Memory::base); ADD(R10, R10, R14); - LDR(R12, R10); - REV(R12, R12); - VMOV(S0, R12); - - LDR(R12, R10, 4); - REV(R12, R12); - VMOV(S1, R12); + NEONXEmitter nemit(this); + nemit.VLD1(I_32, D0, R10); + nemit.VREV32(I_8, D0, D0); POP(2, R12, _PC); const u8* loadPairedFloatOne = GetCodePtr();