mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2025-01-27 00:05:34 +01:00
JitArm64: Optimize ps_mergeXX
1. In some cases, ps_merge01 can be implemented using one instruction. 2. When we need two instructions for ps_merge01, it's best to start with a MOV to avoid false dependencies on the destination register. 3. ps_merge10 can be implemented using a single EXT instruction.
This commit is contained in:
parent
0ef6d30a0d
commit
f45d3a6a2c
@ -2334,6 +2334,16 @@ void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn,
|
|||||||
(1 << 11) | (DecodeReg(Rn) << 5) | DecodeReg(Rd));
|
(1 << 11) | (DecodeReg(Rn) << 5) | DecodeReg(Rd));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ARM64FloatEmitter::EmitExtract(u32 imm4, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
||||||
|
{
|
||||||
|
ASSERT_MSG(DYNA_REC, !IsSingle(Rd), "Singles are not supported!");
|
||||||
|
|
||||||
|
bool quad = IsQuad(Rd);
|
||||||
|
|
||||||
|
Write32((quad << 30) | (23 << 25) | (op << 22) | (DecodeReg(Rm) << 16) | (imm4 << 11) |
|
||||||
|
(DecodeReg(Rn) << 5) | DecodeReg(Rd));
|
||||||
|
}
|
||||||
|
|
||||||
void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8)
|
void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8)
|
||||||
{
|
{
|
||||||
ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "Vector is not supported!");
|
ASSERT_MSG(DYNA_REC, !IsQuad(Rd), "Vector is not supported!");
|
||||||
@ -3540,6 +3550,12 @@ void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
|
|||||||
EmitPermute(size, 0b111, Rd, Rn, Rm);
|
EmitPermute(size, 0b111, Rd, Rn, Rm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Extract
|
||||||
|
void ARM64FloatEmitter::EXT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 index)
|
||||||
|
{
|
||||||
|
EmitExtract(index, 0, Rd, Rn, Rm);
|
||||||
|
}
|
||||||
|
|
||||||
// Scalar shift by immediate
|
// Scalar shift by immediate
|
||||||
void ARM64FloatEmitter::SHL(ARM64Reg Rd, ARM64Reg Rn, u32 shift)
|
void ARM64FloatEmitter::SHL(ARM64Reg Rd, ARM64Reg Rn, u32 shift)
|
||||||
{
|
{
|
||||||
|
@ -1247,6 +1247,9 @@ public:
|
|||||||
void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
|
|
||||||
|
// Extract
|
||||||
|
void EXT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 index);
|
||||||
|
|
||||||
// Scalar shift by immediate
|
// Scalar shift by immediate
|
||||||
void SHL(ARM64Reg Rd, ARM64Reg Rn, u32 shift);
|
void SHL(ARM64Reg Rd, ARM64Reg Rn, u32 shift);
|
||||||
void URSHR(ARM64Reg Rd, ARM64Reg Rn, u32 shift);
|
void URSHR(ARM64Reg Rd, ARM64Reg Rn, u32 shift);
|
||||||
@ -1305,6 +1308,7 @@ private:
|
|||||||
void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm);
|
void EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
void EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
void EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
|
void EmitExtract(u32 imm4, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
|
||||||
void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8);
|
void EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm8);
|
||||||
void EmitShiftImm(bool Q, bool U, u32 imm, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
|
void EmitShiftImm(bool Q, bool U, u32 imm, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
|
||||||
void EmitScalarShiftImm(bool U, u32 imm, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
|
void EmitScalarShiftImm(bool U, u32 imm, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
|
||||||
|
@ -33,9 +33,9 @@ void JitArm64::ps_mergeXX(UGeckoInstruction inst)
|
|||||||
const u8 size = singles ? 32 : 64;
|
const u8 size = singles ? 32 : 64;
|
||||||
const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
|
const auto reg_encoder = singles ? EncodeRegToDouble : EncodeRegToQuad;
|
||||||
|
|
||||||
const ARM64Reg VA = fpr.R(a, type);
|
const ARM64Reg VA = reg_encoder(fpr.R(a, type));
|
||||||
const ARM64Reg VB = fpr.R(b, type);
|
const ARM64Reg VB = reg_encoder(fpr.R(b, type));
|
||||||
const ARM64Reg VD = fpr.RW(d, type);
|
const ARM64Reg VD = reg_encoder(fpr.RW(d, type));
|
||||||
|
|
||||||
switch (inst.SUBOP10)
|
switch (inst.SUBOP10)
|
||||||
{
|
{
|
||||||
@ -43,23 +43,20 @@ void JitArm64::ps_mergeXX(UGeckoInstruction inst)
|
|||||||
m_float_emit.TRN1(size, VD, VA, VB);
|
m_float_emit.TRN1(size, VD, VA, VB);
|
||||||
break;
|
break;
|
||||||
case 560: // 01
|
case 560: // 01
|
||||||
m_float_emit.INS(size, VD, 0, VA, 0);
|
if (d != b)
|
||||||
m_float_emit.INS(size, VD, 1, VB, 1);
|
{
|
||||||
|
if (d != a)
|
||||||
|
m_float_emit.MOV(VD, VA);
|
||||||
|
if (a != b)
|
||||||
|
m_float_emit.INS(size, VD, 1, VB, 1);
|
||||||
|
}
|
||||||
|
else if (d != a)
|
||||||
|
{
|
||||||
|
m_float_emit.INS(size, VD, 0, VA, 0);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case 592: // 10
|
case 592: // 10
|
||||||
if (d != a && d != b)
|
m_float_emit.EXT(VD, VA, VB, size >> 3);
|
||||||
{
|
|
||||||
m_float_emit.INS(size, VD, 0, VA, 1);
|
|
||||||
m_float_emit.INS(size, VD, 1, VB, 0);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
ARM64Reg V0 = fpr.GetReg();
|
|
||||||
m_float_emit.INS(size, V0, 0, VA, 1);
|
|
||||||
m_float_emit.INS(size, V0, 1, VB, 0);
|
|
||||||
m_float_emit.MOV(reg_encoder(VD), reg_encoder(V0));
|
|
||||||
fpr.Unlock(V0);
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
case 624: // 11
|
case 624: // 11
|
||||||
m_float_emit.TRN2(size, VD, VA, VB);
|
m_float_emit.TRN2(size, VD, VA, VB);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user