From c3e0c41da3cef647b8bea54f77103fbad85098ba Mon Sep 17 00:00:00 2001 From: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com> Date: Tue, 19 Jan 2021 23:12:33 +0100 Subject: [PATCH] CPU (A64): Add Fmaxnmp & Fminnmp Scalar Inst.s, Fast & Slow Paths; with Tests. (#1894) --- ARMeilleure/CodeGen/X86/IntrinsicTable.cs | 1 + ARMeilleure/Decoders/OpCodeTable.cs | 2 + .../Instructions/InstEmitSimdArithmetic.cs | 58 ++++++++++++++----- .../Instructions/InstEmitSimdHelper.cs | 43 ++++++++++++++ ARMeilleure/Instructions/InstName.cs | 2 + .../IntermediateRepresentation/Intrinsic.cs | 1 + Ryujinx.Tests/Cpu/CpuTestSimd.cs | 29 ++++++---- 7 files changed, 111 insertions(+), 25 deletions(-) diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs index 9030be3c1..5deee349a 100644 --- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs +++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs @@ -119,6 +119,7 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Popcnt, new IntrinsicInfo(X86Instruction.Popcnt, IntrinsicType.PopCount)); Add(Intrinsic.X86Por, new IntrinsicInfo(X86Instruction.Por, IntrinsicType.Binary)); Add(Intrinsic.X86Pshufb, new IntrinsicInfo(X86Instruction.Pshufb, IntrinsicType.Binary)); + Add(Intrinsic.X86Pshufd, new IntrinsicInfo(X86Instruction.Pshufd, IntrinsicType.BinaryImm)); Add(Intrinsic.X86Pslld, new IntrinsicInfo(X86Instruction.Pslld, IntrinsicType.Binary)); Add(Intrinsic.X86Pslldq, new IntrinsicInfo(X86Instruction.Pslldq, IntrinsicType.Binary)); Add(Intrinsic.X86Psllq, new IntrinsicInfo(X86Instruction.Psllq, IntrinsicType.Binary)); diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs index b19124851..928d0e0d6 100644 --- a/ARMeilleure/Decoders/OpCodeTable.cs +++ b/ARMeilleure/Decoders/OpCodeTable.cs @@ -311,6 +311,7 @@ namespace ARMeilleure.Decoders SetA64("0>0011100<1xxxxx111101xxxxxxxxxx", InstName.Fmax_V, InstEmit.Fmax_V, OpCodeSimdReg.Create); SetA64("000111100x1xxxxx011010xxxxxxxxxx", InstName.Fmaxnm_S, InstEmit.Fmaxnm_S, OpCodeSimdReg.Create); SetA64("0>0011100<1xxxxx110001xxxxxxxxxx", InstName.Fmaxnm_V, InstEmit.Fmaxnm_V, OpCodeSimdReg.Create); + SetA64("011111100x110000110010xxxxxxxxxx", InstName.Fmaxnmp_S, InstEmit.Fmaxnmp_S, OpCodeSimd.Create); SetA64("0>1011100<1xxxxx110001xxxxxxxxxx", InstName.Fmaxnmp_V, InstEmit.Fmaxnmp_V, OpCodeSimdReg.Create); SetA64("0110111000110000110010xxxxxxxxxx", InstName.Fmaxnmv_V, InstEmit.Fmaxnmv_V, OpCodeSimd.Create); SetA64("0>1011100<1xxxxx111101xxxxxxxxxx", InstName.Fmaxp_V, InstEmit.Fmaxp_V, OpCodeSimdReg.Create); @@ -319,6 +320,7 @@ namespace ARMeilleure.Decoders SetA64("0>0011101<1xxxxx111101xxxxxxxxxx", InstName.Fmin_V, InstEmit.Fmin_V, OpCodeSimdReg.Create); SetA64("000111100x1xxxxx011110xxxxxxxxxx", InstName.Fminnm_S, InstEmit.Fminnm_S, OpCodeSimdReg.Create); SetA64("0>0011101<1xxxxx110001xxxxxxxxxx", InstName.Fminnm_V, InstEmit.Fminnm_V, OpCodeSimdReg.Create); + SetA64("011111101x110000110010xxxxxxxxxx", InstName.Fminnmp_S, InstEmit.Fminnmp_S, OpCodeSimd.Create); SetA64("0>1011101<1xxxxx110001xxxxxxxxxx", InstName.Fminnmp_V, InstEmit.Fminnmp_V, OpCodeSimdReg.Create); SetA64("0110111010110000110010xxxxxxxxxx", InstName.Fminnmv_V, InstEmit.Fminnmv_V, OpCodeSimd.Create); SetA64("0>1011101<1xxxxx111101xxxxxxxxxx", InstName.Fminp_V, InstEmit.Fminp_V, OpCodeSimdReg.Create); diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs index 88be07bdd..bd6a98bed 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs @@ -347,19 +347,17 @@ namespace ARMeilleure.Instructions public static void Faddp_S(ArmEmitterContext context) { - OpCodeSimd op = (OpCodeSimd)context.CurrOp; - - int sizeF = op.Size & 1; - if (Optimizations.FastFP && Optimizations.UseSse3) { - if (sizeF == 0) + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + if ((op.Size & 1) == 0) { Operand res = context.AddIntrinsic(Intrinsic.X86Haddps, GetVec(op.Rn), GetVec(op.Rn)); context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); } - else /* if (sizeF == 1) */ + else /* if ((op.Size & 1) == 1) */ { Operand res = context.AddIntrinsic(Intrinsic.X86Haddpd, GetVec(op.Rn), GetVec(op.Rn)); @@ -368,14 +366,10 @@ namespace ARMeilleure.Instructions } else { - OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32; - - Operand ne0 = context.VectorExtract(type, GetVec(op.Rn), 0); - Operand ne1 = context.VectorExtract(type, GetVec(op.Rn), 1); - - Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), ne0, ne1); - - context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0)); + EmitScalarPairwiseOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, op2); + }); } } @@ -552,6 +546,24 @@ namespace ARMeilleure.Instructions } } + public static void Fmaxnmp_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2ScalarPairwiseOpF(context, (op1, op2) => + { + return EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: true, op1, op2); + }); + } + else + { + EmitScalarPairwiseOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2); + }); + } + } + public static void Fmaxnmp_V(ArmEmitterContext context) { if (Optimizations.FastFP && Optimizations.UseSse41) @@ -708,6 +720,24 @@ namespace ARMeilleure.Instructions } } + public static void Fminnmp_S(ArmEmitterContext context) + { + if (Optimizations.FastFP && Optimizations.UseSse41) + { + EmitSse2ScalarPairwiseOpF(context, (op1, op2) => + { + return EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: true, op1, op2); + }); + } + else + { + EmitScalarPairwiseOpF(context, (op1, op2) => + { + return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2); + }); + } + } + public static void Fminnmp_V(ArmEmitterContext context) { if (Optimizations.FastFP && Optimizations.UseSse41) diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper.cs b/ARMeilleure/Instructions/InstEmitSimdHelper.cs index eab891ec5..e9d5303c7 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper.cs @@ -1118,6 +1118,49 @@ namespace ARMeilleure.Instructions context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); } + public static void EmitScalarPairwiseOpF(ArmEmitterContext context, Func2I emit) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32; + + Operand ne0 = context.VectorExtract(type, GetVec(op.Rn), 0); + Operand ne1 = context.VectorExtract(type, GetVec(op.Rn), 1); + + Operand res = context.VectorInsert(context.VectorZero(), emit(ne0, ne1), 0); + + context.Copy(GetVec(op.Rd), res); + } + + public static void EmitSse2ScalarPairwiseOpF(ArmEmitterContext context, Func2I emit) + { + OpCodeSimd op = (OpCodeSimd)context.CurrOp; + + Operand n = GetVec(op.Rn); + + Operand op0, op1; + + if ((op.Size & 1) == 0) + { + const int sm0 = 2 << 6 | 2 << 4 | 2 << 2 | 0 << 0; + const int sm1 = 2 << 6 | 2 << 4 | 2 << 2 | 1 << 0; + + Operand zeroN = context.VectorZeroUpper64(n); + + op0 = context.AddIntrinsic(Intrinsic.X86Pshufd, zeroN, Const(sm0)); + op1 = context.AddIntrinsic(Intrinsic.X86Pshufd, zeroN, Const(sm1)); + } + else /* if ((op.Size & 1) == 1) */ + { + Operand zero = context.VectorZero(); + + op0 = context.AddIntrinsic(Intrinsic.X86Movlhps, n, zero); + op1 = context.AddIntrinsic(Intrinsic.X86Movhlps, zero, n); + } + + context.Copy(GetVec(op.Rd), emit(op0, op1)); + } + public static void EmitVectorPairwiseOpF(ArmEmitterContext context, Func2I emit) { OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp; diff --git a/ARMeilleure/Instructions/InstName.cs b/ARMeilleure/Instructions/InstName.cs index a0ec9dc39..990e4393f 100644 --- a/ARMeilleure/Instructions/InstName.cs +++ b/ARMeilleure/Instructions/InstName.cs @@ -212,6 +212,7 @@ namespace ARMeilleure.Instructions Fmax_V, Fmaxnm_S, Fmaxnm_V, + Fmaxnmp_S, Fmaxnmp_V, Fmaxnmv_V, Fmaxp_V, @@ -220,6 +221,7 @@ namespace ARMeilleure.Instructions Fmin_V, Fminnm_S, Fminnm_V, + Fminnmp_S, Fminnmp_V, Fminnmv_V, Fminp_V, diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs index e2989863b..1ddf93e5b 100644 --- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs +++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs @@ -108,6 +108,7 @@ namespace ARMeilleure.IntermediateRepresentation X86Popcnt, X86Por, X86Pshufb, + X86Pshufd, X86Pslld, X86Pslldq, X86Psllq, diff --git a/Ryujinx.Tests/Cpu/CpuTestSimd.cs b/Ryujinx.Tests/Cpu/CpuTestSimd.cs index 1371de4b7..89c285708 100644 --- a/Ryujinx.Tests/Cpu/CpuTestSimd.cs +++ b/Ryujinx.Tests/Cpu/CpuTestSimd.cs @@ -715,19 +715,23 @@ namespace Ryujinx.Tests.Cpu }; } - private static uint[] _F_Add_P_S_2SS_() + private static uint[] _F_Add_Max_Min_Nm_P_S_2SS_() { return new uint[] { - 0x7E30D820u // FADDP S0, V1.2S + 0x7E30D820u, // FADDP S0, V1.2S + 0x7E30C820u, // FMAXNMP S0, V1.2S + 0x7EB0C820u // FMINNMP S0, V1.2S }; } - private static uint[] _F_Add_P_S_2DD_() + private static uint[] _F_Add_Max_Min_Nm_P_S_2DD_() { return new uint[] { - 0x7E70D820u // FADDP D0, V1.2D + 0x7E70D820u, // FADDP D0, V1.2D + 0x7E70C820u, // FMAXNMP D0, V1.2D + 0x7EF0C820u // FMINNMP D0, V1.2D }; } @@ -1802,12 +1806,13 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] [Explicit] - public void F_Add_P_S_2SS([ValueSource("_F_Add_P_S_2SS_")] uint opcodes, - [ValueSource("_2S_F_")] ulong a) + public void F_Add_Max_Min_Nm_P_S_2SS([ValueSource("_F_Add_Max_Min_Nm_P_S_2SS_")] uint opcodes, + [ValueSource("_2S_F_")] ulong a) { ulong z = TestContext.CurrentContext.Random.NextULong(); + V128 v0 = MakeVectorE0E1(z, z); - V128 v1 = MakeVectorE0(a); + V128 v1 = MakeVectorE0E1(a, z); int rnd = (int)TestContext.CurrentContext.Random.NextUInt(); @@ -1820,12 +1825,14 @@ namespace Ryujinx.Tests.Cpu } [Test, Pairwise] [Explicit] - public void F_Add_P_S_2DD([ValueSource("_F_Add_P_S_2DD_")] uint opcodes, - [ValueSource("_1D_F_")] ulong a) + public void F_Add_Max_Min_Nm_P_S_2DD([ValueSource("_F_Add_Max_Min_Nm_P_S_2DD_")] uint opcodes, + [ValueSource("_1D_F_")] ulong a0, + [ValueSource("_1D_F_")] ulong a1) { ulong z = TestContext.CurrentContext.Random.NextULong(); - V128 v0 = MakeVectorE1(z); - V128 v1 = MakeVectorE0E1(a, a); + + V128 v0 = MakeVectorE0E1(z, z); + V128 v1 = MakeVectorE0E1(a0, a1); int rnd = (int)TestContext.CurrentContext.Random.NextUInt();