From 88af0e29660fa658797f9e7891630d9df8d8c425 Mon Sep 17 00:00:00 2001 From: gdkchan Date: Wed, 26 Sep 2018 23:30:21 -0300 Subject: [PATCH] Optimize BIC, BSL, BIT, BIF, XTN, ZIP, DUP (Gp), FMADD (Scalar) and FCVT (Scalar) using SSE intrinsics (#405) * Optimize BIC, BSL, BIT, BIF, XTN, ZIP, DUP (Gp), FMADD (Scalar) and FCVT (Scalar) using SSE intrinsics, some CQ improvements * Remove useless space * Address PR feedback * Revert EmitVectorZero32_128 changes --- Instruction/AInstEmitSimdArithmetic.cs | 67 +++- Instruction/AInstEmitSimdCmp.cs | 20 +- Instruction/AInstEmitSimdCvt.cs | 45 ++- Instruction/AInstEmitSimdHelper.cs | 228 +++++++++---- Instruction/AInstEmitSimdLogical.cs | 170 +++++++--- Instruction/AInstEmitSimdMove.cs | 204 +++++++++--- Instruction/AVectorHelper.cs | 430 ++++++++++++++++++------- Memory/AMemory.cs | 8 +- 8 files changed, 896 insertions(+), 276 deletions(-) diff --git a/Instruction/AInstEmitSimdArithmetic.cs b/Instruction/AInstEmitSimdArithmetic.cs index be54987..811730f 100644 --- a/Instruction/AInstEmitSimdArithmetic.cs +++ b/Instruction/AInstEmitSimdArithmetic.cs @@ -4,6 +4,7 @@ using ChocolArm64.Translation; using System; using System.Reflection; using System.Reflection.Emit; +using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using static ChocolArm64.Instruction.AInstEmitSimdHelper; @@ -31,7 +32,7 @@ namespace ChocolArm64.Instruction { if (AOptimizations.UseSse2) { - EmitSse2Call(Context, nameof(Sse2.Add)); + EmitSse2Op(Context, nameof(Sse2.Add)); } else { @@ -175,7 +176,7 @@ namespace ChocolArm64.Instruction { if (AOptimizations.UseSse && AOptimizations.UseSse2) { - EmitScalarSseOrSse2CallF(Context, nameof(Sse.AddScalar)); + EmitScalarSseOrSse2OpF(Context, nameof(Sse.AddScalar)); } else { @@ -187,7 +188,7 @@ namespace ChocolArm64.Instruction { if (AOptimizations.UseSse && AOptimizations.UseSse2) { - EmitVectorSseOrSse2CallF(Context, nameof(Sse.Add)); + EmitVectorSseOrSse2OpF(Context, nameof(Sse.Add)); } else { @@ -218,7 +219,7 @@ namespace ChocolArm64.Instruction { if (AOptimizations.UseSse && AOptimizations.UseSse2) { - EmitScalarSseOrSse2CallF(Context, nameof(Sse.DivideScalar)); + EmitScalarSseOrSse2OpF(Context, nameof(Sse.DivideScalar)); } else { @@ -230,7 +231,7 @@ namespace ChocolArm64.Instruction { if (AOptimizations.UseSse && AOptimizations.UseSse2) { - EmitVectorSseOrSse2CallF(Context, nameof(Sse.Divide)); + EmitVectorSseOrSse2OpF(Context, nameof(Sse.Divide)); } else { @@ -240,11 +241,49 @@ namespace ChocolArm64.Instruction public static void Fmadd_S(AILEmitterCtx Context) { - EmitScalarTernaryRaOpF(Context, () => + if (AOptimizations.UseSse2) { - Context.Emit(OpCodes.Mul); - Context.Emit(OpCodes.Add); - }); + AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp; + + if (Op.Size == 0) + { + Context.EmitLdvec(Op.Ra); + Context.EmitLdvec(Op.Rn); + Context.EmitLdvec(Op.Rm); + + Type[] Types = new Type[] { typeof(Vector128), typeof(Vector128) }; + + Context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.MultiplyScalar), Types)); + Context.EmitCall(typeof(Sse).GetMethod(nameof(Sse.AddScalar), Types)); + + Context.EmitStvec(Op.Rd); + + EmitVectorZero32_128(Context, Op.Rd); + } + else /* if (Op.Size == 1) */ + { + EmitLdvecWithCastToDouble(Context, Op.Ra); + EmitLdvecWithCastToDouble(Context, Op.Rn); + EmitLdvecWithCastToDouble(Context, Op.Rm); + + Type[] Types = new Type[] { typeof(Vector128), typeof(Vector128) }; + + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MultiplyScalar), Types)); + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AddScalar), Types)); + + EmitStvecWithCastFromDouble(Context, Op.Rd); + + EmitVectorZeroUpper(Context, Op.Rd); + } + } + else + { + EmitScalarTernaryRaOpF(Context, () => + { + Context.Emit(OpCodes.Mul); + Context.Emit(OpCodes.Add); + }); + } } public static void Fmax_S(AILEmitterCtx Context) @@ -379,7 +418,7 @@ namespace ChocolArm64.Instruction { if (AOptimizations.UseSse && AOptimizations.UseSse2) { - EmitScalarSseOrSse2CallF(Context, nameof(Sse.MultiplyScalar)); + EmitScalarSseOrSse2OpF(Context, nameof(Sse.MultiplyScalar)); } else { @@ -396,7 +435,7 @@ namespace ChocolArm64.Instruction { if (AOptimizations.UseSse && AOptimizations.UseSse2) { - EmitVectorSseOrSse2CallF(Context, nameof(Sse.Multiply)); + EmitVectorSseOrSse2OpF(Context, nameof(Sse.Multiply)); } else { @@ -763,7 +802,7 @@ namespace ChocolArm64.Instruction { if (AOptimizations.UseSse && AOptimizations.UseSse2) { - EmitScalarSseOrSse2CallF(Context, nameof(Sse.SubtractScalar)); + EmitScalarSseOrSse2OpF(Context, nameof(Sse.SubtractScalar)); } else { @@ -775,7 +814,7 @@ namespace ChocolArm64.Instruction { if (AOptimizations.UseSse && AOptimizations.UseSse2) { - EmitVectorSseOrSse2CallF(Context, nameof(Sse.Subtract)); + EmitVectorSseOrSse2OpF(Context, nameof(Sse.Subtract)); } else { @@ -1103,7 +1142,7 @@ namespace ChocolArm64.Instruction { if (AOptimizations.UseSse2) { - EmitSse2Call(Context, nameof(Sse2.Subtract)); + EmitSse2Op(Context, nameof(Sse2.Subtract)); } else { diff --git a/Instruction/AInstEmitSimdCmp.cs b/Instruction/AInstEmitSimdCmp.cs index 6357396..97f7623 100644 --- a/Instruction/AInstEmitSimdCmp.cs +++ b/Instruction/AInstEmitSimdCmp.cs @@ -23,11 +23,11 @@ namespace ChocolArm64.Instruction { if (Op.Size < 3 && AOptimizations.UseSse2) { - EmitSse2Call(Context, nameof(Sse2.CompareEqual)); + EmitSse2Op(Context, nameof(Sse2.CompareEqual)); } else if (Op.Size == 3 && AOptimizations.UseSse41) { - EmitSse41Call(Context, nameof(Sse41.CompareEqual)); + EmitSse41Op(Context, nameof(Sse41.CompareEqual)); } else { @@ -61,11 +61,11 @@ namespace ChocolArm64.Instruction { if (Op.Size < 3 && AOptimizations.UseSse2) { - EmitSse2Call(Context, nameof(Sse2.CompareGreaterThan)); + EmitSse2Op(Context, nameof(Sse2.CompareGreaterThan)); } else if (Op.Size == 3 && AOptimizations.UseSse42) { - EmitSse42Call(Context, nameof(Sse42.CompareGreaterThan)); + EmitSse42Op(Context, nameof(Sse42.CompareGreaterThan)); } else { @@ -158,7 +158,7 @@ namespace ChocolArm64.Instruction if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse && AOptimizations.UseSse2) { - EmitScalarSseOrSse2CallF(Context, nameof(Sse.CompareEqualScalar)); + EmitScalarSseOrSse2OpF(Context, nameof(Sse.CompareEqualScalar)); } else { @@ -171,7 +171,7 @@ namespace ChocolArm64.Instruction if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse && AOptimizations.UseSse2) { - EmitVectorSseOrSse2CallF(Context, nameof(Sse.CompareEqual)); + EmitVectorSseOrSse2OpF(Context, nameof(Sse.CompareEqual)); } else { @@ -184,7 +184,7 @@ namespace ChocolArm64.Instruction if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse && AOptimizations.UseSse2) { - EmitScalarSseOrSse2CallF(Context, nameof(Sse.CompareGreaterThanOrEqualScalar)); + EmitScalarSseOrSse2OpF(Context, nameof(Sse.CompareGreaterThanOrEqualScalar)); } else { @@ -197,7 +197,7 @@ namespace ChocolArm64.Instruction if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse && AOptimizations.UseSse2) { - EmitVectorSseOrSse2CallF(Context, nameof(Sse.CompareGreaterThanOrEqual)); + EmitVectorSseOrSse2OpF(Context, nameof(Sse.CompareGreaterThanOrEqual)); } else { @@ -210,7 +210,7 @@ namespace ChocolArm64.Instruction if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse && AOptimizations.UseSse2) { - EmitScalarSseOrSse2CallF(Context, nameof(Sse.CompareGreaterThanScalar)); + EmitScalarSseOrSse2OpF(Context, nameof(Sse.CompareGreaterThanScalar)); } else { @@ -223,7 +223,7 @@ namespace ChocolArm64.Instruction if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse && AOptimizations.UseSse2) { - EmitVectorSseOrSse2CallF(Context, nameof(Sse.CompareGreaterThan)); + EmitVectorSseOrSse2OpF(Context, nameof(Sse.CompareGreaterThan)); } else { diff --git a/Instruction/AInstEmitSimdCvt.cs b/Instruction/AInstEmitSimdCvt.cs index 231de0a..76d984a 100644 --- a/Instruction/AInstEmitSimdCvt.cs +++ b/Instruction/AInstEmitSimdCvt.cs @@ -3,6 +3,8 @@ using ChocolArm64.State; using ChocolArm64.Translation; using System; using System.Reflection.Emit; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; using static ChocolArm64.Instruction.AInstEmitSimdHelper; @@ -14,11 +16,48 @@ namespace ChocolArm64.Instruction { AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp; - EmitVectorExtractF(Context, Op.Rn, 0, Op.Size); + if (AOptimizations.UseSse2) + { + if (Op.Size == 1 && Op.Opc == 0) + { + //Double -> Single. + AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleZero)); - EmitFloatCast(Context, Op.Opc); + EmitLdvecWithCastToDouble(Context, Op.Rn); - EmitScalarSetF(Context, Op.Rd, Op.Opc); + Type[] Types = new Type[] { typeof(Vector128), typeof(Vector128) }; + + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertScalarToVector128Single), Types)); + + Context.EmitStvec(Op.Rd); + } + else if (Op.Size == 0 && Op.Opc == 1) + { + //Single -> Double. + AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorDoubleZero)); + + Context.EmitLdvec(Op.Rn); + + Type[] Types = new Type[] { typeof(Vector128), typeof(Vector128) }; + + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ConvertScalarToVector128Double), Types)); + + EmitStvecWithCastFromDouble(Context, Op.Rd); + } + else + { + //Invalid encoding. + throw new InvalidOperationException(); + } + } + else + { + EmitVectorExtractF(Context, Op.Rn, 0, Op.Size); + + EmitFloatCast(Context, Op.Opc); + + EmitScalarSetF(Context, Op.Rd, Op.Opc); + } } public static void Fcvtas_Gp(AILEmitterCtx Context) diff --git a/Instruction/AInstEmitSimdHelper.cs b/Instruction/AInstEmitSimdHelper.cs index 171de43..381fc46 100644 --- a/Instruction/AInstEmitSimdHelper.cs +++ b/Instruction/AInstEmitSimdHelper.cs @@ -4,7 +4,6 @@ using ChocolArm64.Translation; using System; using System.Reflection; using System.Reflection.Emit; -using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -12,6 +11,38 @@ namespace ChocolArm64.Instruction { static class AInstEmitSimdHelper { + public static readonly Type[] IntTypesPerSizeLog2 = new Type[] + { + typeof(sbyte), + typeof(short), + typeof(int), + typeof(long) + }; + + public static readonly Type[] UIntTypesPerSizeLog2 = new Type[] + { + typeof(byte), + typeof(ushort), + typeof(uint), + typeof(ulong) + }; + + public static readonly Type[] VectorIntTypesPerSizeLog2 = new Type[] + { + typeof(Vector128), + typeof(Vector128), + typeof(Vector128), + typeof(Vector128) + }; + + public static readonly Type[] VectorUIntTypesPerSizeLog2 = new Type[] + { + typeof(Vector128), + typeof(Vector128), + typeof(Vector128), + typeof(Vector128) + }; + [Flags] public enum OperFlags { @@ -36,56 +67,32 @@ namespace ChocolArm64.Instruction return (8 << (Op.Size + 1)) - Op.Imm; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void EmitSse2Call(AILEmitterCtx Context, string Name) + public static void EmitSse2Op(AILEmitterCtx Context, string Name) { - EmitSseCall(Context, Name, typeof(Sse2)); + EmitSseOp(Context, Name, typeof(Sse2)); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void EmitSse41Call(AILEmitterCtx Context, string Name) + public static void EmitSse41Op(AILEmitterCtx Context, string Name) { - EmitSseCall(Context, Name, typeof(Sse41)); + EmitSseOp(Context, Name, typeof(Sse41)); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void EmitSse42Call(AILEmitterCtx Context, string Name) + public static void EmitSse42Op(AILEmitterCtx Context, string Name) { - EmitSseCall(Context, Name, typeof(Sse42)); + EmitSseOp(Context, Name, typeof(Sse42)); } - private static void EmitSseCall(AILEmitterCtx Context, string Name, Type Type) + private static void EmitSseOp(AILEmitterCtx Context, string Name, Type Type) { AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp; - void Ldvec(int Reg) - { - Context.EmitLdvec(Reg); + EmitLdvecWithSignedCast(Context, Op.Rn, Op.Size); - switch (Op.Size) - { - case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToSByte)); break; - case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToInt16)); break; - case 2: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToInt32)); break; - case 3: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToInt64)); break; - } - } - - Ldvec(Op.Rn); - - Type BaseType = null; - - switch (Op.Size) - { - case 0: BaseType = typeof(Vector128); break; - case 1: BaseType = typeof(Vector128); break; - case 2: BaseType = typeof(Vector128); break; - case 3: BaseType = typeof(Vector128); break; - } + Type BaseType = VectorIntTypesPerSizeLog2[Op.Size]; if (Op is AOpCodeSimdReg BinOp) { - Ldvec(BinOp.Rm); + EmitLdvecWithSignedCast(Context, BinOp.Rm, Op.Size); Context.EmitCall(Type.GetMethod(Name, new Type[] { BaseType, BaseType })); } @@ -94,15 +101,7 @@ namespace ChocolArm64.Instruction Context.EmitCall(Type.GetMethod(Name, new Type[] { BaseType })); } - switch (Op.Size) - { - case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSByteToSingle)); break; - case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt16ToSingle)); break; - case 2: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt32ToSingle)); break; - case 3: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt64ToSingle)); break; - } - - Context.EmitStvec(Op.Rd); + EmitStvecWithSignedCast(Context, Op.Rd, Op.Size); if (Op.RegisterSize == ARegisterSize.SIMD64) { @@ -110,17 +109,91 @@ namespace ChocolArm64.Instruction } } - public static void EmitScalarSseOrSse2CallF(AILEmitterCtx Context, string Name) + public static void EmitLdvecWithSignedCast(AILEmitterCtx Context, int Reg, int Size) { - EmitSseOrSse2CallF(Context, Name, true); + Context.EmitLdvec(Reg); + + switch (Size) + { + case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToSByte)); break; + case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToInt16)); break; + case 2: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToInt32)); break; + case 3: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToInt64)); break; + + default: throw new ArgumentOutOfRangeException(nameof(Size)); + } } - public static void EmitVectorSseOrSse2CallF(AILEmitterCtx Context, string Name) + public static void EmitLdvecWithCastToDouble(AILEmitterCtx Context, int Reg) { - EmitSseOrSse2CallF(Context, Name, false); + Context.EmitLdvec(Reg); + + AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToDouble)); } - public static void EmitSseOrSse2CallF(AILEmitterCtx Context, string Name, bool Scalar) + public static void EmitStvecWithCastFromDouble(AILEmitterCtx Context, int Reg) + { + AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorDoubleToSingle)); + + Context.EmitStvec(Reg); + } + + public static void EmitLdvecWithUnsignedCast(AILEmitterCtx Context, int Reg, int Size) + { + Context.EmitLdvec(Reg); + + switch (Size) + { + case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToByte)); break; + case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToUInt16)); break; + case 2: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToUInt32)); break; + case 3: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleToUInt64)); break; + + default: throw new ArgumentOutOfRangeException(nameof(Size)); + } + } + + public static void EmitStvecWithSignedCast(AILEmitterCtx Context, int Reg, int Size) + { + switch (Size) + { + case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSByteToSingle)); break; + case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt16ToSingle)); break; + case 2: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt32ToSingle)); break; + case 3: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt64ToSingle)); break; + + default: throw new ArgumentOutOfRangeException(nameof(Size)); + } + + Context.EmitStvec(Reg); + } + + public static void EmitStvecWithUnsignedCast(AILEmitterCtx Context, int Reg, int Size) + { + switch (Size) + { + case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorByteToSingle)); break; + case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorUInt16ToSingle)); break; + case 2: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorUInt32ToSingle)); break; + case 3: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorUInt64ToSingle)); break; + + default: throw new ArgumentOutOfRangeException(nameof(Size)); + } + + Context.EmitStvec(Reg); + } + + public static void EmitScalarSseOrSse2OpF(AILEmitterCtx Context, string Name) + { + EmitSseOrSse2OpF(Context, Name, true); + } + + public static void EmitVectorSseOrSse2OpF(AILEmitterCtx Context, string Name) + { + EmitSseOrSse2OpF(Context, Name, false); + } + + public static void EmitSseOrSse2OpF(AILEmitterCtx Context, string Name, bool Scalar) { AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp; @@ -1183,8 +1256,21 @@ namespace ChocolArm64.Instruction public static void EmitScalarSetF(AILEmitterCtx Context, int Reg, int Size) { - EmitVectorZeroAll(Context, Reg); - EmitVectorInsertF(Context, Reg, 0, Size); + if (AOptimizations.UseSse41 && Size == 0) + { + //If the type is float, we can perform insertion and + //zero the upper bits with a single instruction (INSERTPS); + Context.EmitLdvec(Reg); + + AVectorHelper.EmitCall(Context, nameof(AVectorHelper.Sse41VectorInsertScalarSingle)); + + Context.EmitStvec(Reg); + } + else + { + EmitVectorZeroAll(Context, Reg); + EmitVectorInsertF(Context, Reg, 0, Size); + } } public static void EmitVectorExtractSx(AILEmitterCtx Context, int Reg, int Index, int Size) @@ -1235,8 +1321,17 @@ namespace ChocolArm64.Instruction public static void EmitVectorZeroAll(AILEmitterCtx Context, int Rd) { - EmitVectorZeroLower(Context, Rd); - EmitVectorZeroUpper(Context, Rd); + if (AOptimizations.UseSse2) + { + AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSingleZero)); + + Context.EmitStvec(Rd); + } + else + { + EmitVectorZeroLower(Context, Rd); + EmitVectorZeroUpper(Context, Rd); + } } public static void EmitVectorZeroLower(AILEmitterCtx Context, int Rd) @@ -1249,9 +1344,32 @@ namespace ChocolArm64.Instruction EmitVectorInsertTmp(Context, 0, 3, 0); } - public static void EmitVectorZeroUpper(AILEmitterCtx Context, int Rd) + public static void EmitVectorZeroUpper(AILEmitterCtx Context, int Reg) { - EmitVectorInsert(Context, Rd, 1, 3, 0); + if (AOptimizations.UseSse2) + { + //TODO: Use MoveScalar once it is fixed, as of the + //time of writing it just crashes the JIT. + EmitLdvecWithUnsignedCast(Context, Reg, 3); + + Type[] Types = new Type[] { typeof(Vector128), typeof(byte) }; + + //Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.MoveScalar), Types)); + + Context.EmitLdc_I4(8); + + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftLeftLogical128BitLane), Types)); + + Context.EmitLdc_I4(8); + + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), Types)); + + EmitStvecWithUnsignedCast(Context, Reg, 3); + } + else + { + EmitVectorInsert(Context, Reg, 1, 3, 0); + } } public static void EmitVectorZero32_128(AILEmitterCtx Context, int Reg) diff --git a/Instruction/AInstEmitSimdLogical.cs b/Instruction/AInstEmitSimdLogical.cs index 9f5af96..1aa8981 100644 --- a/Instruction/AInstEmitSimdLogical.cs +++ b/Instruction/AInstEmitSimdLogical.cs @@ -15,7 +15,7 @@ namespace ChocolArm64.Instruction { if (AOptimizations.UseSse2) { - EmitSse2Call(Context, nameof(Sse2.And)); + EmitSse2Op(Context, nameof(Sse2.And)); } else { @@ -25,11 +25,36 @@ namespace ChocolArm64.Instruction public static void Bic_V(AILEmitterCtx Context) { - EmitVectorBinaryOpZx(Context, () => + if (AOptimizations.UseSse2) { - Context.Emit(OpCodes.Not); - Context.Emit(OpCodes.And); - }); + AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp; + + EmitLdvecWithUnsignedCast(Context, Op.Rm, Op.Size); + EmitLdvecWithUnsignedCast(Context, Op.Rn, Op.Size); + + Type[] Types = new Type[] + { + VectorUIntTypesPerSizeLog2[Op.Size], + VectorUIntTypesPerSizeLog2[Op.Size] + }; + + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.AndNot), Types)); + + EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size); + + if (Op.RegisterSize == ARegisterSize.SIMD64) + { + EmitVectorZeroUpper(Context, Op.Rd); + } + } + else + { + EmitVectorBinaryOpZx(Context, () => + { + Context.Emit(OpCodes.Not); + Context.Emit(OpCodes.And); + }); + } } public static void Bic_Vi(AILEmitterCtx Context) @@ -55,59 +80,124 @@ namespace ChocolArm64.Instruction { AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp; - int Bytes = Op.GetBitsCount() >> 3; - int Elems = Bytes >> Op.Size; - - for (int Index = 0; Index < Elems; Index++) + if (AOptimizations.UseSse2) { - EmitVectorExtractZx(Context, Op.Rd, Index, Op.Size); - EmitVectorExtractZx(Context, Op.Rn, Index, Op.Size); - - Context.Emit(OpCodes.Xor); - - EmitVectorExtractZx(Context, Op.Rm, Index, Op.Size); - - if (NotRm) + Type[] Types = new Type[] { - Context.Emit(OpCodes.Not); + VectorUIntTypesPerSizeLog2[Op.Size], + VectorUIntTypesPerSizeLog2[Op.Size] + }; + + EmitLdvecWithUnsignedCast(Context, Op.Rm, Op.Size); + EmitLdvecWithUnsignedCast(Context, Op.Rd, Op.Size); + EmitLdvecWithUnsignedCast(Context, Op.Rn, Op.Size); + + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), Types)); + + string Name = NotRm ? nameof(Sse2.AndNot) : nameof(Sse2.And); + + Context.EmitCall(typeof(Sse2).GetMethod(Name, Types)); + + EmitLdvecWithUnsignedCast(Context, Op.Rd, Op.Size); + + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), Types)); + + EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size); + + if (Op.RegisterSize == ARegisterSize.SIMD64) + { + EmitVectorZeroUpper(Context, Op.Rd); + } + } + else + { + int Bytes = Op.GetBitsCount() >> 3; + int Elems = Bytes >> Op.Size; + + for (int Index = 0; Index < Elems; Index++) + { + EmitVectorExtractZx(Context, Op.Rd, Index, Op.Size); + EmitVectorExtractZx(Context, Op.Rn, Index, Op.Size); + + Context.Emit(OpCodes.Xor); + + EmitVectorExtractZx(Context, Op.Rm, Index, Op.Size); + + if (NotRm) + { + Context.Emit(OpCodes.Not); + } + + Context.Emit(OpCodes.And); + + EmitVectorExtractZx(Context, Op.Rd, Index, Op.Size); + + Context.Emit(OpCodes.Xor); + + EmitVectorInsert(Context, Op.Rd, Index, Op.Size); } - Context.Emit(OpCodes.And); - - EmitVectorExtractZx(Context, Op.Rd, Index, Op.Size); - - Context.Emit(OpCodes.Xor); - - EmitVectorInsert(Context, Op.Rd, Index, Op.Size); - } - - if (Op.RegisterSize == ARegisterSize.SIMD64) - { - EmitVectorZeroUpper(Context, Op.Rd); + if (Op.RegisterSize == ARegisterSize.SIMD64) + { + EmitVectorZeroUpper(Context, Op.Rd); + } } } public static void Bsl_V(AILEmitterCtx Context) { - EmitVectorTernaryOpZx(Context, () => + if (AOptimizations.UseSse2) { - Context.EmitSttmp(); - Context.EmitLdtmp(); + AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp; - Context.Emit(OpCodes.Xor); - Context.Emit(OpCodes.And); + Type[] Types = new Type[] + { + VectorUIntTypesPerSizeLog2[Op.Size], + VectorUIntTypesPerSizeLog2[Op.Size] + }; - Context.EmitLdtmp(); + EmitLdvecWithUnsignedCast(Context, Op.Rn, Op.Size); + EmitLdvecWithUnsignedCast(Context, Op.Rm, Op.Size); - Context.Emit(OpCodes.Xor); - }); + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), Types)); + + EmitLdvecWithUnsignedCast(Context, Op.Rd, Op.Size); + + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), Types)); + + EmitLdvecWithUnsignedCast(Context, Op.Rm, Op.Size); + + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Xor), Types)); + + EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size); + + if (Op.RegisterSize == ARegisterSize.SIMD64) + { + EmitVectorZeroUpper(Context, Op.Rd); + } + } + else + { + EmitVectorTernaryOpZx(Context, () => + { + Context.EmitSttmp(); + Context.EmitLdtmp(); + + Context.Emit(OpCodes.Xor); + Context.Emit(OpCodes.And); + + Context.EmitLdtmp(); + + Context.Emit(OpCodes.Xor); + }); + } } public static void Eor_V(AILEmitterCtx Context) { if (AOptimizations.UseSse2) { - EmitSse2Call(Context, nameof(Sse2.Xor)); + EmitSse2Op(Context, nameof(Sse2.Xor)); } else { @@ -133,7 +223,7 @@ namespace ChocolArm64.Instruction { if (AOptimizations.UseSse2) { - EmitSse2Call(Context, nameof(Sse2.Or)); + EmitSse2Op(Context, nameof(Sse2.Or)); } else { diff --git a/Instruction/AInstEmitSimdMove.cs b/Instruction/AInstEmitSimdMove.cs index 3bf1e46..94097f4 100644 --- a/Instruction/AInstEmitSimdMove.cs +++ b/Instruction/AInstEmitSimdMove.cs @@ -3,6 +3,7 @@ using ChocolArm64.State; using ChocolArm64.Translation; using System; using System.Reflection.Emit; +using System.Runtime.Intrinsics.X86; using static ChocolArm64.Instruction.AInstEmitSimdHelper; @@ -14,19 +15,44 @@ namespace ChocolArm64.Instruction { AOpCodeSimdIns Op = (AOpCodeSimdIns)Context.CurrOp; - int Bytes = Op.GetBitsCount() >> 3; - int Elems = Bytes >> Op.Size; - - for (int Index = 0; Index < Elems; Index++) + if (AOptimizations.UseSse2) { Context.EmitLdintzr(Op.Rn); - EmitVectorInsert(Context, Op.Rd, Index, Op.Size); - } + switch (Op.Size) + { + case 0: Context.Emit(OpCodes.Conv_U1); break; + case 1: Context.Emit(OpCodes.Conv_U2); break; + case 2: Context.Emit(OpCodes.Conv_U4); break; + } - if (Op.RegisterSize == ARegisterSize.SIMD64) + Type[] Types = new Type[] { UIntTypesPerSizeLog2[Op.Size] }; + + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), Types)); + + EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size); + + if (Op.RegisterSize == ARegisterSize.SIMD64) + { + EmitVectorZeroUpper(Context, Op.Rd); + } + } + else { - EmitVectorZeroUpper(Context, Op.Rd); + int Bytes = Op.GetBitsCount() >> 3; + int Elems = Bytes >> Op.Size; + + for (int Index = 0; Index < Elems; Index++) + { + Context.EmitLdintzr(Op.Rn); + + EmitVectorInsert(Context, Op.Rd, Index, Op.Size); + } + + if (Op.RegisterSize == ARegisterSize.SIMD64) + { + EmitVectorZeroUpper(Context, Op.Rd); + } } } @@ -295,25 +321,91 @@ namespace ChocolArm64.Instruction int Part = Op.RegisterSize == ARegisterSize.SIMD128 ? Elems : 0; - if (Part != 0) + if (AOptimizations.UseSse41 && Op.Size < 2) { - Context.EmitLdvec(Op.Rd); - Context.EmitStvectmp(); + void EmitZeroVector() + { + switch (Op.Size) + { + case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt16Zero)); break; + case 1: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorInt32Zero)); break; + } + } + + //For XTN, first operand is source, second operand is 0. + //For XTN2, first operand is 0, second operand is source. + if (Part != 0) + { + EmitZeroVector(); + } + + EmitLdvecWithSignedCast(Context, Op.Rn, Op.Size + 1); + + //Set mask to discard the upper half of the wide elements. + switch (Op.Size) + { + case 0: Context.EmitLdc_I4(0x00ff); break; + case 1: Context.EmitLdc_I4(0x0000ffff); break; + } + + Type WideType = IntTypesPerSizeLog2[Op.Size + 1]; + + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.SetAllVector128), new Type[] { WideType })); + + WideType = VectorIntTypesPerSizeLog2[Op.Size + 1]; + + Type[] WideTypes = new Type[] { WideType, WideType }; + + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.And), WideTypes)); + + if (Part == 0) + { + EmitZeroVector(); + } + + //Pack values with signed saturation, the signed saturation shouldn't + //saturate anything since the upper bits were masked off. + Type SseType = Op.Size == 0 ? typeof(Sse2) : typeof(Sse41); + + Context.EmitCall(SseType.GetMethod(nameof(Sse2.PackUnsignedSaturate), WideTypes)); + + if (Part != 0) + { + //For XTN2, we additionally need to discard the upper bits + //of the target register and OR the result with it. + EmitVectorZeroUpper(Context, Op.Rd); + + EmitLdvecWithUnsignedCast(Context, Op.Rd, Op.Size); + + Type NarrowType = VectorUIntTypesPerSizeLog2[Op.Size]; + + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.Or), new Type[] { NarrowType, NarrowType })); + } + + EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size); } - - for (int Index = 0; Index < Elems; Index++) + else { - EmitVectorExtractZx(Context, Op.Rn, Index, Op.Size + 1); + if (Part != 0) + { + Context.EmitLdvec(Op.Rd); + Context.EmitStvectmp(); + } - EmitVectorInsertTmp(Context, Part + Index, Op.Size); - } + for (int Index = 0; Index < Elems; Index++) + { + EmitVectorExtractZx(Context, Op.Rn, Index, Op.Size + 1); - Context.EmitLdvectmp(); - Context.EmitStvec(Op.Rd); + EmitVectorInsertTmp(Context, Part + Index, Op.Size); + } - if (Part == 0) - { - EmitVectorZeroUpper(Context, Op.Rd); + Context.EmitLdvectmp(); + Context.EmitStvec(Op.Rd); + + if (Part == 0) + { + EmitVectorZeroUpper(Context, Op.Rd); + } } } @@ -394,28 +486,64 @@ namespace ChocolArm64.Instruction { AOpCodeSimdReg Op = (AOpCodeSimdReg)Context.CurrOp; - int Words = Op.GetBitsCount() >> 4; - int Pairs = Words >> Op.Size; - - int Base = Part != 0 ? Pairs : 0; - - for (int Index = 0; Index < Pairs; Index++) + if (AOptimizations.UseSse2) { - int Idx = Index << 1; + EmitLdvecWithUnsignedCast(Context, Op.Rn, Op.Size); + EmitLdvecWithUnsignedCast(Context, Op.Rm, Op.Size); - EmitVectorExtractZx(Context, Op.Rn, Base + Index, Op.Size); - EmitVectorExtractZx(Context, Op.Rm, Base + Index, Op.Size); + Type[] Types = new Type[] + { + VectorUIntTypesPerSizeLog2[Op.Size], + VectorUIntTypesPerSizeLog2[Op.Size] + }; - EmitVectorInsertTmp(Context, Idx + 1, Op.Size); - EmitVectorInsertTmp(Context, Idx, Op.Size); + string Name = Part == 0 || (Part != 0 && Op.RegisterSize == ARegisterSize.SIMD64) + ? nameof(Sse2.UnpackLow) + : nameof(Sse2.UnpackHigh); + + Context.EmitCall(typeof(Sse2).GetMethod(Name, Types)); + + if (Op.RegisterSize == ARegisterSize.SIMD64 && Part != 0) + { + Context.EmitLdc_I4(8); + + Type[] ShTypes = new Type[] { VectorUIntTypesPerSizeLog2[Op.Size], typeof(byte) }; + + Context.EmitCall(typeof(Sse2).GetMethod(nameof(Sse2.ShiftRightLogical128BitLane), ShTypes)); + } + + EmitStvecWithUnsignedCast(Context, Op.Rd, Op.Size); + + if (Op.RegisterSize == ARegisterSize.SIMD64 && Part == 0) + { + EmitVectorZeroUpper(Context, Op.Rd); + } } - - Context.EmitLdvectmp(); - Context.EmitStvec(Op.Rd); - - if (Op.RegisterSize == ARegisterSize.SIMD64) + else { - EmitVectorZeroUpper(Context, Op.Rd); + int Words = Op.GetBitsCount() >> 4; + int Pairs = Words >> Op.Size; + + int Base = Part != 0 ? Pairs : 0; + + for (int Index = 0; Index < Pairs; Index++) + { + int Idx = Index << 1; + + EmitVectorExtractZx(Context, Op.Rn, Base + Index, Op.Size); + EmitVectorExtractZx(Context, Op.Rm, Base + Index, Op.Size); + + EmitVectorInsertTmp(Context, Idx + 1, Op.Size); + EmitVectorInsertTmp(Context, Idx, Op.Size); + } + + Context.EmitLdvectmp(); + Context.EmitStvec(Op.Rd); + + if (Op.RegisterSize == ARegisterSize.SIMD64) + { + EmitVectorZeroUpper(Context, Op.Rd); + } } } } diff --git a/Instruction/AVectorHelper.cs b/Instruction/AVectorHelper.cs index 3e4452a..7f9d98c 100644 --- a/Instruction/AVectorHelper.cs +++ b/Instruction/AVectorHelper.cs @@ -227,7 +227,16 @@ namespace ChocolArm64.Instruction [MethodImpl(MethodImplOptions.AggressiveInlining)] public static double VectorExtractDouble(Vector128 Vector, byte Index) { - return BitConverter.Int64BitsToDouble(VectorExtractIntSx(Vector, Index, 3)); + if (Sse41.IsSupported) + { + return BitConverter.Int64BitsToDouble(Sse41.Extract(Sse.StaticCast(Vector), Index)); + } + else if (Sse2.IsSupported) + { + return BitConverter.Int64BitsToDouble((long)VectorExtractIntZx(Vector, Index, 3)); + } + + throw new PlatformNotSupportedException(); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -235,41 +244,49 @@ namespace ChocolArm64.Instruction { if (Sse41.IsSupported) { - switch (Size) + if (Size == 0) { - case 0: - return (sbyte)Sse41.Extract(Sse.StaticCast(Vector), Index); - - case 1: - return (short)Sse2.Extract(Sse.StaticCast(Vector), Index); - - case 2: - return Sse41.Extract(Sse.StaticCast(Vector), Index); - - case 3: - return Sse41.Extract(Sse.StaticCast(Vector), Index); + return (sbyte)Sse41.Extract(Sse.StaticCast(Vector), Index); + } + else if (Size == 1) + { + return (short)Sse2.Extract(Sse.StaticCast(Vector), Index); + } + else if (Size == 2) + { + return Sse41.Extract(Sse.StaticCast(Vector), Index); + } + else if (Size == 3) + { + return Sse41.Extract(Sse.StaticCast(Vector), Index); + } + else + { + throw new ArgumentOutOfRangeException(nameof(Size)); } - - throw new ArgumentOutOfRangeException(nameof(Size)); } else if (Sse2.IsSupported) { - switch (Size) + if (Size == 0) { - case 0: - return (sbyte)VectorExtractIntZx(Vector, Index, Size); - - case 1: - return (short)VectorExtractIntZx(Vector, Index, Size); - - case 2: - return (int)VectorExtractIntZx(Vector, Index, Size); - - case 3: - return (long)VectorExtractIntZx(Vector, Index, Size); + return (sbyte)VectorExtractIntZx(Vector, Index, Size); + } + else if (Size == 1) + { + return (short)VectorExtractIntZx(Vector, Index, Size); + } + else if (Size == 2) + { + return (int)VectorExtractIntZx(Vector, Index, Size); + } + else if (Size == 3) + { + return (long)VectorExtractIntZx(Vector, Index, Size); + } + else + { + throw new ArgumentOutOfRangeException(nameof(Size)); } - - throw new ArgumentOutOfRangeException(nameof(Size)); } throw new PlatformNotSupportedException(); @@ -280,22 +297,26 @@ namespace ChocolArm64.Instruction { if (Sse41.IsSupported) { - switch (Size) + if (Size == 0) { - case 0: - return Sse41.Extract(Sse.StaticCast(Vector), Index); - - case 1: - return Sse2.Extract(Sse.StaticCast(Vector), Index); - - case 2: - return Sse41.Extract(Sse.StaticCast(Vector), Index); - - case 3: - return Sse41.Extract(Sse.StaticCast(Vector), Index); + return Sse41.Extract(Sse.StaticCast(Vector), Index); + } + else if (Size == 1) + { + return Sse2.Extract(Sse.StaticCast(Vector), Index); + } + else if (Size == 2) + { + return Sse41.Extract(Sse.StaticCast(Vector), Index); + } + else if (Size == 3) + { + return Sse41.Extract(Sse.StaticCast(Vector), Index); + } + else + { + throw new ArgumentOutOfRangeException(nameof(Size)); } - - throw new ArgumentOutOfRangeException(nameof(Size)); } else if (Sse2.IsSupported) { @@ -305,35 +326,35 @@ namespace ChocolArm64.Instruction ushort Value = Sse2.Extract(Sse.StaticCast(Vector), (byte)ShortIdx); - switch (Size) + if (Size == 0) { - case 0: - return (byte)(Value >> (Index & 1) * 8); - - case 1: - return Value; - - case 2: - case 3: - { - ushort Value1 = Sse2.Extract(Sse.StaticCast(Vector), (byte)(ShortIdx + 1)); - - if (Size == 2) - { - return (uint)(Value | (Value1 << 16)); - } - - ushort Value2 = Sse2.Extract(Sse.StaticCast(Vector), (byte)(ShortIdx + 2)); - ushort Value3 = Sse2.Extract(Sse.StaticCast(Vector), (byte)(ShortIdx + 3)); - - return ((ulong)Value << 0) | - ((ulong)Value1 << 16) | - ((ulong)Value2 << 32) | - ((ulong)Value3 << 48); - } + return (byte)(Value >> (Index & 1) * 8); } + else if (Size == 1) + { + return Value; + } + else if (Size == 2 || Size == 3) + { + ushort Value1 = Sse2.Extract(Sse.StaticCast(Vector), (byte)(ShortIdx + 1)); - throw new ArgumentOutOfRangeException(nameof(Size)); + if (Size == 2) + { + return (uint)(Value | (Value1 << 16)); + } + + ushort Value2 = Sse2.Extract(Sse.StaticCast(Vector), (byte)(ShortIdx + 2)); + ushort Value3 = Sse2.Extract(Sse.StaticCast(Vector), (byte)(ShortIdx + 3)); + + return ((ulong)Value << 0) | + ((ulong)Value1 << 16) | + ((ulong)Value2 << 32) | + ((ulong)Value3 << 48); + } + else + { + throw new ArgumentOutOfRangeException(nameof(Size)); + } } throw new PlatformNotSupportedException(); @@ -370,22 +391,26 @@ namespace ChocolArm64.Instruction { if (Sse41.IsSupported) { - switch (Size) + if (Size == 0) { - case 0: - return Sse.StaticCast(Sse41.Insert(Sse.StaticCast(Vector), (byte)Value, Index)); - - case 1: - return Sse.StaticCast(Sse2.Insert(Sse.StaticCast(Vector), (ushort)Value, Index)); - - case 2: - return Sse.StaticCast(Sse41.Insert(Sse.StaticCast(Vector), (uint)Value, Index)); - - case 3: - return Sse.StaticCast(Sse41.Insert(Sse.StaticCast(Vector), Value, Index)); + return Sse.StaticCast(Sse41.Insert(Sse.StaticCast(Vector), (byte)Value, Index)); + } + else if (Size == 1) + { + return Sse.StaticCast(Sse2.Insert(Sse.StaticCast(Vector), (ushort)Value, Index)); + } + else if (Size == 2) + { + return Sse.StaticCast(Sse41.Insert(Sse.StaticCast(Vector), (uint)Value, Index)); + } + else if (Size == 3) + { + return Sse.StaticCast(Sse41.Insert(Sse.StaticCast(Vector), Value, Index)); + } + else + { + throw new ArgumentOutOfRangeException(nameof(Size)); } - - throw new ArgumentOutOfRangeException(nameof(Size)); } else if (Sse2.IsSupported) { @@ -395,41 +420,39 @@ namespace ChocolArm64.Instruction ? Index >> 1 : Index << (Size - 1); - switch (Size) + if (Size == 0) { - case 0: - { - ushort ShortVal = Sse2.Extract(Sse.StaticCast(Vector), (byte)ShortIdx); + ushort ShortVal = Sse2.Extract(Sse.StaticCast(Vector), (byte)ShortIdx); - int Shift = (Index & 1) * 8; + int Shift = (Index & 1) * 8; - ShortVal &= (ushort)(0xff00 >> Shift); + ShortVal &= (ushort)(0xff00 >> Shift); - ShortVal |= (ushort)((byte)Value << Shift); + ShortVal |= (ushort)((byte)Value << Shift); - return Sse.StaticCast(Sse2.Insert(ShortVector, ShortVal, (byte)ShortIdx)); - } - - case 1: - return Sse.StaticCast(Sse2.Insert(Sse.StaticCast(Vector), (ushort)Value, Index)); - - case 2: - case 3: - { - ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 0), (byte)(ShortIdx + 0)); - ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 16), (byte)(ShortIdx + 1)); - - if (Size == 3) - { - ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 32), (byte)(ShortIdx + 2)); - ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 48), (byte)(ShortIdx + 3)); - } - - return Sse.StaticCast(ShortVector); - } + return Sse.StaticCast(Sse2.Insert(ShortVector, ShortVal, (byte)ShortIdx)); } + else if (Size == 1) + { + return Sse.StaticCast(Sse2.Insert(Sse.StaticCast(Vector), (ushort)Value, Index)); + } + else if (Size == 2 || Size == 3) + { + ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 0), (byte)(ShortIdx + 0)); + ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 16), (byte)(ShortIdx + 1)); - throw new ArgumentOutOfRangeException(nameof(Size)); + if (Size == 3) + { + ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 32), (byte)(ShortIdx + 2)); + ShortVector = Sse2.Insert(ShortVector, (ushort)(Value >> 48), (byte)(ShortIdx + 3)); + } + + return Sse.StaticCast(ShortVector); + } + else + { + throw new ArgumentOutOfRangeException(nameof(Size)); + } } throw new PlatformNotSupportedException(); @@ -440,7 +463,29 @@ namespace ChocolArm64.Instruction { if (Sse41.IsSupported) { - return Sse41.Insert(Vector, Value, (byte)(Index << 4)); + //Note: The if/else if is necessary to enable the JIT to + //produce a single INSERTPS instruction instead of the + //jump table fallback. + if (Index == 0) + { + return Sse41.Insert(Vector, Value, 0x00); + } + else if (Index == 1) + { + return Sse41.Insert(Vector, Value, 0x10); + } + else if (Index == 2) + { + return Sse41.Insert(Vector, Value, 0x20); + } + else if (Index == 3) + { + return Sse41.Insert(Vector, Value, 0x30); + } + else + { + throw new ArgumentOutOfRangeException(nameof(Index)); + } } else if (Sse2.IsSupported) { @@ -460,6 +505,79 @@ namespace ChocolArm64.Instruction throw new PlatformNotSupportedException(); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 Sse41VectorInsertScalarSingle(float Value, Vector128 Vector) + { + //Note: 0b1110 is the mask to zero the upper bits. + return Sse41.Insert(Vector, Value, 0b1110); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 VectorSByteZero() + { + if (Sse2.IsSupported) + { + return Sse2.SetZeroVector128(); + } + + throw new PlatformNotSupportedException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 VectorInt16Zero() + { + if (Sse2.IsSupported) + { + return Sse2.SetZeroVector128(); + } + + throw new PlatformNotSupportedException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 VectorInt32Zero() + { + if (Sse2.IsSupported) + { + return Sse2.SetZeroVector128(); + } + + throw new PlatformNotSupportedException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 VectorInt64Zero() + { + if (Sse2.IsSupported) + { + return Sse2.SetZeroVector128(); + } + + throw new PlatformNotSupportedException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 VectorSingleZero() + { + if (Sse.IsSupported) + { + return Sse.SetZeroVector128(); + } + + throw new PlatformNotSupportedException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 VectorDoubleZero() + { + if (Sse2.IsSupported) + { + return Sse2.SetZeroVector128(); + } + + throw new PlatformNotSupportedException(); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 VectorZero32_128(Vector128 Vector) { @@ -515,6 +633,50 @@ namespace ChocolArm64.Instruction throw new PlatformNotSupportedException(); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 VectorSingleToByte(Vector128 Vector) + { + if (Sse.IsSupported) + { + return Sse.StaticCast(Vector); + } + + throw new PlatformNotSupportedException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 VectorSingleToUInt16(Vector128 Vector) + { + if (Sse.IsSupported) + { + return Sse.StaticCast(Vector); + } + + throw new PlatformNotSupportedException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 VectorSingleToUInt32(Vector128 Vector) + { + if (Sse.IsSupported) + { + return Sse.StaticCast(Vector); + } + + throw new PlatformNotSupportedException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 VectorSingleToUInt64(Vector128 Vector) + { + if (Sse.IsSupported) + { + return Sse.StaticCast(Vector); + } + + throw new PlatformNotSupportedException(); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 VectorSingleToDouble(Vector128 Vector) { @@ -570,6 +732,50 @@ namespace ChocolArm64.Instruction throw new PlatformNotSupportedException(); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 VectorByteToSingle(Vector128 Vector) + { + if (Sse.IsSupported) + { + return Sse.StaticCast(Vector); + } + + throw new PlatformNotSupportedException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 VectorUInt16ToSingle(Vector128 Vector) + { + if (Sse.IsSupported) + { + return Sse.StaticCast(Vector); + } + + throw new PlatformNotSupportedException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 VectorUInt32ToSingle(Vector128 Vector) + { + if (Sse.IsSupported) + { + return Sse.StaticCast(Vector); + } + + throw new PlatformNotSupportedException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 VectorUInt64ToSingle(Vector128 Vector) + { + if (Sse.IsSupported) + { + return Sse.StaticCast(Vector); + } + + throw new PlatformNotSupportedException(); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 VectorDoubleToSingle(Vector128 Vector) { diff --git a/Memory/AMemory.cs b/Memory/AMemory.cs index 2cb9b16..bb6a2b5 100644 --- a/Memory/AMemory.cs +++ b/Memory/AMemory.cs @@ -232,7 +232,7 @@ namespace ChocolArm64.Memory } } - [MethodImpl(MethodImplOptions.NoInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining)] public Vector128 ReadVector32(long Position) { if (Sse.IsSupported) @@ -245,7 +245,7 @@ namespace ChocolArm64.Memory } } - [MethodImpl(MethodImplOptions.NoInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining)] public Vector128 ReadVector64(long Position) { if (Sse2.IsSupported) @@ -365,7 +365,7 @@ namespace ChocolArm64.Memory } } - [MethodImpl(MethodImplOptions.NoInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining)] public void WriteVector32(long Position, Vector128 Value) { if (Sse.IsSupported) @@ -378,7 +378,7 @@ namespace ChocolArm64.Memory } } - [MethodImpl(MethodImplOptions.NoInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining)] public void WriteVector64(long Position, Vector128 Value) { if (Sse2.IsSupported)