From f8cd072b62808c8da06549807cc263003f0049b7 Mon Sep 17 00:00:00 2001 From: merry Date: Fri, 5 Jun 2020 11:58:27 +0100 Subject: [PATCH] Faster crc32 implementation (#1294) * Add Pclmulqdq intrinsic * Implement crc32 in terms of pclmulqdq * Address PR comments --- ARMeilleure/CodeGen/X86/Assembler.cs | 8 + ARMeilleure/CodeGen/X86/IntrinsicTable.cs | 1 + ARMeilleure/CodeGen/X86/X86Instruction.cs | 1 + ARMeilleure/Instructions/InstEmitHash.cs | 137 +++++++++++++++++- .../IntermediateRepresentation/Intrinsic.cs | 1 + ARMeilleure/Optimizations.cs | 38 ++--- 6 files changed, 160 insertions(+), 26 deletions(-) diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs index de361677b..5ad54289c 100644 --- a/ARMeilleure/CodeGen/X86/Assembler.cs +++ b/ARMeilleure/CodeGen/X86/Assembler.cs @@ -165,6 +165,7 @@ namespace ARMeilleure.CodeGen.X86 Add(X86Instruction.Pavgb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fe0, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Pavgw, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fe3, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Pblendvb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3810, InstructionFlags.Prefix66)); + Add(X86Instruction.Pclmulqdq, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a44, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Pcmpeqb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f74, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Pcmpeqd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f76, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Pcmpeqq, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3829, InstructionFlags.Vex | InstructionFlags.Prefix66)); @@ -633,6 +634,13 @@ namespace ARMeilleure.CodeGen.X86 WriteInstruction(dest, source, type, X86Instruction.Or); } + public void Pclmulqdq(Operand dest, Operand source, byte imm) + { + WriteInstruction(dest, null, source, X86Instruction.Pclmulqdq); + + WriteByte(imm); + } + public void Pcmpeqw(Operand dest, Operand src1, Operand src2) { WriteInstruction(dest, src1, src2, X86Instruction.Pcmpeqw); diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs index 5382e3ead..bc07c6b09 100644 --- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs +++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs @@ -82,6 +82,7 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Pavgb, new IntrinsicInfo(X86Instruction.Pavgb, IntrinsicType.Binary)); Add(Intrinsic.X86Pavgw, new IntrinsicInfo(X86Instruction.Pavgw, IntrinsicType.Binary)); Add(Intrinsic.X86Pblendvb, new IntrinsicInfo(X86Instruction.Pblendvb, IntrinsicType.Ternary)); + Add(Intrinsic.X86Pclmulqdq, new IntrinsicInfo(X86Instruction.Pclmulqdq, IntrinsicType.TernaryImm)); Add(Intrinsic.X86Pcmpeqb, new IntrinsicInfo(X86Instruction.Pcmpeqb, IntrinsicType.Binary)); Add(Intrinsic.X86Pcmpeqd, new IntrinsicInfo(X86Instruction.Pcmpeqd, IntrinsicType.Binary)); Add(Intrinsic.X86Pcmpeqq, new IntrinsicInfo(X86Instruction.Pcmpeqq, IntrinsicType.Binary)); diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs index e4682595e..c3dffc62c 100644 --- a/ARMeilleure/CodeGen/X86/X86Instruction.cs +++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs @@ -98,6 +98,7 @@ namespace ARMeilleure.CodeGen.X86 Pavgb, Pavgw, Pblendvb, + Pclmulqdq, Pcmpeqb, Pcmpeqd, Pcmpeqq, diff --git a/ARMeilleure/Instructions/InstEmitHash.cs b/ARMeilleure/Instructions/InstEmitHash.cs index 0be8458e2..8a539666e 100644 --- a/ARMeilleure/Instructions/InstEmitHash.cs +++ b/ARMeilleure/Instructions/InstEmitHash.cs @@ -1,9 +1,13 @@ +// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf + using ARMeilleure.Decoders; using ARMeilleure.IntermediateRepresentation; using ARMeilleure.Translation; using System; using static ARMeilleure.Instructions.InstEmitHelper; +using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.IntermediateRepresentation.OperandHelper; namespace ARMeilleure.Instructions { @@ -11,42 +15,159 @@ namespace ARMeilleure.Instructions { public static void Crc32b(ArmEmitterContext context) { - EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32b)); + if (Optimizations.UsePclmulqdq) + { + EmitCrc32Optimized(context, false, 8); + } + else + { + EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32b)); + } } public static void Crc32h(ArmEmitterContext context) { - EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32h)); + if (Optimizations.UsePclmulqdq) + { + EmitCrc32Optimized(context, false, 16); + } + else + { + EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32h)); + } } public static void Crc32w(ArmEmitterContext context) { - EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32w)); + if (Optimizations.UsePclmulqdq) + { + EmitCrc32Optimized(context, false, 32); + } + else + { + EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32w)); + } } public static void Crc32x(ArmEmitterContext context) { - EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32x)); + if (Optimizations.UsePclmulqdq) + { + EmitCrc32Optimized64(context, false); + } + else + { + EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32x)); + } } public static void Crc32cb(ArmEmitterContext context) { - EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32cb)); + if (Optimizations.UsePclmulqdq) + { + EmitCrc32Optimized(context, true, 8); + } + else + { + EmitCrc32Call(context, new _U32_U32_U8(SoftFallback.Crc32cb)); + } } public static void Crc32ch(ArmEmitterContext context) { - EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32ch)); + if (Optimizations.UsePclmulqdq) + { + EmitCrc32Optimized(context, true, 16); + } + else + { + EmitCrc32Call(context, new _U32_U32_U16(SoftFallback.Crc32ch)); + } } public static void Crc32cw(ArmEmitterContext context) { - EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32cw)); + if (Optimizations.UsePclmulqdq) + { + EmitCrc32Optimized(context, true, 32); + } + else + { + EmitCrc32Call(context, new _U32_U32_U32(SoftFallback.Crc32cw)); + } } public static void Crc32cx(ArmEmitterContext context) { - EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32cx)); + if (Optimizations.UsePclmulqdq) + { + EmitCrc32Optimized64(context, true); + } + else + { + EmitCrc32Call(context, new _U32_U32_U64(SoftFallback.Crc32cx)); + } + } + + private static void EmitCrc32Optimized(ArmEmitterContext context, bool castagnoli, int bitsize) + { + OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp; + + long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))' + long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1 + + Operand crc = GetIntOrZR(context, op.Rn); + Operand data = GetIntOrZR(context, op.Rm); + + crc = context.VectorInsert(context.VectorZero(), crc, 0); + + switch (bitsize) + { + case 8: data = context.VectorInsert8(context.VectorZero(), data, 0); break; + case 16: data = context.VectorInsert16(context.VectorZero(), data, 0); break; + case 32: data = context.VectorInsert(context.VectorZero(), data, 0); break; + } + + Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data); + tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); + + if (bitsize < 32) + { + crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8)); + tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc); + } + + SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2)); + } + + private static void EmitCrc32Optimized64(ArmEmitterContext context, bool castagnoli) + { + OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp; + + long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))' + long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1 + + Operand crc = GetIntOrZR(context, op.Rn); + Operand data = GetIntOrZR(context, op.Rm); + + crc = context.VectorInsert(context.VectorZero(), crc, 0); + data = context.VectorInsert(context.VectorZero(), data, 0); + + Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data); + Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4)); + + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); + + tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res); + tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32)); + + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1)); + tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0)); + + SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2)); } private static void EmitCrc32Call(ArmEmitterContext context, Delegate dlg) diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs index 28ec9f32d..639ba7f92 100644 --- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs +++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs @@ -71,6 +71,7 @@ namespace ARMeilleure.IntermediateRepresentation X86Pavgb, X86Pavgw, X86Pblendvb, + X86Pclmulqdq, X86Pcmpeqb, X86Pcmpeqd, X86Pcmpeqq, diff --git a/ARMeilleure/Optimizations.cs b/ARMeilleure/Optimizations.cs index b486c5d20..fa06a4109 100644 --- a/ARMeilleure/Optimizations.cs +++ b/ARMeilleure/Optimizations.cs @@ -8,15 +8,16 @@ namespace ARMeilleure public static bool FastFP { get; set; } = true; - public static bool UseSseIfAvailable { get; set; } = true; - public static bool UseSse2IfAvailable { get; set; } = true; - public static bool UseSse3IfAvailable { get; set; } = true; - public static bool UseSsse3IfAvailable { get; set; } = true; - public static bool UseSse41IfAvailable { get; set; } = true; - public static bool UseSse42IfAvailable { get; set; } = true; - public static bool UsePopCntIfAvailable { get; set; } = true; - public static bool UseAvxIfAvailable { get; set; } = true; - public static bool UseAesniIfAvailable { get; set; } = true; + public static bool UseSseIfAvailable { get; set; } = true; + public static bool UseSse2IfAvailable { get; set; } = true; + public static bool UseSse3IfAvailable { get; set; } = true; + public static bool UseSsse3IfAvailable { get; set; } = true; + public static bool UseSse41IfAvailable { get; set; } = true; + public static bool UseSse42IfAvailable { get; set; } = true; + public static bool UsePopCntIfAvailable { get; set; } = true; + public static bool UseAvxIfAvailable { get; set; } = true; + public static bool UseAesniIfAvailable { get; set; } = true; + public static bool UsePclmulqdqIfAvailable { get; set; } = true; public static bool ForceLegacySse { @@ -24,14 +25,15 @@ namespace ARMeilleure set => HardwareCapabilities.ForceLegacySse = value; } - internal static bool UseSse => UseSseIfAvailable && HardwareCapabilities.SupportsSse; - internal static bool UseSse2 => UseSse2IfAvailable && HardwareCapabilities.SupportsSse2; - internal static bool UseSse3 => UseSse3IfAvailable && HardwareCapabilities.SupportsSse3; - internal static bool UseSsse3 => UseSsse3IfAvailable && HardwareCapabilities.SupportsSsse3; - internal static bool UseSse41 => UseSse41IfAvailable && HardwareCapabilities.SupportsSse41; - internal static bool UseSse42 => UseSse42IfAvailable && HardwareCapabilities.SupportsSse42; - internal static bool UsePopCnt => UsePopCntIfAvailable && HardwareCapabilities.SupportsPopcnt; - internal static bool UseAvx => UseAvxIfAvailable && HardwareCapabilities.SupportsAvx && !ForceLegacySse; - internal static bool UseAesni => UseAesniIfAvailable && HardwareCapabilities.SupportsAesni; + internal static bool UseSse => UseSseIfAvailable && HardwareCapabilities.SupportsSse; + internal static bool UseSse2 => UseSse2IfAvailable && HardwareCapabilities.SupportsSse2; + internal static bool UseSse3 => UseSse3IfAvailable && HardwareCapabilities.SupportsSse3; + internal static bool UseSsse3 => UseSsse3IfAvailable && HardwareCapabilities.SupportsSsse3; + internal static bool UseSse41 => UseSse41IfAvailable && HardwareCapabilities.SupportsSse41; + internal static bool UseSse42 => UseSse42IfAvailable && HardwareCapabilities.SupportsSse42; + internal static bool UsePopCnt => UsePopCntIfAvailable && HardwareCapabilities.SupportsPopcnt; + internal static bool UseAvx => UseAvxIfAvailable && HardwareCapabilities.SupportsAvx && !ForceLegacySse; + internal static bool UseAesni => UseAesniIfAvailable && HardwareCapabilities.SupportsAesni; + internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && HardwareCapabilities.SupportsPclmulqdq; } } \ No newline at end of file