From 61d79facd1740264dadb6c62a0af21179bf6672b Mon Sep 17 00:00:00 2001 From: gdkchan Date: Mon, 9 Mar 2020 19:29:34 -0300 Subject: [PATCH] Optimize x64 loads and stores using complex addressing modes (#972) * Optimize x64 loads and stores using complex addressing modes * This was meant to be used for testing --- .../CodeGen/Optimizations/Optimizer.cs | 32 ++- .../RegisterAllocators/HybridAllocator.cs | 83 +++++- .../RegisterAllocators/LinearScanAllocator.cs | 28 ++ ARMeilleure/CodeGen/X86/Assembler.cs | 16 +- ARMeilleure/CodeGen/X86/CodeGenCommon.cs | 19 ++ ARMeilleure/CodeGen/X86/CodeGenerator.cs | 2 + ARMeilleure/CodeGen/X86/PreAllocator.cs | 18 +- ARMeilleure/CodeGen/X86/X86Optimizer.cs | 251 ++++++++++++++++++ .../IntermediateRepresentation/Node.cs | 152 ++++++++--- ARMeilleure/Translation/Compiler.cs | 23 +- 10 files changed, 538 insertions(+), 86 deletions(-) create mode 100644 ARMeilleure/CodeGen/X86/CodeGenCommon.cs create mode 100644 ARMeilleure/CodeGen/X86/X86Optimizer.cs diff --git a/ARMeilleure/CodeGen/Optimizations/Optimizer.cs b/ARMeilleure/CodeGen/Optimizations/Optimizer.cs index e3117d1f7..d3ffd185e 100644 --- a/ARMeilleure/CodeGen/Optimizations/Optimizer.cs +++ b/ARMeilleure/CodeGen/Optimizations/Optimizer.cs @@ -1,8 +1,6 @@ using ARMeilleure.IntermediateRepresentation; using ARMeilleure.Translation; -using System.Collections.Generic; using System.Diagnostics; -using System.Linq; namespace ARMeilleure.CodeGen.Optimizations { @@ -60,6 +58,36 @@ namespace ARMeilleure.CodeGen.Optimizations while (modified); } + public static void RemoveUnusedNodes(ControlFlowGraph cfg) + { + bool modified; + + do + { + modified = false; + + for (BasicBlock block = cfg.Blocks.First; block != null; block = block.ListNext) + { + Node node = block.Operations.First; + + while (node != null) + { + Node nextNode = node.ListNext; + + if (IsUnused(node)) + { + RemoveNode(block, node); + + modified = true; + } + + node = nextNode; + } + } + } + while (modified); + } + private static void PropagateCopy(Operation copyOp) { // Propagate copy source operand to all uses of the destination operand. diff --git a/ARMeilleure/CodeGen/RegisterAllocators/HybridAllocator.cs b/ARMeilleure/CodeGen/RegisterAllocators/HybridAllocator.cs index ed0e1ae1a..ce7936f91 100644 --- a/ARMeilleure/CodeGen/RegisterAllocators/HybridAllocator.cs +++ b/ARMeilleure/CodeGen/RegisterAllocators/HybridAllocator.cs @@ -110,6 +110,20 @@ namespace ARMeilleure.CodeGen.RegisterAllocators { locInfo[source.AsInt32() - 1].SetBlockIndex(block.Index); } + else if (source.Kind == OperandKind.Memory) + { + MemoryOperand memOp = (MemoryOperand)source; + + if (memOp.BaseAddress != null) + { + locInfo[memOp.BaseAddress.AsInt32() - 1].SetBlockIndex(block.Index); + } + + if (memOp.Index != null) + { + locInfo[memOp.Index.AsInt32() - 1].SetBlockIndex(block.Index); + } + } } for (int dstIndex = 0; dstIndex < node.DestinationsCount; dstIndex++) @@ -181,15 +195,8 @@ namespace ARMeilleure.CodeGen.RegisterAllocators int intLocalUse = 0; int vecLocalUse = 0; - for (int srcIndex = 0; srcIndex < node.SourcesCount; srcIndex++) + void AllocateRegister(Operand source, MemoryOperand memOp, int srcIndex) { - Operand source = node.GetSource(srcIndex); - - if (source.Kind != OperandKind.LocalVariable) - { - continue; - } - LocalInfo info = locInfo[source.AsInt32() - 1]; info.UseCount++; @@ -198,7 +205,23 @@ namespace ARMeilleure.CodeGen.RegisterAllocators if (info.Register != -1) { - node.SetSource(srcIndex, Register(info.Register, source.Type.ToRegisterType(), source.Type)); + Operand reg = Register(info.Register, source.Type.ToRegisterType(), source.Type); + + if (memOp != null) + { + if (srcIndex == 0) + { + memOp.BaseAddress = reg; + } + else /* if (srcIndex == 1) */ + { + memOp.Index = reg; + } + } + else + { + node.SetSource(srcIndex, reg); + } if (info.UseCount == info.Uses && !info.PreAllocated) { @@ -223,10 +246,24 @@ namespace ARMeilleure.CodeGen.RegisterAllocators : GetSpillTemp(source, vecSpillTempRegisters, ref vecLocalUse); info.Sequence = sequence; - info.Temp = temp; + info.Temp = temp; } - node.SetSource(srcIndex, temp); + if (memOp != null) + { + if (srcIndex == 0) + { + memOp.BaseAddress = temp; + } + else /* if (srcIndex == 1) */ + { + memOp.Index = temp; + } + } + else + { + node.SetSource(srcIndex, temp); + } Operation fillOp = new Operation(Instruction.Fill, temp, Const(info.SpillOffset)); @@ -234,6 +271,30 @@ namespace ARMeilleure.CodeGen.RegisterAllocators } } + for (int srcIndex = 0; srcIndex < node.SourcesCount; srcIndex++) + { + Operand source = node.GetSource(srcIndex); + + if (source.Kind == OperandKind.LocalVariable) + { + AllocateRegister(source, null, srcIndex); + } + else if (source.Kind == OperandKind.Memory) + { + MemoryOperand memOp = (MemoryOperand)source; + + if (memOp.BaseAddress != null) + { + AllocateRegister(memOp.BaseAddress, memOp, 0); + } + + if (memOp.Index != null) + { + AllocateRegister(memOp.Index, memOp, 1); + } + } + } + int intLocalAsg = 0; int vecLocalAsg = 0; diff --git a/ARMeilleure/CodeGen/RegisterAllocators/LinearScanAllocator.cs b/ARMeilleure/CodeGen/RegisterAllocators/LinearScanAllocator.cs index 1127ccd58..1dc6ad737 100644 --- a/ARMeilleure/CodeGen/RegisterAllocators/LinearScanAllocator.cs +++ b/ARMeilleure/CodeGen/RegisterAllocators/LinearScanAllocator.cs @@ -711,6 +711,20 @@ namespace ARMeilleure.CodeGen.RegisterAllocators { operation.SetSource(index, register); } + else if (source.Kind == OperandKind.Memory) + { + MemoryOperand memOp = (MemoryOperand)source; + + if (memOp.BaseAddress == current.Local) + { + memOp.BaseAddress = register; + } + + if (memOp.Index == current.Local) + { + memOp.Index = register; + } + } } for (int index = 0; index < operation.DestinationsCount; index++) @@ -1011,6 +1025,20 @@ namespace ARMeilleure.CodeGen.RegisterAllocators { yield return source; } + else if (source.Kind == OperandKind.Memory) + { + MemoryOperand memOp = (MemoryOperand)source; + + if (memOp.BaseAddress != null) + { + yield return memOp.BaseAddress; + } + + if (memOp.Index != null) + { + yield return memOp.Index; + } + } } } diff --git a/ARMeilleure/CodeGen/X86/Assembler.cs b/ARMeilleure/CodeGen/X86/Assembler.cs index 4568253ad..70130d90e 100644 --- a/ARMeilleure/CodeGen/X86/Assembler.cs +++ b/ARMeilleure/CodeGen/X86/Assembler.cs @@ -14,6 +14,8 @@ namespace ARMeilleure.CodeGen.X86 private const byte RexWPrefix = 0x48; private const byte LockPrefix = 0xf0; + private const int MaxRegNumber = 15; + [Flags] private enum InstructionFlags { @@ -842,10 +844,7 @@ namespace ARMeilleure.CodeGen.X86 { X86Register shiftReg = (X86Register)source.GetRegister().Index; - if (shiftReg != X86Register.Rcx) - { - throw new ArgumentException($"Invalid shift register \"{shiftReg}\"."); - } + Debug.Assert(shiftReg == X86Register.Rcx, $"Invalid shift register \"{shiftReg}\"."); source = null; } @@ -1080,6 +1079,8 @@ namespace ARMeilleure.CodeGen.X86 if (baseReg.Index >= 8) { + Debug.Assert((uint)baseReg.Index <= MaxRegNumber); + rexPrefix |= RexPrefix | (baseReg.Index >> 3); } @@ -1091,13 +1092,12 @@ namespace ARMeilleure.CodeGen.X86 { int indexReg = memOp.Index.GetRegister().Index; - if (indexReg == (int)X86Register.Rsp) - { - throw new ArgumentException("Using RSP as index register on the memory operand is not allowed."); - } + Debug.Assert(indexReg != (int)X86Register.Rsp, "Using RSP as index register on the memory operand is not allowed."); if (indexReg >= 8) { + Debug.Assert((uint)indexReg <= MaxRegNumber); + rexPrefix |= RexPrefix | (indexReg >> 3) << 1; } diff --git a/ARMeilleure/CodeGen/X86/CodeGenCommon.cs b/ARMeilleure/CodeGen/X86/CodeGenCommon.cs new file mode 100644 index 000000000..237ecee4e --- /dev/null +++ b/ARMeilleure/CodeGen/X86/CodeGenCommon.cs @@ -0,0 +1,19 @@ +using ARMeilleure.IntermediateRepresentation; + +namespace ARMeilleure.CodeGen.X86 +{ + static class CodeGenCommon + { + public static bool IsLongConst(Operand op) + { + long value = op.Type == OperandType.I32 ? op.AsInt32() : op.AsInt64(); + + return !ConstFitsOnS32(value); + } + + private static bool ConstFitsOnS32(long value) + { + return value == (int)value; + } + } +} diff --git a/ARMeilleure/CodeGen/X86/CodeGenerator.cs b/ARMeilleure/CodeGen/X86/CodeGenerator.cs index d0cb77f81..32ca6a781 100644 --- a/ARMeilleure/CodeGen/X86/CodeGenerator.cs +++ b/ARMeilleure/CodeGen/X86/CodeGenerator.cs @@ -109,6 +109,8 @@ namespace ARMeilleure.CodeGen.X86 Optimizer.RunPass(cfg); } + X86Optimizer.RunPass(cfg); + Logger.EndPass(PassName.Optimization, cfg); Logger.StartPass(PassName.PreAllocation); diff --git a/ARMeilleure/CodeGen/X86/PreAllocator.cs b/ARMeilleure/CodeGen/X86/PreAllocator.cs index d250f5e88..75844b099 100644 --- a/ARMeilleure/CodeGen/X86/PreAllocator.cs +++ b/ARMeilleure/CodeGen/X86/PreAllocator.cs @@ -184,7 +184,7 @@ namespace ARMeilleure.CodeGen.X86 operation.SetSource(1, src2); } - else if (!HasConstSrc2(inst) || IsLongConst(src2)) + else if (!HasConstSrc2(inst) || CodeGenCommon.IsLongConst(src2)) { src2 = AddCopy(nodes, node, src2); @@ -1046,7 +1046,7 @@ namespace ARMeilleure.CodeGen.X86 nodes.AddBefore(node, retCopyOp); } - operation.SetSources(new Operand[0]); + operation.SetSources(System.Array.Empty()); } private static void HandleReturnSystemVAbi(IntrusiveList nodes, Node node, Operation operation) @@ -1116,20 +1116,6 @@ namespace ARMeilleure.CodeGen.X86 return value; } - private static bool IsLongConst(Operand operand) - { - long value = operand.Type == OperandType.I32 - ? operand.AsInt32() - : operand.AsInt64(); - - return !ConstFitsOnS32(value); - } - - private static bool ConstFitsOnS32(long value) - { - return value == (int)value; - } - private static void Delete(IntrusiveList nodes, Node node, Operation operation) { operation.Destination = null; diff --git a/ARMeilleure/CodeGen/X86/X86Optimizer.cs b/ARMeilleure/CodeGen/X86/X86Optimizer.cs new file mode 100644 index 000000000..c52541ca9 --- /dev/null +++ b/ARMeilleure/CodeGen/X86/X86Optimizer.cs @@ -0,0 +1,251 @@ +using ARMeilleure.CodeGen.Optimizations; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.Translation; + +using static ARMeilleure.IntermediateRepresentation.OperandHelper; + +namespace ARMeilleure.CodeGen.X86 +{ + static class X86Optimizer + { + public static void RunPass(ControlFlowGraph cfg) + { + for (BasicBlock block = cfg.Blocks.First; block != null; block = block.ListNext) + { + Node nextNode; + + for (Node node = block.Operations.First; node != null; node = nextNode) + { + nextNode = node.ListNext; + + if (!(node is Operation operation)) + { + continue; + } + + // Insert copies for constants that can't fit on a 32-bits immediate. + // Doing this early unblocks a few optimizations. + if (operation.Instruction == Instruction.Add) + { + Operand src1 = operation.GetSource(0); + Operand src2 = operation.GetSource(1); + + if (src1.Kind == OperandKind.Constant && CodeGenCommon.IsLongConst(src1)) + { + Operand temp = Local(src1.Type); + + Operation copyOp = new Operation(Instruction.Copy, temp, src1); + + block.Operations.AddBefore(operation, copyOp); + + operation.SetSource(0, temp); + } + + if (src2.Kind == OperandKind.Constant && CodeGenCommon.IsLongConst(src2)) + { + Operand temp = Local(src2.Type); + + Operation copyOp = new Operation(Instruction.Copy, temp, src2); + + block.Operations.AddBefore(operation, copyOp); + + operation.SetSource(1, temp); + } + } + + // Try to fold something like: + // shl rbx, 2 + // add rax, rbx + // add rax, 0xcafe + // mov rax, [rax] + // Into: + // mov rax, [rax+rbx*4+0xcafe] + if (IsMemoryLoadOrStore(operation.Instruction)) + { + OperandType type; + + if (operation.Destination != null) + { + type = operation.Destination.Type; + } + else + { + type = operation.GetSource(1).Type; + } + + MemoryOperand memOp = GetMemoryOperandOrNull(operation.GetSource(0), type); + + if (memOp != null) + { + operation.SetSource(0, memOp); + } + } + } + } + + Optimizer.RemoveUnusedNodes(cfg); + } + + private static MemoryOperand GetMemoryOperandOrNull(Operand addr, OperandType type) + { + Operand baseOp = addr; + + // First we check if the address is the result of a local X with 32-bits immediate + // addition. If that is the case, then the baseOp is X, and the memory operand immediate + // becomes the addition immediate. Otherwise baseOp keeps being the address. + int imm = GetConstOp(ref baseOp); + + // Now we check if the baseOp is the result of a local Y with a local Z addition. + // If that is the case, we now set baseOp to Y and indexOp to Z. We further check + // if Z is the result of a left shift of local W by a value >= 0 and <= 3, if that + // is the case, we set indexOp to W and adjust the scale value of the memory operand + // to match that of the left shift. + // There is one missed case, which is the address being a shift result, but this is + // probably not worth optimizing as it should never happen. + (Operand indexOp, Multiplier scale) = GetIndexOp(ref baseOp); + + // If baseOp is still equal to address, then there's nothing that can be optimized. + if (baseOp == addr) + { + return null; + } + + return new MemoryOperand(type, baseOp, indexOp, scale, imm); + } + + private static int GetConstOp(ref Operand baseOp) + { + Operation operation = GetAsgOpWithInst(baseOp, Instruction.Add); + + if (operation == null) + { + return 0; + } + + Operand src1 = operation.GetSource(0); + Operand src2 = operation.GetSource(1); + + Operand constOp; + Operand otherOp; + + if (src1.Kind == OperandKind.Constant && src2.Kind == OperandKind.LocalVariable) + { + constOp = src1; + otherOp = src2; + } + else if (src1.Kind == OperandKind.LocalVariable && src2.Kind == OperandKind.Constant) + { + constOp = src2; + otherOp = src1; + } + else + { + return 0; + } + + // If we have addition by 64-bits constant, then we can't optimize it further, + // as we can't encode a 64-bits immediate on the memory operand. + if (CodeGenCommon.IsLongConst(constOp)) + { + return 0; + } + + baseOp = otherOp; + + return constOp.AsInt32(); + } + + private static (Operand, Multiplier) GetIndexOp(ref Operand baseOp) + { + Operand indexOp = null; + + Multiplier scale = Multiplier.x1; + + Operation addOp = GetAsgOpWithInst(baseOp, Instruction.Add); + + if (addOp == null) + { + return (indexOp, scale); + } + + Operand src1 = addOp.GetSource(0); + Operand src2 = addOp.GetSource(1); + + if (src1.Kind != OperandKind.LocalVariable || src2.Kind != OperandKind.LocalVariable) + { + return (indexOp, scale); + } + + baseOp = src1; + indexOp = src2; + + Operation shlOp = GetAsgOpWithInst(src1, Instruction.ShiftLeft); + + bool indexOnSrc2 = false; + + if (shlOp == null) + { + shlOp = GetAsgOpWithInst(src2, Instruction.ShiftLeft); + + indexOnSrc2 = true; + } + + if (shlOp != null) + { + Operand shSrc = shlOp.GetSource(0); + Operand shift = shlOp.GetSource(1); + + if (shSrc.Kind == OperandKind.LocalVariable && shift.Kind == OperandKind.Constant && shift.Value <= 3) + { + scale = shift.Value switch + { + 1 => Multiplier.x2, + 2 => Multiplier.x4, + 3 => Multiplier.x8, + _ => Multiplier.x1 + }; + + baseOp = indexOnSrc2 ? src1 : src2; + indexOp = shSrc; + } + } + + return (indexOp, scale); + } + + private static Operation GetAsgOpWithInst(Operand op, Instruction inst) + { + // If we have multiple assignments, folding is not safe + // as the value may be different depending on the + // control flow path. + if (op.Assignments.Count != 1) + { + return null; + } + + Node asgOp = op.Assignments[0]; + + if (!(asgOp is Operation operation)) + { + return null; + } + + if (operation.Instruction != inst) + { + return null; + } + + return operation; + } + + private static bool IsMemoryLoadOrStore(Instruction inst) + { + return inst == Instruction.Load || + inst == Instruction.Load16 || + inst == Instruction.Load8 || + inst == Instruction.Store || + inst == Instruction.Store16 || + inst == Instruction.Store8; + } + } +} diff --git a/ARMeilleure/IntermediateRepresentation/Node.cs b/ARMeilleure/IntermediateRepresentation/Node.cs index e1f8b11bb..37647c560 100644 --- a/ARMeilleure/IntermediateRepresentation/Node.cs +++ b/ARMeilleure/IntermediateRepresentation/Node.cs @@ -58,34 +58,18 @@ namespace ARMeilleure.IntermediateRepresentation public void SetDestination(int index, Operand destination) { - Operand oldOp = _destinations[index]; + RemoveAssignment(_destinations[index]); - if (oldOp != null && oldOp.Kind == OperandKind.LocalVariable) - { - oldOp.Assignments.Remove(this); - } - - if (destination != null && destination.Kind == OperandKind.LocalVariable) - { - destination.Assignments.Add(this); - } + AddAssignment(destination); _destinations[index] = destination; } public void SetSource(int index, Operand source) { - Operand oldOp = _sources[index]; + RemoveUse(_sources[index]); - if (oldOp != null && oldOp.Kind == OperandKind.LocalVariable) - { - oldOp.Uses.Remove(this); - } - - if (source != null && source.Kind == OperandKind.LocalVariable) - { - source.Uses.Add(this); - } + AddUse(source); _sources[index] = source; } @@ -96,12 +80,7 @@ namespace ARMeilleure.IntermediateRepresentation { for (int index = 0; index < _destinations.Length; index++) { - Operand oldOp = _destinations[index]; - - if (oldOp != null && oldOp.Kind == OperandKind.LocalVariable) - { - oldOp.Assignments.Remove(this); - } + RemoveAssignment(_destinations[index]); } _destinations = destinations; @@ -117,10 +96,7 @@ namespace ARMeilleure.IntermediateRepresentation _destinations[index] = newOp; - if (newOp.Kind == OperandKind.LocalVariable) - { - newOp.Assignments.Add(this); - } + AddAssignment(newOp); } } @@ -128,12 +104,7 @@ namespace ARMeilleure.IntermediateRepresentation { for (int index = 0; index < _sources.Length; index++) { - Operand oldOp = _sources[index]; - - if (oldOp != null && oldOp.Kind == OperandKind.LocalVariable) - { - oldOp.Uses.Remove(this); - } + RemoveUse(_sources[index]); } _sources = new Operand[sources.Length]; @@ -144,9 +115,114 @@ namespace ARMeilleure.IntermediateRepresentation _sources[index] = newOp; - if (newOp.Kind == OperandKind.LocalVariable) + AddUse(newOp); + } + } + + private void AddAssignment(Operand op) + { + if (op == null) + { + return; + } + + if (op.Kind == OperandKind.LocalVariable) + { + op.Assignments.Add(this); + } + else if (op.Kind == OperandKind.Memory) + { + MemoryOperand memOp = (MemoryOperand)op; + + if (memOp.BaseAddress != null) { - newOp.Uses.Add(this); + memOp.BaseAddress.Assignments.Add(this); + } + + if (memOp.Index != null) + { + memOp.Index.Assignments.Add(this); + } + } + } + + private void RemoveAssignment(Operand op) + { + if (op == null) + { + return; + } + + if (op.Kind == OperandKind.LocalVariable) + { + op.Assignments.Remove(this); + } + else if (op.Kind == OperandKind.Memory) + { + MemoryOperand memOp = (MemoryOperand)op; + + if (memOp.BaseAddress != null) + { + memOp.BaseAddress.Assignments.Remove(this); + } + + if (memOp.Index != null) + { + memOp.Index.Assignments.Remove(this); + } + } + } + + private void AddUse(Operand op) + { + if (op == null) + { + return; + } + + if (op.Kind == OperandKind.LocalVariable) + { + op.Uses.Add(this); + } + else if (op.Kind == OperandKind.Memory) + { + MemoryOperand memOp = (MemoryOperand)op; + + if (memOp.BaseAddress != null) + { + memOp.BaseAddress.Uses.Add(this); + } + + if (memOp.Index != null) + { + memOp.Index.Uses.Add(this); + } + } + } + + private void RemoveUse(Operand op) + { + if (op == null) + { + return; + } + + if (op.Kind == OperandKind.LocalVariable) + { + op.Uses.Remove(this); + } + else if (op.Kind == OperandKind.Memory) + { + MemoryOperand memOp = (MemoryOperand)op; + + if (memOp.BaseAddress != null) + { + memOp.BaseAddress.Uses.Remove(this); + } + + if (memOp.Index != null) + { + memOp.Index.Uses.Remove(this); } } } diff --git a/ARMeilleure/Translation/Compiler.cs b/ARMeilleure/Translation/Compiler.cs index 4075a7f06..c2a2c7462 100644 --- a/ARMeilleure/Translation/Compiler.cs +++ b/ARMeilleure/Translation/Compiler.cs @@ -9,11 +9,16 @@ namespace ARMeilleure.Translation { static class Compiler { - public static T Compile( - ControlFlowGraph cfg, - OperandType[] funcArgTypes, - OperandType funcReturnType, - CompilerOptions options) + public static T Compile(ControlFlowGraph cfg, OperandType[] argTypes, OperandType retType, CompilerOptions options) + { + CompiledFunction func = CompileAndGetCf(cfg, argTypes, retType, options); + + IntPtr codePtr = JitCache.Map(func); + + return Marshal.GetDelegateForFunctionPointer(codePtr); + } + + public static CompiledFunction CompileAndGetCf(ControlFlowGraph cfg, OperandType[] argTypes, OperandType retType, CompilerOptions options) { Logger.StartPass(PassName.Dominance); @@ -35,13 +40,9 @@ namespace ARMeilleure.Translation Logger.EndPass(PassName.SsaConstruction, cfg); - CompilerContext cctx = new CompilerContext(cfg, funcArgTypes, funcReturnType, options); + CompilerContext cctx = new CompilerContext(cfg, argTypes, retType, options); - CompiledFunction func = CodeGenerator.Generate(cctx); - - IntPtr codePtr = JitCache.Map(func); - - return Marshal.GetDelegateForFunctionPointer(codePtr); + return CodeGenerator.Generate(cctx); } } } \ No newline at end of file