Ryujinx/ARMeilleure/Translation/RegisterUsage.cs
riperiperi 8226997bc7
CodeGen Optimisations (LSRA and Translator) (#978)
* Start of JIT garbage collection improvements

- thread static pool for Operand, MemoryOperand, Operation
- Operands and Operations are always to be constructed via their static
helper classes, so they can be pooled.
- removing LinkedList from Node for sources/destinations (replaced with
List<>s for now, but probably could do arrays since size is bounded)
- removing params constructors from Node
- LinkedList<> to List<> with Clear() for Operand assignments/uses
- ThreadStaticPool is very simple and basically just exists for the
purpose of our specific translation allocation problem. Right now it
will stay at the worst case allocation count for that thread (so far) -
the pool can never shrink.

- Still some cases of Operand[] that haven't been removed yet. Will need
to evaluate them (eg. is there a reasonable max number of params for
Calls?)

* ConcurrentStack instead of ConcurrentQueue for Rejit

* Optimize some parts of LSRA

- BitMap now operates on 64-bit int rather than 32-bit
- BitMap is now pooled in a ThreadStatic pool (within lrsa)
- BitMap now is now its own iterator. Marginally speeds up iterating
through the bits.
- A few cases where enumerators were generated have been converted to
forms that generate less garbage.
- New data structure for sorting _usePositions in LiveIntervals. Much
faster split, NextUseAfter, initial insertion. Random insertion is
slightly slower.
- That last one is WIP since you need to insert the values backwards. It
would be ideal if it just flipped it for you, uncomplicating things on
the caller side.

* Use a static pool of thread static pools. (yes.)

Prevents each execution thread creating its own lowCq pool and making me cry.

* Move constant value to top, change naming convention.

* Fix iteration of memory operands.

* Increase max thread count.

* Address Feedback
2020-03-18 22:44:32 +11:00

421 lines
15 KiB
C#

using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.State;
using System;
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
using static ARMeilleure.IntermediateRepresentation.OperationHelper;
namespace ARMeilleure.Translation
{
static class RegisterUsage
{
private const long CallerSavedIntRegistersMask = 0x7fL << 9;
private const long PStateNzcvFlagsMask = 0xfL << 60;
private const long FpStateNzcvFlagsMask = 0xfL << 60;
private const long CallerSavedVecRegistersMask = 0xffffL << 16;
private const int RegsCount = 32;
private const int RegsMask = RegsCount - 1;
private struct RegisterMask : IEquatable<RegisterMask>
{
public long IntMask { get; set; }
public long VecMask { get; set; }
public RegisterMask(long intMask, long vecMask)
{
IntMask = intMask;
VecMask = vecMask;
}
public static RegisterMask operator &(RegisterMask x, RegisterMask y)
{
return new RegisterMask(x.IntMask & y.IntMask, x.VecMask & y.VecMask);
}
public static RegisterMask operator |(RegisterMask x, RegisterMask y)
{
return new RegisterMask(x.IntMask | y.IntMask, x.VecMask | y.VecMask);
}
public static RegisterMask operator ~(RegisterMask x)
{
return new RegisterMask(~x.IntMask, ~x.VecMask);
}
public static bool operator ==(RegisterMask x, RegisterMask y)
{
return x.Equals(y);
}
public static bool operator !=(RegisterMask x, RegisterMask y)
{
return !x.Equals(y);
}
public override bool Equals(object obj)
{
return obj is RegisterMask regMask && Equals(regMask);
}
public bool Equals(RegisterMask other)
{
return IntMask == other.IntMask && VecMask == other.VecMask;
}
public override int GetHashCode()
{
return HashCode.Combine(IntMask, VecMask);
}
}
public static void RunPass(ControlFlowGraph cfg, ExecutionMode mode, bool isCompleteFunction)
{
// Compute local register inputs and outputs used inside blocks.
RegisterMask[] localInputs = new RegisterMask[cfg.Blocks.Count];
RegisterMask[] localOutputs = new RegisterMask[cfg.Blocks.Count];
for (BasicBlock block = cfg.Blocks.First; block != null; block = block.ListNext)
{
for (Node node = block.Operations.First; node != null; node = node.ListNext)
{
Operation operation = node as Operation;
for (int srcIndex = 0; srcIndex < operation.SourcesCount; srcIndex++)
{
Operand source = operation.GetSource(srcIndex);
if (source.Kind != OperandKind.Register)
{
continue;
}
Register register = source.GetRegister();
localInputs[block.Index] |= GetMask(register) & ~localOutputs[block.Index];
}
if (operation.Destination != null && operation.Destination.Kind == OperandKind.Register)
{
localOutputs[block.Index] |= GetMask(operation.Destination.GetRegister());
}
}
}
// Compute global register inputs and outputs used across blocks.
RegisterMask[] globalCmnOutputs = new RegisterMask[cfg.Blocks.Count];
RegisterMask[] globalInputs = new RegisterMask[cfg.Blocks.Count];
RegisterMask[] globalOutputs = new RegisterMask[cfg.Blocks.Count];
bool modified;
bool firstPass = true;
do
{
modified = false;
// Compute register outputs.
for (int index = cfg.PostOrderBlocks.Length - 1; index >= 0; index--)
{
BasicBlock block = cfg.PostOrderBlocks[index];
if (block.Predecessors.Count != 0 && !HasContextLoad(block))
{
BasicBlock predecessor = block.Predecessors[0];
RegisterMask cmnOutputs = localOutputs[predecessor.Index] | globalCmnOutputs[predecessor.Index];
RegisterMask outputs = globalOutputs[predecessor.Index];
for (int pIndex = 1; pIndex < block.Predecessors.Count; pIndex++)
{
predecessor = block.Predecessors[pIndex];
cmnOutputs &= localOutputs[predecessor.Index] | globalCmnOutputs[predecessor.Index];
outputs |= globalOutputs[predecessor.Index];
}
globalInputs[block.Index] |= outputs & ~cmnOutputs;
if (!firstPass)
{
cmnOutputs &= globalCmnOutputs[block.Index];
}
if (Exchange(globalCmnOutputs, block.Index, cmnOutputs))
{
modified = true;
}
outputs |= localOutputs[block.Index];
if (Exchange(globalOutputs, block.Index, globalOutputs[block.Index] | outputs))
{
modified = true;
}
}
else if (Exchange(globalOutputs, block.Index, localOutputs[block.Index]))
{
modified = true;
}
}
// Compute register inputs.
for (int index = 0; index < cfg.PostOrderBlocks.Length; index++)
{
BasicBlock block = cfg.PostOrderBlocks[index];
RegisterMask inputs = localInputs[block.Index];
if (block.Next != null)
{
inputs |= globalInputs[block.Next.Index];
}
if (block.Branch != null)
{
inputs |= globalInputs[block.Branch.Index];
}
inputs &= ~globalCmnOutputs[block.Index];
if (Exchange(globalInputs, block.Index, globalInputs[block.Index] | inputs))
{
modified = true;
}
}
firstPass = false;
}
while (modified);
// Insert load and store context instructions where needed.
for (BasicBlock block = cfg.Blocks.First; block != null; block = block.ListNext)
{
bool hasContextLoad = HasContextLoad(block);
if (hasContextLoad)
{
block.Operations.Remove(block.Operations.First);
}
// The only block without any predecessor should be the entry block.
// It always needs a context load as it is the first block to run.
if (block.Predecessors.Count == 0 || hasContextLoad)
{
LoadLocals(block, globalInputs[block.Index].VecMask, RegisterType.Vector, mode);
LoadLocals(block, globalInputs[block.Index].IntMask, RegisterType.Integer, mode);
}
bool hasContextStore = HasContextStore(block);
if (hasContextStore)
{
block.Operations.Remove(block.Operations.Last);
}
if (EndsWithReturn(block) || hasContextStore)
{
StoreLocals(block, globalOutputs[block.Index].IntMask, RegisterType.Integer, mode, isCompleteFunction);
StoreLocals(block, globalOutputs[block.Index].VecMask, RegisterType.Vector, mode, isCompleteFunction);
}
}
}
private static bool HasContextLoad(BasicBlock block)
{
return StartsWith(block, Instruction.LoadFromContext) && block.Operations.First.SourcesCount == 0;
}
private static bool HasContextStore(BasicBlock block)
{
return EndsWith(block, Instruction.StoreToContext) && block.GetLastOp().SourcesCount == 0;
}
private static bool StartsWith(BasicBlock block, Instruction inst)
{
if (block.Operations.Count == 0)
{
return false;
}
return block.Operations.First is Operation operation && operation.Instruction == inst;
}
private static bool EndsWith(BasicBlock block, Instruction inst)
{
if (block.Operations.Count == 0)
{
return false;
}
return block.Operations.Last is Operation operation && operation.Instruction == inst;
}
private static RegisterMask GetMask(Register register)
{
long intMask = 0;
long vecMask = 0;
switch (register.Type)
{
case RegisterType.Flag: intMask = (1L << RegsCount) << register.Index; break;
case RegisterType.Integer: intMask = 1L << register.Index; break;
case RegisterType.FpFlag: vecMask = (1L << RegsCount) << register.Index; break;
case RegisterType.Vector: vecMask = 1L << register.Index; break;
}
return new RegisterMask(intMask, vecMask);
}
private static bool Exchange(RegisterMask[] masks, int blkIndex, RegisterMask value)
{
RegisterMask oldValue = masks[blkIndex];
masks[blkIndex] = value;
return oldValue != value;
}
private static void LoadLocals(BasicBlock block, long inputs, RegisterType baseType, ExecutionMode mode)
{
Operand arg0 = Local(OperandType.I64);
for (int bit = 63; bit >= 0; bit--)
{
long mask = 1L << bit;
if ((inputs & mask) == 0)
{
continue;
}
Operand dest = GetRegFromBit(bit, baseType, mode);
long offset = NativeContext.GetRegisterOffset(dest.GetRegister());
Operand addr = Local(OperandType.I64);
Operation loadOp = Operation(Instruction.Load, dest, addr);
block.Operations.AddFirst(loadOp);
Operation calcOffsOp = Operation(Instruction.Add, addr, arg0, Const(offset));
block.Operations.AddFirst(calcOffsOp);
}
Operation loadArg0 = Operation(Instruction.LoadArgument, arg0, Const(0));
block.Operations.AddFirst(loadArg0);
}
private static void StoreLocals(BasicBlock block, long outputs, RegisterType baseType, ExecutionMode mode, bool isCompleteFunction)
{
if (Optimizations.AssumeStrictAbiCompliance && isCompleteFunction)
{
if (baseType == RegisterType.Integer || baseType == RegisterType.Flag)
{
outputs = ClearCallerSavedIntRegs(outputs);
}
else /* if (baseType == RegisterType.Vector || baseType == RegisterType.FpFlag) */
{
outputs = ClearCallerSavedVecRegs(outputs);
}
}
Operand arg0 = Local(OperandType.I64);
Operation loadArg0 = Operation(Instruction.LoadArgument, arg0, Const(0));
block.Append(loadArg0);
for (int bit = 0; bit < 64; bit++)
{
long mask = 1L << bit;
if ((outputs & mask) == 0)
{
continue;
}
Operand source = GetRegFromBit(bit, baseType, mode);
long offset = NativeContext.GetRegisterOffset(source.GetRegister());
Operand addr = Local(OperandType.I64);
Operation calcOffsOp = Operation(Instruction.Add, addr, arg0, Const(offset));
block.Append(calcOffsOp);
Operation storeOp = Operation(Instruction.Store, null, addr, source);
block.Append(storeOp);
}
}
private static Operand GetRegFromBit(int bit, RegisterType baseType, ExecutionMode mode)
{
if (bit < RegsCount)
{
return OperandHelper.Register(bit, baseType, GetOperandType(baseType, mode));
}
else if (baseType == RegisterType.Integer)
{
return OperandHelper.Register(bit & RegsMask, RegisterType.Flag, OperandType.I32);
}
else if (baseType == RegisterType.Vector)
{
return OperandHelper.Register(bit & RegsMask, RegisterType.FpFlag, OperandType.I32);
}
else
{
throw new ArgumentOutOfRangeException(nameof(bit));
}
}
private static OperandType GetOperandType(RegisterType type, ExecutionMode mode)
{
switch (type)
{
case RegisterType.Flag: return OperandType.I32;
case RegisterType.FpFlag: return OperandType.I32;
case RegisterType.Integer: return (mode == ExecutionMode.Aarch64) ? OperandType.I64 : OperandType.I32;
case RegisterType.Vector: return OperandType.V128;
}
throw new ArgumentException($"Invalid register type \"{type}\".");
}
private static bool EndsWithReturn(BasicBlock block)
{
if (!(block.GetLastOp() is Operation operation))
{
return false;
}
return operation.Instruction == Instruction.Return;
}
private static long ClearCallerSavedIntRegs(long mask)
{
// TODO: ARM32 support.
mask &= ~(CallerSavedIntRegistersMask | PStateNzcvFlagsMask);
return mask;
}
private static long ClearCallerSavedVecRegs(long mask)
{
// TODO: ARM32 support.
mask &= ~(CallerSavedVecRegistersMask | FpStateNzcvFlagsMask);
return mask;
}
}
}