PPCRec: Update spill cost calculation

This commit is contained in:
Exzap 2024-10-25 19:00:11 +02:00
parent 70c99fd626
commit 96d7c754f9
5 changed files with 103 additions and 90 deletions

View File

@ -3,9 +3,6 @@
#include "IMLInstruction.h"
#include "IMLSegment.h"
// analyzer
bool IMLAnalyzer_IsTightFiniteLoop(IMLSegment* imlSegment);
// optimizer passes
void IMLOptimizer_OptimizeDirectFloatCopies(struct ppcImlGenContext_t* ppcImlGenContext);
void IMLOptimizer_OptimizeDirectIntegerCopies(struct ppcImlGenContext_t* ppcImlGenContext);

View File

@ -3,53 +3,3 @@
#include "util/helpers/fixedSizeList.h"
#include "Cafe/HW/Espresso/Interpreter/PPCInterpreterInternal.h"
/*
* Analyzes a single segment and returns true if it is a finite loop
*/
bool IMLAnalyzer_IsTightFiniteLoop(IMLSegment* imlSegment)
{
return false; // !!! DISABLED !!!
bool isTightFiniteLoop = false;
// base criteria, must jump to beginning of same segment
if (imlSegment->nextSegmentBranchTaken != imlSegment)
return false;
// loops using BDNZ are assumed to always be finite
for(const IMLInstruction& instIt : imlSegment->imlList)
{
if (instIt.type == PPCREC_IML_TYPE_R_S32 && instIt.operation == PPCREC_IML_OP_SUB)
{
return true;
}
}
// for non-BDNZ loops, check for common patterns
// risky approach, look for ADD/SUB operations and assume that potential overflow means finite (does not include r_r_s32 ADD/SUB)
// this catches most loops with load-update and store-update instructions, but also those with decrementing counters
FixedSizeList<IMLReg, 64, true> list_modifiedRegisters;
for (const IMLInstruction& instIt : imlSegment->imlList)
{
if (instIt.type == PPCREC_IML_TYPE_R_S32 && (instIt.operation == PPCREC_IML_OP_ADD || instIt.operation == PPCREC_IML_OP_SUB) )
{
list_modifiedRegisters.addUnique(instIt.op_r_immS32.regR);
}
}
if (list_modifiedRegisters.count > 0)
{
// remove all registers from the list that are modified by non-ADD/SUB instructions
// todo: We should also cover the case where ADD+SUB on the same register cancel the effect out
IMLUsedRegisters registersUsed;
for (const IMLInstruction& instIt : imlSegment->imlList)
{
if (instIt.type == PPCREC_IML_TYPE_R_S32 && (instIt.operation == PPCREC_IML_OP_ADD || instIt.operation == PPCREC_IML_OP_SUB))
continue;
instIt.CheckRegisterUsage(&registersUsed);
registersUsed.ForEachWrittenGPR([&](IMLReg r) { list_modifiedRegisters.remove(r); });
}
if (list_modifiedRegisters.count > 0)
{
return true;
}
}
return false;
}

View File

@ -15,7 +15,6 @@
#define DEBUG_RA_EXTRA_VALIDATION 0 // if set to non-zero, additional expensive validation checks will be performed
#define DEBUG_RA_INSTRUCTION_GEN 0
struct IMLRARegAbstractLiveness // preliminary liveness info. One entry per register and segment
{
IMLRARegAbstractLiveness(IMLRegFormat regBaseFormat, sint32 usageStart, sint32 usageEnd)
@ -38,7 +37,7 @@ struct IMLRegisterAllocatorContext
IMLRegisterAllocatorParameters* raParam;
ppcImlGenContext_t* deprGenContext; // deprecated. Try to decouple IMLRA from other parts of IML/PPCRec
std::unordered_map<IMLRegID, IMLRegFormat> regIdToBaseFormat; // a vector would be more efficient but it also means that reg ids have to be continuous and not completely arbitrary
std::unordered_map<IMLRegID, IMLRegFormat> regIdToBaseFormat;
// first pass
std::vector<std::unordered_map<IMLRegID, IMLRARegAbstractLiveness>> perSegmentAbstractRanges;
@ -781,11 +780,11 @@ class RASpillStrategy_LocalRangeHoleCutting : public RASpillStrategy
cemu_assert_debug(currentRangeStart.IsInstructionIndex());
distance2 = std::min<sint32>(distance2, imlSegment->imlList.size() * 2 - currentRangeStart.GetRaw()); // limit distance to end of segment
// calculate split cost of candidate
sint32 cost = PPCRecRARange_estimateAdditionalCostAfterSplit(candidate, currentRangeStart + distance2);
sint32 cost = IMLRA_CalculateAdditionalCostAfterSplit(candidate, currentRangeStart + distance2);
// calculate additional split cost of currentRange if hole is not large enough
if (distance2 < requiredSize2)
{
cost += PPCRecRARange_estimateAdditionalCostAfterSplit(currentRange, currentRangeStart + distance2);
cost += IMLRA_CalculateAdditionalCostAfterSplit(currentRange, currentRangeStart + distance2);
// we also slightly increase cost in relation to the remaining length (in order to make the algorithm prefer larger holes)
cost += (requiredSize2 - distance2) / 10;
}
@ -889,7 +888,7 @@ class RASpillStrategy_AvailableRegisterHole : public RASpillStrategy
continue;
// calculate additional cost due to split
cemu_assert_debug(distance < requiredSize2); // should always be true otherwise previous step would have selected this register?
sint32 cost = PPCRecRARange_estimateAdditionalCostAfterSplit(currentRange, currentRangeStart + distance);
sint32 cost = IMLRA_CalculateAdditionalCostAfterSplit(currentRange, currentRangeStart + distance);
// add small additional cost for the remaining range (prefer larger holes)
cost += ((requiredSize2 - distance) / 2) / 10;
if (cost < strategyCost)
@ -959,11 +958,11 @@ class RASpillStrategy_ExplodeRange : public RASpillStrategy
IMLRA_MakeSafeSplitDistance(imlSegment, currentRangeStart, distance);
if (distance < 2)
continue;
sint32 cost = PPCRecRARange_estimateCostAfterRangeExplode(candidate);
sint32 cost = IMLRA_CalculateAdditionalCostOfRangeExplode(candidate);
// if the hole is not large enough, add cost of splitting current subrange
if (distance < requiredSize2)
{
cost += PPCRecRARange_estimateAdditionalCostAfterSplit(currentRange, currentRangeStart + distance);
cost += IMLRA_CalculateAdditionalCostAfterSplit(currentRange, currentRangeStart + distance);
// add small additional cost for the remaining range (prefer larger holes)
cost += ((requiredSize2 - distance) / 2) / 10;
}
@ -1032,7 +1031,7 @@ class RASpillStrategy_ExplodeRangeInter : public RASpillStrategy
if (!allowedRegs.IsAvailable(candidate->GetPhysicalRegister()))
continue;
sint32 cost;
cost = PPCRecRARange_estimateCostAfterRangeExplode(candidate);
cost = IMLRA_CalculateAdditionalCostOfRangeExplode(candidate);
// compare with current best candidate for this strategy
if (cost < strategyCost)
{
@ -1043,7 +1042,7 @@ class RASpillStrategy_ExplodeRangeInter : public RASpillStrategy
}
// add current range as a candidate too
sint32 ownCost;
ownCost = PPCRecRARange_estimateCostAfterRangeExplode(currentRange);
ownCost = IMLRA_CalculateAdditionalCostOfRangeExplode(currentRange);
if (ownCost < strategyCost)
{
strategyCost = ownCost;
@ -1859,7 +1858,7 @@ static void IMLRA_AnalyzeRangeDataFlow(raLivenessRange* subrange)
if (subrangeItr->hasStore)
continue; // this ending already stores, no extra cost
alreadyStoredInAllEndings = false;
sint32 storeCost = PPCRecRARange_getReadWriteCost(subrangeItr->imlSegment);
sint32 storeCost = IMLRA_GetSegmentReadWriteCost(subrangeItr->imlSegment);
delayStoreCost = std::max(storeCost, delayStoreCost);
}
if (alreadyStoredInAllEndings)
@ -1867,7 +1866,7 @@ static void IMLRA_AnalyzeRangeDataFlow(raLivenessRange* subrange)
subrange->hasStore = false;
subrange->hasStoreDelayed = true;
}
else if (delayStoreCost <= PPCRecRARange_getReadWriteCost(subrange->imlSegment))
else if (delayStoreCost <= IMLRA_GetSegmentReadWriteCost(subrange->imlSegment))
{
subrange->hasStore = false;
subrange->hasStoreDelayed = true;

View File

@ -642,7 +642,7 @@ void PPCRecRA_updateOrAddSubrangeLocation(raLivenessRange* subrange, sint32 inde
subrange->list_locations.emplace_back(index, isRead, isWrite);
}
sint32 PPCRecRARange_getReadWriteCost(IMLSegment* imlSegment)
sint32 IMLRA_GetSegmentReadWriteCost(IMLSegment* imlSegment)
{
sint32 v = imlSegment->loopDepth + 1;
v *= 5;
@ -668,13 +668,13 @@ sint32 PPCRecRARange_estimateTotalCost(std::span<raLivenessRange*> ranges)
if (!subrange->interval2.ExtendsPreviousSegment())
{
//cost += PPCRecRARange_getReadWriteCost(subrange->imlSegment);
mostExpensiveRead = std::max(mostExpensiveRead, PPCRecRARange_getReadWriteCost(subrange->imlSegment));
mostExpensiveRead = std::max(mostExpensiveRead, IMLRA_GetSegmentReadWriteCost(subrange->imlSegment));
readCount++;
}
if (!subrange->interval2.ExtendsIntoNextSegment())
{
//cost += PPCRecRARange_getReadWriteCost(subrange->imlSegment);
mostExpensiveWrite = std::max(mostExpensiveWrite, PPCRecRARange_getReadWriteCost(subrange->imlSegment));
mostExpensiveWrite = std::max(mostExpensiveWrite, IMLRA_GetSegmentReadWriteCost(subrange->imlSegment));
writeCount++;
}
}
@ -683,21 +683,34 @@ sint32 PPCRecRARange_estimateTotalCost(std::span<raLivenessRange*> ranges)
return cost;
}
// calculate cost of range that it would have after calling PPCRecRA_explodeRange() on it
sint32 PPCRecRARange_estimateCostAfterRangeExplode(raLivenessRange* subrange)
// calculate additional cost of range that it would have after calling _ExplodeRange() on it
sint32 IMLRA_CalculateAdditionalCostOfRangeExplode(raLivenessRange* subrange)
{
auto ranges = subrange->GetAllSubrangesInCluster();
sint32 cost = -PPCRecRARange_estimateTotalCost(ranges);
sint32 cost = 0;//-PPCRecRARange_estimateTotalCost(ranges);
for (auto& subrange : ranges)
{
if (subrange->list_locations.empty())
continue;
cost += PPCRecRARange_getReadWriteCost(subrange->imlSegment) * 2; // we assume a read and a store
continue; // this range would be deleted and thus has no cost
sint32 segmentLoadStoreCost = IMLRA_GetSegmentReadWriteCost(subrange->imlSegment);
bool hasAdditionalLoad = subrange->interval2.ExtendsPreviousSegment();
bool hasAdditionalStore = subrange->interval2.ExtendsIntoNextSegment();
if(hasAdditionalLoad && !subrange->list_locations.front().isRead && subrange->list_locations.front().isWrite) // if written before read, then a load isn't necessary
{
cost += segmentLoadStoreCost;
}
if(hasAdditionalStore)
{
bool hasWrite = std::find_if(subrange->list_locations.begin(), subrange->list_locations.end(), [](const raLivenessLocation_t& loc) { return loc.isWrite; }) != subrange->list_locations.end();
if(!hasWrite) // ranges which don't modify their value do not need to be stored
cost += segmentLoadStoreCost;
}
}
// todo - properly calculating all the data-flow dependency based costs is more complex so this currently is an approximation
return cost;
}
sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessRange* subrange, raInstructionEdge splitPosition)
sint32 IMLRA_CalculateAdditionalCostAfterSplit(raLivenessRange* subrange, raInstructionEdge splitPosition)
{
// validation
#ifdef CEMU_DEBUG_ASSERT
@ -719,9 +732,53 @@ sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessRange* subrange,
if (splitInstructionIndex > subrange->list_locations.back().index)
return 0;
// todo - determine exact cost of split subranges
// this can be optimized, but we should change list_locations to track instruction edges instead of instruction indices
std::vector<raLivenessLocation_t> headLocations;
std::vector<raLivenessLocation_t> tailLocations;
for (auto& location : subrange->list_locations)
{
if(location.GetReadPos() < splitPosition || location.GetWritePos() < splitPosition)
headLocations.push_back(location);
if(location.GetReadPos() >= splitPosition || location.GetWritePos() >= splitPosition)
tailLocations.push_back(location);
}
// fixup locations
if(!headLocations.empty() && headLocations.back().GetWritePos() >= splitPosition)
{
headLocations.back().isWrite = false;
if(!headLocations.back().isRead && !headLocations.back().isWrite)
headLocations.pop_back();
}
if(!tailLocations.empty() && tailLocations.front().GetReadPos() < splitPosition)
{
tailLocations.front().isRead = false;
if(!tailLocations.front().isRead && !tailLocations.front().isWrite)
tailLocations.erase(tailLocations.begin());
}
cost += PPCRecRARange_getReadWriteCost(subrange->imlSegment) * 2; // currently we assume that the additional region will require a read and a store
// based on
sint32 segmentLoadStoreCost = IMLRA_GetSegmentReadWriteCost(subrange->imlSegment);
auto CalculateCostFromLocationRange = [segmentLoadStoreCost](const std::vector<raLivenessLocation_t>& locations, bool trackLoadCost = true, bool trackStoreCost = true) -> sint32
{
if(locations.empty())
return 0;
sint32 cost = 0;
if(locations.front().isRead && trackLoadCost)
cost += segmentLoadStoreCost; // not overwritten, so there is a load cost
bool hasWrite = std::find_if(locations.begin(), locations.end(), [](const raLivenessLocation_t& loc) { return loc.isWrite; }) != locations.end();
if(hasWrite && trackStoreCost)
cost += segmentLoadStoreCost; // modified, so there is a store cost
return cost;
};
sint32 baseCost = CalculateCostFromLocationRange(subrange->list_locations);
bool tailOverwritesValue = !tailLocations.empty() && !tailLocations.front().isRead && tailLocations.front().isWrite;
sint32 newCost = CalculateCostFromLocationRange(headLocations) + CalculateCostFromLocationRange(tailLocations, !tailOverwritesValue, true);
cemu_assert_debug(newCost >= baseCost);
cost = newCost - baseCost;
return cost;
}

View File

@ -1,18 +1,6 @@
#pragma once
#include "IMLRegisterAllocator.h"
struct raLivenessLocation_t
{
sint32 index;
bool isRead;
bool isWrite;
raLivenessLocation_t() = default;
raLivenessLocation_t(sint32 index, bool isRead, bool isWrite)
: index(index), isRead(isRead), isWrite(isWrite) {};
};
struct raLivenessSubrangeLink
{
struct raLivenessRange* prev;
@ -167,6 +155,28 @@ private:
};
struct raLivenessLocation_t
{
sint32 index;
bool isRead;
bool isWrite;
raLivenessLocation_t() = default;
raLivenessLocation_t(sint32 index, bool isRead, bool isWrite)
: index(index), isRead(isRead), isWrite(isWrite) {};
raInstructionEdge GetReadPos()
{
return raInstructionEdge(index, true);
}
raInstructionEdge GetWritePos()
{
return raInstructionEdge(index, false);
}
};
struct raInterval
{
raInterval()
@ -354,7 +364,7 @@ void PPCRecRA_updateOrAddSubrangeLocation(raLivenessRange* subrange, sint32 inde
void PPCRecRA_debugValidateSubrange(raLivenessRange* subrange);
// cost estimation
sint32 PPCRecRARange_getReadWriteCost(IMLSegment* imlSegment);
sint32 PPCRecRARange_estimateCostAfterRangeExplode(raLivenessRange* subrange);
sint32 IMLRA_GetSegmentReadWriteCost(IMLSegment* imlSegment);
sint32 IMLRA_CalculateAdditionalCostOfRangeExplode(raLivenessRange* subrange);
//sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessRange* subrange, sint32 splitIndex);
sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessRange* subrange, raInstructionEdge splitPosition);
sint32 IMLRA_CalculateAdditionalCostAfterSplit(raLivenessRange* subrange, raInstructionEdge splitPosition);