From b6a74380537305ef33ce12e952bde0155e4a57b3 Mon Sep 17 00:00:00 2001 From: comex Date: Thu, 16 Oct 2014 21:49:48 -0400 Subject: [PATCH] Add BitSet and, as a test, convert some JitRegCache stuff to it. This is a higher level, more concise wrapper for bitsets which supports efficiently counting and iterating over set bits. It's similar to std::bitset, but the latter does not support efficient iteration (and at least in libc++, the count algorithm is subpar, not that it really matters). The converted uses include both bitsets and, notably, considerably less efficient regular arrays (for in/out registers in PPCAnalyst). Unfortunately, this may slightly pessimize unoptimized builds. --- Source/Core/Common/BitSet.h | 156 ++++++++++++++++++ Source/Core/Common/Common.vcxproj | 3 +- Source/Core/Common/Common.vcxproj.filters | 3 +- Source/Core/Core/PowerPC/Jit64/Jit.cpp | 27 ++- .../Core/Core/PowerPC/Jit64/JitRegCache.cpp | 39 ++--- Source/Core/Core/PowerPC/Jit64/JitRegCache.h | 12 +- Source/Core/Core/PowerPC/PPCAnalyst.cpp | 90 ++++------ Source/Core/Core/PowerPC/PPCAnalyst.h | 15 +- 8 files changed, 236 insertions(+), 109 deletions(-) create mode 100644 Source/Core/Common/BitSet.h diff --git a/Source/Core/Common/BitSet.h b/Source/Core/Common/BitSet.h new file mode 100644 index 0000000000..b2cf577b50 --- /dev/null +++ b/Source/Core/Common/BitSet.h @@ -0,0 +1,156 @@ +// This file is under the public domain. + +#pragma once + +#include +#include +#include "CommonTypes.h" + +// Helper functions: + +#ifdef _WIN32 +template +static inline int CountSetBits(T v) +{ + // from https://graphics.stanford.edu/~seander/bithacks.html + // GCC has this built in, but MSVC's intrinsic will only emit the actual + // POPCNT instruction, which we're not depending on + v = v - ((v >> 1) & (T)~(T)0/3); + v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); + v = (v + (v >> 4)) & (T)~(T)0/255*15; + return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8; +} +static inline int LeastSignificantSetBit(u32 val) +{ + unsigned long index; + _BitScanForward(&index, val); + return (int)index; +} +static inline int LeastSignificantSetBit(u64 val) +{ + unsigned long index; + _BitScanForward64(&index, val); + return (int)index; +} +#else +static inline int CountSetBits(u32 val) { return __builtin_popcount(val); } +static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); } +static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); } +static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); } +#endif + + +// Similar to std::bitset, this is a class which encapsulates a bitset, i.e. +// using the set bits of an integer to represent a set of integers. Like that +// class, it acts like an array of bools: +// BitSet32 bs; // use BitSet{32,64} instead of the template directly +// bs[1] = true; +// but also like the underlying integer ([0] = least significant bit): +// BitSet32 bs2 = ...; +// bs = (bs ^ bs2) & BitSet32(0xffff); +// The following additional functionality is provided: +// - Construction using an initializer list. +// BitSet bs { 1, 2, 4, 8 }; +// - Efficiently iterating through the set bits: +// for (int i : bs) +// [i is the *index* of a set bit] +// (This uses the appropriate CPU instruction to find the next set bit in one +// operation.) +// - Counting set bits using .Count() - see comment on that method. + +// TODO: use constexpr when MSVC gets out of the Dark Ages + +template +class BitSet +{ + static_assert(!std::is_signed::value, "BitSet should not be used with signed types"); +public: + // A reference to a particular bit, returned from operator[]. + class Ref + { + public: + Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {} + Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {} + operator bool() const { return (m_bs->m_val & m_mask) != 0; } + bool operator=(bool set) + { + m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0); + return set; + } + private: + BitSet* m_bs; + IntTy m_mask; + }; + + // A STL-like iterator is required to be able to use range-based for loops. + class Iterator + { + public: + Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {} + Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {} + Iterator& operator=(Iterator other) { new (this) Iterator(other); return *this; } + int operator*() { return m_bit; } + Iterator& operator++() + { + if (m_val == 0) + { + m_bit = -1; + } + else + { + int bit = LeastSignificantSetBit(m_val); + m_val &= ~(1 << bit); + m_bit = bit; + } + return *this; + } + Iterator operator++(int _) + { + Iterator other(*this); + ++*this; + return other; + } + bool operator==(Iterator other) const { return m_bit == other.m_bit; } + bool operator!=(Iterator other) const { return m_bit != other.m_bit; } + private: + IntTy m_val; + int m_bit; + }; + + BitSet() : m_val(0) {} + explicit BitSet(IntTy val) : m_val(val) {} + BitSet(std::initializer_list init) + { + m_val = 0; + for (int bit : init) + m_val |= (IntTy)1 << bit; + } + + Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); } + const Ref operator[](size_t bit) const { return (*const_cast(this))[bit]; } + bool operator==(BitSet other) const { return m_val == other.m_val; } + bool operator!=(BitSet other) const { return m_val != other.m_val; } + BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); } + BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); } + BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); } + BitSet operator~() const { return BitSet(~m_val); } + BitSet& operator|=(BitSet other) { return *this = *this | other; } + BitSet& operator&=(BitSet other) { return *this = *this & other; } + BitSet& operator^=(BitSet other) { return *this = *this ^ other; } + operator u32() = delete; + operator bool() { return m_val != 0; } + + // Warning: Even though on modern CPUs this is a single fast instruction, + // Dolphin's official builds do not currently assume POPCNT support on x86, + // so slower explicit bit twiddling is generated. Still should generally + // be faster than a loop. + unsigned int Count() const { return CountSetBits(m_val); } + + Iterator begin() const { Iterator it(m_val, 0); return ++it; } + Iterator end() const { return Iterator(m_val, -1); } + + IntTy m_val; +}; + +typedef BitSet BitSet32; +typedef BitSet BitSet64; diff --git a/Source/Core/Common/Common.vcxproj b/Source/Core/Common/Common.vcxproj index 814ac4f04f..5d7a31904d 100644 --- a/Source/Core/Common/Common.vcxproj +++ b/Source/Core/Common/Common.vcxproj @@ -39,6 +39,7 @@ + @@ -137,4 +138,4 @@ - \ No newline at end of file + diff --git a/Source/Core/Common/Common.vcxproj.filters b/Source/Core/Common/Common.vcxproj.filters index 84997f7441..ac5b5c454f 100644 --- a/Source/Core/Common/Common.vcxproj.filters +++ b/Source/Core/Common/Common.vcxproj.filters @@ -13,6 +13,7 @@ + @@ -118,4 +119,4 @@ - \ No newline at end of file + diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.cpp b/Source/Core/Core/PowerPC/Jit64/Jit.cpp index d091db4ff3..99633dc57d 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit.cpp @@ -736,29 +736,28 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc // output, which needs to be bound in the actual instruction compilation. // TODO: make this smarter in the case that we're actually register-starved, i.e. // prioritize the more important registers. - for (int k = 0; k < 3 && gpr.NumFreeRegisters() >= 2; k++) + for (int reg : ops[i].regsIn) { - int reg = ops[i].regsIn[k]; - if (reg >= 0 && (ops[i].gprInReg & (1 << reg)) && !gpr.R(reg).IsImm()) + if (gpr.NumFreeRegisters() < 2) + break; + if (ops[i].gprInReg[reg] && !gpr.R(reg).IsImm()) gpr.BindToRegister(reg, true, false); } - for (int k = 0; k < 4 && fpr.NumFreeRegisters() >= 2; k++) + for (int reg : ops[i].regsOut) { - int reg = ops[i].fregsIn[k]; - if (reg >= 0 && (ops[i].fprInXmm & (1 << reg))) - fpr.BindToRegister(reg, true, false); + if (fpr.NumFreeRegisters() < 2) + break; + if (ops[i].fprInXmm[reg]) + gpr.BindToRegister(reg, true, false); } Jit64Tables::CompileInstruction(ops[i]); // If we have a register that will never be used again, flush it. - for (int j = 0; j < 32; j++) - { - if (!(ops[i].gprInUse & (1 << j))) - gpr.StoreFromRegister(j); - if (!(ops[i].fprInUse & (1 << j))) - fpr.StoreFromRegister(j); - } + for (int j : ~ops[i].gprInUse) + gpr.StoreFromRegister(j); + for (int j : ~ops[i].fprInUse) + fpr.StoreFromRegister(j); if (js.memcheck && (opinfo->flags & FL_LOADSTORE)) { diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp index bb1b77371d..c7b0dd1db4 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.cpp @@ -95,42 +95,38 @@ void RegCache::UnlockAllX() xreg.locked = false; } -u32 GPRRegCache::GetRegUtilization() +BitSet32 GPRRegCache::GetRegUtilization() { return jit->js.op->gprInReg; } -u32 FPURegCache::GetRegUtilization() +BitSet32 FPURegCache::GetRegUtilization() { return jit->js.op->gprInReg; } -u32 GPRRegCache::CountRegsIn(size_t preg, u32 lookahead) +BitSet32 GPRRegCache::CountRegsIn(size_t preg, u32 lookahead) { - u32 regsUsed = 0; + BitSet32 regsUsed; for (u32 i = 1; i < lookahead; i++) { - for (int j = 0; j < 3; j++) - if (jit->js.op[i].regsIn[j] >= 0) - regsUsed |= 1 << jit->js.op[i].regsIn[j]; - for (int j = 0; j < 3; j++) - if ((size_t)jit->js.op[i].regsIn[j] == preg) - return regsUsed; + BitSet32 regsIn = jit->js.op[i].regsIn; + regsUsed |= regsIn; + if (regsIn[preg]) + return regsUsed; } return regsUsed; } -u32 FPURegCache::CountRegsIn(size_t preg, u32 lookahead) +BitSet32 FPURegCache::CountRegsIn(size_t preg, u32 lookahead) { - u32 regsUsed = 0; + BitSet32 regsUsed; for (u32 i = 1; i < lookahead; i++) { - for (int j = 0; j < 4; j++) - if (jit->js.op[i].fregsIn[j] >= 0) - regsUsed |= 1 << jit->js.op[i].fregsIn[j]; - for (int j = 0; j < 4; j++) - if ((size_t)jit->js.op[i].fregsIn[j] == preg) - return regsUsed; + BitSet32 regsIn = jit->js.op[i].fregsIn; + regsUsed |= regsIn; + if (regsIn[preg]) + return regsUsed; } return regsUsed; } @@ -151,17 +147,14 @@ float RegCache::ScoreRegister(X64Reg xr) // If the register isn't actually needed in a physical register for a later instruction, // writing it back to the register file isn't quite as bad. - if (GetRegUtilization() & (1 << preg)) + if (GetRegUtilization()[preg]) { // Don't look too far ahead; we don't want to have quadratic compilation times for // enormous block sizes! // This actually improves register allocation a tiny bit; I'm not sure why. u32 lookahead = std::min(jit->js.instructionsLeft, 64); // Count how many other registers are going to be used before we need this one again. - u32 regs_in = CountRegsIn(preg, lookahead); - u32 regs_in_count = 0; - for (int i = 0; i < 32; i++) - regs_in_count += !!(regs_in & (1 << i)); + u32 regs_in_count = CountRegsIn(preg, lookahead).Count(); // Totally ad-hoc heuristic to bias based on how many other registers we'll need // before this one gets used again. score += 1 + 2 * (5 - log2f(1 + (float)regs_in_count)); diff --git a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h index 244ac1be59..3943e83852 100644 --- a/Source/Core/Core/PowerPC/Jit64/JitRegCache.h +++ b/Source/Core/Core/PowerPC/Jit64/JitRegCache.h @@ -44,8 +44,8 @@ protected: virtual const int *GetAllocationOrder(size_t& count) = 0; - virtual u32 GetRegUtilization() = 0; - virtual u32 CountRegsIn(size_t preg, u32 lookahead) = 0; + virtual BitSet32 GetRegUtilization() = 0; + virtual BitSet32 CountRegsIn(size_t preg, u32 lookahead) = 0; Gen::XEmitter *emit; @@ -137,8 +137,8 @@ public: Gen::OpArg GetDefaultLocation(size_t reg) const override; const int* GetAllocationOrder(size_t& count) override; void SetImmediate32(size_t preg, u32 immValue); - u32 GetRegUtilization(); - u32 CountRegsIn(size_t preg, u32 lookahead); + BitSet32 GetRegUtilization() override; + BitSet32 CountRegsIn(size_t preg, u32 lookahead) override; }; @@ -149,6 +149,6 @@ public: void LoadRegister(size_t preg, Gen::X64Reg newLoc) override; const int* GetAllocationOrder(size_t& count) override; Gen::OpArg GetDefaultLocation(size_t reg) const override; - u32 GetRegUtilization(); - u32 CountRegsIn(size_t preg, u32 lookahead); + BitSet32 GetRegUtilization() override; + BitSet32 CountRegsIn(size_t preg, u32 lookahead) override; }; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.cpp b/Source/Core/Core/PowerPC/PPCAnalyst.cpp index 4bd5b60a14..acc5d372ed 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.cpp +++ b/Source/Core/Core/PowerPC/PPCAnalyst.cpp @@ -249,21 +249,15 @@ static bool CanSwapAdjacentOps(const CodeOp &a, const CodeOp &b) // That is, check that none of b's outputs matches any of a's inputs, // and that none of a's outputs matches any of b's inputs. // The latter does not apply if a is a cmp, of course, but doesn't hurt to check. - for (int j = 0; j < 3; j++) - { - int regInA = a.regsIn[j]; - int regInB = b.regsIn[j]; - // register collision: b outputs to one of a's inputs - if (regInA >= 0 && (b.regsOut[0] == regInA || b.regsOut[1] == regInA)) - return false; - // register collision: a outputs to one of b's inputs - if (regInB >= 0 && (a.regsOut[0] == regInB || a.regsOut[1] == regInB)) - return false; - // register collision: b outputs to one of a's outputs (overwriting it) - for (int k = 0; k < 2; k++) - if (b.regsOut[k] >= 0 && (b.regsOut[k] == a.regsOut[0] || b.regsOut[k] == a.regsOut[1])) - return false; - } + // register collision: b outputs to one of a's inputs + if (b.regsOut & a.regsIn) + return false; + // register collision: a outputs to one of b's inputs + if (a.regsOut & b.regsIn) + return false; + // register collision: b outputs to one of a's outputs (overwriting it) + if (b.regsOut & a.regsOut) + return false; return true; } @@ -520,42 +514,41 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf if (code->inst.OPCD == 31 && code->inst.SUBOP10 == 467) // mtspr code->outputCA = ((code->inst.SPRU << 5) | (code->inst.SPRL & 0x1F)) == SPR_XER; - int numOut = 0; - int numIn = 0; - int numFloatIn = 0; + code->regsIn = BitSet32(0); + code->regsOut = BitSet32(0); if (opinfo->flags & FL_OUT_A) { - code->regsOut[numOut++] = code->inst.RA; + code->regsOut[code->inst.RA] = true; block->m_gpa->SetOutputRegister(code->inst.RA, index); } if (opinfo->flags & FL_OUT_D) { - code->regsOut[numOut++] = code->inst.RD; + code->regsOut[code->inst.RD] = true; block->m_gpa->SetOutputRegister(code->inst.RD, index); } if (opinfo->flags & FL_OUT_S) { - code->regsOut[numOut++] = code->inst.RS; + code->regsOut[code->inst.RS] = true; block->m_gpa->SetOutputRegister(code->inst.RS, index); } if ((opinfo->flags & FL_IN_A) || ((opinfo->flags & FL_IN_A0) && code->inst.RA != 0)) { - code->regsIn[numIn++] = code->inst.RA; + code->regsIn[code->inst.RA] = true; block->m_gpa->SetInputRegister(code->inst.RA, index); } if (opinfo->flags & FL_IN_B) { - code->regsIn[numIn++] = code->inst.RB; + code->regsIn[code->inst.RB] = true; block->m_gpa->SetInputRegister(code->inst.RB, index); } if (opinfo->flags & FL_IN_C) { - code->regsIn[numIn++] = code->inst.RC; + code->regsIn[code->inst.RC] = true; block->m_gpa->SetInputRegister(code->inst.RC, index); } if (opinfo->flags & FL_IN_S) { - code->regsIn[numIn++] = code->inst.RS; + code->regsIn[code->inst.RS] = true; block->m_gpa->SetInputRegister(code->inst.RS, index); } @@ -564,24 +557,17 @@ void PPCAnalyzer::SetInstructionStats(CodeBlock *block, CodeOp *code, GekkoOPInf code->fregOut = code->inst.FD; else if (opinfo->flags & FL_OUT_FLOAT_S) code->fregOut = code->inst.FS; + code->fregsIn = BitSet32(0); if (opinfo->flags & FL_IN_FLOAT_A) - code->fregsIn[numFloatIn++] = code->inst.FA; + code->fregsIn[code->inst.FA] = true; if (opinfo->flags & FL_IN_FLOAT_B) - code->fregsIn[numFloatIn++] = code->inst.FB; + code->fregsIn[code->inst.FB] = true; if (opinfo->flags & FL_IN_FLOAT_C) - code->fregsIn[numFloatIn++] = code->inst.FC; + code->fregsIn[code->inst.FC] = true; if (opinfo->flags & FL_IN_FLOAT_D) - code->fregsIn[numFloatIn++] = code->inst.FD; + code->fregsIn[code->inst.FD] = true; if (opinfo->flags & FL_IN_FLOAT_S) - code->fregsIn[numFloatIn++] = code->inst.FS; - - // Set remaining register slots as unused (-1) - for (int j = numIn; j < 3; j++) - code->regsIn[j] = -1; - for (int j = numOut; j < 2; j++) - code->regsOut[j] = -1; - for (int j = numFloatIn; j < 4; j++) - code->fregsIn[j] = -1; + code->fregsIn[code->inst.FS] = true; switch (opinfo->type) { @@ -797,7 +783,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 // Scan for flag dependencies; assume the next block (or any branch that can leave the block) // wants flags, to be safe. bool wantsCR0 = true, wantsCR1 = true, wantsFPRF = true, wantsCA = true; - u32 fprInUse = 0, gprInUse = 0, gprInReg = 0, fprInXmm = 0; + BitSet32 fprInUse, gprInUse, gprInReg, fprInXmm; for (int i = block->m_num_instructions - 1; i >= 0; i--) { bool opWantsCR0 = code[i].wantsCR0; @@ -822,30 +808,20 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock *block, CodeBuffer *buffer, u32 code[i].fprInXmm = fprInXmm; // TODO: if there's no possible endblocks or exceptions in between, tell the regcache // we can throw away a register if it's going to be overwritten later. - for (int j = 0; j < 3; j++) - if (code[i].regsIn[j] >= 0) - { - gprInUse |= 1 << code[i].regsIn[j]; - gprInReg |= 1 << code[i].regsIn[j]; - } - for (int j = 0; j < 4; j++) - if (code[i].fregsIn[j] >= 0) - { - fprInUse |= 1 << code[i].fregsIn[j]; - if (strncmp(code[i].opinfo->opname, "stfd", 4)) - fprInXmm |= 1 << code[i].fregsIn[j]; - } + gprInUse |= code[i].regsIn; + gprInReg |= code[i].regsIn; + fprInUse |= code[i].fregsIn; + if (strncmp(code[i].opinfo->opname, "stfd", 4)) + fprInXmm |= code[i].fregsIn; // For now, we need to count output registers as "used" though; otherwise the flush // will result in a redundant store (e.g. store to regcache, then store again to // the same location later). - for (int j = 0; j < 2; j++) - if (code[i].regsOut[j] >= 0) - gprInUse |= 1 << code[i].regsOut[j]; + gprInUse |= code[i].regsOut; if (code[i].fregOut >= 0) { - fprInUse |= 1 << code[i].fregOut; + fprInUse[code[i].fregOut] = true; if (strncmp(code[i].opinfo->opname, "stfd", 4)) - fprInXmm |= 1 << code[i].fregOut; + fprInXmm[code[i].fregOut] = true; } } return address; diff --git a/Source/Core/Core/PowerPC/PPCAnalyst.h b/Source/Core/Core/PowerPC/PPCAnalyst.h index 4ae6e6ded3..8b3f4bc85a 100644 --- a/Source/Core/Core/PowerPC/PPCAnalyst.h +++ b/Source/Core/Core/PowerPC/PPCAnalyst.h @@ -10,6 +10,7 @@ #include #include +#include "Common/BitSet.h" #include "Common/CommonTypes.h" #include "Core/PowerPC/PPCTables.h" @@ -26,10 +27,10 @@ struct CodeOp //16B u32 address; u32 branchTo; //if 0, not a branch int branchToIndex; //index of target block - s8 regsOut[2]; - s8 regsIn[3]; + BitSet32 regsOut; + BitSet32 regsIn; + BitSet32 fregsIn; s8 fregOut; - s8 fregsIn[4]; bool isBranchTarget; bool wantsCR0; bool wantsCR1; @@ -43,13 +44,13 @@ struct CodeOp //16B bool canEndBlock; bool skip; // followed BL-s for example // which registers are still needed after this instruction in this block - u32 fprInUse; - u32 gprInUse; + BitSet32 fprInUse; + BitSet32 gprInUse; // just because a register is in use doesn't mean we actually need or want it in an x86 register. - u32 gprInReg; + BitSet32 gprInReg; // we do double stores from GPRs, so we don't want to load a PowerPC floating point register into // an XMM only to move it again to a GPR afterwards. - u32 fprInXmm; + BitSet32 fprInXmm; }; struct BlockStats