From ada8bbb3b49622e19deccb7358b1c804a766baab Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Fri, 14 Oct 2022 13:45:40 +0200 Subject: [PATCH] Linux/MacOS: Greatly improve performance (#370) std::unordered_set is implemented as a flat hashtable on libstdc++ which makes clearing expensive due to invoking memset on the entire table. To get the best performance across all platforms this replaces the unordered_set with a custom high-performance sparse bitset --- src/Cafe/HW/Latte/Core/LatteBufferCache.cpp | 73 ++++++++++++++++++--- 1 file changed, 65 insertions(+), 8 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp b/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp index a70ff888..1e2c43b1 100644 --- a/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp +++ b/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp @@ -1005,8 +1005,67 @@ void LatteBufferCache_getStats(uint32& heapSize, uint32& allocationSize, uint32& } FSpinlock g_spinlockDCFlushQueue; -std::unordered_set* g_DCFlushQueue = new std::unordered_set(); // queued pages -std::unordered_set* g_DCFlushQueueAlternate = new std::unordered_set(); + +class SparseBitset +{ + static inline constexpr size_t TABLE_MASK = 0xFF; + +public: + bool Empty() const + { + return m_numNonEmptyVectors == 0; + } + + void Set(uint32 index) + { + auto& v = m_bits[index & TABLE_MASK]; + if (std::find(v.cbegin(), v.cend(), index) != v.end()) + return; + if (v.empty()) + { + m_nonEmptyVectors[m_numNonEmptyVectors] = &v; + m_numNonEmptyVectors++; + } + v.emplace_back(index); + } + + template + void ForAllAndClear(TFunc callbackFunc) + { + auto vCurrent = m_nonEmptyVectors + 0; + auto vEnd = m_nonEmptyVectors + m_numNonEmptyVectors; + while (vCurrent < vEnd) + { + std::vector* vec = *vCurrent; + vCurrent++; + for (const auto& it : *vec) + callbackFunc(it); + vec->clear(); + } + m_numNonEmptyVectors = 0; + } + + void Clear() + { + auto vCurrent = m_nonEmptyVectors + 0; + auto vEnd = m_nonEmptyVectors + m_numNonEmptyVectors; + while (vCurrent < vEnd) + { + std::vector* vec = *vCurrent; + vCurrent++; + vec->clear(); + } + m_numNonEmptyVectors = 0; + } + +private: + std::vector m_bits[TABLE_MASK + 1]; + std::vector* m_nonEmptyVectors[TABLE_MASK + 1]; + size_t m_numNonEmptyVectors{ 0 }; +}; + +SparseBitset* s_DCFlushQueue = new SparseBitset(); +SparseBitset* s_DCFlushQueueAlternate = new SparseBitset(); void LatteBufferCache_notifyDCFlush(MPTR address, uint32 size) { @@ -1017,20 +1076,18 @@ void LatteBufferCache_notifyDCFlush(MPTR address, uint32 size) uint32 lastPage = (address + size - 1) / CACHE_PAGE_SIZE; g_spinlockDCFlushQueue.acquire(); for (uint32 i = firstPage; i <= lastPage; i++) - g_DCFlushQueue->emplace(i); + s_DCFlushQueue->Set(i); g_spinlockDCFlushQueue.release(); } void LatteBufferCache_processDCFlushQueue() { - if (g_DCFlushQueue->empty()) // accessing this outside of the lock is technically undefined/unsafe behavior but on all known implementations this is fine and we can avoid the spinlock + if (s_DCFlushQueue->Empty()) // quick check to avoid locking if there is no work to do return; g_spinlockDCFlushQueue.acquire(); - std::swap(g_DCFlushQueue, g_DCFlushQueueAlternate); + std::swap(s_DCFlushQueue, s_DCFlushQueueAlternate); g_spinlockDCFlushQueue.release(); - for (auto& itr : *g_DCFlushQueueAlternate) - LatteBufferCache_invalidatePage(itr * CACHE_PAGE_SIZE); - g_DCFlushQueueAlternate->clear(); + s_DCFlushQueueAlternate->ForAllAndClear([](uint32 index) {LatteBufferCache_invalidatePage(index * CACHE_PAGE_SIZE); }); } void LatteBufferCache_notifyDrawDone()