From ada8bbb3b49622e19deccb7358b1c804a766baab Mon Sep 17 00:00:00 2001
From: Exzap <13877693+Exzap@users.noreply.github.com>
Date: Fri, 14 Oct 2022 13:45:40 +0200
Subject: [PATCH] Linux/MacOS: Greatly improve performance (#370)

std::unordered_set is implemented as a flat hashtable on libstdc++ which makes clearing expensive due to invoking memset on the entire table. To get the best performance across all platforms this replaces the unordered_set with a custom high-performance sparse bitset
---
 src/Cafe/HW/Latte/Core/LatteBufferCache.cpp | 73 ++++++++++++++++++---
 1 file changed, 65 insertions(+), 8 deletions(-)
diff --git a/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp b/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp
index a70ff888..1e2c43b1 100644
--- a/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteBufferCache.cpp
@@ -1005,8 +1005,67 @@ void LatteBufferCache_getStats(uint32& heapSize, uint32& allocationSize, uint32&
 }
 
 FSpinlock g_spinlockDCFlushQueue;
-std::unordered_set<uint32>* g_DCFlushQueue = new std::unordered_set<uint32>(); // queued pages
-std::unordered_set<uint32>* g_DCFlushQueueAlternate = new std::unordered_set<uint32>();
+
+class SparseBitset
+{
+	static inline constexpr size_t TABLE_MASK = 0xFF;
+
+public:
+	bool Empty() const
+	{
+		return m_numNonEmptyVectors == 0;
+	}
+
+	void Set(uint32 index)
+	{
+		auto& v = m_bits[index & TABLE_MASK];
+		if (std::find(v.cbegin(), v.cend(), index) != v.end())
+			return;
+		if (v.empty())
+		{
+			m_nonEmptyVectors[m_numNonEmptyVectors] = &v;
+			m_numNonEmptyVectors++;
+		}
+		v.emplace_back(index);
+	}
+
+	template<typename TFunc>
+	void ForAllAndClear(TFunc callbackFunc)
+	{
+		auto vCurrent = m_nonEmptyVectors + 0;
+		auto vEnd = m_nonEmptyVectors + m_numNonEmptyVectors;
+		while (vCurrent < vEnd)
+		{
+			std::vector<uint32>* vec = *vCurrent;
+			vCurrent++;
+			for (const auto& it : *vec)
+				callbackFunc(it);
+			vec->clear();
+		}
+		m_numNonEmptyVectors = 0;
+	}
+
+	void Clear()
+	{
+		auto vCurrent = m_nonEmptyVectors + 0;
+		auto vEnd = m_nonEmptyVectors + m_numNonEmptyVectors;
+		while (vCurrent < vEnd)
+		{
+			std::vector<uint32>* vec = *vCurrent;
+			vCurrent++;
+			vec->clear();
+		}
+		m_numNonEmptyVectors = 0;
+	}
+
+private:
+	std::vector<uint32> m_bits[TABLE_MASK + 1];
+	std::vector<uint32>* m_nonEmptyVectors[TABLE_MASK + 1];
+	size_t m_numNonEmptyVectors{ 0 };
+};
+
+SparseBitset* s_DCFlushQueue = new SparseBitset();
+SparseBitset* s_DCFlushQueueAlternate = new SparseBitset();
 
 void LatteBufferCache_notifyDCFlush(MPTR address, uint32 size)
 {
@@ -1017,20 +1076,18 @@ void LatteBufferCache_notifyDCFlush(MPTR address, uint32 size)
 	uint32 lastPage = (address + size - 1) / CACHE_PAGE_SIZE;
 	g_spinlockDCFlushQueue.acquire();
 	for (uint32 i = firstPage; i <= lastPage; i++)
-		g_DCFlushQueue->emplace(i);
+		s_DCFlushQueue->Set(i);
 	g_spinlockDCFlushQueue.release();
 }
 
 void LatteBufferCache_processDCFlushQueue()
 {
-	if (g_DCFlushQueue->empty()) // accessing this outside of the lock is technically undefined/unsafe behavior but on all known implementations this is fine and we can avoid the spinlock
+	if (s_DCFlushQueue->Empty()) // quick check to avoid locking if there is no work to do
 		return;
 	g_spinlockDCFlushQueue.acquire();
-	std::swap(g_DCFlushQueue, g_DCFlushQueueAlternate);
+	std::swap(s_DCFlushQueue, s_DCFlushQueueAlternate);
 	g_spinlockDCFlushQueue.release();
-	for (auto& itr : *g_DCFlushQueueAlternate)
-		LatteBufferCache_invalidatePage(itr * CACHE_PAGE_SIZE);
-	g_DCFlushQueueAlternate->clear();
+	s_DCFlushQueueAlternate->ForAllAndClear([](uint32 index) {LatteBufferCache_invalidatePage(index * CACHE_PAGE_SIZE); });
 }
 
 void LatteBufferCache_notifyDrawDone()