Latte: Implement better index caching (#1443)

2025-03-10 20:37:37 +01:00 · 2025-01-12 12:39:02 +01:00 · 2025-01-12 12:39:02 +01:00 · 8dd809d725
commit 8dd809d725
parent 1923b7a7c4
16 changed files with 526 additions and 191 deletions
--- a/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp
@ -141,6 +141,14 @@ private:

 void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx);

+// called whenever the GPU runs out of commands or hits a wait condition (semaphores, HLE waits)
+void LatteCP_signalEnterWait()
+{
+	// based on the assumption that games won't do a rugpull and swap out buffer data in the middle of an uninterrupted sequence of drawcalls,
+	// we only flush caches when the GPU goes idle or has to wait for any operation
+	LatteIndices_invalidateAll();
+}
+
 /*
 * Read a U32 from the command buffer
 * If no data is available then wait in a busy loop
@ -466,6 +474,8 @@ LatteCMDPtr LatteCP_itWaitRegMem(LatteCMDPtr cmd, uint32 nWords)
 	const uint32 GPU7_WAIT_MEM_OP_GREATER = 6;
 	const uint32 GPU7_WAIT_MEM_OP_NEVER = 7;

+	LatteCP_signalEnterWait();
+
 	bool stalls = false;
 	if ((word0 & 0x10) != 0)
 	{
@ -594,6 +604,7 @@ LatteCMDPtr LatteCP_itMemSemaphore(LatteCMDPtr cmd, uint32 nWords)
 	else if(SEM_SIGNAL == 7)
 	{
 		// wait
+		LatteCP_signalEnterWait();
 		size_t loopCount = 0;
 		while (true)
 		{
@ -1305,11 +1316,13 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
 				}
 				case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
 				{
+					LatteCP_signalEnterWait();
 					LatteCP_itHLESwapScanBuffer(cmdData, nWords);
 					break;
 				}
 				case IT_HLE_WAIT_FOR_FLIP:
 				{
+					LatteCP_signalEnterWait();
 					LatteCP_itHLEWaitForFlip(cmdData, nWords);
 					break;
 				}
@ -1594,12 +1607,14 @@ void LatteCP_ProcessRingbuffer()
 			}
 			case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
 			{
+				LatteCP_signalEnterWait();
 				LatteCP_itHLESwapScanBuffer(cmd, nWords);
 				timerRecheck += CP_TIMER_RECHECK / 64;
 				break;
 			}
 			case IT_HLE_WAIT_FOR_FLIP:
 			{
+				LatteCP_signalEnterWait();
 				LatteCP_itHLEWaitForFlip(cmd, nWords);
 				timerRecheck += CP_TIMER_RECHECK / 1;
 				break;
--- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp
@ -1,6 +1,7 @@
 #include "Cafe/HW/Latte/Core/LatteConst.h"
 #include "Cafe/HW/Latte/Renderer/Renderer.h"
 #include "Cafe/HW/Latte/ISA/RegDefines.h"
+#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
 #include "Common/cpu_features.h"

 #if defined(ARCH_X86_64) && defined(__GNUC__)
@ -9,32 +10,53 @@

 struct  
 {
-	const void* lastPtr;
-	uint32 lastCount;
-	LattePrimitiveMode lastPrimitiveMode;
-	LatteIndexType lastIndexType;
-	// output
-	uint32 indexMin;
-	uint32 indexMax;
-	Renderer::INDEX_TYPE renderIndexType;
-	uint32 outputCount;
-	uint32 indexBufferOffset;
-	uint32 indexBufferIndex;
+	struct CacheEntry
+	{
+		// input data
+		const void* lastPtr;
+		uint32 lastCount;
+		LattePrimitiveMode lastPrimitiveMode;
+		LatteIndexType lastIndexType;
+		uint64 lastUsed;
+		// output
+		uint32 indexMin;
+		uint32 indexMax;
+		Renderer::INDEX_TYPE renderIndexType;
+		uint32 outputCount;
+		Renderer::IndexAllocation indexAllocation;
+	};
+	std::array<CacheEntry, 8> entry;
+	uint64 currentUsageCounter{0};
 }LatteIndexCache{};

 void LatteIndices_invalidate(const void* memPtr, uint32 size)
 {
-	if (LatteIndexCache.lastPtr >= memPtr && (LatteIndexCache.lastPtr < ((uint8*)memPtr + size)) )
+	for(auto& entry : LatteIndexCache.entry)
 	{
-		LatteIndexCache.lastPtr = nullptr;
-		LatteIndexCache.lastCount = 0;
+		if (entry.lastPtr >= memPtr && (entry.lastPtr < ((uint8*)memPtr + size)) )
+		{
+			if(entry.lastPtr != nullptr)
+				g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
+			entry.lastPtr = nullptr;
+			entry.lastCount = 0;
+		}
 	}
 }

 void LatteIndices_invalidateAll()
 {
-	LatteIndexCache.lastPtr = nullptr;
-	LatteIndexCache.lastCount = 0;
+	for(auto& entry : LatteIndexCache.entry)
+	{
+		if (entry.lastPtr != nullptr)
+			g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
+		entry.lastPtr = nullptr;
+		entry.lastCount = 0;
+	}
+}
+
+uint64 LatteIndices_GetNextUsageIndex()
+{
+	return LatteIndexCache.currentUsageCounter++;
 }

 uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, LatteIndexType indexType, uint32 count)
@ -532,7 +554,7 @@ void LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, LatteIn
 	}
 }

-void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex)
+void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation)
 {
 	// what this should do:
 	// [x] use fast SIMD-based index decoding
@ -542,17 +564,18 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
 	// [ ] better cache implementation, allow to cache across frames

 	// reuse from cache if data didn't change
-	if (LatteIndexCache.lastPtr == indexData &&
-		LatteIndexCache.lastCount == count &&
-		LatteIndexCache.lastPrimitiveMode == primitiveMode &&
-		LatteIndexCache.lastIndexType == indexType)
+	auto cacheEntry = std::find_if(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [indexData, count, primitiveMode, indexType](const auto& entry)
 	{
-		indexMin = LatteIndexCache.indexMin;
-		indexMax = LatteIndexCache.indexMax;
-		renderIndexType = LatteIndexCache.renderIndexType;
-		outputCount = LatteIndexCache.outputCount;
-		indexBufferOffset = LatteIndexCache.indexBufferOffset;
-		indexBufferIndex = LatteIndexCache.indexBufferIndex;
+		return entry.lastPtr == indexData && entry.lastCount == count && entry.lastPrimitiveMode == primitiveMode && entry.lastIndexType == indexType;
+	});
+	if (cacheEntry != LatteIndexCache.entry.end())
+	{
+		indexMin = cacheEntry->indexMin;
+		indexMax = cacheEntry->indexMax;
+		renderIndexType = cacheEntry->renderIndexType;
+		outputCount = cacheEntry->outputCount;
+		indexAllocation = cacheEntry->indexAllocation;
+		cacheEntry->lastUsed = LatteIndices_GetNextUsageIndex();
 		return;
 	}

@ -576,10 +599,12 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
 		indexMin = 0;
 		indexMax = std::max(count, 1u)-1;
 		renderIndexType = Renderer::INDEX_TYPE::NONE;
+		indexAllocation = {};
 		return; // no indices
 	}
 	// query index buffer from renderer
-	void* indexOutputPtr = g_renderer->indexData_reserveIndexMemory(indexOutputSize, indexBufferOffset, indexBufferIndex);
+	indexAllocation = g_renderer->indexData_reserveIndexMemory(indexOutputSize);
+	void* indexOutputPtr = indexAllocation.mem;

 	// decode indices
 	indexMin = std::numeric_limits<uint32>::max();
@ -704,16 +729,25 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
 		// recalculate index range but filter out primitive restart index
 		LatteIndices_alternativeCalculateIndexMinMax(indexData, indexType, count, indexMin, indexMax);
 	}
-	g_renderer->indexData_uploadIndexMemory(indexBufferOffset, indexOutputSize);
+	g_renderer->indexData_uploadIndexMemory(indexAllocation);
+	performanceMonitor.cycle[performanceMonitor.cycleIndex].indexDataUploaded += indexOutputSize;
+	// get least recently used cache entry
+	auto lruEntry = std::min_element(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [](const auto& a, const auto& b)
+	{
+		return a.lastUsed < b.lastUsed;
+	});
+	// invalidate previous allocation
+	if(lruEntry->lastPtr != nullptr)
+		g_renderer->indexData_releaseIndexMemory(lruEntry->indexAllocation);
 	// update cache
-	LatteIndexCache.lastPtr = indexData;
-	LatteIndexCache.lastCount = count;
-	LatteIndexCache.lastPrimitiveMode = primitiveMode;
-	LatteIndexCache.lastIndexType = indexType;
-	LatteIndexCache.indexMin = indexMin;
-	LatteIndexCache.indexMax = indexMax;
-	LatteIndexCache.renderIndexType = renderIndexType;
-	LatteIndexCache.outputCount = outputCount;
-	LatteIndexCache.indexBufferOffset = indexBufferOffset;
-	LatteIndexCache.indexBufferIndex = indexBufferIndex;
+	lruEntry->lastPtr = indexData;
+	lruEntry->lastCount = count;
+	lruEntry->lastPrimitiveMode = primitiveMode;
+	lruEntry->lastIndexType = indexType;
+	lruEntry->indexMin = indexMin;
+	lruEntry->indexMax = indexMax;
+	lruEntry->renderIndexType = renderIndexType;
+	lruEntry->outputCount = outputCount;
+	lruEntry->indexAllocation = indexAllocation;
+	lruEntry->lastUsed = LatteIndices_GetNextUsageIndex();
 }
--- a/src/Cafe/HW/Latte/Core/LatteIndices.h
+++ b/src/Cafe/HW/Latte/Core/LatteIndices.h
@ -4,4 +4,4 @@

 void LatteIndices_invalidate(const void* memPtr, uint32 size);
 void LatteIndices_invalidateAll();
-void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex);
+void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation);
--- a/src/Cafe/HW/Latte/Core/LatteOverlay.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteOverlay.cpp
@ -107,7 +107,13 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio
 				ImGui::Text("VRAM: %dMB / %dMB", g_state.vramUsage, g_state.vramTotal);

 			if (config.overlay.debug)
+			{
+				// general debug info
+				ImGui::Text("--- Debug info ---");
+				ImGui::Text("IndexUploadPerFrame: %dKB", (performanceMonitor.stats.indexDataUploadPerFrame+1023)/1024);
+				// backend specific info
 				g_renderer->AppendOverlayDebugInfo();
+			}

 			position.y += (ImGui::GetWindowSize().y + 10.0f) * direction;
 		}
--- a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp
+++ b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp
@ -74,7 +74,6 @@ void LattePerformanceMonitor_frameEnd()
 		uniformBankDataUploadedPerFrame /= 1024ULL;
 		uint32 uniformBankCountUploadedPerFrame = (uint32)(uniformBankUploadedCount / (uint64)elapsedFrames);
 		uint64 indexDataUploadPerFrame = (indexDataUploaded / (uint64)elapsedFrames);
-		indexDataUploadPerFrame /= 1024ULL;

 		double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS;
 		uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames;
@ -82,7 +81,7 @@ void LattePerformanceMonitor_frameEnd()
 		uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime);
 		uint32 tlps = (uint32)((uint64)threadLeaveCount * 1000ULL / (uint64)totalElapsedTime);
 		// set stats
-
+		performanceMonitor.stats.indexDataUploadPerFrame = indexDataUploadPerFrame;
 		// next counter cycle
 		sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES;
 		performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0;
--- a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h
+++ b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h
@ -132,6 +132,12 @@ typedef struct
 		LattePerfStatCounter numDrawBarriersPerFrame;
 		LattePerfStatCounter numBeginRenderpassPerFrame;
 	}vk;
+
+	// calculated stats (per frame)
+	struct
+	{
+		uint32 indexDataUploadPerFrame;
+	}stats;
 }performanceMonitor_t;

 extern performanceMonitor_t performanceMonitor;
--- a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp
@ -11,7 +11,6 @@
 #include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
 #include "Cafe/GraphicPack/GraphicPack2.h"
 #include "config/ActiveSettings.h"
-#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h"
 #include "gui/guiWrapper.h"
 #include "Cafe/OS/libs/erreula/erreula.h"
 #include "input/InputManager.h"
--- a/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h
+++ b/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h
@ -102,16 +102,21 @@ public:
 	static void SetAttributeArrayState(uint32 index, bool isEnabled, sint32 aluDivisor);
 	static void SetArrayElementBuffer(GLuint arrayElementBuffer);

-	// index
-	void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override
+	// index (not used by OpenGL renderer yet)
+	IndexAllocation indexData_reserveIndexMemory(uint32 size) override
 	{
-		assert_dbg();
-		return nullptr;
+		cemu_assert_unimplemented();
+		return {};
 	}

-	void indexData_uploadIndexMemory(uint32 offset, uint32 size) override
+	void indexData_releaseIndexMemory(IndexAllocation& allocation) override
 	{
-		assert_dbg();
+		cemu_assert_unimplemented();
+	}
+
+	void indexData_uploadIndexMemory(IndexAllocation& allocation) override
+	{
+		cemu_assert_unimplemented();
 	}

 	// uniform
--- a/src/Cafe/HW/Latte/Renderer/Renderer.h
+++ b/src/Cafe/HW/Latte/Renderer/Renderer.h
@ -138,8 +138,15 @@ public:
 	virtual void draw_endSequence() = 0;

 	// index
-	virtual void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) = 0;
-	virtual void indexData_uploadIndexMemory(uint32 offset, uint32 size) = 0;
+	struct IndexAllocation
+	{
+		void* mem; // pointer to index data inside buffer
+		void* rendererInternal; // for renderer use
+	};
+
+	virtual IndexAllocation indexData_reserveIndexMemory(uint32 size) = 0;
+	virtual void indexData_releaseIndexMemory(IndexAllocation& allocation) = 0;
+	virtual void indexData_uploadIndexMemory(IndexAllocation& allocation) = 0;

 	// occlusion queries
 	virtual LatteQueryObject* occlusionQuery_create() = 0;
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp
@ -23,11 +23,11 @@ void VKRSynchronizedRingAllocator::allocateAdditionalUploadBuffer(uint32 sizeReq
 	AllocatorBuffer_t newBuffer{};
 	newBuffer.writeIndex = 0;
 	newBuffer.basePtr = nullptr;
-	if (m_bufferType == BUFFER_TYPE::STAGING)
+	if (m_bufferType == VKR_BUFFER_TYPE::STAGING)
 		m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
-	else if (m_bufferType == BUFFER_TYPE::INDEX)
+	else if (m_bufferType == VKR_BUFFER_TYPE::INDEX)
 		m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
-	else if (m_bufferType == BUFFER_TYPE::STRIDE)
+	else if (m_bufferType == VKR_BUFFER_TYPE::STRIDE)
 		m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
 	else
 		cemu_assert_debug(false);
@ -53,7 +53,7 @@ VKRSynchronizedRingAllocator::AllocatorReservation_t VKRSynchronizedRingAllocato
 		uint32 distanceToSyncPoint;
 		if (!itr.queue_syncPoints.empty())
 		{
-			if(itr.queue_syncPoints.front().offset < itr.writeIndex)
+			if (itr.queue_syncPoints.front().offset < itr.writeIndex)
 				distanceToSyncPoint = 0xFFFFFFFF;
 			else
 				distanceToSyncPoint = itr.queue_syncPoints.front().offset - itr.writeIndex;
@ -100,7 +100,7 @@ VKRSynchronizedRingAllocator::AllocatorReservation_t VKRSynchronizedRingAllocato

 void VKRSynchronizedRingAllocator::FlushReservation(AllocatorReservation_t& uploadReservation)
 {
-	cemu_assert_debug(m_bufferType == BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent
+	cemu_assert_debug(m_bufferType == VKR_BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent
 	// todo - use nonCoherentAtomSize for flush size (instead of hardcoded constant)
 	VkMappedMemoryRange flushedRange{};
 	flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
@ -167,6 +167,70 @@ void VKRSynchronizedRingAllocator::GetStats(uint32& numBuffers, size_t& totalBuf
 	}
 }

+/* VKRSynchronizedHeapAllocator */
+
+VKRSynchronizedHeapAllocator::VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize)
+	: m_vkrMemMgr(vkMemoryManager), m_chunkedHeap(bufferType, minimumBufferAllocSize) {};
+
+VKRSynchronizedHeapAllocator::AllocatorReservation* VKRSynchronizedHeapAllocator::AllocateBufferMemory(uint32 size, uint32 alignment)
+{
+	CHAddr addr = m_chunkedHeap.alloc(size, alignment);
+	m_activeAllocations.emplace_back(addr);
+	AllocatorReservation* res = m_poolAllocatorReservation.allocObj();
+	res->bufferIndex = addr.chunkIndex;
+	res->bufferOffset = addr.offset;
+	res->size = size;
+	res->memPtr = m_chunkedHeap.GetChunkPtr(addr.chunkIndex) + addr.offset;
+	m_chunkedHeap.GetChunkVkMemInfo(addr.chunkIndex, res->vkBuffer, res->vkMem);
+	return res;
+}
+
+void VKRSynchronizedHeapAllocator::FreeReservation(AllocatorReservation* uploadReservation)
+{
+	// put the allocation on a delayed release queue for the current command buffer
+	uint64 currentCommandBufferId = VulkanRenderer::GetInstance()->GetCurrentCommandBufferId();
+	auto it = std::find_if(m_activeAllocations.begin(), m_activeAllocations.end(), [&uploadReservation](const TrackedAllocation& allocation) { return allocation.allocation.chunkIndex == uploadReservation->bufferIndex && allocation.allocation.offset == uploadReservation->bufferOffset; });
+	cemu_assert_debug(it != m_activeAllocations.end());
+	m_releaseQueue[currentCommandBufferId].emplace_back(it->allocation);
+	m_activeAllocations.erase(it);
+	m_poolAllocatorReservation.freeObj(uploadReservation);
+}
+
+void VKRSynchronizedHeapAllocator::FlushReservation(AllocatorReservation* uploadReservation)
+{
+	if (m_chunkedHeap.RequiresFlush(uploadReservation->bufferIndex))
+	{
+		VkMappedMemoryRange flushedRange{};
+		flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+		flushedRange.memory = uploadReservation->vkMem;
+		flushedRange.offset = uploadReservation->bufferOffset;
+		flushedRange.size = uploadReservation->size;
+		vkFlushMappedMemoryRanges(VulkanRenderer::GetInstance()->GetLogicalDevice(), 1, &flushedRange);
+	}
+}
+
+void VKRSynchronizedHeapAllocator::CleanupBuffer(uint64 latestFinishedCommandBufferId)
+{
+	auto it = m_releaseQueue.begin();
+	while (it != m_releaseQueue.end())
+	{
+		if (it->first <= latestFinishedCommandBufferId)
+		{
+			// release allocations
+			for(auto& addr : it->second)
+				m_chunkedHeap.free(addr);
+			it = m_releaseQueue.erase(it);
+			continue;
+		}
+		it++;
+	}
+}
+
+void VKRSynchronizedHeapAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
+{
+	m_chunkedHeap.GetStats(numBuffers, totalBufferSize, freeBufferSize);
+}
+
 /* VkTextureChunkedHeap */

 uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
@ -175,7 +239,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 	m_list_chunkInfo.resize(m_list_chunkInfo.size() + 1);

 	// pad minimumAllocationSize to 32KB alignment
-	minimumAllocationSize = (minimumAllocationSize + (32*1024-1)) & ~(32 * 1024 - 1);
+	minimumAllocationSize = (minimumAllocationSize + (32 * 1024 - 1)) & ~(32 * 1024 - 1);

 	uint32 allocationSize = 1024 * 1024 * 128;
 	if (chunkIndex == 0)
@ -189,8 +253,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 	std::vector<uint32> deviceLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
 	std::vector<uint32> hostLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, 0);
 	// remove device local memory types from host local vector
-	auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) ->bool
-	{
+	auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) -> bool {
 		return std::find(deviceLocalMemoryTypeIndices.begin(), deviceLocalMemoryTypeIndices.end(), v) != deviceLocalMemoryTypeIndices.end();
 	};
 	hostLocalMemoryTypeIndices.erase(std::remove_if(hostLocalMemoryTypeIndices.begin(), hostLocalMemoryTypeIndices.end(), pred), hostLocalMemoryTypeIndices.end());
@ -206,7 +269,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 			allocInfo.memoryTypeIndex = memType;

 			VkDeviceMemory imageMemory;
-			VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory);
+			VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory);
 			if (r != VK_SUCCESS)
 				continue;
 			m_list_chunkInfo[chunkIndex].mem = imageMemory;
@ -221,7 +284,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 			allocInfo.memoryTypeIndex = memType;

 			VkDeviceMemory imageMemory;
-			VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory);
+			VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory);
 			if (r != VK_SUCCESS)
 				continue;
 			m_list_chunkInfo[chunkIndex].mem = imageMemory;
@ -238,6 +301,68 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 	return 0;
 }

+/* VkBufferChunkedHeap */
+
+VKRBuffer* VKRBuffer::Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties)
+{
+	auto* memMgr = VulkanRenderer::GetInstance()->GetMemoryManager();
+	VkBuffer buffer;
+	VkDeviceMemory bufferMemory;
+	bool allocSuccess;
+	if (bufferType == VKR_BUFFER_TYPE::STAGING)
+		allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, properties, buffer, bufferMemory);
+	else if (bufferType == VKR_BUFFER_TYPE::INDEX)
+		allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, properties, buffer, bufferMemory);
+	else if (bufferType == VKR_BUFFER_TYPE::STRIDE)
+		allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, properties, buffer, bufferMemory);
+	else
+		cemu_assert_debug(false);
+	if (!allocSuccess)
+		return nullptr;
+
+	VKRBuffer* bufferObj = new VKRBuffer(buffer, bufferMemory);
+	// if host visible, then map buffer
+	void* data = nullptr;
+	if (properties & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
+	{
+		vkMapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), bufferMemory, 0, bufferSize, 0, &data);
+		bufferObj->m_requiresFlush = !HAS_FLAG(properties, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
+	}
+	bufferObj->m_mappedMemory = (uint8*)data;
+	return bufferObj;
+}
+
+VKRBuffer::~VKRBuffer()
+{
+	if (m_mappedMemory)
+		vkUnmapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory);
+	if (m_bufferMemory != VK_NULL_HANDLE)
+		vkFreeMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory, nullptr);
+	if (m_buffer != VK_NULL_HANDLE)
+		vkDestroyBuffer(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_buffer, nullptr);
+}
+
+VkBufferChunkedHeap::~VkBufferChunkedHeap()
+{
+	for (auto& chunk : m_chunkBuffers)
+		delete chunk;
+}
+
+uint32 VkBufferChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
+{
+	size_t allocationSize = std::max<size_t>(m_minimumBufferAllocationSize, minimumAllocationSize);
+	VKRBuffer* buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
+	if(!buffer)
+		buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+	if(!buffer)
+		VulkanRenderer::GetInstance()->UnrecoverableError("Failed to allocate buffer memory for VkBufferChunkedHeap");
+	cemu_assert_debug(buffer);
+	cemu_assert_debug(m_chunkBuffers.size() == chunkIndex);
+	m_chunkBuffers.emplace_back(buffer);
+	// todo - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT might be worth it?
+	return allocationSize;
+}
+
 uint32_t VKRMemoryManager::FindMemoryType(uint32_t typeFilter, VkMemoryPropertyFlags properties) const
 {
 	VkPhysicalDeviceMemoryProperties memProperties;
@ -423,7 +548,7 @@ bool VKRMemoryManager::CreateBufferFromHostMemory(void* hostPointer, VkDeviceSiz
 	importHostMem.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT;
 	importHostMem.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT;
 	importHostMem.pHostPointer = hostPointer;
-	// VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT or 
+	// VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT or
 	// VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT
 	// whats the difference ?

@ -469,7 +594,7 @@ VkImageMemAllocation* VKRMemoryManager::imageMemoryAllocate(VkImage image)
 	auto it = map_textureHeap.find(typeFilter);
 	if (it == map_textureHeap.end())
 	{
-		texHeap = new VkTextureChunkedHeap(this, typeFilter, m_vkr->GetLogicalDevice());
+		texHeap = new VkTextureChunkedHeap(this, typeFilter);
 		map_textureHeap.emplace(typeFilter, texHeap);
 	}
 	else
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h
@ -2,6 +2,36 @@
 #include "Cafe/HW/Latte/Renderer/Renderer.h"
 #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
 #include "util/ChunkedHeap/ChunkedHeap.h"
+#include "util/helpers/MemoryPool.h"
+
+enum class VKR_BUFFER_TYPE
+{
+	STAGING, // staging upload buffer
+	INDEX, // buffer for index data
+	STRIDE, // buffer for stride-adjusted vertex data
+};
+
+class VKRBuffer
+{
+  public:
+	static VKRBuffer* Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties);
+	~VKRBuffer();
+
+	VkBuffer GetVkBuffer() const { return m_buffer; }
+	VkDeviceMemory GetVkBufferMemory() const { return m_bufferMemory; }
+
+	uint8* GetPtr() const { return m_mappedMemory; }
+
+	bool RequiresFlush() const { return m_requiresFlush; }
+
+  private:
+	VKRBuffer(VkBuffer buffer, VkDeviceMemory bufferMem) : m_buffer(buffer), m_bufferMemory(bufferMem) { };
+
+	VkBuffer m_buffer;
+	VkDeviceMemory m_bufferMemory;
+	uint8* m_mappedMemory;
+	bool m_requiresFlush{false};
+};

 struct VkImageMemAllocation
 {
@ -14,18 +44,16 @@ struct VkImageMemAllocation
 	uint32 getAllocationSize() { return allocationSize; }
 };

-class VkTextureChunkedHeap : private ChunkedHeap
+class VkTextureChunkedHeap : private ChunkedHeap<>
 {
 public:
-	VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter, VkDevice device) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter), m_device(device) { };
+	VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter) { };

 	struct ChunkInfo
 	{
 		VkDeviceMemory mem;
 	};

-	uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
-
 	CHAddr allocMem(uint32 size, uint32 alignment)
 	{
 		if (alignment < 4)
@ -43,11 +71,6 @@ public:
 		this->free(addr);
 	}

-	void setDevice(VkDevice dev)
-	{
-		m_device = dev;
-	}
-
 	VkDeviceMemory getChunkMem(uint32 index)
 	{
 		if (index >= m_list_chunkInfo.size())
@ -57,28 +80,73 @@ public:

 	void getStatistics(uint32& totalHeapSize, uint32& allocatedBytes) const
 	{
-		totalHeapSize = numHeapBytes;
-		allocatedBytes = numAllocatedBytes;
+		totalHeapSize = m_numHeapBytes;
+		allocatedBytes = m_numAllocatedBytes;
 	}

-	VkDevice m_device;
+  private:
+	uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
+
 	uint32 m_typeFilter{ 0xFFFFFFFF };
 	class VKRMemoryManager* m_vkrMemoryManager;
 	std::vector<ChunkInfo> m_list_chunkInfo;
 };

+class VkBufferChunkedHeap : private ChunkedHeap<>
+{
+  public:
+	VkBufferChunkedHeap(VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocationSize) : m_bufferType(bufferType), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { };
+	~VkBufferChunkedHeap();
+
+	using ChunkedHeap::alloc;
+	using ChunkedHeap::free;
+
+	uint8* GetChunkPtr(uint32 index) const
+	{
+		if (index >= m_chunkBuffers.size())
+			return nullptr;
+		return m_chunkBuffers[index]->GetPtr();
+	}
+
+	void GetChunkVkMemInfo(uint32 index, VkBuffer& buffer, VkDeviceMemory& mem)
+	{
+		if (index >= m_chunkBuffers.size())
+		{
+			buffer = VK_NULL_HANDLE;
+			mem = VK_NULL_HANDLE;
+			return;
+		}
+		buffer = m_chunkBuffers[index]->GetVkBuffer();
+		mem = m_chunkBuffers[index]->GetVkBufferMemory();
+	}
+
+	void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
+	{
+		numBuffers = m_chunkBuffers.size();
+		totalBufferSize = m_numHeapBytes;
+		freeBufferSize = m_numHeapBytes - m_numAllocatedBytes;
+	}
+
+	bool RequiresFlush(uint32 index) const
+	{
+		if (index >= m_chunkBuffers.size())
+			return false;
+		return m_chunkBuffers[index]->RequiresFlush();
+	}
+
+  private:
+	uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
+
+	VKR_BUFFER_TYPE m_bufferType;
+	std::vector<VKRBuffer*> m_chunkBuffers;
+	size_t m_minimumBufferAllocationSize;
+};
+
 // a circular ring-buffer which tracks and releases memory per command-buffer
 class VKRSynchronizedRingAllocator
 {
 public:
-	enum class BUFFER_TYPE
-	{
-		STAGING, // staging upload buffer
-		INDEX, // buffer for index data
-		STRIDE, // buffer for stride-adjusted vertex data
-	};
-
-	VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {};
+	VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {};
 	VKRSynchronizedRingAllocator(const VKRSynchronizedRingAllocator&) = delete; // disallow copy

 	struct BufferSyncPoint_t
@ -126,13 +194,53 @@ private:

 	const class VulkanRenderer* m_vkr;
 	const class VKRMemoryManager* m_vkrMemMgr;
-	const BUFFER_TYPE m_bufferType;
+	const VKR_BUFFER_TYPE m_bufferType;
 	const uint32 m_minimumBufferAllocSize;

 	std::vector<AllocatorBuffer_t> m_buffers;

 };

+// heap style allocator with released memory being freed after the current command buffer finishes
+class VKRSynchronizedHeapAllocator
+{
+	struct TrackedAllocation
+	{
+		TrackedAllocation(CHAddr allocation) : allocation(allocation) {};
+		CHAddr allocation;
+	};
+
+  public:
+	VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize);
+	VKRSynchronizedHeapAllocator(const VKRSynchronizedHeapAllocator&) = delete; // disallow copy
+
+	struct AllocatorReservation
+	{
+		VkBuffer vkBuffer;
+		VkDeviceMemory vkMem;
+		uint8* memPtr;
+		uint32 bufferOffset;
+		uint32 size;
+		uint32 bufferIndex;
+	};
+
+	AllocatorReservation* AllocateBufferMemory(uint32 size, uint32 alignment);
+	void FreeReservation(AllocatorReservation* uploadReservation);
+	void FlushReservation(AllocatorReservation* uploadReservation);
+
+	void CleanupBuffer(uint64 latestFinishedCommandBufferId);
+
+	void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const;
+  private:
+	const class VKRMemoryManager* m_vkrMemMgr;
+	VkBufferChunkedHeap m_chunkedHeap;
+	// allocations
+	std::vector<TrackedAllocation> m_activeAllocations;
+	MemoryPool<AllocatorReservation> m_poolAllocatorReservation{32};
+	// release queue
+	std::unordered_map<uint64, std::vector<CHAddr>> m_releaseQueue;
+};
+
 void LatteIndices_invalidateAll();

 class VKRMemoryManager
@ -140,9 +248,9 @@ class VKRMemoryManager
 	friend class VKRSynchronizedRingAllocator;
 public:
 	VKRMemoryManager(class VulkanRenderer* renderer) :
-			m_stagingBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STAGING, 32u * 1024 * 1024),
-			m_indexBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::INDEX, 4u * 1024 * 1024),
-			m_vertexStrideMetalBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STRIDE, 4u * 1024 * 1024)
+			m_stagingBuffer(renderer, this, VKR_BUFFER_TYPE::STAGING, 32u * 1024 * 1024),
+			m_indexBuffer(this, VKR_BUFFER_TYPE::INDEX, 4u * 1024 * 1024),
+			m_vertexStrideMetalBuffer(renderer, this, VKR_BUFFER_TYPE::STRIDE, 4u * 1024 * 1024)
 	{
 		m_vkr = renderer;
 	}
@ -167,7 +275,7 @@ public:
 	}

 	VKRSynchronizedRingAllocator& getStagingAllocator() { return m_stagingBuffer; }; // allocator for texture/attribute/uniform uploads
-	VKRSynchronizedRingAllocator& getIndexAllocator() { return m_indexBuffer; }; // allocator for index data
+	VKRSynchronizedHeapAllocator& GetIndexAllocator() { return m_indexBuffer; }; // allocator for index data
 	VKRSynchronizedRingAllocator& getMetalStrideWorkaroundAllocator() { return m_vertexStrideMetalBuffer; }; // allocator for stride-adjusted vertex data

 	void cleanupBuffers(uint64 latestFinishedCommandBufferId)
@ -202,6 +310,6 @@ public:
 	private:
 		class VulkanRenderer* m_vkr;
 		VKRSynchronizedRingAllocator m_stagingBuffer;
-		VKRSynchronizedRingAllocator m_indexBuffer;
+		VKRSynchronizedHeapAllocator m_indexBuffer;
 		VKRSynchronizedRingAllocator m_vertexStrideMetalBuffer;
 };
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp
@ -681,6 +681,9 @@ VulkanRenderer::~VulkanRenderer()
 		vkDestroyDebugUtilsMessengerEXT(m_instance, m_debugCallback, nullptr);
 	}

+	// destroy memory manager
+	delete memoryManager;
+
 	// destroy instance, devices
 	if (m_instance != VK_NULL_HANDLE)
 	{
@ -692,9 +695,6 @@ VulkanRenderer::~VulkanRenderer()
 		vkDestroyInstance(m_instance, nullptr);
 	}

-	// destroy memory manager
-	delete memoryManager;
-
 	// crashes?
 	//glslang::FinalizeProcess();
 }
@ -3701,7 +3701,7 @@ void VulkanRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uin

 void VulkanRenderer::AppendOverlayDebugInfo()
 {
-	ImGui::Text("--- Vulkan info ---");
+	ImGui::Text("--- Vulkan debug info ---");
 	ImGui::Text("GfxPipelines   %u", performanceMonitor.vk.numGraphicPipelines.get());
 	ImGui::Text("DescriptorSets %u", performanceMonitor.vk.numDescriptorSets.get());
 	ImGui::Text("DS ImgSamplers %u", performanceMonitor.vk.numDescriptorSamplerTextures.get());
@ -3719,7 +3719,7 @@ void VulkanRenderer::AppendOverlayDebugInfo()

 	ImGui::Text("BeginRP/f      %u", performanceMonitor.vk.numBeginRenderpassPerFrame.get());
 	ImGui::Text("Barriers/f     %u", performanceMonitor.vk.numDrawBarriersPerFrame.get());
-	ImGui::Text("--- Cache info ---");
+	ImGui::Text("--- Cache debug info ---");

 	uint32 bufferCacheHeapSize = 0;
 	uint32 bufferCacheAllocationSize = 0;
@ -3739,7 +3739,7 @@ void VulkanRenderer::AppendOverlayDebugInfo()
 	ImGui::SameLine(60.0f);
 	ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);

-	memoryManager->getIndexAllocator().GetStats(numBuffers, totalSize, freeSize);
+	memoryManager->GetIndexAllocator().GetStats(numBuffers, totalSize, freeSize);
 	ImGui::Text("Index");
 	ImGui::SameLine(60.0f);
 	ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h
@ -328,8 +328,9 @@ public:

 	RendererShader* shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) override;

-	void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override;
-	void indexData_uploadIndexMemory(uint32 offset, uint32 size) override;
+	IndexAllocation indexData_reserveIndexMemory(uint32 size) override;
+	void indexData_releaseIndexMemory(IndexAllocation& allocation) override;
+	void indexData_uploadIndexMemory(IndexAllocation& allocation) override;

 	// externally callable
 	void GetTextureFormatInfoVK(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, sint32 width, sint32 height, FormatInfoVK* formatInfoOut);
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp
@ -357,18 +357,20 @@ PipelineInfo* VulkanRenderer::draw_getOrCreateGraphicsPipeline(uint32 indexCount
 	return draw_createGraphicsPipeline(indexCount);
 }

-void* VulkanRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex)
+Renderer::IndexAllocation VulkanRenderer::indexData_reserveIndexMemory(uint32 size)
 {
-	auto& indexAllocator = this->memoryManager->getIndexAllocator();
-	auto resv = indexAllocator.AllocateBufferMemory(size, 32);
-	offset = resv.bufferOffset;
-	bufferIndex = resv.bufferIndex;
-	return resv.memPtr;
+	VKRSynchronizedHeapAllocator::AllocatorReservation* resv = memoryManager->GetIndexAllocator().AllocateBufferMemory(size, 32);
+	return { resv->memPtr, resv };
 }

-void VulkanRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size)
+void VulkanRenderer::indexData_releaseIndexMemory(IndexAllocation& allocation)
 {
-	// does nothing since the index buffer memory is coherent
+	memoryManager->GetIndexAllocator().FreeReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal);
+}
+
+void VulkanRenderer::indexData_uploadIndexMemory(IndexAllocation& allocation)
+{
+	memoryManager->GetIndexAllocator().FlushReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal);
 }

 float s_vkUniformData[512 * 4];
@ -1413,14 +1415,15 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
 	uint32 hostIndexCount;
 	uint32 indexMin = 0;
 	uint32 indexMax = 0;
-	uint32 indexBufferOffset = 0;
-	uint32 indexBufferIndex = 0;
-	LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex);
-
+	Renderer::IndexAllocation indexAllocation;
+	LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexAllocation);
+	VKRSynchronizedHeapAllocator::AllocatorReservation* indexReservation = (VKRSynchronizedHeapAllocator::AllocatorReservation*)indexAllocation.rendererInternal;
 	// update index binding
 	bool isPrevIndexData = false;
 	if (hostIndexType != INDEX_TYPE::NONE)
 	{
+		uint32 indexBufferIndex = indexReservation->bufferIndex;
+		uint32 indexBufferOffset = indexReservation->bufferOffset;
 		if (m_state.activeIndexBufferOffset != indexBufferOffset || m_state.activeIndexBufferIndex != indexBufferIndex || m_state.activeIndexType != hostIndexType)
 		{
 			m_state.activeIndexType = hostIndexType;
@ -1433,7 +1436,7 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
 				vkType = VK_INDEX_TYPE_UINT32;
 			else
 				cemu_assert(false);
-			vkCmdBindIndexBuffer(m_state.currentCommandBuffer, memoryManager->getIndexAllocator().GetBufferByIndex(indexBufferIndex), indexBufferOffset, vkType);
+			vkCmdBindIndexBuffer(m_state.currentCommandBuffer, indexReservation->vkBuffer, indexBufferOffset, vkType);
 		}
 		else
 			isPrevIndexData = true;
--- a/src/Common/precompiled.h
+++ b/src/Common/precompiled.h
@ -274,6 +274,25 @@ inline uint64 _udiv128(uint64 highDividend, uint64 lowDividend, uint64 divisor,
 	#define NOEXPORT __attribute__ ((visibility ("hidden")))
 #endif

+#if defined(_MSC_VER)
+#define FORCE_INLINE __forceinline
+#elif defined(__GNUC__) || defined(__clang__)
+#define FORCE_INLINE inline __attribute__((always_inline))
+#else
+#define FORCE_INLINE
+#endif
+
+FORCE_INLINE inline int BSF(uint32 v) // returns index of first bit set, counting from LSB. If v is 0 then result is undefined
+{
+#if defined(_MSC_VER)
+	return _tzcnt_u32(v); // TZCNT requires BMI1. But if not supported it will execute as BSF
+#elif defined(__GNUC__) || defined(__clang__)
+	return __builtin_ctz(v);
+#else
+	return std::countr_zero(v);
+#endif
+}
+
 // On aarch64 we handle some of the x86 intrinsics by implementing them as wrappers
 #if defined(__aarch64__)

--- a/src/util/ChunkedHeap/ChunkedHeap.h
+++ b/src/util/ChunkedHeap/ChunkedHeap.h
@ -1,35 +1,39 @@
 #pragma once

+#include <util/helpers/MemoryPool.h>
+
 struct CHAddr
 {
 	uint32 offset;
 	uint32 chunkIndex;
+	void* internal; // AllocRange

-	CHAddr(uint32 _offset, uint32 _chunkIndex) : offset(_offset), chunkIndex(_chunkIndex) {};
+	CHAddr(uint32 _offset, uint32 _chunkIndex, void* internal = nullptr) : offset(_offset), chunkIndex(_chunkIndex), internal(internal) {};
 	CHAddr() : offset(0xFFFFFFFF), chunkIndex(0xFFFFFFFF) {};

 	bool isValid() { return chunkIndex != 0xFFFFFFFF; };
 	static CHAddr getInvalid() { return CHAddr(0xFFFFFFFF, 0xFFFFFFFF); };
 };

+template<uint32 TMinimumAlignment = 32>
 class ChunkedHeap
 {
-	struct allocRange_t
+	struct AllocRange
 	{
-		allocRange_t* nextFree{};
-		allocRange_t* prevFree{};
-		allocRange_t* prevOrdered{};
-		allocRange_t* nextOrdered{};
+		AllocRange* nextFree{};
+		AllocRange* prevFree{};
+		AllocRange* prevOrdered{};
+		AllocRange* nextOrdered{};
 		uint32 offset;
 		uint32 chunkIndex;
 		uint32 size;
 		bool isFree;
-		allocRange_t(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {};
+		AllocRange(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {};
 	};

-	struct chunk_t
+	struct Chunk
 	{
-		std::unordered_map<uint32, allocRange_t*> map_allocatedRange;
+		uint32 size;
 	};

 public:
@ -47,45 +51,32 @@ public:
 		_free(addr);
 	}

-	virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
-	{
-		return 0;
-	}
+	virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) = 0;

 private:
 	unsigned ulog2(uint32 v)
 	{
-		static const unsigned MUL_DE_BRUIJN_BIT[] =
-		{
-		   0,  9,  1, 10, 13, 21,  2, 29, 11, 14, 16, 18, 22, 25,  3, 30,
-		   8, 12, 20, 28, 15, 17, 24,  7, 19, 27, 23,  6, 26,  5,  4, 31
-		};
-
-		v |= v >> 1;
-		v |= v >> 2;
-		v |= v >> 4;
-		v |= v >> 8;
-		v |= v >> 16;
-
-		return MUL_DE_BRUIJN_BIT[(v * 0x07C4ACDDu) >> 27];
+		cemu_assert_debug(v != 0);
+		return 31 - std::countl_zero(v);
 	}

-	void trackFreeRange(allocRange_t* range)
+	void trackFreeRange(AllocRange* range)
 	{
 		// get index of msb
 		cemu_assert_debug(range->size != 0); // size of zero is not allowed
 		uint32 bucketIndex = ulog2(range->size);
-		range->nextFree = bucketFreeRange[bucketIndex];
-		if (bucketFreeRange[bucketIndex])
-			bucketFreeRange[bucketIndex]->prevFree = range;
+		range->nextFree = m_bucketFreeRange[bucketIndex];
+		if (m_bucketFreeRange[bucketIndex])
+			m_bucketFreeRange[bucketIndex]->prevFree = range;
 		range->prevFree = nullptr;
-		bucketFreeRange[bucketIndex] = range;
+		m_bucketFreeRange[bucketIndex] = range;
+		m_bucketUseMask |= (1u << bucketIndex);
 	}

-	void forgetFreeRange(allocRange_t* range, uint32 bucketIndex)
+	void forgetFreeRange(AllocRange* range, uint32 bucketIndex)
 	{
-		allocRange_t* prevRange = range->prevFree;
-		allocRange_t* nextRange = range->nextFree;
+		AllocRange* prevRange = range->prevFree;
+		AllocRange* nextRange = range->nextFree;
 		if (prevRange)
 		{
 			prevRange->nextFree = nextRange;
@ -94,36 +85,42 @@ private:
 		}
 		else
 		{
-			if (bucketFreeRange[bucketIndex] != range)
-				assert_dbg();
-			bucketFreeRange[bucketIndex] = nextRange;
+			cemu_assert_debug(m_bucketFreeRange[bucketIndex] == range);
+			m_bucketFreeRange[bucketIndex] = nextRange;
 			if (nextRange)
 				nextRange->prevFree = nullptr;
+			else
+				m_bucketUseMask &= ~(1u << bucketIndex);
 		}
 	}

 	bool allocateChunk(uint32 minimumAllocationSize)
 	{
-		uint32 chunkIndex = (uint32)list_chunks.size();
-		list_chunks.emplace_back(new chunk_t());
+		uint32 chunkIndex = (uint32)m_chunks.size();
+		m_chunks.emplace_back();
 		uint32 chunkSize = allocateNewChunk(chunkIndex, minimumAllocationSize);
+		cemu_assert_debug((chunkSize%TMinimumAlignment) == 0); // chunk size should be a multiple of the minimum alignment
 		if (chunkSize == 0)
 			return false;
-		allocRange_t* range = new allocRange_t(0, chunkIndex, chunkSize, true);
+		cemu_assert_debug(chunkSize < 0x80000000u); // chunk size must be below 2GB
+		AllocRange* range = m_allocEntriesPool.allocObj(0, chunkIndex, chunkSize, true);
 		trackFreeRange(range);
-		numHeapBytes += chunkSize;
+		m_numHeapBytes += chunkSize;
 		return true;
 	}

-	void _allocFrom(allocRange_t* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize)
+	void _allocFrom(AllocRange* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize)
 	{
+		cemu_assert_debug(allocSize > 0);
 		// remove the range from the chain of free ranges
 		forgetFreeRange(range, bucketIndex);
 		// split head, allocation and tail into separate ranges
-		if (allocOffset > range->offset)
+		uint32 headBytes = allocOffset - range->offset;
+		if (headBytes > 0)
 		{
 			// alignment padding -> create free range
-			allocRange_t* head = new allocRange_t(range->offset, range->chunkIndex, allocOffset - range->offset, true);
+			cemu_assert_debug(headBytes >= TMinimumAlignment);
+			AllocRange* head = m_allocEntriesPool.allocObj(range->offset, range->chunkIndex, headBytes, true);
 			trackFreeRange(head);
 			if (range->prevOrdered)
 				range->prevOrdered->nextOrdered = head;
@ -131,10 +128,12 @@ private:
 			head->nextOrdered = range;
 			range->prevOrdered = head;
 		}
-		if ((allocOffset + allocSize) < (range->offset + range->size)) // todo - create only if it's more than a couple of bytes?
+		uint32 tailBytes = (range->offset + range->size) - (allocOffset + allocSize);
+		if (tailBytes > 0)
 		{
 			// tail -> create free range
-			allocRange_t* tail = new allocRange_t((allocOffset + allocSize), range->chunkIndex, (range->offset + range->size) - (allocOffset + allocSize), true);
+			cemu_assert_debug(tailBytes >= TMinimumAlignment);
+			AllocRange* tail = m_allocEntriesPool.allocObj((allocOffset + allocSize), range->chunkIndex, tailBytes, true);
 			trackFreeRange(tail);
 			if (range->nextOrdered)
 				range->nextOrdered->prevOrdered = tail;
@ -149,36 +148,51 @@ private:

 	CHAddr _alloc(uint32 size, uint32 alignment)
 	{
+		cemu_assert_debug(size <= (0x7FFFFFFFu-TMinimumAlignment));
+		// make sure size is not zero and align it
+		if(size == 0) [[unlikely]]
+			size = TMinimumAlignment;
+		else
+			size = (size + (TMinimumAlignment - 1)) & ~(TMinimumAlignment - 1);
 		// find smallest bucket to scan
 		uint32 alignmentM1 = alignment - 1;
 		uint32 bucketIndex = ulog2(size);
-		while (bucketIndex < 32)
+		// check if the bucket is available
+		if( !(m_bucketUseMask & (1u << bucketIndex)) )
 		{
-			allocRange_t* range = bucketFreeRange[bucketIndex];
+			// skip to next non-empty bucket
+			uint32 nextIndex = BSF(m_bucketUseMask>>bucketIndex);
+			bucketIndex += nextIndex;
+		}
+		while (bucketIndex < 31)
+		{
+			AllocRange* range = m_bucketFreeRange[bucketIndex];
 			while (range)
 			{
 				if (range->size >= size)
 				{
 					// verify if aligned allocation fits
 					uint32 alignedOffset = (range->offset + alignmentM1) & ~alignmentM1;
-					uint32 alignmentLoss = alignedOffset - range->offset;
-					if (alignmentLoss < range->size && (range->size - alignmentLoss) >= size)
+					uint32 endOffset = alignedOffset + size;
+					if((range->offset+range->size) >= endOffset)
 					{
 						_allocFrom(range, bucketIndex, alignedOffset, size);
-						list_chunks[range->chunkIndex]->map_allocatedRange.emplace(alignedOffset, range);
-						numAllocatedBytes += size;
-						return CHAddr(alignedOffset, range->chunkIndex);
+						m_numAllocatedBytes += size;
+						return CHAddr(alignedOffset, range->chunkIndex, range);
 					}
 				}
 				range = range->nextFree;
 			}
-			bucketIndex++; // try higher bucket
+			// check next non-empty bucket or skip to end
+			bucketIndex++;
+			uint32 emptyBuckets = BSF(m_bucketUseMask>>bucketIndex);
+			bucketIndex += emptyBuckets;
 		}
-		if(allocationLimitReached)
+		if(m_allocationLimitReached)
 			return CHAddr(0xFFFFFFFF, 0xFFFFFFFF);
 		if (!allocateChunk(size))
 		{
-			allocationLimitReached = true;
+			m_allocationLimitReached = true;
 			return CHAddr(0xFFFFFFFF, 0xFFFFFFFF);
 		}
 		return _alloc(size, alignment);
@ -186,24 +200,16 @@ private:

 	void _free(CHAddr addr)
 	{
-		auto it = list_chunks[addr.chunkIndex]->map_allocatedRange.find(addr.offset);
-		if (it == list_chunks[addr.chunkIndex]->map_allocatedRange.end())
+		if(!addr.internal)
 		{
 			cemuLog_log(LogType::Force, "Internal heap error. {:08x} {:08x}", addr.chunkIndex, addr.offset);
-			cemuLog_log(LogType::Force, "Debug info:");
-			for (auto& rangeItr : list_chunks[addr.chunkIndex]->map_allocatedRange)
-			{
-				cemuLog_log(LogType::Force, "{:08x} {:08x}", rangeItr.second->offset, rangeItr.second->size);
-			}
 			return;
 		}
-
-		allocRange_t* range = it->second;
-		numAllocatedBytes -= it->second->size;
-		list_chunks[range->chunkIndex]->map_allocatedRange.erase(it);
+		AllocRange* range = (AllocRange*)addr.internal;
+		m_numAllocatedBytes -= range->size;
 		// try merge left or right
-		allocRange_t* prevRange = range->prevOrdered;
-		allocRange_t* nextRange = range->nextOrdered;
+		AllocRange* prevRange = range->prevOrdered;
+		AllocRange* nextRange = range->nextOrdered;
 		if (prevRange && prevRange->isFree)
 		{
 			if (nextRange && nextRange->isFree)
@ -216,8 +222,8 @@ private:
 				forgetFreeRange(prevRange, ulog2(prevRange->size));
 				prevRange->size = newSize;
 				trackFreeRange(prevRange);
-				delete range;
-				delete nextRange;
+				m_allocEntriesPool.freeObj(range);
+				m_allocEntriesPool.freeObj(nextRange);
 			}
 			else
 			{
@ -228,7 +234,7 @@ private:
 				forgetFreeRange(prevRange, ulog2(prevRange->size));
 				prevRange->size = newSize;
 				trackFreeRange(prevRange);
-				delete range;
+				m_allocEntriesPool.freeObj(range);
 			}
 		}
 		else if (nextRange && nextRange->isFree)
@ -242,7 +248,7 @@ private:
 				range->prevOrdered->nextOrdered = nextRange;
 			nextRange->prevOrdered = range->prevOrdered;
 			trackFreeRange(nextRange);
-			delete range;
+			m_allocEntriesPool.freeObj(range);
 		}
 		else
 		{
@ -265,7 +271,7 @@ private:

 		for (uint32 i = 0; i < 32; i++)
 		{
-			allocRange_t* ar = bucketFreeRange[i];
+			AllocRange* ar = m_bucketFreeRange[i];
 			while (ar)
 			{
 				availableRange_t dbgRange;
@ -278,7 +284,7 @@ private:
 					if (itr.chunkIndex != dbgRange.chunkIndex)
 						continue;
 					if (itr.offset < (dbgRange.offset + dbgRange.size) && (itr.offset + itr.size) >(dbgRange.offset))
-						assert_dbg();
+						cemu_assert_error();
 				}

 				availRanges.emplace_back(dbgRange);
@ -290,14 +296,16 @@ private:
 	}

 private:
-	std::vector<chunk_t*> list_chunks;
-	allocRange_t* bucketFreeRange[32]{};
-	bool allocationLimitReached = false;
+	std::vector<Chunk> m_chunks;
+	uint32 m_bucketUseMask{0x80000000}; // bitmask indicating non-empty buckets. MSB always set to provide an upper bound for BSF instruction
+	AllocRange* m_bucketFreeRange[32]{}; // we are only using 31 entries since the MSB is reserved (thus chunks equal or larger than 2^31 are not allowed)
+	bool m_allocationLimitReached = false;
+	MemoryPool<AllocRange> m_allocEntriesPool{64};

 public:
 	// statistics
-	uint32 numHeapBytes{}; // total size of the heap
-	uint32 numAllocatedBytes{};
+	uint32 m_numHeapBytes{}; // total size of the heap
+	uint32 m_numAllocatedBytes{};
 };

 class VGenericHeap
@ -633,7 +641,7 @@ public:

 	uint32 getCurrentBlockOffset() const { return m_currentBlockOffset; }
 	uint8* getCurrentBlockPtr() const { return m_currentBlockPtr; }
-	
+
 private:
 	void allocateAdditionalChunk()
 	{