Latte: Implement better index caching (#1443)

2025-02-02 11:52:35 +01:00 · 2025-01-12 12:39:02 +01:00 · 2025-01-12 12:39:02 +01:00 · 8dd809d725
commit 8dd809d725
parent 1923b7a7c4
16 changed files with 526 additions and 191 deletions
--- a/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp
@ -141,6 +141,14 @@ private:
 void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx);
 // called whenever the GPU runs out of commands or hits a wait condition (semaphores, HLE waits)
 void LatteCP_signalEnterWait()
 {
 	// based on the assumption that games won't do a rugpull and swap out buffer data in the middle of an uninterrupted sequence of drawcalls,
 	// we only flush caches when the GPU goes idle or has to wait for any operation
 	LatteIndices_invalidateAll();
 }
 /*
 * Read a U32 from the command buffer
 * If no data is available then wait in a busy loop
@ -466,6 +474,8 @@ LatteCMDPtr LatteCP_itWaitRegMem(LatteCMDPtr cmd, uint32 nWords)
 	const uint32 GPU7_WAIT_MEM_OP_GREATER = 6;
 	const uint32 GPU7_WAIT_MEM_OP_NEVER = 7;
 	LatteCP_signalEnterWait();
 	bool stalls = false;
 	if ((word0 & 0x10) != 0)
 	{
@ -594,6 +604,7 @@ LatteCMDPtr LatteCP_itMemSemaphore(LatteCMDPtr cmd, uint32 nWords)
 	else if(SEM_SIGNAL == 7)
 	{
 		// wait
 		LatteCP_signalEnterWait();
 		size_t loopCount = 0;
 		while (true)
 		{
@ -1305,11 +1316,13 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
 				}
 				case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
 				{
 					LatteCP_signalEnterWait();
 					LatteCP_itHLESwapScanBuffer(cmdData, nWords);
 					break;
 				}
 				case IT_HLE_WAIT_FOR_FLIP:
 				{
 					LatteCP_signalEnterWait();
 					LatteCP_itHLEWaitForFlip(cmdData, nWords);
 					break;
 				}
@ -1594,12 +1607,14 @@ void LatteCP_ProcessRingbuffer()
 			}
 			case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
 			{
 				LatteCP_signalEnterWait();
 				LatteCP_itHLESwapScanBuffer(cmd, nWords);
 				timerRecheck += CP_TIMER_RECHECK / 64;
 				break;
 			}
 			case IT_HLE_WAIT_FOR_FLIP:
 			{
 				LatteCP_signalEnterWait();
 				LatteCP_itHLEWaitForFlip(cmd, nWords);
 				timerRecheck += CP_TIMER_RECHECK / 1;
 				break;
--- a/src/Cafe/HW/Latte/Core/LatteIndices.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteIndices.cpp
@ -1,6 +1,7 @@
 #include "Cafe/HW/Latte/Core/LatteConst.h"
 #include "Cafe/HW/Latte/Renderer/Renderer.h"
 #include "Cafe/HW/Latte/ISA/RegDefines.h"
 #include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
 #include "Common/cpu_features.h"
 #if defined(ARCH_X86_64) && defined(__GNUC__)
@ -9,32 +10,53 @@
 struct  
 {
-	const void* lastPtr;
+	struct CacheEntry
-	uint32 lastCount;
+	{
-	LattePrimitiveMode lastPrimitiveMode;
+		// input data
-	LatteIndexType lastIndexType;
+		const void* lastPtr;
-	// output
+		uint32 lastCount;
-	uint32 indexMin;
+		LattePrimitiveMode lastPrimitiveMode;
-	uint32 indexMax;
+		LatteIndexType lastIndexType;
-	Renderer::INDEX_TYPE renderIndexType;
+		uint64 lastUsed;
-	uint32 outputCount;
+		// output
-	uint32 indexBufferOffset;
+		uint32 indexMin;
-	uint32 indexBufferIndex;
+		uint32 indexMax;
 		Renderer::INDEX_TYPE renderIndexType;
 		uint32 outputCount;
 		Renderer::IndexAllocation indexAllocation;
 	};
 	std::array<CacheEntry, 8> entry;
 	uint64 currentUsageCounter{0};
 }LatteIndexCache{};
 void LatteIndices_invalidate(const void* memPtr, uint32 size)
 {
-	if (LatteIndexCache.lastPtr >= memPtr && (LatteIndexCache.lastPtr < ((uint8*)memPtr + size)) )
+	for(auto& entry : LatteIndexCache.entry)
 	{
-		LatteIndexCache.lastPtr = nullptr;
+		if (entry.lastPtr >= memPtr && (entry.lastPtr < ((uint8*)memPtr + size)) )
-		LatteIndexCache.lastCount = 0;
+		{
 			if(entry.lastPtr != nullptr)
 				g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
 			entry.lastPtr = nullptr;
 			entry.lastCount = 0;
 		}
 	}
 }
 void LatteIndices_invalidateAll()
 {
-	LatteIndexCache.lastPtr = nullptr;
+	for(auto& entry : LatteIndexCache.entry)
-	LatteIndexCache.lastCount = 0;
+	{
 		if (entry.lastPtr != nullptr)
 			g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
 		entry.lastPtr = nullptr;
 		entry.lastCount = 0;
 	}
 }
 uint64 LatteIndices_GetNextUsageIndex()
 {
 	return LatteIndexCache.currentUsageCounter++;
 }
 uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, LatteIndexType indexType, uint32 count)
@ -532,7 +554,7 @@ void LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, LatteIn
 	}
 }
-void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex)
+void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation)
 {
 	// what this should do:
 	// [x] use fast SIMD-based index decoding
@ -542,17 +564,18 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
 	// [ ] better cache implementation, allow to cache across frames
 	// reuse from cache if data didn't change
-	if (LatteIndexCache.lastPtr == indexData &&
+	auto cacheEntry = std::find_if(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [indexData, count, primitiveMode, indexType](const auto& entry)
 		LatteIndexCache.lastCount == count &&
 		LatteIndexCache.lastPrimitiveMode == primitiveMode &&
 		LatteIndexCache.lastIndexType == indexType)
 	{
-		indexMin = LatteIndexCache.indexMin;
+		return entry.lastPtr == indexData && entry.lastCount == count && entry.lastPrimitiveMode == primitiveMode && entry.lastIndexType == indexType;
-		indexMax = LatteIndexCache.indexMax;
+	});
-		renderIndexType = LatteIndexCache.renderIndexType;
+	if (cacheEntry != LatteIndexCache.entry.end())
-		outputCount = LatteIndexCache.outputCount;
+	{
-		indexBufferOffset = LatteIndexCache.indexBufferOffset;
+		indexMin = cacheEntry->indexMin;
-		indexBufferIndex = LatteIndexCache.indexBufferIndex;
+		indexMax = cacheEntry->indexMax;
 		renderIndexType = cacheEntry->renderIndexType;
 		outputCount = cacheEntry->outputCount;
 		indexAllocation = cacheEntry->indexAllocation;
 		cacheEntry->lastUsed = LatteIndices_GetNextUsageIndex();
 		return;
 	}
@ -576,10 +599,12 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
 		indexMin = 0;
 		indexMax = std::max(count, 1u)-1;
 		renderIndexType = Renderer::INDEX_TYPE::NONE;
 		indexAllocation = {};
 		return; // no indices
 	}
 	// query index buffer from renderer
-	void* indexOutputPtr = g_renderer->indexData_reserveIndexMemory(indexOutputSize, indexBufferOffset, indexBufferIndex);
+	indexAllocation = g_renderer->indexData_reserveIndexMemory(indexOutputSize);
 	void* indexOutputPtr = indexAllocation.mem;
 	// decode indices
 	indexMin = std::numeric_limits<uint32>::max();
@ -704,16 +729,25 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
 		// recalculate index range but filter out primitive restart index
 		LatteIndices_alternativeCalculateIndexMinMax(indexData, indexType, count, indexMin, indexMax);
 	}
-	g_renderer->indexData_uploadIndexMemory(indexBufferOffset, indexOutputSize);
+	g_renderer->indexData_uploadIndexMemory(indexAllocation);
 	performanceMonitor.cycle[performanceMonitor.cycleIndex].indexDataUploaded += indexOutputSize;
 	// get least recently used cache entry
 	auto lruEntry = std::min_element(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [](const auto& a, const auto& b)
 	{
 		return a.lastUsed < b.lastUsed;
 	});
 	// invalidate previous allocation
 	if(lruEntry->lastPtr != nullptr)
 		g_renderer->indexData_releaseIndexMemory(lruEntry->indexAllocation);
 	// update cache
-	LatteIndexCache.lastPtr = indexData;
+	lruEntry->lastPtr = indexData;
-	LatteIndexCache.lastCount = count;
+	lruEntry->lastCount = count;
-	LatteIndexCache.lastPrimitiveMode = primitiveMode;
+	lruEntry->lastPrimitiveMode = primitiveMode;
-	LatteIndexCache.lastIndexType = indexType;
+	lruEntry->lastIndexType = indexType;
-	LatteIndexCache.indexMin = indexMin;
+	lruEntry->indexMin = indexMin;
-	LatteIndexCache.indexMax = indexMax;
+	lruEntry->indexMax = indexMax;
-	LatteIndexCache.renderIndexType = renderIndexType;
+	lruEntry->renderIndexType = renderIndexType;
-	LatteIndexCache.outputCount = outputCount;
+	lruEntry->outputCount = outputCount;
-	LatteIndexCache.indexBufferOffset = indexBufferOffset;
+	lruEntry->indexAllocation = indexAllocation;
-	LatteIndexCache.indexBufferIndex = indexBufferIndex;
+	lruEntry->lastUsed = LatteIndices_GetNextUsageIndex();
 }
--- a/src/Cafe/HW/Latte/Core/LatteIndices.h
+++ b/src/Cafe/HW/Latte/Core/LatteIndices.h
@ -4,4 +4,4 @@
 void LatteIndices_invalidate(const void* memPtr, uint32 size);
 void LatteIndices_invalidateAll();
-void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex);
+void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation);
--- a/src/Cafe/HW/Latte/Core/LatteOverlay.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteOverlay.cpp
@ -107,7 +107,13 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio
 				ImGui::Text("VRAM: %dMB / %dMB", g_state.vramUsage, g_state.vramTotal);
 			if (config.overlay.debug)
 			{
 				// general debug info
 				ImGui::Text("--- Debug info ---");
 				ImGui::Text("IndexUploadPerFrame: %dKB", (performanceMonitor.stats.indexDataUploadPerFrame+1023)/1024);
 				// backend specific info
 				g_renderer->AppendOverlayDebugInfo();
 			}
 			position.y += (ImGui::GetWindowSize().y + 10.0f) * direction;
 		}
--- a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp
+++ b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp
@ -74,7 +74,6 @@ void LattePerformanceMonitor_frameEnd()
 		uniformBankDataUploadedPerFrame /= 1024ULL;
 		uint32 uniformBankCountUploadedPerFrame = (uint32)(uniformBankUploadedCount / (uint64)elapsedFrames);
 		uint64 indexDataUploadPerFrame = (indexDataUploaded / (uint64)elapsedFrames);
 		indexDataUploadPerFrame /= 1024ULL;
 		double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS;
 		uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames;
@ -82,7 +81,7 @@ void LattePerformanceMonitor_frameEnd()
 		uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime);
 		uint32 tlps = (uint32)((uint64)threadLeaveCount * 1000ULL / (uint64)totalElapsedTime);
 		// set stats
-
+		performanceMonitor.stats.indexDataUploadPerFrame = indexDataUploadPerFrame;
 		// next counter cycle
 		sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES;
 		performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0;
--- a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h
+++ b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h
@ -132,6 +132,12 @@ typedef struct
 		LattePerfStatCounter numDrawBarriersPerFrame;
 		LattePerfStatCounter numBeginRenderpassPerFrame;
 	}vk;
 	// calculated stats (per frame)
 	struct
 	{
 		uint32 indexDataUploadPerFrame;
 	}stats;
 }performanceMonitor_t;
 extern performanceMonitor_t performanceMonitor;
--- a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp
+++ b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp
@ -11,7 +11,6 @@
 #include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
 #include "Cafe/GraphicPack/GraphicPack2.h"
 #include "config/ActiveSettings.h"
 #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h"
 #include "gui/guiWrapper.h"
 #include "Cafe/OS/libs/erreula/erreula.h"
 #include "input/InputManager.h"
--- a/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h
+++ b/src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h
@ -102,16 +102,21 @@ public:
 	static void SetAttributeArrayState(uint32 index, bool isEnabled, sint32 aluDivisor);
 	static void SetArrayElementBuffer(GLuint arrayElementBuffer);
-	// index
+	// index (not used by OpenGL renderer yet)
-	void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override
+	IndexAllocation indexData_reserveIndexMemory(uint32 size) override
 	{
-		assert_dbg();
+		cemu_assert_unimplemented();
-		return nullptr;
+		return {};
 	}
-	void indexData_uploadIndexMemory(uint32 offset, uint32 size) override
+	void indexData_releaseIndexMemory(IndexAllocation& allocation) override
 	{
-		assert_dbg();
+		cemu_assert_unimplemented();
 	}
 	void indexData_uploadIndexMemory(IndexAllocation& allocation) override
 	{
 		cemu_assert_unimplemented();
 	}
 	// uniform
--- a/src/Cafe/HW/Latte/Renderer/Renderer.h
+++ b/src/Cafe/HW/Latte/Renderer/Renderer.h
@ -138,8 +138,15 @@ public:
 	virtual void draw_endSequence() = 0;
 	// index
-	virtual void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) = 0;
+	struct IndexAllocation
-	virtual void indexData_uploadIndexMemory(uint32 offset, uint32 size) = 0;
+	{
 		void* mem; // pointer to index data inside buffer
 		void* rendererInternal; // for renderer use
 	};
 	virtual IndexAllocation indexData_reserveIndexMemory(uint32 size) = 0;
 	virtual void indexData_releaseIndexMemory(IndexAllocation& allocation) = 0;
 	virtual void indexData_uploadIndexMemory(IndexAllocation& allocation) = 0;
 	// occlusion queries
 	virtual LatteQueryObject* occlusionQuery_create() = 0;
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.cpp
@ -23,11 +23,11 @@ void VKRSynchronizedRingAllocator::allocateAdditionalUploadBuffer(uint32 sizeReq
 	AllocatorBuffer_t newBuffer{};
 	newBuffer.writeIndex = 0;
 	newBuffer.basePtr = nullptr;
-	if (m_bufferType == BUFFER_TYPE::STAGING)
+	if (m_bufferType == VKR_BUFFER_TYPE::STAGING)
 		m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
-	else if (m_bufferType == BUFFER_TYPE::INDEX)
+	else if (m_bufferType == VKR_BUFFER_TYPE::INDEX)
 		m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
-	else if (m_bufferType == BUFFER_TYPE::STRIDE)
+	else if (m_bufferType == VKR_BUFFER_TYPE::STRIDE)
 		m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
 	else
 		cemu_assert_debug(false);
@ -53,7 +53,7 @@ VKRSynchronizedRingAllocator::AllocatorReservation_t VKRSynchronizedRingAllocato
 		uint32 distanceToSyncPoint;
 		if (!itr.queue_syncPoints.empty())
 		{
-			if(itr.queue_syncPoints.front().offset < itr.writeIndex)
+			if (itr.queue_syncPoints.front().offset < itr.writeIndex)
 				distanceToSyncPoint = 0xFFFFFFFF;
 			else
 				distanceToSyncPoint = itr.queue_syncPoints.front().offset - itr.writeIndex;
@ -100,7 +100,7 @@ VKRSynchronizedRingAllocator::AllocatorReservation_t VKRSynchronizedRingAllocato
 void VKRSynchronizedRingAllocator::FlushReservation(AllocatorReservation_t& uploadReservation)
 {
-	cemu_assert_debug(m_bufferType == BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent
+	cemu_assert_debug(m_bufferType == VKR_BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent
 	// todo - use nonCoherentAtomSize for flush size (instead of hardcoded constant)
 	VkMappedMemoryRange flushedRange{};
 	flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
@ -167,6 +167,70 @@ void VKRSynchronizedRingAllocator::GetStats(uint32& numBuffers, size_t& totalBuf
 	}
 }
 /* VKRSynchronizedHeapAllocator */
 VKRSynchronizedHeapAllocator::VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize)
 	: m_vkrMemMgr(vkMemoryManager), m_chunkedHeap(bufferType, minimumBufferAllocSize) {};
 VKRSynchronizedHeapAllocator::AllocatorReservation* VKRSynchronizedHeapAllocator::AllocateBufferMemory(uint32 size, uint32 alignment)
 {
 	CHAddr addr = m_chunkedHeap.alloc(size, alignment);
 	m_activeAllocations.emplace_back(addr);
 	AllocatorReservation* res = m_poolAllocatorReservation.allocObj();
 	res->bufferIndex = addr.chunkIndex;
 	res->bufferOffset = addr.offset;
 	res->size = size;
 	res->memPtr = m_chunkedHeap.GetChunkPtr(addr.chunkIndex) + addr.offset;
 	m_chunkedHeap.GetChunkVkMemInfo(addr.chunkIndex, res->vkBuffer, res->vkMem);
 	return res;
 }
 void VKRSynchronizedHeapAllocator::FreeReservation(AllocatorReservation* uploadReservation)
 {
 	// put the allocation on a delayed release queue for the current command buffer
 	uint64 currentCommandBufferId = VulkanRenderer::GetInstance()->GetCurrentCommandBufferId();
 	auto it = std::find_if(m_activeAllocations.begin(), m_activeAllocations.end(), [&uploadReservation](const TrackedAllocation& allocation) { return allocation.allocation.chunkIndex == uploadReservation->bufferIndex && allocation.allocation.offset == uploadReservation->bufferOffset; });
 	cemu_assert_debug(it != m_activeAllocations.end());
 	m_releaseQueue[currentCommandBufferId].emplace_back(it->allocation);
 	m_activeAllocations.erase(it);
 	m_poolAllocatorReservation.freeObj(uploadReservation);
 }
 void VKRSynchronizedHeapAllocator::FlushReservation(AllocatorReservation* uploadReservation)
 {
 	if (m_chunkedHeap.RequiresFlush(uploadReservation->bufferIndex))
 	{
 		VkMappedMemoryRange flushedRange{};
 		flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
 		flushedRange.memory = uploadReservation->vkMem;
 		flushedRange.offset = uploadReservation->bufferOffset;
 		flushedRange.size = uploadReservation->size;
 		vkFlushMappedMemoryRanges(VulkanRenderer::GetInstance()->GetLogicalDevice(), 1, &flushedRange);
 	}
 }
 void VKRSynchronizedHeapAllocator::CleanupBuffer(uint64 latestFinishedCommandBufferId)
 {
 	auto it = m_releaseQueue.begin();
 	while (it != m_releaseQueue.end())
 	{
 		if (it->first <= latestFinishedCommandBufferId)
 		{
 			// release allocations
 			for(auto& addr : it->second)
 				m_chunkedHeap.free(addr);
 			it = m_releaseQueue.erase(it);
 			continue;
 		}
 		it++;
 	}
 }
 void VKRSynchronizedHeapAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
 {
 	m_chunkedHeap.GetStats(numBuffers, totalBufferSize, freeBufferSize);
 }
 /* VkTextureChunkedHeap */
 uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
@ -175,7 +239,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 	m_list_chunkInfo.resize(m_list_chunkInfo.size() + 1);
 	// pad minimumAllocationSize to 32KB alignment
-	minimumAllocationSize = (minimumAllocationSize + (32*1024-1)) & ~(32 * 1024 - 1);
+	minimumAllocationSize = (minimumAllocationSize + (32 * 1024 - 1)) & ~(32 * 1024 - 1);
 	uint32 allocationSize = 1024 * 1024 * 128;
 	if (chunkIndex == 0)
@ -189,8 +253,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 	std::vector<uint32> deviceLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
 	std::vector<uint32> hostLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, 0);
 	// remove device local memory types from host local vector
-	auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) ->bool
+	auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) -> bool {
 	{
 		return std::find(deviceLocalMemoryTypeIndices.begin(), deviceLocalMemoryTypeIndices.end(), v) != deviceLocalMemoryTypeIndices.end();
 	};
 	hostLocalMemoryTypeIndices.erase(std::remove_if(hostLocalMemoryTypeIndices.begin(), hostLocalMemoryTypeIndices.end(), pred), hostLocalMemoryTypeIndices.end());
@ -206,7 +269,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 			allocInfo.memoryTypeIndex = memType;
 			VkDeviceMemory imageMemory;
-			VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory);
+			VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory);
 			if (r != VK_SUCCESS)
 				continue;
 			m_list_chunkInfo[chunkIndex].mem = imageMemory;
@ -221,7 +284,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 			allocInfo.memoryTypeIndex = memType;
 			VkDeviceMemory imageMemory;
-			VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory);
+			VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory);
 			if (r != VK_SUCCESS)
 				continue;
 			m_list_chunkInfo[chunkIndex].mem = imageMemory;
@ -238,6 +301,68 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
 	return 0;
 }
 /* VkBufferChunkedHeap */
 VKRBuffer* VKRBuffer::Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties)
 {
 	auto* memMgr = VulkanRenderer::GetInstance()->GetMemoryManager();
 	VkBuffer buffer;
 	VkDeviceMemory bufferMemory;
 	bool allocSuccess;
 	if (bufferType == VKR_BUFFER_TYPE::STAGING)
 		allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, properties, buffer, bufferMemory);
 	else if (bufferType == VKR_BUFFER_TYPE::INDEX)
 		allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, properties, buffer, bufferMemory);
 	else if (bufferType == VKR_BUFFER_TYPE::STRIDE)
 		allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, properties, buffer, bufferMemory);
 	else
 		cemu_assert_debug(false);
 	if (!allocSuccess)
 		return nullptr;
 	VKRBuffer* bufferObj = new VKRBuffer(buffer, bufferMemory);
 	// if host visible, then map buffer
 	void* data = nullptr;
 	if (properties & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
 	{
 		vkMapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), bufferMemory, 0, bufferSize, 0, &data);
 		bufferObj->m_requiresFlush = !HAS_FLAG(properties, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
 	}
 	bufferObj->m_mappedMemory = (uint8*)data;
 	return bufferObj;
 }
 VKRBuffer::~VKRBuffer()
 {
 	if (m_mappedMemory)
 		vkUnmapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory);
 	if (m_bufferMemory != VK_NULL_HANDLE)
 		vkFreeMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory, nullptr);
 	if (m_buffer != VK_NULL_HANDLE)
 		vkDestroyBuffer(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_buffer, nullptr);
 }
 VkBufferChunkedHeap::~VkBufferChunkedHeap()
 {
 	for (auto& chunk : m_chunkBuffers)
 		delete chunk;
 }
 uint32 VkBufferChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
 {
 	size_t allocationSize = std::max<size_t>(m_minimumBufferAllocationSize, minimumAllocationSize);
 	VKRBuffer* buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
 	if(!buffer)
 		buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
 	if(!buffer)
 		VulkanRenderer::GetInstance()->UnrecoverableError("Failed to allocate buffer memory for VkBufferChunkedHeap");
 	cemu_assert_debug(buffer);
 	cemu_assert_debug(m_chunkBuffers.size() == chunkIndex);
 	m_chunkBuffers.emplace_back(buffer);
 	// todo - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT might be worth it?
 	return allocationSize;
 }
 uint32_t VKRMemoryManager::FindMemoryType(uint32_t typeFilter, VkMemoryPropertyFlags properties) const
 {
 	VkPhysicalDeviceMemoryProperties memProperties;
@ -423,7 +548,7 @@ bool VKRMemoryManager::CreateBufferFromHostMemory(void* hostPointer, VkDeviceSiz
 	importHostMem.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT;
 	importHostMem.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT;
 	importHostMem.pHostPointer = hostPointer;
-	// VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT or 
+	// VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT or
 	// VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT
 	// whats the difference ?
@ -469,7 +594,7 @@ VkImageMemAllocation* VKRMemoryManager::imageMemoryAllocate(VkImage image)
 	auto it = map_textureHeap.find(typeFilter);
 	if (it == map_textureHeap.end())
 	{
-		texHeap = new VkTextureChunkedHeap(this, typeFilter, m_vkr->GetLogicalDevice());
+		texHeap = new VkTextureChunkedHeap(this, typeFilter);
 		map_textureHeap.emplace(typeFilter, texHeap);
 	}
 	else
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VKRMemoryManager.h
@ -2,6 +2,36 @@
 #include "Cafe/HW/Latte/Renderer/Renderer.h"
 #include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
 #include "util/ChunkedHeap/ChunkedHeap.h"
 #include "util/helpers/MemoryPool.h"
 enum class VKR_BUFFER_TYPE
 {
 	STAGING, // staging upload buffer
 	INDEX, // buffer for index data
 	STRIDE, // buffer for stride-adjusted vertex data
 };
 class VKRBuffer
 {
  public:
 	static VKRBuffer* Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties);
 	~VKRBuffer();
 	VkBuffer GetVkBuffer() const { return m_buffer; }
 	VkDeviceMemory GetVkBufferMemory() const { return m_bufferMemory; }
 	uint8* GetPtr() const { return m_mappedMemory; }
 	bool RequiresFlush() const { return m_requiresFlush; }
  private:
 	VKRBuffer(VkBuffer buffer, VkDeviceMemory bufferMem) : m_buffer(buffer), m_bufferMemory(bufferMem) { };
 	VkBuffer m_buffer;
 	VkDeviceMemory m_bufferMemory;
 	uint8* m_mappedMemory;
 	bool m_requiresFlush{false};
 };
 struct VkImageMemAllocation
 {
@ -14,18 +44,16 @@ struct VkImageMemAllocation
 	uint32 getAllocationSize() { return allocationSize; }
 };
-class VkTextureChunkedHeap : private ChunkedHeap
+class VkTextureChunkedHeap : private ChunkedHeap<>
 {
 public:
-	VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter, VkDevice device) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter), m_device(device) { };
+	VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter) { };
 	struct ChunkInfo
 	{
 		VkDeviceMemory mem;
 	};
 	uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
 	CHAddr allocMem(uint32 size, uint32 alignment)
 	{
 		if (alignment < 4)
@ -43,11 +71,6 @@ public:
 		this->free(addr);
 	}
 	void setDevice(VkDevice dev)
 	{
 		m_device = dev;
 	}
 	VkDeviceMemory getChunkMem(uint32 index)
 	{
 		if (index >= m_list_chunkInfo.size())
@ -57,28 +80,73 @@ public:
 	void getStatistics(uint32& totalHeapSize, uint32& allocatedBytes) const
 	{
-		totalHeapSize = numHeapBytes;
+		totalHeapSize = m_numHeapBytes;
-		allocatedBytes = numAllocatedBytes;
+		allocatedBytes = m_numAllocatedBytes;
 	}
-	VkDevice m_device;
+  private:
 	uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
 	uint32 m_typeFilter{ 0xFFFFFFFF };
 	class VKRMemoryManager* m_vkrMemoryManager;
 	std::vector<ChunkInfo> m_list_chunkInfo;
 };
 class VkBufferChunkedHeap : private ChunkedHeap<>
 {
  public:
 	VkBufferChunkedHeap(VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocationSize) : m_bufferType(bufferType), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { };
 	~VkBufferChunkedHeap();
 	using ChunkedHeap::alloc;
 	using ChunkedHeap::free;
 	uint8* GetChunkPtr(uint32 index) const
 	{
 		if (index >= m_chunkBuffers.size())
 			return nullptr;
 		return m_chunkBuffers[index]->GetPtr();
 	}
 	void GetChunkVkMemInfo(uint32 index, VkBuffer& buffer, VkDeviceMemory& mem)
 	{
 		if (index >= m_chunkBuffers.size())
 		{
 			buffer = VK_NULL_HANDLE;
 			mem = VK_NULL_HANDLE;
 			return;
 		}
 		buffer = m_chunkBuffers[index]->GetVkBuffer();
 		mem = m_chunkBuffers[index]->GetVkBufferMemory();
 	}
 	void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
 	{
 		numBuffers = m_chunkBuffers.size();
 		totalBufferSize = m_numHeapBytes;
 		freeBufferSize = m_numHeapBytes - m_numAllocatedBytes;
 	}
 	bool RequiresFlush(uint32 index) const
 	{
 		if (index >= m_chunkBuffers.size())
 			return false;
 		return m_chunkBuffers[index]->RequiresFlush();
 	}
  private:
 	uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
 	VKR_BUFFER_TYPE m_bufferType;
 	std::vector<VKRBuffer*> m_chunkBuffers;
 	size_t m_minimumBufferAllocationSize;
 };
 // a circular ring-buffer which tracks and releases memory per command-buffer
 class VKRSynchronizedRingAllocator
 {
 public:
-	enum class BUFFER_TYPE
+	VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {};
 	{
 		STAGING, // staging upload buffer
 		INDEX, // buffer for index data
 		STRIDE, // buffer for stride-adjusted vertex data
 	};
 	VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {};
 	VKRSynchronizedRingAllocator(const VKRSynchronizedRingAllocator&) = delete; // disallow copy
 	struct BufferSyncPoint_t
@ -126,13 +194,53 @@ private:
 	const class VulkanRenderer* m_vkr;
 	const class VKRMemoryManager* m_vkrMemMgr;
-	const BUFFER_TYPE m_bufferType;
+	const VKR_BUFFER_TYPE m_bufferType;
 	const uint32 m_minimumBufferAllocSize;
 	std::vector<AllocatorBuffer_t> m_buffers;
 };
 // heap style allocator with released memory being freed after the current command buffer finishes
 class VKRSynchronizedHeapAllocator
 {
 	struct TrackedAllocation
 	{
 		TrackedAllocation(CHAddr allocation) : allocation(allocation) {};
 		CHAddr allocation;
 	};
  public:
 	VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize);
 	VKRSynchronizedHeapAllocator(const VKRSynchronizedHeapAllocator&) = delete; // disallow copy
 	struct AllocatorReservation
 	{
 		VkBuffer vkBuffer;
 		VkDeviceMemory vkMem;
 		uint8* memPtr;
 		uint32 bufferOffset;
 		uint32 size;
 		uint32 bufferIndex;
 	};
 	AllocatorReservation* AllocateBufferMemory(uint32 size, uint32 alignment);
 	void FreeReservation(AllocatorReservation* uploadReservation);
 	void FlushReservation(AllocatorReservation* uploadReservation);
 	void CleanupBuffer(uint64 latestFinishedCommandBufferId);
 	void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const;
  private:
 	const class VKRMemoryManager* m_vkrMemMgr;
 	VkBufferChunkedHeap m_chunkedHeap;
 	// allocations
 	std::vector<TrackedAllocation> m_activeAllocations;
 	MemoryPool<AllocatorReservation> m_poolAllocatorReservation{32};
 	// release queue
 	std::unordered_map<uint64, std::vector<CHAddr>> m_releaseQueue;
 };
 void LatteIndices_invalidateAll();
 class VKRMemoryManager
@ -140,9 +248,9 @@ class VKRMemoryManager
 	friend class VKRSynchronizedRingAllocator;
 public:
 	VKRMemoryManager(class VulkanRenderer* renderer) :
-			m_stagingBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STAGING, 32u * 1024 * 1024),
+			m_stagingBuffer(renderer, this, VKR_BUFFER_TYPE::STAGING, 32u * 1024 * 1024),
-			m_indexBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::INDEX, 4u * 1024 * 1024),
+			m_indexBuffer(this, VKR_BUFFER_TYPE::INDEX, 4u * 1024 * 1024),
-			m_vertexStrideMetalBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STRIDE, 4u * 1024 * 1024)
+			m_vertexStrideMetalBuffer(renderer, this, VKR_BUFFER_TYPE::STRIDE, 4u * 1024 * 1024)
 	{
 		m_vkr = renderer;
 	}
@ -167,7 +275,7 @@ public:
 	}
 	VKRSynchronizedRingAllocator& getStagingAllocator() { return m_stagingBuffer; }; // allocator for texture/attribute/uniform uploads
-	VKRSynchronizedRingAllocator& getIndexAllocator() { return m_indexBuffer; }; // allocator for index data
+	VKRSynchronizedHeapAllocator& GetIndexAllocator() { return m_indexBuffer; }; // allocator for index data
 	VKRSynchronizedRingAllocator& getMetalStrideWorkaroundAllocator() { return m_vertexStrideMetalBuffer; }; // allocator for stride-adjusted vertex data
 	void cleanupBuffers(uint64 latestFinishedCommandBufferId)
@ -202,6 +310,6 @@ public:
 	private:
 		class VulkanRenderer* m_vkr;
 		VKRSynchronizedRingAllocator m_stagingBuffer;
-		VKRSynchronizedRingAllocator m_indexBuffer;
+		VKRSynchronizedHeapAllocator m_indexBuffer;
 		VKRSynchronizedRingAllocator m_vertexStrideMetalBuffer;
 };
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp
@ -681,6 +681,9 @@ VulkanRenderer::~VulkanRenderer()
 		vkDestroyDebugUtilsMessengerEXT(m_instance, m_debugCallback, nullptr);
 	}
 	// destroy memory manager
 	delete memoryManager;
 	// destroy instance, devices
 	if (m_instance != VK_NULL_HANDLE)
 	{
@ -692,9 +695,6 @@ VulkanRenderer::~VulkanRenderer()
 		vkDestroyInstance(m_instance, nullptr);
 	}
 	// destroy memory manager
 	delete memoryManager;
 	// crashes?
 	//glslang::FinalizeProcess();
 }
@ -3701,7 +3701,7 @@ void VulkanRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uin
 void VulkanRenderer::AppendOverlayDebugInfo()
 {
-	ImGui::Text("--- Vulkan info ---");
+	ImGui::Text("--- Vulkan debug info ---");
 	ImGui::Text("GfxPipelines   %u", performanceMonitor.vk.numGraphicPipelines.get());
 	ImGui::Text("DescriptorSets %u", performanceMonitor.vk.numDescriptorSets.get());
 	ImGui::Text("DS ImgSamplers %u", performanceMonitor.vk.numDescriptorSamplerTextures.get());
@ -3719,7 +3719,7 @@ void VulkanRenderer::AppendOverlayDebugInfo()
 	ImGui::Text("BeginRP/f      %u", performanceMonitor.vk.numBeginRenderpassPerFrame.get());
 	ImGui::Text("Barriers/f     %u", performanceMonitor.vk.numDrawBarriersPerFrame.get());
-	ImGui::Text("--- Cache info ---");
+	ImGui::Text("--- Cache debug info ---");
 	uint32 bufferCacheHeapSize = 0;
 	uint32 bufferCacheAllocationSize = 0;
@ -3739,7 +3739,7 @@ void VulkanRenderer::AppendOverlayDebugInfo()
 	ImGui::SameLine(60.0f);
 	ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);
-	memoryManager->getIndexAllocator().GetStats(numBuffers, totalSize, freeSize);
+	memoryManager->GetIndexAllocator().GetStats(numBuffers, totalSize, freeSize);
 	ImGui::Text("Index");
 	ImGui::SameLine(60.0f);
 	ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h
@ -328,8 +328,9 @@ public:
 	RendererShader* shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) override;
-	void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override;
+	IndexAllocation indexData_reserveIndexMemory(uint32 size) override;
-	void indexData_uploadIndexMemory(uint32 offset, uint32 size) override;
+	void indexData_releaseIndexMemory(IndexAllocation& allocation) override;
 	void indexData_uploadIndexMemory(IndexAllocation& allocation) override;
 	// externally callable
 	void GetTextureFormatInfoVK(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, sint32 width, sint32 height, FormatInfoVK* formatInfoOut);
--- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp
+++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRendererCore.cpp
@ -357,18 +357,20 @@ PipelineInfo* VulkanRenderer::draw_getOrCreateGraphicsPipeline(uint32 indexCount
 	return draw_createGraphicsPipeline(indexCount);
 }
-void* VulkanRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex)
+Renderer::IndexAllocation VulkanRenderer::indexData_reserveIndexMemory(uint32 size)
 {
-	auto& indexAllocator = this->memoryManager->getIndexAllocator();
+	VKRSynchronizedHeapAllocator::AllocatorReservation* resv = memoryManager->GetIndexAllocator().AllocateBufferMemory(size, 32);
-	auto resv = indexAllocator.AllocateBufferMemory(size, 32);
+	return { resv->memPtr, resv };
 	offset = resv.bufferOffset;
 	bufferIndex = resv.bufferIndex;
 	return resv.memPtr;
 }
-void VulkanRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size)
+void VulkanRenderer::indexData_releaseIndexMemory(IndexAllocation& allocation)
 {
-	// does nothing since the index buffer memory is coherent
+	memoryManager->GetIndexAllocator().FreeReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal);
 }
 void VulkanRenderer::indexData_uploadIndexMemory(IndexAllocation& allocation)
 {
 	memoryManager->GetIndexAllocator().FlushReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal);
 }
 float s_vkUniformData[512 * 4];
@ -1413,14 +1415,15 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
 	uint32 hostIndexCount;
 	uint32 indexMin = 0;
 	uint32 indexMax = 0;
-	uint32 indexBufferOffset = 0;
+	Renderer::IndexAllocation indexAllocation;
-	uint32 indexBufferIndex = 0;
+	LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexAllocation);
-	LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex);
+	VKRSynchronizedHeapAllocator::AllocatorReservation* indexReservation = (VKRSynchronizedHeapAllocator::AllocatorReservation*)indexAllocation.rendererInternal;
 	// update index binding
 	bool isPrevIndexData = false;
 	if (hostIndexType != INDEX_TYPE::NONE)
 	{
 		uint32 indexBufferIndex = indexReservation->bufferIndex;
 		uint32 indexBufferOffset = indexReservation->bufferOffset;
 		if (m_state.activeIndexBufferOffset != indexBufferOffset || m_state.activeIndexBufferIndex != indexBufferIndex || m_state.activeIndexType != hostIndexType)
 		{
 			m_state.activeIndexType = hostIndexType;
@ -1433,7 +1436,7 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
 				vkType = VK_INDEX_TYPE_UINT32;
 			else
 				cemu_assert(false);
-			vkCmdBindIndexBuffer(m_state.currentCommandBuffer, memoryManager->getIndexAllocator().GetBufferByIndex(indexBufferIndex), indexBufferOffset, vkType);
+			vkCmdBindIndexBuffer(m_state.currentCommandBuffer, indexReservation->vkBuffer, indexBufferOffset, vkType);
 		}
 		else
 			isPrevIndexData = true;
--- a/src/Common/precompiled.h
+++ b/src/Common/precompiled.h
@ -274,6 +274,25 @@ inline uint64 _udiv128(uint64 highDividend, uint64 lowDividend, uint64 divisor,
 	#define NOEXPORT __attribute__ ((visibility ("hidden")))
 #endif
 #if defined(_MSC_VER)
 #define FORCE_INLINE __forceinline
 #elif defined(__GNUC__) || defined(__clang__)
 #define FORCE_INLINE inline __attribute__((always_inline))
 #else
 #define FORCE_INLINE
 #endif
 FORCE_INLINE inline int BSF(uint32 v) // returns index of first bit set, counting from LSB. If v is 0 then result is undefined
 {
 #if defined(_MSC_VER)
 	return _tzcnt_u32(v); // TZCNT requires BMI1. But if not supported it will execute as BSF
 #elif defined(__GNUC__) || defined(__clang__)
 	return __builtin_ctz(v);
 #else
 	return std::countr_zero(v);
 #endif
 }
 // On aarch64 we handle some of the x86 intrinsics by implementing them as wrappers
 #if defined(__aarch64__)
--- a/src/util/ChunkedHeap/ChunkedHeap.h
+++ b/src/util/ChunkedHeap/ChunkedHeap.h
@ -1,35 +1,39 @@
 #pragma once
 #include <util/helpers/MemoryPool.h>
 struct CHAddr
 {
 	uint32 offset;
 	uint32 chunkIndex;
 	void* internal; // AllocRange
-	CHAddr(uint32 _offset, uint32 _chunkIndex) : offset(_offset), chunkIndex(_chunkIndex) {};
+	CHAddr(uint32 _offset, uint32 _chunkIndex, void* internal = nullptr) : offset(_offset), chunkIndex(_chunkIndex), internal(internal) {};
 	CHAddr() : offset(0xFFFFFFFF), chunkIndex(0xFFFFFFFF) {};
 	bool isValid() { return chunkIndex != 0xFFFFFFFF; };
 	static CHAddr getInvalid() { return CHAddr(0xFFFFFFFF, 0xFFFFFFFF); };
 };
 template<uint32 TMinimumAlignment = 32>
 class ChunkedHeap
 {
-	struct allocRange_t
+	struct AllocRange
 	{
-		allocRange_t* nextFree{};
+		AllocRange* nextFree{};
-		allocRange_t* prevFree{};
+		AllocRange* prevFree{};
-		allocRange_t* prevOrdered{};
+		AllocRange* prevOrdered{};
-		allocRange_t* nextOrdered{};
+		AllocRange* nextOrdered{};
 		uint32 offset;
 		uint32 chunkIndex;
 		uint32 size;
 		bool isFree;
-		allocRange_t(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {};
+		AllocRange(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {};
 	};
-	struct chunk_t
+	struct Chunk
 	{
-		std::unordered_map<uint32, allocRange_t*> map_allocatedRange;
+		uint32 size;
 	};
 public:
@ -47,45 +51,32 @@ public:
 		_free(addr);
 	}
-	virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
+	virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) = 0;
 	{
 		return 0;
 	}
 private:
 	unsigned ulog2(uint32 v)
 	{
-		static const unsigned MUL_DE_BRUIJN_BIT[] =
+		cemu_assert_debug(v != 0);
-		{
+		return 31 - std::countl_zero(v);
 		   0,  9,  1, 10, 13, 21,  2, 29, 11, 14, 16, 18, 22, 25,  3, 30,
 		   8, 12, 20, 28, 15, 17, 24,  7, 19, 27, 23,  6, 26,  5,  4, 31
 		};
 		v |= v >> 1;
 		v |= v >> 2;
 		v |= v >> 4;
 		v |= v >> 8;
 		v |= v >> 16;
 		return MUL_DE_BRUIJN_BIT[(v * 0x07C4ACDDu) >> 27];
 	}
-	void trackFreeRange(allocRange_t* range)
+	void trackFreeRange(AllocRange* range)
 	{
 		// get index of msb
 		cemu_assert_debug(range->size != 0); // size of zero is not allowed
 		uint32 bucketIndex = ulog2(range->size);
-		range->nextFree = bucketFreeRange[bucketIndex];
+		range->nextFree = m_bucketFreeRange[bucketIndex];
-		if (bucketFreeRange[bucketIndex])
+		if (m_bucketFreeRange[bucketIndex])
-			bucketFreeRange[bucketIndex]->prevFree = range;
+			m_bucketFreeRange[bucketIndex]->prevFree = range;
 		range->prevFree = nullptr;
-		bucketFreeRange[bucketIndex] = range;
+		m_bucketFreeRange[bucketIndex] = range;
 		m_bucketUseMask |= (1u << bucketIndex);
 	}
-	void forgetFreeRange(allocRange_t* range, uint32 bucketIndex)
+	void forgetFreeRange(AllocRange* range, uint32 bucketIndex)
 	{
-		allocRange_t* prevRange = range->prevFree;
+		AllocRange* prevRange = range->prevFree;
-		allocRange_t* nextRange = range->nextFree;
+		AllocRange* nextRange = range->nextFree;
 		if (prevRange)
 		{
 			prevRange->nextFree = nextRange;
@ -94,36 +85,42 @@ private:
 		}
 		else
 		{
-			if (bucketFreeRange[bucketIndex] != range)
+			cemu_assert_debug(m_bucketFreeRange[bucketIndex] == range);
-				assert_dbg();
+			m_bucketFreeRange[bucketIndex] = nextRange;
 			bucketFreeRange[bucketIndex] = nextRange;
 			if (nextRange)
 				nextRange->prevFree = nullptr;
 			else
 				m_bucketUseMask &= ~(1u << bucketIndex);
 		}
 	}
 	bool allocateChunk(uint32 minimumAllocationSize)
 	{
-		uint32 chunkIndex = (uint32)list_chunks.size();
+		uint32 chunkIndex = (uint32)m_chunks.size();
-		list_chunks.emplace_back(new chunk_t());
+		m_chunks.emplace_back();
 		uint32 chunkSize = allocateNewChunk(chunkIndex, minimumAllocationSize);
 		cemu_assert_debug((chunkSize%TMinimumAlignment) == 0); // chunk size should be a multiple of the minimum alignment
 		if (chunkSize == 0)
 			return false;
-		allocRange_t* range = new allocRange_t(0, chunkIndex, chunkSize, true);
+		cemu_assert_debug(chunkSize < 0x80000000u); // chunk size must be below 2GB
 		AllocRange* range = m_allocEntriesPool.allocObj(0, chunkIndex, chunkSize, true);
 		trackFreeRange(range);
-		numHeapBytes += chunkSize;
+		m_numHeapBytes += chunkSize;
 		return true;
 	}
-	void _allocFrom(allocRange_t* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize)
+	void _allocFrom(AllocRange* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize)
 	{
 		cemu_assert_debug(allocSize > 0);
 		// remove the range from the chain of free ranges
 		forgetFreeRange(range, bucketIndex);
 		// split head, allocation and tail into separate ranges
-		if (allocOffset > range->offset)
+		uint32 headBytes = allocOffset - range->offset;
 		if (headBytes > 0)
 		{
 			// alignment padding -> create free range
-			allocRange_t* head = new allocRange_t(range->offset, range->chunkIndex, allocOffset - range->offset, true);
+			cemu_assert_debug(headBytes >= TMinimumAlignment);
 			AllocRange* head = m_allocEntriesPool.allocObj(range->offset, range->chunkIndex, headBytes, true);
 			trackFreeRange(head);
 			if (range->prevOrdered)
 				range->prevOrdered->nextOrdered = head;
@ -131,10 +128,12 @@ private:
 			head->nextOrdered = range;
 			range->prevOrdered = head;
 		}
-		if ((allocOffset + allocSize) < (range->offset + range->size)) // todo - create only if it's more than a couple of bytes?
+		uint32 tailBytes = (range->offset + range->size) - (allocOffset + allocSize);
 		if (tailBytes > 0)
 		{
 			// tail -> create free range
-			allocRange_t* tail = new allocRange_t((allocOffset + allocSize), range->chunkIndex, (range->offset + range->size) - (allocOffset + allocSize), true);
+			cemu_assert_debug(tailBytes >= TMinimumAlignment);
 			AllocRange* tail = m_allocEntriesPool.allocObj((allocOffset + allocSize), range->chunkIndex, tailBytes, true);
 			trackFreeRange(tail);
 			if (range->nextOrdered)
 				range->nextOrdered->prevOrdered = tail;
@ -149,36 +148,51 @@ private:
 	CHAddr _alloc(uint32 size, uint32 alignment)
 	{
 		cemu_assert_debug(size <= (0x7FFFFFFFu-TMinimumAlignment));
 		// make sure size is not zero and align it
 		if(size == 0) [[unlikely]]
 			size = TMinimumAlignment;
 		else
 			size = (size + (TMinimumAlignment - 1)) & ~(TMinimumAlignment - 1);
 		// find smallest bucket to scan
 		uint32 alignmentM1 = alignment - 1;
 		uint32 bucketIndex = ulog2(size);
-		while (bucketIndex < 32)
+		// check if the bucket is available
 		if( !(m_bucketUseMask & (1u << bucketIndex)) )
 		{
-			allocRange_t* range = bucketFreeRange[bucketIndex];
+			// skip to next non-empty bucket
 			uint32 nextIndex = BSF(m_bucketUseMask>>bucketIndex);
 			bucketIndex += nextIndex;
 		}
 		while (bucketIndex < 31)
 		{
 			AllocRange* range = m_bucketFreeRange[bucketIndex];
 			while (range)
 			{
 				if (range->size >= size)
 				{
 					// verify if aligned allocation fits
 					uint32 alignedOffset = (range->offset + alignmentM1) & ~alignmentM1;
-					uint32 alignmentLoss = alignedOffset - range->offset;
+					uint32 endOffset = alignedOffset + size;
-					if (alignmentLoss < range->size && (range->size - alignmentLoss) >= size)
+					if((range->offset+range->size) >= endOffset)
 					{
 						_allocFrom(range, bucketIndex, alignedOffset, size);
-						list_chunks[range->chunkIndex]->map_allocatedRange.emplace(alignedOffset, range);
+						m_numAllocatedBytes += size;
-						numAllocatedBytes += size;
+						return CHAddr(alignedOffset, range->chunkIndex, range);
 						return CHAddr(alignedOffset, range->chunkIndex);
 					}
 				}
 				range = range->nextFree;
 			}
-			bucketIndex++; // try higher bucket
+			// check next non-empty bucket or skip to end
 			bucketIndex++;
 			uint32 emptyBuckets = BSF(m_bucketUseMask>>bucketIndex);
 			bucketIndex += emptyBuckets;
 		}
-		if(allocationLimitReached)
+		if(m_allocationLimitReached)
 			return CHAddr(0xFFFFFFFF, 0xFFFFFFFF);
 		if (!allocateChunk(size))
 		{
-			allocationLimitReached = true;
+			m_allocationLimitReached = true;
 			return CHAddr(0xFFFFFFFF, 0xFFFFFFFF);
 		}
 		return _alloc(size, alignment);
@ -186,24 +200,16 @@ private:
 	void _free(CHAddr addr)
 	{
-		auto it = list_chunks[addr.chunkIndex]->map_allocatedRange.find(addr.offset);
+		if(!addr.internal)
 		if (it == list_chunks[addr.chunkIndex]->map_allocatedRange.end())
 		{
 			cemuLog_log(LogType::Force, "Internal heap error. {:08x} {:08x}", addr.chunkIndex, addr.offset);
 			cemuLog_log(LogType::Force, "Debug info:");
 			for (auto& rangeItr : list_chunks[addr.chunkIndex]->map_allocatedRange)
 			{
 				cemuLog_log(LogType::Force, "{:08x} {:08x}", rangeItr.second->offset, rangeItr.second->size);
 			}
 			return;
 		}
-
+		AllocRange* range = (AllocRange*)addr.internal;
-		allocRange_t* range = it->second;
+		m_numAllocatedBytes -= range->size;
 		numAllocatedBytes -= it->second->size;
 		list_chunks[range->chunkIndex]->map_allocatedRange.erase(it);
 		// try merge left or right
-		allocRange_t* prevRange = range->prevOrdered;
+		AllocRange* prevRange = range->prevOrdered;
-		allocRange_t* nextRange = range->nextOrdered;
+		AllocRange* nextRange = range->nextOrdered;
 		if (prevRange && prevRange->isFree)
 		{
 			if (nextRange && nextRange->isFree)
@ -216,8 +222,8 @@ private:
 				forgetFreeRange(prevRange, ulog2(prevRange->size));
 				prevRange->size = newSize;
 				trackFreeRange(prevRange);
-				delete range;
+				m_allocEntriesPool.freeObj(range);
-				delete nextRange;
+				m_allocEntriesPool.freeObj(nextRange);
 			}
 			else
 			{
@ -228,7 +234,7 @@ private:
 				forgetFreeRange(prevRange, ulog2(prevRange->size));
 				prevRange->size = newSize;
 				trackFreeRange(prevRange);
-				delete range;
+				m_allocEntriesPool.freeObj(range);
 			}
 		}
 		else if (nextRange && nextRange->isFree)
@ -242,7 +248,7 @@ private:
 				range->prevOrdered->nextOrdered = nextRange;
 			nextRange->prevOrdered = range->prevOrdered;
 			trackFreeRange(nextRange);
-			delete range;
+			m_allocEntriesPool.freeObj(range);
 		}
 		else
 		{
@ -265,7 +271,7 @@ private:
 		for (uint32 i = 0; i < 32; i++)
 		{
-			allocRange_t* ar = bucketFreeRange[i];
+			AllocRange* ar = m_bucketFreeRange[i];
 			while (ar)
 			{
 				availableRange_t dbgRange;
@ -278,7 +284,7 @@ private:
 					if (itr.chunkIndex != dbgRange.chunkIndex)
 						continue;
 					if (itr.offset < (dbgRange.offset + dbgRange.size) && (itr.offset + itr.size) >(dbgRange.offset))
-						assert_dbg();
+						cemu_assert_error();
 				}
 				availRanges.emplace_back(dbgRange);
@ -290,14 +296,16 @@ private:
 	}
 private:
-	std::vector<chunk_t*> list_chunks;
+	std::vector<Chunk> m_chunks;
-	allocRange_t* bucketFreeRange[32]{};
+	uint32 m_bucketUseMask{0x80000000}; // bitmask indicating non-empty buckets. MSB always set to provide an upper bound for BSF instruction
-	bool allocationLimitReached = false;
+	AllocRange* m_bucketFreeRange[32]{}; // we are only using 31 entries since the MSB is reserved (thus chunks equal or larger than 2^31 are not allowed)
 	bool m_allocationLimitReached = false;
 	MemoryPool<AllocRange> m_allocEntriesPool{64};
 public:
 	// statistics
-	uint32 numHeapBytes{}; // total size of the heap
+	uint32 m_numHeapBytes{}; // total size of the heap
-	uint32 numAllocatedBytes{};
+	uint32 m_numAllocatedBytes{};
 };
 class VGenericHeap
@ -633,7 +641,7 @@ public:
 	uint32 getCurrentBlockOffset() const { return m_currentBlockOffset; }
 	uint8* getCurrentBlockPtr() const { return m_currentBlockPtr; }
-	
+
 private:
 	void allocateAdditionalChunk()
 	{