mirror of
https://github.com/cemu-project/Cemu.git
synced 2025-02-02 11:52:35 +01:00
Latte: Implement better index caching (#1443)
This commit is contained in:
parent
1923b7a7c4
commit
8dd809d725
@ -141,6 +141,14 @@ private:
|
|||||||
|
|
||||||
void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx);
|
void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx);
|
||||||
|
|
||||||
|
// called whenever the GPU runs out of commands or hits a wait condition (semaphores, HLE waits)
|
||||||
|
void LatteCP_signalEnterWait()
|
||||||
|
{
|
||||||
|
// based on the assumption that games won't do a rugpull and swap out buffer data in the middle of an uninterrupted sequence of drawcalls,
|
||||||
|
// we only flush caches when the GPU goes idle or has to wait for any operation
|
||||||
|
LatteIndices_invalidateAll();
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Read a U32 from the command buffer
|
* Read a U32 from the command buffer
|
||||||
* If no data is available then wait in a busy loop
|
* If no data is available then wait in a busy loop
|
||||||
@ -466,6 +474,8 @@ LatteCMDPtr LatteCP_itWaitRegMem(LatteCMDPtr cmd, uint32 nWords)
|
|||||||
const uint32 GPU7_WAIT_MEM_OP_GREATER = 6;
|
const uint32 GPU7_WAIT_MEM_OP_GREATER = 6;
|
||||||
const uint32 GPU7_WAIT_MEM_OP_NEVER = 7;
|
const uint32 GPU7_WAIT_MEM_OP_NEVER = 7;
|
||||||
|
|
||||||
|
LatteCP_signalEnterWait();
|
||||||
|
|
||||||
bool stalls = false;
|
bool stalls = false;
|
||||||
if ((word0 & 0x10) != 0)
|
if ((word0 & 0x10) != 0)
|
||||||
{
|
{
|
||||||
@ -594,6 +604,7 @@ LatteCMDPtr LatteCP_itMemSemaphore(LatteCMDPtr cmd, uint32 nWords)
|
|||||||
else if(SEM_SIGNAL == 7)
|
else if(SEM_SIGNAL == 7)
|
||||||
{
|
{
|
||||||
// wait
|
// wait
|
||||||
|
LatteCP_signalEnterWait();
|
||||||
size_t loopCount = 0;
|
size_t loopCount = 0;
|
||||||
while (true)
|
while (true)
|
||||||
{
|
{
|
||||||
@ -1305,11 +1316,13 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
|
|||||||
}
|
}
|
||||||
case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
|
case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
|
||||||
{
|
{
|
||||||
|
LatteCP_signalEnterWait();
|
||||||
LatteCP_itHLESwapScanBuffer(cmdData, nWords);
|
LatteCP_itHLESwapScanBuffer(cmdData, nWords);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case IT_HLE_WAIT_FOR_FLIP:
|
case IT_HLE_WAIT_FOR_FLIP:
|
||||||
{
|
{
|
||||||
|
LatteCP_signalEnterWait();
|
||||||
LatteCP_itHLEWaitForFlip(cmdData, nWords);
|
LatteCP_itHLEWaitForFlip(cmdData, nWords);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1594,12 +1607,14 @@ void LatteCP_ProcessRingbuffer()
|
|||||||
}
|
}
|
||||||
case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
|
case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
|
||||||
{
|
{
|
||||||
|
LatteCP_signalEnterWait();
|
||||||
LatteCP_itHLESwapScanBuffer(cmd, nWords);
|
LatteCP_itHLESwapScanBuffer(cmd, nWords);
|
||||||
timerRecheck += CP_TIMER_RECHECK / 64;
|
timerRecheck += CP_TIMER_RECHECK / 64;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case IT_HLE_WAIT_FOR_FLIP:
|
case IT_HLE_WAIT_FOR_FLIP:
|
||||||
{
|
{
|
||||||
|
LatteCP_signalEnterWait();
|
||||||
LatteCP_itHLEWaitForFlip(cmd, nWords);
|
LatteCP_itHLEWaitForFlip(cmd, nWords);
|
||||||
timerRecheck += CP_TIMER_RECHECK / 1;
|
timerRecheck += CP_TIMER_RECHECK / 1;
|
||||||
break;
|
break;
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include "Cafe/HW/Latte/Core/LatteConst.h"
|
#include "Cafe/HW/Latte/Core/LatteConst.h"
|
||||||
#include "Cafe/HW/Latte/Renderer/Renderer.h"
|
#include "Cafe/HW/Latte/Renderer/Renderer.h"
|
||||||
#include "Cafe/HW/Latte/ISA/RegDefines.h"
|
#include "Cafe/HW/Latte/ISA/RegDefines.h"
|
||||||
|
#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
|
||||||
#include "Common/cpu_features.h"
|
#include "Common/cpu_features.h"
|
||||||
|
|
||||||
#if defined(ARCH_X86_64) && defined(__GNUC__)
|
#if defined(ARCH_X86_64) && defined(__GNUC__)
|
||||||
@ -9,32 +10,53 @@
|
|||||||
|
|
||||||
struct
|
struct
|
||||||
{
|
{
|
||||||
|
struct CacheEntry
|
||||||
|
{
|
||||||
|
// input data
|
||||||
const void* lastPtr;
|
const void* lastPtr;
|
||||||
uint32 lastCount;
|
uint32 lastCount;
|
||||||
LattePrimitiveMode lastPrimitiveMode;
|
LattePrimitiveMode lastPrimitiveMode;
|
||||||
LatteIndexType lastIndexType;
|
LatteIndexType lastIndexType;
|
||||||
|
uint64 lastUsed;
|
||||||
// output
|
// output
|
||||||
uint32 indexMin;
|
uint32 indexMin;
|
||||||
uint32 indexMax;
|
uint32 indexMax;
|
||||||
Renderer::INDEX_TYPE renderIndexType;
|
Renderer::INDEX_TYPE renderIndexType;
|
||||||
uint32 outputCount;
|
uint32 outputCount;
|
||||||
uint32 indexBufferOffset;
|
Renderer::IndexAllocation indexAllocation;
|
||||||
uint32 indexBufferIndex;
|
};
|
||||||
|
std::array<CacheEntry, 8> entry;
|
||||||
|
uint64 currentUsageCounter{0};
|
||||||
}LatteIndexCache{};
|
}LatteIndexCache{};
|
||||||
|
|
||||||
void LatteIndices_invalidate(const void* memPtr, uint32 size)
|
void LatteIndices_invalidate(const void* memPtr, uint32 size)
|
||||||
{
|
{
|
||||||
if (LatteIndexCache.lastPtr >= memPtr && (LatteIndexCache.lastPtr < ((uint8*)memPtr + size)) )
|
for(auto& entry : LatteIndexCache.entry)
|
||||||
{
|
{
|
||||||
LatteIndexCache.lastPtr = nullptr;
|
if (entry.lastPtr >= memPtr && (entry.lastPtr < ((uint8*)memPtr + size)) )
|
||||||
LatteIndexCache.lastCount = 0;
|
{
|
||||||
|
if(entry.lastPtr != nullptr)
|
||||||
|
g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
|
||||||
|
entry.lastPtr = nullptr;
|
||||||
|
entry.lastCount = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void LatteIndices_invalidateAll()
|
void LatteIndices_invalidateAll()
|
||||||
{
|
{
|
||||||
LatteIndexCache.lastPtr = nullptr;
|
for(auto& entry : LatteIndexCache.entry)
|
||||||
LatteIndexCache.lastCount = 0;
|
{
|
||||||
|
if (entry.lastPtr != nullptr)
|
||||||
|
g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
|
||||||
|
entry.lastPtr = nullptr;
|
||||||
|
entry.lastCount = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64 LatteIndices_GetNextUsageIndex()
|
||||||
|
{
|
||||||
|
return LatteIndexCache.currentUsageCounter++;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, LatteIndexType indexType, uint32 count)
|
uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, LatteIndexType indexType, uint32 count)
|
||||||
@ -532,7 +554,7 @@ void LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, LatteIn
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex)
|
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation)
|
||||||
{
|
{
|
||||||
// what this should do:
|
// what this should do:
|
||||||
// [x] use fast SIMD-based index decoding
|
// [x] use fast SIMD-based index decoding
|
||||||
@ -542,17 +564,18 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
|
|||||||
// [ ] better cache implementation, allow to cache across frames
|
// [ ] better cache implementation, allow to cache across frames
|
||||||
|
|
||||||
// reuse from cache if data didn't change
|
// reuse from cache if data didn't change
|
||||||
if (LatteIndexCache.lastPtr == indexData &&
|
auto cacheEntry = std::find_if(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [indexData, count, primitiveMode, indexType](const auto& entry)
|
||||||
LatteIndexCache.lastCount == count &&
|
|
||||||
LatteIndexCache.lastPrimitiveMode == primitiveMode &&
|
|
||||||
LatteIndexCache.lastIndexType == indexType)
|
|
||||||
{
|
{
|
||||||
indexMin = LatteIndexCache.indexMin;
|
return entry.lastPtr == indexData && entry.lastCount == count && entry.lastPrimitiveMode == primitiveMode && entry.lastIndexType == indexType;
|
||||||
indexMax = LatteIndexCache.indexMax;
|
});
|
||||||
renderIndexType = LatteIndexCache.renderIndexType;
|
if (cacheEntry != LatteIndexCache.entry.end())
|
||||||
outputCount = LatteIndexCache.outputCount;
|
{
|
||||||
indexBufferOffset = LatteIndexCache.indexBufferOffset;
|
indexMin = cacheEntry->indexMin;
|
||||||
indexBufferIndex = LatteIndexCache.indexBufferIndex;
|
indexMax = cacheEntry->indexMax;
|
||||||
|
renderIndexType = cacheEntry->renderIndexType;
|
||||||
|
outputCount = cacheEntry->outputCount;
|
||||||
|
indexAllocation = cacheEntry->indexAllocation;
|
||||||
|
cacheEntry->lastUsed = LatteIndices_GetNextUsageIndex();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -576,10 +599,12 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
|
|||||||
indexMin = 0;
|
indexMin = 0;
|
||||||
indexMax = std::max(count, 1u)-1;
|
indexMax = std::max(count, 1u)-1;
|
||||||
renderIndexType = Renderer::INDEX_TYPE::NONE;
|
renderIndexType = Renderer::INDEX_TYPE::NONE;
|
||||||
|
indexAllocation = {};
|
||||||
return; // no indices
|
return; // no indices
|
||||||
}
|
}
|
||||||
// query index buffer from renderer
|
// query index buffer from renderer
|
||||||
void* indexOutputPtr = g_renderer->indexData_reserveIndexMemory(indexOutputSize, indexBufferOffset, indexBufferIndex);
|
indexAllocation = g_renderer->indexData_reserveIndexMemory(indexOutputSize);
|
||||||
|
void* indexOutputPtr = indexAllocation.mem;
|
||||||
|
|
||||||
// decode indices
|
// decode indices
|
||||||
indexMin = std::numeric_limits<uint32>::max();
|
indexMin = std::numeric_limits<uint32>::max();
|
||||||
@ -704,16 +729,25 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
|
|||||||
// recalculate index range but filter out primitive restart index
|
// recalculate index range but filter out primitive restart index
|
||||||
LatteIndices_alternativeCalculateIndexMinMax(indexData, indexType, count, indexMin, indexMax);
|
LatteIndices_alternativeCalculateIndexMinMax(indexData, indexType, count, indexMin, indexMax);
|
||||||
}
|
}
|
||||||
g_renderer->indexData_uploadIndexMemory(indexBufferOffset, indexOutputSize);
|
g_renderer->indexData_uploadIndexMemory(indexAllocation);
|
||||||
|
performanceMonitor.cycle[performanceMonitor.cycleIndex].indexDataUploaded += indexOutputSize;
|
||||||
|
// get least recently used cache entry
|
||||||
|
auto lruEntry = std::min_element(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [](const auto& a, const auto& b)
|
||||||
|
{
|
||||||
|
return a.lastUsed < b.lastUsed;
|
||||||
|
});
|
||||||
|
// invalidate previous allocation
|
||||||
|
if(lruEntry->lastPtr != nullptr)
|
||||||
|
g_renderer->indexData_releaseIndexMemory(lruEntry->indexAllocation);
|
||||||
// update cache
|
// update cache
|
||||||
LatteIndexCache.lastPtr = indexData;
|
lruEntry->lastPtr = indexData;
|
||||||
LatteIndexCache.lastCount = count;
|
lruEntry->lastCount = count;
|
||||||
LatteIndexCache.lastPrimitiveMode = primitiveMode;
|
lruEntry->lastPrimitiveMode = primitiveMode;
|
||||||
LatteIndexCache.lastIndexType = indexType;
|
lruEntry->lastIndexType = indexType;
|
||||||
LatteIndexCache.indexMin = indexMin;
|
lruEntry->indexMin = indexMin;
|
||||||
LatteIndexCache.indexMax = indexMax;
|
lruEntry->indexMax = indexMax;
|
||||||
LatteIndexCache.renderIndexType = renderIndexType;
|
lruEntry->renderIndexType = renderIndexType;
|
||||||
LatteIndexCache.outputCount = outputCount;
|
lruEntry->outputCount = outputCount;
|
||||||
LatteIndexCache.indexBufferOffset = indexBufferOffset;
|
lruEntry->indexAllocation = indexAllocation;
|
||||||
LatteIndexCache.indexBufferIndex = indexBufferIndex;
|
lruEntry->lastUsed = LatteIndices_GetNextUsageIndex();
|
||||||
}
|
}
|
||||||
|
@ -4,4 +4,4 @@
|
|||||||
|
|
||||||
void LatteIndices_invalidate(const void* memPtr, uint32 size);
|
void LatteIndices_invalidate(const void* memPtr, uint32 size);
|
||||||
void LatteIndices_invalidateAll();
|
void LatteIndices_invalidateAll();
|
||||||
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex);
|
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation);
|
@ -107,7 +107,13 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio
|
|||||||
ImGui::Text("VRAM: %dMB / %dMB", g_state.vramUsage, g_state.vramTotal);
|
ImGui::Text("VRAM: %dMB / %dMB", g_state.vramUsage, g_state.vramTotal);
|
||||||
|
|
||||||
if (config.overlay.debug)
|
if (config.overlay.debug)
|
||||||
|
{
|
||||||
|
// general debug info
|
||||||
|
ImGui::Text("--- Debug info ---");
|
||||||
|
ImGui::Text("IndexUploadPerFrame: %dKB", (performanceMonitor.stats.indexDataUploadPerFrame+1023)/1024);
|
||||||
|
// backend specific info
|
||||||
g_renderer->AppendOverlayDebugInfo();
|
g_renderer->AppendOverlayDebugInfo();
|
||||||
|
}
|
||||||
|
|
||||||
position.y += (ImGui::GetWindowSize().y + 10.0f) * direction;
|
position.y += (ImGui::GetWindowSize().y + 10.0f) * direction;
|
||||||
}
|
}
|
||||||
|
@ -74,7 +74,6 @@ void LattePerformanceMonitor_frameEnd()
|
|||||||
uniformBankDataUploadedPerFrame /= 1024ULL;
|
uniformBankDataUploadedPerFrame /= 1024ULL;
|
||||||
uint32 uniformBankCountUploadedPerFrame = (uint32)(uniformBankUploadedCount / (uint64)elapsedFrames);
|
uint32 uniformBankCountUploadedPerFrame = (uint32)(uniformBankUploadedCount / (uint64)elapsedFrames);
|
||||||
uint64 indexDataUploadPerFrame = (indexDataUploaded / (uint64)elapsedFrames);
|
uint64 indexDataUploadPerFrame = (indexDataUploaded / (uint64)elapsedFrames);
|
||||||
indexDataUploadPerFrame /= 1024ULL;
|
|
||||||
|
|
||||||
double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS;
|
double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS;
|
||||||
uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames;
|
uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames;
|
||||||
@ -82,7 +81,7 @@ void LattePerformanceMonitor_frameEnd()
|
|||||||
uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime);
|
uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime);
|
||||||
uint32 tlps = (uint32)((uint64)threadLeaveCount * 1000ULL / (uint64)totalElapsedTime);
|
uint32 tlps = (uint32)((uint64)threadLeaveCount * 1000ULL / (uint64)totalElapsedTime);
|
||||||
// set stats
|
// set stats
|
||||||
|
performanceMonitor.stats.indexDataUploadPerFrame = indexDataUploadPerFrame;
|
||||||
// next counter cycle
|
// next counter cycle
|
||||||
sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES;
|
sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES;
|
||||||
performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0;
|
performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0;
|
||||||
|
@ -132,6 +132,12 @@ typedef struct
|
|||||||
LattePerfStatCounter numDrawBarriersPerFrame;
|
LattePerfStatCounter numDrawBarriersPerFrame;
|
||||||
LattePerfStatCounter numBeginRenderpassPerFrame;
|
LattePerfStatCounter numBeginRenderpassPerFrame;
|
||||||
}vk;
|
}vk;
|
||||||
|
|
||||||
|
// calculated stats (per frame)
|
||||||
|
struct
|
||||||
|
{
|
||||||
|
uint32 indexDataUploadPerFrame;
|
||||||
|
}stats;
|
||||||
}performanceMonitor_t;
|
}performanceMonitor_t;
|
||||||
|
|
||||||
extern performanceMonitor_t performanceMonitor;
|
extern performanceMonitor_t performanceMonitor;
|
||||||
|
@ -11,7 +11,6 @@
|
|||||||
#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
|
#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
|
||||||
#include "Cafe/GraphicPack/GraphicPack2.h"
|
#include "Cafe/GraphicPack/GraphicPack2.h"
|
||||||
#include "config/ActiveSettings.h"
|
#include "config/ActiveSettings.h"
|
||||||
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h"
|
|
||||||
#include "gui/guiWrapper.h"
|
#include "gui/guiWrapper.h"
|
||||||
#include "Cafe/OS/libs/erreula/erreula.h"
|
#include "Cafe/OS/libs/erreula/erreula.h"
|
||||||
#include "input/InputManager.h"
|
#include "input/InputManager.h"
|
||||||
|
@ -102,16 +102,21 @@ public:
|
|||||||
static void SetAttributeArrayState(uint32 index, bool isEnabled, sint32 aluDivisor);
|
static void SetAttributeArrayState(uint32 index, bool isEnabled, sint32 aluDivisor);
|
||||||
static void SetArrayElementBuffer(GLuint arrayElementBuffer);
|
static void SetArrayElementBuffer(GLuint arrayElementBuffer);
|
||||||
|
|
||||||
// index
|
// index (not used by OpenGL renderer yet)
|
||||||
void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override
|
IndexAllocation indexData_reserveIndexMemory(uint32 size) override
|
||||||
{
|
{
|
||||||
assert_dbg();
|
cemu_assert_unimplemented();
|
||||||
return nullptr;
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
void indexData_uploadIndexMemory(uint32 offset, uint32 size) override
|
void indexData_releaseIndexMemory(IndexAllocation& allocation) override
|
||||||
{
|
{
|
||||||
assert_dbg();
|
cemu_assert_unimplemented();
|
||||||
|
}
|
||||||
|
|
||||||
|
void indexData_uploadIndexMemory(IndexAllocation& allocation) override
|
||||||
|
{
|
||||||
|
cemu_assert_unimplemented();
|
||||||
}
|
}
|
||||||
|
|
||||||
// uniform
|
// uniform
|
||||||
|
@ -138,8 +138,15 @@ public:
|
|||||||
virtual void draw_endSequence() = 0;
|
virtual void draw_endSequence() = 0;
|
||||||
|
|
||||||
// index
|
// index
|
||||||
virtual void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) = 0;
|
struct IndexAllocation
|
||||||
virtual void indexData_uploadIndexMemory(uint32 offset, uint32 size) = 0;
|
{
|
||||||
|
void* mem; // pointer to index data inside buffer
|
||||||
|
void* rendererInternal; // for renderer use
|
||||||
|
};
|
||||||
|
|
||||||
|
virtual IndexAllocation indexData_reserveIndexMemory(uint32 size) = 0;
|
||||||
|
virtual void indexData_releaseIndexMemory(IndexAllocation& allocation) = 0;
|
||||||
|
virtual void indexData_uploadIndexMemory(IndexAllocation& allocation) = 0;
|
||||||
|
|
||||||
// occlusion queries
|
// occlusion queries
|
||||||
virtual LatteQueryObject* occlusionQuery_create() = 0;
|
virtual LatteQueryObject* occlusionQuery_create() = 0;
|
||||||
|
@ -23,11 +23,11 @@ void VKRSynchronizedRingAllocator::allocateAdditionalUploadBuffer(uint32 sizeReq
|
|||||||
AllocatorBuffer_t newBuffer{};
|
AllocatorBuffer_t newBuffer{};
|
||||||
newBuffer.writeIndex = 0;
|
newBuffer.writeIndex = 0;
|
||||||
newBuffer.basePtr = nullptr;
|
newBuffer.basePtr = nullptr;
|
||||||
if (m_bufferType == BUFFER_TYPE::STAGING)
|
if (m_bufferType == VKR_BUFFER_TYPE::STAGING)
|
||||||
m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
|
m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
|
||||||
else if (m_bufferType == BUFFER_TYPE::INDEX)
|
else if (m_bufferType == VKR_BUFFER_TYPE::INDEX)
|
||||||
m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
|
m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
|
||||||
else if (m_bufferType == BUFFER_TYPE::STRIDE)
|
else if (m_bufferType == VKR_BUFFER_TYPE::STRIDE)
|
||||||
m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
|
m_vkrMemMgr->CreateBuffer(bufferAllocSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, newBuffer.vk_buffer, newBuffer.vk_mem);
|
||||||
else
|
else
|
||||||
cemu_assert_debug(false);
|
cemu_assert_debug(false);
|
||||||
@ -100,7 +100,7 @@ VKRSynchronizedRingAllocator::AllocatorReservation_t VKRSynchronizedRingAllocato
|
|||||||
|
|
||||||
void VKRSynchronizedRingAllocator::FlushReservation(AllocatorReservation_t& uploadReservation)
|
void VKRSynchronizedRingAllocator::FlushReservation(AllocatorReservation_t& uploadReservation)
|
||||||
{
|
{
|
||||||
cemu_assert_debug(m_bufferType == BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent
|
cemu_assert_debug(m_bufferType == VKR_BUFFER_TYPE::STAGING); // only the staging buffer isn't coherent
|
||||||
// todo - use nonCoherentAtomSize for flush size (instead of hardcoded constant)
|
// todo - use nonCoherentAtomSize for flush size (instead of hardcoded constant)
|
||||||
VkMappedMemoryRange flushedRange{};
|
VkMappedMemoryRange flushedRange{};
|
||||||
flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
|
flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
|
||||||
@ -167,6 +167,70 @@ void VKRSynchronizedRingAllocator::GetStats(uint32& numBuffers, size_t& totalBuf
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* VKRSynchronizedHeapAllocator */
|
||||||
|
|
||||||
|
VKRSynchronizedHeapAllocator::VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize)
|
||||||
|
: m_vkrMemMgr(vkMemoryManager), m_chunkedHeap(bufferType, minimumBufferAllocSize) {};
|
||||||
|
|
||||||
|
VKRSynchronizedHeapAllocator::AllocatorReservation* VKRSynchronizedHeapAllocator::AllocateBufferMemory(uint32 size, uint32 alignment)
|
||||||
|
{
|
||||||
|
CHAddr addr = m_chunkedHeap.alloc(size, alignment);
|
||||||
|
m_activeAllocations.emplace_back(addr);
|
||||||
|
AllocatorReservation* res = m_poolAllocatorReservation.allocObj();
|
||||||
|
res->bufferIndex = addr.chunkIndex;
|
||||||
|
res->bufferOffset = addr.offset;
|
||||||
|
res->size = size;
|
||||||
|
res->memPtr = m_chunkedHeap.GetChunkPtr(addr.chunkIndex) + addr.offset;
|
||||||
|
m_chunkedHeap.GetChunkVkMemInfo(addr.chunkIndex, res->vkBuffer, res->vkMem);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
void VKRSynchronizedHeapAllocator::FreeReservation(AllocatorReservation* uploadReservation)
|
||||||
|
{
|
||||||
|
// put the allocation on a delayed release queue for the current command buffer
|
||||||
|
uint64 currentCommandBufferId = VulkanRenderer::GetInstance()->GetCurrentCommandBufferId();
|
||||||
|
auto it = std::find_if(m_activeAllocations.begin(), m_activeAllocations.end(), [&uploadReservation](const TrackedAllocation& allocation) { return allocation.allocation.chunkIndex == uploadReservation->bufferIndex && allocation.allocation.offset == uploadReservation->bufferOffset; });
|
||||||
|
cemu_assert_debug(it != m_activeAllocations.end());
|
||||||
|
m_releaseQueue[currentCommandBufferId].emplace_back(it->allocation);
|
||||||
|
m_activeAllocations.erase(it);
|
||||||
|
m_poolAllocatorReservation.freeObj(uploadReservation);
|
||||||
|
}
|
||||||
|
|
||||||
|
void VKRSynchronizedHeapAllocator::FlushReservation(AllocatorReservation* uploadReservation)
|
||||||
|
{
|
||||||
|
if (m_chunkedHeap.RequiresFlush(uploadReservation->bufferIndex))
|
||||||
|
{
|
||||||
|
VkMappedMemoryRange flushedRange{};
|
||||||
|
flushedRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
|
||||||
|
flushedRange.memory = uploadReservation->vkMem;
|
||||||
|
flushedRange.offset = uploadReservation->bufferOffset;
|
||||||
|
flushedRange.size = uploadReservation->size;
|
||||||
|
vkFlushMappedMemoryRanges(VulkanRenderer::GetInstance()->GetLogicalDevice(), 1, &flushedRange);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void VKRSynchronizedHeapAllocator::CleanupBuffer(uint64 latestFinishedCommandBufferId)
|
||||||
|
{
|
||||||
|
auto it = m_releaseQueue.begin();
|
||||||
|
while (it != m_releaseQueue.end())
|
||||||
|
{
|
||||||
|
if (it->first <= latestFinishedCommandBufferId)
|
||||||
|
{
|
||||||
|
// release allocations
|
||||||
|
for(auto& addr : it->second)
|
||||||
|
m_chunkedHeap.free(addr);
|
||||||
|
it = m_releaseQueue.erase(it);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
it++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void VKRSynchronizedHeapAllocator::GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
|
||||||
|
{
|
||||||
|
m_chunkedHeap.GetStats(numBuffers, totalBufferSize, freeBufferSize);
|
||||||
|
}
|
||||||
|
|
||||||
/* VkTextureChunkedHeap */
|
/* VkTextureChunkedHeap */
|
||||||
|
|
||||||
uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
|
uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
|
||||||
@ -189,8 +253,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
|
|||||||
std::vector<uint32> deviceLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
|
std::vector<uint32> deviceLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
|
||||||
std::vector<uint32> hostLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, 0);
|
std::vector<uint32> hostLocalMemoryTypeIndices = m_vkrMemoryManager->FindMemoryTypes(m_typeFilter, 0);
|
||||||
// remove device local memory types from host local vector
|
// remove device local memory types from host local vector
|
||||||
auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) ->bool
|
auto pred = [&deviceLocalMemoryTypeIndices](const uint32& v) -> bool {
|
||||||
{
|
|
||||||
return std::find(deviceLocalMemoryTypeIndices.begin(), deviceLocalMemoryTypeIndices.end(), v) != deviceLocalMemoryTypeIndices.end();
|
return std::find(deviceLocalMemoryTypeIndices.begin(), deviceLocalMemoryTypeIndices.end(), v) != deviceLocalMemoryTypeIndices.end();
|
||||||
};
|
};
|
||||||
hostLocalMemoryTypeIndices.erase(std::remove_if(hostLocalMemoryTypeIndices.begin(), hostLocalMemoryTypeIndices.end(), pred), hostLocalMemoryTypeIndices.end());
|
hostLocalMemoryTypeIndices.erase(std::remove_if(hostLocalMemoryTypeIndices.begin(), hostLocalMemoryTypeIndices.end(), pred), hostLocalMemoryTypeIndices.end());
|
||||||
@ -206,7 +269,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
|
|||||||
allocInfo.memoryTypeIndex = memType;
|
allocInfo.memoryTypeIndex = memType;
|
||||||
|
|
||||||
VkDeviceMemory imageMemory;
|
VkDeviceMemory imageMemory;
|
||||||
VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory);
|
VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory);
|
||||||
if (r != VK_SUCCESS)
|
if (r != VK_SUCCESS)
|
||||||
continue;
|
continue;
|
||||||
m_list_chunkInfo[chunkIndex].mem = imageMemory;
|
m_list_chunkInfo[chunkIndex].mem = imageMemory;
|
||||||
@ -221,7 +284,7 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
|
|||||||
allocInfo.memoryTypeIndex = memType;
|
allocInfo.memoryTypeIndex = memType;
|
||||||
|
|
||||||
VkDeviceMemory imageMemory;
|
VkDeviceMemory imageMemory;
|
||||||
VkResult r = vkAllocateMemory(m_device, &allocInfo, nullptr, &imageMemory);
|
VkResult r = vkAllocateMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), &allocInfo, nullptr, &imageMemory);
|
||||||
if (r != VK_SUCCESS)
|
if (r != VK_SUCCESS)
|
||||||
continue;
|
continue;
|
||||||
m_list_chunkInfo[chunkIndex].mem = imageMemory;
|
m_list_chunkInfo[chunkIndex].mem = imageMemory;
|
||||||
@ -238,6 +301,68 @@ uint32 VkTextureChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumA
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* VkBufferChunkedHeap */
|
||||||
|
|
||||||
|
VKRBuffer* VKRBuffer::Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties)
|
||||||
|
{
|
||||||
|
auto* memMgr = VulkanRenderer::GetInstance()->GetMemoryManager();
|
||||||
|
VkBuffer buffer;
|
||||||
|
VkDeviceMemory bufferMemory;
|
||||||
|
bool allocSuccess;
|
||||||
|
if (bufferType == VKR_BUFFER_TYPE::STAGING)
|
||||||
|
allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, properties, buffer, bufferMemory);
|
||||||
|
else if (bufferType == VKR_BUFFER_TYPE::INDEX)
|
||||||
|
allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_INDEX_BUFFER_BIT, properties, buffer, bufferMemory);
|
||||||
|
else if (bufferType == VKR_BUFFER_TYPE::STRIDE)
|
||||||
|
allocSuccess = memMgr->CreateBuffer2(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, properties, buffer, bufferMemory);
|
||||||
|
else
|
||||||
|
cemu_assert_debug(false);
|
||||||
|
if (!allocSuccess)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
VKRBuffer* bufferObj = new VKRBuffer(buffer, bufferMemory);
|
||||||
|
// if host visible, then map buffer
|
||||||
|
void* data = nullptr;
|
||||||
|
if (properties & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
|
||||||
|
{
|
||||||
|
vkMapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), bufferMemory, 0, bufferSize, 0, &data);
|
||||||
|
bufferObj->m_requiresFlush = !HAS_FLAG(properties, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
|
||||||
|
}
|
||||||
|
bufferObj->m_mappedMemory = (uint8*)data;
|
||||||
|
return bufferObj;
|
||||||
|
}
|
||||||
|
|
||||||
|
VKRBuffer::~VKRBuffer()
|
||||||
|
{
|
||||||
|
if (m_mappedMemory)
|
||||||
|
vkUnmapMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory);
|
||||||
|
if (m_bufferMemory != VK_NULL_HANDLE)
|
||||||
|
vkFreeMemory(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_bufferMemory, nullptr);
|
||||||
|
if (m_buffer != VK_NULL_HANDLE)
|
||||||
|
vkDestroyBuffer(VulkanRenderer::GetInstance()->GetLogicalDevice(), m_buffer, nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
VkBufferChunkedHeap::~VkBufferChunkedHeap()
|
||||||
|
{
|
||||||
|
for (auto& chunk : m_chunkBuffers)
|
||||||
|
delete chunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32 VkBufferChunkedHeap::allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
|
||||||
|
{
|
||||||
|
size_t allocationSize = std::max<size_t>(m_minimumBufferAllocationSize, minimumAllocationSize);
|
||||||
|
VKRBuffer* buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
|
||||||
|
if(!buffer)
|
||||||
|
buffer = VKRBuffer::Create(m_bufferType, allocationSize, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
|
||||||
|
if(!buffer)
|
||||||
|
VulkanRenderer::GetInstance()->UnrecoverableError("Failed to allocate buffer memory for VkBufferChunkedHeap");
|
||||||
|
cemu_assert_debug(buffer);
|
||||||
|
cemu_assert_debug(m_chunkBuffers.size() == chunkIndex);
|
||||||
|
m_chunkBuffers.emplace_back(buffer);
|
||||||
|
// todo - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT might be worth it?
|
||||||
|
return allocationSize;
|
||||||
|
}
|
||||||
|
|
||||||
uint32_t VKRMemoryManager::FindMemoryType(uint32_t typeFilter, VkMemoryPropertyFlags properties) const
|
uint32_t VKRMemoryManager::FindMemoryType(uint32_t typeFilter, VkMemoryPropertyFlags properties) const
|
||||||
{
|
{
|
||||||
VkPhysicalDeviceMemoryProperties memProperties;
|
VkPhysicalDeviceMemoryProperties memProperties;
|
||||||
@ -469,7 +594,7 @@ VkImageMemAllocation* VKRMemoryManager::imageMemoryAllocate(VkImage image)
|
|||||||
auto it = map_textureHeap.find(typeFilter);
|
auto it = map_textureHeap.find(typeFilter);
|
||||||
if (it == map_textureHeap.end())
|
if (it == map_textureHeap.end())
|
||||||
{
|
{
|
||||||
texHeap = new VkTextureChunkedHeap(this, typeFilter, m_vkr->GetLogicalDevice());
|
texHeap = new VkTextureChunkedHeap(this, typeFilter);
|
||||||
map_textureHeap.emplace(typeFilter, texHeap);
|
map_textureHeap.emplace(typeFilter, texHeap);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -2,6 +2,36 @@
|
|||||||
#include "Cafe/HW/Latte/Renderer/Renderer.h"
|
#include "Cafe/HW/Latte/Renderer/Renderer.h"
|
||||||
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
|
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanAPI.h"
|
||||||
#include "util/ChunkedHeap/ChunkedHeap.h"
|
#include "util/ChunkedHeap/ChunkedHeap.h"
|
||||||
|
#include "util/helpers/MemoryPool.h"
|
||||||
|
|
||||||
|
enum class VKR_BUFFER_TYPE
|
||||||
|
{
|
||||||
|
STAGING, // staging upload buffer
|
||||||
|
INDEX, // buffer for index data
|
||||||
|
STRIDE, // buffer for stride-adjusted vertex data
|
||||||
|
};
|
||||||
|
|
||||||
|
class VKRBuffer
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static VKRBuffer* Create(VKR_BUFFER_TYPE bufferType, size_t bufferSize, VkMemoryPropertyFlags properties);
|
||||||
|
~VKRBuffer();
|
||||||
|
|
||||||
|
VkBuffer GetVkBuffer() const { return m_buffer; }
|
||||||
|
VkDeviceMemory GetVkBufferMemory() const { return m_bufferMemory; }
|
||||||
|
|
||||||
|
uint8* GetPtr() const { return m_mappedMemory; }
|
||||||
|
|
||||||
|
bool RequiresFlush() const { return m_requiresFlush; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
VKRBuffer(VkBuffer buffer, VkDeviceMemory bufferMem) : m_buffer(buffer), m_bufferMemory(bufferMem) { };
|
||||||
|
|
||||||
|
VkBuffer m_buffer;
|
||||||
|
VkDeviceMemory m_bufferMemory;
|
||||||
|
uint8* m_mappedMemory;
|
||||||
|
bool m_requiresFlush{false};
|
||||||
|
};
|
||||||
|
|
||||||
struct VkImageMemAllocation
|
struct VkImageMemAllocation
|
||||||
{
|
{
|
||||||
@ -14,18 +44,16 @@ struct VkImageMemAllocation
|
|||||||
uint32 getAllocationSize() { return allocationSize; }
|
uint32 getAllocationSize() { return allocationSize; }
|
||||||
};
|
};
|
||||||
|
|
||||||
class VkTextureChunkedHeap : private ChunkedHeap
|
class VkTextureChunkedHeap : private ChunkedHeap<>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter, VkDevice device) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter), m_device(device) { };
|
VkTextureChunkedHeap(class VKRMemoryManager* memoryManager, uint32 typeFilter) : m_vkrMemoryManager(memoryManager), m_typeFilter(typeFilter) { };
|
||||||
|
|
||||||
struct ChunkInfo
|
struct ChunkInfo
|
||||||
{
|
{
|
||||||
VkDeviceMemory mem;
|
VkDeviceMemory mem;
|
||||||
};
|
};
|
||||||
|
|
||||||
uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
|
|
||||||
|
|
||||||
CHAddr allocMem(uint32 size, uint32 alignment)
|
CHAddr allocMem(uint32 size, uint32 alignment)
|
||||||
{
|
{
|
||||||
if (alignment < 4)
|
if (alignment < 4)
|
||||||
@ -43,11 +71,6 @@ public:
|
|||||||
this->free(addr);
|
this->free(addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
void setDevice(VkDevice dev)
|
|
||||||
{
|
|
||||||
m_device = dev;
|
|
||||||
}
|
|
||||||
|
|
||||||
VkDeviceMemory getChunkMem(uint32 index)
|
VkDeviceMemory getChunkMem(uint32 index)
|
||||||
{
|
{
|
||||||
if (index >= m_list_chunkInfo.size())
|
if (index >= m_list_chunkInfo.size())
|
||||||
@ -57,28 +80,73 @@ public:
|
|||||||
|
|
||||||
void getStatistics(uint32& totalHeapSize, uint32& allocatedBytes) const
|
void getStatistics(uint32& totalHeapSize, uint32& allocatedBytes) const
|
||||||
{
|
{
|
||||||
totalHeapSize = numHeapBytes;
|
totalHeapSize = m_numHeapBytes;
|
||||||
allocatedBytes = numAllocatedBytes;
|
allocatedBytes = m_numAllocatedBytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
VkDevice m_device;
|
private:
|
||||||
|
uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
|
||||||
|
|
||||||
uint32 m_typeFilter{ 0xFFFFFFFF };
|
uint32 m_typeFilter{ 0xFFFFFFFF };
|
||||||
class VKRMemoryManager* m_vkrMemoryManager;
|
class VKRMemoryManager* m_vkrMemoryManager;
|
||||||
std::vector<ChunkInfo> m_list_chunkInfo;
|
std::vector<ChunkInfo> m_list_chunkInfo;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class VkBufferChunkedHeap : private ChunkedHeap<>
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
VkBufferChunkedHeap(VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocationSize) : m_bufferType(bufferType), m_minimumBufferAllocationSize(minimumBufferAllocationSize) { };
|
||||||
|
~VkBufferChunkedHeap();
|
||||||
|
|
||||||
|
using ChunkedHeap::alloc;
|
||||||
|
using ChunkedHeap::free;
|
||||||
|
|
||||||
|
uint8* GetChunkPtr(uint32 index) const
|
||||||
|
{
|
||||||
|
if (index >= m_chunkBuffers.size())
|
||||||
|
return nullptr;
|
||||||
|
return m_chunkBuffers[index]->GetPtr();
|
||||||
|
}
|
||||||
|
|
||||||
|
void GetChunkVkMemInfo(uint32 index, VkBuffer& buffer, VkDeviceMemory& mem)
|
||||||
|
{
|
||||||
|
if (index >= m_chunkBuffers.size())
|
||||||
|
{
|
||||||
|
buffer = VK_NULL_HANDLE;
|
||||||
|
mem = VK_NULL_HANDLE;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
buffer = m_chunkBuffers[index]->GetVkBuffer();
|
||||||
|
mem = m_chunkBuffers[index]->GetVkBufferMemory();
|
||||||
|
}
|
||||||
|
|
||||||
|
void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const
|
||||||
|
{
|
||||||
|
numBuffers = m_chunkBuffers.size();
|
||||||
|
totalBufferSize = m_numHeapBytes;
|
||||||
|
freeBufferSize = m_numHeapBytes - m_numAllocatedBytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool RequiresFlush(uint32 index) const
|
||||||
|
{
|
||||||
|
if (index >= m_chunkBuffers.size())
|
||||||
|
return false;
|
||||||
|
return m_chunkBuffers[index]->RequiresFlush();
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) override;
|
||||||
|
|
||||||
|
VKR_BUFFER_TYPE m_bufferType;
|
||||||
|
std::vector<VKRBuffer*> m_chunkBuffers;
|
||||||
|
size_t m_minimumBufferAllocationSize;
|
||||||
|
};
|
||||||
|
|
||||||
// a circular ring-buffer which tracks and releases memory per command-buffer
|
// a circular ring-buffer which tracks and releases memory per command-buffer
|
||||||
class VKRSynchronizedRingAllocator
|
class VKRSynchronizedRingAllocator
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
enum class BUFFER_TYPE
|
VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {};
|
||||||
{
|
|
||||||
STAGING, // staging upload buffer
|
|
||||||
INDEX, // buffer for index data
|
|
||||||
STRIDE, // buffer for stride-adjusted vertex data
|
|
||||||
};
|
|
||||||
|
|
||||||
VKRSynchronizedRingAllocator(class VulkanRenderer* vkRenderer, class VKRMemoryManager* vkMemoryManager, BUFFER_TYPE bufferType, uint32 minimumBufferAllocSize) : m_vkr(vkRenderer), m_vkrMemMgr(vkMemoryManager), m_bufferType(bufferType), m_minimumBufferAllocSize(minimumBufferAllocSize) {};
|
|
||||||
VKRSynchronizedRingAllocator(const VKRSynchronizedRingAllocator&) = delete; // disallow copy
|
VKRSynchronizedRingAllocator(const VKRSynchronizedRingAllocator&) = delete; // disallow copy
|
||||||
|
|
||||||
struct BufferSyncPoint_t
|
struct BufferSyncPoint_t
|
||||||
@ -126,13 +194,53 @@ private:
|
|||||||
|
|
||||||
const class VulkanRenderer* m_vkr;
|
const class VulkanRenderer* m_vkr;
|
||||||
const class VKRMemoryManager* m_vkrMemMgr;
|
const class VKRMemoryManager* m_vkrMemMgr;
|
||||||
const BUFFER_TYPE m_bufferType;
|
const VKR_BUFFER_TYPE m_bufferType;
|
||||||
const uint32 m_minimumBufferAllocSize;
|
const uint32 m_minimumBufferAllocSize;
|
||||||
|
|
||||||
std::vector<AllocatorBuffer_t> m_buffers;
|
std::vector<AllocatorBuffer_t> m_buffers;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// heap style allocator with released memory being freed after the current command buffer finishes
|
||||||
|
class VKRSynchronizedHeapAllocator
|
||||||
|
{
|
||||||
|
struct TrackedAllocation
|
||||||
|
{
|
||||||
|
TrackedAllocation(CHAddr allocation) : allocation(allocation) {};
|
||||||
|
CHAddr allocation;
|
||||||
|
};
|
||||||
|
|
||||||
|
public:
|
||||||
|
VKRSynchronizedHeapAllocator(class VKRMemoryManager* vkMemoryManager, VKR_BUFFER_TYPE bufferType, size_t minimumBufferAllocSize);
|
||||||
|
VKRSynchronizedHeapAllocator(const VKRSynchronizedHeapAllocator&) = delete; // disallow copy
|
||||||
|
|
||||||
|
struct AllocatorReservation
|
||||||
|
{
|
||||||
|
VkBuffer vkBuffer;
|
||||||
|
VkDeviceMemory vkMem;
|
||||||
|
uint8* memPtr;
|
||||||
|
uint32 bufferOffset;
|
||||||
|
uint32 size;
|
||||||
|
uint32 bufferIndex;
|
||||||
|
};
|
||||||
|
|
||||||
|
AllocatorReservation* AllocateBufferMemory(uint32 size, uint32 alignment);
|
||||||
|
void FreeReservation(AllocatorReservation* uploadReservation);
|
||||||
|
void FlushReservation(AllocatorReservation* uploadReservation);
|
||||||
|
|
||||||
|
void CleanupBuffer(uint64 latestFinishedCommandBufferId);
|
||||||
|
|
||||||
|
void GetStats(uint32& numBuffers, size_t& totalBufferSize, size_t& freeBufferSize) const;
|
||||||
|
private:
|
||||||
|
const class VKRMemoryManager* m_vkrMemMgr;
|
||||||
|
VkBufferChunkedHeap m_chunkedHeap;
|
||||||
|
// allocations
|
||||||
|
std::vector<TrackedAllocation> m_activeAllocations;
|
||||||
|
MemoryPool<AllocatorReservation> m_poolAllocatorReservation{32};
|
||||||
|
// release queue
|
||||||
|
std::unordered_map<uint64, std::vector<CHAddr>> m_releaseQueue;
|
||||||
|
};
|
||||||
|
|
||||||
void LatteIndices_invalidateAll();
|
void LatteIndices_invalidateAll();
|
||||||
|
|
||||||
class VKRMemoryManager
|
class VKRMemoryManager
|
||||||
@ -140,9 +248,9 @@ class VKRMemoryManager
|
|||||||
friend class VKRSynchronizedRingAllocator;
|
friend class VKRSynchronizedRingAllocator;
|
||||||
public:
|
public:
|
||||||
VKRMemoryManager(class VulkanRenderer* renderer) :
|
VKRMemoryManager(class VulkanRenderer* renderer) :
|
||||||
m_stagingBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STAGING, 32u * 1024 * 1024),
|
m_stagingBuffer(renderer, this, VKR_BUFFER_TYPE::STAGING, 32u * 1024 * 1024),
|
||||||
m_indexBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::INDEX, 4u * 1024 * 1024),
|
m_indexBuffer(this, VKR_BUFFER_TYPE::INDEX, 4u * 1024 * 1024),
|
||||||
m_vertexStrideMetalBuffer(renderer, this, VKRSynchronizedRingAllocator::BUFFER_TYPE::STRIDE, 4u * 1024 * 1024)
|
m_vertexStrideMetalBuffer(renderer, this, VKR_BUFFER_TYPE::STRIDE, 4u * 1024 * 1024)
|
||||||
{
|
{
|
||||||
m_vkr = renderer;
|
m_vkr = renderer;
|
||||||
}
|
}
|
||||||
@ -167,7 +275,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
VKRSynchronizedRingAllocator& getStagingAllocator() { return m_stagingBuffer; }; // allocator for texture/attribute/uniform uploads
|
VKRSynchronizedRingAllocator& getStagingAllocator() { return m_stagingBuffer; }; // allocator for texture/attribute/uniform uploads
|
||||||
VKRSynchronizedRingAllocator& getIndexAllocator() { return m_indexBuffer; }; // allocator for index data
|
VKRSynchronizedHeapAllocator& GetIndexAllocator() { return m_indexBuffer; }; // allocator for index data
|
||||||
VKRSynchronizedRingAllocator& getMetalStrideWorkaroundAllocator() { return m_vertexStrideMetalBuffer; }; // allocator for stride-adjusted vertex data
|
VKRSynchronizedRingAllocator& getMetalStrideWorkaroundAllocator() { return m_vertexStrideMetalBuffer; }; // allocator for stride-adjusted vertex data
|
||||||
|
|
||||||
void cleanupBuffers(uint64 latestFinishedCommandBufferId)
|
void cleanupBuffers(uint64 latestFinishedCommandBufferId)
|
||||||
@ -202,6 +310,6 @@ public:
|
|||||||
private:
|
private:
|
||||||
class VulkanRenderer* m_vkr;
|
class VulkanRenderer* m_vkr;
|
||||||
VKRSynchronizedRingAllocator m_stagingBuffer;
|
VKRSynchronizedRingAllocator m_stagingBuffer;
|
||||||
VKRSynchronizedRingAllocator m_indexBuffer;
|
VKRSynchronizedHeapAllocator m_indexBuffer;
|
||||||
VKRSynchronizedRingAllocator m_vertexStrideMetalBuffer;
|
VKRSynchronizedRingAllocator m_vertexStrideMetalBuffer;
|
||||||
};
|
};
|
||||||
|
@ -681,6 +681,9 @@ VulkanRenderer::~VulkanRenderer()
|
|||||||
vkDestroyDebugUtilsMessengerEXT(m_instance, m_debugCallback, nullptr);
|
vkDestroyDebugUtilsMessengerEXT(m_instance, m_debugCallback, nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// destroy memory manager
|
||||||
|
delete memoryManager;
|
||||||
|
|
||||||
// destroy instance, devices
|
// destroy instance, devices
|
||||||
if (m_instance != VK_NULL_HANDLE)
|
if (m_instance != VK_NULL_HANDLE)
|
||||||
{
|
{
|
||||||
@ -692,9 +695,6 @@ VulkanRenderer::~VulkanRenderer()
|
|||||||
vkDestroyInstance(m_instance, nullptr);
|
vkDestroyInstance(m_instance, nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
// destroy memory manager
|
|
||||||
delete memoryManager;
|
|
||||||
|
|
||||||
// crashes?
|
// crashes?
|
||||||
//glslang::FinalizeProcess();
|
//glslang::FinalizeProcess();
|
||||||
}
|
}
|
||||||
@ -3701,7 +3701,7 @@ void VulkanRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uin
|
|||||||
|
|
||||||
void VulkanRenderer::AppendOverlayDebugInfo()
|
void VulkanRenderer::AppendOverlayDebugInfo()
|
||||||
{
|
{
|
||||||
ImGui::Text("--- Vulkan info ---");
|
ImGui::Text("--- Vulkan debug info ---");
|
||||||
ImGui::Text("GfxPipelines %u", performanceMonitor.vk.numGraphicPipelines.get());
|
ImGui::Text("GfxPipelines %u", performanceMonitor.vk.numGraphicPipelines.get());
|
||||||
ImGui::Text("DescriptorSets %u", performanceMonitor.vk.numDescriptorSets.get());
|
ImGui::Text("DescriptorSets %u", performanceMonitor.vk.numDescriptorSets.get());
|
||||||
ImGui::Text("DS ImgSamplers %u", performanceMonitor.vk.numDescriptorSamplerTextures.get());
|
ImGui::Text("DS ImgSamplers %u", performanceMonitor.vk.numDescriptorSamplerTextures.get());
|
||||||
@ -3719,7 +3719,7 @@ void VulkanRenderer::AppendOverlayDebugInfo()
|
|||||||
|
|
||||||
ImGui::Text("BeginRP/f %u", performanceMonitor.vk.numBeginRenderpassPerFrame.get());
|
ImGui::Text("BeginRP/f %u", performanceMonitor.vk.numBeginRenderpassPerFrame.get());
|
||||||
ImGui::Text("Barriers/f %u", performanceMonitor.vk.numDrawBarriersPerFrame.get());
|
ImGui::Text("Barriers/f %u", performanceMonitor.vk.numDrawBarriersPerFrame.get());
|
||||||
ImGui::Text("--- Cache info ---");
|
ImGui::Text("--- Cache debug info ---");
|
||||||
|
|
||||||
uint32 bufferCacheHeapSize = 0;
|
uint32 bufferCacheHeapSize = 0;
|
||||||
uint32 bufferCacheAllocationSize = 0;
|
uint32 bufferCacheAllocationSize = 0;
|
||||||
@ -3739,7 +3739,7 @@ void VulkanRenderer::AppendOverlayDebugInfo()
|
|||||||
ImGui::SameLine(60.0f);
|
ImGui::SameLine(60.0f);
|
||||||
ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);
|
ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);
|
||||||
|
|
||||||
memoryManager->getIndexAllocator().GetStats(numBuffers, totalSize, freeSize);
|
memoryManager->GetIndexAllocator().GetStats(numBuffers, totalSize, freeSize);
|
||||||
ImGui::Text("Index");
|
ImGui::Text("Index");
|
||||||
ImGui::SameLine(60.0f);
|
ImGui::SameLine(60.0f);
|
||||||
ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);
|
ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers);
|
||||||
|
@ -328,8 +328,9 @@ public:
|
|||||||
|
|
||||||
RendererShader* shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) override;
|
RendererShader* shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) override;
|
||||||
|
|
||||||
void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override;
|
IndexAllocation indexData_reserveIndexMemory(uint32 size) override;
|
||||||
void indexData_uploadIndexMemory(uint32 offset, uint32 size) override;
|
void indexData_releaseIndexMemory(IndexAllocation& allocation) override;
|
||||||
|
void indexData_uploadIndexMemory(IndexAllocation& allocation) override;
|
||||||
|
|
||||||
// externally callable
|
// externally callable
|
||||||
void GetTextureFormatInfoVK(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, sint32 width, sint32 height, FormatInfoVK* formatInfoOut);
|
void GetTextureFormatInfoVK(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, sint32 width, sint32 height, FormatInfoVK* formatInfoOut);
|
||||||
|
@ -357,18 +357,20 @@ PipelineInfo* VulkanRenderer::draw_getOrCreateGraphicsPipeline(uint32 indexCount
|
|||||||
return draw_createGraphicsPipeline(indexCount);
|
return draw_createGraphicsPipeline(indexCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
void* VulkanRenderer::indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex)
|
Renderer::IndexAllocation VulkanRenderer::indexData_reserveIndexMemory(uint32 size)
|
||||||
{
|
{
|
||||||
auto& indexAllocator = this->memoryManager->getIndexAllocator();
|
VKRSynchronizedHeapAllocator::AllocatorReservation* resv = memoryManager->GetIndexAllocator().AllocateBufferMemory(size, 32);
|
||||||
auto resv = indexAllocator.AllocateBufferMemory(size, 32);
|
return { resv->memPtr, resv };
|
||||||
offset = resv.bufferOffset;
|
|
||||||
bufferIndex = resv.bufferIndex;
|
|
||||||
return resv.memPtr;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void VulkanRenderer::indexData_uploadIndexMemory(uint32 offset, uint32 size)
|
void VulkanRenderer::indexData_releaseIndexMemory(IndexAllocation& allocation)
|
||||||
{
|
{
|
||||||
// does nothing since the index buffer memory is coherent
|
memoryManager->GetIndexAllocator().FreeReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal);
|
||||||
|
}
|
||||||
|
|
||||||
|
void VulkanRenderer::indexData_uploadIndexMemory(IndexAllocation& allocation)
|
||||||
|
{
|
||||||
|
memoryManager->GetIndexAllocator().FlushReservation((VKRSynchronizedHeapAllocator::AllocatorReservation*)allocation.rendererInternal);
|
||||||
}
|
}
|
||||||
|
|
||||||
float s_vkUniformData[512 * 4];
|
float s_vkUniformData[512 * 4];
|
||||||
@ -1413,14 +1415,15 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
|
|||||||
uint32 hostIndexCount;
|
uint32 hostIndexCount;
|
||||||
uint32 indexMin = 0;
|
uint32 indexMin = 0;
|
||||||
uint32 indexMax = 0;
|
uint32 indexMax = 0;
|
||||||
uint32 indexBufferOffset = 0;
|
Renderer::IndexAllocation indexAllocation;
|
||||||
uint32 indexBufferIndex = 0;
|
LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexAllocation);
|
||||||
LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexBufferOffset, indexBufferIndex);
|
VKRSynchronizedHeapAllocator::AllocatorReservation* indexReservation = (VKRSynchronizedHeapAllocator::AllocatorReservation*)indexAllocation.rendererInternal;
|
||||||
|
|
||||||
// update index binding
|
// update index binding
|
||||||
bool isPrevIndexData = false;
|
bool isPrevIndexData = false;
|
||||||
if (hostIndexType != INDEX_TYPE::NONE)
|
if (hostIndexType != INDEX_TYPE::NONE)
|
||||||
{
|
{
|
||||||
|
uint32 indexBufferIndex = indexReservation->bufferIndex;
|
||||||
|
uint32 indexBufferOffset = indexReservation->bufferOffset;
|
||||||
if (m_state.activeIndexBufferOffset != indexBufferOffset || m_state.activeIndexBufferIndex != indexBufferIndex || m_state.activeIndexType != hostIndexType)
|
if (m_state.activeIndexBufferOffset != indexBufferOffset || m_state.activeIndexBufferIndex != indexBufferIndex || m_state.activeIndexType != hostIndexType)
|
||||||
{
|
{
|
||||||
m_state.activeIndexType = hostIndexType;
|
m_state.activeIndexType = hostIndexType;
|
||||||
@ -1433,7 +1436,7 @@ void VulkanRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32
|
|||||||
vkType = VK_INDEX_TYPE_UINT32;
|
vkType = VK_INDEX_TYPE_UINT32;
|
||||||
else
|
else
|
||||||
cemu_assert(false);
|
cemu_assert(false);
|
||||||
vkCmdBindIndexBuffer(m_state.currentCommandBuffer, memoryManager->getIndexAllocator().GetBufferByIndex(indexBufferIndex), indexBufferOffset, vkType);
|
vkCmdBindIndexBuffer(m_state.currentCommandBuffer, indexReservation->vkBuffer, indexBufferOffset, vkType);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
isPrevIndexData = true;
|
isPrevIndexData = true;
|
||||||
|
@ -274,6 +274,25 @@ inline uint64 _udiv128(uint64 highDividend, uint64 lowDividend, uint64 divisor,
|
|||||||
#define NOEXPORT __attribute__ ((visibility ("hidden")))
|
#define NOEXPORT __attribute__ ((visibility ("hidden")))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#define FORCE_INLINE __forceinline
|
||||||
|
#elif defined(__GNUC__) || defined(__clang__)
|
||||||
|
#define FORCE_INLINE inline __attribute__((always_inline))
|
||||||
|
#else
|
||||||
|
#define FORCE_INLINE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
FORCE_INLINE inline int BSF(uint32 v) // returns index of first bit set, counting from LSB. If v is 0 then result is undefined
|
||||||
|
{
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
return _tzcnt_u32(v); // TZCNT requires BMI1. But if not supported it will execute as BSF
|
||||||
|
#elif defined(__GNUC__) || defined(__clang__)
|
||||||
|
return __builtin_ctz(v);
|
||||||
|
#else
|
||||||
|
return std::countr_zero(v);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
// On aarch64 we handle some of the x86 intrinsics by implementing them as wrappers
|
// On aarch64 we handle some of the x86 intrinsics by implementing them as wrappers
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__)
|
||||||
|
|
||||||
|
@ -1,35 +1,39 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <util/helpers/MemoryPool.h>
|
||||||
|
|
||||||
struct CHAddr
|
struct CHAddr
|
||||||
{
|
{
|
||||||
uint32 offset;
|
uint32 offset;
|
||||||
uint32 chunkIndex;
|
uint32 chunkIndex;
|
||||||
|
void* internal; // AllocRange
|
||||||
|
|
||||||
CHAddr(uint32 _offset, uint32 _chunkIndex) : offset(_offset), chunkIndex(_chunkIndex) {};
|
CHAddr(uint32 _offset, uint32 _chunkIndex, void* internal = nullptr) : offset(_offset), chunkIndex(_chunkIndex), internal(internal) {};
|
||||||
CHAddr() : offset(0xFFFFFFFF), chunkIndex(0xFFFFFFFF) {};
|
CHAddr() : offset(0xFFFFFFFF), chunkIndex(0xFFFFFFFF) {};
|
||||||
|
|
||||||
bool isValid() { return chunkIndex != 0xFFFFFFFF; };
|
bool isValid() { return chunkIndex != 0xFFFFFFFF; };
|
||||||
static CHAddr getInvalid() { return CHAddr(0xFFFFFFFF, 0xFFFFFFFF); };
|
static CHAddr getInvalid() { return CHAddr(0xFFFFFFFF, 0xFFFFFFFF); };
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<uint32 TMinimumAlignment = 32>
|
||||||
class ChunkedHeap
|
class ChunkedHeap
|
||||||
{
|
{
|
||||||
struct allocRange_t
|
struct AllocRange
|
||||||
{
|
{
|
||||||
allocRange_t* nextFree{};
|
AllocRange* nextFree{};
|
||||||
allocRange_t* prevFree{};
|
AllocRange* prevFree{};
|
||||||
allocRange_t* prevOrdered{};
|
AllocRange* prevOrdered{};
|
||||||
allocRange_t* nextOrdered{};
|
AllocRange* nextOrdered{};
|
||||||
uint32 offset;
|
uint32 offset;
|
||||||
uint32 chunkIndex;
|
uint32 chunkIndex;
|
||||||
uint32 size;
|
uint32 size;
|
||||||
bool isFree;
|
bool isFree;
|
||||||
allocRange_t(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {};
|
AllocRange(uint32 _offset, uint32 _chunkIndex, uint32 _size, bool _isFree) : offset(_offset), chunkIndex(_chunkIndex), size(_size), isFree(_isFree), nextFree(nullptr) {};
|
||||||
};
|
};
|
||||||
|
|
||||||
struct chunk_t
|
struct Chunk
|
||||||
{
|
{
|
||||||
std::unordered_map<uint32, allocRange_t*> map_allocatedRange;
|
uint32 size;
|
||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -47,45 +51,32 @@ public:
|
|||||||
_free(addr);
|
_free(addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize)
|
virtual uint32 allocateNewChunk(uint32 chunkIndex, uint32 minimumAllocationSize) = 0;
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
unsigned ulog2(uint32 v)
|
unsigned ulog2(uint32 v)
|
||||||
{
|
{
|
||||||
static const unsigned MUL_DE_BRUIJN_BIT[] =
|
cemu_assert_debug(v != 0);
|
||||||
{
|
return 31 - std::countl_zero(v);
|
||||||
0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
|
|
||||||
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
|
|
||||||
};
|
|
||||||
|
|
||||||
v |= v >> 1;
|
|
||||||
v |= v >> 2;
|
|
||||||
v |= v >> 4;
|
|
||||||
v |= v >> 8;
|
|
||||||
v |= v >> 16;
|
|
||||||
|
|
||||||
return MUL_DE_BRUIJN_BIT[(v * 0x07C4ACDDu) >> 27];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void trackFreeRange(allocRange_t* range)
|
void trackFreeRange(AllocRange* range)
|
||||||
{
|
{
|
||||||
// get index of msb
|
// get index of msb
|
||||||
cemu_assert_debug(range->size != 0); // size of zero is not allowed
|
cemu_assert_debug(range->size != 0); // size of zero is not allowed
|
||||||
uint32 bucketIndex = ulog2(range->size);
|
uint32 bucketIndex = ulog2(range->size);
|
||||||
range->nextFree = bucketFreeRange[bucketIndex];
|
range->nextFree = m_bucketFreeRange[bucketIndex];
|
||||||
if (bucketFreeRange[bucketIndex])
|
if (m_bucketFreeRange[bucketIndex])
|
||||||
bucketFreeRange[bucketIndex]->prevFree = range;
|
m_bucketFreeRange[bucketIndex]->prevFree = range;
|
||||||
range->prevFree = nullptr;
|
range->prevFree = nullptr;
|
||||||
bucketFreeRange[bucketIndex] = range;
|
m_bucketFreeRange[bucketIndex] = range;
|
||||||
|
m_bucketUseMask |= (1u << bucketIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
void forgetFreeRange(allocRange_t* range, uint32 bucketIndex)
|
void forgetFreeRange(AllocRange* range, uint32 bucketIndex)
|
||||||
{
|
{
|
||||||
allocRange_t* prevRange = range->prevFree;
|
AllocRange* prevRange = range->prevFree;
|
||||||
allocRange_t* nextRange = range->nextFree;
|
AllocRange* nextRange = range->nextFree;
|
||||||
if (prevRange)
|
if (prevRange)
|
||||||
{
|
{
|
||||||
prevRange->nextFree = nextRange;
|
prevRange->nextFree = nextRange;
|
||||||
@ -94,36 +85,42 @@ private:
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (bucketFreeRange[bucketIndex] != range)
|
cemu_assert_debug(m_bucketFreeRange[bucketIndex] == range);
|
||||||
assert_dbg();
|
m_bucketFreeRange[bucketIndex] = nextRange;
|
||||||
bucketFreeRange[bucketIndex] = nextRange;
|
|
||||||
if (nextRange)
|
if (nextRange)
|
||||||
nextRange->prevFree = nullptr;
|
nextRange->prevFree = nullptr;
|
||||||
|
else
|
||||||
|
m_bucketUseMask &= ~(1u << bucketIndex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool allocateChunk(uint32 minimumAllocationSize)
|
bool allocateChunk(uint32 minimumAllocationSize)
|
||||||
{
|
{
|
||||||
uint32 chunkIndex = (uint32)list_chunks.size();
|
uint32 chunkIndex = (uint32)m_chunks.size();
|
||||||
list_chunks.emplace_back(new chunk_t());
|
m_chunks.emplace_back();
|
||||||
uint32 chunkSize = allocateNewChunk(chunkIndex, minimumAllocationSize);
|
uint32 chunkSize = allocateNewChunk(chunkIndex, minimumAllocationSize);
|
||||||
|
cemu_assert_debug((chunkSize%TMinimumAlignment) == 0); // chunk size should be a multiple of the minimum alignment
|
||||||
if (chunkSize == 0)
|
if (chunkSize == 0)
|
||||||
return false;
|
return false;
|
||||||
allocRange_t* range = new allocRange_t(0, chunkIndex, chunkSize, true);
|
cemu_assert_debug(chunkSize < 0x80000000u); // chunk size must be below 2GB
|
||||||
|
AllocRange* range = m_allocEntriesPool.allocObj(0, chunkIndex, chunkSize, true);
|
||||||
trackFreeRange(range);
|
trackFreeRange(range);
|
||||||
numHeapBytes += chunkSize;
|
m_numHeapBytes += chunkSize;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void _allocFrom(allocRange_t* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize)
|
void _allocFrom(AllocRange* range, uint32 bucketIndex, uint32 allocOffset, uint32 allocSize)
|
||||||
{
|
{
|
||||||
|
cemu_assert_debug(allocSize > 0);
|
||||||
// remove the range from the chain of free ranges
|
// remove the range from the chain of free ranges
|
||||||
forgetFreeRange(range, bucketIndex);
|
forgetFreeRange(range, bucketIndex);
|
||||||
// split head, allocation and tail into separate ranges
|
// split head, allocation and tail into separate ranges
|
||||||
if (allocOffset > range->offset)
|
uint32 headBytes = allocOffset - range->offset;
|
||||||
|
if (headBytes > 0)
|
||||||
{
|
{
|
||||||
// alignment padding -> create free range
|
// alignment padding -> create free range
|
||||||
allocRange_t* head = new allocRange_t(range->offset, range->chunkIndex, allocOffset - range->offset, true);
|
cemu_assert_debug(headBytes >= TMinimumAlignment);
|
||||||
|
AllocRange* head = m_allocEntriesPool.allocObj(range->offset, range->chunkIndex, headBytes, true);
|
||||||
trackFreeRange(head);
|
trackFreeRange(head);
|
||||||
if (range->prevOrdered)
|
if (range->prevOrdered)
|
||||||
range->prevOrdered->nextOrdered = head;
|
range->prevOrdered->nextOrdered = head;
|
||||||
@ -131,10 +128,12 @@ private:
|
|||||||
head->nextOrdered = range;
|
head->nextOrdered = range;
|
||||||
range->prevOrdered = head;
|
range->prevOrdered = head;
|
||||||
}
|
}
|
||||||
if ((allocOffset + allocSize) < (range->offset + range->size)) // todo - create only if it's more than a couple of bytes?
|
uint32 tailBytes = (range->offset + range->size) - (allocOffset + allocSize);
|
||||||
|
if (tailBytes > 0)
|
||||||
{
|
{
|
||||||
// tail -> create free range
|
// tail -> create free range
|
||||||
allocRange_t* tail = new allocRange_t((allocOffset + allocSize), range->chunkIndex, (range->offset + range->size) - (allocOffset + allocSize), true);
|
cemu_assert_debug(tailBytes >= TMinimumAlignment);
|
||||||
|
AllocRange* tail = m_allocEntriesPool.allocObj((allocOffset + allocSize), range->chunkIndex, tailBytes, true);
|
||||||
trackFreeRange(tail);
|
trackFreeRange(tail);
|
||||||
if (range->nextOrdered)
|
if (range->nextOrdered)
|
||||||
range->nextOrdered->prevOrdered = tail;
|
range->nextOrdered->prevOrdered = tail;
|
||||||
@ -149,36 +148,51 @@ private:
|
|||||||
|
|
||||||
CHAddr _alloc(uint32 size, uint32 alignment)
|
CHAddr _alloc(uint32 size, uint32 alignment)
|
||||||
{
|
{
|
||||||
|
cemu_assert_debug(size <= (0x7FFFFFFFu-TMinimumAlignment));
|
||||||
|
// make sure size is not zero and align it
|
||||||
|
if(size == 0) [[unlikely]]
|
||||||
|
size = TMinimumAlignment;
|
||||||
|
else
|
||||||
|
size = (size + (TMinimumAlignment - 1)) & ~(TMinimumAlignment - 1);
|
||||||
// find smallest bucket to scan
|
// find smallest bucket to scan
|
||||||
uint32 alignmentM1 = alignment - 1;
|
uint32 alignmentM1 = alignment - 1;
|
||||||
uint32 bucketIndex = ulog2(size);
|
uint32 bucketIndex = ulog2(size);
|
||||||
while (bucketIndex < 32)
|
// check if the bucket is available
|
||||||
|
if( !(m_bucketUseMask & (1u << bucketIndex)) )
|
||||||
{
|
{
|
||||||
allocRange_t* range = bucketFreeRange[bucketIndex];
|
// skip to next non-empty bucket
|
||||||
|
uint32 nextIndex = BSF(m_bucketUseMask>>bucketIndex);
|
||||||
|
bucketIndex += nextIndex;
|
||||||
|
}
|
||||||
|
while (bucketIndex < 31)
|
||||||
|
{
|
||||||
|
AllocRange* range = m_bucketFreeRange[bucketIndex];
|
||||||
while (range)
|
while (range)
|
||||||
{
|
{
|
||||||
if (range->size >= size)
|
if (range->size >= size)
|
||||||
{
|
{
|
||||||
// verify if aligned allocation fits
|
// verify if aligned allocation fits
|
||||||
uint32 alignedOffset = (range->offset + alignmentM1) & ~alignmentM1;
|
uint32 alignedOffset = (range->offset + alignmentM1) & ~alignmentM1;
|
||||||
uint32 alignmentLoss = alignedOffset - range->offset;
|
uint32 endOffset = alignedOffset + size;
|
||||||
if (alignmentLoss < range->size && (range->size - alignmentLoss) >= size)
|
if((range->offset+range->size) >= endOffset)
|
||||||
{
|
{
|
||||||
_allocFrom(range, bucketIndex, alignedOffset, size);
|
_allocFrom(range, bucketIndex, alignedOffset, size);
|
||||||
list_chunks[range->chunkIndex]->map_allocatedRange.emplace(alignedOffset, range);
|
m_numAllocatedBytes += size;
|
||||||
numAllocatedBytes += size;
|
return CHAddr(alignedOffset, range->chunkIndex, range);
|
||||||
return CHAddr(alignedOffset, range->chunkIndex);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
range = range->nextFree;
|
range = range->nextFree;
|
||||||
}
|
}
|
||||||
bucketIndex++; // try higher bucket
|
// check next non-empty bucket or skip to end
|
||||||
|
bucketIndex++;
|
||||||
|
uint32 emptyBuckets = BSF(m_bucketUseMask>>bucketIndex);
|
||||||
|
bucketIndex += emptyBuckets;
|
||||||
}
|
}
|
||||||
if(allocationLimitReached)
|
if(m_allocationLimitReached)
|
||||||
return CHAddr(0xFFFFFFFF, 0xFFFFFFFF);
|
return CHAddr(0xFFFFFFFF, 0xFFFFFFFF);
|
||||||
if (!allocateChunk(size))
|
if (!allocateChunk(size))
|
||||||
{
|
{
|
||||||
allocationLimitReached = true;
|
m_allocationLimitReached = true;
|
||||||
return CHAddr(0xFFFFFFFF, 0xFFFFFFFF);
|
return CHAddr(0xFFFFFFFF, 0xFFFFFFFF);
|
||||||
}
|
}
|
||||||
return _alloc(size, alignment);
|
return _alloc(size, alignment);
|
||||||
@ -186,24 +200,16 @@ private:
|
|||||||
|
|
||||||
void _free(CHAddr addr)
|
void _free(CHAddr addr)
|
||||||
{
|
{
|
||||||
auto it = list_chunks[addr.chunkIndex]->map_allocatedRange.find(addr.offset);
|
if(!addr.internal)
|
||||||
if (it == list_chunks[addr.chunkIndex]->map_allocatedRange.end())
|
|
||||||
{
|
{
|
||||||
cemuLog_log(LogType::Force, "Internal heap error. {:08x} {:08x}", addr.chunkIndex, addr.offset);
|
cemuLog_log(LogType::Force, "Internal heap error. {:08x} {:08x}", addr.chunkIndex, addr.offset);
|
||||||
cemuLog_log(LogType::Force, "Debug info:");
|
|
||||||
for (auto& rangeItr : list_chunks[addr.chunkIndex]->map_allocatedRange)
|
|
||||||
{
|
|
||||||
cemuLog_log(LogType::Force, "{:08x} {:08x}", rangeItr.second->offset, rangeItr.second->size);
|
|
||||||
}
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
AllocRange* range = (AllocRange*)addr.internal;
|
||||||
allocRange_t* range = it->second;
|
m_numAllocatedBytes -= range->size;
|
||||||
numAllocatedBytes -= it->second->size;
|
|
||||||
list_chunks[range->chunkIndex]->map_allocatedRange.erase(it);
|
|
||||||
// try merge left or right
|
// try merge left or right
|
||||||
allocRange_t* prevRange = range->prevOrdered;
|
AllocRange* prevRange = range->prevOrdered;
|
||||||
allocRange_t* nextRange = range->nextOrdered;
|
AllocRange* nextRange = range->nextOrdered;
|
||||||
if (prevRange && prevRange->isFree)
|
if (prevRange && prevRange->isFree)
|
||||||
{
|
{
|
||||||
if (nextRange && nextRange->isFree)
|
if (nextRange && nextRange->isFree)
|
||||||
@ -216,8 +222,8 @@ private:
|
|||||||
forgetFreeRange(prevRange, ulog2(prevRange->size));
|
forgetFreeRange(prevRange, ulog2(prevRange->size));
|
||||||
prevRange->size = newSize;
|
prevRange->size = newSize;
|
||||||
trackFreeRange(prevRange);
|
trackFreeRange(prevRange);
|
||||||
delete range;
|
m_allocEntriesPool.freeObj(range);
|
||||||
delete nextRange;
|
m_allocEntriesPool.freeObj(nextRange);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -228,7 +234,7 @@ private:
|
|||||||
forgetFreeRange(prevRange, ulog2(prevRange->size));
|
forgetFreeRange(prevRange, ulog2(prevRange->size));
|
||||||
prevRange->size = newSize;
|
prevRange->size = newSize;
|
||||||
trackFreeRange(prevRange);
|
trackFreeRange(prevRange);
|
||||||
delete range;
|
m_allocEntriesPool.freeObj(range);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (nextRange && nextRange->isFree)
|
else if (nextRange && nextRange->isFree)
|
||||||
@ -242,7 +248,7 @@ private:
|
|||||||
range->prevOrdered->nextOrdered = nextRange;
|
range->prevOrdered->nextOrdered = nextRange;
|
||||||
nextRange->prevOrdered = range->prevOrdered;
|
nextRange->prevOrdered = range->prevOrdered;
|
||||||
trackFreeRange(nextRange);
|
trackFreeRange(nextRange);
|
||||||
delete range;
|
m_allocEntriesPool.freeObj(range);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -265,7 +271,7 @@ private:
|
|||||||
|
|
||||||
for (uint32 i = 0; i < 32; i++)
|
for (uint32 i = 0; i < 32; i++)
|
||||||
{
|
{
|
||||||
allocRange_t* ar = bucketFreeRange[i];
|
AllocRange* ar = m_bucketFreeRange[i];
|
||||||
while (ar)
|
while (ar)
|
||||||
{
|
{
|
||||||
availableRange_t dbgRange;
|
availableRange_t dbgRange;
|
||||||
@ -278,7 +284,7 @@ private:
|
|||||||
if (itr.chunkIndex != dbgRange.chunkIndex)
|
if (itr.chunkIndex != dbgRange.chunkIndex)
|
||||||
continue;
|
continue;
|
||||||
if (itr.offset < (dbgRange.offset + dbgRange.size) && (itr.offset + itr.size) >(dbgRange.offset))
|
if (itr.offset < (dbgRange.offset + dbgRange.size) && (itr.offset + itr.size) >(dbgRange.offset))
|
||||||
assert_dbg();
|
cemu_assert_error();
|
||||||
}
|
}
|
||||||
|
|
||||||
availRanges.emplace_back(dbgRange);
|
availRanges.emplace_back(dbgRange);
|
||||||
@ -290,14 +296,16 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::vector<chunk_t*> list_chunks;
|
std::vector<Chunk> m_chunks;
|
||||||
allocRange_t* bucketFreeRange[32]{};
|
uint32 m_bucketUseMask{0x80000000}; // bitmask indicating non-empty buckets. MSB always set to provide an upper bound for BSF instruction
|
||||||
bool allocationLimitReached = false;
|
AllocRange* m_bucketFreeRange[32]{}; // we are only using 31 entries since the MSB is reserved (thus chunks equal or larger than 2^31 are not allowed)
|
||||||
|
bool m_allocationLimitReached = false;
|
||||||
|
MemoryPool<AllocRange> m_allocEntriesPool{64};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// statistics
|
// statistics
|
||||||
uint32 numHeapBytes{}; // total size of the heap
|
uint32 m_numHeapBytes{}; // total size of the heap
|
||||||
uint32 numAllocatedBytes{};
|
uint32 m_numAllocatedBytes{};
|
||||||
};
|
};
|
||||||
|
|
||||||
class VGenericHeap
|
class VGenericHeap
|
||||||
|
Loading…
x
Reference in New Issue
Block a user