release unused buffers

This commit is contained in:
Samuliak 2024-08-28 16:42:55 +02:00
parent 35740c5c8e
commit a6c8d83436
2 changed files with 109 additions and 113 deletions

View File

@ -6,15 +6,23 @@
struct MetalBufferRange
{
uint32 bufferIndex;
size_t offset;
size_t size;
};
constexpr size_t BASE_ALLOCATION_SIZE = 8 * 1024 * 1024;
template<typename BufferT>
class MetalBufferAllocator
{
public:
struct Buffer
{
MTL::Buffer* m_buffer;
std::vector<MetalBufferRange> m_freeRanges;
BufferT m_data;
};
MetalBufferAllocator(class MetalRenderer* metalRenderer, MTL::ResourceOptions storageMode) : m_mtlr{metalRenderer} {
m_isCPUAccessible = (storageMode == MTL::ResourceStorageModeShared) || (storageMode == MTL::ResourceStorageModeManaged);
@ -33,9 +41,8 @@ public:
void ResetAllocations()
{
m_freeBufferRanges.clear();
for (uint32_t i = 0; i < m_buffers.size(); i++)
m_freeBufferRanges.push_back({i, 0, m_buffers[i].m_buffer->length()});
for (uint32 i = 0; i < m_buffers.size(); i++)
FreeBuffer(i);
}
MTL::Buffer* GetBuffer(uint32 bufferIndex)
@ -49,63 +56,62 @@ public:
size = Align(size, 128);
// First, try to find a free range
for (uint32 i = 0; i < m_freeBufferRanges.size(); i++)
for (uint32 i = 0; i < m_buffers.size(); i++)
{
auto& range = m_freeBufferRanges[i];
if (size <= range.size)
auto& buffer = m_buffers[i];
for (uint32 j = 0; j < buffer.m_freeRanges.size(); j++)
{
auto& buffer = m_buffers[range.bufferIndex];
MetalBufferAllocation allocation;
allocation.bufferIndex = range.bufferIndex;
allocation.offset = range.offset;
allocation.size = size;
allocation.data = (m_isCPUAccessible ? (uint8*)buffer.m_buffer->contents() + range.offset : nullptr);
range.offset += size;
range.size -= size;
if (range.size == 0)
auto& range = buffer.m_freeRanges[j];
if (size <= range.size)
{
m_freeBufferRanges.erase(m_freeBufferRanges.begin() + i);
}
MetalBufferAllocation allocation;
allocation.bufferIndex = i;
allocation.offset = range.offset;
allocation.size = size;
allocation.data = (m_isCPUAccessible ? (uint8*)buffer.m_buffer->contents() + range.offset : nullptr);
return allocation;
range.offset += size;
range.size -= size;
if (range.size == 0)
{
buffer.m_freeRanges.erase(buffer.m_freeRanges.begin() + j);
}
return allocation;
}
}
}
// If no free range was found, allocate a new buffer
m_allocationSize = std::max(m_allocationSize, size);
MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(m_allocationSize, m_options);
size_t allocationSize = BASE_ALLOCATION_SIZE * (1u << m_buffers.size());
allocationSize = std::max(allocationSize, size);
MTL::Buffer* mtlBuffer = m_mtlr->GetDevice()->newBuffer(allocationSize, m_options);
#ifdef CEMU_DEBUG_ASSERT
buffer->setLabel(GetLabel("Buffer from buffer allocator", buffer));
mtlBuffer->setLabel(GetLabel("Buffer from buffer allocator", mtlBuffer));
#endif
MetalBufferAllocation allocation;
allocation.bufferIndex = m_buffers.size();
allocation.offset = 0;
allocation.size = size;
allocation.data = (m_isCPUAccessible ? buffer->contents() : nullptr);
allocation.data = (m_isCPUAccessible ? mtlBuffer->contents() : nullptr);
m_buffers.push_back({buffer});
m_buffers.push_back({mtlBuffer});
auto& buffer = m_buffers.back();
// If the buffer is larger than the requested size, add the remaining space to the free buffer ranges
if (size < m_allocationSize)
if (size < allocationSize)
{
MetalBufferRange range;
range.bufferIndex = allocation.bufferIndex;
range.offset = size;
range.size = m_allocationSize - size;
range.size = allocationSize - size;
m_freeBufferRanges.push_back(range);
buffer.m_freeRanges.push_back(range);
}
// Debug
m_mtlr->GetPerformanceMonitor().m_bufferAllocatorMemory += m_allocationSize;
// Increase the allocation size for the next buffer
if (m_allocationSize < 128 * 1024 * 1024)
m_allocationSize *= 2;
m_mtlr->GetPerformanceMonitor().m_bufferAllocatorMemory += allocationSize;
return allocation;
}
@ -113,24 +119,24 @@ public:
void FreeAllocation(MetalBufferAllocation& allocation)
{
MetalBufferRange range;
range.bufferIndex = allocation.bufferIndex;
range.offset = allocation.offset;
range.size = allocation.size;
allocation.offset = INVALID_OFFSET;
// Find the correct position to insert the free range
for (uint32 i = 0; i < m_freeBufferRanges.size(); i++)
auto& buffer = m_buffers[allocation.bufferIndex];
for (uint32 i = 0; i < buffer.m_freeRanges.size(); i++)
{
auto& freeRange = m_freeBufferRanges[i];
if (freeRange.bufferIndex == range.bufferIndex && freeRange.offset + freeRange.size == range.offset)
auto& freeRange = buffer.m_freeRanges[i];
if (freeRange.offset + freeRange.size == range.offset)
{
freeRange.size += range.size;
return;
}
}
m_freeBufferRanges.push_back(range);
buffer.m_freeRanges.push_back(range);
}
protected:
@ -138,22 +144,22 @@ protected:
bool m_isCPUAccessible;
MTL::ResourceOptions m_options;
size_t m_allocationSize = 8 * 1024 * 1024;
std::vector<Buffer> m_buffers;
std::vector<BufferT> m_buffers;
std::vector<MetalBufferRange> m_freeBufferRanges;
void FreeBuffer(uint32 bufferIndex)
{
auto& buffer = m_buffers[bufferIndex];
buffer.m_freeRanges.clear();
buffer.m_freeRanges.reserve(1);
buffer.m_freeRanges.push_back({0, m_buffers[bufferIndex].m_buffer->length()});
}
};
struct MetalBuffer
{
MTL::Buffer* m_buffer;
};
typedef MetalBufferAllocator<MetalBuffer> MetalDefaultBufferAllocator;
struct Empty {};
typedef MetalBufferAllocator<Empty> MetalDefaultBufferAllocator;
struct MetalSyncedBuffer
{
MTL::Buffer* m_buffer;
std::vector<MTL::CommandBuffer*> m_commandBuffers;
uint32 m_lock = 0;
@ -163,7 +169,7 @@ struct MetalSyncedBuffer
}
};
constexpr uint16 MAX_COMMAND_BUFFER_FRAMES = 8;
constexpr uint16 BUFFER_RELEASE_FRAME_TRESHOLD = 1024;
class MetalTemporaryBufferAllocator : public MetalBufferAllocator<MetalSyncedBuffer>
{
@ -172,65 +178,72 @@ public:
void LockBuffer(uint32 bufferIndex)
{
m_buffers[bufferIndex].m_lock++;
m_buffers[bufferIndex].m_data.m_lock++;
}
void UnlockBuffer(uint32 bufferIndex)
{
auto& buffer = m_buffers[bufferIndex];
buffer.m_lock--;
buffer.m_data.m_lock--;
// TODO: is this really necessary?
// Release the buffer if it wasn't released due to the lock
if (!buffer.IsLocked() && buffer.m_commandBuffers.empty())
if (!buffer.m_data.IsLocked() && buffer.m_data.m_commandBuffers.empty())
FreeBuffer(bufferIndex);
}
void UnlockAllBuffers()
void EndFrame()
{
CheckForCompletedCommandBuffers();
// Unlock all buffers
for (uint32_t i = 0; i < m_buffers.size(); i++)
{
auto& buffer = m_buffers[i];
if (buffer.m_lock != 0)
if (buffer.m_data.m_lock != 0)
{
if (buffer.m_commandBuffers.empty())
if (buffer.m_data.m_commandBuffers.empty())
FreeBuffer(i);
buffer.m_lock = 0;
buffer.m_data.m_lock = 0;
}
}
/*
auto it = m_commandBuffersFrames.begin();
while (it != m_commandBuffersFrames.end())
// TODO: do this for other buffer allocators as well?
// Track how many frames have passed since the last access to the back buffer
if (!m_buffers.empty())
{
it->second++;
if (it->second > MAX_COMMAND_BUFFER_FRAMES)
auto& backBuffer = m_buffers.back();
if (backBuffer.m_data.m_commandBuffers.empty())
{
debug_printf("command buffer %p remained unfinished for more than %u frames\n", it->first, MAX_COMMAND_BUFFER_FRAMES);
// Release the back buffer if it hasn't been accessed for a while
if (m_framesSinceBackBufferAccess >= BUFFER_RELEASE_FRAME_TRESHOLD)
{
// Debug
m_mtlr->GetPerformanceMonitor().m_bufferAllocatorMemory -= backBuffer.m_buffer->length();
// Pretend like the command buffer has finished
CommandBufferFinished(it->first, false);
backBuffer.m_buffer->release();
m_buffers.pop_back();
it = m_commandBuffersFrames.erase(it);
m_framesSinceBackBufferAccess = 0;
}
else
{
m_framesSinceBackBufferAccess++;
}
}
else
{
it++;
m_framesSinceBackBufferAccess = 0;
}
}
*/
}
void SetActiveCommandBuffer(MTL::CommandBuffer* commandBuffer)
{
m_activeCommandBuffer = commandBuffer;
//if (commandBuffer)
// m_commandBuffersFrames[commandBuffer] = 0;
}
void CheckForCompletedCommandBuffers(/*MTL::CommandBuffer* commandBuffer, bool erase = true*/)
@ -238,24 +251,24 @@ public:
for (uint32_t i = 0; i < m_buffers.size(); i++)
{
auto& buffer = m_buffers[i];
for (uint32_t j = 0; j < buffer.m_commandBuffers.size(); j++)
for (uint32_t j = 0; j < buffer.m_data.m_commandBuffers.size(); j++)
{
if (m_mtlr->CommandBufferCompleted(buffer.m_commandBuffers[j]))
if (m_mtlr->CommandBufferCompleted(buffer.m_data.m_commandBuffers[j]))
{
if (buffer.m_commandBuffers.size() == 1)
if (buffer.m_data.m_commandBuffers.size() == 1)
{
if (!buffer.IsLocked())
if (!buffer.m_data.IsLocked())
{
// All command buffers using it have finished execution, we can use it again
FreeBuffer(i);
}
buffer.m_commandBuffers.clear();
buffer.m_data.m_commandBuffers.clear();
break;
}
else
{
buffer.m_commandBuffers.erase(buffer.m_commandBuffers.begin() + j);
buffer.m_data.m_commandBuffers.erase(buffer.m_data.m_commandBuffers.begin() + j);
j--;
}
}
@ -271,8 +284,8 @@ public:
cemu_assert_debug(m_activeCommandBuffer);
auto& buffer = m_buffers[bufferIndex];
if (buffer.m_commandBuffers.empty() || buffer.m_commandBuffers.back() != m_activeCommandBuffer/*std::find(buffer.m_commandBuffers.begin(), buffer.m_commandBuffers.end(), m_activeCommandBuffer) == buffer.m_commandBuffers.end()*/)
buffer.m_commandBuffers.push_back(m_activeCommandBuffer);
if (buffer.m_data.m_commandBuffers.empty() || buffer.m_data.m_commandBuffers.back() != m_activeCommandBuffer/*std::find(buffer.m_commandBuffers.begin(), buffer.m_commandBuffers.end(), m_activeCommandBuffer) == buffer.m_commandBuffers.end()*/)
buffer.m_data.m_commandBuffers.push_back(m_activeCommandBuffer);
return buffer.m_buffer;
}
@ -298,33 +311,34 @@ public:
}
*/
// For debugging
/*
void LogInfo()
{
debug_printf("BUFFERS:\n");
for (auto& buffer : m_buffers)
{
debug_printf(" %p -> size: %lu, command buffers: %zu\n", buffer.m_buffer, buffer.m_buffer->length(), buffer.m_commandBuffers.size());
debug_printf(" %p -> size: %lu, command buffers: %zu\n", buffer.m_buffer, buffer.m_buffer->length(), buffer.m_data.m_commandBuffers.size());
uint32 same = 0;
uint32 completed = 0;
for (uint32 i = 0; i < buffer.m_commandBuffers.size(); i++)
for (uint32 i = 0; i < buffer.m_data.m_commandBuffers.size(); i++)
{
if (m_mtlr->CommandBufferCompleted(buffer.m_commandBuffers[i]))
if (m_mtlr->CommandBufferCompleted(buffer.m_data.m_commandBuffers[i]))
completed++;
for (uint32 j = 0; j < buffer.m_commandBuffers.size(); j++)
for (uint32 j = 0; j < buffer.m_data.m_commandBuffers.size(); j++)
{
if (i != j && buffer.m_commandBuffers[i] == buffer.m_commandBuffers[j])
if (i != j && buffer.m_data.m_commandBuffers[i] == buffer.m_data.m_commandBuffers[j])
same++;
}
}
debug_printf(" same: %u\n", same);
debug_printf(" completed: %u\n", completed);
}
debug_printf("FREE RANGES:\n");
for (auto& range : m_freeBufferRanges)
{
debug_printf(" %u -> offset: %zu, size: %zu\n", range.bufferIndex, range.offset, range.size);
debug_printf(" FREE RANGES:\n");
for (auto& range : buffer.m_freeRanges)
{
debug_printf(" offset: %zu, size: %zu\n", range.offset, range.size);
}
}
}
*/
@ -332,20 +346,5 @@ public:
private:
MTL::CommandBuffer* m_activeCommandBuffer = nullptr;
//std::map<MTL::CommandBuffer*, uint16> m_commandBuffersFrames;
void FreeBuffer(uint32 bufferIndex)
{
// First remove any free ranges that use this buffer
for (uint32 k = 0; k < m_freeBufferRanges.size(); k++)
{
if (m_freeBufferRanges[k].bufferIndex == bufferIndex)
{
m_freeBufferRanges.erase(m_freeBufferRanges.begin() + k);
k--;
}
}
m_freeBufferRanges.push_back({bufferIndex, 0, m_buffers[bufferIndex].m_buffer->length()});
}
uint16 m_framesSinceBackBufferAccess = 0;
};

View File

@ -264,10 +264,7 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC)
m_memoryManager->GetFramePersistentBufferAllocator().ResetAllocations();
// Unlock all temporary buffers
m_memoryManager->GetTemporaryBufferAllocator().UnlockAllBuffers();
// Check for completed command buffers
m_memoryManager->GetTemporaryBufferAllocator().CheckForCompletedCommandBuffers();
m_memoryManager->GetTemporaryBufferAllocator().EndFrame();
}
// TODO: use `shader` for drawing
@ -504,7 +501,7 @@ void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIn
}
}
// TODO: do a GPU blit even on unified memory? That would mean we could use private storage mode for all textures
// TODO: do a cpu copy on Apple Silicon?
void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize)
{
auto textureMtl = (LatteTextureMtl*)hostTexture;