diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h index 702278ca..2e264b53 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalBufferAllocator.h @@ -6,15 +6,23 @@ struct MetalBufferRange { - uint32 bufferIndex; size_t offset; size_t size; }; +constexpr size_t BASE_ALLOCATION_SIZE = 8 * 1024 * 1024; + template class MetalBufferAllocator { public: + struct Buffer + { + MTL::Buffer* m_buffer; + std::vector m_freeRanges; + BufferT m_data; + }; + MetalBufferAllocator(class MetalRenderer* metalRenderer, MTL::ResourceOptions storageMode) : m_mtlr{metalRenderer} { m_isCPUAccessible = (storageMode == MTL::ResourceStorageModeShared) || (storageMode == MTL::ResourceStorageModeManaged); @@ -33,9 +41,8 @@ public: void ResetAllocations() { - m_freeBufferRanges.clear(); - for (uint32_t i = 0; i < m_buffers.size(); i++) - m_freeBufferRanges.push_back({i, 0, m_buffers[i].m_buffer->length()}); + for (uint32 i = 0; i < m_buffers.size(); i++) + FreeBuffer(i); } MTL::Buffer* GetBuffer(uint32 bufferIndex) @@ -49,63 +56,62 @@ public: size = Align(size, 128); // First, try to find a free range - for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) + for (uint32 i = 0; i < m_buffers.size(); i++) { - auto& range = m_freeBufferRanges[i]; - if (size <= range.size) + auto& buffer = m_buffers[i]; + for (uint32 j = 0; j < buffer.m_freeRanges.size(); j++) { - auto& buffer = m_buffers[range.bufferIndex]; - - MetalBufferAllocation allocation; - allocation.bufferIndex = range.bufferIndex; - allocation.offset = range.offset; - allocation.size = size; - allocation.data = (m_isCPUAccessible ? (uint8*)buffer.m_buffer->contents() + range.offset : nullptr); - - range.offset += size; - range.size -= size; - - if (range.size == 0) + auto& range = buffer.m_freeRanges[j]; + if (size <= range.size) { - m_freeBufferRanges.erase(m_freeBufferRanges.begin() + i); - } + MetalBufferAllocation allocation; + allocation.bufferIndex = i; + allocation.offset = range.offset; + allocation.size = size; + allocation.data = (m_isCPUAccessible ? (uint8*)buffer.m_buffer->contents() + range.offset : nullptr); - return allocation; + range.offset += size; + range.size -= size; + + if (range.size == 0) + { + buffer.m_freeRanges.erase(buffer.m_freeRanges.begin() + j); + } + + return allocation; + } } } // If no free range was found, allocate a new buffer - m_allocationSize = std::max(m_allocationSize, size); - MTL::Buffer* buffer = m_mtlr->GetDevice()->newBuffer(m_allocationSize, m_options); + size_t allocationSize = BASE_ALLOCATION_SIZE * (1u << m_buffers.size()); + allocationSize = std::max(allocationSize, size); + MTL::Buffer* mtlBuffer = m_mtlr->GetDevice()->newBuffer(allocationSize, m_options); #ifdef CEMU_DEBUG_ASSERT - buffer->setLabel(GetLabel("Buffer from buffer allocator", buffer)); + mtlBuffer->setLabel(GetLabel("Buffer from buffer allocator", mtlBuffer)); #endif MetalBufferAllocation allocation; allocation.bufferIndex = m_buffers.size(); allocation.offset = 0; allocation.size = size; - allocation.data = (m_isCPUAccessible ? buffer->contents() : nullptr); + allocation.data = (m_isCPUAccessible ? mtlBuffer->contents() : nullptr); - m_buffers.push_back({buffer}); + m_buffers.push_back({mtlBuffer}); + auto& buffer = m_buffers.back(); // If the buffer is larger than the requested size, add the remaining space to the free buffer ranges - if (size < m_allocationSize) + if (size < allocationSize) { MetalBufferRange range; - range.bufferIndex = allocation.bufferIndex; range.offset = size; - range.size = m_allocationSize - size; + range.size = allocationSize - size; - m_freeBufferRanges.push_back(range); + buffer.m_freeRanges.push_back(range); } // Debug - m_mtlr->GetPerformanceMonitor().m_bufferAllocatorMemory += m_allocationSize; - - // Increase the allocation size for the next buffer - if (m_allocationSize < 128 * 1024 * 1024) - m_allocationSize *= 2; + m_mtlr->GetPerformanceMonitor().m_bufferAllocatorMemory += allocationSize; return allocation; } @@ -113,24 +119,24 @@ public: void FreeAllocation(MetalBufferAllocation& allocation) { MetalBufferRange range; - range.bufferIndex = allocation.bufferIndex; range.offset = allocation.offset; range.size = allocation.size; allocation.offset = INVALID_OFFSET; // Find the correct position to insert the free range - for (uint32 i = 0; i < m_freeBufferRanges.size(); i++) + auto& buffer = m_buffers[allocation.bufferIndex]; + for (uint32 i = 0; i < buffer.m_freeRanges.size(); i++) { - auto& freeRange = m_freeBufferRanges[i]; - if (freeRange.bufferIndex == range.bufferIndex && freeRange.offset + freeRange.size == range.offset) + auto& freeRange = buffer.m_freeRanges[i]; + if (freeRange.offset + freeRange.size == range.offset) { freeRange.size += range.size; return; } } - m_freeBufferRanges.push_back(range); + buffer.m_freeRanges.push_back(range); } protected: @@ -138,22 +144,22 @@ protected: bool m_isCPUAccessible; MTL::ResourceOptions m_options; - size_t m_allocationSize = 8 * 1024 * 1024; + std::vector m_buffers; - std::vector m_buffers; - std::vector m_freeBufferRanges; + void FreeBuffer(uint32 bufferIndex) + { + auto& buffer = m_buffers[bufferIndex]; + buffer.m_freeRanges.clear(); + buffer.m_freeRanges.reserve(1); + buffer.m_freeRanges.push_back({0, m_buffers[bufferIndex].m_buffer->length()}); + } }; -struct MetalBuffer -{ - MTL::Buffer* m_buffer; -}; - -typedef MetalBufferAllocator MetalDefaultBufferAllocator; +struct Empty {}; +typedef MetalBufferAllocator MetalDefaultBufferAllocator; struct MetalSyncedBuffer { - MTL::Buffer* m_buffer; std::vector m_commandBuffers; uint32 m_lock = 0; @@ -163,7 +169,7 @@ struct MetalSyncedBuffer } }; -constexpr uint16 MAX_COMMAND_BUFFER_FRAMES = 8; +constexpr uint16 BUFFER_RELEASE_FRAME_TRESHOLD = 1024; class MetalTemporaryBufferAllocator : public MetalBufferAllocator { @@ -172,65 +178,72 @@ public: void LockBuffer(uint32 bufferIndex) { - m_buffers[bufferIndex].m_lock++; + m_buffers[bufferIndex].m_data.m_lock++; } void UnlockBuffer(uint32 bufferIndex) { auto& buffer = m_buffers[bufferIndex]; - buffer.m_lock--; + buffer.m_data.m_lock--; // TODO: is this really necessary? // Release the buffer if it wasn't released due to the lock - if (!buffer.IsLocked() && buffer.m_commandBuffers.empty()) + if (!buffer.m_data.IsLocked() && buffer.m_data.m_commandBuffers.empty()) FreeBuffer(bufferIndex); } - void UnlockAllBuffers() + void EndFrame() { + CheckForCompletedCommandBuffers(); + + // Unlock all buffers for (uint32_t i = 0; i < m_buffers.size(); i++) { auto& buffer = m_buffers[i]; - if (buffer.m_lock != 0) + if (buffer.m_data.m_lock != 0) { - if (buffer.m_commandBuffers.empty()) + if (buffer.m_data.m_commandBuffers.empty()) FreeBuffer(i); - buffer.m_lock = 0; + buffer.m_data.m_lock = 0; } } - /* - auto it = m_commandBuffersFrames.begin(); - while (it != m_commandBuffersFrames.end()) + // TODO: do this for other buffer allocators as well? + // Track how many frames have passed since the last access to the back buffer + if (!m_buffers.empty()) { - it->second++; - - if (it->second > MAX_COMMAND_BUFFER_FRAMES) + auto& backBuffer = m_buffers.back(); + if (backBuffer.m_data.m_commandBuffers.empty()) { - debug_printf("command buffer %p remained unfinished for more than %u frames\n", it->first, MAX_COMMAND_BUFFER_FRAMES); + // Release the back buffer if it hasn't been accessed for a while + if (m_framesSinceBackBufferAccess >= BUFFER_RELEASE_FRAME_TRESHOLD) + { + // Debug + m_mtlr->GetPerformanceMonitor().m_bufferAllocatorMemory -= backBuffer.m_buffer->length(); - // Pretend like the command buffer has finished - CommandBufferFinished(it->first, false); + backBuffer.m_buffer->release(); + m_buffers.pop_back(); - it = m_commandBuffersFrames.erase(it); + m_framesSinceBackBufferAccess = 0; + } + else + { + m_framesSinceBackBufferAccess++; + } } else { - it++; + m_framesSinceBackBufferAccess = 0; } } - */ } void SetActiveCommandBuffer(MTL::CommandBuffer* commandBuffer) { m_activeCommandBuffer = commandBuffer; - - //if (commandBuffer) - // m_commandBuffersFrames[commandBuffer] = 0; } void CheckForCompletedCommandBuffers(/*MTL::CommandBuffer* commandBuffer, bool erase = true*/) @@ -238,24 +251,24 @@ public: for (uint32_t i = 0; i < m_buffers.size(); i++) { auto& buffer = m_buffers[i]; - for (uint32_t j = 0; j < buffer.m_commandBuffers.size(); j++) + for (uint32_t j = 0; j < buffer.m_data.m_commandBuffers.size(); j++) { - if (m_mtlr->CommandBufferCompleted(buffer.m_commandBuffers[j])) + if (m_mtlr->CommandBufferCompleted(buffer.m_data.m_commandBuffers[j])) { - if (buffer.m_commandBuffers.size() == 1) + if (buffer.m_data.m_commandBuffers.size() == 1) { - if (!buffer.IsLocked()) + if (!buffer.m_data.IsLocked()) { // All command buffers using it have finished execution, we can use it again FreeBuffer(i); } - buffer.m_commandBuffers.clear(); + buffer.m_data.m_commandBuffers.clear(); break; } else { - buffer.m_commandBuffers.erase(buffer.m_commandBuffers.begin() + j); + buffer.m_data.m_commandBuffers.erase(buffer.m_data.m_commandBuffers.begin() + j); j--; } } @@ -271,8 +284,8 @@ public: cemu_assert_debug(m_activeCommandBuffer); auto& buffer = m_buffers[bufferIndex]; - if (buffer.m_commandBuffers.empty() || buffer.m_commandBuffers.back() != m_activeCommandBuffer/*std::find(buffer.m_commandBuffers.begin(), buffer.m_commandBuffers.end(), m_activeCommandBuffer) == buffer.m_commandBuffers.end()*/) - buffer.m_commandBuffers.push_back(m_activeCommandBuffer); + if (buffer.m_data.m_commandBuffers.empty() || buffer.m_data.m_commandBuffers.back() != m_activeCommandBuffer/*std::find(buffer.m_commandBuffers.begin(), buffer.m_commandBuffers.end(), m_activeCommandBuffer) == buffer.m_commandBuffers.end()*/) + buffer.m_data.m_commandBuffers.push_back(m_activeCommandBuffer); return buffer.m_buffer; } @@ -298,33 +311,34 @@ public: } */ + // For debugging /* void LogInfo() { debug_printf("BUFFERS:\n"); for (auto& buffer : m_buffers) { - debug_printf(" %p -> size: %lu, command buffers: %zu\n", buffer.m_buffer, buffer.m_buffer->length(), buffer.m_commandBuffers.size()); + debug_printf(" %p -> size: %lu, command buffers: %zu\n", buffer.m_buffer, buffer.m_buffer->length(), buffer.m_data.m_commandBuffers.size()); uint32 same = 0; uint32 completed = 0; - for (uint32 i = 0; i < buffer.m_commandBuffers.size(); i++) + for (uint32 i = 0; i < buffer.m_data.m_commandBuffers.size(); i++) { - if (m_mtlr->CommandBufferCompleted(buffer.m_commandBuffers[i])) + if (m_mtlr->CommandBufferCompleted(buffer.m_data.m_commandBuffers[i])) completed++; - for (uint32 j = 0; j < buffer.m_commandBuffers.size(); j++) + for (uint32 j = 0; j < buffer.m_data.m_commandBuffers.size(); j++) { - if (i != j && buffer.m_commandBuffers[i] == buffer.m_commandBuffers[j]) + if (i != j && buffer.m_data.m_commandBuffers[i] == buffer.m_data.m_commandBuffers[j]) same++; } } debug_printf(" same: %u\n", same); debug_printf(" completed: %u\n", completed); - } - debug_printf("FREE RANGES:\n"); - for (auto& range : m_freeBufferRanges) - { - debug_printf(" %u -> offset: %zu, size: %zu\n", range.bufferIndex, range.offset, range.size); + debug_printf(" FREE RANGES:\n"); + for (auto& range : buffer.m_freeRanges) + { + debug_printf(" offset: %zu, size: %zu\n", range.offset, range.size); + } } } */ @@ -332,20 +346,5 @@ public: private: MTL::CommandBuffer* m_activeCommandBuffer = nullptr; - //std::map m_commandBuffersFrames; - - void FreeBuffer(uint32 bufferIndex) - { - // First remove any free ranges that use this buffer - for (uint32 k = 0; k < m_freeBufferRanges.size(); k++) - { - if (m_freeBufferRanges[k].bufferIndex == bufferIndex) - { - m_freeBufferRanges.erase(m_freeBufferRanges.begin() + k); - k--; - } - } - - m_freeBufferRanges.push_back({bufferIndex, 0, m_buffers[bufferIndex].m_buffer->length()}); - } + uint16 m_framesSinceBackBufferAccess = 0; }; diff --git a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp index 2e541c2b..b604cdd5 100644 --- a/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Metal/MetalRenderer.cpp @@ -264,10 +264,7 @@ void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) m_memoryManager->GetFramePersistentBufferAllocator().ResetAllocations(); // Unlock all temporary buffers - m_memoryManager->GetTemporaryBufferAllocator().UnlockAllBuffers(); - - // Check for completed command buffers - m_memoryManager->GetTemporaryBufferAllocator().CheckForCompletedCommandBuffers(); + m_memoryManager->GetTemporaryBufferAllocator().EndFrame(); } // TODO: use `shader` for drawing @@ -504,7 +501,7 @@ void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIn } } -// TODO: do a GPU blit even on unified memory? That would mean we could use private storage mode for all textures +// TODO: do a cpu copy on Apple Silicon? void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) { auto textureMtl = (LatteTextureMtl*)hostTexture;