diff --git a/app/src/main/cpp/skyline/gpu/buffer.cpp b/app/src/main/cpp/skyline/gpu/buffer.cpp index f53fc311..18fd24a5 100644 --- a/app/src/main/cpp/skyline/gpu/buffer.cpp +++ b/app/src/main/cpp/skyline/gpu/buffer.cpp @@ -8,6 +8,15 @@ #include "buffer.h" namespace skyline::gpu { + void Buffer::ResetMegabufferState() { + if (megaBufferTableUsed) + megaBufferTableValidity.reset(); + + megaBufferTableUsed = false; + megaBufferViewAccumulatedSize = 0; + unifiedMegaBuffer = {}; + } + void Buffer::SetupGuestMappings() { u8 *alignedData{util::AlignDown(guest->data(), constant::PageSize)}; size_t alignedSize{static_cast(util::AlignUp(guest->data() + guest->size(), constant::PageSize) - alignedData)}; @@ -285,39 +294,58 @@ namespace skyline::gpu { return {}; } - BufferBinding Buffer::TryMegaBufferView(const std::shared_ptr &pCycle, MegaBufferAllocator &allocator, size_t executionNumber, + BufferBinding Buffer::TryMegaBufferView(const std::shared_ptr &pCycle, MegaBufferAllocator &allocator, u32 executionNumber, vk::DeviceSize offset, vk::DeviceSize size) { if (!everHadInlineUpdate && sequenceNumber < FrequentlySyncedThreshold) // Don't megabuffer buffers that have never had inline updates and are not frequently synced since performance is only going to be harmed as a result of the constant copying and there wont be any benefit since there are no GPU inline updates that would be avoided return {}; - if (size > MegaBufferingDisableThreshold) - return {}; - // We are safe to check dirty state here since it will only ever be set GPU dirty with the buffer locked and from the active GPFIFO thread. This helps with perf since the lock ends up being slightly expensive if (dirtyState == DirtyState::GpuDirty && !SynchronizeGuest(false, true)) // Bail out if buffer cannot be synced, we don't know the contents ahead of time so the sequence is indeterminate return {}; + // If the active execution has changed all previous allocations are now invalid + if (executionNumber != lastExecutionNumber) [[unlikely]] + ResetMegabufferState(); + + // If more than half the buffer has been megabuffered in chunks within the same execution assume this will generally be the case for this buffer and just megabuffer the whole thing without chunking + if (unifiedMegaBufferEnabled || megaBufferViewAccumulatedSize > (backing.size() / 2)) { + if (!unifiedMegaBuffer) { + unifiedMegaBuffer = allocator.Push(pCycle, mirror, true); + unifiedMegaBufferEnabled = true; + } + + return BufferBinding{unifiedMegaBuffer.buffer, unifiedMegaBuffer.offset + offset, size}; + } + + if (size > MegaBufferingDisableThreshold && sequenceNumber < FrequentlySyncedThresholdHigh) + return {}; + size_t entryIdx{offset >> megaBufferTableShift}; size_t bufferEntryOffset{entryIdx << megaBufferTableShift}; size_t entryViewOffset{offset - bufferEntryOffset}; - auto &entry{megaBufferTable[entryIdx]}; - // If the cached allocation is invalid or not up to date, allocate a new one - if (!entry.allocation || entry.executionNumber != executionNumber || - entry.sequenceNumber != sequenceNumber || entry.allocation.region.size() + entryViewOffset < size) { + if (entryIdx >= megaBufferTable.size()) + return {}; + + auto &allocation{megaBufferTable[entryIdx]}; + + // If the cached allocation is invalid or too small, allocate a new one + if (!megaBufferTableValidity.test(entryIdx) || allocation.region.size() < (size + entryViewOffset)) { // Use max(oldSize, newSize) to avoid redundant reallocations within an execution if a larger allocation comes along later - auto mirrorAllocationRegion{mirror.subspan(bufferEntryOffset, std::max(entryViewOffset + size, entry.allocation.region.size()))}; - entry.allocation = allocator.Push(pCycle, mirrorAllocationRegion, true); - entry.executionNumber = executionNumber; - entry.sequenceNumber = sequenceNumber; + auto mirrorAllocationRegion{mirror.subspan(bufferEntryOffset, std::max(entryViewOffset + size, allocation.region.size()))}; + allocation = allocator.Push(pCycle, mirrorAllocationRegion, true); + megaBufferTableValidity.set(entryIdx); + megaBufferViewAccumulatedSize += mirrorAllocationRegion.size(); + megaBufferTableUsed = true; } - return {entry.allocation.buffer, entry.allocation.offset + entryViewOffset, size}; + return {allocation.buffer, allocation.offset + entryViewOffset, size}; } void Buffer::AdvanceSequence() { + ResetMegabufferState(); sequenceNumber++; } @@ -408,7 +436,7 @@ namespace skyline::gpu { return GetBuffer()->Write(isFirstUsage, flushHostCallback, data, writeOffset + GetOffset(), gpuCopyCallback); } - BufferBinding BufferView::TryMegaBuffer(const std::shared_ptr &pCycle, MegaBufferAllocator &allocator, size_t executionNumber, size_t sizeOverride) const { + BufferBinding BufferView::TryMegaBuffer(const std::shared_ptr &pCycle, MegaBufferAllocator &allocator, u32 executionNumber, size_t sizeOverride) const { return GetBuffer()->TryMegaBufferView(pCycle, allocator, executionNumber, GetOffset(), sizeOverride ? sizeOverride : size); } diff --git a/app/src/main/cpp/skyline/gpu/buffer.h b/app/src/main/cpp/skyline/gpu/buffer.h index 342f6ffe..321f9e64 100644 --- a/app/src/main/cpp/skyline/gpu/buffer.h +++ b/app/src/main/cpp/skyline/gpu/buffer.h @@ -67,31 +67,35 @@ namespace skyline::gpu { } backingImmutability{}; //!< Describes how the buffer backing should be accessed by the current context RecursiveSpinLock stateMutex; //!< Synchronizes access to the dirty state and backing immutability - bool everHadInlineUpdate{}; //!< Whether the buffer has ever had an inline update since it was created, if this is set then megabuffering will be attempted by views to avoid the cost of inline GPU updates + static constexpr u32 InitialSequenceNumber{1}; //!< Sequence number that all buffers start off with + static constexpr u32 FrequentlySyncedThreshold{6}; //!< Threshold for the sequence number after which the buffer is considered elegible for megabuffering + static constexpr u32 FrequentlySyncedThresholdHigh{16}; //!< Threshold for the sequence number after which the buffer is considered elegible for megabuffering irrespective of view size + u32 sequenceNumber{InitialSequenceNumber}; //!< Sequence number that is incremented after all modifications to the host side `backing` buffer, used to prevent redundant copies of the buffer being stored in the megabuffer by views - static constexpr u64 InitialSequenceNumber{1}; //!< Sequence number that all buffers start off with - static constexpr u64 FrequentlySyncedThreshold{15}; //!< Threshold for the sequence number after which the buffer is considered elegible for megabuffering - u64 sequenceNumber{InitialSequenceNumber}; //!< Sequence number that is incremented after all modifications to the host side `backing` buffer, used to prevent redundant copies of the buffer being stored in the megabuffer by views + constexpr static vk::DeviceSize MegaBufferingDisableThreshold{1024 * 256}; //!< The threshold at which a view is considered to be too large to be megabuffered (256KiB) - constexpr static vk::DeviceSize MegaBufferingDisableThreshold{1024 * 128}; //!< The threshold at which a view is considered to be too large to be megabuffered (128KiB) - - /** - * @brief Holds a single megabuffer copy with sequencing information for an offset within the buffer - */ - struct MegaBufferTableEntry { - MegaBufferAllocator::Allocation allocation{}; //!< The allocation in the megabuffer for the entry, can be any size - size_t executionNumber; //!< Execution number of when the allocation was made - size_t sequenceNumber; //!< Sequence number of when the allocation was made - }; - - static constexpr int MegaBufferTableShiftMin{std::countr_zero(0x80U)}; //!< The minimum shift for megabuffer table entries, giving an alignment of at least 128 bytes + static constexpr int MegaBufferTableShiftMin{std::countr_zero(0x100U)}; //!< The minimum shift for megabuffer table entries, giving an alignment of at least 256 bytes static constexpr size_t MegaBufferTableMaxEntries{0x500U}; //!< Maximum number of entries in the megabuffer table, `megaBufferTableShift` is set based on this and the total buffer size int megaBufferTableShift; //!< Shift to apply to buffer offsets to get their megabuffer table index - std::vector megaBufferTable; //!< Table of megabuffer allocations for regions of the buffer + std::vector megaBufferTable; //!< Table of megabuffer allocations for regions of the buffer + std::bitset megaBufferTableValidity{}; //!< Bitset keeping track of which entries in the megabuffer table are valid + bool megaBufferTableUsed{}; //!< If the megabuffer table has been used at all since the last time it was cleared + bool unifiedMegaBufferEnabled{}; //!< If the unified megabuffer is enabled for this buffer and should be used instead of the table + bool everHadInlineUpdate{}; //!< Whether the buffer has ever had an inline update since it was created, if this is set then megabuffering will be attempted by views to avoid the cost of inline GPU updates + + u32 lastExecutionNumber{}; //!< The execution number of the last time megabuffer data was updated + + size_t megaBufferViewAccumulatedSize{}; + MegaBufferAllocator::Allocation unifiedMegaBuffer{}; //!< An optional full-size mirror of the buffer in the megabuffer for use when the buffer is frequently updated and *all* of the buffer is frequently used. Replaces all uses of the table when active static constexpr size_t FrequentlyLockedThreshold{2}; //!< Threshold for the number of times a buffer can be locked (not from context locks, only normal) before it should be considered frequently locked size_t accumulatedCpuLockCounter{}; //!< Number of times buffer has been locked through non-ContextLocks + /** + * @brief Resets all megabuffer tracking state + */ + void ResetMegabufferState(); + private: BufferDelegate *delegate; @@ -307,7 +311,7 @@ namespace skyline::gpu { * @return A binding to the megabuffer allocation for the view, may be invalid if megabuffering is not beneficial * @note The buffer **must** be locked prior to calling this */ - BufferBinding TryMegaBufferView(const std::shared_ptr &pCycle, MegaBufferAllocator &allocator, size_t executionNumber, + BufferBinding TryMegaBufferView(const std::shared_ptr &pCycle, MegaBufferAllocator &allocator, u32 executionNumber, vk::DeviceSize offset, vk::DeviceSize size); /** @@ -436,14 +440,13 @@ namespace skyline::gpu { bool Write(bool isFirstUsage, const std::shared_ptr &cycle, const std::function &flushHostCallback, span data, vk::DeviceSize writeOffset, const std::function &gpuCopyCallback = {}) const; - /* * @brief If megabuffering is determined to be beneficial for the underlying buffer, allocates and copies this view into the megabuffer (in case of cache miss), returning a binding of the allocated megabuffer region * @param sizeOverride If non-zero, specifies the size of the megabuffer region to allocate and copy to, *MUST* be smaller than the size of the view * @note The view **must** be locked prior to calling this * @note See Buffer::TryMegaBufferView */ - BufferBinding TryMegaBuffer(const std::shared_ptr &pCycle, MegaBufferAllocator &allocator, size_t executionNumber, size_t sizeOverride = 0) const; + BufferBinding TryMegaBuffer(const std::shared_ptr &pCycle, MegaBufferAllocator &allocator, u32 executionNumber, size_t sizeOverride = 0) const; /** * @return A span of the backing buffer contents