From 98c0cc3e7f42a596317dfe318dc19f34f16d4ace Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Sun, 9 Oct 2022 13:23:42 +0100 Subject: [PATCH] Impl preserve attached buffers/textures to avoid GPFIFO lock thrashing When profiling SMO, it became obvious that the constant locking of textures and buffers in SyncDescriptors took up a large amount of CPU time (3-5%), a precious resource in intensive areas like Metro. This commit implements somewhat of a workaround to avoid constant relocking, if a buffer is frequently attached on the GPU and almost never used on the CPU we can keep the lock held between executions. Of course it's not that simple though, if the guest tries to lock a texture for the first time which has already been locked as preserve on the GPFIFO we need to avoid a deadlock. This is acheived through a combination of two things: first we periodically clear the locked attachments every 2*SlotCount submissions, preventing a complete deadlock on the CPU (just a long wait instead) and meaning that the next time the resource is attached on the GPU it will not be marked for preservation due to having been locked on the guest before; second, we always need to unlock everything when the GPU thread runs out of work, as the perioding clearing will not execute in this case which would otherwise leave the textures locked on the GPFIFO thread forever (if guest was waiting on a lock to submit work). It should be noted that we don't clear preserve attached resources in the latter scenario, only unlock them and then relock when more work is available. --- app/src/main/cpp/skyline/gpu/buffer.cpp | 2 +- app/src/main/cpp/skyline/gpu/buffer.h | 7 +- .../gpu/interconnect/command_executor.cpp | 84 +++++++++++++++---- .../gpu/interconnect/command_executor.h | 19 ++++- .../main/cpp/skyline/gpu/texture/texture.h | 5 +- 5 files changed, 94 insertions(+), 23 deletions(-) diff --git a/app/src/main/cpp/skyline/gpu/buffer.cpp b/app/src/main/cpp/skyline/gpu/buffer.cpp index 42862bbc..f53fc311 100644 --- a/app/src/main/cpp/skyline/gpu/buffer.cpp +++ b/app/src/main/cpp/skyline/gpu/buffer.cpp @@ -345,7 +345,7 @@ namespace skyline::gpu { void Buffer::unlock() { tag = ContextTag{}; - backingImmutability = BackingImmutability::None; + AllowAllBackingWrites(); mutex.unlock(); } diff --git a/app/src/main/cpp/skyline/gpu/buffer.h b/app/src/main/cpp/skyline/gpu/buffer.h index f2d94461..342f6ffe 100644 --- a/app/src/main/cpp/skyline/gpu/buffer.h +++ b/app/src/main/cpp/skyline/gpu/buffer.h @@ -43,7 +43,7 @@ namespace skyline::gpu { class Buffer : public std::enable_shared_from_this { private: GPU &gpu; - SpinLock mutex; //!< Synchronizes any mutations to the buffer or its backing + RecursiveSpinLock mutex; //!< Synchronizes any mutations to the buffer or its backing std::atomic tag{}; //!< The tag associated with the last lock call memory::Buffer backing; std::optional guest; @@ -190,6 +190,11 @@ namespace skyline::gpu { backingImmutability = BackingImmutability::AllWrites; } + void AllowAllBackingWrites() { + std::scoped_lock lock{stateMutex}; + backingImmutability = BackingImmutability::None; + } + /** * @return If sequenced writes to the backing must not occur on the CPU * @note The buffer **must** be locked prior to calling this diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp index 4ce188bb..ee35fac5 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp @@ -1,6 +1,7 @@ // SPDX-License-Identifier: MPL-2.0 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/) +#include #include #include #include "command_executor.h" @@ -234,8 +235,12 @@ namespace skyline::gpu::interconnect { textureManagerLock.emplace(gpu.texture); bool didLock{view->LockWithTag(tag)}; - if (didLock) - attachedTextures.emplace_back(view->texture); + if (didLock) { + if (view->texture->FrequentlyLocked()) + attachedTextures.emplace_back(view->texture); + else + preserveAttachedTextures.emplace_back(view->texture); + } return didLock; } @@ -259,8 +264,12 @@ namespace skyline::gpu::interconnect { bufferManagerLock.emplace(gpu.buffer); bool didLock{view.LockWithTag(tag)}; - if (didLock) - attachedBuffers.emplace_back(view.GetBuffer()->shared_from_this()); + if (didLock) { + if (view.GetBuffer()->FrequentlyLocked()) + attachedBuffers.emplace_back(view.GetBuffer()->shared_from_this()); + else + preserveAttachedBuffers.emplace_back(view.GetBuffer()->shared_from_this()); + } return didLock; } @@ -271,14 +280,20 @@ namespace skyline::gpu::interconnect { if (lock.OwnsLock()) { // Transfer ownership to executor so that the resource will stay locked for the period it is used on the GPU - attachedBuffers.emplace_back(view.GetBuffer()->shared_from_this()); + if (view.GetBuffer()->FrequentlyLocked()) + attachedBuffers.emplace_back(view.GetBuffer()->shared_from_this()); + else + preserveAttachedBuffers.emplace_back(view.GetBuffer()->shared_from_this()); lock.Release(); // The executor will handle unlocking the lock so it doesn't need to be handled here } } void CommandExecutor::AttachLockedBuffer(std::shared_ptr buffer, ContextLock &&lock) { if (lock.OwnsLock()) { - attachedBuffers.emplace_back(std::move(buffer)); + if (buffer->FrequentlyLocked()) + attachedBuffers.emplace_back(std::move(buffer)); + else + preserveAttachedBuffers.emplace_back(std::move(buffer)); lock.Release(); // See AttachLockedBufferView(...) } } @@ -381,24 +396,25 @@ namespace skyline::gpu::interconnect { }, {}, {} ); - for (const auto &texture : attachedTextures) + boost::container::small_vector chainedCycles; + for (const auto &texture : ranges::views::concat(attachedTextures, preserveAttachedTextures)) { texture->SynchronizeHostInline(slot->commandBuffer, cycle, true); + // We don't need to attach the Texture to the cycle as a TextureView will already be attached + if (ranges::find(chainedCycles, texture->cycle.get()) == chainedCycles.end()) { + cycle->ChainCycle(texture->cycle); + chainedCycles.emplace_back(texture->cycle.get()); + } + + texture->cycle = cycle; + } } - for (const auto &attachedBuffer : attachedBuffers) - if (attachedBuffer->SequencedCpuBackingWritesBlocked()) + for (const auto &attachedBuffer : ranges::views::concat(attachedBuffers, preserveAttachedBuffers)) { + if (attachedBuffer->RequiresCycleAttach()) { attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer - - for (const auto &attachedTexture : attachedTextures) { - // We don't need to attach the Texture to the cycle as a TextureView will already be attached - cycle->ChainCycle(attachedTexture->cycle); - attachedTexture->cycle = cycle; - } - - for (const auto &attachedBuffer : attachedBuffers) { - if (attachedBuffer->RequiresCycleAttach() ) { cycle->AttachObject(attachedBuffer.buffer); attachedBuffer->UpdateCycle(cycle); + attachedBuffer->AllowAllBackingWrites(); } } @@ -412,6 +428,12 @@ namespace skyline::gpu::interconnect { bufferManagerLock.reset(); megaBufferAllocatorLock.reset(); allocator->Reset(); + + // Periodically clear preserve attachments just in case there are new waiters which would otherwise end up waiting forever + if ((submissionNumber % CommandRecordThread::ActiveRecordSlots * 2) == 0) { + preserveAttachedBuffers.clear(); + preserveAttachedTextures.clear(); + } } void CommandExecutor::Submit() { @@ -423,7 +445,33 @@ namespace skyline::gpu::interconnect { if (!slot->nodes.empty()) { TRACE_EVENT("gpu", "CommandExecutor::Submit"); SubmitInternal(); + submissionNumber++; } + ResetInternal(); } + + void CommandExecutor::LockPreserve() { + if (!preserveLocked) { + preserveLocked = true; + + for (auto &buffer : preserveAttachedBuffers) + buffer->LockWithTag(tag); + + for (auto &texture : preserveAttachedTextures) + texture->LockWithTag(tag); + } + } + + void CommandExecutor::UnlockPreserve() { + if (preserveLocked) { + for (auto &buffer : preserveAttachedBuffers) + buffer->unlock(); + + for (auto &texture : preserveAttachedTextures) + texture->unlock(); + + preserveLocked = false; + } + } } diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h index 9f7a9d5e..874426fb 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h +++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h @@ -15,6 +15,8 @@ namespace skyline::gpu::interconnect { */ class CommandRecordThread { public: + static constexpr size_t ActiveRecordSlots{6}; //!< Maximum number of simultaneously active slots + /** * @brief Single execution slot, buffered back and forth between the GPFIFO thread and the record thread */ @@ -40,7 +42,6 @@ namespace skyline::gpu::interconnect { const DeviceState &state; std::thread thread; - static constexpr size_t ActiveRecordSlots{6}; //!< Maximum number of simultaneously active slots CircularQueue incoming{ActiveRecordSlots}; //!< Slots pending recording CircularQueue outgoing{ActiveRecordSlots}; //!< Slots that have been submitted, may still be active on the GPU @@ -76,6 +77,7 @@ namespace skyline::gpu::interconnect { std::optional> textureManagerLock; //!< The lock on the texture manager, this is locked for the duration of the command execution from the first usage inside an execution to the submission std::optional> bufferManagerLock; //!< The lock on the buffer manager, see above for details std::optional> megaBufferAllocatorLock; //!< The lock on the megabuffer allocator, see above for details + bool preserveLocked{}; /** * @brief A wrapper of a Texture object that has been locked beforehand and must be unlocked afterwards @@ -94,6 +96,7 @@ namespace skyline::gpu::interconnect { ~LockedTexture(); }; + std::vector preserveAttachedTextures; std::vector attachedTextures; //!< All textures that are attached to the current execution /** @@ -113,6 +116,7 @@ namespace skyline::gpu::interconnect { ~LockedBuffer(); }; + std::vector preserveAttachedBuffers; std::vector attachedBuffers; //!< All textures that are attached to the current execution @@ -154,6 +158,7 @@ namespace skyline::gpu::interconnect { std::shared_ptr cycle; //!< The fence cycle that this command executor uses to wait for the GPU to finish executing commands LinearAllocatorState<> *allocator; ContextTag tag; //!< The tag associated with this command executor, any tagged resource locking must utilize this tag + size_t submissionNumber{}; u32 executionNumber{}; CommandExecutor(const DeviceState &state); @@ -258,5 +263,17 @@ namespace skyline::gpu::interconnect { * @brief Execute all the nodes and submit the resulting command buffer to the GPU */ void Submit(); + + /** + * @brief Locks all preserve attached buffers/textures + * @note This **MUST** be called before attaching any buffers/textures to an execution + */ + void LockPreserve(); + + /** + * @brief Unlocks all preserve attached buffers/textures + * @note This **MUST** be called when there is no GPU work left to be done to avoid deadlocks where the guest will try to lock a buffer/texture but the GPFIFO thread has no work so won't periodically unlock it + */ + void UnlockPreserve(); }; } diff --git a/app/src/main/cpp/skyline/gpu/texture/texture.h b/app/src/main/cpp/skyline/gpu/texture/texture.h index 000a2122..f1e8da30 100644 --- a/app/src/main/cpp/skyline/gpu/texture/texture.h +++ b/app/src/main/cpp/skyline/gpu/texture/texture.h @@ -3,6 +3,7 @@ #pragma once +#include #include #include #include @@ -356,9 +357,9 @@ namespace skyline::gpu { class Texture : public std::enable_shared_from_this { private: GPU &gpu; - std::mutex mutex; //!< Synchronizes any mutations to the texture or its backing + RecursiveSpinLock mutex; //!< Synchronizes any mutations to the texture or its backing std::atomic tag{}; //!< The tag associated with the last lock call - std::condition_variable backingCondition; //!< Signalled when a valid backing has been swapped in + std::condition_variable_any backingCondition; //!< Signalled when a valid backing has been swapped in using BackingType = std::variant; BackingType backing; //!< The Vulkan image that backs this texture, it is nullable