Avoid locking Texture/Buffer in trap handler

We generally don't need to lock the `Texture`/`Buffer` in the trap handler, this is particularly problematic now as we hold the lock for the duration of a submission of any workloads. This leads to a large amount of contention for the lock and stalling in the signal handler when the resource may be `Clean` and can simply be switched over to `CpuDirty` without locking and utilizing atomics which is what this commit addresses.
This commit is contained in:
PixelyIon 2022-07-02 19:42:26 +05:30
parent a60d6ec58f
commit b0910e7b1a
No known key found for this signature in database
GPG Key ID: 11BC6C3201BC2C05
4 changed files with 101 additions and 76 deletions

View File

@ -20,6 +20,10 @@ namespace skyline::gpu {
SynchronizeGuest(true); // We can skip trapping since the caller will do it SynchronizeGuest(true); // We can skip trapping since the caller will do it
WaitOnFence(); WaitOnFence();
}, [this] { }, [this] {
DirtyState expectedState{DirtyState::Clean};
if (dirtyState.compare_exchange_strong(expectedState, DirtyState::CpuDirty, std::memory_order_relaxed) || expectedState == DirtyState::CpuDirty)
return; // If we can transition the buffer to CPU dirty (from Clean) or if it already is CPU dirty then we can just return, we only need to do the lock and corresponding sync if the buffer is GPU dirty
std::scoped_lock lock{*this}; std::scoped_lock lock{*this};
SynchronizeGuest(true); SynchronizeGuest(true);
dirtyState = DirtyState::CpuDirty; // We need to assume the buffer is dirty since we don't know what the guest is writing dirtyState = DirtyState::CpuDirty; // We need to assume the buffer is dirty since we don't know what the guest is writing
@ -97,12 +101,17 @@ namespace skyline::gpu {
} }
void Buffer::MarkGpuDirty() { void Buffer::MarkGpuDirty() {
if (dirtyState == DirtyState::GpuDirty || !guest) if (!guest)
return; return;
auto currentState{dirtyState.load(std::memory_order_relaxed)};
do {
if (currentState == DirtyState::GpuDirty)
return;
} while (!dirtyState.compare_exchange_strong(currentState, DirtyState::GpuDirty, std::memory_order_relaxed));
AdvanceSequence(); // The GPU will modify buffer contents so advance to the next sequence AdvanceSequence(); // The GPU will modify buffer contents so advance to the next sequence
gpu.state.nce->RetrapRegions(*trapHandle, false); gpu.state.nce->RetrapRegions(*trapHandle, false);
dirtyState = DirtyState::GpuDirty;
} }
void Buffer::WaitOnFence() { void Buffer::WaitOnFence() {
@ -123,42 +132,47 @@ namespace skyline::gpu {
} }
void Buffer::SynchronizeHost(bool rwTrap) { void Buffer::SynchronizeHost(bool rwTrap) {
if (dirtyState != DirtyState::CpuDirty || !guest) if (!guest)
return; // If the buffer has not been modified on the CPU or there's no guest buffer, there is no need to synchronize it return;
WaitOnFence(); auto currentState{dirtyState.load(std::memory_order_relaxed)};
do {
if (currentState != DirtyState::CpuDirty || !guest)
return; // If the buffer has not been modified on the CPU, there is no need to synchronize it
} while (!dirtyState.compare_exchange_strong(currentState, rwTrap ? DirtyState::GpuDirty : DirtyState::Clean, std::memory_order_relaxed));
TRACE_EVENT("gpu", "Buffer::SynchronizeHost"); TRACE_EVENT("gpu", "Buffer::SynchronizeHost");
AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
std::memcpy(backing.data(), mirror.data(), mirror.size());
if (rwTrap) { WaitOnFence();
gpu.state.nce->RetrapRegions(*trapHandle, false);
dirtyState = DirtyState::GpuDirty; std::memcpy(backing.data(), mirror.data(), mirror.size());
} else { gpu.state.nce->RetrapRegions(*trapHandle, !rwTrap); // Trap any future CPU reads (optionally) + writes to this buffer
gpu.state.nce->RetrapRegions(*trapHandle, true);
dirtyState = DirtyState::Clean;
}
} }
void Buffer::SynchronizeGuest(bool skipTrap, bool nonBlocking) { void Buffer::SynchronizeGuest(bool skipTrap, bool nonBlocking) {
if (dirtyState != DirtyState::GpuDirty || !guest) if (!guest)
return; // If the buffer has not been used on the GPU or there's no guest buffer, there is no need to synchronize it return;
auto currentState{dirtyState.load(std::memory_order_relaxed)};
do {
if (currentState != DirtyState::GpuDirty)
return; // If the buffer has not been used on the GPU, there is no need to synchronize it
} while (!dirtyState.compare_exchange_strong(currentState, DirtyState::Clean, std::memory_order_relaxed));
if (nonBlocking && !PollFence()) if (nonBlocking && !PollFence())
return; return;
else if (!nonBlocking)
WaitOnFence();
TRACE_EVENT("gpu", "Buffer::SynchronizeGuest"); TRACE_EVENT("gpu", "Buffer::SynchronizeGuest");
std::memcpy(mirror.data(), backing.data(), mirror.size());
if (!skipTrap) if (!skipTrap)
gpu.state.nce->RetrapRegions(*trapHandle, true); gpu.state.nce->RetrapRegions(*trapHandle, true);
dirtyState = DirtyState::Clean; if (!nonBlocking)
WaitOnFence();
std::memcpy(mirror.data(), backing.data(), mirror.size());
} }
void Buffer::SynchronizeGuestImmediate(bool isFirstUsage, const std::function<void()> &flushHostCallback) { void Buffer::SynchronizeGuestImmediate(bool isFirstUsage, const std::function<void()> &flushHostCallback) {
@ -186,8 +200,7 @@ namespace skyline::gpu {
else if (dirtyState == DirtyState::GpuDirty) else if (dirtyState == DirtyState::GpuDirty)
SynchronizeGuestImmediate(isFirstUsage, flushHostCallback); SynchronizeGuestImmediate(isFirstUsage, flushHostCallback);
if (dirtyState != DirtyState::Clean) // It's possible that the guest will arbitrarily modify the buffer contents on the CPU after the syncs and trigger the signal handler which would set the dirty state to CPU dirty, this is acceptable as there is no requirement to make writes visible immediately
Logger::Error("Attempting to write to a dirty buffer"); // This should never happen since we do syncs in both directions above
std::memcpy(mirror.data() + offset, data.data(), data.size()); // Always copy to mirror since any CPU side reads will need the up-to-date contents std::memcpy(mirror.data() + offset, data.data(), data.size()); // Always copy to mirror since any CPU side reads will need the up-to-date contents
@ -267,7 +280,7 @@ namespace skyline::gpu {
bool Buffer::BufferDelegate::LockWithTag(ContextTag pTag) { bool Buffer::BufferDelegate::LockWithTag(ContextTag pTag) {
bool result{}; bool result{};
buffer.Lock([pTag, &result](Buffer* pBuffer) { buffer.Lock([pTag, &result](Buffer *pBuffer) {
result = pBuffer->LockWithTag(pTag); result = pBuffer->LockWithTag(pTag);
}); });
return result; return result;

View File

@ -36,7 +36,9 @@ namespace skyline::gpu {
Clean, //!< The CPU mappings are in sync with the GPU buffer Clean, //!< The CPU mappings are in sync with the GPU buffer
CpuDirty, //!< The CPU mappings have been modified but the GPU buffer is not up to date CpuDirty, //!< The CPU mappings have been modified but the GPU buffer is not up to date
GpuDirty, //!< The GPU buffer has been modified but the CPU mappings have not been updated GpuDirty, //!< The GPU buffer has been modified but the CPU mappings have not been updated
} dirtyState{DirtyState::CpuDirty}; //!< The state of the CPU mappings with respect to the GPU buffer };
std::atomic<DirtyState> dirtyState{DirtyState::CpuDirty}; //!< The state of the CPU mappings with respect to the GPU buffer
static_assert(std::atomic<DirtyState>::is_always_lock_free);
bool everHadInlineUpdate{}; //!< Whether the buffer has ever had an inline update since it was created, if this is set then megabuffering will be attempted by views to avoid the cost of inline GPU updates bool everHadInlineUpdate{}; //!< Whether the buffer has ever had an inline update since it was created, if this is set then megabuffering will be attempted by views to avoid the cost of inline GPU updates

View File

@ -98,7 +98,7 @@ namespace skyline::gpu {
bool TextureView::LockWithTag(ContextTag tag) { bool TextureView::LockWithTag(ContextTag tag) {
bool result{}; bool result{};
texture.Lock([tag, &result](Texture* pTexture) { texture.Lock([tag, &result](Texture *pTexture) {
result = pTexture->LockWithTag(tag); result = pTexture->LockWithTag(tag);
}); });
return result; return result;
@ -148,6 +148,10 @@ namespace skyline::gpu {
SynchronizeGuest(true); // We can skip trapping since the caller will do it SynchronizeGuest(true); // We can skip trapping since the caller will do it
WaitOnFence(); WaitOnFence();
}, [this] { }, [this] {
DirtyState expectedState{DirtyState::Clean};
if (dirtyState.compare_exchange_strong(expectedState, DirtyState::CpuDirty, std::memory_order_relaxed) || expectedState == DirtyState::CpuDirty)
return; // If we can transition the texture to CPU dirty (from Clean) or if it already is CPU dirty then we can just return, we only need to do the lock and corresponding sync if the texture is GPU dirty
std::scoped_lock lock{*this}; std::scoped_lock lock{*this};
SynchronizeGuest(true); SynchronizeGuest(true);
dirtyState = DirtyState::CpuDirty; // We need to assume the texture is dirty since we don't know what the guest is writing dirtyState = DirtyState::CpuDirty; // We need to assume the texture is dirty since we don't know what the guest is writing
@ -156,9 +160,7 @@ namespace skyline::gpu {
} }
std::shared_ptr<memory::StagingBuffer> Texture::SynchronizeHostImpl() { std::shared_ptr<memory::StagingBuffer> Texture::SynchronizeHostImpl() {
if (!guest) if (guest->dimensions != dimensions)
throw exception("Synchronization of host textures requires a valid guest texture to synchronize from");
else if (guest->dimensions != dimensions)
throw exception("Guest and host dimensions being different is not supported currently"); throw exception("Guest and host dimensions being different is not supported currently");
auto pointer{mirror.data()}; auto pointer{mirror.data()};
@ -576,10 +578,16 @@ namespace skyline::gpu {
} }
void Texture::MarkGpuDirty() { void Texture::MarkGpuDirty() {
if (dirtyState == DirtyState::GpuDirty || !guest || format != guest->format) if (!guest || format != guest->format)
return; // In addition to other checks, we also need to skip GPU dirty if the host format and guest format differ as we don't support re-encoding compressed textures which is when this generally occurs return; // We need to skip GPU dirty if the host format and guest format differ as we don't support re-encoding compressed textures which is when this generally occurs
auto currentState{dirtyState.load(std::memory_order_relaxed)};
do {
if (currentState == DirtyState::GpuDirty)
return;
} while (!dirtyState.compare_exchange_strong(currentState, DirtyState::GpuDirty, std::memory_order_relaxed));
gpu.state.nce->RetrapRegions(*trapHandle, false); gpu.state.nce->RetrapRegions(*trapHandle, false);
dirtyState = DirtyState::GpuDirty;
} }
bool Texture::WaitOnBacking() { bool Texture::WaitOnBacking() {
@ -642,8 +650,14 @@ namespace skyline::gpu {
} }
void Texture::SynchronizeHost(bool rwTrap) { void Texture::SynchronizeHost(bool rwTrap) {
if (dirtyState != DirtyState::CpuDirty || !guest) if (!guest)
return; // If the texture has not been modified on the CPU or has no mappings, there is no need to synchronize it return;
auto currentState{dirtyState.load(std::memory_order_relaxed)};
do {
if (currentState != DirtyState::CpuDirty)
return; // If the texture has not been modified on the CPU or has no mappings, there is no need to synchronize it
} while (!dirtyState.compare_exchange_strong(currentState, rwTrap ? DirtyState::GpuDirty : DirtyState::Clean, std::memory_order_relaxed));
TRACE_EVENT("gpu", "Texture::SynchronizeHost"); TRACE_EVENT("gpu", "Texture::SynchronizeHost");
@ -657,19 +671,19 @@ namespace skyline::gpu {
cycle = lCycle; cycle = lCycle;
} }
if (rwTrap) { gpu.state.nce->RetrapRegions(*trapHandle, !rwTrap); // Trap any future CPU reads (optionally) + writes to this texture
gpu.state.nce->RetrapRegions(*trapHandle, false);
dirtyState = DirtyState::GpuDirty;
} else {
gpu.state.nce->RetrapRegions(*trapHandle, true); // Trap any future CPU writes to this texture
dirtyState = DirtyState::Clean;
}
} }
void Texture::SynchronizeHostWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &pCycle, bool rwTrap) { void Texture::SynchronizeHostWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &pCycle, bool rwTrap) {
if (dirtyState != DirtyState::CpuDirty || !guest) if (!guest)
return; return;
auto currentState{dirtyState.load(std::memory_order_relaxed)};
do {
if (currentState != DirtyState::CpuDirty)
return;
} while (!dirtyState.compare_exchange_strong(currentState, rwTrap ? DirtyState::GpuDirty : DirtyState::Clean, std::memory_order_relaxed));
TRACE_EVENT("gpu", "Texture::SynchronizeHostWithBuffer"); TRACE_EVENT("gpu", "Texture::SynchronizeHostWithBuffer");
auto stagingBuffer{SynchronizeHostImpl()}; auto stagingBuffer{SynchronizeHostImpl()};
@ -680,35 +694,29 @@ namespace skyline::gpu {
cycle = pCycle; cycle = pCycle;
} }
if (rwTrap) { gpu.state.nce->RetrapRegions(*trapHandle, !rwTrap); // Trap any future CPU reads (optionally) + writes to this texture
gpu.state.nce->RetrapRegions(*trapHandle, false);
dirtyState = DirtyState::GpuDirty;
} else {
gpu.state.nce->RetrapRegions(*trapHandle, true); // Trap any future CPU writes to this texture
dirtyState = DirtyState::Clean;
}
} }
void Texture::SynchronizeGuest(bool skipTrap) { void Texture::SynchronizeGuest(bool skipTrap) {
if (dirtyState != DirtyState::GpuDirty || layout == vk::ImageLayout::eUndefined || !guest) { if (!guest)
// We can skip syncing in three cases:
// * If the texture has not been used on the GPU, there is no need to synchronize it
// * If the state of the host texture is undefined then so can the guest
// * If there is no guest texture to synchronise
return; return;
}
if (layout == vk::ImageLayout::eUndefined || format != guest->format) { auto currentState{dirtyState.load(std::memory_order_relaxed)};
// If the state of the host texture is undefined then so can the guest do {
// If the texture has differing formats on the guest and host, we don't support converting back in that case as it may involve recompression of a decompressed texture if (currentState != DirtyState::GpuDirty)
if (!skipTrap) return; // If the buffer has not been used on the GPU, there is no need to synchronize it
gpu.state.nce->RetrapRegions(*trapHandle, true); } while (!dirtyState.compare_exchange_strong(currentState, DirtyState::Clean, std::memory_order_relaxed));
dirtyState = DirtyState::Clean;
return;
}
TRACE_EVENT("gpu", "Texture::SynchronizeGuest"); TRACE_EVENT("gpu", "Texture::SynchronizeGuest");
if (!skipTrap)
gpu.state.nce->RetrapRegions(*trapHandle, true);
if (layout == vk::ImageLayout::eUndefined || format != guest->format)
// If the state of the host texture is undefined then so can the guest
// If the texture has differing formats on the guest and host, we don't support converting back in that case as it may involve recompression of a decompressed texture
return;
WaitOnBacking(); WaitOnBacking();
if (tiling == vk::ImageTiling::eOptimal || !std::holds_alternative<memory::Image>(backing)) { if (tiling == vk::ImageTiling::eOptimal || !std::holds_alternative<memory::Image>(backing)) {
@ -727,25 +735,27 @@ namespace skyline::gpu {
} else { } else {
throw exception("Host -> Guest synchronization of images tiled as '{}' isn't implemented", vk::to_string(tiling)); throw exception("Host -> Guest synchronization of images tiled as '{}' isn't implemented", vk::to_string(tiling));
} }
if (!skipTrap)
gpu.state.nce->RetrapRegions(*trapHandle, true);
dirtyState = DirtyState::Clean;
} }
void Texture::SynchronizeGuestWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &pCycle) { void Texture::SynchronizeGuestWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &pCycle) {
if (dirtyState != DirtyState::GpuDirty || !guest) if (!guest)
return; return;
if (layout == vk::ImageLayout::eUndefined || format != guest->format) { auto currentState{dirtyState.load(std::memory_order_relaxed)};
// If the state of the host texture is undefined then so can the guest do {
// If the texture has differing formats on the guest and host, we don't support converting back in that case as it may involve recompression of a decompressed texture if (currentState != DirtyState::GpuDirty)
dirtyState = DirtyState::Clean; return; // If the buffer has not been used on the GPU, there is no need to synchronize it
return; } while (!dirtyState.compare_exchange_strong(currentState, DirtyState::Clean, std::memory_order_relaxed));
}
TRACE_EVENT("gpu", "Texture::SynchronizeGuestWithBuffer"); TRACE_EVENT("gpu", "Texture::SynchronizeGuestWithBuffer");
gpu.state.nce->RetrapRegions(*trapHandle, true);
if (layout == vk::ImageLayout::eUndefined || format != guest->format)
// If the state of the host texture is undefined then so can the guest
// If the texture has differing formats on the guest and host, we don't support converting back in that case as it may involve recompression of a decompressed texture
return;
WaitOnBacking(); WaitOnBacking();
pCycle->ChainCycle(cycle); pCycle->ChainCycle(cycle);
@ -762,8 +772,6 @@ namespace skyline::gpu {
} else { } else {
throw exception("Host -> Guest synchronization of images tiled as '{}' isn't implemented", vk::to_string(tiling)); throw exception("Host -> Guest synchronization of images tiled as '{}' isn't implemented", vk::to_string(tiling));
} }
dirtyState = DirtyState::Clean;
} }
std::shared_ptr<TextureView> Texture::GetView(vk::ImageViewType type, vk::ImageSubresourceRange range, texture::Format pFormat, vk::ComponentMapping mapping) { std::shared_ptr<TextureView> Texture::GetView(vk::ImageViewType type, vk::ImageSubresourceRange range, texture::Format pFormat, vk::ComponentMapping mapping) {

View File

@ -367,7 +367,9 @@ namespace skyline::gpu {
Clean, //!< The CPU mappings are in sync with the GPU texture Clean, //!< The CPU mappings are in sync with the GPU texture
CpuDirty, //!< The CPU mappings have been modified but the GPU texture is not up to date CpuDirty, //!< The CPU mappings have been modified but the GPU texture is not up to date
GpuDirty, //!< The GPU texture has been modified but the CPU mappings have not been updated GpuDirty, //!< The GPU texture has been modified but the CPU mappings have not been updated
} dirtyState{DirtyState::CpuDirty}; //!< The state of the CPU mappings with respect to the GPU texture };
std::atomic<DirtyState> dirtyState{DirtyState::CpuDirty}; //!< The state of the CPU mappings with respect to the GPU texture
static_assert(std::atomic<DirtyState>::is_always_lock_free);
/** /**
* @brief Storage for all metadata about a specific view into the buffer, used to prevent redundant view creation and duplication of VkBufferView(s) * @brief Storage for all metadata about a specific view into the buffer, used to prevent redundant view creation and duplication of VkBufferView(s)