Introduce usage tracker for dirty tracking within an execution

This is neccessary as e.g. shaders can be updated through a mirror and never hit modification traps. By tracking which addresses have sequenced writes applied, the shader manager can then correctly detect if a given shader has been modified by the GPU.
This commit is contained in:
Billy Laws 2023-03-04 20:11:34 +00:00
parent f64860c93e
commit 090151f0c3
16 changed files with 103 additions and 44 deletions

View File

@ -194,13 +194,15 @@ namespace skyline::gpu {
return isDirect ? ValidateMegaBufferViewImplDirect(size) : ValidateMegaBufferViewImplStaged(size);
}
void Buffer::CopyFromImplDirect(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback) {
void Buffer::CopyFromImplDirect(vk::DeviceSize dstOffset,
Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size,
UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback) {
everHadInlineUpdate = true;
bool needsGpuTracking{src->RefreshGpuWritesActiveDirect() || RefreshGpuWritesActiveDirect()};
bool needsCpuTracking{RefreshGpuReadsActiveDirect() && !needsGpuTracking};
if (needsGpuTracking || needsCpuTracking) {
if (needsGpuTracking) // Force buffer to be dirty for this cycle if either of the sources are dirty, this is needed as otherwise it could have just been dirty from the previous cycle
MarkGpuDirty();
MarkGpuDirty(usageTracker);
gpuCopyCallback();
if (needsCpuTracking)
@ -210,7 +212,9 @@ namespace skyline::gpu {
}
}
void Buffer::CopyFromImplStaged(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback) {
void Buffer::CopyFromImplStaged(vk::DeviceSize dstOffset,
Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size,
UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback) {
std::scoped_lock lock{stateMutex, src->stateMutex}; // Fine even if src and dst are same since recursive mutex
if (dirtyState == DirtyState::CpuDirty && SequencedCpuBackingWritesBlocked())
@ -230,18 +234,19 @@ namespace skyline::gpu {
else
gpuCopyCallback();
} else {
MarkGpuDirty();
MarkGpuDirty(usageTracker);
gpuCopyCallback();
}
}
bool Buffer::WriteImplDirect(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback) {
bool Buffer::WriteImplDirect(span<u8> data, vk::DeviceSize offset,
UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback) {
// If the buffer is GPU dirty do the write on the GPU and we're done
if (RefreshGpuWritesActiveDirect()) {
if (gpuCopyCallback) {
// Propagate dirtiness to the current cycle, since if this is only dirty in a previous cycle that could change at any time and we would need to have the write saved somewhere for CPU reads
// By propagating the dirtiness to the current cycle we can avoid this and force a wait on any reads
MarkGpuDirty();
MarkGpuDirty(usageTracker);
gpuCopyCallback();
return false;
} else {
@ -349,6 +354,15 @@ namespace skyline::gpu {
AdvanceSequence(); // The GPU will modify buffer contents so advance to the next sequence
}
void Buffer::MarkGpuDirtyImpl() {
currentExecutionGpuDirty = true;
if (isDirect)
MarkGpuDirtyImplDirect();
else
MarkGpuDirtyImplStaged();
}
Buffer::Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, GuestBuffer guest, size_t id, bool direct)
: gpu{gpu},
guest{guest},
@ -382,16 +396,12 @@ namespace skyline::gpu {
WaitOnFence();
}
void Buffer::MarkGpuDirty() {
void Buffer::MarkGpuDirty(UsageTracker &usageTracker) {
if (!guest)
return;
currentExecutionGpuDirty = true;
if (isDirect)
MarkGpuDirtyImplDirect();
else
MarkGpuDirtyImplStaged();
usageTracker.dirtyIntervals.Insert(*guest);
MarkGpuDirtyImpl();
}
void Buffer::WaitOnFence() {
@ -493,24 +503,30 @@ namespace skyline::gpu {
ReadImplStaged(isFirstUsage, flushHostCallback, data, offset);
}
bool Buffer::Write(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback) {
bool Buffer::Write(span<u8> data, vk::DeviceSize offset, UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback) {
AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
everHadInlineUpdate = true;
usageTracker.sequencedIntervals.Insert(*guest);
if (isDirect)
return WriteImplDirect(data, offset, gpuCopyCallback);
return WriteImplDirect(data, offset, usageTracker, gpuCopyCallback);
else
return WriteImplStaged(data, offset, gpuCopyCallback);
}
void Buffer::CopyFrom(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback) {
void Buffer::CopyFrom(vk::DeviceSize dstOffset,
Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size,
UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback) {
AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
everHadInlineUpdate = true;
usageTracker.sequencedIntervals.Insert(*guest);
if (isDirect)
CopyFromImplDirect(dstOffset, src, srcOffset, size, gpuCopyCallback);
CopyFromImplDirect(dstOffset, src, srcOffset, size, usageTracker, gpuCopyCallback);
else
CopyFromImplStaged(dstOffset, src, srcOffset, size, gpuCopyCallback);
CopyFromImplStaged(dstOffset, src, srcOffset, size, usageTracker, gpuCopyCallback);
}
BufferView Buffer::GetView(vk::DeviceSize offset, vk::DeviceSize size) {
@ -676,8 +692,8 @@ namespace skyline::gpu {
GetBuffer()->Read(isFirstUsage, flushHostCallback, data, readOffset + GetOffset());
}
bool BufferView::Write(span<u8> data, vk::DeviceSize writeOffset, const std::function<void()> &gpuCopyCallback) const {
return GetBuffer()->Write(data, writeOffset + GetOffset(), gpuCopyCallback);
bool BufferView::Write(span<u8> data, vk::DeviceSize writeOffset, UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback) const {
return GetBuffer()->Write(data, writeOffset + GetOffset(), usageTracker, gpuCopyCallback);
}
BufferBinding BufferView::TryMegaBuffer(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, ContextTag executionTag, size_t sizeOverride) const {
@ -689,9 +705,9 @@ namespace skyline::gpu {
return backing.subspan(GetOffset(), size);
}
void BufferView::CopyFrom(BufferView src, const std::function<void()> &gpuCopyCallback) {
void BufferView::CopyFrom(BufferView src, UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback) {
if (src.size != size)
throw exception("Copy size mismatch!");
return GetBuffer()->CopyFrom(GetOffset(), src.GetBuffer(), src.GetOffset(), size, gpuCopyCallback);
return GetBuffer()->CopyFrom(GetOffset(), src.GetBuffer(), src.GetOffset(), size, usageTracker, gpuCopyCallback);
}
}

View File

@ -8,6 +8,7 @@
#include <common/spin_lock.h>
#include <nce.h>
#include <gpu/tag_allocator.h>
#include "usage_tracker.h"
#include "megabuffer.h"
#include "memory_manager.h"
@ -146,11 +147,16 @@ namespace skyline::gpu {
*/
bool ValidateMegaBufferView(vk::DeviceSize size);
void CopyFromImplDirect(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback);
void CopyFromImplDirect(vk::DeviceSize dstOffset,
Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size,
UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback);
void CopyFromImplStaged(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback);
void CopyFromImplStaged(vk::DeviceSize dstOffset,
Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size,
UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback);
bool WriteImplDirect(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback = {});
bool WriteImplDirect(span<u8> data, vk::DeviceSize offset,
UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback = {});
bool WriteImplStaged(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback = {});
@ -162,6 +168,8 @@ namespace skyline::gpu {
void MarkGpuDirtyImplStaged();
void MarkGpuDirtyImpl();
public:
void UpdateCycle(const std::shared_ptr<FenceCycle> &newCycle) {
newCycle->ChainCycle(cycle);
@ -227,7 +235,7 @@ namespace skyline::gpu {
* @note This **must** be called after syncing the buffer to the GPU not before
* @note The buffer **must** be locked prior to calling this
*/
void MarkGpuDirty();
void MarkGpuDirty(UsageTracker &usageTracker);
/**
* @brief Prevents sequenced writes to this buffer's backing from occuring on the CPU, forcing sequencing on the GPU instead for the duration of the context. Unsequenced writes such as those from the guest can still occur however.
@ -365,13 +373,15 @@ namespace skyline::gpu {
* @param gpuCopyCallback Optional callback to perform a GPU-side copy for this Write if necessary, if such a copy is needed and this is not supplied `true` will be returned to indicate that the write needs to be repeated with the callback present
* @return Whether the write needs to be repeated with `gpuCopyCallback` provided, always false if `gpuCopyCallback` is provided
*/
bool Write(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback = {});
bool Write(span<u8> data, vk::DeviceSize offset, UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback = {});
/**
* @brief Copies a region of the src buffer into a region of this buffer
* @note The src/dst buffers **must** be locked prior to calling this
*/
void CopyFrom(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback);
void CopyFrom(vk::DeviceSize dstOffset,
Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size,
UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback);
/**
* @return A view into this buffer with the supplied attributes
@ -528,7 +538,7 @@ namespace skyline::gpu {
* @note The view **must** be locked prior to calling this
* @note See Buffer::Write
*/
bool Write(span<u8> data, vk::DeviceSize writeOffset, const std::function<void()> &gpuCopyCallback = {}) const;
bool Write(span<u8> data, vk::DeviceSize writeOffset, UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback = {}) const;
/*
* @brief If megabuffering is determined to be beneficial for the underlying buffer, allocates and copies this view into the megabuffer (in case of cache miss), returning a binding of the allocated megabuffer region
@ -550,7 +560,7 @@ namespace skyline::gpu {
* @brief Copies the contents of one view into this one
* @note The src/dst views **must** be locked prior to calling this
*/
void CopyFrom(BufferView src, const std::function<void()> &gpuCopyCallback);
void CopyFrom(BufferView src, UsageTracker &usageTracker, const std::function<void()> &gpuCopyCallback);
constexpr operator bool() {
return delegate != nullptr;

View File

@ -113,7 +113,7 @@ namespace skyline::gpu {
if (srcBuffer.lock.IsFirstUsage() && newBuffer->dirtyState != Buffer::DirtyState::GpuDirty)
copyBuffer(*newBuffer->guest, *srcBuffer->guest, newBuffer->mirror.data(), srcBuffer->backing->data());
else
newBuffer->MarkGpuDirty();
newBuffer->MarkGpuDirtyImpl();
// Since we don't synchost source buffers and the source buffers here are GPU dirty their mirrors will be out of date, meaning the backing contents of this source buffer's region in the new buffer from the initial synchost call will be incorrect. By copying backings directly here we can ensure that no writes are lost and that if the newly created buffer needs to turn GPU dirty during recreation no copies need to be done since the backing is as up to date as the mirror at a minimum.
copyBuffer(*newBuffer->guest, *srcBuffer->guest, newBuffer->backing->data(), srcBuffer->backing->data());
@ -126,7 +126,7 @@ namespace skyline::gpu {
}
} else {
if (srcBuffer->directGpuWritesActive) {
newBuffer->MarkGpuDirty();
newBuffer->MarkGpuDirtyImpl();
} else if (srcBuffer->directTrackedShadowActive) {
newBuffer->EnableTrackedShadowDirect();
copyBuffer(*newBuffer->guest, *srcBuffer->guest, newBuffer->directTrackedShadow.data(), srcBuffer->directTrackedShadow.data());

View File

@ -562,6 +562,7 @@ namespace skyline::gpu::interconnect {
attachedBuffers.clear();
allocator->Reset();
renderPassIndex = 0;
usageTracker.sequencedIntervals.Clear();
// Periodically clear preserve attachments just in case there are new waiters which would otherwise end up waiting forever
if ((submissionNumber % (2U << *state.settings->executorSlotCountScale)) == 0) {
@ -586,7 +587,6 @@ namespace skyline::gpu::interconnect {
SubmitInternal();
submissionNumber++;
} else {
if (callback && *state.settings->useDirectMemoryImport)
waiterThread.Queue(nullptr, std::move(callback));
@ -598,6 +598,8 @@ namespace skyline::gpu::interconnect {
ResetInternal();
if (wait) {
usageTracker.dirtyIntervals.Clear();
std::condition_variable cv;
std::mutex mutex;
bool gpuDone{};

View File

@ -6,6 +6,7 @@
#include <boost/container/stable_vector.hpp>
#include <renderdoc_app.h>
#include <common/linear_allocator.h>
#include <gpu/usage_tracker.h>
#include <gpu/megabuffer.h>
#include "command_nodes.h"
#include "common/spin_lock.h"
@ -217,6 +218,7 @@ namespace skyline::gpu::interconnect {
size_t submissionNumber{};
ContextTag executionTag{};
bool captureNextExecution{};
UsageTracker usageTracker;
CommandExecutor(const DeviceState &state);

View File

@ -62,7 +62,7 @@ namespace skyline::gpu::interconnect {
dstStageMask |= dstStage;
}
view.GetBuffer()->MarkGpuDirty();
view.GetBuffer()->MarkGpuDirty(ctx.executor.usageTracker);
} else {
if (auto megaBufferBinding{view.TryMegaBuffer(ctx.executor.cycle, ctx.gpu.megaBufferAllocator, ctx.executor.executionTag)})
return megaBufferBinding;

View File

@ -53,13 +53,13 @@ namespace skyline::gpu::interconnect {
mirrorBlock = blockMapping;
}
if (entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber) {
entry->channelSequenceNumber = ctx.channelCtx.channelSequenceNumber;
if (entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->executionTag != ctx.executor.executionTag) {
entry->executionTag = ctx.executor.executionTag;
entry->dirty = true;
}
// If the mirror entry has been written to, clear its shader binary cache and retrap to catch any future writes
if (entry->dirty) {
if (entry->dirty || ctx.executor.usageTracker.sequencedIntervals.Intersect(blockMapping.subspan(blockOffset))) {
entry->cache.clear();
entry->dirty = false;
@ -129,7 +129,7 @@ namespace skyline::gpu::interconnect {
if (programBase != lastProgramBase || programOffset != lastProgramOffset)
return true;
if (entry && entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->channelSequenceNumber != ctx.channelCtx.channelSequenceNumber)
if (entry && entry->trapCount > MirrorEntry::SkipTrapThreshold && entry->executionTag != ctx.executor.executionTag)
return true;
else if (entry && entry->dirty)
return true;

View File

@ -22,7 +22,7 @@ namespace skyline::gpu::interconnect {
static constexpr u32 SkipTrapThreshold{20}; //!< Threshold for the number of times a mirror trap needs to be hit before we fallback to always hashing
u32 trapCount{}; //!< The number of times the trap has been hit, used to avoid trapping in cases where the constant retraps would harm performance
size_t channelSequenceNumber{}; //!< For the case where `trapCount > SkipTrapThreshold`, the memory sequence number number used to clear the cache after every access
ContextTag executionTag{}; //!< For the case where `trapCount > SkipTrapThreshold`, the memory sequence number number used to clear the cache after every access
bool dirty{}; //!< If the trap has been hit and the cache needs to be cleared
MirrorEntry(span<u8> alignedMirror) : mirror{alignedMirror} {}

View File

@ -123,6 +123,7 @@ namespace skyline::gpu::interconnect {
auto dstTextureView{gpu.texture.FindOrCreate(dstGuestTexture, executor.tag)};
executor.AttachDependency(dstTextureView);
executor.AttachTexture(dstTextureView.get());
dstTextureView->texture->MarkGpuDirty(executor.usageTracker);
// Blit shader always samples from centre so adjust if necessary
float centredSrcRectX{sampleOrigin == SampleModeOrigin::Corner ? srcRectX - 0.5f : srcRectX};

View File

@ -22,7 +22,7 @@ namespace skyline::gpu::interconnect {
ContextLock dstBufLock{executor.tag, dstBuf};
dstBuf.Write(src, 0, [&]() {
dstBuf.Write(src, 0, executor.usageTracker, [&]() {
executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock));
// This will prevent any CPU accesses to backing for the duration of the usage
dstBuf.GetBuffer()->BlockAllCpuBackingWrites();

View File

@ -206,7 +206,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
dstStageMask |= vk::PipelineStageFlagBits::eTransformFeedbackEXT;
}
view->GetBuffer()->MarkGpuDirty();
view->GetBuffer()->MarkGpuDirty(ctx.executor.usageTracker);
builder.SetTransformFeedbackBuffer(index, *view);
return;
} else {

View File

@ -46,7 +46,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
ContextLock lock{ctx.executor.tag, view};
// First attempt the write without setting up the gpu copy callback as a fast path
if (view.Write(srcCpuBuf, offset)) [[unlikely]] {
if (view.Write(srcCpuBuf, offset, ctx.executor.usageTracker)) [[unlikely]] {
// Store callback data in a stack allocated struct to avoid heap allocation for the gpu copy callback lambda
struct GpuCopyCallbackData {
InterconnectContext &ctx;
@ -56,7 +56,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
BufferView &view;
} callbackData{ctx, srcCpuBuf, offset, lock, view};
view.Write(srcCpuBuf, offset, [&callbackData]() {
view.Write(srcCpuBuf, offset, ctx.executor.usageTracker, [&callbackData]() {
callbackData.ctx.executor.AttachLockedBufferView(callbackData.view, std::move(callbackData.lock));
// This will prevent any CPU accesses to backing for the duration of the usage
callbackData.view.GetBuffer()->BlockAllCpuBackingWrites();

View File

@ -24,7 +24,7 @@ namespace skyline::gpu::interconnect {
})};
ContextLock dstBufLock{executor.tag, dstBuf};
dstBuf.CopyFrom(srcBuf, [&]() {
dstBuf.CopyFrom(srcBuf, executor.usageTracker, [&]() {
executor.AttachLockedBufferView(srcBuf, std::move(srcBufLock));
executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock));
// This will prevent any CPU accesses to backing for the duration of the usage

View File

@ -725,6 +725,12 @@ namespace skyline::gpu {
}
}
void Texture::MarkGpuDirty(UsageTracker &usageTracker) {
for (auto mapping : guest->mappings)
if (mapping.valid())
usageTracker.dirtyIntervals.Insert(mapping);
}
void Texture::SynchronizeHost(bool gpuDirty) {
if (!guest)
return;

View File

@ -10,6 +10,7 @@
#include <nce.h>
#include <gpu/tag_allocator.h>
#include <gpu/memory_manager.h>
#include <gpu/usage_tracker.h>
namespace skyline::gpu {
namespace texture {
@ -560,6 +561,11 @@ namespace skyline::gpu {
*/
void TransitionLayout(vk::ImageLayout layout);
/**
* @brief Marks the texture as being GPU dirty
*/
void MarkGpuDirty(UsageTracker &usageTracker);
/**
* @brief Synchronizes the host texture with the guest after it has been modified
* @param gpuDirty If true, the texture will be transitioned to being GpuDirty by this call

View File

@ -0,0 +1,16 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
#pragma once
#include <common/interval_list.h>
namespace skyline::gpu {
/**
* @brief Tracks the usage of GPU memory and buffers to allow for fine-grained flushing
*/
struct UsageTracker {
IntervalList<u8 *> dirtyIntervals; //!< Intervals of GPU-dirty contents that requires a flush before accessing
IntervalList<u8 *> sequencedIntervals; //!< Intervals of GPFIFO-sequenced writes that occur within an execution
};
}