Be more careful about potentially-unneeded GPU->CPU syncs

These can be especially expensive so should be avoided as much as possible.
This commit is contained in:
Billy Laws 2022-10-25 20:48:38 +01:00
parent e6530e2386
commit 0b5d9308c4
3 changed files with 15 additions and 16 deletions

View File

@ -247,16 +247,20 @@ namespace skyline::gpu {
std::memcpy(data.data(), mirror.data() + offset, data.size()); std::memcpy(data.data(), mirror.data() + offset, data.size());
} }
bool Buffer::Write(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback) { bool Buffer::Write(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback) {
AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
everHadInlineUpdate = true; everHadInlineUpdate = true;
// We cannot have *ANY* state changes for the duration of this function, if the buffer became CPU dirty partway through the GPU writes would mismatch the CPU writes // We cannot have *ANY* state changes for the duration of this function, if the buffer became CPU dirty partway through the GPU writes would mismatch the CPU writes
std::scoped_lock lock{stateMutex}; std::scoped_lock lock{stateMutex};
// Syncs in both directions to ensure correct ordering of writes // If the buffer is GPU dirty do the write on the GPU and we're done
if (dirtyState == DirtyState::GpuDirty) if (dirtyState == DirtyState::GpuDirty) {
SynchronizeGuestImmediate(isFirstUsage, flushHostCallback); if (gpuCopyCallback)
gpuCopyCallback();
else
return true;
}
if (dirtyState == DirtyState::CpuDirty && SequencedCpuBackingWritesBlocked()) if (dirtyState == DirtyState::CpuDirty && SequencedCpuBackingWritesBlocked())
// If the buffer is used in sequence directly on the GPU, SynchronizeHost before modifying the mirror contents to ensure proper sequencing. This write will then be sequenced on the GPU instead (the buffer will be kept clean for the rest of the execution due to gpuCopyCallback blocking all writes) // If the buffer is used in sequence directly on the GPU, SynchronizeHost before modifying the mirror contents to ensure proper sequencing. This write will then be sequenced on the GPU instead (the buffer will be kept clean for the rest of the execution due to gpuCopyCallback blocking all writes)
@ -301,7 +305,7 @@ namespace skyline::gpu {
return {}; return {};
// We are safe to check dirty state here since it will only ever be set GPU dirty with the buffer locked and from the active GPFIFO thread. This helps with perf since the lock ends up being slightly expensive // We are safe to check dirty state here since it will only ever be set GPU dirty with the buffer locked and from the active GPFIFO thread. This helps with perf since the lock ends up being slightly expensive
if (dirtyState == DirtyState::GpuDirty && !SynchronizeGuest(false, true)) if (dirtyState == DirtyState::GpuDirty)
// Bail out if buffer cannot be synced, we don't know the contents ahead of time so the sequence is indeterminate // Bail out if buffer cannot be synced, we don't know the contents ahead of time so the sequence is indeterminate
return {}; return {};
@ -431,9 +435,8 @@ namespace skyline::gpu {
GetBuffer()->Read(isFirstUsage, flushHostCallback, data, readOffset + GetOffset()); GetBuffer()->Read(isFirstUsage, flushHostCallback, data, readOffset + GetOffset());
} }
bool BufferView::Write(bool isFirstUsage, const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, bool BufferView::Write(span<u8> data, vk::DeviceSize writeOffset, const std::function<void()> &gpuCopyCallback) const {
span<u8> data, vk::DeviceSize writeOffset, const std::function<void()> &gpuCopyCallback) const { return GetBuffer()->Write(data, writeOffset + GetOffset(), gpuCopyCallback);
return GetBuffer()->Write(isFirstUsage, flushHostCallback, data, writeOffset + GetOffset(), gpuCopyCallback);
} }
BufferBinding BufferView::TryMegaBuffer(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, u32 executionNumber, size_t sizeOverride) const { BufferBinding BufferView::TryMegaBuffer(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, u32 executionNumber, size_t sizeOverride) const {

View File

@ -285,12 +285,10 @@ namespace skyline::gpu {
/** /**
* @brief Writes data at the specified offset in the buffer, falling back to GPU side copies if the buffer is host immutable * @brief Writes data at the specified offset in the buffer, falling back to GPU side copies if the buffer is host immutable
* @param isFirstUsage If this is the first usage of this resource in the context as returned from LockWithTag(...)
* @param flushHostCallback Callback to flush and execute all pending GPU work to allow for synchronisation of GPU dirty buffers
* @param gpuCopyCallback Optional callback to perform a GPU-side copy for this Write if necessary, if such a copy is needed and this is not supplied `true` will be returned to indicate that the write needs to be repeated with the callback present * @param gpuCopyCallback Optional callback to perform a GPU-side copy for this Write if necessary, if such a copy is needed and this is not supplied `true` will be returned to indicate that the write needs to be repeated with the callback present
* @return Whether the write needs to be repeated with `gpuCopyCallback` provided, always false if `gpuCopyCallback` is provided * @return Whether the write needs to be repeated with `gpuCopyCallback` provided, always false if `gpuCopyCallback` is provided
*/ */
bool Write(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback = {}); bool Write(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback = {});
/** /**
* @return A view into this buffer with the supplied attributes * @return A view into this buffer with the supplied attributes
@ -304,7 +302,6 @@ namespace skyline::gpu {
*/ */
BufferView TryGetView(span<u8> mapping); BufferView TryGetView(span<u8> mapping);
/* /*
* @brief If megabuffering is determined to be beneficial for this buffer, allocates and copies the given view of buffer into the megabuffer (in case of cache miss), returning a binding of the allocated megabuffer region * @brief If megabuffering is determined to be beneficial for this buffer, allocates and copies the given view of buffer into the megabuffer (in case of cache miss), returning a binding of the allocated megabuffer region
* @return A binding to the megabuffer allocation for the view, may be invalid if megabuffering is not beneficial * @return A binding to the megabuffer allocation for the view, may be invalid if megabuffering is not beneficial
@ -436,8 +433,7 @@ namespace skyline::gpu {
* @note The view **must** be locked prior to calling this * @note The view **must** be locked prior to calling this
* @note See Buffer::Write * @note See Buffer::Write
*/ */
bool Write(bool isFirstUsage, const std::shared_ptr<FenceCycle> &cycle, const std::function<void()> &flushHostCallback, bool Write(span<u8> data, vk::DeviceSize writeOffset, const std::function<void()> &gpuCopyCallback = {}) const;
span<u8> data, vk::DeviceSize writeOffset, const std::function<void()> &gpuCopyCallback = {}) const;
/* /*
* @brief If megabuffering is determined to be beneficial for the underlying buffer, allocates and copies this view into the megabuffer (in case of cache miss), returning a binding of the allocated megabuffer region * @brief If megabuffering is determined to be beneficial for the underlying buffer, allocates and copies this view into the megabuffer (in case of cache miss), returning a binding of the allocated megabuffer region

View File

@ -56,7 +56,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
ContextLock lock{ctx.executor.tag, view}; ContextLock lock{ctx.executor.tag, view};
// First attempt the write without setting up the gpu copy callback as a fast path // First attempt the write without setting up the gpu copy callback as a fast path
if (view.Write(lock.IsFirstUsage(), ctx.executor.cycle, FlushHostCallback, srcCpuBuf, offset)) [[unlikely]] { if (view.Write(srcCpuBuf, offset)) [[unlikely]] {
// Store callback data in a stack allocated struct to avoid heap allocation for the gpu copy callback lambda // Store callback data in a stack allocated struct to avoid heap allocation for the gpu copy callback lambda
struct GpuCopyCallbackData { struct GpuCopyCallbackData {
InterconnectContext &ctx; InterconnectContext &ctx;
@ -66,7 +66,7 @@ namespace skyline::gpu::interconnect::maxwell3d {
BufferView &view; BufferView &view;
} callbackData{ctx, srcCpuBuf, offset, lock, view}; } callbackData{ctx, srcCpuBuf, offset, lock, view};
view.Write(lock.IsFirstUsage(), ctx.executor.cycle, FlushHostCallback, srcCpuBuf, offset, [&callbackData]() { view.Write(srcCpuBuf, offset, [&callbackData]() {
callbackData.ctx.executor.AttachLockedBufferView(callbackData.view, std::move(callbackData.lock)); callbackData.ctx.executor.AttachLockedBufferView(callbackData.view, std::move(callbackData.lock));
// This will prevent any CPU accesses to backing for the duration of the usage // This will prevent any CPU accesses to backing for the duration of the usage
callbackData.view.GetBuffer()->BlockAllCpuBackingWrites(); callbackData.view.GetBuffer()->BlockAllCpuBackingWrites();