From 0b5d9308c48811e1108c3a8743906d2303cef2c8 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Tue, 25 Oct 2022 20:48:38 +0100 Subject: [PATCH] Be more careful about potentially-unneeded GPU->CPU syncs These can be especially expensive so should be avoided as much as possible. --- app/src/main/cpp/skyline/gpu/buffer.cpp | 19 +++++++++++-------- app/src/main/cpp/skyline/gpu/buffer.h | 8 ++------ .../maxwell_3d/constant_buffers.cpp | 4 ++-- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/app/src/main/cpp/skyline/gpu/buffer.cpp b/app/src/main/cpp/skyline/gpu/buffer.cpp index 18fd24a5..d0ffd7ba 100644 --- a/app/src/main/cpp/skyline/gpu/buffer.cpp +++ b/app/src/main/cpp/skyline/gpu/buffer.cpp @@ -247,16 +247,20 @@ namespace skyline::gpu { std::memcpy(data.data(), mirror.data() + offset, data.size()); } - bool Buffer::Write(bool isFirstUsage, const std::function &flushHostCallback, span data, vk::DeviceSize offset, const std::function &gpuCopyCallback) { + bool Buffer::Write(span data, vk::DeviceSize offset, const std::function &gpuCopyCallback) { AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence everHadInlineUpdate = true; // We cannot have *ANY* state changes for the duration of this function, if the buffer became CPU dirty partway through the GPU writes would mismatch the CPU writes std::scoped_lock lock{stateMutex}; - // Syncs in both directions to ensure correct ordering of writes - if (dirtyState == DirtyState::GpuDirty) - SynchronizeGuestImmediate(isFirstUsage, flushHostCallback); + // If the buffer is GPU dirty do the write on the GPU and we're done + if (dirtyState == DirtyState::GpuDirty) { + if (gpuCopyCallback) + gpuCopyCallback(); + else + return true; + } if (dirtyState == DirtyState::CpuDirty && SequencedCpuBackingWritesBlocked()) // If the buffer is used in sequence directly on the GPU, SynchronizeHost before modifying the mirror contents to ensure proper sequencing. This write will then be sequenced on the GPU instead (the buffer will be kept clean for the rest of the execution due to gpuCopyCallback blocking all writes) @@ -301,7 +305,7 @@ namespace skyline::gpu { return {}; // We are safe to check dirty state here since it will only ever be set GPU dirty with the buffer locked and from the active GPFIFO thread. This helps with perf since the lock ends up being slightly expensive - if (dirtyState == DirtyState::GpuDirty && !SynchronizeGuest(false, true)) + if (dirtyState == DirtyState::GpuDirty) // Bail out if buffer cannot be synced, we don't know the contents ahead of time so the sequence is indeterminate return {}; @@ -431,9 +435,8 @@ namespace skyline::gpu { GetBuffer()->Read(isFirstUsage, flushHostCallback, data, readOffset + GetOffset()); } - bool BufferView::Write(bool isFirstUsage, const std::shared_ptr &pCycle, const std::function &flushHostCallback, - span data, vk::DeviceSize writeOffset, const std::function &gpuCopyCallback) const { - return GetBuffer()->Write(isFirstUsage, flushHostCallback, data, writeOffset + GetOffset(), gpuCopyCallback); + bool BufferView::Write(span data, vk::DeviceSize writeOffset, const std::function &gpuCopyCallback) const { + return GetBuffer()->Write(data, writeOffset + GetOffset(), gpuCopyCallback); } BufferBinding BufferView::TryMegaBuffer(const std::shared_ptr &pCycle, MegaBufferAllocator &allocator, u32 executionNumber, size_t sizeOverride) const { diff --git a/app/src/main/cpp/skyline/gpu/buffer.h b/app/src/main/cpp/skyline/gpu/buffer.h index 691cb364..ee5e2289 100644 --- a/app/src/main/cpp/skyline/gpu/buffer.h +++ b/app/src/main/cpp/skyline/gpu/buffer.h @@ -285,12 +285,10 @@ namespace skyline::gpu { /** * @brief Writes data at the specified offset in the buffer, falling back to GPU side copies if the buffer is host immutable - * @param isFirstUsage If this is the first usage of this resource in the context as returned from LockWithTag(...) - * @param flushHostCallback Callback to flush and execute all pending GPU work to allow for synchronisation of GPU dirty buffers * @param gpuCopyCallback Optional callback to perform a GPU-side copy for this Write if necessary, if such a copy is needed and this is not supplied `true` will be returned to indicate that the write needs to be repeated with the callback present * @return Whether the write needs to be repeated with `gpuCopyCallback` provided, always false if `gpuCopyCallback` is provided */ - bool Write(bool isFirstUsage, const std::function &flushHostCallback, span data, vk::DeviceSize offset, const std::function &gpuCopyCallback = {}); + bool Write(span data, vk::DeviceSize offset, const std::function &gpuCopyCallback = {}); /** * @return A view into this buffer with the supplied attributes @@ -304,7 +302,6 @@ namespace skyline::gpu { */ BufferView TryGetView(span mapping); - /* * @brief If megabuffering is determined to be beneficial for this buffer, allocates and copies the given view of buffer into the megabuffer (in case of cache miss), returning a binding of the allocated megabuffer region * @return A binding to the megabuffer allocation for the view, may be invalid if megabuffering is not beneficial @@ -436,8 +433,7 @@ namespace skyline::gpu { * @note The view **must** be locked prior to calling this * @note See Buffer::Write */ - bool Write(bool isFirstUsage, const std::shared_ptr &cycle, const std::function &flushHostCallback, - span data, vk::DeviceSize writeOffset, const std::function &gpuCopyCallback = {}) const; + bool Write(span data, vk::DeviceSize writeOffset, const std::function &gpuCopyCallback = {}) const; /* * @brief If megabuffering is determined to be beneficial for the underlying buffer, allocates and copies this view into the megabuffer (in case of cache miss), returning a binding of the allocated megabuffer region diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp index 0b3074fe..6c363041 100644 --- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_3d/constant_buffers.cpp @@ -56,7 +56,7 @@ namespace skyline::gpu::interconnect::maxwell3d { ContextLock lock{ctx.executor.tag, view}; // First attempt the write without setting up the gpu copy callback as a fast path - if (view.Write(lock.IsFirstUsage(), ctx.executor.cycle, FlushHostCallback, srcCpuBuf, offset)) [[unlikely]] { + if (view.Write(srcCpuBuf, offset)) [[unlikely]] { // Store callback data in a stack allocated struct to avoid heap allocation for the gpu copy callback lambda struct GpuCopyCallbackData { InterconnectContext &ctx; @@ -66,7 +66,7 @@ namespace skyline::gpu::interconnect::maxwell3d { BufferView &view; } callbackData{ctx, srcCpuBuf, offset, lock, view}; - view.Write(lock.IsFirstUsage(), ctx.executor.cycle, FlushHostCallback, srcCpuBuf, offset, [&callbackData]() { + view.Write(srcCpuBuf, offset, [&callbackData]() { callbackData.ctx.executor.AttachLockedBufferView(callbackData.view, std::move(callbackData.lock)); // This will prevent any CPU accesses to backing for the duration of the usage callbackData.view.GetBuffer()->BlockAllCpuBackingWrites();