From cac287d9fdc38fa64d2d7286b16dcd62c778fc0b Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Tue, 25 Oct 2022 20:57:30 +0100 Subject: [PATCH] Implement accelerated uploads/copies through buffer manager Previously, both I2M uploads and DMA copies would force GPU serialisation if they happened to hit a trap or were used to copy GPU dirty buffers. By using the buffer manager to implement them on the host GPU we can avoid such slowdowns entiely. --- app/CMakeLists.txt | 2 + .../gpu/interconnect/inline2memory.cpp | 50 +++++++++++++++ .../skyline/gpu/interconnect/inline2memory.h | 36 +++++++++++ .../skyline/gpu/interconnect/maxwell_dma.cpp | 61 +++++++++++++++++++ .../skyline/gpu/interconnect/maxwell_dma.h | 36 +++++++++++ .../soc/gm20b/engines/inline2memory.cpp | 12 ++-- .../skyline/soc/gm20b/engines/inline2memory.h | 6 +- .../soc/gm20b/engines/kepler_compute.cpp | 2 +- .../skyline/soc/gm20b/engines/maxwell_dma.cpp | 10 +-- .../skyline/soc/gm20b/engines/maxwell_dma.h | 5 +- 10 files changed, 205 insertions(+), 15 deletions(-) create mode 100644 app/src/main/cpp/skyline/gpu/interconnect/inline2memory.cpp create mode 100644 app/src/main/cpp/skyline/gpu/interconnect/inline2memory.h create mode 100644 app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.cpp create mode 100644 app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.h diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt index 0507f674..4392c74c 100644 --- a/app/CMakeLists.txt +++ b/app/CMakeLists.txt @@ -186,6 +186,8 @@ add_library(skyline SHARED ${source_DIR}/skyline/gpu/cache/renderpass_cache.cpp ${source_DIR}/skyline/gpu/cache/framebuffer_cache.cpp ${source_DIR}/skyline/gpu/interconnect/fermi_2d.cpp + ${source_DIR}/skyline/gpu/interconnect/maxwell_dma.cpp + ${source_DIR}/skyline/gpu/interconnect/inline2memory.cpp ${source_DIR}/skyline/gpu/interconnect/maxwell_3d/common.cpp ${source_DIR}/skyline/gpu/interconnect/maxwell_3d/active_state.cpp ${source_DIR}/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp diff --git a/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.cpp b/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.cpp new file mode 100644 index 00000000..a8432b7b --- /dev/null +++ b/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.cpp @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: MPL-2.0 +// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/) +// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/) + +#include +#include +#include +#include "inline2memory.h" + +namespace skyline::gpu::interconnect { + using IOVA = soc::gm20b::IOVA; + + Inline2Memory::Inline2Memory(GPU &gpu, soc::gm20b::ChannelContext &channelCtx) + : gpu{gpu}, + channelCtx{channelCtx}, + executor{channelCtx.executor} {} + + void Inline2Memory::Upload(IOVA dst, span src) { + auto dstMappings{channelCtx.asCtx->gmmu.TranslateRange(dst, src.size_bytes())}; + + if (dstMappings.size() > 1) + Logger::Warn("Split mapping are unsupported for DMA copies"); + + auto dstBuf{gpu.buffer.FindOrCreate(dstMappings.front(), executor.tag, [this](std::shared_ptr buffer, ContextLock &&lock) { + executor.AttachLockedBuffer(buffer, std::move(lock)); + })}; + ContextLock dstBufLock{executor.tag, dstBuf}; + + + dstBuf.Write(src.cast(), 0, [&]() { + executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock)); + // This will prevent any CPU accesses to backing for the duration of the usage + dstBuf.GetBuffer()->BlockAllCpuBackingWrites(); + + auto srcGpuAllocation{gpu.megaBufferAllocator.Push(executor.cycle, src.cast())}; + executor.AddOutsideRpCommand([srcGpuAllocation, dstBuf, src](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr &, GPU &) { + vk::BufferCopy copyRegion{ + .size = src.size_bytes(), + .srcOffset = srcGpuAllocation.offset, + .dstOffset = dstBuf.GetOffset() + }; + commandBuffer.copyBuffer(srcGpuAllocation.buffer, dstBuf.GetBuffer()->GetBacking(), copyRegion); + commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{ + .srcAccessMask = vk::AccessFlagBits::eTransferWrite, + .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite + }, {}, {}); + }); + }); + } +} diff --git a/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.h b/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.h new file mode 100644 index 00000000..b24a4fee --- /dev/null +++ b/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: MPL-2.0 +// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/) +// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/) + +#pragma once + +#include + +namespace skyline::gpu { + class GPU; +} + +namespace skyline::soc::gm20b { + struct ChannelContext; +} + +namespace skyline::gpu::interconnect { + class CommandExecutor; + + /** + * @brief Handles translating I2M operations to Vulkan + */ + class Inline2Memory { + private: + using IOVA = soc::gm20b::IOVA; + + GPU &gpu; + soc::gm20b::ChannelContext &channelCtx; + gpu::interconnect::CommandExecutor &executor; + + public: + Inline2Memory(GPU &gpu, soc::gm20b::ChannelContext &channelCtx); + + void Upload(IOVA dst, span src); + }; +} diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.cpp b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.cpp new file mode 100644 index 00000000..7887f7da --- /dev/null +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.cpp @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: MPL-2.0 +// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/) +// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/) + +#include +#include +#include +#include "maxwell_dma.h" + +namespace skyline::gpu::interconnect { + using IOVA = soc::gm20b::IOVA; + + MaxwellDma::MaxwellDma(GPU &gpu, soc::gm20b::ChannelContext &channelCtx) + : gpu{gpu}, + channelCtx{channelCtx}, + executor{channelCtx.executor} {} + + void MaxwellDma::Copy(IOVA dst, IOVA src, size_t size) { + auto srcMappings{channelCtx.asCtx->gmmu.TranslateRange(src, size)}; + auto dstMappings{channelCtx.asCtx->gmmu.TranslateRange(dst, size)}; + + if (srcMappings.size() > 1 || dstMappings.size() > 1) + Logger::Warn("Split mapping are unsupported for DMA copies"); + + auto srcBuf{gpu.buffer.FindOrCreate(srcMappings.front(), executor.tag, [this](std::shared_ptr buffer, ContextLock &&lock) { + executor.AttachLockedBuffer(buffer, std::move(lock)); + })}; + ContextLock srcBufLock{executor.tag, srcBuf}; + + auto dstBuf{gpu.buffer.FindOrCreate(dstMappings.front(), executor.tag, [this](std::shared_ptr buffer, ContextLock &&lock) { + executor.AttachLockedBuffer(buffer, std::move(lock)); + })}; + ContextLock dstBufLock{executor.tag, dstBuf}; + + dstBuf.CopyFrom(srcBuf, [&]() { + executor.AttachLockedBufferView(srcBuf, std::move(srcBufLock)); + executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock)); + // This will prevent any CPU accesses to backing for the duration of the usage + // GPU dirtiness will be handled on the CopyFrom end as it's not always necessary + srcBuf.GetBuffer()->BlockAllCpuBackingWrites(); + dstBuf.GetBuffer()->BlockAllCpuBackingWrites(); + + executor.AddOutsideRpCommand([srcBuf, dstBuf](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr &, GPU &) { + commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eTransfer, {}, vk::MemoryBarrier{ + .srcAccessMask = vk::AccessFlagBits::eMemoryRead, + .dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite + }, {}, {}); + vk::BufferCopy copyRegion{ + .size = srcBuf.size, + .srcOffset = srcBuf.GetOffset(), + .dstOffset = dstBuf.GetOffset() + }; + commandBuffer.copyBuffer(srcBuf.GetBuffer()->GetBacking(), dstBuf.GetBuffer()->GetBacking(), copyRegion); + commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{ + .srcAccessMask = vk::AccessFlagBits::eTransferWrite, + .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite, + }, {}, {}); + }); + }); + } +} diff --git a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.h b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.h new file mode 100644 index 00000000..16e815b4 --- /dev/null +++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: MPL-2.0 +// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/) +// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/) + +#pragma once + +#include + +namespace skyline::gpu { + class GPU; +} + +namespace skyline::soc::gm20b { + struct ChannelContext; +} + +namespace skyline::gpu::interconnect { + class CommandExecutor; + + /** + * @brief Handles translating Maxwell DMA operations to Vulkan + */ + class MaxwellDma { + private: + using IOVA = soc::gm20b::IOVA; + + GPU &gpu; + soc::gm20b::ChannelContext &channelCtx; + gpu::interconnect::CommandExecutor &executor; + + public: + MaxwellDma(GPU &gpu, soc::gm20b::ChannelContext &channelCtx); + + void Copy(IOVA dst, IOVA src, size_t size); + }; +} diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/inline2memory.cpp b/app/src/main/cpp/skyline/soc/gm20b/engines/inline2memory.cpp index efa59fbd..130f9744 100644 --- a/app/src/main/cpp/skyline/soc/gm20b/engines/inline2memory.cpp +++ b/app/src/main/cpp/skyline/soc/gm20b/engines/inline2memory.cpp @@ -5,7 +5,9 @@ #include "inline2memory.h" namespace skyline::soc::gm20b::engine { - Inline2MemoryBackend::Inline2MemoryBackend(ChannelContext &channelCtx) : channelCtx(channelCtx) {} + Inline2MemoryBackend::Inline2MemoryBackend(const DeviceState &state, ChannelContext &channelCtx) + : interconnect{*state.gpu, channelCtx}, + channelCtx{channelCtx} {} void Inline2MemoryBackend::LaunchDma(Inline2MemoryBackend::RegisterState &state) { writeOffset = 0; @@ -17,13 +19,11 @@ namespace skyline::soc::gm20b::engine { if (state.launchDma.completion == RegisterState::DmaCompletionType::ReleaseSemaphore) throw exception("Semaphore release on I2M completion is not supported!"); - channelCtx.executor.Submit(); - if (state.launchDma.layout == RegisterState::DmaDstMemoryLayout::Pitch && state.lineCount == 1) { - // TODO: we can do this with the buffer manager to avoid some overhead in the future Logger::Debug("range: 0x{:X} -> 0x{:X}", u64{state.offsetOut}, u64{state.offsetOut} + buffer.size() * 0x4); - channelCtx.asCtx->gmmu.Write(state.offsetOut, span(buffer)); + interconnect.Upload(u64{state.offsetOut}, span{buffer}); } else { + channelCtx.executor.Submit(); Logger::Warn("Non-linear I2M uploads are not supported!"); } } @@ -49,7 +49,7 @@ namespace skyline::soc::gm20b::engine { CompleteDma(state); } - Inline2Memory::Inline2Memory(ChannelContext &channelCtx) : backend(channelCtx) {} + Inline2Memory::Inline2Memory(const DeviceState &state, ChannelContext &channelCtx) : backend{state, channelCtx} {} __attribute__((always_inline)) void Inline2Memory::CallMethod(u32 method, u32 argument) { Logger::Verbose("Called method in I2M: 0x{:X} args: 0x{:X}", method, argument); diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/inline2memory.h b/app/src/main/cpp/skyline/soc/gm20b/engines/inline2memory.h index c2365d4d..827d37a3 100644 --- a/app/src/main/cpp/skyline/soc/gm20b/engines/inline2memory.h +++ b/app/src/main/cpp/skyline/soc/gm20b/engines/inline2memory.h @@ -4,6 +4,7 @@ #pragma once #include +#include #include "engine.h" namespace skyline::soc::gm20b { @@ -18,6 +19,7 @@ namespace skyline::soc::gm20b::engine { private: std::vector buffer; //!< Temporary buffer to hold data being currently uploaded u32 writeOffset{}; //!< Current write offset in words into `buffer` + gpu::interconnect::Inline2Memory interconnect; ChannelContext &channelCtx; public: @@ -124,7 +126,7 @@ namespace skyline::soc::gm20b::engine { void CompleteDma(RegisterState &state); public: - Inline2MemoryBackend(ChannelContext &channelCtx); + Inline2MemoryBackend(const DeviceState &state, ChannelContext &channelCtx); /** * @brief Should be called when launchDma in `state` is written to @@ -164,7 +166,7 @@ namespace skyline::soc::gm20b::engine { } registers{}; public: - Inline2Memory(ChannelContext &channelCtx); + Inline2Memory(const DeviceState &state, ChannelContext &channelCtx); void CallMethod(u32 method, u32 argument); diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute.cpp b/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute.cpp index 865b9a7c..cea64b58 100644 --- a/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute.cpp +++ b/app/src/main/cpp/skyline/soc/gm20b/engines/kepler_compute.cpp @@ -8,7 +8,7 @@ namespace skyline::soc::gm20b::engine { KeplerCompute::KeplerCompute(const DeviceState &state, ChannelContext &channelCtx) - : syncpoints(state.soc->host1x.syncpoints), i2m(channelCtx) {} + : syncpoints{state.soc->host1x.syncpoints}, i2m{state, channelCtx} {} __attribute__((always_inline)) void KeplerCompute::CallMethod(u32 method, u32 argument) { Logger::Verbose("Called method in Kepler compute: 0x{:X} args: 0x{:X}", method, argument); diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.cpp b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.cpp index c01aa7f2..20db7f8f 100644 --- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.cpp +++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.cpp @@ -11,8 +11,10 @@ #include "maxwell_dma.h" namespace skyline::soc::gm20b::engine { - MaxwellDma::MaxwellDma(const DeviceState &state, ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor) - : channelCtx(channelCtx), syncpoints(state.soc->host1x.syncpoints), executor(executor) {} + MaxwellDma::MaxwellDma(const DeviceState &state, ChannelContext &channelCtx) + : channelCtx{channelCtx}, + syncpoints{state.soc->host1x.syncpoints}, + interconnect{*state.gpu, channelCtx} {} __attribute__((always_inline)) void MaxwellDma::CallMethod(u32 method, u32 argument) { Logger::Verbose("Called method in Maxwell DMA: 0x{:X} args: 0x{:X}", method, argument); @@ -36,8 +38,8 @@ namespace skyline::soc::gm20b::engine { return; } - executor.Submit(); if (registers.launchDma->multiLineEnable) { + channelCtx.executor.Submit(); if (registers.launchDma->srcMemoryLayout == Registers::LaunchDma::MemoryLayout::Pitch && registers.launchDma->dstMemoryLayout == Registers::LaunchDma::MemoryLayout::BlockLinear) CopyPitchToBlockLinear(); @@ -51,7 +53,7 @@ namespace skyline::soc::gm20b::engine { // 1D buffer copy // TODO: implement swizzled 1D copies based on VMM 'kind' Logger::Debug("src: 0x{:X} dst: 0x{:X} size: 0x{:X}", u64{*registers.offsetIn}, u64{*registers.offsetOut}, *registers.lineLengthIn); - channelCtx.asCtx->gmmu.Copy(*registers.offsetOut, *registers.offsetIn, *registers.lineLengthIn); + interconnect.Copy(u64{*registers.offsetOut}, u64{*registers.offsetIn}, u64{*registers.lineLengthIn}); } ReleaseSemaphore(); diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.h b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.h index 676cc3aa..e52e0b5c 100644 --- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.h +++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_dma.h @@ -3,6 +3,7 @@ #pragma once +#include #include "engine.h" namespace skyline::gpu::interconnect { @@ -21,7 +22,7 @@ namespace skyline::soc::gm20b::engine { private: host1x::SyncpointSet &syncpoints; ChannelContext &channelCtx; - gpu::interconnect::CommandExecutor &executor; + gpu::interconnect::MaxwellDma interconnect; void HandleMethod(u32 method, u32 argument); @@ -253,7 +254,7 @@ namespace skyline::soc::gm20b::engine { static_assert(sizeof(Registers) == (EngineMethodsEnd * 0x4)); #pragma pack(pop) - MaxwellDma(const DeviceState &state, ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor); + MaxwellDma(const DeviceState &state, ChannelContext &channelCtx); void CallMethod(u32 method, u32 argument);