Implement accelerated uploads/copies through buffer manager

Previously, both I2M uploads and DMA copies would force GPU serialisation if they happened to hit a trap or were used to copy GPU dirty buffers. By using the buffer manager to implement them on the host GPU we can avoid such slowdowns entiely.
This commit is contained in:
Billy Laws 2022-10-25 20:57:30 +01:00
parent c5ec484d9a
commit cac287d9fd
10 changed files with 205 additions and 15 deletions

View File

@ -186,6 +186,8 @@ add_library(skyline SHARED
${source_DIR}/skyline/gpu/cache/renderpass_cache.cpp
${source_DIR}/skyline/gpu/cache/framebuffer_cache.cpp
${source_DIR}/skyline/gpu/interconnect/fermi_2d.cpp
${source_DIR}/skyline/gpu/interconnect/maxwell_dma.cpp
${source_DIR}/skyline/gpu/interconnect/inline2memory.cpp
${source_DIR}/skyline/gpu/interconnect/maxwell_3d/common.cpp
${source_DIR}/skyline/gpu/interconnect/maxwell_3d/active_state.cpp
${source_DIR}/skyline/gpu/interconnect/maxwell_3d/pipeline_state.cpp

View File

@ -0,0 +1,50 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
#include <gpu/buffer_manager.h>
#include <soc/gm20b/gmmu.h>
#include <soc/gm20b/channel.h>
#include "inline2memory.h"
namespace skyline::gpu::interconnect {
using IOVA = soc::gm20b::IOVA;
Inline2Memory::Inline2Memory(GPU &gpu, soc::gm20b::ChannelContext &channelCtx)
: gpu{gpu},
channelCtx{channelCtx},
executor{channelCtx.executor} {}
void Inline2Memory::Upload(IOVA dst, span<u32> src) {
auto dstMappings{channelCtx.asCtx->gmmu.TranslateRange(dst, src.size_bytes())};
if (dstMappings.size() > 1)
Logger::Warn("Split mapping are unsupported for DMA copies");
auto dstBuf{gpu.buffer.FindOrCreate(dstMappings.front(), executor.tag, [this](std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
executor.AttachLockedBuffer(buffer, std::move(lock));
})};
ContextLock dstBufLock{executor.tag, dstBuf};
dstBuf.Write(src.cast<u8>(), 0, [&]() {
executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock));
// This will prevent any CPU accesses to backing for the duration of the usage
dstBuf.GetBuffer()->BlockAllCpuBackingWrites();
auto srcGpuAllocation{gpu.megaBufferAllocator.Push(executor.cycle, src.cast<u8>())};
executor.AddOutsideRpCommand([srcGpuAllocation, dstBuf, src](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
vk::BufferCopy copyRegion{
.size = src.size_bytes(),
.srcOffset = srcGpuAllocation.offset,
.dstOffset = dstBuf.GetOffset()
};
commandBuffer.copyBuffer(srcGpuAllocation.buffer, dstBuf.GetBuffer()->GetBacking(), copyRegion);
commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{
.srcAccessMask = vk::AccessFlagBits::eTransferWrite,
.dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite
}, {}, {});
});
});
}
}

View File

@ -0,0 +1,36 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
#pragma once
#include <soc/gm20b/gmmu.h>
namespace skyline::gpu {
class GPU;
}
namespace skyline::soc::gm20b {
struct ChannelContext;
}
namespace skyline::gpu::interconnect {
class CommandExecutor;
/**
* @brief Handles translating I2M operations to Vulkan
*/
class Inline2Memory {
private:
using IOVA = soc::gm20b::IOVA;
GPU &gpu;
soc::gm20b::ChannelContext &channelCtx;
gpu::interconnect::CommandExecutor &executor;
public:
Inline2Memory(GPU &gpu, soc::gm20b::ChannelContext &channelCtx);
void Upload(IOVA dst, span<u32> src);
};
}

View File

@ -0,0 +1,61 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
#include <gpu/buffer_manager.h>
#include <soc/gm20b/gmmu.h>
#include <soc/gm20b/channel.h>
#include "maxwell_dma.h"
namespace skyline::gpu::interconnect {
using IOVA = soc::gm20b::IOVA;
MaxwellDma::MaxwellDma(GPU &gpu, soc::gm20b::ChannelContext &channelCtx)
: gpu{gpu},
channelCtx{channelCtx},
executor{channelCtx.executor} {}
void MaxwellDma::Copy(IOVA dst, IOVA src, size_t size) {
auto srcMappings{channelCtx.asCtx->gmmu.TranslateRange(src, size)};
auto dstMappings{channelCtx.asCtx->gmmu.TranslateRange(dst, size)};
if (srcMappings.size() > 1 || dstMappings.size() > 1)
Logger::Warn("Split mapping are unsupported for DMA copies");
auto srcBuf{gpu.buffer.FindOrCreate(srcMappings.front(), executor.tag, [this](std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
executor.AttachLockedBuffer(buffer, std::move(lock));
})};
ContextLock srcBufLock{executor.tag, srcBuf};
auto dstBuf{gpu.buffer.FindOrCreate(dstMappings.front(), executor.tag, [this](std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
executor.AttachLockedBuffer(buffer, std::move(lock));
})};
ContextLock dstBufLock{executor.tag, dstBuf};
dstBuf.CopyFrom(srcBuf, [&]() {
executor.AttachLockedBufferView(srcBuf, std::move(srcBufLock));
executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock));
// This will prevent any CPU accesses to backing for the duration of the usage
// GPU dirtiness will be handled on the CopyFrom end as it's not always necessary
srcBuf.GetBuffer()->BlockAllCpuBackingWrites();
dstBuf.GetBuffer()->BlockAllCpuBackingWrites();
executor.AddOutsideRpCommand([srcBuf, dstBuf](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eTransfer, {}, vk::MemoryBarrier{
.srcAccessMask = vk::AccessFlagBits::eMemoryRead,
.dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite
}, {}, {});
vk::BufferCopy copyRegion{
.size = srcBuf.size,
.srcOffset = srcBuf.GetOffset(),
.dstOffset = dstBuf.GetOffset()
};
commandBuffer.copyBuffer(srcBuf.GetBuffer()->GetBacking(), dstBuf.GetBuffer()->GetBacking(), copyRegion);
commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{
.srcAccessMask = vk::AccessFlagBits::eTransferWrite,
.dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
}, {}, {});
});
});
}
}

View File

@ -0,0 +1,36 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
#pragma once
#include <soc/gm20b/gmmu.h>
namespace skyline::gpu {
class GPU;
}
namespace skyline::soc::gm20b {
struct ChannelContext;
}
namespace skyline::gpu::interconnect {
class CommandExecutor;
/**
* @brief Handles translating Maxwell DMA operations to Vulkan
*/
class MaxwellDma {
private:
using IOVA = soc::gm20b::IOVA;
GPU &gpu;
soc::gm20b::ChannelContext &channelCtx;
gpu::interconnect::CommandExecutor &executor;
public:
MaxwellDma(GPU &gpu, soc::gm20b::ChannelContext &channelCtx);
void Copy(IOVA dst, IOVA src, size_t size);
};
}

View File

@ -5,7 +5,9 @@
#include "inline2memory.h"
namespace skyline::soc::gm20b::engine {
Inline2MemoryBackend::Inline2MemoryBackend(ChannelContext &channelCtx) : channelCtx(channelCtx) {}
Inline2MemoryBackend::Inline2MemoryBackend(const DeviceState &state, ChannelContext &channelCtx)
: interconnect{*state.gpu, channelCtx},
channelCtx{channelCtx} {}
void Inline2MemoryBackend::LaunchDma(Inline2MemoryBackend::RegisterState &state) {
writeOffset = 0;
@ -17,13 +19,11 @@ namespace skyline::soc::gm20b::engine {
if (state.launchDma.completion == RegisterState::DmaCompletionType::ReleaseSemaphore)
throw exception("Semaphore release on I2M completion is not supported!");
channelCtx.executor.Submit();
if (state.launchDma.layout == RegisterState::DmaDstMemoryLayout::Pitch && state.lineCount == 1) {
// TODO: we can do this with the buffer manager to avoid some overhead in the future
Logger::Debug("range: 0x{:X} -> 0x{:X}", u64{state.offsetOut}, u64{state.offsetOut} + buffer.size() * 0x4);
channelCtx.asCtx->gmmu.Write(state.offsetOut, span(buffer));
interconnect.Upload(u64{state.offsetOut}, span{buffer});
} else {
channelCtx.executor.Submit();
Logger::Warn("Non-linear I2M uploads are not supported!");
}
}
@ -49,7 +49,7 @@ namespace skyline::soc::gm20b::engine {
CompleteDma(state);
}
Inline2Memory::Inline2Memory(ChannelContext &channelCtx) : backend(channelCtx) {}
Inline2Memory::Inline2Memory(const DeviceState &state, ChannelContext &channelCtx) : backend{state, channelCtx} {}
__attribute__((always_inline)) void Inline2Memory::CallMethod(u32 method, u32 argument) {
Logger::Verbose("Called method in I2M: 0x{:X} args: 0x{:X}", method, argument);

View File

@ -4,6 +4,7 @@
#pragma once
#include <common.h>
#include <gpu/interconnect/inline2memory.h>
#include "engine.h"
namespace skyline::soc::gm20b {
@ -18,6 +19,7 @@ namespace skyline::soc::gm20b::engine {
private:
std::vector<u32> buffer; //!< Temporary buffer to hold data being currently uploaded
u32 writeOffset{}; //!< Current write offset in words into `buffer`
gpu::interconnect::Inline2Memory interconnect;
ChannelContext &channelCtx;
public:
@ -124,7 +126,7 @@ namespace skyline::soc::gm20b::engine {
void CompleteDma(RegisterState &state);
public:
Inline2MemoryBackend(ChannelContext &channelCtx);
Inline2MemoryBackend(const DeviceState &state, ChannelContext &channelCtx);
/**
* @brief Should be called when launchDma in `state` is written to
@ -164,7 +166,7 @@ namespace skyline::soc::gm20b::engine {
} registers{};
public:
Inline2Memory(ChannelContext &channelCtx);
Inline2Memory(const DeviceState &state, ChannelContext &channelCtx);
void CallMethod(u32 method, u32 argument);

View File

@ -8,7 +8,7 @@
namespace skyline::soc::gm20b::engine {
KeplerCompute::KeplerCompute(const DeviceState &state, ChannelContext &channelCtx)
: syncpoints(state.soc->host1x.syncpoints), i2m(channelCtx) {}
: syncpoints{state.soc->host1x.syncpoints}, i2m{state, channelCtx} {}
__attribute__((always_inline)) void KeplerCompute::CallMethod(u32 method, u32 argument) {
Logger::Verbose("Called method in Kepler compute: 0x{:X} args: 0x{:X}", method, argument);

View File

@ -11,8 +11,10 @@
#include "maxwell_dma.h"
namespace skyline::soc::gm20b::engine {
MaxwellDma::MaxwellDma(const DeviceState &state, ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor)
: channelCtx(channelCtx), syncpoints(state.soc->host1x.syncpoints), executor(executor) {}
MaxwellDma::MaxwellDma(const DeviceState &state, ChannelContext &channelCtx)
: channelCtx{channelCtx},
syncpoints{state.soc->host1x.syncpoints},
interconnect{*state.gpu, channelCtx} {}
__attribute__((always_inline)) void MaxwellDma::CallMethod(u32 method, u32 argument) {
Logger::Verbose("Called method in Maxwell DMA: 0x{:X} args: 0x{:X}", method, argument);
@ -36,8 +38,8 @@ namespace skyline::soc::gm20b::engine {
return;
}
executor.Submit();
if (registers.launchDma->multiLineEnable) {
channelCtx.executor.Submit();
if (registers.launchDma->srcMemoryLayout == Registers::LaunchDma::MemoryLayout::Pitch &&
registers.launchDma->dstMemoryLayout == Registers::LaunchDma::MemoryLayout::BlockLinear)
CopyPitchToBlockLinear();
@ -51,7 +53,7 @@ namespace skyline::soc::gm20b::engine {
// 1D buffer copy
// TODO: implement swizzled 1D copies based on VMM 'kind'
Logger::Debug("src: 0x{:X} dst: 0x{:X} size: 0x{:X}", u64{*registers.offsetIn}, u64{*registers.offsetOut}, *registers.lineLengthIn);
channelCtx.asCtx->gmmu.Copy(*registers.offsetOut, *registers.offsetIn, *registers.lineLengthIn);
interconnect.Copy(u64{*registers.offsetOut}, u64{*registers.offsetIn}, u64{*registers.lineLengthIn});
}
ReleaseSemaphore();

View File

@ -3,6 +3,7 @@
#pragma once
#include <gpu/interconnect/maxwell_dma.h>
#include "engine.h"
namespace skyline::gpu::interconnect {
@ -21,7 +22,7 @@ namespace skyline::soc::gm20b::engine {
private:
host1x::SyncpointSet &syncpoints;
ChannelContext &channelCtx;
gpu::interconnect::CommandExecutor &executor;
gpu::interconnect::MaxwellDma interconnect;
void HandleMethod(u32 method, u32 argument);
@ -253,7 +254,7 @@ namespace skyline::soc::gm20b::engine {
static_assert(sizeof(Registers) == (EngineMethodsEnd * 0x4));
#pragma pack(pop)
MaxwellDma(const DeviceState &state, ChannelContext &channelCtx, gpu::interconnect::CommandExecutor &executor);
MaxwellDma(const DeviceState &state, ChannelContext &channelCtx);
void CallMethod(u32 method, u32 argument);