Implement access-driven Texture synchronization

There was a lot of redundant synchronization of textures to and from host constantly as we were not aware of guest memory access, this has now been averted by tracking any memory accesses to the texture memory using the NCE Memory Trapping API and synchronizing only when required.
This commit is contained in:
PixelyIon 2022-03-06 20:59:09 +05:30
parent 3e33d49faf
commit 3268b3779a
3 changed files with 90 additions and 71 deletions

View File

@ -131,7 +131,7 @@ namespace skyline::gpu::interconnect {
});
for (auto texture : syncTextures)
texture->SynchronizeHostWithBuffer(commandBuffer, cycle);
texture->SynchronizeHostWithBuffer(commandBuffer, cycle, true);
for (auto buffer : syncBuffers)
buffer->SynchronizeHostWithCycle(cycle);
@ -162,9 +162,6 @@ namespace skyline::gpu::interconnect {
#undef NODE
}
for (auto texture : syncTextures)
texture->SynchronizeGuestWithBuffer(commandBuffer, cycle);
for (auto buffer : syncBuffers)
buffer->SynchronizeGuestWithCycle(cycle);

View File

@ -115,6 +115,17 @@ namespace skyline::gpu {
alignedMirror = gpu.state.process->memory.CreateMirrors(alignedMappings);
mirror = alignedMirror.subspan(static_cast<size_t>(frontMapping.data() - alignedData), totalSize);
}
trapHandle = gpu.state.nce->TrapRegions(mappings, true, [this] {
std::lock_guard lock(*this);
SynchronizeGuest(true); // We can skip trapping since the caller will do it
WaitOnFence();
}, [this] {
std::lock_guard lock(*this);
SynchronizeGuest(true);
dirtyState = DirtyState::CpuDirty; // We need to assume the texture is dirty since we don't know what the guest is writing
WaitOnFence();
});
}
std::shared_ptr<memory::StagingBuffer> Texture::SynchronizeHostImpl(const std::shared_ptr<FenceCycle> &pCycle) {
@ -266,22 +277,6 @@ namespace skyline::gpu {
texture->CopyToGuest(stagingBuffer ? stagingBuffer->data() : std::get<memory::Image>(texture->backing).data());
}
Texture::Texture(GPU &gpu, BackingType &&backing, GuestTexture guest, texture::Dimensions dimensions, texture::Format format, vk::ImageLayout layout, vk::ImageTiling tiling, u32 mipLevels, u32 layerCount, vk::SampleCountFlagBits sampleCount)
: gpu(gpu),
backing(std::move(backing)),
layout(layout),
guest(std::move(guest)),
dimensions(dimensions),
format(format),
tiling(tiling),
mipLevels(mipLevels),
layerCount(layerCount),
sampleCount(sampleCount) {
SetupGuestMappings();
if (GetBacking())
SynchronizeHost();
}
Texture::Texture(GPU &gpu, BackingType &&backing, texture::Dimensions dimensions, texture::Format format, vk::ImageLayout layout, vk::ImageTiling tiling, u32 mipLevels, u32 layerCount, vk::SampleCountFlagBits sampleCount)
: gpu(gpu),
backing(std::move(backing)),
@ -324,45 +319,26 @@ namespace skyline::gpu {
.initialLayout = layout,
};
backing = tiling != vk::ImageTiling::eLinear ? gpu.memory.AllocateImage(imageCreateInfo) : gpu.memory.AllocateMappedImage(imageCreateInfo);
TransitionLayout(vk::ImageLayout::eGeneral);
SetupGuestMappings();
}
Texture::Texture(GPU &gpu, texture::Dimensions dimensions, texture::Format format, vk::ImageLayout initialLayout, vk::ImageUsageFlags usage, vk::ImageTiling tiling, u32 mipLevels, u32 layerCount, vk::SampleCountFlagBits sampleCount)
: gpu(gpu),
dimensions(dimensions),
format(format),
layout(initialLayout == vk::ImageLayout::ePreinitialized ? vk::ImageLayout::ePreinitialized : vk::ImageLayout::eUndefined),
tiling(vk::ImageTiling::eOptimal), // Same as above
mipLevels(mipLevels),
layerCount(layerCount),
sampleCount(sampleCount) {
vk::ImageCreateInfo imageCreateInfo{
.imageType = dimensions.GetType(),
.format = *format,
.extent = dimensions,
.mipLevels = mipLevels,
.arrayLayers = layerCount,
.samples = sampleCount,
.tiling = tiling,
.usage = usage | vk::ImageUsageFlagBits::eTransferSrc | vk::ImageUsageFlagBits::eTransferDst,
.sharingMode = vk::SharingMode::eExclusive,
.queueFamilyIndexCount = 1,
.pQueueFamilyIndices = &gpu.vkQueueFamilyIndex,
.initialLayout = layout,
};
backing = tiling != vk::ImageTiling::eLinear ? gpu.memory.AllocateImage(imageCreateInfo) : gpu.memory.AllocateMappedImage(imageCreateInfo);
if (initialLayout != layout)
TransitionLayout(initialLayout);
SetupGuestMappings();
}
Texture::~Texture() {
std::lock_guard lock(*this);
if (trapHandle)
gpu.state.nce->DeleteTrap(*trapHandle);
SynchronizeGuest(true);
if (alignedMirror.valid())
munmap(alignedMirror.data(), alignedMirror.size());
}
void Texture::MarkGpuDirty() {
if (dirtyState == DirtyState::GpuDirty)
return;
gpu.state.nce->RetrapRegions(*trapHandle, false);
dirtyState = DirtyState::GpuDirty;
}
bool Texture::WaitOnBacking() {
TRACE_EVENT("gpu", "Texture::WaitOnBacking");
@ -420,7 +396,10 @@ namespace skyline::gpu {
});
}
void Texture::SynchronizeHost() {
void Texture::SynchronizeHost(bool rwTrap) {
if (dirtyState != DirtyState::CpuDirty)
return; // If the texture has not been modified on the CPU, there is no need to synchronize it
TRACE_EVENT("gpu", "Texture::SynchronizeHost");
auto stagingBuffer{SynchronizeHostImpl(nullptr)};
@ -431,9 +410,20 @@ namespace skyline::gpu {
lCycle->AttachObjects(stagingBuffer, shared_from_this());
cycle = lCycle;
}
if (rwTrap) {
gpu.state.nce->RetrapRegions(*trapHandle, false);
dirtyState = DirtyState::GpuDirty;
} else {
gpu.state.nce->RetrapRegions(*trapHandle, true); // Trap any future CPU writes to this texture
dirtyState = DirtyState::Clean;
}
}
void Texture::SynchronizeHostWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &pCycle) {
void Texture::SynchronizeHostWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &pCycle, bool rwTrap) {
if (dirtyState != DirtyState::CpuDirty)
return;
TRACE_EVENT("gpu", "Texture::SynchronizeHostWithBuffer");
auto stagingBuffer{SynchronizeHostImpl(pCycle)};
@ -442,19 +432,28 @@ namespace skyline::gpu {
pCycle->AttachObjects(stagingBuffer, shared_from_this());
cycle = pCycle;
}
if (rwTrap) {
gpu.state.nce->RetrapRegions(*trapHandle, false);
dirtyState = DirtyState::GpuDirty;
} else {
gpu.state.nce->RetrapRegions(*trapHandle, true); // Trap any future CPU writes to this texture
dirtyState = DirtyState::Clean;
}
}
void Texture::SynchronizeGuest() {
if (!guest)
void Texture::SynchronizeGuest(bool skipTrap) {
if (dirtyState != DirtyState::GpuDirty || layout == vk::ImageLayout::eUndefined) {
// We can skip syncing in two cases:
// * If the texture has not been used on the GPU, there is no need to synchronize it
// * If the state of the host texture is undefined then so can the guest
return;
} else if (!guest) {
throw exception("Synchronization of guest textures requires a valid guest texture to synchronize to");
else if (layout == vk::ImageLayout::eUndefined)
return; // If the state of the host texture is undefined then so can the guest
}
TRACE_EVENT("gpu", "Texture::SynchronizeGuest");
if (layout == vk::ImageLayout::eUndefined)
return; // We don't need to synchronize the image if it is in an undefined state on the host
WaitOnBacking();
WaitOnFence();
@ -473,9 +472,16 @@ namespace skyline::gpu {
} else {
throw exception("Host -> Guest synchronization of images tiled as '{}' isn't implemented", vk::to_string(tiling));
}
if (!skipTrap)
gpu.state.nce->RetrapRegions(*trapHandle, true);
dirtyState = DirtyState::Clean;
}
void Texture::SynchronizeGuestWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &pCycle) {
if (dirtyState != DirtyState::GpuDirty)
return;
if (!guest)
throw exception("Synchronization of guest textures requires a valid guest texture to synchronize to");
else if (layout == vk::ImageLayout::eUndefined)
@ -483,9 +489,6 @@ namespace skyline::gpu {
TRACE_EVENT("gpu", "Texture::SynchronizeGuestWithBuffer");
if (layout == vk::ImageLayout::eUndefined)
return;
WaitOnBacking();
if (cycle.lock() != pCycle)
WaitOnFence();
@ -504,6 +507,8 @@ namespace skyline::gpu {
} else {
throw exception("Host -> Guest synchronization of images tiled as '{}' isn't implemented", vk::to_string(tiling));
}
dirtyState = DirtyState::Clean;
}
std::shared_ptr<TextureView> Texture::GetView(vk::ImageViewType type, vk::ImageSubresourceRange range, texture::Format pFormat, vk::ComponentMapping mapping) {

View File

@ -3,6 +3,7 @@
#pragma once
#include <nce.h>
#include <gpu/memory_manager.h>
namespace skyline::gpu {
@ -321,6 +322,13 @@ namespace skyline::gpu {
span<u8> mirror{}; //!< A contiguous mirror of all the guest mappings to allow linear access on the CPU
span<u8> alignedMirror{}; //!< The mirror mapping aligned to page size to reflect the full mapping
std::optional<nce::NCE::TrapHandle> trapHandle{}; //!< The handle of the traps for the guest mappings
enum class DirtyState {
Clean, //!< The CPU mappings are in sync with the GPU texture
CpuDirty, //!< The CPU mappings have been modified but the GPU texture is not up to date
GpuDirty, //!< The GPU texture has been modified but the CPU mappings have not been updated
} dirtyState{DirtyState::CpuDirty}; //!< The state of the CPU mappings with respect to the GPU texture
std::vector<std::weak_ptr<TextureView>> views; //!< TextureView(s) that are backed by this Texture, used for repointing to a new Texture on deletion
friend TextureManager;
@ -377,17 +385,16 @@ namespace skyline::gpu {
u32 layerCount; //!< The amount of array layers in the image, utilized for efficient binding (Not to be confused with the depth or faces in a cubemap)
vk::SampleCountFlagBits sampleCount;
Texture(GPU &gpu, BackingType &&backing, GuestTexture guest, texture::Dimensions dimensions, texture::Format format, vk::ImageLayout layout, vk::ImageTiling tiling, u32 mipLevels = 1, u32 layerCount = 1, vk::SampleCountFlagBits sampleCount = vk::SampleCountFlagBits::e1);
/**
* @brief Creates a texture object wrapping the supplied backing with the supplied attributes
* @param layout The initial layout of the texture, it **must** be eUndefined or ePreinitialized
*/
Texture(GPU &gpu, BackingType &&backing, texture::Dimensions dimensions, texture::Format format, vk::ImageLayout layout, vk::ImageTiling tiling, u32 mipLevels = 1, u32 layerCount = 1, vk::SampleCountFlagBits sampleCount = vk::SampleCountFlagBits::e1);
Texture(GPU &gpu, GuestTexture guest);
/**
* @brief Creates and allocates memory for the backing to creates a texture object wrapping it
* @param usage Usage flags that will applied aside from VK_IMAGE_USAGE_TRANSFER_SRC_BIT/VK_IMAGE_USAGE_TRANSFER_DST_BIT which are mandatory
* @brief Creates a texture object wrapping the guest texture with a backing that can represent the guest texture data
*/
Texture(GPU &gpu, texture::Dimensions dimensions, texture::Format format, vk::ImageLayout initialLayout = vk::ImageLayout::eGeneral, vk::ImageUsageFlags usage = {}, vk::ImageTiling tiling = vk::ImageTiling::eOptimal, u32 mipLevels = 1, u32 layerCount = 1, vk::SampleCountFlagBits sampleCount = vk::SampleCountFlagBits::e1);
Texture(GPU &gpu, GuestTexture guest);
~Texture();
@ -426,6 +433,13 @@ namespace skyline::gpu {
return mutex.try_lock();
}
/**
* @brief Marks the texture as dirty on the GPU, it will be synced on the next call to SynchronizeGuest
* @note This **must** be called after syncing the texture to the GPU not before
* @note The texture **must** be locked prior to calling this
*/
void MarkGpuDirty();
/**
* @brief Waits on the texture backing to be a valid non-null Vulkan image
* @return If the mutex could be unlocked during the function
@ -458,25 +472,28 @@ namespace skyline::gpu {
/**
* @brief Synchronizes the host texture with the guest after it has been modified
* @param rwTrap If true, the guest buffer will be read/write trapped rather than only being write trapped which is more efficient than calling MarkGpuDirty directly after
* @note The texture **must** be locked prior to calling this
* @note The guest texture backing should exist prior to calling this
*/
void SynchronizeHost();
void SynchronizeHost(bool rwTrap = false);
/**
* @brief Same as SynchronizeHost but this records any commands into the supplied command buffer rather than creating one as necessary
* @param rwTrap If true, the guest buffer will be read/write trapped rather than only being write trapped which is more efficient than calling MarkGpuDirty directly after
* @note It is more efficient to call SynchronizeHost than allocating a command buffer purely for this function as it may conditionally not record any commands
* @note The texture **must** be locked prior to calling this
* @note The guest texture backing should exist prior to calling this
*/
void SynchronizeHostWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle);
void SynchronizeHostWithBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle, bool rwTrap = false);
/**
* @brief Synchronizes the guest texture with the host texture after it has been modified
* @param skipTrap If true, setting up a CPU trap will be skipped and the dirty state will be Clean/CpuDirty
* @note The texture **must** be locked prior to calling this
* @note The guest texture should not be null prior to calling this
*/
void SynchronizeGuest();
void SynchronizeGuest(bool skipTrap = false);
/**
* @brief Synchronizes the guest texture with the host texture after it has been modified