Introduce FenceCycle Waiter Thread

A substantial amount of time is spent destroying dependencies for any threads waiting or polling `FenceCycle`s, this is not optimal as it blocks them from moving onto other tasks while destruction is a fundamentally async task and can be delayed.

This commit solves this by introducing a thread that is dedicated to waiting on every `FenceCycle` then signalling and destroying all dependencies which entirely fixes the issue of destruction blocking on more important threads.
This commit is contained in:
PixelyIon 2022-08-01 22:08:32 +05:30
parent 5f8619f791
commit e1a4325137
No known key found for this signature in database
GPG Key ID: 11BC6C3201BC2C05
5 changed files with 118 additions and 42 deletions

View File

@ -368,7 +368,7 @@ namespace skyline::gpu {
vkDevice(CreateDevice(vkContext, vkPhysicalDevice, vkQueueFamilyIndex, traits)),
vkQueue(vkDevice, vkQueueFamilyIndex, 0),
memory(*this),
scheduler(*this),
scheduler(state, *this),
presentation(state, *this),
texture(*this),
buffer(*this),

View File

@ -2,19 +2,53 @@
// Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)
#include <gpu.h>
#include <loader/loader.h>
#include "command_scheduler.h"
namespace skyline::gpu {
void CommandScheduler::WaiterThread() {
if (int result{pthread_setname_np(pthread_self(), "Sky-CycleWaiter")})
Logger::Warn("Failed to set the thread name: {}", strerror(result));
try {
signal::SetSignalHandler({SIGINT, SIGILL, SIGTRAP, SIGBUS, SIGFPE, SIGSEGV}, signal::ExceptionalSignalHandler);
cycleQueue.Process([](const std::shared_ptr<FenceCycle> &cycle) {
cycle->Wait(true);
}, [] {});
} catch (const signal::SignalException &e) {
Logger::Error("{}\nStack Trace:{}", e.what(), state.loader->GetStackTrace(e.frames));
if (state.process)
state.process->Kill(false);
else
std::rethrow_exception(std::current_exception());
} catch (const std::exception &e) {
Logger::Error(e.what());
if (state.process)
state.process->Kill(false);
else
std::rethrow_exception(std::current_exception());
}
}
CommandScheduler::CommandBufferSlot::CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool)
: device(device),
commandBuffer(device, static_cast<VkCommandBuffer>(commandBuffer), static_cast<VkCommandPool>(*pool)),
fence(device, vk::FenceCreateInfo{}),
cycle(std::make_shared<FenceCycle>(device, *fence)) {}
CommandScheduler::CommandScheduler(GPU &pGpu) : gpu(pGpu), pool(std::ref(pGpu.vkDevice), vk::CommandPoolCreateInfo{
.flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
.queueFamilyIndex = pGpu.vkQueueFamilyIndex,
}) {}
CommandScheduler::CommandScheduler(const DeviceState &state, GPU &pGpu)
: state{state},
gpu{pGpu},
waiterThread{&CommandScheduler::WaiterThread, this},
pool{std::ref(pGpu.vkDevice), vk::CommandPoolCreateInfo{
.flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
.queueFamilyIndex = pGpu.vkQueueFamilyIndex,
}} {}
CommandScheduler::~CommandScheduler() {
waiterThread.join();
}
CommandScheduler::ActiveCommandBuffer CommandScheduler::AllocateCommandBuffer() {
for (auto &slot : pool->buffers) {
@ -42,11 +76,15 @@ namespace skyline::gpu {
return {pool->buffers.emplace_back(gpu.vkDevice, commandBuffer, pool->vkCommandPool)};
}
void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, vk::Fence fence) {
std::scoped_lock lock(gpu.queueMutex);
gpu.vkQueue.submit(vk::SubmitInfo{
.commandBufferCount = 1,
.pCommandBuffers = &*commandBuffer,
}, fence);
void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle) {
{
std::scoped_lock lock(gpu.queueMutex);
gpu.vkQueue.submit(vk::SubmitInfo{
.commandBufferCount = 1,
.pCommandBuffers = &*commandBuffer,
}, cycle->fence);
}
cycleQueue.Push(cycle);
}
}

View File

@ -4,6 +4,7 @@
#pragma once
#include <common/thread_local.h>
#include <common/circular_queue.h>
#include "fence_cycle.h"
namespace skyline::gpu {
@ -25,6 +26,7 @@ namespace skyline::gpu {
CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool);
};
const DeviceState &state;
GPU &gpu;
/**
@ -40,6 +42,12 @@ namespace skyline::gpu {
};
ThreadLocal<CommandPool> pool;
std::thread waiterThread; //!< A thread that waits on and signals FenceCycle(s) then clears any associated resources
static constexpr size_t FenceCycleWaitCount{256}; //!< The amount of fence cycles the cycle queue can hold
CircularQueue<std::shared_ptr<FenceCycle>> cycleQueue{FenceCycleWaitCount}; //!< A circular queue containing all the active cycles that can be waited on
void WaiterThread();
public:
/**
* @brief An active command buffer occupies a slot and ensures that its status is updated correctly
@ -92,7 +100,9 @@ namespace skyline::gpu {
}
};
CommandScheduler(GPU &gpu);
CommandScheduler(const DeviceState &state, GPU &gpu);
~CommandScheduler();
/**
* @brief Allocates an existing or new primary command buffer from the pool
@ -100,9 +110,11 @@ namespace skyline::gpu {
ActiveCommandBuffer AllocateCommandBuffer();
/**
* @brief Submits a single command buffer to the GPU queue with an optional fence
* @brief Submits a single command buffer to the GPU queue while queuing it up to be waited on
* @note The supplied command buffer and cycle **must** be from AllocateCommandBuffer()
* @note Any cycle submitted via this method does not need to destroy dependencies manually, the waiter thread will handle this
*/
void SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, vk::Fence fence = {});
void SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle);
/**
* @brief Submits a command buffer recorded with the supplied function synchronously
@ -116,8 +128,10 @@ namespace skyline::gpu {
});
recordFunction(*commandBuffer);
commandBuffer->end();
SubmitCommandBuffer(*commandBuffer, commandBuffer.GetFence());
return commandBuffer.GetFenceCycle();
auto cycle{commandBuffer.GetFenceCycle()};
SubmitCommandBuffer(*commandBuffer, cycle);
return cycle;
} catch (...) {
commandBuffer.GetFenceCycle()->Cancel();
std::rethrow_exception(std::current_exception());
@ -130,14 +144,16 @@ namespace skyline::gpu {
template<typename RecordFunction>
std::shared_ptr<FenceCycle> SubmitWithCycle(RecordFunction recordFunction) {
auto commandBuffer{AllocateCommandBuffer()};
auto cycle{commandBuffer.GetFenceCycle()};
try {
commandBuffer->begin(vk::CommandBufferBeginInfo{
.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit,
});
recordFunction(*commandBuffer, commandBuffer.GetFenceCycle());
recordFunction(*commandBuffer, cycle);
commandBuffer->end();
SubmitCommandBuffer(*commandBuffer, commandBuffer.GetFence());
return commandBuffer.GetFenceCycle();
SubmitCommandBuffer(*commandBuffer, cycle);
return cycle;
} catch (...) {
commandBuffer.GetFenceCycle()->Cancel();
std::rethrow_exception(std::current_exception());

View File

@ -8,7 +8,7 @@
#include <common/atomic_forward_list.h>
namespace skyline::gpu {
struct FenceCycle;
class CommandScheduler;
/**
* @brief A wrapper around a Vulkan Fence which only tracks a single reset -> signal cycle with the ability to attach lifetimes of objects to it
@ -17,19 +17,23 @@ namespace skyline::gpu {
*/
struct FenceCycle {
private:
std::atomic_flag signalled;
std::atomic_flag signalled{}; //!< If the underlying fence has been signalled since the creation of this FenceCycle, this doesn't necessarily mean the dependencies have been destroyed
std::atomic_flag alreadyDestroyed{}; //!< If the cycle's dependencies are already destroyed, this prevents multiple destructions
const vk::raii::Device &device;
vk::Fence fence;
friend CommandScheduler;
AtomicForwardList<std::shared_ptr<void>> dependencies; //!< A list of all dependencies on this fence cycle
AtomicForwardList<std::shared_ptr<FenceCycle>> chainedCycles; //!< A list of all chained FenceCycles, this is used to express multi-fence dependencies
/**
* @brief Sequentially iterate through the shared_ptr linked list of dependencies and reset all pointers in a thread-safe atomic manner
* @note We cannot simply nullify the base pointer of the list as a false dependency chain is maintained between the objects when retained externally
* @brief Destroy all the dependencies of this cycle
* @note We cannot delete the chained cycles associated with this fence as they may be iterated over during the deletion, it is only safe to delete them during the destruction of the cycle
*/
void DestroyDependencies() {
dependencies.Clear();
if (!alreadyDestroyed.test_and_set(std::memory_order_release))
dependencies.Clear();
}
public:
@ -45,19 +49,23 @@ namespace skyline::gpu {
* @brief Signals this fence regardless of if the underlying fence has been signalled or not
*/
void Cancel() {
if (!signalled.test_and_set(std::memory_order_release))
DestroyDependencies();
signalled.test_and_set(std::memory_order_release);
DestroyDependencies();
}
/**
* @brief Wait on a fence cycle till it has been signalled
* @param shouldDestroy If true, the dependencies of this cycle will be destroyed after the fence is signalled
*/
void Wait() {
if (signalled.test(std::memory_order_consume))
void Wait(bool shouldDestroy = false) {
if (signalled.test(std::memory_order_consume)) {
if (shouldDestroy)
DestroyDependencies();
return;
}
chainedCycles.Iterate([](auto &cycle) {
cycle->Wait();
chainedCycles.Iterate([shouldDestroy](auto &cycle) {
cycle->Wait(shouldDestroy);
});
vk::Result waitResult;
@ -73,21 +81,26 @@ namespace skyline::gpu {
throw exception("An error occurred while waiting for fence 0x{:X}: {}", static_cast<VkFence>(fence), vk::to_string(waitResult));
}
if (!signalled.test_and_set(std::memory_order_release))
signalled.test_and_set(std::memory_order_release);
if (shouldDestroy)
DestroyDependencies();
}
/**
* @brief Wait on a fence cycle with a timeout in nanoseconds
* @param shouldDestroy If true, the dependencies of this cycle will be destroyed after the fence is signalled
* @return If the wait was successful or timed out
*/
bool Wait(i64 timeoutNs) {
if (signalled.test(std::memory_order_consume))
bool Wait(i64 timeoutNs, bool shouldDestroy = false) {
if (signalled.test(std::memory_order_consume)) {
if (shouldDestroy)
DestroyDependencies();
return true;
}
i64 startTime{util::GetTimeNs()}, initialTimeout{timeoutNs};
if (!chainedCycles.AllOf([&](auto &cycle) {
if (!cycle->Wait(timeoutNs))
if (!cycle->Wait(timeoutNs, shouldDestroy))
return false;
timeoutNs = std::max<i64>(0, initialTimeout - (util::GetTimeNs() - startTime));
return true;
@ -108,7 +121,8 @@ namespace skyline::gpu {
}
if (waitResult == vk::Result::eSuccess) {
if (!signalled.test_and_set(std::memory_order_release))
signalled.test_and_set(std::memory_order_release);
if (shouldDestroy)
DestroyDependencies();
return true;
} else {
@ -116,23 +130,31 @@ namespace skyline::gpu {
}
}
bool Wait(std::chrono::duration<i64, std::nano> timeout) {
return Wait(timeout.count());
bool Wait(std::chrono::duration<i64, std::nano> timeout, bool shouldDestroy = false) {
return Wait(timeout.count(), shouldDestroy);
}
/**
* @param quick Skips the call to check the fence's status, just checking the signalled flag
* @return If the fence is signalled currently or not
*/
bool Poll() {
if (signalled.test(std::memory_order_consume))
bool Poll(bool quick = true, bool shouldDestroy = false) {
if (signalled.test(std::memory_order_consume)) {
if (shouldDestroy)
DestroyDependencies();
return true;
}
if (!chainedCycles.AllOf([](auto &cycle) { return cycle->Poll(); }))
if (quick)
return false; // We need to return early if we're not waiting on the fence
if (!chainedCycles.AllOf([=](auto &cycle) { return cycle->Poll(quick, shouldDestroy); }))
return false;
auto status{(*device).getFenceStatus(fence, *device.getDispatcher())};
if (status == vk::Result::eSuccess) {
if (!signalled.test_and_set(std::memory_order_release))
signalled.test_and_set(std::memory_order_release);
if (shouldDestroy)
DestroyDependencies();
return true;
} else {

View File

@ -289,7 +289,7 @@ namespace skyline::gpu::interconnect {
for (const auto &attachedBuffer : attachedBuffers)
attachedBuffer->SynchronizeHost(); // Synchronize attached buffers from the CPU without using a staging buffer, this is done directly prior to submission to prevent stalls
gpu.scheduler.SubmitCommandBuffer(commandBuffer, activeCommandBuffer.GetFence());
gpu.scheduler.SubmitCommandBuffer(commandBuffer, cycle);
nodes.clear();