Allow for tying guest GPU sync operations to host GPU sync

This is necessary for the upcoming direct buffer support, as in order to use guest buffers directly without trapping we need to recreate any guest GPU sync on the host GPU. This avoids the guest thinking work is done that isn't and overwriting in-use buffer contents.
This commit is contained in:
Billy Laws 2022-12-27 18:15:44 +00:00
parent 89c6fab1cb
commit b3f7e990cc
5 changed files with 116 additions and 33 deletions

View File

@ -192,6 +192,42 @@ namespace skyline::gpu::interconnect {
incoming.Push(slot);
}
void ExecutionWaiterThread::Run() {
signal::SetSignalHandler({SIGSEGV}, nce::NCE::HostSignalHandler); // We may access NCE trapped memory
while (true) {
std::pair<std::shared_ptr<FenceCycle>, std::function<void()>> item{};
{
std::unique_lock lock{mutex};
idle = true;
condition.wait(lock, [this] { return !pendingSignalQueue.empty(); });
idle = false;
item = std::move(pendingSignalQueue.front());
pendingSignalQueue.pop();
}
{
TRACE_EVENT("gpu", "GPU");
if (item.first)
item.first->Wait();
}
if (item.second)
item.second();
}
}
ExecutionWaiterThread::ExecutionWaiterThread() : thread{&ExecutionWaiterThread::Run, this} {}
bool ExecutionWaiterThread::IsIdle() const {
return idle;
}
void ExecutionWaiterThread::Queue(std::shared_ptr<FenceCycle> cycle, std::function<void()> &&callback) {
std::unique_lock lock{mutex};
pendingSignalQueue.push({std::move(cycle), std::move(callback)});
condition.notify_all();
}
CommandExecutor::CommandExecutor(const DeviceState &state)
: state{state},
gpu{*state.gpu},
@ -501,18 +537,31 @@ namespace skyline::gpu::interconnect {
}
}
void CommandExecutor::Submit() {
for (const auto &callback : flushCallbacks)
callback();
void CommandExecutor::Submit(std::function<void()> &&callback) {
for (const auto &flushCallback : flushCallbacks)
flushCallback();
executionTag = AllocateTag();
if (!slot->nodes.empty()) {
TRACE_EVENT("gpu", "CommandExecutor::Submit");
if (callback && *state.settings->useDirectMemoryImport)
waiterThread.Queue(cycle, std::move(callback));
else
waiterThread.Queue(cycle, {});
SubmitInternal();
submissionNumber++;
} else {
if (callback && *state.settings->useDirectMemoryImport)
waiterThread.Queue(nullptr, std::move(callback));
}
if (callback && !*state.settings->useDirectMemoryImport)
callback();
ResetInternal();
}

View File

@ -92,6 +92,30 @@ namespace skyline::gpu::interconnect {
void ReleaseSlot(Slot *slot);
};
/**
* @brief Thread responsible for notifying the guest of the completion of GPU operations
*/
class ExecutionWaiterThread {
private:
std::thread thread;
std::mutex mutex;
std::condition_variable condition;
std::queue<std::pair<std::shared_ptr<FenceCycle>, std::function<void()>>> pendingSignalQueue; //!< Queue of callbacks to be executed when their coressponding fence is signalled
std::atomic<bool> idle{};
void Run();
public:
ExecutionWaiterThread();
bool IsIdle() const;
/**
* @brief Queues `callback` to be executed when `cycle` is signalled, null values are valid for either, will null cycle representing an immediate callback (dep on previously queued cycles) and null callback representing a wait with no callback
*/
void Queue(std::shared_ptr<FenceCycle> cycle, std::function<void()> &&callback);
};
/**
* @brief Assembles a Vulkan command stream with various nodes and manages execution of the produced graph
* @note This class is **NOT** thread-safe and should **ONLY** be utilized by a single thread
@ -102,6 +126,7 @@ namespace skyline::gpu::interconnect {
GPU &gpu;
CommandRecordThread recordThread;
CommandRecordThread::Slot *slot{};
ExecutionWaiterThread waiterThread;
node::RenderPassNode *renderPass{};
size_t subpassCount{}; //!< The number of subpasses in the current render pass
u32 renderPassIndex{};
@ -274,8 +299,9 @@ namespace skyline::gpu::interconnect {
/**
* @brief Execute all the nodes and submit the resulting command buffer to the GPU
* @param callback A function to call upon GPU completion of the submission
*/
void Submit();
void Submit(std::function<void()> &&callback = {});
/**
* @brief Locks all preserve attached buffers/textures

View File

@ -19,8 +19,9 @@ namespace skyline::soc::gm20b::engine {
ENGINE_STRUCT_CASE(syncpoint, action, {
if (action.operation == Registers::Syncpoint::Operation::Incr) {
Logger::Debug("Increment syncpoint: {}", +action.index);
channelCtx.executor.Submit();
syncpoints.at(action.index).Increment();
channelCtx.executor.Submit([=, syncpoints = &this->syncpoints, index = action.index]() {
syncpoints->at(index).Increment();
});
} else if (action.operation == Registers::Syncpoint::Operation::Wait) {
Logger::Debug("Wait syncpoint: {}, thresh: {}", +action.index, registers.syncpoint->payload);
@ -36,12 +37,6 @@ namespace skyline::soc::gm20b::engine {
ENGINE_STRUCT_CASE(semaphore, action, {
u64 address{registers.semaphore->address};
// Write timestamp first to ensure ordering
if (action.releaseSize == Registers::Semaphore::ReleaseSize::SixteenBytes) {
channelCtx.asCtx->gmmu.Write<u32>(address + 4, 0);
channelCtx.asCtx->gmmu.Write(address + 8, GetGpuTimeTicks());
}
switch (action.operation) {
case Registers::Semaphore::Operation::Acquire:
Logger::Debug("Acquire semaphore: 0x{:X} payload: {}", address, registers.semaphore->payload);
@ -54,7 +49,16 @@ namespace skyline::soc::gm20b::engine {
channelCtx.Lock();
break;
case Registers::Semaphore::Operation::Release:
channelCtx.asCtx->gmmu.Write(address, registers.semaphore->payload);
channelCtx.executor.Submit([this, action, address, payload = registers.semaphore->payload] () {
// Write timestamp first to ensure ordering
if (action.releaseSize == Registers::Semaphore::ReleaseSize::SixteenBytes) {
channelCtx.asCtx->gmmu.Write<u32>(address + 4, 0);
channelCtx.asCtx->gmmu.Write(address + 8, GetGpuTimeTicks());
}
channelCtx.asCtx->gmmu.Write(address, payload);
});
Logger::Debug("SemaphoreRelease: address: 0x{:X} payload: {}", address, registers.semaphore->payload);
break;
case Registers::Semaphore::Operation::AcqGeq :

View File

@ -218,8 +218,9 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
ENGINE_CASE(syncpointAction, {
Logger::Debug("Increment syncpoint: {}", static_cast<u16>(syncpointAction.id));
channelCtx.executor.Submit();
syncpoints.at(syncpointAction.id).Increment();
channelCtx.executor.Submit([=, syncpoints = &this->syncpoints, index = syncpointAction.id]() {
syncpoints->at(index).Increment();
});
})
ENGINE_CASE(clearSurface, {
@ -338,14 +339,17 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
switch (info.op) {
case type::SemaphoreInfo::Op::Release:
channelCtx.executor.Submit();
WriteSemaphoreResult(registers.semaphore->payload);
channelCtx.executor.Submit([=, this, semaphore = *registers.semaphore]() {
WriteSemaphoreResult(semaphore, semaphore.payload);
});
break;
case type::SemaphoreInfo::Op::Counter: {
switch (info.counterType) {
case type::SemaphoreInfo::CounterType::Zero:
WriteSemaphoreResult(registers.semaphore->payload);
channelCtx.executor.Submit([=, this, semaphore = *registers.semaphore]() {
WriteSemaphoreResult(semaphore, semaphore.payload);
});
break;
default:
@ -390,21 +394,19 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
}
}
void Maxwell3D::WriteSemaphoreResult(u64 result) {
u64 address{registers.semaphore->address};
switch (registers.semaphore->info.structureSize) {
void Maxwell3D::WriteSemaphoreResult(const Registers::Semaphore &semaphore, u64 result) {
switch (semaphore.info.structureSize) {
case type::SemaphoreInfo::StructureSize::OneWord:
channelCtx.asCtx->gmmu.Write(address, static_cast<u32>(result));
Logger::Debug("address: 0x{:X} payload: {}", address, result);
channelCtx.asCtx->gmmu.Write(semaphore.address, static_cast<u32>(result));
Logger::Debug("address: 0x{:X} payload: {}", semaphore.address, result);
break;
case type::SemaphoreInfo::StructureSize::FourWords: {
// Write timestamp first to ensure correct ordering
u64 timestamp{GetGpuTimeTicks()};
channelCtx.asCtx->gmmu.Write(address + 8, timestamp);
channelCtx.asCtx->gmmu.Write(address, result);
Logger::Debug("address: 0x{:X} payload: {} timestamp: {}", address, result, timestamp);
channelCtx.asCtx->gmmu.Write(semaphore.address + 8, timestamp);
channelCtx.asCtx->gmmu.Write(semaphore.address, result);
Logger::Debug("address: 0x{:X} payload: {} timestamp: {}", semaphore.address, result, timestamp);
break;
}

View File

@ -78,12 +78,6 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
*/
void HandleMethod(u32 method, u32 argument);
/**
* @brief Writes back a semaphore result to the guest with an auto-generated timestamp (if required)
* @note If the semaphore is OneWord then the result will be downcasted to a 32-bit unsigned integer
*/
void WriteSemaphoreResult(u64 result);
public:
/**
* @url https://github.com/devkitPro/deko3d/blob/master/source/maxwell/engine_3d.def
@ -421,6 +415,14 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
static_assert(sizeof(Registers) == (EngineMethodsEnd * sizeof(u32)));
#pragma pack(pop)
private:
/**
* @brief Writes back a semaphore result to the guest with an auto-generated timestamp (if required)
* @note If the semaphore is OneWord then the result will be downcasted to a 32-bit unsigned integer
*/
void WriteSemaphoreResult(const Registers::Semaphore &semaphore, u64 result);
public:
Registers registers{};
Registers shadowRegisters{}; //!< A shadow-copy of the registers, their function is controlled by the 'shadowRamControl' register