Allow for tying guest GPU sync operations to host GPU sync

This is necessary for the upcoming direct buffer support, as in order to use guest buffers directly without trapping we need to recreate any guest GPU sync on the host GPU. This avoids the guest thinking work is done that isn't and overwriting in-use buffer contents.
This commit is contained in:
Billy Laws 2022-12-27 18:15:44 +00:00
parent 89c6fab1cb
commit b3f7e990cc
5 changed files with 116 additions and 33 deletions

View File

@ -192,6 +192,42 @@ namespace skyline::gpu::interconnect {
incoming.Push(slot); incoming.Push(slot);
} }
void ExecutionWaiterThread::Run() {
signal::SetSignalHandler({SIGSEGV}, nce::NCE::HostSignalHandler); // We may access NCE trapped memory
while (true) {
std::pair<std::shared_ptr<FenceCycle>, std::function<void()>> item{};
{
std::unique_lock lock{mutex};
idle = true;
condition.wait(lock, [this] { return !pendingSignalQueue.empty(); });
idle = false;
item = std::move(pendingSignalQueue.front());
pendingSignalQueue.pop();
}
{
TRACE_EVENT("gpu", "GPU");
if (item.first)
item.first->Wait();
}
if (item.second)
item.second();
}
}
ExecutionWaiterThread::ExecutionWaiterThread() : thread{&ExecutionWaiterThread::Run, this} {}
bool ExecutionWaiterThread::IsIdle() const {
return idle;
}
void ExecutionWaiterThread::Queue(std::shared_ptr<FenceCycle> cycle, std::function<void()> &&callback) {
std::unique_lock lock{mutex};
pendingSignalQueue.push({std::move(cycle), std::move(callback)});
condition.notify_all();
}
CommandExecutor::CommandExecutor(const DeviceState &state) CommandExecutor::CommandExecutor(const DeviceState &state)
: state{state}, : state{state},
gpu{*state.gpu}, gpu{*state.gpu},
@ -501,18 +537,31 @@ namespace skyline::gpu::interconnect {
} }
} }
void CommandExecutor::Submit() { void CommandExecutor::Submit(std::function<void()> &&callback) {
for (const auto &callback : flushCallbacks) for (const auto &flushCallback : flushCallbacks)
callback(); flushCallback();
executionTag = AllocateTag(); executionTag = AllocateTag();
if (!slot->nodes.empty()) { if (!slot->nodes.empty()) {
TRACE_EVENT("gpu", "CommandExecutor::Submit"); TRACE_EVENT("gpu", "CommandExecutor::Submit");
if (callback && *state.settings->useDirectMemoryImport)
waiterThread.Queue(cycle, std::move(callback));
else
waiterThread.Queue(cycle, {});
SubmitInternal(); SubmitInternal();
submissionNumber++; submissionNumber++;
} else {
if (callback && *state.settings->useDirectMemoryImport)
waiterThread.Queue(nullptr, std::move(callback));
} }
if (callback && !*state.settings->useDirectMemoryImport)
callback();
ResetInternal(); ResetInternal();
} }

View File

@ -92,6 +92,30 @@ namespace skyline::gpu::interconnect {
void ReleaseSlot(Slot *slot); void ReleaseSlot(Slot *slot);
}; };
/**
* @brief Thread responsible for notifying the guest of the completion of GPU operations
*/
class ExecutionWaiterThread {
private:
std::thread thread;
std::mutex mutex;
std::condition_variable condition;
std::queue<std::pair<std::shared_ptr<FenceCycle>, std::function<void()>>> pendingSignalQueue; //!< Queue of callbacks to be executed when their coressponding fence is signalled
std::atomic<bool> idle{};
void Run();
public:
ExecutionWaiterThread();
bool IsIdle() const;
/**
* @brief Queues `callback` to be executed when `cycle` is signalled, null values are valid for either, will null cycle representing an immediate callback (dep on previously queued cycles) and null callback representing a wait with no callback
*/
void Queue(std::shared_ptr<FenceCycle> cycle, std::function<void()> &&callback);
};
/** /**
* @brief Assembles a Vulkan command stream with various nodes and manages execution of the produced graph * @brief Assembles a Vulkan command stream with various nodes and manages execution of the produced graph
* @note This class is **NOT** thread-safe and should **ONLY** be utilized by a single thread * @note This class is **NOT** thread-safe and should **ONLY** be utilized by a single thread
@ -102,6 +126,7 @@ namespace skyline::gpu::interconnect {
GPU &gpu; GPU &gpu;
CommandRecordThread recordThread; CommandRecordThread recordThread;
CommandRecordThread::Slot *slot{}; CommandRecordThread::Slot *slot{};
ExecutionWaiterThread waiterThread;
node::RenderPassNode *renderPass{}; node::RenderPassNode *renderPass{};
size_t subpassCount{}; //!< The number of subpasses in the current render pass size_t subpassCount{}; //!< The number of subpasses in the current render pass
u32 renderPassIndex{}; u32 renderPassIndex{};
@ -274,8 +299,9 @@ namespace skyline::gpu::interconnect {
/** /**
* @brief Execute all the nodes and submit the resulting command buffer to the GPU * @brief Execute all the nodes and submit the resulting command buffer to the GPU
* @param callback A function to call upon GPU completion of the submission
*/ */
void Submit(); void Submit(std::function<void()> &&callback = {});
/** /**
* @brief Locks all preserve attached buffers/textures * @brief Locks all preserve attached buffers/textures

View File

@ -19,8 +19,9 @@ namespace skyline::soc::gm20b::engine {
ENGINE_STRUCT_CASE(syncpoint, action, { ENGINE_STRUCT_CASE(syncpoint, action, {
if (action.operation == Registers::Syncpoint::Operation::Incr) { if (action.operation == Registers::Syncpoint::Operation::Incr) {
Logger::Debug("Increment syncpoint: {}", +action.index); Logger::Debug("Increment syncpoint: {}", +action.index);
channelCtx.executor.Submit(); channelCtx.executor.Submit([=, syncpoints = &this->syncpoints, index = action.index]() {
syncpoints.at(action.index).Increment(); syncpoints->at(index).Increment();
});
} else if (action.operation == Registers::Syncpoint::Operation::Wait) { } else if (action.operation == Registers::Syncpoint::Operation::Wait) {
Logger::Debug("Wait syncpoint: {}, thresh: {}", +action.index, registers.syncpoint->payload); Logger::Debug("Wait syncpoint: {}, thresh: {}", +action.index, registers.syncpoint->payload);
@ -36,12 +37,6 @@ namespace skyline::soc::gm20b::engine {
ENGINE_STRUCT_CASE(semaphore, action, { ENGINE_STRUCT_CASE(semaphore, action, {
u64 address{registers.semaphore->address}; u64 address{registers.semaphore->address};
// Write timestamp first to ensure ordering
if (action.releaseSize == Registers::Semaphore::ReleaseSize::SixteenBytes) {
channelCtx.asCtx->gmmu.Write<u32>(address + 4, 0);
channelCtx.asCtx->gmmu.Write(address + 8, GetGpuTimeTicks());
}
switch (action.operation) { switch (action.operation) {
case Registers::Semaphore::Operation::Acquire: case Registers::Semaphore::Operation::Acquire:
Logger::Debug("Acquire semaphore: 0x{:X} payload: {}", address, registers.semaphore->payload); Logger::Debug("Acquire semaphore: 0x{:X} payload: {}", address, registers.semaphore->payload);
@ -54,7 +49,16 @@ namespace skyline::soc::gm20b::engine {
channelCtx.Lock(); channelCtx.Lock();
break; break;
case Registers::Semaphore::Operation::Release: case Registers::Semaphore::Operation::Release:
channelCtx.asCtx->gmmu.Write(address, registers.semaphore->payload); channelCtx.executor.Submit([this, action, address, payload = registers.semaphore->payload] () {
// Write timestamp first to ensure ordering
if (action.releaseSize == Registers::Semaphore::ReleaseSize::SixteenBytes) {
channelCtx.asCtx->gmmu.Write<u32>(address + 4, 0);
channelCtx.asCtx->gmmu.Write(address + 8, GetGpuTimeTicks());
}
channelCtx.asCtx->gmmu.Write(address, payload);
});
Logger::Debug("SemaphoreRelease: address: 0x{:X} payload: {}", address, registers.semaphore->payload); Logger::Debug("SemaphoreRelease: address: 0x{:X} payload: {}", address, registers.semaphore->payload);
break; break;
case Registers::Semaphore::Operation::AcqGeq : case Registers::Semaphore::Operation::AcqGeq :

View File

@ -218,8 +218,9 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
ENGINE_CASE(syncpointAction, { ENGINE_CASE(syncpointAction, {
Logger::Debug("Increment syncpoint: {}", static_cast<u16>(syncpointAction.id)); Logger::Debug("Increment syncpoint: {}", static_cast<u16>(syncpointAction.id));
channelCtx.executor.Submit(); channelCtx.executor.Submit([=, syncpoints = &this->syncpoints, index = syncpointAction.id]() {
syncpoints.at(syncpointAction.id).Increment(); syncpoints->at(index).Increment();
});
}) })
ENGINE_CASE(clearSurface, { ENGINE_CASE(clearSurface, {
@ -338,14 +339,17 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
switch (info.op) { switch (info.op) {
case type::SemaphoreInfo::Op::Release: case type::SemaphoreInfo::Op::Release:
channelCtx.executor.Submit(); channelCtx.executor.Submit([=, this, semaphore = *registers.semaphore]() {
WriteSemaphoreResult(registers.semaphore->payload); WriteSemaphoreResult(semaphore, semaphore.payload);
});
break; break;
case type::SemaphoreInfo::Op::Counter: { case type::SemaphoreInfo::Op::Counter: {
switch (info.counterType) { switch (info.counterType) {
case type::SemaphoreInfo::CounterType::Zero: case type::SemaphoreInfo::CounterType::Zero:
WriteSemaphoreResult(registers.semaphore->payload); channelCtx.executor.Submit([=, this, semaphore = *registers.semaphore]() {
WriteSemaphoreResult(semaphore, semaphore.payload);
});
break; break;
default: default:
@ -390,21 +394,19 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
} }
} }
void Maxwell3D::WriteSemaphoreResult(u64 result) { void Maxwell3D::WriteSemaphoreResult(const Registers::Semaphore &semaphore, u64 result) {
u64 address{registers.semaphore->address}; switch (semaphore.info.structureSize) {
switch (registers.semaphore->info.structureSize) {
case type::SemaphoreInfo::StructureSize::OneWord: case type::SemaphoreInfo::StructureSize::OneWord:
channelCtx.asCtx->gmmu.Write(address, static_cast<u32>(result)); channelCtx.asCtx->gmmu.Write(semaphore.address, static_cast<u32>(result));
Logger::Debug("address: 0x{:X} payload: {}", address, result); Logger::Debug("address: 0x{:X} payload: {}", semaphore.address, result);
break; break;
case type::SemaphoreInfo::StructureSize::FourWords: { case type::SemaphoreInfo::StructureSize::FourWords: {
// Write timestamp first to ensure correct ordering // Write timestamp first to ensure correct ordering
u64 timestamp{GetGpuTimeTicks()}; u64 timestamp{GetGpuTimeTicks()};
channelCtx.asCtx->gmmu.Write(address + 8, timestamp); channelCtx.asCtx->gmmu.Write(semaphore.address + 8, timestamp);
channelCtx.asCtx->gmmu.Write(address, result); channelCtx.asCtx->gmmu.Write(semaphore.address, result);
Logger::Debug("address: 0x{:X} payload: {} timestamp: {}", address, result, timestamp); Logger::Debug("address: 0x{:X} payload: {} timestamp: {}", semaphore.address, result, timestamp);
break; break;
} }

View File

@ -78,12 +78,6 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
*/ */
void HandleMethod(u32 method, u32 argument); void HandleMethod(u32 method, u32 argument);
/**
* @brief Writes back a semaphore result to the guest with an auto-generated timestamp (if required)
* @note If the semaphore is OneWord then the result will be downcasted to a 32-bit unsigned integer
*/
void WriteSemaphoreResult(u64 result);
public: public:
/** /**
* @url https://github.com/devkitPro/deko3d/blob/master/source/maxwell/engine_3d.def * @url https://github.com/devkitPro/deko3d/blob/master/source/maxwell/engine_3d.def
@ -421,6 +415,14 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
static_assert(sizeof(Registers) == (EngineMethodsEnd * sizeof(u32))); static_assert(sizeof(Registers) == (EngineMethodsEnd * sizeof(u32)));
#pragma pack(pop) #pragma pack(pop)
private:
/**
* @brief Writes back a semaphore result to the guest with an auto-generated timestamp (if required)
* @note If the semaphore is OneWord then the result will be downcasted to a 32-bit unsigned integer
*/
void WriteSemaphoreResult(const Registers::Semaphore &semaphore, u64 result);
public:
Registers registers{}; Registers registers{};
Registers shadowRegisters{}; //!< A shadow-copy of the registers, their function is controlled by the 'shadowRamControl' register Registers shadowRegisters{}; //!< A shadow-copy of the registers, their function is controlled by the 'shadowRamControl' register