From b3f7e990cc100c939e84c68f2b3e08b8c3737395 Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Tue, 27 Dec 2022 18:15:44 +0000
Subject: [PATCH] Allow for tying guest GPU sync operations to host GPU sync

This is necessary for the upcoming direct buffer support, as in order to use guest buffers directly without trapping we need to recreate any guest GPU sync on the host GPU. This avoids the guest thinking work is done that isn't and overwriting in-use buffer contents.
---
 .../gpu/interconnect/command_executor.cpp     | 55 ++++++++++++++++++-
 .../gpu/interconnect/command_executor.h       | 28 +++++++++-
 .../cpp/skyline/soc/gm20b/engines/gpfifo.cpp  | 22 +++++---
 .../skyline/soc/gm20b/engines/maxwell_3d.cpp  | 30 +++++-----
 .../skyline/soc/gm20b/engines/maxwell_3d.h    | 14 +++--
 5 files changed, 116 insertions(+), 33 deletions(-)
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
index 8e310b8f..de510eb3 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@@ -192,6 +192,42 @@ namespace skyline::gpu::interconnect {
         incoming.Push(slot);
     }
 
+    void ExecutionWaiterThread::Run() {
+        signal::SetSignalHandler({SIGSEGV}, nce::NCE::HostSignalHandler); // We may access NCE trapped memory
+
+        while (true) {
+            std::pair<std::shared_ptr<FenceCycle>, std::function<void()>> item{};
+            {
+                std::unique_lock lock{mutex};
+                idle = true;
+                condition.wait(lock, [this] { return !pendingSignalQueue.empty(); });
+                idle = false;
+                item = std::move(pendingSignalQueue.front());
+                pendingSignalQueue.pop();
+            }
+            {
+                TRACE_EVENT("gpu", "GPU");
+                if (item.first)
+                    item.first->Wait();
+            }
+
+            if (item.second)
+                item.second();
+        }
+    }
+
+    ExecutionWaiterThread::ExecutionWaiterThread() : thread{&ExecutionWaiterThread::Run, this} {}
+
+    bool ExecutionWaiterThread::IsIdle() const {
+        return idle;
+    }
+
+    void ExecutionWaiterThread::Queue(std::shared_ptr<FenceCycle> cycle, std::function<void()> &&callback) {
+        std::unique_lock lock{mutex};
+        pendingSignalQueue.push({std::move(cycle), std::move(callback)});
+        condition.notify_all();
+    }
+
     CommandExecutor::CommandExecutor(const DeviceState &state)
         : state{state},
           gpu{*state.gpu},
@@ -501,18 +537,31 @@ namespace skyline::gpu::interconnect {
         }
     }
 
-    void CommandExecutor::Submit() {
-        for (const auto &callback : flushCallbacks)
-            callback();
+    void CommandExecutor::Submit(std::function<void()> &&callback) {
+        for (const auto &flushCallback : flushCallbacks)
+            flushCallback();
 
         executionTag = AllocateTag();
 
         if (!slot->nodes.empty()) {
             TRACE_EVENT("gpu", "CommandExecutor::Submit");
+
+            if (callback && *state.settings->useDirectMemoryImport)
+                waiterThread.Queue(cycle, std::move(callback));
+            else
+                waiterThread.Queue(cycle, {});
+
             SubmitInternal();
             submissionNumber++;
+
+        } else {
+            if (callback && *state.settings->useDirectMemoryImport)
+                waiterThread.Queue(nullptr, std::move(callback));
         }
 
+        if (callback && !*state.settings->useDirectMemoryImport)
+            callback();
+
         ResetInternal();
     }
 
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
index 789005bb..fbb79840 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@@ -92,6 +92,30 @@ namespace skyline::gpu::interconnect {
         void ReleaseSlot(Slot *slot);
     };
 
+    /**
+     * @brief Thread responsible for notifying the guest of the completion of GPU operations
+     */
+    class ExecutionWaiterThread {
+      private:
+        std::thread thread;
+        std::mutex mutex;
+        std::condition_variable condition;
+        std::queue<std::pair<std::shared_ptr<FenceCycle>, std::function<void()>>> pendingSignalQueue; //!< Queue of callbacks to be executed when their coressponding fence is signalled
+        std::atomic<bool> idle{};
+
+        void Run();
+
+      public:
+        ExecutionWaiterThread();
+
+        bool IsIdle() const;
+
+        /**
+         * @brief Queues `callback` to be executed when `cycle` is signalled, null values are valid for either, will null cycle representing an immediate callback (dep on previously queued cycles) and null callback representing a wait with no callback
+         */
+        void Queue(std::shared_ptr<FenceCycle> cycle, std::function<void()> &&callback);
+    };
+
     /**
      * @brief Assembles a Vulkan command stream with various nodes and manages execution of the produced graph
      * @note This class is **NOT** thread-safe and should **ONLY** be utilized by a single thread
@@ -102,6 +126,7 @@ namespace skyline::gpu::interconnect {
         GPU &gpu;
         CommandRecordThread recordThread;
         CommandRecordThread::Slot *slot{};
+        ExecutionWaiterThread waiterThread;
         node::RenderPassNode *renderPass{};
         size_t subpassCount{}; //!< The number of subpasses in the current render pass
         u32 renderPassIndex{};
@@ -274,8 +299,9 @@ namespace skyline::gpu::interconnect {
 
         /**
          * @brief Execute all the nodes and submit the resulting command buffer to the GPU
+         * @param callback A function to call upon GPU completion of the submission
          */
-        void Submit();
+        void Submit(std::function<void()> &&callback = {});
 
         /**
          * @brief Locks all preserve attached buffers/textures
diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.cpp b/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.cpp
index 33477212..30ba3cd9 100644
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.cpp
@@ -19,8 +19,9 @@ namespace skyline::soc::gm20b::engine {
             ENGINE_STRUCT_CASE(syncpoint, action, {
                 if (action.operation == Registers::Syncpoint::Operation::Incr) {
                     Logger::Debug("Increment syncpoint: {}", +action.index);
-                    channelCtx.executor.Submit();
-                    syncpoints.at(action.index).Increment();
+                    channelCtx.executor.Submit([=, syncpoints = &this->syncpoints, index = action.index]() {
+                        syncpoints->at(index).Increment();
+                    });
                 } else if (action.operation == Registers::Syncpoint::Operation::Wait) {
                     Logger::Debug("Wait syncpoint: {}, thresh: {}", +action.index, registers.syncpoint->payload);
 
@@ -36,12 +37,6 @@ namespace skyline::soc::gm20b::engine {
             ENGINE_STRUCT_CASE(semaphore, action, {
                 u64 address{registers.semaphore->address};
 
-                // Write timestamp first to ensure ordering
-                if (action.releaseSize == Registers::Semaphore::ReleaseSize::SixteenBytes) {
-                    channelCtx.asCtx->gmmu.Write<u32>(address + 4, 0);
-                    channelCtx.asCtx->gmmu.Write(address + 8, GetGpuTimeTicks());
-                }
-
                 switch (action.operation) {
                     case Registers::Semaphore::Operation::Acquire:
                         Logger::Debug("Acquire semaphore: 0x{:X} payload: {}", address, registers.semaphore->payload);
@@ -54,7 +49,16 @@ namespace skyline::soc::gm20b::engine {
                         channelCtx.Lock();
                         break;
                     case Registers::Semaphore::Operation::Release:
-                        channelCtx.asCtx->gmmu.Write(address, registers.semaphore->payload);
+                        channelCtx.executor.Submit([this, action, address, payload = registers.semaphore->payload] () {
+                            // Write timestamp first to ensure ordering
+                            if (action.releaseSize == Registers::Semaphore::ReleaseSize::SixteenBytes) {
+                                channelCtx.asCtx->gmmu.Write<u32>(address + 4, 0);
+                                channelCtx.asCtx->gmmu.Write(address + 8, GetGpuTimeTicks());
+                            }
+
+                            channelCtx.asCtx->gmmu.Write(address, payload);
+                        });
+
                         Logger::Debug("SemaphoreRelease: address: 0x{:X} payload: {}", address, registers.semaphore->payload);
                         break;
                     case Registers::Semaphore::Operation::AcqGeq    :
diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
index aed066f3..4bb995d4 100644
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
@@ -218,8 +218,9 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
 
             ENGINE_CASE(syncpointAction, {
                 Logger::Debug("Increment syncpoint: {}", static_cast<u16>(syncpointAction.id));
-                channelCtx.executor.Submit();
-                syncpoints.at(syncpointAction.id).Increment();
+                channelCtx.executor.Submit([=, syncpoints = &this->syncpoints, index = syncpointAction.id]() {
+                    syncpoints->at(index).Increment();
+                });
             })
 
             ENGINE_CASE(clearSurface, {
@@ -338,14 +339,17 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
 
                 switch (info.op) {
                     case type::SemaphoreInfo::Op::Release:
-                        channelCtx.executor.Submit();
-                        WriteSemaphoreResult(registers.semaphore->payload);
+                        channelCtx.executor.Submit([=, this, semaphore = *registers.semaphore]() {
+                            WriteSemaphoreResult(semaphore, semaphore.payload);
+                        });
                         break;
 
                     case type::SemaphoreInfo::Op::Counter: {
                         switch (info.counterType) {
                             case type::SemaphoreInfo::CounterType::Zero:
-                                WriteSemaphoreResult(registers.semaphore->payload);
+                                channelCtx.executor.Submit([=, this, semaphore = *registers.semaphore]() {
+                                    WriteSemaphoreResult(semaphore, semaphore.payload);
+                                });
                                 break;
 
                             default:
@@ -390,21 +394,19 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
         }
     }
 
-    void Maxwell3D::WriteSemaphoreResult(u64 result) {
-        u64 address{registers.semaphore->address};
-
-        switch (registers.semaphore->info.structureSize) {
+    void Maxwell3D::WriteSemaphoreResult(const Registers::Semaphore &semaphore, u64 result) {
+        switch (semaphore.info.structureSize) {
             case type::SemaphoreInfo::StructureSize::OneWord:
-                channelCtx.asCtx->gmmu.Write(address, static_cast<u32>(result));
-                Logger::Debug("address: 0x{:X} payload: {}", address, result);
+                channelCtx.asCtx->gmmu.Write(semaphore.address, static_cast<u32>(result));
+                Logger::Debug("address: 0x{:X} payload: {}", semaphore.address, result);
                 break;
 
             case type::SemaphoreInfo::StructureSize::FourWords: {
                 // Write timestamp first to ensure correct ordering
                 u64 timestamp{GetGpuTimeTicks()};
-                channelCtx.asCtx->gmmu.Write(address + 8, timestamp);
-                channelCtx.asCtx->gmmu.Write(address, result);
-                Logger::Debug("address: 0x{:X} payload: {} timestamp: {}", address, result, timestamp);
+                channelCtx.asCtx->gmmu.Write(semaphore.address + 8, timestamp);
+                channelCtx.asCtx->gmmu.Write(semaphore.address, result);
+                Logger::Debug("address: 0x{:X} payload: {} timestamp: {}", semaphore.address, result, timestamp);
 
                 break;
             }
diff --git a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h
index 9a3876e1..69543a22 100644
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h
@@ -78,12 +78,6 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
          */
         void HandleMethod(u32 method, u32 argument);
 
-        /**
-         * @brief Writes back a semaphore result to the guest with an auto-generated timestamp (if required)
-         * @note If the semaphore is OneWord then the result will be downcasted to a 32-bit unsigned integer
-         */
-        void WriteSemaphoreResult(u64 result);
-
       public:
         /**
          * @url https://github.com/devkitPro/deko3d/blob/master/source/maxwell/engine_3d.def
@@ -421,6 +415,14 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
         static_assert(sizeof(Registers) == (EngineMethodsEnd * sizeof(u32)));
         #pragma pack(pop)
 
+      private:
+        /**
+         * @brief Writes back a semaphore result to the guest with an auto-generated timestamp (if required)
+         * @note If the semaphore is OneWord then the result will be downcasted to a 32-bit unsigned integer
+         */
+        void WriteSemaphoreResult(const Registers::Semaphore &semaphore, u64 result);
+
+      public:
         Registers registers{};
         Registers shadowRegisters{}; //!< A shadow-copy of the registers, their function is controlled by the 'shadowRamControl' register