Allow for tying guest GPU sync operations to host GPU sync

This is necessary for the upcoming direct buffer support, as in order to use guest buffers directly without trapping we need to recreate any guest GPU sync on the host GPU. This avoids the guest thinking work is done that isn't and overwriting in-use buffer contents.
2024-11-22 15:09:17 +01:00 · 2022-12-27 18:15:44 +00:00 · 2022-12-27 18:15:44 +00:00 · b3f7e990cc
commit b3f7e990cc
parent 89c6fab1cb
5 changed files with 116 additions and 33 deletions
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@ -192,6 +192,42 @@ namespace skyline::gpu::interconnect {
        incoming.Push(slot);
    }

+    void ExecutionWaiterThread::Run() {
+        signal::SetSignalHandler({SIGSEGV}, nce::NCE::HostSignalHandler); // We may access NCE trapped memory
+
+        while (true) {
+            std::pair<std::shared_ptr<FenceCycle>, std::function<void()>> item{};
+            {
+                std::unique_lock lock{mutex};
+                idle = true;
+                condition.wait(lock, [this] { return !pendingSignalQueue.empty(); });
+                idle = false;
+                item = std::move(pendingSignalQueue.front());
+                pendingSignalQueue.pop();
+            }
+            {
+                TRACE_EVENT("gpu", "GPU");
+                if (item.first)
+                    item.first->Wait();
+            }
+
+            if (item.second)
+                item.second();
+        }
+    }
+
+    ExecutionWaiterThread::ExecutionWaiterThread() : thread{&ExecutionWaiterThread::Run, this} {}
+
+    bool ExecutionWaiterThread::IsIdle() const {
+        return idle;
+    }
+
+    void ExecutionWaiterThread::Queue(std::shared_ptr<FenceCycle> cycle, std::function<void()> &&callback) {
+        std::unique_lock lock{mutex};
+        pendingSignalQueue.push({std::move(cycle), std::move(callback)});
+        condition.notify_all();
+    }
+
    CommandExecutor::CommandExecutor(const DeviceState &state)
        : state{state},
          gpu{*state.gpu},
@ -501,18 +537,31 @@ namespace skyline::gpu::interconnect {
        }
    }

-    void CommandExecutor::Submit() {
-        for (const auto &callback : flushCallbacks)
-            callback();
+    void CommandExecutor::Submit(std::function<void()> &&callback) {
+        for (const auto &flushCallback : flushCallbacks)
+            flushCallback();

        executionTag = AllocateTag();

        if (!slot->nodes.empty()) {
            TRACE_EVENT("gpu", "CommandExecutor::Submit");
+
+            if (callback && *state.settings->useDirectMemoryImport)
+                waiterThread.Queue(cycle, std::move(callback));
+            else
+                waiterThread.Queue(cycle, {});
+
            SubmitInternal();
            submissionNumber++;
+
+        } else {
+            if (callback && *state.settings->useDirectMemoryImport)
+                waiterThread.Queue(nullptr, std::move(callback));
        }

+        if (callback && !*state.settings->useDirectMemoryImport)
+            callback();
+
        ResetInternal();
    }

--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@ -92,6 +92,30 @@ namespace skyline::gpu::interconnect {
        void ReleaseSlot(Slot *slot);
    };

+    /**
+     * @brief Thread responsible for notifying the guest of the completion of GPU operations
+     */
+    class ExecutionWaiterThread {
+      private:
+        std::thread thread;
+        std::mutex mutex;
+        std::condition_variable condition;
+        std::queue<std::pair<std::shared_ptr<FenceCycle>, std::function<void()>>> pendingSignalQueue; //!< Queue of callbacks to be executed when their coressponding fence is signalled
+        std::atomic<bool> idle{};
+
+        void Run();
+
+      public:
+        ExecutionWaiterThread();
+
+        bool IsIdle() const;
+
+        /**
+         * @brief Queues `callback` to be executed when `cycle` is signalled, null values are valid for either, will null cycle representing an immediate callback (dep on previously queued cycles) and null callback representing a wait with no callback
+         */
+        void Queue(std::shared_ptr<FenceCycle> cycle, std::function<void()> &&callback);
+    };
+
    /**
     * @brief Assembles a Vulkan command stream with various nodes and manages execution of the produced graph
     * @note This class is **NOT** thread-safe and should **ONLY** be utilized by a single thread
@ -102,6 +126,7 @@ namespace skyline::gpu::interconnect {
        GPU &gpu;
        CommandRecordThread recordThread;
        CommandRecordThread::Slot *slot{};
+        ExecutionWaiterThread waiterThread;
        node::RenderPassNode *renderPass{};
        size_t subpassCount{}; //!< The number of subpasses in the current render pass
        u32 renderPassIndex{};
@ -274,8 +299,9 @@ namespace skyline::gpu::interconnect {

        /**
         * @brief Execute all the nodes and submit the resulting command buffer to the GPU
+         * @param callback A function to call upon GPU completion of the submission
         */
-        void Submit();
+        void Submit(std::function<void()> &&callback = {});

        /**
         * @brief Locks all preserve attached buffers/textures
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/gpfifo.cpp
@ -19,8 +19,9 @@ namespace skyline::soc::gm20b::engine {
            ENGINE_STRUCT_CASE(syncpoint, action, {
                if (action.operation == Registers::Syncpoint::Operation::Incr) {
                    Logger::Debug("Increment syncpoint: {}", +action.index);
-                    channelCtx.executor.Submit();
-                    syncpoints.at(action.index).Increment();
+                    channelCtx.executor.Submit([=, syncpoints = &this->syncpoints, index = action.index]() {
+                        syncpoints->at(index).Increment();
+                    });
                } else if (action.operation == Registers::Syncpoint::Operation::Wait) {
                    Logger::Debug("Wait syncpoint: {}, thresh: {}", +action.index, registers.syncpoint->payload);

@ -36,12 +37,6 @@ namespace skyline::soc::gm20b::engine {
            ENGINE_STRUCT_CASE(semaphore, action, {
                u64 address{registers.semaphore->address};

-                // Write timestamp first to ensure ordering
-                if (action.releaseSize == Registers::Semaphore::ReleaseSize::SixteenBytes) {
-                    channelCtx.asCtx->gmmu.Write<u32>(address + 4, 0);
-                    channelCtx.asCtx->gmmu.Write(address + 8, GetGpuTimeTicks());
-                }
-
                switch (action.operation) {
                    case Registers::Semaphore::Operation::Acquire:
                        Logger::Debug("Acquire semaphore: 0x{:X} payload: {}", address, registers.semaphore->payload);
@ -54,7 +49,16 @@ namespace skyline::soc::gm20b::engine {
                        channelCtx.Lock();
                        break;
                    case Registers::Semaphore::Operation::Release:
-                        channelCtx.asCtx->gmmu.Write(address, registers.semaphore->payload);
+                        channelCtx.executor.Submit([this, action, address, payload = registers.semaphore->payload] () {
+                            // Write timestamp first to ensure ordering
+                            if (action.releaseSize == Registers::Semaphore::ReleaseSize::SixteenBytes) {
+                                channelCtx.asCtx->gmmu.Write<u32>(address + 4, 0);
+                                channelCtx.asCtx->gmmu.Write(address + 8, GetGpuTimeTicks());
+                            }
+
+                            channelCtx.asCtx->gmmu.Write(address, payload);
+                        });
+
                        Logger::Debug("SemaphoreRelease: address: 0x{:X} payload: {}", address, registers.semaphore->payload);
                        break;
                    case Registers::Semaphore::Operation::AcqGeq    :
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.cpp
@ -218,8 +218,9 @@ namespace skyline::soc::gm20b::engine::maxwell3d {

            ENGINE_CASE(syncpointAction, {
                Logger::Debug("Increment syncpoint: {}", static_cast<u16>(syncpointAction.id));
-                channelCtx.executor.Submit();
-                syncpoints.at(syncpointAction.id).Increment();
+                channelCtx.executor.Submit([=, syncpoints = &this->syncpoints, index = syncpointAction.id]() {
+                    syncpoints->at(index).Increment();
+                });
            })

            ENGINE_CASE(clearSurface, {
@ -338,14 +339,17 @@ namespace skyline::soc::gm20b::engine::maxwell3d {

                switch (info.op) {
                    case type::SemaphoreInfo::Op::Release:
-                        channelCtx.executor.Submit();
-                        WriteSemaphoreResult(registers.semaphore->payload);
+                        channelCtx.executor.Submit([=, this, semaphore = *registers.semaphore]() {
+                            WriteSemaphoreResult(semaphore, semaphore.payload);
+                        });
                        break;

                    case type::SemaphoreInfo::Op::Counter: {
                        switch (info.counterType) {
                            case type::SemaphoreInfo::CounterType::Zero:
-                                WriteSemaphoreResult(registers.semaphore->payload);
+                                channelCtx.executor.Submit([=, this, semaphore = *registers.semaphore]() {
+                                    WriteSemaphoreResult(semaphore, semaphore.payload);
+                                });
                                break;

                            default:
@ -390,21 +394,19 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
        }
    }

-    void Maxwell3D::WriteSemaphoreResult(u64 result) {
-        u64 address{registers.semaphore->address};
-
-        switch (registers.semaphore->info.structureSize) {
+    void Maxwell3D::WriteSemaphoreResult(const Registers::Semaphore &semaphore, u64 result) {
+        switch (semaphore.info.structureSize) {
            case type::SemaphoreInfo::StructureSize::OneWord:
-                channelCtx.asCtx->gmmu.Write(address, static_cast<u32>(result));
-                Logger::Debug("address: 0x{:X} payload: {}", address, result);
+                channelCtx.asCtx->gmmu.Write(semaphore.address, static_cast<u32>(result));
+                Logger::Debug("address: 0x{:X} payload: {}", semaphore.address, result);
                break;

            case type::SemaphoreInfo::StructureSize::FourWords: {
                // Write timestamp first to ensure correct ordering
                u64 timestamp{GetGpuTimeTicks()};
-                channelCtx.asCtx->gmmu.Write(address + 8, timestamp);
-                channelCtx.asCtx->gmmu.Write(address, result);
-                Logger::Debug("address: 0x{:X} payload: {} timestamp: {}", address, result, timestamp);
+                channelCtx.asCtx->gmmu.Write(semaphore.address + 8, timestamp);
+                channelCtx.asCtx->gmmu.Write(semaphore.address, result);
+                Logger::Debug("address: 0x{:X} payload: {} timestamp: {}", semaphore.address, result, timestamp);

                break;
            }
--- a/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h
+++ b/app/src/main/cpp/skyline/soc/gm20b/engines/maxwell_3d.h
@ -78,12 +78,6 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
         */
        void HandleMethod(u32 method, u32 argument);

-        /**
-         * @brief Writes back a semaphore result to the guest with an auto-generated timestamp (if required)
-         * @note If the semaphore is OneWord then the result will be downcasted to a 32-bit unsigned integer
-         */
-        void WriteSemaphoreResult(u64 result);
-
      public:
        /**
         * @url https://github.com/devkitPro/deko3d/blob/master/source/maxwell/engine_3d.def
@ -421,6 +415,14 @@ namespace skyline::soc::gm20b::engine::maxwell3d {
        static_assert(sizeof(Registers) == (EngineMethodsEnd * sizeof(u32)));
        #pragma pack(pop)

+      private:
+        /**
+         * @brief Writes back a semaphore result to the guest with an auto-generated timestamp (if required)
+         * @note If the semaphore is OneWord then the result will be downcasted to a 32-bit unsigned integer
+         */
+        void WriteSemaphoreResult(const Registers::Semaphore &semaphore, u64 result);
+
+      public:
        Registers registers{};
        Registers shadowRegisters{}; //!< A shadow-copy of the registers, their function is controlled by the 'shadowRamControl' register