From 579a2d9337bd884501012b121d2072940eaf0c3f Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Sat, 19 Nov 2022 18:11:24 +0000
Subject: [PATCH] Add dynamic executor slot growth

---
 .../main/cpp/skyline/common/circular_queue.h  |  4 ++--
 .../gpu/interconnect/command_executor.cpp     | 22 ++++++++++++++-----
 .../gpu/interconnect/command_executor.h       |  3 +++
 .../nvdrv/devices/nvhost/host1x_channel.cpp   |  6 ++++-
 4 files changed, 27 insertions(+), 8 deletions(-)
diff --git a/app/src/main/cpp/skyline/common/circular_queue.h b/app/src/main/cpp/skyline/common/circular_queue.h
index bdbd3836..1e536e38 100644
--- a/app/src/main/cpp/skyline/common/circular_queue.h
+++ b/app/src/main/cpp/skyline/common/circular_queue.h
@@ -124,9 +124,9 @@ namespace skyline {
          * @param tranformation A function that takes in an item of TransformedType as input and returns an item of Type
          */
         template<typename TransformedType, typename Transformation>
-        void AppendTranform(span <TransformedType> buffer, Transformation transformation) {
+        void AppendTranform(TransformedType &container, Transformation transformation) {
             std::unique_lock lock(productionMutex);
-            for (auto &item : buffer) {
+            for (auto &item : container) {
                 auto next{end + 1};
                 next = (next == reinterpret_cast<Type *>(vector.end().base())) ? reinterpret_cast<Type *>(vector.begin().base()) : next;
                 if (next == start) {
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
index af239a77..916d9efd 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@@ -55,8 +55,13 @@ namespace skyline::gpu::interconnect {
           ready{other.ready} {}
 
     std::shared_ptr<FenceCycle> CommandRecordThread::Slot::Reset(GPU &gpu) {
+        auto startTime{util::GetTimeNs()};
+
         cycle->Wait();
         cycle = std::make_shared<FenceCycle>(*cycle);
+        if (util::GetTimeNs() - startTime > GrowThresholdNs)
+            didWait = true;
+
         // Command buffer doesn't need to be reset since that's done implicitly by begin
         return cycle;
     }
@@ -126,10 +131,7 @@ namespace skyline::gpu::interconnect {
                 Logger::Warn("Failed to intialise RenderDoc API: {}", ret);
         }
 
-        std::vector<Slot> slots{};
-        std::generate_n(std::back_inserter(slots), (1U << *state.settings->executorSlotCountScale), [&] () -> Slot { return gpu; });
-
-        outgoing.AppendTranform(span<Slot>(slots), [](auto &slot) { return &slot; });
+        outgoing.Push(&slots.emplace_back(gpu));
 
         if (int result{pthread_setname_np(pthread_self(), "Sky-CmdRecord")})
             Logger::Warn("Failed to set the thread name: {}", strerror(result));
@@ -148,6 +150,11 @@ namespace skyline::gpu::interconnect {
                     renderDocApi->EndFrameCapture(RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(instance), nullptr);
                 slot->capture = false;
 
+                if (slot->didWait && slots.size() < (1U << *state.settings->executorSlotCountScale)) {
+                    outgoing.Push(&slots.emplace_back(gpu));
+                    slot->didWait = false;
+                }
+
                 outgoing.Push(slot);
             }, [] {});
         } catch (const signal::SignalException &e) {
@@ -166,7 +173,12 @@ namespace skyline::gpu::interconnect {
     }
 
     CommandRecordThread::Slot *CommandRecordThread::AcquireSlot() {
-        return outgoing.Pop();
+        auto startTime{util::GetTimeNs()};
+        auto slot{outgoing.Pop()};
+        if (util::GetTimeNs() - startTime > GrowThresholdNs)
+            slot->didWait = true;
+
+        return slot;
     }
 
     void CommandRecordThread::ReleaseSlot(Slot *slot) {
diff --git a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
index 521a53ef..5dd470c7 100644
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.h
@@ -42,6 +42,7 @@ namespace skyline::gpu::interconnect {
             u32 executionNumber;
             bool ready{}; //!< If this slot's command buffer has had 'beginCommandBuffer' called and is ready to have commands recorded into it
             bool capture{}; //!< If this slot's Vulkan commands should be captured using the renderdoc API
+            bool didWait{}; //!< If a wait of time longer than GrowThresholdNs occured when this slot was acquired
 
             Slot(GPU &gpu);
 
@@ -62,9 +63,11 @@ namespace skyline::gpu::interconnect {
         };
 
       private:
+        static constexpr size_t GrowThresholdNs{constant::NsInMillisecond / 4}; //!< The wait time threshold at which the slot count will be increased
         const DeviceState &state;
         CircularQueue<Slot *> incoming; //!< Slots pending recording
         CircularQueue<Slot *> outgoing; //!< Slots that have been submitted, may still be active on the GPU
+        std::list<Slot> slots;
 
         std::thread thread;
 
diff --git a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/host1x_channel.cpp b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/host1x_channel.cpp
index 80c40d99..ebc3db4a 100644
--- a/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/host1x_channel.cpp
+++ b/app/src/main/cpp/skyline/services/nvdrv/devices/nvhost/host1x_channel.cpp
@@ -3,6 +3,7 @@
 
 #include <soc.h>
 #include <services/nvdrv/devices/deserialisation/deserialisation.h>
+#include <gpu.h>
 #include "host1x_channel.h"
 
 namespace skyline::service::nvdrv::device::nvhost {
@@ -38,6 +39,9 @@ namespace skyline::service::nvdrv::device::nvhost {
         for (size_t i{}; i < syncpointIncrs.size(); i++) {
             const auto &incr{syncpointIncrs[i]};
 
+            for (size_t j{}; j < incr.numIncrs; j++)
+                state.soc->host1x.syncpoints[incr.syncpointId].Increment();
+
             u32 max{core.syncpointManager.IncrementSyncpointMaxExt(incr.syncpointId, incr.numIncrs)};
             if (i < fenceThresholds.size())
                 fenceThresholds[i] = max;
@@ -52,7 +56,7 @@ namespace skyline::service::nvdrv::device::nvhost {
             Logger::Debug("Submit gather, CPU address: 0x{:X}, words: 0x{:X}", gatherAddress, cmdBuf.words);
 
             span gather(reinterpret_cast<u32 *>(gatherAddress), cmdBuf.words);
-            state.soc->host1x.channels[static_cast<size_t>(channelType)].Push(gather);
+       //     state.soc->host1x.channels[static_cast<size_t>(channelType)].Push(gather);
         }
 
         return PosixResult::Success;