Implement Cooperative Scheduling With Load Balancing

2025-02-22 16:27:09 +01:00 · 2020-12-05 23:11:52 +05:30 · 2020-12-05 23:11:52 +05:30 · cf000f5750
commit cf000f5750
parent 8564edcb16
10 changed files with 321 additions and 102 deletions
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@ -41,6 +41,7 @@ add_library(skyline SHARED
        ${source_DIR}/skyline/jvm.cpp
        ${source_DIR}/skyline/os.cpp
        ${source_DIR}/skyline/kernel/memory.cpp
+        ${source_DIR}/skyline/kernel/scheduler.cpp
        ${source_DIR}/skyline/kernel/ipc.cpp
        ${source_DIR}/skyline/kernel/svc.cpp
        ${source_DIR}/skyline/kernel/types/KProcess.cpp
--- a/app/src/main/cpp/emu_jni.cpp
+++ b/app/src/main/cpp/emu_jni.cpp
@ -5,9 +5,10 @@
 #include <pthread.h>
 #include <unistd.h>
 #include <android/log.h>
-#include "skyline/loader/loader.h"
 #include "skyline/common.h"
+#include "skyline/common/signal.h"
 #include "skyline/common/settings.h"
+#include "skyline/loader/loader.h"
 #include "skyline/os.h"
 #include "skyline/jvm.h"
 #include "skyline/gpu.h"
@ -50,6 +51,8 @@ extern "C" JNIEXPORT void Java_emu_skyline_EmulationActivity_executeApplication(
        os->Execute(romFd, static_cast<skyline::loader::RomFormat>(romType));
    } catch (std::exception &e) {
        logger->Error(e.what());
+    } catch (const skyline::signal::SignalException &e) {
+        logger->Error(e.what());
    } catch (...) {
        logger->Error("An unknown exception has occurred");
    }
--- a/app/src/main/cpp/skyline/common.cpp
+++ b/app/src/main/cpp/skyline/common.cpp
@ -62,6 +62,7 @@ namespace skyline {
        gpu = std::make_shared<gpu::GPU>(*this);
        audio = std::make_shared<audio::Audio>(*this);
        nce = std::make_shared<nce::NCE>(*this);
+        scheduler = std::make_shared<kernel::Scheduler>(*this);
        input = std::make_shared<input::Input>(*this);
    }
 }
--- a/app/src/main/cpp/skyline/common.h
+++ b/app/src/main/cpp/skyline/common.h
@ -480,6 +480,7 @@ namespace skyline {
            class KProcess;
            class KThread;
        }
+        class Scheduler;
        class OS;
    }
    namespace audio {
@ -506,6 +507,7 @@ namespace skyline {
        std::shared_ptr<gpu::GPU> gpu;
        std::shared_ptr<audio::Audio> audio;
        std::shared_ptr<nce::NCE> nce;
+        std::shared_ptr<kernel::Scheduler> scheduler;
        std::shared_ptr<kernel::type::KProcess> process;
        static thread_local inline std::shared_ptr<kernel::type::KThread> thread{}; //!< The KThread of the thread which accesses this object
        static thread_local inline nce::ThreadContext *ctx{}; //!< The context of the guest thread for the corresponding host thread
--- a/app/src/main/cpp/skyline/kernel/scheduler.cpp
+++ b/app/src/main/cpp/skyline/kernel/scheduler.cpp
@ -0,0 +1,134 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2020 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <common/signal.h>
+#include "types/KThread.h"
+#include "scheduler.h"
+
+namespace skyline::kernel {
+    Scheduler::CoreContext::CoreContext(u8 id) : id(id) {}
+
+    Scheduler::Scheduler(const DeviceState &state) : state(state) {}
+
+    Scheduler::CoreContext &Scheduler::LoadBalance() {
+        auto &thread{state.thread};
+        auto currentCore{&cores.at(thread->coreId)};
+
+        if (!currentCore->queue.empty() && thread->affinityMask.count() != 1) {
+            // Select core where the current thread will be scheduled the earliest based off average timeslice durations for resident threads
+            // There's a preference for the current core as migration isn't free
+
+            size_t minTimeslice{};
+            CoreContext *optimalCore{};
+            for (auto &candidateCore : cores) {
+                if (thread->affinityMask.test(candidateCore.id)) {
+                    u64 timeslice{};
+
+                    if (!candidateCore.queue.empty()) {
+                        std::shared_lock lock(candidateCore.mutex);
+
+                        auto threadIterator{candidateCore.queue.cbegin()};
+                        if (threadIterator != candidateCore.queue.cend()) {
+                            const auto &runningThread{*threadIterator};
+                            timeslice += runningThread->averageTimeslice ? std::min(runningThread->averageTimeslice - (util::GetTimeTicks() - runningThread->timesliceStart), 1UL) : runningThread->timesliceStart ? util::GetTimeTicks() - runningThread->timesliceStart : 1UL;
+
+                            while (++threadIterator != candidateCore.queue.cend()) {
+                                const auto &residentThread{*threadIterator};
+                                if (residentThread->priority <= thread->priority)
+                                    timeslice += residentThread->averageTimeslice ? residentThread->averageTimeslice : 1UL;
+                            }
+                        }
+                    }
+
+                    if (!optimalCore || timeslice < minTimeslice || (timeslice == minTimeslice && &candidateCore == currentCore)) {
+                        optimalCore = &candidateCore;
+                        minTimeslice = timeslice;
+                    }
+                }
+            }
+
+            if (optimalCore != currentCore) {
+                std::unique_lock lock(currentCore->mutex);
+                currentCore->queue.erase(std::remove(currentCore->queue.begin(), currentCore->queue.end(), thread), currentCore->queue.end());
+                currentCore->mutateCondition.notify_all();
+
+                thread->coreId = optimalCore->id;
+
+                state.logger->Debug("Load Balancing for #{}: C{} -> C{}", thread->id, currentCore->id, optimalCore->id);
+            } else {
+                state.logger->Debug("Load Balancing for #{}: C{} (Late)", thread->id, currentCore->id);
+            }
+
+            return *optimalCore;
+        }
+
+        state.logger->Debug("Load Balancing for #{}: C{} (Early)", thread->id, currentCore->id);
+
+        return *currentCore;
+    }
+
+    void Scheduler::InsertThread(bool loadBalance) {
+        auto &thread{state.thread};
+        auto &core{loadBalance ? LoadBalance() : cores.at(thread->coreId)};
+
+        std::unique_lock lock(core.mutex);
+        auto nextThread{std::find_if(core.queue.begin(), core.queue.end(), [&](const std::shared_ptr<type::KThread> &it) { return it->priority > thread->priority; })};
+        if (nextThread == core.queue.begin() && nextThread != core.queue.end()) {
+            throw exception("Migration Interrupt Required");
+        } else {
+            core.queue.insert(nextThread, thread);
+        }
+
+        core.mutateCondition.notify_all();
+    }
+
+    void Scheduler::WaitSchedule() {
+        auto &thread{state.thread};
+        auto *core{&cores.at(thread->coreId)};
+
+        std::shared_lock lock(core->mutex);
+        if (thread->affinityMask.count() > 1) {
+            std::chrono::milliseconds loadBalanceThreshold{1}; //!< The amount of time that needs to pass unscheduled for a thread to attempt load balancing
+            while (!core->mutateCondition.wait_for(lock, loadBalanceThreshold, [&]() { return core->queue.front() == thread; })) {
+                lock.unlock();
+                LoadBalance();
+                if (thread->coreId == core->id) {
+                    lock.lock();
+                } else {
+                    InsertThread(false);
+                    core = &cores.at(thread->coreId);
+                    lock = std::shared_lock(core->mutex);
+                }
+
+                loadBalanceThreshold *= 2; // We double the duration required for future load balancing for this invocation to minimize pointless load balancing
+            }
+        } else {
+            core->mutateCondition.wait(lock, [&]() { return core->queue.front() == thread; });
+        }
+
+        thread->timesliceStart = util::GetTimeTicks();
+    }
+
+    void Scheduler::Rotate() {
+        auto &thread{state.thread};
+        auto &core{cores.at(thread->coreId)};
+
+        std::unique_lock lock(core.mutex);
+        if (core.queue.front() == thread) {
+            thread->averageTimeslice = (thread->averageTimeslice / 4) + (3 * (util::GetTimeTicks() - thread->timesliceStart / 4)); // 0.25 * old timeslice duration + 0.75 * current timeslice duration
+
+            core.queue.pop_front();
+            core.queue.push_back(thread);
+
+            core.mutateCondition.notify_all();
+        }
+    }
+
+    void Scheduler::RemoveThread() {
+        auto &thread{state.thread};
+        auto &core{cores.at(thread->coreId)};
+
+        std::unique_lock lock(core.mutex);
+        core.queue.erase(std::remove(core.queue.begin(), core.queue.end(), thread), core.queue.end());
+    }
+}
--- a/app/src/main/cpp/skyline/kernel/scheduler.h
+++ b/app/src/main/cpp/skyline/kernel/scheduler.h
@ -4,6 +4,7 @@
 #pragma once

 #include <common.h>
+#include <condition_variable>

 namespace skyline {
    namespace constant {
@ -32,5 +33,57 @@ namespace skyline {
                return (value >= min) && (value <= max);
            }
        };
+
+        /*
+         * @brief The Scheduler is responsible for determining which threads should run on which virtual cores and when they should be scheduled
+         * @note We tend to stray a lot from HOS in our scheduler design as we've designed it around our 1 host thread per guest thread which leads to scheduling from the perspective of threads while the HOS scheduler deals with scheduling from the perspective of cores, not doing this would lead to missing out on key optimizations and serialization of scheduling
+         */
+        class Scheduler {
+          private:
+            const DeviceState &state;
+
+            struct CoreContext {
+                u8 id;
+                std::shared_mutex mutex; //!< Synchronizes all operations on the queue
+                std::condition_variable_any mutateCondition; //!< A conditional variable which is signalled on every mutation of the queue
+                std::deque<std::shared_ptr<type::KThread>> queue; //!< A queue of threads which are running or to be run on this core
+
+                explicit CoreContext(u8 id);
+            };
+
+            std::array<CoreContext, constant::CoreCount> cores{CoreContext(0), CoreContext(1), CoreContext(2), CoreContext(3)};
+
+          public:
+            Scheduler(const DeviceState &state);
+
+            /**
+             * @brief Checks all cores and migrates the calling thread to the core where the calling thread should be scheduled the earliest
+             * @return A reference to the CoreContext of the core which the calling thread is running on after load balancing
+             * @note This doesn't insert the thread into the migrated process's queue after load-balancing
+             */
+            CoreContext& LoadBalance();
+
+            /**
+             * @brief Inserts the calling thread into the scheduler queue at the appropriate location based on it's priority
+             * @param loadBalance If to load balance or use the thread's current core (KThread::coreId)
+             */
+            void InsertThread(bool loadBalance = true);
+
+            /**
+             * @brief Wait for the current thread to be scheduled on it's resident core
+             * @note There is an assumption of the thread being on it's resident core queue, if it's not this'll never return
+             */
+            void WaitSchedule();
+
+            /**
+             * @brief Rotates the calling thread's resident core queue, if it is at the front of it
+             */
+            void Rotate();
+
+            /**
+             * @brief Removes the calling thread from it's resident core queue
+             */
+            void RemoveThread();
+        };
    }
 }
--- a/app/src/main/cpp/skyline/kernel/svc.cpp
+++ b/app/src/main/cpp/skyline/kernel/svc.cpp
@ -281,20 +281,27 @@ namespace skyline::kernel::svc {
    }

    void SleepThread(const DeviceState &state) {
+        constexpr i64 yieldWithoutCoreMigration{0};
+        constexpr i64 yieldWithCoreMigration{-1};
+        constexpr i64 yieldToAnyThread{-2};
+
        i64 in{static_cast<i64>(state.ctx->gpr.x0)};
-        switch (in) {
-            case 0:
-            case -1:
-            case -2:
-                state.logger->Debug("svcSleepThread: Yielding thread: {}", in);
-                break;
-            default:
-                state.logger->Debug("svcSleepThread: Thread sleeping for {} ns", in);
-                struct timespec spec{
-                    .tv_sec = static_cast<time_t>(state.ctx->gpr.x0 / 1000000000),
-                    .tv_nsec = static_cast<long>(state.ctx->gpr.x0 % 1000000000)
-                };
-                nanosleep(&spec, nullptr);
+        if (in > 0) {
+            state.logger->Debug("svcSleepThread: Thread sleeping for {} ns", in);
+
+            struct timespec spec{
+                .tv_sec = static_cast<time_t>(in / 1000000000),
+                .tv_nsec = static_cast<long>(in % 1000000000),
+            };
+
+            state.scheduler->Rotate();
+            nanosleep(&spec, nullptr);
+            state.scheduler->WaitSchedule();
+        } else if (in == yieldWithoutCoreMigration || in == yieldWithCoreMigration || in == yieldToAnyThread) {
+            // Core Migration doesn't affect us as threads schedule and load balance themselves
+            state.logger->Debug("svcSleepThread: Yielding thread ({})", in);
+            state.scheduler->Rotate();
+            state.scheduler->WaitSchedule();
        }
    }

--- a/app/src/main/cpp/skyline/kernel/types/KThread.cpp
+++ b/app/src/main/cpp/skyline/kernel/types/KThread.cpp
@ -1,8 +1,8 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2020 Skyline Team and Contributors (https://github.com/skyline-emu/)

+#include <cxxabi.h>
 #include <unistd.h>
-#include <android/log.h>
 #include <common/signal.h>
 #include <nce.h>
 #include <os.h>
@ -39,6 +39,8 @@ namespace skyline::kernel::type {
        state.thread = shared_from_this();

        if (setjmp(originalCtx)) { // Returns 1 if it's returning from guest, 0 otherwise
+            state.scheduler->RemoveThread();
+
            running = false;
            Signal();

@ -52,85 +54,108 @@ namespace skyline::kernel::type {

        signal::SetSignalHandler({SIGINT, SIGILL, SIGTRAP, SIGBUS, SIGFPE, SIGSEGV}, nce::NCE::SignalHandler);

-        asm volatile(
-        "MRS X0, TPIDR_EL0\n\t"
-        "MSR TPIDR_EL0, %x0\n\t" // Set TLS to ThreadContext
-        "STR X0, [%x0, #0x2A0]\n\t" // Write ThreadContext::hostTpidrEl0
-        "MOV X0, SP\n\t"
-        "STR X0, [%x0, #0x2A8]\n\t" // Write ThreadContext::hostSp
-        "MOV SP, %x1\n\t" // Replace SP with guest stack
-        "MOV LR, %x2\n\t" // Store entry in Link Register so it is jumped to on return
-        "MOV X0, %x3\n\t" // Store the argument in X0
-        "MOV X1, %x4\n\t" // Store the thread handle in X1, NCA applications require this
-        "MOV X2, XZR\n\t" // Zero out other GP and SIMD registers, not doing this will break applications
-        "MOV X3, XZR\n\t"
-        "MOV X4, XZR\n\t"
-        "MOV X5, XZR\n\t"
-        "MOV X6, XZR\n\t"
-        "MOV X7, XZR\n\t"
-        "MOV X8, XZR\n\t"
-        "MOV X9, XZR\n\t"
-        "MOV X10, XZR\n\t"
-        "MOV X11, XZR\n\t"
-        "MOV X12, XZR\n\t"
-        "MOV X13, XZR\n\t"
-        "MOV X14, XZR\n\t"
-        "MOV X15, XZR\n\t"
-        "MOV X16, XZR\n\t"
-        "MOV X17, XZR\n\t"
-        "MOV X18, XZR\n\t"
-        "MOV X19, XZR\n\t"
-        "MOV X20, XZR\n\t"
-        "MOV X21, XZR\n\t"
-        "MOV X22, XZR\n\t"
-        "MOV X23, XZR\n\t"
-        "MOV X24, XZR\n\t"
-        "MOV X25, XZR\n\t"
-        "MOV X26, XZR\n\t"
-        "MOV X27, XZR\n\t"
-        "MOV X28, XZR\n\t"
-        "MOV X29, XZR\n\t"
-        "MSR FPSR, XZR\n\t"
-        "MSR FPCR, XZR\n\t"
-        "DUP V0.16B, WZR\n\t"
-        "DUP V1.16B, WZR\n\t"
-        "DUP V2.16B, WZR\n\t"
-        "DUP V3.16B, WZR\n\t"
-        "DUP V4.16B, WZR\n\t"
-        "DUP V5.16B, WZR\n\t"
-        "DUP V6.16B, WZR\n\t"
-        "DUP V7.16B, WZR\n\t"
-        "DUP V8.16B, WZR\n\t"
-        "DUP V9.16B, WZR\n\t"
-        "DUP V10.16B, WZR\n\t"
-        "DUP V11.16B, WZR\n\t"
-        "DUP V12.16B, WZR\n\t"
-        "DUP V13.16B, WZR\n\t"
-        "DUP V14.16B, WZR\n\t"
-        "DUP V15.16B, WZR\n\t"
-        "DUP V16.16B, WZR\n\t"
-        "DUP V17.16B, WZR\n\t"
-        "DUP V18.16B, WZR\n\t"
-        "DUP V19.16B, WZR\n\t"
-        "DUP V20.16B, WZR\n\t"
-        "DUP V21.16B, WZR\n\t"
-        "DUP V22.16B, WZR\n\t"
-        "DUP V23.16B, WZR\n\t"
-        "DUP V24.16B, WZR\n\t"
-        "DUP V25.16B, WZR\n\t"
-        "DUP V26.16B, WZR\n\t"
-        "DUP V27.16B, WZR\n\t"
-        "DUP V28.16B, WZR\n\t"
-        "DUP V29.16B, WZR\n\t"
-        "DUP V30.16B, WZR\n\t"
-        "DUP V31.16B, WZR\n\t"
-        "RET"
-        :
-        : "r"(&ctx), "r"(stackTop), "r"(entry), "r"(entryArgument), "r"(handle)
-        : "x0", "x1", "lr"
-        );
+        try {
+            state.scheduler->InsertThread();
+            state.scheduler->WaitSchedule();

-        __builtin_unreachable();
+            asm volatile(
+            "MRS X0, TPIDR_EL0\n\t"
+            "MSR TPIDR_EL0, %x0\n\t" // Set TLS to ThreadContext
+            "STR X0, [%x0, #0x2A0]\n\t" // Write ThreadContext::hostTpidrEl0
+            "MOV X0, SP\n\t"
+            "STR X0, [%x0, #0x2A8]\n\t" // Write ThreadContext::hostSp
+            "MOV SP, %x1\n\t" // Replace SP with guest stack
+            "MOV LR, %x2\n\t" // Store entry in Link Register so it is jumped to on return
+            "MOV X0, %x3\n\t" // Store the argument in X0
+            "MOV X1, %x4\n\t" // Store the thread handle in X1, NCA applications require this
+            "MOV X2, XZR\n\t" // Zero out other GP and SIMD registers, not doing this will break applications
+            "MOV X3, XZR\n\t"
+            "MOV X4, XZR\n\t"
+            "MOV X5, XZR\n\t"
+            "MOV X6, XZR\n\t"
+            "MOV X7, XZR\n\t"
+            "MOV X8, XZR\n\t"
+            "MOV X9, XZR\n\t"
+            "MOV X10, XZR\n\t"
+            "MOV X11, XZR\n\t"
+            "MOV X12, XZR\n\t"
+            "MOV X13, XZR\n\t"
+            "MOV X14, XZR\n\t"
+            "MOV X15, XZR\n\t"
+            "MOV X16, XZR\n\t"
+            "MOV X17, XZR\n\t"
+            "MOV X18, XZR\n\t"
+            "MOV X19, XZR\n\t"
+            "MOV X20, XZR\n\t"
+            "MOV X21, XZR\n\t"
+            "MOV X22, XZR\n\t"
+            "MOV X23, XZR\n\t"
+            "MOV X24, XZR\n\t"
+            "MOV X25, XZR\n\t"
+            "MOV X26, XZR\n\t"
+            "MOV X27, XZR\n\t"
+            "MOV X28, XZR\n\t"
+            "MOV X29, XZR\n\t"
+            "MSR FPSR, XZR\n\t"
+            "MSR FPCR, XZR\n\t"
+            "DUP V0.16B, WZR\n\t"
+            "DUP V1.16B, WZR\n\t"
+            "DUP V2.16B, WZR\n\t"
+            "DUP V3.16B, WZR\n\t"
+            "DUP V4.16B, WZR\n\t"
+            "DUP V5.16B, WZR\n\t"
+            "DUP V6.16B, WZR\n\t"
+            "DUP V7.16B, WZR\n\t"
+            "DUP V8.16B, WZR\n\t"
+            "DUP V9.16B, WZR\n\t"
+            "DUP V10.16B, WZR\n\t"
+            "DUP V11.16B, WZR\n\t"
+            "DUP V12.16B, WZR\n\t"
+            "DUP V13.16B, WZR\n\t"
+            "DUP V14.16B, WZR\n\t"
+            "DUP V15.16B, WZR\n\t"
+            "DUP V16.16B, WZR\n\t"
+            "DUP V17.16B, WZR\n\t"
+            "DUP V18.16B, WZR\n\t"
+            "DUP V19.16B, WZR\n\t"
+            "DUP V20.16B, WZR\n\t"
+            "DUP V21.16B, WZR\n\t"
+            "DUP V22.16B, WZR\n\t"
+            "DUP V23.16B, WZR\n\t"
+            "DUP V24.16B, WZR\n\t"
+            "DUP V25.16B, WZR\n\t"
+            "DUP V26.16B, WZR\n\t"
+            "DUP V27.16B, WZR\n\t"
+            "DUP V28.16B, WZR\n\t"
+            "DUP V29.16B, WZR\n\t"
+            "DUP V30.16B, WZR\n\t"
+            "DUP V31.16B, WZR\n\t"
+            "RET"
+            :
+            : "r"(&ctx), "r"(stackTop), "r"(entry), "r"(entryArgument), "r"(handle)
+            : "x0", "x1", "lr"
+            );
+
+            __builtin_unreachable();
+        } catch (const std::exception &e) {
+            state.logger->Error(e.what());
+            if (id) {
+                signal::BlockSignal({SIGINT});
+                state.process->Kill(false);
+            }
+            abi::__cxa_end_catch();
+            std::longjmp(originalCtx, true);
+        } catch (const signal::SignalException &e) {
+            if (e.signal != SIGINT) {
+                state.logger->Error(e.what());
+                if (id) {
+                    signal::BlockSignal({SIGINT});
+                    state.process->Kill(false);
+                }
+            }
+            abi::__cxa_end_catch();
+            std::longjmp(originalCtx, true);
+        }
    }

    void KThread::Start(bool self) {
--- a/app/src/main/cpp/skyline/kernel/types/KThread.h
+++ b/app/src/main/cpp/skyline/kernel/types/KThread.h
@ -42,6 +42,8 @@ namespace skyline {
            i8 idealCore; //!< The ideal CPU core for this thread to run on
            i8 coreId; //!< The CPU core on which this thread is running
            CoreMask affinityMask{}; //!< A mask of CPU cores this thread is allowed to run on
+            u64 timesliceStart{}; //!< Start of the scheduler timeslice
+            u64 averageTimeslice{}; //!< A weighted average of the timeslice duration for this thread

            KThread(const DeviceState &state, KHandle handle, KProcess *parent, size_t id, void *entry, u64 argument, void *stackTop, i8 priority, i8 idealCore);

--- a/app/src/main/cpp/skyline/os.h
+++ b/app/src/main/cpp/skyline/os.h
@ -29,14 +29,5 @@ namespace skyline::kernel {
         * @param romType The type of the ROM file
         */
        void Execute(int romFd, loader::RomFormat romType);
-
-        /**
-         * @brief Creates a new process
-         * @param entry The entry point for the new process
-         * @param argument The argument for the initial function
-         * @param stackSize The size of the main stack
-         * @return An instance of the KProcess of the created process
-         */
-        std::shared_ptr<type::KProcess> CreateProcess(u64 entry, u64 argument, size_t stackSize);
    };
 }