5 files changed, 181 insertions, 9 deletions
diff --git a/src/common/thread_queue_list.h b/src/common/thread_queue_list.h
index 133122c5f..e7594db68 100644
--- a/src/common/thread_queue_list.h
+++ b/src/common/thread_queue_list.h
@@ -49,6 +49,22 @@ struct ThreadQueueList {
         return T();
     }
 
+    template <typename UnaryPredicate>
+    T get_first_filter(UnaryPredicate filter) const {
+        const Queue* cur = first;
+        while (cur != nullptr) {
+            if (!cur->data.empty()) {
+                for (const auto& item : cur->data) {
+                    if (filter(item))
+                        return item;
+                }
+            }
+            cur = cur->next_nonempty;
+        }
+
+        return T();
+    }
+
     T pop_first() {
         Queue* cur = first;
         while (cur != nullptr) {
diff --git a/src/core/hle/kernel/scheduler.cpp b/src/core/hle/kernel/scheduler.cpp
index 5a5f4cef1..df4d6cf0a 100644
--- a/src/core/hle/kernel/scheduler.cpp
+++ b/src/core/hle/kernel/scheduler.cpp
@@ -9,6 +9,7 @@
 #include "common/logging/log.h"
 #include "core/arm/arm_interface.h"
 #include "core/core.h"
+#include "core/core_cpu.h"
 #include "core/core_timing.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/process.h"
@@ -179,4 +180,69 @@ void Scheduler::SetThreadPriority(Thread* thread, u32 priority) {
         ready_queue.prepare(priority);
 }
 
+Thread* Scheduler::GetNextSuggestedThread(u32 core, u32 maximum_priority) const {
+    std::lock_guard<std::mutex> lock(scheduler_mutex);
+
+    const u32 mask = 1U << core;
+    return ready_queue.get_first_filter([mask, maximum_priority](Thread const* thread) {
+        return (thread->GetAffinityMask() & mask) != 0 && thread->GetPriority() < maximum_priority;
+    });
+}
+
+void Scheduler::YieldWithoutLoadBalancing(Thread* thread) {
+    ASSERT(thread != nullptr);
+    // Avoid yielding if the thread isn't even running.
+    ASSERT(thread->GetStatus() == ThreadStatus::Running);
+
+    // Sanity check that the priority is valid
+    ASSERT(thread->GetPriority() < THREADPRIO_COUNT);
+
+    // Yield this thread -- sleep for zero time and force reschedule to different thread
+    WaitCurrentThread_Sleep();
+    GetCurrentThread()->WakeAfterDelay(0);
+}
+
+void Scheduler::YieldWithLoadBalancing(Thread* thread) {
+    ASSERT(thread != nullptr);
+    const auto priority = thread->GetPriority();
+    const auto core = static_cast<u32>(thread->GetProcessorID());
+
+    // Avoid yielding if the thread isn't even running.
+    ASSERT(thread->GetStatus() == ThreadStatus::Running);
+
+    // Sanity check that the priority is valid
+    ASSERT(priority < THREADPRIO_COUNT);
+
+    // Sleep for zero time to be able to force reschedule to different thread
+    WaitCurrentThread_Sleep();
+    GetCurrentThread()->WakeAfterDelay(0);
+
+    Thread* suggested_thread = nullptr;
+
+    // Search through all of the cpu cores (except this one) for a suggested thread.
+    // Take the first non-nullptr one
+    for (unsigned cur_core = 0; cur_core < Core::NUM_CPU_CORES; ++cur_core) {
+        const auto res =
+            Core::System::GetInstance().CpuCore(cur_core).Scheduler().GetNextSuggestedThread(
+                core, priority);
+
+        // If scheduler provides a suggested thread
+        if (res != nullptr) {
+            // And its better than the current suggested thread (or is the first valid one)
+            if (suggested_thread == nullptr ||
+                suggested_thread->GetPriority() > res->GetPriority()) {
+                suggested_thread = res;
+            }
+        }
+    }
+
+    // If a suggested thread was found, queue that for this core
+    if (suggested_thread != nullptr)
+        suggested_thread->ChangeCore(core, suggested_thread->GetAffinityMask());
+}
+
+void Scheduler::YieldAndWaitForLoadBalancing(Thread* thread) {
+    UNIMPLEMENTED_MSG("Wait for load balancing thread yield type is not implemented!");
+}
+
 } // namespace Kernel
diff --git a/src/core/hle/kernel/scheduler.h b/src/core/hle/kernel/scheduler.h
index c63032b7d..97ced4dfc 100644
--- a/src/core/hle/kernel/scheduler.h
+++ b/src/core/hle/kernel/scheduler.h
@@ -51,6 +51,75 @@ public:
     /// Sets the priority of a thread in the scheduler
     void SetThreadPriority(Thread* thread, u32 priority);
 
+    /// Gets the next suggested thread for load balancing
+    Thread* GetNextSuggestedThread(u32 core, u32 minimum_priority) const;
+
+    /**
+     * YieldWithoutLoadBalancing -- analogous to normal yield on a system
+     * Moves the thread to the end of the ready queue for its priority, and then reschedules the
+     * system to the new head of the queue.
+     *
+     * Example (Single Core -- but can be extrapolated to multi):
+     * ready_queue[prio=0]: ThreadA, ThreadB, ThreadC (->exec order->)
+     * Currently Running: ThreadR
+     *
+     * ThreadR calls YieldWithoutLoadBalancing
+     *
+     * ThreadR is moved to the end of ready_queue[prio=0]:
+     * ready_queue[prio=0]: ThreadA, ThreadB, ThreadC, ThreadR (->exec order->)
+     * Currently Running: Nothing
+     *
+     * System is rescheduled (ThreadA is popped off of queue):
+     * ready_queue[prio=0]: ThreadB, ThreadC, ThreadR (->exec order->)
+     * Currently Running: ThreadA
+     *
+     * If the queue is empty at time of call, no yielding occurs. This does not cross between cores
+     * or priorities at all.
+     */
+    void YieldWithoutLoadBalancing(Thread* thread);
+
+    /**
+     * YieldWithLoadBalancing -- yield but with better selection of the new running thread
+     * Moves the current thread to the end of the ready queue for its priority, then selects a
+     * 'suggested thread' (a thread on a different core that could run on this core) from the
+     * scheduler, changes its core, and reschedules the current core to that thread.
+     *
+     * Example (Dual Core -- can be extrapolated to Quad Core, this is just normal yield if it were
+     * single core):
+     * ready_queue[core=0][prio=0]: ThreadA, ThreadB (affinities not pictured as irrelevant
+     * ready_queue[core=1][prio=0]: ThreadC[affinity=both], ThreadD[affinity=core1only]
+     * Currently Running: ThreadQ on Core 0 || ThreadP on Core 1
+     *
+     * ThreadQ calls YieldWithLoadBalancing
+     *
+     * ThreadQ is moved to the end of ready_queue[core=0][prio=0]:
+     * ready_queue[core=0][prio=0]: ThreadA, ThreadB
+     * ready_queue[core=1][prio=0]: ThreadC[affinity=both], ThreadD[affinity=core1only]
+     * Currently Running: ThreadQ on Core 0 || ThreadP on Core 1
+     *
+     * A list of suggested threads for each core is compiled
+     * Suggested Threads: {ThreadC on Core 1}
+     * If this were quad core (as the switch is), there could be between 0 and 3 threads in this
+     * list. If there are more than one, the thread is selected by highest prio.
+     *
+     * ThreadC is core changed to Core 0:
+     * ready_queue[core=0][prio=0]: ThreadC, ThreadA, ThreadB, ThreadQ
+     * ready_queue[core=1][prio=0]: ThreadD
+     * Currently Running: None on Core 0 || ThreadP on Core 1
+     *
+     * System is rescheduled (ThreadC is popped off of queue):
+     * ready_queue[core=0][prio=0]: ThreadA, ThreadB, ThreadQ
+     * ready_queue[core=1][prio=0]: ThreadD
+     * Currently Running: ThreadC on Core 0 || ThreadP on Core 1
+     *
+     * If no suggested threads can be found this will behave just as normal yield. If there are
+     * multiple candidates for the suggested thread on a core, the highest prio is taken.
+     */
+    void YieldWithLoadBalancing(Thread* thread);
+
+    /// Currently unknown -- asserts as unimplemented on call
+    void YieldAndWaitForLoadBalancing(Thread* thread);
+
     /// Returns a list of all threads managed by the scheduler
     const std::vector<SharedPtr<Thread>>& GetThreadList() const {
         return thread_list;
diff --git a/src/core/hle/kernel/svc.cpp b/src/core/hle/kernel/svc.cpp
index 5d36792ca..348a22904 100644
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -1208,18 +1208,38 @@ static void ExitThread() {
 static void SleepThread(s64 nanoseconds) {
     LOG_TRACE(Kernel_SVC, "called nanoseconds={}", nanoseconds);
 
-    // Don't attempt to yield execution if there are no available threads to run,
-    // this way we avoid a useless reschedule to the idle thread.
-    if (nanoseconds == 0 && !Core::System::GetInstance().CurrentScheduler().HaveReadyThreads())
-        return;
+    enum class SleepType : s64 {
+        YieldWithoutLoadBalancing = 0,
+        YieldWithLoadBalancing = -1,
+        YieldAndWaitForLoadBalancing = -2,
+    };
 
-    // Sleep current thread and check for next thread to schedule
-    WaitCurrentThread_Sleep();
+    if (nanoseconds <= 0) {
+        auto& scheduler{Core::System::GetInstance().CurrentScheduler()};
+        switch (static_cast<SleepType>(nanoseconds)) {
+        case SleepType::YieldWithoutLoadBalancing:
+            scheduler.YieldWithoutLoadBalancing(GetCurrentThread());
+            break;
+        case SleepType::YieldWithLoadBalancing:
+            scheduler.YieldWithLoadBalancing(GetCurrentThread());
+            break;
+        case SleepType::YieldAndWaitForLoadBalancing:
+            scheduler.YieldAndWaitForLoadBalancing(GetCurrentThread());
+            break;
+        default:
+            UNREACHABLE_MSG("Unimplemented sleep yield type '{:016X}'!", nanoseconds);
+        }
+    } else {
+        // Sleep current thread and check for next thread to schedule
+        WaitCurrentThread_Sleep();
 
-    // Create an event to wake the thread up after the specified nanosecond delay has passed
-    GetCurrentThread()->WakeAfterDelay(nanoseconds);
+        // Create an event to wake the thread up after the specified nanosecond delay has passed
+        GetCurrentThread()->WakeAfterDelay(nanoseconds);
+    }
 
-    Core::System::GetInstance().PrepareReschedule();
+    // Reschedule all CPU cores
+    for (std::size_t i = 0; i < Core::NUM_CPU_CORES; ++i)
+        Core::System::GetInstance().CpuCore(i).PrepareReschedule();
 }
 
 /// Wait process wide key atomic
diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h
index d384d50db..77aec099a 100644
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -26,6 +26,7 @@ enum ThreadPriority : u32 {
     THREADPRIO_USERLAND_MAX = 24, ///< Highest thread priority for userland apps
     THREADPRIO_DEFAULT = 44,      ///< Default thread priority for userland apps
     THREADPRIO_LOWEST = 63,       ///< Lowest thread priority
+    THREADPRIO_COUNT = 64,        ///< Total number of possible thread priorities.
 };
 
 enum ThreadProcessorId : s32 {