9 files changed, 88 insertions, 47 deletions
diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.cpp b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
index cea7f0fb1..c8f6dc765 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
@@ -128,6 +128,7 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable*
     if (page_table) {
         config.page_table = reinterpret_cast<std::array<std::uint8_t*, NUM_PAGE_TABLE_ENTRIES>*>(
             page_table->pointers.data());
+        config.fastmem_pointer = page_table->fastmem_arena;
     }
     config.absolute_offset_page_table = true;
     config.page_table_pointer_mask_bits = Common::PageTable::ATTRIBUTE_BITS;
@@ -143,7 +144,7 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable*
 
     // Code cache size
     config.code_cache_size = 512 * 1024 * 1024;
-    config.far_code_offset = 256 * 1024 * 1024;
+    config.far_code_offset = 400 * 1024 * 1024;
 
     // Safe optimizations
     if (Settings::values.cpu_accuracy.GetValue() == Settings::CPUAccuracy::DebugMode) {
@@ -171,6 +172,9 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable*
         if (!Settings::values.cpuopt_reduce_misalign_checks) {
             config.only_detect_misalignment_via_page_table_on_page_boundary = false;
         }
+        if (!Settings::values.cpuopt_fastmem) {
+            config.fastmem_pointer = nullptr;
+        }
     }
 
     // Unsafe optimizations
diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.cpp b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
index 63193dcb1..ba524cd05 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
@@ -160,6 +160,10 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable*
         config.absolute_offset_page_table = true;
         config.detect_misaligned_access_via_page_table = 16 | 32 | 64 | 128;
         config.only_detect_misalignment_via_page_table_on_page_boundary = true;
+
+        config.fastmem_pointer = page_table->fastmem_arena;
+        config.fastmem_address_space_bits = address_space_bits;
+        config.silently_mirror_fastmem = false;
     }
 
     // Multi-process state
@@ -181,7 +185,7 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable*
 
     // Code cache size
     config.code_cache_size = 512 * 1024 * 1024;
-    config.far_code_offset = 256 * 1024 * 1024;
+    config.far_code_offset = 400 * 1024 * 1024;
 
     // Safe optimizations
     if (Settings::values.cpu_accuracy.GetValue() == Settings::CPUAccuracy::DebugMode) {
@@ -209,6 +213,9 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable*
         if (!Settings::values.cpuopt_reduce_misalign_checks) {
             config.only_detect_misalignment_via_page_table_on_page_boundary = false;
         }
+        if (!Settings::values.cpuopt_fastmem) {
+            config.fastmem_pointer = nullptr;
+        }
     }
 
     // Unsafe optimizations
@@ -223,6 +230,9 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable*
         if (Settings::values.cpuopt_unsafe_inaccurate_nan.GetValue()) {
             config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_InaccurateNaN;
         }
+        if (Settings::values.cpuopt_unsafe_fastmem_check.GetValue()) {
+            config.fastmem_address_space_bits = 64;
+        }
     }
 
     return std::make_shared<Dynarmic::A64::Jit>(config);
diff --git a/src/core/device_memory.cpp b/src/core/device_memory.cpp
index 0c4b440ed..f19c0515f 100644
--- a/src/core/device_memory.cpp
+++ b/src/core/device_memory.cpp
@@ -6,7 +6,7 @@
 
 namespace Core {
 
-DeviceMemory::DeviceMemory() : buffer{DramMemoryMap::Size} {}
+DeviceMemory::DeviceMemory() : buffer{DramMemoryMap::Size, 1ULL << 39} {}
 DeviceMemory::~DeviceMemory() = default;
 
 } // namespace Core
diff --git a/src/core/device_memory.h b/src/core/device_memory.h
index 5b1ae28f3..c4d17705f 100644
--- a/src/core/device_memory.h
+++ b/src/core/device_memory.h
@@ -5,7 +5,7 @@
 #pragma once
 
 #include "common/common_types.h"
-#include "common/virtual_buffer.h"
+#include "common/host_memory.h"
 
 namespace Core {
 
@@ -21,27 +21,30 @@ enum : u64 {
 };
 }; // namespace DramMemoryMap
 
-class DeviceMemory : NonCopyable {
+class DeviceMemory {
 public:
     explicit DeviceMemory();
     ~DeviceMemory();
 
+    DeviceMemory& operator=(const DeviceMemory&) = delete;
+    DeviceMemory(const DeviceMemory&) = delete;
+
     template <typename T>
     PAddr GetPhysicalAddr(const T* ptr) const {
-        return (reinterpret_cast<uintptr_t>(ptr) - reinterpret_cast<uintptr_t>(buffer.data())) +
+        return (reinterpret_cast<uintptr_t>(ptr) -
+                reinterpret_cast<uintptr_t>(buffer.BackingBasePointer())) +
                DramMemoryMap::Base;
     }
 
     u8* GetPointer(PAddr addr) {
-        return buffer.data() + (addr - DramMemoryMap::Base);
+        return buffer.BackingBasePointer() + (addr - DramMemoryMap::Base);
     }
 
     const u8* GetPointer(PAddr addr) const {
-        return buffer.data() + (addr - DramMemoryMap::Base);
+        return buffer.BackingBasePointer() + (addr - DramMemoryMap::Base);
     }
 
-private:
-    Common::VirtualBuffer<u8> buffer;
+    Common::HostMemory buffer;
 };
 
 } // namespace Core
diff --git a/src/core/hle/kernel/k_light_condition_variable.h b/src/core/hle/kernel/k_light_condition_variable.h
index ca2e539a7..a95fa41f3 100644
--- a/src/core/hle/kernel/k_light_condition_variable.h
+++ b/src/core/hle/kernel/k_light_condition_variable.h
@@ -18,41 +18,58 @@ class KernelCore;
 
 class KLightConditionVariable {
 public:
-    explicit KLightConditionVariable(KernelCore& kernel_)
-        : thread_queue(kernel_), kernel(kernel_) {}
+    explicit KLightConditionVariable(KernelCore& kernel_) : kernel{kernel_} {}
 
-    void Wait(KLightLock* lock, s64 timeout = -1) {
-        WaitImpl(lock, timeout);
-        lock->Lock();
+    void Wait(KLightLock* lock, s64 timeout = -1, bool allow_terminating_thread = true) {
+        WaitImpl(lock, timeout, allow_terminating_thread);
     }
 
     void Broadcast() {
         KScopedSchedulerLock lk{kernel};
-        while (thread_queue.WakeupFrontThread() != nullptr) {
-            // We want to signal all threads, and so should continue waking up until there's nothing
-            // to wake.
+
+        // Signal all threads.
+        for (auto& thread : wait_list) {
+            thread.SetState(ThreadState::Runnable);
         }
     }
 
 private:
-    void WaitImpl(KLightLock* lock, s64 timeout) {
+    void WaitImpl(KLightLock* lock, s64 timeout, bool allow_terminating_thread) {
         KThread* owner = GetCurrentThreadPointer(kernel);
 
         // Sleep the thread.
         {
-            KScopedSchedulerLockAndSleep lk(kernel, owner, timeout);
-            lock->Unlock();
+            KScopedSchedulerLockAndSleep lk{kernel, owner, timeout};
 
-            if (!thread_queue.SleepThread(owner)) {
+            if (!allow_terminating_thread && owner->IsTerminationRequested()) {
                 lk.CancelSleep();
                 return;
             }
+
+            lock->Unlock();
+
+            // Set the thread as waiting.
+            GetCurrentThread(kernel).SetState(ThreadState::Waiting);
+
+            // Add the thread to the queue.
+            wait_list.push_back(GetCurrentThread(kernel));
+        }
+
+        // Remove the thread from the wait list.
+        {
+            KScopedSchedulerLock sl{kernel};
+
+            wait_list.erase(wait_list.iterator_to(GetCurrentThread(kernel)));
         }
 
         // Cancel the task that the sleep setup.
         kernel.TimeManager().UnscheduleTimeEvent(owner);
+
+        // Re-acquire the lock.
+        lock->Lock();
     }
-    KThreadQueue thread_queue;
+
     KernelCore& kernel;
+    KThread::WaiterList wait_list{};
 };
 } // namespace Kernel
diff --git a/src/core/hle/kernel/k_light_lock.cpp b/src/core/hle/kernel/k_light_lock.cpp
index f974022e8..0896e705f 100644
--- a/src/core/hle/kernel/k_light_lock.cpp
+++ b/src/core/hle/kernel/k_light_lock.cpp
@@ -59,11 +59,7 @@ void KLightLock::LockSlowPath(uintptr_t _owner, uintptr_t _cur_thread) {
         owner_thread->AddWaiter(cur_thread);
 
         // Set thread states.
-        if (cur_thread->GetState() == ThreadState::Runnable) {
-            cur_thread->SetState(ThreadState::Waiting);
-        } else {
-            KScheduler::SetSchedulerUpdateNeeded(kernel);
-        }
+        cur_thread->SetState(ThreadState::Waiting);
 
         if (owner_thread->IsSuspended()) {
             owner_thread->ContinueIfHasKernelWaiters();
@@ -73,10 +69,9 @@ void KLightLock::LockSlowPath(uintptr_t _owner, uintptr_t _cur_thread) {
     // We're no longer waiting on the lock owner.
     {
         KScopedSchedulerLock sl{kernel};
-        KThread* owner_thread = cur_thread->GetLockOwner();
-        if (owner_thread) {
+
+        if (KThread* owner_thread = cur_thread->GetLockOwner(); owner_thread != nullptr) {
             owner_thread->RemoveWaiter(cur_thread);
-            KScheduler::SetSchedulerUpdateNeeded(kernel);
         }
     }
 }
@@ -95,17 +90,13 @@ void KLightLock::UnlockSlowPath(uintptr_t _cur_thread) {
 
         // Pass the lock to the next owner.
         uintptr_t next_tag = 0;
-        if (next_owner) {
+        if (next_owner != nullptr) {
             next_tag = reinterpret_cast<uintptr_t>(next_owner);
             if (num_waiters > 1) {
                 next_tag |= 0x1;
             }
 
-            if (next_owner->GetState() == ThreadState::Waiting) {
-                next_owner->SetState(ThreadState::Runnable);
-            } else {
-                KScheduler::SetSchedulerUpdateNeeded(kernel);
-            }
+            next_owner->SetState(ThreadState::Runnable);
 
             if (next_owner->IsSuspended()) {
                 next_owner->ContinueIfHasKernelWaiters();
diff --git a/src/core/hle/kernel/k_process.cpp b/src/core/hle/kernel/k_process.cpp
index 06b8ce151..d1bd98051 100644
--- a/src/core/hle/kernel/k_process.cpp
+++ b/src/core/hle/kernel/k_process.cpp
@@ -201,17 +201,15 @@ bool KProcess::ReleaseUserException(KThread* thread) {
 
         // Remove waiter thread.
         s32 num_waiters{};
-        KThread* next = thread->RemoveWaiterByKey(
-            std::addressof(num_waiters),
-            reinterpret_cast<uintptr_t>(std::addressof(exception_thread)));
-        if (next != nullptr) {
-            if (next->GetState() == ThreadState::Waiting) {
-                next->SetState(ThreadState::Runnable);
-            } else {
-                KScheduler::SetSchedulerUpdateNeeded(kernel);
-            }
+        if (KThread* next = thread->RemoveWaiterByKey(
+                std::addressof(num_waiters),
+                reinterpret_cast<uintptr_t>(std::addressof(exception_thread)));
+            next != nullptr) {
+            next->SetState(ThreadState::Runnable);
         }
 
+        KScheduler::SetSchedulerUpdateNeeded(kernel);
+
         return true;
     } else {
         return false;
diff --git a/src/core/hle/kernel/k_resource_limit.cpp b/src/core/hle/kernel/k_resource_limit.cpp
index f91cb65dc..da88f35bc 100644
--- a/src/core/hle/kernel/k_resource_limit.cpp
+++ b/src/core/hle/kernel/k_resource_limit.cpp
@@ -117,7 +117,7 @@ bool KResourceLimit::Reserve(LimitableResource which, s64 value, s64 timeout) {
         if (current_hints[index] + value <= limit_values[index] &&
             (timeout < 0 || core_timing->GetGlobalTimeNs().count() < timeout)) {
             waiter_count++;
-            cond_var.Wait(&lock, timeout);
+            cond_var.Wait(&lock, timeout, false);
             waiter_count--;
         } else {
             break;
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 9857278f6..f285c6f63 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -12,6 +12,7 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "common/page_table.h"
+#include "common/settings.h"
 #include "common/swap.h"
 #include "core/arm/arm_interface.h"
 #include "core/core.h"
@@ -32,6 +33,7 @@ struct Memory::Impl {
 
     void SetCurrentPageTable(Kernel::KProcess& process, u32 core_id) {
         current_page_table = &process.PageTable().PageTableImpl();
+        current_page_table->fastmem_arena = system.DeviceMemory().buffer.VirtualBasePointer();
 
         const std::size_t address_space_width = process.PageTable().GetAddressSpaceWidth();
 
@@ -41,13 +43,23 @@ struct Memory::Impl {
     void MapMemoryRegion(Common::PageTable& page_table, VAddr base, u64 size, PAddr target) {
         ASSERT_MSG((size & PAGE_MASK) == 0, "non-page aligned size: {:016X}", size);
         ASSERT_MSG((base & PAGE_MASK) == 0, "non-page aligned base: {:016X}", base);
+        ASSERT_MSG(target >= DramMemoryMap::Base && target < DramMemoryMap::End,
+                   "Out of bounds target: {:016X}", target);
         MapPages(page_table, base / PAGE_SIZE, size / PAGE_SIZE, target, Common::PageType::Memory);
+
+        if (Settings::IsFastmemEnabled()) {
+            system.DeviceMemory().buffer.Map(base, target - DramMemoryMap::Base, size);
+        }
     }
 
     void UnmapRegion(Common::PageTable& page_table, VAddr base, u64 size) {
         ASSERT_MSG((size & PAGE_MASK) == 0, "non-page aligned size: {:016X}", size);
         ASSERT_MSG((base & PAGE_MASK) == 0, "non-page aligned base: {:016X}", base);
         MapPages(page_table, base / PAGE_SIZE, size / PAGE_SIZE, 0, Common::PageType::Unmapped);
+
+        if (Settings::IsFastmemEnabled()) {
+            system.DeviceMemory().buffer.Unmap(base, size);
+        }
     }
 
     bool IsValidVirtualAddress(const Kernel::KProcess& process, const VAddr vaddr) const {
@@ -466,6 +478,12 @@ struct Memory::Impl {
         if (vaddr == 0) {
             return;
         }
+
+        if (Settings::IsFastmemEnabled()) {
+            const bool is_read_enable = Settings::IsGPULevelHigh() || !cached;
+            system.DeviceMemory().buffer.Protect(vaddr, size, is_read_enable, !cached);
+        }
+
         // Iterate over a contiguous CPU address space, which corresponds to the specified GPU
         // address space, marking the region as un/cached. The region is marked un/cached at a
         // granularity of CPU pages, hence why we iterate on a CPU page basis (note: GPU page size