109 files changed, 1985 insertions, 728 deletions
diff --git a/.ci/templates/build-single.yml b/.ci/templates/build-single.yml
index 77eeb96b5..357731eb9 100644
--- a/.ci/templates/build-single.yml
+++ b/.ci/templates/build-single.yml
@@ -1,20 +1,22 @@
 parameters:
   artifactSource: 'true'
+  cache: 'false'
 
 steps:
 - task: DockerInstaller@0
   displayName: 'Prepare Environment'
   inputs:
     dockerVersion: '17.09.0-ce'
-- task: CacheBeta@0
-  displayName: 'Cache Build System'
-  inputs:
-    key: yuzu-v1-$(BuildName)-$(BuildSuffix)-$(CacheSuffix)
-    path: $(System.DefaultWorkingDirectory)/ccache
-    cacheHitVar: CACHE_RESTORED
+- ${{ if eq(parameters.cache, 'true') }}:
+  - task: CacheBeta@0
+    displayName: 'Cache Build System'
+    inputs:
+      key: yuzu-v1-$(BuildName)-$(BuildSuffix)-$(CacheSuffix)
+      path: $(System.DefaultWorkingDirectory)/ccache
+      cacheHitVar: CACHE_RESTORED
 - script: chmod a+x ./.ci/scripts/$(ScriptFolder)/exec.sh && ./.ci/scripts/$(ScriptFolder)/exec.sh
   displayName: 'Build'
-- script: chmod a+x ./.ci/scripts/$(ScriptFolder)/upload.sh && ./.ci/scripts/$(ScriptFolder)/upload.sh
+- script: chmod a+x ./.ci/scripts/$(ScriptFolder)/upload.sh && RELEASE_NAME=$(BuildName) ./.ci/scripts/$(ScriptFolder)/upload.sh
   displayName: 'Package Artifacts'
 - publish: artifacts
   artifact: 'yuzu-$(BuildName)-$(BuildSuffix)'
diff --git a/.ci/templates/build-standard.yml b/.ci/templates/build-standard.yml
index 9975f5c49..aa180894e 100644
--- a/.ci/templates/build-standard.yml
+++ b/.ci/templates/build-standard.yml
@@ -3,7 +3,7 @@ jobs:
   displayName: 'standard'
   pool:
     vmImage: ubuntu-latest
-  strategy: 
+  strategy:
     maxParallel: 10
     matrix:
       windows:
@@ -19,4 +19,5 @@ jobs:
       needSubmodules: 'true'
   - template: ./build-single.yml
     parameters:
-      artifactSource: 'false'
-\ No newline at end of file
+      artifactSource: 'false'
+      cache: $(parameters.cache)
+\ No newline at end of file
diff --git a/.ci/templates/build-testing.yml b/.ci/templates/build-testing.yml
index 101e52996..a307addfd 100644
--- a/.ci/templates/build-testing.yml
+++ b/.ci/templates/build-testing.yml
@@ -3,19 +3,21 @@ jobs:
   displayName: 'testing'
   pool:
     vmImage: ubuntu-latest
-  strategy: 
-    maxParallel: 10
+  strategy:
+    maxParallel: 5
     matrix:
       windows:
         BuildSuffix: 'windows-testing'
         ScriptFolder: 'windows'
   steps:
+  - script: sudo apt upgrade python3-pip && pip install requests urllib3
+    displayName: 'Prepare Environment'
   - task: PythonScript@0
     condition: eq(variables['Build.Reason'], 'PullRequest')
     displayName: 'Determine Testing Status'
     inputs:
       scriptSource: 'filePath'
-      scriptPath: '../scripts/merge/check-label-presence.py'
+      scriptPath: '.ci/scripts/merge/check-label-presence.py'
       arguments: '$(System.PullRequest.PullRequestNumber) create-testing-build'
   - ${{ if eq(variables.enabletesting, 'true') }}:
     - template: ./sync-source.yml
@@ -27,4 +29,5 @@ jobs:
         matchLabel: 'testing-merge'
     - template: ./build-single.yml
       parameters:
-        artifactSource: 'false'
-\ No newline at end of file
+        artifactSource: 'false'
+        cache: 'false'
diff --git a/.ci/templates/release.yml b/.ci/templates/release.yml
deleted file mode 100644
index 60bebd2aa..000000000
--- a/.ci/templates/release.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-steps:
-  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Windows Release'
-    inputs:
-      artifactName: 'yuzu-$(BuildName)-windows-mingw'
-      buildType: 'current'
-      targetPath: '$(Build.ArtifactStagingDirectory)'
-  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Linux Release'
-    inputs:
-      artifactName: 'yuzu-$(BuildName)-linux'
-      buildType: 'current'
-      targetPath: '$(Build.ArtifactStagingDirectory)'
-  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Release Point'
-    inputs:
-      artifactName: 'yuzu-$(BuildName)-release-point'
-      buildType: 'current'
-      targetPath: '$(Build.ArtifactStagingDirectory)'
-  - script: echo '##vso[task.setvariable variable=tagcommit]' && cat $(Build.ArtifactStagingDirectory)/tag-commit.sha
-    displayName: 'Calculate Release Point'
-  - task: GitHubRelease@0
-    inputs:
-      gitHubConnection: $(GitHubReleaseConnectionName)
-      repositoryName: '$(GitHubReleaseRepoName)'
-      action: 'create'
-      target: $(variables.tagcommit)
-      title: 'yuzu $(BuildName) #$(Build.BuildId)'
-      assets: '$(Build.ArtifactStagingDirectory)/*'
diff --git a/.ci/yuzu-mainline.yml b/.ci/yuzu-mainline.yml
index 164bcb165..2930a8564 100644
--- a/.ci/yuzu-mainline.yml
+++ b/.ci/yuzu-mainline.yml
@@ -21,3 +21,5 @@ stages:
   dependsOn: format
   jobs:
   - template: ./templates/build-standard.yml
+    parameters:
+      cache: 'true'
diff --git a/.ci/yuzu-verify.yml b/.ci/yuzu-verify.yml
index d01c1feed..5492e696a 100644
--- a/.ci/yuzu-verify.yml
+++ b/.ci/yuzu-verify.yml
@@ -15,4 +15,6 @@ stages:
   dependsOn: format
   jobs:
   - template: ./templates/build-standard.yml
-  - template: ./templates/build-testing.yml
-\ No newline at end of file
+    parameters:
+      cache: 'false'
+  - template: ./templates/build-testing.yml
diff --git a/README.md b/README.md
index 4b1ea7d7c..430c6dd65 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@ yuzu emulator
 =============
 [![Travis CI Build Status](https://travis-ci.org/yuzu-emu/yuzu.svg?branch=master)](https://travis-ci.org/yuzu-emu/yuzu)
 [![AppVeyor CI Build Status](https://ci.appveyor.com/api/projects/status/77k97svb2usreu68?svg=true)](https://ci.appveyor.com/project/bunnei/yuzu)
+[![Azure Mainline CI Build Status](https://dev.azure.com/yuzu-emu/yuzu/_apis/build/status/yuzu%20mainline?branchName=master)](https://dev.azure.com/yuzu-emu/yuzu/)
 
 yuzu is an experimental open-source emulator for the Nintendo Switch from the creators of [Citra](https://citra-emu.org/).
 
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index f4325f0f8..5462decee 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -111,6 +111,8 @@ add_library(core STATIC
     frontend/scope_acquire_window_context.h
     gdbstub/gdbstub.cpp
     gdbstub/gdbstub.h
+    hardware_interrupt_manager.cpp
+    hardware_interrupt_manager.h
     hle/ipc.h
     hle/ipc_helpers.h
     hle/kernel/address_arbiter.cpp
@@ -372,6 +374,7 @@ add_library(core STATIC
     hle/service/nvdrv/devices/nvmap.h
     hle/service/nvdrv/interface.cpp
     hle/service/nvdrv/interface.h
+    hle/service/nvdrv/nvdata.h
     hle/service/nvdrv/nvdrv.cpp
     hle/service/nvdrv/nvdrv.h
     hle/service/nvdrv/nvmemp.cpp
diff --git a/src/core/core.cpp b/src/core/core.cpp
index 4aceee785..20d64f3b0 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -19,6 +19,7 @@
 #include "core/file_sys/vfs_concat.h"
 #include "core/file_sys/vfs_real.h"
 #include "core/gdbstub/gdbstub.h"
+#include "core/hardware_interrupt_manager.h"
 #include "core/hle/kernel/client_port.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/process.h"
@@ -151,7 +152,7 @@ struct System::Impl {
         if (!renderer->Init()) {
             return ResultStatus::ErrorVideoCore;
         }
-
+        interrupt_manager = std::make_unique<Core::Hardware::InterruptManager>(system);
         gpu_core = VideoCore::CreateGPU(system);
 
         is_powered_on = true;
@@ -298,6 +299,7 @@ struct System::Impl {
     std::unique_ptr<VideoCore::RendererBase> renderer;
     std::unique_ptr<Tegra::GPU> gpu_core;
     std::shared_ptr<Tegra::DebugContext> debug_context;
+    std::unique_ptr<Core::Hardware::InterruptManager> interrupt_manager;
     CpuCoreManager cpu_core_manager;
     bool is_powered_on = false;
 
@@ -444,6 +446,14 @@ const Tegra::GPU& System::GPU() const {
     return *impl->gpu_core;
 }
 
+Core::Hardware::InterruptManager& System::InterruptManager() {
+    return *impl->interrupt_manager;
+}
+
+const Core::Hardware::InterruptManager& System::InterruptManager() const {
+    return *impl->interrupt_manager;
+}
+
 VideoCore::RendererBase& System::Renderer() {
     return *impl->renderer;
 }
diff --git a/src/core/core.h b/src/core/core.h
index 8ebb385ac..0138d93b0 100644
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -70,6 +70,10 @@ namespace Core::Timing {
 class CoreTiming;
 }
 
+namespace Core::Hardware {
+class InterruptManager;
+}
+
 namespace Core {
 
 class ARM_Interface;
@@ -234,6 +238,12 @@ public:
     /// Provides a constant reference to the core timing instance.
     const Timing::CoreTiming& CoreTiming() const;
 
+    /// Provides a reference to the interrupt manager instance.
+    Core::Hardware::InterruptManager& InterruptManager();
+
+    /// Provides a constant reference to the interrupt manager instance.
+    const Core::Hardware::InterruptManager& InterruptManager() const;
+
     /// Provides a reference to the kernel instance.
     Kernel::KernelCore& Kernel();
 
diff --git a/src/core/hardware_interrupt_manager.cpp b/src/core/hardware_interrupt_manager.cpp
new file mode 100644
index 000000000..c2115db2d
--- /dev/null
+++ b/src/core/hardware_interrupt_manager.cpp
@@ -0,0 +1,30 @@
+// Copyright 2019 Yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "core/hardware_interrupt_manager.h"
+#include "core/hle/service/nvdrv/interface.h"
+#include "core/hle/service/sm/sm.h"
+
+namespace Core::Hardware {
+
+InterruptManager::InterruptManager(Core::System& system_in) : system(system_in) {
+    gpu_interrupt_event =
+        system.CoreTiming().RegisterEvent("GPUInterrupt", [this](u64 message, s64) {
+            auto nvdrv = system.ServiceManager().GetService<Service::Nvidia::NVDRV>("nvdrv");
+            const u32 syncpt = static_cast<u32>(message >> 32);
+            const u32 value = static_cast<u32>(message);
+            nvdrv->SignalGPUInterruptSyncpt(syncpt, value);
+        });
+}
+
+InterruptManager::~InterruptManager() = default;
+
+void InterruptManager::GPUInterruptSyncpt(const u32 syncpoint_id, const u32 value) {
+    const u64 msg = (static_cast<u64>(syncpoint_id) << 32ULL) | value;
+    system.CoreTiming().ScheduleEvent(10, gpu_interrupt_event, msg);
+}
+
+} // namespace Core::Hardware
diff --git a/src/core/hardware_interrupt_manager.h b/src/core/hardware_interrupt_manager.h
new file mode 100644
index 000000000..494db883a
--- /dev/null
+++ b/src/core/hardware_interrupt_manager.h
@@ -0,0 +1,31 @@
+// Copyright 2019 Yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Core {
+class System;
+}
+
+namespace Core::Timing {
+struct EventType;
+}
+
+namespace Core::Hardware {
+
+class InterruptManager {
+public:
+    explicit InterruptManager(Core::System& system);
+    ~InterruptManager();
+
+    void GPUInterruptSyncpt(u32 syncpoint_id, u32 value);
+
+private:
+    Core::System& system;
+    Core::Timing::EventType* gpu_interrupt_event{};
+};
+
+} // namespace Core::Hardware
diff --git a/src/core/hle/service/nvdrv/devices/nvdevice.h b/src/core/hle/service/nvdrv/devices/nvdevice.h
index 4f6042b00..5b8248433 100644
--- a/src/core/hle/service/nvdrv/devices/nvdevice.h
+++ b/src/core/hle/service/nvdrv/devices/nvdevice.h
@@ -8,6 +8,11 @@
 #include "common/bit_field.h"
 #include "common/common_types.h"
 #include "common/swap.h"
+#include "core/hle/service/nvdrv/nvdata.h"
+
+namespace Core {
+class System;
+}
 
 namespace Service::Nvidia::Devices {
 
@@ -15,7 +20,7 @@ namespace Service::Nvidia::Devices {
 /// implement the ioctl interface.
 class nvdevice {
 public:
-    nvdevice() = default;
+    explicit nvdevice(Core::System& system) : system{system} {};
     virtual ~nvdevice() = default;
     union Ioctl {
         u32_le raw;
@@ -33,7 +38,11 @@ public:
      * @param output A buffer where the output data will be written to.
      * @returns The result code of the ioctl.
      */
-    virtual u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) = 0;
+    virtual u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                      IoctlCtrl& ctrl) = 0;
+
+protected:
+    Core::System& system;
 };
 
 } // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
index 20c7c39aa..76494f0b7 100644
--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
@@ -13,10 +13,12 @@
 
 namespace Service::Nvidia::Devices {
 
-nvdisp_disp0::nvdisp_disp0(std::shared_ptr<nvmap> nvmap_dev) : nvmap_dev(std::move(nvmap_dev)) {}
+nvdisp_disp0::nvdisp_disp0(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
+    : nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
 nvdisp_disp0 ::~nvdisp_disp0() = default;
 
-u32 nvdisp_disp0::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvdisp_disp0::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                        IoctlCtrl& ctrl) {
     UNIMPLEMENTED_MSG("Unimplemented ioctl");
     return 0;
 }
@@ -34,9 +36,8 @@ void nvdisp_disp0::flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u3
         addr,      offset,   width, height, stride, static_cast<PixelFormat>(format),
         transform, crop_rect};
 
-    auto& instance = Core::System::GetInstance();
-    instance.GetPerfStats().EndGameFrame();
-    instance.GPU().SwapBuffers(framebuffer);
+    system.GetPerfStats().EndGameFrame();
+    system.GPU().SwapBuffers(framebuffer);
 }
 
 } // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h
index 12f3ef825..e79e490ff 100644
--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h
@@ -17,10 +17,11 @@ class nvmap;
 
 class nvdisp_disp0 final : public nvdevice {
 public:
-    explicit nvdisp_disp0(std::shared_ptr<nvmap> nvmap_dev);
+    explicit nvdisp_disp0(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
     ~nvdisp_disp0() override;
 
-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;
 
     /// Performs a screen flip, drawing the buffer pointed to by the handle.
     void flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u32 height, u32 stride,
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
index af62d33d2..24ab3f2e9 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
@@ -22,10 +22,12 @@ enum {
 };
 }
 
-nvhost_as_gpu::nvhost_as_gpu(std::shared_ptr<nvmap> nvmap_dev) : nvmap_dev(std::move(nvmap_dev)) {}
+nvhost_as_gpu::nvhost_as_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
+    : nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
 nvhost_as_gpu::~nvhost_as_gpu() = default;
 
-u32 nvhost_as_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_as_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                         IoctlCtrl& ctrl) {
     LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
               command.raw, input.size(), output.size());
 
@@ -65,7 +67,7 @@ u32 nvhost_as_gpu::AllocateSpace(const std::vector<u8>& input, std::vector<u8>&
     LOG_DEBUG(Service_NVDRV, "called, pages={:X}, page_size={:X}, flags={:X}", params.pages,
               params.page_size, params.flags);
 
-    auto& gpu = Core::System::GetInstance().GPU();
+    auto& gpu = system.GPU();
     const u64 size{static_cast<u64>(params.pages) * static_cast<u64>(params.page_size)};
     if (params.flags & 1) {
         params.offset = gpu.MemoryManager().AllocateSpace(params.offset, size, 1);
@@ -85,7 +87,7 @@ u32 nvhost_as_gpu::Remap(const std::vector<u8>& input, std::vector<u8>& output)
     std::vector<IoctlRemapEntry> entries(num_entries);
     std::memcpy(entries.data(), input.data(), input.size());
 
-    auto& gpu = Core::System::GetInstance().GPU();
+    auto& gpu = system.GPU();
     for (const auto& entry : entries) {
         LOG_WARNING(Service_NVDRV, "remap entry, offset=0x{:X} handle=0x{:X} pages=0x{:X}",
                     entry.offset, entry.nvmap_handle, entry.pages);
@@ -136,7 +138,7 @@ u32 nvhost_as_gpu::MapBufferEx(const std::vector<u8>& input, std::vector<u8>& ou
     // case to prevent unexpected behavior.
     ASSERT(object->id == params.nvmap_handle);
 
-    auto& gpu = Core::System::GetInstance().GPU();
+    auto& gpu = system.GPU();
 
     if (params.flags & 1) {
         params.offset = gpu.MemoryManager().MapBufferEx(object->addr, params.offset, object->size);
@@ -173,8 +175,7 @@ u32 nvhost_as_gpu::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& ou
         return 0;
     }
 
-    params.offset = Core::System::GetInstance().GPU().MemoryManager().UnmapBuffer(params.offset,
-                                                                                  itr->second.size);
+    params.offset = system.GPU().MemoryManager().UnmapBuffer(params.offset, itr->second.size);
     buffer_mappings.erase(itr->second.offset);
 
     std::memcpy(output.data(), &params, output.size());
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h
index eb14b1da8..30ca5f4c3 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h
@@ -17,10 +17,11 @@ class nvmap;
 
 class nvhost_as_gpu final : public nvdevice {
 public:
-    explicit nvhost_as_gpu(std::shared_ptr<nvmap> nvmap_dev);
+    explicit nvhost_as_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
     ~nvhost_as_gpu() override;
 
-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;
 
 private:
     enum class IoctlCommand : u32_le {
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
index b39fb9ef9..9a66a5f88 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
@@ -7,14 +7,20 @@
 
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/core.h"
+#include "core/hle/kernel/readable_event.h"
+#include "core/hle/kernel/writable_event.h"
 #include "core/hle/service/nvdrv/devices/nvhost_ctrl.h"
+#include "video_core/gpu.h"
 
 namespace Service::Nvidia::Devices {
 
-nvhost_ctrl::nvhost_ctrl() = default;
+nvhost_ctrl::nvhost_ctrl(Core::System& system, EventInterface& events_interface)
+    : nvdevice(system), events_interface{events_interface} {}
 nvhost_ctrl::~nvhost_ctrl() = default;
 
-u32 nvhost_ctrl::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_ctrl::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                       IoctlCtrl& ctrl) {
     LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
               command.raw, input.size(), output.size());
 
@@ -22,11 +28,15 @@ u32 nvhost_ctrl::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<
     case IoctlCommand::IocGetConfigCommand:
         return NvOsGetConfigU32(input, output);
     case IoctlCommand::IocCtrlEventWaitCommand:
-        return IocCtrlEventWait(input, output, false);
+        return IocCtrlEventWait(input, output, false, ctrl);
     case IoctlCommand::IocCtrlEventWaitAsyncCommand:
-        return IocCtrlEventWait(input, output, true);
+        return IocCtrlEventWait(input, output, true, ctrl);
     case IoctlCommand::IocCtrlEventRegisterCommand:
         return IocCtrlEventRegister(input, output);
+    case IoctlCommand::IocCtrlEventUnregisterCommand:
+        return IocCtrlEventUnregister(input, output);
+    case IoctlCommand::IocCtrlEventSignalCommand:
+        return IocCtrlEventSignal(input, output);
     }
     UNIMPLEMENTED_MSG("Unimplemented ioctl");
     return 0;
@@ -41,23 +51,137 @@ u32 nvhost_ctrl::NvOsGetConfigU32(const std::vector<u8>& input, std::vector<u8>&
 }
 
 u32 nvhost_ctrl::IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>& output,
-                                  bool is_async) {
+                                  bool is_async, IoctlCtrl& ctrl) {
     IocCtrlEventWaitParams params{};
     std::memcpy(&params, input.data(), sizeof(params));
-    LOG_WARNING(Service_NVDRV,
-                "(STUBBED) called, syncpt_id={}, threshold={}, timeout={}, is_async={}",
-                params.syncpt_id, params.threshold, params.timeout, is_async);
+    LOG_DEBUG(Service_NVDRV, "syncpt_id={}, threshold={}, timeout={}, is_async={}",
+              params.syncpt_id, params.threshold, params.timeout, is_async);
 
-    // TODO(Subv): Implement actual syncpt waiting.
-    params.value = 0;
+    if (params.syncpt_id >= MaxSyncPoints) {
+        return NvResult::BadParameter;
+    }
+
+    auto& gpu = system.GPU();
+    // This is mostly to take into account unimplemented features. As synced
+    // gpu is always synced.
+    if (!gpu.IsAsync()) {
+        return NvResult::Success;
+    }
+    auto lock = gpu.LockSync();
+    const u32 current_syncpoint_value = gpu.GetSyncpointValue(params.syncpt_id);
+    const s32 diff = current_syncpoint_value - params.threshold;
+    if (diff >= 0) {
+        params.value = current_syncpoint_value;
+        std::memcpy(output.data(), &params, sizeof(params));
+        return NvResult::Success;
+    }
+    const u32 target_value = current_syncpoint_value - diff;
+
+    if (!is_async) {
+        params.value = 0;
+    }
+
+    if (params.timeout == 0) {
+        std::memcpy(output.data(), &params, sizeof(params));
+        return NvResult::Timeout;
+    }
+
+    u32 event_id;
+    if (is_async) {
+        event_id = params.value & 0x00FF;
+        if (event_id >= MaxNvEvents) {
+            std::memcpy(output.data(), &params, sizeof(params));
+            return NvResult::BadParameter;
+        }
+    } else {
+        if (ctrl.fresh_call) {
+            const auto result = events_interface.GetFreeEvent();
+            if (result) {
+                event_id = *result;
+            } else {
+                LOG_CRITICAL(Service_NVDRV, "No Free Events available!");
+                event_id = params.value & 0x00FF;
+            }
+        } else {
+            event_id = ctrl.event_id;
+        }
+    }
+
+    EventState status = events_interface.status[event_id];
+    if (event_id < MaxNvEvents || status == EventState::Free || status == EventState::Registered) {
+        events_interface.SetEventStatus(event_id, EventState::Waiting);
+        events_interface.assigned_syncpt[event_id] = params.syncpt_id;
+        events_interface.assigned_value[event_id] = target_value;
+        if (is_async) {
+            params.value = params.syncpt_id << 4;
+        } else {
+            params.value = ((params.syncpt_id & 0xfff) << 16) | 0x10000000;
+        }
+        params.value |= event_id;
+        events_interface.events[event_id].writable->Clear();
+        gpu.RegisterSyncptInterrupt(params.syncpt_id, target_value);
+        if (!is_async && ctrl.fresh_call) {
+            ctrl.must_delay = true;
+            ctrl.timeout = params.timeout;
+            ctrl.event_id = event_id;
+            return NvResult::Timeout;
+        }
+        std::memcpy(output.data(), &params, sizeof(params));
+        return NvResult::Timeout;
+    }
     std::memcpy(output.data(), &params, sizeof(params));
-    return 0;
+    return NvResult::BadParameter;
 }
 
 u32 nvhost_ctrl::IocCtrlEventRegister(const std::vector<u8>& input, std::vector<u8>& output) {
-    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
-    // TODO(bunnei): Implement this.
-    return 0;
+    IocCtrlEventRegisterParams params{};
+    std::memcpy(&params, input.data(), sizeof(params));
+    const u32 event_id = params.user_event_id & 0x00FF;
+    LOG_DEBUG(Service_NVDRV, " called, user_event_id: {:X}", event_id);
+    if (event_id >= MaxNvEvents) {
+        return NvResult::BadParameter;
+    }
+    if (events_interface.registered[event_id]) {
+        return NvResult::BadParameter;
+    }
+    events_interface.RegisterEvent(event_id);
+    return NvResult::Success;
+}
+
+u32 nvhost_ctrl::IocCtrlEventUnregister(const std::vector<u8>& input, std::vector<u8>& output) {
+    IocCtrlEventUnregisterParams params{};
+    std::memcpy(&params, input.data(), sizeof(params));
+    const u32 event_id = params.user_event_id & 0x00FF;
+    LOG_DEBUG(Service_NVDRV, " called, user_event_id: {:X}", event_id);
+    if (event_id >= MaxNvEvents) {
+        return NvResult::BadParameter;
+    }
+    if (!events_interface.registered[event_id]) {
+        return NvResult::BadParameter;
+    }
+    events_interface.UnregisterEvent(event_id);
+    return NvResult::Success;
+}
+
+u32 nvhost_ctrl::IocCtrlEventSignal(const std::vector<u8>& input, std::vector<u8>& output) {
+    IocCtrlEventSignalParams params{};
+    std::memcpy(&params, input.data(), sizeof(params));
+    // TODO(Blinkhawk): This is normally called when an NvEvents timeout on WaitSynchronization
+    // It is believed from RE to cancel the GPU Event. However, better research is required
+    u32 event_id = params.user_event_id & 0x00FF;
+    LOG_WARNING(Service_NVDRV, "(STUBBED) called, user_event_id: {:X}", event_id);
+    if (event_id >= MaxNvEvents) {
+        return NvResult::BadParameter;
+    }
+    if (events_interface.status[event_id] == EventState::Waiting) {
+        auto& gpu = system.GPU();
+        if (gpu.CancelSyncptInterrupt(events_interface.assigned_syncpt[event_id],
+                                      events_interface.assigned_value[event_id])) {
+            events_interface.LiberateEvent(event_id);
+            events_interface.events[event_id].writable->Signal();
+        }
+    }
+    return NvResult::Success;
 }
 
 } // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h
index 6d0de2212..14e6e7e57 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h
@@ -8,15 +8,17 @@
 #include <vector>
 #include "common/common_types.h"
 #include "core/hle/service/nvdrv/devices/nvdevice.h"
+#include "core/hle/service/nvdrv/nvdrv.h"
 
 namespace Service::Nvidia::Devices {
 
 class nvhost_ctrl final : public nvdevice {
 public:
-    nvhost_ctrl();
+    explicit nvhost_ctrl(Core::System& system, EventInterface& events_interface);
     ~nvhost_ctrl() override;
 
-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;
 
 private:
     enum class IoctlCommand : u32_le {
@@ -132,9 +134,16 @@ private:
 
     u32 NvOsGetConfigU32(const std::vector<u8>& input, std::vector<u8>& output);
 
-    u32 IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>& output, bool is_async);
+    u32 IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>& output, bool is_async,
+                         IoctlCtrl& ctrl);
 
     u32 IocCtrlEventRegister(const std::vector<u8>& input, std::vector<u8>& output);
+
+    u32 IocCtrlEventUnregister(const std::vector<u8>& input, std::vector<u8>& output);
+
+    u32 IocCtrlEventSignal(const std::vector<u8>& input, std::vector<u8>& output);
+
+    EventInterface& events_interface;
 };
 
 } // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
index 0e28755bd..988effd90 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
@@ -12,10 +12,11 @@
 
 namespace Service::Nvidia::Devices {
 
-nvhost_ctrl_gpu::nvhost_ctrl_gpu() = default;
+nvhost_ctrl_gpu::nvhost_ctrl_gpu(Core::System& system) : nvdevice(system) {}
 nvhost_ctrl_gpu::~nvhost_ctrl_gpu() = default;
 
-u32 nvhost_ctrl_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_ctrl_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                           IoctlCtrl& ctrl) {
     LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
               command.raw, input.size(), output.size());
 
@@ -185,7 +186,7 @@ u32 nvhost_ctrl_gpu::GetGpuTime(const std::vector<u8>& input, std::vector<u8>& o
 
     IoctlGetGpuTime params{};
     std::memcpy(&params, input.data(), input.size());
-    const auto ns = Core::Timing::CyclesToNs(Core::System::GetInstance().CoreTiming().GetTicks());
+    const auto ns = Core::Timing::CyclesToNs(system.CoreTiming().GetTicks());
     params.gpu_time = static_cast<u64_le>(ns.count());
     std::memcpy(output.data(), &params, output.size());
     return 0;
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
index 240435eea..2b035ae3f 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
@@ -13,10 +13,11 @@ namespace Service::Nvidia::Devices {
 
 class nvhost_ctrl_gpu final : public nvdevice {
 public:
-    nvhost_ctrl_gpu();
+    explicit nvhost_ctrl_gpu(Core::System& system);
     ~nvhost_ctrl_gpu() override;
 
-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;
 
 private:
     enum class IoctlCommand : u32_le {
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
index 8ce7bc7a5..241dac881 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -13,10 +13,12 @@
 
 namespace Service::Nvidia::Devices {
 
-nvhost_gpu::nvhost_gpu(std::shared_ptr<nvmap> nvmap_dev) : nvmap_dev(std::move(nvmap_dev)) {}
+nvhost_gpu::nvhost_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
+    : nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
 nvhost_gpu::~nvhost_gpu() = default;
 
-u32 nvhost_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                      IoctlCtrl& ctrl) {
     LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
               command.raw, input.size(), output.size());
 
@@ -119,8 +121,10 @@ u32 nvhost_gpu::AllocGPFIFOEx2(const std::vector<u8>& input, std::vector<u8>& ou
                 params.num_entries, params.flags, params.unk0, params.unk1, params.unk2,
                 params.unk3);
 
-    params.fence_out.id = 0;
-    params.fence_out.value = 0;
+    auto& gpu = system.GPU();
+    params.fence_out.id = assigned_syncpoints;
+    params.fence_out.value = gpu.GetSyncpointValue(assigned_syncpoints);
+    assigned_syncpoints++;
     std::memcpy(output.data(), &params, output.size());
     return 0;
 }
@@ -143,7 +147,7 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
     IoctlSubmitGpfifo params{};
     std::memcpy(&params, input.data(), sizeof(IoctlSubmitGpfifo));
     LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
-                params.address, params.num_entries, params.flags);
+                params.address, params.num_entries, params.flags.raw);
 
     ASSERT_MSG(input.size() == sizeof(IoctlSubmitGpfifo) +
                                    params.num_entries * sizeof(Tegra::CommandListHeader),
@@ -153,10 +157,18 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
     std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)],
                 params.num_entries * sizeof(Tegra::CommandListHeader));
 
-    Core::System::GetInstance().GPU().PushGPUEntries(std::move(entries));
+    UNIMPLEMENTED_IF(params.flags.add_wait.Value() != 0);
+    UNIMPLEMENTED_IF(params.flags.add_increment.Value() != 0);
+
+    auto& gpu = system.GPU();
+    u32 current_syncpoint_value = gpu.GetSyncpointValue(params.fence_out.id);
+    if (params.flags.increment.Value()) {
+        params.fence_out.value += current_syncpoint_value;
+    } else {
+        params.fence_out.value = current_syncpoint_value;
+    }
+    gpu.PushGPUEntries(std::move(entries));
 
-    params.fence_out.id = 0;
-    params.fence_out.value = 0;
     std::memcpy(output.data(), &params, sizeof(IoctlSubmitGpfifo));
     return 0;
 }
@@ -168,16 +180,24 @@ u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output)
     IoctlSubmitGpfifo params{};
     std::memcpy(&params, input.data(), sizeof(IoctlSubmitGpfifo));
     LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
-                params.address, params.num_entries, params.flags);
+                params.address, params.num_entries, params.flags.raw);
 
     Tegra::CommandList entries(params.num_entries);
     Memory::ReadBlock(params.address, entries.data(),
                       params.num_entries * sizeof(Tegra::CommandListHeader));
 
-    Core::System::GetInstance().GPU().PushGPUEntries(std::move(entries));
+    UNIMPLEMENTED_IF(params.flags.add_wait.Value() != 0);
+    UNIMPLEMENTED_IF(params.flags.add_increment.Value() != 0);
+
+    auto& gpu = system.GPU();
+    u32 current_syncpoint_value = gpu.GetSyncpointValue(params.fence_out.id);
+    if (params.flags.increment.Value()) {
+        params.fence_out.value += current_syncpoint_value;
+    } else {
+        params.fence_out.value = current_syncpoint_value;
+    }
+    gpu.PushGPUEntries(std::move(entries));
 
-    params.fence_out.id = 0;
-    params.fence_out.value = 0;
     std::memcpy(output.data(), &params, output.size());
     return 0;
 }
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
index 62beb5c0c..d2e8fbae9 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
@@ -10,6 +10,7 @@
 #include "common/common_types.h"
 #include "common/swap.h"
 #include "core/hle/service/nvdrv/devices/nvdevice.h"
+#include "core/hle/service/nvdrv/nvdata.h"
 
 namespace Service::Nvidia::Devices {
 
@@ -20,10 +21,11 @@ constexpr u32 NVGPU_IOCTL_CHANNEL_KICKOFF_PB(0x1b);
 
 class nvhost_gpu final : public nvdevice {
 public:
-    explicit nvhost_gpu(std::shared_ptr<nvmap> nvmap_dev);
+    explicit nvhost_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
     ~nvhost_gpu() override;
 
-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;
 
 private:
     enum class IoctlCommand : u32_le {
@@ -113,11 +115,7 @@ private:
     static_assert(sizeof(IoctlGetErrorNotification) == 16,
                   "IoctlGetErrorNotification is incorrect size");
 
-    struct IoctlFence {
-        u32_le id;
-        u32_le value;
-    };
-    static_assert(sizeof(IoctlFence) == 8, "IoctlFence is incorrect size");
+    static_assert(sizeof(Fence) == 8, "Fence is incorrect size");
 
     struct IoctlAllocGpfifoEx {
         u32_le num_entries;
@@ -132,13 +130,13 @@ private:
     static_assert(sizeof(IoctlAllocGpfifoEx) == 32, "IoctlAllocGpfifoEx is incorrect size");
 
     struct IoctlAllocGpfifoEx2 {
-        u32_le num_entries;   // in
-        u32_le flags;         // in
-        u32_le unk0;          // in (1 works)
-        IoctlFence fence_out; // out
-        u32_le unk1;          // in
-        u32_le unk2;          // in
-        u32_le unk3;          // in
+        u32_le num_entries; // in
+        u32_le flags;       // in
+        u32_le unk0;        // in (1 works)
+        Fence fence_out;    // out
+        u32_le unk1;        // in
+        u32_le unk2;        // in
+        u32_le unk3;        // in
     };
     static_assert(sizeof(IoctlAllocGpfifoEx2) == 32, "IoctlAllocGpfifoEx2 is incorrect size");
 
@@ -153,10 +151,16 @@ private:
     struct IoctlSubmitGpfifo {
         u64_le address;     // pointer to gpfifo entry structs
         u32_le num_entries; // number of fence objects being submitted
-        u32_le flags;
-        IoctlFence fence_out; // returned new fence object for others to wait on
-    };
-    static_assert(sizeof(IoctlSubmitGpfifo) == 16 + sizeof(IoctlFence),
+        union {
+            u32_le raw;
+            BitField<0, 1, u32_le> add_wait;      // append a wait sync_point to the list
+            BitField<1, 1, u32_le> add_increment; // append an increment to the list
+            BitField<2, 1, u32_le> new_hw_format; // Mostly ignored
+            BitField<8, 1, u32_le> increment;     // increment the returned fence
+        } flags;
+        Fence fence_out; // returned new fence object for others to wait on
+    };
+    static_assert(sizeof(IoctlSubmitGpfifo) == 16 + sizeof(Fence),
                   "IoctlSubmitGpfifo is incorrect size");
 
     struct IoctlGetWaitbase {
@@ -184,6 +188,7 @@ private:
     u32 ChannelSetTimeout(const std::vector<u8>& input, std::vector<u8>& output);
 
     std::shared_ptr<nvmap> nvmap_dev;
+    u32 assigned_syncpoints{};
 };
 
 } // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
index f5e8ea7c3..f572ad30f 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
@@ -10,10 +10,11 @@
 
 namespace Service::Nvidia::Devices {
 
-nvhost_nvdec::nvhost_nvdec() = default;
+nvhost_nvdec::nvhost_nvdec(Core::System& system) : nvdevice(system) {}
 nvhost_nvdec::~nvhost_nvdec() = default;
 
-u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                        IoctlCtrl& ctrl) {
     LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
               command.raw, input.size(), output.size());
 
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
index 0e7b284f8..2710f0511 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
@@ -13,10 +13,11 @@ namespace Service::Nvidia::Devices {
 
 class nvhost_nvdec final : public nvdevice {
 public:
-    nvhost_nvdec();
+    explicit nvhost_nvdec(Core::System& system);
     ~nvhost_nvdec() override;
 
-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;
 
 private:
     enum class IoctlCommand : u32_le {
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp
index 3e0951ab0..38282956f 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp
@@ -10,10 +10,11 @@
 
 namespace Service::Nvidia::Devices {
 
-nvhost_nvjpg::nvhost_nvjpg() = default;
+nvhost_nvjpg::nvhost_nvjpg(Core::System& system) : nvdevice(system) {}
 nvhost_nvjpg::~nvhost_nvjpg() = default;
 
-u32 nvhost_nvjpg::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_nvjpg::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                        IoctlCtrl& ctrl) {
     LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
               command.raw, input.size(), output.size());
 
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h
index 89fd5e95e..379766693 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h
@@ -13,10 +13,11 @@ namespace Service::Nvidia::Devices {
 
 class nvhost_nvjpg final : public nvdevice {
 public:
-    nvhost_nvjpg();
+    explicit nvhost_nvjpg(Core::System& system);
     ~nvhost_nvjpg() override;
 
-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;
 
 private:
     enum class IoctlCommand : u32_le {
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
index d544f0f31..70e8091db 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@@ -10,10 +10,11 @@
 
 namespace Service::Nvidia::Devices {
 
-nvhost_vic::nvhost_vic() = default;
+nvhost_vic::nvhost_vic(Core::System& system) : nvdevice(system) {}
 nvhost_vic::~nvhost_vic() = default;
 
-u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                      IoctlCtrl& ctrl) {
     LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
               command.raw, input.size(), output.size());
 
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.h b/src/core/hle/service/nvdrv/devices/nvhost_vic.h
index fc24c3f9c..7d111977e 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.h
@@ -13,10 +13,11 @@ namespace Service::Nvidia::Devices {
 
 class nvhost_vic final : public nvdevice {
 public:
-    nvhost_vic();
+    explicit nvhost_vic(Core::System& system);
     ~nvhost_vic() override;
 
-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;
 
 private:
     enum class IoctlCommand : u32_le {
diff --git a/src/core/hle/service/nvdrv/devices/nvmap.cpp b/src/core/hle/service/nvdrv/devices/nvmap.cpp
index 1ec796fc6..223b496b7 100644
--- a/src/core/hle/service/nvdrv/devices/nvmap.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvmap.cpp
@@ -18,7 +18,7 @@ enum {
 };
 }
 
-nvmap::nvmap() = default;
+nvmap::nvmap(Core::System& system) : nvdevice(system) {}
 nvmap::~nvmap() = default;
 
 VAddr nvmap::GetObjectAddress(u32 handle) const {
@@ -28,7 +28,8 @@ VAddr nvmap::GetObjectAddress(u32 handle) const {
     return object->addr;
 }
 
-u32 nvmap::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvmap::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                 IoctlCtrl& ctrl) {
     switch (static_cast<IoctlCommand>(command.raw)) {
     case IoctlCommand::Create:
         return IocCreate(input, output);
diff --git a/src/core/hle/service/nvdrv/devices/nvmap.h b/src/core/hle/service/nvdrv/devices/nvmap.h
index 396230c19..bf4a101c2 100644
--- a/src/core/hle/service/nvdrv/devices/nvmap.h
+++ b/src/core/hle/service/nvdrv/devices/nvmap.h
@@ -16,13 +16,14 @@ namespace Service::Nvidia::Devices {
 
 class nvmap final : public nvdevice {
 public:
-    nvmap();
+    explicit nvmap(Core::System& system);
     ~nvmap() override;
 
     /// Returns the allocated address of an nvmap object given its handle.
     VAddr GetObjectAddress(u32 handle) const;
 
-    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl) override;
 
     /// Represents an nvmap object.
     struct Object {
diff --git a/src/core/hle/service/nvdrv/interface.cpp b/src/core/hle/service/nvdrv/interface.cpp
index b60fc748b..d5be64ed2 100644
--- a/src/core/hle/service/nvdrv/interface.cpp
+++ b/src/core/hle/service/nvdrv/interface.cpp
@@ -8,12 +8,18 @@
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/readable_event.h"
+#include "core/hle/kernel/thread.h"
 #include "core/hle/kernel/writable_event.h"
 #include "core/hle/service/nvdrv/interface.h"
+#include "core/hle/service/nvdrv/nvdata.h"
 #include "core/hle/service/nvdrv/nvdrv.h"
 
 namespace Service::Nvidia {
 
+void NVDRV::SignalGPUInterruptSyncpt(const u32 syncpoint_id, const u32 value) {
+    nvdrv->SignalSyncpt(syncpoint_id, value);
+}
+
 void NVDRV::Open(Kernel::HLERequestContext& ctx) {
     LOG_DEBUG(Service_NVDRV, "called");
 
@@ -36,11 +42,31 @@ void NVDRV::Ioctl(Kernel::HLERequestContext& ctx) {
 
     std::vector<u8> output(ctx.GetWriteBufferSize());
 
+    IoctlCtrl ctrl{};
+
+    u32 result = nvdrv->Ioctl(fd, command, ctx.ReadBuffer(), output, ctrl);
+
+    if (ctrl.must_delay) {
+        ctrl.fresh_call = false;
+        ctx.SleepClientThread(
+            "NVServices::DelayedResponse", ctrl.timeout,
+            [=](Kernel::SharedPtr<Kernel::Thread> thread, Kernel::HLERequestContext& ctx,
+                Kernel::ThreadWakeupReason reason) {
+                IoctlCtrl ctrl2{ctrl};
+                std::vector<u8> output2 = output;
+                u32 result = nvdrv->Ioctl(fd, command, ctx.ReadBuffer(), output2, ctrl2);
+                ctx.WriteBuffer(output2);
+                IPC::ResponseBuilder rb{ctx, 3};
+                rb.Push(RESULT_SUCCESS);
+                rb.Push(result);
+            },
+            nvdrv->GetEventWriteable(ctrl.event_id));
+    } else {
+        ctx.WriteBuffer(output);
+    }
     IPC::ResponseBuilder rb{ctx, 3};
     rb.Push(RESULT_SUCCESS);
-    rb.Push(nvdrv->Ioctl(fd, command, ctx.ReadBuffer(), output));
-
-    ctx.WriteBuffer(output);
+    rb.Push(result);
 }
 
 void NVDRV::Close(Kernel::HLERequestContext& ctx) {
@@ -66,13 +92,19 @@ void NVDRV::Initialize(Kernel::HLERequestContext& ctx) {
 void NVDRV::QueryEvent(Kernel::HLERequestContext& ctx) {
     IPC::RequestParser rp{ctx};
     u32 fd = rp.Pop<u32>();
-    u32 event_id = rp.Pop<u32>();
+    // TODO(Blinkhawk): Figure the meaning of the flag at bit 16
+    u32 event_id = rp.Pop<u32>() & 0x000000FF;
     LOG_WARNING(Service_NVDRV, "(STUBBED) called, fd={:X}, event_id={:X}", fd, event_id);
 
     IPC::ResponseBuilder rb{ctx, 3, 1};
     rb.Push(RESULT_SUCCESS);
-    rb.PushCopyObjects(query_event.readable);
-    rb.Push<u32>(0);
+    if (event_id < MaxNvEvents) {
+        rb.PushCopyObjects(nvdrv->GetEvent(event_id));
+        rb.Push<u32>(NvResult::Success);
+    } else {
+        rb.Push<u32>(0);
+        rb.Push<u32>(NvResult::BadParameter);
+    }
 }
 
 void NVDRV::SetClientPID(Kernel::HLERequestContext& ctx) {
@@ -127,10 +159,6 @@ NVDRV::NVDRV(std::shared_ptr<Module> nvdrv, const char* name)
         {13, &NVDRV::FinishInitialize, "FinishInitialize"},
     };
     RegisterHandlers(functions);
-
-    auto& kernel = Core::System::GetInstance().Kernel();
-    query_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
-                                                         "NVDRV::query_event");
 }
 
 NVDRV::~NVDRV() = default;
diff --git a/src/core/hle/service/nvdrv/interface.h b/src/core/hle/service/nvdrv/interface.h
index 5b4889910..10a0ecd52 100644
--- a/src/core/hle/service/nvdrv/interface.h
+++ b/src/core/hle/service/nvdrv/interface.h
@@ -19,6 +19,8 @@ public:
     NVDRV(std::shared_ptr<Module> nvdrv, const char* name);
     ~NVDRV() override;
 
+    void SignalGPUInterruptSyncpt(const u32 syncpoint_id, const u32 value);
+
 private:
     void Open(Kernel::HLERequestContext& ctx);
     void Ioctl(Kernel::HLERequestContext& ctx);
@@ -33,8 +35,6 @@ private:
     std::shared_ptr<Module> nvdrv;
 
     u64 pid{};
-
-    Kernel::EventPair query_event;
 };
 
 } // namespace Service::Nvidia
diff --git a/src/core/hle/service/nvdrv/nvdata.h b/src/core/hle/service/nvdrv/nvdata.h
new file mode 100644
index 000000000..ac03cbc23
--- /dev/null
+++ b/src/core/hle/service/nvdrv/nvdata.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <array>
+#include "common/common_types.h"
+
+namespace Service::Nvidia {
+
+constexpr u32 MaxSyncPoints = 192;
+constexpr u32 MaxNvEvents = 64;
+
+struct Fence {
+    s32 id;
+    u32 value;
+};
+
+static_assert(sizeof(Fence) == 8, "Fence has wrong size");
+
+struct MultiFence {
+    u32 num_fences;
+    std::array<Fence, 4> fences;
+};
+
+enum NvResult : u32 {
+    Success = 0,
+    BadParameter = 4,
+    Timeout = 5,
+    ResourceError = 15,
+};
+
+enum class EventState {
+    Free = 0,
+    Registered = 1,
+    Waiting = 2,
+    Busy = 3,
+};
+
+struct IoctlCtrl {
+    // First call done to the servioce for services that call itself again after a call.
+    bool fresh_call{true};
+    // Tells the Ioctl Wrapper that it must delay the IPC response and send the thread to sleep
+    bool must_delay{};
+    // Timeout for the delay
+    s64 timeout{};
+    // NV Event Id
+    s32 event_id{-1};
+};
+
+} // namespace Service::Nvidia
diff --git a/src/core/hle/service/nvdrv/nvdrv.cpp b/src/core/hle/service/nvdrv/nvdrv.cpp
index 6e4b8f2c6..2011a226a 100644
--- a/src/core/hle/service/nvdrv/nvdrv.cpp
+++ b/src/core/hle/service/nvdrv/nvdrv.cpp
@@ -4,7 +4,10 @@
 
 #include <utility>
 
+#include <fmt/format.h>
 #include "core/hle/ipc_helpers.h"
+#include "core/hle/kernel/readable_event.h"
+#include "core/hle/kernel/writable_event.h"
 #include "core/hle/service/nvdrv/devices/nvdevice.h"
 #include "core/hle/service/nvdrv/devices/nvdisp_disp0.h"
 #include "core/hle/service/nvdrv/devices/nvhost_as_gpu.h"
@@ -22,8 +25,9 @@
 
 namespace Service::Nvidia {
 
-void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger& nvflinger) {
-    auto module_ = std::make_shared<Module>();
+void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger& nvflinger,
+                       Core::System& system) {
+    auto module_ = std::make_shared<Module>(system);
     std::make_shared<NVDRV>(module_, "nvdrv")->InstallAsService(service_manager);
     std::make_shared<NVDRV>(module_, "nvdrv:a")->InstallAsService(service_manager);
     std::make_shared<NVDRV>(module_, "nvdrv:s")->InstallAsService(service_manager);
@@ -32,17 +36,25 @@ void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger
     nvflinger.SetNVDrvInstance(module_);
 }
 
-Module::Module() {
-    auto nvmap_dev = std::make_shared<Devices::nvmap>();
-    devices["/dev/nvhost-as-gpu"] = std::make_shared<Devices::nvhost_as_gpu>(nvmap_dev);
-    devices["/dev/nvhost-gpu"] = std::make_shared<Devices::nvhost_gpu>(nvmap_dev);
-    devices["/dev/nvhost-ctrl-gpu"] = std::make_shared<Devices::nvhost_ctrl_gpu>();
+Module::Module(Core::System& system) {
+    auto& kernel = system.Kernel();
+    for (u32 i = 0; i < MaxNvEvents; i++) {
+        std::string event_label = fmt::format("NVDRV::NvEvent_{}", i);
+        events_interface.events[i] = Kernel::WritableEvent::CreateEventPair(
+            kernel, Kernel::ResetType::Automatic, event_label);
+        events_interface.status[i] = EventState::Free;
+        events_interface.registered[i] = false;
+    }
+    auto nvmap_dev = std::make_shared<Devices::nvmap>(system);
+    devices["/dev/nvhost-as-gpu"] = std::make_shared<Devices::nvhost_as_gpu>(system, nvmap_dev);
+    devices["/dev/nvhost-gpu"] = std::make_shared<Devices::nvhost_gpu>(system, nvmap_dev);
+    devices["/dev/nvhost-ctrl-gpu"] = std::make_shared<Devices::nvhost_ctrl_gpu>(system);
     devices["/dev/nvmap"] = nvmap_dev;
-    devices["/dev/nvdisp_disp0"] = std::make_shared<Devices::nvdisp_disp0>(nvmap_dev);
-    devices["/dev/nvhost-ctrl"] = std::make_shared<Devices::nvhost_ctrl>();
-    devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>();
-    devices["/dev/nvhost-nvjpg"] = std::make_shared<Devices::nvhost_nvjpg>();
-    devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>();
+    devices["/dev/nvdisp_disp0"] = std::make_shared<Devices::nvdisp_disp0>(system, nvmap_dev);
+    devices["/dev/nvhost-ctrl"] = std::make_shared<Devices::nvhost_ctrl>(system, events_interface);
+    devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>(system);
+    devices["/dev/nvhost-nvjpg"] = std::make_shared<Devices::nvhost_nvjpg>(system);
+    devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>(system);
 }
 
 Module::~Module() = default;
@@ -59,12 +71,13 @@ u32 Module::Open(const std::string& device_name) {
     return fd;
 }
 
-u32 Module::Ioctl(u32 fd, u32 command, const std::vector<u8>& input, std::vector<u8>& output) {
+u32 Module::Ioctl(u32 fd, u32 command, const std::vector<u8>& input, std::vector<u8>& output,
+                  IoctlCtrl& ctrl) {
     auto itr = open_files.find(fd);
     ASSERT_MSG(itr != open_files.end(), "Tried to talk to an invalid device");
 
     auto& device = itr->second;
-    return device->ioctl({command}, input, output);
+    return device->ioctl({command}, input, output, ctrl);
 }
 
 ResultCode Module::Close(u32 fd) {
@@ -77,4 +90,22 @@ ResultCode Module::Close(u32 fd) {
     return RESULT_SUCCESS;
 }
 
+void Module::SignalSyncpt(const u32 syncpoint_id, const u32 value) {
+    for (u32 i = 0; i < MaxNvEvents; i++) {
+        if (events_interface.assigned_syncpt[i] == syncpoint_id &&
+            events_interface.assigned_value[i] == value) {
+            events_interface.LiberateEvent(i);
+            events_interface.events[i].writable->Signal();
+        }
+    }
+}
+
+Kernel::SharedPtr<Kernel::ReadableEvent> Module::GetEvent(const u32 event_id) const {
+    return events_interface.events[event_id].readable;
+}
+
+Kernel::SharedPtr<Kernel::WritableEvent> Module::GetEventWriteable(const u32 event_id) const {
+    return events_interface.events[event_id].writable;
+}
+
 } // namespace Service::Nvidia
diff --git a/src/core/hle/service/nvdrv/nvdrv.h b/src/core/hle/service/nvdrv/nvdrv.h
index 53564f696..a339ab672 100644
--- a/src/core/hle/service/nvdrv/nvdrv.h
+++ b/src/core/hle/service/nvdrv/nvdrv.h
@@ -8,8 +8,14 @@
 #include <unordered_map>
 #include <vector>
 #include "common/common_types.h"
+#include "core/hle/kernel/writable_event.h"
+#include "core/hle/service/nvdrv/nvdata.h"
 #include "core/hle/service/service.h"
 
+namespace Core {
+class System;
+}
+
 namespace Service::NVFlinger {
 class NVFlinger;
 }
@@ -20,16 +26,72 @@ namespace Devices {
 class nvdevice;
 }
 
-struct IoctlFence {
-    u32 id;
-    u32 value;
+struct EventInterface {
+    // Mask representing currently busy events
+    u64 events_mask{};
+    // Each kernel event associated to an NV event
+    std::array<Kernel::EventPair, MaxNvEvents> events;
+    // The status of the current NVEvent
+    std::array<EventState, MaxNvEvents> status{};
+    // Tells if an NVEvent is registered or not
+    std::array<bool, MaxNvEvents> registered{};
+    // When an NVEvent is waiting on GPU interrupt, this is the sync_point
+    // associated with it.
+    std::array<u32, MaxNvEvents> assigned_syncpt{};
+    // This is the value of the GPU interrupt for which the NVEvent is waiting
+    // for.
+    std::array<u32, MaxNvEvents> assigned_value{};
+    // Constant to denote an unasigned syncpoint.
+    static constexpr u32 unassigned_syncpt = 0xFFFFFFFF;
+    std::optional<u32> GetFreeEvent() const {
+        u64 mask = events_mask;
+        for (u32 i = 0; i < MaxNvEvents; i++) {
+            const bool is_free = (mask & 0x1) == 0;
+            if (is_free) {
+                if (status[i] == EventState::Registered || status[i] == EventState::Free) {
+                    return {i};
+                }
+            }
+            mask = mask >> 1;
+        }
+        return {};
+    }
+    void SetEventStatus(const u32 event_id, EventState new_status) {
+        EventState old_status = status[event_id];
+        if (old_status == new_status) {
+            return;
+        }
+        status[event_id] = new_status;
+        if (new_status == EventState::Registered) {
+            registered[event_id] = true;
+        }
+        if (new_status == EventState::Waiting || new_status == EventState::Busy) {
+            events_mask |= (1ULL << event_id);
+        }
+    }
+    void RegisterEvent(const u32 event_id) {
+        registered[event_id] = true;
+        if (status[event_id] == EventState::Free) {
+            status[event_id] = EventState::Registered;
+        }
+    }
+    void UnregisterEvent(const u32 event_id) {
+        registered[event_id] = false;
+        if (status[event_id] == EventState::Registered) {
+            status[event_id] = EventState::Free;
+        }
+    }
+    void LiberateEvent(const u32 event_id) {
+        status[event_id] = registered[event_id] ? EventState::Registered : EventState::Free;
+        events_mask &= ~(1ULL << event_id);
+        assigned_syncpt[event_id] = unassigned_syncpt;
+        assigned_value[event_id] = 0;
+    }
 };
 
-static_assert(sizeof(IoctlFence) == 8, "IoctlFence has wrong size");
-
 class Module final {
 public:
-    Module();
+    Module(Core::System& system);
     ~Module();
 
     /// Returns a pointer to one of the available devices, identified by its name.
@@ -44,10 +106,17 @@ public:
     /// Opens a device node and returns a file descriptor to it.
     u32 Open(const std::string& device_name);
     /// Sends an ioctl command to the specified file descriptor.
-    u32 Ioctl(u32 fd, u32 command, const std::vector<u8>& input, std::vector<u8>& output);
+    u32 Ioctl(u32 fd, u32 command, const std::vector<u8>& input, std::vector<u8>& output,
+              IoctlCtrl& ctrl);
     /// Closes a device file descriptor and returns operation success.
     ResultCode Close(u32 fd);
 
+    void SignalSyncpt(const u32 syncpoint_id, const u32 value);
+
+    Kernel::SharedPtr<Kernel::ReadableEvent> GetEvent(u32 event_id) const;
+
+    Kernel::SharedPtr<Kernel::WritableEvent> GetEventWriteable(u32 event_id) const;
+
 private:
     /// Id to use for the next open file descriptor.
     u32 next_fd = 1;
@@ -57,9 +126,12 @@ private:
 
     /// Mapping of device node names to their implementation.
     std::unordered_map<std::string, std::shared_ptr<Devices::nvdevice>> devices;
+
+    EventInterface events_interface;
 };
 
 /// Registers all NVDRV services with the specified service manager.
-void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger& nvflinger);
+void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger& nvflinger,
+                       Core::System& system);
 
 } // namespace Service::Nvidia
diff --git a/src/core/hle/service/nvflinger/buffer_queue.cpp b/src/core/hle/service/nvflinger/buffer_queue.cpp
index 5731e815f..e1a07d3ee 100644
--- a/src/core/hle/service/nvflinger/buffer_queue.cpp
+++ b/src/core/hle/service/nvflinger/buffer_queue.cpp
@@ -34,7 +34,8 @@ void BufferQueue::SetPreallocatedBuffer(u32 slot, const IGBPBuffer& igbp_buffer)
     buffer_wait_event.writable->Signal();
 }
 
-std::optional<u32> BufferQueue::DequeueBuffer(u32 width, u32 height) {
+std::optional<std::pair<u32, Service::Nvidia::MultiFence*>> BufferQueue::DequeueBuffer(u32 width,
+                                                                                       u32 height) {
     auto itr = std::find_if(queue.begin(), queue.end(), [&](const Buffer& buffer) {
         // Only consider free buffers. Buffers become free once again after they've been Acquired
         // and Released by the compositor, see the NVFlinger::Compose method.
@@ -51,7 +52,7 @@ std::optional<u32> BufferQueue::DequeueBuffer(u32 width, u32 height) {
     }
 
     itr->status = Buffer::Status::Dequeued;
-    return itr->slot;
+    return {{itr->slot, &itr->multi_fence}};
 }
 
 const IGBPBuffer& BufferQueue::RequestBuffer(u32 slot) const {
@@ -63,7 +64,8 @@ const IGBPBuffer& BufferQueue::RequestBuffer(u32 slot) const {
 }
 
 void BufferQueue::QueueBuffer(u32 slot, BufferTransformFlags transform,
-                              const Common::Rectangle<int>& crop_rect) {
+                              const Common::Rectangle<int>& crop_rect, u32 swap_interval,
+                              Service::Nvidia::MultiFence& multi_fence) {
     auto itr = std::find_if(queue.begin(), queue.end(),
                             [&](const Buffer& buffer) { return buffer.slot == slot; });
     ASSERT(itr != queue.end());
@@ -71,12 +73,21 @@ void BufferQueue::QueueBuffer(u32 slot, BufferTransformFlags transform,
     itr->status = Buffer::Status::Queued;
     itr->transform = transform;
     itr->crop_rect = crop_rect;
+    itr->swap_interval = swap_interval;
+    itr->multi_fence = multi_fence;
+    queue_sequence.push_back(slot);
 }
 
 std::optional<std::reference_wrapper<const BufferQueue::Buffer>> BufferQueue::AcquireBuffer() {
-    auto itr = std::find_if(queue.begin(), queue.end(), [](const Buffer& buffer) {
-        return buffer.status == Buffer::Status::Queued;
-    });
+    auto itr = queue.end();
+    // Iterate to find a queued buffer matching the requested slot.
+    while (itr == queue.end() && !queue_sequence.empty()) {
+        u32 slot = queue_sequence.front();
+        itr = std::find_if(queue.begin(), queue.end(), [&slot](const Buffer& buffer) {
+            return buffer.status == Buffer::Status::Queued && buffer.slot == slot;
+        });
+        queue_sequence.pop_front();
+    }
     if (itr == queue.end())
         return {};
     itr->status = Buffer::Status::Acquired;
diff --git a/src/core/hle/service/nvflinger/buffer_queue.h b/src/core/hle/service/nvflinger/buffer_queue.h
index e1ccb6171..356bedb81 100644
--- a/src/core/hle/service/nvflinger/buffer_queue.h
+++ b/src/core/hle/service/nvflinger/buffer_queue.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <list>
 #include <optional>
 #include <vector>
 
@@ -12,6 +13,7 @@
 #include "common/swap.h"
 #include "core/hle/kernel/object.h"
 #include "core/hle/kernel/writable_event.h"
+#include "core/hle/service/nvdrv/nvdata.h"
 
 namespace Service::NVFlinger {
 
@@ -68,13 +70,17 @@ public:
         IGBPBuffer igbp_buffer;
         BufferTransformFlags transform;
         Common::Rectangle<int> crop_rect;
+        u32 swap_interval;
+        Service::Nvidia::MultiFence multi_fence;
     };
 
     void SetPreallocatedBuffer(u32 slot, const IGBPBuffer& igbp_buffer);
-    std::optional<u32> DequeueBuffer(u32 width, u32 height);
+    std::optional<std::pair<u32, Service::Nvidia::MultiFence*>> DequeueBuffer(u32 width,
+                                                                              u32 height);
     const IGBPBuffer& RequestBuffer(u32 slot) const;
     void QueueBuffer(u32 slot, BufferTransformFlags transform,
-                     const Common::Rectangle<int>& crop_rect);
+                     const Common::Rectangle<int>& crop_rect, u32 swap_interval,
+                     Service::Nvidia::MultiFence& multi_fence);
     std::optional<std::reference_wrapper<const Buffer>> AcquireBuffer();
     void ReleaseBuffer(u32 slot);
     u32 Query(QueryType type);
@@ -92,6 +98,7 @@ private:
     u64 layer_id;
 
     std::vector<Buffer> queue;
+    std::list<u32> queue_sequence;
     Kernel::EventPair buffer_wait_event;
 };
 
diff --git a/src/core/hle/service/nvflinger/nvflinger.cpp b/src/core/hle/service/nvflinger/nvflinger.cpp
index 3c5c53e24..f9db79370 100644
--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@@ -37,15 +37,14 @@ NVFlinger::NVFlinger(Core::Timing::CoreTiming& core_timing) : core_timing{core_t
     displays.emplace_back(4, "Null");
 
     // Schedule the screen composition events
-    const auto ticks = Settings::values.force_30fps_mode ? frame_ticks_30fps : frame_ticks;
-
-    composition_event = core_timing.RegisterEvent(
-        "ScreenComposition", [this, ticks](u64 userdata, s64 cycles_late) {
-            Compose();
-            this->core_timing.ScheduleEvent(ticks - cycles_late, composition_event);
-        });
-
-    core_timing.ScheduleEvent(ticks, composition_event);
+    composition_event = core_timing.RegisterEvent("ScreenComposition", [this](u64 userdata,
+                                                                              s64 cycles_late) {
+        Compose();
+        const auto ticks = Settings::values.force_30fps_mode ? frame_ticks_30fps : GetNextTicks();
+        this->core_timing.ScheduleEvent(std::max<s64>(0LL, ticks - cycles_late), composition_event);
+    });
+
+    core_timing.ScheduleEvent(frame_ticks, composition_event);
 }
 
 NVFlinger::~NVFlinger() {
@@ -206,8 +205,14 @@ void NVFlinger::Compose() {
                      igbp_buffer.width, igbp_buffer.height, igbp_buffer.stride,
                      buffer->get().transform, buffer->get().crop_rect);
 
+        swap_interval = buffer->get().swap_interval;
         buffer_queue.ReleaseBuffer(buffer->get().slot);
     }
 }
 
+s64 NVFlinger::GetNextTicks() const {
+    constexpr s64 max_hertz = 120LL;
+    return (Core::Timing::BASE_CLOCK_RATE * (1LL << swap_interval)) / max_hertz;
+}
+
 } // namespace Service::NVFlinger
diff --git a/src/core/hle/service/nvflinger/nvflinger.h b/src/core/hle/service/nvflinger/nvflinger.h
index c0a83fffb..988be8726 100644
--- a/src/core/hle/service/nvflinger/nvflinger.h
+++ b/src/core/hle/service/nvflinger/nvflinger.h
@@ -74,6 +74,8 @@ public:
     /// finished.
     void Compose();
 
+    s64 GetNextTicks() const;
+
 private:
     /// Finds the display identified by the specified ID.
     VI::Display* FindDisplay(u64 display_id);
@@ -98,6 +100,8 @@ private:
     /// layers.
     u32 next_buffer_queue_id = 1;
 
+    u32 swap_interval = 1;
+
     /// Event that handles screen composition.
     Core::Timing::EventType* composition_event;
 
diff --git a/src/core/hle/service/service.cpp b/src/core/hle/service/service.cpp
index d343392fd..3a0f8c3f6 100644
--- a/src/core/hle/service/service.cpp
+++ b/src/core/hle/service/service.cpp
@@ -236,7 +236,7 @@ void Init(std::shared_ptr<SM::ServiceManager>& sm, Core::System& system) {
     NIM::InstallInterfaces(*sm);
     NPNS::InstallInterfaces(*sm);
     NS::InstallInterfaces(*sm);
-    Nvidia::InstallInterfaces(*sm, *nv_flinger);
+    Nvidia::InstallInterfaces(*sm, *nv_flinger, system);
     PCIe::InstallInterfaces(*sm);
     PCTL::InstallInterfaces(*sm);
     PCV::InstallInterfaces(*sm);
diff --git a/src/core/hle/service/vi/vi.cpp b/src/core/hle/service/vi/vi.cpp
index f1fa6ccd1..199b30635 100644
--- a/src/core/hle/service/vi/vi.cpp
+++ b/src/core/hle/service/vi/vi.cpp
@@ -21,6 +21,7 @@
 #include "core/hle/kernel/readable_event.h"
 #include "core/hle/kernel/thread.h"
 #include "core/hle/kernel/writable_event.h"
+#include "core/hle/service/nvdrv/nvdata.h"
 #include "core/hle/service/nvdrv/nvdrv.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
 #include "core/hle/service/nvflinger/nvflinger.h"
@@ -328,32 +329,22 @@ public:
     Data data;
 };
 
-struct BufferProducerFence {
-    u32 is_valid;
-    std::array<Nvidia::IoctlFence, 4> fences;
-};
-static_assert(sizeof(BufferProducerFence) == 36, "BufferProducerFence has wrong size");
-
 class IGBPDequeueBufferResponseParcel : public Parcel {
 public:
-    explicit IGBPDequeueBufferResponseParcel(u32 slot) : slot(slot) {}
+    explicit IGBPDequeueBufferResponseParcel(u32 slot, Service::Nvidia::MultiFence& multi_fence)
+        : slot(slot), multi_fence(multi_fence) {}
     ~IGBPDequeueBufferResponseParcel() override = default;
 
 protected:
     void SerializeData() override {
-        // TODO(Subv): Find out how this Fence is used.
-        BufferProducerFence fence = {};
-        fence.is_valid = 1;
-        for (auto& fence_ : fence.fences)
-            fence_.id = -1;
-
         Write(slot);
         Write<u32_le>(1);
-        WriteObject(fence);
+        WriteObject(multi_fence);
         Write<u32_le>(0);
     }
 
     u32_le slot;
+    Service::Nvidia::MultiFence multi_fence;
 };
 
 class IGBPRequestBufferRequestParcel : public Parcel {
@@ -400,12 +391,6 @@ public:
         data = Read<Data>();
     }
 
-    struct Fence {
-        u32_le id;
-        u32_le value;
-    };
-    static_assert(sizeof(Fence) == 8, "Fence has wrong size");
-
     struct Data {
         u32_le slot;
         INSERT_PADDING_WORDS(3);
@@ -418,15 +403,15 @@ public:
         s32_le scaling_mode;
         NVFlinger::BufferQueue::BufferTransformFlags transform;
         u32_le sticky_transform;
-        INSERT_PADDING_WORDS(2);
-        u32_le fence_is_valid;
-        std::array<Fence, 2> fences;
+        INSERT_PADDING_WORDS(1);
+        u32_le swap_interval;
+        Service::Nvidia::MultiFence multi_fence;
 
         Common::Rectangle<int> GetCropRect() const {
             return {crop_left, crop_top, crop_right, crop_bottom};
         }
     };
-    static_assert(sizeof(Data) == 80, "ParcelData has wrong size");
+    static_assert(sizeof(Data) == 96, "ParcelData has wrong size");
 
     Data data;
 };
@@ -547,11 +532,11 @@ private:
             IGBPDequeueBufferRequestParcel request{ctx.ReadBuffer()};
             const u32 width{request.data.width};
             const u32 height{request.data.height};
-            std::optional<u32> slot = buffer_queue.DequeueBuffer(width, height);
+            auto result = buffer_queue.DequeueBuffer(width, height);
 
-            if (slot) {
+            if (result) {
                 // Buffer is available
-                IGBPDequeueBufferResponseParcel response{*slot};
+                IGBPDequeueBufferResponseParcel response{result->first, *result->second};
                 ctx.WriteBuffer(response.Serialize());
             } else {
                 // Wait the current thread until a buffer becomes available
@@ -561,10 +546,10 @@ private:
                         Kernel::ThreadWakeupReason reason) {
                         // Repeat TransactParcel DequeueBuffer when a buffer is available
                         auto& buffer_queue = nv_flinger->FindBufferQueue(id);
-                        std::optional<u32> slot = buffer_queue.DequeueBuffer(width, height);
-                        ASSERT_MSG(slot != std::nullopt, "Could not dequeue buffer.");
+                        auto result = buffer_queue.DequeueBuffer(width, height);
+                        ASSERT_MSG(result != std::nullopt, "Could not dequeue buffer.");
 
-                        IGBPDequeueBufferResponseParcel response{*slot};
+                        IGBPDequeueBufferResponseParcel response{result->first, *result->second};
                         ctx.WriteBuffer(response.Serialize());
                         IPC::ResponseBuilder rb{ctx, 2};
                         rb.Push(RESULT_SUCCESS);
@@ -582,7 +567,8 @@ private:
             IGBPQueueBufferRequestParcel request{ctx.ReadBuffer()};
 
             buffer_queue.QueueBuffer(request.data.slot, request.data.transform,
-                                     request.data.GetCropRect());
+                                     request.data.GetCropRect(), request.data.swap_interval,
+                                     request.data.multi_fence);
 
             IGBPQueueBufferResponseParcel response{1280, 720};
             ctx.WriteBuffer(response.Serialize());
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 3175579cc..bd036cbe8 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -22,7 +22,7 @@ void DmaPusher::DispatchCalls() {
     MICROPROFILE_SCOPE(DispatchCalls);
 
     // On entering GPU code, assume all memory may be touched by the ARM core.
-    gpu.Maxwell3D().dirty_flags.OnMemoryWrite();
+    gpu.Maxwell3D().dirty.OnMemoryWrite();
 
     dma_pushbuffer_subindex = 0;
 
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index 7404a8163..08586d33c 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -37,7 +37,7 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
         const bool is_last_call = method_call.IsLastCall();
         upload_state.ProcessData(method_call.argument, is_last_call);
         if (is_last_call) {
-            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+            system.GPU().Maxwell3D().dirty.OnMemoryWrite();
         }
         break;
     }
@@ -50,13 +50,14 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
 }
 
 void KeplerCompute::ProcessLaunch() {
-
     const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
     memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description,
                                    LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32));
 
-    const GPUVAddr code_loc = regs.code_loc.Address() + launch_description.program_start;
-    LOG_WARNING(HW_GPU, "Compute Kernel Execute at Address 0x{:016x}, STUBBED", code_loc);
+    const GPUVAddr code_addr = regs.code_loc.Address() + launch_description.program_start;
+    LOG_TRACE(HW_GPU, "Compute invocation launched at address 0x{:016x}", code_addr);
+
+    rasterizer.DispatchCompute(code_addr);
 }
 
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 0561f676c..44279de00 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -34,7 +34,7 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
         const bool is_last_call = method_call.IsLastCall();
         upload_state.ProcessData(method_call.argument, is_last_call);
         if (is_last_call) {
-            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+            system.GPU().Maxwell3D().dirty.OnMemoryWrite();
         }
         break;
     }
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 8755b8af4..125c53360 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -22,6 +22,7 @@ Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& raste
                      MemoryManager& memory_manager)
     : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
       macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
+    InitDirtySettings();
     InitializeRegisterDefaults();
 }
 
@@ -69,6 +70,10 @@ void Maxwell3D::InitializeRegisterDefaults() {
     regs.stencil_back_func_mask = 0xFFFFFFFF;
     regs.stencil_back_mask = 0xFFFFFFFF;
 
+    regs.depth_test_func = Regs::ComparisonOp::Always;
+    regs.cull.front_face = Regs::Cull::FrontFace::CounterClockWise;
+    regs.cull.cull_face = Regs::Cull::CullFace::Back;
+
     // TODO(Rodrigo): Most games do not set a point size. I think this is a case of a
     // register carrying a default value. Assume it's OpenGL's default (1).
     regs.point_size = 1.0f;
@@ -86,6 +91,159 @@ void Maxwell3D::InitializeRegisterDefaults() {
     regs.rt_separate_frag_data = 1;
 }
 
+#define DIRTY_REGS_POS(field_name) (offsetof(Maxwell3D::DirtyRegs, field_name))
+
+void Maxwell3D::InitDirtySettings() {
+    const auto set_block = [this](const u32 start, const u32 range, const u8 position) {
+        const auto start_itr = dirty_pointers.begin() + start;
+        const auto end_itr = start_itr + range;
+        std::fill(start_itr, end_itr, position);
+    };
+    dirty.regs.fill(true);
+
+    // Init Render Targets
+    constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
+    constexpr u32 rt_start_reg = MAXWELL3D_REG_INDEX(rt);
+    constexpr u32 rt_end_reg = rt_start_reg + registers_per_rt * 8;
+    u32 rt_dirty_reg = DIRTY_REGS_POS(render_target);
+    for (u32 rt_reg = rt_start_reg; rt_reg < rt_end_reg; rt_reg += registers_per_rt) {
+        set_block(rt_reg, registers_per_rt, rt_dirty_reg);
+        rt_dirty_reg++;
+    }
+    constexpr u32 depth_buffer_flag = DIRTY_REGS_POS(depth_buffer);
+    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_enable)] = depth_buffer_flag;
+    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_width)] = depth_buffer_flag;
+    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_height)] = depth_buffer_flag;
+    constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32);
+    constexpr u32 zeta_reg = MAXWELL3D_REG_INDEX(zeta);
+    set_block(zeta_reg, registers_in_zeta, depth_buffer_flag);
+
+    // Init Vertex Arrays
+    constexpr u32 vertex_array_start = MAXWELL3D_REG_INDEX(vertex_array);
+    constexpr u32 vertex_array_size = sizeof(regs.vertex_array[0]) / sizeof(u32);
+    constexpr u32 vertex_array_end = vertex_array_start + vertex_array_size * Regs::NumVertexArrays;
+    u32 va_reg = DIRTY_REGS_POS(vertex_array);
+    u32 vi_reg = DIRTY_REGS_POS(vertex_instance);
+    for (u32 vertex_reg = vertex_array_start; vertex_reg < vertex_array_end;
+         vertex_reg += vertex_array_size) {
+        set_block(vertex_reg, 3, va_reg);
+        // The divisor concerns vertex array instances
+        dirty_pointers[vertex_reg + 3] = vi_reg;
+        va_reg++;
+        vi_reg++;
+    }
+    constexpr u32 vertex_limit_start = MAXWELL3D_REG_INDEX(vertex_array_limit);
+    constexpr u32 vertex_limit_size = sizeof(regs.vertex_array_limit[0]) / sizeof(u32);
+    constexpr u32 vertex_limit_end = vertex_limit_start + vertex_limit_size * Regs::NumVertexArrays;
+    va_reg = DIRTY_REGS_POS(vertex_array);
+    for (u32 vertex_reg = vertex_limit_start; vertex_reg < vertex_limit_end;
+         vertex_reg += vertex_limit_size) {
+        set_block(vertex_reg, vertex_limit_size, va_reg);
+        va_reg++;
+    }
+    constexpr u32 vertex_instance_start = MAXWELL3D_REG_INDEX(instanced_arrays);
+    constexpr u32 vertex_instance_size =
+        sizeof(regs.instanced_arrays.is_instanced[0]) / sizeof(u32);
+    constexpr u32 vertex_instance_end =
+        vertex_instance_start + vertex_instance_size * Regs::NumVertexArrays;
+    vi_reg = DIRTY_REGS_POS(vertex_instance);
+    for (u32 vertex_reg = vertex_instance_start; vertex_reg < vertex_instance_end;
+         vertex_reg += vertex_instance_size) {
+        set_block(vertex_reg, vertex_instance_size, vi_reg);
+        vi_reg++;
+    }
+    set_block(MAXWELL3D_REG_INDEX(vertex_attrib_format), regs.vertex_attrib_format.size(),
+              DIRTY_REGS_POS(vertex_attrib_format));
+
+    // Init Shaders
+    constexpr u32 shader_registers_count =
+        sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32);
+    set_block(MAXWELL3D_REG_INDEX(shader_config[0]), shader_registers_count,
+              DIRTY_REGS_POS(shaders));
+
+    // State
+
+    // Viewport
+    constexpr u32 viewport_dirty_reg = DIRTY_REGS_POS(viewport);
+    constexpr u32 viewport_start = MAXWELL3D_REG_INDEX(viewports);
+    constexpr u32 viewport_size = sizeof(regs.viewports) / sizeof(u32);
+    set_block(viewport_start, viewport_size, viewport_dirty_reg);
+    constexpr u32 view_volume_start = MAXWELL3D_REG_INDEX(view_volume_clip_control);
+    constexpr u32 view_volume_size = sizeof(regs.view_volume_clip_control) / sizeof(u32);
+    set_block(view_volume_start, view_volume_size, viewport_dirty_reg);
+
+    // Viewport transformation
+    constexpr u32 viewport_trans_start = MAXWELL3D_REG_INDEX(viewport_transform);
+    constexpr u32 viewport_trans_size = sizeof(regs.viewport_transform) / sizeof(u32);
+    set_block(viewport_trans_start, viewport_trans_size, DIRTY_REGS_POS(viewport_transform));
+
+    // Cullmode
+    constexpr u32 cull_mode_start = MAXWELL3D_REG_INDEX(cull);
+    constexpr u32 cull_mode_size = sizeof(regs.cull) / sizeof(u32);
+    set_block(cull_mode_start, cull_mode_size, DIRTY_REGS_POS(cull_mode));
+
+    // Screen y control
+    dirty_pointers[MAXWELL3D_REG_INDEX(screen_y_control)] = DIRTY_REGS_POS(screen_y_control);
+
+    // Primitive Restart
+    constexpr u32 primitive_restart_start = MAXWELL3D_REG_INDEX(primitive_restart);
+    constexpr u32 primitive_restart_size = sizeof(regs.primitive_restart) / sizeof(u32);
+    set_block(primitive_restart_start, primitive_restart_size, DIRTY_REGS_POS(primitive_restart));
+
+    // Depth Test
+    constexpr u32 depth_test_dirty_reg = DIRTY_REGS_POS(depth_test);
+    dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_enable)] = depth_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(depth_write_enabled)] = depth_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_func)] = depth_test_dirty_reg;
+
+    // Stencil Test
+    constexpr u32 stencil_test_dirty_reg = DIRTY_REGS_POS(stencil_test);
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_enable)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_func)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_ref)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_mask)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_fail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zfail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zpass)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_mask)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_two_side_enable)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_func)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_ref)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_mask)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_fail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zfail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zpass)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_mask)] = stencil_test_dirty_reg;
+
+    // Color Mask
+    constexpr u32 color_mask_dirty_reg = DIRTY_REGS_POS(color_mask);
+    dirty_pointers[MAXWELL3D_REG_INDEX(color_mask_common)] = color_mask_dirty_reg;
+    set_block(MAXWELL3D_REG_INDEX(color_mask), sizeof(regs.color_mask) / sizeof(u32),
+              color_mask_dirty_reg);
+    // Blend State
+    constexpr u32 blend_state_dirty_reg = DIRTY_REGS_POS(blend_state);
+    set_block(MAXWELL3D_REG_INDEX(blend_color), sizeof(regs.blend_color) / sizeof(u32),
+              blend_state_dirty_reg);
+    dirty_pointers[MAXWELL3D_REG_INDEX(independent_blend_enable)] = blend_state_dirty_reg;
+    set_block(MAXWELL3D_REG_INDEX(blend), sizeof(regs.blend) / sizeof(u32), blend_state_dirty_reg);
+    set_block(MAXWELL3D_REG_INDEX(independent_blend), sizeof(regs.independent_blend) / sizeof(u32),
+              blend_state_dirty_reg);
+
+    // Scissor State
+    constexpr u32 scissor_test_dirty_reg = DIRTY_REGS_POS(scissor_test);
+    set_block(MAXWELL3D_REG_INDEX(scissor_test), sizeof(regs.scissor_test) / sizeof(u32),
+              scissor_test_dirty_reg);
+
+    // Polygon Offset
+    constexpr u32 polygon_offset_dirty_reg = DIRTY_REGS_POS(polygon_offset);
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_fill_enable)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_line_enable)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_point_enable)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_units)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_factor)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_clamp)] = polygon_offset_dirty_reg;
+}
+
 void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
     // Reset the current macro.
     executing_macro = 0;
@@ -108,6 +266,14 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
 
     const u32 method = method_call.method;
 
+    if (method == cb_data_state.current) {
+        regs.reg_array[method] = method_call.argument;
+        ProcessCBData(method_call.argument);
+        return;
+    } else if (cb_data_state.current != null_cb_data) {
+        FinishCBData();
+    }
+
     // It is an error to write to a register other than the current macro's ARG register before it
     // has finished execution.
     if (executing_macro != 0) {
@@ -143,49 +309,19 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
 
     if (regs.reg_array[method] != method_call.argument) {
         regs.reg_array[method] = method_call.argument;
-        // Color buffers
-        constexpr u32 first_rt_reg = MAXWELL3D_REG_INDEX(rt);
-        constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
-        if (method >= first_rt_reg &&
-            method < first_rt_reg + registers_per_rt * Regs::NumRenderTargets) {
-            const std::size_t rt_index = (method - first_rt_reg) / registers_per_rt;
-            dirty_flags.color_buffer.set(rt_index);
-        }
-
-        // Zeta buffer
-        constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32);
-        if (method == MAXWELL3D_REG_INDEX(zeta_enable) ||
-            method == MAXWELL3D_REG_INDEX(zeta_width) ||
-            method == MAXWELL3D_REG_INDEX(zeta_height) ||
-            (method >= MAXWELL3D_REG_INDEX(zeta) &&
-             method < MAXWELL3D_REG_INDEX(zeta) + registers_in_zeta)) {
-            dirty_flags.zeta_buffer = true;
-        }
-
-        // Shader
-        constexpr u32 shader_registers_count =
-            sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32);
-        if (method >= MAXWELL3D_REG_INDEX(shader_config[0]) &&
-            method < MAXWELL3D_REG_INDEX(shader_config[0]) + shader_registers_count) {
-            dirty_flags.shaders = true;
-        }
-
-        // Vertex format
-        if (method >= MAXWELL3D_REG_INDEX(vertex_attrib_format) &&
-            method < MAXWELL3D_REG_INDEX(vertex_attrib_format) + regs.vertex_attrib_format.size()) {
-            dirty_flags.vertex_attrib_format = true;
-        }
-
-        // Vertex buffer
-        if (method >= MAXWELL3D_REG_INDEX(vertex_array) &&
-            method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * Regs::NumVertexArrays) {
-            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2);
-        } else if (method >= MAXWELL3D_REG_INDEX(vertex_array_limit) &&
-                   method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * Regs::NumVertexArrays) {
-            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1);
-        } else if (method >= MAXWELL3D_REG_INDEX(instanced_arrays) &&
-                   method < MAXWELL3D_REG_INDEX(instanced_arrays) + Regs::NumVertexArrays) {
-            dirty_flags.vertex_array.set(method - MAXWELL3D_REG_INDEX(instanced_arrays));
+        const std::size_t dirty_reg = dirty_pointers[method];
+        if (dirty_reg) {
+            dirty.regs[dirty_reg] = true;
+            if (dirty_reg >= DIRTY_REGS_POS(vertex_array) &&
+                dirty_reg < DIRTY_REGS_POS(vertex_array_buffers)) {
+                dirty.vertex_array_buffers = true;
+            } else if (dirty_reg >= DIRTY_REGS_POS(vertex_instance) &&
+                       dirty_reg < DIRTY_REGS_POS(vertex_instances)) {
+                dirty.vertex_instances = true;
+            } else if (dirty_reg >= DIRTY_REGS_POS(render_target) &&
+                       dirty_reg < DIRTY_REGS_POS(render_settings)) {
+                dirty.render_settings = true;
+            }
         }
     }
 
@@ -214,7 +350,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): {
-        ProcessCBData(method_call.argument);
+        StartCBData(method);
         break;
     }
     case MAXWELL3D_REG_INDEX(cb_bind[0].raw_config): {
@@ -249,6 +385,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
         ProcessQueryGet();
         break;
     }
+    case MAXWELL3D_REG_INDEX(condition.mode): {
+        ProcessQueryCondition();
+        break;
+    }
     case MAXWELL3D_REG_INDEX(sync_info): {
         ProcessSyncPoint();
         break;
@@ -261,7 +401,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
         const bool is_last_call = method_call.IsLastCall();
         upload_state.ProcessData(method_call.argument, is_last_call);
         if (is_last_call) {
-            dirty_flags.OnMemoryWrite();
+            dirty.OnMemoryWrite();
         }
         break;
     }
@@ -302,6 +442,7 @@ void Maxwell3D::ProcessQueryGet() {
         result = regs.query.query_sequence;
         break;
     default:
+        result = 1;
         UNIMPLEMENTED_MSG("Unimplemented query select type {}",
                           static_cast<u32>(regs.query.query_get.select.Value()));
     }
@@ -333,7 +474,6 @@ void Maxwell3D::ProcessQueryGet() {
             query_result.timestamp = system.CoreTiming().GetTicks();
             memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
         }
-        dirty_flags.OnMemoryWrite();
         break;
     }
     default:
@@ -342,12 +482,52 @@ void Maxwell3D::ProcessQueryGet() {
     }
 }
 
+void Maxwell3D::ProcessQueryCondition() {
+    const GPUVAddr condition_address{regs.condition.Address()};
+    switch (regs.condition.mode) {
+    case Regs::ConditionMode::Always: {
+        execute_on = true;
+        break;
+    }
+    case Regs::ConditionMode::Never: {
+        execute_on = false;
+        break;
+    }
+    case Regs::ConditionMode::ResNonZero: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;
+        break;
+    }
+    case Regs::ConditionMode::Equal: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        execute_on =
+            cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;
+        break;
+    }
+    case Regs::ConditionMode::NotEqual: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        execute_on =
+            cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;
+        break;
+    }
+    default: {
+        UNIMPLEMENTED_MSG("Uninplemented Condition Mode!");
+        execute_on = true;
+        break;
+    }
+    }
+}
+
 void Maxwell3D::ProcessSyncPoint() {
     const u32 sync_point = regs.sync_info.sync_point.Value();
     const u32 increment = regs.sync_info.increment.Value();
     const u32 cache_flush = regs.sync_info.unknown.Value();
-    LOG_DEBUG(HW_GPU, "Syncpoint set {}, increment: {}, unk: {}", sync_point, increment,
-              cache_flush);
+    if (increment) {
+        system.GPU().IncrementSyncPoint(sync_point);
+    }
 }
 
 void Maxwell3D::DrawArrays() {
@@ -405,23 +585,39 @@ void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) {
 }
 
 void Maxwell3D::ProcessCBData(u32 value) {
+    const u32 id = cb_data_state.id;
+    cb_data_state.buffer[id][cb_data_state.counter] = value;
+    // Increment the current buffer position.
+    regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4;
+    cb_data_state.counter++;
+}
+
+void Maxwell3D::StartCBData(u32 method) {
+    constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]);
+    cb_data_state.start_pos = regs.const_buffer.cb_pos;
+    cb_data_state.id = method - first_cb_data;
+    cb_data_state.current = method;
+    cb_data_state.counter = 0;
+    ProcessCBData(regs.const_buffer.cb_data[cb_data_state.id]);
+}
+
+void Maxwell3D::FinishCBData() {
     // Write the input value to the current const buffer at the current position.
     const GPUVAddr buffer_address = regs.const_buffer.BufferAddress();
     ASSERT(buffer_address != 0);
 
     // Don't allow writing past the end of the buffer.
-    ASSERT(regs.const_buffer.cb_pos + sizeof(u32) <= regs.const_buffer.cb_size);
-
-    const GPUVAddr address{buffer_address + regs.const_buffer.cb_pos};
+    ASSERT(regs.const_buffer.cb_pos <= regs.const_buffer.cb_size);
 
-    u8* ptr{memory_manager.GetPointer(address)};
-    rasterizer.InvalidateRegion(ToCacheAddr(ptr), sizeof(u32));
-    memory_manager.Write<u32>(address, value);
+    const GPUVAddr address{buffer_address + cb_data_state.start_pos};
+    const std::size_t size = regs.const_buffer.cb_pos - cb_data_state.start_pos;
 
-    dirty_flags.OnMemoryWrite();
+    const u32 id = cb_data_state.id;
+    memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size);
+    dirty.OnMemoryWrite();
 
-    // Increment the current buffer position.
-    regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4;
+    cb_data_state.id = null_cb_data;
+    cb_data_state.current = null_cb_data;
 }
 
 Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 8d15c8a48..1ee982b76 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -90,6 +90,20 @@ public:
 
         enum class QuerySelect : u32 {
             Zero = 0,
+            TimeElapsed = 2,
+            TransformFeedbackPrimitivesGenerated = 11,
+            PrimitivesGenerated = 18,
+            SamplesPassed = 21,
+            TransformFeedbackUnknown = 26,
+        };
+
+        struct QueryCompare {
+            u32 initial_sequence;
+            u32 initial_mode;
+            u32 unknown1;
+            u32 unknown2;
+            u32 current_sequence;
+            u32 current_mode;
         };
 
         enum class QuerySyncCondition : u32 {
@@ -97,6 +111,14 @@ public:
             GreaterThan = 1,
         };
 
+        enum class ConditionMode : u32 {
+            Never = 0,
+            Always = 1,
+            ResNonZero = 2,
+            Equal = 3,
+            NotEqual = 4,
+        };
+
         enum class ShaderProgram : u32 {
             VertexA = 0,
             VertexB = 1,
@@ -815,7 +837,18 @@ public:
                     BitField<4, 1, u32> alpha_to_one;
                 } multisample_control;
 
-                INSERT_PADDING_WORDS(0x7);
+                INSERT_PADDING_WORDS(0x4);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    ConditionMode mode;
+
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } condition;
 
                 struct {
                     u32 tsc_address_high;
@@ -1124,23 +1157,77 @@ public:
 
     State state{};
 
-    struct DirtyFlags {
-        std::bitset<8> color_buffer{0xFF};
-        std::bitset<32> vertex_array{0xFFFFFFFF};
+    struct DirtyRegs {
+        static constexpr std::size_t NUM_REGS = 256;
+        union {
+            struct {
+                bool null_dirty;
+
+                // Vertex Attributes
+                bool vertex_attrib_format;
+
+                // Vertex Arrays
+                std::array<bool, 32> vertex_array;
+
+                bool vertex_array_buffers;
+
+                // Vertex Instances
+                std::array<bool, 32> vertex_instance;
 
-        bool vertex_attrib_format = true;
-        bool zeta_buffer = true;
-        bool shaders = true;
+                bool vertex_instances;
+
+                // Render Targets
+                std::array<bool, 8> render_target;
+                bool depth_buffer;
+
+                bool render_settings;
+
+                // Shaders
+                bool shaders;
+
+                // Rasterizer State
+                bool viewport;
+                bool clip_coefficient;
+                bool cull_mode;
+                bool primitive_restart;
+                bool depth_test;
+                bool stencil_test;
+                bool blend_state;
+                bool scissor_test;
+                bool transform_feedback;
+                bool color_mask;
+                bool polygon_offset;
+
+                // Complementary
+                bool viewport_transform;
+                bool screen_y_control;
+
+                bool memory_general;
+            };
+            std::array<bool, NUM_REGS> regs;
+        };
+
+        void ResetVertexArrays() {
+            vertex_array.fill(true);
+            vertex_array_buffers = true;
+        }
+
+        void ResetRenderTargets() {
+            depth_buffer = true;
+            render_target.fill(true);
+            render_settings = true;
+        }
 
         void OnMemoryWrite() {
-            zeta_buffer = true;
             shaders = true;
-            color_buffer.set();
-            vertex_array.set();
+            memory_general = true;
+            ResetRenderTargets();
+            ResetVertexArrays();
         }
-    };
 
-    DirtyFlags dirty_flags;
+    } dirty{};
+
+    std::array<u8, Regs::NUM_REGS> dirty_pointers{};
 
     /// Reads a register value located at the input method address
     u32 GetRegisterValue(u32 method) const;
@@ -1169,6 +1256,10 @@ public:
         return macro_memory;
     }
 
+    bool ShouldExecute() const {
+        return execute_on;
+    }
+
 private:
     void InitializeRegisterDefaults();
 
@@ -1192,14 +1283,27 @@ private:
     /// Interpreter for the macro codes uploaded to the GPU.
     MacroInterpreter macro_interpreter;
 
+    static constexpr u32 null_cb_data = 0xFFFFFFFF;
+    struct {
+        std::array<std::array<u32, 0x4000>, 16> buffer;
+        u32 current{null_cb_data};
+        u32 id{null_cb_data};
+        u32 start_pos{};
+        u32 counter{};
+    } cb_data_state;
+
     Upload::State upload_state;
 
+    bool execute_on{true};
+
     /// Retrieves information about a specific TIC entry from the TIC buffer.
     Texture::TICEntry GetTICEntry(u32 tic_index) const;
 
     /// Retrieves information about a specific TSC entry from the TSC buffer.
     Texture::TSCEntry GetTSCEntry(u32 tsc_index) const;
 
+    void InitDirtySettings();
+
     /**
      * Call a macro on this engine.
      * @param method Method to call
@@ -1219,11 +1323,16 @@ private:
     /// Handles a write to the QUERY_GET register.
     void ProcessQueryGet();
 
+    // Handles Conditional Rendering
+    void ProcessQueryCondition();
+
     /// Handles writes to syncing register.
     void ProcessSyncPoint();
 
     /// Handles a write to the CB_DATA[i] register.
+    void StartCBData(u32 method);
     void ProcessCBData(u32 value);
+    void FinishCBData();
 
     /// Handles a write to the CB_BIND register.
     void ProcessCBBind(Regs::ShaderStage stage);
@@ -1290,6 +1399,7 @@ ASSERT_REG_POSITION(clip_distance_enabled, 0x544);
 ASSERT_REG_POSITION(point_size, 0x546);
 ASSERT_REG_POSITION(zeta_enable, 0x54E);
 ASSERT_REG_POSITION(multisample_control, 0x54F);
+ASSERT_REG_POSITION(condition, 0x554);
 ASSERT_REG_POSITION(tsc, 0x557);
 ASSERT_REG_POSITION(polygon_offset_factor, 0x55b);
 ASSERT_REG_POSITION(tic, 0x55D);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index afb9578d0..a28c04473 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -38,7 +38,7 @@ void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
 }
 
 void MaxwellDMA::HandleCopy() {
-    LOG_WARNING(HW_GPU, "Requested a DMA copy");
+    LOG_TRACE(HW_GPU, "Requested a DMA copy");
 
     const GPUVAddr source = regs.src_address.Address();
     const GPUVAddr dest = regs.dst_address.Address();
@@ -58,7 +58,7 @@ void MaxwellDMA::HandleCopy() {
     }
 
     // All copies here update the main memory, so mark all rasterizer states as invalid.
-    system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+    system.GPU().Maxwell3D().dirty.OnMemoryWrite();
 
     if (regs.exec.is_dst_linear && regs.exec.is_src_linear) {
         // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 79d469b88..083ee3304 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -560,6 +560,11 @@ union Instruction {
     BitField<48, 16, u64> opcode;
 
     union {
+        BitField<8, 5, ConditionCode> cc;
+        BitField<13, 1, u64> trigger;
+    } nop;
+
+    union {
         BitField<8, 8, Register> gpr;
         BitField<20, 24, s64> offset;
     } gmem;
@@ -931,8 +936,6 @@ union Instruction {
     } csetp;
 
     union {
-        BitField<35, 4, PredCondition> cond;
-        BitField<49, 1, u64> h_and;
         BitField<6, 1, u64> ftz;
         BitField<45, 2, PredOperation> op;
         BitField<3, 3, u64> pred3;
@@ -940,9 +943,21 @@ union Instruction {
         BitField<43, 1, u64> negate_a;
         BitField<44, 1, u64> abs_a;
         BitField<47, 2, HalfType> type_a;
-        BitField<31, 1, u64> negate_b;
-        BitField<30, 1, u64> abs_b;
-        BitField<28, 2, HalfType> type_b;
+        union {
+            BitField<35, 4, PredCondition> cond;
+            BitField<49, 1, u64> h_and;
+            BitField<31, 1, u64> negate_b;
+            BitField<30, 1, u64> abs_b;
+            BitField<28, 2, HalfType> type_b;
+        } reg;
+        union {
+            BitField<56, 1, u64> negate_b;
+            BitField<54, 1, u64> abs_b;
+        } cbuf;
+        union {
+            BitField<49, 4, PredCondition> cond;
+            BitField<53, 1, u64> h_and;
+        } cbuf_and_imm;
         BitField<42, 1, u64> neg_pred;
         BitField<39, 3, u64> pred39;
     } hsetp2;
@@ -1506,6 +1521,7 @@ public:
         TMML,   // Texture Mip Map Level
         SUST,   // Surface Store
         EXIT,
+        NOP,
         IPA,
         OUT_R, // Emit vertex/primitive
         ISBERD,
@@ -1548,7 +1564,9 @@ public:
         HFMA2_RC,
         HFMA2_RR,
         HFMA2_IMM_R,
+        HSETP2_C,
         HSETP2_R,
+        HSETP2_IMM,
         HSET2_R,
         POPC_C,
         POPC_R,
@@ -1783,6 +1801,7 @@ private:
             INST("110111110110----", Id::TMML_B, Type::Texture, "TMML_B"),
             INST("1101111101011---", Id::TMML, Type::Texture, "TMML"),
             INST("11101011001-----", Id::SUST, Type::Image, "SUST"),
+            INST("0101000010110---", Id::NOP, Type::Trivial, "NOP"),
             INST("11100000--------", Id::IPA, Type::Trivial, "IPA"),
             INST("1111101111100---", Id::OUT_R, Type::Trivial, "OUT_R"),
             INST("1110111111010---", Id::ISBERD, Type::Trivial, "ISBERD"),
@@ -1831,7 +1850,9 @@ private:
             INST("01100---1-------", Id::HFMA2_RC, Type::Hfma2, "HFMA2_RC"),
             INST("0101110100000---", Id::HFMA2_RR, Type::Hfma2, "HFMA2_RR"),
             INST("01110---0-------", Id::HFMA2_IMM_R, Type::Hfma2, "HFMA2_R_IMM"),
-            INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP_R"),
+            INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"),
+            INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),
+            INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),
             INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),
             INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
             INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 1b4975498..1622332a4 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -29,7 +29,8 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
     UNREACHABLE();
 }
 
-GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
+GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
+    : system{system}, renderer{renderer}, is_async{is_async} {
     auto& rasterizer{renderer.Rasterizer()};
     memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
     dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
@@ -50,6 +51,14 @@ const Engines::Maxwell3D& GPU::Maxwell3D() const {
     return *maxwell_3d;
 }
 
+Engines::KeplerCompute& GPU::KeplerCompute() {
+    return *kepler_compute;
+}
+
+const Engines::KeplerCompute& GPU::KeplerCompute() const {
+    return *kepler_compute;
+}
+
 MemoryManager& GPU::MemoryManager() {
     return *memory_manager;
 }
@@ -66,6 +75,51 @@ const DmaPusher& GPU::DmaPusher() const {
     return *dma_pusher;
 }
 
+void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
+    syncpoints[syncpoint_id]++;
+    std::lock_guard lock{sync_mutex};
+    if (!syncpt_interrupts[syncpoint_id].empty()) {
+        u32 value = syncpoints[syncpoint_id].load();
+        auto it = syncpt_interrupts[syncpoint_id].begin();
+        while (it != syncpt_interrupts[syncpoint_id].end()) {
+            if (value >= *it) {
+                TriggerCpuInterrupt(syncpoint_id, *it);
+                it = syncpt_interrupts[syncpoint_id].erase(it);
+                continue;
+            }
+            it++;
+        }
+    }
+}
+
+u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const {
+    return syncpoints[syncpoint_id].load();
+}
+
+void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
+    auto& interrupt = syncpt_interrupts[syncpoint_id];
+    bool contains = std::any_of(interrupt.begin(), interrupt.end(),
+                                [value](u32 in_value) { return in_value == value; });
+    if (contains) {
+        return;
+    }
+    syncpt_interrupts[syncpoint_id].emplace_back(value);
+}
+
+bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
+    std::lock_guard lock{sync_mutex};
+    auto& interrupt = syncpt_interrupts[syncpoint_id];
+    const auto iter =
+        std::find_if(interrupt.begin(), interrupt.end(),
+                     [value](u32 interrupt_value) { return value == interrupt_value; });
+
+    if (iter == interrupt.end()) {
+        return false;
+    }
+    interrupt.erase(iter);
+    return true;
+}
+
 u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
     ASSERT(format != RenderTargetFormat::NONE);
 
@@ -143,12 +197,12 @@ enum class BufferMethods {
     NotifyIntr = 0x8,
     WrcacheFlush = 0x9,
     Unk28 = 0xA,
-    Unk2c = 0xB,
+    UnkCacheFlush = 0xB,
     RefCnt = 0x14,
     SemaphoreAcquire = 0x1A,
     SemaphoreRelease = 0x1B,
-    Unk70 = 0x1C,
-    Unk74 = 0x1D,
+    FenceValue = 0x1C,
+    FenceAction = 0x1D,
     Unk78 = 0x1E,
     Unk7c = 0x1F,
     Yield = 0x20,
@@ -194,6 +248,10 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
     case BufferMethods::SemaphoreAddressLow:
     case BufferMethods::SemaphoreSequence:
     case BufferMethods::RefCnt:
+    case BufferMethods::UnkCacheFlush:
+    case BufferMethods::WrcacheFlush:
+    case BufferMethods::FenceValue:
+    case BufferMethods::FenceAction:
         break;
     case BufferMethods::SemaphoreTrigger: {
         ProcessSemaphoreTriggerMethod();
@@ -204,21 +262,11 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
         LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented");
         break;
     }
-    case BufferMethods::WrcacheFlush: {
-        // TODO(Kmather73): Research and implement this method.
-        LOG_ERROR(HW_GPU, "Special puller engine method WrcacheFlush not implemented");
-        break;
-    }
     case BufferMethods::Unk28: {
         // TODO(Kmather73): Research and implement this method.
         LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented");
         break;
     }
-    case BufferMethods::Unk2c: {
-        // TODO(Kmather73): Research and implement this method.
-        LOG_ERROR(HW_GPU, "Special puller engine method Unk2c not implemented");
-        break;
-    }
     case BufferMethods::SemaphoreAcquire: {
         ProcessSemaphoreAcquire();
         break;
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index fe6628923..87c96f46b 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -5,8 +5,12 @@
 #pragma once
 
 #include <array>
+#include <atomic>
+#include <list>
 #include <memory>
+#include <mutex>
 #include "common/common_types.h"
+#include "core/hle/service/nvdrv/nvdata.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
 #include "video_core/dma_pusher.h"
 
@@ -127,7 +131,7 @@ class MemoryManager;
 
 class GPU {
 public:
-    explicit GPU(Core::System& system, VideoCore::RendererBase& renderer);
+    explicit GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async);
 
     virtual ~GPU();
 
@@ -155,6 +159,12 @@ public:
     /// Returns a const reference to the Maxwell3D GPU engine.
     const Engines::Maxwell3D& Maxwell3D() const;
 
+    /// Returns a reference to the KeplerCompute GPU engine.
+    Engines::KeplerCompute& KeplerCompute();
+
+    /// Returns a reference to the KeplerCompute GPU engine.
+    const Engines::KeplerCompute& KeplerCompute() const;
+
     /// Returns a reference to the GPU memory manager.
     Tegra::MemoryManager& MemoryManager();
 
@@ -164,6 +174,22 @@ public:
     /// Returns a reference to the GPU DMA pusher.
     Tegra::DmaPusher& DmaPusher();
 
+    void IncrementSyncPoint(u32 syncpoint_id);
+
+    u32 GetSyncpointValue(u32 syncpoint_id) const;
+
+    void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value);
+
+    bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value);
+
+    std::unique_lock<std::mutex> LockSync() {
+        return std::unique_lock{sync_mutex};
+    }
+
+    bool IsAsync() const {
+        return is_async;
+    }
+
     /// Returns a const reference to the GPU DMA pusher.
     const Tegra::DmaPusher& DmaPusher() const;
 
@@ -194,7 +220,12 @@ public:
 
                 u32 semaphore_acquire;
                 u32 semaphore_release;
-                INSERT_PADDING_WORDS(0xE4);
+                u32 fence_value;
+                union {
+                    BitField<4, 4, u32> operation;
+                    BitField<8, 8, u32> id;
+                } fence_action;
+                INSERT_PADDING_WORDS(0xE2);
 
                 // Puller state
                 u32 acquire_mode;
@@ -228,6 +259,9 @@ public:
     /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
     virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
 
+protected:
+    virtual void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const = 0;
+
 private:
     void ProcessBindMethod(const MethodCall& method_call);
     void ProcessSemaphoreTriggerMethod();
@@ -246,6 +280,7 @@ private:
 protected:
     std::unique_ptr<Tegra::DmaPusher> dma_pusher;
     VideoCore::RendererBase& renderer;
+    Core::System& system;
 
 private:
     std::unique_ptr<Tegra::MemoryManager> memory_manager;
@@ -262,6 +297,14 @@ private:
     std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
     /// Inline memory engine
     std::unique_ptr<Engines::KeplerMemory> kepler_memory;
+
+    std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
+
+    std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
+
+    std::mutex sync_mutex;
+
+    const bool is_async;
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
@@ -274,6 +317,8 @@ ASSERT_REG_POSITION(semaphore_trigger, 0x7);
 ASSERT_REG_POSITION(reference_count, 0x14);
 ASSERT_REG_POSITION(semaphore_acquire, 0x1A);
 ASSERT_REG_POSITION(semaphore_release, 0x1B);
+ASSERT_REG_POSITION(fence_value, 0x1C);
+ASSERT_REG_POSITION(fence_action, 0x1D);
 
 ASSERT_REG_POSITION(acquire_mode, 0x100);
 ASSERT_REG_POSITION(acquire_source, 0x101);
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index d4e2553a9..ea67be831 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include "core/core.h"
+#include "core/hardware_interrupt_manager.h"
 #include "video_core/gpu_asynch.h"
 #include "video_core/gpu_thread.h"
 #include "video_core/renderer_base.h"
@@ -9,7 +11,7 @@
 namespace VideoCommon {
 
 GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
-    : GPU(system, renderer), gpu_thread{system} {}
+    : GPU(system, renderer, true), gpu_thread{system} {}
 
 GPUAsynch::~GPUAsynch() = default;
 
@@ -38,4 +40,9 @@ void GPUAsynch::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
     gpu_thread.FlushAndInvalidateRegion(addr, size);
 }
 
+void GPUAsynch::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
+    auto& interrupt_manager = system.InterruptManager();
+    interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
+}
+
 } // namespace VideoCommon
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index 30be74cba..36377d677 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -27,6 +27,9 @@ public:
     void InvalidateRegion(CacheAddr addr, u64 size) override;
     void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
 
+protected:
+    void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override;
+
 private:
     GPUThread::ThreadManager gpu_thread;
 };
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 45e43b1dc..d4ead9c47 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -8,7 +8,7 @@
 namespace VideoCommon {
 
 GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer)
-    : GPU(system, renderer) {}
+    : GPU(system, renderer, false) {}
 
 GPUSynch::~GPUSynch() = default;
 
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 3031fcf72..07bcc47f1 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -25,6 +25,10 @@ public:
     void FlushRegion(CacheAddr addr, u64 size) override;
     void InvalidateRegion(CacheAddr addr, u64 size) override;
     void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
+
+protected:
+    void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id,
+                             [[maybe_unused]] u32 value) const override {}
 };
 
 } // namespace VideoCommon
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 3f0939ec9..b441e92b0 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -21,7 +21,8 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
     MicroProfileOnThreadCreate("GpuThread");
 
     // Wait for first GPU command before acquiring the window context
-    state.WaitForCommands();
+    while (state.queue.Empty())
+        ;
 
     // If emulation was stopped during disk shader loading, abort before trying to acquire context
     if (!state.is_running) {
@@ -32,7 +33,6 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
 
     CommandDataContainer next;
     while (state.is_running) {
-        state.WaitForCommands();
         while (!state.queue.Empty()) {
             state.queue.Pop(next);
             if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) {
@@ -49,8 +49,7 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
             } else {
                 UNREACHABLE();
             }
-            state.signaled_fence = next.fence;
-            state.TrySynchronize();
+            state.signaled_fence.store(next.fence);
         }
     }
 }
@@ -89,12 +88,7 @@ void ThreadManager::FlushRegion(CacheAddr addr, u64 size) {
 }
 
 void ThreadManager::InvalidateRegion(CacheAddr addr, u64 size) {
-    if (state.queue.Empty()) {
-        // It's quicker to invalidate a single region on the CPU if the queue is already empty
-        system.Renderer().Rasterizer().InvalidateRegion(addr, size);
-    } else {
-        PushCommand(InvalidateRegionCommand(addr, size));
-    }
+    system.Renderer().Rasterizer().InvalidateRegion(addr, size);
 }
 
 void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
@@ -105,22 +99,13 @@ void ThreadManager::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
 u64 ThreadManager::PushCommand(CommandData&& command_data) {
     const u64 fence{++state.last_fence};
     state.queue.Push(CommandDataContainer(std::move(command_data), fence));
-    state.SignalCommands();
     return fence;
 }
 
 MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
 void SynchState::WaitForSynchronization(u64 fence) {
-    if (signaled_fence >= fence) {
-        return;
-    }
-
-    // Wait for the GPU to be idle (all commands to be executed)
-    {
-        MICROPROFILE_SCOPE(GPU_wait);
-        std::unique_lock lock{synchronization_mutex};
-        synchronization_condition.wait(lock, [this, fence] { return signaled_fence >= fence; });
-    }
+    while (signaled_fence.load() < fence)
+        ;
 }
 
 } // namespace VideoCommon::GPUThread
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 05a168a72..1d9d0c39e 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -88,41 +88,9 @@ struct CommandDataContainer {
 /// Struct used to synchronize the GPU thread
 struct SynchState final {
     std::atomic_bool is_running{true};
-    std::atomic_int queued_frame_count{};
-    std::mutex synchronization_mutex;
-    std::mutex commands_mutex;
-    std::condition_variable commands_condition;
-    std::condition_variable synchronization_condition;
-
-    /// Returns true if the gap in GPU commands is small enough that we can consider the CPU and GPU
-    /// synchronized. This is entirely empirical.
-    bool IsSynchronized() const {
-        constexpr std::size_t max_queue_gap{5};
-        return queue.Size() <= max_queue_gap;
-    }
-
-    void TrySynchronize() {
-        if (IsSynchronized()) {
-            std::lock_guard lock{synchronization_mutex};
-            synchronization_condition.notify_one();
-        }
-    }
 
     void WaitForSynchronization(u64 fence);
 
-    void SignalCommands() {
-        if (queue.Empty()) {
-            return;
-        }
-
-        commands_condition.notify_one();
-    }
-
-    void WaitForCommands() {
-        std::unique_lock lock{commands_mutex};
-        commands_condition.wait(lock, [this] { return !queue.Empty(); });
-    }
-
     using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
     CommandQueue queue;
     u64 last_fence{};
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 2b7367568..9881df0d5 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -34,6 +34,9 @@ public:
     /// Clear the current framebuffer
     virtual void Clear() = 0;
 
+    /// Dispatches a compute shader invocation
+    virtual void DispatchCompute(GPUVAddr code_addr) = 0;
+
     /// Notify rasterizer that all caches should be flushed to Switch memory
     virtual void FlushAll() = 0;
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 0bb5c068c..c28ae795c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -4,6 +4,7 @@
 
 #include <algorithm>
 #include <array>
+#include <bitset>
 #include <memory>
 #include <string>
 #include <string_view>
@@ -19,6 +20,7 @@
 #include "core/core.h"
 #include "core/hle/kernel/process.h"
 #include "core/settings.h"
+#include "video_core/engines/kepler_compute.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/memory_manager.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
@@ -105,6 +107,7 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind
     shader_program_manager = std::make_unique<GLShader::ProgramManager>();
     state.draw.shader_program = 0;
     state.Apply();
+    clear_framebuffer.Create();
 
     LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here");
     CheckExtensions();
@@ -124,10 +127,10 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
     auto& gpu = system.GPU().Maxwell3D();
     const auto& regs = gpu.regs;
 
-    if (!gpu.dirty_flags.vertex_attrib_format) {
+    if (!gpu.dirty.vertex_attrib_format) {
         return state.draw.vertex_array;
     }
-    gpu.dirty_flags.vertex_attrib_format = false;
+    gpu.dirty.vertex_attrib_format = false;
 
     MICROPROFILE_SCOPE(OpenGL_VAO);
 
@@ -181,7 +184,7 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
     }
 
     // Rebinding the VAO invalidates the vertex buffer bindings.
-    gpu.dirty_flags.vertex_array.set();
+    gpu.dirty.ResetVertexArrays();
 
     state.draw.vertex_array = vao_entry.handle;
     return vao_entry.handle;
@@ -189,17 +192,20 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
 
 void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
     auto& gpu = system.GPU().Maxwell3D();
-    const auto& regs = gpu.regs;
-
-    if (gpu.dirty_flags.vertex_array.none())
+    if (!gpu.dirty.vertex_array_buffers)
         return;
+    gpu.dirty.vertex_array_buffers = false;
+
+    const auto& regs = gpu.regs;
 
     MICROPROFILE_SCOPE(OpenGL_VB);
 
     // Upload all guest vertex arrays sequentially to our buffer
     for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
-        if (!gpu.dirty_flags.vertex_array[index])
+        if (!gpu.dirty.vertex_array[index])
             continue;
+        gpu.dirty.vertex_array[index] = false;
+        gpu.dirty.vertex_instance[index] = false;
 
         const auto& vertex_array = regs.vertex_array[index];
         if (!vertex_array.IsEnabled())
@@ -224,8 +230,32 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
             glVertexArrayBindingDivisor(vao, index, 0);
         }
     }
+}
+
+void RasterizerOpenGL::SetupVertexInstances(GLuint vao) {
+    auto& gpu = system.GPU().Maxwell3D();
+
+    if (!gpu.dirty.vertex_instances)
+        return;
+    gpu.dirty.vertex_instances = false;
+
+    const auto& regs = gpu.regs;
+    // Upload all guest vertex arrays sequentially to our buffer
+    for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
+        if (!gpu.dirty.vertex_instance[index])
+            continue;
+
+        gpu.dirty.vertex_instance[index] = false;
 
-    gpu.dirty_flags.vertex_array.reset();
+        if (regs.instanced_arrays.IsInstancingEnabled(index) &&
+            regs.vertex_array[index].divisor != 0) {
+            // Enable vertex buffer instancing with the specified divisor.
+            glVertexArrayBindingDivisor(vao, index, regs.vertex_array[index].divisor);
+        } else {
+            // Disable the vertex buffer instancing.
+            glVertexArrayBindingDivisor(vao, index, 0);
+        }
+    }
 }
 
 GLintptr RasterizerOpenGL::SetupIndexBuffer() {
@@ -298,9 +328,9 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
 
         Shader shader{shader_cache.GetStageProgram(program)};
 
-        const auto stage_enum{static_cast<Maxwell::ShaderStage>(stage)};
+        const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage);
         SetupDrawConstBuffers(stage_enum, shader);
-        SetupGlobalRegions(stage_enum, shader);
+        SetupDrawGlobalMemory(stage_enum, shader);
         const auto texture_buffer_usage{SetupTextures(stage_enum, shader, base_bindings)};
 
         const ProgramVariant variant{base_bindings, primitive_mode, texture_buffer_usage};
@@ -341,7 +371,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
 
     SyncClipEnabled(clip_distances);
 
-    gpu.dirty_flags.shaders = false;
+    gpu.dirty.shaders = false;
 }
 
 std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
@@ -424,13 +454,13 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
 
     const FramebufferConfigState fb_config_state{using_color_fb, using_depth_fb, preserve_contents,
                                                  single_color_target};
-    if (fb_config_state == current_framebuffer_config_state &&
-        gpu.dirty_flags.color_buffer.none() && !gpu.dirty_flags.zeta_buffer) {
+    if (fb_config_state == current_framebuffer_config_state && !gpu.dirty.render_settings) {
         // Only skip if the previous ConfigureFramebuffers call was from the same kind (multiple or
         // single color targets). This is done because the guest registers may not change but the
         // host framebuffer may contain different attachments
         return current_depth_stencil_usage;
     }
+    gpu.dirty.render_settings = false;
     current_framebuffer_config_state = fb_config_state;
 
     texture_cache.GuardRenderTargets(true);
@@ -519,13 +549,71 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
     return current_depth_stencil_usage = {static_cast<bool>(depth_surface), fbkey.stencil_enable};
 }
 
+void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb,
+                                                 bool using_depth_fb, bool using_stencil_fb) {
+    auto& gpu = system.GPU().Maxwell3D();
+    const auto& regs = gpu.regs;
+
+    texture_cache.GuardRenderTargets(true);
+    View color_surface{};
+    if (using_color_fb) {
+        color_surface = texture_cache.GetColorBufferSurface(regs.clear_buffers.RT, false);
+    }
+    View depth_surface{};
+    if (using_depth_fb || using_stencil_fb) {
+        depth_surface = texture_cache.GetDepthBufferSurface(false);
+    }
+    texture_cache.GuardRenderTargets(false);
+
+    current_state.draw.draw_framebuffer = clear_framebuffer.handle;
+    current_state.ApplyFramebufferState();
+
+    if (color_surface) {
+        color_surface->Attach(GL_COLOR_ATTACHMENT0, GL_DRAW_FRAMEBUFFER);
+    } else {
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
+    }
+
+    if (depth_surface) {
+        const auto& params = depth_surface->GetSurfaceParams();
+        switch (params.type) {
+        case VideoCore::Surface::SurfaceType::Depth: {
+            depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER);
+            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
+            break;
+        }
+        case VideoCore::Surface::SurfaceType::DepthStencil: {
+            depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER);
+            break;
+        }
+        default: { UNIMPLEMENTED(); }
+        }
+    } else {
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
+                               0);
+    }
+}
+
 void RasterizerOpenGL::Clear() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& maxwell3d = system.GPU().Maxwell3D();
+
+    if (!maxwell3d.ShouldExecute()) {
+        return;
+    }
+
+    const auto& regs = maxwell3d.regs;
     bool use_color{};
     bool use_depth{};
     bool use_stencil{};
 
-    OpenGLState clear_state;
+    OpenGLState prev_state{OpenGLState::GetCurState()};
+    SCOPE_EXIT({
+        prev_state.AllDirty();
+        prev_state.Apply();
+    });
+
+    OpenGLState clear_state{OpenGLState::GetCurState()};
+    clear_state.SetDefaultViewports();
     if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B ||
         regs.clear_buffers.A) {
         use_color = true;
@@ -545,6 +633,7 @@ void RasterizerOpenGL::Clear() {
         // true.
         clear_state.depth.test_enabled = true;
         clear_state.depth.test_func = GL_ALWAYS;
+        clear_state.depth.write_mask = GL_TRUE;
     }
     if (regs.clear_buffers.S) {
         ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear stencil but buffer is not enabled!");
@@ -581,8 +670,9 @@ void RasterizerOpenGL::Clear() {
         return;
     }
 
-    const auto [clear_depth, clear_stencil] = ConfigureFramebuffers(
-        clear_state, use_color, use_depth || use_stencil, false, regs.clear_buffers.RT.Value());
+    ConfigureClearFramebuffer(clear_state, use_color, use_depth, use_stencil);
+
+    SyncViewport(clear_state);
     if (regs.clear_flags.scissor) {
         SyncScissorTest(clear_state);
     }
@@ -591,21 +681,18 @@ void RasterizerOpenGL::Clear() {
         clear_state.EmulateViewportWithScissor();
     }
 
-    clear_state.ApplyColorMask();
-    clear_state.ApplyDepth();
-    clear_state.ApplyStencilTest();
-    clear_state.ApplyViewport();
-    clear_state.ApplyFramebufferState();
+    clear_state.AllDirty();
+    clear_state.Apply();
 
     if (use_color) {
-        glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color);
+        glClearBufferfv(GL_COLOR, 0, regs.clear_color);
     }
 
-    if (clear_depth && clear_stencil) {
+    if (use_depth && use_stencil) {
         glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil);
-    } else if (clear_depth) {
+    } else if (use_depth) {
         glClearBufferfv(GL_DEPTH, 0, &regs.clear_depth);
-    } else if (clear_stencil) {
+    } else if (use_stencil) {
         glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil);
     }
 }
@@ -616,6 +703,11 @@ void RasterizerOpenGL::DrawArrays() {
 
     MICROPROFILE_SCOPE(OpenGL_Drawing);
     auto& gpu = system.GPU().Maxwell3D();
+
+    if (!gpu.ShouldExecute()) {
+        return;
+    }
+
     const auto& regs = gpu.regs;
 
     SyncColorMask();
@@ -661,6 +753,7 @@ void RasterizerOpenGL::DrawArrays() {
 
     // Upload vertex and index data.
     SetupVertexBuffer(vao);
+    SetupVertexInstances(vao);
     const GLintptr index_buffer_offset = SetupIndexBuffer();
 
     // Setup draw parameters. It will automatically choose what glDraw* method to use.
@@ -687,7 +780,7 @@ void RasterizerOpenGL::DrawArrays() {
 
     if (invalidate) {
         // As all cached buffers are invalidated, we need to recheck their state.
-        gpu.dirty_flags.vertex_array.set();
+        gpu.dirty.ResetVertexArrays();
     }
 
     shader_program_manager->ApplyTo(state);
@@ -700,6 +793,46 @@ void RasterizerOpenGL::DrawArrays() {
     params.DispatchDraw();
 
     accelerate_draw = AccelDraw::Disabled;
+    gpu.dirty.memory_general = false;
+}
+
+void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
+    if (!GLAD_GL_ARB_compute_variable_group_size) {
+        LOG_ERROR(Render_OpenGL, "Compute is currently not supported on this device due to the "
+                                 "lack of GL_ARB_compute_variable_group_size");
+        return;
+    }
+
+    auto kernel = shader_cache.GetComputeKernel(code_addr);
+    const auto [program, next_bindings] = kernel->GetProgramHandle({});
+    state.draw.shader_program = program;
+    state.draw.program_pipeline = 0;
+
+    const std::size_t buffer_size =
+        Tegra::Engines::KeplerCompute::NumConstBuffers *
+        (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
+    buffer_cache.Map(buffer_size);
+
+    bind_ubo_pushbuffer.Setup(0);
+    bind_ssbo_pushbuffer.Setup(0);
+
+    SetupComputeConstBuffers(kernel);
+    SetupComputeGlobalMemory(kernel);
+
+    // TODO(Rodrigo): Bind images and samplers
+
+    buffer_cache.Unmap();
+
+    bind_ubo_pushbuffer.Bind();
+    bind_ssbo_pushbuffer.Bind();
+
+    state.ApplyShaderProgram();
+    state.ApplyProgramPipeline();
+
+    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    glDispatchComputeGroupSizeARB(launch_desc.grid_dim_x, launch_desc.grid_dim_y,
+                                  launch_desc.grid_dim_z, launch_desc.block_dim_x,
+                                  launch_desc.block_dim_y, launch_desc.block_dim_z);
 }
 
 void RasterizerOpenGL::FlushAll() {}
@@ -775,12 +908,25 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
 void RasterizerOpenGL::SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
                                              const Shader& shader) {
     MICROPROFILE_SCOPE(OpenGL_UBO);
-    const auto stage_index = static_cast<std::size_t>(stage);
-    const auto& shader_stage = system.GPU().Maxwell3D().state.shader_stages[stage_index];
-
-    // Upload only the enabled buffers from the 16 constbuffers of each shader stage
+    const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
+    const auto& shader_stage = stages[static_cast<std::size_t>(stage)];
     for (const auto& entry : shader->GetShaderEntries().const_buffers) {
-        SetupConstBuffer(shader_stage.const_buffers[entry.GetIndex()], entry);
+        const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
+        SetupConstBuffer(buffer, entry);
+    }
+}
+
+void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
+    MICROPROFILE_SCOPE(OpenGL_UBO);
+    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    for (const auto& entry : kernel->GetShaderEntries().const_buffers) {
+        const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
+        const std::bitset<8> mask = launch_desc.memory_config.const_buffer_enable_mask.Value();
+        Tegra::Engines::ConstBufferInfo buffer;
+        buffer.address = config.Address();
+        buffer.size = config.size;
+        buffer.enabled = mask[entry.GetIndex()];
+        SetupConstBuffer(buffer, entry);
     }
 }
 
@@ -801,24 +947,39 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b
     bind_ubo_pushbuffer.Push(cbuf, offset, size);
 }
 
-void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                                          const Shader& shader) {
+void RasterizerOpenGL::SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                                             const Shader& shader) {
     auto& gpu{system.GPU()};
     auto& memory_manager{gpu.MemoryManager()};
     const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
-    const auto alignment{device.GetShaderStorageBufferAlignment()};
-
     for (const auto& entry : shader->GetShaderEntries().global_memory_entries) {
         const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()};
-        const auto actual_addr{memory_manager.Read<u64>(addr)};
+        const auto gpu_addr{memory_manager.Read<u64>(addr)};
         const auto size{memory_manager.Read<u32>(addr + 8)};
+        SetupGlobalMemory(entry, gpu_addr, size);
+    }
+}
 
-        const auto [ssbo, buffer_offset] =
-            buffer_cache.UploadMemory(actual_addr, size, alignment, true, entry.IsWritten());
-        bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
+void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
+    auto& gpu{system.GPU()};
+    auto& memory_manager{gpu.MemoryManager()};
+    const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
+    for (const auto& entry : kernel->GetShaderEntries().global_memory_entries) {
+        const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()};
+        const auto gpu_addr{memory_manager.Read<u64>(addr)};
+        const auto size{memory_manager.Read<u32>(addr + 8)};
+        SetupGlobalMemory(entry, gpu_addr, size);
     }
 }
 
+void RasterizerOpenGL::SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry,
+                                         GPUVAddr gpu_addr, std::size_t size) {
+    const auto alignment{device.GetShaderStorageBufferAlignment()};
+    const auto [ssbo, buffer_offset] =
+        buffer_cache.UploadMemory(gpu_addr, size, alignment, true, entry.IsWritten());
+    bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
+}
+
 TextureBufferUsage RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader,
                                                    BaseBindings base_bindings) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
@@ -907,10 +1068,11 @@ void RasterizerOpenGL::SyncClipCoef() {
 }
 
 void RasterizerOpenGL::SyncCullMode() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
 
-    state.cull.enabled = regs.cull.enabled != 0;
+    const auto& regs = maxwell3d.regs;
 
+    state.cull.enabled = regs.cull.enabled != 0;
     if (state.cull.enabled) {
         state.cull.front_face = MaxwellToGL::FrontFace(regs.cull.front_face);
         state.cull.mode = MaxwellToGL::CullFace(regs.cull.cull_face);
@@ -943,16 +1105,21 @@ void RasterizerOpenGL::SyncDepthTestState() {
     state.depth.test_enabled = regs.depth_test_enable != 0;
     state.depth.write_mask = regs.depth_write_enabled ? GL_TRUE : GL_FALSE;
 
-    if (!state.depth.test_enabled)
+    if (!state.depth.test_enabled) {
         return;
+    }
 
     state.depth.test_func = MaxwellToGL::ComparisonOp(regs.depth_test_func);
 }
 
 void RasterizerOpenGL::SyncStencilTestState() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
-    state.stencil.test_enabled = regs.stencil_enable != 0;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.stencil_test) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;
 
+    state.stencil.test_enabled = regs.stencil_enable != 0;
     if (!regs.stencil_enable) {
         return;
     }
@@ -981,10 +1148,17 @@ void RasterizerOpenGL::SyncStencilTestState() {
         state.stencil.back.action_depth_fail = GL_KEEP;
         state.stencil.back.action_depth_pass = GL_KEEP;
     }
+    state.MarkDirtyStencilState();
+    maxwell3d.dirty.stencil_test = false;
 }
 
 void RasterizerOpenGL::SyncColorMask() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.color_mask) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;
+
     const std::size_t count =
         regs.independent_blend_enable ? Tegra::Engines::Maxwell3D::Regs::NumRenderTargets : 1;
     for (std::size_t i = 0; i < count; i++) {
@@ -995,6 +1169,9 @@ void RasterizerOpenGL::SyncColorMask() {
         dest.blue_enabled = (source.B == 0) ? GL_FALSE : GL_TRUE;
         dest.alpha_enabled = (source.A == 0) ? GL_FALSE : GL_TRUE;
     }
+
+    state.MarkDirtyColorMask();
+    maxwell3d.dirty.color_mask = false;
 }
 
 void RasterizerOpenGL::SyncMultiSampleState() {
@@ -1009,7 +1186,11 @@ void RasterizerOpenGL::SyncFragmentColorClampState() {
 }
 
 void RasterizerOpenGL::SyncBlendState() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.blend_state) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;
 
     state.blend_color.red = regs.blend_color.r;
     state.blend_color.green = regs.blend_color.g;
@@ -1032,6 +1213,8 @@ void RasterizerOpenGL::SyncBlendState() {
         for (std::size_t i = 1; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
             state.blend[i].enabled = false;
         }
+        maxwell3d.dirty.blend_state = false;
+        state.MarkDirtyBlendState();
         return;
     }
 
@@ -1048,6 +1231,9 @@ void RasterizerOpenGL::SyncBlendState() {
         blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a);
         blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a);
     }
+
+    state.MarkDirtyBlendState();
+    maxwell3d.dirty.blend_state = false;
 }
 
 void RasterizerOpenGL::SyncLogicOpState() {
@@ -1099,13 +1285,21 @@ void RasterizerOpenGL::SyncPointState() {
 }
 
 void RasterizerOpenGL::SyncPolygonOffset() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.polygon_offset) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;
+
     state.polygon_offset.fill_enable = regs.polygon_offset_fill_enable != 0;
     state.polygon_offset.line_enable = regs.polygon_offset_line_enable != 0;
     state.polygon_offset.point_enable = regs.polygon_offset_point_enable != 0;
     state.polygon_offset.units = regs.polygon_offset_units;
     state.polygon_offset.factor = regs.polygon_offset_factor;
     state.polygon_offset.clamp = regs.polygon_offset_clamp;
+
+    state.MarkDirtyPolygonOffset();
+    maxwell3d.dirty.polygon_offset = false;
 }
 
 void RasterizerOpenGL::SyncAlphaTest() {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 40b571d58..8b123c48d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -58,6 +58,7 @@ public:
 
     void DrawArrays() override;
     void Clear() override;
+    void DispatchCompute(GPUVAddr code_addr) override;
     void FlushAll() override;
     void FlushRegion(CacheAddr addr, u64 size) override;
     void InvalidateRegion(CacheAddr addr, u64 size) override;
@@ -108,17 +109,30 @@ private:
         OpenGLState& current_state, bool using_color_fb = true, bool using_depth_fb = true,
         bool preserve_contents = true, std::optional<std::size_t> single_color_target = {});
 
+    void ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb,
+                                   bool using_depth_fb, bool using_stencil_fb);
+
     /// Configures the current constbuffers to use for the draw command.
     void SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
                                const Shader& shader);
 
+    /// Configures the current constbuffers to use for the kernel invocation.
+    void SetupComputeConstBuffers(const Shader& kernel);
+
     /// Configures a constant buffer.
     void SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer,
                           const GLShader::ConstBufferEntry& entry);
 
     /// Configures the current global memory entries to use for the draw command.
-    void SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                            const Shader& shader);
+    void SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                               const Shader& shader);
+
+    /// Configures the current global memory entries to use for the kernel invocation.
+    void SetupComputeGlobalMemory(const Shader& kernel);
+
+    /// Configures a constant buffer.
+    void SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
+                           std::size_t size);
 
     /// Configures the current textures to use for the draw command. Returns shaders texture buffer
     /// usage.
@@ -216,6 +230,7 @@ private:
     GLuint SetupVertexFormat();
 
     void SetupVertexBuffer(GLuint vao);
+    void SetupVertexInstances(GLuint vao);
 
     GLintptr SetupIndexBuffer();
 
@@ -226,6 +241,8 @@ private:
     enum class AccelDraw { Disabled, Arrays, Indexed };
     AccelDraw accelerate_draw = AccelDraw::Disabled;
 
+    OGLFramebuffer clear_framebuffer;
+
     using CachedPageMap = boost::icl::interval_map<u64, int>;
     CachedPageMap cached_pages;
 };
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 32dd9eae7..1c90facc3 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -23,13 +23,13 @@ namespace OpenGL {
 
 using VideoCommon::Shader::ProgramCode;
 
-// One UBO is always reserved for emulation values
-constexpr u32 RESERVED_UBOS = 1;
+// One UBO is always reserved for emulation values on staged shaders
+constexpr u32 STAGE_RESERVED_UBOS = 1;
 
 struct UnspecializedShader {
     std::string code;
     GLShader::ShaderEntries entries;
-    Maxwell::ShaderProgram program_type;
+    ProgramType program_type;
 };
 
 namespace {
@@ -55,15 +55,17 @@ ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr g
 }
 
 /// Gets the shader type from a Maxwell program type
-constexpr GLenum GetShaderType(Maxwell::ShaderProgram program_type) {
+constexpr GLenum GetShaderType(ProgramType program_type) {
     switch (program_type) {
-    case Maxwell::ShaderProgram::VertexA:
-    case Maxwell::ShaderProgram::VertexB:
+    case ProgramType::VertexA:
+    case ProgramType::VertexB:
         return GL_VERTEX_SHADER;
-    case Maxwell::ShaderProgram::Geometry:
+    case ProgramType::Geometry:
         return GL_GEOMETRY_SHADER;
-    case Maxwell::ShaderProgram::Fragment:
+    case ProgramType::Fragment:
         return GL_FRAGMENT_SHADER;
+    case ProgramType::Compute:
+        return GL_COMPUTE_SHADER;
     default:
         return GL_NONE;
     }
@@ -100,6 +102,25 @@ constexpr std::tuple<const char*, const char*, u32> GetPrimitiveDescription(GLen
     }
 }
 
+ProgramType GetProgramType(Maxwell::ShaderProgram program) {
+    switch (program) {
+    case Maxwell::ShaderProgram::VertexA:
+        return ProgramType::VertexA;
+    case Maxwell::ShaderProgram::VertexB:
+        return ProgramType::VertexB;
+    case Maxwell::ShaderProgram::TesselationControl:
+        return ProgramType::TessellationControl;
+    case Maxwell::ShaderProgram::TesselationEval:
+        return ProgramType::TessellationEval;
+    case Maxwell::ShaderProgram::Geometry:
+        return ProgramType::Geometry;
+    case Maxwell::ShaderProgram::Fragment:
+        return ProgramType::Fragment;
+    }
+    UNREACHABLE();
+    return {};
+}
+
 /// Calculates the size of a program stream
 std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
     constexpr std::size_t start_offset = 10;
@@ -128,13 +149,13 @@ std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
 }
 
 /// Hashes one (or two) program streams
-u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& code,
+u64 GetUniqueIdentifier(ProgramType program_type, const ProgramCode& code,
                         const ProgramCode& code_b, std::size_t size_a = 0, std::size_t size_b = 0) {
     if (size_a == 0) {
         size_a = CalculateProgramSize(code);
     }
     u64 unique_identifier = Common::CityHash64(reinterpret_cast<const char*>(code.data()), size_a);
-    if (program_type != Maxwell::ShaderProgram::VertexA) {
+    if (program_type != ProgramType::VertexA) {
         return unique_identifier;
     }
     // VertexA programs include two programs
@@ -152,12 +173,12 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode&
 }
 
 /// Creates an unspecialized program from code streams
-GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type,
+GLShader::ProgramResult CreateProgram(const Device& device, ProgramType program_type,
                                       ProgramCode program_code, ProgramCode program_code_b) {
     GLShader::ShaderSetup setup(program_code);
     setup.program.size_a = CalculateProgramSize(program_code);
     setup.program.size_b = 0;
-    if (program_type == Maxwell::ShaderProgram::VertexA) {
+    if (program_type == ProgramType::VertexA) {
         // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders.
         // Conventional HW does not support this, so we combine VertexA and VertexB into one
         // stage here.
@@ -168,22 +189,23 @@ GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgr
         program_type, program_code, program_code_b, setup.program.size_a, setup.program.size_b);
 
     switch (program_type) {
-    case Maxwell::ShaderProgram::VertexA:
-    case Maxwell::ShaderProgram::VertexB:
+    case ProgramType::VertexA:
+    case ProgramType::VertexB:
         return GLShader::GenerateVertexShader(device, setup);
-    case Maxwell::ShaderProgram::Geometry:
+    case ProgramType::Geometry:
         return GLShader::GenerateGeometryShader(device, setup);
-    case Maxwell::ShaderProgram::Fragment:
+    case ProgramType::Fragment:
         return GLShader::GenerateFragmentShader(device, setup);
+    case ProgramType::Compute:
+        return GLShader::GenerateComputeShader(device, setup);
     default:
-        LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type));
-        UNREACHABLE();
+        UNIMPLEMENTED_MSG("Unimplemented program_type={}", static_cast<u32>(program_type));
         return {};
     }
 }
 
 CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEntries& entries,
-                               Maxwell::ShaderProgram program_type, const ProgramVariant& variant,
+                               ProgramType program_type, const ProgramVariant& variant,
                                bool hint_retrievable = false) {
     auto base_bindings{variant.base_bindings};
     const auto primitive_mode{variant.primitive_mode};
@@ -194,7 +216,14 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
     if (entries.shader_viewport_layer_array) {
         source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
     }
-    source += fmt::format("\n#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
+    if (program_type == ProgramType::Compute) {
+        source += "#extension GL_ARB_compute_variable_group_size : require\n";
+    }
+    source += '\n';
+
+    if (program_type != ProgramType::Compute) {
+        source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
+    }
 
     for (const auto& cbuf : entries.const_buffers) {
         source +=
@@ -221,13 +250,16 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
         source += fmt::format("#define SAMPLER_{}_IS_BUFFER", i);
     }
 
-    if (program_type == Maxwell::ShaderProgram::Geometry) {
+    if (program_type == ProgramType::Geometry) {
         const auto [glsl_topology, debug_name, max_vertices] =
             GetPrimitiveDescription(primitive_mode);
 
         source += "layout (" + std::string(glsl_topology) + ") in;\n";
         source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n';
     }
+    if (program_type == ProgramType::Compute) {
+        source += "layout (local_size_variable) in;\n";
+    }
 
     source += code;
 
@@ -255,7 +287,7 @@ std::set<GLenum> GetSupportedFormats() {
 
 } // Anonymous namespace
 
-CachedShader::CachedShader(const ShaderParameters& params, Maxwell::ShaderProgram program_type,
+CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type,
                            GLShader::ProgramResult result)
     : RasterizerCacheObject{params.host_ptr}, host_ptr{params.host_ptr}, cpu_addr{params.cpu_addr},
       unique_identifier{params.unique_identifier}, program_type{program_type},
@@ -268,29 +300,50 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
                                            ProgramCode&& program_code_b) {
     const auto code_size{CalculateProgramSize(program_code)};
     const auto code_size_b{CalculateProgramSize(program_code_b)};
-    auto result{CreateProgram(params.device, program_type, program_code, program_code_b)};
+    auto result{
+        CreateProgram(params.device, GetProgramType(program_type), program_code, program_code_b)};
     if (result.first.empty()) {
         // TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now
         return {};
     }
 
     params.disk_cache.SaveRaw(ShaderDiskCacheRaw(
-        params.unique_identifier, program_type, static_cast<u32>(code_size / sizeof(u64)),
-        static_cast<u32>(code_size_b / sizeof(u64)), std::move(program_code),
-        std::move(program_code_b)));
+        params.unique_identifier, GetProgramType(program_type),
+        static_cast<u32>(code_size / sizeof(u64)), static_cast<u32>(code_size_b / sizeof(u64)),
+        std::move(program_code), std::move(program_code_b)));
 
-    return std::shared_ptr<CachedShader>(new CachedShader(params, program_type, std::move(result)));
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, GetProgramType(program_type), std::move(result)));
 }
 
 Shader CachedShader::CreateStageFromCache(const ShaderParameters& params,
                                           Maxwell::ShaderProgram program_type,
                                           GLShader::ProgramResult result) {
-    return std::shared_ptr<CachedShader>(new CachedShader(params, program_type, std::move(result)));
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, GetProgramType(program_type), std::move(result)));
+}
+
+Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code) {
+    auto result{CreateProgram(params.device, ProgramType::Compute, code, {})};
+
+    const auto code_size{CalculateProgramSize(code)};
+    params.disk_cache.SaveRaw(ShaderDiskCacheRaw(params.unique_identifier, ProgramType::Compute,
+                                                 static_cast<u32>(code_size / sizeof(u64)), 0,
+                                                 std::move(code), {}));
+
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, ProgramType::Compute, std::move(result)));
+}
+
+Shader CachedShader::CreateKernelFromCache(const ShaderParameters& params,
+                                           GLShader::ProgramResult result) {
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, ProgramType::Compute, std::move(result)));
 }
 
 std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVariant& variant) {
     GLuint handle{};
-    if (program_type == Maxwell::ShaderProgram::Geometry) {
+    if (program_type == ProgramType::Geometry) {
         handle = GetGeometryShader(variant);
     } else {
         const auto [entry, is_cache_miss] = programs.try_emplace(variant);
@@ -308,8 +361,11 @@ std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVar
         handle = program->handle;
     }
 
-    auto base_bindings{variant.base_bindings};
-    base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size()) + RESERVED_UBOS;
+    auto base_bindings = variant.base_bindings;
+    base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size());
+    if (program_type != ProgramType::Compute) {
+        base_bindings.cbuf += STAGE_RESERVED_UBOS;
+    }
     base_bindings.gmem += static_cast<u32>(entries.global_memory_entries.size());
     base_bindings.sampler += static_cast<u32>(entries.samplers.size());
 
@@ -572,7 +628,7 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
 }
 
 Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
-    if (!system.GPU().Maxwell3D().dirty_flags.shaders) {
+    if (!system.GPU().Maxwell3D().dirty.shaders) {
         return last_shaders[static_cast<std::size_t>(program)];
     }
 
@@ -589,13 +645,15 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
     // No shader found - create a new one
     ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)};
     ProgramCode program_code_b;
-    if (program == Maxwell::ShaderProgram::VertexA) {
+    const bool is_program_a{program == Maxwell::ShaderProgram::VertexA};
+    if (is_program_a) {
         const GPUVAddr program_addr_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)};
         program_code_b = GetShaderCode(memory_manager, program_addr_b,
                                        memory_manager.GetPointer(program_addr_b));
     }
 
-    const auto unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
+    const auto unique_identifier =
+        GetUniqueIdentifier(GetProgramType(program), program_code, program_code_b);
     const auto cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)};
     const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr,
                                   host_ptr,   unique_identifier};
@@ -612,4 +670,30 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
     return last_shaders[static_cast<std::size_t>(program)] = shader;
 }
 
+Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
+    auto& memory_manager{system.GPU().MemoryManager()};
+    const auto host_ptr{memory_manager.GetPointer(code_addr)};
+    auto kernel = TryGet(host_ptr);
+    if (kernel) {
+        return kernel;
+    }
+
+    // No kernel found - create a new one
+    auto code{GetShaderCode(memory_manager, code_addr, host_ptr)};
+    const auto unique_identifier{GetUniqueIdentifier(ProgramType::Compute, code, {})};
+    const auto cpu_addr{*memory_manager.GpuToCpuAddress(code_addr)};
+    const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr,
+                                  host_ptr,   unique_identifier};
+
+    const auto found = precompiled_shaders.find(unique_identifier);
+    if (found == precompiled_shaders.end()) {
+        kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
+    } else {
+        kernel = CachedShader::CreateKernelFromCache(params, found->second);
+    }
+
+    Register(kernel);
+    return kernel;
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index bbb53cdf4..a3106a0ff 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -61,6 +61,11 @@ public:
                                        Maxwell::ShaderProgram program_type,
                                        GLShader::ProgramResult result);
 
+    static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code);
+
+    static Shader CreateKernelFromCache(const ShaderParameters& params,
+                                        GLShader::ProgramResult result);
+
     VAddr GetCpuAddr() const override {
         return cpu_addr;
     }
@@ -78,7 +83,7 @@ public:
     std::tuple<GLuint, BaseBindings> GetProgramHandle(const ProgramVariant& variant);
 
 private:
-    explicit CachedShader(const ShaderParameters& params, Maxwell::ShaderProgram program_type,
+    explicit CachedShader(const ShaderParameters& params, ProgramType program_type,
                           GLShader::ProgramResult result);
 
     // Geometry programs. These are needed because GLSL needs an input topology but it's not
@@ -104,7 +109,7 @@ private:
     u8* host_ptr{};
     VAddr cpu_addr{};
     u64 unique_identifier{};
-    Maxwell::ShaderProgram program_type{};
+    ProgramType program_type{};
     ShaderDiskCacheOpenGL& disk_cache;
     const PrecompiledPrograms& precompiled_programs;
 
@@ -132,6 +137,9 @@ public:
     /// Gets the current specified shader stage program
     Shader GetStageProgram(Maxwell::ShaderProgram program);
 
+    /// Gets a compute kernel in the passed address
+    Shader GetComputeKernel(GPUVAddr code_addr);
+
 protected:
     // We do not have to flush this cache as things in it are never modified by us.
     void FlushObjectInner(const Shader& object) override {}
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 119073776..ffe26b241 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -37,7 +37,6 @@ using namespace std::string_literals;
 using namespace VideoCommon::Shader;
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-using ShaderStage = Tegra::Engines::Maxwell3D::Regs::ShaderStage;
 using Operation = const OperationNode&;
 
 enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
@@ -162,9 +161,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
     return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
 }
 
+constexpr bool IsVertexShader(ProgramType stage) {
+    return stage == ProgramType::VertexA || stage == ProgramType::VertexB;
+}
+
 class GLSLDecompiler final {
 public:
-    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
+    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ProgramType stage,
                             std::string suffix)
         : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {}
 
@@ -248,25 +251,21 @@ public:
         }
         entries.clip_distances = ir.GetClipDistances();
         entries.shader_viewport_layer_array =
-            stage == ShaderStage::Vertex && (ir.UsesLayer() || ir.UsesViewportIndex());
+            IsVertexShader(stage) && (ir.UsesLayer() || ir.UsesViewportIndex());
         entries.shader_length = ir.GetLength();
         return entries;
     }
 
 private:
-    using OperationDecompilerFn = std::string (GLSLDecompiler::*)(Operation);
-    using OperationDecompilersArray =
-        std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>;
-
     void DeclareVertex() {
-        if (stage != ShaderStage::Vertex)
+        if (!IsVertexShader(stage))
             return;
 
         DeclareVertexRedeclarations();
     }
 
     void DeclareGeometry() {
-        if (stage != ShaderStage::Geometry) {
+        if (stage != ProgramType::Geometry) {
             return;
         }
 
@@ -297,14 +296,14 @@ private:
                 break;
             }
         }
-        if (stage != ShaderStage::Vertex || device.HasVertexViewportLayer()) {
+        if (!IsVertexShader(stage) || device.HasVertexViewportLayer()) {
             if (ir.UsesLayer()) {
                 code.AddLine("int gl_Layer;");
             }
             if (ir.UsesViewportIndex()) {
                 code.AddLine("int gl_ViewportIndex;");
             }
-        } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && stage == ShaderStage::Vertex &&
+        } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && IsVertexShader(stage) &&
                    !device.HasVertexViewportLayer()) {
             LOG_ERROR(
                 Render_OpenGL,
@@ -341,11 +340,16 @@ private:
     }
 
     void DeclareLocalMemory() {
-        if (const u64 local_memory_size = header.GetLocalMemorySize(); local_memory_size > 0) {
-            const auto element_count = Common::AlignUp(local_memory_size, 4) / 4;
-            code.AddLine("float {}[{}];", GetLocalMemory(), element_count);
-            code.AddNewLine();
+        // TODO(Rodrigo): Unstub kernel local memory size and pass it from a register at
+        // specialization time.
+        const u64 local_memory_size =
+            stage == ProgramType::Compute ? 0x400 : header.GetLocalMemorySize();
+        if (local_memory_size == 0) {
+            return;
         }
+        const auto element_count = Common::AlignUp(local_memory_size, 4) / 4;
+        code.AddLine("float {}[{}];", GetLocalMemory(), element_count);
+        code.AddNewLine();
     }
 
     void DeclareInternalFlags() {
@@ -399,12 +403,12 @@ private:
         const u32 location{GetGenericAttributeIndex(index)};
 
         std::string name{GetInputAttribute(index)};
-        if (stage == ShaderStage::Geometry) {
+        if (stage == ProgramType::Geometry) {
             name = "gs_" + name + "[]";
         }
 
         std::string suffix;
-        if (stage == ShaderStage::Fragment) {
+        if (stage == ProgramType::Fragment) {
             const auto input_mode{header.ps.GetAttributeUse(location)};
             if (skip_unused && input_mode == AttributeUse::Unused) {
                 return;
@@ -416,7 +420,7 @@ private:
     }
 
     void DeclareOutputAttributes() {
-        if (ir.HasPhysicalAttributes() && stage != ShaderStage::Fragment) {
+        if (ir.HasPhysicalAttributes() && stage != ProgramType::Fragment) {
             for (u32 i = 0; i < GetNumPhysicalVaryings(); ++i) {
                 DeclareOutputAttribute(ToGenericAttribute(i));
             }
@@ -538,7 +542,7 @@ private:
                 constexpr u32 element_stride{4};
                 const u32 address{generic_base + index * generic_stride + element * element_stride};
 
-                const bool declared{stage != ShaderStage::Fragment ||
+                const bool declared{stage != ProgramType::Fragment ||
                                     header.ps.GetAttributeUse(index) != AttributeUse::Unused};
                 const std::string value{declared ? ReadAttribute(attribute, element) : "0"};
                 code.AddLine("case 0x{:x}: return {};", address, value);
@@ -642,7 +646,7 @@ private:
         }
 
         if (const auto abuf = std::get_if<AbufNode>(&*node)) {
-            UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ShaderStage::Geometry,
+            UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ProgramType::Geometry,
                                  "Physical attributes in geometry shaders are not implemented");
             if (abuf->IsPhysicalBuffer()) {
                 return fmt::format("readPhysicalAttribute(ftou({}))",
@@ -697,6 +701,9 @@ private:
         }
 
         if (const auto lmem = std::get_if<LmemNode>(&*node)) {
+            if (stage == ProgramType::Compute) {
+                LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
+            }
             return fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
         }
 
@@ -726,7 +733,7 @@ private:
 
     std::string ReadAttribute(Attribute::Index attribute, u32 element, const Node& buffer = {}) {
         const auto GeometryPass = [&](std::string_view name) {
-            if (stage == ShaderStage::Geometry && buffer) {
+            if (stage == ProgramType::Geometry && buffer) {
                 // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games
                 // set an 0x80000000 index for those and the shader fails to build. Find out why
                 // this happens and what's its intent.
@@ -738,10 +745,10 @@ private:
         switch (attribute) {
         case Attribute::Index::Position:
             switch (stage) {
-            case ShaderStage::Geometry:
+            case ProgramType::Geometry:
                 return fmt::format("gl_in[ftou({})].gl_Position{}", Visit(buffer),
                                    GetSwizzle(element));
-            case ShaderStage::Fragment:
+            case ProgramType::Fragment:
                 return element == 3 ? "1.0f" : ("gl_FragCoord"s + GetSwizzle(element));
             default:
                 UNREACHABLE();
@@ -762,7 +769,7 @@ private:
             // TODO(Subv): Find out what the values are for the first two elements when inside a
             // vertex shader, and what's the value of the fourth element when inside a Tess Eval
             // shader.
-            ASSERT(stage == ShaderStage::Vertex);
+            ASSERT(IsVertexShader(stage));
             switch (element) {
             case 2:
                 // Config pack's first value is instance_id.
@@ -774,7 +781,7 @@ private:
             return "0";
         case Attribute::Index::FrontFacing:
             // TODO(Subv): Find out what the values are for the other elements.
-            ASSERT(stage == ShaderStage::Fragment);
+            ASSERT(stage == ProgramType::Fragment);
             switch (element) {
             case 3:
                 return "itof(gl_FrontFacing ? -1 : 0)";
@@ -796,7 +803,7 @@ private:
             return value;
         }
         // There's a bug in NVidia's proprietary drivers that makes precise fail on fragment shaders
-        const std::string precise = stage != ShaderStage::Fragment ? "precise " : "";
+        const std::string precise = stage != ProgramType::Fragment ? "precise " : "";
 
         const std::string temporary = code.GenerateTemporary();
         code.AddLine("{}float {} = {};", precise, temporary, value);
@@ -831,12 +838,12 @@ private:
                 UNIMPLEMENTED();
                 return {};
             case 1:
-                if (stage == ShaderStage::Vertex && !device.HasVertexViewportLayer()) {
+                if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
                     return {};
                 }
                 return std::make_pair("gl_Layer", true);
             case 2:
-                if (stage == ShaderStage::Vertex && !device.HasVertexViewportLayer()) {
+                if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
                     return {};
                 }
                 return std::make_pair("gl_ViewportIndex", true);
@@ -1073,6 +1080,9 @@ private:
             target = result->first;
             is_integer = result->second;
         } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
+            if (stage == ProgramType::Compute) {
+                LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
+            }
             target = fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
         } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
             const std::string real = Visit(gmem->GetRealAddress());
@@ -1400,14 +1410,10 @@ private:
         return fmt::format("{}[{}]", pair, VisitOperand(operation, 1, Type::Uint));
     }
 
-    std::string LogicalAll2(Operation operation) {
+    std::string LogicalAnd2(Operation operation) {
         return GenerateUnary(operation, "all", Type::Bool, Type::Bool2);
     }
 
-    std::string LogicalAny2(Operation operation) {
-        return GenerateUnary(operation, "any", Type::Bool, Type::Bool2);
-    }
-
     template <bool with_nan>
     std::string GenerateHalfComparison(Operation operation, const std::string& compare_op) {
         const std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2,
@@ -1630,7 +1636,7 @@ private:
     }
 
     std::string Exit(Operation operation) {
-        if (stage != ShaderStage::Fragment) {
+        if (stage != ProgramType::Fragment) {
             code.AddLine("return;");
             return {};
         }
@@ -1681,7 +1687,7 @@ private:
     }
 
     std::string EmitVertex(Operation operation) {
-        ASSERT_MSG(stage == ShaderStage::Geometry,
+        ASSERT_MSG(stage == ProgramType::Geometry,
                    "EmitVertex is expected to be used in a geometry shader.");
 
         // If a geometry shader is attached, it will always flip (it's the last stage before
@@ -1692,7 +1698,7 @@ private:
     }
 
     std::string EndPrimitive(Operation operation) {
-        ASSERT_MSG(stage == ShaderStage::Geometry,
+        ASSERT_MSG(stage == ProgramType::Geometry,
                    "EndPrimitive is expected to be used in a geometry shader.");
 
         code.AddLine("EndPrimitive();");
@@ -1714,7 +1720,7 @@ private:
         return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')';
     }
 
-    static constexpr OperationDecompilersArray operation_decompilers = {
+    static constexpr std::array operation_decompilers = {
         &GLSLDecompiler::Assign,
 
         &GLSLDecompiler::Select,
@@ -1798,8 +1804,7 @@ private:
         &GLSLDecompiler::LogicalXor,
         &GLSLDecompiler::LogicalNegate,
         &GLSLDecompiler::LogicalPick2,
-        &GLSLDecompiler::LogicalAll2,
-        &GLSLDecompiler::LogicalAny2,
+        &GLSLDecompiler::LogicalAnd2,
 
         &GLSLDecompiler::LogicalLessThan<Type::Float>,
         &GLSLDecompiler::LogicalEqual<Type::Float>,
@@ -1863,6 +1868,7 @@ private:
         &GLSLDecompiler::WorkGroupId<1>,
         &GLSLDecompiler::WorkGroupId<2>,
     };
+    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
     std::string GetRegister(u32 index) const {
         return GetDeclarationWithSuffix(index, "gpr");
@@ -1927,7 +1933,7 @@ private:
     }
 
     u32 GetNumPhysicalInputAttributes() const {
-        return stage == ShaderStage::Vertex ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings();
+        return IsVertexShader(stage) ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings();
     }
 
     u32 GetNumPhysicalAttributes() const {
@@ -1940,7 +1946,7 @@ private:
 
     const Device& device;
     const ShaderIR& ir;
-    const ShaderStage stage;
+    const ProgramType stage;
     const std::string suffix;
     const Header header;
 
@@ -1971,7 +1977,7 @@ std::string GetCommonDeclarations() {
         MAX_CONSTBUFFER_ELEMENTS);
 }
 
-ProgramResult Decompile(const Device& device, const ShaderIR& ir, Maxwell::ShaderStage stage,
+ProgramResult Decompile(const Device& device, const ShaderIR& ir, ProgramType stage,
                         const std::string& suffix) {
     GLSLDecompiler decompiler(device, ir, stage, suffix);
     decompiler.Decompile();
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index 02586736d..2ea02f5bf 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -12,14 +12,26 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/shader/shader_ir.h"
 
-namespace OpenGL {
-class Device;
-}
-
 namespace VideoCommon::Shader {
 class ShaderIR;
 }
 
+namespace OpenGL {
+
+class Device;
+
+enum class ProgramType : u32 {
+    VertexA = 0,
+    VertexB = 1,
+    TessellationControl = 2,
+    TessellationEval = 3,
+    Geometry = 4,
+    Fragment = 5,
+    Compute = 6
+};
+
+} // namespace OpenGL
+
 namespace OpenGL::GLShader {
 
 struct ShaderEntries;
@@ -85,6 +97,6 @@ struct ShaderEntries {
 std::string GetCommonDeclarations();
 
 ProgramResult Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
-                        Maxwell::ShaderStage stage, const std::string& suffix);
+                        ProgramType stage, const std::string& suffix);
 
 } // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 7893d1e26..969fe9ced 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -51,7 +51,7 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() {
 
 } // namespace
 
-ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type,
+ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type,
                                        u32 program_code_size, u32 program_code_size_b,
                                        ProgramCode program_code, ProgramCode program_code_b)
     : unique_identifier{unique_identifier}, program_type{program_type},
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index 4f296dda6..cc8bbd61e 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -18,7 +18,6 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "core/file_sys/vfs_vector.h"
-#include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
 
 namespace Core {
@@ -34,14 +33,11 @@ namespace OpenGL {
 struct ShaderDiskCacheUsage;
 struct ShaderDiskCacheDump;
 
-using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>;
-
 using ProgramCode = std::vector<u64>;
-using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-
+using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>;
 using TextureBufferUsage = std::bitset<64>;
 
-/// Allocated bindings used by an OpenGL shader program.
+/// Allocated bindings used by an OpenGL shader program
 struct BaseBindings {
     u32 cbuf{};
     u32 gmem{};
@@ -126,7 +122,7 @@ namespace OpenGL {
 /// Describes a shader how it's used by the guest GPU
 class ShaderDiskCacheRaw {
 public:
-    explicit ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type,
+    explicit ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type,
                                 u32 program_code_size, u32 program_code_size_b,
                                 ProgramCode program_code, ProgramCode program_code_b);
     ShaderDiskCacheRaw();
@@ -141,30 +137,13 @@ public:
     }
 
     bool HasProgramA() const {
-        return program_type == Maxwell::ShaderProgram::VertexA;
+        return program_type == ProgramType::VertexA;
     }
 
-    Maxwell::ShaderProgram GetProgramType() const {
+    ProgramType GetProgramType() const {
         return program_type;
     }
 
-    Maxwell::ShaderStage GetProgramStage() const {
-        switch (program_type) {
-        case Maxwell::ShaderProgram::VertexA:
-        case Maxwell::ShaderProgram::VertexB:
-            return Maxwell::ShaderStage::Vertex;
-        case Maxwell::ShaderProgram::TesselationControl:
-            return Maxwell::ShaderStage::TesselationControl;
-        case Maxwell::ShaderProgram::TesselationEval:
-            return Maxwell::ShaderStage::TesselationEval;
-        case Maxwell::ShaderProgram::Geometry:
-            return Maxwell::ShaderStage::Geometry;
-        case Maxwell::ShaderProgram::Fragment:
-            return Maxwell::ShaderStage::Fragment;
-        }
-        UNREACHABLE();
-    }
-
     const ProgramCode& GetProgramCode() const {
         return program_code;
     }
@@ -175,7 +154,7 @@ public:
 
 private:
     u64 unique_identifier{};
-    Maxwell::ShaderProgram program_type{};
+    ProgramType program_type{};
     u32 program_code_size{};
     u32 program_code_size_b{};
 
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index f9ee8429e..3a8d9e1da 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -14,7 +14,8 @@ using Tegra::Engines::Maxwell3D;
 using VideoCommon::Shader::ProgramCode;
 using VideoCommon::Shader::ShaderIR;
 
-static constexpr u32 PROGRAM_OFFSET{10};
+static constexpr u32 PROGRAM_OFFSET = 10;
+static constexpr u32 COMPUTE_OFFSET = 0;
 
 ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) {
     const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
@@ -29,17 +30,15 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
 };
 
 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
-    ProgramResult program =
-        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");
 
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
+    const auto stage = setup.IsDualProgram() ? ProgramType::VertexA : ProgramType::VertexB;
+    ProgramResult program = Decompile(device, program_ir, stage, "vertex");
     out += program.first;
 
     if (setup.IsDualProgram()) {
         const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET, setup.program.size_b);
-        ProgramResult program_b =
-            Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");
-
+        ProgramResult program_b = Decompile(device, program_ir_b, ProgramType::VertexB, "vertex_b");
         out += program_b.first;
     }
 
@@ -80,9 +79,9 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
 };
 
 )";
+
     const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
-    ProgramResult program =
-        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
+    ProgramResult program = Decompile(device, program_ir, ProgramType::Geometry, "geometry");
     out += program.first;
 
     out += R"(
@@ -116,9 +115,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config {
 
 )";
     const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
-    ProgramResult program =
-        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");
-
+    ProgramResult program = Decompile(device, program_ir, ProgramType::Fragment, "fragment");
     out += program.first;
 
     out += R"(
@@ -130,4 +127,22 @@ void main() {
     return {std::move(out), std::move(program.second)};
 }
 
+ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup) {
+    const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
+
+    std::string out = "// Shader Unique Id: CS" + id + "\n\n";
+    out += GetCommonDeclarations();
+
+    const ShaderIR program_ir(setup.program.code, COMPUTE_OFFSET, setup.program.size_a);
+    ProgramResult program = Decompile(device, program_ir, ProgramType::Compute, "compute");
+    out += program.first;
+
+    out += R"(
+void main() {
+    execute_compute();
+}
+)";
+    return {std::move(out), std::move(program.second)};
+}
+
 } // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index 7cbc590f8..3833e88ab 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -54,4 +54,7 @@ ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& se
 /// Generates the GLSL fragment shader program source code for the given FS program
 ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup);
 
+/// Generates the GLSL compute shader program source code for the given CS program
+ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup);
+
 } // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp
index 5f3fe067e..9e74eda0d 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_util.cpp
@@ -10,21 +10,25 @@
 
 namespace OpenGL::GLShader {
 
-GLuint LoadShader(const char* source, GLenum type) {
-    const char* debug_type;
+namespace {
+const char* GetStageDebugName(GLenum type) {
     switch (type) {
     case GL_VERTEX_SHADER:
-        debug_type = "vertex";
-        break;
+        return "vertex";
     case GL_GEOMETRY_SHADER:
-        debug_type = "geometry";
-        break;
+        return "geometry";
     case GL_FRAGMENT_SHADER:
-        debug_type = "fragment";
-        break;
-    default:
-        UNREACHABLE();
+        return "fragment";
+    case GL_COMPUTE_SHADER:
+        return "compute";
     }
+    UNIMPLEMENTED();
+    return "unknown";
+}
+} // Anonymous namespace
+
+GLuint LoadShader(const char* source, GLenum type) {
+    const char* debug_type = GetStageDebugName(type);
     const GLuint shader_id = glCreateShader(type);
     glShaderSource(shader_id, 1, &source, nullptr);
     LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type);
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 0eae98afe..f4777d0b0 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -165,6 +165,25 @@ OpenGLState::OpenGLState() {
     alpha_test.ref = 0.0f;
 }
 
+void OpenGLState::SetDefaultViewports() {
+    for (auto& item : viewports) {
+        item.x = 0;
+        item.y = 0;
+        item.width = 0;
+        item.height = 0;
+        item.depth_range_near = 0.0f;
+        item.depth_range_far = 1.0f;
+        item.scissor.enabled = false;
+        item.scissor.x = 0;
+        item.scissor.y = 0;
+        item.scissor.width = 0;
+        item.scissor.height = 0;
+    }
+
+    depth_clamp.far_plane = false;
+    depth_clamp.near_plane = false;
+}
+
 void OpenGLState::ApplyDefaultState() {
     glEnable(GL_BLEND);
     glDisable(GL_FRAMEBUFFER_SRGB);
@@ -526,7 +545,7 @@ void OpenGLState::ApplySamplers() const {
     }
 }
 
-void OpenGLState::Apply() const {
+void OpenGLState::Apply() {
     MICROPROFILE_SCOPE(OpenGL_State);
     ApplyFramebufferState();
     ApplyVertexArrayState();
@@ -536,19 +555,31 @@ void OpenGLState::Apply() const {
     ApplyPointSize();
     ApplyFragmentColorClamp();
     ApplyMultisample();
+    if (dirty.color_mask) {
+        ApplyColorMask();
+        dirty.color_mask = false;
+    }
     ApplyDepthClamp();
-    ApplyColorMask();
     ApplyViewport();
-    ApplyStencilTest();
+    if (dirty.stencil_state) {
+        ApplyStencilTest();
+        dirty.stencil_state = false;
+    }
     ApplySRgb();
     ApplyCulling();
     ApplyDepth();
     ApplyPrimitiveRestart();
-    ApplyBlending();
+    if (dirty.blend_state) {
+        ApplyBlending();
+        dirty.blend_state = false;
+    }
     ApplyLogicOp();
     ApplyTextures();
     ApplySamplers();
-    ApplyPolygonOffset();
+    if (dirty.polygon_offset) {
+        ApplyPolygonOffset();
+        dirty.polygon_offset = false;
+    }
     ApplyAlphaTest();
 }
 
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index b0140495d..fdf9a8a12 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -195,8 +195,9 @@ public:
         s_rgb_used = false;
     }
 
+    void SetDefaultViewports();
     /// Apply this state as the current OpenGL state
-    void Apply() const;
+    void Apply();
 
     void ApplyFramebufferState() const;
     void ApplyVertexArrayState() const;
@@ -237,11 +238,41 @@ public:
     /// Viewport does not affects glClearBuffer so emulate viewport using scissor test
     void EmulateViewportWithScissor();
 
+    void MarkDirtyBlendState() {
+        dirty.blend_state = true;
+    }
+
+    void MarkDirtyStencilState() {
+        dirty.stencil_state = true;
+    }
+
+    void MarkDirtyPolygonOffset() {
+        dirty.polygon_offset = true;
+    }
+
+    void MarkDirtyColorMask() {
+        dirty.color_mask = true;
+    }
+
+    void AllDirty() {
+        dirty.blend_state = true;
+        dirty.stencil_state = true;
+        dirty.polygon_offset = true;
+        dirty.color_mask = true;
+    }
+
 private:
     static OpenGLState cur_state;
 
     // Workaround for sRGB problems caused by QT not supporting srgb output
     static bool s_rgb_used;
+    struct {
+        bool blend_state;
+        bool stencil_state;
+        bool viewport_state;
+        bool polygon_offset;
+        bool color_mask;
+    } dirty{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index b1f6bc7c2..408332f90 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -137,7 +137,6 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format
 const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) {
     ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size());
     const auto& format{tex_format_tuples[static_cast<std::size_t>(pixel_format)]};
-    ASSERT(component_type == format.component_type);
     return format;
 }
 
@@ -485,11 +484,15 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
     const auto& dst_params{dst_view->GetSurfaceParams()};
 
     OpenGLState prev_state{OpenGLState::GetCurState()};
-    SCOPE_EXIT({ prev_state.Apply(); });
+    SCOPE_EXIT({
+        prev_state.AllDirty();
+        prev_state.Apply();
+    });
 
     OpenGLState state;
     state.draw.read_framebuffer = src_framebuffer.handle;
     state.draw.draw_framebuffer = dst_framebuffer.handle;
+    state.AllDirty();
     state.Apply();
 
     u32 buffers{};
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 9ecdddb0d..a05cef3b9 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -108,6 +108,7 @@ void RendererOpenGL::SwapBuffers(
 
     // Maintain the rasterizer's state as a priority
     OpenGLState prev_state = OpenGLState::GetCurState();
+    state.AllDirty();
     state.Apply();
 
     if (framebuffer) {
@@ -140,6 +141,7 @@ void RendererOpenGL::SwapBuffers(
     system.GetPerfStats().BeginSystemFrame();
 
     // Restore the rasterizer state
+    prev_state.AllDirty();
     prev_state.Apply();
 }
 
@@ -206,6 +208,7 @@ void RendererOpenGL::InitOpenGLObjects() {
     // Link shaders and get variable locations
     shader.CreateFromSource(vertex_shader, nullptr, fragment_shader);
     state.draw.shader_program = shader.handle;
+    state.AllDirty();
     state.Apply();
     uniform_modelview_matrix = glGetUniformLocation(shader.handle, "modelview_matrix");
     uniform_color_texture = glGetUniformLocation(shader.handle, "color_texture");
@@ -338,12 +341,14 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
     // Workaround brigthness problems in SMO by enabling sRGB in the final output
     // if it has been used in the frame. Needed because of this bug in QT: QTBUG-50987
     state.framebuffer_srgb.enabled = OpenGLState::GetsRGBUsed();
+    state.AllDirty();
     state.Apply();
     glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), vertices.data());
     glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
     // Restore default state
     state.framebuffer_srgb.enabled = false;
     state.texture_units[0].texture = 0;
+    state.AllDirty();
     state.Apply();
     // Clear sRGB state for the next frame
     OpenGLState::ClearsRGBUsed();
@@ -388,6 +393,7 @@ void RendererOpenGL::CaptureScreenshot() {
     GLuint old_read_fb = state.draw.read_framebuffer;
     GLuint old_draw_fb = state.draw.draw_framebuffer;
     state.draw.read_framebuffer = state.draw.draw_framebuffer = screenshot_framebuffer.handle;
+    state.AllDirty();
     state.Apply();
 
     Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout};
@@ -407,6 +413,7 @@ void RendererOpenGL::CaptureScreenshot() {
     screenshot_framebuffer.Release();
     state.draw.read_framebuffer = old_read_fb;
     state.draw.draw_framebuffer = old_draw_fb;
+    state.AllDirty();
     state.Apply();
     glDeleteRenderbuffers(1, &renderbuffer);
 
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 9b2d8e987..d267712c9 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -205,10 +205,6 @@ public:
     }
 
 private:
-    using OperationDecompilerFn = Id (SPIRVDecompiler::*)(Operation);
-    using OperationDecompilersArray =
-        std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>;
-
     static constexpr auto INTERNAL_FLAGS_COUNT = static_cast<std::size_t>(InternalFlag::Amount);
 
     void AllocateBindings() {
@@ -804,12 +800,7 @@ private:
         return {};
     }
 
-    Id LogicalAll2(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Id LogicalAny2(Operation operation) {
+    Id LogicalAnd2(Operation operation) {
         UNIMPLEMENTED();
         return {};
     }
@@ -1206,7 +1197,7 @@ private:
         return {};
     }
 
-    static constexpr OperationDecompilersArray operation_decompilers = {
+    static constexpr std::array operation_decompilers = {
         &SPIRVDecompiler::Assign,
 
         &SPIRVDecompiler::Ternary<&Module::OpSelect, Type::Float, Type::Bool, Type::Float,
@@ -1291,8 +1282,7 @@ private:
         &SPIRVDecompiler::Binary<&Module::OpLogicalNotEqual, Type::Bool>,
         &SPIRVDecompiler::Unary<&Module::OpLogicalNot, Type::Bool>,
         &SPIRVDecompiler::LogicalPick2,
-        &SPIRVDecompiler::LogicalAll2,
-        &SPIRVDecompiler::LogicalAny2,
+        &SPIRVDecompiler::LogicalAnd2,
 
         &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::Float>,
         &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::Float>,
@@ -1357,6 +1347,7 @@ private:
         &SPIRVDecompiler::WorkGroupId<1>,
         &SPIRVDecompiler::WorkGroupId<2>,
     };
+    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
     const VKDevice& device;
     const ShaderIR& ir;
diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp
index fdcc970ff..ec3a76690 100644
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@@ -15,7 +15,7 @@
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
-
+namespace {
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 
@@ -29,8 +29,7 @@ struct Query {
 
 struct BlockStack {
     BlockStack() = default;
-    BlockStack(const BlockStack& b) = default;
-    BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {}
+    explicit BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {}
     std::stack<u32> ssy_stack{};
     std::stack<u32> pbk_stack{};
 };
@@ -58,7 +57,7 @@ struct BlockInfo {
 struct CFGRebuildState {
     explicit CFGRebuildState(const ProgramCode& program_code, const std::size_t program_size,
                              const u32 start)
-        : program_code{program_code}, program_size{program_size}, start{start} {}
+        : start{start}, program_code{program_code}, program_size{program_size} {}
 
     u32 start{};
     std::vector<BlockInfo> block_info{};
@@ -85,7 +84,7 @@ std::pair<BlockCollision, u32> TryGetBlock(CFGRebuildState& state, u32 address)
             return {BlockCollision::Inside, index};
         }
     }
-    return {BlockCollision::None, -1};
+    return {BlockCollision::None, 0xFFFFFFFF};
 }
 
 struct ParseInfo {
@@ -365,27 +364,29 @@ bool TryQuery(CFGRebuildState& state) {
         const auto gather_end = labels.upper_bound(block.end);
         while (gather_start != gather_end) {
             cc.push(gather_start->second);
-            gather_start++;
+            ++gather_start;
         }
     };
     if (state.queries.empty()) {
         return false;
     }
+
     Query& q = state.queries.front();
     const u32 block_index = state.registered[q.address];
     BlockInfo& block = state.block_info[block_index];
-    // If the block is visted, check if the stacks match, else gather the ssy/pbk
+    // If the block is visited, check if the stacks match, else gather the ssy/pbk
     // labels into the current stack and look if the branch at the end of the block
     // consumes a label. Schedule new queries accordingly
     if (block.visited) {
         BlockStack& stack = state.stacks[q.address];
-        const bool all_okay = (stack.ssy_stack.size() == 0 || q.ssy_stack == stack.ssy_stack) &&
-                              (stack.pbk_stack.size() == 0 || q.pbk_stack == stack.pbk_stack);
+        const bool all_okay = (stack.ssy_stack.empty() || q.ssy_stack == stack.ssy_stack) &&
+                              (stack.pbk_stack.empty() || q.pbk_stack == stack.pbk_stack);
         state.queries.pop_front();
         return all_okay;
     }
     block.visited = true;
-    state.stacks[q.address] = BlockStack{q};
+    state.stacks.insert_or_assign(q.address, BlockStack{q});
+
     Query q2(q);
     state.queries.pop_front();
     gather_labels(q2.ssy_stack, state.ssy_labels, block);
@@ -394,6 +395,7 @@ bool TryQuery(CFGRebuildState& state) {
         q2.address = block.end + 1;
         state.queries.push_back(q2);
     }
+
     Query conditional_query{q2};
     if (block.branch.is_sync) {
         if (block.branch.address == unassigned_branch) {
@@ -408,13 +410,15 @@ bool TryQuery(CFGRebuildState& state) {
         conditional_query.pbk_stack.pop();
     }
     conditional_query.address = block.branch.address;
-    state.queries.push_back(conditional_query);
+    state.queries.push_back(std::move(conditional_query));
     return true;
 }
+} // Anonymous namespace
 
-std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
-                                              u32 start_address) {
+std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code,
+                                              std::size_t program_size, u32 start_address) {
     CFGRebuildState state{program_code, program_size, start_address};
+
     // Inspect Code and generate blocks
     state.labels.clear();
     state.labels.emplace(start_address);
@@ -424,10 +428,9 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
             return {};
         }
     }
+
     // Decompile Stacks
-    Query start_query{};
-    start_query.address = state.start;
-    state.queries.push_back(start_query);
+    state.queries.push_back(Query{state.start, {}, {}});
     bool decompiled = true;
     while (!state.queries.empty()) {
         if (!TryQuery(state)) {
@@ -435,14 +438,15 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
             break;
         }
     }
+
     // Sort and organize results
     std::sort(state.block_info.begin(), state.block_info.end(),
-              [](const BlockInfo& a, const BlockInfo& b) -> bool { return a.start < b.start; });
+              [](const BlockInfo& a, const BlockInfo& b) { return a.start < b.start; });
     ShaderCharacteristics result_out{};
     result_out.decompilable = decompiled;
     result_out.start = start_address;
     result_out.end = start_address;
-    for (auto& block : state.block_info) {
+    for (const auto& block : state.block_info) {
         ShaderBlock new_block{};
         new_block.start = block.start;
         new_block.end = block.end;
@@ -457,8 +461,9 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
     }
     if (result_out.decompilable) {
         result_out.labels = std::move(state.labels);
-        return {result_out};
+        return {std::move(result_out)};
     }
+
     // If it's not decompilable, merge the unlabelled blocks together
     auto back = result_out.blocks.begin();
     auto next = std::next(back);
@@ -469,8 +474,8 @@ std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u
             continue;
         }
         back = next;
-        next++;
+        ++next;
     }
-    return {result_out};
+    return {std::move(result_out)};
 }
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/control_flow.h b/src/video_core/shader/control_flow.h
index 5e8ea3271..b0a5e4f8c 100644
--- a/src/video_core/shader/control_flow.h
+++ b/src/video_core/shader/control_flow.h
@@ -4,7 +4,6 @@
 
 #pragma once
 
-#include <cstring>
 #include <list>
 #include <optional>
 #include <unordered_set>
@@ -26,27 +25,44 @@ struct Condition {
     bool IsUnconditional() const {
         return predicate == Pred::UnusedIndex && cc == ConditionCode::T;
     }
+
     bool operator==(const Condition& other) const {
         return std::tie(predicate, cc) == std::tie(other.predicate, other.cc);
     }
+
+    bool operator!=(const Condition& other) const {
+        return !operator==(other);
+    }
 };
 
 struct ShaderBlock {
-    u32 start{};
-    u32 end{};
-    bool ignore_branch{};
     struct Branch {
         Condition cond{};
         bool kills{};
         s32 address{};
+
         bool operator==(const Branch& b) const {
             return std::tie(cond, kills, address) == std::tie(b.cond, b.kills, b.address);
         }
-    } branch{};
+
+        bool operator!=(const Branch& b) const {
+            return !operator==(b);
+        }
+    };
+
+    u32 start{};
+    u32 end{};
+    bool ignore_branch{};
+    Branch branch{};
+
     bool operator==(const ShaderBlock& sb) const {
         return std::tie(start, end, ignore_branch, branch) ==
                std::tie(sb.start, sb.end, sb.ignore_branch, sb.branch);
     }
+
+    bool operator!=(const ShaderBlock& sb) const {
+        return !operator==(sb);
+    }
 };
 
 struct ShaderCharacteristics {
@@ -57,7 +73,7 @@ struct ShaderCharacteristics {
     std::unordered_set<u32> labels{};
 };
 
-std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
-                                              u32 start_address);
+std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code,
+                                              std::size_t program_size, u32 start_address);
 
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index afffd157f..b547d8323 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -47,14 +47,14 @@ void ShaderIR::Decode() {
         if (shader_info.decompilable) {
             disable_flow_stack = true;
             const auto insert_block = [this](NodeBlock& nodes, u32 label) {
-                if (label == exit_branch) {
+                if (label == static_cast<u32>(exit_branch)) {
                     return;
                 }
                 basic_blocks.insert({label, nodes});
             };
             const auto& blocks = shader_info.blocks;
             NodeBlock current_block;
-            u32 current_label = exit_branch;
+            u32 current_label = static_cast<u32>(exit_branch);
             for (auto& block : blocks) {
                 if (shader_info.labels.count(block.start) != 0) {
                     insert_block(current_block, current_label);
diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp
index 87d8fecaa..1473c282a 100644
--- a/src/video_core/shader/decode/arithmetic.cpp
+++ b/src/video_core/shader/decode/arithmetic.cpp
@@ -42,11 +42,14 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
     case OpCode::Id::FMUL_R:
     case OpCode::Id::FMUL_IMM: {
         // FMUL does not have 'abs' bits and only the second operand has a 'neg' bit.
-        UNIMPLEMENTED_IF_MSG(instr.fmul.tab5cb8_2 != 0, "FMUL tab5cb8_2({}) is not implemented",
-                             instr.fmul.tab5cb8_2.Value());
-        UNIMPLEMENTED_IF_MSG(
-            instr.fmul.tab5c68_0 != 1, "FMUL tab5cb8_0({}) is not implemented",
-            instr.fmul.tab5c68_0.Value()); // SMO typical sends 1 here which seems to be the default
+        if (instr.fmul.tab5cb8_2 != 0) {
+            LOG_WARNING(HW_GPU, "FMUL tab5cb8_2({}) is not implemented",
+                        instr.fmul.tab5cb8_2.Value());
+        }
+        if (instr.fmul.tab5c68_0 != 1) {
+            LOG_WARNING(HW_GPU, "FMUL tab5cb8_0({}) is not implemented",
+                        instr.fmul.tab5c68_0.Value());
+        }
 
         op_b = GetOperandAbsNegFloat(op_b, false, instr.fmul.negate_b);
 
diff --git a/src/video_core/shader/decode/arithmetic_half_immediate.cpp b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
index 7bcf38f23..6466fc011 100644
--- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
@@ -23,7 +23,9 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) {
             LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
         }
     } else {
-        UNIMPLEMENTED_IF(instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None);
+        if (instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None) {
+            LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+        }
     }
 
     Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half_imm.type_a);
diff --git a/src/video_core/shader/decode/ffma.cpp b/src/video_core/shader/decode/ffma.cpp
index 29be25ca3..ca2f39e8d 100644
--- a/src/video_core/shader/decode/ffma.cpp
+++ b/src/video_core/shader/decode/ffma.cpp
@@ -18,10 +18,12 @@ u32 ShaderIR::DecodeFfma(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     UNIMPLEMENTED_IF_MSG(instr.ffma.cc != 0, "FFMA cc not implemented");
-    UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_0 != 1, "FFMA tab5980_0({}) not implemented",
-                         instr.ffma.tab5980_0.Value()); // Seems to be 1 by default based on SMO
-    UNIMPLEMENTED_IF_MSG(instr.ffma.tab5980_1 != 0, "FFMA tab5980_1({}) not implemented",
-                         instr.ffma.tab5980_1.Value());
+    if (instr.ffma.tab5980_0 != 1) {
+        LOG_WARNING(HW_GPU, "FFMA tab5980_0({}) not implemented", instr.ffma.tab5980_0.Value());
+    }
+    if (instr.ffma.tab5980_1 != 0) {
+        LOG_WARNING(HW_GPU, "FFMA tab5980_1({}) not implemented", instr.ffma.tab5980_1.Value());
+    }
 
     const Node op_a = GetRegister(instr.gpr8);
 
diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp
index d59d15bd8..a6c082cc9 100644
--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -18,43 +18,56 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0);
+    DEBUG_ASSERT(instr.hsetp2.ftz == 0);
 
     Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a);
     op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a);
 
-    Node op_b = [&]() {
-        switch (opcode->get().GetId()) {
-        case OpCode::Id::HSETP2_R:
-            return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a,
-                                        instr.hsetp2.negate_b);
-        default:
-            UNREACHABLE();
-            return Immediate(0);
-        }
-    }();
-    op_b = UnpackHalfFloat(op_b, instr.hsetp2.type_b);
-
-    // We can't use the constant predicate as destination.
-    ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex));
-
-    const Node second_pred = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred != 0);
+    Tegra::Shader::PredCondition cond{};
+    bool h_and{};
+    Node op_b{};
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSETP2_C:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        h_and = instr.hsetp2.cbuf_and_imm.h_and;
+        op_b = GetOperandAbsNegHalf(GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset),
+                                    instr.hsetp2.cbuf.abs_b, instr.hsetp2.cbuf.negate_b);
+        break;
+    case OpCode::Id::HSETP2_IMM:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        h_and = instr.hsetp2.cbuf_and_imm.h_and;
+        op_b = UnpackHalfImmediate(instr, true);
+        break;
+    case OpCode::Id::HSETP2_R:
+        cond = instr.hsetp2.reg.cond;
+        h_and = instr.hsetp2.reg.h_and;
+        op_b =
+            UnpackHalfFloat(GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.reg.abs_b,
+                                                 instr.hsetp2.reg.negate_b),
+                            instr.hsetp2.reg.type_b);
+        break;
+    default:
+        UNREACHABLE();
+        op_b = Immediate(0);
+    }
 
     const OperationCode combiner = GetPredicateCombiner(instr.hsetp2.op);
-    const OperationCode pair_combiner =
-        instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2;
-
-    const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, op_a, op_b);
-    const Node first_pred = Operation(pair_combiner, comparison);
+    const Node combined_pred = GetPredicate(instr.hsetp2.pred3, instr.hsetp2.neg_pred);
 
-    // Set the primary predicate to the result of Predicate OP SecondPredicate
-    const Node value = Operation(combiner, first_pred, second_pred);
-    SetPredicate(bb, instr.hsetp2.pred3, value);
+    const auto Write = [&](u64 dest, Node src) {
+        SetPredicate(bb, dest, Operation(combiner, std::move(src), combined_pred));
+    };
 
-    if (instr.hsetp2.pred0 != static_cast<u64>(Pred::UnusedIndex)) {
-        // Set the secondary predicate to the result of !Predicate OP SecondPredicate, if enabled
-        const Node negated_pred = Operation(OperationCode::LogicalNegate, first_pred);
-        SetPredicate(bb, instr.hsetp2.pred0, Operation(combiner, negated_pred, second_pred));
+    const Node comparison = GetPredicateComparisonHalf(cond, op_a, op_b);
+    const u64 first = instr.hsetp2.pred0;
+    const u64 second = instr.hsetp2.pred39;
+    if (h_and) {
+        const Node joined = Operation(OperationCode::LogicalAnd2, comparison);
+        Write(first, joined);
+        Write(second, Operation(OperationCode::LogicalNegate, joined));
+    } else {
+        Write(first, Operation(OperationCode::LogicalPick2, comparison, Immediate(0u)));
+        Write(second, Operation(OperationCode::LogicalPick2, comparison, Immediate(1u)));
     }
 
     return pc;
diff --git a/src/video_core/shader/decode/hfma2.cpp b/src/video_core/shader/decode/hfma2.cpp
index c3bcf1ae9..5b44cb79c 100644
--- a/src/video_core/shader/decode/hfma2.cpp
+++ b/src/video_core/shader/decode/hfma2.cpp
@@ -22,9 +22,9 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     if (opcode->get().GetId() == OpCode::Id::HFMA2_RR) {
-        UNIMPLEMENTED_IF(instr.hfma2.rr.precision != HalfPrecision::None);
+        DEBUG_ASSERT(instr.hfma2.rr.precision == HalfPrecision::None);
     } else {
-        UNIMPLEMENTED_IF(instr.hfma2.precision != HalfPrecision::None);
+        DEBUG_ASSERT(instr.hfma2.precision == HalfPrecision::None);
     }
 
     constexpr auto identity = HalfType::H0_H1;
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index c0f64d7a0..ac0e764d6 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -22,6 +22,12 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     switch (opcode->get().GetId()) {
+    case OpCode::Id::NOP: {
+        UNIMPLEMENTED_IF(instr.nop.cc != Tegra::Shader::ConditionCode::T);
+        UNIMPLEMENTED_IF(instr.nop.trigger != 0);
+        // With the previous preconditions, this instruction is a no-operation.
+        break;
+    }
     case OpCode::Id::EXIT: {
         const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
         UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "EXIT condition code used: {}",
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index 7427ed896..715184d67 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -101,8 +101,7 @@ enum class OperationCode {
     LogicalXor,    /// (bool a, bool b) -> bool
     LogicalNegate, /// (bool a) -> bool
     LogicalPick2,  /// (bool2 pair, uint index) -> bool
-    LogicalAll2,   /// (bool2 a) -> bool
-    LogicalAny2,   /// (bool2 a) -> bool
+    LogicalAnd2,   /// (bool2 a) -> bool
 
     LogicalFLessThan,     /// (float a, float b) -> bool
     LogicalFEqual,        /// (float a, float b) -> bool
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index a53e02253..55f5949e4 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -59,8 +59,8 @@ std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& co
         return TrackCbuf(source, code, new_cursor);
     }
     if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
-        for (std::size_t i = 0; i < operation->GetOperandsCount(); ++i) {
-            if (auto found = TrackCbuf((*operation)[i], code, cursor); std::get<0>(found)) {
+        for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {
+            if (auto found = TrackCbuf((*operation)[i - 1], code, cursor); std::get<0>(found)) {
                 // Cbuf found in operand.
                 return found;
             }
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 6af9044ca..683c49207 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -24,9 +24,8 @@ StagingCache::StagingCache() = default;
 StagingCache::~StagingCache() = default;
 
 SurfaceBaseImpl::SurfaceBaseImpl(GPUVAddr gpu_addr, const SurfaceParams& params)
-    : params{params}, mipmap_sizes(params.num_levels),
-      mipmap_offsets(params.num_levels), gpu_addr{gpu_addr}, host_memory_size{
-                                                                 params.GetHostSizeInBytes()} {
+    : params{params}, host_memory_size{params.GetHostSizeInBytes()}, gpu_addr{gpu_addr},
+      mipmap_sizes(params.num_levels), mipmap_offsets(params.num_levels) {
     std::size_t offset = 0;
     for (u32 level = 0; level < params.num_levels; ++level) {
         const std::size_t mipmap_size{params.GetGuestMipmapSize(level)};
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 7f9623c62..a3a3770a7 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -116,10 +116,10 @@ public:
         std::lock_guard lock{mutex};
         auto& maxwell3d = system.GPU().Maxwell3D();
 
-        if (!maxwell3d.dirty_flags.zeta_buffer) {
+        if (!maxwell3d.dirty.depth_buffer) {
             return depth_buffer.view;
         }
-        maxwell3d.dirty_flags.zeta_buffer = false;
+        maxwell3d.dirty.depth_buffer = false;
 
         const auto& regs{maxwell3d.regs};
         const auto gpu_addr{regs.zeta.Address()};
@@ -145,10 +145,10 @@ public:
         std::lock_guard lock{mutex};
         ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets);
         auto& maxwell3d = system.GPU().Maxwell3D();
-        if (!maxwell3d.dirty_flags.color_buffer[index]) {
+        if (!maxwell3d.dirty.render_target[index]) {
             return render_targets[index].view;
         }
-        maxwell3d.dirty_flags.color_buffer.reset(index);
+        maxwell3d.dirty.render_target[index] = false;
 
         const auto& regs{maxwell3d.regs};
         if (index >= regs.rt_control.count || regs.rt[index].Address() == 0 ||
@@ -274,10 +274,11 @@ protected:
         auto& maxwell3d = system.GPU().Maxwell3D();
         const u32 index = surface->GetRenderTarget();
         if (index == DEPTH_RT) {
-            maxwell3d.dirty_flags.zeta_buffer = true;
+            maxwell3d.dirty.depth_buffer = true;
         } else {
-            maxwell3d.dirty_flags.color_buffer.set(index, true);
+            maxwell3d.dirty.render_target[index] = true;
         }
+        maxwell3d.dirty.render_settings = true;
     }
 
     void Register(TSurface surface) {
diff --git a/src/yuzu/CMakeLists.txt b/src/yuzu/CMakeLists.txt
index 3dc0e47d0..f051e17b4 100644
--- a/src/yuzu/CMakeLists.txt
+++ b/src/yuzu/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(CMAKE_AUTOMOC ON)
 set(CMAKE_AUTORCC ON)
+set(CMAKE_AUTOUIC ON)
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR}/CMakeModules)
 
@@ -7,6 +8,7 @@ add_executable(yuzu
     Info.plist
     about_dialog.cpp
     about_dialog.h
+    aboutdialog.ui
     applets/error.cpp
     applets/error.h
     applets/profile_select.cpp
@@ -17,42 +19,59 @@ add_executable(yuzu
     applets/web_browser.h
     bootmanager.cpp
     bootmanager.h
+    compatdb.ui
     compatibility_list.cpp
     compatibility_list.h
     configuration/config.cpp
     configuration/config.h
+    configuration/configure.ui
     configuration/configure_audio.cpp
     configuration/configure_audio.h
+    configuration/configure_audio.ui
     configuration/configure_debug.cpp
     configuration/configure_debug.h
+    configuration/configure_debug.ui
     configuration/configure_dialog.cpp
     configuration/configure_dialog.h
     configuration/configure_gamelist.cpp
     configuration/configure_gamelist.h
+    configuration/configure_gamelist.ui
     configuration/configure_general.cpp
     configuration/configure_general.h
+    configuration/configure_general.ui
     configuration/configure_graphics.cpp
     configuration/configure_graphics.h
+    configuration/configure_graphics.ui
     configuration/configure_hotkeys.cpp
     configuration/configure_hotkeys.h
+    configuration/configure_hotkeys.ui
     configuration/configure_input.cpp
     configuration/configure_input.h
+    configuration/configure_input.ui
     configuration/configure_input_player.cpp
     configuration/configure_input_player.h
+    configuration/configure_input_player.ui
     configuration/configure_input_simple.cpp
     configuration/configure_input_simple.h
+    configuration/configure_input_simple.ui
     configuration/configure_mouse_advanced.cpp
     configuration/configure_mouse_advanced.h
+    configuration/configure_mouse_advanced.ui
+    configuration/configure_per_general.cpp
+    configuration/configure_per_general.h
+    configuration/configure_per_general.ui
     configuration/configure_profile_manager.cpp
     configuration/configure_profile_manager.h
+    configuration/configure_profile_manager.ui
     configuration/configure_system.cpp
     configuration/configure_system.h
-    configuration/configure_per_general.cpp
-    configuration/configure_per_general.h
+    configuration/configure_system.ui
     configuration/configure_touchscreen_advanced.cpp
     configuration/configure_touchscreen_advanced.h
+    configuration/configure_touchscreen_advanced.ui
     configuration/configure_web.cpp
     configuration/configure_web.h
+    configuration/configure_web.ui
     debugger/graphics/graphics_breakpoint_observer.cpp
     debugger/graphics/graphics_breakpoint_observer.h
     debugger/graphics/graphics_breakpoints.cpp
@@ -72,12 +91,14 @@ add_executable(yuzu
     game_list_worker.h
     loading_screen.cpp
     loading_screen.h
+    loading_screen.ui
     hotkeys.cpp
     hotkeys.h
     main.cpp
     main.h
-    ui_settings.cpp
-    ui_settings.h
+    main.ui
+    uisettings.cpp
+    uisettings.h
     util/limitable_input_dialog.cpp
     util/limitable_input_dialog.h
     util/sequence_dialog/sequence_dialog.cpp
@@ -89,44 +110,18 @@ add_executable(yuzu
     yuzu.rc
 )
 
-set(UIS
-    aboutdialog.ui
-    configuration/configure.ui
-    configuration/configure_audio.ui
-    configuration/configure_debug.ui
-    configuration/configure_gamelist.ui
-    configuration/configure_general.ui
-    configuration/configure_graphics.ui
-    configuration/configure_hotkeys.ui
-    configuration/configure_input.ui
-    configuration/configure_input_player.ui
-    configuration/configure_input_simple.ui
-    configuration/configure_mouse_advanced.ui
-    configuration/configure_per_general.ui
-    configuration/configure_profile_manager.ui
-    configuration/configure_system.ui
-    configuration/configure_touchscreen_advanced.ui
-    configuration/configure_web.ui
-    compatdb.ui
-    loading_screen.ui
-    main.ui
-)
-
 file(GLOB COMPAT_LIST
      ${PROJECT_BINARY_DIR}/dist/compatibility_list/compatibility_list.qrc
      ${PROJECT_BINARY_DIR}/dist/compatibility_list/compatibility_list.json)
 file(GLOB_RECURSE ICONS ${PROJECT_SOURCE_DIR}/dist/icons/*)
 file(GLOB_RECURSE THEMES ${PROJECT_SOURCE_DIR}/dist/qt_themes/*)
 
-qt5_wrap_ui(UI_HDRS ${UIS})
 
 target_sources(yuzu
     PRIVATE
         ${COMPAT_LIST}
         ${ICONS}
         ${THEMES}
-        ${UI_HDRS}
-        ${UIS}
 )
 
 if (APPLE)
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index b7f3fdf75..5d0fb3f9f 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -11,7 +11,7 @@
 #include "core/hle/service/hid/controllers/npad.h"
 #include "input_common/main.h"
 #include "yuzu/configuration/config.h"
-#include "yuzu/ui_settings.h"
+#include "yuzu/uisettings.h"
 
 Config::Config() {
     // TODO: Don't hardcode the path; let the frontend decide where to put the config files.
diff --git a/src/yuzu/configuration/configure_debug.cpp b/src/yuzu/configuration/configure_debug.cpp
index 9a13bb797..5b7e03056 100644
--- a/src/yuzu/configuration/configure_debug.cpp
+++ b/src/yuzu/configuration/configure_debug.cpp
@@ -12,13 +12,13 @@
 #include "ui_configure_debug.h"
 #include "yuzu/configuration/configure_debug.h"
 #include "yuzu/debugger/console.h"
-#include "yuzu/ui_settings.h"
+#include "yuzu/uisettings.h"
 
 ConfigureDebug::ConfigureDebug(QWidget* parent) : QWidget(parent), ui(new Ui::ConfigureDebug) {
     ui->setupUi(this);
     SetConfiguration();
 
-    connect(ui->open_log_button, &QPushButton::pressed, []() {
+    connect(ui->open_log_button, &QPushButton::clicked, []() {
         QString path = QString::fromStdString(FileUtil::GetUserPath(FileUtil::UserPath::LogDir));
         QDesktopServices::openUrl(QUrl::fromLocalFile(path));
     });
diff --git a/src/yuzu/configuration/configure_gamelist.cpp b/src/yuzu/configuration/configure_gamelist.cpp
index d1724ba89..daedbc33e 100644
--- a/src/yuzu/configuration/configure_gamelist.cpp
+++ b/src/yuzu/configuration/configure_gamelist.cpp
@@ -9,7 +9,7 @@
 #include "core/settings.h"
 #include "ui_configure_gamelist.h"
 #include "yuzu/configuration/configure_gamelist.h"
-#include "yuzu/ui_settings.h"
+#include "yuzu/uisettings.h"
 
 namespace {
 constexpr std::array default_icon_sizes{
diff --git a/src/yuzu/configuration/configure_general.cpp b/src/yuzu/configuration/configure_general.cpp
index 7a6e921cd..75fcbfea3 100644
--- a/src/yuzu/configuration/configure_general.cpp
+++ b/src/yuzu/configuration/configure_general.cpp
@@ -6,7 +6,7 @@
 #include "core/settings.h"
 #include "ui_configure_general.h"
 #include "yuzu/configuration/configure_general.h"
-#include "yuzu/ui_settings.h"
+#include "yuzu/uisettings.h"
 
 ConfigureGeneral::ConfigureGeneral(QWidget* parent)
     : QWidget(parent), ui(new Ui::ConfigureGeneral) {
diff --git a/src/yuzu/configuration/configure_input.cpp b/src/yuzu/configuration/configure_input.cpp
index 4dd775aab..7613197f2 100644
--- a/src/yuzu/configuration/configure_input.cpp
+++ b/src/yuzu/configuration/configure_input.cpp
@@ -79,7 +79,7 @@ ConfigureInput::ConfigureInput(QWidget* parent)
     LoadConfiguration();
     UpdateUIEnabled();
 
-    connect(ui->restore_defaults_button, &QPushButton::pressed, this,
+    connect(ui->restore_defaults_button, &QPushButton::clicked, this,
             &ConfigureInput::RestoreDefaults);
 
     for (auto* enabled : players_controller) {
@@ -96,20 +96,20 @@ ConfigureInput::ConfigureInput(QWidget* parent)
             &ConfigureInput::UpdateUIEnabled);
 
     for (std::size_t i = 0; i < players_configure.size(); ++i) {
-        connect(players_configure[i], &QPushButton::pressed, this,
+        connect(players_configure[i], &QPushButton::clicked, this,
                 [this, i] { CallConfigureDialog<ConfigureInputPlayer>(*this, i, false); });
     }
 
-    connect(ui->handheld_configure, &QPushButton::pressed, this,
+    connect(ui->handheld_configure, &QPushButton::clicked, this,
             [this] { CallConfigureDialog<ConfigureInputPlayer>(*this, 8, false); });
 
-    connect(ui->debug_configure, &QPushButton::pressed, this,
+    connect(ui->debug_configure, &QPushButton::clicked, this,
             [this] { CallConfigureDialog<ConfigureInputPlayer>(*this, 9, true); });
 
-    connect(ui->mouse_advanced, &QPushButton::pressed, this,
+    connect(ui->mouse_advanced, &QPushButton::clicked, this,
             [this] { CallConfigureDialog<ConfigureMouseAdvanced>(*this); });
 
-    connect(ui->touchscreen_advanced, &QPushButton::pressed, this,
+    connect(ui->touchscreen_advanced, &QPushButton::clicked, this,
             [this] { CallConfigureDialog<ConfigureTouchscreenAdvanced>(*this); });
 }
 
diff --git a/src/yuzu/configuration/configure_input_player.cpp b/src/yuzu/configuration/configure_input_player.cpp
index 916baccc1..7b70f307c 100644
--- a/src/yuzu/configuration/configure_input_player.cpp
+++ b/src/yuzu/configuration/configure_input_player.cpp
@@ -244,7 +244,7 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i
         }
 
         button->setContextMenuPolicy(Qt::CustomContextMenu);
-        connect(button, &QPushButton::released, [=] {
+        connect(button, &QPushButton::clicked, [=] {
             HandleClick(
                 button_map[button_id],
                 [=](const Common::ParamPackage& params) { buttons_param[button_id] = params; },
@@ -273,7 +273,7 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i
             }
 
             analog_button->setContextMenuPolicy(Qt::CustomContextMenu);
-            connect(analog_button, &QPushButton::released, [=]() {
+            connect(analog_button, &QPushButton::clicked, [=]() {
                 HandleClick(analog_map_buttons[analog_id][sub_button_id],
                             [=](const Common::ParamPackage& params) {
                                 SetAnalogButton(params, analogs_param[analog_id],
@@ -300,7 +300,7 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i
                             menu_location));
                     });
         }
-        connect(analog_map_stick[analog_id], &QPushButton::released, [=] {
+        connect(analog_map_stick[analog_id], &QPushButton::clicked, [=] {
             QMessageBox::information(this, tr("Information"),
                                      tr("After pressing OK, first move your joystick horizontally, "
                                         "and then vertically."));
@@ -311,8 +311,8 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i
         });
     }
 
-    connect(ui->buttonClearAll, &QPushButton::released, [this] { ClearAll(); });
-    connect(ui->buttonRestoreDefaults, &QPushButton::released, [this] { RestoreDefaults(); });
+    connect(ui->buttonClearAll, &QPushButton::clicked, [this] { ClearAll(); });
+    connect(ui->buttonRestoreDefaults, &QPushButton::clicked, [this] { RestoreDefaults(); });
 
     timeout_timer->setSingleShot(true);
     connect(timeout_timer.get(), &QTimer::timeout, [this] { SetPollingResult({}, true); });
diff --git a/src/yuzu/configuration/configure_input_simple.cpp b/src/yuzu/configuration/configure_input_simple.cpp
index 864803ea3..ab3a11d30 100644
--- a/src/yuzu/configuration/configure_input_simple.cpp
+++ b/src/yuzu/configuration/configure_input_simple.cpp
@@ -9,7 +9,7 @@
 #include "yuzu/configuration/configure_input.h"
 #include "yuzu/configuration/configure_input_player.h"
 #include "yuzu/configuration/configure_input_simple.h"
-#include "yuzu/ui_settings.h"
+#include "yuzu/uisettings.h"
 
 namespace {
 
@@ -101,7 +101,7 @@ ConfigureInputSimple::ConfigureInputSimple(QWidget* parent)
 
     connect(ui->profile_combobox, QOverload<int>::of(&QComboBox::currentIndexChanged), this,
             &ConfigureInputSimple::OnSelectProfile);
-    connect(ui->profile_configure, &QPushButton::pressed, this, &ConfigureInputSimple::OnConfigure);
+    connect(ui->profile_configure, &QPushButton::clicked, this, &ConfigureInputSimple::OnConfigure);
 
     LoadConfiguration();
 }
diff --git a/src/yuzu/configuration/configure_mouse_advanced.cpp b/src/yuzu/configuration/configure_mouse_advanced.cpp
index b7305e653..0a4abe34f 100644
--- a/src/yuzu/configuration/configure_mouse_advanced.cpp
+++ b/src/yuzu/configuration/configure_mouse_advanced.cpp
@@ -83,7 +83,7 @@ ConfigureMouseAdvanced::ConfigureMouseAdvanced(QWidget* parent)
         }
 
         button->setContextMenuPolicy(Qt::CustomContextMenu);
-        connect(button, &QPushButton::released, [=] {
+        connect(button, &QPushButton::clicked, [=] {
             HandleClick(
                 button_map[button_id],
                 [=](const Common::ParamPackage& params) { buttons_param[button_id] = params; },
@@ -104,8 +104,8 @@ ConfigureMouseAdvanced::ConfigureMouseAdvanced(QWidget* parent)
         });
     }
 
-    connect(ui->buttonClearAll, &QPushButton::released, [this] { ClearAll(); });
-    connect(ui->buttonRestoreDefaults, &QPushButton::released, [this] { RestoreDefaults(); });
+    connect(ui->buttonClearAll, &QPushButton::clicked, [this] { ClearAll(); });
+    connect(ui->buttonRestoreDefaults, &QPushButton::clicked, [this] { RestoreDefaults(); });
 
     timeout_timer->setSingleShot(true);
     connect(timeout_timer.get(), &QTimer::timeout, [this] { SetPollingResult({}, true); });
diff --git a/src/yuzu/configuration/configure_per_general.cpp b/src/yuzu/configuration/configure_per_general.cpp
index 90336e235..d7f259f12 100644
--- a/src/yuzu/configuration/configure_per_general.cpp
+++ b/src/yuzu/configuration/configure_per_general.cpp
@@ -23,7 +23,7 @@
 #include "yuzu/configuration/config.h"
 #include "yuzu/configuration/configure_input.h"
 #include "yuzu/configuration/configure_per_general.h"
-#include "yuzu/ui_settings.h"
+#include "yuzu/uisettings.h"
 #include "yuzu/util/util.h"
 
 ConfigurePerGameGeneral::ConfigurePerGameGeneral(QWidget* parent, u64 title_id)
diff --git a/src/yuzu/configuration/configure_profile_manager.cpp b/src/yuzu/configuration/configure_profile_manager.cpp
index c90f4cdd8..f53423440 100644
--- a/src/yuzu/configuration/configure_profile_manager.cpp
+++ b/src/yuzu/configuration/configure_profile_manager.cpp
@@ -108,10 +108,10 @@ ConfigureProfileManager ::ConfigureProfileManager(QWidget* parent)
 
     connect(tree_view, &QTreeView::clicked, this, &ConfigureProfileManager::SelectUser);
 
-    connect(ui->pm_add, &QPushButton::pressed, this, &ConfigureProfileManager::AddUser);
-    connect(ui->pm_rename, &QPushButton::pressed, this, &ConfigureProfileManager::RenameUser);
-    connect(ui->pm_remove, &QPushButton::pressed, this, &ConfigureProfileManager::DeleteUser);
-    connect(ui->pm_set_image, &QPushButton::pressed, this, &ConfigureProfileManager::SetUserImage);
+    connect(ui->pm_add, &QPushButton::clicked, this, &ConfigureProfileManager::AddUser);
+    connect(ui->pm_rename, &QPushButton::clicked, this, &ConfigureProfileManager::RenameUser);
+    connect(ui->pm_remove, &QPushButton::clicked, this, &ConfigureProfileManager::DeleteUser);
+    connect(ui->pm_set_image, &QPushButton::clicked, this, &ConfigureProfileManager::SetUserImage);
 
     scene = new QGraphicsScene;
     ui->current_user_icon->setScene(scene);
diff --git a/src/yuzu/configuration/configure_touchscreen_advanced.cpp b/src/yuzu/configuration/configure_touchscreen_advanced.cpp
index 8ced28c75..7d7cc00b7 100644
--- a/src/yuzu/configuration/configure_touchscreen_advanced.cpp
+++ b/src/yuzu/configuration/configure_touchscreen_advanced.cpp
@@ -11,7 +11,7 @@ ConfigureTouchscreenAdvanced::ConfigureTouchscreenAdvanced(QWidget* parent)
     : QDialog(parent), ui(std::make_unique<Ui::ConfigureTouchscreenAdvanced>()) {
     ui->setupUi(this);
 
-    connect(ui->restore_defaults_button, &QPushButton::pressed, this,
+    connect(ui->restore_defaults_button, &QPushButton::clicked, this,
             &ConfigureTouchscreenAdvanced::RestoreDefaults);
 
     LoadConfiguration();
diff --git a/src/yuzu/configuration/configure_web.cpp b/src/yuzu/configuration/configure_web.cpp
index 5a70ef168..336b062b3 100644
--- a/src/yuzu/configuration/configure_web.cpp
+++ b/src/yuzu/configuration/configure_web.cpp
@@ -9,7 +9,7 @@
 #include "core/telemetry_session.h"
 #include "ui_configure_web.h"
 #include "yuzu/configuration/configure_web.h"
-#include "yuzu/ui_settings.h"
+#include "yuzu/uisettings.h"
 
 ConfigureWeb::ConfigureWeb(QWidget* parent)
     : QWidget(parent), ui(std::make_unique<Ui::ConfigureWeb>()) {
diff --git a/src/yuzu/debugger/console.cpp b/src/yuzu/debugger/console.cpp
index 320898f6a..207ff4d58 100644
--- a/src/yuzu/debugger/console.cpp
+++ b/src/yuzu/debugger/console.cpp
@@ -10,7 +10,7 @@
 
 #include "common/logging/backend.h"
 #include "yuzu/debugger/console.h"
-#include "yuzu/ui_settings.h"
+#include "yuzu/uisettings.h"
 
 namespace Debugger {
 void ToggleConsole() {
diff --git a/src/yuzu/discord_impl.cpp b/src/yuzu/discord_impl.cpp
index 9d87a41eb..ea0079353 100644
--- a/src/yuzu/discord_impl.cpp
+++ b/src/yuzu/discord_impl.cpp
@@ -9,7 +9,7 @@
 #include "core/core.h"
 #include "core/loader/loader.h"
 #include "yuzu/discord_impl.h"
-#include "yuzu/ui_settings.h"
+#include "yuzu/uisettings.h"
 
 namespace DiscordRPC {
 
diff --git a/src/yuzu/game_list.cpp b/src/yuzu/game_list.cpp
index 1885587af..d18b96519 100644
--- a/src/yuzu/game_list.cpp
+++ b/src/yuzu/game_list.cpp
@@ -23,7 +23,7 @@
 #include "yuzu/game_list_p.h"
 #include "yuzu/game_list_worker.h"
 #include "yuzu/main.h"
-#include "yuzu/ui_settings.h"
+#include "yuzu/uisettings.h"
 
 GameListSearchField::KeyReleaseEater::KeyReleaseEater(GameList* gamelist) : gamelist{gamelist} {}
 
diff --git a/src/yuzu/game_list_p.h b/src/yuzu/game_list_p.h
index 0b458ef48..ece534dd6 100644
--- a/src/yuzu/game_list_p.h
+++ b/src/yuzu/game_list_p.h
@@ -19,7 +19,7 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "common/string_util.h"
-#include "yuzu/ui_settings.h"
+#include "yuzu/uisettings.h"
 #include "yuzu/util/util.h"
 
 /**
diff --git a/src/yuzu/game_list_worker.cpp b/src/yuzu/game_list_worker.cpp
index 4f30e9147..77f358630 100644
--- a/src/yuzu/game_list_worker.cpp
+++ b/src/yuzu/game_list_worker.cpp
@@ -29,7 +29,7 @@
 #include "yuzu/game_list.h"
 #include "yuzu/game_list_p.h"
 #include "yuzu/game_list_worker.h"
-#include "yuzu/ui_settings.h"
+#include "yuzu/uisettings.h"
 
 namespace {
 
diff --git a/src/yuzu/hotkeys.cpp b/src/yuzu/hotkeys.cpp
index 4582e7f21..d4e97fa16 100644
--- a/src/yuzu/hotkeys.cpp
+++ b/src/yuzu/hotkeys.cpp
@@ -7,7 +7,7 @@
 #include <QTreeWidgetItem>
 #include <QtGlobal>
 #include "yuzu/hotkeys.h"
-#include "yuzu/ui_settings.h"
+#include "yuzu/uisettings.h"
 
 HotkeyRegistry::HotkeyRegistry() = default;
 HotkeyRegistry::~HotkeyRegistry() = default;
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index ae21f4753..a7c656fdb 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -100,7 +100,7 @@ static FileSys::VirtualFile VfsDirectoryCreateFileWrapper(const FileSys::Virtual
 #include "yuzu/hotkeys.h"
 #include "yuzu/loading_screen.h"
 #include "yuzu/main.h"
-#include "yuzu/ui_settings.h"
+#include "yuzu/uisettings.h"
 
 #ifdef USE_DISCORD_PRESENCE
 #include "yuzu/discord_impl.h"
@@ -1843,13 +1843,14 @@ void GMainWindow::OnCoreError(Core::System::ResultStatus result, std::string det
            "data, or other bugs.");
     switch (result) {
     case Core::System::ResultStatus::ErrorSystemFiles: {
-        QString message = tr("yuzu was unable to locate a Switch system archive");
-        if (!details.empty()) {
-            message.append(tr(": %1. ").arg(QString::fromStdString(details)));
+        QString message;
+        if (details.empty()) {
+            message =
+                tr("yuzu was unable to locate a Switch system archive. %1").arg(common_message);
         } else {
-            message.append(tr(". "));
+            message = tr("yuzu was unable to locate a Switch system archive: %1. %2")
+                          .arg(QString::fromStdString(details), common_message);
         }
-        message.append(common_message);
 
         answer = QMessageBox::question(this, tr("System Archive Not Found"), message,
                                        QMessageBox::Yes | QMessageBox::No, QMessageBox::No);
@@ -1858,8 +1859,8 @@ void GMainWindow::OnCoreError(Core::System::ResultStatus result, std::string det
     }
 
     case Core::System::ResultStatus::ErrorSharedFont: {
-        QString message = tr("yuzu was unable to locate the Switch shared fonts. ");
-        message.append(common_message);
+        const QString message =
+            tr("yuzu was unable to locate the Switch shared fonts. %1").arg(common_message);
         answer = QMessageBox::question(this, tr("Shared Fonts Not Found"), message,
                                        QMessageBox::Yes | QMessageBox::No, QMessageBox::No);
         status_message = tr("Shared Font Missing");
diff --git a/src/yuzu/ui_settings.cpp b/src/yuzu/uisettings.cpp
index 4bdc302e0..7f7d247a3 100644
--- a/src/yuzu/ui_settings.cpp
+++ b/src/yuzu/uisettings.cpp
@@ -2,7 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include "ui_settings.h"
+#include "yuzu/uisettings.h"
 
 namespace UISettings {
 
diff --git a/src/yuzu/ui_settings.h b/src/yuzu/uisettings.h
index a62cd6911..a62cd6911 100644
--- a/src/yuzu/ui_settings.h
+++ b/src/yuzu/uisettings.h