76 files changed, 1395 insertions, 771 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3639b623c..59c610732 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
-# CMake 3.6 required for FindBoost to define IMPORTED libs properly on unknown Boost versions
-cmake_minimum_required(VERSION 3.6)
+cmake_minimum_required(VERSION 3.7)
+
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/externals/cmake-modules")
 include(DownloadExternals)
@@ -66,10 +66,12 @@ if (NOT ENABLE_GENERIC)
         detect_architecture("_M_AMD64" x86_64)
         detect_architecture("_M_IX86" x86)
         detect_architecture("_M_ARM" ARM)
+        detect_architecture("_M_ARM64" ARM64)
     else()
         detect_architecture("__x86_64__" x86_64)
         detect_architecture("__i386__" x86)
         detect_architecture("__arm__" ARM)
+        detect_architecture("__aarch64__" ARM64)
     endif()
 endif()
 
@@ -187,8 +189,8 @@ find_package(Threads REQUIRED)
 if (ENABLE_SDL2)
     if (YUZU_USE_BUNDLED_SDL2)
         # Detect toolchain and platform
-        if (MSVC14 AND ARCHITECTURE_x86_64)
-            set(SDL2_VER "SDL2-2.0.5")
+        if ((MSVC_VERSION GREATER_EQUAL 1910 AND MSVC_VERSION LESS 1920) AND ARCHITECTURE_x86_64)
+            set(SDL2_VER "SDL2-2.0.8")
         else()
             message(FATAL_ERROR "No bundled SDL2 binaries for your toolchain. Disable YUZU_USE_BUNDLED_SDL2 and provide your own.")
         endif()
@@ -220,7 +222,7 @@ if (YUZU_USE_BUNDLED_UNICORN)
     if (MSVC)
         message(STATUS "unicorn not found, falling back to bundled")
         # Detect toolchain and platform
-        if (MSVC14 AND ARCHITECTURE_x86_64)
+        if ((MSVC_VERSION GREATER_EQUAL 1910 AND MSVC_VERSION LESS 1920) AND ARCHITECTURE_x86_64)
             set(UNICORN_VER "unicorn-yuzu")
         else()
             message(FATAL_ERROR "No bundled Unicorn binaries for your toolchain. Disable YUZU_USE_BUNDLED_UNICORN and provide your own.")
@@ -279,7 +281,7 @@ endif()
 
 if (ENABLE_QT)
     if (YUZU_USE_BUNDLED_QT)
-        if (MSVC14 AND ARCHITECTURE_x86_64)
+        if ((MSVC_VERSION GREATER_EQUAL 1910 AND MSVC_VERSION LESS 1920) AND ARCHITECTURE_x86_64)
             set(QT_VER qt-5.10.0-msvc2015_64)
         else()
             message(FATAL_ERROR "No bundled Qt binaries for your toolchain. Disable YUZU_USE_BUNDLED_QT and provide your own.")
@@ -303,7 +305,7 @@ endif()
 # ======================================
 
 IF (APPLE)
-    FIND_LIBRARY(COCOA_LIBRARY Cocoa)           # Umbrella framework for everything GUI-related
+    find_library(COCOA_LIBRARY Cocoa)           # Umbrella framework for everything GUI-related
     set(PLATFORM_LIBRARIES ${COCOA_LIBRARY} ${IOKIT_LIBRARY} ${COREVIDEO_LIBRARY})
 
     if (CMAKE_CXX_COMPILER_ID STREQUAL Clang)
diff --git a/src/audio_core/CMakeLists.txt b/src/audio_core/CMakeLists.txt
index ec71524a3..82e4850f7 100644
--- a/src/audio_core/CMakeLists.txt
+++ b/src/audio_core/CMakeLists.txt
@@ -1,4 +1,8 @@
 add_library(audio_core STATIC
+    algorithm/filter.cpp
+    algorithm/filter.h
+    algorithm/interpolate.cpp
+    algorithm/interpolate.h
     audio_out.cpp
     audio_out.h
     audio_renderer.cpp
@@ -7,12 +11,12 @@ add_library(audio_core STATIC
     codec.cpp
     codec.h
     null_sink.h
-    stream.cpp
-    stream.h
     sink.h
     sink_details.cpp
     sink_details.h
     sink_stream.h
+    stream.cpp
+    stream.h
 
     $<$<BOOL:${ENABLE_CUBEB}>:cubeb_sink.cpp cubeb_sink.h>
 )
diff --git a/src/audio_core/algorithm/filter.cpp b/src/audio_core/algorithm/filter.cpp
new file mode 100644
index 000000000..403b8503f
--- /dev/null
+++ b/src/audio_core/algorithm/filter.cpp
@@ -0,0 +1,79 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#define _USE_MATH_DEFINES
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <vector>
+#include "audio_core/algorithm/filter.h"
+#include "common/common_types.h"
+
+namespace AudioCore {
+
+Filter Filter::LowPass(double cutoff, double Q) {
+    const double w0 = 2.0 * M_PI * cutoff;
+    const double sin_w0 = std::sin(w0);
+    const double cos_w0 = std::cos(w0);
+    const double alpha = sin_w0 / (2 * Q);
+
+    const double a0 = 1 + alpha;
+    const double a1 = -2.0 * cos_w0;
+    const double a2 = 1 - alpha;
+    const double b0 = 0.5 * (1 - cos_w0);
+    const double b1 = 1.0 * (1 - cos_w0);
+    const double b2 = 0.5 * (1 - cos_w0);
+
+    return {a0, a1, a2, b0, b1, b2};
+}
+
+Filter::Filter() : Filter(1.0, 0.0, 0.0, 1.0, 0.0, 0.0) {}
+
+Filter::Filter(double a0, double a1, double a2, double b0, double b1, double b2)
+    : a1(a1 / a0), a2(a2 / a0), b0(b0 / a0), b1(b1 / a0), b2(b2 / a0) {}
+
+void Filter::Process(std::vector<s16>& signal) {
+    const size_t num_frames = signal.size() / 2;
+    for (size_t i = 0; i < num_frames; i++) {
+        std::rotate(in.begin(), in.end() - 1, in.end());
+        std::rotate(out.begin(), out.end() - 1, out.end());
+
+        for (size_t ch = 0; ch < channel_count; ch++) {
+            in[0][ch] = signal[i * channel_count + ch];
+
+            out[0][ch] = b0 * in[0][ch] + b1 * in[1][ch] + b2 * in[2][ch] - a1 * out[1][ch] -
+                         a2 * out[2][ch];
+
+            signal[i * 2 + ch] = std::clamp(out[0][ch], -32768.0, 32767.0);
+        }
+    }
+}
+
+/// Calculates the appropriate Q for each biquad in a cascading filter.
+/// @param total_count The total number of biquads to be cascaded.
+/// @param index 0-index of the biquad to calculate the Q value for.
+static double CascadingBiquadQ(size_t total_count, size_t index) {
+    const double pole = M_PI * (2 * index + 1) / (4.0 * total_count);
+    return 1.0 / (2.0 * std::cos(pole));
+}
+
+CascadingFilter CascadingFilter::LowPass(double cutoff, size_t cascade_size) {
+    std::vector<Filter> cascade(cascade_size);
+    for (size_t i = 0; i < cascade_size; i++) {
+        cascade[i] = Filter::LowPass(cutoff, CascadingBiquadQ(cascade_size, i));
+    }
+    return CascadingFilter{std::move(cascade)};
+}
+
+CascadingFilter::CascadingFilter() = default;
+CascadingFilter::CascadingFilter(std::vector<Filter> filters) : filters(std::move(filters)) {}
+
+void CascadingFilter::Process(std::vector<s16>& signal) {
+    for (auto& filter : filters) {
+        filter.Process(signal);
+    }
+}
+
+} // namespace AudioCore
diff --git a/src/audio_core/algorithm/filter.h b/src/audio_core/algorithm/filter.h
new file mode 100644
index 000000000..a41beef98
--- /dev/null
+++ b/src/audio_core/algorithm/filter.h
@@ -0,0 +1,62 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <vector>
+#include "common/common_types.h"
+
+namespace AudioCore {
+
+/// Digital biquad filter:
+///
+///          b0 + b1 z^-1 + b2 z^-2
+///  H(z) = ------------------------
+///          a0 + a1 z^-1 + b2 z^-2
+class Filter {
+public:
+    /// Creates a low-pass filter.
+    /// @param cutoff Determines the cutoff frequency. A value from 0.0 to 1.0.
+    /// @param Q Determines the quality factor of this filter.
+    static Filter LowPass(double cutoff, double Q = 0.7071);
+
+    /// Passthrough filter.
+    Filter();
+
+    Filter(double a0, double a1, double a2, double b0, double b1, double b2);
+
+    void Process(std::vector<s16>& signal);
+
+private:
+    static constexpr size_t channel_count = 2;
+
+    /// Coefficients are in normalized form (a0 = 1.0).
+    double a1, a2, b0, b1, b2;
+    /// Input History
+    std::array<std::array<double, channel_count>, 3> in;
+    /// Output History
+    std::array<std::array<double, channel_count>, 3> out;
+};
+
+/// Cascade filters to build up higher-order filters from lower-order ones.
+class CascadingFilter {
+public:
+    /// Creates a cascading low-pass filter.
+    /// @param cutoff Determines the cutoff frequency. A value from 0.0 to 1.0.
+    /// @param cascade_size Number of biquads in cascade.
+    static CascadingFilter LowPass(double cutoff, size_t cascade_size);
+
+    /// Passthrough.
+    CascadingFilter();
+
+    explicit CascadingFilter(std::vector<Filter> filters);
+
+    void Process(std::vector<s16>& signal);
+
+private:
+    std::vector<Filter> filters;
+};
+
+} // namespace AudioCore
diff --git a/src/audio_core/algorithm/interpolate.cpp b/src/audio_core/algorithm/interpolate.cpp
new file mode 100644
index 000000000..11459821f
--- /dev/null
+++ b/src/audio_core/algorithm/interpolate.cpp
@@ -0,0 +1,71 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#define _USE_MATH_DEFINES
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+#include "audio_core/algorithm/interpolate.h"
+#include "common/common_types.h"
+#include "common/logging/log.h"
+
+namespace AudioCore {
+
+/// The Lanczos kernel
+static double Lanczos(size_t a, double x) {
+    if (x == 0.0)
+        return 1.0;
+    const double px = M_PI * x;
+    return a * std::sin(px) * std::sin(px / a) / (px * px);
+}
+
+std::vector<s16> Interpolate(InterpolationState& state, std::vector<s16> input, double ratio) {
+    if (input.size() < 2)
+        return {};
+
+    if (ratio <= 0) {
+        LOG_CRITICAL(Audio, "Nonsensical interpolation ratio {}", ratio);
+        ratio = 1.0;
+    }
+
+    if (ratio != state.current_ratio) {
+        const double cutoff_frequency = std::min(0.5 / ratio, 0.5 * ratio);
+        state.nyquist = CascadingFilter::LowPass(std::clamp(cutoff_frequency, 0.0, 0.4), 3);
+        state.current_ratio = ratio;
+    }
+    state.nyquist.Process(input);
+
+    constexpr size_t taps = InterpolationState::lanczos_taps;
+    const size_t num_frames = input.size() / 2;
+
+    std::vector<s16> output;
+    output.reserve(static_cast<size_t>(input.size() / ratio + 4));
+
+    double& pos = state.position;
+    auto& h = state.history;
+    for (size_t i = 0; i < num_frames; ++i) {
+        std::rotate(h.begin(), h.end() - 1, h.end());
+        h[0][0] = input[i * 2 + 0];
+        h[0][1] = input[i * 2 + 1];
+
+        while (pos <= 1.0) {
+            double l = 0.0;
+            double r = 0.0;
+            for (size_t j = 0; j < h.size(); j++) {
+                l += Lanczos(taps, pos + j - taps + 1) * h[j][0];
+                r += Lanczos(taps, pos + j - taps + 1) * h[j][1];
+            }
+            output.emplace_back(static_cast<s16>(std::clamp(l, -32768.0, 32767.0)));
+            output.emplace_back(static_cast<s16>(std::clamp(r, -32768.0, 32767.0)));
+
+            pos += ratio;
+        }
+        pos -= 1.0;
+    }
+
+    return output;
+}
+
+} // namespace AudioCore
diff --git a/src/audio_core/algorithm/interpolate.h b/src/audio_core/algorithm/interpolate.h
new file mode 100644
index 000000000..c79c2eef4
--- /dev/null
+++ b/src/audio_core/algorithm/interpolate.h
@@ -0,0 +1,43 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <vector>
+#include "audio_core/algorithm/filter.h"
+#include "common/common_types.h"
+
+namespace AudioCore {
+
+struct InterpolationState {
+    static constexpr size_t lanczos_taps = 4;
+    static constexpr size_t history_size = lanczos_taps * 2 - 1;
+
+    double current_ratio = 0.0;
+    CascadingFilter nyquist;
+    std::array<std::array<s16, 2>, history_size> history = {};
+    double position = 0;
+};
+
+/// Interpolates input signal to produce output signal.
+/// @param input The signal to interpolate.
+/// @param ratio Interpolation ratio.
+///              ratio > 1.0 results in fewer output samples.
+///              ratio < 1.0 results in more output samples.
+/// @returns Output signal.
+std::vector<s16> Interpolate(InterpolationState& state, std::vector<s16> input, double ratio);
+
+/// Interpolates input signal to produce output signal.
+/// @param input The signal to interpolate.
+/// @param input_rate The sample rate of input.
+/// @param output_rate The desired sample rate of the output.
+/// @returns Output signal.
+inline std::vector<s16> Interpolate(InterpolationState& state, std::vector<s16> input,
+                                    u32 input_rate, u32 output_rate) {
+    const double ratio = static_cast<double>(input_rate) / static_cast<double>(output_rate);
+    return Interpolate(state, std::move(input), ratio);
+}
+
+} // namespace AudioCore
diff --git a/src/audio_core/audio_renderer.cpp b/src/audio_core/audio_renderer.cpp
index 282f345c5..397b107f5 100644
--- a/src/audio_core/audio_renderer.cpp
+++ b/src/audio_core/audio_renderer.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include "audio_core/algorithm/interpolate.h"
 #include "audio_core/audio_renderer.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
@@ -26,6 +27,18 @@ AudioRenderer::AudioRenderer(AudioRendererParameter params,
     QueueMixedBuffer(2);
 }
 
+u32 AudioRenderer::GetSampleRate() const {
+    return worker_params.sample_rate;
+}
+
+u32 AudioRenderer::GetSampleCount() const {
+    return worker_params.sample_count;
+}
+
+u32 AudioRenderer::GetMixBufferCount() const {
+    return worker_params.mix_buffer_count;
+}
+
 std::vector<u8> AudioRenderer::UpdateAudioRenderer(const std::vector<u8>& input_params) {
     // Copy UpdateDataHeader struct
     UpdateDataHeader config{};
@@ -187,6 +200,8 @@ void AudioRenderer::VoiceState::RefreshBuffer() {
         break;
     }
 
+    samples = Interpolate(interp_state, std::move(samples), Info().sample_rate, STREAM_SAMPLE_RATE);
+
     is_refresh_pending = false;
 }
 
@@ -212,7 +227,7 @@ void AudioRenderer::QueueMixedBuffer(Buffer::Tag tag) {
                 break;
             }
 
-            samples_remaining -= samples.size();
+            samples_remaining -= samples.size() / stream->GetNumChannels();
 
             for (const auto& sample : samples) {
                 const s32 buffer_sample{buffer[offset]};
diff --git a/src/audio_core/audio_renderer.h b/src/audio_core/audio_renderer.h
index 6950a4681..eba67f28e 100644
--- a/src/audio_core/audio_renderer.h
+++ b/src/audio_core/audio_renderer.h
@@ -8,6 +8,7 @@
 #include <memory>
 #include <vector>
 
+#include "audio_core/algorithm/interpolate.h"
 #include "audio_core/audio_out.h"
 #include "audio_core/codec.h"
 #include "audio_core/stream.h"
@@ -26,7 +27,7 @@ enum class PlayState : u8 {
 struct AudioRendererParameter {
     u32_le sample_rate;
     u32_le sample_count;
-    u32_le unknown_8;
+    u32_le mix_buffer_count;
     u32_le unknown_c;
     u32_le voice_count;
     u32_le sink_count;
@@ -160,6 +161,9 @@ public:
     std::vector<u8> UpdateAudioRenderer(const std::vector<u8>& input_params);
     void QueueMixedBuffer(Buffer::Tag tag);
     void ReleaseAndQueueBuffers();
+    u32 GetSampleRate() const;
+    u32 GetSampleCount() const;
+    u32 GetMixBufferCount() const;
 
 private:
     class VoiceState {
@@ -191,6 +195,7 @@ private:
         size_t wave_index{};
         size_t offset{};
         Codec::ADPCMState adpcm_state{};
+        InterpolationState interp_state{};
         std::vector<s16> samples;
         VoiceOutStatus out_status{};
         VoiceInfo info{};
diff --git a/src/audio_core/cubeb_sink.cpp b/src/audio_core/cubeb_sink.cpp
index 1501ef1f4..5a1177d0c 100644
--- a/src/audio_core/cubeb_sink.cpp
+++ b/src/audio_core/cubeb_sink.cpp
@@ -4,6 +4,7 @@
 
 #include <algorithm>
 #include <cstring>
+#include <mutex>
 
 #include "audio_core/cubeb_sink.h"
 #include "audio_core/stream.h"
@@ -66,6 +67,8 @@ public:
             return;
         }
 
+        std::lock_guard lock{queue_mutex};
+
         queue.reserve(queue.size() + samples.size() * GetNumChannels());
 
         if (is_6_channel) {
@@ -94,6 +97,7 @@ private:
     u32 num_channels{};
     bool is_6_channel{};
 
+    std::mutex queue_mutex;
     std::vector<s16> queue;
 
     static long DataCallback(cubeb_stream* stream, void* user_data, const void* input_buffer,
@@ -153,6 +157,8 @@ long SinkStreamImpl::DataCallback(cubeb_stream* stream, void* user_data, const v
         return {};
     }
 
+    std::lock_guard lock{impl->queue_mutex};
+
     const size_t frames_to_write{
         std::min(impl->queue.size() / impl->GetNumChannels(), static_cast<size_t>(num_frames))};
 
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 2ad456864..d9424ea91 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -29,8 +29,6 @@ add_library(common STATIC
     assert.h
     bit_field.h
     bit_set.h
-    break_points.cpp
-    break_points.h
     cityhash.cpp
     cityhash.h
     color.h
diff --git a/src/common/break_points.cpp b/src/common/break_points.cpp
deleted file mode 100644
index fa367a4ca..000000000
--- a/src/common/break_points.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright 2013 Dolphin Emulator Project / 2014 Citra Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <algorithm>
-#include <sstream>
-#include "common/break_points.h"
-
-bool BreakPoints::IsAddressBreakPoint(u32 iAddress) const {
-    auto cond = [&iAddress](const TBreakPoint& bp) { return bp.iAddress == iAddress; };
-    auto it = std::find_if(m_BreakPoints.begin(), m_BreakPoints.end(), cond);
-    return it != m_BreakPoints.end();
-}
-
-bool BreakPoints::IsTempBreakPoint(u32 iAddress) const {
-    auto cond = [&iAddress](const TBreakPoint& bp) {
-        return bp.iAddress == iAddress && bp.bTemporary;
-    };
-    auto it = std::find_if(m_BreakPoints.begin(), m_BreakPoints.end(), cond);
-    return it != m_BreakPoints.end();
-}
-
-BreakPoints::TBreakPointsStr BreakPoints::GetStrings() const {
-    TBreakPointsStr bps;
-    for (auto breakpoint : m_BreakPoints) {
-        if (!breakpoint.bTemporary) {
-            std::stringstream bp;
-            bp << std::hex << breakpoint.iAddress << " " << (breakpoint.bOn ? "n" : "");
-            bps.push_back(bp.str());
-        }
-    }
-
-    return bps;
-}
-
-void BreakPoints::AddFromStrings(const TBreakPointsStr& bps) {
-    for (auto bps_item : bps) {
-        TBreakPoint bp;
-        std::stringstream bpstr;
-        bpstr << std::hex << bps_item;
-        bpstr >> bp.iAddress;
-        bp.bOn = bps_item.find("n") != bps_item.npos;
-        bp.bTemporary = false;
-        Add(bp);
-    }
-}
-
-void BreakPoints::Add(const TBreakPoint& bp) {
-    if (!IsAddressBreakPoint(bp.iAddress)) {
-        m_BreakPoints.push_back(bp);
-        // if (jit)
-        //    jit->GetBlockCache()->InvalidateICache(bp.iAddress, 4);
-    }
-}
-
-void BreakPoints::Add(u32 em_address, bool temp) {
-    if (!IsAddressBreakPoint(em_address)) // only add new addresses
-    {
-        TBreakPoint pt; // breakpoint settings
-        pt.bOn = true;
-        pt.bTemporary = temp;
-        pt.iAddress = em_address;
-
-        m_BreakPoints.push_back(pt);
-
-        // if (jit)
-        //    jit->GetBlockCache()->InvalidateICache(em_address, 4);
-    }
-}
-
-void BreakPoints::Remove(u32 em_address) {
-    auto cond = [&em_address](const TBreakPoint& bp) { return bp.iAddress == em_address; };
-    auto it = std::find_if(m_BreakPoints.begin(), m_BreakPoints.end(), cond);
-    if (it != m_BreakPoints.end())
-        m_BreakPoints.erase(it);
-}
-
-void BreakPoints::Clear() {
-    // if (jit)
-    //{
-    //    std::for_each(m_BreakPoints.begin(), m_BreakPoints.end(),
-    //        [](const TBreakPoint& bp)
-    //        {
-    //            jit->GetBlockCache()->InvalidateICache(bp.iAddress, 4);
-    //        }
-    //    );
-    //}
-
-    m_BreakPoints.clear();
-}
diff --git a/src/common/break_points.h b/src/common/break_points.h
deleted file mode 100644
index e15b9f842..000000000
--- a/src/common/break_points.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2013 Dolphin Emulator Project / 2014 Citra Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "common/common_types.h"
-
-class DebugInterface;
-
-struct TBreakPoint {
-    u32 iAddress;
-    bool bOn;
-    bool bTemporary;
-};
-
-// Code breakpoints.
-class BreakPoints {
-public:
-    typedef std::vector<TBreakPoint> TBreakPoints;
-    typedef std::vector<std::string> TBreakPointsStr;
-
-    const TBreakPoints& GetBreakPoints() {
-        return m_BreakPoints;
-    }
-
-    TBreakPointsStr GetStrings() const;
-    void AddFromStrings(const TBreakPointsStr& bps);
-
-    // is address breakpoint
-    bool IsAddressBreakPoint(u32 iAddress) const;
-    bool IsTempBreakPoint(u32 iAddress) const;
-
-    // Add BreakPoint
-    void Add(u32 em_address, bool temp = false);
-    void Add(const TBreakPoint& bp);
-
-    // Remove Breakpoint
-    void Remove(u32 iAddress);
-    void Clear();
-
-    void DeleteByAddress(u32 Address);
-
-private:
-    TBreakPoints m_BreakPoints;
-    u32 m_iBreakOnCount;
-};
diff --git a/src/common/logging/backend.cpp b/src/common/logging/backend.cpp
index e80784c3c..1323f8d0f 100644
--- a/src/common/logging/backend.cpp
+++ b/src/common/logging/backend.cpp
@@ -302,13 +302,14 @@ Backend* GetBackend(std::string_view backend_name) {
 void FmtLogMessageImpl(Class log_class, Level log_level, const char* filename,
                        unsigned int line_num, const char* function, const char* format,
                        const fmt::format_args& args) {
-    auto filter = Impl::Instance().GetGlobalFilter();
+    auto& instance = Impl::Instance();
+    const auto& filter = instance.GetGlobalFilter();
     if (!filter.CheckMessage(log_class, log_level))
         return;
 
     Entry entry =
         CreateEntry(log_class, log_level, filename, line_num, function, fmt::vformat(format, args));
 
-    Impl::Instance().PushEntry(std::move(entry));
+    instance.PushEntry(std::move(entry));
 }
 } // namespace Log
diff --git a/src/common/misc.cpp b/src/common/misc.cpp
index 217a87098..3fa8a3bc4 100644
--- a/src/common/misc.cpp
+++ b/src/common/misc.cpp
@@ -4,7 +4,7 @@
 
 #include <cstddef>
 #ifdef _WIN32
-#include <Windows.h>
+#include <windows.h>
 #else
 #include <cerrno>
 #include <cstring>
diff --git a/src/common/thread_queue_list.h b/src/common/thread_queue_list.h
index 38a450d69..133122c5f 100644
--- a/src/common/thread_queue_list.h
+++ b/src/common/thread_queue_list.h
@@ -16,7 +16,7 @@ struct ThreadQueueList {
     //               (dynamically resizable) circular buffers to remove their overhead when
     //               inserting and popping.
 
-    typedef unsigned int Priority;
+    using Priority = unsigned int;
 
     // Number of priority levels. (Valid levels are [0..NUM_QUEUES).)
     static const Priority NUM_QUEUES = N;
@@ -26,9 +26,9 @@ struct ThreadQueueList {
     }
 
     // Only for debugging, returns priority level.
-    Priority contains(const T& uid) {
+    Priority contains(const T& uid) const {
         for (Priority i = 0; i < NUM_QUEUES; ++i) {
-            Queue& cur = queues[i];
+            const Queue& cur = queues[i];
             if (std::find(cur.data.cbegin(), cur.data.cend(), uid) != cur.data.cend()) {
                 return i;
             }
@@ -37,8 +37,8 @@ struct ThreadQueueList {
         return -1;
     }
 
-    T get_first() {
-        Queue* cur = first;
+    T get_first() const {
+        const Queue* cur = first;
         while (cur != nullptr) {
             if (!cur->data.empty()) {
                 return cur->data.front();
diff --git a/src/common/x64/xbyak_abi.h b/src/common/x64/xbyak_abi.h
index fd3fbdd4b..927da9187 100644
--- a/src/common/x64/xbyak_abi.h
+++ b/src/common/x64/xbyak_abi.h
@@ -9,10 +9,9 @@
 #include "common/assert.h"
 #include "common/bit_set.h"
 
-namespace Common {
-namespace X64 {
+namespace Common::X64 {
 
-int RegToIndex(const Xbyak::Reg& reg) {
+inline int RegToIndex(const Xbyak::Reg& reg) {
     using Kind = Xbyak::Reg::Kind;
     ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0,
                "RegSet only support GPRs and XMM registers.");
@@ -152,8 +151,8 @@ constexpr size_t ABI_SHADOW_SPACE = 0;
 
 #endif
 
-void ABI_CalculateFrameSize(BitSet32 regs, size_t rsp_alignment, size_t needed_frame_size,
-                            s32* out_subtraction, s32* out_xmm_offset) {
+inline void ABI_CalculateFrameSize(BitSet32 regs, size_t rsp_alignment, size_t needed_frame_size,
+                                   s32* out_subtraction, s32* out_xmm_offset) {
     int count = (regs & ABI_ALL_GPRS).Count();
     rsp_alignment -= count * 8;
     size_t subtraction = 0;
@@ -174,8 +173,8 @@ void ABI_CalculateFrameSize(BitSet32 regs, size_t rsp_alignment, size_t needed_f
     *out_xmm_offset = (s32)(subtraction - xmm_base_subtraction);
 }
 
-size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, BitSet32 regs,
-                                       size_t rsp_alignment, size_t needed_frame_size = 0) {
+inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, BitSet32 regs,
+                                              size_t rsp_alignment, size_t needed_frame_size = 0) {
     s32 subtraction, xmm_offset;
     ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
 
@@ -195,8 +194,8 @@ size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, BitSet32 regs
     return ABI_SHADOW_SPACE;
 }
 
-void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, BitSet32 regs, size_t rsp_alignment,
-                                    size_t needed_frame_size = 0) {
+inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, BitSet32 regs,
+                                           size_t rsp_alignment, size_t needed_frame_size = 0) {
     s32 subtraction, xmm_offset;
     ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
 
@@ -217,5 +216,4 @@ void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, BitSet32 regs, s
     }
 }
 
-} // namespace X64
-} // namespace Common
+} // namespace Common::X64
diff --git a/src/common/x64/xbyak_util.h b/src/common/x64/xbyak_util.h
index ec76e0a47..02323a017 100644
--- a/src/common/x64/xbyak_util.h
+++ b/src/common/x64/xbyak_util.h
@@ -8,8 +8,7 @@
 #include <xbyak.h>
 #include "common/x64/xbyak_abi.h"
 
-namespace Common {
-namespace X64 {
+namespace Common::X64 {
 
 // Constants for use with cmpps/cmpss
 enum {
@@ -45,5 +44,4 @@ inline void CallFarFunction(Xbyak::CodeGenerator& code, const T f) {
     }
 }
 
-} // namespace X64
-} // namespace Common
+} // namespace Common::X64
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 8cf9fb405..67ad6109a 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -257,6 +257,10 @@ add_library(core STATIC
     hle/service/nvdrv/devices/nvhost_gpu.h
     hle/service/nvdrv/devices/nvhost_nvdec.cpp
     hle/service/nvdrv/devices/nvhost_nvdec.h
+    hle/service/nvdrv/devices/nvhost_nvjpg.cpp
+    hle/service/nvdrv/devices/nvhost_nvjpg.h
+    hle/service/nvdrv/devices/nvhost_vic.cpp
+    hle/service/nvdrv/devices/nvhost_vic.h
     hle/service/nvdrv/devices/nvmap.cpp
     hle/service/nvdrv/devices/nvmap.h
     hle/service/nvdrv/interface.cpp
diff --git a/src/core/arm/dynarmic/arm_dynarmic.cpp b/src/core/arm/dynarmic/arm_dynarmic.cpp
index ceb3f7683..20e5200a8 100644
--- a/src/core/arm/dynarmic/arm_dynarmic.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic.cpp
@@ -86,7 +86,16 @@ public:
     }
 
     void AddTicks(u64 ticks) override {
-        CoreTiming::AddTicks(ticks - num_interpreted_instructions);
+        // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a
+        // rough approximation of the amount of executed ticks in the system, it may be thrown off
+        // if not all cores are doing a similar amount of work. Instead of doing this, we should
+        // device a way so that timing is consistent across all cores without increasing the ticks 4
+        // times.
+        u64 amortized_ticks = (ticks - num_interpreted_instructions) / Core::NUM_CPU_CORES;
+        // Always execute at least one tick.
+        amortized_ticks = std::max<u64>(amortized_ticks, 1);
+
+        CoreTiming::AddTicks(amortized_ticks);
         num_interpreted_instructions = 0;
     }
     u64 GetTicksRemaining() override {
@@ -234,9 +243,7 @@ void ARM_Dynarmic::LoadContext(const ThreadContext& ctx) {
 }
 
 void ARM_Dynarmic::PrepareReschedule() {
-    if (jit->IsExecuting()) {
-        jit->HaltExecution();
-    }
+    jit->HaltExecution();
 }
 
 void ARM_Dynarmic::ClearInstructionCache() {
diff --git a/src/core/core_cpu.cpp b/src/core/core_cpu.cpp
index 46a522fcd..b042ee02b 100644
--- a/src/core/core_cpu.cpp
+++ b/src/core/core_cpu.cpp
@@ -14,6 +14,7 @@
 #include "core/core_timing.h"
 #include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/thread.h"
+#include "core/hle/lock.h"
 #include "core/settings.h"
 
 namespace Core {
@@ -90,6 +91,7 @@ void Cpu::RunLoop(bool tight_loop) {
         LOG_TRACE(Core, "Core-{} idling", core_index);
 
         if (IsMainCore()) {
+            // TODO(Subv): Only let CoreTiming idle if all 4 cores are idling.
             CoreTiming::Idle();
             CoreTiming::Advance();
         }
@@ -125,6 +127,8 @@ void Cpu::Reschedule() {
     }
 
     reschedule_pending = false;
+    // Lock the global kernel mutex when we manipulate the HLE state
+    std::lock_guard<std::recursive_mutex> lock(HLE::g_hle_lock);
     scheduler->Reschedule();
 }
 
diff --git a/src/core/core_cpu.h b/src/core/core_cpu.h
index 976952903..56cdae194 100644
--- a/src/core/core_cpu.h
+++ b/src/core/core_cpu.h
@@ -79,7 +79,7 @@ private:
     std::shared_ptr<CpuBarrier> cpu_barrier;
     std::shared_ptr<Kernel::Scheduler> scheduler;
 
-    bool reschedule_pending{};
+    std::atomic<bool> reschedule_pending = false;
     size_t core_index;
 };
 
diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp
index d3bb6f818..7953c8720 100644
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -56,6 +56,9 @@ static u64 event_fifo_id;
 // to the event_queue by the emu thread
 static Common::MPSCQueue<Event, false> ts_queue;
 
+// the queue for unscheduling the events from other threads threadsafe
+static Common::MPSCQueue<std::pair<const EventType*, u64>, false> unschedule_queue;
+
 constexpr int MAX_SLICE_LENGTH = 20000;
 
 static s64 idled_cycles;
@@ -135,11 +138,9 @@ void ClearPendingEvents() {
 void ScheduleEvent(s64 cycles_into_future, const EventType* event_type, u64 userdata) {
     ASSERT(event_type != nullptr);
     s64 timeout = GetTicks() + cycles_into_future;
-
     // If this event needs to be scheduled before the next advance(), force one early
     if (!is_global_timer_sane)
         ForceExceptionCheck(cycles_into_future);
-
     event_queue.emplace_back(Event{timeout, event_fifo_id++, userdata, event_type});
     std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>());
 }
@@ -160,6 +161,10 @@ void UnscheduleEvent(const EventType* event_type, u64 userdata) {
     }
 }
 
+void UnscheduleEventThreadsafe(const EventType* event_type, u64 userdata) {
+    unschedule_queue.Push(std::make_pair(event_type, userdata));
+}
+
 void RemoveEvent(const EventType* event_type) {
     auto itr = std::remove_if(event_queue.begin(), event_queue.end(),
                               [&](const Event& e) { return e.type == event_type; });
@@ -196,6 +201,9 @@ void MoveEvents() {
 
 void Advance() {
     MoveEvents();
+    for (std::pair<const EventType*, u64> ev; unschedule_queue.Pop(ev);) {
+        UnscheduleEvent(ev.first, ev.second);
+    }
 
     int cycles_executed = slice_length - downcount;
     global_timer += cycles_executed;
diff --git a/src/core/core_timing.h b/src/core/core_timing.h
index dfa161c0d..9ed757bd7 100644
--- a/src/core/core_timing.h
+++ b/src/core/core_timing.h
@@ -65,6 +65,7 @@ void ScheduleEvent(s64 cycles_into_future, const EventType* event_type, u64 user
 void ScheduleEventThreadsafe(s64 cycles_into_future, const EventType* event_type, u64 userdata);
 
 void UnscheduleEvent(const EventType* event_type, u64 userdata);
+void UnscheduleEventThreadsafe(const EventType* event_type, u64 userdata);
 
 /// We only permit one event of each type in the queue at a time.
 void RemoveEvent(const EventType* event_type);
diff --git a/src/core/file_sys/card_image.cpp b/src/core/file_sys/card_image.cpp
index 093c625ff..1d7c7fb10 100644
--- a/src/core/file_sys/card_image.cpp
+++ b/src/core/file_sys/card_image.cpp
@@ -4,11 +4,14 @@
 
 #include <array>
 #include <string>
-#include <core/loader/loader.h>
+
+#include <fmt/ostream.h>
+
 #include "common/logging/log.h"
 #include "core/file_sys/card_image.h"
 #include "core/file_sys/partition_filesystem.h"
 #include "core/file_sys/vfs_offset.h"
+#include "core/loader/loader.h"
 
 namespace FileSys {
 
@@ -111,19 +114,19 @@ VirtualFile XCI::GetNCAFileByType(NCAContentType type) const {
     return nullptr;
 }
 
-std::vector<std::shared_ptr<VfsFile>> XCI::GetFiles() const {
+std::vector<VirtualFile> XCI::GetFiles() const {
     return {};
 }
 
-std::vector<std::shared_ptr<VfsDirectory>> XCI::GetSubdirectories() const {
-    return std::vector<std::shared_ptr<VfsDirectory>>();
+std::vector<VirtualDir> XCI::GetSubdirectories() const {
+    return {};
 }
 
 std::string XCI::GetName() const {
     return file->GetName();
 }
 
-std::shared_ptr<VfsDirectory> XCI::GetParentDirectory() const {
+VirtualDir XCI::GetParentDirectory() const {
     return file->GetContainingDirectory();
 }
 
@@ -146,7 +149,7 @@ Loader::ResultStatus XCI::AddNCAFromPartition(XCIPartition part) {
             const u16 error_id = static_cast<u16>(nca->GetStatus());
             LOG_CRITICAL(Loader, "Could not load NCA {}/{}, failed with error code {:04X} ({})",
                          partition_names[static_cast<size_t>(part)], nca->GetName(), error_id,
-                         Loader::GetMessageForResultStatus(nca->GetStatus()));
+                         nca->GetStatus());
         }
     }
 
diff --git a/src/core/file_sys/card_image.h b/src/core/file_sys/card_image.h
index 3514bdf6c..a03d5264e 100644
--- a/src/core/file_sys/card_image.h
+++ b/src/core/file_sys/card_image.h
@@ -72,13 +72,13 @@ public:
     std::shared_ptr<NCA> GetNCAByType(NCAContentType type) const;
     VirtualFile GetNCAFileByType(NCAContentType type) const;
 
-    std::vector<std::shared_ptr<VfsFile>> GetFiles() const override;
+    std::vector<VirtualFile> GetFiles() const override;
 
-    std::vector<std::shared_ptr<VfsDirectory>> GetSubdirectories() const override;
+    std::vector<VirtualDir> GetSubdirectories() const override;
 
     std::string GetName() const override;
 
-    std::shared_ptr<VfsDirectory> GetParentDirectory() const override;
+    VirtualDir GetParentDirectory() const override;
 
 protected:
     bool ReplaceFileWithSubdirectory(VirtualFile file, VirtualDir dir) override;
diff --git a/src/core/file_sys/partition_filesystem.h b/src/core/file_sys/partition_filesystem.h
index 7c7a75816..be7bc32a8 100644
--- a/src/core/file_sys/partition_filesystem.h
+++ b/src/core/file_sys/partition_filesystem.h
@@ -13,7 +13,7 @@
 #include "core/file_sys/vfs.h"
 
 namespace Loader {
-enum class ResultStatus;
+enum class ResultStatus : u16;
 }
 
 namespace FileSys {
diff --git a/src/core/file_sys/program_metadata.h b/src/core/file_sys/program_metadata.h
index 06a7315db..74a91052b 100644
--- a/src/core/file_sys/program_metadata.h
+++ b/src/core/file_sys/program_metadata.h
@@ -13,7 +13,7 @@
 #include "partition_filesystem.h"
 
 namespace Loader {
-enum class ResultStatus;
+enum class ResultStatus : u16;
 }
 
 namespace FileSys {
diff --git a/src/core/file_sys/vfs.cpp b/src/core/file_sys/vfs.cpp
index 24e158962..a5ec50b1a 100644
--- a/src/core/file_sys/vfs.cpp
+++ b/src/core/file_sys/vfs.cpp
@@ -74,15 +74,15 @@ VirtualFile VfsFilesystem::CopyFile(std::string_view old_path_, std::string_view
     return new_file;
 }
 
-VirtualFile VfsFilesystem::MoveFile(std::string_view old_path_, std::string_view new_path_) {
-    const auto old_path = FileUtil::SanitizePath(old_path_);
-    const auto new_path = FileUtil::SanitizePath(new_path_);
+VirtualFile VfsFilesystem::MoveFile(std::string_view old_path, std::string_view new_path) {
+    const auto sanitized_old_path = FileUtil::SanitizePath(old_path);
+    const auto sanitized_new_path = FileUtil::SanitizePath(new_path);
 
     // Again, non-default impls are highly encouraged to provide a more optimized version of this.
-    auto out = CopyFile(old_path_, new_path_);
+    auto out = CopyFile(sanitized_old_path, sanitized_new_path);
     if (out == nullptr)
         return nullptr;
-    if (DeleteFile(old_path))
+    if (DeleteFile(sanitized_old_path))
         return out;
     return nullptr;
 }
@@ -137,15 +137,15 @@ VirtualDir VfsFilesystem::CopyDirectory(std::string_view old_path_, std::string_
     return new_dir;
 }
 
-VirtualDir VfsFilesystem::MoveDirectory(std::string_view old_path_, std::string_view new_path_) {
-    const auto old_path = FileUtil::SanitizePath(old_path_);
-    const auto new_path = FileUtil::SanitizePath(new_path_);
+VirtualDir VfsFilesystem::MoveDirectory(std::string_view old_path, std::string_view new_path) {
+    const auto sanitized_old_path = FileUtil::SanitizePath(old_path);
+    const auto sanitized_new_path = FileUtil::SanitizePath(new_path);
 
     // Non-default impls are highly encouraged to provide a more optimized version of this.
-    auto out = CopyDirectory(old_path_, new_path_);
+    auto out = CopyDirectory(sanitized_old_path, sanitized_new_path);
     if (out == nullptr)
         return nullptr;
-    if (DeleteDirectory(old_path))
+    if (DeleteDirectory(sanitized_old_path))
         return out;
     return nullptr;
 }
diff --git a/src/core/file_sys/vfs.h b/src/core/file_sys/vfs.h
index 141a053ce..78a63c59b 100644
--- a/src/core/file_sys/vfs.h
+++ b/src/core/file_sys/vfs.h
@@ -15,9 +15,9 @@
 
 namespace FileSys {
 
-struct VfsFilesystem;
-struct VfsFile;
-struct VfsDirectory;
+class VfsDirectory;
+class VfsFile;
+class VfsFilesystem;
 
 // Convenience typedefs to use Vfs* interfaces
 using VirtualFilesystem = std::shared_ptr<VfsFilesystem>;
@@ -34,8 +34,9 @@ enum class VfsEntryType {
 // A class representing an abstract filesystem. A default implementation given the root VirtualDir
 // is provided for convenience, but if the Vfs implementation has any additional state or
 // functionality, they will need to override.
-struct VfsFilesystem : NonCopyable {
-    VfsFilesystem(VirtualDir root);
+class VfsFilesystem : NonCopyable {
+public:
+    explicit VfsFilesystem(VirtualDir root);
     virtual ~VfsFilesystem();
 
     // Gets the friendly name for the filesystem.
@@ -81,7 +82,8 @@ protected:
 };
 
 // A class representing a file in an abstract filesystem.
-struct VfsFile : NonCopyable {
+class VfsFile : NonCopyable {
+public:
     virtual ~VfsFile();
 
     // Retrieves the file name.
@@ -179,7 +181,8 @@ struct VfsFile : NonCopyable {
 };
 
 // A class representing a directory in an abstract filesystem.
-struct VfsDirectory : NonCopyable {
+class VfsDirectory : NonCopyable {
+public:
     virtual ~VfsDirectory();
 
     // Retrives the file located at path as if the current directory was root. Returns nullptr if
@@ -295,7 +298,8 @@ protected:
 
 // A convenience partial-implementation of VfsDirectory that stubs out methods that should only work
 // if writable. This is to avoid redundant empty methods everywhere.
-struct ReadOnlyVfsDirectory : public VfsDirectory {
+class ReadOnlyVfsDirectory : public VfsDirectory {
+public:
     bool IsWritable() const override;
     bool IsReadable() const override;
     std::shared_ptr<VfsDirectory> CreateSubdirectory(std::string_view name) override;
diff --git a/src/core/file_sys/vfs_offset.h b/src/core/file_sys/vfs_offset.h
index 235970dc5..cb92d1570 100644
--- a/src/core/file_sys/vfs_offset.h
+++ b/src/core/file_sys/vfs_offset.h
@@ -15,7 +15,8 @@ namespace FileSys {
 // Similar to seeking to an offset.
 // If the file is writable, operations that would write past the end of the offset file will expand
 // the size of this wrapper.
-struct OffsetVfsFile : public VfsFile {
+class OffsetVfsFile : public VfsFile {
+public:
     OffsetVfsFile(std::shared_ptr<VfsFile> file, size_t size, size_t offset = 0,
                   std::string new_name = "", VirtualDir new_parent = nullptr);
 
diff --git a/src/core/file_sys/vfs_vector.h b/src/core/file_sys/vfs_vector.h
index dc39c9f2f..179f62e4b 100644
--- a/src/core/file_sys/vfs_vector.h
+++ b/src/core/file_sys/vfs_vector.h
@@ -10,7 +10,8 @@ namespace FileSys {
 
 // An implementation of VfsDirectory that maintains two vectors for subdirectories and files.
 // Vector data is supplied upon construction.
-struct VectorVfsDirectory : public VfsDirectory {
+class VectorVfsDirectory : public VfsDirectory {
+public:
     explicit VectorVfsDirectory(std::vector<VirtualFile> files = {},
                                 std::vector<VirtualDir> dirs = {}, std::string name = "",
                                 VirtualDir parent = nullptr);
diff --git a/src/core/frontend/emu_window.h b/src/core/frontend/emu_window.h
index 384dc7822..7006a37b3 100644
--- a/src/core/frontend/emu_window.h
+++ b/src/core/frontend/emu_window.h
@@ -34,9 +34,9 @@ class EmuWindow {
 public:
     /// Data structure to store emuwindow configuration
     struct WindowConfig {
-        bool fullscreen;
-        int res_width;
-        int res_height;
+        bool fullscreen = false;
+        int res_width = 0;
+        int res_height = 0;
         std::pair<unsigned, unsigned> min_client_area_size;
     };
 
diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp
index 1b0cd0abf..8c19e86d3 100644
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -11,7 +11,7 @@
 
 namespace Kernel {
 
-unsigned int Object::next_object_id;
+std::atomic<u32> Object::next_object_id{0};
 
 /// Initialize the kernel
 void Init() {
diff --git a/src/core/hle/kernel/object.h b/src/core/hle/kernel/object.h
index 83df68dfd..526ac9cc3 100644
--- a/src/core/hle/kernel/object.h
+++ b/src/core/hle/kernel/object.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <atomic>
 #include <string>
 #include <utility>
 
@@ -42,8 +43,8 @@ public:
     virtual ~Object();
 
     /// Returns a unique identifier for the object. For debugging purposes only.
-    unsigned int GetObjectId() const {
-        return object_id;
+    u32 GetObjectId() const {
+        return object_id.load(std::memory_order_relaxed);
     }
 
     virtual std::string GetTypeName() const {
@@ -61,23 +62,23 @@ public:
     bool IsWaitable() const;
 
 public:
-    static unsigned int next_object_id;
+    static std::atomic<u32> next_object_id;
 
 private:
     friend void intrusive_ptr_add_ref(Object*);
     friend void intrusive_ptr_release(Object*);
 
-    unsigned int ref_count = 0;
-    unsigned int object_id = next_object_id++;
+    std::atomic<u32> ref_count{0};
+    std::atomic<u32> object_id{next_object_id++};
 };
 
 // Special functions used by boost::instrusive_ptr to do automatic ref-counting
 inline void intrusive_ptr_add_ref(Object* object) {
-    ++object->ref_count;
+    object->ref_count.fetch_add(1, std::memory_order_relaxed);
 }
 
 inline void intrusive_ptr_release(Object* object) {
-    if (--object->ref_count == 0) {
+    if (object->ref_count.fetch_sub(1, std::memory_order_acq_rel) == 1) {
         delete object;
     }
 }
diff --git a/src/core/hle/kernel/scheduler.cpp b/src/core/hle/kernel/scheduler.cpp
index 94065c736..e770b9103 100644
--- a/src/core/hle/kernel/scheduler.cpp
+++ b/src/core/hle/kernel/scheduler.cpp
@@ -25,7 +25,7 @@ Scheduler::~Scheduler() {
     }
 }
 
-bool Scheduler::HaveReadyThreads() {
+bool Scheduler::HaveReadyThreads() const {
     std::lock_guard<std::mutex> lock(scheduler_mutex);
     return ready_queue.get_first() != nullptr;
 }
diff --git a/src/core/hle/kernel/scheduler.h b/src/core/hle/kernel/scheduler.h
index 1a4ee8f36..6a61ef64e 100644
--- a/src/core/hle/kernel/scheduler.h
+++ b/src/core/hle/kernel/scheduler.h
@@ -21,7 +21,7 @@ public:
     ~Scheduler();
 
     /// Returns whether there are any threads that are ready to run.
-    bool HaveReadyThreads();
+    bool HaveReadyThreads() const;
 
     /// Reschedules to the next available thread (call after current thread is suspended)
     void Reschedule();
diff --git a/src/core/hle/kernel/server_session.cpp b/src/core/hle/kernel/server_session.cpp
index d09ca5992..51a1ec160 100644
--- a/src/core/hle/kernel/server_session.cpp
+++ b/src/core/hle/kernel/server_session.cpp
@@ -152,7 +152,7 @@ ResultCode ServerSession::HandleSyncRequest(SharedPtr<Thread> thread) {
     // Handle scenario when ConvertToDomain command was issued, as we must do the conversion at the
     // end of the command such that only commands following this one are handled as domains
     if (convert_to_domain) {
-        ASSERT_MSG(domain_request_handlers.empty(), "already a domain");
+        ASSERT_MSG(IsSession(), "ServerSession is already a domain instance.");
         domain_request_handlers = {hle_handler};
         convert_to_domain = false;
     }
diff --git a/src/core/hle/kernel/server_session.h b/src/core/hle/kernel/server_session.h
index 2bce54fee..1a88e66b9 100644
--- a/src/core/hle/kernel/server_session.h
+++ b/src/core/hle/kernel/server_session.h
@@ -97,7 +97,12 @@ public:
 
     /// Returns true if the session has been converted to a domain, otherwise False
     bool IsDomain() const {
-        return !domain_request_handlers.empty();
+        return !IsSession();
+    }
+
+    /// Returns true if this session has not been converted to a domain, otherwise false.
+    bool IsSession() const {
+        return domain_request_handlers.empty();
     }
 
     /// Converts the session to a domain at the end of the current command
diff --git a/src/core/hle/kernel/svc.cpp b/src/core/hle/kernel/svc.cpp
index 5db2db687..6be5c474e 100644
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -250,8 +250,11 @@ static ResultCode ArbitrateUnlock(VAddr mutex_addr) {
 }
 
 /// Break program execution
-static void Break(u64 unk_0, u64 unk_1, u64 unk_2) {
-    LOG_CRITICAL(Debug_Emulated, "Emulated program broke execution!");
+static void Break(u64 reason, u64 info1, u64 info2) {
+    LOG_CRITICAL(
+        Debug_Emulated,
+        "Emulated program broke execution! reason=0x{:016X}, info1=0x{:016X}, info2=0x{:016X}",
+        reason, info1, info2);
     ASSERT(false);
 }
 
@@ -532,7 +535,6 @@ static ResultCode CreateThread(Handle* out_handle, VAddr entry_point, u64 arg, V
     CASCADE_RESULT(thread->guest_handle, g_handle_table.Create(thread));
     *out_handle = thread->guest_handle;
 
-    Core::System::GetInstance().PrepareReschedule();
     Core::System::GetInstance().CpuCore(thread->processor_id).PrepareReschedule();
 
     LOG_TRACE(Kernel_SVC,
@@ -706,8 +708,7 @@ static ResultCode SignalProcessWideKey(VAddr condition_variable_addr, s32 target
             Handle owner_handle = static_cast<Handle>(mutex_val & Mutex::MutexOwnerMask);
             auto owner = g_handle_table.Get<Thread>(owner_handle);
             ASSERT(owner);
-            ASSERT(thread->status != ThreadStatus::Running);
-            thread->status = ThreadStatus::WaitMutex;
+            ASSERT(thread->status == ThreadStatus::WaitMutex);
             thread->wakeup_callback = nullptr;
 
             owner->AddMutexWaiter(thread);
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index b9022feae..cf4f94822 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -23,6 +23,7 @@
 #include "core/hle/kernel/object.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/thread.h"
+#include "core/hle/lock.h"
 #include "core/hle/result.h"
 #include "core/memory.h"
 
@@ -104,6 +105,10 @@ void ExitCurrentThread() {
  */
 static void ThreadWakeupCallback(u64 thread_handle, int cycles_late) {
     const auto proper_handle = static_cast<Handle>(thread_handle);
+
+    // Lock the global kernel mutex when we enter the kernel HLE.
+    std::lock_guard<std::recursive_mutex> lock(HLE::g_hle_lock);
+
     SharedPtr<Thread> thread = wakeup_callback_handle_table.Get<Thread>(proper_handle);
     if (thread == nullptr) {
         LOG_CRITICAL(Kernel, "Callback fired for invalid thread {:08X}", proper_handle);
@@ -155,12 +160,14 @@ void Thread::WakeAfterDelay(s64 nanoseconds) {
     if (nanoseconds == -1)
         return;
 
-    CoreTiming::ScheduleEvent(CoreTiming::nsToCycles(nanoseconds), ThreadWakeupEventType,
-                              callback_handle);
+    // This function might be called from any thread so we have to be cautious and use the
+    // thread-safe version of ScheduleEvent.
+    CoreTiming::ScheduleEventThreadsafe(CoreTiming::nsToCycles(nanoseconds), ThreadWakeupEventType,
+                                        callback_handle);
 }
 
 void Thread::CancelWakeupTimer() {
-    CoreTiming::UnscheduleEvent(ThreadWakeupEventType, callback_handle);
+    CoreTiming::UnscheduleEventThreadsafe(ThreadWakeupEventType, callback_handle);
 }
 
 static boost::optional<s32> GetNextProcessorId(u64 mask) {
@@ -419,12 +426,33 @@ VAddr Thread::GetCommandBufferAddress() const {
 }
 
 void Thread::AddMutexWaiter(SharedPtr<Thread> thread) {
+    if (thread->lock_owner == this) {
+        // If the thread is already waiting for this thread to release the mutex, ensure that the
+        // waiters list is consistent and return without doing anything.
+        auto itr = std::find(wait_mutex_threads.begin(), wait_mutex_threads.end(), thread);
+        ASSERT(itr != wait_mutex_threads.end());
+        return;
+    }
+
+    // A thread can't wait on two different mutexes at the same time.
+    ASSERT(thread->lock_owner == nullptr);
+
+    // Ensure that the thread is not already in the list of mutex waiters
+    auto itr = std::find(wait_mutex_threads.begin(), wait_mutex_threads.end(), thread);
+    ASSERT(itr == wait_mutex_threads.end());
+
     thread->lock_owner = this;
     wait_mutex_threads.emplace_back(std::move(thread));
     UpdatePriority();
 }
 
 void Thread::RemoveMutexWaiter(SharedPtr<Thread> thread) {
+    ASSERT(thread->lock_owner == this);
+
+    // Ensure that the thread is in the list of mutex waiters
+    auto itr = std::find(wait_mutex_threads.begin(), wait_mutex_threads.end(), thread);
+    ASSERT(itr != wait_mutex_threads.end());
+
     boost::remove_erase(wait_mutex_threads, thread);
     thread->lock_owner = nullptr;
     UpdatePriority();
diff --git a/src/core/hle/service/audio/audout_u.cpp b/src/core/hle/service/audio/audout_u.cpp
index 108a7c6eb..ce709ccf4 100644
--- a/src/core/hle/service/audio/audout_u.cpp
+++ b/src/core/hle/service/audio/audout_u.cpp
@@ -28,7 +28,7 @@ constexpr int DefaultSampleRate{48000};
 class IAudioOut final : public ServiceFramework<IAudioOut> {
 public:
     IAudioOut(AudoutParams audio_params, AudioCore::AudioOut& audio_core)
-        : ServiceFramework("IAudioOut"), audio_params(audio_params), audio_core(audio_core) {
+        : ServiceFramework("IAudioOut"), audio_core(audio_core), audio_params(audio_params) {
 
         static const FunctionInfo functions[] = {
             {0, &IAudioOut::GetAudioOutState, "GetAudioOutState"},
diff --git a/src/core/hle/service/audio/audren_u.cpp b/src/core/hle/service/audio/audren_u.cpp
index f99304de5..9e75eb3a6 100644
--- a/src/core/hle/service/audio/audren_u.cpp
+++ b/src/core/hle/service/audio/audren_u.cpp
@@ -20,9 +20,9 @@ public:
     explicit IAudioRenderer(AudioCore::AudioRendererParameter audren_params)
         : ServiceFramework("IAudioRenderer") {
         static const FunctionInfo functions[] = {
-            {0, nullptr, "GetAudioRendererSampleRate"},
-            {1, nullptr, "GetAudioRendererSampleCount"},
-            {2, nullptr, "GetAudioRendererMixBufferCount"},
+            {0, &IAudioRenderer::GetAudioRendererSampleRate, "GetAudioRendererSampleRate"},
+            {1, &IAudioRenderer::GetAudioRendererSampleCount, "GetAudioRendererSampleCount"},
+            {2, &IAudioRenderer::GetAudioRendererMixBufferCount, "GetAudioRendererMixBufferCount"},
             {3, nullptr, "GetAudioRendererState"},
             {4, &IAudioRenderer::RequestUpdateAudioRenderer, "RequestUpdateAudioRenderer"},
             {5, &IAudioRenderer::StartAudioRenderer, "StartAudioRenderer"},
@@ -45,6 +45,27 @@ private:
         system_event->Signal();
     }
 
+    void GetAudioRendererSampleRate(Kernel::HLERequestContext& ctx) {
+        IPC::ResponseBuilder rb{ctx, 3};
+        rb.Push(RESULT_SUCCESS);
+        rb.Push<u32>(renderer->GetSampleRate());
+        LOG_DEBUG(Service_Audio, "called");
+    }
+
+    void GetAudioRendererSampleCount(Kernel::HLERequestContext& ctx) {
+        IPC::ResponseBuilder rb{ctx, 3};
+        rb.Push(RESULT_SUCCESS);
+        rb.Push<u32>(renderer->GetSampleCount());
+        LOG_DEBUG(Service_Audio, "called");
+    }
+
+    void GetAudioRendererMixBufferCount(Kernel::HLERequestContext& ctx) {
+        IPC::ResponseBuilder rb{ctx, 3};
+        rb.Push(RESULT_SUCCESS);
+        rb.Push<u32>(renderer->GetMixBufferCount());
+        LOG_DEBUG(Service_Audio, "called");
+    }
+
     void RequestUpdateAudioRenderer(Kernel::HLERequestContext& ctx) {
         ctx.WriteBuffer(renderer->UpdateAudioRenderer(ctx.ReadBuffer()));
         IPC::ResponseBuilder rb{ctx, 2};
@@ -169,7 +190,8 @@ AudRenU::AudRenU() : ServiceFramework("audren:u") {
         {1, &AudRenU::GetAudioRendererWorkBufferSize, "GetAudioRendererWorkBufferSize"},
         {2, &AudRenU::GetAudioDevice, "GetAudioDevice"},
         {3, nullptr, "OpenAudioRendererAuto"},
-        {4, nullptr, "GetAudioDeviceServiceWithRevisionInfo"},
+        {4, &AudRenU::GetAudioDeviceServiceWithRevisionInfo,
+         "GetAudioDeviceServiceWithRevisionInfo"},
     };
     RegisterHandlers(functions);
 }
@@ -189,7 +211,7 @@ void AudRenU::GetAudioRendererWorkBufferSize(Kernel::HLERequestContext& ctx) {
     IPC::RequestParser rp{ctx};
     auto params = rp.PopRaw<AudioCore::AudioRendererParameter>();
 
-    u64 buffer_sz = Common::AlignUp(4 * params.unknown_8, 0x40);
+    u64 buffer_sz = Common::AlignUp(4 * params.mix_buffer_count, 0x40);
     buffer_sz += params.unknown_c * 1024;
     buffer_sz += 0x940 * (params.unknown_c + 1);
     buffer_sz += 0x3F0 * params.voice_count;
@@ -197,7 +219,7 @@ void AudRenU::GetAudioRendererWorkBufferSize(Kernel::HLERequestContext& ctx) {
     buffer_sz += Common::AlignUp(8 * params.voice_count, 0x10);
     buffer_sz +=
         Common::AlignUp((0x3C0 * (params.sink_count + params.unknown_c) + 4 * params.sample_count) *
-                            (params.unknown_8 + 6),
+                            (params.mix_buffer_count + 6),
                         0x40);
 
     if (IsFeatureSupported(AudioFeatures::Splitter, params.revision)) {
@@ -253,6 +275,16 @@ void AudRenU::GetAudioDevice(Kernel::HLERequestContext& ctx) {
     LOG_DEBUG(Service_Audio, "called");
 }
 
+void AudRenU::GetAudioDeviceServiceWithRevisionInfo(Kernel::HLERequestContext& ctx) {
+    IPC::ResponseBuilder rb{ctx, 2, 0, 1};
+
+    rb.Push(RESULT_SUCCESS);
+    rb.PushIpcInterface<Audio::IAudioDevice>();
+
+    LOG_WARNING(Service_Audio, "(STUBBED) called"); // TODO(ogniK): Figure out what is different
+                                                    // based on the current revision
+}
+
 bool AudRenU::IsFeatureSupported(AudioFeatures feature, u32_le revision) const {
     u32_be version_num = (revision - Common::MakeMagic('R', 'E', 'V', '0')); // Byte swap
     switch (feature) {
diff --git a/src/core/hle/service/audio/audren_u.h b/src/core/hle/service/audio/audren_u.h
index 14907f8ae..8600ac6e4 100644
--- a/src/core/hle/service/audio/audren_u.h
+++ b/src/core/hle/service/audio/audren_u.h
@@ -22,6 +22,7 @@ private:
     void OpenAudioRenderer(Kernel::HLERequestContext& ctx);
     void GetAudioRendererWorkBufferSize(Kernel::HLERequestContext& ctx);
     void GetAudioDevice(Kernel::HLERequestContext& ctx);
+    void GetAudioDeviceServiceWithRevisionInfo(Kernel::HLERequestContext& ctx);
 
     enum class AudioFeatures : u32 {
         Splitter,
diff --git a/src/core/hle/service/friend/friend.cpp b/src/core/hle/service/friend/friend.cpp
index 2b642c32f..f2b0e509a 100644
--- a/src/core/hle/service/friend/friend.cpp
+++ b/src/core/hle/service/friend/friend.cpp
@@ -26,7 +26,7 @@ public:
             {10600, nullptr, "DeclareOpenOnlinePlaySession"},
             {10601, &IFriendService::DeclareCloseOnlinePlaySession,
              "DeclareCloseOnlinePlaySession"},
-            {10610, nullptr, "UpdateUserPresence"},
+            {10610, &IFriendService::UpdateUserPresence, "UpdateUserPresence"},
             {10700, nullptr, "GetPlayHistoryRegistrationKey"},
             {10701, nullptr, "GetPlayHistoryRegistrationKeyWithNetworkServiceAccountId"},
             {10702, nullptr, "AddPlayHistory"},
@@ -99,6 +99,13 @@ private:
         IPC::ResponseBuilder rb{ctx, 2};
         rb.Push(RESULT_SUCCESS);
     }
+
+    void UpdateUserPresence(Kernel::HLERequestContext& ctx) {
+        // Stub used by Retro City Rampage
+        LOG_WARNING(Service_ACC, "(STUBBED) called");
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+    }
 };
 
 void Module::Interface::CreateFriendService(Kernel::HLERequestContext& ctx) {
diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp
index dcdfa0e19..970942d3f 100644
--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -291,6 +291,7 @@ private:
 class Hid final : public ServiceFramework<Hid> {
 public:
     Hid() : ServiceFramework("hid") {
+        // clang-format off
         static const FunctionInfo functions[] = {
             {0, &Hid::CreateAppletResource, "CreateAppletResource"},
             {1, &Hid::ActivateDebugPad, "ActivateDebugPad"},
@@ -333,15 +334,13 @@ public:
             {102, &Hid::SetSupportedNpadIdType, "SetSupportedNpadIdType"},
             {103, &Hid::ActivateNpad, "ActivateNpad"},
             {104, nullptr, "DeactivateNpad"},
-            {106, &Hid::AcquireNpadStyleSetUpdateEventHandle,
-             "AcquireNpadStyleSetUpdateEventHandle"},
-            {107, nullptr, "DisconnectNpad"},
+            {106, &Hid::AcquireNpadStyleSetUpdateEventHandle, "AcquireNpadStyleSetUpdateEventHandle"},
+            {107, &Hid::DisconnectNpad, "DisconnectNpad"},
             {108, &Hid::GetPlayerLedPattern, "GetPlayerLedPattern"},
             {109, nullptr, "ActivateNpadWithRevision"},
             {120, &Hid::SetNpadJoyHoldType, "SetNpadJoyHoldType"},
             {121, &Hid::GetNpadJoyHoldType, "GetNpadJoyHoldType"},
-            {122, &Hid::SetNpadJoyAssignmentModeSingleByDefault,
-             "SetNpadJoyAssignmentModeSingleByDefault"},
+            {122, &Hid::SetNpadJoyAssignmentModeSingleByDefault, "SetNpadJoyAssignmentModeSingleByDefault"},
             {123, nullptr, "SetNpadJoyAssignmentModeSingleByDefault"},
             {124, &Hid::SetNpadJoyAssignmentModeDual, "SetNpadJoyAssignmentModeDual"},
             {125, &Hid::MergeSingleJoyAsDualJoy, "MergeSingleJoyAsDualJoy"},
@@ -398,6 +397,8 @@ public:
             {1000, nullptr, "SetNpadCommunicationMode"},
             {1001, nullptr, "GetNpadCommunicationMode"},
         };
+        // clang-format on
+
         RegisterHandlers(functions);
 
         event = Kernel::Event::Create(Kernel::ResetType::OneShot, "hid:EventHandle");
@@ -496,6 +497,12 @@ private:
         LOG_WARNING(Service_HID, "(STUBBED) called");
     }
 
+    void DisconnectNpad(Kernel::HLERequestContext& ctx) {
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+        LOG_WARNING(Service_HID, "(STUBBED) called");
+    }
+
     void GetPlayerLedPattern(Kernel::HLERequestContext& ctx) {
         IPC::ResponseBuilder rb{ctx, 2};
         rb.Push(RESULT_SUCCESS);
diff --git a/src/core/hle/service/lm/lm.cpp b/src/core/hle/service/lm/lm.cpp
index 2e99ddf51..098da2a41 100644
--- a/src/core/hle/service/lm/lm.cpp
+++ b/src/core/hle/service/lm/lm.cpp
@@ -92,7 +92,11 @@ private:
 
         // Parse out log metadata
         u32 line{};
-        std::string message, filename, function;
+        std::string module;
+        std::string message;
+        std::string filename;
+        std::string function;
+        std::string thread;
         while (addr < end_addr) {
             const Field field{static_cast<Field>(Memory::Read8(addr++))};
             const size_t length{Memory::Read8(addr++)};
@@ -102,6 +106,8 @@ private:
             }
 
             switch (field) {
+            case Field::Skip:
+                break;
             case Field::Message:
                 message = Memory::ReadCString(addr, length);
                 break;
@@ -114,6 +120,12 @@ private:
             case Field::Function:
                 function = Memory::ReadCString(addr, length);
                 break;
+            case Field::Module:
+                module = Memory::ReadCString(addr, length);
+                break;
+            case Field::Thread:
+                thread = Memory::ReadCString(addr, length);
+                break;
             }
 
             addr += length;
@@ -128,12 +140,18 @@ private:
         if (!filename.empty()) {
             log_stream << filename << ':';
         }
+        if (!module.empty()) {
+            log_stream << module << ':';
+        }
         if (!function.empty()) {
             log_stream << function << ':';
         }
         if (line) {
             log_stream << std::to_string(line) << ':';
         }
+        if (!thread.empty()) {
+            log_stream << thread << ':';
+        }
         if (log_stream.str().length() > 0 && log_stream.str().back() == ':') {
             log_stream << ' ';
         }
@@ -142,7 +160,7 @@ private:
         if (header.IsTailLog()) {
             switch (header.severity) {
             case MessageHeader::Severity::Trace:
-                LOG_TRACE(Debug_Emulated, "{}", log_stream.str());
+                LOG_DEBUG(Debug_Emulated, "{}", log_stream.str());
                 break;
             case MessageHeader::Severity::Info:
                 LOG_INFO(Debug_Emulated, "{}", log_stream.str());
diff --git a/src/core/hle/service/mm/mm_u.cpp b/src/core/hle/service/mm/mm_u.cpp
index 08f45b78a..7b91bb258 100644
--- a/src/core/hle/service/mm/mm_u.cpp
+++ b/src/core/hle/service/mm/mm_u.cpp
@@ -9,42 +9,63 @@
 
 namespace Service::MM {
 
-void InstallInterfaces(SM::ServiceManager& service_manager) {
-    std::make_shared<MM_U>()->InstallAsService(service_manager);
-}
+class MM_U final : public ServiceFramework<MM_U> {
+public:
+    explicit MM_U() : ServiceFramework{"mm:u"} {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, &MM_U::Initialize, "InitializeOld"},
+            {1, &MM_U::Finalize, "FinalizeOld"},
+            {2, &MM_U::SetAndWait, "SetAndWaitOld"},
+            {3, &MM_U::Get, "GetOld"},
+            {4, &MM_U::Initialize, "Initialize"},
+            {5, &MM_U::Finalize, "Finalize"},
+            {6, &MM_U::SetAndWait, "SetAndWait"},
+            {7, &MM_U::Get, "Get"},
+        };
+        // clang-format on
 
-void MM_U::Initialize(Kernel::HLERequestContext& ctx) {
-    LOG_WARNING(Service_MM, "(STUBBED) called");
-    IPC::ResponseBuilder rb{ctx, 2};
-    rb.Push(RESULT_SUCCESS);
-}
+        RegisterHandlers(functions);
+    }
 
-void MM_U::SetAndWait(Kernel::HLERequestContext& ctx) {
-    IPC::RequestParser rp{ctx};
-    min = rp.Pop<u32>();
-    max = rp.Pop<u32>();
-    current = min;
+private:
+    void Initialize(Kernel::HLERequestContext& ctx) {
+        LOG_WARNING(Service_MM, "(STUBBED) called");
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+    }
 
-    LOG_WARNING(Service_MM, "(STUBBED) called, min=0x{:X}, max=0x{:X}", min, max);
-    IPC::ResponseBuilder rb{ctx, 2};
-    rb.Push(RESULT_SUCCESS);
-}
+    void Finalize(Kernel::HLERequestContext& ctx) {
+        LOG_WARNING(Service_MM, "(STUBBED) called");
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+    }
 
-void MM_U::Get(Kernel::HLERequestContext& ctx) {
-    LOG_WARNING(Service_MM, "(STUBBED) called");
-    IPC::ResponseBuilder rb{ctx, 3};
-    rb.Push(RESULT_SUCCESS);
-    rb.Push(current);
-}
+    void SetAndWait(Kernel::HLERequestContext& ctx) {
+        IPC::RequestParser rp{ctx};
+        min = rp.Pop<u32>();
+        max = rp.Pop<u32>();
+        current = min;
 
-MM_U::MM_U() : ServiceFramework("mm:u") {
-    static const FunctionInfo functions[] = {
-        {0, nullptr, "InitializeOld"},        {1, nullptr, "FinalizeOld"},
-        {2, nullptr, "SetAndWaitOld"},        {3, nullptr, "GetOld"},
-        {4, &MM_U::Initialize, "Initialize"}, {5, nullptr, "Finalize"},
-        {6, &MM_U::SetAndWait, "SetAndWait"}, {7, &MM_U::Get, "Get"},
-    };
-    RegisterHandlers(functions);
+        LOG_WARNING(Service_MM, "(STUBBED) called, min=0x{:X}, max=0x{:X}", min, max);
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+    }
+
+    void Get(Kernel::HLERequestContext& ctx) {
+        LOG_WARNING(Service_MM, "(STUBBED) called");
+        IPC::ResponseBuilder rb{ctx, 3};
+        rb.Push(RESULT_SUCCESS);
+        rb.Push(current);
+    }
+
+    u32 min{0};
+    u32 max{0};
+    u32 current{0};
+};
+
+void InstallInterfaces(SM::ServiceManager& service_manager) {
+    std::make_shared<MM_U>()->InstallAsService(service_manager);
 }
 
 } // namespace Service::MM
diff --git a/src/core/hle/service/mm/mm_u.h b/src/core/hle/service/mm/mm_u.h
index 79eeedf9c..5439fa653 100644
--- a/src/core/hle/service/mm/mm_u.h
+++ b/src/core/hle/service/mm/mm_u.h
@@ -8,21 +8,6 @@
 
 namespace Service::MM {
 
-class MM_U final : public ServiceFramework<MM_U> {
-public:
-    MM_U();
-    ~MM_U() = default;
-
-private:
-    void Initialize(Kernel::HLERequestContext& ctx);
-    void SetAndWait(Kernel::HLERequestContext& ctx);
-    void Get(Kernel::HLERequestContext& ctx);
-
-    u32 min{0};
-    u32 max{0};
-    u32 current{0};
-};
-
 /// Registers all MM services with the specified service manager.
 void InstallInterfaces(SM::ServiceManager& service_manager);
 
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp
new file mode 100644
index 000000000..51f01077b
--- /dev/null
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp
@@ -0,0 +1,34 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/hle/service/nvdrv/devices/nvhost_nvjpg.h"
+
+namespace Service::Nvidia::Devices {
+
+u32 nvhost_nvjpg::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+    LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
+              command.raw, input.size(), output.size());
+
+    switch (static_cast<IoctlCommand>(command.raw)) {
+    case IoctlCommand::IocSetNVMAPfdCommand:
+        return SetNVMAPfd(input, output);
+    }
+
+    UNIMPLEMENTED_MSG("Unimplemented ioctl");
+    return 0;
+}
+
+u32 nvhost_nvjpg::SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlSetNvmapFD params{};
+    std::memcpy(&params, input.data(), input.size());
+    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
+    nvmap_fd = params.nvmap_fd;
+    return 0;
+}
+
+} // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h
new file mode 100644
index 000000000..2b0eb43ee
--- /dev/null
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h
@@ -0,0 +1,36 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+#include "common/common_types.h"
+#include "common/swap.h"
+#include "core/hle/service/nvdrv/devices/nvdevice.h"
+
+namespace Service::Nvidia::Devices {
+
+class nvhost_nvjpg final : public nvdevice {
+public:
+    nvhost_nvjpg() = default;
+    ~nvhost_nvjpg() override = default;
+
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+
+private:
+    enum class IoctlCommand : u32_le {
+        IocSetNVMAPfdCommand = 0x40044801,
+    };
+
+    struct IoctlSetNvmapFD {
+        u32_le nvmap_fd;
+    };
+    static_assert(sizeof(IoctlSetNvmapFD) == 4, "IoctlSetNvmapFD is incorrect size");
+
+    u32_le nvmap_fd{};
+
+    u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output);
+};
+
+} // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
new file mode 100644
index 000000000..fcb488d50
--- /dev/null
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@@ -0,0 +1,34 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/hle/service/nvdrv/devices/nvhost_vic.h"
+
+namespace Service::Nvidia::Devices {
+
+u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+    LOG_DEBUG(Service_NVDRV, "called, command=0x{:08X}, input_size=0x{:X}, output_size=0x{:X}",
+              command.raw, input.size(), output.size());
+
+    switch (static_cast<IoctlCommand>(command.raw)) {
+    case IoctlCommand::IocSetNVMAPfdCommand:
+        return SetNVMAPfd(input, output);
+    }
+
+    UNIMPLEMENTED_MSG("Unimplemented ioctl");
+    return 0;
+}
+
+u32 nvhost_vic::SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlSetNvmapFD params{};
+    std::memcpy(&params, input.data(), input.size());
+    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);
+    nvmap_fd = params.nvmap_fd;
+    return 0;
+}
+
+} // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.h b/src/core/hle/service/nvdrv/devices/nvhost_vic.h
new file mode 100644
index 000000000..c7d681e52
--- /dev/null
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.h
@@ -0,0 +1,36 @@
+// Copyright 2018 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+#include "common/common_types.h"
+#include "common/swap.h"
+#include "core/hle/service/nvdrv/devices/nvdevice.h"
+
+namespace Service::Nvidia::Devices {
+
+class nvhost_vic final : public nvdevice {
+public:
+    nvhost_vic() = default;
+    ~nvhost_vic() override = default;
+
+    u32 ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+
+private:
+    enum class IoctlCommand : u32_le {
+        IocSetNVMAPfdCommand = 0x40044801,
+    };
+
+    struct IoctlSetNvmapFD {
+        u32_le nvmap_fd;
+    };
+    static_assert(sizeof(IoctlSetNvmapFD) == 4, "IoctlSetNvmapFD is incorrect size");
+
+    u32_le nvmap_fd{};
+
+    u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output);
+};
+
+} // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/nvdrv.cpp b/src/core/hle/service/nvdrv/nvdrv.cpp
index 427f4b574..2de39822f 100644
--- a/src/core/hle/service/nvdrv/nvdrv.cpp
+++ b/src/core/hle/service/nvdrv/nvdrv.cpp
@@ -12,6 +12,8 @@
 #include "core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h"
 #include "core/hle/service/nvdrv/devices/nvhost_gpu.h"
 #include "core/hle/service/nvdrv/devices/nvhost_nvdec.h"
+#include "core/hle/service/nvdrv/devices/nvhost_nvjpg.h"
+#include "core/hle/service/nvdrv/devices/nvhost_vic.h"
 #include "core/hle/service/nvdrv/devices/nvmap.h"
 #include "core/hle/service/nvdrv/interface.h"
 #include "core/hle/service/nvdrv/nvdrv.h"
@@ -39,6 +41,8 @@ Module::Module() {
     devices["/dev/nvdisp_disp0"] = std::make_shared<Devices::nvdisp_disp0>(nvmap_dev);
     devices["/dev/nvhost-ctrl"] = std::make_shared<Devices::nvhost_ctrl>();
     devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>();
+    devices["/dev/nvhost-nvjpg"] = std::make_shared<Devices::nvhost_nvjpg>();
+    devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>();
 }
 
 u32 Module::Open(const std::string& device_name) {
diff --git a/src/core/hle/service/service.h b/src/core/hle/service/service.h
index 8a294c0f2..cd9c74f3d 100644
--- a/src/core/hle/service/service.h
+++ b/src/core/hle/service/service.h
@@ -23,7 +23,7 @@ class HLERequestContext;
 } // namespace Kernel
 
 namespace FileSys {
-struct VfsFilesystem;
+class VfsFilesystem;
 }
 
 namespace Service {
diff --git a/src/core/hle/service/sm/controller.cpp b/src/core/hle/service/sm/controller.cpp
index 518a0cc46..1cef73216 100644
--- a/src/core/hle/service/sm/controller.cpp
+++ b/src/core/hle/service/sm/controller.cpp
@@ -10,7 +10,7 @@
 namespace Service::SM {
 
 void Controller::ConvertSessionToDomain(Kernel::HLERequestContext& ctx) {
-    ASSERT_MSG(!ctx.Session()->IsDomain(), "session is alread a domain");
+    ASSERT_MSG(ctx.Session()->IsSession(), "Session is already a domain");
     ctx.Session()->ConvertToDomain();
 
     IPC::ResponseBuilder rb{ctx, 3};
@@ -41,7 +41,7 @@ void Controller::DuplicateSessionEx(Kernel::HLERequestContext& ctx) {
 void Controller::QueryPointerBufferSize(Kernel::HLERequestContext& ctx) {
     IPC::ResponseBuilder rb{ctx, 3};
     rb.Push(RESULT_SUCCESS);
-    rb.Push<u32>(0x500);
+    rb.Push<u16>(0x500);
 
     LOG_WARNING(Service, "(STUBBED) called");
 }
diff --git a/src/core/loader/deconstructed_rom_directory.cpp b/src/core/loader/deconstructed_rom_directory.cpp
index de05f21d8..d575a9bea 100644
--- a/src/core/loader/deconstructed_rom_directory.cpp
+++ b/src/core/loader/deconstructed_rom_directory.cpp
@@ -118,7 +118,6 @@ ResultStatus AppLoader_DeconstructedRomDirectory::Load(
 
     process->program_id = metadata.GetTitleID();
     process->svc_access_mask.set();
-    process->address_mappings = default_address_mappings;
     process->resource_limit =
         Kernel::ResourceLimit::GetForCategory(Kernel::ResourceLimitCategory::APPLICATION);
     process->Run(Memory::PROCESS_IMAGE_VADDR, metadata.GetMainThreadPriority(),
diff --git a/src/core/loader/elf.cpp b/src/core/loader/elf.cpp
index 401cad3ab..6420a7f11 100644
--- a/src/core/loader/elf.cpp
+++ b/src/core/loader/elf.cpp
@@ -398,7 +398,6 @@ ResultStatus AppLoader_ELF::Load(Kernel::SharedPtr<Kernel::Process>& process) {
 
     process->LoadModule(codeset, codeset->entrypoint);
     process->svc_access_mask.set();
-    process->address_mappings = default_address_mappings;
 
     // Attach the default resource limit (APPLICATION) to the process
     process->resource_limit =
diff --git a/src/core/loader/loader.cpp b/src/core/loader/loader.cpp
index 0e690abb3..70ef5d240 100644
--- a/src/core/loader/loader.cpp
+++ b/src/core/loader/loader.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include <memory>
+#include <ostream>
 #include <string>
 #include "common/logging/log.h"
 #include "common/string_util.h"
@@ -17,12 +18,6 @@
 
 namespace Loader {
 
-const std::initializer_list<Kernel::AddressMapping> default_address_mappings = {
-    {0x1FF50000, 0x8000, true},    // part of DSP RAM
-    {0x1FF70000, 0x8000, true},    // part of DSP RAM
-    {0x1F000000, 0x600000, false}, // entire VRAM
-};
-
 FileType IdentifyFile(FileSys::VirtualFile file) {
     FileType type;
 
@@ -127,14 +122,9 @@ constexpr std::array<const char*, 36> RESULT_MESSAGES{
     "There is no control data available.",
 };
 
-std::string GetMessageForResultStatus(ResultStatus status) {
-    return GetMessageForResultStatus(static_cast<size_t>(status));
-}
-
-std::string GetMessageForResultStatus(u16 status) {
-    if (status >= 36)
-        return "";
-    return RESULT_MESSAGES[status];
+std::ostream& operator<<(std::ostream& os, ResultStatus status) {
+    os << RESULT_MESSAGES.at(static_cast<size_t>(status));
+    return os;
 }
 
 /**
diff --git a/src/core/loader/loader.h b/src/core/loader/loader.h
index cfdadbee3..b74cfbf8a 100644
--- a/src/core/loader/loader.h
+++ b/src/core/loader/loader.h
@@ -5,7 +5,7 @@
 #pragma once
 
 #include <algorithm>
-#include <initializer_list>
+#include <iosfwd>
 #include <memory>
 #include <string>
 #include <utility>
@@ -56,7 +56,7 @@ FileType GuessFromFilename(const std::string& name);
 std::string GetFileTypeString(FileType type);
 
 /// Return type for functions in Loader namespace
-enum class ResultStatus {
+enum class ResultStatus : u16 {
     Success,
     ErrorAlreadyLoaded,
     ErrorNotImplemented,
@@ -95,8 +95,7 @@ enum class ResultStatus {
     ErrorNoControl,
 };
 
-std::string GetMessageForResultStatus(ResultStatus status);
-std::string GetMessageForResultStatus(u16 status);
+std::ostream& operator<<(std::ostream& os, ResultStatus status);
 
 /// Interface for loading an application
 class AppLoader : NonCopyable {
@@ -208,12 +207,6 @@ protected:
 };
 
 /**
- * Common address mappings found in most games, used for binary formats that don't have this
- * information.
- */
-extern const std::initializer_list<Kernel::AddressMapping> default_address_mappings;
-
-/**
  * Identifies a bootable file and return a suitable loader
  * @param file The bootable file
  * @return the best loader for this file
diff --git a/src/core/loader/nro.cpp b/src/core/loader/nro.cpp
index 908d91eab..2179cf2ea 100644
--- a/src/core/loader/nro.cpp
+++ b/src/core/loader/nro.cpp
@@ -186,7 +186,6 @@ ResultStatus AppLoader_NRO::Load(Kernel::SharedPtr<Kernel::Process>& process) {
     }
 
     process->svc_access_mask.set();
-    process->address_mappings = default_address_mappings;
     process->resource_limit =
         Kernel::ResourceLimit::GetForCategory(Kernel::ResourceLimitCategory::APPLICATION);
     process->Run(base_addr, THREADPRIO_DEFAULT, Memory::DEFAULT_STACK_SIZE);
diff --git a/src/core/loader/nso.cpp b/src/core/loader/nso.cpp
index fee7d58c6..a94558ac5 100644
--- a/src/core/loader/nso.cpp
+++ b/src/core/loader/nso.cpp
@@ -152,7 +152,6 @@ ResultStatus AppLoader_NSO::Load(Kernel::SharedPtr<Kernel::Process>& process) {
     LOG_DEBUG(Loader, "loaded module {} @ 0x{:X}", file->GetName(), Memory::PROCESS_IMAGE_VADDR);
 
     process->svc_access_mask.set();
-    process->address_mappings = default_address_mappings;
     process->resource_limit =
         Kernel::ResourceLimit::GetForCategory(Kernel::ResourceLimitCategory::APPLICATION);
     process->Run(Memory::PROCESS_IMAGE_VADDR, THREADPRIO_DEFAULT, Memory::DEFAULT_STACK_SIZE);
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 9f64b248b..2526ebf28 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -200,6 +200,14 @@ enum class IMinMaxExchange : u64 {
     XHi = 3,
 };
 
+enum class XmadMode : u64 {
+    None = 0,
+    CLo = 1,
+    CHi = 2,
+    CSfu = 3,
+    CBcc = 4,
+};
+
 enum class FlowCondition : u64 {
     Always = 0xF,
     Fcsm_Tr = 0x1C, // TODO(bunnei): What is this used for?
@@ -457,6 +465,18 @@ union Instruction {
     } bra;
 
     union {
+        BitField<20, 16, u64> imm20_16;
+        BitField<36, 1, u64> product_shift_left;
+        BitField<37, 1, u64> merge_37;
+        BitField<48, 1, u64> sign_a;
+        BitField<49, 1, u64> sign_b;
+        BitField<50, 3, XmadMode> mode;
+        BitField<52, 1, u64> high_b;
+        BitField<53, 1, u64> high_a;
+        BitField<56, 1, u64> merge_56;
+    } xmad;
+
+    union {
         BitField<20, 14, u64> offset;
         BitField<34, 5, u64> index;
     } cbuf34;
@@ -593,6 +613,7 @@ public:
         IntegerSetPredicate,
         PredicateSetPredicate,
         Conversion,
+        Xmad,
         Unknown,
     };
 
@@ -782,10 +803,10 @@ private:
             INST("010010110101----", Id::ISET_C, Type::IntegerSet, "ISET_C"),
             INST("0011011-0101----", Id::ISET_IMM, Type::IntegerSet, "ISET_IMM"),
             INST("0101000010010---", Id::PSETP, Type::PredicateSetPredicate, "PSETP"),
-            INST("0011011-00------", Id::XMAD_IMM, Type::Arithmetic, "XMAD_IMM"),
-            INST("0100111---------", Id::XMAD_CR, Type::Arithmetic, "XMAD_CR"),
-            INST("010100010-------", Id::XMAD_RC, Type::Arithmetic, "XMAD_RC"),
-            INST("0101101100------", Id::XMAD_RR, Type::Arithmetic, "XMAD_RR"),
+            INST("0011011-00------", Id::XMAD_IMM, Type::Xmad, "XMAD_IMM"),
+            INST("0100111---------", Id::XMAD_CR, Type::Xmad, "XMAD_CR"),
+            INST("010100010-------", Id::XMAD_RC, Type::Xmad, "XMAD_RC"),
+            INST("0101101100------", Id::XMAD_RR, Type::Xmad, "XMAD_RR"),
         };
 #undef INST
         std::stable_sort(table.begin(), table.end(), [](const auto& a, const auto& b) {
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 19e7f1161..5a593c1f7 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -46,8 +46,11 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
     case RenderTargetFormat::RGBA32_FLOAT:
     case RenderTargetFormat::RGBA32_UINT:
         return 16;
+    case RenderTargetFormat::RGBA16_UINT:
+    case RenderTargetFormat::RGBA16_UNORM:
     case RenderTargetFormat::RGBA16_FLOAT:
     case RenderTargetFormat::RG32_FLOAT:
+    case RenderTargetFormat::RG32_UINT:
         return 8;
     case RenderTargetFormat::RGBA8_UNORM:
     case RenderTargetFormat::RGBA8_SNORM:
@@ -61,12 +64,14 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
     case RenderTargetFormat::RG16_FLOAT:
     case RenderTargetFormat::R32_FLOAT:
     case RenderTargetFormat::R11G11B10_FLOAT:
+    case RenderTargetFormat::R32_UINT:
         return 4;
     case RenderTargetFormat::R16_UNORM:
     case RenderTargetFormat::R16_SNORM:
     case RenderTargetFormat::R16_UINT:
     case RenderTargetFormat::R16_SINT:
     case RenderTargetFormat::R16_FLOAT:
+    case RenderTargetFormat::RG8_UNORM:
     case RenderTargetFormat::RG8_SNORM:
         return 2;
     case RenderTargetFormat::R8_UNORM:
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index e008d8f26..97dcccb92 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -20,8 +20,11 @@ enum class RenderTargetFormat : u32 {
     NONE = 0x0,
     RGBA32_FLOAT = 0xC0,
     RGBA32_UINT = 0xC2,
+    RGBA16_UNORM = 0xC6,
+    RGBA16_UINT = 0xC9,
     RGBA16_FLOAT = 0xCA,
     RG32_FLOAT = 0xCB,
+    RG32_UINT = 0xCD,
     BGRA8_UNORM = 0xCF,
     RGB10_A2_UNORM = 0xD1,
     RGBA8_UNORM = 0xD5,
@@ -33,8 +36,10 @@ enum class RenderTargetFormat : u32 {
     RG16_UINT = 0xDD,
     RG16_FLOAT = 0xDE,
     R11G11B10_FLOAT = 0xE0,
+    R32_UINT = 0xE4,
     R32_FLOAT = 0xE5,
     B5G6R5_UNORM = 0xE8,
+    RG8_UNORM = 0xEA,
     RG8_SNORM = 0xEB,
     R16_UNORM = 0xEE,
     R16_SNORM = 0xEF,
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 38a7b1413..9d1549fe9 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -36,30 +36,21 @@ MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192));
 MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255));
 MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100));
 
-RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window) : emu_window{window} {
+RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window)
+    : emu_window{window}, stream_buffer(GL_ARRAY_BUFFER, STREAM_BUFFER_SIZE) {
     // Create sampler objects
     for (size_t i = 0; i < texture_samplers.size(); ++i) {
         texture_samplers[i].Create();
         state.texture_units[i].sampler = texture_samplers[i].sampler.handle;
     }
 
-    // Create SSBOs
-    for (size_t stage = 0; stage < ssbos.size(); ++stage) {
-        for (size_t buffer = 0; buffer < ssbos[stage].size(); ++buffer) {
-            ssbos[stage][buffer].Create();
-            state.draw.const_buffers[stage][buffer].ssbo = ssbos[stage][buffer].handle;
-        }
-    }
-
     GLint ext_num;
     glGetIntegerv(GL_NUM_EXTENSIONS, &ext_num);
     for (GLint i = 0; i < ext_num; i++) {
         const std::string_view extension{
             reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, i))};
 
-        if (extension == "GL_ARB_buffer_storage") {
-            has_ARB_buffer_storage = true;
-        } else if (extension == "GL_ARB_direct_state_access") {
+        if (extension == "GL_ARB_direct_state_access") {
             has_ARB_direct_state_access = true;
         } else if (extension == "GL_ARB_separate_shader_objects") {
             has_ARB_separate_shader_objects = true;
@@ -86,47 +77,31 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window) : emu_wind
 
     hw_vao.Create();
 
-    stream_buffer = OGLStreamBuffer::MakeBuffer(has_ARB_buffer_storage, GL_ARRAY_BUFFER);
-    stream_buffer->Create(STREAM_BUFFER_SIZE, STREAM_BUFFER_SIZE / 2);
-    state.draw.vertex_buffer = stream_buffer->GetHandle();
+    state.draw.vertex_buffer = stream_buffer.GetHandle();
 
     shader_program_manager = std::make_unique<GLShader::ProgramManager>();
     state.draw.shader_program = 0;
     state.draw.vertex_array = hw_vao.handle;
     state.Apply();
 
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, stream_buffer->GetHandle());
-
-    for (unsigned index = 0; index < uniform_buffers.size(); ++index) {
-        auto& buffer = uniform_buffers[index];
-        buffer.Create();
-        glBindBuffer(GL_UNIFORM_BUFFER, buffer.handle);
-        glBufferData(GL_UNIFORM_BUFFER, sizeof(GLShader::MaxwellUniformData), nullptr,
-                     GL_STREAM_COPY);
-        glBindBufferBase(GL_UNIFORM_BUFFER, index, buffer.handle);
-    }
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, stream_buffer.GetHandle());
 
     glEnable(GL_BLEND);
 
+    glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_alignment);
+
     LOG_CRITICAL(Render_OpenGL, "Sync fixed function OpenGL state here!");
 }
 
-RasterizerOpenGL::~RasterizerOpenGL() {
-    if (stream_buffer != nullptr) {
-        state.draw.vertex_buffer = stream_buffer->GetHandle();
-        state.Apply();
-        stream_buffer->Release();
-    }
-}
+RasterizerOpenGL::~RasterizerOpenGL() {}
 
 std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
                                                              GLintptr buffer_offset) {
     MICROPROFILE_SCOPE(OpenGL_VAO);
     const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
-    const auto& memory_manager = Core::System::GetInstance().GPU().memory_manager;
 
     state.draw.vertex_array = hw_vao.handle;
-    state.draw.vertex_buffer = stream_buffer->GetHandle();
+    state.draw.vertex_buffer = stream_buffer.GetHandle();
     state.Apply();
 
     // Upload all guest vertex arrays sequentially to our buffer
@@ -141,16 +116,15 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
         ASSERT(end > start);
         u64 size = end - start + 1;
 
-        // Copy vertex array data
-        Memory::ReadBlock(*memory_manager->GpuToCpuAddress(start), array_ptr, size);
+        GLintptr vertex_buffer_offset;
+        std::tie(array_ptr, buffer_offset, vertex_buffer_offset) =
+            UploadMemory(array_ptr, buffer_offset, start, size);
 
         // Bind the vertex array to the buffer at the current offset.
-        glBindVertexBuffer(index, stream_buffer->GetHandle(), buffer_offset, vertex_array.stride);
+        glBindVertexBuffer(index, stream_buffer.GetHandle(), vertex_buffer_offset,
+                           vertex_array.stride);
 
         ASSERT_MSG(vertex_array.divisor == 0, "Vertex buffer divisor unimplemented");
-
-        array_ptr += size;
-        buffer_offset += size;
     }
 
     // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL.
@@ -201,22 +175,12 @@ static GLShader::ProgramCode GetShaderProgramCode(Maxwell::ShaderProgram program
     return program_code;
 }
 
-void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
-    // Helper function for uploading uniform data
-    const auto copy_buffer = [&](GLuint handle, GLintptr offset, GLsizeiptr size) {
-        if (has_ARB_direct_state_access) {
-            glCopyNamedBufferSubData(stream_buffer->GetHandle(), handle, offset, 0, size);
-        } else {
-            glBindBuffer(GL_COPY_WRITE_BUFFER, handle);
-            glCopyBufferSubData(GL_ARRAY_BUFFER, GL_COPY_WRITE_BUFFER, offset, 0, size);
-        }
-    };
-
+std::pair<u8*, GLintptr> RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
     auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
 
     // Next available bindpoints to use when uploading the const buffers and textures to the GLSL
     // shaders. The constbuffer bindpoint starts after the shader stage configuration bind points.
-    u32 current_constbuffer_bindpoint = static_cast<u32>(uniform_buffers.size());
+    u32 current_constbuffer_bindpoint = Tegra::Engines::Maxwell3D::Regs::MaxShaderStage;
     u32 current_texture_bindpoint = 0;
 
     for (size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
@@ -228,22 +192,21 @@ void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
             continue;
         }
 
+        std::tie(buffer_ptr, buffer_offset) =
+            AlignBuffer(buffer_ptr, buffer_offset, static_cast<size_t>(uniform_buffer_alignment));
+
         const size_t stage{index == 0 ? 0 : index - 1}; // Stage indices are 0 - 5
 
         GLShader::MaxwellUniformData ubo{};
         ubo.SetFromRegs(gpu.state.shader_stages[stage]);
         std::memcpy(buffer_ptr, &ubo, sizeof(ubo));
 
-        // Flush the buffer so that the GPU can see the data we just wrote.
-        glFlushMappedBufferRange(GL_ARRAY_BUFFER, buffer_offset, sizeof(ubo));
-
-        // Upload uniform data as one UBO per stage
-        const GLintptr ubo_offset = buffer_offset;
-        copy_buffer(uniform_buffers[stage].handle, ubo_offset,
-                    sizeof(GLShader::MaxwellUniformData));
+        // Bind the buffer
+        glBindBufferRange(GL_UNIFORM_BUFFER, stage, stream_buffer.GetHandle(), buffer_offset,
+                          sizeof(ubo));
 
-        buffer_ptr += sizeof(GLShader::MaxwellUniformData);
-        buffer_offset += sizeof(GLShader::MaxwellUniformData);
+        buffer_ptr += sizeof(ubo);
+        buffer_offset += sizeof(ubo);
 
         GLShader::ShaderSetup setup{GetShaderProgramCode(program)};
         GLShader::ShaderEntries shader_resources;
@@ -282,9 +245,9 @@ void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
             static_cast<Maxwell::ShaderStage>(stage));
 
         // Configure the const buffers for this shader stage.
-        current_constbuffer_bindpoint =
-            SetupConstBuffers(static_cast<Maxwell::ShaderStage>(stage), gl_stage_program,
-                              current_constbuffer_bindpoint, shader_resources.const_buffer_entries);
+        std::tie(buffer_ptr, buffer_offset, current_constbuffer_bindpoint) = SetupConstBuffers(
+            buffer_ptr, buffer_offset, static_cast<Maxwell::ShaderStage>(stage), gl_stage_program,
+            current_constbuffer_bindpoint, shader_resources.const_buffer_entries);
 
         // Configure the textures for this shader stage.
         current_texture_bindpoint =
@@ -299,6 +262,8 @@ void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
     }
 
     shader_program_manager->UseTrivialGeometryShader();
+
+    return {buffer_ptr, buffer_offset};
 }
 
 size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
@@ -432,6 +397,31 @@ void RasterizerOpenGL::Clear() {
     }
 }
 
+std::pair<u8*, GLintptr> RasterizerOpenGL::AlignBuffer(u8* buffer_ptr, GLintptr buffer_offset,
+                                                       size_t alignment) {
+    // Align the offset, not the mapped pointer
+    GLintptr offset_aligned =
+        static_cast<GLintptr>(Common::AlignUp(static_cast<size_t>(buffer_offset), alignment));
+    return {buffer_ptr + (offset_aligned - buffer_offset), offset_aligned};
+}
+
+std::tuple<u8*, GLintptr, GLintptr> RasterizerOpenGL::UploadMemory(u8* buffer_ptr,
+                                                                   GLintptr buffer_offset,
+                                                                   Tegra::GPUVAddr gpu_addr,
+                                                                   size_t size, size_t alignment) {
+    std::tie(buffer_ptr, buffer_offset) = AlignBuffer(buffer_ptr, buffer_offset, alignment);
+    GLintptr uploaded_offset = buffer_offset;
+
+    const auto& memory_manager = Core::System::GetInstance().GPU().memory_manager;
+    const boost::optional<VAddr> cpu_addr{memory_manager->GpuToCpuAddress(gpu_addr)};
+    Memory::ReadBlock(*cpu_addr, buffer_ptr, size);
+
+    buffer_ptr += size;
+    buffer_offset += size;
+
+    return {buffer_ptr, buffer_offset, uploaded_offset};
+}
+
 void RasterizerOpenGL::DrawArrays() {
     if (accelerate_draw == AccelDraw::Disabled)
         return;
@@ -456,7 +446,7 @@ void RasterizerOpenGL::DrawArrays() {
     const u64 index_buffer_size{regs.index_array.count * regs.index_array.FormatSizeInBytes()};
     const unsigned vertex_num{is_indexed ? regs.index_array.count : regs.vertex_buffer.count};
 
-    state.draw.vertex_buffer = stream_buffer->GetHandle();
+    state.draw.vertex_buffer = stream_buffer.GetHandle();
     state.Apply();
 
     size_t buffer_size = CalculateVertexArraysSize();
@@ -466,41 +456,31 @@ void RasterizerOpenGL::DrawArrays() {
     }
 
     // Uniform space for the 5 shader stages
-    buffer_size = Common::AlignUp<size_t>(buffer_size, 4) +
-                  sizeof(GLShader::MaxwellUniformData) * Maxwell::MaxShaderStage;
+    buffer_size =
+        Common::AlignUp<size_t>(buffer_size, 4) +
+        (sizeof(GLShader::MaxwellUniformData) + uniform_buffer_alignment) * Maxwell::MaxShaderStage;
+
+    // Add space for at least 18 constant buffers
+    buffer_size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + uniform_buffer_alignment);
 
     u8* buffer_ptr;
     GLintptr buffer_offset;
-    std::tie(buffer_ptr, buffer_offset) =
-        stream_buffer->Map(static_cast<GLsizeiptr>(buffer_size), 4);
+    std::tie(buffer_ptr, buffer_offset, std::ignore) =
+        stream_buffer.Map(static_cast<GLsizeiptr>(buffer_size), 4);
+    u8* buffer_ptr_base = buffer_ptr;
 
-    u8* offseted_buffer;
-    std::tie(offseted_buffer, buffer_offset) = SetupVertexArrays(buffer_ptr, buffer_offset);
-
-    offseted_buffer =
-        reinterpret_cast<u8*>(Common::AlignUp(reinterpret_cast<size_t>(offseted_buffer), 4));
-    buffer_offset = Common::AlignUp<size_t>(buffer_offset, 4);
+    std::tie(buffer_ptr, buffer_offset) = SetupVertexArrays(buffer_ptr, buffer_offset);
 
     // If indexed mode, copy the index buffer
     GLintptr index_buffer_offset = 0;
     if (is_indexed) {
-        const auto& memory_manager = Core::System::GetInstance().GPU().memory_manager;
-        const boost::optional<VAddr> index_data_addr{
-            memory_manager->GpuToCpuAddress(regs.index_array.StartAddress())};
-        Memory::ReadBlock(*index_data_addr, offseted_buffer, index_buffer_size);
-
-        index_buffer_offset = buffer_offset;
-        offseted_buffer += index_buffer_size;
-        buffer_offset += index_buffer_size;
+        std::tie(buffer_ptr, buffer_offset, index_buffer_offset) = UploadMemory(
+            buffer_ptr, buffer_offset, regs.index_array.StartAddress(), index_buffer_size);
     }
 
-    offseted_buffer =
-        reinterpret_cast<u8*>(Common::AlignUp(reinterpret_cast<size_t>(offseted_buffer), 4));
-    buffer_offset = Common::AlignUp<size_t>(buffer_offset, 4);
-
-    SetupShaders(offseted_buffer, buffer_offset);
+    std::tie(buffer_ptr, buffer_offset) = SetupShaders(buffer_ptr, buffer_offset);
 
-    stream_buffer->Unmap();
+    stream_buffer.Unmap(buffer_ptr - buffer_ptr_base);
 
     shader_program_manager->ApplyTo(state);
     state.Apply();
@@ -647,45 +627,32 @@ void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntr
     }
 }
 
-u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, GLuint program,
-                                        u32 current_bindpoint,
-                                        const std::vector<GLShader::ConstBufferEntry>& entries) {
+std::tuple<u8*, GLintptr, u32> RasterizerOpenGL::SetupConstBuffers(
+    u8* buffer_ptr, GLintptr buffer_offset, Maxwell::ShaderStage stage, GLuint program,
+    u32 current_bindpoint, const std::vector<GLShader::ConstBufferEntry>& entries) {
     const auto& gpu = Core::System::GetInstance().GPU();
     const auto& maxwell3d = gpu.Maxwell3D();
 
-    // Reset all buffer draw state for this stage.
-    for (auto& buffer : state.draw.const_buffers[static_cast<size_t>(stage)]) {
-        buffer.bindpoint = 0;
-        buffer.enabled = false;
-    }
-
     // Upload only the enabled buffers from the 16 constbuffers of each shader stage
     const auto& shader_stage = maxwell3d.state.shader_stages[static_cast<size_t>(stage)];
 
     for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
         const auto& used_buffer = entries[bindpoint];
         const auto& buffer = shader_stage.const_buffers[used_buffer.GetIndex()];
-        auto& buffer_draw_state =
-            state.draw.const_buffers[static_cast<size_t>(stage)][used_buffer.GetIndex()];
 
         if (!buffer.enabled) {
             continue;
         }
 
-        buffer_draw_state.enabled = true;
-        buffer_draw_state.bindpoint = current_bindpoint + bindpoint;
-
-        boost::optional<VAddr> addr = gpu.memory_manager->GpuToCpuAddress(buffer.address);
-
         size_t size = 0;
 
         if (used_buffer.IsIndirect()) {
             // Buffer is accessed indirectly, so upload the entire thing
-            size = buffer.size * sizeof(float);
+            size = buffer.size;
 
             if (size > MaxConstbufferSize) {
-                LOG_ERROR(HW_GPU, "indirect constbuffer size {} exceeds maximum {}", size,
-                          MaxConstbufferSize);
+                LOG_CRITICAL(HW_GPU, "indirect constbuffer size {} exceeds maximum {}", size,
+                             MaxConstbufferSize);
                 size = MaxConstbufferSize;
             }
         } else {
@@ -698,25 +665,26 @@ u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, GLuint progr
         size = Common::AlignUp(size, sizeof(GLvec4));
         ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big");
 
-        std::vector<u8> data(size);
-        Memory::ReadBlock(*addr, data.data(), data.size());
+        GLintptr const_buffer_offset;
+        std::tie(buffer_ptr, buffer_offset, const_buffer_offset) =
+            UploadMemory(buffer_ptr, buffer_offset, buffer.address, size,
+                         static_cast<size_t>(uniform_buffer_alignment));
 
-        glBindBuffer(GL_UNIFORM_BUFFER, buffer_draw_state.ssbo);
-        glBufferData(GL_UNIFORM_BUFFER, data.size(), data.data(), GL_DYNAMIC_DRAW);
-        glBindBuffer(GL_UNIFORM_BUFFER, 0);
+        glBindBufferRange(GL_UNIFORM_BUFFER, current_bindpoint + bindpoint,
+                          stream_buffer.GetHandle(), const_buffer_offset, size);
 
         // Now configure the bindpoint of the buffer inside the shader
         const std::string buffer_name = used_buffer.GetName();
         const GLuint index =
             glGetProgramResourceIndex(program, GL_UNIFORM_BLOCK, buffer_name.c_str());
         if (index != GL_INVALID_INDEX) {
-            glUniformBlockBinding(program, index, buffer_draw_state.bindpoint);
+            glUniformBlockBinding(program, index, current_bindpoint + bindpoint);
         }
     }
 
     state.Apply();
 
-    return current_bindpoint + static_cast<u32>(entries.size());
+    return {buffer_ptr, buffer_offset, current_bindpoint + static_cast<u32>(entries.size())};
 }
 
 u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, GLuint program, u32 current_unit,
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index bd01dc0ae..74307f626 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -7,6 +7,7 @@
 #include <array>
 #include <cstddef>
 #include <memory>
+#include <tuple>
 #include <utility>
 #include <vector>
 #include <glad/glad.h>
@@ -100,9 +101,10 @@ private:
      * @param entries Vector describing the buffers that are actually used in the guest shader.
      * @returns The next available bindpoint for use in the next shader stage.
      */
-    u32 SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, GLuint program,
-                          u32 current_bindpoint,
-                          const std::vector<GLShader::ConstBufferEntry>& entries);
+    std::tuple<u8*, GLintptr, u32> SetupConstBuffers(
+        u8* buffer_ptr, GLintptr buffer_offset, Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+        GLuint program, u32 current_bindpoint,
+        const std::vector<GLShader::ConstBufferEntry>& entries);
 
     /*
      * Configures the current textures to use for the draw command.
@@ -139,7 +141,6 @@ private:
     /// Syncs the blend state to match the guest state
     void SyncBlendState();
 
-    bool has_ARB_buffer_storage = false;
     bool has_ARB_direct_state_access = false;
     bool has_ARB_separate_shader_objects = false;
     bool has_ARB_vertex_attrib_binding = false;
@@ -155,22 +156,24 @@ private:
     OGLVertexArray hw_vao;
 
     std::array<SamplerInfo, GLShader::NumTextureSamplers> texture_samplers;
-    std::array<std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers>,
-               Tegra::Engines::Maxwell3D::Regs::MaxShaderStage>
-        ssbos;
 
     static constexpr size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
-    std::unique_ptr<OGLStreamBuffer> stream_buffer;
+    OGLStreamBuffer stream_buffer;
     OGLBuffer uniform_buffer;
     OGLFramebuffer framebuffer;
+    GLint uniform_buffer_alignment;
 
     size_t CalculateVertexArraysSize() const;
 
     std::pair<u8*, GLintptr> SetupVertexArrays(u8* array_ptr, GLintptr buffer_offset);
 
-    std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::MaxShaderStage> uniform_buffers;
+    std::pair<u8*, GLintptr> SetupShaders(u8* buffer_ptr, GLintptr buffer_offset);
 
-    void SetupShaders(u8* buffer_ptr, GLintptr buffer_offset);
+    std::pair<u8*, GLintptr> AlignBuffer(u8* buffer_ptr, GLintptr buffer_offset, size_t alignment);
+
+    std::tuple<u8*, GLintptr, GLintptr> UploadMemory(u8* buffer_ptr, GLintptr buffer_offset,
+                                                     Tegra::GPUVAddr gpu_addr, size_t size,
+                                                     size_t alignment = 4);
 
     enum class AccelDraw { Disabled, Arrays, Indexed };
     AccelDraw accelerate_draw = AccelDraw::Disabled;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 84c250c63..b6947b97b 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -94,13 +94,15 @@ struct FormatTuple {
 static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_format_tuples = {{
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // ABGR8U
     {GL_RGBA8, GL_RGBA, GL_BYTE, ComponentType::SNorm, false},                     // ABGR8S
-    {GL_RGB, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, ComponentType::UNorm, false},    // B5G6R5
+    {GL_RGB, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, ComponentType::UNorm, false},    // B5G6R5U
     {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, ComponentType::UNorm,
-     false}, // A2B10G10R10
-    {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, ComponentType::UNorm, false}, // A1B5G5R5
-    {GL_R8, GL_RED, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},                    // R8
+     false}, // A2B10G10R10U
+    {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, ComponentType::UNorm, false}, // A1B5G5R5U
+    {GL_R8, GL_RED, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},                    // R8U
     {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE, ComponentType::UInt, false},           // R8UI
     {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, ComponentType::Float, false},                 // RGBA16F
+    {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT, ComponentType::UNorm, false},              // RGBA16U
+    {GL_RGBA16UI, GL_RGBA, GL_UNSIGNED_SHORT, ComponentType::UInt, false},             // RGBA16UI
     {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, ComponentType::Float,
      false},                                                                     // R11FG11FB10F
     {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // RGBA32UI
@@ -114,16 +116,17 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form
     {GL_COMPRESSED_RG_RGTC2, GL_RG, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm,
      true},                                                                     // DXN2UNORM
     {GL_COMPRESSED_SIGNED_RG_RGTC2, GL_RG, GL_INT, ComponentType::SNorm, true}, // DXN2SNORM
-    {GL_COMPRESSED_RGBA_BPTC_UNORM_ARB, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm,
+    {GL_COMPRESSED_RGBA_BPTC_UNORM_ARB, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm,
      true},                                                                    // BC7U
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},        // ASTC_2D_4X4
-    {GL_RG8, GL_RG, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},            // G8R8
+    {GL_RG8, GL_RG, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},            // G8R8U
+    {GL_RG8, GL_RG, GL_BYTE, ComponentType::SNorm, false},                     // G8R8S
     {GL_RGBA8, GL_BGRA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},        // BGRA8
     {GL_RGBA32F, GL_RGBA, GL_FLOAT, ComponentType::Float, false},              // RGBA32F
     {GL_RG32F, GL_RG, GL_FLOAT, ComponentType::Float, false},                  // RG32F
     {GL_R32F, GL_RED, GL_FLOAT, ComponentType::Float, false},                  // R32F
     {GL_R16F, GL_RED, GL_HALF_FLOAT, ComponentType::Float, false},             // R16F
-    {GL_R16, GL_RED, GL_UNSIGNED_SHORT, ComponentType::UNorm, false},          // R16UNORM
+    {GL_R16, GL_RED, GL_UNSIGNED_SHORT, ComponentType::UNorm, false},          // R16U
     {GL_R16_SNORM, GL_RED, GL_SHORT, ComponentType::SNorm, false},             // R16S
     {GL_R16UI, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ComponentType::UInt, false}, // R16UI
     {GL_R16I, GL_RED_INTEGER, GL_SHORT, ComponentType::SInt, false},           // R16I
@@ -134,7 +137,10 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form
     {GL_RG16_SNORM, GL_RG, GL_SHORT, ComponentType::SNorm, false},             // RG16S
     {GL_RGB32F, GL_RGB, GL_FLOAT, ComponentType::Float, false},                // RGB32F
     {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // SRGBA8
+    {GL_RG8, GL_RG, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},                       // RG8U
     {GL_RG8, GL_RG, GL_BYTE, ComponentType::SNorm, false},                                // RG8S
+    {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false},              // RG32UI
+    {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false},              // R32UI
 
     // DepthStencil formats
     {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, ComponentType::UNorm,
@@ -234,40 +240,71 @@ void MortonCopy(u32 stride, u32 block_height, u32 height, std::vector<u8>& gl_bu
 static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, Tegra::GPUVAddr),
                             SurfaceParams::MaxPixelFormat>
     morton_to_gl_fns = {
-        MortonCopy<true, PixelFormat::ABGR8U>,       MortonCopy<true, PixelFormat::ABGR8S>,
-        MortonCopy<true, PixelFormat::B5G6R5>,       MortonCopy<true, PixelFormat::A2B10G10R10>,
-        MortonCopy<true, PixelFormat::A1B5G5R5>,     MortonCopy<true, PixelFormat::R8>,
-        MortonCopy<true, PixelFormat::R8UI>,         MortonCopy<true, PixelFormat::RGBA16F>,
-        MortonCopy<true, PixelFormat::R11FG11FB10F>, MortonCopy<true, PixelFormat::RGBA32UI>,
-        MortonCopy<true, PixelFormat::DXT1>,         MortonCopy<true, PixelFormat::DXT23>,
-        MortonCopy<true, PixelFormat::DXT45>,        MortonCopy<true, PixelFormat::DXN1>,
-        MortonCopy<true, PixelFormat::DXN2UNORM>,    MortonCopy<true, PixelFormat::DXN2SNORM>,
-        MortonCopy<true, PixelFormat::BC7U>,         MortonCopy<true, PixelFormat::ASTC_2D_4X4>,
-        MortonCopy<true, PixelFormat::G8R8>,         MortonCopy<true, PixelFormat::BGRA8>,
-        MortonCopy<true, PixelFormat::RGBA32F>,      MortonCopy<true, PixelFormat::RG32F>,
-        MortonCopy<true, PixelFormat::R32F>,         MortonCopy<true, PixelFormat::R16F>,
-        MortonCopy<true, PixelFormat::R16UNORM>,     MortonCopy<true, PixelFormat::R16S>,
-        MortonCopy<true, PixelFormat::R16UI>,        MortonCopy<true, PixelFormat::R16I>,
-        MortonCopy<true, PixelFormat::RG16>,         MortonCopy<true, PixelFormat::RG16F>,
-        MortonCopy<true, PixelFormat::RG16UI>,       MortonCopy<true, PixelFormat::RG16I>,
-        MortonCopy<true, PixelFormat::RG16S>,        MortonCopy<true, PixelFormat::RGB32F>,
-        MortonCopy<true, PixelFormat::SRGBA8>,       MortonCopy<true, PixelFormat::RG8S>,
-        MortonCopy<true, PixelFormat::Z24S8>,        MortonCopy<true, PixelFormat::S8Z24>,
-        MortonCopy<true, PixelFormat::Z32F>,         MortonCopy<true, PixelFormat::Z16>,
+        // clang-format off
+        MortonCopy<true, PixelFormat::ABGR8U>,
+        MortonCopy<true, PixelFormat::ABGR8S>,
+        MortonCopy<true, PixelFormat::B5G6R5U>,
+        MortonCopy<true, PixelFormat::A2B10G10R10U>,
+        MortonCopy<true, PixelFormat::A1B5G5R5U>,
+        MortonCopy<true, PixelFormat::R8U>,
+        MortonCopy<true, PixelFormat::R8UI>,
+        MortonCopy<true, PixelFormat::RGBA16F>,
+        MortonCopy<true, PixelFormat::RGBA16U>,
+        MortonCopy<true, PixelFormat::RGBA16UI>,
+        MortonCopy<true, PixelFormat::R11FG11FB10F>,
+        MortonCopy<true, PixelFormat::RGBA32UI>,
+        MortonCopy<true, PixelFormat::DXT1>,
+        MortonCopy<true, PixelFormat::DXT23>,
+        MortonCopy<true, PixelFormat::DXT45>,
+        MortonCopy<true, PixelFormat::DXN1>,
+        MortonCopy<true, PixelFormat::DXN2UNORM>,
+        MortonCopy<true, PixelFormat::DXN2SNORM>,
+        MortonCopy<true, PixelFormat::BC7U>,
+        MortonCopy<true, PixelFormat::ASTC_2D_4X4>,
+        MortonCopy<true, PixelFormat::G8R8U>,
+        MortonCopy<true, PixelFormat::G8R8S>,
+        MortonCopy<true, PixelFormat::BGRA8>,
+        MortonCopy<true, PixelFormat::RGBA32F>,
+        MortonCopy<true, PixelFormat::RG32F>,
+        MortonCopy<true, PixelFormat::R32F>,
+        MortonCopy<true, PixelFormat::R16F>,
+        MortonCopy<true, PixelFormat::R16U>,
+        MortonCopy<true, PixelFormat::R16S>,
+        MortonCopy<true, PixelFormat::R16UI>,
+        MortonCopy<true, PixelFormat::R16I>,
+        MortonCopy<true, PixelFormat::RG16>,
+        MortonCopy<true, PixelFormat::RG16F>,
+        MortonCopy<true, PixelFormat::RG16UI>,
+        MortonCopy<true, PixelFormat::RG16I>,
+        MortonCopy<true, PixelFormat::RG16S>,
+        MortonCopy<true, PixelFormat::RGB32F>,
+        MortonCopy<true, PixelFormat::SRGBA8>,
+        MortonCopy<true, PixelFormat::RG8U>,
+        MortonCopy<true, PixelFormat::RG8S>,
+        MortonCopy<true, PixelFormat::RG32UI>,
+        MortonCopy<true, PixelFormat::R32UI>,
+        MortonCopy<true, PixelFormat::Z24S8>,
+        MortonCopy<true, PixelFormat::S8Z24>,
+        MortonCopy<true, PixelFormat::Z32F>,
+        MortonCopy<true, PixelFormat::Z16>,
         MortonCopy<true, PixelFormat::Z32FS8>,
+        // clang-format on
 };
 
 static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, Tegra::GPUVAddr),
                             SurfaceParams::MaxPixelFormat>
     gl_to_morton_fns = {
+        // clang-format off
         MortonCopy<false, PixelFormat::ABGR8U>,
         MortonCopy<false, PixelFormat::ABGR8S>,
-        MortonCopy<false, PixelFormat::B5G6R5>,
-        MortonCopy<false, PixelFormat::A2B10G10R10>,
-        MortonCopy<false, PixelFormat::A1B5G5R5>,
-        MortonCopy<false, PixelFormat::R8>,
+        MortonCopy<false, PixelFormat::B5G6R5U>,
+        MortonCopy<false, PixelFormat::A2B10G10R10U>,
+        MortonCopy<false, PixelFormat::A1B5G5R5U>,
+        MortonCopy<false, PixelFormat::R8U>,
         MortonCopy<false, PixelFormat::R8UI>,
         MortonCopy<false, PixelFormat::RGBA16F>,
+        MortonCopy<false, PixelFormat::RGBA16U>,
+        MortonCopy<false, PixelFormat::RGBA16UI>,
         MortonCopy<false, PixelFormat::R11FG11FB10F>,
         MortonCopy<false, PixelFormat::RGBA32UI>,
         // TODO(Subv): Swizzling DXT1/DXT23/DXT45/DXN1/DXN2/BC7U/ASTC_2D_4X4 formats is not
@@ -280,13 +317,14 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, Tegra::GPU
         nullptr,
         nullptr,
         nullptr,
-        MortonCopy<false, PixelFormat::G8R8>,
+        MortonCopy<false, PixelFormat::G8R8U>,
+        MortonCopy<false, PixelFormat::G8R8S>,
         MortonCopy<false, PixelFormat::BGRA8>,
         MortonCopy<false, PixelFormat::RGBA32F>,
         MortonCopy<false, PixelFormat::RG32F>,
         MortonCopy<false, PixelFormat::R32F>,
         MortonCopy<false, PixelFormat::R16F>,
-        MortonCopy<false, PixelFormat::R16UNORM>,
+        MortonCopy<false, PixelFormat::R16U>,
         MortonCopy<false, PixelFormat::R16S>,
         MortonCopy<false, PixelFormat::R16UI>,
         MortonCopy<false, PixelFormat::R16I>,
@@ -297,12 +335,16 @@ static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, Tegra::GPU
         MortonCopy<false, PixelFormat::RG16S>,
         MortonCopy<false, PixelFormat::RGB32F>,
         MortonCopy<false, PixelFormat::SRGBA8>,
+        MortonCopy<false, PixelFormat::RG8U>,
         MortonCopy<false, PixelFormat::RG8S>,
+        MortonCopy<false, PixelFormat::RG32UI>,
+        MortonCopy<false, PixelFormat::R32UI>,
         MortonCopy<false, PixelFormat::Z24S8>,
         MortonCopy<false, PixelFormat::S8Z24>,
         MortonCopy<false, PixelFormat::Z32F>,
         MortonCopy<false, PixelFormat::Z16>,
         MortonCopy<false, PixelFormat::Z32FS8>,
+        // clang-format on
 };
 
 // Allocate an uninitialized texture of appropriate size and format for the surface
@@ -422,7 +464,7 @@ static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height) {
 }
 
 static void ConvertG8R8ToR8G8(std::vector<u8>& data, u32 width, u32 height) {
-    const auto bpp{CachedSurface::GetGLBytesPerPixel(PixelFormat::G8R8)};
+    const auto bpp{CachedSurface::GetGLBytesPerPixel(PixelFormat::G8R8U)};
     for (size_t y = 0; y < height; ++y) {
         for (size_t x = 0; x < width; ++x) {
             const size_t offset{bpp * (y * width + x)};
@@ -454,7 +496,8 @@ static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelForma
         ConvertS8Z24ToZ24S8(data, width, height);
         break;
 
-    case PixelFormat::G8R8:
+    case PixelFormat::G8R8U:
+    case PixelFormat::G8R8S:
         // Convert the G8R8 color format to R8G8, as OpenGL does not support G8R8.
         ConvertG8R8ToR8G8(data, width, height);
         break;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 202257b58..55cf3782c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -25,49 +25,55 @@ struct SurfaceParams {
     enum class PixelFormat {
         ABGR8U = 0,
         ABGR8S = 1,
-        B5G6R5 = 2,
-        A2B10G10R10 = 3,
-        A1B5G5R5 = 4,
-        R8 = 5,
+        B5G6R5U = 2,
+        A2B10G10R10U = 3,
+        A1B5G5R5U = 4,
+        R8U = 5,
         R8UI = 6,
         RGBA16F = 7,
-        R11FG11FB10F = 8,
-        RGBA32UI = 9,
-        DXT1 = 10,
-        DXT23 = 11,
-        DXT45 = 12,
-        DXN1 = 13, // This is also known as BC4
-        DXN2UNORM = 14,
-        DXN2SNORM = 15,
-        BC7U = 16,
-        ASTC_2D_4X4 = 17,
-        G8R8 = 18,
-        BGRA8 = 19,
-        RGBA32F = 20,
-        RG32F = 21,
-        R32F = 22,
-        R16F = 23,
-        R16UNORM = 24,
-        R16S = 25,
-        R16UI = 26,
-        R16I = 27,
-        RG16 = 28,
-        RG16F = 29,
-        RG16UI = 30,
-        RG16I = 31,
-        RG16S = 32,
-        RGB32F = 33,
-        SRGBA8 = 34,
-        RG8S = 35,
+        RGBA16U = 8,
+        RGBA16UI = 9,
+        R11FG11FB10F = 10,
+        RGBA32UI = 11,
+        DXT1 = 12,
+        DXT23 = 13,
+        DXT45 = 14,
+        DXN1 = 15, // This is also known as BC4
+        DXN2UNORM = 16,
+        DXN2SNORM = 17,
+        BC7U = 18,
+        ASTC_2D_4X4 = 19,
+        G8R8U = 20,
+        G8R8S = 21,
+        BGRA8 = 22,
+        RGBA32F = 23,
+        RG32F = 24,
+        R32F = 25,
+        R16F = 26,
+        R16U = 27,
+        R16S = 28,
+        R16UI = 29,
+        R16I = 30,
+        RG16 = 31,
+        RG16F = 32,
+        RG16UI = 33,
+        RG16I = 34,
+        RG16S = 35,
+        RGB32F = 36,
+        SRGBA8 = 37,
+        RG8U = 38,
+        RG8S = 39,
+        RG32UI = 40,
+        R32UI = 41,
 
         MaxColorFormat,
 
         // DepthStencil formats
-        Z24S8 = 36,
-        S8Z24 = 37,
-        Z32F = 38,
-        Z16 = 39,
-        Z32FS8 = 40,
+        Z24S8 = 42,
+        S8Z24 = 43,
+        Z32F = 44,
+        Z16 = 45,
+        Z32FS8 = 46,
 
         MaxDepthStencilFormat,
 
@@ -107,12 +113,14 @@ struct SurfaceParams {
         constexpr std::array<u32, MaxPixelFormat> compression_factor_table = {{
             1, // ABGR8U
             1, // ABGR8S
-            1, // B5G6R5
-            1, // A2B10G10R10
-            1, // A1B5G5R5
-            1, // R8
+            1, // B5G6R5U
+            1, // A2B10G10R10U
+            1, // A1B5G5R5U
+            1, // R8U
             1, // R8UI
             1, // RGBA16F
+            1, // RGBA16U
+            1, // RGBA16UI
             1, // R11FG11FB10F
             1, // RGBA32UI
             4, // DXT1
@@ -123,13 +131,14 @@ struct SurfaceParams {
             4, // DXN2SNORM
             4, // BC7U
             4, // ASTC_2D_4X4
-            1, // G8R8
+            1, // G8R8U
+            1, // G8R8S
             1, // BGRA8
             1, // RGBA32F
             1, // RG32F
             1, // R32F
             1, // R16F
-            1, // R16UNORM
+            1, // R16U
             1, // R16S
             1, // R16UI
             1, // R16I
@@ -140,7 +149,10 @@ struct SurfaceParams {
             1, // RG16S
             1, // RGB32F
             1, // SRGBA8
+            1, // RG8U
             1, // RG8S
+            1, // RG32UI
+            1, // R32UI
             1, // Z24S8
             1, // S8Z24
             1, // Z32F
@@ -159,12 +171,14 @@ struct SurfaceParams {
         constexpr std::array<u32, MaxPixelFormat> bpp_table = {{
             32,  // ABGR8U
             32,  // ABGR8S
-            16,  // B5G6R5
-            32,  // A2B10G10R10
-            16,  // A1B5G5R5
-            8,   // R8
+            16,  // B5G6R5U
+            32,  // A2B10G10R10U
+            16,  // A1B5G5R5U
+            8,   // R8U
             8,   // R8UI
             64,  // RGBA16F
+            64,  // RGBA16U
+            64,  // RGBA16UI
             32,  // R11FG11FB10F
             128, // RGBA32UI
             64,  // DXT1
@@ -175,13 +189,14 @@ struct SurfaceParams {
             128, // DXN2SNORM
             128, // BC7U
             32,  // ASTC_2D_4X4
-            16,  // G8R8
+            16,  // G8R8U
+            16,  // G8R8S
             32,  // BGRA8
             128, // RGBA32F
             64,  // RG32F
             32,  // R32F
             16,  // R16F
-            16,  // R16UNORM
+            16,  // R16U
             16,  // R16S
             16,  // R16UI
             16,  // R16I
@@ -192,7 +207,10 @@ struct SurfaceParams {
             32,  // RG16S
             96,  // RGB32F
             32,  // SRGBA8
+            16,  // RG8U
             16,  // RG8S
+            64,  // RG32UI
+            32,  // R32UI
             32,  // Z24S8
             32,  // S8Z24
             32,  // Z32F
@@ -238,9 +256,13 @@ struct SurfaceParams {
         case Tegra::RenderTargetFormat::BGRA8_UNORM:
             return PixelFormat::BGRA8;
         case Tegra::RenderTargetFormat::RGB10_A2_UNORM:
-            return PixelFormat::A2B10G10R10;
+            return PixelFormat::A2B10G10R10U;
         case Tegra::RenderTargetFormat::RGBA16_FLOAT:
             return PixelFormat::RGBA16F;
+        case Tegra::RenderTargetFormat::RGBA16_UNORM:
+            return PixelFormat::RGBA16U;
+        case Tegra::RenderTargetFormat::RGBA16_UINT:
+            return PixelFormat::RGBA16UI;
         case Tegra::RenderTargetFormat::RGBA32_FLOAT:
             return PixelFormat::RGBA32F;
         case Tegra::RenderTargetFormat::RG32_FLOAT:
@@ -248,11 +270,11 @@ struct SurfaceParams {
         case Tegra::RenderTargetFormat::R11G11B10_FLOAT:
             return PixelFormat::R11FG11FB10F;
         case Tegra::RenderTargetFormat::B5G6R5_UNORM:
-            return PixelFormat::B5G6R5;
+            return PixelFormat::B5G6R5U;
         case Tegra::RenderTargetFormat::RGBA32_UINT:
             return PixelFormat::RGBA32UI;
         case Tegra::RenderTargetFormat::R8_UNORM:
-            return PixelFormat::R8;
+            return PixelFormat::R8U;
         case Tegra::RenderTargetFormat::R8_UINT:
             return PixelFormat::R8UI;
         case Tegra::RenderTargetFormat::RG16_FLOAT:
@@ -265,12 +287,14 @@ struct SurfaceParams {
             return PixelFormat::RG16;
         case Tegra::RenderTargetFormat::RG16_SNORM:
             return PixelFormat::RG16S;
+        case Tegra::RenderTargetFormat::RG8_UNORM:
+            return PixelFormat::RG8U;
         case Tegra::RenderTargetFormat::RG8_SNORM:
             return PixelFormat::RG8S;
         case Tegra::RenderTargetFormat::R16_FLOAT:
             return PixelFormat::R16F;
         case Tegra::RenderTargetFormat::R16_UNORM:
-            return PixelFormat::R16UNORM;
+            return PixelFormat::R16U;
         case Tegra::RenderTargetFormat::R16_SNORM:
             return PixelFormat::R16S;
         case Tegra::RenderTargetFormat::R16_UINT:
@@ -279,6 +303,10 @@ struct SurfaceParams {
             return PixelFormat::R16I;
         case Tegra::RenderTargetFormat::R32_FLOAT:
             return PixelFormat::R32F;
+        case Tegra::RenderTargetFormat::R32_UINT:
+            return PixelFormat::R32UI;
+        case Tegra::RenderTargetFormat::RG32_UINT:
+            return PixelFormat::RG32UI;
         default:
             LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
             UNREACHABLE();
@@ -300,15 +328,33 @@ struct SurfaceParams {
                          static_cast<u32>(component_type));
             UNREACHABLE();
         case Tegra::Texture::TextureFormat::B5G6R5:
-            return PixelFormat::B5G6R5;
+            switch (component_type) {
+            case Tegra::Texture::ComponentType::UNORM:
+                return PixelFormat::B5G6R5U;
+            }
+            LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
+                         static_cast<u32>(component_type));
+            UNREACHABLE();
         case Tegra::Texture::TextureFormat::A2B10G10R10:
-            return PixelFormat::A2B10G10R10;
+            switch (component_type) {
+            case Tegra::Texture::ComponentType::UNORM:
+                return PixelFormat::A2B10G10R10U;
+            }
+            LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
+                         static_cast<u32>(component_type));
+            UNREACHABLE();
         case Tegra::Texture::TextureFormat::A1B5G5R5:
-            return PixelFormat::A1B5G5R5;
+            switch (component_type) {
+            case Tegra::Texture::ComponentType::UNORM:
+                return PixelFormat::A1B5G5R5U;
+            }
+            LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
+                         static_cast<u32>(component_type));
+            UNREACHABLE();
         case Tegra::Texture::TextureFormat::R8:
             switch (component_type) {
             case Tegra::Texture::ComponentType::UNORM:
-                return PixelFormat::R8;
+                return PixelFormat::R8U;
             case Tegra::Texture::ComponentType::UINT:
                 return PixelFormat::R8UI;
             }
@@ -316,11 +362,33 @@ struct SurfaceParams {
                          static_cast<u32>(component_type));
             UNREACHABLE();
         case Tegra::Texture::TextureFormat::G8R8:
-            return PixelFormat::G8R8;
+            switch (component_type) {
+            case Tegra::Texture::ComponentType::UNORM:
+                return PixelFormat::G8R8U;
+            case Tegra::Texture::ComponentType::SNORM:
+                return PixelFormat::G8R8S;
+            }
+            LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
+                         static_cast<u32>(component_type));
+            UNREACHABLE();
         case Tegra::Texture::TextureFormat::R16_G16_B16_A16:
-            return PixelFormat::RGBA16F;
+            switch (component_type) {
+            case Tegra::Texture::ComponentType::UNORM:
+                return PixelFormat::RGBA16U;
+            case Tegra::Texture::ComponentType::FLOAT:
+                return PixelFormat::RGBA16F;
+            }
+            LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
+                         static_cast<u32>(component_type));
+            UNREACHABLE();
         case Tegra::Texture::TextureFormat::BF10GF11RF11:
-            return PixelFormat::R11FG11FB10F;
+            switch (component_type) {
+            case Tegra::Texture::ComponentType::FLOAT:
+                return PixelFormat::R11FG11FB10F;
+            }
+            LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
+                         static_cast<u32>(component_type));
+            UNREACHABLE();
         case Tegra::Texture::TextureFormat::R32_G32_B32_A32:
             switch (component_type) {
             case Tegra::Texture::ComponentType::FLOAT:
@@ -332,15 +400,29 @@ struct SurfaceParams {
                          static_cast<u32>(component_type));
             UNREACHABLE();
         case Tegra::Texture::TextureFormat::R32_G32:
-            return PixelFormat::RG32F;
+            switch (component_type) {
+            case Tegra::Texture::ComponentType::FLOAT:
+                return PixelFormat::RG32F;
+            case Tegra::Texture::ComponentType::UINT:
+                return PixelFormat::RG32UI;
+            }
+            LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
+                         static_cast<u32>(component_type));
+            UNREACHABLE();
         case Tegra::Texture::TextureFormat::R32_G32_B32:
-            return PixelFormat::RGB32F;
+            switch (component_type) {
+            case Tegra::Texture::ComponentType::FLOAT:
+                return PixelFormat::RGB32F;
+            }
+            LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
+                         static_cast<u32>(component_type));
+            UNREACHABLE();
         case Tegra::Texture::TextureFormat::R16:
             switch (component_type) {
             case Tegra::Texture::ComponentType::FLOAT:
                 return PixelFormat::R16F;
             case Tegra::Texture::ComponentType::UNORM:
-                return PixelFormat::R16UNORM;
+                return PixelFormat::R16U;
             case Tegra::Texture::ComponentType::SNORM:
                 return PixelFormat::R16S;
             case Tegra::Texture::ComponentType::UINT:
@@ -352,9 +434,19 @@ struct SurfaceParams {
                          static_cast<u32>(component_type));
             UNREACHABLE();
         case Tegra::Texture::TextureFormat::R32:
-            return PixelFormat::R32F;
+            switch (component_type) {
+            case Tegra::Texture::ComponentType::FLOAT:
+                return PixelFormat::R32F;
+            case Tegra::Texture::ComponentType::UINT:
+                return PixelFormat::R32UI;
+            }
+            LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
+                         static_cast<u32>(component_type));
+            UNREACHABLE();
         case Tegra::Texture::TextureFormat::ZF32:
             return PixelFormat::Z32F;
+        case Tegra::Texture::TextureFormat::Z16:
+            return PixelFormat::Z16;
         case Tegra::Texture::TextureFormat::Z24S8:
             return PixelFormat::Z24S8;
         case Tegra::Texture::TextureFormat::DXT1:
@@ -432,6 +524,8 @@ struct SurfaceParams {
         case Tegra::RenderTargetFormat::RG16_UNORM:
         case Tegra::RenderTargetFormat::R16_UNORM:
         case Tegra::RenderTargetFormat::B5G6R5_UNORM:
+        case Tegra::RenderTargetFormat::RG8_UNORM:
+        case Tegra::RenderTargetFormat::RGBA16_UNORM:
             return ComponentType::UNorm;
         case Tegra::RenderTargetFormat::RGBA8_SNORM:
         case Tegra::RenderTargetFormat::RG16_SNORM:
@@ -447,9 +541,12 @@ struct SurfaceParams {
         case Tegra::RenderTargetFormat::R32_FLOAT:
             return ComponentType::Float;
         case Tegra::RenderTargetFormat::RGBA32_UINT:
+        case Tegra::RenderTargetFormat::RGBA16_UINT:
         case Tegra::RenderTargetFormat::RG16_UINT:
         case Tegra::RenderTargetFormat::R8_UINT:
         case Tegra::RenderTargetFormat::R16_UINT:
+        case Tegra::RenderTargetFormat::RG32_UINT:
+        case Tegra::RenderTargetFormat::R32_UINT:
             return ComponentType::UInt;
         case Tegra::RenderTargetFormat::RG16_SINT:
         case Tegra::RenderTargetFormat::R16_SINT:
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 85297bd00..e0dfdbb9f 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -356,40 +356,43 @@ public:
      * @param reg The register to use as the source value.
      */
     void SetOutputAttributeToRegister(Attribute::Index attribute, u64 elem, const Register& reg) {
-        std::string dest = GetOutputAttribute(attribute) + GetSwizzle(elem);
+        std::string dest = GetOutputAttribute(attribute);
         std::string src = GetRegisterAsFloat(reg);
 
         if (!dest.empty()) {
             // Can happen with unknown/unimplemented output attributes, in which case we ignore the
             // instruction for now.
-            shader.AddLine(dest + " = " + src + ';');
+            shader.AddLine(dest + GetSwizzle(elem) + " = " + src + ';');
         }
     }
 
     /// Generates code representing a uniform (C buffer) register, interpreted as the input type.
-    std::string GetUniform(u64 index, u64 offset, GLSLRegister::Type type) {
+    std::string GetUniform(u64 index, u64 offset, GLSLRegister::Type type,
+                           Register::Size size = Register::Size::Word) {
         declr_const_buffers[index].MarkAsUsed(index, offset, stage);
         std::string value = 'c' + std::to_string(index) + '[' + std::to_string(offset / 4) + "][" +
                             std::to_string(offset % 4) + ']';
 
         if (type == GLSLRegister::Type::Float) {
-            return value;
+            // Do nothing, default
         } else if (type == GLSLRegister::Type::Integer) {
-            return "floatBitsToInt(" + value + ')';
+            value = "floatBitsToInt(" + value + ')';
+        } else if (type == GLSLRegister::Type::UnsignedInteger) {
+            value = "floatBitsToUint(" + value + ')';
         } else {
             UNREACHABLE();
         }
+
+        return ConvertIntegerSize(value, size);
     }
 
-    std::string GetUniformIndirect(u64 index, s64 offset, const Register& index_reg,
+    std::string GetUniformIndirect(u64 cbuf_index, s64 offset, const std::string& index_str,
                                    GLSLRegister::Type type) {
-        declr_const_buffers[index].MarkAsUsedIndirect(index, stage);
-
-        std::string final_offset = "((floatBitsToInt(" + GetRegister(index_reg, 0) + ") + " +
-                                   std::to_string(offset) + ") / 4)";
+        declr_const_buffers[cbuf_index].MarkAsUsedIndirect(cbuf_index, stage);
 
-        std::string value =
-            'c' + std::to_string(index) + '[' + final_offset + " / 4][" + final_offset + " % 4]";
+        std::string final_offset = fmt::format("({} + {})", index_str, offset / 4);
+        std::string value = 'c' + std::to_string(cbuf_index) + '[' + final_offset + " / 4][" +
+                            final_offset + " % 4]";
 
         if (type == GLSLRegister::Type::Float) {
             return value;
@@ -1247,20 +1250,41 @@ private:
                     op_a = "abs(" + op_a + ')';
                 }
 
+                if (instr.conversion.negate_a) {
+                    op_a = "-(" + op_a + ')';
+                }
+
                 regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_output_signed, 0, op_a, 1,
                                           1, instr.alu.saturate_d, 0, instr.conversion.dest_size);
                 break;
             }
-            case OpCode::Id::I2F_R: {
+            case OpCode::Id::I2F_R:
+            case OpCode::Id::I2F_C: {
                 ASSERT_MSG(instr.conversion.dest_size == Register::Size::Word, "Unimplemented");
                 ASSERT_MSG(!instr.conversion.selector, "Unimplemented");
-                std::string op_a = regs.GetRegisterAsInteger(
-                    instr.gpr20, 0, instr.conversion.is_input_signed, instr.conversion.src_size);
+
+                std::string op_a{};
+
+                if (instr.is_b_gpr) {
+                    op_a =
+                        regs.GetRegisterAsInteger(instr.gpr20, 0, instr.conversion.is_input_signed,
+                                                  instr.conversion.src_size);
+                } else {
+                    op_a = regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                           instr.conversion.is_input_signed
+                                               ? GLSLRegister::Type::Integer
+                                               : GLSLRegister::Type::UnsignedInteger,
+                                           instr.conversion.src_size);
+                }
 
                 if (instr.conversion.abs_a) {
                     op_a = "abs(" + op_a + ')';
                 }
 
+                if (instr.conversion.negate_a) {
+                    op_a = "-(" + op_a + ')';
+                }
+
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1);
                 break;
             }
@@ -1269,6 +1293,14 @@ private:
                 ASSERT_MSG(instr.conversion.src_size == Register::Size::Word, "Unimplemented");
                 std::string op_a = regs.GetRegisterAsFloat(instr.gpr20);
 
+                if (instr.conversion.abs_a) {
+                    op_a = "abs(" + op_a + ')';
+                }
+
+                if (instr.conversion.negate_a) {
+                    op_a = "-(" + op_a + ')';
+                }
+
                 switch (instr.conversion.f2f.rounding) {
                 case Tegra::Shader::F2fRoundingOp::None:
                     break;
@@ -1291,21 +1323,29 @@ private:
                     break;
                 }
 
-                if (instr.conversion.abs_a) {
-                    op_a = "abs(" + op_a + ')';
-                }
-
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1, instr.alu.saturate_d);
                 break;
             }
-            case OpCode::Id::F2I_R: {
+            case OpCode::Id::F2I_R:
+            case OpCode::Id::F2I_C: {
                 ASSERT_MSG(instr.conversion.src_size == Register::Size::Word, "Unimplemented");
-                std::string op_a = regs.GetRegisterAsFloat(instr.gpr20);
+                std::string op_a{};
+
+                if (instr.is_b_gpr) {
+                    op_a = regs.GetRegisterAsFloat(instr.gpr20);
+                } else {
+                    op_a = regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                           GLSLRegister::Type::Float);
+                }
 
                 if (instr.conversion.abs_a) {
                     op_a = "abs(" + op_a + ')';
                 }
 
+                if (instr.conversion.negate_a) {
+                    op_a = "-(" + op_a + ')';
+                }
+
                 switch (instr.conversion.f2i.rounding) {
                 case Tegra::Shader::F2iRoundingOp::None:
                     break;
@@ -1353,11 +1393,16 @@ private:
             case OpCode::Id::LD_C: {
                 ASSERT_MSG(instr.ld_c.unknown == 0, "Unimplemented");
 
+                // Add an extra scope and declare the index register inside to prevent
+                // overwriting it in case it is used as an output of the LD instruction.
+                shader.AddLine("{");
+                ++shader.scope;
+
+                shader.AddLine("uint index = (" + regs.GetRegisterAsInteger(instr.gpr8, 0, false) +
+                               " / 4) & (MAX_CONSTBUFFER_ELEMENTS - 1);");
+
                 std::string op_a =
-                    regs.GetUniformIndirect(instr.cbuf36.index, instr.cbuf36.offset + 0, instr.gpr8,
-                                            GLSLRegister::Type::Float);
-                std::string op_b =
-                    regs.GetUniformIndirect(instr.cbuf36.index, instr.cbuf36.offset + 4, instr.gpr8,
+                    regs.GetUniformIndirect(instr.cbuf36.index, instr.cbuf36.offset + 0, "index",
                                             GLSLRegister::Type::Float);
 
                 switch (instr.ld_c.type.Value()) {
@@ -1365,16 +1410,22 @@ private:
                     regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1);
                     break;
 
-                case Tegra::Shader::UniformType::Double:
+                case Tegra::Shader::UniformType::Double: {
+                    std::string op_b =
+                        regs.GetUniformIndirect(instr.cbuf36.index, instr.cbuf36.offset + 4,
+                                                "index", GLSLRegister::Type::Float);
                     regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1);
                     regs.SetRegisterToFloat(instr.gpr0.Value() + 1, 0, op_b, 1, 1);
                     break;
-
+                }
                 default:
                     LOG_CRITICAL(HW_GPU, "Unhandled type: {}",
                                  static_cast<unsigned>(instr.ld_c.type.Value()));
                     UNREACHABLE();
                 }
+
+                --shader.scope;
+                shader.AddLine("}");
                 break;
             }
             case OpCode::Id::ST_A: {
@@ -1630,6 +1681,99 @@ private:
             }
             break;
         }
+        case OpCode::Type::Xmad: {
+            ASSERT_MSG(!instr.xmad.sign_a, "Unimplemented");
+            ASSERT_MSG(!instr.xmad.sign_b, "Unimplemented");
+
+            std::string op_a{regs.GetRegisterAsInteger(instr.gpr8, 0, instr.xmad.sign_a)};
+            std::string op_b;
+            std::string op_c;
+
+            // TODO(bunnei): Needs to be fixed once op_a or op_b is signed
+            ASSERT_MSG(instr.xmad.sign_a == instr.xmad.sign_b, "Unimplemented");
+            const bool is_signed{instr.xmad.sign_a == 1};
+
+            bool is_merge{};
+            switch (opcode->GetId()) {
+            case OpCode::Id::XMAD_CR: {
+                is_merge = instr.xmad.merge_56;
+                op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                        instr.xmad.sign_b ? GLSLRegister::Type::Integer
+                                                          : GLSLRegister::Type::UnsignedInteger);
+                op_c += regs.GetRegisterAsInteger(instr.gpr39, 0, is_signed);
+                break;
+            }
+            case OpCode::Id::XMAD_RR: {
+                is_merge = instr.xmad.merge_37;
+                op_b += regs.GetRegisterAsInteger(instr.gpr20, 0, instr.xmad.sign_b);
+                op_c += regs.GetRegisterAsInteger(instr.gpr39, 0, is_signed);
+                break;
+            }
+            case OpCode::Id::XMAD_RC: {
+                op_b += regs.GetRegisterAsInteger(instr.gpr39, 0, instr.xmad.sign_b);
+                op_c += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                        is_signed ? GLSLRegister::Type::Integer
+                                                  : GLSLRegister::Type::UnsignedInteger);
+                break;
+            }
+            case OpCode::Id::XMAD_IMM: {
+                is_merge = instr.xmad.merge_37;
+                op_b += std::to_string(instr.xmad.imm20_16);
+                op_c += regs.GetRegisterAsInteger(instr.gpr39, 0, is_signed);
+                break;
+            }
+            default: {
+                LOG_CRITICAL(HW_GPU, "Unhandled XMAD instruction: {}", opcode->GetName());
+                UNREACHABLE();
+            }
+            }
+
+            // TODO(bunnei): Ensure this is right with signed operands
+            if (instr.xmad.high_a) {
+                op_a = "((" + op_a + ") >> 16)";
+            } else {
+                op_a = "((" + op_a + ") & 0xFFFF)";
+            }
+
+            std::string src2 = '(' + op_b + ')'; // Preserve original source 2
+            if (instr.xmad.high_b) {
+                op_b = '(' + src2 + " >> 16)";
+            } else {
+                op_b = '(' + src2 + " & 0xFFFF)";
+            }
+
+            std::string product = '(' + op_a + " * " + op_b + ')';
+            if (instr.xmad.product_shift_left) {
+                product = '(' + product + " << 16)";
+            }
+
+            switch (instr.xmad.mode) {
+            case Tegra::Shader::XmadMode::None:
+                break;
+            case Tegra::Shader::XmadMode::CLo:
+                op_c = "((" + op_c + ") & 0xFFFF)";
+                break;
+            case Tegra::Shader::XmadMode::CHi:
+                op_c = "((" + op_c + ") >> 16)";
+                break;
+            case Tegra::Shader::XmadMode::CBcc:
+                op_c = "((" + op_c + ") + (" + src2 + "<< 16))";
+                break;
+            default: {
+                LOG_CRITICAL(HW_GPU, "Unhandled XMAD mode: {}",
+                             static_cast<u32>(instr.xmad.mode.Value()));
+                UNREACHABLE();
+            }
+            }
+
+            std::string sum{'(' + product + " + " + op_c + ')'};
+            if (is_merge) {
+                sum = "((" + sum + " & 0xFFFF) | (" + src2 + "<< 16))";
+            }
+
+            regs.SetRegisterToInteger(instr.gpr0, is_signed, 0, sum, 1, 1);
+            break;
+        }
         default: {
             switch (opcode->GetId()) {
             case OpCode::Id::EXIT: {
@@ -1667,7 +1811,15 @@ private:
             }
             case OpCode::Id::KIL: {
                 ASSERT(instr.flow.cond == Tegra::Shader::FlowCondition::Always);
+
+                // Enclose "discard" in a conditional, so that GLSL compilation does not complain
+                // about unexecuted instructions that may follow this.
+                shader.AddLine("if (true) {");
+                ++shader.scope;
                 shader.AddLine("discard;");
+                --shader.scope;
+                shader.AddLine("}");
+
                 break;
             }
             case OpCode::Id::BRA: {
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 68bacd4c5..1d1975179 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -203,21 +203,6 @@ void OpenGLState::Apply() const {
         }
     }
 
-    // Constbuffers
-    for (std::size_t stage = 0; stage < draw.const_buffers.size(); ++stage) {
-        for (std::size_t buffer_id = 0; buffer_id < draw.const_buffers[stage].size(); ++buffer_id) {
-            const auto& current = cur_state.draw.const_buffers[stage][buffer_id];
-            const auto& new_state = draw.const_buffers[stage][buffer_id];
-
-            if (current.enabled != new_state.enabled || current.bindpoint != new_state.bindpoint ||
-                current.ssbo != new_state.ssbo) {
-                if (new_state.enabled) {
-                    glBindBufferBase(GL_UNIFORM_BUFFER, new_state.bindpoint, new_state.ssbo);
-                }
-            }
-        }
-    }
-
     // Framebuffer
     if (draw.read_framebuffer != cur_state.draw.read_framebuffer) {
         glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer);
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 5c7b636e4..bdb02ba25 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -119,12 +119,6 @@ public:
         GLuint uniform_buffer;   // GL_UNIFORM_BUFFER_BINDING
         GLuint shader_program;   // GL_CURRENT_PROGRAM
         GLuint program_pipeline; // GL_PROGRAM_PIPELINE_BINDING
-        struct ConstBufferConfig {
-            bool enabled = false;
-            GLuint bindpoint;
-            GLuint ssbo;
-        };
-        std::array<std::array<ConstBufferConfig, Regs::MaxConstBuffers>, 5> const_buffers;
     } draw;
 
     struct {
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index a2713e9f0..03a8ed8b7 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -9,174 +9,91 @@
 #include "video_core/renderer_opengl/gl_state.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
-class OrphanBuffer : public OGLStreamBuffer {
-public:
-    explicit OrphanBuffer(GLenum target) : OGLStreamBuffer(target) {}
-    ~OrphanBuffer() override;
-
-private:
-    void Create(size_t size, size_t sync_subdivide) override;
-    void Release() override;
-
-    std::pair<u8*, GLintptr> Map(size_t size, size_t alignment) override;
-    void Unmap() override;
-
-    std::vector<u8> data;
-};
-
-class StorageBuffer : public OGLStreamBuffer {
-public:
-    explicit StorageBuffer(GLenum target) : OGLStreamBuffer(target) {}
-    ~StorageBuffer() override;
-
-private:
-    void Create(size_t size, size_t sync_subdivide) override;
-    void Release() override;
-
-    std::pair<u8*, GLintptr> Map(size_t size, size_t alignment) override;
-    void Unmap() override;
-
-    struct Fence {
-        OGLSync sync;
-        size_t offset;
-    };
-    std::deque<Fence> head;
-    std::deque<Fence> tail;
-
-    u8* mapped_ptr;
-};
-
-OGLStreamBuffer::OGLStreamBuffer(GLenum target) {
-    gl_target = target;
-}
-
-GLuint OGLStreamBuffer::GetHandle() const {
-    return gl_buffer.handle;
-}
+OGLStreamBuffer::OGLStreamBuffer(GLenum target, GLsizeiptr size, bool prefer_coherent)
+    : gl_target(target), buffer_size(size) {
+    gl_buffer.Create();
+    glBindBuffer(gl_target, gl_buffer.handle);
 
-std::unique_ptr<OGLStreamBuffer> OGLStreamBuffer::MakeBuffer(bool storage_buffer, GLenum target) {
-    if (storage_buffer) {
-        return std::make_unique<StorageBuffer>(target);
+    GLsizeiptr allocate_size = size;
+    if (target == GL_ARRAY_BUFFER) {
+        // On AMD GPU there is a strange crash in indexed drawing. The crash happens when the buffer
+        // read position is near the end and is an out-of-bound access to the vertex buffer. This is
+        // probably a bug in the driver and is related to the usage of vec3<byte> attributes in the
+        // vertex array. Doubling the allocation size for the vertex buffer seems to avoid the
+        // crash.
+        allocate_size *= 2;
     }
-    return std::make_unique<OrphanBuffer>(target);
-}
 
-OrphanBuffer::~OrphanBuffer() {
-    Release();
+    if (GLAD_GL_ARB_buffer_storage) {
+        persistent = true;
+        coherent = prefer_coherent;
+        GLbitfield flags =
+            GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
+        glBufferStorage(gl_target, allocate_size, nullptr, flags);
+        mapped_ptr = static_cast<u8*>(glMapBufferRange(
+            gl_target, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
+    } else {
+        glBufferData(gl_target, allocate_size, nullptr, GL_STREAM_DRAW);
+    }
 }
 
-void OrphanBuffer::Create(size_t size, size_t /*sync_subdivide*/) {
-    buffer_pos = 0;
-    buffer_size = size;
-    data.resize(buffer_size);
-
-    if (gl_buffer.handle == 0) {
-        gl_buffer.Create();
+OGLStreamBuffer::~OGLStreamBuffer() {
+    if (persistent) {
         glBindBuffer(gl_target, gl_buffer.handle);
+        glUnmapBuffer(gl_target);
     }
-
-    glBufferData(gl_target, static_cast<GLsizeiptr>(buffer_size), nullptr, GL_STREAM_DRAW);
-}
-
-void OrphanBuffer::Release() {
     gl_buffer.Release();
 }
 
-std::pair<u8*, GLintptr> OrphanBuffer::Map(size_t size, size_t alignment) {
-    buffer_pos = Common::AlignUp(buffer_pos, alignment);
-
-    if (buffer_pos + size > buffer_size) {
-        Create(std::max(buffer_size, size), 0);
-    }
-
-    mapped_size = size;
-    return std::make_pair(&data[buffer_pos], static_cast<GLintptr>(buffer_pos));
-}
-
-void OrphanBuffer::Unmap() {
-    glBufferSubData(gl_target, static_cast<GLintptr>(buffer_pos),
-                    static_cast<GLsizeiptr>(mapped_size), &data[buffer_pos]);
-    buffer_pos += mapped_size;
-}
-
-StorageBuffer::~StorageBuffer() {
-    Release();
+GLuint OGLStreamBuffer::GetHandle() const {
+    return gl_buffer.handle;
 }
 
-void StorageBuffer::Create(size_t size, size_t sync_subdivide) {
-    if (gl_buffer.handle != 0)
-        return;
-
-    buffer_pos = 0;
-    buffer_size = size;
-    buffer_sync_subdivide = std::max<size_t>(sync_subdivide, 1);
-
-    gl_buffer.Create();
-    glBindBuffer(gl_target, gl_buffer.handle);
-
-    glBufferStorage(gl_target, static_cast<GLsizeiptr>(buffer_size), nullptr,
-                    GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT);
-    mapped_ptr = reinterpret_cast<u8*>(
-        glMapBufferRange(gl_target, 0, static_cast<GLsizeiptr>(buffer_size),
-                         GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_FLUSH_EXPLICIT_BIT));
+GLsizeiptr OGLStreamBuffer::GetSize() const {
+    return buffer_size;
 }
 
-void StorageBuffer::Release() {
-    if (gl_buffer.handle == 0)
-        return;
-
-    glUnmapBuffer(gl_target);
-
-    gl_buffer.Release();
-    head.clear();
-    tail.clear();
-}
-
-std::pair<u8*, GLintptr> StorageBuffer::Map(size_t size, size_t alignment) {
+std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
     ASSERT(size <= buffer_size);
+    ASSERT(alignment <= buffer_size);
+    mapped_size = size;
 
-    OGLSync sync;
-
-    buffer_pos = Common::AlignUp(buffer_pos, alignment);
-    size_t effective_offset = Common::AlignDown(buffer_pos, buffer_sync_subdivide);
-
-    if (!head.empty() &&
-        (effective_offset > head.back().offset || buffer_pos + size > buffer_size)) {
-        ASSERT(head.back().sync.handle == 0);
-        head.back().sync.Create();
+    if (alignment > 0) {
+        buffer_pos = Common::AlignUp<size_t>(buffer_pos, alignment);
     }
 
+    bool invalidate = false;
     if (buffer_pos + size > buffer_size) {
-        if (!tail.empty()) {
-            std::swap(sync, tail.back().sync);
-            tail.clear();
-        }
-        std::swap(tail, head);
         buffer_pos = 0;
-        effective_offset = 0;
-    }
+        invalidate = true;
 
-    while (!tail.empty() && buffer_pos + size > tail.front().offset) {
-        std::swap(sync, tail.front().sync);
-        tail.pop_front();
+        if (persistent) {
+            glUnmapBuffer(gl_target);
+        }
     }
 
-    if (sync.handle != 0) {
-        glClientWaitSync(sync.handle, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
-        sync.Release();
+    if (invalidate | !persistent) {
+        GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
+                           (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) |
+                           (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
+        mapped_ptr = static_cast<u8*>(
+            glMapBufferRange(gl_target, buffer_pos, buffer_size - buffer_pos, flags));
+        mapped_offset = buffer_pos;
     }
 
-    if (head.empty() || effective_offset > head.back().offset) {
-        head.emplace_back();
-        head.back().offset = effective_offset;
+    return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
+}
+
+void OGLStreamBuffer::Unmap(GLsizeiptr size) {
+    ASSERT(size <= mapped_size);
+
+    if (!coherent && size > 0) {
+        glFlushMappedBufferRange(gl_target, buffer_pos - mapped_offset, size);
     }
 
-    mapped_size = size;
-    return std::make_pair(&mapped_ptr[buffer_pos], static_cast<GLintptr>(buffer_pos));
-}
+    if (!persistent) {
+        glUnmapBuffer(gl_target);
+    }
 
-void StorageBuffer::Unmap() {
-    glFlushMappedBufferRange(gl_target, static_cast<GLintptr>(buffer_pos),
-                             static_cast<GLsizeiptr>(mapped_size));
-    buffer_pos += mapped_size;
+    buffer_pos += size;
 }
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index e78dc5784..45592daaf 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -2,35 +2,41 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#pragma once
-
-#include <memory>
+#include <tuple>
 #include <glad/glad.h>
 #include "common/common_types.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 
 class OGLStreamBuffer : private NonCopyable {
 public:
-    explicit OGLStreamBuffer(GLenum target);
-    virtual ~OGLStreamBuffer() = default;
-
-public:
-    static std::unique_ptr<OGLStreamBuffer> MakeBuffer(bool storage_buffer, GLenum target);
-
-    virtual void Create(size_t size, size_t sync_subdivide) = 0;
-    virtual void Release() {}
+    explicit OGLStreamBuffer(GLenum target, GLsizeiptr size, bool prefer_coherent = false);
+    ~OGLStreamBuffer();
 
     GLuint GetHandle() const;
+    GLsizeiptr GetSize() const;
+
+    /*
+     * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
+     * and the optional alignment requirement.
+     * If the buffer is full, the whole buffer is reallocated which invalidates old chunks.
+     * The return values are the pointer to the new chunk, the offset within the buffer,
+     * and the invalidation flag for previous chunks.
+     * The actual used size must be specified on unmapping the chunk.
+     */
+    std::tuple<u8*, GLintptr, bool> Map(GLsizeiptr size, GLintptr alignment = 0);
 
-    virtual std::pair<u8*, GLintptr> Map(size_t size, size_t alignment) = 0;
-    virtual void Unmap() = 0;
+    void Unmap(GLsizeiptr size);
 
-protected:
+private:
     OGLBuffer gl_buffer;
     GLenum gl_target;
 
-    size_t buffer_pos = 0;
-    size_t buffer_size = 0;
-    size_t buffer_sync_subdivide = 0;
-    size_t mapped_size = 0;
+    bool coherent = false;
+    bool persistent = false;
+
+    GLintptr buffer_pos = 0;
+    GLsizeiptr buffer_size = 0;
+    GLintptr mapped_offset = 0;
+    GLsizeiptr mapped_size = 0;
+    u8* mapped_ptr = nullptr;
 };
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 5afd20dbe..8f719fdd8 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -24,15 +24,25 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
     switch (attrib.type) {
+    case Maxwell::VertexAttribute::Type::UnsignedInt:
     case Maxwell::VertexAttribute::Type::UnsignedNorm: {
 
         switch (attrib.size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
             return GL_UNSIGNED_BYTE;
+        case Maxwell::VertexAttribute::Size::Size_16:
         case Maxwell::VertexAttribute::Size::Size_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
             return GL_UNSIGNED_SHORT;
+        case Maxwell::VertexAttribute::Size::Size_32:
+        case Maxwell::VertexAttribute::Size::Size_32_32:
+        case Maxwell::VertexAttribute::Size::Size_32_32_32:
+        case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
+            return GL_UNSIGNED_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_UNSIGNED_INT_2_10_10_10_REV;
         }
@@ -42,16 +52,25 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         return {};
     }
 
+    case Maxwell::VertexAttribute::Type::SignedInt:
     case Maxwell::VertexAttribute::Type::SignedNorm: {
 
         switch (attrib.size) {
-        case Maxwell::VertexAttribute::Size::Size_32_32_32:
-            return GL_INT;
+        case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8_8:
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
             return GL_BYTE;
+        case Maxwell::VertexAttribute::Size::Size_16:
         case Maxwell::VertexAttribute::Size::Size_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
             return GL_SHORT;
+        case Maxwell::VertexAttribute::Size::Size_32:
+        case Maxwell::VertexAttribute::Size::Size_32_32:
+        case Maxwell::VertexAttribute::Size::Size_32_32_32:
+        case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
+            return GL_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_INT_2_10_10_10_REV;
         }
@@ -61,9 +80,6 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         return {};
     }
 
-    case Maxwell::VertexAttribute::Type::UnsignedInt:
-        return GL_UNSIGNED_INT;
-
     case Maxwell::VertexAttribute::Type::Float:
         return GL_FLOAT;
     }
@@ -91,6 +107,8 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
     switch (topology) {
     case Maxwell::PrimitiveTopology::Points:
         return GL_POINTS;
+    case Maxwell::PrimitiveTopology::LineStrip:
+        return GL_LINE_STRIP;
     case Maxwell::PrimitiveTopology::Triangles:
         return GL_TRIANGLES;
     case Maxwell::PrimitiveTopology::TriangleStrip:
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index b5f97f332..f7eee7769 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -6,7 +6,10 @@
 #include <clocale>
 #include <memory>
 #include <thread>
+
+#include <fmt/ostream.h>
 #include <glad/glad.h>
+
 #define QT_NO_OPENGL
 #include <QDesktopWidget>
 #include <QFileDialog>
@@ -460,7 +463,7 @@ bool GMainWindow::LoadROM(const QString& filename) {
                         "While attempting to load the ROM requested, an error occured. Please "
                         "refer to the yuzu wiki for more information or the yuzu discord for "
                         "additional help.\n\nError Code: {:04X}-{:04X}\nError Description: {}",
-                        loader_id, error_id, Loader::GetMessageForResultStatus(error_id))));
+                        loader_id, error_id, static_cast<Loader::ResultStatus>(error_id))));
             } else {
                 QMessageBox::critical(
                     this, tr("Error while loading ROM!"),
diff --git a/src/yuzu_cmd/yuzu.cpp b/src/yuzu_cmd/yuzu.cpp
index e44a98311..9095cf27d 100644
--- a/src/yuzu_cmd/yuzu.cpp
+++ b/src/yuzu_cmd/yuzu.cpp
@@ -7,6 +7,8 @@
 #include <string>
 #include <thread>
 
+#include <fmt/ostream.h>
+
 #include "common/common_paths.h"
 #include "common/logging/backend.h"
 #include "common/logging/filter.h"
@@ -194,7 +196,7 @@ int main(int argc, char** argv) {
                          "While attempting to load the ROM requested, an error occured. Please "
                          "refer to the yuzu wiki for more information or the yuzu discord for "
                          "additional help.\n\nError Code: {:04X}-{:04X}\nError Description: {}",
-                         loader_id, error_id, Loader::GetMessageForResultStatus(error_id));
+                         loader_id, error_id, static_cast<Loader::ResultStatus>(error_id));
         }
     }