37 files changed, 1167 insertions, 953 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 7bfd57369..d350c9b36 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -570,13 +570,12 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am
     ForEachWrittenRange(*cpu_src_address, amount, mirror);
     // This subtraction in this order is important for overlapping copies.
     common_ranges.subtract(subtract_interval);
-    bool atleast_1_download = tmp_intervals.size() != 0;
-    for (const IntervalType add_interval : tmp_intervals) {
+    const bool has_new_downloads = tmp_intervals.size() != 0;
+    for (const IntervalType& add_interval : tmp_intervals) {
         common_ranges.add(add_interval);
     }
-
     runtime.CopyBuffer(dest_buffer, src_buffer, copies);
-    if (atleast_1_download) {
+    if (has_new_downloads) {
         dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount);
     }
     std::vector<u8> tmp_buffer(amount);
diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp
index 8b86ad050..a8c4b4415 100644
--- a/src/video_core/cdma_pusher.cpp
+++ b/src/video_core/cdma_pusher.cpp
@@ -24,6 +24,7 @@
 #include "command_classes/vic.h"
 #include "video_core/cdma_pusher.h"
 #include "video_core/command_classes/nvdec_common.h"
+#include "video_core/command_classes/sync_manager.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h
index 1bada44dd..87b49d6ea 100644
--- a/src/video_core/cdma_pusher.h
+++ b/src/video_core/cdma_pusher.h
@@ -9,13 +9,13 @@
 
 #include "common/bit_field.h"
 #include "common/common_types.h"
-#include "video_core/command_classes/sync_manager.h"
 
 namespace Tegra {
 
 class GPU;
 class Host1x;
 class Nvdec;
+class SyncptIncrManager;
 class Vic;
 
 enum class ChSubmissionMode : u32 {
diff --git a/src/video_core/command_classes/codecs/h264.cpp b/src/video_core/command_classes/codecs/h264.cpp
index 51ee14c13..5519c4705 100644
--- a/src/video_core/command_classes/codecs/h264.cpp
+++ b/src/video_core/command_classes/codecs/h264.cpp
@@ -20,6 +20,8 @@
 
 #include <array>
 #include <bit>
+
+#include "common/settings.h"
 #include "video_core/command_classes/codecs/h264.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
@@ -96,7 +98,10 @@ const std::vector<u8>& H264::ComposeFrameHeader(const NvdecCommon::NvdecRegister
                            (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
 
     // TODO (ameerj): Where do we get this number, it seems to be particular for each stream
-    writer.WriteUe(6); // Max number of reference frames
+    const auto nvdec_decoding = Settings::values.nvdec_emulation.GetValue();
+    const bool uses_gpu_decoding = nvdec_decoding == Settings::NvdecEmulation::GPU;
+    const u32 max_num_ref_frames = uses_gpu_decoding ? 6u : 16u;
+    writer.WriteUe(max_num_ref_frames);
     writer.WriteBit(false);
     writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
     writer.WriteUe(pic_height - 1);
diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp
index 0ee07f398..051616124 100644
--- a/src/video_core/command_classes/vic.cpp
+++ b/src/video_core/command_classes/vic.cpp
@@ -16,6 +16,7 @@ extern "C" {
 }
 
 #include "common/assert.h"
+#include "common/bit_field.h"
 #include "common/logging/log.h"
 
 #include "video_core/command_classes/nvdec.h"
@@ -26,6 +27,25 @@ extern "C" {
 #include "video_core/textures/decoders.h"
 
 namespace Tegra {
+namespace {
+enum class VideoPixelFormat : u64_le {
+    RGBA8 = 0x1f,
+    BGRA8 = 0x20,
+    RGBX8 = 0x23,
+    YUV420 = 0x44,
+};
+} // Anonymous namespace
+
+union VicConfig {
+    u64_le raw{};
+    BitField<0, 7, VideoPixelFormat> pixel_format;
+    BitField<7, 2, u64_le> chroma_loc_horiz;
+    BitField<9, 2, u64_le> chroma_loc_vert;
+    BitField<11, 4, u64_le> block_linear_kind;
+    BitField<15, 4, u64_le> block_linear_height_log2;
+    BitField<32, 14, u64_le> surface_width_minus1;
+    BitField<46, 14, u64_le> surface_height_minus1;
+};
 
 Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_)
     : gpu(gpu_),
@@ -65,134 +85,155 @@ void Vic::Execute() {
     if (!frame) {
         return;
     }
-    const auto pixel_format = static_cast<VideoPixelFormat>(config.pixel_format.Value());
-    switch (pixel_format) {
+    const u64 surface_width = config.surface_width_minus1 + 1;
+    const u64 surface_height = config.surface_height_minus1 + 1;
+    if (static_cast<u64>(frame->width) != surface_width ||
+        static_cast<u64>(frame->height) != surface_height) {
+        // TODO: Properly support multiple video streams with differing frame dimensions
+        LOG_WARNING(Service_NVDRV, "Frame dimensions {}x{} don't match surface dimensions {}x{}",
+                    frame->width, frame->height, surface_width, surface_height);
+    }
+    switch (config.pixel_format) {
+    case VideoPixelFormat::RGBA8:
     case VideoPixelFormat::BGRA8:
-    case VideoPixelFormat::RGBA8: {
-        LOG_TRACE(Service_NVDRV, "Writing RGB Frame");
+    case VideoPixelFormat::RGBX8:
+        WriteRGBFrame(frame, config);
+        break;
+    case VideoPixelFormat::YUV420:
+        WriteYUVFrame(frame, config);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unknown video pixel format {:X}", config.pixel_format.Value());
+        break;
+    }
+}
 
-        if (scaler_ctx == nullptr || frame->width != scaler_width ||
-            frame->height != scaler_height) {
-            const AVPixelFormat target_format =
-                (pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA;
+void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) {
+    LOG_TRACE(Service_NVDRV, "Writing RGB Frame");
+
+    if (!scaler_ctx || frame->width != scaler_width || frame->height != scaler_height) {
+        const AVPixelFormat target_format = [pixel_format = config.pixel_format]() {
+            switch (pixel_format) {
+            case VideoPixelFormat::RGBA8:
+                return AV_PIX_FMT_RGBA;
+            case VideoPixelFormat::BGRA8:
+                return AV_PIX_FMT_BGRA;
+            case VideoPixelFormat::RGBX8:
+                return AV_PIX_FMT_RGB0;
+            default:
+                return AV_PIX_FMT_RGBA;
+            }
+        }();
+
+        sws_freeContext(scaler_ctx);
+        // Frames are decoded into either YUV420 or NV12 formats. Convert to desired RGB format
+        scaler_ctx = sws_getContext(frame->width, frame->height,
+                                    static_cast<AVPixelFormat>(frame->format), frame->width,
+                                    frame->height, target_format, 0, nullptr, nullptr, nullptr);
+        scaler_width = frame->width;
+        scaler_height = frame->height;
+        converted_frame_buffer.reset();
+    }
+    if (!converted_frame_buffer) {
+        const size_t frame_size = frame->width * frame->height * 4;
+        converted_frame_buffer = AVMallocPtr{static_cast<u8*>(av_malloc(frame_size)), av_free};
+    }
+    const std::array<int, 4> converted_stride{frame->width * 4, frame->height * 4, 0, 0};
+    u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
+    sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height, &converted_frame_buf_addr,
+              converted_stride.data());
+
+    // Use the minimum of surface/frame dimensions to avoid buffer overflow.
+    const u32 surface_width = static_cast<u32>(config.surface_width_minus1) + 1;
+    const u32 surface_height = static_cast<u32>(config.surface_height_minus1) + 1;
+    const u32 width = std::min(surface_width, static_cast<u32>(frame->width));
+    const u32 height = std::min(surface_height, static_cast<u32>(frame->height));
+    const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
+    if (blk_kind != 0) {
+        // swizzle pitch linear to block linear
+        const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
+        const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0);
+        luma_buffer.resize(size);
+        Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(),
+                                converted_frame_buf_addr, block_height, 0, 0);
+
+        gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);
+    } else {
+        // send pitch linear frame
+        const size_t linear_size = width * height * 4;
+        gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
+                                       linear_size);
+    }
+}
 
-            sws_freeContext(scaler_ctx);
-            scaler_ctx = nullptr;
+void Vic::WriteYUVFrame(const AVFrame* frame, const VicConfig& config) {
+    LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");
 
-            // Frames are decoded into either YUV420 or NV12 formats. Convert to desired format
-            scaler_ctx = sws_getContext(frame->width, frame->height,
-                                        static_cast<AVPixelFormat>(frame->format), frame->width,
-                                        frame->height, target_format, 0, nullptr, nullptr, nullptr);
+    const std::size_t surface_width = config.surface_width_minus1 + 1;
+    const std::size_t surface_height = config.surface_height_minus1 + 1;
+    const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL;
+    // Use the minimum of surface/frame dimensions to avoid buffer overflow.
+    const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->width));
+    const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->height));
 
-            scaler_width = frame->width;
-            scaler_height = frame->height;
-        }
-        // Get Converted frame
-        const u32 width = static_cast<u32>(frame->width);
-        const u32 height = static_cast<u32>(frame->height);
-        const std::size_t linear_size = width * height * 4;
-
-        // Only allocate frame_buffer once per stream, as the size is not expected to change
-        if (!converted_frame_buffer) {
-            converted_frame_buffer = AVMallocPtr{static_cast<u8*>(av_malloc(linear_size)), av_free};
+    const auto stride = static_cast<size_t>(frame->linesize[0]);
+
+    luma_buffer.resize(aligned_width * surface_height);
+    chroma_buffer.resize(aligned_width * surface_height / 2);
+
+    // Populate luma buffer
+    const u8* luma_src = frame->data[0];
+    for (std::size_t y = 0; y < frame_height; ++y) {
+        const std::size_t src = y * stride;
+        const std::size_t dst = y * aligned_width;
+        for (std::size_t x = 0; x < frame_width; ++x) {
+            luma_buffer[dst + x] = luma_src[src + x];
         }
-        const std::array<int, 4> converted_stride{frame->width * 4, frame->height * 4, 0, 0};
-        u8* const converted_frame_buf_addr{converted_frame_buffer.get()};
-
-        sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height,
-                  &converted_frame_buf_addr, converted_stride.data());
-
-        const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
-        if (blk_kind != 0) {
-            // swizzle pitch linear to block linear
-            const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
-            const auto size =
-                Tegra::Texture::CalculateSize(true, 4, width, height, 1, block_height, 0);
-            luma_buffer.resize(size);
-            Tegra::Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(),
-                                           converted_frame_buffer.get(), block_height, 0, 0);
-
-            gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);
-        } else {
-            // send pitch linear frame
-            gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
-                                           linear_size);
+    }
+    gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
+                                   luma_buffer.size());
+
+    // Chroma
+    const std::size_t half_height = frame_height / 2;
+    const auto half_stride = static_cast<size_t>(frame->linesize[1]);
+
+    switch (frame->format) {
+    case AV_PIX_FMT_YUV420P: {
+        // Frame from FFmpeg software
+        // Populate chroma buffer from both channels with interleaving.
+        const std::size_t half_width = frame_width / 2;
+        const u8* chroma_b_src = frame->data[1];
+        const u8* chroma_r_src = frame->data[2];
+        for (std::size_t y = 0; y < half_height; ++y) {
+            const std::size_t src = y * half_stride;
+            const std::size_t dst = y * aligned_width;
+
+            for (std::size_t x = 0; x < half_width; ++x) {
+                chroma_buffer[dst + x * 2] = chroma_b_src[src + x];
+                chroma_buffer[dst + x * 2 + 1] = chroma_r_src[src + x];
+            }
         }
         break;
     }
-    case VideoPixelFormat::Yuv420: {
-        LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame");
-
-        const std::size_t surface_width = config.surface_width_minus1 + 1;
-        const std::size_t surface_height = config.surface_height_minus1 + 1;
-        const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->width));
-        const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->height));
-        const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL;
-
-        const auto stride = static_cast<size_t>(frame->linesize[0]);
-
-        luma_buffer.resize(aligned_width * surface_height);
-        chroma_buffer.resize(aligned_width * surface_height / 2);
-
-        // Populate luma buffer
-        const u8* luma_src = frame->data[0];
-        for (std::size_t y = 0; y < frame_height; ++y) {
+    case AV_PIX_FMT_NV12: {
+        // Frame from VA-API hardware
+        // This is already interleaved so just copy
+        const u8* chroma_src = frame->data[1];
+        for (std::size_t y = 0; y < half_height; ++y) {
             const std::size_t src = y * stride;
             const std::size_t dst = y * aligned_width;
             for (std::size_t x = 0; x < frame_width; ++x) {
-                luma_buffer[dst + x] = luma_src[src + x];
-            }
-        }
-        gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
-                                       luma_buffer.size());
-
-        // Chroma
-        const std::size_t half_height = frame_height / 2;
-        const auto half_stride = static_cast<size_t>(frame->linesize[1]);
-
-        switch (frame->format) {
-        case AV_PIX_FMT_YUV420P: {
-            // Frame from FFmpeg software
-            // Populate chroma buffer from both channels with interleaving.
-            const std::size_t half_width = frame_width / 2;
-            const u8* chroma_b_src = frame->data[1];
-            const u8* chroma_r_src = frame->data[2];
-            for (std::size_t y = 0; y < half_height; ++y) {
-                const std::size_t src = y * half_stride;
-                const std::size_t dst = y * aligned_width;
-
-                for (std::size_t x = 0; x < half_width; ++x) {
-                    chroma_buffer[dst + x * 2] = chroma_b_src[src + x];
-                    chroma_buffer[dst + x * 2 + 1] = chroma_r_src[src + x];
-                }
+                chroma_buffer[dst + x] = chroma_src[src + x];
             }
-            break;
-        }
-        case AV_PIX_FMT_NV12: {
-            // Frame from VA-API hardware
-            // This is already interleaved so just copy
-            const u8* chroma_src = frame->data[1];
-            for (std::size_t y = 0; y < half_height; ++y) {
-                const std::size_t src = y * stride;
-                const std::size_t dst = y * aligned_width;
-                for (std::size_t x = 0; x < frame_width; ++x) {
-                    chroma_buffer[dst + x] = chroma_src[src + x];
-                }
-            }
-            break;
-        }
-        default:
-            UNREACHABLE();
-            break;
         }
-        gpu.MemoryManager().WriteBlock(output_surface_chroma_address, chroma_buffer.data(),
-                                       chroma_buffer.size());
         break;
     }
     default:
-        UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value());
+        UNREACHABLE();
         break;
     }
+    gpu.MemoryManager().WriteBlock(output_surface_chroma_address, chroma_buffer.data(),
+                                   chroma_buffer.size());
 }
 
 } // namespace Tegra
diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h
index 74246e08c..6d4cdfd57 100644
--- a/src/video_core/command_classes/vic.h
+++ b/src/video_core/command_classes/vic.h
@@ -6,7 +6,6 @@
 
 #include <memory>
 #include <vector>
-#include "common/bit_field.h"
 #include "common/common_types.h"
 
 struct SwsContext;
@@ -14,6 +13,7 @@ struct SwsContext;
 namespace Tegra {
 class GPU;
 class Nvdec;
+union VicConfig;
 
 class Vic {
 public:
@@ -27,6 +27,7 @@ public:
     };
 
     explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor);
+
     ~Vic();
 
     /// Write to the device state.
@@ -35,22 +36,9 @@ public:
 private:
     void Execute();
 
-    enum class VideoPixelFormat : u64_le {
-        RGBA8 = 0x1f,
-        BGRA8 = 0x20,
-        Yuv420 = 0x44,
-    };
+    void WriteRGBFrame(const AVFrame* frame, const VicConfig& config);
 
-    union VicConfig {
-        u64_le raw{};
-        BitField<0, 7, u64_le> pixel_format;
-        BitField<7, 2, u64_le> chroma_loc_horiz;
-        BitField<9, 2, u64_le> chroma_loc_vert;
-        BitField<11, 4, u64_le> block_linear_kind;
-        BitField<15, 4, u64_le> block_linear_height_log2;
-        BitField<32, 14, u64_le> surface_width_minus1;
-        BitField<46, 14, u64_le> surface_height_minus1;
-    };
+    void WriteYUVFrame(const AVFrame* frame, const VicConfig& config);
 
     GPU& gpu;
     std::shared_ptr<Tegra::Nvdec> nvdec_processor;
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 7f4ca6282..f22342dfb 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <bitset>
+#include <cmath>
 #include <limits>
 #include <optional>
 #include <type_traits>
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index c7ec1eac9..67388d980 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -82,41 +82,41 @@ void MaxwellDMA::Launch() {
 }
 
 void MaxwellDMA::CopyPitchToPitch() {
-    // When `multi_line_enable` bit is disabled the copy is performed as if we were copying a 1D
-    // buffer of length `line_length_in`.
-    // Otherwise we copy a 2D image of dimensions (line_length_in, line_count).
-    auto& accelerate = rasterizer->AccessAccelerateDMA();
-    if (!regs.launch_dma.multi_line_enable) {
-        const bool is_buffer_clear = regs.launch_dma.remap_enable != 0 &&
-                                     regs.remap_const.dst_x == RemapConst::Swizzle::CONST_A;
-        // TODO: allow multisized components.
-        if (is_buffer_clear) {
-            ASSERT(regs.remap_const.component_size_minus_one == 3);
-            accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value);
-            std::vector<u32> tmp_buffer(regs.line_length_in, regs.remap_consta_value);
-            memory_manager.WriteBlockUnsafe(regs.offset_out,
-                                            reinterpret_cast<u8*>(tmp_buffer.data()),
-                                            regs.line_length_in * sizeof(u32));
-            return;
-        }
-        UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
-        if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) {
-            std::vector<u8> tmp_buffer(regs.line_length_in);
-            memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), regs.line_length_in);
-            memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), regs.line_length_in);
+    // When `multi_line_enable` bit is enabled we copy a 2D image of dimensions
+    // (line_length_in, line_count).
+    // Otherwise the copy is performed as if we were copying a 1D buffer of length line_length_in.
+    const bool remap_enabled = regs.launch_dma.remap_enable != 0;
+    if (regs.launch_dma.multi_line_enable) {
+        UNIMPLEMENTED_IF(remap_enabled);
+
+        // Perform a line-by-line copy.
+        // We're going to take a subrect of size (line_length_in, line_count) from the source
+        // rectangle. There is no need to manually flush/invalidate the regions because CopyBlock
+        // does that for us.
+        for (u32 line = 0; line < regs.line_count; ++line) {
+            const GPUVAddr source_line = regs.offset_in + static_cast<size_t>(line) * regs.pitch_in;
+            const GPUVAddr dest_line = regs.offset_out + static_cast<size_t>(line) * regs.pitch_out;
+            memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in);
         }
         return;
     }
-
-    UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
-
-    // Perform a line-by-line copy.
-    // We're going to take a subrect of size (line_length_in, line_count) from the source rectangle.
-    // There is no need to manually flush/invalidate the regions because CopyBlock does that for us.
-    for (u32 line = 0; line < regs.line_count; ++line) {
-        const GPUVAddr source_line = regs.offset_in + static_cast<size_t>(line) * regs.pitch_in;
-        const GPUVAddr dest_line = regs.offset_out + static_cast<size_t>(line) * regs.pitch_out;
-        memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in);
+    // TODO: allow multisized components.
+    auto& accelerate = rasterizer->AccessAccelerateDMA();
+    const bool is_const_a_dst = regs.remap_const.dst_x == RemapConst::Swizzle::CONST_A;
+    const bool is_buffer_clear = remap_enabled && is_const_a_dst;
+    if (is_buffer_clear) {
+        ASSERT(regs.remap_const.component_size_minus_one == 3);
+        accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value);
+        std::vector<u32> tmp_buffer(regs.line_length_in, regs.remap_consta_value);
+        memory_manager.WriteBlockUnsafe(regs.offset_out, reinterpret_cast<u8*>(tmp_buffer.data()),
+                                        regs.line_length_in * sizeof(u32));
+        return;
+    }
+    UNIMPLEMENTED_IF(remap_enabled);
+    if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) {
+        std::vector<u8> tmp_buffer(regs.line_length_in);
+        memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), regs.line_length_in);
+        memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), regs.line_length_in);
     }
 }
 
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 9e457ae16..a04514425 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -175,7 +175,7 @@ public:
     static_assert(sizeof(LaunchDMA) == 4);
 
     struct RemapConst {
-        enum Swizzle : u32 {
+        enum class Swizzle : u32 {
             SRC_X = 0,
             SRC_Y = 1,
             SRC_Z = 2,
diff --git a/src/video_core/framebuffer_config.h b/src/video_core/framebuffer_config.h
index b86c3a757..b1d455e30 100644
--- a/src/video_core/framebuffer_config.h
+++ b/src/video_core/framebuffer_config.h
@@ -4,8 +4,10 @@
 
 #pragma once
 
-namespace Tegra {
+#include "common/common_types.h"
+#include "common/math_util.h"
 
+namespace Tegra {
 /**
  * Struct describing framebuffer configuration
  */
@@ -16,6 +18,21 @@ struct FramebufferConfig {
         B8G8R8A8_UNORM = 5,
     };
 
+    enum class TransformFlags : u32 {
+        /// No transform flags are set
+        Unset = 0x00,
+        /// Flip source image horizontally (around the vertical axis)
+        FlipH = 0x01,
+        /// Flip source image vertically (around the horizontal axis)
+        FlipV = 0x02,
+        /// Rotate source image 90 degrees clockwise
+        Rotate90 = 0x04,
+        /// Rotate source image 180 degrees
+        Rotate180 = 0x03,
+        /// Rotate source image 270 degrees clockwise
+        Rotate270 = 0x07,
+    };
+
     VAddr address{};
     u32 offset{};
     u32 width{};
@@ -23,7 +40,6 @@ struct FramebufferConfig {
     u32 stride{};
     PixelFormat pixel_format{};
 
-    using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags;
     TransformFlags transform_flags{};
     Common::Rectangle<int> crop_rect;
 };
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 2ae3639b5..ab7c21a49 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -2,540 +2,913 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <array>
+#include <atomic>
 #include <chrono>
+#include <condition_variable>
+#include <list>
+#include <memory>
 
 #include "common/assert.h"
 #include "common/microprofile.h"
 #include "common/settings.h"
 #include "core/core.h"
 #include "core/core_timing.h"
-#include "core/core_timing_util.h"
 #include "core/frontend/emu_window.h"
 #include "core/hardware_interrupt_manager.h"
-#include "core/memory.h"
+#include "core/hle/service/nvdrv/nvdata.h"
+#include "core/hle/service/nvflinger/buffer_queue.h"
 #include "core/perf_stats.h"
+#include "video_core/cdma_pusher.h"
+#include "video_core/dma_pusher.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/kepler_compute.h"
 #include "video_core/engines/kepler_memory.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_dma.h"
 #include "video_core/gpu.h"
+#include "video_core/gpu_thread.h"
 #include "video_core/memory_manager.h"
 #include "video_core/renderer_base.h"
 #include "video_core/shader_notify.h"
-#include "video_core/video_core.h"
 
 namespace Tegra {
 
 MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
 
-GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
-    : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)},
-      dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)}, use_nvdec{use_nvdec_},
-      maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},
-      fermi_2d{std::make_unique<Engines::Fermi2D>()},
-      kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
-      maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)},
-      kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)},
-      shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_},
-      gpu_thread{system_, is_async_} {}
+struct GPU::Impl {
+    explicit Impl(GPU& gpu_, Core::System& system_, bool is_async_, bool use_nvdec_)
+        : gpu{gpu_}, system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(
+                                          system)},
+          dma_pusher{std::make_unique<Tegra::DmaPusher>(system, gpu)}, use_nvdec{use_nvdec_},
+          maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)},
+          fermi_2d{std::make_unique<Engines::Fermi2D>()},
+          kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
+          maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)},
+          kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)},
+          shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_},
+          gpu_thread{system_, is_async_} {}
+
+    ~Impl() = default;
+
+    /// Binds a renderer to the GPU.
+    void BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) {
+        renderer = std::move(renderer_);
+        rasterizer = renderer->ReadRasterizer();
+
+        memory_manager->BindRasterizer(rasterizer);
+        maxwell_3d->BindRasterizer(rasterizer);
+        fermi_2d->BindRasterizer(rasterizer);
+        kepler_compute->BindRasterizer(rasterizer);
+        maxwell_dma->BindRasterizer(rasterizer);
+    }
 
-GPU::~GPU() = default;
+    /// Calls a GPU method.
+    void CallMethod(const GPU::MethodCall& method_call) {
+        LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method_call.method,
+                  method_call.subchannel);
+
+        ASSERT(method_call.subchannel < bound_engines.size());
+
+        if (ExecuteMethodOnEngine(method_call.method)) {
+            CallEngineMethod(method_call);
+        } else {
+            CallPullerMethod(method_call);
+        }
+    }
+
+    /// Calls a GPU multivalue method.
+    void CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
+                         u32 methods_pending) {
+        LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method, subchannel);
 
-void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) {
-    renderer = std::move(renderer_);
-    rasterizer = renderer->ReadRasterizer();
+        ASSERT(subchannel < bound_engines.size());
+
+        if (ExecuteMethodOnEngine(method)) {
+            CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending);
+        } else {
+            for (std::size_t i = 0; i < amount; i++) {
+                CallPullerMethod(GPU::MethodCall{
+                    method,
+                    base_start[i],
+                    subchannel,
+                    methods_pending - static_cast<u32>(i),
+                });
+            }
+        }
+    }
+
+    /// Flush all current written commands into the host GPU for execution.
+    void FlushCommands() {
+        rasterizer->FlushCommands();
+    }
+
+    /// Synchronizes CPU writes with Host GPU memory.
+    void SyncGuestHost() {
+        rasterizer->SyncGuestHost();
+    }
+
+    /// Signal the ending of command list.
+    void OnCommandListEnd() {
+        if (is_async) {
+            // This command only applies to asynchronous GPU mode
+            gpu_thread.OnCommandListEnd();
+        }
+    }
+
+    /// Request a host GPU memory flush from the CPU.
+    [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size) {
+        std::unique_lock lck{flush_request_mutex};
+        const u64 fence = ++last_flush_fence;
+        flush_requests.emplace_back(fence, addr, size);
+        return fence;
+    }
+
+    /// Obtains current flush request fence id.
+    [[nodiscard]] u64 CurrentFlushRequestFence() const {
+        return current_flush_fence.load(std::memory_order_relaxed);
+    }
+
+    /// Tick pending requests within the GPU.
+    void TickWork() {
+        std::unique_lock lck{flush_request_mutex};
+        while (!flush_requests.empty()) {
+            auto& request = flush_requests.front();
+            const u64 fence = request.fence;
+            const VAddr addr = request.addr;
+            const std::size_t size = request.size;
+            flush_requests.pop_front();
+            flush_request_mutex.unlock();
+            rasterizer->FlushRegion(addr, size);
+            current_flush_fence.store(fence);
+            flush_request_mutex.lock();
+        }
+    }
+
+    /// Returns a reference to the Maxwell3D GPU engine.
+    [[nodiscard]] Engines::Maxwell3D& Maxwell3D() {
+        return *maxwell_3d;
+    }
+
+    /// Returns a const reference to the Maxwell3D GPU engine.
+    [[nodiscard]] const Engines::Maxwell3D& Maxwell3D() const {
+        return *maxwell_3d;
+    }
+
+    /// Returns a reference to the KeplerCompute GPU engine.
+    [[nodiscard]] Engines::KeplerCompute& KeplerCompute() {
+        return *kepler_compute;
+    }
+
+    /// Returns a reference to the KeplerCompute GPU engine.
+    [[nodiscard]] const Engines::KeplerCompute& KeplerCompute() const {
+        return *kepler_compute;
+    }
+
+    /// Returns a reference to the GPU memory manager.
+    [[nodiscard]] Tegra::MemoryManager& MemoryManager() {
+        return *memory_manager;
+    }
+
+    /// Returns a const reference to the GPU memory manager.
+    [[nodiscard]] const Tegra::MemoryManager& MemoryManager() const {
+        return *memory_manager;
+    }
+
+    /// Returns a reference to the GPU DMA pusher.
+    [[nodiscard]] Tegra::DmaPusher& DmaPusher() {
+        return *dma_pusher;
+    }
+
+    /// Returns a const reference to the GPU DMA pusher.
+    [[nodiscard]] const Tegra::DmaPusher& DmaPusher() const {
+        return *dma_pusher;
+    }
+
+    /// Returns a reference to the GPU CDMA pusher.
+    [[nodiscard]] Tegra::CDmaPusher& CDmaPusher() {
+        return *cdma_pusher;
+    }
+
+    /// Returns a const reference to the GPU CDMA pusher.
+    [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const {
+        return *cdma_pusher;
+    }
+
+    /// Returns a reference to the underlying renderer.
+    [[nodiscard]] VideoCore::RendererBase& Renderer() {
+        return *renderer;
+    }
+
+    /// Returns a const reference to the underlying renderer.
+    [[nodiscard]] const VideoCore::RendererBase& Renderer() const {
+        return *renderer;
+    }
+
+    /// Returns a reference to the shader notifier.
+    [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() {
+        return *shader_notify;
+    }
+
+    /// Returns a const reference to the shader notifier.
+    [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const {
+        return *shader_notify;
+    }
+
+    /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
+    void WaitFence(u32 syncpoint_id, u32 value) {
+        // Synced GPU, is always in sync
+        if (!is_async) {
+            return;
+        }
+        if (syncpoint_id == UINT32_MAX) {
+            // TODO: Research what this does.
+            LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented");
+            return;
+        }
+        MICROPROFILE_SCOPE(GPU_wait);
+        std::unique_lock lock{sync_mutex};
+        sync_cv.wait(lock, [=, this] {
+            if (shutting_down.load(std::memory_order_relaxed)) {
+                // We're shutting down, ensure no threads continue to wait for the next syncpoint
+                return true;
+            }
+            return syncpoints.at(syncpoint_id).load() >= value;
+        });
+    }
+
+    void IncrementSyncPoint(u32 syncpoint_id) {
+        auto& syncpoint = syncpoints.at(syncpoint_id);
+        syncpoint++;
+        std::lock_guard lock{sync_mutex};
+        sync_cv.notify_all();
+        auto& interrupt = syncpt_interrupts.at(syncpoint_id);
+        if (!interrupt.empty()) {
+            u32 value = syncpoint.load();
+            auto it = interrupt.begin();
+            while (it != interrupt.end()) {
+                if (value >= *it) {
+                    TriggerCpuInterrupt(syncpoint_id, *it);
+                    it = interrupt.erase(it);
+                    continue;
+                }
+                it++;
+            }
+        }
+    }
+
+    [[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const {
+        return syncpoints.at(syncpoint_id).load();
+    }
+
+    void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value) {
+        std::lock_guard lock{sync_mutex};
+        auto& interrupt = syncpt_interrupts.at(syncpoint_id);
+        bool contains = std::any_of(interrupt.begin(), interrupt.end(),
+                                    [value](u32 in_value) { return in_value == value; });
+        if (contains) {
+            return;
+        }
+        interrupt.emplace_back(value);
+    }
+
+    [[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value) {
+        std::lock_guard lock{sync_mutex};
+        auto& interrupt = syncpt_interrupts.at(syncpoint_id);
+        const auto iter =
+            std::find_if(interrupt.begin(), interrupt.end(),
+                         [value](u32 interrupt_value) { return value == interrupt_value; });
+
+        if (iter == interrupt.end()) {
+            return false;
+        }
+        interrupt.erase(iter);
+        return true;
+    }
+
+    [[nodiscard]] u64 GetTicks() const {
+        // This values were reversed engineered by fincs from NVN
+        // The gpu clock is reported in units of 385/625 nanoseconds
+        constexpr u64 gpu_ticks_num = 384;
+        constexpr u64 gpu_ticks_den = 625;
+
+        u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count();
+        if (Settings::values.use_fast_gpu_time.GetValue()) {
+            nanoseconds /= 256;
+        }
+        const u64 nanoseconds_num = nanoseconds / gpu_ticks_den;
+        const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den;
+        return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den;
+    }
+
+    [[nodiscard]] bool IsAsync() const {
+        return is_async;
+    }
+
+    [[nodiscard]] bool UseNvdec() const {
+        return use_nvdec;
+    }
+
+    void RendererFrameEndNotify() {
+        system.GetPerfStats().EndGameFrame();
+    }
+
+    /// Performs any additional setup necessary in order to begin GPU emulation.
+    /// This can be used to launch any necessary threads and register any necessary
+    /// core timing events.
+    void Start() {
+        gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher);
+        cpu_context = renderer->GetRenderWindow().CreateSharedContext();
+        cpu_context->MakeCurrent();
+    }
+
+    /// Obtain the CPU Context
+    void ObtainContext() {
+        cpu_context->MakeCurrent();
+    }
+
+    /// Release the CPU Context
+    void ReleaseContext() {
+        cpu_context->DoneCurrent();
+    }
+
+    /// Push GPU command entries to be processed
+    void PushGPUEntries(Tegra::CommandList&& entries) {
+        gpu_thread.SubmitList(std::move(entries));
+    }
+
+    /// Push GPU command buffer entries to be processed
+    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
+        if (!use_nvdec) {
+            return;
+        }
+
+        if (!cdma_pusher) {
+            cdma_pusher = std::make_unique<Tegra::CDmaPusher>(gpu);
+        }
+
+        // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
+        // TODO(ameerj): RE proper async nvdec operation
+        // gpu_thread.SubmitCommandBuffer(std::move(entries));
+
+        cdma_pusher->ProcessEntries(std::move(entries));
+    }
+
+    /// Frees the CDMAPusher instance to free up resources
+    void ClearCdmaInstance() {
+        cdma_pusher.reset();
+    }
+
+    /// Swap buffers (render frame)
+    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+        gpu_thread.SwapBuffers(framebuffer);
+    }
+
+    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
+    void FlushRegion(VAddr addr, u64 size) {
+        gpu_thread.FlushRegion(addr, size);
+    }
+
+    /// Notify rasterizer that any caches of the specified region should be invalidated
+    void InvalidateRegion(VAddr addr, u64 size) {
+        gpu_thread.InvalidateRegion(addr, size);
+    }
+
+    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
+    void FlushAndInvalidateRegion(VAddr addr, u64 size) {
+        gpu_thread.FlushAndInvalidateRegion(addr, size);
+    }
+
+    void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const {
+        auto& interrupt_manager = system.InterruptManager();
+        interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
+    }
+
+    void ProcessBindMethod(const GPU::MethodCall& method_call) {
+        // Bind the current subchannel to the desired engine id.
+        LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel,
+                  method_call.argument);
+        const auto engine_id = static_cast<EngineID>(method_call.argument);
+        bound_engines[method_call.subchannel] = static_cast<EngineID>(engine_id);
+        switch (engine_id) {
+        case EngineID::FERMI_TWOD_A:
+            dma_pusher->BindSubchannel(fermi_2d.get(), method_call.subchannel);
+            break;
+        case EngineID::MAXWELL_B:
+            dma_pusher->BindSubchannel(maxwell_3d.get(), method_call.subchannel);
+            break;
+        case EngineID::KEPLER_COMPUTE_B:
+            dma_pusher->BindSubchannel(kepler_compute.get(), method_call.subchannel);
+            break;
+        case EngineID::MAXWELL_DMA_COPY_A:
+            dma_pusher->BindSubchannel(maxwell_dma.get(), method_call.subchannel);
+            break;
+        case EngineID::KEPLER_INLINE_TO_MEMORY_B:
+            dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel);
+            break;
+        default:
+            UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id);
+        }
+    }
 
-    memory_manager->BindRasterizer(rasterizer);
-    maxwell_3d->BindRasterizer(rasterizer);
-    fermi_2d->BindRasterizer(rasterizer);
-    kepler_compute->BindRasterizer(rasterizer);
-    maxwell_dma->BindRasterizer(rasterizer);
+    void ProcessFenceActionMethod() {
+        switch (regs.fence_action.op) {
+        case GPU::FenceOperation::Acquire:
+            WaitFence(regs.fence_action.syncpoint_id, regs.fence_value);
+            break;
+        case GPU::FenceOperation::Increment:
+            IncrementSyncPoint(regs.fence_action.syncpoint_id);
+            break;
+        default:
+            UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value());
+        }
+    }
+
+    void ProcessWaitForInterruptMethod() {
+        // TODO(bunnei) ImplementMe
+        LOG_WARNING(HW_GPU, "(STUBBED) called");
+    }
+
+    void ProcessSemaphoreTriggerMethod() {
+        const auto semaphoreOperationMask = 0xF;
+        const auto op =
+            static_cast<GpuSemaphoreOperation>(regs.semaphore_trigger & semaphoreOperationMask);
+        if (op == GpuSemaphoreOperation::WriteLong) {
+            struct Block {
+                u32 sequence;
+                u32 zeros = 0;
+                u64 timestamp;
+            };
+
+            Block block{};
+            block.sequence = regs.semaphore_sequence;
+            // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of
+            // CoreTiming
+            block.timestamp = GetTicks();
+            memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block,
+                                       sizeof(block));
+        } else {
+            const u32 word{memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress())};
+            if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) ||
+                (op == GpuSemaphoreOperation::AcquireGequal &&
+                 static_cast<s32>(word - regs.semaphore_sequence) > 0) ||
+                (op == GpuSemaphoreOperation::AcquireMask && (word & regs.semaphore_sequence))) {
+                // Nothing to do in this case
+            } else {
+                regs.acquire_source = true;
+                regs.acquire_value = regs.semaphore_sequence;
+                if (op == GpuSemaphoreOperation::AcquireEqual) {
+                    regs.acquire_active = true;
+                    regs.acquire_mode = false;
+                } else if (op == GpuSemaphoreOperation::AcquireGequal) {
+                    regs.acquire_active = true;
+                    regs.acquire_mode = true;
+                } else if (op == GpuSemaphoreOperation::AcquireMask) {
+                    // TODO(kemathe) The acquire mask operation waits for a value that, ANDed with
+                    // semaphore_sequence, gives a non-0 result
+                    LOG_ERROR(HW_GPU, "Invalid semaphore operation AcquireMask not implemented");
+                } else {
+                    LOG_ERROR(HW_GPU, "Invalid semaphore operation");
+                }
+            }
+        }
+    }
+
+    void ProcessSemaphoreRelease() {
+        memory_manager->Write<u32>(regs.semaphore_address.SemaphoreAddress(),
+                                   regs.semaphore_release);
+    }
+
+    void ProcessSemaphoreAcquire() {
+        const u32 word = memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress());
+        const auto value = regs.semaphore_acquire;
+        if (word != value) {
+            regs.acquire_active = true;
+            regs.acquire_value = value;
+            // TODO(kemathe73) figure out how to do the acquire_timeout
+            regs.acquire_mode = false;
+            regs.acquire_source = false;
+        }
+    }
+
+    /// Calls a GPU puller method.
+    void CallPullerMethod(const GPU::MethodCall& method_call) {
+        regs.reg_array[method_call.method] = method_call.argument;
+        const auto method = static_cast<BufferMethods>(method_call.method);
+
+        switch (method) {
+        case BufferMethods::BindObject: {
+            ProcessBindMethod(method_call);
+            break;
+        }
+        case BufferMethods::Nop:
+        case BufferMethods::SemaphoreAddressHigh:
+        case BufferMethods::SemaphoreAddressLow:
+        case BufferMethods::SemaphoreSequence:
+        case BufferMethods::UnkCacheFlush:
+        case BufferMethods::WrcacheFlush:
+        case BufferMethods::FenceValue:
+            break;
+        case BufferMethods::RefCnt:
+            rasterizer->SignalReference();
+            break;
+        case BufferMethods::FenceAction:
+            ProcessFenceActionMethod();
+            break;
+        case BufferMethods::WaitForInterrupt:
+            ProcessWaitForInterruptMethod();
+            break;
+        case BufferMethods::SemaphoreTrigger: {
+            ProcessSemaphoreTriggerMethod();
+            break;
+        }
+        case BufferMethods::NotifyIntr: {
+            // TODO(Kmather73): Research and implement this method.
+            LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented");
+            break;
+        }
+        case BufferMethods::Unk28: {
+            // TODO(Kmather73): Research and implement this method.
+            LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented");
+            break;
+        }
+        case BufferMethods::SemaphoreAcquire: {
+            ProcessSemaphoreAcquire();
+            break;
+        }
+        case BufferMethods::SemaphoreRelease: {
+            ProcessSemaphoreRelease();
+            break;
+        }
+        case BufferMethods::Yield: {
+            // TODO(Kmather73): Research and implement this method.
+            LOG_ERROR(HW_GPU, "Special puller engine method Yield not implemented");
+            break;
+        }
+        default:
+            LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method);
+            break;
+        }
+    }
+
+    /// Calls a GPU engine method.
+    void CallEngineMethod(const GPU::MethodCall& method_call) {
+        const EngineID engine = bound_engines[method_call.subchannel];
+
+        switch (engine) {
+        case EngineID::FERMI_TWOD_A:
+            fermi_2d->CallMethod(method_call.method, method_call.argument,
+                                 method_call.IsLastCall());
+            break;
+        case EngineID::MAXWELL_B:
+            maxwell_3d->CallMethod(method_call.method, method_call.argument,
+                                   method_call.IsLastCall());
+            break;
+        case EngineID::KEPLER_COMPUTE_B:
+            kepler_compute->CallMethod(method_call.method, method_call.argument,
+                                       method_call.IsLastCall());
+            break;
+        case EngineID::MAXWELL_DMA_COPY_A:
+            maxwell_dma->CallMethod(method_call.method, method_call.argument,
+                                    method_call.IsLastCall());
+            break;
+        case EngineID::KEPLER_INLINE_TO_MEMORY_B:
+            kepler_memory->CallMethod(method_call.method, method_call.argument,
+                                      method_call.IsLastCall());
+            break;
+        default:
+            UNIMPLEMENTED_MSG("Unimplemented engine");
+        }
+    }
+
+    /// Calls a GPU engine multivalue method.
+    void CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
+                               u32 methods_pending) {
+        const EngineID engine = bound_engines[subchannel];
+
+        switch (engine) {
+        case EngineID::FERMI_TWOD_A:
+            fermi_2d->CallMultiMethod(method, base_start, amount, methods_pending);
+            break;
+        case EngineID::MAXWELL_B:
+            maxwell_3d->CallMultiMethod(method, base_start, amount, methods_pending);
+            break;
+        case EngineID::KEPLER_COMPUTE_B:
+            kepler_compute->CallMultiMethod(method, base_start, amount, methods_pending);
+            break;
+        case EngineID::MAXWELL_DMA_COPY_A:
+            maxwell_dma->CallMultiMethod(method, base_start, amount, methods_pending);
+            break;
+        case EngineID::KEPLER_INLINE_TO_MEMORY_B:
+            kepler_memory->CallMultiMethod(method, base_start, amount, methods_pending);
+            break;
+        default:
+            UNIMPLEMENTED_MSG("Unimplemented engine");
+        }
+    }
+
+    /// Determines where the method should be executed.
+    [[nodiscard]] bool ExecuteMethodOnEngine(u32 method) {
+        const auto buffer_method = static_cast<BufferMethods>(method);
+        return buffer_method >= BufferMethods::NonPullerMethods;
+    }
+
+    struct Regs {
+        static constexpr size_t NUM_REGS = 0x40;
+
+        union {
+            struct {
+                INSERT_PADDING_WORDS_NOINIT(0x4);
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+
+                    [[nodiscard]] GPUVAddr SemaphoreAddress() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } semaphore_address;
+
+                u32 semaphore_sequence;
+                u32 semaphore_trigger;
+                INSERT_PADDING_WORDS_NOINIT(0xC);
+
+                // The pusher and the puller share the reference counter, the pusher only has read
+                // access
+                u32 reference_count;
+                INSERT_PADDING_WORDS_NOINIT(0x5);
+
+                u32 semaphore_acquire;
+                u32 semaphore_release;
+                u32 fence_value;
+                GPU::FenceAction fence_action;
+                INSERT_PADDING_WORDS_NOINIT(0xE2);
+
+                // Puller state
+                u32 acquire_mode;
+                u32 acquire_source;
+                u32 acquire_active;
+                u32 acquire_timeout;
+                u32 acquire_value;
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+
+    GPU& gpu;
+    Core::System& system;
+    std::unique_ptr<Tegra::MemoryManager> memory_manager;
+    std::unique_ptr<Tegra::DmaPusher> dma_pusher;
+    std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
+    std::unique_ptr<VideoCore::RendererBase> renderer;
+    VideoCore::RasterizerInterface* rasterizer = nullptr;
+    const bool use_nvdec;
+
+    /// Mapping of command subchannels to their bound engine ids
+    std::array<EngineID, 8> bound_engines{};
+    /// 3D engine
+    std::unique_ptr<Engines::Maxwell3D> maxwell_3d;
+    /// 2D engine
+    std::unique_ptr<Engines::Fermi2D> fermi_2d;
+    /// Compute engine
+    std::unique_ptr<Engines::KeplerCompute> kepler_compute;
+    /// DMA engine
+    std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
+    /// Inline memory engine
+    std::unique_ptr<Engines::KeplerMemory> kepler_memory;
+    /// Shader build notifier
+    std::unique_ptr<VideoCore::ShaderNotify> shader_notify;
+    /// When true, we are about to shut down emulation session, so terminate outstanding tasks
+    std::atomic_bool shutting_down{};
+
+    std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
+
+    std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
+
+    std::mutex sync_mutex;
+    std::mutex device_mutex;
+
+    std::condition_variable sync_cv;
+
+    struct FlushRequest {
+        explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_)
+            : fence{fence_}, addr{addr_}, size{size_} {}
+        u64 fence;
+        VAddr addr;
+        std::size_t size;
+    };
+
+    std::list<FlushRequest> flush_requests;
+    std::atomic<u64> current_flush_fence{};
+    u64 last_flush_fence{};
+    std::mutex flush_request_mutex;
+
+    const bool is_async;
+
+    VideoCommon::GPUThread::ThreadManager gpu_thread;
+    std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(Regs, field_name) == position * 4,                                      \
+                  "Field " #field_name " has invalid position")
+
+    ASSERT_REG_POSITION(semaphore_address, 0x4);
+    ASSERT_REG_POSITION(semaphore_sequence, 0x6);
+    ASSERT_REG_POSITION(semaphore_trigger, 0x7);
+    ASSERT_REG_POSITION(reference_count, 0x14);
+    ASSERT_REG_POSITION(semaphore_acquire, 0x1A);
+    ASSERT_REG_POSITION(semaphore_release, 0x1B);
+    ASSERT_REG_POSITION(fence_value, 0x1C);
+    ASSERT_REG_POSITION(fence_action, 0x1D);
+
+    ASSERT_REG_POSITION(acquire_mode, 0x100);
+    ASSERT_REG_POSITION(acquire_source, 0x101);
+    ASSERT_REG_POSITION(acquire_active, 0x102);
+    ASSERT_REG_POSITION(acquire_timeout, 0x103);
+    ASSERT_REG_POSITION(acquire_value, 0x104);
+
+#undef ASSERT_REG_POSITION
+
+    enum class GpuSemaphoreOperation {
+        AcquireEqual = 0x1,
+        WriteLong = 0x2,
+        AcquireGequal = 0x4,
+        AcquireMask = 0x8,
+    };
+};
+
+GPU::GPU(Core::System& system, bool is_async, bool use_nvdec)
+    : impl{std::make_unique<Impl>(*this, system, is_async, use_nvdec)} {}
+
+GPU::~GPU() = default;
+
+void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer) {
+    impl->BindRenderer(std::move(renderer));
 }
 
-Engines::Maxwell3D& GPU::Maxwell3D() {
-    return *maxwell_3d;
+void GPU::CallMethod(const MethodCall& method_call) {
+    impl->CallMethod(method_call);
 }
 
-const Engines::Maxwell3D& GPU::Maxwell3D() const {
-    return *maxwell_3d;
+void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
+                          u32 methods_pending) {
+    impl->CallMultiMethod(method, subchannel, base_start, amount, methods_pending);
 }
 
-Engines::KeplerCompute& GPU::KeplerCompute() {
-    return *kepler_compute;
+void GPU::FlushCommands() {
+    impl->FlushCommands();
 }
 
-const Engines::KeplerCompute& GPU::KeplerCompute() const {
-    return *kepler_compute;
+void GPU::SyncGuestHost() {
+    impl->SyncGuestHost();
 }
 
-MemoryManager& GPU::MemoryManager() {
-    return *memory_manager;
+void GPU::OnCommandListEnd() {
+    impl->OnCommandListEnd();
 }
 
-const MemoryManager& GPU::MemoryManager() const {
-    return *memory_manager;
+u64 GPU::RequestFlush(VAddr addr, std::size_t size) {
+    return impl->RequestFlush(addr, size);
 }
 
-DmaPusher& GPU::DmaPusher() {
-    return *dma_pusher;
+u64 GPU::CurrentFlushRequestFence() const {
+    return impl->CurrentFlushRequestFence();
 }
 
-Tegra::CDmaPusher& GPU::CDmaPusher() {
-    return *cdma_pusher;
+void GPU::TickWork() {
+    impl->TickWork();
 }
 
-const DmaPusher& GPU::DmaPusher() const {
-    return *dma_pusher;
+Engines::Maxwell3D& GPU::Maxwell3D() {
+    return impl->Maxwell3D();
 }
 
-const Tegra::CDmaPusher& GPU::CDmaPusher() const {
-    return *cdma_pusher;
+const Engines::Maxwell3D& GPU::Maxwell3D() const {
+    return impl->Maxwell3D();
 }
 
-void GPU::WaitFence(u32 syncpoint_id, u32 value) {
-    // Synced GPU, is always in sync
-    if (!is_async) {
-        return;
-    }
-    if (syncpoint_id == UINT32_MAX) {
-        // TODO: Research what this does.
-        LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented");
-        return;
-    }
-    MICROPROFILE_SCOPE(GPU_wait);
-    std::unique_lock lock{sync_mutex};
-    sync_cv.wait(lock, [=, this] {
-        if (shutting_down.load(std::memory_order_relaxed)) {
-            // We're shutting down, ensure no threads continue to wait for the next syncpoint
-            return true;
-        }
-        return syncpoints.at(syncpoint_id).load() >= value;
-    });
-}
-
-void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
-    auto& syncpoint = syncpoints.at(syncpoint_id);
-    syncpoint++;
-    std::lock_guard lock{sync_mutex};
-    sync_cv.notify_all();
-    auto& interrupt = syncpt_interrupts.at(syncpoint_id);
-    if (!interrupt.empty()) {
-        u32 value = syncpoint.load();
-        auto it = interrupt.begin();
-        while (it != interrupt.end()) {
-            if (value >= *it) {
-                TriggerCpuInterrupt(syncpoint_id, *it);
-                it = interrupt.erase(it);
-                continue;
-            }
-            it++;
-        }
-    }
+Engines::KeplerCompute& GPU::KeplerCompute() {
+    return impl->KeplerCompute();
 }
 
-u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const {
-    return syncpoints.at(syncpoint_id).load();
+const Engines::KeplerCompute& GPU::KeplerCompute() const {
+    return impl->KeplerCompute();
 }
 
-void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
-    auto& interrupt = syncpt_interrupts.at(syncpoint_id);
-    bool contains = std::any_of(interrupt.begin(), interrupt.end(),
-                                [value](u32 in_value) { return in_value == value; });
-    if (contains) {
-        return;
-    }
-    interrupt.emplace_back(value);
+Tegra::MemoryManager& GPU::MemoryManager() {
+    return impl->MemoryManager();
 }
 
-bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) {
-    std::lock_guard lock{sync_mutex};
-    auto& interrupt = syncpt_interrupts.at(syncpoint_id);
-    const auto iter =
-        std::find_if(interrupt.begin(), interrupt.end(),
-                     [value](u32 interrupt_value) { return value == interrupt_value; });
+const Tegra::MemoryManager& GPU::MemoryManager() const {
+    return impl->MemoryManager();
+}
 
-    if (iter == interrupt.end()) {
-        return false;
-    }
-    interrupt.erase(iter);
-    return true;
+Tegra::DmaPusher& GPU::DmaPusher() {
+    return impl->DmaPusher();
 }
 
-u64 GPU::RequestFlush(VAddr addr, std::size_t size) {
-    std::unique_lock lck{flush_request_mutex};
-    const u64 fence = ++last_flush_fence;
-    flush_requests.emplace_back(fence, addr, size);
-    return fence;
+const Tegra::DmaPusher& GPU::DmaPusher() const {
+    return impl->DmaPusher();
 }
 
-void GPU::TickWork() {
-    std::unique_lock lck{flush_request_mutex};
-    while (!flush_requests.empty()) {
-        auto& request = flush_requests.front();
-        const u64 fence = request.fence;
-        const VAddr addr = request.addr;
-        const std::size_t size = request.size;
-        flush_requests.pop_front();
-        flush_request_mutex.unlock();
-        rasterizer->FlushRegion(addr, size);
-        current_flush_fence.store(fence);
-        flush_request_mutex.lock();
-    }
+Tegra::CDmaPusher& GPU::CDmaPusher() {
+    return impl->CDmaPusher();
 }
 
-u64 GPU::GetTicks() const {
-    // This values were reversed engineered by fincs from NVN
-    // The gpu clock is reported in units of 385/625 nanoseconds
-    constexpr u64 gpu_ticks_num = 384;
-    constexpr u64 gpu_ticks_den = 625;
+const Tegra::CDmaPusher& GPU::CDmaPusher() const {
+    return impl->CDmaPusher();
+}
 
-    u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count();
-    if (Settings::values.use_fast_gpu_time.GetValue()) {
-        nanoseconds /= 256;
-    }
-    const u64 nanoseconds_num = nanoseconds / gpu_ticks_den;
-    const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den;
-    return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den;
+VideoCore::RendererBase& GPU::Renderer() {
+    return impl->Renderer();
 }
 
-void GPU::RendererFrameEndNotify() {
-    system.GetPerfStats().EndGameFrame();
+const VideoCore::RendererBase& GPU::Renderer() const {
+    return impl->Renderer();
 }
 
-void GPU::FlushCommands() {
-    rasterizer->FlushCommands();
+VideoCore::ShaderNotify& GPU::ShaderNotify() {
+    return impl->ShaderNotify();
 }
 
-void GPU::SyncGuestHost() {
-    rasterizer->SyncGuestHost();
+const VideoCore::ShaderNotify& GPU::ShaderNotify() const {
+    return impl->ShaderNotify();
 }
 
-enum class GpuSemaphoreOperation {
-    AcquireEqual = 0x1,
-    WriteLong = 0x2,
-    AcquireGequal = 0x4,
-    AcquireMask = 0x8,
-};
+void GPU::WaitFence(u32 syncpoint_id, u32 value) {
+    impl->WaitFence(syncpoint_id, value);
+}
 
-void GPU::CallMethod(const MethodCall& method_call) {
-    LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method_call.method,
-              method_call.subchannel);
+void GPU::IncrementSyncPoint(u32 syncpoint_id) {
+    impl->IncrementSyncPoint(syncpoint_id);
+}
 
-    ASSERT(method_call.subchannel < bound_engines.size());
+u32 GPU::GetSyncpointValue(u32 syncpoint_id) const {
+    return impl->GetSyncpointValue(syncpoint_id);
+}
 
-    if (ExecuteMethodOnEngine(method_call.method)) {
-        CallEngineMethod(method_call);
-    } else {
-        CallPullerMethod(method_call);
-    }
+void GPU::RegisterSyncptInterrupt(u32 syncpoint_id, u32 value) {
+    impl->RegisterSyncptInterrupt(syncpoint_id, value);
 }
 
-void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
-                          u32 methods_pending) {
-    LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method, subchannel);
-
-    ASSERT(subchannel < bound_engines.size());
-
-    if (ExecuteMethodOnEngine(method)) {
-        CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending);
-    } else {
-        for (std::size_t i = 0; i < amount; i++) {
-            CallPullerMethod(MethodCall{
-                method,
-                base_start[i],
-                subchannel,
-                methods_pending - static_cast<u32>(i),
-            });
-        }
-    }
+bool GPU::CancelSyncptInterrupt(u32 syncpoint_id, u32 value) {
+    return impl->CancelSyncptInterrupt(syncpoint_id, value);
 }
 
-bool GPU::ExecuteMethodOnEngine(u32 method) {
-    const auto buffer_method = static_cast<BufferMethods>(method);
-    return buffer_method >= BufferMethods::NonPullerMethods;
-}
-
-void GPU::CallPullerMethod(const MethodCall& method_call) {
-    regs.reg_array[method_call.method] = method_call.argument;
-    const auto method = static_cast<BufferMethods>(method_call.method);
-
-    switch (method) {
-    case BufferMethods::BindObject: {
-        ProcessBindMethod(method_call);
-        break;
-    }
-    case BufferMethods::Nop:
-    case BufferMethods::SemaphoreAddressHigh:
-    case BufferMethods::SemaphoreAddressLow:
-    case BufferMethods::SemaphoreSequence:
-    case BufferMethods::UnkCacheFlush:
-    case BufferMethods::WrcacheFlush:
-    case BufferMethods::FenceValue:
-        break;
-    case BufferMethods::RefCnt:
-        rasterizer->SignalReference();
-        break;
-    case BufferMethods::FenceAction:
-        ProcessFenceActionMethod();
-        break;
-    case BufferMethods::WaitForInterrupt:
-        ProcessWaitForInterruptMethod();
-        break;
-    case BufferMethods::SemaphoreTrigger: {
-        ProcessSemaphoreTriggerMethod();
-        break;
-    }
-    case BufferMethods::NotifyIntr: {
-        // TODO(Kmather73): Research and implement this method.
-        LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented");
-        break;
-    }
-    case BufferMethods::Unk28: {
-        // TODO(Kmather73): Research and implement this method.
-        LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented");
-        break;
-    }
-    case BufferMethods::SemaphoreAcquire: {
-        ProcessSemaphoreAcquire();
-        break;
-    }
-    case BufferMethods::SemaphoreRelease: {
-        ProcessSemaphoreRelease();
-        break;
-    }
-    case BufferMethods::Yield: {
-        // TODO(Kmather73): Research and implement this method.
-        LOG_ERROR(HW_GPU, "Special puller engine method Yield not implemented");
-        break;
-    }
-    default:
-        LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method);
-        break;
-    }
-}
-
-void GPU::CallEngineMethod(const MethodCall& method_call) {
-    const EngineID engine = bound_engines[method_call.subchannel];
-
-    switch (engine) {
-    case EngineID::FERMI_TWOD_A:
-        fermi_2d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall());
-        break;
-    case EngineID::MAXWELL_B:
-        maxwell_3d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall());
-        break;
-    case EngineID::KEPLER_COMPUTE_B:
-        kepler_compute->CallMethod(method_call.method, method_call.argument,
-                                   method_call.IsLastCall());
-        break;
-    case EngineID::MAXWELL_DMA_COPY_A:
-        maxwell_dma->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall());
-        break;
-    case EngineID::KEPLER_INLINE_TO_MEMORY_B:
-        kepler_memory->CallMethod(method_call.method, method_call.argument,
-                                  method_call.IsLastCall());
-        break;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented engine");
-    }
-}
-
-void GPU::CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
-                                u32 methods_pending) {
-    const EngineID engine = bound_engines[subchannel];
-
-    switch (engine) {
-    case EngineID::FERMI_TWOD_A:
-        fermi_2d->CallMultiMethod(method, base_start, amount, methods_pending);
-        break;
-    case EngineID::MAXWELL_B:
-        maxwell_3d->CallMultiMethod(method, base_start, amount, methods_pending);
-        break;
-    case EngineID::KEPLER_COMPUTE_B:
-        kepler_compute->CallMultiMethod(method, base_start, amount, methods_pending);
-        break;
-    case EngineID::MAXWELL_DMA_COPY_A:
-        maxwell_dma->CallMultiMethod(method, base_start, amount, methods_pending);
-        break;
-    case EngineID::KEPLER_INLINE_TO_MEMORY_B:
-        kepler_memory->CallMultiMethod(method, base_start, amount, methods_pending);
-        break;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented engine");
-    }
-}
-
-void GPU::ProcessBindMethod(const MethodCall& method_call) {
-    // Bind the current subchannel to the desired engine id.
-    LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel,
-              method_call.argument);
-    const auto engine_id = static_cast<EngineID>(method_call.argument);
-    bound_engines[method_call.subchannel] = static_cast<EngineID>(engine_id);
-    switch (engine_id) {
-    case EngineID::FERMI_TWOD_A:
-        dma_pusher->BindSubchannel(fermi_2d.get(), method_call.subchannel);
-        break;
-    case EngineID::MAXWELL_B:
-        dma_pusher->BindSubchannel(maxwell_3d.get(), method_call.subchannel);
-        break;
-    case EngineID::KEPLER_COMPUTE_B:
-        dma_pusher->BindSubchannel(kepler_compute.get(), method_call.subchannel);
-        break;
-    case EngineID::MAXWELL_DMA_COPY_A:
-        dma_pusher->BindSubchannel(maxwell_dma.get(), method_call.subchannel);
-        break;
-    case EngineID::KEPLER_INLINE_TO_MEMORY_B:
-        dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel);
-        break;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id);
-    }
-}
-
-void GPU::ProcessFenceActionMethod() {
-    switch (regs.fence_action.op) {
-    case FenceOperation::Acquire:
-        WaitFence(regs.fence_action.syncpoint_id, regs.fence_value);
-        break;
-    case FenceOperation::Increment:
-        IncrementSyncPoint(regs.fence_action.syncpoint_id);
-        break;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value());
-    }
-}
-
-void GPU::ProcessWaitForInterruptMethod() {
-    // TODO(bunnei) ImplementMe
-    LOG_WARNING(HW_GPU, "(STUBBED) called");
-}
-
-void GPU::ProcessSemaphoreTriggerMethod() {
-    const auto semaphoreOperationMask = 0xF;
-    const auto op =
-        static_cast<GpuSemaphoreOperation>(regs.semaphore_trigger & semaphoreOperationMask);
-    if (op == GpuSemaphoreOperation::WriteLong) {
-        struct Block {
-            u32 sequence;
-            u32 zeros = 0;
-            u64 timestamp;
-        };
+u64 GPU::GetTicks() const {
+    return impl->GetTicks();
+}
 
-        Block block{};
-        block.sequence = regs.semaphore_sequence;
-        // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of
-        // CoreTiming
-        block.timestamp = GetTicks();
-        memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block,
-                                   sizeof(block));
-    } else {
-        const u32 word{memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress())};
-        if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) ||
-            (op == GpuSemaphoreOperation::AcquireGequal &&
-             static_cast<s32>(word - regs.semaphore_sequence) > 0) ||
-            (op == GpuSemaphoreOperation::AcquireMask && (word & regs.semaphore_sequence))) {
-            // Nothing to do in this case
-        } else {
-            regs.acquire_source = true;
-            regs.acquire_value = regs.semaphore_sequence;
-            if (op == GpuSemaphoreOperation::AcquireEqual) {
-                regs.acquire_active = true;
-                regs.acquire_mode = false;
-            } else if (op == GpuSemaphoreOperation::AcquireGequal) {
-                regs.acquire_active = true;
-                regs.acquire_mode = true;
-            } else if (op == GpuSemaphoreOperation::AcquireMask) {
-                // TODO(kemathe) The acquire mask operation waits for a value that, ANDed with
-                // semaphore_sequence, gives a non-0 result
-                LOG_ERROR(HW_GPU, "Invalid semaphore operation AcquireMask not implemented");
-            } else {
-                LOG_ERROR(HW_GPU, "Invalid semaphore operation");
-            }
-        }
-    }
+bool GPU::IsAsync() const {
+    return impl->IsAsync();
 }
 
-void GPU::ProcessSemaphoreRelease() {
-    memory_manager->Write<u32>(regs.semaphore_address.SemaphoreAddress(), regs.semaphore_release);
+bool GPU::UseNvdec() const {
+    return impl->UseNvdec();
 }
 
-void GPU::ProcessSemaphoreAcquire() {
-    const u32 word = memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress());
-    const auto value = regs.semaphore_acquire;
-    if (word != value) {
-        regs.acquire_active = true;
-        regs.acquire_value = value;
-        // TODO(kemathe73) figure out how to do the acquire_timeout
-        regs.acquire_mode = false;
-        regs.acquire_source = false;
-    }
+void GPU::RendererFrameEndNotify() {
+    impl->RendererFrameEndNotify();
 }
 
 void GPU::Start() {
-    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher);
-    cpu_context = renderer->GetRenderWindow().CreateSharedContext();
-    cpu_context->MakeCurrent();
+    impl->Start();
 }
 
 void GPU::ObtainContext() {
-    cpu_context->MakeCurrent();
+    impl->ObtainContext();
 }
 
 void GPU::ReleaseContext() {
-    cpu_context->DoneCurrent();
+    impl->ReleaseContext();
 }
 
 void GPU::PushGPUEntries(Tegra::CommandList&& entries) {
-    gpu_thread.SubmitList(std::move(entries));
+    impl->PushGPUEntries(std::move(entries));
 }
 
 void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
-    if (!use_nvdec) {
-        return;
-    }
-
-    if (!cdma_pusher) {
-        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
-    }
-
-    // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
-    // TODO(ameerj): RE proper async nvdec operation
-    // gpu_thread.SubmitCommandBuffer(std::move(entries));
-
-    cdma_pusher->ProcessEntries(std::move(entries));
+    impl->PushCommandBuffer(entries);
 }
 
 void GPU::ClearCdmaInstance() {
-    cdma_pusher.reset();
+    impl->ClearCdmaInstance();
 }
 
 void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
-    gpu_thread.SwapBuffers(framebuffer);
+    impl->SwapBuffers(framebuffer);
 }
 
 void GPU::FlushRegion(VAddr addr, u64 size) {
-    gpu_thread.FlushRegion(addr, size);
+    impl->FlushRegion(addr, size);
 }
 
 void GPU::InvalidateRegion(VAddr addr, u64 size) {
-    gpu_thread.InvalidateRegion(addr, size);
+    impl->InvalidateRegion(addr, size);
 }
 
 void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) {
-    gpu_thread.FlushAndInvalidateRegion(addr, size);
-}
-
-void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
-    auto& interrupt_manager = system.InterruptManager();
-    interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
-}
-
-void GPU::OnCommandListEnd() {
-    if (is_async) {
-        // This command only applies to asynchronous GPU mode
-        gpu_thread.OnCommandListEnd();
-    }
+    impl->FlushAndInvalidateRegion(addr, size);
 }
 
 } // namespace Tegra
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index e6a02a71b..05e5c94f3 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -4,28 +4,12 @@
 
 #pragma once
 
-#include <array>
-#include <atomic>
-#include <condition_variable>
-#include <list>
 #include <memory>
-#include <mutex>
+
+#include "common/bit_field.h"
 #include "common/common_types.h"
-#include "core/hle/service/nvdrv/nvdata.h"
-#include "core/hle/service/nvflinger/buffer_queue.h"
 #include "video_core/cdma_pusher.h"
-#include "video_core/dma_pusher.h"
 #include "video_core/framebuffer_config.h"
-#include "video_core/gpu_thread.h"
-
-using CacheAddr = std::uintptr_t;
-[[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) {
-    return reinterpret_cast<CacheAddr>(host_ptr);
-}
-
-[[nodiscard]] inline u8* FromCacheAddr(CacheAddr cache_addr) {
-    return reinterpret_cast<u8*>(cache_addr);
-}
 
 namespace Core {
 namespace Frontend {
@@ -40,6 +24,9 @@ class ShaderNotify;
 } // namespace VideoCore
 
 namespace Tegra {
+class DmaPusher;
+class CDmaPusher;
+struct CommandList;
 
 enum class RenderTargetFormat : u32 {
     NONE = 0x0,
@@ -138,7 +125,18 @@ public:
         }
     };
 
-    explicit GPU(Core::System& system_, bool is_async_, bool use_nvdec_);
+    enum class FenceOperation : u32 {
+        Acquire = 0,
+        Increment = 1,
+    };
+
+    union FenceAction {
+        u32 raw;
+        BitField<0, 1, FenceOperation> op;
+        BitField<8, 24, u32> syncpoint_id;
+    };
+
+    explicit GPU(Core::System& system, bool is_async, bool use_nvdec);
     ~GPU();
 
     /// Binds a renderer to the GPU.
@@ -162,9 +160,7 @@ public:
     [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size);
 
     /// Obtains current flush request fence id.
-    [[nodiscard]] u64 CurrentFlushRequestFence() const {
-        return current_flush_fence.load(std::memory_order_relaxed);
-    }
+    [[nodiscard]] u64 CurrentFlushRequestFence() const;
 
     /// Tick pending requests within the GPU.
     void TickWork();
@@ -200,24 +196,16 @@ public:
     [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const;
 
     /// Returns a reference to the underlying renderer.
-    [[nodiscard]] VideoCore::RendererBase& Renderer() {
-        return *renderer;
-    }
+    [[nodiscard]] VideoCore::RendererBase& Renderer();
 
     /// Returns a const reference to the underlying renderer.
-    [[nodiscard]] const VideoCore::RendererBase& Renderer() const {
-        return *renderer;
-    }
+    [[nodiscard]] const VideoCore::RendererBase& Renderer() const;
 
     /// Returns a reference to the shader notifier.
-    [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() {
-        return *shader_notify;
-    }
+    [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify();
 
     /// Returns a const reference to the shader notifier.
-    [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const {
-        return *shader_notify;
-    }
+    [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const;
 
     /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
     void WaitFence(u32 syncpoint_id, u32 value);
@@ -232,80 +220,12 @@ public:
 
     [[nodiscard]] u64 GetTicks() const;
 
-    [[nodiscard]] std::unique_lock<std::mutex> LockSync() {
-        return std::unique_lock{sync_mutex};
-    }
-
-    [[nodiscard]] bool IsAsync() const {
-        return is_async;
-    }
+    [[nodiscard]] bool IsAsync() const;
 
-    [[nodiscard]] bool UseNvdec() const {
-        return use_nvdec;
-    }
+    [[nodiscard]] bool UseNvdec() const;
 
     void RendererFrameEndNotify();
 
-    enum class FenceOperation : u32 {
-        Acquire = 0,
-        Increment = 1,
-    };
-
-    union FenceAction {
-        u32 raw;
-        BitField<0, 1, FenceOperation> op;
-        BitField<8, 24, u32> syncpoint_id;
-
-        [[nodiscard]] static CommandHeader Build(FenceOperation op, u32 syncpoint_id) {
-            FenceAction result{};
-            result.op.Assign(op);
-            result.syncpoint_id.Assign(syncpoint_id);
-            return {result.raw};
-        }
-    };
-
-    struct Regs {
-        static constexpr size_t NUM_REGS = 0x40;
-
-        union {
-            struct {
-                INSERT_PADDING_WORDS_NOINIT(0x4);
-                struct {
-                    u32 address_high;
-                    u32 address_low;
-
-                    [[nodiscard]] GPUVAddr SemaphoreAddress() const {
-                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                                     address_low);
-                    }
-                } semaphore_address;
-
-                u32 semaphore_sequence;
-                u32 semaphore_trigger;
-                INSERT_PADDING_WORDS_NOINIT(0xC);
-
-                // The pusher and the puller share the reference counter, the pusher only has read
-                // access
-                u32 reference_count;
-                INSERT_PADDING_WORDS_NOINIT(0x5);
-
-                u32 semaphore_acquire;
-                u32 semaphore_release;
-                u32 fence_value;
-                FenceAction fence_action;
-                INSERT_PADDING_WORDS_NOINIT(0xE2);
-
-                // Puller state
-                u32 acquire_mode;
-                u32 acquire_source;
-                u32 acquire_active;
-                u32 acquire_timeout;
-                u32 acquire_value;
-            };
-            std::array<u32, NUM_REGS> reg_array;
-        };
-    } regs{};
-
     /// Performs any additional setup necessary in order to begin GPU emulation.
     /// This can be used to launch any necessary threads and register any necessary
     /// core timing events.
@@ -338,104 +258,9 @@ public:
     /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
     void FlushAndInvalidateRegion(VAddr addr, u64 size);
 
-protected:
-    void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const;
-
 private:
-    void ProcessBindMethod(const MethodCall& method_call);
-    void ProcessFenceActionMethod();
-    void ProcessWaitForInterruptMethod();
-    void ProcessSemaphoreTriggerMethod();
-    void ProcessSemaphoreRelease();
-    void ProcessSemaphoreAcquire();
-
-    /// Calls a GPU puller method.
-    void CallPullerMethod(const MethodCall& method_call);
-
-    /// Calls a GPU engine method.
-    void CallEngineMethod(const MethodCall& method_call);
-
-    /// Calls a GPU engine multivalue method.
-    void CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount,
-                               u32 methods_pending);
-
-    /// Determines where the method should be executed.
-    [[nodiscard]] bool ExecuteMethodOnEngine(u32 method);
-
-protected:
-    Core::System& system;
-    std::unique_ptr<Tegra::MemoryManager> memory_manager;
-    std::unique_ptr<Tegra::DmaPusher> dma_pusher;
-    std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
-    std::unique_ptr<VideoCore::RendererBase> renderer;
-    VideoCore::RasterizerInterface* rasterizer = nullptr;
-    const bool use_nvdec;
-
-private:
-    /// Mapping of command subchannels to their bound engine ids
-    std::array<EngineID, 8> bound_engines = {};
-    /// 3D engine
-    std::unique_ptr<Engines::Maxwell3D> maxwell_3d;
-    /// 2D engine
-    std::unique_ptr<Engines::Fermi2D> fermi_2d;
-    /// Compute engine
-    std::unique_ptr<Engines::KeplerCompute> kepler_compute;
-    /// DMA engine
-    std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
-    /// Inline memory engine
-    std::unique_ptr<Engines::KeplerMemory> kepler_memory;
-    /// Shader build notifier
-    std::unique_ptr<VideoCore::ShaderNotify> shader_notify;
-    /// When true, we are about to shut down emulation session, so terminate outstanding tasks
-    std::atomic_bool shutting_down{};
-
-    std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{};
-
-    std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts;
-
-    std::mutex sync_mutex;
-    std::mutex device_mutex;
-
-    std::condition_variable sync_cv;
-
-    struct FlushRequest {
-        explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_)
-            : fence{fence_}, addr{addr_}, size{size_} {}
-        u64 fence;
-        VAddr addr;
-        std::size_t size;
-    };
-
-    std::list<FlushRequest> flush_requests;
-    std::atomic<u64> current_flush_fence{};
-    u64 last_flush_fence{};
-    std::mutex flush_request_mutex;
-
-    const bool is_async;
-
-    VideoCommon::GPUThread::ThreadManager gpu_thread;
-    std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
+    struct Impl;
+    std::unique_ptr<Impl> impl;
 };
 
-#define ASSERT_REG_POSITION(field_name, position)                                                  \
-    static_assert(offsetof(GPU::Regs, field_name) == position * 4,                                 \
-                  "Field " #field_name " has invalid position")
-
-ASSERT_REG_POSITION(semaphore_address, 0x4);
-ASSERT_REG_POSITION(semaphore_sequence, 0x6);
-ASSERT_REG_POSITION(semaphore_trigger, 0x7);
-ASSERT_REG_POSITION(reference_count, 0x14);
-ASSERT_REG_POSITION(semaphore_acquire, 0x1A);
-ASSERT_REG_POSITION(semaphore_release, 0x1B);
-ASSERT_REG_POSITION(fence_value, 0x1C);
-ASSERT_REG_POSITION(fence_action, 0x1D);
-
-ASSERT_REG_POSITION(acquire_mode, 0x100);
-ASSERT_REG_POSITION(acquire_source, 0x101);
-ASSERT_REG_POSITION(acquire_active, 0x102);
-ASSERT_REG_POSITION(acquire_timeout, 0x103);
-ASSERT_REG_POSITION(acquire_value, 0x104);
-
-#undef ASSERT_REG_POSITION
-
 } // namespace Tegra
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 91bada925..00984188e 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -130,9 +130,6 @@ public:
     /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
     void FlushAndInvalidateRegion(VAddr addr, u64 size);
 
-    // Stops the GPU execution and waits for the GPU to finish working
-    void ShutDown();
-
     void OnCommandListEnd();
 
 private:
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index c9cff7450..20d748c12 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -6,7 +6,6 @@ set(SHADER_FILES
     convert_float_to_depth.frag
     full_screen_triangle.vert
     opengl_copy_bc4.comp
-    opengl_copy_bgra.comp
     opengl_present.frag
     opengl_present.vert
     pitch_unswizzle.comp
diff --git a/src/video_core/host_shaders/opengl_copy_bgra.comp b/src/video_core/host_shaders/opengl_copy_bgra.comp
deleted file mode 100644
index 2571a4abf..000000000
--- a/src/video_core/host_shaders/opengl_copy_bgra.comp
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright 2021 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#version 430 core
-
-layout (local_size_x = 4, local_size_y = 4) in;
-
-layout(binding = 0, rgba8) readonly uniform image2DArray bgr_input;
-layout(binding = 1, rgba8) writeonly uniform image2DArray bgr_output;
-
-void main() {
-    vec4 color = imageLoad(bgr_input, ivec3(gl_GlobalInvocationID));
-    imageStore(bgr_output, ivec3(gl_GlobalInvocationID), color.bgra);
-}
diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h
index a19b9f931..392f82eb7 100644
--- a/src/video_core/query_cache.h
+++ b/src/video_core/query_cache.h
@@ -8,6 +8,7 @@
 #include <array>
 #include <cstring>
 #include <iterator>
+#include <list>
 #include <memory>
 #include <mutex>
 #include <optional>
diff --git a/src/video_core/rasterizer_accelerated.h b/src/video_core/rasterizer_accelerated.h
index ea879bfdd..249644e50 100644
--- a/src/video_core/rasterizer_accelerated.h
+++ b/src/video_core/rasterizer_accelerated.h
@@ -42,7 +42,7 @@ private:
     };
     static_assert(sizeof(CacheEntry) == 8, "CacheEntry should be 8 bytes!");
 
-    std::array<CacheEntry, 0x1000000> cached_pages;
+    std::array<CacheEntry, 0x2000000> cached_pages;
     Core::Memory::Memory& cpu_memory;
 };
 
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 07a995f7d..187a28e4d 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -147,8 +147,7 @@ void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer,
 
 void BufferCacheRuntime::ClearBuffer(Buffer& dest_buffer, u32 offset, size_t size, u32 value) {
     glClearNamedBufferSubData(dest_buffer.Handle(), GL_R32UI, static_cast<GLintptr>(offset),
-                              static_cast<GLsizeiptr>(size / sizeof(u32)), GL_RED, GL_UNSIGNED_INT,
-                              &value);
+                              static_cast<GLsizeiptr>(size), GL_RED, GL_UNSIGNED_INT, &value);
 }
 
 void BufferCacheRuntime::BindIndexBuffer(Buffer& buffer, u32 offset, u32 size) {
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index b0aee6cc1..8c3ca3d82 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -20,6 +20,7 @@
 #include "video_core/surface.h"
 #include "video_core/texture_cache/formatter.h"
 #include "video_core/texture_cache/samples_helper.h"
+#include "video_core/texture_cache/util.h"
 
 namespace OpenGL {
 namespace {
@@ -461,7 +462,7 @@ bool TextureCacheRuntime::CanImageBeCopied(const Image& dst, const Image& src) {
     if (dst.info.type == ImageType::e3D && dst.info.format == PixelFormat::BC4_UNORM) {
         return false;
     }
-    if (IsPixelFormatBGR(dst.info.format) || IsPixelFormatBGR(src.info.format)) {
+    if (IsPixelFormatBGR(dst.info.format) != IsPixelFormatBGR(src.info.format)) {
         return false;
     }
     return true;
@@ -473,7 +474,7 @@ void TextureCacheRuntime::EmulateCopyImage(Image& dst, Image& src,
         ASSERT(src.info.type == ImageType::e3D);
         util_shaders.CopyBC4(dst, src, copies);
     } else if (IsPixelFormatBGR(dst.info.format) || IsPixelFormatBGR(src.info.format)) {
-        util_shaders.CopyBGR(dst, src, copies);
+        bgr_copy_pass.CopyBGR(dst, src, copies);
     } else {
         UNREACHABLE();
     }
@@ -1112,4 +1113,37 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM
     framebuffer.handle = handle;
 }
 
+void BGRCopyPass::CopyBGR(Image& dst_image, Image& src_image,
+                          std::span<const VideoCommon::ImageCopy> copies) {
+    static constexpr VideoCommon::Offset3D zero_offset{0, 0, 0};
+    const u32 requested_pbo_size =
+        std::max(src_image.unswizzled_size_bytes, dst_image.unswizzled_size_bytes);
+
+    if (bgr_pbo_size < requested_pbo_size) {
+        bgr_pbo.Create();
+        bgr_pbo_size = requested_pbo_size;
+        glNamedBufferData(bgr_pbo.handle, bgr_pbo_size, nullptr, GL_STREAM_COPY);
+    }
+    for (const ImageCopy& copy : copies) {
+        ASSERT(copy.src_offset == zero_offset);
+        ASSERT(copy.dst_offset == zero_offset);
+
+        // Copy from source to PBO
+        glPixelStorei(GL_PACK_ALIGNMENT, 1);
+        glPixelStorei(GL_PACK_ROW_LENGTH, copy.extent.width);
+        glBindBuffer(GL_PIXEL_PACK_BUFFER, bgr_pbo.handle);
+        glGetTextureSubImage(src_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height,
+                             copy.src_subresource.num_layers, src_image.GlFormat(),
+                             src_image.GlType(), static_cast<GLsizei>(bgr_pbo_size), nullptr);
+
+        // Copy from PBO to destination in desired GL format
+        glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+        glPixelStorei(GL_UNPACK_ROW_LENGTH, copy.extent.width);
+        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, bgr_pbo.handle);
+        glTextureSubImage3D(dst_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height,
+                            copy.dst_subresource.num_layers, dst_image.GlFormat(),
+                            dst_image.GlType(), nullptr);
+    }
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 4a4f6301c..1ca2c90be 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -12,6 +12,7 @@
 #include "shader_recompiler/shader_info.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/util_shaders.h"
+#include "video_core/texture_cache/image_view_base.h"
 #include "video_core/texture_cache/texture_cache_base.h"
 
 namespace OpenGL {
@@ -47,6 +48,19 @@ struct FormatProperties {
     bool is_compressed;
 };
 
+class BGRCopyPass {
+public:
+    BGRCopyPass() = default;
+    ~BGRCopyPass() = default;
+
+    void CopyBGR(Image& dst_image, Image& src_image,
+                 std::span<const VideoCommon::ImageCopy> copies);
+
+private:
+    OGLBuffer bgr_pbo;
+    size_t bgr_pbo_size{};
+};
+
 class TextureCacheRuntime {
     friend Framebuffer;
     friend Image;
@@ -118,6 +132,7 @@ private:
     const Device& device;
     StateTracker& state_tracker;
     UtilShaders util_shaders;
+    BGRCopyPass bgr_copy_pass;
 
     std::array<std::unordered_map<GLenum, FormatProperties>, 3> format_properties;
     bool has_broken_texture_view_formats = false;
@@ -162,6 +177,14 @@ public:
         return texture.handle;
     }
 
+    GLuint GlFormat() const noexcept {
+        return gl_format;
+    }
+
+    GLuint GlType() const noexcept {
+        return gl_type;
+    }
+
 private:
     void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset);
 
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 672f94bfc..39158aa3e 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -52,7 +52,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> FORMAT_TAB
     {GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT},                          // BC6H_UFLOAT
     {GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT},                            // BC6H_SFLOAT
     {GL_COMPRESSED_RGBA_ASTC_4x4_KHR},                                // ASTC_2D_4X4_UNORM
-    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE},                            // B8G8R8A8_UNORM
+    {GL_RGBA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV},                 // B8G8R8A8_UNORM
     {GL_RGBA32F, GL_RGBA, GL_FLOAT},                                  // R32G32B32A32_FLOAT
     {GL_RGBA32I, GL_RGBA_INTEGER, GL_INT},                            // R32G32B32A32_SINT
     {GL_RG32F, GL_RG, GL_FLOAT},                                      // R32G32_FLOAT
@@ -81,7 +81,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> FORMAT_TAB
     {GL_COMPRESSED_RGBA_ASTC_8x8_KHR},                                // ASTC_2D_8X8_UNORM
     {GL_COMPRESSED_RGBA_ASTC_8x5_KHR},                                // ASTC_2D_8X5_UNORM
     {GL_COMPRESSED_RGBA_ASTC_5x4_KHR},                                // ASTC_2D_5X4_UNORM
-    {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE},                     // B8G8R8A8_SRGB
+    {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV},          // B8G8R8A8_SRGB
     {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT},                         // BC1_RGBA_SRGB
     {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT},                         // BC2_SRGB
     {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT},                         // BC3_SRGB
diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp
index 333f35a1c..897c380b3 100644
--- a/src/video_core/renderer_opengl/util_shaders.cpp
+++ b/src/video_core/renderer_opengl/util_shaders.cpp
@@ -14,7 +14,6 @@
 #include "video_core/host_shaders/block_linear_unswizzle_2d_comp.h"
 #include "video_core/host_shaders/block_linear_unswizzle_3d_comp.h"
 #include "video_core/host_shaders/opengl_copy_bc4_comp.h"
-#include "video_core/host_shaders/opengl_copy_bgra_comp.h"
 #include "video_core/host_shaders/pitch_unswizzle_comp.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/gl_shader_util.h"
@@ -44,11 +43,6 @@ namespace {
 OGLProgram MakeProgram(std::string_view source) {
     return CreateProgram(source, GL_COMPUTE_SHADER);
 }
-
-size_t NumPixelsInCopy(const VideoCommon::ImageCopy& copy) {
-    return static_cast<size_t>(copy.extent.width * copy.extent.height *
-                               copy.src_subresource.num_layers);
-}
 } // Anonymous namespace
 
 UtilShaders::UtilShaders(ProgramManager& program_manager_)
@@ -56,7 +50,6 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
       block_linear_unswizzle_2d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_2D_COMP)),
       block_linear_unswizzle_3d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_3D_COMP)),
       pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)),
-      copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)),
       copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) {
     const auto swizzle_table = Tegra::Texture::MakeSwizzleTable();
     swizzle_table_buffer.Create();
@@ -255,43 +248,6 @@ void UtilShaders::CopyBC4(Image& dst_image, Image& src_image, std::span<const Im
     program_manager.RestoreGuestCompute();
 }
 
-void UtilShaders::CopyBGR(Image& dst_image, Image& src_image,
-                          std::span<const VideoCommon::ImageCopy> copies) {
-    static constexpr GLuint BINDING_INPUT_IMAGE = 0;
-    static constexpr GLuint BINDING_OUTPUT_IMAGE = 1;
-    static constexpr VideoCommon::Offset3D zero_offset{0, 0, 0};
-    const u32 bytes_per_block = BytesPerBlock(dst_image.info.format);
-    switch (bytes_per_block) {
-    case 2:
-        // BGR565 copy
-        for (const ImageCopy& copy : copies) {
-            ASSERT(copy.src_offset == zero_offset);
-            ASSERT(copy.dst_offset == zero_offset);
-            bgr_copy_pass.Execute(dst_image, src_image, copy);
-        }
-        break;
-    case 4: {
-        // BGRA8 copy
-        program_manager.BindComputeProgram(copy_bgra_program.handle);
-        constexpr GLenum FORMAT = GL_RGBA8;
-        for (const ImageCopy& copy : copies) {
-            ASSERT(copy.src_offset == zero_offset);
-            ASSERT(copy.dst_offset == zero_offset);
-            glBindImageTexture(BINDING_INPUT_IMAGE, src_image.StorageHandle(),
-                               copy.src_subresource.base_level, GL_FALSE, 0, GL_READ_ONLY, FORMAT);
-            glBindImageTexture(BINDING_OUTPUT_IMAGE, dst_image.StorageHandle(),
-                               copy.dst_subresource.base_level, GL_FALSE, 0, GL_WRITE_ONLY, FORMAT);
-            glDispatchCompute(copy.extent.width, copy.extent.height, copy.extent.depth);
-        }
-        program_manager.RestoreGuestCompute();
-        break;
-    }
-    default:
-        UNREACHABLE();
-        break;
-    }
-}
-
 GLenum StoreFormat(u32 bytes_per_block) {
     switch (bytes_per_block) {
     case 1:
@@ -309,36 +265,4 @@ GLenum StoreFormat(u32 bytes_per_block) {
     return GL_R8UI;
 }
 
-void Bgr565CopyPass::Execute(const Image& dst_image, const Image& src_image,
-                             const ImageCopy& copy) {
-    if (CopyBufferCreationNeeded(copy)) {
-        CreateNewCopyBuffer(copy, GL_TEXTURE_2D_ARRAY, GL_RGB565);
-    }
-    // Copy from source to PBO
-    glPixelStorei(GL_PACK_ALIGNMENT, 1);
-    glPixelStorei(GL_PACK_ROW_LENGTH, copy.extent.width);
-    glBindBuffer(GL_PIXEL_PACK_BUFFER, bgr16_pbo.handle);
-    glGetTextureSubImage(src_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height,
-                         copy.src_subresource.num_layers, GL_RGB, GL_UNSIGNED_SHORT_5_6_5,
-                         static_cast<GLsizei>(bgr16_pbo_size), nullptr);
-
-    // Copy from PBO to destination in reverse order
-    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
-    glPixelStorei(GL_UNPACK_ROW_LENGTH, copy.extent.width);
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, bgr16_pbo.handle);
-    glTextureSubImage3D(dst_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height,
-                        copy.dst_subresource.num_layers, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV,
-                        nullptr);
-}
-
-bool Bgr565CopyPass::CopyBufferCreationNeeded(const ImageCopy& copy) {
-    return bgr16_pbo_size < NumPixelsInCopy(copy) * sizeof(u16);
-}
-
-void Bgr565CopyPass::CreateNewCopyBuffer(const ImageCopy& copy, GLenum target, GLuint format) {
-    bgr16_pbo.Create();
-    bgr16_pbo_size = NumPixelsInCopy(copy) * sizeof(u16);
-    glNamedBufferData(bgr16_pbo.handle, bgr16_pbo_size, nullptr, GL_STREAM_COPY);
-}
-
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h
index ef881e35f..5de95ea7a 100644
--- a/src/video_core/renderer_opengl/util_shaders.h
+++ b/src/video_core/renderer_opengl/util_shaders.h
@@ -19,22 +19,6 @@ class ProgramManager;
 
 struct ImageBufferMap;
 
-class Bgr565CopyPass {
-public:
-    Bgr565CopyPass() = default;
-    ~Bgr565CopyPass() = default;
-
-    void Execute(const Image& dst_image, const Image& src_image,
-                 const VideoCommon::ImageCopy& copy);
-
-private:
-    [[nodiscard]] bool CopyBufferCreationNeeded(const VideoCommon::ImageCopy& copy);
-    void CreateNewCopyBuffer(const VideoCommon::ImageCopy& copy, GLenum target, GLuint format);
-
-    OGLBuffer bgr16_pbo;
-    size_t bgr16_pbo_size{};
-};
-
 class UtilShaders {
 public:
     explicit UtilShaders(ProgramManager& program_manager);
@@ -55,9 +39,6 @@ public:
     void CopyBC4(Image& dst_image, Image& src_image,
                  std::span<const VideoCommon::ImageCopy> copies);
 
-    void CopyBGR(Image& dst_image, Image& src_image,
-                 std::span<const VideoCommon::ImageCopy> copies);
-
 private:
     ProgramManager& program_manager;
 
@@ -67,10 +48,7 @@ private:
     OGLProgram block_linear_unswizzle_2d_program;
     OGLProgram block_linear_unswizzle_3d_program;
     OGLProgram pitch_unswizzle_program;
-    OGLProgram copy_bgra_program;
     OGLProgram copy_bc4_program;
-
-    Bgr565CopyPass bgr_copy_pass;
 };
 
 GLenum StoreFormat(u32 bytes_per_block);
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index adb6b7a3b..74822814d 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -97,19 +97,14 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_,
                                Core::Frontend::EmuWindow& emu_window,
                                Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_,
                                std::unique_ptr<Core::Frontend::GraphicsContext> context_) try
-    : RendererBase(emu_window, std::move(context_)),
-      telemetry_session(telemetry_session_),
-      cpu_memory(cpu_memory_),
-      gpu(gpu_),
-      library(OpenLibrary()),
+    : RendererBase(emu_window, std::move(context_)), telemetry_session(telemetry_session_),
+      cpu_memory(cpu_memory_), gpu(gpu_), library(OpenLibrary()),
       instance(CreateInstance(library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type,
                               true, Settings::values.renderer_debug.GetValue())),
       debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr),
       surface(CreateSurface(instance, render_window)),
-      device(CreateDevice(instance, dld, *surface)),
-      memory_allocator(device, false),
-      state_tracker(gpu),
-      scheduler(device, state_tracker),
+      device(CreateDevice(instance, dld, *surface)), memory_allocator(device, false),
+      state_tracker(gpu), scheduler(device, state_tracker),
       swapchain(*surface, device, scheduler, render_window.GetFramebufferLayout().width,
                 render_window.GetFramebufferLayout().height, false),
       blit_screen(cpu_memory, render_window, device, memory_allocator, swapchain, scheduler,
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index 7c0f91007..8634c3316 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -507,8 +507,9 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
             vertex_attributes.push_back({
                 .location = static_cast<u32>(index),
                 .binding = 0,
-                .format = type == 1 ? VK_FORMAT_R32_SFLOAT
-                                    : type == 2 ? VK_FORMAT_R32_SINT : VK_FORMAT_R32_UINT,
+                .format = type == 1   ? VK_FORMAT_R32_SFLOAT
+                          : type == 2 ? VK_FORMAT_R32_SINT
+                                      : VK_FORMAT_R32_UINT,
                 .offset = 0,
             });
         }
@@ -567,12 +568,21 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
     if (!vertex_binding_divisors.empty()) {
         vertex_input_ci.pNext = &input_divisor_ci;
     }
+    const bool has_tess_stages = spv_modules[1] || spv_modules[2];
     auto input_assembly_topology = MaxwellToVK::PrimitiveTopology(device, key.state.topology);
     if (input_assembly_topology == VK_PRIMITIVE_TOPOLOGY_PATCH_LIST) {
-        if (!spv_modules[1] && !spv_modules[2]) {
+        if (!has_tess_stages) {
             LOG_WARNING(Render_Vulkan, "Patch topology used without tessellation, using points");
             input_assembly_topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
         }
+    } else {
+        if (has_tess_stages) {
+            // The Vulkan spec requires patch list IA topology be used with tessellation
+            // shader stages. Forcing it fixes a crash on some drivers
+            LOG_WARNING(Render_Vulkan,
+                        "Patch topology not used with tessellation, using patch list");
+            input_assembly_topology = VK_PRIMITIVE_TOPOLOGY_PATCH_LIST;
+        }
     }
     const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci{
         .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h
index 4f8688118..0886b7da8 100644
--- a/src/video_core/renderer_vulkan/vk_master_semaphore.h
+++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h
@@ -21,12 +21,12 @@ public:
 
     /// Returns the current logical tick.
     [[nodiscard]] u64 CurrentTick() const noexcept {
-        return current_tick.load(std::memory_order_relaxed);
+        return current_tick.load(std::memory_order_acquire);
     }
 
     /// Returns the last known GPU tick.
     [[nodiscard]] u64 KnownGpuTick() const noexcept {
-        return gpu_tick.load(std::memory_order_relaxed);
+        return gpu_tick.load(std::memory_order_acquire);
     }
 
     /// Returns the timeline semaphore handle.
@@ -41,12 +41,21 @@ public:
 
     /// Advance to the logical tick and return the old one
     [[nodiscard]] u64 NextTick() noexcept {
-        return current_tick.fetch_add(1, std::memory_order::relaxed);
+        return current_tick.fetch_add(1, std::memory_order_release);
     }
 
     /// Refresh the known GPU tick
     void Refresh() {
-        gpu_tick.store(semaphore.GetCounter(), std::memory_order_relaxed);
+        u64 this_tick{};
+        u64 counter{};
+        do {
+            this_tick = gpu_tick.load(std::memory_order_acquire);
+            counter = semaphore.GetCounter();
+            if (counter < this_tick) {
+                return;
+            }
+        } while (!gpu_tick.compare_exchange_weak(this_tick, counter, std::memory_order_release,
+                                                 std::memory_order_relaxed));
     }
 
     /// Waits for a tick to be hit on the GPU
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index bd22e4e83..85fc1712f 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -212,7 +212,6 @@ private:
     vk::CommandBuffer current_cmdbuf;
 
     std::unique_ptr<CommandChunk> chunk;
-    std::jthread worker_thread;
 
     State state;
 
@@ -226,6 +225,7 @@ private:
     std::mutex work_mutex;
     std::condition_variable_any work_cv;
     std::condition_variable wait_cv;
+    std::jthread worker_thread;
 };
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index ff979a7ac..06c5fb867 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -21,6 +21,7 @@
 #include "video_core/renderer_vulkan/vk_texture_cache.h"
 #include "video_core/texture_cache/formatter.h"
 #include "video_core/texture_cache/samples_helper.h"
+#include "video_core/texture_cache/util.h"
 #include "video_core/vulkan_common/vulkan_device.h"
 #include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"
@@ -127,7 +128,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
     const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format);
     VkImageCreateFlags flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;
     if (info.type == ImageType::e2D && info.resources.layers >= 6 &&
-        info.size.width == info.size.height) {
+        info.size.width == info.size.height && !device.HasBrokenCubeImageCompability()) {
         flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT;
     }
     if (info.type == ImageType::e3D) {
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index 6d5a68bfe..b09c468e4 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -4,11 +4,11 @@
 
 #pragma once
 
-#include <compare>
 #include <span>
 
 #include "shader_recompiler/shader_info.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
+#include "video_core/texture_cache/image_view_base.h"
 #include "video_core/texture_cache/texture_cache_base.h"
 #include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"
diff --git a/src/video_core/shader_environment.cpp b/src/video_core/shader_environment.cpp
index 81a878bb2..05850afd0 100644
--- a/src/video_core/shader_environment.cpp
+++ b/src/video_core/shader_environment.cpp
@@ -16,6 +16,7 @@
 #include "common/fs/fs.h"
 #include "common/logging/log.h"
 #include "shader_recompiler/environment.h"
+#include "video_core/engines/kepler_compute.h"
 #include "video_core/memory_manager.h"
 #include "video_core/shader_environment.h"
 #include "video_core/textures/texture.h"
diff --git a/src/video_core/shader_environment.h b/src/video_core/shader_environment.h
index 2079979db..6640e53d0 100644
--- a/src/video_core/shader_environment.h
+++ b/src/video_core/shader_environment.h
@@ -5,13 +5,13 @@
 #pragma once
 
 #include <array>
-#include <atomic>
 #include <filesystem>
 #include <iosfwd>
 #include <limits>
 #include <memory>
 #include <optional>
 #include <span>
+#include <stop_token>
 #include <type_traits>
 #include <unordered_map>
 #include <vector>
@@ -19,9 +19,7 @@
 #include "common/common_types.h"
 #include "common/unique_function.h"
 #include "shader_recompiler/environment.h"
-#include "video_core/engines/kepler_compute.h"
 #include "video_core/engines/maxwell_3d.h"
-#include "video_core/textures/texture.h"
 
 namespace Tegra {
 class Memorymanager;
diff --git a/src/video_core/texture_cache/image_view_info.cpp b/src/video_core/texture_cache/image_view_info.cpp
index 6527e14c8..e751f26c7 100644
--- a/src/video_core/texture_cache/image_view_info.cpp
+++ b/src/video_core/texture_cache/image_view_info.cpp
@@ -8,6 +8,7 @@
 #include "video_core/texture_cache/image_view_info.h"
 #include "video_core/texture_cache/texture_cache_base.h"
 #include "video_core/texture_cache/types.h"
+#include "video_core/texture_cache/util.h"
 #include "video_core/textures/texture.h"
 
 namespace VideoCommon {
diff --git a/src/video_core/texture_cache/slot_vector.h b/src/video_core/texture_cache/slot_vector.h
index 74cd3c9d8..50df06409 100644
--- a/src/video_core/texture_cache/slot_vector.h
+++ b/src/video_core/texture_cache/slot_vector.h
@@ -31,8 +31,8 @@ struct SlotId {
 };
 
 template <class T>
-requires std::is_nothrow_move_assignable_v<T>&&
-    std::is_nothrow_move_constructible_v<T> class SlotVector {
+requires std::is_nothrow_move_assignable_v<T> && std::is_nothrow_move_constructible_v<T>
+class SlotVector {
 public:
     class Iterator {
         friend SlotVector<T>;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 24b809242..329df2e49 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -4,10 +4,15 @@
 
 #pragma once
 
+#include <unordered_set>
+
 #include "common/alignment.h"
 #include "video_core/dirty_flags.h"
+#include "video_core/engines/kepler_compute.h"
+#include "video_core/texture_cache/image_view_base.h"
 #include "video_core/texture_cache/samples_helper.h"
 #include "video_core/texture_cache/texture_cache_base.h"
+#include "video_core/texture_cache/util.h"
 
 namespace VideoCommon {
 
diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h
index d7528ed24..2d1893c1c 100644
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -4,13 +4,12 @@
 
 #pragma once
 
-#include <array>
 #include <mutex>
 #include <span>
 #include <type_traits>
 #include <unordered_map>
-#include <unordered_set>
 #include <vector>
+#include <queue>
 
 #include "common/common_types.h"
 #include "common/literals.h"
@@ -18,10 +17,6 @@
 #include "video_core/compatible_formats.h"
 #include "video_core/delayed_destruction_ring.h"
 #include "video_core/engines/fermi_2d.h"
-#include "video_core/engines/kepler_compute.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/memory_manager.h"
-#include "video_core/rasterizer_interface.h"
 #include "video_core/surface.h"
 #include "video_core/texture_cache/descriptor_table.h"
 #include "video_core/texture_cache/image_base.h"
@@ -30,7 +25,6 @@
 #include "video_core/texture_cache/render_targets.h"
 #include "video_core/texture_cache/slot_vector.h"
 #include "video_core/texture_cache/types.h"
-#include "video_core/texture_cache/util.h"
 #include "video_core/textures/texture.h"
 
 namespace VideoCommon {
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index c2ec9f76a..6388ed2eb 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -588,22 +588,27 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
             ext_extended_dynamic_state = false;
         }
     }
-
     sets_per_pool = 64;
-    if (driver_id == VK_DRIVER_ID_AMD_PROPRIETARY || driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE) {
+
+    const bool is_amd =
+        driver_id == VK_DRIVER_ID_AMD_PROPRIETARY || driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE;
+    if (is_amd) {
         // AMD drivers need a higher amount of Sets per Pool in certain circunstances like in XC2.
         sets_per_pool = 96;
-    }
-
-    const bool is_amd = driver_id == VK_DRIVER_ID_AMD_PROPRIETARY ||
-                        driver_id == VK_DRIVER_ID_MESA_RADV ||
-                        driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE;
-    if (ext_sampler_filter_minmax && is_amd) {
-        // Disable ext_sampler_filter_minmax on AMD GCN4 and lower as it is broken.
+        // Disable VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT on AMD GCN4 and lower as it is broken.
         if (!is_float16_supported) {
             LOG_WARNING(
                 Render_Vulkan,
-                "Blacklisting AMD GCN4 and lower for VK_EXT_SAMPLER_FILTER_MINMAX_EXTENSION_NAME");
+                "AMD GCN4 and earlier do not properly support VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT");
+            has_broken_cube_compatibility = true;
+        }
+    }
+    const bool is_amd_or_radv = is_amd || driver_id == VK_DRIVER_ID_MESA_RADV;
+    if (ext_sampler_filter_minmax && is_amd_or_radv) {
+        // Disable ext_sampler_filter_minmax on AMD GCN4 and lower as it is broken.
+        if (!is_float16_supported) {
+            LOG_WARNING(Render_Vulkan,
+                        "Blacklisting AMD GCN4 and earlier for VK_EXT_sampler_filter_minmax");
             ext_sampler_filter_minmax = false;
         }
     }
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index bc180a32a..d9e74f1aa 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -309,6 +309,11 @@ public:
         return has_renderdoc || has_nsight_graphics;
     }
 
+    /// Returns true when the device does not properly support cube compatibility.
+    bool HasBrokenCubeImageCompability() const {
+        return has_broken_cube_compatibility;
+    }
+
     /// Returns the vendor name reported from Vulkan.
     std::string_view GetVendorName() const {
         return vendor_name;
@@ -417,6 +422,7 @@ private:
     bool ext_conservative_rasterization{};  ///< Support for VK_EXT_conservative_rasterization.
     bool ext_provoking_vertex{};            ///< Support for VK_EXT_provoking_vertex.
     bool nv_device_diagnostics_config{};    ///< Support for VK_NV_device_diagnostics_config.
+    bool has_broken_cube_compatibility{};   ///< Has broken cube compatiblity bit
     bool has_renderdoc{};                   ///< Has RenderDoc attached
     bool has_nsight_graphics{};             ///< Has Nsight Graphics attached