73 files changed, 2147 insertions, 1052 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 114bed20d..1e010e4da 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -3,6 +3,8 @@ add_library(video_core STATIC
     dma_pusher.h
     debug_utils/debug_utils.cpp
     debug_utils/debug_utils.h
+    engines/engine_upload.cpp
+    engines/engine_upload.h
     engines/fermi_2d.cpp
     engines/fermi_2d.h
     engines/kepler_compute.cpp
@@ -36,6 +38,8 @@ add_library(video_core STATIC
     renderer_base.h
     renderer_opengl/gl_buffer_cache.cpp
     renderer_opengl/gl_buffer_cache.h
+    renderer_opengl/gl_device.cpp
+    renderer_opengl/gl_device.h
     renderer_opengl/gl_global_cache.cpp
     renderer_opengl/gl_global_cache.h
     renderer_opengl/gl_primitive_assembler.cpp
@@ -46,6 +50,8 @@ add_library(video_core STATIC
     renderer_opengl/gl_rasterizer_cache.h
     renderer_opengl/gl_resource_manager.cpp
     renderer_opengl/gl_resource_manager.h
+    renderer_opengl/gl_sampler_cache.cpp
+    renderer_opengl/gl_sampler_cache.h
     renderer_opengl/gl_shader_cache.cpp
     renderer_opengl/gl_shader_cache.h
     renderer_opengl/gl_shader_decompiler.cpp
@@ -67,6 +73,8 @@ add_library(video_core STATIC
     renderer_opengl/renderer_opengl.h
     renderer_opengl/utils.cpp
     renderer_opengl/utils.h
+    sampler_cache.cpp
+    sampler_cache.h
     shader/decode/arithmetic.cpp
     shader/decode/arithmetic_immediate.cpp
     shader/decode/bfe.cpp
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 046d047cb..3175579cc 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -40,6 +40,13 @@ bool DmaPusher::Step() {
     }
 
     const CommandList& command_list{dma_pushbuffer.front()};
+    ASSERT_OR_EXECUTE(!command_list.empty(), {
+        // Somehow the command_list is empty, in order to avoid a crash
+        // We ignore it and assume its size is 0.
+        dma_pushbuffer.pop();
+        dma_pushbuffer_subindex = 0;
+        return true;
+    });
     const CommandListHeader command_list_header{command_list[dma_pushbuffer_subindex++]};
     GPUVAddr dma_get = command_list_header.addr;
     GPUVAddr dma_put = dma_get + command_list_header.size * sizeof(u32);
@@ -57,8 +64,8 @@ bool DmaPusher::Step() {
 
     // Push buffer non-empty, read a word
     command_headers.resize(command_list_header.size);
-    gpu.MemoryManager().ReadBlock(dma_get, command_headers.data(),
-                                  command_list_header.size * sizeof(u32));
+    gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
+                                        command_list_header.size * sizeof(u32));
 
     for (const CommandHeader& command_header : command_headers) {
 
@@ -105,6 +112,8 @@ bool DmaPusher::Step() {
                 dma_state.non_incrementing = false;
                 dma_increment_once = true;
                 break;
+            default:
+                break;
             }
         }
     }
diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp
new file mode 100644
index 000000000..082a40cd9
--- /dev/null
+++ b/src/video_core/engines/engine_upload.cpp
@@ -0,0 +1,52 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+
+#include "common/assert.h"
+#include "video_core/engines/engine_upload.h"
+#include "video_core/memory_manager.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra::Engines::Upload {
+
+State::State(MemoryManager& memory_manager, Registers& regs)
+    : regs{regs}, memory_manager{memory_manager} {}
+
+State::~State() = default;
+
+void State::ProcessExec(const bool is_linear) {
+    write_offset = 0;
+    copy_size = regs.line_length_in * regs.line_count;
+    inner_buffer.resize(copy_size);
+    this->is_linear = is_linear;
+}
+
+void State::ProcessData(const u32 data, const bool is_last_call) {
+    const u32 sub_copy_size = std::min(4U, copy_size - write_offset);
+    std::memcpy(&inner_buffer[write_offset], &data, sub_copy_size);
+    write_offset += sub_copy_size;
+    if (!is_last_call) {
+        return;
+    }
+    const GPUVAddr address{regs.dest.Address()};
+    if (is_linear) {
+        memory_manager.WriteBlock(address, inner_buffer.data(), copy_size);
+    } else {
+        UNIMPLEMENTED_IF(regs.dest.z != 0);
+        UNIMPLEMENTED_IF(regs.dest.depth != 1);
+        UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 1);
+        UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 1);
+        const std::size_t dst_size = Tegra::Texture::CalculateSize(
+            true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 1);
+        tmp_buffer.resize(dst_size);
+        memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
+        Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y,
+                                      regs.dest.BlockHeight(), copy_size, inner_buffer.data(),
+                                      tmp_buffer.data());
+        memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
+    }
+}
+
+} // namespace Tegra::Engines::Upload
diff --git a/src/video_core/engines/engine_upload.h b/src/video_core/engines/engine_upload.h
new file mode 100644
index 000000000..ef4f5839a
--- /dev/null
+++ b/src/video_core/engines/engine_upload.h
@@ -0,0 +1,73 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace Tegra::Engines::Upload {
+
+struct Registers {
+    u32 line_length_in;
+    u32 line_count;
+
+    struct {
+        u32 address_high;
+        u32 address_low;
+        u32 pitch;
+        union {
+            BitField<0, 4, u32> block_width;
+            BitField<4, 4, u32> block_height;
+            BitField<8, 4, u32> block_depth;
+        };
+        u32 width;
+        u32 height;
+        u32 depth;
+        u32 z;
+        u32 x;
+        u32 y;
+
+        GPUVAddr Address() const {
+            return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low);
+        }
+
+        u32 BlockWidth() const {
+            return 1U << block_width.Value();
+        }
+
+        u32 BlockHeight() const {
+            return 1U << block_height.Value();
+        }
+
+        u32 BlockDepth() const {
+            return 1U << block_depth.Value();
+        }
+    } dest;
+};
+
+class State {
+public:
+    State(MemoryManager& memory_manager, Registers& regs);
+    ~State();
+
+    void ProcessExec(bool is_linear);
+    void ProcessData(u32 data, bool is_last_call);
+
+private:
+    u32 write_offset = 0;
+    u32 copy_size = 0;
+    std::vector<u8> inner_buffer;
+    std::vector<u8> tmp_buffer;
+    bool is_linear = false;
+    Registers& regs;
+    MemoryManager& memory_manager;
+};
+
+} // namespace Tegra::Engines::Upload
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 2e51b7f13..45f59a4d9 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -21,6 +21,12 @@ class RasterizerInterface;
 
 namespace Tegra::Engines {
 
+/**
+ * This Engine is known as G80_2D. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/g80_2d.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h
+ */
+
 #define FERMI2D_REG_INDEX(field_name)                                                              \
     (offsetof(Tegra::Engines::Fermi2D::Regs, field_name) / sizeof(u32))
 
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index b1d950460..7404a8163 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -4,12 +4,21 @@
 
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/core.h"
 #include "video_core/engines/kepler_compute.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
+#include "video_core/textures/decoders.h"
 
 namespace Tegra::Engines {
 
-KeplerCompute::KeplerCompute(MemoryManager& memory_manager) : memory_manager{memory_manager} {}
+KeplerCompute::KeplerCompute(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                             MemoryManager& memory_manager)
+    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, upload_state{
+                                                                                  memory_manager,
+                                                                                  regs.upload} {}
 
 KeplerCompute::~KeplerCompute() = default;
 
@@ -20,14 +29,34 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
     regs.reg_array[method_call.method] = method_call.argument;
 
     switch (method_call.method) {
+    case KEPLER_COMPUTE_REG_INDEX(exec_upload): {
+        upload_state.ProcessExec(regs.exec_upload.linear != 0);
+        break;
+    }
+    case KEPLER_COMPUTE_REG_INDEX(data_upload): {
+        const bool is_last_call = method_call.IsLastCall();
+        upload_state.ProcessData(method_call.argument, is_last_call);
+        if (is_last_call) {
+            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+        }
+        break;
+    }
     case KEPLER_COMPUTE_REG_INDEX(launch):
-        // Abort execution since compute shaders can be used to alter game memory (e.g. CUDA
-        // kernels)
-        UNREACHABLE_MSG("Compute shaders are not implemented");
+        ProcessLaunch();
         break;
     default:
         break;
     }
 }
 
+void KeplerCompute::ProcessLaunch() {
+
+    const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
+    memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description,
+                                   LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32));
+
+    const GPUVAddr code_loc = regs.code_loc.Address() + launch_description.program_start;
+    LOG_WARNING(HW_GPU, "Compute Kernel Execute at Address 0x{:016x}, STUBBED", code_loc);
+}
+
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index fb6cdf432..5250b8d9b 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -6,22 +6,40 @@
 
 #include <array>
 #include <cstddef>
+#include <vector>
+#include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 
+namespace Core {
+class System;
+}
+
 namespace Tegra {
 class MemoryManager;
 }
 
+namespace VideoCore {
+class RasterizerInterface;
+}
+
 namespace Tegra::Engines {
 
+/**
+ * This Engine is known as GK104_Compute. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_compute.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h
+ */
+
 #define KEPLER_COMPUTE_REG_INDEX(field_name)                                                       \
     (offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32))
 
 class KeplerCompute final {
 public:
-    explicit KeplerCompute(MemoryManager& memory_manager);
+    explicit KeplerCompute(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                           MemoryManager& memory_manager);
     ~KeplerCompute();
 
     static constexpr std::size_t NumConstBuffers = 8;
@@ -31,30 +49,181 @@ public:
 
         union {
             struct {
-                INSERT_PADDING_WORDS(0xAF);
+                INSERT_PADDING_WORDS(0x60);
+
+                Upload::Registers upload;
+
+                struct {
+                    union {
+                        BitField<0, 1, u32> linear;
+                    };
+                } exec_upload;
+
+                u32 data_upload;
+
+                INSERT_PADDING_WORDS(0x3F);
+
+                struct {
+                    u32 address;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address) << 8));
+                    }
+                } launch_desc_loc;
+
+                INSERT_PADDING_WORDS(0x1);
 
                 u32 launch;
 
-                INSERT_PADDING_WORDS(0xC48);
+                INSERT_PADDING_WORDS(0x4A7);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    u32 limit;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } tsc;
+
+                INSERT_PADDING_WORDS(0x3);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    u32 limit;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } tic;
+
+                INSERT_PADDING_WORDS(0x22);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } code_loc;
+
+                INSERT_PADDING_WORDS(0x3FE);
+
+                u32 texture_const_buffer_index;
+
+                INSERT_PADDING_WORDS(0x374);
             };
             std::array<u32, NUM_REGS> reg_array;
         };
     } regs{};
+
+    struct LaunchParams {
+        static constexpr std::size_t NUM_LAUNCH_PARAMETERS = 0x40;
+
+        INSERT_PADDING_WORDS(0x8);
+
+        u32 program_start;
+
+        INSERT_PADDING_WORDS(0x2);
+
+        BitField<30, 1, u32> linked_tsc;
+
+        BitField<0, 31, u32> grid_dim_x;
+        union {
+            BitField<0, 16, u32> grid_dim_y;
+            BitField<16, 16, u32> grid_dim_z;
+        };
+
+        INSERT_PADDING_WORDS(0x3);
+
+        BitField<0, 16, u32> shared_alloc;
+
+        BitField<0, 31, u32> block_dim_x;
+        union {
+            BitField<0, 16, u32> block_dim_y;
+            BitField<16, 16, u32> block_dim_z;
+        };
+
+        union {
+            BitField<0, 8, u32> const_buffer_enable_mask;
+            BitField<29, 2, u32> cache_layout;
+        } memory_config;
+
+        INSERT_PADDING_WORDS(0x8);
+
+        struct {
+            u32 address_low;
+            union {
+                BitField<0, 8, u32> address_high;
+                BitField<15, 17, u32> size;
+            };
+            GPUVAddr Address() const {
+                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high.Value()) << 32) |
+                                             address_low);
+            }
+        } const_buffer_config[8];
+
+        union {
+            BitField<0, 20, u32> local_pos_alloc;
+            BitField<27, 5, u32> barrier_alloc;
+        };
+
+        union {
+            BitField<0, 20, u32> local_neg_alloc;
+            BitField<24, 5, u32> gpr_alloc;
+        };
+
+        INSERT_PADDING_WORDS(0x11);
+    } launch_description;
+
+    struct {
+        u32 write_offset = 0;
+        u32 copy_size = 0;
+        std::vector<u8> inner_buffer;
+    } state{};
+
     static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32),
                   "KeplerCompute Regs has wrong size");
 
+    static_assert(sizeof(LaunchParams) == LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32),
+                  "KeplerCompute LaunchParams has wrong size");
+
     /// Write the value to the register identified by method.
     void CallMethod(const GPU::MethodCall& method_call);
 
 private:
+    Core::System& system;
+    VideoCore::RasterizerInterface& rasterizer;
     MemoryManager& memory_manager;
+    Upload::State upload_state;
+
+    void ProcessLaunch();
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
     static_assert(offsetof(KeplerCompute::Regs, field_name) == position * 4,                       \
                   "Field " #field_name " has invalid position")
 
+#define ASSERT_LAUNCH_PARAM_POSITION(field_name, position)                                         \
+    static_assert(offsetof(KeplerCompute::LaunchParams, field_name) == position * 4,               \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(upload, 0x60);
+ASSERT_REG_POSITION(exec_upload, 0x6C);
+ASSERT_REG_POSITION(data_upload, 0x6D);
 ASSERT_REG_POSITION(launch, 0xAF);
+ASSERT_REG_POSITION(tsc, 0x557);
+ASSERT_REG_POSITION(tic, 0x55D);
+ASSERT_REG_POSITION(code_loc, 0x582);
+ASSERT_REG_POSITION(texture_const_buffer_index, 0x982);
+ASSERT_LAUNCH_PARAM_POSITION(program_start, 0x8);
+ASSERT_LAUNCH_PARAM_POSITION(grid_dim_x, 0xC);
+ASSERT_LAUNCH_PARAM_POSITION(shared_alloc, 0x11);
+ASSERT_LAUNCH_PARAM_POSITION(block_dim_x, 0x12);
+ASSERT_LAUNCH_PARAM_POSITION(memory_config, 0x14);
+ASSERT_LAUNCH_PARAM_POSITION(const_buffer_config, 0x1D);
 
 #undef ASSERT_REG_POSITION
 
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index cd51a31d7..0561f676c 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -10,12 +10,12 @@
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
+#include "video_core/textures/decoders.h"
 
 namespace Tegra::Engines {
 
-KeplerMemory::KeplerMemory(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                           MemoryManager& memory_manager)
-    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {}
+KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager)
+    : system{system}, memory_manager{memory_manager}, upload_state{memory_manager, regs.upload} {}
 
 KeplerMemory::~KeplerMemory() = default;
 
@@ -27,30 +27,18 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
 
     switch (method_call.method) {
     case KEPLERMEMORY_REG_INDEX(exec): {
-        state.write_offset = 0;
+        upload_state.ProcessExec(regs.exec.linear != 0);
         break;
     }
     case KEPLERMEMORY_REG_INDEX(data): {
-        ProcessData(method_call.argument);
+        const bool is_last_call = method_call.IsLastCall();
+        upload_state.ProcessData(method_call.argument, is_last_call);
+        if (is_last_call) {
+            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+        }
         break;
     }
     }
 }
 
-void KeplerMemory::ProcessData(u32 data) {
-    ASSERT_MSG(regs.exec.linear, "Non-linear uploads are not supported");
-    ASSERT(regs.dest.x == 0 && regs.dest.y == 0 && regs.dest.z == 0);
-
-    // We have to invalidate the destination region to evict any outdated surfaces from the cache.
-    // We do this before actually writing the new data because the destination address might
-    // contain a dirty surface that will have to be written back to memory.
-    const GPUVAddr address{regs.dest.Address() + state.write_offset * sizeof(u32)};
-    rasterizer.InvalidateRegion(ToCacheAddr(memory_manager.GetPointer(address)), sizeof(u32));
-    memory_manager.Write<u32>(address, data);
-
-    system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
-
-    state.write_offset++;
-}
-
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index 78b6c3e45..f3bc675a9 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -6,9 +6,11 @@
 
 #include <array>
 #include <cstddef>
+#include <vector>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 
 namespace Core {
@@ -19,19 +21,20 @@ namespace Tegra {
 class MemoryManager;
 }
 
-namespace VideoCore {
-class RasterizerInterface;
-}
-
 namespace Tegra::Engines {
 
+/**
+ * This Engine is known as P2MF. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_p2mf.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_p2mf.xml.h
+ */
+
 #define KEPLERMEMORY_REG_INDEX(field_name)                                                         \
     (offsetof(Tegra::Engines::KeplerMemory::Regs, field_name) / sizeof(u32))
 
 class KeplerMemory final {
 public:
-    KeplerMemory(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                 MemoryManager& memory_manager);
+    KeplerMemory(Core::System& system, MemoryManager& memory_manager);
     ~KeplerMemory();
 
     /// Write the value to the register identified by method.
@@ -44,26 +47,7 @@ public:
             struct {
                 INSERT_PADDING_WORDS(0x60);
 
-                u32 line_length_in;
-                u32 line_count;
-
-                struct {
-                    u32 address_high;
-                    u32 address_low;
-                    u32 pitch;
-                    u32 block_dimensions;
-                    u32 width;
-                    u32 height;
-                    u32 depth;
-                    u32 z;
-                    u32 x;
-                    u32 y;
-
-                    GPUVAddr Address() const {
-                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                                     address_low);
-                    }
-                } dest;
+                Upload::Registers upload;
 
                 struct {
                     union {
@@ -79,25 +63,17 @@ public:
         };
     } regs{};
 
-    struct {
-        u32 write_offset = 0;
-    } state{};
-
 private:
     Core::System& system;
-    VideoCore::RasterizerInterface& rasterizer;
     MemoryManager& memory_manager;
-
-    void ProcessData(u32 data);
+    Upload::State upload_state;
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
     static_assert(offsetof(KeplerMemory::Regs, field_name) == position * 4,                        \
                   "Field " #field_name " has invalid position")
 
-ASSERT_REG_POSITION(line_length_in, 0x60);
-ASSERT_REG_POSITION(line_count, 0x61);
-ASSERT_REG_POSITION(dest, 0x62);
+ASSERT_REG_POSITION(upload, 0x60);
 ASSERT_REG_POSITION(exec, 0x6C);
 ASSERT_REG_POSITION(data, 0x6D);
 #undef ASSERT_REG_POSITION
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 74403eed4..39968d403 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -20,8 +20,8 @@ constexpr u32 MacroRegistersStart = 0xE00;
 
 Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                      MemoryManager& memory_manager)
-    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, macro_interpreter{
-                                                                                  *this} {
+    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
+      macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
     InitializeRegisterDefaults();
 }
 
@@ -34,9 +34,9 @@ void Maxwell3D::InitializeRegisterDefaults() {
 
     // Depth range near/far is not always set, but is expected to be the default 0.0f, 1.0f. This is
     // needed for ARMS.
-    for (std::size_t viewport{}; viewport < Regs::NumViewports; ++viewport) {
-        regs.viewports[viewport].depth_range_near = 0.0f;
-        regs.viewports[viewport].depth_range_far = 1.0f;
+    for (auto& viewport : regs.viewports) {
+        viewport.depth_range_near = 0.0f;
+        viewport.depth_range_far = 1.0f;
     }
 
     // Doom and Bomberman seems to use the uninitialized registers and just enable blend
@@ -47,13 +47,13 @@ void Maxwell3D::InitializeRegisterDefaults() {
     regs.blend.equation_a = Regs::Blend::Equation::Add;
     regs.blend.factor_source_a = Regs::Blend::Factor::One;
     regs.blend.factor_dest_a = Regs::Blend::Factor::Zero;
-    for (std::size_t blend_index = 0; blend_index < Regs::NumRenderTargets; blend_index++) {
-        regs.independent_blend[blend_index].equation_rgb = Regs::Blend::Equation::Add;
-        regs.independent_blend[blend_index].factor_source_rgb = Regs::Blend::Factor::One;
-        regs.independent_blend[blend_index].factor_dest_rgb = Regs::Blend::Factor::Zero;
-        regs.independent_blend[blend_index].equation_a = Regs::Blend::Equation::Add;
-        regs.independent_blend[blend_index].factor_source_a = Regs::Blend::Factor::One;
-        regs.independent_blend[blend_index].factor_dest_a = Regs::Blend::Factor::Zero;
+    for (auto& blend : regs.independent_blend) {
+        blend.equation_rgb = Regs::Blend::Equation::Add;
+        blend.factor_source_rgb = Regs::Blend::Factor::One;
+        blend.factor_dest_rgb = Regs::Blend::Factor::Zero;
+        blend.equation_a = Regs::Blend::Equation::Add;
+        blend.factor_source_a = Regs::Blend::Factor::One;
+        blend.factor_dest_a = Regs::Blend::Factor::Zero;
     }
     regs.stencil_front_op_fail = Regs::StencilOp::Keep;
     regs.stencil_front_op_zfail = Regs::StencilOp::Keep;
@@ -75,11 +75,11 @@ void Maxwell3D::InitializeRegisterDefaults() {
 
     // TODO(bunnei): Some games do not initialize the color masks (e.g. Sonic Mania). Assuming a
     // default of enabled fixes rendering here.
-    for (std::size_t color_mask = 0; color_mask < Regs::NumRenderTargets; color_mask++) {
-        regs.color_mask[color_mask].R.Assign(1);
-        regs.color_mask[color_mask].G.Assign(1);
-        regs.color_mask[color_mask].B.Assign(1);
-        regs.color_mask[color_mask].A.Assign(1);
+    for (auto& color_mask : regs.color_mask) {
+        color_mask.R.Assign(1);
+        color_mask.G.Assign(1);
+        color_mask.B.Assign(1);
+        color_mask.A.Assign(1);
     }
 
     // Commercial games seem to assume this value is enabled and nouveau sets this value manually.
@@ -178,13 +178,13 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
 
         // Vertex buffer
         if (method >= MAXWELL3D_REG_INDEX(vertex_array) &&
-            method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * 32) {
+            method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * Regs::NumVertexArrays) {
             dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2);
         } else if (method >= MAXWELL3D_REG_INDEX(vertex_array_limit) &&
-                   method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * 32) {
+                   method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * Regs::NumVertexArrays) {
             dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1);
         } else if (method >= MAXWELL3D_REG_INDEX(instanced_arrays) &&
-                   method < MAXWELL3D_REG_INDEX(instanced_arrays) + 32) {
+                   method < MAXWELL3D_REG_INDEX(instanced_arrays) + Regs::NumVertexArrays) {
             dirty_flags.vertex_array.set(method - MAXWELL3D_REG_INDEX(instanced_arrays));
         }
     }
@@ -253,6 +253,18 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
         ProcessSyncPoint();
         break;
     }
+    case MAXWELL3D_REG_INDEX(exec_upload): {
+        upload_state.ProcessExec(regs.exec_upload.linear != 0);
+        break;
+    }
+    case MAXWELL3D_REG_INDEX(data_upload): {
+        const bool is_last_call = method_call.IsLastCall();
+        upload_state.ProcessData(method_call.argument, is_last_call);
+        if (is_last_call) {
+            dirty_flags.OnMemoryWrite();
+        }
+        break;
+    }
     default:
         break;
     }
@@ -418,7 +430,7 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
     const GPUVAddr tic_address_gpu{regs.tic.TICAddress() + tic_index * sizeof(Texture::TICEntry)};
 
     Texture::TICEntry tic_entry;
-    memory_manager.ReadBlock(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
+    memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
 
     ASSERT_MSG(tic_entry.header_version == Texture::TICHeaderVersion::BlockLinear ||
                    tic_entry.header_version == Texture::TICHeaderVersion::Pitch,
@@ -430,7 +442,7 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
     const auto a_type = tic_entry.a_type.Value();
 
     // TODO(Subv): Different data types for separate components are not supported
-    ASSERT(r_type == g_type && r_type == b_type && r_type == a_type);
+    DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type);
 
     return tic_entry;
 }
@@ -439,7 +451,7 @@ Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
     const GPUVAddr tsc_address_gpu{regs.tsc.TSCAddress() + tsc_index * sizeof(Texture::TSCEntry)};
 
     Texture::TSCEntry tsc_entry;
-    memory_manager.ReadBlock(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
+    memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
     return tsc_entry;
 }
 
@@ -482,19 +494,8 @@ std::vector<Texture::FullTextureInfo> Maxwell3D::GetStageTextures(Regs::ShaderSt
     return textures;
 }
 
-Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage,
-                                                    std::size_t offset) const {
-    auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
-    auto& tex_info_buffer = shader.const_buffers[regs.tex_cb_index];
-    ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0);
-
-    const GPUVAddr tex_info_address =
-        tex_info_buffer.address + offset * sizeof(Texture::TextureHandle);
-
-    ASSERT(tex_info_address < tex_info_buffer.address + tex_info_buffer.size);
-
-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
-
+Texture::FullTextureInfo Maxwell3D::GetTextureInfo(const Texture::TextureHandle tex_handle,
+                                                   std::size_t offset) const {
     Texture::FullTextureInfo tex_info{};
     tex_info.index = static_cast<u32>(offset);
 
@@ -511,6 +512,22 @@ Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage,
     return tex_info;
 }
 
+Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage,
+                                                    std::size_t offset) const {
+    const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
+    const auto& tex_info_buffer = shader.const_buffers[regs.tex_cb_index];
+    ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0);
+
+    const GPUVAddr tex_info_address =
+        tex_info_buffer.address + offset * sizeof(Texture::TextureHandle);
+
+    ASSERT(tex_info_address < tex_info_buffer.address + tex_info_buffer.size);
+
+    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+
+    return GetTextureInfo(tex_handle, offset);
+}
+
 u32 Maxwell3D::GetRegisterValue(u32 method) const {
     ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register");
     return regs.reg_array[method];
@@ -524,4 +541,12 @@ void Maxwell3D::ProcessClearBuffers() {
     rasterizer.Clear();
 }
 
+u32 Maxwell3D::AccessConstBuffer32(Regs::ShaderStage stage, u64 const_buffer, u64 offset) const {
+    const auto& shader_stage = state.shader_stages[static_cast<std::size_t>(stage)];
+    const auto& buffer = shader_stage.const_buffers[const_buffer];
+    u32 result;
+    std::memcpy(&result, memory_manager.GetPointer(buffer.address + offset), sizeof(u32));
+    return result;
+}
+
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 321af3297..48e4fec33 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <bitset>
+#include <type_traits>
 #include <unordered_map>
 #include <vector>
 
@@ -14,6 +15,7 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/math_util.h"
+#include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 #include "video_core/macro_interpreter.h"
 #include "video_core/textures/texture.h"
@@ -32,6 +34,12 @@ class RasterizerInterface;
 
 namespace Tegra::Engines {
 
+/**
+ * This Engine is known as GF100_3D. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/gf100_3d.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h
+ */
+
 #define MAXWELL3D_REG_INDEX(field_name)                                                            \
     (offsetof(Tegra::Engines::Maxwell3D::Regs, field_name) / sizeof(u32))
 
@@ -243,9 +251,10 @@ public:
                     return "10_10_10_2";
                 case Size::Size_11_11_10:
                     return "11_11_10";
+                default:
+                    UNREACHABLE();
+                    return {};
                 }
-                UNREACHABLE();
-                return {};
             }
 
             std::string TypeString() const {
@@ -579,7 +588,18 @@ public:
                     u32 bind;
                 } macros;
 
-                INSERT_PADDING_WORDS(0x69);
+                INSERT_PADDING_WORDS(0x17);
+
+                Upload::Registers upload;
+                struct {
+                    union {
+                        BitField<0, 1, u32> linear;
+                    };
+                } exec_upload;
+
+                u32 data_upload;
+
+                INSERT_PADDING_WORDS(0x44);
 
                 struct {
                     union {
@@ -1088,6 +1108,7 @@ public:
     } regs{};
 
     static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32), "Maxwell3D Regs has wrong size");
+    static_assert(std::is_trivially_copyable_v<Regs>, "Maxwell3D Regs must be trivially copyable");
 
     struct State {
         struct ConstBufferInfo {
@@ -1131,12 +1152,18 @@ public:
     /// Write the value to the register identified by method.
     void CallMethod(const GPU::MethodCall& method_call);
 
+    /// Given a Texture Handle, returns the TSC and TIC entries.
+    Texture::FullTextureInfo GetTextureInfo(const Texture::TextureHandle tex_handle,
+                                            std::size_t offset) const;
+
     /// Returns a list of enabled textures for the specified shader stage.
     std::vector<Texture::FullTextureInfo> GetStageTextures(Regs::ShaderStage stage) const;
 
     /// Returns the texture information for a specific texture in a specific shader stage.
     Texture::FullTextureInfo GetStageTexture(Regs::ShaderStage stage, std::size_t offset) const;
 
+    u32 AccessConstBuffer32(Regs::ShaderStage stage, u64 const_buffer, u64 offset) const;
+
     /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than
     /// we've seen used.
     using MacroMemory = std::array<u32, 0x40000>;
@@ -1169,6 +1196,8 @@ private:
     /// Interpreter for the macro codes uploaded to the GPU.
     MacroInterpreter macro_interpreter;
 
+    Upload::State upload_state;
+
     /// Retrieves information about a specific TIC entry from the TIC buffer.
     Texture::TICEntry GetTICEntry(u32 tic_index) const;
 
@@ -1212,6 +1241,9 @@ private:
                   "Field " #field_name " has invalid position")
 
 ASSERT_REG_POSITION(macros, 0x45);
+ASSERT_REG_POSITION(upload, 0x60);
+ASSERT_REG_POSITION(exec_upload, 0x6C);
+ASSERT_REG_POSITION(data_upload, 0x6D);
 ASSERT_REG_POSITION(sync_info, 0xB2);
 ASSERT_REG_POSITION(tfb_enabled, 0x1D1);
 ASSERT_REG_POSITION(rt, 0x200);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 2426d0067..3a5dfef0c 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -83,57 +83,66 @@ void MaxwellDMA::HandleCopy() {
 
     ASSERT(regs.exec.enable_2d == 1);
 
-    const std::size_t copy_size = regs.x_count * regs.y_count;
+    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
+        ASSERT(regs.src_params.size_z == 1);
+        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
+        const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x;
+        const std::size_t src_size = Texture::CalculateSize(
+            true, src_bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,
+            regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
 
-    auto source_ptr{memory_manager.GetPointer(source)};
-    auto dst_ptr{memory_manager.GetPointer(dest)};
+        const std::size_t dst_size = regs.dst_pitch * regs.y_count;
 
-    if (!source_ptr) {
-        LOG_ERROR(HW_GPU, "source_ptr is invalid");
-        return;
-    }
+        if (read_buffer.size() < src_size) {
+            read_buffer.resize(src_size);
+        }
 
-    if (!dst_ptr) {
-        LOG_ERROR(HW_GPU, "dst_ptr is invalid");
-        return;
-    }
+        if (write_buffer.size() < dst_size) {
+            write_buffer.resize(dst_size);
+        }
 
-    const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) {
-        // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated
-        // copying.
-        rasterizer.FlushRegion(ToCacheAddr(source_ptr), src_size);
+        memory_manager.ReadBlock(source, read_buffer.data(), src_size);
+        memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
 
-        // We have to invalidate the destination region to evict any outdated surfaces from the
-        // cache. We do this before actually writing the new data because the destination address
-        // might contain a dirty surface that will have to be written back to memory.
-        rasterizer.InvalidateRegion(ToCacheAddr(dst_ptr), dst_size);
-    };
+        Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch,
+                                  regs.src_params.size_x, src_bytes_per_pixel, read_buffer.data(),
+                                  write_buffer.data(), regs.src_params.BlockHeight(),
+                                  regs.src_params.pos_x, regs.src_params.pos_y);
 
-    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
-        ASSERT(regs.src_params.size_z == 1);
-        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
+        memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
+    } else {
+        ASSERT(regs.dst_params.BlockDepth() == 1);
 
-        const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x;
+        const u32 src_bytes_per_pixel = regs.src_pitch / regs.x_count;
 
-        FlushAndInvalidate(regs.src_pitch * regs.src_params.size_y,
-                           copy_size * src_bytes_per_pixel);
+        const std::size_t dst_size = Texture::CalculateSize(
+            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,
+            regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
 
-        Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch,
-                                  regs.src_params.size_x, src_bytes_per_pixel, source_ptr, dst_ptr,
-                                  regs.src_params.BlockHeight(), regs.src_params.pos_x,
-                                  regs.src_params.pos_y);
-    } else {
-        ASSERT(regs.dst_params.size_z == 1);
-        ASSERT(regs.src_pitch == regs.x_count);
+        const std::size_t dst_layer_size = Texture::CalculateSize(
+            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,
+            regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
 
-        const u32 src_bpp = regs.src_pitch / regs.x_count;
+        const std::size_t src_size = regs.src_pitch * regs.y_count;
 
-        FlushAndInvalidate(regs.src_pitch * regs.y_count,
-                           regs.dst_params.size_x * regs.dst_params.size_y * src_bpp);
+        if (read_buffer.size() < src_size) {
+            read_buffer.resize(src_size);
+        }
+
+        if (write_buffer.size() < dst_size) {
+            write_buffer.resize(dst_size);
+        }
+
+        memory_manager.ReadBlock(source, read_buffer.data(), src_size);
+        memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
 
         // If the input is linear and the output is tiled, swizzle the input and copy it over.
         Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x,
-                                src_bpp, dst_ptr, source_ptr, regs.dst_params.BlockHeight());
+                                src_bytes_per_pixel,
+                                write_buffer.data() + dst_layer_size * regs.dst_params.pos_z,
+                                read_buffer.data(), regs.dst_params.BlockHeight());
+
+        memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
     }
 }
 
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index c6b649842..e5942f671 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <cstddef>
+#include <vector>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
@@ -25,6 +26,11 @@ class RasterizerInterface;
 
 namespace Tegra::Engines {
 
+/**
+ * This Engine is known as GK104_Copy. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/fifo/gk104_copy.xml
+ */
+
 class MaxwellDMA final {
 public:
     explicit MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
@@ -63,6 +69,16 @@ public:
 
         static_assert(sizeof(Parameters) == 24, "Parameters has wrong size");
 
+        enum class ComponentMode : u32 {
+            Src0 = 0,
+            Src1 = 1,
+            Src2 = 2,
+            Src3 = 3,
+            Const0 = 4,
+            Const1 = 5,
+            Zero = 6,
+        };
+
         enum class CopyMode : u32 {
             None = 0,
             Unk1 = 1,
@@ -128,7 +144,26 @@ public:
                 u32 x_count;
                 u32 y_count;
 
-                INSERT_PADDING_WORDS(0xBB);
+                INSERT_PADDING_WORDS(0xB8);
+
+                u32 const0;
+                u32 const1;
+                union {
+                    BitField<0, 4, ComponentMode> component0;
+                    BitField<4, 4, ComponentMode> component1;
+                    BitField<8, 4, ComponentMode> component2;
+                    BitField<12, 4, ComponentMode> component3;
+                    BitField<16, 2, u32> component_size;
+                    BitField<20, 3, u32> src_num_components;
+                    BitField<24, 3, u32> dst_num_components;
+
+                    u32 SrcBytePerPixel() const {
+                        return src_num_components.Value() * component_size.Value();
+                    }
+                    u32 DstBytePerPixel() const {
+                        return dst_num_components.Value() * component_size.Value();
+                    }
+                } swizzle_config;
 
                 Parameters dst_params;
 
@@ -149,6 +184,9 @@ private:
 
     MemoryManager& memory_manager;
 
+    std::vector<u8> read_buffer;
+    std::vector<u8> write_buffer;
+
     /// Performs the copy from the source buffer to the destination buffer as configured in the
     /// registers.
     void HandleCopy();
@@ -165,6 +203,9 @@ ASSERT_REG_POSITION(src_pitch, 0x104);
 ASSERT_REG_POSITION(dst_pitch, 0x105);
 ASSERT_REG_POSITION(x_count, 0x106);
 ASSERT_REG_POSITION(y_count, 0x107);
+ASSERT_REG_POSITION(const0, 0x1C0);
+ASSERT_REG_POSITION(const1, 0x1C1);
+ASSERT_REG_POSITION(swizzle_config, 0x1C2);
 ASSERT_REG_POSITION(dst_params, 0x1C3);
 ASSERT_REG_POSITION(src_params, 0x1CA);
 
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 2e1e96c81..e5b4eadea 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -387,6 +387,20 @@ enum class IpaSampleMode : u64 {
     Offset = 2,
 };
 
+enum class LmemLoadCacheManagement : u64 {
+    Default = 0,
+    LU = 1,
+    CI = 2,
+    CV = 3,
+};
+
+enum class LmemStoreCacheManagement : u64 {
+    Default = 0,
+    CG = 1,
+    CS = 2,
+    WT = 3,
+};
+
 struct IpaMode {
     IpaInterpMode interpolation_mode;
     IpaSampleMode sampling_mode;
@@ -782,7 +796,7 @@ union Instruction {
     } ld_l;
 
     union {
-        BitField<44, 2, u64> unknown;
+        BitField<44, 2, LmemStoreCacheManagement> cache_management;
     } st_l;
 
     union {
@@ -792,6 +806,12 @@ union Instruction {
     } ldg;
 
     union {
+        BitField<48, 3, UniformType> type;
+        BitField<46, 2, u64> cache_mode;
+        BitField<20, 24, s64> immediate_offset;
+    } stg;
+
+    union {
         BitField<0, 3, u64> pred0;
         BitField<3, 3, u64> pred3;
         BitField<7, 1, u64> abs_a;
@@ -917,21 +937,34 @@ union Instruction {
     } iset;
 
     union {
-        BitField<8, 2, Register::Size> dest_size;
-        BitField<10, 2, Register::Size> src_size;
-        BitField<12, 1, u64> is_output_signed;
-        BitField<13, 1, u64> is_input_signed;
-        BitField<41, 2, u64> selector;
+        BitField<41, 2, u64> selector; // i2i and i2f only
         BitField<45, 1, u64> negate_a;
         BitField<49, 1, u64> abs_a;
+        BitField<10, 2, Register::Size> src_size;
+        BitField<13, 1, u64> is_input_signed;
+        BitField<8, 2, Register::Size> dst_size;
+        BitField<12, 1, u64> is_output_signed;
+
+        union {
+            BitField<39, 2, u64> tab5cb8_2;
+        } i2f;
 
         union {
             BitField<39, 2, F2iRoundingOp> rounding;
         } f2i;
 
         union {
-            BitField<39, 4, F2fRoundingOp> rounding;
+            BitField<8, 2, Register::Size> src_size;
+            BitField<10, 2, Register::Size> dst_size;
+            BitField<39, 4, u64> rounding;
+            // H0, H1 extract for F16 missing
+            BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value
+            F2fRoundingOp GetRoundingMode() const {
+                constexpr u64 rounding_mask = 0x0B;
+                return static_cast<F2fRoundingOp>(rounding.Value() & rounding_mask);
+            }
         } f2f;
+
     } conversion;
 
     union {
@@ -967,6 +1000,38 @@ union Instruction {
     } tex;
 
     union {
+        BitField<28, 1, u64> array;
+        BitField<29, 2, TextureType> texture_type;
+        BitField<31, 4, u64> component_mask;
+        BitField<49, 1, u64> nodep_flag;
+        BitField<50, 1, u64> dc_flag;
+        BitField<36, 1, u64> aoffi_flag;
+        BitField<37, 3, TextureProcessMode> process_mode;
+
+        bool IsComponentEnabled(std::size_t component) const {
+            return ((1ULL << component) & component_mask) != 0;
+        }
+
+        TextureProcessMode GetTextureProcessMode() const {
+            return process_mode;
+        }
+
+        bool UsesMiscMode(TextureMiscMode mode) const {
+            switch (mode) {
+            case TextureMiscMode::DC:
+                return dc_flag != 0;
+            case TextureMiscMode::NODEP:
+                return nodep_flag != 0;
+            case TextureMiscMode::AOFFI:
+                return aoffi_flag != 0;
+            default:
+                break;
+            }
+            return false;
+        }
+    } tex_b;
+
+    union {
         BitField<22, 6, TextureQueryType> query_type;
         BitField<31, 4, u64> component_mask;
         BitField<49, 1, u64> nodep_flag;
@@ -1312,7 +1377,9 @@ public:
         LDG, // Load from global memory
         STG, // Store in global memory
         TEX,
+        TEX_B,  // Texture Load Bindless
         TXQ,    // Texture Query
+        TXQ_B,  // Texture Query Bindless
         TEXS,   // Texture Fetch with scalar/non-vec4 source/destinations
         TLDS,   // Texture Load with scalar/non-vec4 source/destinations
         TLD4,   // Texture Load 4
@@ -1580,7 +1647,9 @@ private:
             INST("1110111011010---", Id::LDG, Type::Memory, "LDG"),
             INST("1110111011011---", Id::STG, Type::Memory, "STG"),
             INST("110000----111---", Id::TEX, Type::Texture, "TEX"),
+            INST("1101111010111---", Id::TEX_B, Type::Texture, "TEX_B"),
             INST("1101111101001---", Id::TXQ, Type::Texture, "TXQ"),
+            INST("1101111101010---", Id::TXQ_B, Type::Texture, "TXQ_B"),
             INST("1101-00---------", Id::TEXS, Type::Texture, "TEXS"),
             INST("1101101---------", Id::TLDS, Type::Texture, "TLDS"),
             INST("110010----111---", Id::TLD4, Type::Texture, "TLD4"),
@@ -1678,7 +1747,7 @@ private:
             INST("0011100-00101---", Id::SHR_IMM, Type::Shift, "SHR_IMM"),
             INST("0100110011100---", Id::I2I_C, Type::Conversion, "I2I_C"),
             INST("0101110011100---", Id::I2I_R, Type::Conversion, "I2I_R"),
-            INST("01110001-1000---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"),
+            INST("0011101-11100---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"),
             INST("0100110010111---", Id::I2F_C, Type::Conversion, "I2F_C"),
             INST("0101110010111---", Id::I2F_R, Type::Conversion, "I2F_R"),
             INST("0011100-10111---", Id::I2F_IMM, Type::Conversion, "I2F_IMM"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 4461083ff..52706505b 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -35,9 +35,9 @@ GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{ren
     dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
     maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
     fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
-    kepler_compute = std::make_unique<Engines::KeplerCompute>(*memory_manager);
+    kepler_compute = std::make_unique<Engines::KeplerCompute>(system, rasterizer, *memory_manager);
     maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, rasterizer, *memory_manager);
-    kepler_memory = std::make_unique<Engines::KeplerMemory>(system, rasterizer, *memory_manager);
+    kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager);
 }
 
 GPU::~GPU() = default;
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index de30ea354..fe6628923 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -207,6 +207,11 @@ public:
         };
     } regs{};
 
+    /// Performs any additional setup necessary in order to begin GPU emulation.
+    /// This can be used to launch any necessary threads and register any necessary
+    /// core timing events.
+    virtual void Start() = 0;
+
     /// Push GPU command entries to be processed
     virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
 
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index db507cf04..d4e2553a9 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -9,10 +9,14 @@
 namespace VideoCommon {
 
 GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
-    : Tegra::GPU(system, renderer), gpu_thread{system, renderer, *dma_pusher} {}
+    : GPU(system, renderer), gpu_thread{system} {}
 
 GPUAsynch::~GPUAsynch() = default;
 
+void GPUAsynch::Start() {
+    gpu_thread.StartThread(renderer, *dma_pusher);
+}
+
 void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
     gpu_thread.SubmitList(std::move(entries));
 }
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index 1dcc61a6c..30be74cba 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -13,16 +13,13 @@ class RendererBase;
 
 namespace VideoCommon {
 
-namespace GPUThread {
-class ThreadManager;
-} // namespace GPUThread
-
 /// Implementation of GPU interface that runs the GPU asynchronously
 class GPUAsynch : public Tegra::GPU {
 public:
     explicit GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer);
     ~GPUAsynch() override;
 
+    void Start() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(
         std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 2cfc900ed..45e43b1dc 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -8,10 +8,12 @@
 namespace VideoCommon {
 
 GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer)
-    : Tegra::GPU(system, renderer) {}
+    : GPU(system, renderer) {}
 
 GPUSynch::~GPUSynch() = default;
 
+void GPUSynch::Start() {}
+
 void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
     dma_pusher->Push(std::move(entries));
     dma_pusher->DispatchCalls();
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 766b5631c..3031fcf72 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -18,6 +18,7 @@ public:
     explicit GPUSynch(Core::System& system, VideoCore::RendererBase& renderer);
     ~GPUSynch() override;
 
+    void Start() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
     void SwapBuffers(
         std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index cc56cf467..1e2ff46b0 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -44,7 +44,7 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
                 renderer.Rasterizer().FlushRegion(data->addr, data->size);
             } else if (const auto data = std::get_if<InvalidateRegionCommand>(&next.data)) {
                 renderer.Rasterizer().InvalidateRegion(data->addr, data->size);
-            } else if (const auto data = std::get_if<EndProcessingCommand>(&next.data)) {
+            } else if (std::holds_alternative<EndProcessingCommand>(next.data)) {
                 return;
             } else {
                 UNREACHABLE();
@@ -55,19 +55,24 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
     }
 }
 
-ThreadManager::ThreadManager(Core::System& system, VideoCore::RendererBase& renderer,
-                             Tegra::DmaPusher& dma_pusher)
-    : system{system}, thread{RunThread, std::ref(renderer), std::ref(dma_pusher), std::ref(state)} {
-    synchronization_event = system.CoreTiming().RegisterEvent(
-        "GPUThreadSynch", [this](u64 fence, s64) { state.WaitForSynchronization(fence); });
-}
+ThreadManager::ThreadManager(Core::System& system) : system{system} {}
 
 ThreadManager::~ThreadManager() {
+    if (!thread.joinable()) {
+        return;
+    }
+
     // Notify GPU thread that a shutdown is pending
     PushCommand(EndProcessingCommand());
     thread.join();
 }
 
+void ThreadManager::StartThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher) {
+    thread = std::thread{RunThread, std::ref(renderer), std::ref(dma_pusher), std::ref(state)};
+    synchronization_event = system.CoreTiming().RegisterEvent(
+        "GPUThreadSynch", [this](u64 fence, s64) { state.WaitForSynchronization(fence); });
+}
+
 void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
     const u64 fence{PushCommand(SubmitListCommand(std::move(entries)))};
     const s64 synchronization_ticks{Core::Timing::usToCycles(9000)};
@@ -113,7 +118,7 @@ void SynchState::WaitForSynchronization(u64 fence) {
     // Wait for the GPU to be idle (all commands to be executed)
     {
         MICROPROFILE_SCOPE(GPU_wait);
-        std::unique_lock<std::mutex> lock{synchronization_mutex};
+        std::unique_lock lock{synchronization_mutex};
         synchronization_condition.wait(lock, [this, fence] { return signaled_fence >= fence; });
     }
 }
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 62bcea5bb..05a168a72 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -81,12 +81,6 @@ struct CommandDataContainer {
     CommandDataContainer(CommandData&& data, u64 next_fence)
         : data{std::move(data)}, fence{next_fence} {}
 
-    CommandDataContainer& operator=(const CommandDataContainer& t) {
-        data = std::move(t.data);
-        fence = t.fence;
-        return *this;
-    }
-
     CommandData data;
     u64 fence{};
 };
@@ -109,7 +103,7 @@ struct SynchState final {
 
     void TrySynchronize() {
         if (IsSynchronized()) {
-            std::lock_guard<std::mutex> lock{synchronization_mutex};
+            std::lock_guard lock{synchronization_mutex};
             synchronization_condition.notify_one();
         }
     }
@@ -138,10 +132,12 @@ struct SynchState final {
 /// Class used to manage the GPU thread
 class ThreadManager final {
 public:
-    explicit ThreadManager(Core::System& system, VideoCore::RendererBase& renderer,
-                           Tegra::DmaPusher& dma_pusher);
+    explicit ThreadManager(Core::System& system);
     ~ThreadManager();
 
+    /// Creates and starts the GPU thread.
+    void StartThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher);
+
     /// Push GPU command entries to be processed
     void SubmitList(Tegra::CommandList&& entries);
 
diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro_interpreter.cpp
index 524d9ea5a..fbea107ca 100644
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro_interpreter.cpp
@@ -118,10 +118,10 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
                           static_cast<u32>(opcode.operation.Value()));
     }
 
-    if (opcode.is_exit) {
+    // An instruction with the Exit flag will not actually
+    // cause an exit if it's executed inside a delay slot.
+    if (opcode.is_exit && !is_delay_slot) {
         // Exit has a delay slot, execute the next instruction
-        // Note: Executing an exit during a branch delay slot will cause the instruction at the
-        // branch target to be executed before exiting.
         Step(offset, true);
         return false;
     }
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 0f4e820aa..5d8d126c1 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -25,6 +25,8 @@ MemoryManager::MemoryManager(VideoCore::RasterizerInterface& rasterizer) : raste
     UpdatePageTableForVMA(initial_vma);
 }
 
+MemoryManager::~MemoryManager() = default;
+
 GPUVAddr MemoryManager::AllocateSpace(u64 size, u64 align) {
     const u64 aligned_size{Common::AlignUp(size, page_size)};
     const GPUVAddr gpu_addr{FindFreeRegion(address_space_base, aligned_size)};
@@ -199,7 +201,15 @@ const u8* MemoryManager::GetPointer(GPUVAddr addr) const {
     return {};
 }
 
-void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const {
+bool MemoryManager::IsBlockContinuous(const GPUVAddr start, const std::size_t size) const {
+    const GPUVAddr end = start + size;
+    const auto host_ptr_start = reinterpret_cast<std::uintptr_t>(GetPointer(start));
+    const auto host_ptr_end = reinterpret_cast<std::uintptr_t>(GetPointer(end));
+    const auto range = static_cast<std::size_t>(host_ptr_end - host_ptr_start);
+    return range == size;
+}
+
+void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const {
     std::size_t remaining_size{size};
     std::size_t page_index{src_addr >> page_bits};
     std::size_t page_offset{src_addr & page_mask};
@@ -226,7 +236,30 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t
     }
 }
 
-void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size) {
+void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
+                                    const std::size_t size) const {
+    std::size_t remaining_size{size};
+    std::size_t page_index{src_addr >> page_bits};
+    std::size_t page_offset{src_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+        const u8* page_pointer = page_table.pointers[page_index];
+        if (page_pointer) {
+            const u8* src_ptr{page_pointer + page_offset};
+            std::memcpy(dest_buffer, src_ptr, copy_amount);
+        } else {
+            std::memset(dest_buffer, 0, copy_amount);
+        }
+        page_index++;
+        page_offset = 0;
+        dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
+        remaining_size -= copy_amount;
+    }
+}
+
+void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) {
     std::size_t remaining_size{size};
     std::size_t page_index{dest_addr >> page_bits};
     std::size_t page_offset{dest_addr & page_mask};
@@ -253,7 +286,28 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::
     }
 }
 
-void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size) {
+void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
+                                     const std::size_t size) {
+    std::size_t remaining_size{size};
+    std::size_t page_index{dest_addr >> page_bits};
+    std::size_t page_offset{dest_addr & page_mask};
+
+    while (remaining_size > 0) {
+        const std::size_t copy_amount{
+            std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
+        u8* page_pointer = page_table.pointers[page_index];
+        if (page_pointer) {
+            u8* dest_ptr{page_pointer + page_offset};
+            std::memcpy(dest_ptr, src_buffer, copy_amount);
+        }
+        page_index++;
+        page_offset = 0;
+        src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
+        remaining_size -= copy_amount;
+    }
+}
+
+void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
     std::size_t remaining_size{size};
     std::size_t page_index{src_addr >> page_bits};
     std::size_t page_offset{src_addr & page_mask};
@@ -281,6 +335,12 @@ void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t
     }
 }
 
+void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
+    std::vector<u8> tmp_buffer(size);
+    ReadBlockUnsafe(src_addr, tmp_buffer.data(), size);
+    WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size);
+}
+
 void MemoryManager::MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type,
                              VAddr backing_addr) {
     LOG_DEBUG(HW_GPU, "Mapping {} onto {:016X}-{:016X}", fmt::ptr(memory), base * page_size,
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 647cbf93a..113f9d8f3 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -47,7 +47,8 @@ struct VirtualMemoryArea {
 
 class MemoryManager final {
 public:
-    MemoryManager(VideoCore::RasterizerInterface& rasterizer);
+    explicit MemoryManager(VideoCore::RasterizerInterface& rasterizer);
+    ~MemoryManager();
 
     GPUVAddr AllocateSpace(u64 size, u64 align);
     GPUVAddr AllocateSpace(GPUVAddr addr, u64 size, u64 align);
@@ -65,10 +66,33 @@ public:
     u8* GetPointer(GPUVAddr addr);
     const u8* GetPointer(GPUVAddr addr) const;
 
+    /// Returns true if the block is continuous in host memory, false otherwise
+    bool IsBlockContinuous(GPUVAddr start, std::size_t size) const;
+
+    /**
+     * ReadBlock and WriteBlock are full read and write operations over virtual
+     * GPU Memory. It's important to use these when GPU memory may not be continuous
+     * in the Host Memory counterpart. Note: This functions cause Host GPU Memory
+     * Flushes and Invalidations, respectively to each operation.
+     */
     void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
     void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
     void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
 
+    /**
+     * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and
+     * WriteBlock respectively. In this versions, no flushing or invalidation is actually
+     * done and their performance is similar to a memcpy. This functions can be used
+     * on either of this 2 scenarios instead of their safe counterpart:
+     * - Memory which is sure to never be represented in the Host GPU.
+     * - Memory Managed by a Cache Manager. Example: Texture Flushing should use
+     * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture
+     * being flushed.
+     */
+    void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
+
 private:
     using VMAMap = std::map<GPUVAddr, VirtualMemoryArea>;
     using VMAHandle = VMAMap::const_iterator;
@@ -88,10 +112,10 @@ private:
     /**
      * Maps an unmanaged host memory pointer at a given address.
      *
-     * @param target The guest address to start the mapping at.
-     * @param memory The memory to be mapped.
-     * @param size Size of the mapping.
-     * @param state MemoryState tag to attach to the VMA.
+     * @param target       The guest address to start the mapping at.
+     * @param memory       The memory to be mapped.
+     * @param size         Size of the mapping in bytes.
+     * @param backing_addr The base address of the range to back this mapping.
      */
     VMAHandle MapBackingMemory(GPUVAddr target, u8* memory, u64 size, VAddr backing_addr);
 
@@ -101,7 +125,7 @@ private:
     /// Converts a VMAHandle to a mutable VMAIter.
     VMAIter StripIterConstness(const VMAHandle& iter);
 
-    /// Marks as the specfied VMA as allocated.
+    /// Marks as the specified VMA as allocated.
     VMAIter Allocate(VMAIter vma);
 
     /**
diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h
index 291772186..0c4ea1494 100644
--- a/src/video_core/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache.h
@@ -37,9 +37,6 @@ public:
     /// Gets the size of the shader in guest memory, required for cache management
     virtual std::size_t GetSizeInBytes() const = 0;
 
-    /// Wriets any cached resources back to memory
-    virtual void Flush() = 0;
-
     /// Sets whether the cached object should be considered registered
     void SetIsRegistered(bool registered) {
         is_registered = registered;
@@ -147,8 +144,9 @@ protected:
 
         object->SetIsRegistered(false);
         rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
+        const CacheAddr addr = object->GetCacheAddr();
         interval_cache.subtract({GetInterval(object), ObjectSet{object}});
-        map_cache.erase(object->GetCacheAddr());
+        map_cache.erase(addr);
     }
 
     /// Returns a ticks counter used for tracking when cached objects were last modified
@@ -158,6 +156,8 @@ protected:
         return ++modified_ticks;
     }
 
+    virtual void FlushObjectInner(const T& object) = 0;
+
     /// Flushes the specified object, updating appropriate cache state as needed
     void FlushObject(const T& object) {
         std::lock_guard lock{mutex};
@@ -165,7 +165,7 @@ protected:
         if (!object->IsDirty()) {
             return;
         }
-        object->Flush();
+        FlushObjectInner(object);
         object->MarkAsModified(false, *this);
     }
 
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index fc33aa433..f9247a40e 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -42,9 +42,6 @@ public:
         return alignment;
     }
 
-    // We do not have to flush this cache as things in it are never modified by us.
-    void Flush() override {}
-
 private:
     VAddr cpu_addr{};
     std::size_t size{};
@@ -75,6 +72,9 @@ public:
 protected:
     void AlignBuffer(std::size_t alignment);
 
+    // We do not have to flush this cache as things in it are never modified by us.
+    void FlushObjectInner(const std::shared_ptr<CachedBufferEntry>& object) override {}
+
 private:
     OGLStreamBuffer stream_buffer;
 
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
new file mode 100644
index 000000000..b6d9e0ddb
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -0,0 +1,45 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstddef>
+#include <glad/glad.h>
+
+#include "common/logging/log.h"
+#include "video_core/renderer_opengl/gl_device.h"
+
+namespace OpenGL {
+
+namespace {
+template <typename T>
+T GetInteger(GLenum pname) {
+    GLint temporary;
+    glGetIntegerv(pname, &temporary);
+    return static_cast<T>(temporary);
+}
+} // Anonymous namespace
+
+Device::Device() {
+    uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
+    has_variable_aoffi = TestVariableAoffi();
+}
+
+bool Device::TestVariableAoffi() {
+    const GLchar* AOFFI_TEST = R"(#version 430 core
+uniform sampler2D tex;
+uniform ivec2 variable_offset;
+void main() {
+    gl_Position = textureOffset(tex, vec2(0), variable_offset);
+}
+)";
+    const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &AOFFI_TEST)};
+    GLint link_status{};
+    glGetProgramiv(shader, GL_LINK_STATUS, &link_status);
+    glDeleteProgram(shader);
+
+    const bool supported{link_status == GL_TRUE};
+    LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", supported);
+    return supported;
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
new file mode 100644
index 000000000..78ff5ee58
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -0,0 +1,30 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+
+namespace OpenGL {
+
+class Device {
+public:
+    Device();
+
+    std::size_t GetUniformBufferAlignment() const {
+        return uniform_buffer_alignment;
+    }
+
+    bool HasVariableAoffi() const {
+        return has_variable_aoffi;
+    }
+
+private:
+    static bool TestVariableAoffi();
+
+    std::size_t uniform_buffer_alignment{};
+    bool has_variable_aoffi{};
+};
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_global_cache.cpp b/src/video_core/renderer_opengl/gl_global_cache.cpp
index 8d9ee81f1..ea4a593af 100644
--- a/src/video_core/renderer_opengl/gl_global_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_global_cache.cpp
@@ -14,28 +14,28 @@
 
 namespace OpenGL {
 
-CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr)
-    : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size} {
+CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size)
+    : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, host_ptr{host_ptr}, size{size},
+      max_size{max_size} {
     buffer.Create();
-    // Bind and unbind the buffer so it gets allocated by the driver
-    glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);
-    glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
     LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory");
 }
 
-void CachedGlobalRegion::Reload(u32 size_) {
-    constexpr auto max_size = static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize);
+CachedGlobalRegion::~CachedGlobalRegion() = default;
 
+void CachedGlobalRegion::Reload(u32 size_) {
     size = size_;
     if (size > max_size) {
         size = max_size;
-        LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the expected size {}!", size_,
+        LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the supported size {}!", size_,
                      max_size);
     }
+    glNamedBufferData(buffer.handle, size, host_ptr, GL_STREAM_DRAW);
+}
 
-    // TODO(Rodrigo): Get rid of Memory::GetPointer with a staging buffer
-    glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);
-    glBufferData(GL_SHADER_STORAGE_BUFFER, size, GetHostPtr(), GL_DYNAMIC_DRAW);
+void CachedGlobalRegion::Flush() {
+    LOG_DEBUG(Render_OpenGL, "Flushing {} bytes to CPU memory address 0x{:16}", size, cpu_addr);
+    glGetNamedBufferSubData(buffer.handle, 0, static_cast<GLsizeiptr>(size), host_ptr);
 }
 
 GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const {
@@ -46,14 +46,16 @@ GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr,
     return search->second;
 }
 
-GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u32 size,
-                                                              u8* host_ptr) {
+GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr,
+                                                              u32 size) {
     GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)};
     if (!region) {
         // No reserved surface available, create a new one and reserve it
         auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
-        const auto cpu_addr = *memory_manager.GpuToCpuAddress(addr);
-        region = std::make_shared<CachedGlobalRegion>(cpu_addr, size, host_ptr);
+        const auto cpu_addr{memory_manager.GpuToCpuAddress(addr)};
+        ASSERT(cpu_addr);
+
+        region = std::make_shared<CachedGlobalRegion>(*cpu_addr, host_ptr, size, max_ssbo_size);
         ReserveGlobalRegion(region);
     }
     region->Reload(size);
@@ -65,7 +67,11 @@ void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) {
 }
 
 GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer)
-    : RasterizerCache{rasterizer} {}
+    : RasterizerCache{rasterizer} {
+    GLint max_ssbo_size_;
+    glGetIntegerv(GL_MAX_SHADER_STORAGE_BLOCK_SIZE, &max_ssbo_size_);
+    max_ssbo_size = static_cast<u32>(max_ssbo_size_);
+}
 
 GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
     const GLShader::GlobalMemoryEntry& global_region,
@@ -73,7 +79,7 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
 
     auto& gpu{Core::System::GetInstance().GPU()};
     auto& memory_manager{gpu.MemoryManager()};
-    const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<u64>(stage)]};
+    const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
     const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address +
                     global_region.GetCbufOffset()};
     const auto actual_addr{memory_manager.Read<u64>(addr)};
@@ -85,7 +91,7 @@ GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
 
     if (!region) {
         // No global region found - create a new one
-        region = GetUncachedGlobalRegion(actual_addr, size, host_ptr);
+        region = GetUncachedGlobalRegion(actual_addr, host_ptr, size);
         Register(region);
     }
 
diff --git a/src/video_core/renderer_opengl/gl_global_cache.h b/src/video_core/renderer_opengl/gl_global_cache.h
index 5a21ab66f..2d467a240 100644
--- a/src/video_core/renderer_opengl/gl_global_cache.h
+++ b/src/video_core/renderer_opengl/gl_global_cache.h
@@ -19,7 +19,7 @@ namespace OpenGL {
 
 namespace GLShader {
 class GlobalMemoryEntry;
-} // namespace GLShader
+}
 
 class RasterizerOpenGL;
 class CachedGlobalRegion;
@@ -27,7 +27,8 @@ using GlobalRegion = std::shared_ptr<CachedGlobalRegion>;
 
 class CachedGlobalRegion final : public RasterizerCacheObject {
 public:
-    explicit CachedGlobalRegion(VAddr cpu_addr, u32 size, u8* host_ptr);
+    explicit CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size);
+    ~CachedGlobalRegion();
 
     VAddr GetCpuAddr() const override {
         return cpu_addr;
@@ -45,14 +46,14 @@ public:
     /// Reloads the global region from guest memory
     void Reload(u32 size_);
 
-    // TODO(Rodrigo): When global memory is written (STG), implement flushing
-    void Flush() override {
-        UNIMPLEMENTED();
-    }
+    void Flush();
 
 private:
     VAddr cpu_addr{};
+    u8* host_ptr{};
     u32 size{};
+    u32 max_size{};
+
     OGLBuffer buffer;
 };
 
@@ -64,12 +65,18 @@ public:
     GlobalRegion GetGlobalRegion(const GLShader::GlobalMemoryEntry& descriptor,
                                  Tegra::Engines::Maxwell3D::Regs::ShaderStage stage);
 
+protected:
+    void FlushObjectInner(const GlobalRegion& object) override {
+        object->Flush();
+    }
+
 private:
     GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const;
-    GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u32 size, u8* host_ptr);
+    GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, u32 size);
     void ReserveGlobalRegion(GlobalRegion region);
 
     std::unordered_map<CacheAddr, GlobalRegion> reserve;
+    u32 max_ssbo_size{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index d250d5cbb..dbd8049f5 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -99,22 +99,14 @@ struct FramebufferCacheKey {
 };
 
 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, ScreenInfo& info)
-    : res_cache{*this}, shader_cache{*this, system}, global_cache{*this}, system{system},
+    : res_cache{*this}, shader_cache{*this, system, device}, global_cache{*this}, system{system},
       screen_info{info}, buffer_cache(*this, STREAM_BUFFER_SIZE) {
-    // Create sampler objects
-    for (std::size_t i = 0; i < texture_samplers.size(); ++i) {
-        texture_samplers[i].Create();
-        state.texture_units[i].sampler = texture_samplers[i].sampler.handle;
-    }
-
     OpenGLState::ApplyDefaultState();
 
     shader_program_manager = std::make_unique<GLShader::ProgramManager>();
     state.draw.shader_program = 0;
     state.Apply();
 
-    glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_alignment);
-
     LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here");
     CheckExtensions();
 }
@@ -269,8 +261,8 @@ DrawParameters RasterizerOpenGL::SetupDraw() {
             // MakeQuadArray always generates u32 indexes
             params.index_format = GL_UNSIGNED_INT;
             params.count = (regs.vertex_buffer.count / 4) * 6;
-            params.index_buffer_offset =
-                primitive_assembler.MakeQuadArray(regs.vertex_buffer.first, params.count);
+            params.index_buffer_offset = primitive_assembler.MakeQuadArray(
+                regs.vertex_buffer.first, regs.vertex_buffer.count);
         }
         return params;
     }
@@ -313,6 +305,8 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
             case Maxwell::ShaderProgram::Geometry:
                 shader_program_manager->UseTrivialGeometryShader();
                 break;
+            default:
+                break;
             }
             continue;
         }
@@ -321,8 +315,8 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
 
         GLShader::MaxwellUniformData ubo{};
         ubo.SetFromRegs(gpu, stage);
-        const GLintptr offset = buffer_cache.UploadHostMemory(
-            &ubo, sizeof(ubo), static_cast<std::size_t>(uniform_buffer_alignment));
+        const GLintptr offset =
+            buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
 
         // Bind the emulation info buffer
         bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset,
@@ -582,9 +576,6 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
 }
 
 void RasterizerOpenGL::Clear() {
-    const auto prev_state{state};
-    SCOPE_EXIT({ prev_state.Apply(); });
-
     const auto& regs = system.GPU().Maxwell3D().regs;
     bool use_color{};
     bool use_depth{};
@@ -656,7 +647,10 @@ void RasterizerOpenGL::Clear() {
         clear_state.EmulateViewportWithScissor();
     }
 
-    clear_state.Apply();
+    clear_state.ApplyColorMask();
+    clear_state.ApplyDepth();
+    clear_state.ApplyStencilTest();
+    clear_state.ApplyViewport();
 
     if (use_color) {
         glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color);
@@ -706,23 +700,24 @@ void RasterizerOpenGL::DrawArrays() {
     // Add space for index buffer (keeping in mind non-core primitives)
     switch (regs.draw.topology) {
     case Maxwell::PrimitiveTopology::Quads:
-        buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
+        buffer_size = Common::AlignUp(buffer_size, 4) +
                       primitive_assembler.CalculateQuadSize(regs.vertex_buffer.count);
         break;
     default:
         if (is_indexed) {
-            buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + CalculateIndexBufferSize();
+            buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize();
         }
         break;
     }
 
     // Uniform space for the 5 shader stages
-    buffer_size =
-        Common::AlignUp<std::size_t>(buffer_size, 4) +
-        (sizeof(GLShader::MaxwellUniformData) + uniform_buffer_alignment) * Maxwell::MaxShaderStage;
+    buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
+                  (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) *
+                      Maxwell::MaxShaderStage;
 
     // Add space for at least 18 constant buffers
-    buffer_size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + uniform_buffer_alignment);
+    buffer_size +=
+        Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment());
 
     const bool invalidate = buffer_cache.Map(buffer_size);
     if (invalidate) {
@@ -756,6 +751,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
         return;
     }
     res_cache.FlushRegion(addr, size);
+    global_cache.FlushRegion(addr, size);
 }
 
 void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
@@ -812,92 +808,6 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
     return true;
 }
 
-void RasterizerOpenGL::SamplerInfo::Create() {
-    sampler.Create();
-    mag_filter = Tegra::Texture::TextureFilter::Linear;
-    min_filter = Tegra::Texture::TextureFilter::Linear;
-    wrap_u = Tegra::Texture::WrapMode::Wrap;
-    wrap_v = Tegra::Texture::WrapMode::Wrap;
-    wrap_p = Tegra::Texture::WrapMode::Wrap;
-    use_depth_compare = false;
-    depth_compare_func = Tegra::Texture::DepthCompareFunc::Never;
-
-    // OpenGL's default is GL_LINEAR_MIPMAP_LINEAR
-    glSamplerParameteri(sampler.handle, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-    glSamplerParameteri(sampler.handle, GL_TEXTURE_COMPARE_FUNC, GL_NEVER);
-
-    // Other attributes have correct defaults
-}
-
-void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntry& config) {
-    const GLuint sampler_id = sampler.handle;
-    if (mag_filter != config.mag_filter) {
-        mag_filter = config.mag_filter;
-        glSamplerParameteri(
-            sampler_id, GL_TEXTURE_MAG_FILTER,
-            MaxwellToGL::TextureFilterMode(mag_filter, Tegra::Texture::TextureMipmapFilter::None));
-    }
-    if (min_filter != config.min_filter || mipmap_filter != config.mipmap_filter) {
-        min_filter = config.min_filter;
-        mipmap_filter = config.mipmap_filter;
-        glSamplerParameteri(sampler_id, GL_TEXTURE_MIN_FILTER,
-                            MaxwellToGL::TextureFilterMode(min_filter, mipmap_filter));
-    }
-
-    if (wrap_u != config.wrap_u) {
-        wrap_u = config.wrap_u;
-        glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_S, MaxwellToGL::WrapMode(wrap_u));
-    }
-    if (wrap_v != config.wrap_v) {
-        wrap_v = config.wrap_v;
-        glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_T, MaxwellToGL::WrapMode(wrap_v));
-    }
-    if (wrap_p != config.wrap_p) {
-        wrap_p = config.wrap_p;
-        glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_R, MaxwellToGL::WrapMode(wrap_p));
-    }
-
-    if (const bool enabled = config.depth_compare_enabled == 1; use_depth_compare != enabled) {
-        use_depth_compare = enabled;
-        glSamplerParameteri(sampler_id, GL_TEXTURE_COMPARE_MODE,
-                            use_depth_compare ? GL_COMPARE_REF_TO_TEXTURE : GL_NONE);
-    }
-
-    if (depth_compare_func != config.depth_compare_func) {
-        depth_compare_func = config.depth_compare_func;
-        glSamplerParameteri(sampler_id, GL_TEXTURE_COMPARE_FUNC,
-                            MaxwellToGL::DepthCompareFunc(depth_compare_func));
-    }
-
-    if (const auto new_border_color = config.GetBorderColor(); border_color != new_border_color) {
-        border_color = new_border_color;
-        glSamplerParameterfv(sampler_id, GL_TEXTURE_BORDER_COLOR, border_color.data());
-    }
-
-    if (const float anisotropic = config.GetMaxAnisotropy(); max_anisotropic != anisotropic) {
-        max_anisotropic = anisotropic;
-        if (GLAD_GL_ARB_texture_filter_anisotropic) {
-            glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY, max_anisotropic);
-        } else if (GLAD_GL_EXT_texture_filter_anisotropic) {
-            glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY_EXT, max_anisotropic);
-        }
-    }
-
-    if (const float min = config.GetMinLod(); min_lod != min) {
-        min_lod = min;
-        glSamplerParameterf(sampler_id, GL_TEXTURE_MIN_LOD, min_lod);
-    }
-    if (const float max = config.GetMaxLod(); max_lod != max) {
-        max_lod = max;
-        glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_LOD, max_lod);
-    }
-
-    if (const float bias = config.GetLodBias(); lod_bias != bias) {
-        lod_bias = bias;
-        glSamplerParameterf(sampler_id, GL_TEXTURE_LOD_BIAS, lod_bias);
-    }
-}
-
 void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
                                          const Shader& shader, GLuint program_handle,
                                          BaseBindings base_bindings) {
@@ -939,8 +849,8 @@ void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::Shader
         size = Common::AlignUp(size, sizeof(GLvec4));
         ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big");
 
-        const GLintptr const_buffer_offset = buffer_cache.UploadMemory(
-            buffer.address, size, static_cast<std::size_t>(uniform_buffer_alignment));
+        const GLintptr const_buffer_offset =
+            buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment());
 
         bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), const_buffer_offset, size);
     }
@@ -953,6 +863,9 @@ void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::Shade
     for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
         const auto& entry{entries[bindpoint]};
         const auto& region{global_cache.GetGlobalRegion(entry, stage)};
+        if (entry.IsWritten()) {
+            region->MarkAsModified(true, global_cache);
+        }
         bind_ssbo_pushbuffer.Push(region->GetBufferHandle(), 0,
                                   static_cast<GLsizeiptr>(region->GetSizeInBytes()));
     }
@@ -970,10 +883,18 @@ void RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& s
 
     for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
         const auto& entry = entries[bindpoint];
-        const auto texture = maxwell3d.GetStageTexture(stage, entry.GetOffset());
+        Tegra::Texture::FullTextureInfo texture;
+        if (entry.IsBindless()) {
+            const auto cbuf = entry.GetBindlessCBuf();
+            Tegra::Texture::TextureHandle tex_handle;
+            tex_handle.raw = maxwell3d.AccessConstBuffer32(stage, cbuf.first, cbuf.second);
+            texture = maxwell3d.GetTextureInfo(tex_handle, entry.GetOffset());
+        } else {
+            texture = maxwell3d.GetStageTexture(stage, entry.GetOffset());
+        }
         const u32 current_bindpoint = base_bindings.sampler + bindpoint;
 
-        texture_samplers[current_bindpoint].SyncWithConfig(texture.tsc);
+        state.texture_units[current_bindpoint].sampler = sampler_cache.GetSampler(texture.tsc);
 
         if (Surface surface = res_cache.GetTextureSurface(texture, entry); surface) {
             state.texture_units[current_bindpoint].texture =
@@ -1001,8 +922,8 @@ void RasterizerOpenGL::SyncViewport(OpenGLState& current_state) {
         viewport.y = viewport_rect.bottom;
         viewport.width = viewport_rect.GetWidth();
         viewport.height = viewport_rect.GetHeight();
-        viewport.depth_range_far = regs.viewports[i].depth_range_far;
-        viewport.depth_range_near = regs.viewports[i].depth_range_near;
+        viewport.depth_range_far = src.depth_range_far;
+        viewport.depth_range_near = src.depth_range_near;
     }
     state.depth_clamp.far_plane = regs.view_volume_clip_control.depth_clamp_far != 0;
     state.depth_clamp.near_plane = regs.view_volume_clip_control.depth_clamp_near != 0;
@@ -1214,7 +1135,9 @@ void RasterizerOpenGL::SyncTransformFeedback() {
 
 void RasterizerOpenGL::SyncPointState() {
     const auto& regs = system.GPU().Maxwell3D().regs;
-    state.point.size = regs.point_size;
+    // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid
+    // in OpenGL).
+    state.point.size = std::max(1.0f, regs.point_size);
 }
 
 void RasterizerOpenGL::SyncPolygonOffset() {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index e4c64ae71..71b9c5ead 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -21,10 +21,12 @@
 #include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_global_cache.h"
 #include "video_core/renderer_opengl/gl_primitive_assembler.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/renderer_opengl/gl_sampler_cache.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/gl_state.h"
@@ -71,39 +73,7 @@ public:
     static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0,
                   "The maximum size of a constbuffer must be a multiple of the size of GLvec4");
 
-    static constexpr std::size_t MaxGlobalMemorySize = 0x10000;
-    static_assert(MaxGlobalMemorySize % sizeof(float) == 0,
-                  "The maximum size of a global memory must be a multiple of the size of float");
-
 private:
-    class SamplerInfo {
-    public:
-        OGLSampler sampler;
-
-        /// Creates the sampler object, initializing its state so that it's in sync with the
-        /// SamplerInfo struct.
-        void Create();
-        /// Syncs the sampler object with the config, updating any necessary state.
-        void SyncWithConfig(const Tegra::Texture::TSCEntry& info);
-
-    private:
-        Tegra::Texture::TextureFilter mag_filter = Tegra::Texture::TextureFilter::Nearest;
-        Tegra::Texture::TextureFilter min_filter = Tegra::Texture::TextureFilter::Nearest;
-        Tegra::Texture::TextureMipmapFilter mipmap_filter =
-            Tegra::Texture::TextureMipmapFilter::None;
-        Tegra::Texture::WrapMode wrap_u = Tegra::Texture::WrapMode::ClampToEdge;
-        Tegra::Texture::WrapMode wrap_v = Tegra::Texture::WrapMode::ClampToEdge;
-        Tegra::Texture::WrapMode wrap_p = Tegra::Texture::WrapMode::ClampToEdge;
-        bool use_depth_compare = false;
-        Tegra::Texture::DepthCompareFunc depth_compare_func =
-            Tegra::Texture::DepthCompareFunc::Always;
-        GLvec4 border_color = {};
-        float min_lod = 0.0f;
-        float max_lod = 16.0f;
-        float lod_bias = 0.0f;
-        float max_anisotropic = 1.0f;
-    };
-
     struct FramebufferConfigState {
         bool using_color_fb{};
         bool using_depth_fb{};
@@ -203,14 +173,15 @@ private:
     /// but are needed for correct emulation
     void CheckExtensions();
 
+    const Device device;
     OpenGLState state;
 
     RasterizerCacheOpenGL res_cache;
     ShaderCacheOpenGL shader_cache;
     GlobalRegionCacheOpenGL global_cache;
+    SamplerCacheOpenGL sampler_cache;
 
     Core::System& system;
-
     ScreenInfo& screen_info;
 
     std::unique_ptr<GLShader::ProgramManager> shader_program_manager;
@@ -223,12 +194,9 @@ private:
     FramebufferConfigState current_framebuffer_config_state;
     std::pair<bool, bool> current_depth_stencil_usage{};
 
-    std::array<SamplerInfo, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> texture_samplers;
-
     static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
     OGLBufferCache buffer_cache;
     PrimitiveAssembler primitive_assembler{buffer_cache};
-    GLint uniform_buffer_alignment;
 
     BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
     BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index f2ffc4710..a7681902e 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -281,10 +281,7 @@ std::size_t SurfaceParams::InnerMemorySize(bool force_gl, bool layer_only,
     params.component_type = ComponentTypeFromRenderTarget(config.format);
     params.type = GetFormatType(params.pixel_format);
     params.width = config.width;
-    if (!params.is_tiled) {
-        const u32 bpp = params.GetFormatBpp() / 8;
-        params.pitch = config.width * bpp;
-    }
+    params.pitch = config.pitch;
     params.height = config.height;
     params.unaligned_height = config.height;
     params.target = SurfaceTarget::Texture2D;
@@ -631,9 +628,11 @@ CachedSurface::CachedSurface(const SurfaceParams& params)
 }
 
 MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 192, 64));
-void CachedSurface::LoadGLBuffer() {
+void CachedSurface::LoadGLBuffer(RasterizerTemporaryMemory& res_cache_tmp_mem) {
     MICROPROFILE_SCOPE(OpenGL_SurfaceLoad);
-    gl_buffer.resize(params.max_mip_level);
+    auto& gl_buffer = res_cache_tmp_mem.gl_buffer;
+    if (gl_buffer.size() < params.max_mip_level)
+        gl_buffer.resize(params.max_mip_level);
     for (u32 i = 0; i < params.max_mip_level; i++)
         gl_buffer[i].resize(params.GetMipmapSizeGL(i));
     if (params.is_tiled) {
@@ -643,13 +642,16 @@ void CachedSurface::LoadGLBuffer() {
             SwizzleFunc(MortonSwizzleMode::MortonToLinear, params, gl_buffer[i], i);
     } else {
         const u32 bpp = params.GetFormatBpp() / 8;
-        const u32 copy_size = params.width * bpp;
+        const u32 copy_size = (params.width * bpp + GetDefaultBlockWidth(params.pixel_format) - 1) /
+                              GetDefaultBlockWidth(params.pixel_format);
         if (params.pitch == copy_size) {
             std::memcpy(gl_buffer[0].data(), params.host_ptr, params.size_in_bytes_gl);
         } else {
+            const u32 height = (params.height + GetDefaultBlockHeight(params.pixel_format) - 1) /
+                               GetDefaultBlockHeight(params.pixel_format);
             const u8* start{params.host_ptr};
             u8* write_to = gl_buffer[0].data();
-            for (u32 h = params.height; h > 0; h--) {
+            for (u32 h = height; h > 0; h--) {
                 std::memcpy(write_to, start, copy_size);
                 start += params.pitch;
                 write_to += copy_size;
@@ -671,13 +673,13 @@ void CachedSurface::LoadGLBuffer() {
 }
 
 MICROPROFILE_DEFINE(OpenGL_SurfaceFlush, "OpenGL", "Surface Flush", MP_RGB(128, 192, 64));
-void CachedSurface::FlushGLBuffer() {
+void CachedSurface::FlushGLBuffer(RasterizerTemporaryMemory& res_cache_tmp_mem) {
     MICROPROFILE_SCOPE(OpenGL_SurfaceFlush);
 
     ASSERT_MSG(!IsPixelFormatASTC(params.pixel_format), "Unimplemented");
 
+    auto& gl_buffer = res_cache_tmp_mem.gl_buffer;
     // OpenGL temporary buffer needs to be big enough to store raw texture size
-    gl_buffer.resize(1);
     gl_buffer[0].resize(GetSizeInBytes());
 
     const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type);
@@ -713,10 +715,12 @@ void CachedSurface::FlushGLBuffer() {
     }
 }
 
-void CachedSurface::UploadGLMipmapTexture(u32 mip_map, GLuint read_fb_handle,
-                                          GLuint draw_fb_handle) {
+void CachedSurface::UploadGLMipmapTexture(RasterizerTemporaryMemory& res_cache_tmp_mem, u32 mip_map,
+                                          GLuint read_fb_handle, GLuint draw_fb_handle) {
     const auto& rect{params.GetRect(mip_map)};
 
+    auto& gl_buffer = res_cache_tmp_mem.gl_buffer;
+
     // Load data from memory to the surface
     const auto x0 = static_cast<GLint>(rect.left);
     const auto y0 = static_cast<GLint>(rect.bottom);
@@ -801,7 +805,6 @@ void CachedSurface::UploadGLMipmapTexture(u32 mip_map, GLuint read_fb_handle,
                                 tuple.type, &gl_buffer[mip_map][buffer_offset]);
             break;
         case SurfaceTarget::TextureCubemap: {
-            std::size_t start = buffer_offset;
             for (std::size_t face = 0; face < params.depth; ++face) {
                 glTextureSubImage3D(texture.handle, mip_map, x0, y0, static_cast<GLint>(face),
                                     static_cast<GLsizei>(rect.GetWidth()),
@@ -845,11 +848,12 @@ void CachedSurface::EnsureTextureDiscrepantView() {
 }
 
 MICROPROFILE_DEFINE(OpenGL_TextureUL, "OpenGL", "Texture Upload", MP_RGB(128, 192, 64));
-void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle) {
+void CachedSurface::UploadGLTexture(RasterizerTemporaryMemory& res_cache_tmp_mem,
+                                    GLuint read_fb_handle, GLuint draw_fb_handle) {
     MICROPROFILE_SCOPE(OpenGL_TextureUL);
 
     for (u32 i = 0; i < params.max_mip_level; i++)
-        UploadGLMipmapTexture(i, read_fb_handle, draw_fb_handle);
+        UploadGLMipmapTexture(res_cache_tmp_mem, i, read_fb_handle, draw_fb_handle);
 }
 
 void CachedSurface::UpdateSwizzle(Tegra::Texture::SwizzleSource swizzle_x,
@@ -929,8 +933,8 @@ Surface RasterizerCacheOpenGL::GetColorBufferSurface(std::size_t index, bool pre
 }
 
 void RasterizerCacheOpenGL::LoadSurface(const Surface& surface) {
-    surface->LoadGLBuffer();
-    surface->UploadGLTexture(read_framebuffer.handle, draw_framebuffer.handle);
+    surface->LoadGLBuffer(temporal_memory);
+    surface->UploadGLTexture(temporal_memory, read_framebuffer.handle, draw_framebuffer.handle);
     surface->MarkAsModified(false, *this);
     surface->MarkForReload(false);
 }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index db280dbb3..6263ef3e7 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -355,6 +355,12 @@ namespace OpenGL {
 
 class RasterizerOpenGL;
 
+// This is used to store temporary big buffers,
+// instead of creating/destroying all the time
+struct RasterizerTemporaryMemory {
+    std::vector<std::vector<u8>> gl_buffer;
+};
+
 class CachedSurface final : public RasterizerCacheObject {
 public:
     explicit CachedSurface(const SurfaceParams& params);
@@ -371,10 +377,6 @@ public:
         return memory_size;
     }
 
-    void Flush() override {
-        FlushGLBuffer();
-    }
-
     const OGLTexture& Texture() const {
         return texture;
     }
@@ -397,11 +399,12 @@ public:
     }
 
     // Read/Write data in Switch memory to/from gl_buffer
-    void LoadGLBuffer();
-    void FlushGLBuffer();
+    void LoadGLBuffer(RasterizerTemporaryMemory& res_cache_tmp_mem);
+    void FlushGLBuffer(RasterizerTemporaryMemory& res_cache_tmp_mem);
 
     // Upload data in gl_buffer to this surface's texture
-    void UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle);
+    void UploadGLTexture(RasterizerTemporaryMemory& res_cache_tmp_mem, GLuint read_fb_handle,
+                         GLuint draw_fb_handle);
 
     void UpdateSwizzle(Tegra::Texture::SwizzleSource swizzle_x,
                        Tegra::Texture::SwizzleSource swizzle_y,
@@ -429,13 +432,13 @@ public:
     }
 
 private:
-    void UploadGLMipmapTexture(u32 mip_map, GLuint read_fb_handle, GLuint draw_fb_handle);
+    void UploadGLMipmapTexture(RasterizerTemporaryMemory& res_cache_tmp_mem, u32 mip_map,
+                               GLuint read_fb_handle, GLuint draw_fb_handle);
 
     void EnsureTextureDiscrepantView();
 
     OGLTexture texture;
     OGLTexture discrepant_view;
-    std::vector<std::vector<u8>> gl_buffer;
     SurfaceParams params{};
     GLenum gl_target{};
     GLenum gl_internal_format{};
@@ -473,6 +476,11 @@ public:
     void SignalPreDrawCall();
     void SignalPostDrawCall();
 
+protected:
+    void FlushObjectInner(const Surface& object) override {
+        object->FlushGLBuffer(temporal_memory);
+    }
+
 private:
     void LoadSurface(const Surface& surface);
     Surface GetSurface(const SurfaceParams& params, bool preserve_contents = true);
@@ -519,6 +527,8 @@ private:
     std::array<Surface, Maxwell::NumRenderTargets> current_color_buffers;
     Surface last_depth_buffer;
 
+    RasterizerTemporaryMemory temporal_memory;
+
     using SurfaceIntervalCache = boost::icl::interval_map<CacheAddr, Surface>;
     using SurfaceInterval = typename SurfaceIntervalCache::interval_type;
 
diff --git a/src/video_core/renderer_opengl/gl_sampler_cache.cpp b/src/video_core/renderer_opengl/gl_sampler_cache.cpp
new file mode 100644
index 000000000..3ded5ecea
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_sampler_cache.cpp
@@ -0,0 +1,52 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/logging/log.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/renderer_opengl/gl_sampler_cache.h"
+#include "video_core/renderer_opengl/maxwell_to_gl.h"
+
+namespace OpenGL {
+
+SamplerCacheOpenGL::SamplerCacheOpenGL() = default;
+
+SamplerCacheOpenGL::~SamplerCacheOpenGL() = default;
+
+OGLSampler SamplerCacheOpenGL::CreateSampler(const Tegra::Texture::TSCEntry& tsc) const {
+    OGLSampler sampler;
+    sampler.Create();
+
+    const GLuint sampler_id{sampler.handle};
+    glSamplerParameteri(
+        sampler_id, GL_TEXTURE_MAG_FILTER,
+        MaxwellToGL::TextureFilterMode(tsc.mag_filter, Tegra::Texture::TextureMipmapFilter::None));
+    glSamplerParameteri(sampler_id, GL_TEXTURE_MIN_FILTER,
+                        MaxwellToGL::TextureFilterMode(tsc.min_filter, tsc.mipmap_filter));
+    glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_S, MaxwellToGL::WrapMode(tsc.wrap_u));
+    glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_T, MaxwellToGL::WrapMode(tsc.wrap_v));
+    glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_R, MaxwellToGL::WrapMode(tsc.wrap_p));
+    glSamplerParameteri(sampler_id, GL_TEXTURE_COMPARE_MODE,
+                        tsc.depth_compare_enabled == 1 ? GL_COMPARE_REF_TO_TEXTURE : GL_NONE);
+    glSamplerParameteri(sampler_id, GL_TEXTURE_COMPARE_FUNC,
+                        MaxwellToGL::DepthCompareFunc(tsc.depth_compare_func));
+    glSamplerParameterfv(sampler_id, GL_TEXTURE_BORDER_COLOR, tsc.GetBorderColor().data());
+    glSamplerParameterf(sampler_id, GL_TEXTURE_MIN_LOD, tsc.GetMinLod());
+    glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_LOD, tsc.GetMaxLod());
+    glSamplerParameterf(sampler_id, GL_TEXTURE_LOD_BIAS, tsc.GetLodBias());
+    if (GLAD_GL_ARB_texture_filter_anisotropic) {
+        glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY, tsc.GetMaxAnisotropy());
+    } else if (GLAD_GL_EXT_texture_filter_anisotropic) {
+        glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY_EXT, tsc.GetMaxAnisotropy());
+    } else if (tsc.GetMaxAnisotropy() != 1) {
+        LOG_WARNING(Render_OpenGL, "Anisotropy not supported by host GPU driver");
+    }
+
+    return sampler;
+}
+
+GLuint SamplerCacheOpenGL::ToSamplerType(const OGLSampler& sampler) const {
+    return sampler.handle;
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_sampler_cache.h b/src/video_core/renderer_opengl/gl_sampler_cache.h
new file mode 100644
index 000000000..defbc2d81
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_sampler_cache.h
@@ -0,0 +1,25 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <glad/glad.h>
+
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/sampler_cache.h"
+
+namespace OpenGL {
+
+class SamplerCacheOpenGL final : public VideoCommon::SamplerCache<GLuint, OGLSampler> {
+public:
+    explicit SamplerCacheOpenGL();
+    ~SamplerCacheOpenGL();
+
+protected:
+    OGLSampler CreateSampler(const Tegra::Texture::TSCEntry& tsc) const;
+
+    GLuint ToSamplerType(const OGLSampler& sampler) const;
+};
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 99f67494c..f700dc89a 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -38,13 +38,15 @@ GPUVAddr GetShaderAddress(Maxwell::ShaderProgram program) {
 }
 
 /// Gets the shader program code from memory for the specified address
-ProgramCode GetShaderCode(const u8* host_ptr) {
+ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr gpu_addr,
+                          const u8* host_ptr) {
     ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH);
     ASSERT_OR_EXECUTE(host_ptr != nullptr, {
         std::fill(program_code.begin(), program_code.end(), 0);
         return program_code;
     });
-    std::memcpy(program_code.data(), host_ptr, program_code.size() * sizeof(u64));
+    memory_manager.ReadBlockUnsafe(gpu_addr, program_code.data(),
+                                   program_code.size() * sizeof(u64));
     return program_code;
 }
 
@@ -134,8 +136,8 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode&
 }
 
 /// Creates an unspecialized program from code streams
-GLShader::ProgramResult CreateProgram(Maxwell::ShaderProgram program_type, ProgramCode program_code,
-                                      ProgramCode program_code_b) {
+GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type,
+                                      ProgramCode program_code, ProgramCode program_code_b) {
     GLShader::ShaderSetup setup(program_code);
     if (program_type == Maxwell::ShaderProgram::VertexA) {
         // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders.
@@ -149,11 +151,11 @@ GLShader::ProgramResult CreateProgram(Maxwell::ShaderProgram program_type, Progr
     switch (program_type) {
     case Maxwell::ShaderProgram::VertexA:
     case Maxwell::ShaderProgram::VertexB:
-        return GLShader::GenerateVertexShader(setup);
+        return GLShader::GenerateVertexShader(device, setup);
     case Maxwell::ShaderProgram::Geometry:
-        return GLShader::GenerateGeometryShader(setup);
+        return GLShader::GenerateGeometryShader(device, setup);
     case Maxwell::ShaderProgram::Fragment:
-        return GLShader::GenerateFragmentShader(setup);
+        return GLShader::GenerateFragmentShader(device, setup);
     default:
         LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type));
         UNREACHABLE();
@@ -212,22 +214,20 @@ std::set<GLenum> GetSupportedFormats() {
     return supported_formats;
 }
 
-} // namespace
+} // Anonymous namespace
 
-CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier,
+CachedShader::CachedShader(const Device& device, VAddr cpu_addr, u64 unique_identifier,
                            Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
                            const PrecompiledPrograms& precompiled_programs,
                            ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr)
     : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr},
       unique_identifier{unique_identifier}, program_type{program_type}, disk_cache{disk_cache},
       precompiled_programs{precompiled_programs} {
-
-    const std::size_t code_size = CalculateProgramSize(program_code);
-    const std::size_t code_size_b =
-        program_code_b.empty() ? 0 : CalculateProgramSize(program_code_b);
-
-    GLShader::ProgramResult program_result =
-        CreateProgram(program_type, program_code, program_code_b);
+    const std::size_t code_size{CalculateProgramSize(program_code)};
+    const std::size_t code_size_b{program_code_b.empty() ? 0
+                                                         : CalculateProgramSize(program_code_b)};
+    GLShader::ProgramResult program_result{
+        CreateProgram(device, program_type, program_code, program_code_b)};
     if (program_result.first.empty()) {
         // TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now
         return;
@@ -251,7 +251,6 @@ CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier,
     : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, unique_identifier{unique_identifier},
       program_type{program_type}, disk_cache{disk_cache}, precompiled_programs{
                                                               precompiled_programs} {
-
     code = std::move(result.first);
     entries = result.second;
     shader_length = entries.shader_length;
@@ -344,8 +343,9 @@ ShaderDiskCacheUsage CachedShader::GetUsage(GLenum primitive_mode,
     return {unique_identifier, base_bindings, primitive_mode};
 }
 
-ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system)
-    : RasterizerCache{rasterizer}, disk_cache{system} {}
+ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
+                                     const Device& device)
+    : RasterizerCache{rasterizer}, device{device}, disk_cache{system} {}
 
 void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
                                       const VideoCore::DiskResourceLoadCallback& callback) {
@@ -363,6 +363,10 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
     if (stop_loading)
         return;
 
+    // Track if precompiled cache was altered during loading to know if we have to serialize the
+    // virtual precompiled cache file back to the hard drive
+    bool precompiled_cache_altered = false;
+
     // Build shaders
     if (callback)
         callback(VideoCore::LoadCallbackStage::Build, 0, usages.size());
@@ -384,6 +388,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
             if (!shader) {
                 // Invalidate the precompiled cache if a shader dumped shader was rejected
                 disk_cache.InvalidatePrecompiled();
+                precompiled_cache_altered = true;
                 dumps.clear();
             }
         }
@@ -405,8 +410,13 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
         if (dumps.find(usage) == dumps.end()) {
             const auto& program = precompiled_programs.at(usage);
             disk_cache.SaveDump(usage, program->handle);
+            precompiled_cache_altered = true;
         }
     }
+
+    if (precompiled_cache_altered) {
+        disk_cache.SaveVirtualPrecompiledFile();
+    }
 }
 
 CachedProgram ShaderCacheOpenGL::GeneratePrecompiledProgram(
@@ -439,17 +449,18 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
     const std::unordered_map<u64, ShaderDiskCacheDecompiled>& decompiled) {
     std::unordered_map<u64, UnspecializedShader> unspecialized;
 
-    if (callback)
+    if (callback) {
         callback(VideoCore::LoadCallbackStage::Decompile, 0, raws.size());
+    }
 
     for (std::size_t i = 0; i < raws.size(); ++i) {
-        if (stop_loading)
+        if (stop_loading) {
             return {};
-
+        }
         const auto& raw{raws[i]};
-        const u64 unique_identifier = raw.GetUniqueIdentifier();
-        const u64 calculated_hash =
-            GetUniqueIdentifier(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB());
+        const u64 unique_identifier{raw.GetUniqueIdentifier()};
+        const u64 calculated_hash{
+            GetUniqueIdentifier(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB())};
         if (unique_identifier != calculated_hash) {
             LOG_ERROR(
                 Render_OpenGL,
@@ -466,8 +477,8 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
             result = {stored_decompiled.code, stored_decompiled.entries};
         } else {
             // Otherwise decompile the shader at boot and save the result to the decompiled file
-            result =
-                CreateProgram(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB());
+            result = CreateProgram(device, raw.GetProgramType(), raw.GetProgramCode(),
+                                   raw.GetProgramCodeB());
             disk_cache.SaveDecompiled(unique_identifier, result.first, result.second);
         }
 
@@ -477,8 +488,9 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
             {raw.GetUniqueIdentifier(),
              {std::move(result.first), std::move(result.second), raw.GetProgramType()}});
 
-        if (callback)
+        if (callback) {
             callback(VideoCore::LoadCallbackStage::Decompile, i, raws.size());
+        }
     }
     return unspecialized;
 }
@@ -497,11 +509,12 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
 
     if (!shader) {
         // No shader found - create a new one
-        ProgramCode program_code{GetShaderCode(host_ptr)};
+        ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)};
         ProgramCode program_code_b;
         if (program == Maxwell::ShaderProgram::VertexA) {
-            program_code_b = GetShaderCode(
-                memory_manager.GetPointer(GetShaderAddress(Maxwell::ShaderProgram::VertexB)));
+            const GPUVAddr program_addr_b{GetShaderAddress(Maxwell::ShaderProgram::VertexB)};
+            program_code_b = GetShaderCode(memory_manager, program_addr_b,
+                                           memory_manager.GetPointer(program_addr_b));
         }
         const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
         const VAddr cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)};
@@ -512,7 +525,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
                                                precompiled_programs, found->second, host_ptr);
         } else {
             shader = std::make_shared<CachedShader>(
-                cpu_addr, unique_identifier, program, disk_cache, precompiled_programs,
+                device, cpu_addr, unique_identifier, program, disk_cache, precompiled_programs,
                 std::move(program_code), std::move(program_code_b), host_ptr);
         }
         Register(shader);
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 0cf8e0b3d..31b979987 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -27,6 +27,7 @@ class System;
 namespace OpenGL {
 
 class CachedShader;
+class Device;
 class RasterizerOpenGL;
 struct UnspecializedShader;
 
@@ -38,7 +39,7 @@ using PrecompiledShaders = std::unordered_map<u64, GLShader::ProgramResult>;
 
 class CachedShader final : public RasterizerCacheObject {
 public:
-    explicit CachedShader(VAddr cpu_addr, u64 unique_identifier,
+    explicit CachedShader(const Device& device, VAddr cpu_addr, u64 unique_identifier,
                           Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
                           const PrecompiledPrograms& precompiled_programs,
                           ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr);
@@ -56,9 +57,6 @@ public:
         return shader_length;
     }
 
-    // We do not have to flush this cache as things in it are never modified by us.
-    void Flush() override {}
-
     /// Gets the shader entries for the shader
     const GLShader::ShaderEntries& GetShaderEntries() const {
         return entries;
@@ -112,7 +110,8 @@ private:
 
 class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
 public:
-    explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system);
+    explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
+                               const Device& device);
 
     /// Loads disk cache for the current game
     void LoadDiskCache(const std::atomic_bool& stop_loading,
@@ -121,6 +120,10 @@ public:
     /// Gets the current specified shader stage program
     Shader GetStageProgram(Maxwell::ShaderProgram program);
 
+protected:
+    // We do not have to flush this cache as things in it are never modified by us.
+    void FlushObjectInner(const Shader& object) override {}
+
 private:
     std::unordered_map<u64, UnspecializedShader> GenerateUnspecializedShaders(
         const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback,
@@ -130,6 +133,8 @@ private:
     CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump,
                                              const std::set<GLenum>& supported_formats);
 
+    const Device& device;
+
     std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
 
     ShaderDiskCacheOpenGL disk_cache;
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 28e490b3c..1a62795e1 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -15,6 +15,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/shader/shader_ir.h"
@@ -45,8 +46,6 @@ using TextureIR = std::variant<TextureAoffi, TextureArgument>;
 enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 };
 constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
     static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float));
-constexpr u32 MAX_GLOBALMEMORY_ELEMENTS =
-    static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize) / sizeof(float);
 
 class ShaderWriter {
 public:
@@ -121,14 +120,10 @@ std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
 
 /// Returns true if an object has to be treated as precise
 bool IsPrecise(Operation operand) {
-    const auto& meta = operand.GetMeta();
-
+    const auto& meta{operand.GetMeta()};
     if (const auto arithmetic = std::get_if<MetaArithmetic>(&meta)) {
         return arithmetic->precise;
     }
-    if (const auto half_arithmetic = std::get_if<MetaHalfArithmetic>(&meta)) {
-        return half_arithmetic->precise;
-    }
     return false;
 }
 
@@ -141,8 +136,9 @@ bool IsPrecise(Node node) {
 
 class GLSLDecompiler final {
 public:
-    explicit GLSLDecompiler(const ShaderIR& ir, ShaderStage stage, std::string suffix)
-        : ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {}
+    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
+                            std::string suffix)
+        : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {}
 
     void Decompile() {
         DeclareVertex();
@@ -208,8 +204,10 @@ public:
         for (const auto& sampler : ir.GetSamplers()) {
             entries.samplers.emplace_back(sampler);
         }
-        for (const auto& gmem : ir.GetGlobalMemoryBases()) {
-            entries.global_memory_entries.emplace_back(gmem.cbuf_index, gmem.cbuf_offset);
+        for (const auto& gmem_pair : ir.GetGlobalMemory()) {
+            const auto& [base, usage] = gmem_pair;
+            entries.global_memory_entries.emplace_back(base.cbuf_index, base.cbuf_offset,
+                                                       usage.is_read, usage.is_written);
         }
         entries.clip_distances = ir.GetClipDistances();
         entries.shader_length = ir.GetLength();
@@ -380,12 +378,22 @@ private:
     }
 
     void DeclareGlobalMemory() {
-        for (const auto& entry : ir.GetGlobalMemoryBases()) {
+        for (const auto& gmem : ir.GetGlobalMemory()) {
+            const auto& [base, usage] = gmem;
+
+            // Since we don't know how the shader will use the shader, hint the driver to disable as
+            // much optimizations as possible
+            std::string qualifier = "coherent volatile";
+            if (usage.is_read && !usage.is_written)
+                qualifier += " readonly";
+            else if (usage.is_written && !usage.is_read)
+                qualifier += " writeonly";
+
             const std::string binding =
-                fmt::format("GMEM_BINDING_{}_{}", entry.cbuf_index, entry.cbuf_offset);
-            code.AddLine("layout (std430, binding = " + binding + ") buffer " +
-                         GetGlobalMemoryBlock(entry) + " {");
-            code.AddLine("    float " + GetGlobalMemory(entry) + "[MAX_GLOBALMEMORY_ELEMENTS];");
+                fmt::format("GMEM_BINDING_{}_{}", base.cbuf_index, base.cbuf_offset);
+            code.AddLine("layout (std430, binding = " + binding + ") " + qualifier + " buffer " +
+                         GetGlobalMemoryBlock(base) + " {");
+            code.AddLine("    float " + GetGlobalMemory(base) + "[];");
             code.AddLine("};");
             code.AddNewLine();
         }
@@ -617,28 +625,7 @@ private:
     }
 
     std::string VisitOperand(Operation operation, std::size_t operand_index, Type type) {
-        std::string value = VisitOperand(operation, operand_index);
-        switch (type) {
-        case Type::HalfFloat: {
-            const auto half_meta = std::get_if<MetaHalfArithmetic>(&operation.GetMeta());
-            if (!half_meta) {
-                value = "toHalf2(" + value + ')';
-            }
-
-            switch (half_meta->types.at(operand_index)) {
-            case Tegra::Shader::HalfType::H0_H1:
-                return "toHalf2(" + value + ')';
-            case Tegra::Shader::HalfType::F32:
-                return "vec2(" + value + ')';
-            case Tegra::Shader::HalfType::H0_H0:
-                return "vec2(toHalf2(" + value + ")[0])";
-            case Tegra::Shader::HalfType::H1_H1:
-                return "vec2(toHalf2(" + value + ")[1])";
-            }
-        }
-        default:
-            return CastOperand(value, type);
-        }
+        return CastOperand(VisitOperand(operation, operand_index), type);
     }
 
     std::string CastOperand(const std::string& value, Type type) const {
@@ -652,9 +639,7 @@ private:
         case Type::Uint:
             return "ftou(" + value + ')';
         case Type::HalfFloat:
-            // Can't be handled as a stand-alone value
-            UNREACHABLE();
-            return value;
+            return "toHalf2(" + value + ')';
         }
         UNREACHABLE();
         return value;
@@ -819,8 +804,12 @@ private:
                 // Inline the string as an immediate integer in GLSL (AOFFI arguments are required
                 // to be constant by the standard).
                 expr += std::to_string(static_cast<s32>(immediate->GetValue()));
-            } else {
+            } else if (device.HasVariableAoffi()) {
+                // Avoid using variable AOFFI on unsupported devices.
                 expr += "ftoi(" + Visit(operand) + ')';
+            } else {
+                // Insert 0 on devices not supporting variable AOFFI.
+                expr += '0';
             }
             if (index + 1 < aoffi.size()) {
                 expr += ", ";
@@ -868,6 +857,12 @@ private:
         } else if (const auto lmem = std::get_if<LmemNode>(dest)) {
             target = GetLocalMemory() + "[ftou(" + Visit(lmem->GetAddress()) + ") / 4]";
 
+        } else if (const auto gmem = std::get_if<GmemNode>(dest)) {
+            const std::string real = Visit(gmem->GetRealAddress());
+            const std::string base = Visit(gmem->GetBaseAddress());
+            const std::string final_offset = "(ftou(" + real + ") - ftou(" + base + ")) / 4";
+            target = fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset);
+
         } else {
             UNREACHABLE_MSG("Assign called without a proper target");
         }
@@ -876,17 +871,6 @@ private:
         return {};
     }
 
-    std::string Composite(Operation operation) {
-        std::string value = "vec4(";
-        for (std::size_t i = 0; i < 4; ++i) {
-            value += Visit(operation[i]);
-            if (i < 3)
-                value += ", ";
-        }
-        value += ')';
-        return value;
-    }
-
     template <Type type>
     std::string Add(Operation operation) {
         return GenerateBinaryInfix(operation, "+", type, type, type);
@@ -1067,13 +1051,40 @@ private:
         return BitwiseCastResult(value, Type::HalfFloat);
     }
 
+    std::string HClamp(Operation operation) {
+        const std::string value = VisitOperand(operation, 0, Type::HalfFloat);
+        const std::string min = VisitOperand(operation, 1, Type::Float);
+        const std::string max = VisitOperand(operation, 2, Type::Float);
+        const std::string clamped = "clamp(" + value + ", vec2(" + min + "), vec2(" + max + "))";
+        return ApplyPrecise(operation, BitwiseCastResult(clamped, Type::HalfFloat));
+    }
+
+    std::string HUnpack(Operation operation) {
+        const std::string operand{VisitOperand(operation, 0, Type::HalfFloat)};
+        const auto value = [&]() -> std::string {
+            switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) {
+            case Tegra::Shader::HalfType::H0_H1:
+                return operand;
+            case Tegra::Shader::HalfType::F32:
+                return "vec2(fromHalf2(" + operand + "))";
+            case Tegra::Shader::HalfType::H0_H0:
+                return "vec2(" + operand + "[0])";
+            case Tegra::Shader::HalfType::H1_H1:
+                return "vec2(" + operand + "[1])";
+            }
+            UNREACHABLE();
+            return "0";
+        }();
+        return "fromHalf2(" + value + ')';
+    }
+
     std::string HMergeF32(Operation operation) {
         return "float(toHalf2(" + Visit(operation[0]) + ")[0])";
     }
 
     std::string HMergeH0(Operation operation) {
-        return "fromHalf2(vec2(toHalf2(" + Visit(operation[0]) + ")[1], toHalf2(" +
-               Visit(operation[1]) + ")[0]))";
+        return "fromHalf2(vec2(toHalf2(" + Visit(operation[1]) + ")[0], toHalf2(" +
+               Visit(operation[0]) + ")[1]))";
     }
 
     std::string HMergeH1(Operation operation) {
@@ -1173,34 +1184,46 @@ private:
         return GenerateUnary(operation, "any", Type::Bool, Type::Bool2);
     }
 
+    template <bool with_nan>
+    std::string GenerateHalfComparison(Operation operation, std::string compare_op) {
+        std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2,
+                                                  Type::HalfFloat, Type::HalfFloat)};
+        if constexpr (!with_nan) {
+            return comparison;
+        }
+        return "halfFloatNanComparison(" + comparison + ", " +
+               VisitOperand(operation, 0, Type::HalfFloat) + ", " +
+               VisitOperand(operation, 1, Type::HalfFloat) + ')';
+    }
+
+    template <bool with_nan>
     std::string Logical2HLessThan(Operation operation) {
-        return GenerateBinaryCall(operation, "lessThan", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "lessThan");
     }
 
+    template <bool with_nan>
     std::string Logical2HEqual(Operation operation) {
-        return GenerateBinaryCall(operation, "equal", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "equal");
     }
 
+    template <bool with_nan>
     std::string Logical2HLessEqual(Operation operation) {
-        return GenerateBinaryCall(operation, "lessThanEqual", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "lessThanEqual");
     }
 
+    template <bool with_nan>
     std::string Logical2HGreaterThan(Operation operation) {
-        return GenerateBinaryCall(operation, "greaterThan", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "greaterThan");
     }
 
+    template <bool with_nan>
     std::string Logical2HNotEqual(Operation operation) {
-        return GenerateBinaryCall(operation, "notEqual", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "notEqual");
     }
 
+    template <bool with_nan>
     std::string Logical2HGreaterEqual(Operation operation) {
-        return GenerateBinaryCall(operation, "greaterThanEqual", Type::Bool2, Type::HalfFloat,
-                                  Type::HalfFloat);
+        return GenerateHalfComparison<with_nan>(operation, "greaterThanEqual");
     }
 
     std::string Texture(Operation operation) {
@@ -1489,6 +1512,8 @@ private:
         &GLSLDecompiler::Fma<Type::HalfFloat>,
         &GLSLDecompiler::Absolute<Type::HalfFloat>,
         &GLSLDecompiler::HNegate,
+        &GLSLDecompiler::HClamp,
+        &GLSLDecompiler::HUnpack,
         &GLSLDecompiler::HMergeF32,
         &GLSLDecompiler::HMergeH0,
         &GLSLDecompiler::HMergeH1,
@@ -1525,12 +1550,18 @@ private:
         &GLSLDecompiler::LogicalNotEqual<Type::Uint>,
         &GLSLDecompiler::LogicalGreaterEqual<Type::Uint>,
 
-        &GLSLDecompiler::Logical2HLessThan,
-        &GLSLDecompiler::Logical2HEqual,
-        &GLSLDecompiler::Logical2HLessEqual,
-        &GLSLDecompiler::Logical2HGreaterThan,
-        &GLSLDecompiler::Logical2HNotEqual,
-        &GLSLDecompiler::Logical2HGreaterEqual,
+        &GLSLDecompiler::Logical2HLessThan<false>,
+        &GLSLDecompiler::Logical2HEqual<false>,
+        &GLSLDecompiler::Logical2HLessEqual<false>,
+        &GLSLDecompiler::Logical2HGreaterThan<false>,
+        &GLSLDecompiler::Logical2HNotEqual<false>,
+        &GLSLDecompiler::Logical2HGreaterEqual<false>,
+        &GLSLDecompiler::Logical2HLessThan<true>,
+        &GLSLDecompiler::Logical2HEqual<true>,
+        &GLSLDecompiler::Logical2HLessEqual<true>,
+        &GLSLDecompiler::Logical2HGreaterThan<true>,
+        &GLSLDecompiler::Logical2HNotEqual<true>,
+        &GLSLDecompiler::Logical2HGreaterEqual<true>,
 
         &GLSLDecompiler::Texture,
         &GLSLDecompiler::TextureLod,
@@ -1609,6 +1640,7 @@ private:
         return name + '_' + std::to_string(index) + '_' + suffix;
     }
 
+    const Device& device;
     const ShaderIR& ir;
     const ShaderStage stage;
     const std::string suffix;
@@ -1621,9 +1653,7 @@ private:
 
 std::string GetCommonDeclarations() {
     const auto cbuf = std::to_string(MAX_CONSTBUFFER_ELEMENTS);
-    const auto gmem = std::to_string(MAX_GLOBALMEMORY_ELEMENTS);
     return "#define MAX_CONSTBUFFER_ELEMENTS " + cbuf + "\n" +
-           "#define MAX_GLOBALMEMORY_ELEMENTS " + gmem + "\n" +
            "#define ftoi floatBitsToInt\n"
            "#define ftou floatBitsToUint\n"
            "#define itof intBitsToFloat\n"
@@ -1633,11 +1663,18 @@ std::string GetCommonDeclarations() {
            "}\n\n"
            "vec2 toHalf2(float value) {\n"
            "    return unpackHalf2x16(ftou(value));\n"
+           "}\n\n"
+           "bvec2 halfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {\n"
+           "    bvec2 is_nan1 = isnan(pair1);\n"
+           "    bvec2 is_nan2 = isnan(pair2);\n"
+           "    return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || "
+           "is_nan2.y);\n"
            "}\n";
 }
 
-ProgramResult Decompile(const ShaderIR& ir, Maxwell::ShaderStage stage, const std::string& suffix) {
-    GLSLDecompiler decompiler(ir, stage, suffix);
+ProgramResult Decompile(const Device& device, const ShaderIR& ir, Maxwell::ShaderStage stage,
+                        const std::string& suffix) {
+    GLSLDecompiler decompiler(device, ir, stage, suffix);
     decompiler.Decompile();
     return {decompiler.GetResult(), decompiler.GetShaderEntries()};
 }
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index 4e04ab2f8..c1569e737 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -12,6 +12,10 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/shader/shader_ir.h"
 
+namespace OpenGL {
+class Device;
+}
+
 namespace VideoCommon::Shader {
 class ShaderIR;
 }
@@ -39,8 +43,9 @@ private:
 
 class GlobalMemoryEntry {
 public:
-    explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset)
-        : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset} {}
+    explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read, bool is_written)
+        : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, is_read{is_read}, is_written{
+                                                                                  is_written} {}
 
     u32 GetCbufIndex() const {
         return cbuf_index;
@@ -50,14 +55,25 @@ public:
         return cbuf_offset;
     }
 
+    bool IsRead() const {
+        return is_read;
+    }
+
+    bool IsWritten() const {
+        return is_written;
+    }
+
 private:
     u32 cbuf_index{};
     u32 cbuf_offset{};
+    bool is_read{};
+    bool is_written{};
 };
 
 struct ShaderEntries {
     std::vector<ConstBufferEntry> const_buffers;
     std::vector<SamplerEntry> samplers;
+    std::vector<SamplerEntry> bindless_samplers;
     std::vector<GlobalMemoryEntry> global_memory_entries;
     std::array<bool, Maxwell::NumClipDistances> clip_distances{};
     std::size_t shader_length{};
@@ -65,7 +81,7 @@ struct ShaderEntries {
 
 std::string GetCommonDeclarations();
 
-ProgramResult Decompile(const VideoCommon::Shader::ShaderIR& ir, Maxwell::ShaderStage stage,
-                        const std::string& suffix);
+ProgramResult Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                        Maxwell::ShaderStage stage, const std::string& suffix);
 
-} // namespace OpenGL::GLShader
-\ No newline at end of file
+} // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 8a43eb157..fba9c594a 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -106,6 +106,8 @@ bool ShaderDiskCacheRaw::Save(FileUtil::IOFile& file) const {
 
 ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {}
 
+ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default;
+
 std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>>
 ShaderDiskCacheOpenGL::LoadTransferable() {
     // Skip games without title id
@@ -177,6 +179,7 @@ ShaderDiskCacheOpenGL::LoadTransferable() {
             return {};
         }
     }
+
     return {{raws, usages}};
 }
 
@@ -208,59 +211,64 @@ ShaderDiskCacheOpenGL::LoadPrecompiled() {
 std::optional<std::pair<std::unordered_map<u64, ShaderDiskCacheDecompiled>,
                         std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>>
 ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
+    // Read compressed file from disk and decompress to virtual precompiled cache file
+    std::vector<u8> compressed(file.GetSize());
+    file.ReadBytes(compressed.data(), compressed.size());
+    const std::vector<u8> decompressed = Common::Compression::DecompressDataZSTD(compressed);
+    SaveArrayToPrecompiled(decompressed.data(), decompressed.size());
+    precompiled_cache_virtual_file_offset = 0;
+
     ShaderCacheVersionHash file_hash{};
-    if (file.ReadArray(file_hash.data(), file_hash.size()) != file_hash.size()) {
+    if (!LoadArrayFromPrecompiled(file_hash.data(), file_hash.size())) {
+        precompiled_cache_virtual_file_offset = 0;
         return {};
     }
     if (GetShaderCacheVersionHash() != file_hash) {
         LOG_INFO(Render_OpenGL, "Precompiled cache is from another version of the emulator");
+        precompiled_cache_virtual_file_offset = 0;
         return {};
     }
 
     std::unordered_map<u64, ShaderDiskCacheDecompiled> decompiled;
     std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump> dumps;
-    while (file.Tell() < file.GetSize()) {
+    while (precompiled_cache_virtual_file_offset < precompiled_cache_virtual_file.GetSize()) {
         PrecompiledEntryKind kind{};
-        if (file.ReadBytes(&kind, sizeof(u32)) != sizeof(u32)) {
+        if (!LoadObjectFromPrecompiled(kind)) {
             return {};
         }
 
         switch (kind) {
         case PrecompiledEntryKind::Decompiled: {
             u64 unique_identifier{};
-            if (file.ReadBytes(&unique_identifier, sizeof(u64)) != sizeof(u64))
+            if (!LoadObjectFromPrecompiled(unique_identifier)) {
                 return {};
+            }
 
-            const auto entry = LoadDecompiledEntry(file);
-            if (!entry)
+            auto entry = LoadDecompiledEntry();
+            if (!entry) {
                 return {};
+            }
             decompiled.insert({unique_identifier, std::move(*entry)});
             break;
         }
         case PrecompiledEntryKind::Dump: {
             ShaderDiskCacheUsage usage;
-            if (file.ReadBytes(&usage, sizeof(usage)) != sizeof(usage))
+            if (!LoadObjectFromPrecompiled(usage)) {
                 return {};
+            }
 
             ShaderDiskCacheDump dump;
-            if (file.ReadBytes(&dump.binary_format, sizeof(u32)) != sizeof(u32))
-                return {};
-
-            u32 binary_length{};
-            u32 compressed_size{};
-            if (file.ReadBytes(&binary_length, sizeof(u32)) != sizeof(u32) ||
-                file.ReadBytes(&compressed_size, sizeof(u32)) != sizeof(u32)) {
+            if (!LoadObjectFromPrecompiled(dump.binary_format)) {
                 return {};
             }
 
-            std::vector<u8> compressed_binary(compressed_size);
-            if (file.ReadArray(compressed_binary.data(), compressed_binary.size()) !=
-                compressed_binary.size()) {
+            u32 binary_length{};
+            if (!LoadObjectFromPrecompiled(binary_length)) {
                 return {};
             }
 
-            dump.binary = Common::Compression::DecompressDataZSTD(compressed_binary);
-            if (dump.binary.empty()) {
+            dump.binary.resize(binary_length);
+            if (!LoadArrayFromPrecompiled(dump.binary.data(), dump.binary.size())) {
                 return {};
             }
 
@@ -274,143 +282,151 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
     return {{decompiled, dumps}};
 }
 
-std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEntry(
-    FileUtil::IOFile& file) {
+std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEntry() {
     u32 code_size{};
-    u32 compressed_code_size{};
-    if (file.ReadBytes(&code_size, sizeof(u32)) != sizeof(u32) ||
-        file.ReadBytes(&compressed_code_size, sizeof(u32)) != sizeof(u32)) {
+    if (!LoadObjectFromPrecompiled(code_size)) {
         return {};
     }
 
-    std::vector<u8> compressed_code(compressed_code_size);
-    if (file.ReadArray(compressed_code.data(), compressed_code.size()) != compressed_code.size()) {
+    std::string code(code_size, '\0');
+    if (!LoadArrayFromPrecompiled(code.data(), code.size())) {
         return {};
     }
 
-    const std::vector<u8> code = Common::Compression::DecompressDataZSTD(compressed_code);
-    if (code.empty()) {
-        return {};
-    }
     ShaderDiskCacheDecompiled entry;
-    entry.code = std::string(reinterpret_cast<const char*>(code.data()), code_size);
+    entry.code = std::move(code);
 
     u32 const_buffers_count{};
-    if (file.ReadBytes(&const_buffers_count, sizeof(u32)) != sizeof(u32))
+    if (!LoadObjectFromPrecompiled(const_buffers_count)) {
         return {};
+    }
+
     for (u32 i = 0; i < const_buffers_count; ++i) {
         u32 max_offset{};
         u32 index{};
-        u8 is_indirect{};
-        if (file.ReadBytes(&max_offset, sizeof(u32)) != sizeof(u32) ||
-            file.ReadBytes(&index, sizeof(u32)) != sizeof(u32) ||
-            file.ReadBytes(&is_indirect, sizeof(u8)) != sizeof(u8)) {
+        bool is_indirect{};
+        if (!LoadObjectFromPrecompiled(max_offset) || !LoadObjectFromPrecompiled(index) ||
+            !LoadObjectFromPrecompiled(is_indirect)) {
             return {};
         }
-        entry.entries.const_buffers.emplace_back(max_offset, is_indirect != 0, index);
+        entry.entries.const_buffers.emplace_back(max_offset, is_indirect, index);
     }
 
     u32 samplers_count{};
-    if (file.ReadBytes(&samplers_count, sizeof(u32)) != sizeof(u32))
+    if (!LoadObjectFromPrecompiled(samplers_count)) {
         return {};
+    }
+
     for (u32 i = 0; i < samplers_count; ++i) {
         u64 offset{};
         u64 index{};
         u32 type{};
-        u8 is_array{};
-        u8 is_shadow{};
-        if (file.ReadBytes(&offset, sizeof(u64)) != sizeof(u64) ||
-            file.ReadBytes(&index, sizeof(u64)) != sizeof(u64) ||
-            file.ReadBytes(&type, sizeof(u32)) != sizeof(u32) ||
-            file.ReadBytes(&is_array, sizeof(u8)) != sizeof(u8) ||
-            file.ReadBytes(&is_shadow, sizeof(u8)) != sizeof(u8)) {
+        bool is_array{};
+        bool is_shadow{};
+        bool is_bindless{};
+        if (!LoadObjectFromPrecompiled(offset) || !LoadObjectFromPrecompiled(index) ||
+            !LoadObjectFromPrecompiled(type) || !LoadObjectFromPrecompiled(is_array) ||
+            !LoadObjectFromPrecompiled(is_shadow) || !LoadObjectFromPrecompiled(is_bindless)) {
             return {};
         }
         entry.entries.samplers.emplace_back(
             static_cast<std::size_t>(offset), static_cast<std::size_t>(index),
-            static_cast<Tegra::Shader::TextureType>(type), is_array != 0, is_shadow != 0);
+            static_cast<Tegra::Shader::TextureType>(type), is_array, is_shadow, is_bindless);
     }
 
     u32 global_memory_count{};
-    if (file.ReadBytes(&global_memory_count, sizeof(u32)) != sizeof(u32))
+    if (!LoadObjectFromPrecompiled(global_memory_count)) {
         return {};
+    }
+
     for (u32 i = 0; i < global_memory_count; ++i) {
         u32 cbuf_index{};
         u32 cbuf_offset{};
-        if (file.ReadBytes(&cbuf_index, sizeof(u32)) != sizeof(u32) ||
-            file.ReadBytes(&cbuf_offset, sizeof(u32)) != sizeof(u32)) {
+        bool is_read{};
+        bool is_written{};
+        if (!LoadObjectFromPrecompiled(cbuf_index) || !LoadObjectFromPrecompiled(cbuf_offset) ||
+            !LoadObjectFromPrecompiled(is_read) || !LoadObjectFromPrecompiled(is_written)) {
             return {};
         }
-        entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset);
+        entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset, is_read,
+                                                         is_written);
     }
 
     for (auto& clip_distance : entry.entries.clip_distances) {
-        u8 clip_distance_raw{};
-        if (file.ReadBytes(&clip_distance_raw, sizeof(u8)) != sizeof(u8))
+        if (!LoadObjectFromPrecompiled(clip_distance)) {
             return {};
-        clip_distance = clip_distance_raw != 0;
+        }
     }
 
     u64 shader_length{};
-    if (file.ReadBytes(&shader_length, sizeof(u64)) != sizeof(u64))
+    if (!LoadObjectFromPrecompiled(shader_length)) {
         return {};
+    }
+
     entry.entries.shader_length = static_cast<std::size_t>(shader_length);
 
     return entry;
 }
 
-bool ShaderDiskCacheOpenGL::SaveDecompiledFile(FileUtil::IOFile& file, u64 unique_identifier,
-                                               const std::string& code,
-                                               const std::vector<u8>& compressed_code,
+bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std::string& code,
                                                const GLShader::ShaderEntries& entries) {
-    if (file.WriteObject(static_cast<u32>(PrecompiledEntryKind::Decompiled)) != 1 ||
-        file.WriteObject(unique_identifier) != 1 ||
-        file.WriteObject(static_cast<u32>(code.size())) != 1 ||
-        file.WriteObject(static_cast<u32>(compressed_code.size())) != 1 ||
-        file.WriteArray(compressed_code.data(), compressed_code.size()) != compressed_code.size()) {
+    if (!SaveObjectToPrecompiled(static_cast<u32>(PrecompiledEntryKind::Decompiled)) ||
+        !SaveObjectToPrecompiled(unique_identifier) ||
+        !SaveObjectToPrecompiled(static_cast<u32>(code.size())) ||
+        !SaveArrayToPrecompiled(code.data(), code.size())) {
         return false;
     }
 
-    if (file.WriteObject(static_cast<u32>(entries.const_buffers.size())) != 1)
+    if (!SaveObjectToPrecompiled(static_cast<u32>(entries.const_buffers.size()))) {
         return false;
+    }
     for (const auto& cbuf : entries.const_buffers) {
-        if (file.WriteObject(static_cast<u32>(cbuf.GetMaxOffset())) != 1 ||
-            file.WriteObject(static_cast<u32>(cbuf.GetIndex())) != 1 ||
-            file.WriteObject(static_cast<u8>(cbuf.IsIndirect() ? 1 : 0)) != 1) {
+        if (!SaveObjectToPrecompiled(static_cast<u32>(cbuf.GetMaxOffset())) ||
+            !SaveObjectToPrecompiled(static_cast<u32>(cbuf.GetIndex())) ||
+            !SaveObjectToPrecompiled(cbuf.IsIndirect())) {
             return false;
         }
     }
 
-    if (file.WriteObject(static_cast<u32>(entries.samplers.size())) != 1)
+    if (!SaveObjectToPrecompiled(static_cast<u32>(entries.samplers.size()))) {
         return false;
+    }
     for (const auto& sampler : entries.samplers) {
-        if (file.WriteObject(static_cast<u64>(sampler.GetOffset())) != 1 ||
-            file.WriteObject(static_cast<u64>(sampler.GetIndex())) != 1 ||
-            file.WriteObject(static_cast<u32>(sampler.GetType())) != 1 ||
-            file.WriteObject(static_cast<u8>(sampler.IsArray() ? 1 : 0)) != 1 ||
-            file.WriteObject(static_cast<u8>(sampler.IsShadow() ? 1 : 0)) != 1) {
+        if (!SaveObjectToPrecompiled(static_cast<u64>(sampler.GetOffset())) ||
+            !SaveObjectToPrecompiled(static_cast<u64>(sampler.GetIndex())) ||
+            !SaveObjectToPrecompiled(static_cast<u32>(sampler.GetType())) ||
+            !SaveObjectToPrecompiled(sampler.IsArray()) ||
+            !SaveObjectToPrecompiled(sampler.IsShadow()) ||
+            !SaveObjectToPrecompiled(sampler.IsBindless())) {
             return false;
         }
     }
 
-    if (file.WriteObject(static_cast<u32>(entries.global_memory_entries.size())) != 1)
+    if (!SaveObjectToPrecompiled(static_cast<u32>(entries.global_memory_entries.size()))) {
         return false;
+    }
     for (const auto& gmem : entries.global_memory_entries) {
-        if (file.WriteObject(static_cast<u32>(gmem.GetCbufIndex())) != 1 ||
-            file.WriteObject(static_cast<u32>(gmem.GetCbufOffset())) != 1) {
+        if (!SaveObjectToPrecompiled(static_cast<u32>(gmem.GetCbufIndex())) ||
+            !SaveObjectToPrecompiled(static_cast<u32>(gmem.GetCbufOffset())) ||
+            !SaveObjectToPrecompiled(gmem.IsRead()) || !SaveObjectToPrecompiled(gmem.IsWritten())) {
             return false;
         }
     }
 
     for (const bool clip_distance : entries.clip_distances) {
-        if (file.WriteObject(static_cast<u8>(clip_distance ? 1 : 0)) != 1)
+        if (!SaveObjectToPrecompiled(clip_distance)) {
             return false;
+        }
     }
 
-    return file.WriteObject(static_cast<u64>(entries.shader_length)) == 1;
+    if (!SaveObjectToPrecompiled(static_cast<u64>(entries.shader_length))) {
+        return false;
+    }
+
+    return true;
 }
 
-void ShaderDiskCacheOpenGL::InvalidateTransferable() const {
+void ShaderDiskCacheOpenGL::InvalidateTransferable() {
     if (!FileUtil::Delete(GetTransferablePath())) {
         LOG_ERROR(Render_OpenGL, "Failed to invalidate transferable file={}",
                   GetTransferablePath());
@@ -418,7 +434,10 @@ void ShaderDiskCacheOpenGL::InvalidateTransferable() const {
     InvalidatePrecompiled();
 }
 
-void ShaderDiskCacheOpenGL::InvalidatePrecompiled() const {
+void ShaderDiskCacheOpenGL::InvalidatePrecompiled() {
+    // Clear virtaul precompiled cache file
+    precompiled_cache_virtual_file.Resize(0);
+
     if (!FileUtil::Delete(GetPrecompiledPath())) {
         LOG_ERROR(Render_OpenGL, "Failed to invalidate precompiled file={}", GetPrecompiledPath());
     }
@@ -454,7 +473,10 @@ void ShaderDiskCacheOpenGL::SaveUsage(const ShaderDiskCacheUsage& usage) {
     ASSERT_MSG(it != transferable.end(), "Saving shader usage without storing raw previously");
 
     auto& usages{it->second};
-    ASSERT(usages.find(usage) == usages.end());
+    if (usages.find(usage) != usages.end()) {
+        // Skip this variant since the shader is already stored.
+        return;
+    }
     usages.insert(usage);
 
     FileUtil::IOFile file = AppendTransferableFile();
@@ -474,22 +496,13 @@ void ShaderDiskCacheOpenGL::SaveDecompiled(u64 unique_identifier, const std::str
     if (!IsUsable())
         return;
 
-    const std::vector<u8> compressed_code{Common::Compression::CompressDataZSTDDefault(
-        reinterpret_cast<const u8*>(code.data()), code.size())};
-    if (compressed_code.empty()) {
-        LOG_ERROR(Render_OpenGL, "Failed to compress GLSL code - skipping shader {:016x}",
-                  unique_identifier);
-        return;
+    if (precompiled_cache_virtual_file.GetSize() == 0) {
+        SavePrecompiledHeaderToVirtualPrecompiledCache();
     }
 
-    FileUtil::IOFile file = AppendPrecompiledFile();
-    if (!file.IsOpen())
-        return;
-
-    if (!SaveDecompiledFile(file, unique_identifier, code, compressed_code, entries)) {
+    if (!SaveDecompiledFile(unique_identifier, code, entries)) {
         LOG_ERROR(Render_OpenGL,
                   "Failed to save decompiled entry to the precompiled file - removing");
-        file.Close();
         InvalidatePrecompiled();
     }
 }
@@ -505,28 +518,13 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p
     std::vector<u8> binary(binary_length);
     glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data());
 
-    const std::vector<u8> compressed_binary =
-        Common::Compression::CompressDataZSTDDefault(binary.data(), binary.size());
-
-    if (compressed_binary.empty()) {
-        LOG_ERROR(Render_OpenGL, "Failed to compress binary program in shader={:016x}",
-                  usage.unique_identifier);
-        return;
-    }
-
-    FileUtil::IOFile file = AppendPrecompiledFile();
-    if (!file.IsOpen())
-        return;
-
-    if (file.WriteObject(static_cast<u32>(PrecompiledEntryKind::Dump)) != 1 ||
-        file.WriteObject(usage) != 1 || file.WriteObject(static_cast<u32>(binary_format)) != 1 ||
-        file.WriteObject(static_cast<u32>(binary_length)) != 1 ||
-        file.WriteObject(static_cast<u32>(compressed_binary.size())) != 1 ||
-        file.WriteArray(compressed_binary.data(), compressed_binary.size()) !=
-            compressed_binary.size()) {
+    if (!SaveObjectToPrecompiled(static_cast<u32>(PrecompiledEntryKind::Dump)) ||
+        !SaveObjectToPrecompiled(usage) ||
+        !SaveObjectToPrecompiled(static_cast<u32>(binary_format)) ||
+        !SaveObjectToPrecompiled(static_cast<u32>(binary_length)) ||
+        !SaveArrayToPrecompiled(binary.data(), binary.size())) {
         LOG_ERROR(Render_OpenGL, "Failed to save binary program file in shader={:016x} - removing",
                   usage.unique_identifier);
-        file.Close();
         InvalidatePrecompiled();
         return;
     }
@@ -559,28 +557,33 @@ FileUtil::IOFile ShaderDiskCacheOpenGL::AppendTransferableFile() const {
     return file;
 }
 
-FileUtil::IOFile ShaderDiskCacheOpenGL::AppendPrecompiledFile() const {
-    if (!EnsureDirectories())
-        return {};
+void ShaderDiskCacheOpenGL::SavePrecompiledHeaderToVirtualPrecompiledCache() {
+    const auto hash{GetShaderCacheVersionHash()};
+    if (!SaveArrayToPrecompiled(hash.data(), hash.size())) {
+        LOG_ERROR(
+            Render_OpenGL,
+            "Failed to write precompiled cache version hash to virtual precompiled cache file");
+    }
+}
+
+void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() {
+    precompiled_cache_virtual_file_offset = 0;
+    const std::vector<u8>& uncompressed = precompiled_cache_virtual_file.ReadAllBytes();
+    const std::vector<u8>& compressed =
+        Common::Compression::CompressDataZSTDDefault(uncompressed.data(), uncompressed.size());
 
     const auto precompiled_path{GetPrecompiledPath()};
-    const bool existed = FileUtil::Exists(precompiled_path);
+    FileUtil::IOFile file(precompiled_path, "wb");
 
-    FileUtil::IOFile file(precompiled_path, "ab");
     if (!file.IsOpen()) {
         LOG_ERROR(Render_OpenGL, "Failed to open precompiled cache in path={}", precompiled_path);
-        return {};
+        return;
     }
-
-    if (!existed || file.GetSize() == 0) {
-        const auto hash{GetShaderCacheVersionHash()};
-        if (file.WriteArray(hash.data(), hash.size()) != hash.size()) {
-            LOG_ERROR(Render_OpenGL, "Failed to write precompiled cache version hash in path={}",
-                      precompiled_path);
-            return {};
-        }
+    if (file.WriteBytes(compressed.data(), compressed.size()) != compressed.size()) {
+        LOG_ERROR(Render_OpenGL, "Failed to write precompiled cache version in path={}",
+                  precompiled_path);
+        return;
     }
-    return file;
 }
 
 bool ShaderDiskCacheOpenGL::EnsureDirectories() const {
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index 6be0c0547..2da0a4a23 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -16,6 +16,7 @@
 
 #include "common/assert.h"
 #include "common/common_types.h"
+#include "core/file_sys/vfs_vector.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
 
@@ -69,14 +70,14 @@ namespace std {
 
 template <>
 struct hash<OpenGL::BaseBindings> {
-    std::size_t operator()(const OpenGL::BaseBindings& bindings) const {
+    std::size_t operator()(const OpenGL::BaseBindings& bindings) const noexcept {
         return bindings.cbuf | bindings.gmem << 8 | bindings.sampler << 16;
     }
 };
 
 template <>
 struct hash<OpenGL::ShaderDiskCacheUsage> {
-    std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const {
+    std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const noexcept {
         return static_cast<std::size_t>(usage.unique_identifier) ^
                std::hash<OpenGL::BaseBindings>()(usage.bindings) ^ usage.primitive << 16;
     }
@@ -161,6 +162,7 @@ struct ShaderDiskCacheDump {
 class ShaderDiskCacheOpenGL {
 public:
     explicit ShaderDiskCacheOpenGL(Core::System& system);
+    ~ShaderDiskCacheOpenGL();
 
     /// Loads transferable cache. If file has a old version or on failure, it deletes the file.
     std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>>
@@ -172,10 +174,10 @@ public:
     LoadPrecompiled();
 
     /// Removes the transferable (and precompiled) cache file.
-    void InvalidateTransferable() const;
+    void InvalidateTransferable();
 
-    /// Removes the precompiled cache file.
-    void InvalidatePrecompiled() const;
+    /// Removes the precompiled cache file and clears virtual precompiled cache file.
+    void InvalidatePrecompiled();
 
     /// Saves a raw dump to the transferable file. Checks for collisions.
     void SaveRaw(const ShaderDiskCacheRaw& entry);
@@ -190,18 +192,21 @@ public:
     /// Saves a dump entry to the precompiled file. Does not check for collisions.
     void SaveDump(const ShaderDiskCacheUsage& usage, GLuint program);
 
+    /// Serializes virtual precompiled shader cache file to real file
+    void SaveVirtualPrecompiledFile();
+
 private:
     /// Loads the transferable cache. Returns empty on failure.
     std::optional<std::pair<std::unordered_map<u64, ShaderDiskCacheDecompiled>,
                             std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>>
     LoadPrecompiledFile(FileUtil::IOFile& file);
 
-    /// Loads a decompiled cache entry from the passed file. Returns empty on failure.
-    std::optional<ShaderDiskCacheDecompiled> LoadDecompiledEntry(FileUtil::IOFile& file);
+    /// Loads a decompiled cache entry from m_precompiled_cache_virtual_file. Returns empty on
+    /// failure.
+    std::optional<ShaderDiskCacheDecompiled> LoadDecompiledEntry();
 
     /// Saves a decompiled entry to the passed file. Returns true on success.
-    bool SaveDecompiledFile(FileUtil::IOFile& file, u64 unique_identifier, const std::string& code,
-                            const std::vector<u8>& compressed_code,
+    bool SaveDecompiledFile(u64 unique_identifier, const std::string& code,
                             const GLShader::ShaderEntries& entries);
 
     /// Returns if the cache can be used
@@ -210,8 +215,8 @@ private:
     /// Opens current game's transferable file and write it's header if it doesn't exist
     FileUtil::IOFile AppendTransferableFile() const;
 
-    /// Opens current game's precompiled file and write it's header if it doesn't exist
-    FileUtil::IOFile AppendPrecompiledFile() const;
+    /// Save precompiled header to precompiled_cache_in_memory
+    void SavePrecompiledHeaderToVirtualPrecompiledCache();
 
     /// Create shader disk cache directories. Returns true on success.
     bool EnsureDirectories() const;
@@ -234,10 +239,57 @@ private:
     /// Get current game's title id
     std::string GetTitleID() const;
 
-    // Copre system
+    template <typename T>
+    bool SaveArrayToPrecompiled(const T* data, std::size_t length) {
+        const std::size_t write_length = precompiled_cache_virtual_file.WriteArray(
+            data, length, precompiled_cache_virtual_file_offset);
+        precompiled_cache_virtual_file_offset += write_length;
+        return write_length == sizeof(T) * length;
+    }
+
+    template <typename T>
+    bool LoadArrayFromPrecompiled(T* data, std::size_t length) {
+        const std::size_t read_length = precompiled_cache_virtual_file.ReadArray(
+            data, length, precompiled_cache_virtual_file_offset);
+        precompiled_cache_virtual_file_offset += read_length;
+        return read_length == sizeof(T) * length;
+    }
+
+    template <typename T>
+    bool SaveObjectToPrecompiled(const T& object) {
+        return SaveArrayToPrecompiled(&object, 1);
+    }
+
+    bool SaveObjectToPrecompiled(bool object) {
+        const auto value = static_cast<u8>(object);
+        return SaveArrayToPrecompiled(&value, 1);
+    }
+
+    template <typename T>
+    bool LoadObjectFromPrecompiled(T& object) {
+        return LoadArrayFromPrecompiled(&object, 1);
+    }
+
+    bool LoadObjectFromPrecompiled(bool& object) {
+        u8 value;
+        const bool read_ok = LoadArrayFromPrecompiled(&value, 1);
+        if (!read_ok) {
+            return false;
+        }
+
+        object = value != 0;
+        return true;
+    }
+
+    // Core system
     Core::System& system;
     // Stored transferable shaders
     std::map<u64, std::unordered_set<ShaderDiskCacheUsage>> transferable;
+    // Stores whole precompiled cache which will be read from/saved to the precompiled cache file
+    FileSys::VectorVfsFile precompiled_cache_virtual_file;
+    // Stores the current offset of the precompiled cache file for IO purposes
+    std::size_t precompiled_cache_virtual_file_offset = 0;
+
     // The cache has been loaded at boot
     bool tried_to_load{};
 };
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 8763d9c71..7ab0b4553 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -16,7 +16,7 @@ using VideoCommon::Shader::ShaderIR;
 
 static constexpr u32 PROGRAM_OFFSET{10};
 
-ProgramResult GenerateVertexShader(const ShaderSetup& setup) {
+ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) {
     const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
 
     std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
@@ -33,15 +33,16 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
 };
 
 )";
-    ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
-    ProgramResult program = Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    ProgramResult program =
+        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");
 
     out += program.first;
 
     if (setup.IsDualProgram()) {
-        ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET);
+        const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET);
         ProgramResult program_b =
-            Decompile(program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");
+            Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");
 
         out += program_b.first;
     }
@@ -57,6 +58,9 @@ void main() {
     }
 
     out += R"(
+
+    // Set Position Y direction
+    position.y *= utof(config_pack[2]);
     // Check if the flip stage is VertexB
     // Config pack's second value is flip_stage
     if (config_pack[1] == 1) {
@@ -72,10 +76,10 @@ void main() {
     }
 })";
 
-    return {out, program.second};
+    return {std::move(out), std::move(program.second)};
 }
 
-ProgramResult GenerateGeometryShader(const ShaderSetup& setup) {
+ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup) {
     const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
 
     std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
@@ -93,9 +97,9 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
 };
 
 )";
-    ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
     ProgramResult program =
-        Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
+        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
     out += program.first;
 
     out += R"(
@@ -103,10 +107,10 @@ void main() {
     execute_geometry();
 };)";
 
-    return {out, program.second};
+    return {std::move(out), std::move(program.second)};
 }
 
-ProgramResult GenerateFragmentShader(const ShaderSetup& setup) {
+ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup) {
     const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
 
     std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
@@ -156,9 +160,9 @@ bool AlphaFunc(in float value) {
 }
 
 )";
-    ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
     ProgramResult program =
-        Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");
+        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");
 
     out += program.first;
 
@@ -168,7 +172,7 @@ void main() {
 }
 
 )";
-    return {out, program.second};
+    return {std::move(out), std::move(program.second)};
 }
 
 } // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index fad346b48..0536c8a03 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -10,6 +10,10 @@
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/shader/shader_ir.h"
 
+namespace OpenGL {
+class Device;
+}
+
 namespace OpenGL::GLShader {
 
 using VideoCommon::Shader::ProgramCode;
@@ -39,22 +43,13 @@ private:
     bool has_program_b{};
 };
 
-/**
- * Generates the GLSL vertex shader program source code for the given VS program
- * @returns String of the shader source code
- */
-ProgramResult GenerateVertexShader(const ShaderSetup& setup);
-
-/**
- * Generates the GLSL geometry shader program source code for the given GS program
- * @returns String of the shader source code
- */
-ProgramResult GenerateGeometryShader(const ShaderSetup& setup);
-
-/**
- * Generates the GLSL fragment shader program source code for the given FS program
- * @returns String of the shader source code
- */
-ProgramResult GenerateFragmentShader(const ShaderSetup& setup);
+/// Generates the GLSL vertex shader program source code for the given VS program
+ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup);
+
+/// Generates the GLSL geometry shader program source code for the given GS program
+ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup);
+
+/// Generates the GLSL fragment shader program source code for the given FS program
+ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup);
 
 } // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 52d569a1b..7425fbe5d 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -471,8 +471,9 @@ void OpenGLState::ApplyTextures() const {
         const auto& texture_unit = texture_units[i];
         auto& cur_state_texture_unit = cur_state.texture_units[i];
         textures[i] = texture_unit.texture;
-        if (cur_state_texture_unit.texture == textures[i])
+        if (cur_state_texture_unit.texture == textures[i]) {
             continue;
+        }
         cur_state_texture_unit.texture = textures[i];
         if (!has_delta) {
             first = i;
@@ -493,10 +494,11 @@ void OpenGLState::ApplySamplers() const {
     std::array<GLuint, Maxwell::NumTextureSamplers> samplers;
 
     for (std::size_t i = 0; i < std::size(samplers); ++i) {
-        if (cur_state.texture_units[i].sampler == texture_units[i].sampler)
+        samplers[i] = texture_units[i].sampler;
+        if (cur_state.texture_units[i].sampler == texture_units[i].sampler) {
             continue;
+        }
         cur_state.texture_units[i].sampler = texture_units[i].sampler;
-        samplers[i] = texture_units[i].sampler;
         if (!has_delta) {
             first = i;
             has_delta = true;
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index a8833c06e..ed7b5cff0 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -27,8 +27,7 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
     switch (attrib.type) {
     case Maxwell::VertexAttribute::Type::UnsignedInt:
-    case Maxwell::VertexAttribute::Type::UnsignedNorm: {
-
+    case Maxwell::VertexAttribute::Type::UnsignedNorm:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -47,16 +46,13 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_UNSIGNED_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_UNSIGNED_INT_2_10_10_10_REV;
+        default:
+            LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
+            UNREACHABLE();
+            return {};
         }
-
-        LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-        UNREACHABLE();
-        return {};
-    }
-
     case Maxwell::VertexAttribute::Type::SignedInt:
-    case Maxwell::VertexAttribute::Type::SignedNorm: {
-
+    case Maxwell::VertexAttribute::Type::SignedNorm:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -75,14 +71,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_INT_2_10_10_10_REV;
+        default:
+            LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
+            UNREACHABLE();
+            return {};
         }
-
-        LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-        UNREACHABLE();
-        return {};
-    }
-
-    case Maxwell::VertexAttribute::Type::Float: {
+    case Maxwell::VertexAttribute::Type::Float:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_16:
         case Maxwell::VertexAttribute::Size::Size_16_16:
@@ -94,13 +88,16 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_32_32_32:
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return GL_FLOAT;
+        default:
+            LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
+            UNREACHABLE();
+            return {};
         }
+    default:
+        LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
+        UNREACHABLE();
+        return {};
     }
-    }
-
-    LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
-    UNREACHABLE();
-    return {};
 }
 
 inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
@@ -129,10 +126,13 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
         return GL_TRIANGLES;
     case Maxwell::PrimitiveTopology::TriangleStrip:
         return GL_TRIANGLE_STRIP;
+    case Maxwell::PrimitiveTopology::TriangleFan:
+        return GL_TRIANGLE_FAN;
+    default:
+        LOG_CRITICAL(Render_OpenGL, "Unimplemented topology={}", static_cast<u32>(topology));
+        UNREACHABLE();
+        return {};
     }
-    LOG_CRITICAL(Render_OpenGL, "Unimplemented topology={}", static_cast<u32>(topology));
-    UNREACHABLE();
-    return {};
 }
 
 inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
@@ -186,9 +186,10 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         } else {
             return GL_MIRROR_CLAMP_TO_EDGE;
         }
+    default:
+        LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
+        return GL_REPEAT;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
-    return GL_REPEAT;
 }
 
 inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 34bf26ff2..9fe1e3280 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -62,9 +62,10 @@ vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode) {
     case Tegra::Texture::WrapMode::MirrorOnceBorder:
         UNIMPLEMENTED();
         return vk::SamplerAddressMode::eMirrorClampToEdge;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode));
+        return {};
     }
-    UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode));
-    return {};
 }
 
 vk::CompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func) {
@@ -225,9 +226,10 @@ vk::PrimitiveTopology PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
         return vk::PrimitiveTopology::eTriangleList;
     case Maxwell::PrimitiveTopology::TriangleStrip:
         return vk::PrimitiveTopology::eTriangleStrip;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology));
+        return {};
     }
-    UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology));
-    return {};
 }
 
 vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) {
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index 08b786aad..3edf460df 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -49,9 +49,6 @@ public:
         return alignment;
     }
 
-    // We do not have to flush this cache as things in it are never modified by us.
-    void Flush() override {}
-
 private:
     VAddr cpu_addr{};
     std::size_t size{};
@@ -87,6 +84,10 @@ public:
         return buffer_handle;
     }
 
+protected:
+    // We do not have to flush this cache as things in it are never modified by us.
+    void FlushObjectInner(const std::shared_ptr<CachedBufferEntry>& object) override {}
+
 private:
     void AlignBuffer(std::size_t alignment);
 
diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
index ed3178f09..801826d3d 100644
--- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
@@ -7,7 +7,6 @@
 #include <unordered_map>
 
 #include "common/assert.h"
-#include "common/cityhash.h"
 #include "video_core/renderer_vulkan/declarations.h"
 #include "video_core/renderer_vulkan/maxwell_to_vk.h"
 #include "video_core/renderer_vulkan/vk_sampler_cache.h"
@@ -28,39 +27,20 @@ static std::optional<vk::BorderColor> TryConvertBorderColor(std::array<float, 4>
     }
 }
 
-std::size_t SamplerCacheKey::Hash() const {
-    static_assert(sizeof(raw) % sizeof(u64) == 0);
-    return static_cast<std::size_t>(
-        Common::CityHash64(reinterpret_cast<const char*>(raw.data()), sizeof(raw) / sizeof(u64)));
-}
-
-bool SamplerCacheKey::operator==(const SamplerCacheKey& rhs) const {
-    return raw == rhs.raw;
-}
-
 VKSamplerCache::VKSamplerCache(const VKDevice& device) : device{device} {}
 
 VKSamplerCache::~VKSamplerCache() = default;
 
-vk::Sampler VKSamplerCache::GetSampler(const Tegra::Texture::TSCEntry& tsc) {
-    const auto [entry, is_cache_miss] = cache.try_emplace(SamplerCacheKey{tsc});
-    auto& sampler = entry->second;
-    if (is_cache_miss) {
-        sampler = CreateSampler(tsc);
-    }
-    return *sampler;
-}
-
-UniqueSampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) {
-    const float max_anisotropy = tsc.GetMaxAnisotropy();
-    const bool has_anisotropy = max_anisotropy > 1.0f;
+UniqueSampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) const {
+    const float max_anisotropy{tsc.GetMaxAnisotropy()};
+    const bool has_anisotropy{max_anisotropy > 1.0f};
 
-    const auto border_color = tsc.GetBorderColor();
-    const auto vk_border_color = TryConvertBorderColor(border_color);
+    const auto border_color{tsc.GetBorderColor()};
+    const auto vk_border_color{TryConvertBorderColor(border_color)};
     UNIMPLEMENTED_IF_MSG(!vk_border_color, "Unimplemented border color {} {} {} {}",
                          border_color[0], border_color[1], border_color[2], border_color[3]);
 
-    constexpr bool unnormalized_coords = false;
+    constexpr bool unnormalized_coords{false};
 
     const vk::SamplerCreateInfo sampler_ci(
         {}, MaxwellToVK::Sampler::Filter(tsc.mag_filter),
@@ -73,9 +53,13 @@ UniqueSampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc)
         tsc.GetMaxLod(), vk_border_color.value_or(vk::BorderColor::eFloatTransparentBlack),
         unnormalized_coords);
 
-    const auto& dld = device.GetDispatchLoader();
-    const auto dev = device.GetLogical();
+    const auto& dld{device.GetDispatchLoader()};
+    const auto dev{device.GetLogical()};
     return dev.createSamplerUnique(sampler_ci, nullptr, dld);
 }
 
+vk::Sampler VKSamplerCache::ToSamplerType(const UniqueSampler& sampler) const {
+    return *sampler;
+}
+
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.h b/src/video_core/renderer_vulkan/vk_sampler_cache.h
index c6394dc87..771b05c73 100644
--- a/src/video_core/renderer_vulkan/vk_sampler_cache.h
+++ b/src/video_core/renderer_vulkan/vk_sampler_cache.h
@@ -8,49 +8,25 @@
 
 #include "common/common_types.h"
 #include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/sampler_cache.h"
 #include "video_core/textures/texture.h"
 
 namespace Vulkan {
 
 class VKDevice;
 
-struct SamplerCacheKey final : public Tegra::Texture::TSCEntry {
-    std::size_t Hash() const;
-
-    bool operator==(const SamplerCacheKey& rhs) const;
-
-    bool operator!=(const SamplerCacheKey& rhs) const {
-        return !operator==(rhs);
-    }
-};
-
-} // namespace Vulkan
-
-namespace std {
-
-template <>
-struct hash<Vulkan::SamplerCacheKey> {
-    std::size_t operator()(const Vulkan::SamplerCacheKey& k) const noexcept {
-        return k.Hash();
-    }
-};
-
-} // namespace std
-
-namespace Vulkan {
-
-class VKSamplerCache {
+class VKSamplerCache final : public VideoCommon::SamplerCache<vk::Sampler, UniqueSampler> {
 public:
     explicit VKSamplerCache(const VKDevice& device);
     ~VKSamplerCache();
 
-    vk::Sampler GetSampler(const Tegra::Texture::TSCEntry& tsc);
+protected:
+    UniqueSampler CreateSampler(const Tegra::Texture::TSCEntry& tsc) const;
 
-private:
-    UniqueSampler CreateSampler(const Tegra::Texture::TSCEntry& tsc);
+    vk::Sampler ToSamplerType(const UniqueSampler& sampler) const;
 
+private:
     const VKDevice& device;
-    std::unordered_map<SamplerCacheKey, UniqueSampler> cache;
 };
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index e0a6f5e87..a11000f6b 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -76,14 +76,10 @@ constexpr u32 GetGenericAttributeLocation(Attribute::Index attribute) {
 
 /// Returns true if an object has to be treated as precise
 bool IsPrecise(Operation operand) {
-    const auto& meta = operand.GetMeta();
-
+    const auto& meta{operand.GetMeta()};
     if (std::holds_alternative<MetaArithmetic>(meta)) {
         return std::get<MetaArithmetic>(meta).precise;
     }
-    if (std::holds_alternative<MetaHalfArithmetic>(meta)) {
-        return std::get<MetaHalfArithmetic>(meta).precise;
-    }
     return false;
 }
 
@@ -191,8 +187,9 @@ public:
         for (const auto& cbuf : ir.GetConstantBuffers()) {
             entries.const_buffers.emplace_back(cbuf.second, cbuf.first);
         }
-        for (const auto& gmem : ir.GetGlobalMemoryBases()) {
-            entries.global_buffers.emplace_back(gmem.cbuf_index, gmem.cbuf_offset);
+        for (const auto& gmem_pair : ir.GetGlobalMemory()) {
+            const auto& [base, usage] = gmem_pair;
+            entries.global_buffers.emplace_back(base.cbuf_index, base.cbuf_offset);
         }
         for (const auto& sampler : ir.GetSamplers()) {
             entries.samplers.emplace_back(sampler);
@@ -225,7 +222,7 @@ private:
             return current_binding;
         };
         const_buffers_base_binding = Allocate(ir.GetConstantBuffers().size());
-        global_buffers_base_binding = Allocate(ir.GetGlobalMemoryBases().size());
+        global_buffers_base_binding = Allocate(ir.GetGlobalMemory().size());
         samplers_base_binding = Allocate(ir.GetSamplers().size());
 
         ASSERT_MSG(binding_iterator - binding_base < STAGE_BINDING_STRIDE,
@@ -318,7 +315,6 @@ private:
         constexpr std::array<const char*, INTERNAL_FLAGS_COUNT> names = {"zero", "sign", "carry",
                                                                          "overflow"};
         for (std::size_t flag = 0; flag < INTERNAL_FLAGS_COUNT; ++flag) {
-            const auto flag_code = static_cast<InternalFlag>(flag);
             const Id id = OpVariable(t_prv_bool, spv::StorageClass::Private, v_false);
             internal_flags[flag] = AddGlobalVariable(Name(id, names[flag]));
         }
@@ -390,14 +386,15 @@ private:
 
     void DeclareGlobalBuffers() {
         u32 binding = global_buffers_base_binding;
-        for (const auto& entry : ir.GetGlobalMemoryBases()) {
+        for (const auto& entry : ir.GetGlobalMemory()) {
+            const auto [base, usage] = entry;
             const Id id = OpVariable(t_gmem_ssbo, spv::StorageClass::StorageBuffer);
             AddGlobalVariable(
-                Name(id, fmt::format("gmem_{}_{}", entry.cbuf_index, entry.cbuf_offset)));
+                Name(id, fmt::format("gmem_{}_{}", base.cbuf_index, base.cbuf_offset)));
 
             Decorate(id, spv::Decoration::Binding, binding++);
             Decorate(id, spv::Decoration::DescriptorSet, DESCRIPTOR_SET);
-            global_buffers.emplace(entry, id);
+            global_buffers.emplace(base, id);
         }
     }
 
@@ -744,6 +741,16 @@ private:
         return {};
     }
 
+    Id HClamp(Operation operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id HUnpack(Operation operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
     Id HMergeF32(Operation operation) {
         UNIMPLEMENTED();
         return {};
@@ -1216,6 +1223,8 @@ private:
         &SPIRVDecompiler::Ternary<&Module::OpFma, Type::HalfFloat>,
         &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::HalfFloat>,
         &SPIRVDecompiler::HNegate,
+        &SPIRVDecompiler::HClamp,
+        &SPIRVDecompiler::HUnpack,
         &SPIRVDecompiler::HMergeF32,
         &SPIRVDecompiler::HMergeH0,
         &SPIRVDecompiler::HMergeH1,
@@ -1258,6 +1267,13 @@ private:
         &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::HalfFloat>,
         &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::HalfFloat>,
         &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::HalfFloat>,
+        // TODO(Rodrigo): Should these use the OpFUnord* variants?
+        &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::HalfFloat>,
+        &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::HalfFloat>,
+        &SPIRVDecompiler::Binary<&Module::OpFOrdLessThanEqual, Type::Bool, Type::HalfFloat>,
+        &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::HalfFloat>,
+        &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::HalfFloat>,
+        &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::HalfFloat>,
 
         &SPIRVDecompiler::Texture,
         &SPIRVDecompiler::TextureLod,
diff --git a/src/video_core/sampler_cache.cpp b/src/video_core/sampler_cache.cpp
new file mode 100644
index 000000000..53c7ef12d
--- /dev/null
+++ b/src/video_core/sampler_cache.cpp
@@ -0,0 +1,21 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/cityhash.h"
+#include "common/common_types.h"
+#include "video_core/sampler_cache.h"
+
+namespace VideoCommon {
+
+std::size_t SamplerCacheKey::Hash() const {
+    static_assert(sizeof(raw) % sizeof(u64) == 0);
+    return static_cast<std::size_t>(
+        Common::CityHash64(reinterpret_cast<const char*>(raw.data()), sizeof(raw) / sizeof(u64)));
+}
+
+bool SamplerCacheKey::operator==(const SamplerCacheKey& rhs) const {
+    return raw == rhs.raw;
+}
+
+} // namespace VideoCommon
diff --git a/src/video_core/sampler_cache.h b/src/video_core/sampler_cache.h
new file mode 100644
index 000000000..cbe3ad071
--- /dev/null
+++ b/src/video_core/sampler_cache.h
@@ -0,0 +1,60 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <unordered_map>
+
+#include "video_core/textures/texture.h"
+
+namespace VideoCommon {
+
+struct SamplerCacheKey final : public Tegra::Texture::TSCEntry {
+    std::size_t Hash() const;
+
+    bool operator==(const SamplerCacheKey& rhs) const;
+
+    bool operator!=(const SamplerCacheKey& rhs) const {
+        return !operator==(rhs);
+    }
+};
+
+} // namespace VideoCommon
+
+namespace std {
+
+template <>
+struct hash<VideoCommon::SamplerCacheKey> {
+    std::size_t operator()(const VideoCommon::SamplerCacheKey& k) const noexcept {
+        return k.Hash();
+    }
+};
+
+} // namespace std
+
+namespace VideoCommon {
+
+template <typename SamplerType, typename SamplerStorageType>
+class SamplerCache {
+public:
+    SamplerType GetSampler(const Tegra::Texture::TSCEntry& tsc) {
+        const auto [entry, is_cache_miss] = cache.try_emplace(SamplerCacheKey{tsc});
+        auto& sampler = entry->second;
+        if (is_cache_miss) {
+            sampler = CreateSampler(tsc);
+        }
+        return ToSamplerType(sampler);
+    }
+
+protected:
+    virtual SamplerStorageType CreateSampler(const Tegra::Texture::TSCEntry& tsc) const = 0;
+
+    virtual SamplerType ToSamplerType(const SamplerStorageType& sampler) const = 0;
+
+private:
+    std::unordered_map<SamplerCacheKey, SamplerStorageType> cache;
+};
+
+} // namespace VideoCommon
+\ No newline at end of file
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index e4c438792..2da595c0d 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -116,6 +116,8 @@ ExitMethod ShaderIR::Scan(u32 begin, u32 end, std::set<u32>& labels) {
             // Continue scanning for an exit method.
             break;
         }
+        default:
+            break;
         }
     }
     return exit_method = ExitMethod::AlwaysReturn;
@@ -206,4 +208,4 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
     return pc + 1;
 }
 
-} // namespace VideoCommon::Shader
-\ No newline at end of file
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode/arithmetic_half.cpp b/src/video_core/shader/decode/arithmetic_half.cpp
index baee89107..2098c1170 100644
--- a/src/video_core/shader/decode/arithmetic_half.cpp
+++ b/src/video_core/shader/decode/arithmetic_half.cpp
@@ -9,6 +9,7 @@
 
 namespace VideoCommon::Shader {
 
+using Tegra::Shader::HalfType;
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 
@@ -18,48 +19,50 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
 
     if (opcode->get().GetId() == OpCode::Id::HADD2_C ||
         opcode->get().GetId() == OpCode::Id::HADD2_R) {
-        UNIMPLEMENTED_IF(instr.alu_half.ftz != 0);
+        if (instr.alu_half.ftz != 0) {
+            LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+        }
     }
-    UNIMPLEMENTED_IF_MSG(instr.alu_half.saturate != 0, "Half float saturation not implemented");
 
     const bool negate_a =
         opcode->get().GetId() != OpCode::Id::HMUL2_R && instr.alu_half.negate_a != 0;
     const bool negate_b =
         opcode->get().GetId() != OpCode::Id::HMUL2_C && instr.alu_half.negate_b != 0;
 
-    const Node op_a = GetOperandAbsNegHalf(GetRegister(instr.gpr8), instr.alu_half.abs_a, negate_a);
-
-    // instr.alu_half.type_a
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half.type_a);
+    op_a = GetOperandAbsNegHalf(op_a, instr.alu_half.abs_a, negate_a);
 
-    Node op_b = [&]() {
+    auto [type_b, op_b] = [&]() -> std::tuple<HalfType, Node> {
         switch (opcode->get().GetId()) {
         case OpCode::Id::HADD2_C:
         case OpCode::Id::HMUL2_C:
-            return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+            return {HalfType::F32, GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset())};
         case OpCode::Id::HADD2_R:
         case OpCode::Id::HMUL2_R:
-            return GetRegister(instr.gpr20);
+            return {instr.alu_half.type_b, GetRegister(instr.gpr20)};
         default:
             UNREACHABLE();
-            return Immediate(0);
+            return {HalfType::F32, Immediate(0)};
         }
     }();
-    op_b = GetOperandAbsNegHalf(op_b, instr.alu_half.abs_b, negate_b);
+    op_b = UnpackHalfFloat(op_b, type_b);
+    // redeclaration to avoid a bug in clang with reusing local bindings in lambdas
+    Node op_b_alt = GetOperandAbsNegHalf(op_b, instr.alu_half.abs_b, negate_b);
 
     Node value = [&]() {
-        MetaHalfArithmetic meta{true, {instr.alu_half_imm.type_a, instr.alu_half.type_b}};
         switch (opcode->get().GetId()) {
         case OpCode::Id::HADD2_C:
         case OpCode::Id::HADD2_R:
-            return Operation(OperationCode::HAdd, meta, op_a, op_b);
+            return Operation(OperationCode::HAdd, PRECISE, op_a, op_b_alt);
         case OpCode::Id::HMUL2_C:
         case OpCode::Id::HMUL2_R:
-            return Operation(OperationCode::HMul, meta, op_a, op_b);
+            return Operation(OperationCode::HMul, PRECISE, op_a, op_b_alt);
         default:
             UNIMPLEMENTED_MSG("Unhandled half float instruction: {}", opcode->get().GetName());
             return Immediate(0);
         }
     }();
+    value = GetSaturatedHalfFloat(value, instr.alu_half.saturate);
     value = HalfMerge(GetRegister(instr.gpr0), value, instr.alu_half.merge);
 
     SetRegister(bb, instr.gpr0, value);
@@ -67,4 +70,4 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
     return pc;
 }
 
-} // namespace VideoCommon::Shader
-\ No newline at end of file
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode/arithmetic_half_immediate.cpp b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
index c2164ba50..fbcd35b18 100644
--- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
@@ -17,34 +17,33 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     if (opcode->get().GetId() == OpCode::Id::HADD2_IMM) {
-        UNIMPLEMENTED_IF(instr.alu_half_imm.ftz != 0);
+        if (instr.alu_half_imm.ftz != 0) {
+            LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+        }
     } else {
         UNIMPLEMENTED_IF(instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None);
     }
-    UNIMPLEMENTED_IF_MSG(instr.alu_half_imm.saturate != 0,
-                         "Half float immediate saturation not implemented");
 
-    Node op_a = GetRegister(instr.gpr8);
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half_imm.type_a);
     op_a = GetOperandAbsNegHalf(op_a, instr.alu_half_imm.abs_a, instr.alu_half_imm.negate_a);
 
     const Node op_b = UnpackHalfImmediate(instr, true);
 
     Node value = [&]() {
-        MetaHalfArithmetic meta{true, {instr.alu_half_imm.type_a}};
         switch (opcode->get().GetId()) {
         case OpCode::Id::HADD2_IMM:
-            return Operation(OperationCode::HAdd, meta, op_a, op_b);
+            return Operation(OperationCode::HAdd, PRECISE, op_a, op_b);
         case OpCode::Id::HMUL2_IMM:
-            return Operation(OperationCode::HMul, meta, op_a, op_b);
+            return Operation(OperationCode::HMul, PRECISE, op_a, op_b);
         default:
             UNREACHABLE();
             return Immediate(0);
         }
     }();
-    value = HalfMerge(GetRegister(instr.gpr0), value, instr.alu_half_imm.merge);
 
+    value = GetSaturatedHalfFloat(value, instr.alu_half_imm.saturate);
+    value = HalfMerge(GetRegister(instr.gpr0), value, instr.alu_half_imm.merge);
     SetRegister(bb, instr.gpr0, value);
-
     return pc;
 }
 
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp
index 55a6fbbf2..b5ec9a6f5 100644
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -18,13 +18,29 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
     const auto opcode = OpCode::Decode(instr);
 
     switch (opcode->get().GetId()) {
-    case OpCode::Id::I2I_R: {
+    case OpCode::Id::I2I_R:
+    case OpCode::Id::I2I_C:
+    case OpCode::Id::I2I_IMM: {
         UNIMPLEMENTED_IF(instr.conversion.selector);
+        UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
+        UNIMPLEMENTED_IF(instr.alu.saturate_d);
 
         const bool input_signed = instr.conversion.is_input_signed;
         const bool output_signed = instr.conversion.is_output_signed;
 
-        Node value = GetRegister(instr.gpr20);
+        Node value = [&]() {
+            switch (opcode->get().GetId()) {
+            case OpCode::Id::I2I_R:
+                return GetRegister(instr.gpr20);
+            case OpCode::Id::I2I_C:
+                return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+            case OpCode::Id::I2I_IMM:
+                return Immediate(instr.alu.GetSignedImm20_20());
+            default:
+                UNREACHABLE();
+                return Immediate(0);
+            }
+        }();
         value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed);
 
         value = GetOperandAbsNegInteger(value, instr.conversion.abs_a, instr.conversion.negate_a,
@@ -38,17 +54,24 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::I2F_R:
-    case OpCode::Id::I2F_C: {
-        UNIMPLEMENTED_IF(instr.conversion.dest_size != Register::Size::Word);
+    case OpCode::Id::I2F_C:
+    case OpCode::Id::I2F_IMM: {
+        UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
         UNIMPLEMENTED_IF(instr.conversion.selector);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in I2F is not implemented");
 
         Node value = [&]() {
-            if (instr.is_b_gpr) {
+            switch (opcode->get().GetId()) {
+            case OpCode::Id::I2F_R:
                 return GetRegister(instr.gpr20);
-            } else {
+            case OpCode::Id::I2F_C:
                 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+            case OpCode::Id::I2F_IMM:
+                return Immediate(instr.alu.GetSignedImm20_20());
+            default:
+                UNREACHABLE();
+                return Immediate(0);
             }
         }();
         const bool input_signed = instr.conversion.is_input_signed;
@@ -62,24 +85,31 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::F2F_R:
-    case OpCode::Id::F2F_C: {
-        UNIMPLEMENTED_IF(instr.conversion.dest_size != Register::Size::Word);
-        UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word);
+    case OpCode::Id::F2F_C:
+    case OpCode::Id::F2F_IMM: {
+        UNIMPLEMENTED_IF(instr.conversion.f2f.dst_size != Register::Size::Word);
+        UNIMPLEMENTED_IF(instr.conversion.f2f.src_size != Register::Size::Word);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in F2F is not implemented");
 
         Node value = [&]() {
-            if (instr.is_b_gpr) {
+            switch (opcode->get().GetId()) {
+            case OpCode::Id::F2F_R:
                 return GetRegister(instr.gpr20);
-            } else {
+            case OpCode::Id::F2F_C:
                 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+            case OpCode::Id::F2F_IMM:
+                return GetImmediate19(instr);
+            default:
+                UNREACHABLE();
+                return Immediate(0);
             }
         }();
 
         value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a);
 
         value = [&]() {
-            switch (instr.conversion.f2f.rounding) {
+            switch (instr.conversion.f2f.GetRoundingMode()) {
             case Tegra::Shader::F2fRoundingOp::None:
                 return value;
             case Tegra::Shader::F2fRoundingOp::Round:
@@ -90,10 +120,11 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
                 return Operation(OperationCode::FCeil, PRECISE, value);
             case Tegra::Shader::F2fRoundingOp::Trunc:
                 return Operation(OperationCode::FTrunc, PRECISE, value);
+            default:
+                UNIMPLEMENTED_MSG("Unimplemented F2F rounding mode {}",
+                                  static_cast<u32>(instr.conversion.f2f.rounding.Value()));
+                return Immediate(0);
             }
-            UNIMPLEMENTED_MSG("Unimplemented F2F rounding mode {}",
-                              static_cast<u32>(instr.conversion.f2f.rounding.Value()));
-            return Immediate(0);
         }();
         value = GetSaturatedFloat(value, instr.alu.saturate_d);
 
@@ -102,15 +133,22 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::F2I_R:
-    case OpCode::Id::F2I_C: {
+    case OpCode::Id::F2I_C:
+    case OpCode::Id::F2I_IMM: {
         UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in F2I is not implemented");
         Node value = [&]() {
-            if (instr.is_b_gpr) {
+            switch (opcode->get().GetId()) {
+            case OpCode::Id::F2I_R:
                 return GetRegister(instr.gpr20);
-            } else {
+            case OpCode::Id::F2I_C:
                 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+            case OpCode::Id::F2I_IMM:
+                return GetImmediate19(instr);
+            default:
+                UNREACHABLE();
+                return Immediate(0);
             }
         }();
 
@@ -134,7 +172,7 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         }();
         const bool is_signed = instr.conversion.is_output_signed;
         value = SignedOperation(OperationCode::ICastFloat, is_signed, PRECISE, value);
-        value = ConvertIntegerSize(value, instr.conversion.dest_size, is_signed);
+        value = ConvertIntegerSize(value, instr.conversion.dst_size, is_signed);
 
         SetRegister(bb, instr.gpr0, value);
         break;
diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp
index 748368555..1dd94bf9d 100644
--- a/src/video_core/shader/decode/half_set.cpp
+++ b/src/video_core/shader/decode/half_set.cpp
@@ -18,11 +18,13 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    UNIMPLEMENTED_IF(instr.hset2.ftz != 0);
+    if (instr.hset2.ftz != 0) {
+        LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
+    }
+
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
+    op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a);
 
-    // instr.hset2.type_a
-    // instr.hset2.type_b
-    Node op_a = GetRegister(instr.gpr8);
     Node op_b = [&]() {
         switch (opcode->get().GetId()) {
         case OpCode::Id::HSET2_R:
@@ -32,14 +34,12 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
             return Immediate(0);
         }
     }();
-
-    op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a);
+    op_b = UnpackHalfFloat(op_b, instr.hset2.type_b);
     op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b);
 
     const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
 
-    MetaHalfArithmetic meta{false, {instr.hset2.type_a, instr.hset2.type_b}};
-    const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, meta, op_a, op_b);
+    const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, op_a, op_b);
 
     const OperationCode combiner = GetPredicateCombiner(instr.hset2.op);
 
diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp
index e68512692..6e59eb650 100644
--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -19,10 +19,10 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
 
     UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0);
 
-    Node op_a = GetRegister(instr.gpr8);
+    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a);
     op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a);
 
-    const Node op_b = [&]() {
+    Node op_b = [&]() {
         switch (opcode->get().GetId()) {
         case OpCode::Id::HSETP2_R:
             return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a,
@@ -32,6 +32,7 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
             return Immediate(0);
         }
     }();
+    op_b = UnpackHalfFloat(op_b, instr.hsetp2.type_b);
 
     // We can't use the constant predicate as destination.
     ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex));
@@ -42,8 +43,7 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
     const OperationCode pair_combiner =
         instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2;
 
-    MetaHalfArithmetic meta = {false, {instr.hsetp2.type_a, instr.hsetp2.type_b}};
-    const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, meta, op_a, op_b);
+    const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, op_a, op_b);
     const Node first_pred = Operation(pair_combiner, comparison);
 
     // Set the primary predicate to the result of Predicate OP SecondPredicate
diff --git a/src/video_core/shader/decode/hfma2.cpp b/src/video_core/shader/decode/hfma2.cpp
index 7a07c5ec6..a425f9eb7 100644
--- a/src/video_core/shader/decode/hfma2.cpp
+++ b/src/video_core/shader/decode/hfma2.cpp
@@ -27,10 +27,6 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
     }
 
     constexpr auto identity = HalfType::H0_H1;
-
-    const HalfType type_a = instr.hfma2.type_a;
-    const Node op_a = GetRegister(instr.gpr8);
-
     bool neg_b{}, neg_c{};
     auto [saturate, type_b, op_b, type_c,
           op_c] = [&]() -> std::tuple<bool, HalfType, Node, HalfType, Node> {
@@ -38,15 +34,14 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
         case OpCode::Id::HFMA2_CR:
             neg_b = instr.hfma2.negate_b;
             neg_c = instr.hfma2.negate_c;
-            return {instr.hfma2.saturate, instr.hfma2.type_b,
+            return {instr.hfma2.saturate, HalfType::F32,
                     GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()),
                     instr.hfma2.type_reg39, GetRegister(instr.gpr39)};
         case OpCode::Id::HFMA2_RC:
             neg_b = instr.hfma2.negate_b;
             neg_c = instr.hfma2.negate_c;
             return {instr.hfma2.saturate, instr.hfma2.type_reg39, GetRegister(instr.gpr39),
-                    instr.hfma2.type_b,
-                    GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset())};
+                    HalfType::F32, GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset())};
         case OpCode::Id::HFMA2_RR:
             neg_b = instr.hfma2.rr.negate_b;
             neg_c = instr.hfma2.rr.negate_c;
@@ -60,13 +55,13 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
             return {false, identity, Immediate(0), identity, Immediate(0)};
         }
     }();
-    UNIMPLEMENTED_IF_MSG(saturate, "HFMA2 saturation is not implemented");
 
-    op_b = GetOperandAbsNegHalf(op_b, false, neg_b);
-    op_c = GetOperandAbsNegHalf(op_c, false, neg_c);
+    const Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hfma2.type_a);
+    op_b = GetOperandAbsNegHalf(UnpackHalfFloat(op_b, type_b), false, neg_b);
+    op_c = GetOperandAbsNegHalf(UnpackHalfFloat(op_c, type_c), false, neg_c);
 
-    MetaHalfArithmetic meta{true, {type_a, type_b, type_c}};
-    Node value = Operation(OperationCode::HFma, meta, op_a, op_b, op_c);
+    Node value = Operation(OperationCode::HFma, PRECISE, op_a, op_b, op_c);
+    value = GetSaturatedHalfFloat(value, saturate);
     value = HalfMerge(GetRegister(instr.gpr0), value, instr.hfma2.merge);
 
     SetRegister(bb, instr.gpr0, value);
@@ -74,4 +69,4 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
     return pc;
 }
 
-} // namespace VideoCommon::Shader
-\ No newline at end of file
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index ea3c71eed..ea1092db1 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -8,6 +8,7 @@
 
 #include "common/assert.h"
 #include "common/common_types.h"
+#include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/shader_ir.h"
 
@@ -18,6 +19,23 @@ using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 using Tegra::Shader::Register;
 
+namespace {
+u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) {
+    switch (uniform_type) {
+    case Tegra::Shader::UniformType::Single:
+        return 1;
+    case Tegra::Shader::UniformType::Double:
+        return 2;
+    case Tegra::Shader::UniformType::Quad:
+    case Tegra::Shader::UniformType::UnsignedQuad:
+        return 4;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type));
+        return 1;
+    }
+}
+} // namespace
+
 u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
@@ -85,8 +103,8 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::LD_L: {
-        UNIMPLEMENTED_IF_MSG(instr.ld_l.unknown == 1, "LD_L Unhandled mode: {}",
-                             static_cast<u32>(instr.ld_l.unknown.Value()));
+        LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}",
+                  static_cast<u64>(instr.ld_l.unknown.Value()));
 
         const auto GetLmem = [&](s32 offset) {
             ASSERT(offset % 4 == 0);
@@ -126,45 +144,15 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::LDG: {
-        const u32 count = [&]() {
-            switch (instr.ldg.type) {
-            case Tegra::Shader::UniformType::Single:
-                return 1;
-            case Tegra::Shader::UniformType::Double:
-                return 2;
-            case Tegra::Shader::UniformType::Quad:
-            case Tegra::Shader::UniformType::UnsignedQuad:
-                return 4;
-            default:
-                UNIMPLEMENTED_MSG("Unimplemented LDG size!");
-                return 1;
-            }
-        }();
-
-        const Node addr_register = GetRegister(instr.gpr8);
-        const Node base_address =
-            TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()));
-        const auto cbuf = std::get_if<CbufNode>(base_address);
-        ASSERT(cbuf != nullptr);
-        const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset());
-        ASSERT(cbuf_offset_imm != nullptr);
-        const auto cbuf_offset = cbuf_offset_imm->GetValue();
-
-        bb.push_back(Comment(
-            fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset)));
-
-        const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset};
-        used_global_memory_bases.insert(descriptor);
-
-        const Node immediate_offset =
-            Immediate(static_cast<u32>(instr.ldg.immediate_offset.Value()));
-        const Node base_real_address =
-            Operation(OperationCode::UAdd, NO_PRECISE, immediate_offset, addr_register);
+        const auto [real_address_base, base_address, descriptor] =
+            TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8),
+                                    static_cast<u32>(instr.ldg.immediate_offset.Value()), false);
 
+        const u32 count = GetUniformTypeElementsCount(instr.ldg.type);
         for (u32 i = 0; i < count; ++i) {
             const Node it_offset = Immediate(i * 4);
             const Node real_address =
-                Operation(OperationCode::UAdd, NO_PRECISE, base_real_address, it_offset);
+                Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
             const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));
 
             SetTemporal(bb, i, gmem);
@@ -174,6 +162,28 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         }
         break;
     }
+    case OpCode::Id::STG: {
+        const auto [real_address_base, base_address, descriptor] =
+            TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8),
+                                    static_cast<u32>(instr.stg.immediate_offset.Value()), true);
+
+        // Encode in temporary registers like this: real_base_address, {registers_to_be_written...}
+        SetTemporal(bb, 0, real_address_base);
+
+        const u32 count = GetUniformTypeElementsCount(instr.stg.type);
+        for (u32 i = 0; i < count; ++i) {
+            SetTemporal(bb, i + 1, GetRegister(instr.gpr0.Value() + i));
+        }
+        for (u32 i = 0; i < count; ++i) {
+            const Node it_offset = Immediate(i * 4);
+            const Node real_address =
+                Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
+            const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));
+
+            bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporal(i + 1)));
+        }
+        break;
+    }
     case OpCode::Id::ST_A: {
         UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex,
                              "Indirect attribute loads are not supported");
@@ -205,8 +215,8 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::ST_L: {
-        UNIMPLEMENTED_IF_MSG(instr.st_l.unknown == 0, "ST_L Unhandled mode: {}",
-                             static_cast<u32>(instr.st_l.unknown.Value()));
+        LOG_DEBUG(HW_GPU, "ST_L cache management mode: {}",
+                  static_cast<u64>(instr.st_l.cache_management.Value()));
 
         const auto GetLmemAddr = [&](s32 offset) {
             ASSERT(offset % 4 == 0);
@@ -236,4 +246,34 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
     return pc;
 }
 
+std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackAndGetGlobalMemory(NodeBlock& bb,
+                                                                           Node addr_register,
+                                                                           u32 immediate_offset,
+                                                                           bool is_write) {
+    const Node base_address{
+        TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()))};
+    const auto cbuf = std::get_if<CbufNode>(base_address);
+    ASSERT(cbuf != nullptr);
+    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset());
+    ASSERT(cbuf_offset_imm != nullptr);
+    const auto cbuf_offset = cbuf_offset_imm->GetValue();
+
+    bb.push_back(
+        Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset)));
+
+    const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset};
+    const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor);
+    auto& usage = entry->second;
+    if (is_write) {
+        usage.is_written = true;
+    } else {
+        usage.is_read = true;
+    }
+
+    const auto real_address =
+        Operation(OperationCode::UAdd, NO_PRECISE, Immediate(immediate_offset), addr_register);
+
+    return {real_address, base_address, descriptor};
+}
+
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index a775b402b..5b033126d 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -40,7 +40,7 @@ static std::size_t GetCoordCount(TextureType texture_type) {
 u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
-
+    bool is_bindless = false;
     switch (opcode->get().GetId()) {
     case OpCode::Id::TEX: {
         if (instr.tex.UsesMiscMode(TextureMiscMode::NODEP)) {
@@ -54,7 +54,25 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         const auto process_mode = instr.tex.GetTextureProcessMode();
         WriteTexInstructionFloat(
             bb, instr,
-            GetTexCode(instr, texture_type, process_mode, depth_compare, is_array, is_aoffi));
+            GetTexCode(instr, texture_type, process_mode, depth_compare, is_array, is_aoffi, {}));
+        break;
+    }
+    case OpCode::Id::TEX_B: {
+        UNIMPLEMENTED_IF_MSG(instr.tex.UsesMiscMode(TextureMiscMode::AOFFI),
+                             "AOFFI is not implemented");
+
+        if (instr.tex.UsesMiscMode(TextureMiscMode::NODEP)) {
+            LOG_WARNING(HW_GPU, "TEX.NODEP implementation is incomplete");
+        }
+
+        const TextureType texture_type{instr.tex_b.texture_type};
+        const bool is_array = instr.tex_b.array != 0;
+        const bool is_aoffi = instr.tex.UsesMiscMode(TextureMiscMode::AOFFI);
+        const bool depth_compare = instr.tex_b.UsesMiscMode(TextureMiscMode::DC);
+        const auto process_mode = instr.tex_b.GetTextureProcessMode();
+        WriteTexInstructionFloat(bb, instr,
+                                 GetTexCode(instr, texture_type, process_mode, depth_compare,
+                                            is_array, is_aoffi, {instr.gpr20}));
         break;
     }
     case OpCode::Id::TEXS: {
@@ -134,6 +152,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         WriteTexsInstructionFloat(bb, instr, values);
         break;
     }
+    case OpCode::Id::TXQ_B:
+        is_bindless = true;
+        [[fallthrough]];
     case OpCode::Id::TXQ: {
         if (instr.txq.UsesMiscMode(TextureMiscMode::NODEP)) {
             LOG_WARNING(HW_GPU, "TXQ.NODEP implementation is incomplete");
@@ -143,7 +164,10 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         // Sadly, not all texture instructions specify the type of texture their sampler
         // uses. This must be fixed at a later instance.
         const auto& sampler =
-            GetSampler(instr.sampler, Tegra::Shader::TextureType::Texture2D, false, false);
+            is_bindless
+                ? GetBindlessSampler(instr.gpr8, Tegra::Shader::TextureType::Texture2D, false,
+                                     false)
+                : GetSampler(instr.sampler, Tegra::Shader::TextureType::Texture2D, false, false);
 
         u32 indexer = 0;
         switch (instr.txq.query_type) {
@@ -154,7 +178,8 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
                 }
                 MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element};
                 const Node value =
-                    Operation(OperationCode::TextureQueryDimensions, meta, GetRegister(instr.gpr8));
+                    Operation(OperationCode::TextureQueryDimensions, meta,
+                              GetRegister(instr.gpr8.Value() + (is_bindless ? 1 : 0)));
                 SetTemporal(bb, indexer++, value);
             }
             for (u32 i = 0; i < indexer; ++i) {
@@ -168,6 +193,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
         }
         break;
     }
+    case OpCode::Id::TMML_B:
+        is_bindless = true;
+        [[fallthrough]];
     case OpCode::Id::TMML: {
         UNIMPLEMENTED_IF_MSG(instr.tmml.UsesMiscMode(Tegra::Shader::TextureMiscMode::NDV),
                              "NDV is not implemented");
@@ -178,7 +206,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
 
         auto texture_type = instr.tmml.texture_type.Value();
         const bool is_array = instr.tmml.array != 0;
-        const auto& sampler = GetSampler(instr.sampler, texture_type, is_array, false);
+        const auto& sampler = is_bindless
+                                  ? GetBindlessSampler(instr.gpr20, texture_type, is_array, false)
+                                  : GetSampler(instr.sampler, texture_type, is_array, false);
 
         std::vector<Node> coords;
 
@@ -199,17 +229,19 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
             coords.push_back(GetRegister(instr.gpr8.Value() + 1));
             texture_type = TextureType::Texture2D;
         }
-
+        u32 indexer = 0;
         for (u32 element = 0; element < 2; ++element) {
+            if (!instr.tmml.IsComponentEnabled(element)) {
+                continue;
+            }
             auto params = coords;
             MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element};
             const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params));
-            SetTemporal(bb, element, value);
+            SetTemporal(bb, indexer++, value);
         }
-        for (u32 element = 0; element < 2; ++element) {
-            SetRegister(bb, instr.gpr0.Value() + element, GetTemporal(element));
+        for (u32 i = 0; i < indexer; ++i) {
+            SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
         }
-
         break;
     }
     case OpCode::Id::TLDS: {
@@ -254,6 +286,34 @@ const Sampler& ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler, Textu
     return *used_samplers.emplace(entry).first;
 }
 
+const Sampler& ShaderIR::GetBindlessSampler(const Tegra::Shader::Register& reg, TextureType type,
+                                            bool is_array, bool is_shadow) {
+    const Node sampler_register = GetRegister(reg);
+    const Node base_sampler =
+        TrackCbuf(sampler_register, global_code, static_cast<s64>(global_code.size()));
+    const auto cbuf = std::get_if<CbufNode>(base_sampler);
+    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset());
+    ASSERT(cbuf_offset_imm != nullptr);
+    const auto cbuf_offset = cbuf_offset_imm->GetValue();
+    const auto cbuf_index = cbuf->GetIndex();
+    const auto cbuf_key = (static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset);
+
+    // If this sampler has already been used, return the existing mapping.
+    const auto itr =
+        std::find_if(used_samplers.begin(), used_samplers.end(),
+                     [&](const Sampler& entry) { return entry.GetOffset() == cbuf_key; });
+    if (itr != used_samplers.end()) {
+        ASSERT(itr->GetType() == type && itr->IsArray() == is_array &&
+               itr->IsShadow() == is_shadow);
+        return *itr;
+    }
+
+    // Otherwise create a new mapping for this sampler
+    const std::size_t next_index = used_samplers.size();
+    const Sampler entry{cbuf_index, cbuf_offset, next_index, type, is_array, is_shadow};
+    return *used_samplers.emplace(entry).first;
+}
+
 void ShaderIR::WriteTexInstructionFloat(NodeBlock& bb, Instruction instr, const Node4& components) {
     u32 dest_elem = 0;
     for (u32 elem = 0; elem < 4; ++elem) {
@@ -326,22 +386,27 @@ void ShaderIR::WriteTexsInstructionHalfFloat(NodeBlock& bb, Instruction instr,
 Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
                                TextureProcessMode process_mode, std::vector<Node> coords,
                                Node array, Node depth_compare, u32 bias_offset,
-                               std::vector<Node> aoffi) {
+                               std::vector<Node> aoffi,
+                               std::optional<Tegra::Shader::Register> bindless_reg) {
     const bool is_array = array;
     const bool is_shadow = depth_compare;
+    const bool is_bindless = bindless_reg.has_value();
 
     UNIMPLEMENTED_IF_MSG((texture_type == TextureType::Texture3D && (is_array || is_shadow)) ||
                              (texture_type == TextureType::TextureCube && is_array && is_shadow),
                          "This method is not supported.");
 
-    const auto& sampler = GetSampler(instr.sampler, texture_type, is_array, is_shadow);
+    const auto& sampler = is_bindless
+                              ? GetBindlessSampler(*bindless_reg, texture_type, is_array, is_shadow)
+                              : GetSampler(instr.sampler, texture_type, is_array, is_shadow);
 
     const bool lod_needed = process_mode == TextureProcessMode::LZ ||
                             process_mode == TextureProcessMode::LL ||
                             process_mode == TextureProcessMode::LLA;
 
-    // LOD selection (either via bias or explicit textureLod) not supported in GL for
-    // sampler2DArrayShadow and samplerCubeArrayShadow.
+    // LOD selection (either via bias or explicit textureLod) not
+    // supported in GL for sampler2DArrayShadow and
+    // samplerCubeArrayShadow.
     const bool gl_lod_supported =
         !((texture_type == Tegra::Shader::TextureType::Texture2D && is_array && is_shadow) ||
           (texture_type == Tegra::Shader::TextureType::TextureCube && is_array && is_shadow));
@@ -359,8 +424,9 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
             lod = Immediate(0.0f);
             break;
         case TextureProcessMode::LB:
-            // If present, lod or bias are always stored in the register indexed by the gpr20
-            // field with an offset depending on the usage of the other registers
+            // If present, lod or bias are always stored in the register
+            // indexed by the gpr20 field with an offset depending on the
+            // usage of the other registers
             bias = GetRegister(instr.gpr20.Value() + bias_offset);
             break;
         case TextureProcessMode::LL:
@@ -384,11 +450,18 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
 
 Node4 ShaderIR::GetTexCode(Instruction instr, TextureType texture_type,
                            TextureProcessMode process_mode, bool depth_compare, bool is_array,
-                           bool is_aoffi) {
+                           bool is_aoffi, std::optional<Tegra::Shader::Register> bindless_reg) {
     const bool lod_bias_enabled{
         (process_mode != TextureProcessMode::None && process_mode != TextureProcessMode::LZ)};
 
+    const bool is_bindless = bindless_reg.has_value();
+
     u64 parameter_register = instr.gpr20.Value();
+    if (is_bindless) {
+        ++parameter_register;
+    }
+
+    const u32 bias_lod_offset = (is_bindless ? 1 : 0);
     if (lod_bias_enabled) {
         ++parameter_register;
     }
@@ -423,7 +496,8 @@ Node4 ShaderIR::GetTexCode(Instruction instr, TextureType texture_type,
         dc = GetRegister(parameter_register++);
     }
 
-    return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, 0, aoffi);
+    return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, bias_lod_offset,
+                          aoffi, bindless_reg);
 }
 
 Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type,
@@ -459,14 +533,13 @@ Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type,
         dc = GetRegister(depth_register);
     }
 
-    return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, bias_offset, {});
+    return GetTextureCode(instr, texture_type, process_mode, coords, array, dc, bias_offset, {},
+                          {});
 }
 
 Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool depth_compare,
                             bool is_array, bool is_aoffi) {
     const std::size_t coord_count = GetCoordCount(texture_type);
-    const std::size_t total_coord_count = coord_count + (is_array ? 1 : 0);
-    const std::size_t total_reg_count = total_coord_count + (depth_compare ? 1 : 0);
 
     // If enabled arrays index is always stored in the gpr8 field
     const u64 array_register = instr.gpr8.Value();
diff --git a/src/video_core/shader/decode/xmad.cpp b/src/video_core/shader/decode/xmad.cpp
index db15c0718..04a776398 100644
--- a/src/video_core/shader/decode/xmad.cpp
+++ b/src/video_core/shader/decode/xmad.cpp
@@ -56,9 +56,10 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
                     instr.xmad.mode,
                     Immediate(static_cast<u32>(instr.xmad.imm20_16)),
                     GetRegister(instr.gpr39)};
+        default:
+            UNIMPLEMENTED_MSG("Unhandled XMAD instruction: {}", opcode->get().GetName());
+            return {false, false, false, Tegra::Shader::XmadMode::None, Immediate(0), Immediate(0)};
         }
-        UNIMPLEMENTED_MSG("Unhandled XMAD instruction: {}", opcode->get().GetName());
-        return {false, false, false, Tegra::Shader::XmadMode::None, Immediate(0), Immediate(0)};
     }();
 
     op_a = BitfieldExtract(op_a, instr.xmad.high_a ? 16 : 0, 16);
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index ac5112d78..196235e5d 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -21,6 +21,13 @@ using Tegra::Shader::PredCondition;
 using Tegra::Shader::PredOperation;
 using Tegra::Shader::Register;
 
+ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset)
+    : program_code{program_code}, main_offset{main_offset} {
+    Decode();
+}
+
+ShaderIR::~ShaderIR() = default;
+
 Node ShaderIR::StoreNode(NodeData&& node_data) {
     auto store = std::make_unique<NodeData>(node_data);
     const Node node = store.get();
@@ -189,7 +196,11 @@ Node ShaderIR::UnpackHalfImmediate(Instruction instr, bool has_negation) {
     const Node first_negate = GetPredicate(instr.half_imm.first_negate != 0);
     const Node second_negate = GetPredicate(instr.half_imm.second_negate != 0);
 
-    return Operation(OperationCode::HNegate, HALF_NO_PRECISE, value, first_negate, second_negate);
+    return Operation(OperationCode::HNegate, NO_PRECISE, value, first_negate, second_negate);
+}
+
+Node ShaderIR::UnpackHalfFloat(Node value, Tegra::Shader::HalfType type) {
+    return Operation(OperationCode::HUnpack, type, value);
 }
 
 Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {
@@ -209,17 +220,26 @@ Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {
 
 Node ShaderIR::GetOperandAbsNegHalf(Node value, bool absolute, bool negate) {
     if (absolute) {
-        value = Operation(OperationCode::HAbsolute, HALF_NO_PRECISE, value);
+        value = Operation(OperationCode::HAbsolute, NO_PRECISE, value);
     }
     if (negate) {
-        value = Operation(OperationCode::HNegate, HALF_NO_PRECISE, value, GetPredicate(true),
+        value = Operation(OperationCode::HNegate, NO_PRECISE, value, GetPredicate(true),
                           GetPredicate(true));
     }
     return value;
 }
 
+Node ShaderIR::GetSaturatedHalfFloat(Node value, bool saturate) {
+    if (!saturate) {
+        return value;
+    }
+    const Node positive_zero = Immediate(std::copysignf(0, 1));
+    const Node positive_one = Immediate(1.0f);
+    return Operation(OperationCode::HClamp, NO_PRECISE, value, positive_zero, positive_one);
+}
+
 Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, Node op_b) {
-    static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
+    const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
         {PredCondition::LessThan, OperationCode::LogicalFLessThan},
         {PredCondition::Equal, OperationCode::LogicalFEqual},
         {PredCondition::LessEqual, OperationCode::LogicalFLessEqual},
@@ -255,7 +275,7 @@ Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, N
 
 Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_signed, Node op_a,
                                              Node op_b) {
-    static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
+    const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
         {PredCondition::LessThan, OperationCode::LogicalILessThan},
         {PredCondition::Equal, OperationCode::LogicalIEqual},
         {PredCondition::LessEqual, OperationCode::LogicalILessEqual},
@@ -283,40 +303,32 @@ Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_si
     return predicate;
 }
 
-Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition,
-                                          const MetaHalfArithmetic& meta, Node op_a, Node op_b) {
-
-    UNIMPLEMENTED_IF_MSG(condition == PredCondition::LessThanWithNan ||
-                             condition == PredCondition::NotEqualWithNan ||
-                             condition == PredCondition::LessEqualWithNan ||
-                             condition == PredCondition::GreaterThanWithNan ||
-                             condition == PredCondition::GreaterEqualWithNan,
-                         "Unimplemented NaN comparison for half floats");
-
-    static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
+Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, Node op_a,
+                                          Node op_b) {
+    const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
         {PredCondition::LessThan, OperationCode::Logical2HLessThan},
         {PredCondition::Equal, OperationCode::Logical2HEqual},
         {PredCondition::LessEqual, OperationCode::Logical2HLessEqual},
         {PredCondition::GreaterThan, OperationCode::Logical2HGreaterThan},
         {PredCondition::NotEqual, OperationCode::Logical2HNotEqual},
         {PredCondition::GreaterEqual, OperationCode::Logical2HGreaterEqual},
-        {PredCondition::LessThanWithNan, OperationCode::Logical2HLessThan},
-        {PredCondition::NotEqualWithNan, OperationCode::Logical2HNotEqual},
-        {PredCondition::LessEqualWithNan, OperationCode::Logical2HLessEqual},
-        {PredCondition::GreaterThanWithNan, OperationCode::Logical2HGreaterThan},
-        {PredCondition::GreaterEqualWithNan, OperationCode::Logical2HGreaterEqual}};
+        {PredCondition::LessThanWithNan, OperationCode::Logical2HLessThanWithNan},
+        {PredCondition::NotEqualWithNan, OperationCode::Logical2HNotEqualWithNan},
+        {PredCondition::LessEqualWithNan, OperationCode::Logical2HLessEqualWithNan},
+        {PredCondition::GreaterThanWithNan, OperationCode::Logical2HGreaterThanWithNan},
+        {PredCondition::GreaterEqualWithNan, OperationCode::Logical2HGreaterEqualWithNan}};
 
     const auto comparison{PredicateComparisonTable.find(condition)};
     UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonTable.end(),
                          "Unknown predicate comparison operation");
 
-    const Node predicate = Operation(comparison->second, meta, op_a, op_b);
+    const Node predicate = Operation(comparison->second, NO_PRECISE, op_a, op_b);
 
     return predicate;
 }
 
 OperationCode ShaderIR::GetPredicateCombiner(PredOperation operation) {
-    static const std::unordered_map<PredOperation, OperationCode> PredicateOperationTable = {
+    const std::unordered_map<PredOperation, OperationCode> PredicateOperationTable = {
         {PredOperation::And, OperationCode::LogicalAnd},
         {PredOperation::Or, OperationCode::LogicalOr},
         {PredOperation::Xor, OperationCode::LogicalXor},
@@ -434,11 +446,14 @@ Node ShaderIR::BitfieldExtract(Node value, u32 offset, u32 bits) {
         return OperationCode::LogicalUGreaterEqual;
     case OperationCode::INegate:
         UNREACHABLE_MSG("Can't negate an unsigned integer");
+        return {};
     case OperationCode::IAbsolute:
         UNREACHABLE_MSG("Can't apply absolute to an unsigned integer");
+        return {};
+    default:
+        UNREACHABLE_MSG("Unknown signed operation with code={}", static_cast<u32>(operation_code));
+        return {};
     }
-    UNREACHABLE_MSG("Unknown signed operation with code={}", static_cast<u32>(operation_code));
-    return {};
 }
 
-} // namespace VideoCommon::Shader
-\ No newline at end of file
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 4888998d3..e4253fdb3 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -109,11 +109,13 @@ enum class OperationCode {
     UBitfieldExtract, /// (MetaArithmetic, uint value, int offset, int offset) -> uint
     UBitCount,        /// (MetaArithmetic, uint) -> uint
 
-    HAdd,      /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
-    HMul,      /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
-    HFma,      /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
+    HAdd,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
+    HMul,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
+    HFma,      /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
     HAbsolute, /// (f16vec2 a) -> f16vec2
     HNegate,   /// (f16vec2 a, bool first, bool second) -> f16vec2
+    HClamp,    /// (f16vec2 src, float min, float max) -> f16vec2
+    HUnpack,   /// (Tegra::Shader::HalfType, T value) -> f16vec2
     HMergeF32, /// (f16vec2 src) -> float
     HMergeH0,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
     HMergeH1,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
@@ -150,12 +152,18 @@ enum class OperationCode {
     LogicalUNotEqual,     /// (uint a, uint b) -> bool
     LogicalUGreaterEqual, /// (uint a, uint b) -> bool
 
-    Logical2HLessThan,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HEqual,        /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HLessEqual,    /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HGreaterThan,  /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HNotEqual,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HGreaterEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HLessThan,            /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HEqual,               /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HLessEqual,           /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterThan,         /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HNotEqual,            /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterEqual,        /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HLessThanWithNan,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HEqualWithNan,        /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HLessEqualWithNan,    /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterThanWithNan,  /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HNotEqualWithNan,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
 
     Texture,                /// (MetaTexture, float[N] coords) -> float4
     TextureLod,             /// (MetaTexture, float[N] coords) -> float4
@@ -196,9 +204,23 @@ enum class ExitMethod {
 
 class Sampler {
 public:
+    // Use this constructor for bounded Samplers
     explicit Sampler(std::size_t offset, std::size_t index, Tegra::Shader::TextureType type,
                      bool is_array, bool is_shadow)
-        : offset{offset}, index{index}, type{type}, is_array{is_array}, is_shadow{is_shadow} {}
+        : offset{offset}, index{index}, type{type}, is_array{is_array}, is_shadow{is_shadow},
+          is_bindless{false} {}
+
+    // Use this constructor for bindless Samplers
+    explicit Sampler(u32 cbuf_index, u32 cbuf_offset, std::size_t index,
+                     Tegra::Shader::TextureType type, bool is_array, bool is_shadow)
+        : offset{(static_cast<u64>(cbuf_index) << 32) | cbuf_offset}, index{index}, type{type},
+          is_array{is_array}, is_shadow{is_shadow}, is_bindless{true} {}
+
+    // Use this only for serialization/deserialization
+    explicit Sampler(std::size_t offset, std::size_t index, Tegra::Shader::TextureType type,
+                     bool is_array, bool is_shadow, bool is_bindless)
+        : offset{offset}, index{index}, type{type}, is_array{is_array}, is_shadow{is_shadow},
+          is_bindless{is_bindless} {}
 
     std::size_t GetOffset() const {
         return offset;
@@ -220,9 +242,18 @@ public:
         return is_shadow;
     }
 
+    bool IsBindless() const {
+        return is_bindless;
+    }
+
+    std::pair<u32, u32> GetBindlessCBuf() const {
+        return {static_cast<u32>(offset >> 32), static_cast<u32>(offset)};
+    }
+
     bool operator<(const Sampler& rhs) const {
-        return std::tie(offset, index, type, is_array, is_shadow) <
-               std::tie(rhs.offset, rhs.index, rhs.type, rhs.is_array, rhs.is_shadow);
+        return std::tie(index, offset, type, is_array, is_shadow, is_bindless) <
+               std::tie(rhs.index, rhs.offset, rhs.type, rhs.is_array, rhs.is_shadow,
+                        rhs.is_bindless);
     }
 
 private:
@@ -231,8 +262,9 @@ private:
     std::size_t offset{};
     std::size_t index{}; ///< Value used to index into the generated GLSL sampler array.
     Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc)
-    bool is_array{};  ///< Whether the texture is being sampled as an array texture or not.
-    bool is_shadow{}; ///< Whether the texture is being sampled as a depth texture or not.
+    bool is_array{};    ///< Whether the texture is being sampled as an array texture or not.
+    bool is_shadow{};   ///< Whether the texture is being sampled as a depth texture or not.
+    bool is_bindless{}; ///< Whether this sampler belongs to a bindless texture or not.
 };
 
 class ConstBuffer {
@@ -276,15 +308,13 @@ struct GlobalMemoryBase {
     }
 };
 
-struct MetaArithmetic {
-    bool precise{};
+struct GlobalMemoryUsage {
+    bool is_read{};
+    bool is_written{};
 };
 
-struct MetaHalfArithmetic {
+struct MetaArithmetic {
     bool precise{};
-    std::array<Tegra::Shader::HalfType, 3> types = {Tegra::Shader::HalfType::H0_H1,
-                                                    Tegra::Shader::HalfType::H0_H1,
-                                                    Tegra::Shader::HalfType::H0_H1};
 };
 
 struct MetaTexture {
@@ -300,39 +330,29 @@ struct MetaTexture {
 
 constexpr MetaArithmetic PRECISE = {true};
 constexpr MetaArithmetic NO_PRECISE = {false};
-constexpr MetaHalfArithmetic HALF_NO_PRECISE = {false};
 
-using Meta = std::variant<MetaArithmetic, MetaHalfArithmetic, MetaTexture>;
+using Meta = std::variant<MetaArithmetic, MetaTexture, Tegra::Shader::HalfType>;
 
 /// Holds any kind of operation that can be done in the IR
 class OperationNode final {
 public:
-    template <typename... T>
-    explicit constexpr OperationNode(OperationCode code) : code{code}, meta{} {}
+    explicit OperationNode(OperationCode code) : code{code} {}
 
-    template <typename... T>
-    explicit constexpr OperationNode(OperationCode code, Meta&& meta)
-        : code{code}, meta{std::move(meta)} {}
+    explicit OperationNode(OperationCode code, Meta&& meta) : code{code}, meta{std::move(meta)} {}
 
     template <typename... T>
-    explicit constexpr OperationNode(OperationCode code, const T*... operands)
+    explicit OperationNode(OperationCode code, const T*... operands)
         : OperationNode(code, {}, operands...) {}
 
     template <typename... T>
-    explicit constexpr OperationNode(OperationCode code, Meta&& meta, const T*... operands_)
-        : code{code}, meta{std::move(meta)} {
-
-        auto operands_list = {operands_...};
-        for (auto& operand : operands_list) {
-            operands.push_back(operand);
-        }
-    }
+    explicit OperationNode(OperationCode code, Meta&& meta, const T*... operands_)
+        : code{code}, meta{std::move(meta)}, operands{operands_...} {}
 
     explicit OperationNode(OperationCode code, Meta&& meta, std::vector<Node>&& operands)
         : code{code}, meta{meta}, operands{std::move(operands)} {}
 
     explicit OperationNode(OperationCode code, std::vector<Node>&& operands)
-        : code{code}, meta{}, operands{std::move(operands)} {}
+        : code{code}, operands{std::move(operands)} {}
 
     OperationCode GetCode() const {
         return code;
@@ -538,11 +558,8 @@ private:
 
 class ShaderIR final {
 public:
-    explicit ShaderIR(const ProgramCode& program_code, u32 main_offset)
-        : program_code{program_code}, main_offset{main_offset} {
-
-        Decode();
-    }
+    explicit ShaderIR(const ProgramCode& program_code, u32 main_offset);
+    ~ShaderIR();
 
     const std::map<u32, NodeBlock>& GetBasicBlocks() const {
         return basic_blocks;
@@ -578,8 +595,8 @@ public:
         return used_clip_distances;
     }
 
-    const std::set<GlobalMemoryBase>& GetGlobalMemoryBases() const {
-        return used_global_memory_bases;
+    const std::map<GlobalMemoryBase, GlobalMemoryUsage>& GetGlobalMemory() const {
+        return used_global_memory;
     }
 
     std::size_t GetLength() const {
@@ -706,10 +723,14 @@ private:
 
     /// Unpacks a half immediate from an instruction
     Node UnpackHalfImmediate(Tegra::Shader::Instruction instr, bool has_negation);
+    /// Unpacks a binary value into a half float pair with a type format
+    Node UnpackHalfFloat(Node value, Tegra::Shader::HalfType type);
     /// Merges a half pair into another value
     Node HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge);
     /// Conditionally absolute/negated half float pair. Absolute is applied first
     Node GetOperandAbsNegHalf(Node value, bool absolute, bool negate);
+    /// Conditionally saturates a half float pair
+    Node GetSaturatedHalfFloat(Node value, bool saturate = true);
 
     /// Returns a predicate comparing two floats
     Node GetPredicateComparisonFloat(Tegra::Shader::PredCondition condition, Node op_a, Node op_b);
@@ -717,8 +738,7 @@ private:
     Node GetPredicateComparisonInteger(Tegra::Shader::PredCondition condition, bool is_signed,
                                        Node op_a, Node op_b);
     /// Returns a predicate comparing two half floats. meta consumes how both pairs will be compared
-    Node GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition,
-                                    const MetaHalfArithmetic& meta, Node op_a, Node op_b);
+    Node GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, Node op_a, Node op_b);
 
     /// Returns a predicate combiner operation
     OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation);
@@ -730,6 +750,11 @@ private:
     const Sampler& GetSampler(const Tegra::Shader::Sampler& sampler,
                               Tegra::Shader::TextureType type, bool is_array, bool is_shadow);
 
+    // Accesses a texture sampler for a bindless texture.
+    const Sampler& GetBindlessSampler(const Tegra::Shader::Register& reg,
+                                      Tegra::Shader::TextureType type, bool is_array,
+                                      bool is_shadow);
+
     /// Extracts a sequence of bits from a node
     Node BitfieldExtract(Node value, u32 offset, u32 bits);
 
@@ -743,7 +768,8 @@ private:
 
     Node4 GetTexCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
                      Tegra::Shader::TextureProcessMode process_mode, bool depth_compare,
-                     bool is_array, bool is_aoffi);
+                     bool is_array, bool is_aoffi,
+                     std::optional<Tegra::Shader::Register> bindless_reg);
 
     Node4 GetTexsCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
                       Tegra::Shader::TextureProcessMode process_mode, bool depth_compare,
@@ -763,7 +789,8 @@ private:
 
     Node4 GetTextureCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
                          Tegra::Shader::TextureProcessMode process_mode, std::vector<Node> coords,
-                         Node array, Node depth_compare, u32 bias_offset, std::vector<Node> aoffi);
+                         Node array, Node depth_compare, u32 bias_offset, std::vector<Node> aoffi,
+                         std::optional<Tegra::Shader::Register> bindless_reg);
 
     Node GetVideoOperand(Node op, bool is_chunk, bool is_signed, Tegra::Shader::VideoType type,
                          u64 byte_height);
@@ -775,11 +802,17 @@ private:
     void WriteLop3Instruction(NodeBlock& bb, Tegra::Shader::Register dest, Node op_a, Node op_b,
                               Node op_c, Node imm_lut, bool sets_cc);
 
-    Node TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor);
+    Node TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;
+
+    std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const;
 
-    std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor);
+    std::pair<Node, s64> TrackRegister(const GprNode* tracked, const NodeBlock& code,
+                                       s64 cursor) const;
 
-    std::pair<Node, s64> TrackRegister(const GprNode* tracked, const NodeBlock& code, s64 cursor);
+    std::tuple<Node, Node, GlobalMemoryBase> TrackAndGetGlobalMemory(NodeBlock& bb,
+                                                                     Node addr_register,
+                                                                     u32 immediate_offset,
+                                                                     bool is_write);
 
     template <typename... T>
     Node Operation(OperationCode code, const T*... operands) {
@@ -791,12 +824,10 @@ private:
         return StoreNode(OperationNode(code, std::move(meta), operands...));
     }
 
-    template <typename... T>
     Node Operation(OperationCode code, std::vector<Node>&& operands) {
         return StoreNode(OperationNode(code, std::move(operands)));
     }
 
-    template <typename... T>
     Node Operation(OperationCode code, Meta&& meta, std::vector<Node>&& operands) {
         return StoreNode(OperationNode(code, std::move(meta), std::move(operands)));
     }
@@ -834,7 +865,7 @@ private:
     std::map<u32, ConstBuffer> used_cbufs;
     std::set<Sampler> used_samplers;
     std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{};
-    std::set<GlobalMemoryBase> used_global_memory_bases;
+    std::map<GlobalMemoryBase, GlobalMemoryUsage> used_global_memory;
 
     Tegra::Shader::Header header;
 };
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index 4505667ff..19ede1eb9 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -17,22 +17,24 @@ std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
     for (; cursor >= 0; --cursor) {
         const Node node = code.at(cursor);
         if (const auto operation = std::get_if<OperationNode>(node)) {
-            if (operation->GetCode() == operation_code)
+            if (operation->GetCode() == operation_code) {
                 return {node, cursor};
+            }
         }
         if (const auto conditional = std::get_if<ConditionalNode>(node)) {
             const auto& conditional_code = conditional->GetCode();
             const auto [found, internal_cursor] = FindOperation(
                 conditional_code, static_cast<s64>(conditional_code.size() - 1), operation_code);
-            if (found)
+            if (found) {
                 return {found, cursor};
+            }
         }
     }
     return {};
 }
 } // namespace
 
-Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) {
+Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const {
     if (const auto cbuf = std::get_if<CbufNode>(tracked)) {
         // Cbuf found, but it has to be immediate
         return std::holds_alternative<ImmediateNode>(*cbuf->GetOffset()) ? tracked : nullptr;
@@ -65,7 +67,7 @@ Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) {
     return nullptr;
 }
 
-std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) {
+std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const {
     // Reduce the cursor in one to avoid infinite loops when the instruction sets the same register
     // that it uses as operand
     const auto [found, found_cursor] =
@@ -80,7 +82,7 @@ std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code,
 }
 
 std::pair<Node, s64> ShaderIR::TrackRegister(const GprNode* tracked, const NodeBlock& code,
-                                             s64 cursor) {
+                                             s64 cursor) const {
     for (; cursor >= 0; --cursor) {
         const auto [found_node, new_cursor] = FindOperation(code, cursor, OperationCode::Assign);
         if (!found_node) {
diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp
index 3b022a456..6384fa8d2 100644
--- a/src/video_core/surface.cpp
+++ b/src/video_core/surface.cpp
@@ -178,39 +178,44 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format,
             return PixelFormat::ABGR8S;
         case Tegra::Texture::ComponentType::UINT:
             return PixelFormat::ABGR8UI;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::B5G6R5:
         switch (component_type) {
         case Tegra::Texture::ComponentType::UNORM:
             return PixelFormat::B5G6R5U;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::A2B10G10R10:
         switch (component_type) {
         case Tegra::Texture::ComponentType::UNORM:
             return PixelFormat::A2B10G10R10U;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::A1B5G5R5:
         switch (component_type) {
         case Tegra::Texture::ComponentType::UNORM:
             return PixelFormat::A1B5G5R5U;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::R8:
         switch (component_type) {
         case Tegra::Texture::ComponentType::UNORM:
             return PixelFormat::R8U;
         case Tegra::Texture::ComponentType::UINT:
             return PixelFormat::R8UI;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::G8R8:
         // TextureFormat::G8R8 is actually ordered red then green, as such we can use
         // PixelFormat::RG8U and PixelFormat::RG8S. This was tested with The Legend of Zelda: Breath
@@ -220,50 +225,55 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format,
             return PixelFormat::RG8U;
         case Tegra::Texture::ComponentType::SNORM:
             return PixelFormat::RG8S;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::R16_G16_B16_A16:
         switch (component_type) {
         case Tegra::Texture::ComponentType::UNORM:
             return PixelFormat::RGBA16U;
         case Tegra::Texture::ComponentType::FLOAT:
             return PixelFormat::RGBA16F;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::BF10GF11RF11:
         switch (component_type) {
         case Tegra::Texture::ComponentType::FLOAT:
             return PixelFormat::R11FG11FB10F;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
     case Tegra::Texture::TextureFormat::R32_G32_B32_A32:
         switch (component_type) {
         case Tegra::Texture::ComponentType::FLOAT:
             return PixelFormat::RGBA32F;
         case Tegra::Texture::ComponentType::UINT:
             return PixelFormat::RGBA32UI;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::R32_G32:
         switch (component_type) {
         case Tegra::Texture::ComponentType::FLOAT:
             return PixelFormat::RG32F;
         case Tegra::Texture::ComponentType::UINT:
             return PixelFormat::RG32UI;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::R32_G32_B32:
         switch (component_type) {
         case Tegra::Texture::ComponentType::FLOAT:
             return PixelFormat::RGB32F;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::R16:
         switch (component_type) {
         case Tegra::Texture::ComponentType::FLOAT:
@@ -276,18 +286,20 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format,
             return PixelFormat::R16UI;
         case Tegra::Texture::ComponentType::SINT:
             return PixelFormat::R16I;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::R32:
         switch (component_type) {
         case Tegra::Texture::ComponentType::FLOAT:
             return PixelFormat::R32F;
         case Tegra::Texture::ComponentType::UINT:
             return PixelFormat::R32UI;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::ZF32:
         return PixelFormat::Z32F;
     case Tegra::Texture::TextureFormat::Z16:
@@ -310,9 +322,10 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format,
             return PixelFormat::DXN2UNORM;
         case Tegra::Texture::ComponentType::SNORM:
             return PixelFormat::DXN2SNORM;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     case Tegra::Texture::TextureFormat::BC7U:
         return is_srgb ? PixelFormat::BC7U_SRGB : PixelFormat::BC7U;
     case Tegra::Texture::TextureFormat::BC6H_UF16:
@@ -343,15 +356,17 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format,
             return PixelFormat::RG16UI;
         case Tegra::Texture::ComponentType::SINT:
             return PixelFormat::RG16I;
+        default:
+            break;
         }
-        LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type));
-        UNREACHABLE();
+        break;
     default:
-        LOG_CRITICAL(HW_GPU, "Unimplemented format={}, component_type={}", static_cast<u32>(format),
-                     static_cast<u32>(component_type));
-        UNREACHABLE();
-        return PixelFormat::ABGR8U;
+        break;
     }
+    LOG_CRITICAL(HW_GPU, "Unimplemented format={}, component_type={}", static_cast<u32>(format),
+                 static_cast<u32>(component_type));
+    UNREACHABLE();
+    return PixelFormat::ABGR8U;
 }
 
 ComponentType ComponentTypeFromTexture(Tegra::Texture::ComponentType type) {
@@ -513,8 +528,9 @@ bool IsFormatBCn(PixelFormat format) {
     case PixelFormat::DXT45_SRGB:
     case PixelFormat::BC7U_SRGB:
         return true;
+    default:
+        return false;
     }
-    return false;
 }
 
 } // namespace VideoCore::Surface
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index b508d64e9..a9b8f69af 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -25,8 +25,8 @@
 
 class InputBitStream {
 public:
-    explicit InputBitStream(const unsigned char* ptr, int nBits = 0, int start_offset = 0)
-        : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {}
+    explicit InputBitStream(const unsigned char* ptr, int start_offset = 0)
+        : m_CurByte(ptr), m_NextBit(start_offset % 8) {}
 
     ~InputBitStream() = default;
 
@@ -55,12 +55,9 @@ public:
     }
 
 private:
-    const int m_NumBits;
     const unsigned char* m_CurByte;
     int m_NextBit = 0;
     int m_BitsRead = 0;
-
-    bool done = false;
 };
 
 class OutputBitStream {
@@ -114,7 +111,6 @@ private:
     const int m_NumBits;
     unsigned char* m_CurByte;
     int m_NextBit = 0;
-    int m_BitsRead = 0;
 
     bool done = false;
 };
@@ -1616,6 +1612,7 @@ namespace Tegra::Texture::ASTC {
 std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t height,
                                 uint32_t depth, uint32_t block_width, uint32_t block_height) {
     uint32_t blockIdx = 0;
+    std::size_t depth_offset = 0;
     std::vector<uint8_t> outData(height * width * depth * 4);
     for (uint32_t k = 0; k < depth; k++) {
         for (uint32_t j = 0; j < height; j += block_height) {
@@ -1630,7 +1627,7 @@ std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t he
                 uint32_t decompWidth = std::min(block_width, width - i);
                 uint32_t decompHeight = std::min(block_height, height - j);
 
-                uint8_t* outRow = outData.data() + (j * width + i) * 4;
+                uint8_t* outRow = depth_offset + outData.data() + (j * width + i) * 4;
                 for (uint32_t jj = 0; jj < decompHeight; jj++) {
                     memcpy(outRow + jj * width * 4, uncompData + jj * block_width, decompWidth * 4);
                 }
@@ -1638,6 +1635,7 @@ std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t he
                 blockIdx++;
             }
         }
+        depth_offset += height * width * 4;
     }
 
     return outData;
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 995d0e068..217805386 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -288,6 +288,29 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32
     }
 }
 
+void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
+                   const u32 block_height, const std::size_t copy_size, const u8* source_data,
+                   u8* swizzle_data) {
+    const u32 image_width_in_gobs{(width + gob_size_x - 1) / gob_size_x};
+    std::size_t count = 0;
+    for (std::size_t y = dst_y; y < height && count < copy_size; ++y) {
+        const std::size_t gob_address_y =
+            (y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
+            ((y % (gob_size_y * block_height)) / gob_size_y) * gob_size;
+        const auto& table = legacy_swizzle_table[y % gob_size_y];
+        for (std::size_t x = dst_x; x < width && count < copy_size; ++x) {
+            const std::size_t gob_address =
+                gob_address_y + (x / gob_size_x) * gob_size * block_height;
+            const std::size_t swizzled_offset = gob_address + table[x % gob_size_x];
+            const u8* source_line = source_data + count;
+            u8* dest_addr = swizzle_data + swizzled_offset;
+            count++;
+
+            std::memcpy(dest_addr, source_line, 1);
+        }
+    }
+}
+
 std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat format, u32 width,
                               u32 height) {
     std::vector<u8> rgba_data;
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index e078fa274..e072d8401 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -51,4 +51,8 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32
                       u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,
                       u32 offset_x, u32 offset_y);
 
+void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
+                   const u32 block_height, const std::size_t copy_size, const u8* source_data,
+                   u8* swizzle_data);
+
 } // namespace Tegra::Texture
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index cb82ecf3f..60cda0ca3 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -5,6 +5,8 @@
 #include <memory>
 #include "core/core.h"
 #include "core/settings.h"
+#include "video_core/gpu_asynch.h"
+#include "video_core/gpu_synch.h"
 #include "video_core/renderer_base.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
 #include "video_core/video_core.h"
@@ -16,6 +18,14 @@ std::unique_ptr<RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_wind
     return std::make_unique<OpenGL::RendererOpenGL>(emu_window, system);
 }
 
+std::unique_ptr<Tegra::GPU> CreateGPU(Core::System& system) {
+    if (Settings::values.use_asynchronous_gpu_emulation) {
+        return std::make_unique<VideoCommon::GPUAsynch>(system, system.Renderer());
+    }
+
+    return std::make_unique<VideoCommon::GPUSynch>(system, system.Renderer());
+}
+
 u16 GetResolutionScaleFactor(const RendererBase& renderer) {
     return static_cast<u16>(
         Settings::values.resolution_factor
diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h
index 3c583f195..b8e0ac372 100644
--- a/src/video_core/video_core.h
+++ b/src/video_core/video_core.h
@@ -14,6 +14,10 @@ namespace Core::Frontend {
 class EmuWindow;
 }
 
+namespace Tegra {
+class GPU;
+}
+
 namespace VideoCore {
 
 class RendererBase;
@@ -27,6 +31,9 @@ class RendererBase;
 std::unique_ptr<RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_window,
                                              Core::System& system);
 
+/// Creates an emulated GPU instance using the given system context.
+std::unique_ptr<Tegra::GPU> CreateGPU(Core::System& system);
+
 u16 GetResolutionScaleFactor(const RendererBase& renderer);
 
 } // namespace VideoCore