44 files changed, 1190 insertions, 650 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 6821f275d..1e010e4da 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -3,6 +3,8 @@ add_library(video_core STATIC
     dma_pusher.h
     debug_utils/debug_utils.cpp
     debug_utils/debug_utils.h
+    engines/engine_upload.cpp
+    engines/engine_upload.h
     engines/fermi_2d.cpp
     engines/fermi_2d.h
     engines/kepler_compute.cpp
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 036e66f05..3175579cc 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -40,6 +40,13 @@ bool DmaPusher::Step() {
     }
 
     const CommandList& command_list{dma_pushbuffer.front()};
+    ASSERT_OR_EXECUTE(!command_list.empty(), {
+        // Somehow the command_list is empty, in order to avoid a crash
+        // We ignore it and assume its size is 0.
+        dma_pushbuffer.pop();
+        dma_pushbuffer_subindex = 0;
+        return true;
+    });
     const CommandListHeader command_list_header{command_list[dma_pushbuffer_subindex++]};
     GPUVAddr dma_get = command_list_header.addr;
     GPUVAddr dma_put = dma_get + command_list_header.size * sizeof(u32);
diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp
new file mode 100644
index 000000000..082a40cd9
--- /dev/null
+++ b/src/video_core/engines/engine_upload.cpp
@@ -0,0 +1,52 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+
+#include "common/assert.h"
+#include "video_core/engines/engine_upload.h"
+#include "video_core/memory_manager.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra::Engines::Upload {
+
+State::State(MemoryManager& memory_manager, Registers& regs)
+    : regs{regs}, memory_manager{memory_manager} {}
+
+State::~State() = default;
+
+void State::ProcessExec(const bool is_linear) {
+    write_offset = 0;
+    copy_size = regs.line_length_in * regs.line_count;
+    inner_buffer.resize(copy_size);
+    this->is_linear = is_linear;
+}
+
+void State::ProcessData(const u32 data, const bool is_last_call) {
+    const u32 sub_copy_size = std::min(4U, copy_size - write_offset);
+    std::memcpy(&inner_buffer[write_offset], &data, sub_copy_size);
+    write_offset += sub_copy_size;
+    if (!is_last_call) {
+        return;
+    }
+    const GPUVAddr address{regs.dest.Address()};
+    if (is_linear) {
+        memory_manager.WriteBlock(address, inner_buffer.data(), copy_size);
+    } else {
+        UNIMPLEMENTED_IF(regs.dest.z != 0);
+        UNIMPLEMENTED_IF(regs.dest.depth != 1);
+        UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 1);
+        UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 1);
+        const std::size_t dst_size = Tegra::Texture::CalculateSize(
+            true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 1);
+        tmp_buffer.resize(dst_size);
+        memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
+        Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y,
+                                      regs.dest.BlockHeight(), copy_size, inner_buffer.data(),
+                                      tmp_buffer.data());
+        memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
+    }
+}
+
+} // namespace Tegra::Engines::Upload
diff --git a/src/video_core/engines/engine_upload.h b/src/video_core/engines/engine_upload.h
new file mode 100644
index 000000000..ef4f5839a
--- /dev/null
+++ b/src/video_core/engines/engine_upload.h
@@ -0,0 +1,73 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace Tegra::Engines::Upload {
+
+struct Registers {
+    u32 line_length_in;
+    u32 line_count;
+
+    struct {
+        u32 address_high;
+        u32 address_low;
+        u32 pitch;
+        union {
+            BitField<0, 4, u32> block_width;
+            BitField<4, 4, u32> block_height;
+            BitField<8, 4, u32> block_depth;
+        };
+        u32 width;
+        u32 height;
+        u32 depth;
+        u32 z;
+        u32 x;
+        u32 y;
+
+        GPUVAddr Address() const {
+            return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low);
+        }
+
+        u32 BlockWidth() const {
+            return 1U << block_width.Value();
+        }
+
+        u32 BlockHeight() const {
+            return 1U << block_height.Value();
+        }
+
+        u32 BlockDepth() const {
+            return 1U << block_depth.Value();
+        }
+    } dest;
+};
+
+class State {
+public:
+    State(MemoryManager& memory_manager, Registers& regs);
+    ~State();
+
+    void ProcessExec(bool is_linear);
+    void ProcessData(u32 data, bool is_last_call);
+
+private:
+    u32 write_offset = 0;
+    u32 copy_size = 0;
+    std::vector<u8> inner_buffer;
+    std::vector<u8> tmp_buffer;
+    bool is_linear = false;
+    Registers& regs;
+    MemoryManager& memory_manager;
+};
+
+} // namespace Tegra::Engines::Upload
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 2e51b7f13..45f59a4d9 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -21,6 +21,12 @@ class RasterizerInterface;
 
 namespace Tegra::Engines {
 
+/**
+ * This Engine is known as G80_2D. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/g80_2d.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h
+ */
+
 #define FERMI2D_REG_INDEX(field_name)                                                              \
     (offsetof(Tegra::Engines::Fermi2D::Regs, field_name) / sizeof(u32))
 
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index b1d950460..7404a8163 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -4,12 +4,21 @@
 
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/core.h"
 #include "video_core/engines/kepler_compute.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
+#include "video_core/textures/decoders.h"
 
 namespace Tegra::Engines {
 
-KeplerCompute::KeplerCompute(MemoryManager& memory_manager) : memory_manager{memory_manager} {}
+KeplerCompute::KeplerCompute(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                             MemoryManager& memory_manager)
+    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, upload_state{
+                                                                                  memory_manager,
+                                                                                  regs.upload} {}
 
 KeplerCompute::~KeplerCompute() = default;
 
@@ -20,14 +29,34 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
     regs.reg_array[method_call.method] = method_call.argument;
 
     switch (method_call.method) {
+    case KEPLER_COMPUTE_REG_INDEX(exec_upload): {
+        upload_state.ProcessExec(regs.exec_upload.linear != 0);
+        break;
+    }
+    case KEPLER_COMPUTE_REG_INDEX(data_upload): {
+        const bool is_last_call = method_call.IsLastCall();
+        upload_state.ProcessData(method_call.argument, is_last_call);
+        if (is_last_call) {
+            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+        }
+        break;
+    }
     case KEPLER_COMPUTE_REG_INDEX(launch):
-        // Abort execution since compute shaders can be used to alter game memory (e.g. CUDA
-        // kernels)
-        UNREACHABLE_MSG("Compute shaders are not implemented");
+        ProcessLaunch();
         break;
     default:
         break;
     }
 }
 
+void KeplerCompute::ProcessLaunch() {
+
+    const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
+    memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description,
+                                   LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32));
+
+    const GPUVAddr code_loc = regs.code_loc.Address() + launch_description.program_start;
+    LOG_WARNING(HW_GPU, "Compute Kernel Execute at Address 0x{:016x}, STUBBED", code_loc);
+}
+
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index fb6cdf432..5250b8d9b 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -6,22 +6,40 @@
 
 #include <array>
 #include <cstddef>
+#include <vector>
+#include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 
+namespace Core {
+class System;
+}
+
 namespace Tegra {
 class MemoryManager;
 }
 
+namespace VideoCore {
+class RasterizerInterface;
+}
+
 namespace Tegra::Engines {
 
+/**
+ * This Engine is known as GK104_Compute. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_compute.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h
+ */
+
 #define KEPLER_COMPUTE_REG_INDEX(field_name)                                                       \
     (offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32))
 
 class KeplerCompute final {
 public:
-    explicit KeplerCompute(MemoryManager& memory_manager);
+    explicit KeplerCompute(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                           MemoryManager& memory_manager);
     ~KeplerCompute();
 
     static constexpr std::size_t NumConstBuffers = 8;
@@ -31,30 +49,181 @@ public:
 
         union {
             struct {
-                INSERT_PADDING_WORDS(0xAF);
+                INSERT_PADDING_WORDS(0x60);
+
+                Upload::Registers upload;
+
+                struct {
+                    union {
+                        BitField<0, 1, u32> linear;
+                    };
+                } exec_upload;
+
+                u32 data_upload;
+
+                INSERT_PADDING_WORDS(0x3F);
+
+                struct {
+                    u32 address;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address) << 8));
+                    }
+                } launch_desc_loc;
+
+                INSERT_PADDING_WORDS(0x1);
 
                 u32 launch;
 
-                INSERT_PADDING_WORDS(0xC48);
+                INSERT_PADDING_WORDS(0x4A7);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    u32 limit;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } tsc;
+
+                INSERT_PADDING_WORDS(0x3);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    u32 limit;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } tic;
+
+                INSERT_PADDING_WORDS(0x22);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } code_loc;
+
+                INSERT_PADDING_WORDS(0x3FE);
+
+                u32 texture_const_buffer_index;
+
+                INSERT_PADDING_WORDS(0x374);
             };
             std::array<u32, NUM_REGS> reg_array;
         };
     } regs{};
+
+    struct LaunchParams {
+        static constexpr std::size_t NUM_LAUNCH_PARAMETERS = 0x40;
+
+        INSERT_PADDING_WORDS(0x8);
+
+        u32 program_start;
+
+        INSERT_PADDING_WORDS(0x2);
+
+        BitField<30, 1, u32> linked_tsc;
+
+        BitField<0, 31, u32> grid_dim_x;
+        union {
+            BitField<0, 16, u32> grid_dim_y;
+            BitField<16, 16, u32> grid_dim_z;
+        };
+
+        INSERT_PADDING_WORDS(0x3);
+
+        BitField<0, 16, u32> shared_alloc;
+
+        BitField<0, 31, u32> block_dim_x;
+        union {
+            BitField<0, 16, u32> block_dim_y;
+            BitField<16, 16, u32> block_dim_z;
+        };
+
+        union {
+            BitField<0, 8, u32> const_buffer_enable_mask;
+            BitField<29, 2, u32> cache_layout;
+        } memory_config;
+
+        INSERT_PADDING_WORDS(0x8);
+
+        struct {
+            u32 address_low;
+            union {
+                BitField<0, 8, u32> address_high;
+                BitField<15, 17, u32> size;
+            };
+            GPUVAddr Address() const {
+                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high.Value()) << 32) |
+                                             address_low);
+            }
+        } const_buffer_config[8];
+
+        union {
+            BitField<0, 20, u32> local_pos_alloc;
+            BitField<27, 5, u32> barrier_alloc;
+        };
+
+        union {
+            BitField<0, 20, u32> local_neg_alloc;
+            BitField<24, 5, u32> gpr_alloc;
+        };
+
+        INSERT_PADDING_WORDS(0x11);
+    } launch_description;
+
+    struct {
+        u32 write_offset = 0;
+        u32 copy_size = 0;
+        std::vector<u8> inner_buffer;
+    } state{};
+
     static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32),
                   "KeplerCompute Regs has wrong size");
 
+    static_assert(sizeof(LaunchParams) == LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32),
+                  "KeplerCompute LaunchParams has wrong size");
+
     /// Write the value to the register identified by method.
     void CallMethod(const GPU::MethodCall& method_call);
 
 private:
+    Core::System& system;
+    VideoCore::RasterizerInterface& rasterizer;
     MemoryManager& memory_manager;
+    Upload::State upload_state;
+
+    void ProcessLaunch();
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
     static_assert(offsetof(KeplerCompute::Regs, field_name) == position * 4,                       \
                   "Field " #field_name " has invalid position")
 
+#define ASSERT_LAUNCH_PARAM_POSITION(field_name, position)                                         \
+    static_assert(offsetof(KeplerCompute::LaunchParams, field_name) == position * 4,               \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(upload, 0x60);
+ASSERT_REG_POSITION(exec_upload, 0x6C);
+ASSERT_REG_POSITION(data_upload, 0x6D);
 ASSERT_REG_POSITION(launch, 0xAF);
+ASSERT_REG_POSITION(tsc, 0x557);
+ASSERT_REG_POSITION(tic, 0x55D);
+ASSERT_REG_POSITION(code_loc, 0x582);
+ASSERT_REG_POSITION(texture_const_buffer_index, 0x982);
+ASSERT_LAUNCH_PARAM_POSITION(program_start, 0x8);
+ASSERT_LAUNCH_PARAM_POSITION(grid_dim_x, 0xC);
+ASSERT_LAUNCH_PARAM_POSITION(shared_alloc, 0x11);
+ASSERT_LAUNCH_PARAM_POSITION(block_dim_x, 0x12);
+ASSERT_LAUNCH_PARAM_POSITION(memory_config, 0x14);
+ASSERT_LAUNCH_PARAM_POSITION(const_buffer_config, 0x1D);
 
 #undef ASSERT_REG_POSITION
 
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 7387886a3..0561f676c 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -14,9 +14,8 @@
 
 namespace Tegra::Engines {
 
-KeplerMemory::KeplerMemory(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                           MemoryManager& memory_manager)
-    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {}
+KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager)
+    : system{system}, memory_manager{memory_manager}, upload_state{memory_manager, regs.upload} {}
 
 KeplerMemory::~KeplerMemory() = default;
 
@@ -28,46 +27,18 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
 
     switch (method_call.method) {
     case KEPLERMEMORY_REG_INDEX(exec): {
-        ProcessExec();
+        upload_state.ProcessExec(regs.exec.linear != 0);
         break;
     }
     case KEPLERMEMORY_REG_INDEX(data): {
-        ProcessData(method_call.argument, method_call.IsLastCall());
+        const bool is_last_call = method_call.IsLastCall();
+        upload_state.ProcessData(method_call.argument, is_last_call);
+        if (is_last_call) {
+            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+        }
         break;
     }
     }
 }
 
-void KeplerMemory::ProcessExec() {
-    state.write_offset = 0;
-    state.copy_size = regs.line_length_in * regs.line_count;
-    state.inner_buffer.resize(state.copy_size);
-}
-
-void KeplerMemory::ProcessData(u32 data, bool is_last_call) {
-    const u32 sub_copy_size = std::min(4U, state.copy_size - state.write_offset);
-    std::memcpy(&state.inner_buffer[state.write_offset], &regs.data, sub_copy_size);
-    state.write_offset += sub_copy_size;
-    if (is_last_call) {
-        const GPUVAddr address{regs.dest.Address()};
-        if (regs.exec.linear != 0) {
-            memory_manager.WriteBlock(address, state.inner_buffer.data(), state.copy_size);
-        } else {
-            UNIMPLEMENTED_IF(regs.dest.z != 0);
-            UNIMPLEMENTED_IF(regs.dest.depth != 1);
-            UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 1);
-            UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 1);
-            const std::size_t dst_size = Tegra::Texture::CalculateSize(
-                true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 1);
-            std::vector<u8> tmp_buffer(dst_size);
-            memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
-            Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x,
-                                          regs.dest.y, regs.dest.BlockHeight(), state.copy_size,
-                                          state.inner_buffer.data(), tmp_buffer.data());
-            memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
-        }
-        system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
-    }
-}
-
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index 5f892ddad..f3bc675a9 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -10,6 +10,7 @@
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 
 namespace Core {
@@ -20,19 +21,20 @@ namespace Tegra {
 class MemoryManager;
 }
 
-namespace VideoCore {
-class RasterizerInterface;
-}
-
 namespace Tegra::Engines {
 
+/**
+ * This Engine is known as P2MF. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_p2mf.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_p2mf.xml.h
+ */
+
 #define KEPLERMEMORY_REG_INDEX(field_name)                                                         \
     (offsetof(Tegra::Engines::KeplerMemory::Regs, field_name) / sizeof(u32))
 
 class KeplerMemory final {
 public:
-    KeplerMemory(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                 MemoryManager& memory_manager);
+    KeplerMemory(Core::System& system, MemoryManager& memory_manager);
     ~KeplerMemory();
 
     /// Write the value to the register identified by method.
@@ -45,42 +47,7 @@ public:
             struct {
                 INSERT_PADDING_WORDS(0x60);
 
-                u32 line_length_in;
-                u32 line_count;
-
-                struct {
-                    u32 address_high;
-                    u32 address_low;
-                    u32 pitch;
-                    union {
-                        BitField<0, 4, u32> block_width;
-                        BitField<4, 4, u32> block_height;
-                        BitField<8, 4, u32> block_depth;
-                    };
-                    u32 width;
-                    u32 height;
-                    u32 depth;
-                    u32 z;
-                    u32 x;
-                    u32 y;
-
-                    GPUVAddr Address() const {
-                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                                     address_low);
-                    }
-
-                    u32 BlockWidth() const {
-                        return 1U << block_width.Value();
-                    }
-
-                    u32 BlockHeight() const {
-                        return 1U << block_height.Value();
-                    }
-
-                    u32 BlockDepth() const {
-                        return 1U << block_depth.Value();
-                    }
-                } dest;
+                Upload::Registers upload;
 
                 struct {
                     union {
@@ -96,28 +63,17 @@ public:
         };
     } regs{};
 
-    struct {
-        u32 write_offset = 0;
-        u32 copy_size = 0;
-        std::vector<u8> inner_buffer;
-    } state{};
-
 private:
     Core::System& system;
-    VideoCore::RasterizerInterface& rasterizer;
     MemoryManager& memory_manager;
-
-    void ProcessExec();
-    void ProcessData(u32 data, bool is_last_call);
+    Upload::State upload_state;
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
     static_assert(offsetof(KeplerMemory::Regs, field_name) == position * 4,                        \
                   "Field " #field_name " has invalid position")
 
-ASSERT_REG_POSITION(line_length_in, 0x60);
-ASSERT_REG_POSITION(line_count, 0x61);
-ASSERT_REG_POSITION(dest, 0x62);
+ASSERT_REG_POSITION(upload, 0x60);
 ASSERT_REG_POSITION(exec, 0x6C);
 ASSERT_REG_POSITION(data, 0x6D);
 #undef ASSERT_REG_POSITION
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 9780417f2..39968d403 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -20,8 +20,8 @@ constexpr u32 MacroRegistersStart = 0xE00;
 
 Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                      MemoryManager& memory_manager)
-    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, macro_interpreter{
-                                                                                  *this} {
+    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
+      macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
     InitializeRegisterDefaults();
 }
 
@@ -34,9 +34,9 @@ void Maxwell3D::InitializeRegisterDefaults() {
 
     // Depth range near/far is not always set, but is expected to be the default 0.0f, 1.0f. This is
     // needed for ARMS.
-    for (std::size_t viewport{}; viewport < Regs::NumViewports; ++viewport) {
-        regs.viewports[viewport].depth_range_near = 0.0f;
-        regs.viewports[viewport].depth_range_far = 1.0f;
+    for (auto& viewport : regs.viewports) {
+        viewport.depth_range_near = 0.0f;
+        viewport.depth_range_far = 1.0f;
     }
 
     // Doom and Bomberman seems to use the uninitialized registers and just enable blend
@@ -47,13 +47,13 @@ void Maxwell3D::InitializeRegisterDefaults() {
     regs.blend.equation_a = Regs::Blend::Equation::Add;
     regs.blend.factor_source_a = Regs::Blend::Factor::One;
     regs.blend.factor_dest_a = Regs::Blend::Factor::Zero;
-    for (std::size_t blend_index = 0; blend_index < Regs::NumRenderTargets; blend_index++) {
-        regs.independent_blend[blend_index].equation_rgb = Regs::Blend::Equation::Add;
-        regs.independent_blend[blend_index].factor_source_rgb = Regs::Blend::Factor::One;
-        regs.independent_blend[blend_index].factor_dest_rgb = Regs::Blend::Factor::Zero;
-        regs.independent_blend[blend_index].equation_a = Regs::Blend::Equation::Add;
-        regs.independent_blend[blend_index].factor_source_a = Regs::Blend::Factor::One;
-        regs.independent_blend[blend_index].factor_dest_a = Regs::Blend::Factor::Zero;
+    for (auto& blend : regs.independent_blend) {
+        blend.equation_rgb = Regs::Blend::Equation::Add;
+        blend.factor_source_rgb = Regs::Blend::Factor::One;
+        blend.factor_dest_rgb = Regs::Blend::Factor::Zero;
+        blend.equation_a = Regs::Blend::Equation::Add;
+        blend.factor_source_a = Regs::Blend::Factor::One;
+        blend.factor_dest_a = Regs::Blend::Factor::Zero;
     }
     regs.stencil_front_op_fail = Regs::StencilOp::Keep;
     regs.stencil_front_op_zfail = Regs::StencilOp::Keep;
@@ -75,11 +75,11 @@ void Maxwell3D::InitializeRegisterDefaults() {
 
     // TODO(bunnei): Some games do not initialize the color masks (e.g. Sonic Mania). Assuming a
     // default of enabled fixes rendering here.
-    for (std::size_t color_mask = 0; color_mask < Regs::NumRenderTargets; color_mask++) {
-        regs.color_mask[color_mask].R.Assign(1);
-        regs.color_mask[color_mask].G.Assign(1);
-        regs.color_mask[color_mask].B.Assign(1);
-        regs.color_mask[color_mask].A.Assign(1);
+    for (auto& color_mask : regs.color_mask) {
+        color_mask.R.Assign(1);
+        color_mask.G.Assign(1);
+        color_mask.B.Assign(1);
+        color_mask.A.Assign(1);
     }
 
     // Commercial games seem to assume this value is enabled and nouveau sets this value manually.
@@ -178,13 +178,13 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
 
         // Vertex buffer
         if (method >= MAXWELL3D_REG_INDEX(vertex_array) &&
-            method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * 32) {
+            method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * Regs::NumVertexArrays) {
             dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2);
         } else if (method >= MAXWELL3D_REG_INDEX(vertex_array_limit) &&
-                   method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * 32) {
+                   method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * Regs::NumVertexArrays) {
             dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1);
         } else if (method >= MAXWELL3D_REG_INDEX(instanced_arrays) &&
-                   method < MAXWELL3D_REG_INDEX(instanced_arrays) + 32) {
+                   method < MAXWELL3D_REG_INDEX(instanced_arrays) + Regs::NumVertexArrays) {
             dirty_flags.vertex_array.set(method - MAXWELL3D_REG_INDEX(instanced_arrays));
         }
     }
@@ -253,6 +253,18 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
         ProcessSyncPoint();
         break;
     }
+    case MAXWELL3D_REG_INDEX(exec_upload): {
+        upload_state.ProcessExec(regs.exec_upload.linear != 0);
+        break;
+    }
+    case MAXWELL3D_REG_INDEX(data_upload): {
+        const bool is_last_call = method_call.IsLastCall();
+        upload_state.ProcessData(method_call.argument, is_last_call);
+        if (is_last_call) {
+            dirty_flags.OnMemoryWrite();
+        }
+        break;
+    }
     default:
         break;
     }
@@ -430,7 +442,7 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
     const auto a_type = tic_entry.a_type.Value();
 
     // TODO(Subv): Different data types for separate components are not supported
-    ASSERT(r_type == g_type && r_type == b_type && r_type == a_type);
+    DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type);
 
     return tic_entry;
 }
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 85d309d9b..f342c78e6 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <bitset>
+#include <type_traits>
 #include <unordered_map>
 #include <vector>
 
@@ -14,6 +15,7 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/math_util.h"
+#include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 #include "video_core/macro_interpreter.h"
 #include "video_core/textures/texture.h"
@@ -32,6 +34,12 @@ class RasterizerInterface;
 
 namespace Tegra::Engines {
 
+/**
+ * This Engine is known as GF100_3D. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/graph/gf100_3d.xml
+ * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h
+ */
+
 #define MAXWELL3D_REG_INDEX(field_name)                                                            \
     (offsetof(Tegra::Engines::Maxwell3D::Regs, field_name) / sizeof(u32))
 
@@ -51,6 +59,7 @@ public:
         static constexpr std::size_t NumCBData = 16;
         static constexpr std::size_t NumVertexArrays = 32;
         static constexpr std::size_t NumVertexAttributes = 32;
+        static constexpr std::size_t NumVaryings = 31;
         static constexpr std::size_t NumTextureSamplers = 32;
         static constexpr std::size_t NumClipDistances = 8;
         static constexpr std::size_t MaxShaderProgram = 6;
@@ -580,7 +589,18 @@ public:
                     u32 bind;
                 } macros;
 
-                INSERT_PADDING_WORDS(0x69);
+                INSERT_PADDING_WORDS(0x17);
+
+                Upload::Registers upload;
+                struct {
+                    union {
+                        BitField<0, 1, u32> linear;
+                    };
+                } exec_upload;
+
+                u32 data_upload;
+
+                INSERT_PADDING_WORDS(0x44);
 
                 struct {
                     union {
@@ -1089,6 +1109,7 @@ public:
     } regs{};
 
     static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32), "Maxwell3D Regs has wrong size");
+    static_assert(std::is_trivially_copyable_v<Regs>, "Maxwell3D Regs must be trivially copyable");
 
     struct State {
         struct ConstBufferInfo {
@@ -1176,6 +1197,8 @@ private:
     /// Interpreter for the macro codes uploaded to the GPU.
     MacroInterpreter macro_interpreter;
 
+    Upload::State upload_state;
+
     /// Retrieves information about a specific TIC entry from the TIC buffer.
     Texture::TICEntry GetTICEntry(u32 tic_index) const;
 
@@ -1219,6 +1242,9 @@ private:
                   "Field " #field_name " has invalid position")
 
 ASSERT_REG_POSITION(macros, 0x45);
+ASSERT_REG_POSITION(upload, 0x60);
+ASSERT_REG_POSITION(exec_upload, 0x6C);
+ASSERT_REG_POSITION(data_upload, 0x6D);
 ASSERT_REG_POSITION(sync_info, 0xB2);
 ASSERT_REG_POSITION(tfb_enabled, 0x1D1);
 ASSERT_REG_POSITION(rt, 0x200);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 2426d0067..3a5dfef0c 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -83,57 +83,66 @@ void MaxwellDMA::HandleCopy() {
 
     ASSERT(regs.exec.enable_2d == 1);
 
-    const std::size_t copy_size = regs.x_count * regs.y_count;
+    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
+        ASSERT(regs.src_params.size_z == 1);
+        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
+        const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x;
+        const std::size_t src_size = Texture::CalculateSize(
+            true, src_bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,
+            regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
 
-    auto source_ptr{memory_manager.GetPointer(source)};
-    auto dst_ptr{memory_manager.GetPointer(dest)};
+        const std::size_t dst_size = regs.dst_pitch * regs.y_count;
 
-    if (!source_ptr) {
-        LOG_ERROR(HW_GPU, "source_ptr is invalid");
-        return;
-    }
+        if (read_buffer.size() < src_size) {
+            read_buffer.resize(src_size);
+        }
 
-    if (!dst_ptr) {
-        LOG_ERROR(HW_GPU, "dst_ptr is invalid");
-        return;
-    }
+        if (write_buffer.size() < dst_size) {
+            write_buffer.resize(dst_size);
+        }
 
-    const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) {
-        // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated
-        // copying.
-        rasterizer.FlushRegion(ToCacheAddr(source_ptr), src_size);
+        memory_manager.ReadBlock(source, read_buffer.data(), src_size);
+        memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
 
-        // We have to invalidate the destination region to evict any outdated surfaces from the
-        // cache. We do this before actually writing the new data because the destination address
-        // might contain a dirty surface that will have to be written back to memory.
-        rasterizer.InvalidateRegion(ToCacheAddr(dst_ptr), dst_size);
-    };
+        Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch,
+                                  regs.src_params.size_x, src_bytes_per_pixel, read_buffer.data(),
+                                  write_buffer.data(), regs.src_params.BlockHeight(),
+                                  regs.src_params.pos_x, regs.src_params.pos_y);
 
-    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
-        ASSERT(regs.src_params.size_z == 1);
-        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
+        memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
+    } else {
+        ASSERT(regs.dst_params.BlockDepth() == 1);
 
-        const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x;
+        const u32 src_bytes_per_pixel = regs.src_pitch / regs.x_count;
 
-        FlushAndInvalidate(regs.src_pitch * regs.src_params.size_y,
-                           copy_size * src_bytes_per_pixel);
+        const std::size_t dst_size = Texture::CalculateSize(
+            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,
+            regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
 
-        Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch,
-                                  regs.src_params.size_x, src_bytes_per_pixel, source_ptr, dst_ptr,
-                                  regs.src_params.BlockHeight(), regs.src_params.pos_x,
-                                  regs.src_params.pos_y);
-    } else {
-        ASSERT(regs.dst_params.size_z == 1);
-        ASSERT(regs.src_pitch == regs.x_count);
+        const std::size_t dst_layer_size = Texture::CalculateSize(
+            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,
+            regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
 
-        const u32 src_bpp = regs.src_pitch / regs.x_count;
+        const std::size_t src_size = regs.src_pitch * regs.y_count;
 
-        FlushAndInvalidate(regs.src_pitch * regs.y_count,
-                           regs.dst_params.size_x * regs.dst_params.size_y * src_bpp);
+        if (read_buffer.size() < src_size) {
+            read_buffer.resize(src_size);
+        }
+
+        if (write_buffer.size() < dst_size) {
+            write_buffer.resize(dst_size);
+        }
+
+        memory_manager.ReadBlock(source, read_buffer.data(), src_size);
+        memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
 
         // If the input is linear and the output is tiled, swizzle the input and copy it over.
         Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x,
-                                src_bpp, dst_ptr, source_ptr, regs.dst_params.BlockHeight());
+                                src_bytes_per_pixel,
+                                write_buffer.data() + dst_layer_size * regs.dst_params.pos_z,
+                                read_buffer.data(), regs.dst_params.BlockHeight());
+
+        memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
     }
 }
 
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index c6b649842..e5942f671 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <cstddef>
+#include <vector>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
@@ -25,6 +26,11 @@ class RasterizerInterface;
 
 namespace Tegra::Engines {
 
+/**
+ * This Engine is known as GK104_Copy. Documentation can be found in:
+ * https://github.com/envytools/envytools/blob/master/rnndb/fifo/gk104_copy.xml
+ */
+
 class MaxwellDMA final {
 public:
     explicit MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
@@ -63,6 +69,16 @@ public:
 
         static_assert(sizeof(Parameters) == 24, "Parameters has wrong size");
 
+        enum class ComponentMode : u32 {
+            Src0 = 0,
+            Src1 = 1,
+            Src2 = 2,
+            Src3 = 3,
+            Const0 = 4,
+            Const1 = 5,
+            Zero = 6,
+        };
+
         enum class CopyMode : u32 {
             None = 0,
             Unk1 = 1,
@@ -128,7 +144,26 @@ public:
                 u32 x_count;
                 u32 y_count;
 
-                INSERT_PADDING_WORDS(0xBB);
+                INSERT_PADDING_WORDS(0xB8);
+
+                u32 const0;
+                u32 const1;
+                union {
+                    BitField<0, 4, ComponentMode> component0;
+                    BitField<4, 4, ComponentMode> component1;
+                    BitField<8, 4, ComponentMode> component2;
+                    BitField<12, 4, ComponentMode> component3;
+                    BitField<16, 2, u32> component_size;
+                    BitField<20, 3, u32> src_num_components;
+                    BitField<24, 3, u32> dst_num_components;
+
+                    u32 SrcBytePerPixel() const {
+                        return src_num_components.Value() * component_size.Value();
+                    }
+                    u32 DstBytePerPixel() const {
+                        return dst_num_components.Value() * component_size.Value();
+                    }
+                } swizzle_config;
 
                 Parameters dst_params;
 
@@ -149,6 +184,9 @@ private:
 
     MemoryManager& memory_manager;
 
+    std::vector<u8> read_buffer;
+    std::vector<u8> write_buffer;
+
     /// Performs the copy from the source buffer to the destination buffer as configured in the
     /// registers.
     void HandleCopy();
@@ -165,6 +203,9 @@ ASSERT_REG_POSITION(src_pitch, 0x104);
 ASSERT_REG_POSITION(dst_pitch, 0x105);
 ASSERT_REG_POSITION(x_count, 0x106);
 ASSERT_REG_POSITION(y_count, 0x107);
+ASSERT_REG_POSITION(const0, 0x1C0);
+ASSERT_REG_POSITION(const1, 0x1C1);
+ASSERT_REG_POSITION(swizzle_config, 0x1C2);
 ASSERT_REG_POSITION(dst_params, 0x1C3);
 ASSERT_REG_POSITION(src_params, 0x1CA);
 
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index e5b4eadea..7bbc556da 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -98,6 +98,10 @@ union Attribute {
         BitField<22, 2, u64> element;
         BitField<24, 6, Index> index;
         BitField<47, 3, AttributeSize> size;
+
+        bool IsPhysical() const {
+            return element == 0 && static_cast<u64>(index.Value()) == 0;
+        }
     } fmt20;
 
     union {
@@ -499,6 +503,11 @@ enum class SystemVariable : u64 {
     CircularQueueEntryAddressHigh = 0x63,
 };
 
+enum class PhysicalAttributeDirection : u64 {
+    Input = 0,
+    Output = 1,
+};
+
 union Instruction {
     Instruction& operator=(const Instruction& instr) {
         value = instr.value;
@@ -587,6 +596,7 @@ union Instruction {
     } alu;
 
     union {
+        BitField<38, 1, u64> idx;
         BitField<51, 1, u64> saturate;
         BitField<52, 2, IpaSampleMode> sample_mode;
         BitField<54, 2, IpaInterpMode> interp_mode;
@@ -812,6 +822,12 @@ union Instruction {
     } stg;
 
     union {
+        BitField<32, 1, PhysicalAttributeDirection> direction;
+        BitField<47, 3, AttributeSize> size;
+        BitField<20, 11, u64> address;
+    } al2p;
+
+    union {
         BitField<0, 3, u64> pred0;
         BitField<3, 3, u64> pred3;
         BitField<7, 1, u64> abs_a;
@@ -1374,8 +1390,9 @@ public:
         ST_A,
         ST_L,
         ST_S,
-        LDG, // Load from global memory
-        STG, // Store in global memory
+        LDG,  // Load from global memory
+        STG,  // Store in global memory
+        AL2P, // Transforms attribute memory into physical memory
         TEX,
         TEX_B,  // Texture Load Bindless
         TXQ,    // Texture Query
@@ -1646,6 +1663,7 @@ private:
             INST("1110111101010---", Id::ST_L, Type::Memory, "ST_L"),
             INST("1110111011010---", Id::LDG, Type::Memory, "LDG"),
             INST("1110111011011---", Id::STG, Type::Memory, "STG"),
+            INST("1110111110100---", Id::AL2P, Type::Memory, "AL2P"),
             INST("110000----111---", Id::TEX, Type::Texture, "TEX"),
             INST("1101111010111---", Id::TEX_B, Type::Texture, "TEX_B"),
             INST("1101111101001---", Id::TXQ, Type::Texture, "TXQ"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 4461083ff..52706505b 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -35,9 +35,9 @@ GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{ren
     dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
     maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
     fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
-    kepler_compute = std::make_unique<Engines::KeplerCompute>(*memory_manager);
+    kepler_compute = std::make_unique<Engines::KeplerCompute>(system, rasterizer, *memory_manager);
     maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, rasterizer, *memory_manager);
-    kepler_memory = std::make_unique<Engines::KeplerMemory>(system, rasterizer, *memory_manager);
+    kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager);
 }
 
 GPU::~GPU() = default;
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index c9a2077de..1e2ff46b0 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -44,7 +44,7 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
                 renderer.Rasterizer().FlushRegion(data->addr, data->size);
             } else if (const auto data = std::get_if<InvalidateRegionCommand>(&next.data)) {
                 renderer.Rasterizer().InvalidateRegion(data->addr, data->size);
-            } else if (const auto data = std::get_if<EndProcessingCommand>(&next.data)) {
+            } else if (std::holds_alternative<EndProcessingCommand>(next.data)) {
                 return;
             } else {
                 UNREACHABLE();
@@ -118,7 +118,7 @@ void SynchState::WaitForSynchronization(u64 fence) {
     // Wait for the GPU to be idle (all commands to be executed)
     {
         MICROPROFILE_SCOPE(GPU_wait);
-        std::unique_lock<std::mutex> lock{synchronization_mutex};
+        std::unique_lock lock{synchronization_mutex};
         synchronization_condition.wait(lock, [this, fence] { return signaled_fence >= fence; });
     }
 }
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index cc14527c7..05a168a72 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -81,12 +81,6 @@ struct CommandDataContainer {
     CommandDataContainer(CommandData&& data, u64 next_fence)
         : data{std::move(data)}, fence{next_fence} {}
 
-    CommandDataContainer& operator=(const CommandDataContainer& t) {
-        data = std::move(t.data);
-        fence = t.fence;
-        return *this;
-    }
-
     CommandData data;
     u64 fence{};
 };
@@ -109,7 +103,7 @@ struct SynchState final {
 
     void TrySynchronize() {
         if (IsSynchronized()) {
-            std::lock_guard<std::mutex> lock{synchronization_mutex};
+            std::lock_guard lock{synchronization_mutex};
             synchronization_condition.notify_one();
         }
     }
diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro_interpreter.cpp
index 524d9ea5a..c766ed692 100644
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro_interpreter.cpp
@@ -118,10 +118,12 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
                           static_cast<u32>(opcode.operation.Value()));
     }
 
+    // An instruction with the Exit flag will not actually
+    // cause an exit if it's executed inside a delay slot.
+    // TODO(Blinkhawk): Reversed to always exit. The behavior explained above requires further
+    // testing on the MME code.
     if (opcode.is_exit) {
         // Exit has a delay slot, execute the next instruction
-        // Note: Executing an exit during a branch delay slot will cause the instruction at the
-        // branch target to be executed before exiting.
         Step(offset, true);
         return false;
     }
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 6c98c6701..5d8d126c1 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -25,6 +25,8 @@ MemoryManager::MemoryManager(VideoCore::RasterizerInterface& rasterizer) : raste
     UpdatePageTableForVMA(initial_vma);
 }
 
+MemoryManager::~MemoryManager() = default;
+
 GPUVAddr MemoryManager::AllocateSpace(u64 size, u64 align) {
     const u64 aligned_size{Common::AlignUp(size, page_size)};
     const GPUVAddr gpu_addr{FindFreeRegion(address_space_base, aligned_size)};
@@ -199,11 +201,11 @@ const u8* MemoryManager::GetPointer(GPUVAddr addr) const {
     return {};
 }
 
-bool MemoryManager::IsBlockContinous(const GPUVAddr start, const std::size_t size) {
+bool MemoryManager::IsBlockContinuous(const GPUVAddr start, const std::size_t size) const {
     const GPUVAddr end = start + size;
     const auto host_ptr_start = reinterpret_cast<std::uintptr_t>(GetPointer(start));
     const auto host_ptr_end = reinterpret_cast<std::uintptr_t>(GetPointer(end));
-    const std::size_t range = static_cast<std::size_t>(host_ptr_end - host_ptr_start);
+    const auto range = static_cast<std::size_t>(host_ptr_end - host_ptr_start);
     return range == size;
 }
 
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index e4f0c4bd6..113f9d8f3 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -47,7 +47,8 @@ struct VirtualMemoryArea {
 
 class MemoryManager final {
 public:
-    MemoryManager(VideoCore::RasterizerInterface& rasterizer);
+    explicit MemoryManager(VideoCore::RasterizerInterface& rasterizer);
+    ~MemoryManager();
 
     GPUVAddr AllocateSpace(u64 size, u64 align);
     GPUVAddr AllocateSpace(GPUVAddr addr, u64 size, u64 align);
@@ -65,18 +66,18 @@ public:
     u8* GetPointer(GPUVAddr addr);
     const u8* GetPointer(GPUVAddr addr) const;
 
-    // Returns true if the block is continous in host memory, false otherwise
-    bool IsBlockContinous(const GPUVAddr start, const std::size_t size);
+    /// Returns true if the block is continuous in host memory, false otherwise
+    bool IsBlockContinuous(GPUVAddr start, std::size_t size) const;
 
     /**
      * ReadBlock and WriteBlock are full read and write operations over virtual
-     * GPU Memory. It's important to use these when GPU memory may not be continous
+     * GPU Memory. It's important to use these when GPU memory may not be continuous
      * in the Host Memory counterpart. Note: This functions cause Host GPU Memory
      * Flushes and Invalidations, respectively to each operation.
      */
-    void ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const;
-    void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size);
-    void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size);
+    void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
 
     /**
      * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and
@@ -88,9 +89,9 @@ public:
      * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture
      * being flushed.
      */
-    void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const;
-    void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size);
-    void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size);
+    void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
 
 private:
     using VMAMap = std::map<GPUVAddr, VirtualMemoryArea>;
@@ -111,10 +112,10 @@ private:
     /**
      * Maps an unmanaged host memory pointer at a given address.
      *
-     * @param target The guest address to start the mapping at.
-     * @param memory The memory to be mapped.
-     * @param size Size of the mapping.
-     * @param state MemoryState tag to attach to the VMA.
+     * @param target       The guest address to start the mapping at.
+     * @param memory       The memory to be mapped.
+     * @param size         Size of the mapping in bytes.
+     * @param backing_addr The base address of the range to back this mapping.
      */
     VMAHandle MapBackingMemory(GPUVAddr target, u8* memory, u64 size, VAddr backing_addr);
 
@@ -124,7 +125,7 @@ private:
     /// Converts a VMAHandle to a mutable VMAIter.
     VMAIter StripIterConstness(const VMAHandle& iter);
 
-    /// Marks as the specfied VMA as allocated.
+    /// Marks as the specified VMA as allocated.
     VMAIter Allocate(VMAIter vma);
 
     /**
diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h
index 291772186..0c4ea1494 100644
--- a/src/video_core/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache.h
@@ -37,9 +37,6 @@ public:
     /// Gets the size of the shader in guest memory, required for cache management
     virtual std::size_t GetSizeInBytes() const = 0;
 
-    /// Wriets any cached resources back to memory
-    virtual void Flush() = 0;
-
     /// Sets whether the cached object should be considered registered
     void SetIsRegistered(bool registered) {
         is_registered = registered;
@@ -147,8 +144,9 @@ protected:
 
         object->SetIsRegistered(false);
         rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
+        const CacheAddr addr = object->GetCacheAddr();
         interval_cache.subtract({GetInterval(object), ObjectSet{object}});
-        map_cache.erase(object->GetCacheAddr());
+        map_cache.erase(addr);
     }
 
     /// Returns a ticks counter used for tracking when cached objects were last modified
@@ -158,6 +156,8 @@ protected:
         return ++modified_ticks;
     }
 
+    virtual void FlushObjectInner(const T& object) = 0;
+
     /// Flushes the specified object, updating appropriate cache state as needed
     void FlushObject(const T& object) {
         std::lock_guard lock{mutex};
@@ -165,7 +165,7 @@ protected:
         if (!object->IsDirty()) {
             return;
         }
-        object->Flush();
+        FlushObjectInner(object);
         object->MarkAsModified(false, *this);
     }
 
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index fc33aa433..f9247a40e 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -42,9 +42,6 @@ public:
         return alignment;
     }
 
-    // We do not have to flush this cache as things in it are never modified by us.
-    void Flush() override {}
-
 private:
     VAddr cpu_addr{};
     std::size_t size{};
@@ -75,6 +72,9 @@ public:
 protected:
     void AlignBuffer(std::size_t alignment);
 
+    // We do not have to flush this cache as things in it are never modified by us.
+    void FlushObjectInner(const std::shared_ptr<CachedBufferEntry>& object) override {}
+
 private:
     OGLStreamBuffer stream_buffer;
 
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index b6d9e0ddb..38497678a 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -21,9 +21,18 @@ T GetInteger(GLenum pname) {
 
 Device::Device() {
     uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
+    max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
+    max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
     has_variable_aoffi = TestVariableAoffi();
 }
 
+Device::Device(std::nullptr_t) {
+    uniform_buffer_alignment = 0;
+    max_vertex_attributes = 16;
+    max_varyings = 15;
+    has_variable_aoffi = true;
+}
+
 bool Device::TestVariableAoffi() {
     const GLchar* AOFFI_TEST = R"(#version 430 core
 uniform sampler2D tex;
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index 78ff5ee58..de8490682 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -5,17 +5,27 @@
 #pragma once
 
 #include <cstddef>
+#include "common/common_types.h"
 
 namespace OpenGL {
 
 class Device {
 public:
-    Device();
+    explicit Device();
+    explicit Device(std::nullptr_t);
 
     std::size_t GetUniformBufferAlignment() const {
         return uniform_buffer_alignment;
     }
 
+    u32 GetMaxVertexAttributes() const {
+        return max_vertex_attributes;
+    }
+
+    u32 GetMaxVaryings() const {
+        return max_varyings;
+    }
+
     bool HasVariableAoffi() const {
         return has_variable_aoffi;
     }
@@ -24,6 +34,8 @@ private:
     static bool TestVariableAoffi();
 
     std::size_t uniform_buffer_alignment{};
+    u32 max_vertex_attributes{};
+    u32 max_varyings{};
     bool has_variable_aoffi{};
 };
 
diff --git a/src/video_core/renderer_opengl/gl_global_cache.h b/src/video_core/renderer_opengl/gl_global_cache.h
index 196e6e278..2d467a240 100644
--- a/src/video_core/renderer_opengl/gl_global_cache.h
+++ b/src/video_core/renderer_opengl/gl_global_cache.h
@@ -46,7 +46,7 @@ public:
     /// Reloads the global region from guest memory
     void Reload(u32 size_);
 
-    void Flush() override;
+    void Flush();
 
 private:
     VAddr cpu_addr{};
@@ -65,6 +65,11 @@ public:
     GlobalRegion GetGlobalRegion(const GLShader::GlobalMemoryEntry& descriptor,
                                  Tegra::Engines::Maxwell3D::Regs::ShaderStage stage);
 
+protected:
+    void FlushObjectInner(const GlobalRegion& object) override {
+        object->Flush();
+    }
+
 private:
     GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const;
     GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, u32 size);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index db73e746c..dbd8049f5 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -261,8 +261,8 @@ DrawParameters RasterizerOpenGL::SetupDraw() {
             // MakeQuadArray always generates u32 indexes
             params.index_format = GL_UNSIGNED_INT;
             params.count = (regs.vertex_buffer.count / 4) * 6;
-            params.index_buffer_offset =
-                primitive_assembler.MakeQuadArray(regs.vertex_buffer.first, params.count);
+            params.index_buffer_offset = primitive_assembler.MakeQuadArray(
+                regs.vertex_buffer.first, regs.vertex_buffer.count);
         }
         return params;
     }
@@ -922,8 +922,8 @@ void RasterizerOpenGL::SyncViewport(OpenGLState& current_state) {
         viewport.y = viewport_rect.bottom;
         viewport.width = viewport_rect.GetWidth();
         viewport.height = viewport_rect.GetHeight();
-        viewport.depth_range_far = regs.viewports[i].depth_range_far;
-        viewport.depth_range_near = regs.viewports[i].depth_range_near;
+        viewport.depth_range_far = src.depth_range_far;
+        viewport.depth_range_near = src.depth_range_near;
     }
     state.depth_clamp.far_plane = regs.view_volume_clip_control.depth_clamp_far != 0;
     state.depth_clamp.near_plane = regs.view_volume_clip_control.depth_clamp_near != 0;
@@ -1135,7 +1135,9 @@ void RasterizerOpenGL::SyncTransformFeedback() {
 
 void RasterizerOpenGL::SyncPointState() {
     const auto& regs = system.GPU().Maxwell3D().regs;
-    state.point.size = regs.point_size;
+    // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid
+    // in OpenGL).
+    state.point.size = std::max(1.0f, regs.point_size);
 }
 
 void RasterizerOpenGL::SyncPolygonOffset() {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 5a25f5b37..a7681902e 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -628,9 +628,11 @@ CachedSurface::CachedSurface(const SurfaceParams& params)
 }
 
 MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 192, 64));
-void CachedSurface::LoadGLBuffer() {
+void CachedSurface::LoadGLBuffer(RasterizerTemporaryMemory& res_cache_tmp_mem) {
     MICROPROFILE_SCOPE(OpenGL_SurfaceLoad);
-    gl_buffer.resize(params.max_mip_level);
+    auto& gl_buffer = res_cache_tmp_mem.gl_buffer;
+    if (gl_buffer.size() < params.max_mip_level)
+        gl_buffer.resize(params.max_mip_level);
     for (u32 i = 0; i < params.max_mip_level; i++)
         gl_buffer[i].resize(params.GetMipmapSizeGL(i));
     if (params.is_tiled) {
@@ -671,13 +673,13 @@ void CachedSurface::LoadGLBuffer() {
 }
 
 MICROPROFILE_DEFINE(OpenGL_SurfaceFlush, "OpenGL", "Surface Flush", MP_RGB(128, 192, 64));
-void CachedSurface::FlushGLBuffer() {
+void CachedSurface::FlushGLBuffer(RasterizerTemporaryMemory& res_cache_tmp_mem) {
     MICROPROFILE_SCOPE(OpenGL_SurfaceFlush);
 
     ASSERT_MSG(!IsPixelFormatASTC(params.pixel_format), "Unimplemented");
 
+    auto& gl_buffer = res_cache_tmp_mem.gl_buffer;
     // OpenGL temporary buffer needs to be big enough to store raw texture size
-    gl_buffer.resize(1);
     gl_buffer[0].resize(GetSizeInBytes());
 
     const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type);
@@ -713,10 +715,12 @@ void CachedSurface::FlushGLBuffer() {
     }
 }
 
-void CachedSurface::UploadGLMipmapTexture(u32 mip_map, GLuint read_fb_handle,
-                                          GLuint draw_fb_handle) {
+void CachedSurface::UploadGLMipmapTexture(RasterizerTemporaryMemory& res_cache_tmp_mem, u32 mip_map,
+                                          GLuint read_fb_handle, GLuint draw_fb_handle) {
     const auto& rect{params.GetRect(mip_map)};
 
+    auto& gl_buffer = res_cache_tmp_mem.gl_buffer;
+
     // Load data from memory to the surface
     const auto x0 = static_cast<GLint>(rect.left);
     const auto y0 = static_cast<GLint>(rect.bottom);
@@ -801,7 +805,6 @@ void CachedSurface::UploadGLMipmapTexture(u32 mip_map, GLuint read_fb_handle,
                                 tuple.type, &gl_buffer[mip_map][buffer_offset]);
             break;
         case SurfaceTarget::TextureCubemap: {
-            std::size_t start = buffer_offset;
             for (std::size_t face = 0; face < params.depth; ++face) {
                 glTextureSubImage3D(texture.handle, mip_map, x0, y0, static_cast<GLint>(face),
                                     static_cast<GLsizei>(rect.GetWidth()),
@@ -845,11 +848,12 @@ void CachedSurface::EnsureTextureDiscrepantView() {
 }
 
 MICROPROFILE_DEFINE(OpenGL_TextureUL, "OpenGL", "Texture Upload", MP_RGB(128, 192, 64));
-void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle) {
+void CachedSurface::UploadGLTexture(RasterizerTemporaryMemory& res_cache_tmp_mem,
+                                    GLuint read_fb_handle, GLuint draw_fb_handle) {
     MICROPROFILE_SCOPE(OpenGL_TextureUL);
 
     for (u32 i = 0; i < params.max_mip_level; i++)
-        UploadGLMipmapTexture(i, read_fb_handle, draw_fb_handle);
+        UploadGLMipmapTexture(res_cache_tmp_mem, i, read_fb_handle, draw_fb_handle);
 }
 
 void CachedSurface::UpdateSwizzle(Tegra::Texture::SwizzleSource swizzle_x,
@@ -929,8 +933,8 @@ Surface RasterizerCacheOpenGL::GetColorBufferSurface(std::size_t index, bool pre
 }
 
 void RasterizerCacheOpenGL::LoadSurface(const Surface& surface) {
-    surface->LoadGLBuffer();
-    surface->UploadGLTexture(read_framebuffer.handle, draw_framebuffer.handle);
+    surface->LoadGLBuffer(temporal_memory);
+    surface->UploadGLTexture(temporal_memory, read_framebuffer.handle, draw_framebuffer.handle);
     surface->MarkAsModified(false, *this);
     surface->MarkForReload(false);
 }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index db280dbb3..6263ef3e7 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -355,6 +355,12 @@ namespace OpenGL {
 
 class RasterizerOpenGL;
 
+// This is used to store temporary big buffers,
+// instead of creating/destroying all the time
+struct RasterizerTemporaryMemory {
+    std::vector<std::vector<u8>> gl_buffer;
+};
+
 class CachedSurface final : public RasterizerCacheObject {
 public:
     explicit CachedSurface(const SurfaceParams& params);
@@ -371,10 +377,6 @@ public:
         return memory_size;
     }
 
-    void Flush() override {
-        FlushGLBuffer();
-    }
-
     const OGLTexture& Texture() const {
         return texture;
     }
@@ -397,11 +399,12 @@ public:
     }
 
     // Read/Write data in Switch memory to/from gl_buffer
-    void LoadGLBuffer();
-    void FlushGLBuffer();
+    void LoadGLBuffer(RasterizerTemporaryMemory& res_cache_tmp_mem);
+    void FlushGLBuffer(RasterizerTemporaryMemory& res_cache_tmp_mem);
 
     // Upload data in gl_buffer to this surface's texture
-    void UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle);
+    void UploadGLTexture(RasterizerTemporaryMemory& res_cache_tmp_mem, GLuint read_fb_handle,
+                         GLuint draw_fb_handle);
 
     void UpdateSwizzle(Tegra::Texture::SwizzleSource swizzle_x,
                        Tegra::Texture::SwizzleSource swizzle_y,
@@ -429,13 +432,13 @@ public:
     }
 
 private:
-    void UploadGLMipmapTexture(u32 mip_map, GLuint read_fb_handle, GLuint draw_fb_handle);
+    void UploadGLMipmapTexture(RasterizerTemporaryMemory& res_cache_tmp_mem, u32 mip_map,
+                               GLuint read_fb_handle, GLuint draw_fb_handle);
 
     void EnsureTextureDiscrepantView();
 
     OGLTexture texture;
     OGLTexture discrepant_view;
-    std::vector<std::vector<u8>> gl_buffer;
     SurfaceParams params{};
     GLenum gl_target{};
     GLenum gl_internal_format{};
@@ -473,6 +476,11 @@ public:
     void SignalPreDrawCall();
     void SignalPostDrawCall();
 
+protected:
+    void FlushObjectInner(const Surface& object) override {
+        object->FlushGLBuffer(temporal_memory);
+    }
+
 private:
     void LoadSurface(const Surface& surface);
     Surface GetSurface(const SurfaceParams& params, bool preserve_contents = true);
@@ -519,6 +527,8 @@ private:
     std::array<Surface, Maxwell::NumRenderTargets> current_color_buffers;
     Surface last_depth_buffer;
 
+    RasterizerTemporaryMemory temporal_memory;
+
     using SurfaceIntervalCache = boost::icl::interval_map<CacheAddr, Surface>;
     using SurfaceInterval = typename SurfaceIntervalCache::interval_type;
 
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index b1c8f7c35..f700dc89a 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -345,7 +345,7 @@ ShaderDiskCacheUsage CachedShader::GetUsage(GLenum primitive_mode,
 
 ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
                                      const Device& device)
-    : RasterizerCache{rasterizer}, disk_cache{system}, device{device} {}
+    : RasterizerCache{rasterizer}, device{device}, disk_cache{system} {}
 
 void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
                                       const VideoCore::DiskResourceLoadCallback& callback) {
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index a332087f8..31b979987 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -57,9 +57,6 @@ public:
         return shader_length;
     }
 
-    // We do not have to flush this cache as things in it are never modified by us.
-    void Flush() override {}
-
     /// Gets the shader entries for the shader
     const GLShader::ShaderEntries& GetShaderEntries() const {
         return entries;
@@ -123,6 +120,10 @@ public:
     /// Gets the current specified shader stage program
     Shader GetStageProgram(Maxwell::ShaderProgram program);
 
+protected:
+    // We do not have to flush this cache as things in it are never modified by us.
+    void FlushObjectInner(const Shader& object) override {}
+
 private:
     std::unordered_map<u64, UnspecializedShader> GenerateUnspecializedShaders(
         const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback,
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index ef1a1995f..4c380677d 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -57,15 +57,14 @@ public:
         shader_source += text;
     }
 
-    void AddLine(std::string_view text) {
-        AddExpression(text);
-        AddNewLine();
-    }
-
-    void AddLine(char character) {
-        DEBUG_ASSERT(scope >= 0);
-        AppendIndentation();
-        shader_source += character;
+    // Forwards all arguments directly to libfmt.
+    // Note that all formatting requirements for fmt must be
+    // obeyed when using this function. (e.g. {{ must be used
+    // printing the character '{' is desirable. Ditto for }} and '}',
+    // etc).
+    template <typename... Args>
+    void AddLine(std::string_view text, Args&&... args) {
+        AddExpression(fmt::format(text, std::forward<Args>(args)...));
         AddNewLine();
     }
 
@@ -75,9 +74,7 @@ public:
     }
 
     std::string GenerateTemporary() {
-        std::string temporary = "tmp";
-        temporary += std::to_string(temporary_index++);
-        return temporary;
+        return fmt::format("tmp{}", temporary_index++);
     }
 
     std::string GetResult() {
@@ -134,6 +131,19 @@ bool IsPrecise(Node node) {
     return false;
 }
 
+constexpr bool IsGenericAttribute(Attribute::Index index) {
+    return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31;
+}
+
+constexpr Attribute::Index ToGenericAttribute(u32 value) {
+    return static_cast<Attribute::Index>(value + static_cast<u32>(Attribute::Index::Attribute_0));
+}
+
+u32 GetGenericAttributeIndex(Attribute::Index index) {
+    ASSERT(IsGenericAttribute(index));
+    return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
+}
+
 class GLSLDecompiler final {
 public:
     explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
@@ -152,42 +162,43 @@ public:
         DeclareConstantBuffers();
         DeclareGlobalMemory();
         DeclareSamplers();
+        DeclarePhysicalAttributeReader();
 
-        code.AddLine("void execute_" + suffix + "() {");
+        code.AddLine("void execute_{}() {{", suffix);
         ++code.scope;
 
         // VM's program counter
         const auto first_address = ir.GetBasicBlocks().begin()->first;
-        code.AddLine("uint jmp_to = " + std::to_string(first_address) + "u;");
+        code.AddLine("uint jmp_to = {}u;", first_address);
 
         // TODO(Subv): Figure out the actual depth of the flow stack, for now it seems
         // unlikely that shaders will use 20 nested SSYs and PBKs.
         constexpr u32 FLOW_STACK_SIZE = 20;
-        code.AddLine(fmt::format("uint flow_stack[{}];", FLOW_STACK_SIZE));
+        code.AddLine("uint flow_stack[{}];", FLOW_STACK_SIZE);
         code.AddLine("uint flow_stack_top = 0u;");
 
-        code.AddLine("while (true) {");
+        code.AddLine("while (true) {{");
         ++code.scope;
 
-        code.AddLine("switch (jmp_to) {");
+        code.AddLine("switch (jmp_to) {{");
 
         for (const auto& pair : ir.GetBasicBlocks()) {
             const auto [address, bb] = pair;
-            code.AddLine(fmt::format("case 0x{:x}u: {{", address));
+            code.AddLine("case 0x{:x}u: {{", address);
             ++code.scope;
 
             VisitBlock(bb);
 
             --code.scope;
-            code.AddLine('}');
+            code.AddLine("}}");
         }
 
         code.AddLine("default: return;");
-        code.AddLine('}');
+        code.AddLine("}}");
 
         for (std::size_t i = 0; i < 2; ++i) {
             --code.scope;
-            code.AddLine('}');
+            code.AddLine("}}");
         }
     }
 
@@ -227,12 +238,13 @@ private:
     }
 
     void DeclareGeometry() {
-        if (stage != ShaderStage::Geometry)
+        if (stage != ShaderStage::Geometry) {
             return;
+        }
 
         const auto topology = GetTopologyName(header.common3.output_topology);
-        const auto max_vertices = std::to_string(header.common4.max_output_vertices);
-        code.AddLine("layout (" + topology + ", max_vertices = " + max_vertices + ") out;");
+        const auto max_vertices = header.common4.max_output_vertices.Value();
+        code.AddLine("layout ({}, max_vertices = {}) out;", topology, max_vertices);
         code.AddNewLine();
 
         DeclareVertexRedeclarations();
@@ -241,7 +253,7 @@ private:
     void DeclareVertexRedeclarations() {
         bool clip_distances_declared = false;
 
-        code.AddLine("out gl_PerVertex {");
+        code.AddLine("out gl_PerVertex {{");
         ++code.scope;
 
         code.AddLine("vec4 gl_Position;");
@@ -257,122 +269,143 @@ private:
         }
 
         --code.scope;
-        code.AddLine("};");
+        code.AddLine("}};");
         code.AddNewLine();
     }
 
     void DeclareRegisters() {
         const auto& registers = ir.GetRegisters();
         for (const u32 gpr : registers) {
-            code.AddLine("float " + GetRegister(gpr) + " = 0;");
+            code.AddLine("float {} = 0;", GetRegister(gpr));
         }
-        if (!registers.empty())
+        if (!registers.empty()) {
             code.AddNewLine();
+        }
     }
 
     void DeclarePredicates() {
         const auto& predicates = ir.GetPredicates();
         for (const auto pred : predicates) {
-            code.AddLine("bool " + GetPredicate(pred) + " = false;");
+            code.AddLine("bool {} = false;", GetPredicate(pred));
         }
-        if (!predicates.empty())
+        if (!predicates.empty()) {
             code.AddNewLine();
+        }
     }
 
     void DeclareLocalMemory() {
         if (const u64 local_memory_size = header.GetLocalMemorySize(); local_memory_size > 0) {
             const auto element_count = Common::AlignUp(local_memory_size, 4) / 4;
-            code.AddLine("float " + GetLocalMemory() + '[' + std::to_string(element_count) + "];");
+            code.AddLine("float {}[{}];", GetLocalMemory(), element_count);
             code.AddNewLine();
         }
     }
 
     void DeclareInternalFlags() {
         for (u32 flag = 0; flag < static_cast<u32>(InternalFlag::Amount); flag++) {
-            const InternalFlag flag_code = static_cast<InternalFlag>(flag);
-            code.AddLine("bool " + GetInternalFlag(flag_code) + " = false;");
+            const auto flag_code = static_cast<InternalFlag>(flag);
+            code.AddLine("bool {} = false;", GetInternalFlag(flag_code));
         }
         code.AddNewLine();
     }
 
     std::string GetInputFlags(AttributeUse attribute) {
-        std::string out;
-
         switch (attribute) {
-        case AttributeUse::Constant:
-            out += "flat ";
-            break;
-        case AttributeUse::ScreenLinear:
-            out += "noperspective ";
-            break;
         case AttributeUse::Perspective:
             // Default, Smooth
-            break;
+            return {};
+        case AttributeUse::Constant:
+            return "flat ";
+        case AttributeUse::ScreenLinear:
+            return "noperspective ";
         default:
-            LOG_CRITICAL(HW_GPU, "Unused attribute being fetched");
-            UNREACHABLE();
+        case AttributeUse::Unused:
+            UNREACHABLE_MSG("Unused attribute being fetched");
+            return {};
+            UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<u32>(attribute));
+            return {};
         }
-        return out;
     }
 
     void DeclareInputAttributes() {
-        const auto& attributes = ir.GetInputAttributes();
-        for (const auto element : attributes) {
-            const Attribute::Index index = element.first;
-            if (index < Attribute::Index::Attribute_0 || index > Attribute::Index::Attribute_31) {
-                // Skip when it's not a generic attribute
-                continue;
+        if (ir.HasPhysicalAttributes()) {
+            const u32 num_inputs{GetNumPhysicalInputAttributes()};
+            for (u32 i = 0; i < num_inputs; ++i) {
+                DeclareInputAttribute(ToGenericAttribute(i), true);
             }
+            code.AddNewLine();
+            return;
+        }
 
-            // TODO(bunnei): Use proper number of elements for these
-            u32 idx = static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
-            if (stage != ShaderStage::Vertex) {
-                // If inputs are varyings, add an offset
-                idx += GENERIC_VARYING_START_LOCATION;
+        const auto& attributes = ir.GetInputAttributes();
+        for (const auto index : attributes) {
+            if (IsGenericAttribute(index)) {
+                DeclareInputAttribute(index, false);
             }
+        }
+        if (!attributes.empty()) {
+            code.AddNewLine();
+        }
+    }
 
-            std::string attr = GetInputAttribute(index);
-            if (stage == ShaderStage::Geometry) {
-                attr = "gs_" + attr + "[]";
-            }
-            std::string suffix;
-            if (stage == ShaderStage::Fragment) {
-                const auto input_mode =
-                    header.ps.GetAttributeUse(idx - GENERIC_VARYING_START_LOCATION);
-                suffix = GetInputFlags(input_mode);
+    void DeclareInputAttribute(Attribute::Index index, bool skip_unused) {
+        const u32 generic_index{GetGenericAttributeIndex(index)};
+
+        std::string name{GetInputAttribute(index)};
+        if (stage == ShaderStage::Geometry) {
+            name = "gs_" + name + "[]";
+        }
+
+        std::string suffix;
+        if (stage == ShaderStage::Fragment) {
+            const auto input_mode{header.ps.GetAttributeUse(generic_index)};
+            if (skip_unused && input_mode == AttributeUse::Unused) {
+                return;
             }
-            code.AddLine("layout (location = " + std::to_string(idx) + ") " + suffix + "in vec4 " +
-                         attr + ';');
+            suffix = GetInputFlags(input_mode);
         }
-        if (!attributes.empty())
-            code.AddNewLine();
+
+        u32 location = generic_index;
+        if (stage != ShaderStage::Vertex) {
+            // If inputs are varyings, add an offset
+            location += GENERIC_VARYING_START_LOCATION;
+        }
+
+        code.AddLine("layout (location = {}) {} in vec4 {};", name, location, suffix, name);
     }
 
     void DeclareOutputAttributes() {
+        if (ir.HasPhysicalAttributes() && stage != ShaderStage::Fragment) {
+            for (u32 i = 0; i < GetNumPhysicalVaryings(); ++i) {
+                DeclareOutputAttribute(ToGenericAttribute(i));
+            }
+            code.AddNewLine();
+            return;
+        }
+
         const auto& attributes = ir.GetOutputAttributes();
         for (const auto index : attributes) {
-            if (index < Attribute::Index::Attribute_0 || index > Attribute::Index::Attribute_31) {
-                // Skip when it's not a generic attribute
-                continue;
+            if (IsGenericAttribute(index)) {
+                DeclareOutputAttribute(index);
             }
-            // TODO(bunnei): Use proper number of elements for these
-            const auto idx = static_cast<u32>(index) -
-                             static_cast<u32>(Attribute::Index::Attribute_0) +
-                             GENERIC_VARYING_START_LOCATION;
-            code.AddLine("layout (location = " + std::to_string(idx) + ") out vec4 " +
-                         GetOutputAttribute(index) + ';');
-        }
-        if (!attributes.empty())
+        }
+        if (!attributes.empty()) {
             code.AddNewLine();
+        }
+    }
+
+    void DeclareOutputAttribute(Attribute::Index index) {
+        const u32 location{GetGenericAttributeIndex(index) + GENERIC_VARYING_START_LOCATION};
+        code.AddLine("layout (location = {}) out vec4 {};", location, GetOutputAttribute(index));
     }
 
     void DeclareConstantBuffers() {
         for (const auto& entry : ir.GetConstantBuffers()) {
             const auto [index, size] = entry;
-            code.AddLine("layout (std140, binding = CBUF_BINDING_" + std::to_string(index) +
-                         ") uniform " + GetConstBufferBlock(index) + " {");
-            code.AddLine("    vec4 " + GetConstBuffer(index) + "[MAX_CONSTBUFFER_ELEMENTS];");
-            code.AddLine("};");
+            code.AddLine("layout (std140, binding = CBUF_BINDING_{}) uniform {} {{", index,
+                         GetConstBufferBlock(index));
+            code.AddLine("    vec4 {}[MAX_CONSTBUFFER_ELEMENTS];", GetConstBuffer(index));
+            code.AddLine("}};");
             code.AddNewLine();
         }
     }
@@ -384,17 +417,16 @@ private:
             // Since we don't know how the shader will use the shader, hint the driver to disable as
             // much optimizations as possible
             std::string qualifier = "coherent volatile";
-            if (usage.is_read && !usage.is_written)
+            if (usage.is_read && !usage.is_written) {
                 qualifier += " readonly";
-            else if (usage.is_written && !usage.is_read)
+            } else if (usage.is_written && !usage.is_read) {
                 qualifier += " writeonly";
+            }
 
-            const std::string binding =
-                fmt::format("GMEM_BINDING_{}_{}", base.cbuf_index, base.cbuf_offset);
-            code.AddLine("layout (std430, binding = " + binding + ") " + qualifier + " buffer " +
-                         GetGlobalMemoryBlock(base) + " {");
-            code.AddLine("    float " + GetGlobalMemory(base) + "[];");
-            code.AddLine("};");
+            code.AddLine("layout (std430, binding = GMEM_BINDING_{}_{}) {} buffer {} {{",
+                         base.cbuf_index, base.cbuf_offset, qualifier, GetGlobalMemoryBlock(base));
+            code.AddLine("    float {}[];", GetGlobalMemory(base));
+            code.AddLine("}};");
             code.AddNewLine();
         }
     }
@@ -402,7 +434,7 @@ private:
     void DeclareSamplers() {
         const auto& samplers = ir.GetSamplers();
         for (const auto& sampler : samplers) {
-            std::string sampler_type = [&]() {
+            std::string sampler_type = [&sampler] {
                 switch (sampler.GetType()) {
                 case Tegra::Shader::TextureType::Texture1D:
                     return "sampler1D";
@@ -417,16 +449,52 @@ private:
                     return "sampler2D";
                 }
             }();
-            if (sampler.IsArray())
+            if (sampler.IsArray()) {
                 sampler_type += "Array";
-            if (sampler.IsShadow())
+            }
+            if (sampler.IsShadow()) {
                 sampler_type += "Shadow";
+            }
 
-            code.AddLine("layout (binding = SAMPLER_BINDING_" + std::to_string(sampler.GetIndex()) +
-                         ") uniform " + sampler_type + ' ' + GetSampler(sampler) + ';');
+            code.AddLine("layout (binding = SAMPLER_BINDING_{}) uniform {} {};", sampler.GetIndex(),
+                         sampler_type, GetSampler(sampler));
         }
-        if (!samplers.empty())
+        if (!samplers.empty()) {
             code.AddNewLine();
+        }
+    }
+
+    void DeclarePhysicalAttributeReader() {
+        if (!ir.HasPhysicalAttributes()) {
+            return;
+        }
+        code.AddLine("float readPhysicalAttribute(uint physical_address) {{");
+        ++code.scope;
+        code.AddLine("switch (physical_address) {{");
+
+        // Just declare generic attributes for now.
+        const auto num_attributes{static_cast<u32>(GetNumPhysicalInputAttributes())};
+        for (u32 index = 0; index < num_attributes; ++index) {
+            const auto attribute{ToGenericAttribute(index)};
+            for (u32 element = 0; element < 4; ++element) {
+                constexpr u32 generic_base{0x80};
+                constexpr u32 generic_stride{16};
+                constexpr u32 element_stride{4};
+                const u32 address{generic_base + index * generic_stride + element * element_stride};
+
+                const bool declared{stage != ShaderStage::Fragment ||
+                                    header.ps.GetAttributeUse(index) != AttributeUse::Unused};
+                const std::string value{declared ? ReadAttribute(attribute, element) : "0"};
+                code.AddLine("case 0x{:x}: return {};", address, value);
+            }
+        }
+
+        code.AddLine("default: return 0;");
+
+        code.AddLine("}}");
+        --code.scope;
+        code.AddLine("}}");
+        code.AddNewLine();
     }
 
     void VisitBlock(const NodeBlock& bb) {
@@ -450,23 +518,26 @@ private:
                 return {};
             }
             return (this->*decompiler)(*operation);
+        }
 
-        } else if (const auto gpr = std::get_if<GprNode>(node)) {
+        if (const auto gpr = std::get_if<GprNode>(node)) {
             const u32 index = gpr->GetIndex();
             if (index == Register::ZeroIndex) {
                 return "0";
             }
             return GetRegister(index);
+        }
 
-        } else if (const auto immediate = std::get_if<ImmediateNode>(node)) {
+        if (const auto immediate = std::get_if<ImmediateNode>(node)) {
             const u32 value = immediate->GetValue();
             if (value < 10) {
                 // For eyecandy avoid using hex numbers on single digits
                 return fmt::format("utof({}u)", immediate->GetValue());
             }
             return fmt::format("utof(0x{:x}u)", immediate->GetValue());
+        }
 
-        } else if (const auto predicate = std::get_if<PredicateNode>(node)) {
+        if (const auto predicate = std::get_if<PredicateNode>(node)) {
             const auto value = [&]() -> std::string {
                 switch (const auto index = predicate->GetIndex(); index) {
                 case Tegra::Shader::Pred::UnusedIndex:
@@ -478,77 +549,22 @@ private:
                 }
             }();
             if (predicate->IsNegated()) {
-                return "!(" + value + ')';
+                return fmt::format("!({})", value);
             }
             return value;
+        }
 
-        } else if (const auto abuf = std::get_if<AbufNode>(node)) {
-            const auto attribute = abuf->GetIndex();
-            const auto element = abuf->GetElement();
-
-            const auto GeometryPass = [&](const std::string& name) {
-                if (stage == ShaderStage::Geometry && abuf->GetBuffer()) {
-                    // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games
-                    // set an 0x80000000 index for those and the shader fails to build. Find out why
-                    // this happens and what's its intent.
-                    return "gs_" + name + "[ftou(" + Visit(abuf->GetBuffer()) +
-                           ") % MAX_VERTEX_INPUT]";
-                }
-                return name;
-            };
-
-            switch (attribute) {
-            case Attribute::Index::Position:
-                if (stage != ShaderStage::Fragment) {
-                    return GeometryPass("position") + GetSwizzle(element);
-                } else {
-                    return element == 3 ? "1.0f" : "gl_FragCoord" + GetSwizzle(element);
-                }
-            case Attribute::Index::PointCoord:
-                switch (element) {
-                case 0:
-                    return "gl_PointCoord.x";
-                case 1:
-                    return "gl_PointCoord.y";
-                case 2:
-                case 3:
-                    return "0";
-                }
-                UNREACHABLE();
-                return "0";
-            case Attribute::Index::TessCoordInstanceIDVertexID:
-                // TODO(Subv): Find out what the values are for the first two elements when inside a
-                // vertex shader, and what's the value of the fourth element when inside a Tess Eval
-                // shader.
-                ASSERT(stage == ShaderStage::Vertex);
-                switch (element) {
-                case 2:
-                    // Config pack's first value is instance_id.
-                    return "uintBitsToFloat(config_pack[0])";
-                case 3:
-                    return "uintBitsToFloat(gl_VertexID)";
-                }
-                UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
-                return "0";
-            case Attribute::Index::FrontFacing:
-                // TODO(Subv): Find out what the values are for the other elements.
-                ASSERT(stage == ShaderStage::Fragment);
-                switch (element) {
-                case 3:
-                    return "itof(gl_FrontFacing ? -1 : 0)";
-                }
-                UNIMPLEMENTED_MSG("Unmanaged FrontFacing element={}", element);
-                return "0";
-            default:
-                if (attribute >= Attribute::Index::Attribute_0 &&
-                    attribute <= Attribute::Index::Attribute_31) {
-                    return GeometryPass(GetInputAttribute(attribute)) + GetSwizzle(element);
-                }
-                break;
+        if (const auto abuf = std::get_if<AbufNode>(node)) {
+            UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ShaderStage::Geometry,
+                                 "Physical attributes in geometry shaders are not implemented");
+            if (abuf->IsPhysicalBuffer()) {
+                return fmt::format("readPhysicalAttribute(ftou({}))",
+                                   Visit(abuf->GetPhysicalAddress()));
             }
-            UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute));
+            return ReadAttribute(abuf->GetIndex(), abuf->GetElement(), abuf->GetBuffer());
+        }
 
-        } else if (const auto cbuf = std::get_if<CbufNode>(node)) {
+        if (const auto cbuf = std::get_if<CbufNode>(node)) {
             const Node offset = cbuf->GetOffset();
             if (const auto immediate = std::get_if<ImmediateNode>(offset)) {
                 // Direct access
@@ -556,48 +572,117 @@ private:
                 ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
                 return fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
                                    offset_imm / (4 * 4), (offset_imm / 4) % 4);
+            }
 
-            } else if (std::holds_alternative<OperationNode>(*offset)) {
+            if (std::holds_alternative<OperationNode>(*offset)) {
                 // Indirect access
                 const std::string final_offset = code.GenerateTemporary();
-                code.AddLine("uint " + final_offset + " = (ftou(" + Visit(offset) + ") / 4);");
+                code.AddLine("uint {} = (ftou({}) / 4);", final_offset, Visit(offset));
                 return fmt::format("{}[{} / 4][{} % 4]", GetConstBuffer(cbuf->GetIndex()),
                                    final_offset, final_offset);
-
-            } else {
-                UNREACHABLE_MSG("Unmanaged offset node type");
             }
 
-        } else if (const auto gmem = std::get_if<GmemNode>(node)) {
+            UNREACHABLE_MSG("Unmanaged offset node type");
+        }
+
+        if (const auto gmem = std::get_if<GmemNode>(node)) {
             const std::string real = Visit(gmem->GetRealAddress());
             const std::string base = Visit(gmem->GetBaseAddress());
-            const std::string final_offset = "(ftou(" + real + ") - ftou(" + base + ")) / 4";
+            const std::string final_offset = fmt::format("(ftou({}) - ftou({})) / 4", real, base);
             return fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset);
+        }
 
-        } else if (const auto lmem = std::get_if<LmemNode>(node)) {
+        if (const auto lmem = std::get_if<LmemNode>(node)) {
             return fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
+        }
 
-        } else if (const auto internal_flag = std::get_if<InternalFlagNode>(node)) {
+        if (const auto internal_flag = std::get_if<InternalFlagNode>(node)) {
             return GetInternalFlag(internal_flag->GetFlag());
+        }
 
-        } else if (const auto conditional = std::get_if<ConditionalNode>(node)) {
+        if (const auto conditional = std::get_if<ConditionalNode>(node)) {
             // It's invalid to call conditional on nested nodes, use an operation instead
-            code.AddLine("if (" + Visit(conditional->GetCondition()) + ") {");
+            code.AddLine("if ({}) {{", Visit(conditional->GetCondition()));
             ++code.scope;
 
             VisitBlock(conditional->GetCode());
 
             --code.scope;
-            code.AddLine('}');
+            code.AddLine("}}");
             return {};
+        }
 
-        } else if (const auto comment = std::get_if<CommentNode>(node)) {
+        if (const auto comment = std::get_if<CommentNode>(node)) {
             return "// " + comment->GetText();
         }
+
         UNREACHABLE();
         return {};
     }
 
+    std::string ReadAttribute(Attribute::Index attribute, u32 element, Node buffer = {}) {
+        const auto GeometryPass = [&](std::string_view name) {
+            if (stage == ShaderStage::Geometry && buffer) {
+                // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games
+                // set an 0x80000000 index for those and the shader fails to build. Find out why
+                // this happens and what's its intent.
+                return fmt::format("gs_{}[ftou({}) % MAX_VERTEX_INPUT]", name, Visit(buffer));
+            }
+            return std::string(name);
+        };
+
+        switch (attribute) {
+        case Attribute::Index::Position:
+            if (stage != ShaderStage::Fragment) {
+                return GeometryPass("position") + GetSwizzle(element);
+            } else {
+                return element == 3 ? "1.0f" : "gl_FragCoord" + GetSwizzle(element);
+            }
+        case Attribute::Index::PointCoord:
+            switch (element) {
+            case 0:
+                return "gl_PointCoord.x";
+            case 1:
+                return "gl_PointCoord.y";
+            case 2:
+            case 3:
+                return "0";
+            }
+            UNREACHABLE();
+            return "0";
+        case Attribute::Index::TessCoordInstanceIDVertexID:
+            // TODO(Subv): Find out what the values are for the first two elements when inside a
+            // vertex shader, and what's the value of the fourth element when inside a Tess Eval
+            // shader.
+            ASSERT(stage == ShaderStage::Vertex);
+            switch (element) {
+            case 2:
+                // Config pack's first value is instance_id.
+                return "uintBitsToFloat(config_pack[0])";
+            case 3:
+                return "uintBitsToFloat(gl_VertexID)";
+            }
+            UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
+            return "0";
+        case Attribute::Index::FrontFacing:
+            // TODO(Subv): Find out what the values are for the other elements.
+            ASSERT(stage == ShaderStage::Fragment);
+            switch (element) {
+            case 3:
+                return "itof(gl_FrontFacing ? -1 : 0)";
+            }
+            UNIMPLEMENTED_MSG("Unmanaged FrontFacing element={}", element);
+            return "0";
+        default:
+            if (IsGenericAttribute(attribute)) {
+                return GeometryPass(GetInputAttribute(attribute)) + GetSwizzle(element);
+            }
+            break;
+        }
+        UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute));
+        return "0";
+    }
+
     std::string ApplyPrecise(Operation operation, const std::string& value) {
         if (!IsPrecise(operation)) {
             return value;
@@ -606,7 +691,7 @@ private:
         const std::string precise = stage != ShaderStage::Fragment ? "precise " : "";
 
         const std::string temporary = code.GenerateTemporary();
-        code.AddLine(precise + "float " + temporary + " = " + value + ';');
+        code.AddLine("{}float {} = {};", precise, temporary, value);
         return temporary;
     }
 
@@ -620,7 +705,7 @@ private:
         }
 
         const std::string temporary = code.GenerateTemporary();
-        code.AddLine("float " + temporary + " = " + Visit(operand) + ';');
+        code.AddLine("float {} = {};", temporary, Visit(operand));
         return temporary;
     }
 
@@ -635,31 +720,32 @@ private:
         case Type::Float:
             return value;
         case Type::Int:
-            return "ftoi(" + value + ')';
+            return fmt::format("ftoi({})", value);
         case Type::Uint:
-            return "ftou(" + value + ')';
+            return fmt::format("ftou({})", value);
         case Type::HalfFloat:
-            return "toHalf2(" + value + ')';
+            return fmt::format("toHalf2({})", value);
         }
         UNREACHABLE();
         return value;
     }
 
-    std::string BitwiseCastResult(std::string value, Type type, bool needs_parenthesis = false) {
+    std::string BitwiseCastResult(const std::string& value, Type type,
+                                  bool needs_parenthesis = false) {
         switch (type) {
         case Type::Bool:
         case Type::Bool2:
         case Type::Float:
             if (needs_parenthesis) {
-                return '(' + value + ')';
+                return fmt::format("({})", value);
             }
             return value;
         case Type::Int:
-            return "itof(" + value + ')';
+            return fmt::format("itof({})", value);
         case Type::Uint:
-            return "utof(" + value + ')';
+            return fmt::format("utof({})", value);
         case Type::HalfFloat:
-            return "fromHalf2(" + value + ')';
+            return fmt::format("fromHalf2({})", value);
         }
         UNREACHABLE();
         return value;
@@ -667,27 +753,27 @@ private:
 
     std::string GenerateUnary(Operation operation, const std::string& func, Type result_type,
                               Type type_a, bool needs_parenthesis = true) {
-        return ApplyPrecise(operation,
-                            BitwiseCastResult(func + '(' + VisitOperand(operation, 0, type_a) + ')',
-                                              result_type, needs_parenthesis));
+        const std::string op_str = fmt::format("{}({})", func, VisitOperand(operation, 0, type_a));
+
+        return ApplyPrecise(operation, BitwiseCastResult(op_str, result_type, needs_parenthesis));
     }
 
     std::string GenerateBinaryInfix(Operation operation, const std::string& func, Type result_type,
                                     Type type_a, Type type_b) {
         const std::string op_a = VisitOperand(operation, 0, type_a);
         const std::string op_b = VisitOperand(operation, 1, type_b);
+        const std::string op_str = fmt::format("({} {} {})", op_a, func, op_b);
 
-        return ApplyPrecise(
-            operation, BitwiseCastResult('(' + op_a + ' ' + func + ' ' + op_b + ')', result_type));
+        return ApplyPrecise(operation, BitwiseCastResult(op_str, result_type));
     }
 
     std::string GenerateBinaryCall(Operation operation, const std::string& func, Type result_type,
                                    Type type_a, Type type_b) {
         const std::string op_a = VisitOperand(operation, 0, type_a);
         const std::string op_b = VisitOperand(operation, 1, type_b);
+        const std::string op_str = fmt::format("{}({}, {})", func, op_a, op_b);
 
-        return ApplyPrecise(operation,
-                            BitwiseCastResult(func + '(' + op_a + ", " + op_b + ')', result_type));
+        return ApplyPrecise(operation, BitwiseCastResult(op_str, result_type));
     }
 
     std::string GenerateTernary(Operation operation, const std::string& func, Type result_type,
@@ -695,10 +781,9 @@ private:
         const std::string op_a = VisitOperand(operation, 0, type_a);
         const std::string op_b = VisitOperand(operation, 1, type_b);
         const std::string op_c = VisitOperand(operation, 2, type_c);
+        const std::string op_str = fmt::format("{}({}, {}, {})", func, op_a, op_b, op_c);
 
-        return ApplyPrecise(
-            operation,
-            BitwiseCastResult(func + '(' + op_a + ", " + op_b + ", " + op_c + ')', result_type));
+        return ApplyPrecise(operation, BitwiseCastResult(op_str, result_type));
     }
 
     std::string GenerateQuaternary(Operation operation, const std::string& func, Type result_type,
@@ -707,10 +792,9 @@ private:
         const std::string op_b = VisitOperand(operation, 1, type_b);
         const std::string op_c = VisitOperand(operation, 2, type_c);
         const std::string op_d = VisitOperand(operation, 3, type_d);
+        const std::string op_str = fmt::format("{}({}, {}, {}, {})", func, op_a, op_b, op_c, op_d);
 
-        return ApplyPrecise(operation, BitwiseCastResult(func + '(' + op_a + ", " + op_b + ", " +
-                                                             op_c + ", " + op_d + ')',
-                                                         result_type));
+        return ApplyPrecise(operation, BitwiseCastResult(op_str, result_type));
     }
 
     std::string GenerateTexture(Operation operation, const std::string& function_suffix,
@@ -773,7 +857,7 @@ private:
                 // required to be constant)
                 expr += std::to_string(static_cast<s32>(immediate->GetValue()));
             } else {
-                expr += "ftoi(" + Visit(operand) + ')';
+                expr += fmt::format("ftoi({})", Visit(operand));
             }
             break;
         case Type::Float:
@@ -806,7 +890,7 @@ private:
                 expr += std::to_string(static_cast<s32>(immediate->GetValue()));
             } else if (device.HasVariableAoffi()) {
                 // Avoid using variable AOFFI on unsupported devices.
-                expr += "ftoi(" + Visit(operand) + ')';
+                expr += fmt::format("ftoi({})", Visit(operand));
             } else {
                 // Insert 0 on devices not supporting variable AOFFI.
                 expr += '0';
@@ -831,8 +915,9 @@ private:
                 return {};
             }
             target = GetRegister(gpr->GetIndex());
-
         } else if (const auto abuf = std::get_if<AbufNode>(dest)) {
+            UNIMPLEMENTED_IF(abuf->IsPhysicalBuffer());
+
             target = [&]() -> std::string {
                 switch (const auto attribute = abuf->GetIndex(); abuf->GetIndex()) {
                 case Attribute::Index::Position:
@@ -840,12 +925,11 @@ private:
                 case Attribute::Index::PointSize:
                     return "gl_PointSize";
                 case Attribute::Index::ClipDistances0123:
-                    return "gl_ClipDistance[" + std::to_string(abuf->GetElement()) + ']';
+                    return fmt::format("gl_ClipDistance[{}]", abuf->GetElement());
                 case Attribute::Index::ClipDistances4567:
-                    return "gl_ClipDistance[" + std::to_string(abuf->GetElement() + 4) + ']';
+                    return fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4);
                 default:
-                    if (attribute >= Attribute::Index::Attribute_0 &&
-                        attribute <= Attribute::Index::Attribute_31) {
+                    if (IsGenericAttribute(attribute)) {
                         return GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement());
                     }
                     UNIMPLEMENTED_MSG("Unhandled output attribute: {}",
@@ -853,35 +937,21 @@ private:
                     return "0";
                 }
             }();
-
         } else if (const auto lmem = std::get_if<LmemNode>(dest)) {
-            target = GetLocalMemory() + "[ftou(" + Visit(lmem->GetAddress()) + ") / 4]";
-
+            target = fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
         } else if (const auto gmem = std::get_if<GmemNode>(dest)) {
             const std::string real = Visit(gmem->GetRealAddress());
             const std::string base = Visit(gmem->GetBaseAddress());
-            const std::string final_offset = "(ftou(" + real + ") - ftou(" + base + ")) / 4";
+            const std::string final_offset = fmt::format("(ftou({}) - ftou({})) / 4", real, base);
             target = fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset);
-
         } else {
             UNREACHABLE_MSG("Assign called without a proper target");
         }
 
-        code.AddLine(target + " = " + Visit(src) + ';');
+        code.AddLine("{} = {};", target, Visit(src));
         return {};
     }
 
-    std::string Composite(Operation operation) {
-        std::string value = "vec4(";
-        for (std::size_t i = 0; i < 4; ++i) {
-            value += Visit(operation[i]);
-            if (i < 3)
-                value += ", ";
-        }
-        value += ')';
-        return value;
-    }
-
     template <Type type>
     std::string Add(Operation operation) {
         return GenerateBinaryInfix(operation, "+", type, type, type);
@@ -931,8 +1001,9 @@ private:
         const std::string condition = Visit(operation[0]);
         const std::string true_case = Visit(operation[1]);
         const std::string false_case = Visit(operation[2]);
-        return ApplyPrecise(operation,
-                            '(' + condition + " ? " + true_case + " : " + false_case + ')');
+        const std::string op_str = fmt::format("({} ? {} : {})", condition, true_case, false_case);
+
+        return ApplyPrecise(operation, op_str);
     }
 
     std::string FCos(Operation operation) {
@@ -996,9 +1067,9 @@ private:
     std::string ILogicalShiftRight(Operation operation) {
         const std::string op_a = VisitOperand(operation, 0, Type::Uint);
         const std::string op_b = VisitOperand(operation, 1, Type::Uint);
+        const std::string op_str = fmt::format("int({} >> {})", op_a, op_b);
 
-        return ApplyPrecise(operation,
-                            BitwiseCastResult("int(" + op_a + " >> " + op_b + ')', Type::Int));
+        return ApplyPrecise(operation, BitwiseCastResult(op_str, Type::Int));
     }
 
     std::string IArithmeticShiftRight(Operation operation) {
@@ -1054,11 +1125,12 @@ private:
     }
 
     std::string HNegate(Operation operation) {
-        const auto GetNegate = [&](std::size_t index) -> std::string {
+        const auto GetNegate = [&](std::size_t index) {
             return VisitOperand(operation, index, Type::Bool) + " ? -1 : 1";
         };
-        const std::string value = '(' + VisitOperand(operation, 0, Type::HalfFloat) + " * vec2(" +
-                                  GetNegate(1) + ", " + GetNegate(2) + "))";
+        const std::string value =
+            fmt::format("({} * vec2({}, {}))", VisitOperand(operation, 0, Type::HalfFloat),
+                        GetNegate(1), GetNegate(2));
         return BitwiseCastResult(value, Type::HalfFloat);
     }
 
@@ -1066,7 +1138,8 @@ private:
         const std::string value = VisitOperand(operation, 0, Type::HalfFloat);
         const std::string min = VisitOperand(operation, 1, Type::Float);
         const std::string max = VisitOperand(operation, 2, Type::Float);
-        const std::string clamped = "clamp(" + value + ", vec2(" + min + "), vec2(" + max + "))";
+        const std::string clamped = fmt::format("clamp({}, vec2({}), vec2({}))", value, min, max);
+
         return ApplyPrecise(operation, BitwiseCastResult(clamped, Type::HalfFloat));
     }
 
@@ -1077,34 +1150,35 @@ private:
             case Tegra::Shader::HalfType::H0_H1:
                 return operand;
             case Tegra::Shader::HalfType::F32:
-                return "vec2(fromHalf2(" + operand + "))";
+                return fmt::format("vec2(fromHalf2({}))", operand);
             case Tegra::Shader::HalfType::H0_H0:
-                return "vec2(" + operand + "[0])";
+                return fmt::format("vec2({}[0])", operand);
             case Tegra::Shader::HalfType::H1_H1:
-                return "vec2(" + operand + "[1])";
+                return fmt::format("vec2({}[1])", operand);
             }
             UNREACHABLE();
             return "0";
         }();
-        return "fromHalf2(" + value + ')';
+        return fmt::format("fromHalf2({})", value);
     }
 
     std::string HMergeF32(Operation operation) {
-        return "float(toHalf2(" + Visit(operation[0]) + ")[0])";
+        return fmt::format("float(toHalf2({})[0])", Visit(operation[0]));
     }
 
     std::string HMergeH0(Operation operation) {
-        return "fromHalf2(vec2(toHalf2(" + Visit(operation[1]) + ")[0], toHalf2(" +
-               Visit(operation[0]) + ")[1]))";
+        return fmt::format("fromHalf2(vec2(toHalf2({})[0], toHalf2({})[1]))", Visit(operation[1]),
+                           Visit(operation[0]));
     }
 
     std::string HMergeH1(Operation operation) {
-        return "fromHalf2(vec2(toHalf2(" + Visit(operation[0]) + ")[0], toHalf2(" +
-               Visit(operation[1]) + ")[1]))";
+        return fmt::format("fromHalf2(vec2(toHalf2({})[0], toHalf2({})[1]))", Visit(operation[0]),
+                           Visit(operation[1]));
     }
 
     std::string HPack2(Operation operation) {
-        return "utof(packHalf2x16(vec2(" + Visit(operation[0]) + ", " + Visit(operation[1]) + ")))";
+        return fmt::format("utof(packHalf2x16(vec2({}, {})))", Visit(operation[0]),
+                           Visit(operation[1]));
     }
 
     template <Type type>
@@ -1162,7 +1236,7 @@ private:
             target = GetInternalFlag(flag->GetFlag());
         }
 
-        code.AddLine(target + " = " + Visit(src) + ';');
+        code.AddLine("{} = {};", target, Visit(src));
         return {};
     }
 
@@ -1184,7 +1258,7 @@ private:
 
     std::string LogicalPick2(Operation operation) {
         const std::string pair = VisitOperand(operation, 0, Type::Bool2);
-        return pair + '[' + VisitOperand(operation, 1, Type::Uint) + ']';
+        return fmt::format("{}[{}]", pair, VisitOperand(operation, 1, Type::Uint));
     }
 
     std::string LogicalAll2(Operation operation) {
@@ -1196,15 +1270,15 @@ private:
     }
 
     template <bool with_nan>
-    std::string GenerateHalfComparison(Operation operation, std::string compare_op) {
-        std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2,
-                                                  Type::HalfFloat, Type::HalfFloat)};
+    std::string GenerateHalfComparison(Operation operation, const std::string& compare_op) {
+        const std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2,
+                                                        Type::HalfFloat, Type::HalfFloat)};
         if constexpr (!with_nan) {
             return comparison;
         }
-        return "halfFloatNanComparison(" + comparison + ", " +
-               VisitOperand(operation, 0, Type::HalfFloat) + ", " +
-               VisitOperand(operation, 1, Type::HalfFloat) + ')';
+        return fmt::format("halfFloatNanComparison({}, {}, {})", comparison,
+                           VisitOperand(operation, 0, Type::HalfFloat),
+                           VisitOperand(operation, 1, Type::HalfFloat));
     }
 
     template <bool with_nan>
@@ -1281,12 +1355,12 @@ private:
         switch (meta->element) {
         case 0:
         case 1:
-            return "itof(int(textureSize(" + sampler + ", " + lod + ')' +
-                   GetSwizzle(meta->element) + "))";
+            return fmt::format("itof(int(textureSize({}, {}){}))", sampler, lod,
+                               GetSwizzle(meta->element));
         case 2:
             return "0";
         case 3:
-            return "itof(textureQueryLevels(" + sampler + "))";
+            return fmt::format("itof(textureQueryLevels({}))", sampler);
         }
         UNREACHABLE();
         return "0";
@@ -1297,8 +1371,9 @@ private:
         ASSERT(meta);
 
         if (meta->element < 2) {
-            return "itof(int((" + GenerateTexture(operation, "QueryLod", {}) + " * vec2(256))" +
-                   GetSwizzle(meta->element) + "))";
+            return fmt::format("itof(int(({} * vec2(256)){}))",
+                               GenerateTexture(operation, "QueryLod", {}),
+                               GetSwizzle(meta->element));
         }
         return "0";
     }
@@ -1337,7 +1412,7 @@ private:
         const auto target = std::get_if<ImmediateNode>(operation[0]);
         UNIMPLEMENTED_IF(!target);
 
-        code.AddLine(fmt::format("jmp_to = 0x{:x}u;", target->GetValue()));
+        code.AddLine("jmp_to = 0x{:x}u;", target->GetValue());
         code.AddLine("break;");
         return {};
     }
@@ -1346,7 +1421,7 @@ private:
         const auto target = std::get_if<ImmediateNode>(operation[0]);
         UNIMPLEMENTED_IF(!target);
 
-        code.AddLine(fmt::format("flow_stack[flow_stack_top++] = 0x{:x}u;", target->GetValue()));
+        code.AddLine("flow_stack[flow_stack_top++] = 0x{:x}u;", target->GetValue());
         return {};
     }
 
@@ -1372,7 +1447,7 @@ private:
 
         UNIMPLEMENTED_IF_MSG(header.ps.omap.sample_mask != 0, "Sample mask write is unimplemented");
 
-        code.AddLine("if (alpha_test[0] != 0) {");
+        code.AddLine("if (alpha_test[0] != 0) {{");
         ++code.scope;
         // We start on the register containing the alpha value in the first RT.
         u32 current_reg = 3;
@@ -1383,13 +1458,12 @@ private:
                 header.ps.IsColorComponentOutputEnabled(render_target, 1) ||
                 header.ps.IsColorComponentOutputEnabled(render_target, 2) ||
                 header.ps.IsColorComponentOutputEnabled(render_target, 3)) {
-                code.AddLine(
-                    fmt::format("if (!AlphaFunc({})) discard;", SafeGetRegister(current_reg)));
+                code.AddLine("if (!AlphaFunc({})) discard;", SafeGetRegister(current_reg));
                 current_reg += 4;
             }
         }
         --code.scope;
-        code.AddLine('}');
+        code.AddLine("}}");
 
         // Write the color outputs using the data in the shader registers, disabled
         // rendertargets/components are skipped in the register assignment.
@@ -1398,8 +1472,8 @@ private:
             // TODO(Subv): Figure out how dual-source blending is configured in the Switch.
             for (u32 component = 0; component < 4; ++component) {
                 if (header.ps.IsColorComponentOutputEnabled(render_target, component)) {
-                    code.AddLine(fmt::format("FragColor{}[{}] = {};", render_target, component,
-                                             SafeGetRegister(current_reg)));
+                    code.AddLine("FragColor{}[{}] = {};", render_target, component,
+                                 SafeGetRegister(current_reg));
                     ++current_reg;
                 }
             }
@@ -1408,7 +1482,7 @@ private:
         if (header.ps.omap.depth) {
             // The depth output is always 2 registers after the last color output, and current_reg
             // already contains one past the last color register.
-            code.AddLine("gl_FragDepth = " + SafeGetRegister(current_reg + 1) + ';');
+            code.AddLine("gl_FragDepth = {};", SafeGetRegister(current_reg + 1));
         }
 
         code.AddLine("return;");
@@ -1418,11 +1492,11 @@ private:
     std::string Discard(Operation operation) {
         // Enclose "discard" in a conditional, so that GLSL compilation does not complain
         // about unexecuted instructions that may follow this.
-        code.AddLine("if (true) {");
+        code.AddLine("if (true) {{");
         ++code.scope;
         code.AddLine("discard;");
         --code.scope;
-        code.AddLine("}");
+        code.AddLine("}}");
         return {};
     }
 
@@ -1602,15 +1676,11 @@ private:
     }
 
     std::string GetInputAttribute(Attribute::Index attribute) const {
-        const auto index{static_cast<u32>(attribute) -
-                         static_cast<u32>(Attribute::Index::Attribute_0)};
-        return GetDeclarationWithSuffix(index, "input_attr");
+        return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "input_attr");
     }
 
     std::string GetOutputAttribute(Attribute::Index attribute) const {
-        const auto index{static_cast<u32>(attribute) -
-                         static_cast<u32>(Attribute::Index::Attribute_0)};
-        return GetDeclarationWithSuffix(index, "output_attr");
+        return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "output_attr");
     }
 
     std::string GetConstBuffer(u32 index) const {
@@ -1640,7 +1710,7 @@ private:
         const auto index = static_cast<u32>(flag);
         ASSERT(index < static_cast<u32>(InternalFlag::Amount));
 
-        return std::string(InternalFlagNames[index]) + '_' + suffix;
+        return fmt::format("{}_{}", InternalFlagNames[index], suffix);
     }
 
     std::string GetSampler(const Sampler& sampler) const {
@@ -1648,7 +1718,20 @@ private:
     }
 
     std::string GetDeclarationWithSuffix(u32 index, const std::string& name) const {
-        return name + '_' + std::to_string(index) + '_' + suffix;
+        return fmt::format("{}_{}_{}", name, index, suffix);
+    }
+
+    u32 GetNumPhysicalInputAttributes() const {
+        return stage == ShaderStage::Vertex ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings();
+    }
+
+    u32 GetNumPhysicalAttributes() const {
+        return std::min<u32>(device.GetMaxVertexAttributes(), Maxwell::NumVertexAttributes);
+    }
+
+    u32 GetNumPhysicalVaryings() const {
+        return std::min<u32>(device.GetMaxVaryings() - GENERIC_VARYING_START_LOCATION,
+                             Maxwell::NumVaryings);
     }
 
     const Device& device;
@@ -1663,24 +1746,25 @@ private:
 } // Anonymous namespace
 
 std::string GetCommonDeclarations() {
-    const auto cbuf = std::to_string(MAX_CONSTBUFFER_ELEMENTS);
-    return "#define MAX_CONSTBUFFER_ELEMENTS " + cbuf + "\n" +
-           "#define ftoi floatBitsToInt\n"
-           "#define ftou floatBitsToUint\n"
-           "#define itof intBitsToFloat\n"
-           "#define utof uintBitsToFloat\n\n"
-           "float fromHalf2(vec2 pair) {\n"
-           "    return utof(packHalf2x16(pair));\n"
-           "}\n\n"
-           "vec2 toHalf2(float value) {\n"
-           "    return unpackHalf2x16(ftou(value));\n"
-           "}\n\n"
-           "bvec2 halfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {\n"
-           "    bvec2 is_nan1 = isnan(pair1);\n"
-           "    bvec2 is_nan2 = isnan(pair2);\n"
-           "    return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || "
-           "is_nan2.y);\n"
-           "}\n";
+    return fmt::format(
+        "#define MAX_CONSTBUFFER_ELEMENTS {}\n"
+        "#define ftoi floatBitsToInt\n"
+        "#define ftou floatBitsToUint\n"
+        "#define itof intBitsToFloat\n"
+        "#define utof uintBitsToFloat\n\n"
+        "float fromHalf2(vec2 pair) {{\n"
+        "    return utof(packHalf2x16(pair));\n"
+        "}}\n\n"
+        "vec2 toHalf2(float value) {{\n"
+        "    return unpackHalf2x16(ftou(value));\n"
+        "}}\n\n"
+        "bvec2 halfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{\n"
+        "    bvec2 is_nan1 = isnan(pair1);\n"
+        "    bvec2 is_nan2 = isnan(pair2);\n"
+        "    return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || "
+        "is_nan2.y);\n"
+        "}}\n",
+        MAX_CONSTBUFFER_ELEMENTS);
 }
 
 ProgramResult Decompile(const Device& device, const ShaderIR& ir, Maxwell::ShaderStage stage,
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index ed7afc4a0..fba9c594a 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -104,8 +104,9 @@ bool ShaderDiskCacheRaw::Save(FileUtil::IOFile& file) const {
     return true;
 }
 
-ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system)
-    : system{system}, precompiled_cache_virtual_file_offset{0} {}
+ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {}
+
+ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default;
 
 std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>>
 ShaderDiskCacheOpenGL::LoadTransferable() {
@@ -243,7 +244,7 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
                 return {};
             }
 
-            const auto entry = LoadDecompiledEntry();
+            auto entry = LoadDecompiledEntry();
             if (!entry) {
                 return {};
             }
@@ -287,13 +288,13 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
         return {};
     }
 
-    std::vector<u8> code(code_size);
+    std::string code(code_size, '\0');
     if (!LoadArrayFromPrecompiled(code.data(), code.size())) {
         return {};
     }
 
     ShaderDiskCacheDecompiled entry;
-    entry.code = std::string(reinterpret_cast<const char*>(code.data()), code_size);
+    entry.code = std::move(code);
 
     u32 const_buffers_count{};
     if (!LoadObjectFromPrecompiled(const_buffers_count)) {
@@ -303,12 +304,12 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
     for (u32 i = 0; i < const_buffers_count; ++i) {
         u32 max_offset{};
         u32 index{};
-        u8 is_indirect{};
+        bool is_indirect{};
         if (!LoadObjectFromPrecompiled(max_offset) || !LoadObjectFromPrecompiled(index) ||
             !LoadObjectFromPrecompiled(is_indirect)) {
             return {};
         }
-        entry.entries.const_buffers.emplace_back(max_offset, is_indirect != 0, index);
+        entry.entries.const_buffers.emplace_back(max_offset, is_indirect, index);
     }
 
     u32 samplers_count{};
@@ -320,18 +321,17 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
         u64 offset{};
         u64 index{};
         u32 type{};
-        u8 is_array{};
-        u8 is_shadow{};
-        u8 is_bindless{};
+        bool is_array{};
+        bool is_shadow{};
+        bool is_bindless{};
         if (!LoadObjectFromPrecompiled(offset) || !LoadObjectFromPrecompiled(index) ||
             !LoadObjectFromPrecompiled(type) || !LoadObjectFromPrecompiled(is_array) ||
             !LoadObjectFromPrecompiled(is_shadow) || !LoadObjectFromPrecompiled(is_bindless)) {
             return {};
         }
-        entry.entries.samplers.emplace_back(static_cast<std::size_t>(offset),
-                                            static_cast<std::size_t>(index),
-                                            static_cast<Tegra::Shader::TextureType>(type),
-                                            is_array != 0, is_shadow != 0, is_bindless != 0);
+        entry.entries.samplers.emplace_back(
+            static_cast<std::size_t>(offset), static_cast<std::size_t>(index),
+            static_cast<Tegra::Shader::TextureType>(type), is_array, is_shadow, is_bindless);
     }
 
     u32 global_memory_count{};
@@ -342,21 +342,20 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
     for (u32 i = 0; i < global_memory_count; ++i) {
         u32 cbuf_index{};
         u32 cbuf_offset{};
-        u8 is_read{};
-        u8 is_written{};
+        bool is_read{};
+        bool is_written{};
         if (!LoadObjectFromPrecompiled(cbuf_index) || !LoadObjectFromPrecompiled(cbuf_offset) ||
             !LoadObjectFromPrecompiled(is_read) || !LoadObjectFromPrecompiled(is_written)) {
             return {};
         }
-        entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset, is_read != 0,
-                                                         is_written != 0);
+        entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset, is_read,
+                                                         is_written);
     }
 
     for (auto& clip_distance : entry.entries.clip_distances) {
-        u8 clip_distance_raw{};
-        if (!LoadObjectFromPrecompiled(clip_distance_raw))
+        if (!LoadObjectFromPrecompiled(clip_distance)) {
             return {};
-        clip_distance = clip_distance_raw != 0;
+        }
     }
 
     u64 shader_length{};
@@ -384,7 +383,7 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std:
     for (const auto& cbuf : entries.const_buffers) {
         if (!SaveObjectToPrecompiled(static_cast<u32>(cbuf.GetMaxOffset())) ||
             !SaveObjectToPrecompiled(static_cast<u32>(cbuf.GetIndex())) ||
-            !SaveObjectToPrecompiled(static_cast<u8>(cbuf.IsIndirect() ? 1 : 0))) {
+            !SaveObjectToPrecompiled(cbuf.IsIndirect())) {
             return false;
         }
     }
@@ -396,9 +395,9 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std:
         if (!SaveObjectToPrecompiled(static_cast<u64>(sampler.GetOffset())) ||
             !SaveObjectToPrecompiled(static_cast<u64>(sampler.GetIndex())) ||
             !SaveObjectToPrecompiled(static_cast<u32>(sampler.GetType())) ||
-            !SaveObjectToPrecompiled(static_cast<u8>(sampler.IsArray() ? 1 : 0)) ||
-            !SaveObjectToPrecompiled(static_cast<u8>(sampler.IsShadow() ? 1 : 0)) ||
-            !SaveObjectToPrecompiled(static_cast<u8>(sampler.IsBindless() ? 1 : 0))) {
+            !SaveObjectToPrecompiled(sampler.IsArray()) ||
+            !SaveObjectToPrecompiled(sampler.IsShadow()) ||
+            !SaveObjectToPrecompiled(sampler.IsBindless())) {
             return false;
         }
     }
@@ -409,14 +408,13 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std:
     for (const auto& gmem : entries.global_memory_entries) {
         if (!SaveObjectToPrecompiled(static_cast<u32>(gmem.GetCbufIndex())) ||
             !SaveObjectToPrecompiled(static_cast<u32>(gmem.GetCbufOffset())) ||
-            !SaveObjectToPrecompiled(static_cast<u8>(gmem.IsRead() ? 1 : 0)) ||
-            !SaveObjectToPrecompiled(static_cast<u8>(gmem.IsWritten() ? 1 : 0))) {
+            !SaveObjectToPrecompiled(gmem.IsRead()) || !SaveObjectToPrecompiled(gmem.IsWritten())) {
             return false;
         }
     }
 
     for (const bool clip_distance : entries.clip_distances) {
-        if (!SaveObjectToPrecompiled(static_cast<u8>(clip_distance ? 1 : 0))) {
+        if (!SaveObjectToPrecompiled(clip_distance)) {
             return false;
         }
     }
@@ -475,7 +473,10 @@ void ShaderDiskCacheOpenGL::SaveUsage(const ShaderDiskCacheUsage& usage) {
     ASSERT_MSG(it != transferable.end(), "Saving shader usage without storing raw previously");
 
     auto& usages{it->second};
-    ASSERT(usages.find(usage) == usages.end());
+    if (usages.find(usage) != usages.end()) {
+        // Skip this variant since the shader is already stored.
+        return;
+    }
     usages.insert(usage);
 
     FileUtil::IOFile file = AppendTransferableFile();
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index 0142b2e3b..2da0a4a23 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -70,14 +70,14 @@ namespace std {
 
 template <>
 struct hash<OpenGL::BaseBindings> {
-    std::size_t operator()(const OpenGL::BaseBindings& bindings) const {
+    std::size_t operator()(const OpenGL::BaseBindings& bindings) const noexcept {
         return bindings.cbuf | bindings.gmem << 8 | bindings.sampler << 16;
     }
 };
 
 template <>
 struct hash<OpenGL::ShaderDiskCacheUsage> {
-    std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const {
+    std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const noexcept {
         return static_cast<std::size_t>(usage.unique_identifier) ^
                std::hash<OpenGL::BaseBindings>()(usage.bindings) ^ usage.primitive << 16;
     }
@@ -162,6 +162,7 @@ struct ShaderDiskCacheDump {
 class ShaderDiskCacheOpenGL {
 public:
     explicit ShaderDiskCacheOpenGL(Core::System& system);
+    ~ShaderDiskCacheOpenGL();
 
     /// Loads transferable cache. If file has a old version or on failure, it deletes the file.
     std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>>
@@ -259,20 +260,35 @@ private:
         return SaveArrayToPrecompiled(&object, 1);
     }
 
+    bool SaveObjectToPrecompiled(bool object) {
+        const auto value = static_cast<u8>(object);
+        return SaveArrayToPrecompiled(&value, 1);
+    }
+
     template <typename T>
     bool LoadObjectFromPrecompiled(T& object) {
         return LoadArrayFromPrecompiled(&object, 1);
     }
 
-    // Copre system
+    bool LoadObjectFromPrecompiled(bool& object) {
+        u8 value;
+        const bool read_ok = LoadArrayFromPrecompiled(&value, 1);
+        if (!read_ok) {
+            return false;
+        }
+
+        object = value != 0;
+        return true;
+    }
+
+    // Core system
     Core::System& system;
     // Stored transferable shaders
     std::map<u64, std::unordered_set<ShaderDiskCacheUsage>> transferable;
-    // Stores whole precompiled cache which will be read from or saved to the precompiled chache
-    // file
+    // Stores whole precompiled cache which will be read from/saved to the precompiled cache file
     FileSys::VectorVfsFile precompiled_cache_virtual_file;
     // Stores the current offset of the precompiled cache file for IO purposes
-    std::size_t precompiled_cache_virtual_file_offset;
+    std::size_t precompiled_cache_virtual_file_offset = 0;
 
     // The cache has been loaded at boot
     bool tried_to_load{};
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 6abf948f8..7ab0b4553 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -33,14 +33,14 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
 };
 
 )";
-    ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
     ProgramResult program =
         Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");
 
     out += program.first;
 
     if (setup.IsDualProgram()) {
-        ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET);
+        const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET);
         ProgramResult program_b =
             Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");
 
@@ -76,7 +76,7 @@ void main() {
     }
 })";
 
-    return {out, program.second};
+    return {std::move(out), std::move(program.second)};
 }
 
 ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup) {
@@ -97,7 +97,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
 };
 
 )";
-    ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
     ProgramResult program =
         Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
     out += program.first;
@@ -107,7 +107,7 @@ void main() {
     execute_geometry();
 };)";
 
-    return {out, program.second};
+    return {std::move(out), std::move(program.second)};
 }
 
 ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup) {
@@ -160,7 +160,7 @@ bool AlphaFunc(in float value) {
 }
 
 )";
-    ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
     ProgramResult program =
         Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");
 
@@ -172,7 +172,7 @@ void main() {
 }
 
 )";
-    return {out, program.second};
+    return {std::move(out), std::move(program.second)};
 }
 
 } // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 95b773135..ed7b5cff0 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -126,6 +126,8 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
         return GL_TRIANGLES;
     case Maxwell::PrimitiveTopology::TriangleStrip:
         return GL_TRIANGLE_STRIP;
+    case Maxwell::PrimitiveTopology::TriangleFan:
+        return GL_TRIANGLE_FAN;
     default:
         LOG_CRITICAL(Render_OpenGL, "Unimplemented topology={}", static_cast<u32>(topology));
         UNREACHABLE();
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index 08b786aad..3edf460df 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -49,9 +49,6 @@ public:
         return alignment;
     }
 
-    // We do not have to flush this cache as things in it are never modified by us.
-    void Flush() override {}
-
 private:
     VAddr cpu_addr{};
     std::size_t size{};
@@ -87,6 +84,10 @@ public:
         return buffer_handle;
     }
 
+protected:
+    // We do not have to flush this cache as things in it are never modified by us.
+    void FlushObjectInner(const std::shared_ptr<CachedBufferEntry>& object) override {}
+
 private:
     void AlignBuffer(std::size_t alignment);
 
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 23d9b10db..b61a6d170 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -194,8 +194,8 @@ public:
         for (const auto& sampler : ir.GetSamplers()) {
             entries.samplers.emplace_back(sampler);
         }
-        for (const auto& attr : ir.GetInputAttributes()) {
-            entries.attributes.insert(GetGenericAttributeLocation(attr.first));
+        for (const auto& attribute : ir.GetInputAttributes()) {
+            entries.attributes.insert(GetGenericAttributeLocation(attribute));
         }
         entries.clip_distances = ir.GetClipDistances();
         entries.shader_length = ir.GetLength();
@@ -315,15 +315,13 @@ private:
         constexpr std::array<const char*, INTERNAL_FLAGS_COUNT> names = {"zero", "sign", "carry",
                                                                          "overflow"};
         for (std::size_t flag = 0; flag < INTERNAL_FLAGS_COUNT; ++flag) {
-            const auto flag_code = static_cast<InternalFlag>(flag);
             const Id id = OpVariable(t_prv_bool, spv::StorageClass::Private, v_false);
             internal_flags[flag] = AddGlobalVariable(Name(id, names[flag]));
         }
     }
 
     void DeclareInputAttributes() {
-        for (const auto element : ir.GetInputAttributes()) {
-            const Attribute::Index index = element.first;
+        for (const auto index : ir.GetInputAttributes()) {
             if (!IsGenericAttribute(index)) {
                 continue;
             }
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index ea1092db1..6a992c543 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -12,6 +12,8 @@
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/shader_ir.h"
 
+#pragma optimize("", off)
+
 namespace VideoCommon::Shader {
 
 using Tegra::Shader::Attribute;
@@ -47,17 +49,20 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
                              "Indirect attribute loads are not supported");
         UNIMPLEMENTED_IF_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) != 0,
                              "Unaligned attribute loads are not supported");
+        UNIMPLEMENTED_IF_MSG(instr.attribute.fmt20.IsPhysical() &&
+                                 instr.attribute.fmt20.size != Tegra::Shader::AttributeSize::Word,
+                             "Non-32 bits PHYS reads are not implemented");
 
-        Tegra::Shader::IpaMode input_mode{Tegra::Shader::IpaInterpMode::Pass,
-                                          Tegra::Shader::IpaSampleMode::Default};
+        const Node buffer{GetRegister(instr.gpr39)};
 
         u64 next_element = instr.attribute.fmt20.element;
         auto next_index = static_cast<u64>(instr.attribute.fmt20.index.Value());
 
         const auto LoadNextElement = [&](u32 reg_offset) {
-            const Node buffer = GetRegister(instr.gpr39);
-            const Node attribute = GetInputAttribute(static_cast<Attribute::Index>(next_index),
-                                                     next_element, input_mode, buffer);
+            const Node attribute{instr.attribute.fmt20.IsPhysical()
+                                     ? GetPhysicalInputAttribute(instr.gpr8, buffer)
+                                     : GetInputAttribute(static_cast<Attribute::Index>(next_index),
+                                                         next_element, buffer)};
 
             SetRegister(bb, instr.gpr0.Value() + reg_offset, attribute);
 
@@ -239,6 +244,21 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         }
         break;
     }
+    case OpCode::Id::AL2P: {
+        // Ignore al2p.direction since we don't care about it.
+
+        // Calculate emulation fake physical address.
+        const Node fixed_address{Immediate(static_cast<u32>(instr.al2p.address))};
+        const Node reg{GetRegister(instr.gpr8)};
+        const Node fake_address{Operation(OperationCode::IAdd, NO_PRECISE, reg, fixed_address)};
+
+        // Set the fake address to target register.
+        SetRegister(bb, instr.gpr0, fake_address);
+
+        // Signal the shader IR to declare all possible attributes and varyings
+        uses_physical_attributes = true;
+        break;
+    }
     default:
         UNIMPLEMENTED_MSG("Unhandled memory instruction: {}", opcode->get().GetName());
     }
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index d750a2936..fa17c45b5 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -130,15 +130,18 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::IPA: {
-        const auto& attribute = instr.attribute.fmt28;
+        const bool is_physical = instr.ipa.idx && instr.gpr8.Value() != 0xff;
+
+        const auto attribute = instr.attribute.fmt28;
         const Tegra::Shader::IpaMode input_mode{instr.ipa.interp_mode.Value(),
                                                 instr.ipa.sample_mode.Value()};
 
-        const Node attr = GetInputAttribute(attribute.index, attribute.element, input_mode);
-        Node value = attr;
+        Node value = is_physical ? GetPhysicalInputAttribute(instr.gpr8)
+                                 : GetInputAttribute(attribute.index, attribute.element);
         const Tegra::Shader::Attribute::Index index = attribute.index.Value();
-        if (index >= Tegra::Shader::Attribute::Index::Attribute_0 &&
-            index <= Tegra::Shader::Attribute::Index::Attribute_31) {
+        const bool is_generic = index >= Tegra::Shader::Attribute::Index::Attribute_0 &&
+                                index <= Tegra::Shader::Attribute::Index::Attribute_31;
+        if (is_generic || is_physical) {
             // TODO(Blinkhawk): There are cases where a perspective attribute use PASS.
             // In theory by setting them as perspective, OpenGL does the perspective correction.
             // A way must figured to reverse the last step of it.
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 819cc6131..5b033126d 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -540,8 +540,6 @@ Node4 ShaderIR::GetTexsCode(Instruction instr, TextureType texture_type,
 Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool depth_compare,
                             bool is_array, bool is_aoffi) {
     const std::size_t coord_count = GetCoordCount(texture_type);
-    const std::size_t total_coord_count = coord_count + (is_array ? 1 : 0);
-    const std::size_t total_reg_count = total_coord_count + (depth_compare ? 1 : 0);
 
     // If enabled arrays index is always stored in the gpr8 field
     const u64 array_register = instr.gpr8.Value();
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index e4eb0dfd9..153ad1fd0 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -21,6 +21,13 @@ using Tegra::Shader::PredCondition;
 using Tegra::Shader::PredOperation;
 using Tegra::Shader::Register;
 
+ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset)
+    : program_code{program_code}, main_offset{main_offset} {
+    Decode();
+}
+
+ShaderIR::~ShaderIR() = default;
+
 Node ShaderIR::StoreNode(NodeData&& node_data) {
     auto store = std::make_unique<NodeData>(node_data);
     const Node node = store.get();
@@ -89,13 +96,14 @@ Node ShaderIR::GetPredicate(bool immediate) {
     return GetPredicate(static_cast<u64>(immediate ? Pred::UnusedIndex : Pred::NeverExecute));
 }
 
-Node ShaderIR::GetInputAttribute(Attribute::Index index, u64 element,
-                                 const Tegra::Shader::IpaMode& input_mode, Node buffer) {
-    const auto [entry, is_new] =
-        used_input_attributes.emplace(std::make_pair(index, std::set<Tegra::Shader::IpaMode>{}));
-    entry->second.insert(input_mode);
+Node ShaderIR::GetInputAttribute(Attribute::Index index, u64 element, Node buffer) {
+    used_input_attributes.emplace(index);
+    return StoreNode(AbufNode(index, static_cast<u32>(element), buffer));
+}
 
-    return StoreNode(AbufNode(index, static_cast<u32>(element), input_mode, buffer));
+Node ShaderIR::GetPhysicalInputAttribute(Tegra::Shader::Register physical_address, Node buffer) {
+    uses_physical_attributes = true;
+    return StoreNode(AbufNode(GetRegister(physical_address), buffer));
 }
 
 Node ShaderIR::GetOutputAttribute(Attribute::Index index, u64 element, Node buffer) {
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 65f1e1de9..0bf124252 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -328,40 +328,31 @@ struct MetaTexture {
     u32 element{};
 };
 
-inline constexpr MetaArithmetic PRECISE = {true};
-inline constexpr MetaArithmetic NO_PRECISE = {false};
+constexpr MetaArithmetic PRECISE = {true};
+constexpr MetaArithmetic NO_PRECISE = {false};
 
 using Meta = std::variant<MetaArithmetic, MetaTexture, Tegra::Shader::HalfType>;
 
 /// Holds any kind of operation that can be done in the IR
 class OperationNode final {
 public:
-    template <typename... T>
-    explicit constexpr OperationNode(OperationCode code) : code{code}, meta{} {}
+    explicit OperationNode(OperationCode code) : code{code} {}
 
-    template <typename... T>
-    explicit constexpr OperationNode(OperationCode code, Meta&& meta)
-        : code{code}, meta{std::move(meta)} {}
+    explicit OperationNode(OperationCode code, Meta&& meta) : code{code}, meta{std::move(meta)} {}
 
     template <typename... T>
-    explicit constexpr OperationNode(OperationCode code, const T*... operands)
+    explicit OperationNode(OperationCode code, const T*... operands)
         : OperationNode(code, {}, operands...) {}
 
     template <typename... T>
-    explicit constexpr OperationNode(OperationCode code, Meta&& meta, const T*... operands_)
-        : code{code}, meta{std::move(meta)} {
-
-        auto operands_list = {operands_...};
-        for (auto& operand : operands_list) {
-            operands.push_back(operand);
-        }
-    }
+    explicit OperationNode(OperationCode code, Meta&& meta, const T*... operands_)
+        : code{code}, meta{std::move(meta)}, operands{operands_...} {}
 
     explicit OperationNode(OperationCode code, Meta&& meta, std::vector<Node>&& operands)
         : code{code}, meta{meta}, operands{std::move(operands)} {}
 
     explicit OperationNode(OperationCode code, std::vector<Node>&& operands)
-        : code{code}, meta{}, operands{std::move(operands)} {}
+        : code{code}, operands{std::move(operands)} {}
 
     OperationCode GetCode() const {
         return code;
@@ -465,17 +456,14 @@ private:
 /// Attribute buffer memory (known as attributes or varyings in GLSL terms)
 class AbufNode final {
 public:
-    explicit constexpr AbufNode(Tegra::Shader::Attribute::Index index, u32 element,
-                                const Tegra::Shader::IpaMode& input_mode, Node buffer = {})
-        : input_mode{input_mode}, buffer{buffer}, index{index}, element{element} {}
-
+    // Initialize for standard attributes (index is explicit).
     explicit constexpr AbufNode(Tegra::Shader::Attribute::Index index, u32 element,
                                 Node buffer = {})
-        : input_mode{}, buffer{buffer}, index{index}, element{element} {}
+        : buffer{buffer}, index{index}, element{element} {}
 
-    Tegra::Shader::IpaMode GetInputMode() const {
-        return input_mode;
-    }
+    // Initialize for physical attributes (index is a variable value).
+    explicit constexpr AbufNode(Node physical_address, Node buffer = {})
+        : physical_address{physical_address}, buffer{buffer} {}
 
     Tegra::Shader::Attribute::Index GetIndex() const {
         return index;
@@ -489,11 +477,19 @@ public:
         return buffer;
     }
 
+    bool IsPhysicalBuffer() const {
+        return physical_address != nullptr;
+    }
+
+    Node GetPhysicalAddress() const {
+        return physical_address;
+    }
+
 private:
-    const Tegra::Shader::IpaMode input_mode;
-    const Node buffer;
-    const Tegra::Shader::Attribute::Index index;
-    const u32 element;
+    Node physical_address{};
+    Node buffer{};
+    Tegra::Shader::Attribute::Index index{};
+    u32 element{};
 };
 
 /// Constant buffer node, usually mapped to uniform buffers in GLSL
@@ -567,11 +563,8 @@ private:
 
 class ShaderIR final {
 public:
-    explicit ShaderIR(const ProgramCode& program_code, u32 main_offset)
-        : program_code{program_code}, main_offset{main_offset} {
-
-        Decode();
-    }
+    explicit ShaderIR(const ProgramCode& program_code, u32 main_offset);
+    ~ShaderIR();
 
     const std::map<u32, NodeBlock>& GetBasicBlocks() const {
         return basic_blocks;
@@ -585,8 +578,7 @@ public:
         return used_predicates;
     }
 
-    const std::map<Tegra::Shader::Attribute::Index, std::set<Tegra::Shader::IpaMode>>&
-    GetInputAttributes() const {
+    const std::set<Tegra::Shader::Attribute::Index>& GetInputAttributes() const {
         return used_input_attributes;
     }
 
@@ -615,6 +607,10 @@ public:
         return static_cast<std::size_t>(coverage_end * sizeof(u64));
     }
 
+    bool HasPhysicalAttributes() const {
+        return uses_physical_attributes;
+    }
+
     const Tegra::Shader::Header& GetHeader() const {
         return header;
     }
@@ -696,8 +692,9 @@ private:
     /// Generates a predicate node for an immediate true or false value
     Node GetPredicate(bool immediate);
     /// Generates a node representing an input attribute. Keeps track of used attributes.
-    Node GetInputAttribute(Tegra::Shader::Attribute::Index index, u64 element,
-                           const Tegra::Shader::IpaMode& input_mode, Node buffer = {});
+    Node GetInputAttribute(Tegra::Shader::Attribute::Index index, u64 element, Node buffer = {});
+    /// Generates a node representing a physical input attribute.
+    Node GetPhysicalInputAttribute(Tegra::Shader::Register physical_address, Node buffer = {});
     /// Generates a node representing an output attribute. Keeps track of used attributes.
     Node GetOutputAttribute(Tegra::Shader::Attribute::Index index, u64 element, Node buffer);
     /// Generates a node representing an internal flag
@@ -814,11 +811,12 @@ private:
     void WriteLop3Instruction(NodeBlock& bb, Tegra::Shader::Register dest, Node op_a, Node op_b,
                               Node op_c, Node imm_lut, bool sets_cc);
 
-    Node TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor);
+    Node TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;
 
-    std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor);
+    std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const;
 
-    std::pair<Node, s64> TrackRegister(const GprNode* tracked, const NodeBlock& code, s64 cursor);
+    std::pair<Node, s64> TrackRegister(const GprNode* tracked, const NodeBlock& code,
+                                       s64 cursor) const;
 
     std::tuple<Node, Node, GlobalMemoryBase> TrackAndGetGlobalMemory(NodeBlock& bb,
                                                                      Node addr_register,
@@ -835,12 +833,10 @@ private:
         return StoreNode(OperationNode(code, std::move(meta), operands...));
     }
 
-    template <typename... T>
     Node Operation(OperationCode code, std::vector<Node>&& operands) {
         return StoreNode(OperationNode(code, std::move(operands)));
     }
 
-    template <typename... T>
     Node Operation(OperationCode code, Meta&& meta, std::vector<Node>&& operands) {
         return StoreNode(OperationNode(code, std::move(meta), std::move(operands)));
     }
@@ -872,13 +868,13 @@ private:
 
     std::set<u32> used_registers;
     std::set<Tegra::Shader::Pred> used_predicates;
-    std::map<Tegra::Shader::Attribute::Index, std::set<Tegra::Shader::IpaMode>>
-        used_input_attributes;
+    std::set<Tegra::Shader::Attribute::Index> used_input_attributes;
     std::set<Tegra::Shader::Attribute::Index> used_output_attributes;
     std::map<u32, ConstBuffer> used_cbufs;
     std::set<Sampler> used_samplers;
     std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{};
     std::map<GlobalMemoryBase, GlobalMemoryUsage> used_global_memory;
+    bool uses_physical_attributes{}; // Shader uses AL2P or physical attribute read/writes
 
     Tegra::Shader::Header header;
 };
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index 4505667ff..19ede1eb9 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -17,22 +17,24 @@ std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
     for (; cursor >= 0; --cursor) {
         const Node node = code.at(cursor);
         if (const auto operation = std::get_if<OperationNode>(node)) {
-            if (operation->GetCode() == operation_code)
+            if (operation->GetCode() == operation_code) {
                 return {node, cursor};
+            }
         }
         if (const auto conditional = std::get_if<ConditionalNode>(node)) {
             const auto& conditional_code = conditional->GetCode();
             const auto [found, internal_cursor] = FindOperation(
                 conditional_code, static_cast<s64>(conditional_code.size() - 1), operation_code);
-            if (found)
+            if (found) {
                 return {found, cursor};
+            }
         }
     }
     return {};
 }
 } // namespace
 
-Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) {
+Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const {
     if (const auto cbuf = std::get_if<CbufNode>(tracked)) {
         // Cbuf found, but it has to be immediate
         return std::holds_alternative<ImmediateNode>(*cbuf->GetOffset()) ? tracked : nullptr;
@@ -65,7 +67,7 @@ Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) {
     return nullptr;
 }
 
-std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) {
+std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const {
     // Reduce the cursor in one to avoid infinite loops when the instruction sets the same register
     // that it uses as operand
     const auto [found, found_cursor] =
@@ -80,7 +82,7 @@ std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code,
 }
 
 std::pair<Node, s64> ShaderIR::TrackRegister(const GprNode* tracked, const NodeBlock& code,
-                                             s64 cursor) {
+                                             s64 cursor) const {
     for (; cursor >= 0; --cursor) {
         const auto [found_node, new_cursor] = FindOperation(code, cursor, OperationCode::Assign);
         if (!found_node) {
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index b508d64e9..a9b8f69af 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -25,8 +25,8 @@
 
 class InputBitStream {
 public:
-    explicit InputBitStream(const unsigned char* ptr, int nBits = 0, int start_offset = 0)
-        : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {}
+    explicit InputBitStream(const unsigned char* ptr, int start_offset = 0)
+        : m_CurByte(ptr), m_NextBit(start_offset % 8) {}
 
     ~InputBitStream() = default;
 
@@ -55,12 +55,9 @@ public:
     }
 
 private:
-    const int m_NumBits;
     const unsigned char* m_CurByte;
     int m_NextBit = 0;
     int m_BitsRead = 0;
-
-    bool done = false;
 };
 
 class OutputBitStream {
@@ -114,7 +111,6 @@ private:
     const int m_NumBits;
     unsigned char* m_CurByte;
     int m_NextBit = 0;
-    int m_BitsRead = 0;
 
     bool done = false;
 };
@@ -1616,6 +1612,7 @@ namespace Tegra::Texture::ASTC {
 std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t height,
                                 uint32_t depth, uint32_t block_width, uint32_t block_height) {
     uint32_t blockIdx = 0;
+    std::size_t depth_offset = 0;
     std::vector<uint8_t> outData(height * width * depth * 4);
     for (uint32_t k = 0; k < depth; k++) {
         for (uint32_t j = 0; j < height; j += block_height) {
@@ -1630,7 +1627,7 @@ std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t he
                 uint32_t decompWidth = std::min(block_width, width - i);
                 uint32_t decompHeight = std::min(block_height, height - j);
 
-                uint8_t* outRow = outData.data() + (j * width + i) * 4;
+                uint8_t* outRow = depth_offset + outData.data() + (j * width + i) * 4;
                 for (uint32_t jj = 0; jj < decompHeight; jj++) {
                     memcpy(outRow + jj * width * 4, uncompData + jj * block_width, decompWidth * 4);
                 }
@@ -1638,6 +1635,7 @@ std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t he
                 blockIdx++;
             }
         }
+        depth_offset += height * width * 4;
     }
 
     return outData;