65 files changed, 1342 insertions, 1185 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 1e010e4da..f8b67cbe1 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -3,6 +3,7 @@ add_library(video_core STATIC
     dma_pusher.h
     debug_utils/debug_utils.cpp
     debug_utils/debug_utils.h
+    engines/const_buffer_info.h
     engines/engine_upload.cpp
     engines/engine_upload.h
     engines/fermi_2d.cpp
@@ -42,8 +43,6 @@ add_library(video_core STATIC
     renderer_opengl/gl_device.h
     renderer_opengl/gl_global_cache.cpp
     renderer_opengl/gl_global_cache.h
-    renderer_opengl/gl_primitive_assembler.cpp
-    renderer_opengl/gl_primitive_assembler.h
     renderer_opengl/gl_rasterizer.cpp
     renderer_opengl/gl_rasterizer.h
     renderer_opengl/gl_rasterizer_cache.cpp
@@ -102,6 +101,9 @@ add_library(video_core STATIC
     shader/decode/xmad.cpp
     shader/decode/other.cpp
     shader/decode.cpp
+    shader/node_helper.cpp
+    shader/node_helper.h
+    shader/node.h
     shader/shader_ir.cpp
     shader/shader_ir.h
     shader/track.cpp
diff --git a/src/video_core/engines/const_buffer_info.h b/src/video_core/engines/const_buffer_info.h
new file mode 100644
index 000000000..d8f672462
--- /dev/null
+++ b/src/video_core/engines/const_buffer_info.h
@@ -0,0 +1,17 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+
+struct ConstBufferInfo {
+    GPUVAddr address;
+    u32 size;
+    bool enabled;
+};
+
+} // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index 5250b8d9b..6a3309a2c 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -140,7 +140,7 @@ public:
 
         BitField<0, 16, u32> shared_alloc;
 
-        BitField<0, 31, u32> block_dim_x;
+        BitField<16, 16, u32> block_dim_x;
         union {
             BitField<0, 16, u32> block_dim_y;
             BitField<16, 16, u32> block_dim_z;
@@ -153,7 +153,7 @@ public:
 
         INSERT_PADDING_WORDS(0x8);
 
-        struct {
+        struct ConstBufferConfig {
             u32 address_low;
             union {
                 BitField<0, 8, u32> address_high;
@@ -163,7 +163,8 @@ public:
                 return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high.Value()) << 32) |
                                              address_low);
             }
-        } const_buffer_config[8];
+        };
+        std::array<ConstBufferConfig, NumConstBuffers> const_buffer_config;
 
         union {
             BitField<0, 20, u32> local_pos_alloc;
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 39968d403..08d553696 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -396,12 +396,10 @@ void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) {
     auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
     auto& bind_data = regs.cb_bind[static_cast<std::size_t>(stage)];
 
-    auto& buffer = shader.const_buffers[bind_data.index];
-
     ASSERT(bind_data.index < Regs::MaxConstBuffers);
+    auto& buffer = shader.const_buffers[bind_data.index];
 
     buffer.enabled = bind_data.valid.Value() != 0;
-    buffer.index = bind_data.index;
     buffer.address = regs.const_buffer.BufferAddress();
     buffer.size = regs.const_buffer.cb_size;
 }
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index f342c78e6..13e314944 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -15,6 +15,7 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/math_util.h"
+#include "video_core/engines/const_buffer_info.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 #include "video_core/macro_interpreter.h"
@@ -1112,13 +1113,6 @@ public:
     static_assert(std::is_trivially_copyable_v<Regs>, "Maxwell3D Regs must be trivially copyable");
 
     struct State {
-        struct ConstBufferInfo {
-            GPUVAddr address;
-            u32 index;
-            u32 size;
-            bool enabled;
-        };
-
         struct ShaderStageInfo {
             std::array<ConstBufferInfo, Regs::MaxConstBuffers> const_buffers;
         };
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index e83f25fa1..ffb3ec3e0 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -1663,6 +1663,7 @@ private:
             INST("111000100100----", Id::BRA, Type::Flow, "BRA"),
             INST("1111000011111---", Id::SYNC, Type::Flow, "SYNC"),
             INST("111000110100---", Id::BRK, Type::Flow, "BRK"),
+            INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),
             INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"),
             INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),
             INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"),
@@ -1686,7 +1687,6 @@ private:
             INST("1101111100------", Id::TLD4S, Type::Texture, "TLD4S"),
             INST("110111110110----", Id::TMML_B, Type::Texture, "TMML_B"),
             INST("1101111101011---", Id::TMML, Type::Texture, "TMML"),
-            INST("111000110000----", Id::EXIT, Type::Trivial, "EXIT"),
             INST("11100000--------", Id::IPA, Type::Trivial, "IPA"),
             INST("1111101111100---", Id::OUT_R, Type::Trivial, "OUT_R"),
             INST("1110111111010---", Id::ISBERD, Type::Trivial, "ISBERD"),
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 1e2ff46b0..3f0939ec9 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -75,7 +75,7 @@ void ThreadManager::StartThread(VideoCore::RendererBase& renderer, Tegra::DmaPus
 
 void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
     const u64 fence{PushCommand(SubmitListCommand(std::move(entries)))};
-    const s64 synchronization_ticks{Core::Timing::usToCycles(9000)};
+    const s64 synchronization_ticks{Core::Timing::usToCycles(std::chrono::microseconds{9000})};
     system.CoreTiming().ScheduleEvent(synchronization_ticks, synchronization_event, fence);
 }
 
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 113f9d8f3..43a84bd52 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -163,8 +163,8 @@ private:
     static constexpr u64 page_size{1 << page_bits};
     static constexpr u64 page_mask{page_size - 1};
 
-    /// Address space in bits, this is fairly arbitrary but sufficiently large.
-    static constexpr u32 address_space_width{39};
+    /// Address space in bits, according to Tegra X1 TRM
+    static constexpr u32 address_space_width{40};
     /// Start address for mapping, this is fairly arbitrary but must be non-zero.
     static constexpr GPUVAddr address_space_base{0x100000};
     /// End of address space, based on address space in bits.
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 25652e794..48b86f3bd 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -71,16 +71,6 @@ GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, std::size_t s
     return uploaded_offset;
 }
 
-std::tuple<u8*, GLintptr> OGLBufferCache::ReserveMemory(std::size_t size, std::size_t alignment) {
-    AlignBuffer(alignment);
-    u8* const uploaded_ptr = buffer_ptr;
-    const GLintptr uploaded_offset = buffer_offset;
-
-    buffer_ptr += size;
-    buffer_offset += size;
-    return std::make_tuple(uploaded_ptr, uploaded_offset);
-}
-
 bool OGLBufferCache::Map(std::size_t max_size) {
     bool invalidate;
     std::tie(buffer_ptr, buffer_offset_base, invalidate) =
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index f9247a40e..f2347581b 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -61,9 +61,6 @@ public:
     /// Uploads from a host memory. Returns host's buffer offset where it's been allocated.
     GLintptr UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment = 4);
 
-    /// Reserves memory to be used by host's CPU. Returns mapped address and offset.
-    std::tuple<u8*, GLintptr> ReserveMemory(std::size_t size, std::size_t alignment = 4);
-
     bool Map(std::size_t max_size);
     void Unmap();
 
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 1d1581f49..65a88b06c 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -2,11 +2,14 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <array>
 #include <cstddef>
 #include <glad/glad.h>
 
 #include "common/logging/log.h"
+#include "common/scope_exit.h"
 #include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
 
 namespace OpenGL {
 
@@ -24,6 +27,7 @@ Device::Device() {
     max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
     max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
     has_variable_aoffi = TestVariableAoffi();
+    has_component_indexing_bug = TestComponentIndexingBug();
 }
 
 Device::Device(std::nullptr_t) {
@@ -31,6 +35,7 @@ Device::Device(std::nullptr_t) {
     max_vertex_attributes = 16;
     max_varyings = 15;
     has_variable_aoffi = true;
+    has_component_indexing_bug = false;
 }
 
 bool Device::TestVariableAoffi() {
@@ -52,4 +57,53 @@ void main() {
     return supported;
 }
 
+bool Device::TestComponentIndexingBug() {
+    constexpr char log_message[] = "Renderer_ComponentIndexingBug: {}";
+    const GLchar* COMPONENT_TEST = R"(#version 430 core
+layout (std430, binding = 0) buffer OutputBuffer {
+    uint output_value;
+};
+layout (std140, binding = 0) uniform InputBuffer {
+    uvec4 input_value[4096];
+};
+layout (location = 0) uniform uint idx;
+void main() {
+    output_value = input_value[idx >> 2][idx & 3];
+})";
+    const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &COMPONENT_TEST)};
+    SCOPE_EXIT({ glDeleteProgram(shader); });
+    glUseProgram(shader);
+
+    OGLVertexArray vao;
+    vao.Create();
+    glBindVertexArray(vao.handle);
+
+    constexpr std::array<GLuint, 8> values{0, 0, 0, 0, 0x1236327, 0x985482, 0x872753, 0x2378432};
+    OGLBuffer ubo;
+    ubo.Create();
+    glNamedBufferData(ubo.handle, sizeof(values), values.data(), GL_STATIC_DRAW);
+    glBindBufferBase(GL_UNIFORM_BUFFER, 0, ubo.handle);
+
+    OGLBuffer ssbo;
+    ssbo.Create();
+    glNamedBufferStorage(ssbo.handle, sizeof(GLuint), nullptr, GL_CLIENT_STORAGE_BIT);
+
+    for (GLuint index = 4; index < 8; ++index) {
+        glInvalidateBufferData(ssbo.handle);
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssbo.handle);
+
+        glProgramUniform1ui(shader, 0, index);
+        glDrawArrays(GL_POINTS, 0, 1);
+
+        GLuint result;
+        glGetNamedBufferSubData(ssbo.handle, 0, sizeof(result), &result);
+        if (result != values.at(index)) {
+            LOG_INFO(Render_OpenGL, log_message, true);
+            return true;
+        }
+    }
+    LOG_INFO(Render_OpenGL, log_message, false);
+    return false;
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index de8490682..8c8c93760 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -30,13 +30,19 @@ public:
         return has_variable_aoffi;
     }
 
+    bool HasComponentIndexingBug() const {
+        return has_component_indexing_bug;
+    }
+
 private:
     static bool TestVariableAoffi();
+    static bool TestComponentIndexingBug();
 
     std::size_t uniform_buffer_alignment{};
     u32 max_vertex_attributes{};
     u32 max_varyings{};
     bool has_variable_aoffi{};
+    bool has_component_indexing_bug{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_primitive_assembler.cpp b/src/video_core/renderer_opengl/gl_primitive_assembler.cpp
deleted file mode 100644
index c3e94d917..000000000
--- a/src/video_core/renderer_opengl/gl_primitive_assembler.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <algorithm>
-#include <array>
-#include "common/assert.h"
-#include "common/common_types.h"
-#include "core/core.h"
-#include "video_core/memory_manager.h"
-#include "video_core/renderer_opengl/gl_buffer_cache.h"
-#include "video_core/renderer_opengl/gl_primitive_assembler.h"
-
-namespace OpenGL {
-
-constexpr u32 TRIANGLES_PER_QUAD = 6;
-constexpr std::array<u32, TRIANGLES_PER_QUAD> QUAD_MAP = {0, 1, 2, 0, 2, 3};
-
-PrimitiveAssembler::PrimitiveAssembler(OGLBufferCache& buffer_cache) : buffer_cache(buffer_cache) {}
-
-PrimitiveAssembler::~PrimitiveAssembler() = default;
-
-std::size_t PrimitiveAssembler::CalculateQuadSize(u32 count) const {
-    ASSERT_MSG(count % 4 == 0, "Quad count is expected to be a multiple of 4");
-    return (count / 4) * TRIANGLES_PER_QUAD * sizeof(GLuint);
-}
-
-GLintptr PrimitiveAssembler::MakeQuadArray(u32 first, u32 count) {
-    const std::size_t size{CalculateQuadSize(count)};
-    auto [dst_pointer, index_offset] = buffer_cache.ReserveMemory(size);
-
-    for (u32 primitive = 0; primitive < count / 4; ++primitive) {
-        for (u32 i = 0; i < TRIANGLES_PER_QUAD; ++i) {
-            const u32 index = first + primitive * 4 + QUAD_MAP[i];
-            std::memcpy(dst_pointer, &index, sizeof(index));
-            dst_pointer += sizeof(index);
-        }
-    }
-
-    return index_offset;
-}
-
-GLintptr PrimitiveAssembler::MakeQuadIndexed(GPUVAddr gpu_addr, std::size_t index_size, u32 count) {
-    const std::size_t map_size{CalculateQuadSize(count)};
-    auto [dst_pointer, index_offset] = buffer_cache.ReserveMemory(map_size);
-
-    auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
-    const u8* source{memory_manager.GetPointer(gpu_addr)};
-
-    for (u32 primitive = 0; primitive < count / 4; ++primitive) {
-        for (std::size_t i = 0; i < TRIANGLES_PER_QUAD; ++i) {
-            const u32 index = primitive * 4 + QUAD_MAP[i];
-            const u8* src_offset = source + (index * index_size);
-
-            std::memcpy(dst_pointer, src_offset, index_size);
-            dst_pointer += index_size;
-        }
-    }
-
-    return index_offset;
-}
-
-} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_primitive_assembler.h b/src/video_core/renderer_opengl/gl_primitive_assembler.h
deleted file mode 100644
index 4e87ce4d6..000000000
--- a/src/video_core/renderer_opengl/gl_primitive_assembler.h
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <glad/glad.h>
-
-#include "common/common_types.h"
-
-namespace OpenGL {
-
-class OGLBufferCache;
-
-class PrimitiveAssembler {
-public:
-    explicit PrimitiveAssembler(OGLBufferCache& buffer_cache);
-    ~PrimitiveAssembler();
-
-    /// Calculates the size required by MakeQuadArray and MakeQuadIndexed.
-    std::size_t CalculateQuadSize(u32 count) const;
-
-    GLintptr MakeQuadArray(u32 first, u32 count);
-
-    GLintptr MakeQuadIndexed(GPUVAddr gpu_addr, std::size_t index_size, u32 count);
-
-private:
-    OGLBufferCache& buffer_cache;
-};
-
-} // namespace OpenGL
-\ No newline at end of file
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index f9b6dfeea..d77426067 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -246,29 +246,6 @@ DrawParameters RasterizerOpenGL::SetupDraw() {
     DrawParameters params{};
     params.current_instance = gpu.state.current_instance;
 
-    if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) {
-        MICROPROFILE_SCOPE(OpenGL_PrimitiveAssembly);
-
-        params.use_indexed = true;
-        params.primitive_mode = GL_TRIANGLES;
-
-        if (is_indexed) {
-            params.index_format = MaxwellToGL::IndexFormat(regs.index_array.format);
-            params.count = (regs.index_array.count / 4) * 6;
-            params.index_buffer_offset = primitive_assembler.MakeQuadIndexed(
-                regs.index_array.IndexStart(), regs.index_array.FormatSizeInBytes(),
-                regs.index_array.count);
-            params.base_vertex = static_cast<GLint>(regs.vb_element_base);
-        } else {
-            // MakeQuadArray always generates u32 indexes
-            params.index_format = GL_UNSIGNED_INT;
-            params.count = (regs.vertex_buffer.count / 4) * 6;
-            params.index_buffer_offset = primitive_assembler.MakeQuadArray(
-                regs.vertex_buffer.first, regs.vertex_buffer.count);
-        }
-        return params;
-    }
-
     params.use_indexed = is_indexed;
     params.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology);
 
@@ -345,9 +322,9 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
         }
 
         const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage);
-        SetupConstBuffers(stage_enum, shader, program_handle, base_bindings);
-        SetupGlobalRegions(stage_enum, shader, program_handle, base_bindings);
-        SetupTextures(stage_enum, shader, program_handle, base_bindings);
+        SetupDrawConstBuffers(stage_enum, shader);
+        SetupGlobalRegions(stage_enum, shader);
+        SetupTextures(stage_enum, shader, base_bindings);
 
         // Workaround for Intel drivers.
         // When a clip distance is enabled but not set in the shader it crops parts of the screen
@@ -686,30 +663,19 @@ void RasterizerOpenGL::DrawArrays() {
     SyncCullMode();
     SyncPrimitiveRestart();
     SyncScissorTest(state);
-    // Alpha Testing is synced on shaders.
     SyncTransformFeedback();
     SyncPointState();
-    CheckAlphaTests();
     SyncPolygonOffset();
-    // TODO(bunnei): Sync framebuffer_scale uniform here
-    // TODO(bunnei): Sync scissorbox uniform(s) here
+    SyncAlphaTest();
 
     // Draw the vertex batch
     const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
 
     std::size_t buffer_size = CalculateVertexArraysSize();
 
-    // Add space for index buffer (keeping in mind non-core primitives)
-    switch (regs.draw.topology) {
-    case Maxwell::PrimitiveTopology::Quads:
-        buffer_size = Common::AlignUp(buffer_size, 4) +
-                      primitive_assembler.CalculateQuadSize(regs.vertex_buffer.count);
-        break;
-    default:
-        if (is_indexed) {
-            buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize();
-        }
-        break;
+    // Add space for index buffer
+    if (is_indexed) {
+        buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize();
     }
 
     // Uniform space for the 5 shader stages
@@ -810,57 +776,55 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
     return true;
 }
 
-void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                                         const Shader& shader, GLuint program_handle,
-                                         BaseBindings base_bindings) {
+void RasterizerOpenGL::SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                                             const Shader& shader) {
     MICROPROFILE_SCOPE(OpenGL_UBO);
-    const auto& gpu = system.GPU();
-    const auto& maxwell3d = gpu.Maxwell3D();
-    const auto& shader_stage = maxwell3d.state.shader_stages[static_cast<std::size_t>(stage)];
+    const auto stage_index = static_cast<std::size_t>(stage);
+    const auto& shader_stage = system.GPU().Maxwell3D().state.shader_stages[stage_index];
     const auto& entries = shader->GetShaderEntries().const_buffers;
 
     // Upload only the enabled buffers from the 16 constbuffers of each shader stage
     for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
-        const auto& used_buffer = entries[bindpoint];
-        const auto& buffer = shader_stage.const_buffers[used_buffer.GetIndex()];
-
-        if (!buffer.enabled) {
-            // Set values to zero to unbind buffers
-            bind_ubo_pushbuffer.Push(0, 0, 0);
-            continue;
-        }
+        const auto& entry = entries[bindpoint];
+        SetupConstBuffer(shader_stage.const_buffers[entry.GetIndex()], entry);
+    }
+}
 
-        std::size_t size = 0;
+void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer,
+                                        const GLShader::ConstBufferEntry& entry) {
+    if (!buffer.enabled) {
+        // Set values to zero to unbind buffers
+        bind_ubo_pushbuffer.Push(0, 0, 0);
+        return;
+    }
 
-        if (used_buffer.IsIndirect()) {
-            // Buffer is accessed indirectly, so upload the entire thing
-            size = buffer.size;
+    std::size_t size;
+    if (entry.IsIndirect()) {
+        // Buffer is accessed indirectly, so upload the entire thing
+        size = buffer.size;
 
-            if (size > MaxConstbufferSize) {
-                LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", size,
-                            MaxConstbufferSize);
-                size = MaxConstbufferSize;
-            }
-        } else {
-            // Buffer is accessed directly, upload just what we use
-            size = used_buffer.GetSize();
+        if (size > MaxConstbufferSize) {
+            LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", size,
+                        MaxConstbufferSize);
+            size = MaxConstbufferSize;
         }
+    } else {
+        // Buffer is accessed directly, upload just what we use
+        size = entry.GetSize();
+    }
 
-        // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
-        // UBO alignment requirements.
-        size = Common::AlignUp(size, sizeof(GLvec4));
-        ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big");
-
-        const GLintptr const_buffer_offset =
-            buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment());
+    // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
+    // UBO alignment requirements.
+    size = Common::AlignUp(size, sizeof(GLvec4));
+    ASSERT_MSG(size <= MaxConstbufferSize, "Constant buffer is too big");
 
-        bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), const_buffer_offset, size);
-    }
+    const std::size_t alignment = device.GetUniformBufferAlignment();
+    const GLintptr offset = buffer_cache.UploadMemory(buffer.address, size, alignment);
+    bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, size);
 }
 
 void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                                          const Shader& shader, GLenum primitive_mode,
-                                          BaseBindings base_bindings) {
+                                          const Shader& shader) {
     const auto& entries = shader->GetShaderEntries().global_memory_entries;
     for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
         const auto& entry{entries[bindpoint]};
@@ -874,7 +838,7 @@ void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::Shade
 }
 
 void RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader,
-                                     GLuint program_handle, BaseBindings base_bindings) {
+                                     BaseBindings base_bindings) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
     const auto& gpu = system.GPU();
     const auto& maxwell3d = gpu.Maxwell3D();
@@ -1152,10 +1116,17 @@ void RasterizerOpenGL::SyncPolygonOffset() {
     state.polygon_offset.clamp = regs.polygon_offset_clamp;
 }
 
-void RasterizerOpenGL::CheckAlphaTests() {
+void RasterizerOpenGL::SyncAlphaTest() {
     const auto& regs = system.GPU().Maxwell3D().regs;
     UNIMPLEMENTED_IF_MSG(regs.alpha_test_enabled != 0 && regs.rt_control.count > 1,
                          "Alpha Testing is enabled with more than one rendertarget");
+
+    state.alpha_test.enabled = regs.alpha_test_enabled;
+    if (!state.alpha_test.enabled) {
+        return;
+    }
+    state.alpha_test.func = MaxwellToGL::ComparisonOp(regs.alpha_test_func);
+    state.alpha_test.ref = regs.alpha_test_ref;
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index d78094138..f7671ff5d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -17,17 +17,18 @@
 #include <glad/glad.h>
 
 #include "common/common_types.h"
+#include "video_core/engines/const_buffer_info.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_global_cache.h"
-#include "video_core/renderer_opengl/gl_primitive_assembler.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_sampler_cache.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
+#include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/gl_state.h"
 #include "video_core/renderer_opengl/utils.h"
@@ -106,17 +107,20 @@ private:
         bool preserve_contents = true, std::optional<std::size_t> single_color_target = {});
 
     /// Configures the current constbuffers to use for the draw command.
-    void SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader,
-                           GLuint program_handle, BaseBindings base_bindings);
+    void SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                               const Shader& shader);
+
+    /// Configures a constant buffer.
+    void SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer,
+                          const GLShader::ConstBufferEntry& entry);
 
     /// Configures the current global memory entries to use for the draw command.
     void SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                            const Shader& shader, GLenum primitive_mode,
-                            BaseBindings base_bindings);
+                            const Shader& shader);
 
     /// Configures the current textures to use for the draw command.
     void SetupTextures(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader,
-                       GLuint program_handle, BaseBindings base_bindings);
+                       BaseBindings base_bindings);
 
     /// Syncs the viewport and depth range to match the guest state
     void SyncViewport(OpenGLState& current_state);
@@ -167,8 +171,8 @@ private:
     /// Syncs the polygon offsets
     void SyncPolygonOffset();
 
-    /// Check asserts for alpha testing.
-    void CheckAlphaTests();
+    /// Syncs the alpha test state to match the guest state
+    void SyncAlphaTest();
 
     /// Check for extension that are not strictly required
     /// but are needed for correct emulation
@@ -197,7 +201,6 @@ private:
 
     static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
     OGLBufferCache buffer_cache;
-    PrimitiveAssembler primitive_assembler{buffer_cache};
 
     BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
     BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index d66252224..ac8a9e6b7 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -35,8 +35,8 @@ struct UnspecializedShader {
 namespace {
 
 /// Gets the address for the specified shader stage program
-GPUVAddr GetShaderAddress(Maxwell::ShaderProgram program) {
-    const auto& gpu{Core::System::GetInstance().GPU().Maxwell3D()};
+GPUVAddr GetShaderAddress(Core::System& system, Maxwell::ShaderProgram program) {
+    const auto& gpu{system.GPU().Maxwell3D()};
     const auto& shader_config{gpu.regs.shader_config[static_cast<std::size_t>(program)]};
     return gpu.regs.code_address.CodeAddress() + shader_config.offset;
 }
@@ -350,7 +350,8 @@ ShaderDiskCacheUsage CachedShader::GetUsage(GLenum primitive_mode,
 
 ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
                                      Core::Frontend::EmuWindow& emu_window, const Device& device)
-    : RasterizerCache{rasterizer}, emu_window{emu_window}, device{device}, disk_cache{system} {}
+    : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device},
+      disk_cache{system} {}
 
 void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
                                       const VideoCore::DiskResourceLoadCallback& callback) {
@@ -546,42 +547,45 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
 }
 
 Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
-    if (!Core::System::GetInstance().GPU().Maxwell3D().dirty_flags.shaders) {
-        return last_shaders[static_cast<u32>(program)];
+    if (!system.GPU().Maxwell3D().dirty_flags.shaders) {
+        return last_shaders[static_cast<std::size_t>(program)];
     }
 
-    auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
-    const GPUVAddr program_addr{GetShaderAddress(program)};
+    auto& memory_manager{system.GPU().MemoryManager()};
+    const GPUVAddr program_addr{GetShaderAddress(system, program)};
 
     // Look up shader in the cache based on address
-    const auto& host_ptr{memory_manager.GetPointer(program_addr)};
+    const auto host_ptr{memory_manager.GetPointer(program_addr)};
     Shader shader{TryGet(host_ptr)};
+    if (shader) {
+        return last_shaders[static_cast<std::size_t>(program)] = shader;
+    }
 
-    if (!shader) {
-        // No shader found - create a new one
-        ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)};
-        ProgramCode program_code_b;
-        if (program == Maxwell::ShaderProgram::VertexA) {
-            const GPUVAddr program_addr_b{GetShaderAddress(Maxwell::ShaderProgram::VertexB)};
-            program_code_b = GetShaderCode(memory_manager, program_addr_b,
-                                           memory_manager.GetPointer(program_addr_b));
-        }
-        const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
-        const VAddr cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)};
-        const auto found = precompiled_shaders.find(unique_identifier);
-        if (found != precompiled_shaders.end()) {
-            shader =
-                std::make_shared<CachedShader>(cpu_addr, unique_identifier, program, disk_cache,
-                                               precompiled_programs, found->second, host_ptr);
-        } else {
-            shader = std::make_shared<CachedShader>(
-                device, cpu_addr, unique_identifier, program, disk_cache, precompiled_programs,
-                std::move(program_code), std::move(program_code_b), host_ptr);
-        }
-        Register(shader);
+    // No shader found - create a new one
+    ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)};
+    ProgramCode program_code_b;
+    if (program == Maxwell::ShaderProgram::VertexA) {
+        const GPUVAddr program_addr_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)};
+        program_code_b = GetShaderCode(memory_manager, program_addr_b,
+                                       memory_manager.GetPointer(program_addr_b));
+    }
+
+    const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
+    const VAddr cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)};
+    const auto found = precompiled_shaders.find(unique_identifier);
+    if (found != precompiled_shaders.end()) {
+        // Create a shader from the cache
+        shader = std::make_shared<CachedShader>(cpu_addr, unique_identifier, program, disk_cache,
+                                                precompiled_programs, found->second, host_ptr);
+    } else {
+        // Create a shader from guest memory
+        shader = std::make_shared<CachedShader>(
+            device, cpu_addr, unique_identifier, program, disk_cache, precompiled_programs,
+            std::move(program_code), std::move(program_code_b), host_ptr);
     }
+    Register(shader);
 
-    return last_shaders[static_cast<u32>(program)] = shader;
+    return last_shaders[static_cast<std::size_t>(program)] = shader;
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 64e5a5594..09bd0761d 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -137,6 +137,7 @@ private:
     CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump,
                                              const std::set<GLenum>& supported_formats);
 
+    Core::System& system;
     Core::Frontend::EmuWindow& emu_window;
     const Device& device;
     ShaderDiskCacheOpenGL disk_cache;
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index e9f8d40db..7dc2e0560 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -45,7 +45,6 @@ struct TextureAoffi {};
 using TextureArgument = std::pair<Type, Node>;
 using TextureIR = std::variant<TextureAoffi, TextureArgument>;
 
-enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 };
 constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
     static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float));
 
@@ -124,8 +123,8 @@ bool IsPrecise(Operation operand) {
     return false;
 }
 
-bool IsPrecise(Node node) {
-    if (const auto operation = std::get_if<OperationNode>(node)) {
+bool IsPrecise(const Node& node) {
+    if (const auto operation = std::get_if<OperationNode>(&*node)) {
         return IsPrecise(*operation);
     }
     return false;
@@ -144,6 +143,24 @@ u32 GetGenericAttributeIndex(Attribute::Index index) {
     return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
 }
 
+constexpr const char* GetFlowStackPrefix(MetaStackClass stack) {
+    switch (stack) {
+    case MetaStackClass::Ssy:
+        return "ssy";
+    case MetaStackClass::Pbk:
+        return "pbk";
+    }
+    return {};
+}
+
+std::string FlowStackName(MetaStackClass stack) {
+    return fmt::format("{}_flow_stack", GetFlowStackPrefix(stack));
+}
+
+std::string FlowStackTopName(MetaStackClass stack) {
+    return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
+}
+
 class GLSLDecompiler final {
 public:
     explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
@@ -174,8 +191,10 @@ public:
         // TODO(Subv): Figure out the actual depth of the flow stack, for now it seems
         // unlikely that shaders will use 20 nested SSYs and PBKs.
         constexpr u32 FLOW_STACK_SIZE = 20;
-        code.AddLine("uint flow_stack[{}];", FLOW_STACK_SIZE);
-        code.AddLine("uint flow_stack_top = 0u;");
+        for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) {
+            code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE);
+            code.AddLine("uint {} = 0u;", FlowStackTopName(stack));
+        }
 
         code.AddLine("while (true) {{");
         ++code.scope;
@@ -247,6 +266,12 @@ private:
         code.AddLine("layout ({}, max_vertices = {}) out;", topology, max_vertices);
         code.AddNewLine();
 
+        code.AddLine("in gl_PerVertex {{");
+        ++code.scope;
+        code.AddLine("vec4 gl_Position;");
+        --code.scope;
+        code.AddLine("}} gl_in[];");
+
         DeclareVertexRedeclarations();
     }
 
@@ -349,7 +374,7 @@ private:
     }
 
     void DeclareInputAttribute(Attribute::Index index, bool skip_unused) {
-        const u32 generic_index{GetGenericAttributeIndex(index)};
+        const u32 location{GetGenericAttributeIndex(index)};
 
         std::string name{GetInputAttribute(index)};
         if (stage == ShaderStage::Geometry) {
@@ -358,19 +383,13 @@ private:
 
         std::string suffix;
         if (stage == ShaderStage::Fragment) {
-            const auto input_mode{header.ps.GetAttributeUse(generic_index)};
+            const auto input_mode{header.ps.GetAttributeUse(location)};
             if (skip_unused && input_mode == AttributeUse::Unused) {
                 return;
             }
             suffix = GetInputFlags(input_mode);
         }
 
-        u32 location = generic_index;
-        if (stage != ShaderStage::Vertex) {
-            // If inputs are varyings, add an offset
-            location += GENERIC_VARYING_START_LOCATION;
-        }
-
         code.AddLine("layout (location = {}) {} in vec4 {};", location, suffix, name);
     }
 
@@ -395,7 +414,7 @@ private:
     }
 
     void DeclareOutputAttribute(Attribute::Index index) {
-        const u32 location{GetGenericAttributeIndex(index) + GENERIC_VARYING_START_LOCATION};
+        const u32 location{GetGenericAttributeIndex(index)};
         code.AddLine("layout (location = {}) out vec4 {};", location, GetOutputAttribute(index));
     }
 
@@ -498,15 +517,15 @@ private:
     }
 
     void VisitBlock(const NodeBlock& bb) {
-        for (const Node node : bb) {
+        for (const auto& node : bb) {
             if (const std::string expr = Visit(node); !expr.empty()) {
                 code.AddLine(expr);
             }
         }
     }
 
-    std::string Visit(Node node) {
-        if (const auto operation = std::get_if<OperationNode>(node)) {
+    std::string Visit(const Node& node) {
+        if (const auto operation = std::get_if<OperationNode>(&*node)) {
             const auto operation_index = static_cast<std::size_t>(operation->GetCode());
             if (operation_index >= operation_decompilers.size()) {
                 UNREACHABLE_MSG("Out of bounds operation: {}", operation_index);
@@ -520,7 +539,7 @@ private:
             return (this->*decompiler)(*operation);
         }
 
-        if (const auto gpr = std::get_if<GprNode>(node)) {
+        if (const auto gpr = std::get_if<GprNode>(&*node)) {
             const u32 index = gpr->GetIndex();
             if (index == Register::ZeroIndex) {
                 return "0";
@@ -528,7 +547,7 @@ private:
             return GetRegister(index);
         }
 
-        if (const auto immediate = std::get_if<ImmediateNode>(node)) {
+        if (const auto immediate = std::get_if<ImmediateNode>(&*node)) {
             const u32 value = immediate->GetValue();
             if (value < 10) {
                 // For eyecandy avoid using hex numbers on single digits
@@ -537,7 +556,7 @@ private:
             return fmt::format("utof(0x{:x}u)", immediate->GetValue());
         }
 
-        if (const auto predicate = std::get_if<PredicateNode>(node)) {
+        if (const auto predicate = std::get_if<PredicateNode>(&*node)) {
             const auto value = [&]() -> std::string {
                 switch (const auto index = predicate->GetIndex(); index) {
                 case Tegra::Shader::Pred::UnusedIndex:
@@ -554,7 +573,7 @@ private:
             return value;
         }
 
-        if (const auto abuf = std::get_if<AbufNode>(node)) {
+        if (const auto abuf = std::get_if<AbufNode>(&*node)) {
             UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ShaderStage::Geometry,
                                  "Physical attributes in geometry shaders are not implemented");
             if (abuf->IsPhysicalBuffer()) {
@@ -564,9 +583,9 @@ private:
             return ReadAttribute(abuf->GetIndex(), abuf->GetElement(), abuf->GetBuffer());
         }
 
-        if (const auto cbuf = std::get_if<CbufNode>(node)) {
+        if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
             const Node offset = cbuf->GetOffset();
-            if (const auto immediate = std::get_if<ImmediateNode>(offset)) {
+            if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
                 // Direct access
                 const u32 offset_imm = immediate->GetValue();
                 ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
@@ -577,30 +596,47 @@ private:
             if (std::holds_alternative<OperationNode>(*offset)) {
                 // Indirect access
                 const std::string final_offset = code.GenerateTemporary();
-                code.AddLine("uint {} = (ftou({}) / 4);", final_offset, Visit(offset));
-                return fmt::format("{}[{} / 4][{} % 4]", GetConstBuffer(cbuf->GetIndex()),
-                                   final_offset, final_offset);
+                code.AddLine("uint {} = ftou({}) >> 2;", final_offset, Visit(offset));
+
+                if (!device.HasComponentIndexingBug()) {
+                    return fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
+                                       final_offset, final_offset);
+                }
+
+                // AMD's proprietary GLSL compiler emits ill code for variable component access.
+                // To bypass this driver bug generate 4 ifs, one per each component.
+                const std::string pack = code.GenerateTemporary();
+                code.AddLine("vec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
+                             final_offset);
+
+                const std::string result = code.GenerateTemporary();
+                code.AddLine("float {};", result);
+                for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
+                    code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result,
+                                 pack, GetSwizzle(swizzle));
+                }
+                return result;
             }
 
             UNREACHABLE_MSG("Unmanaged offset node type");
         }
 
-        if (const auto gmem = std::get_if<GmemNode>(node)) {
+        if (const auto gmem = std::get_if<GmemNode>(&*node)) {
             const std::string real = Visit(gmem->GetRealAddress());
             const std::string base = Visit(gmem->GetBaseAddress());
             const std::string final_offset = fmt::format("(ftou({}) - ftou({})) / 4", real, base);
             return fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset);
         }
 
-        if (const auto lmem = std::get_if<LmemNode>(node)) {
+        if (const auto lmem = std::get_if<LmemNode>(&*node)) {
             return fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
         }
 
-        if (const auto internal_flag = std::get_if<InternalFlagNode>(node)) {
+        if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) {
             return GetInternalFlag(internal_flag->GetFlag());
         }
 
-        if (const auto conditional = std::get_if<ConditionalNode>(node)) {
+        if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
             // It's invalid to call conditional on nested nodes, use an operation instead
             code.AddLine("if ({}) {{", Visit(conditional->GetCondition()));
             ++code.scope;
@@ -612,7 +648,7 @@ private:
             return {};
         }
 
-        if (const auto comment = std::get_if<CommentNode>(node)) {
+        if (const auto comment = std::get_if<CommentNode>(&*node)) {
             return "// " + comment->GetText();
         }
 
@@ -620,7 +656,7 @@ private:
         return {};
     }
 
-    std::string ReadAttribute(Attribute::Index attribute, u32 element, Node buffer = {}) {
+    std::string ReadAttribute(Attribute::Index attribute, u32 element, const Node& buffer = {}) {
         const auto GeometryPass = [&](std::string_view name) {
             if (stage == ShaderStage::Geometry && buffer) {
                 // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games
@@ -633,10 +669,14 @@ private:
 
         switch (attribute) {
         case Attribute::Index::Position:
-            if (stage != ShaderStage::Fragment) {
-                return GeometryPass("position") + GetSwizzle(element);
-            } else {
+            switch (stage) {
+            case ShaderStage::Geometry:
+                return fmt::format("gl_in[ftou({})].gl_Position{}", Visit(buffer),
+                                   GetSwizzle(element));
+            case ShaderStage::Fragment:
                 return element == 3 ? "1.0f" : ("gl_FragCoord"s + GetSwizzle(element));
+            default:
+                UNREACHABLE();
             }
         case Attribute::Index::PointCoord:
             switch (element) {
@@ -852,7 +892,7 @@ private:
         std::string expr = ", ";
         switch (type) {
         case Type::Int:
-            if (const auto immediate = std::get_if<ImmediateNode>(operand)) {
+            if (const auto immediate = std::get_if<ImmediateNode>(&*operand)) {
                 // Inline the string as an immediate integer in GLSL (some extra arguments are
                 // required to be constant)
                 expr += std::to_string(static_cast<s32>(immediate->GetValue()));
@@ -884,7 +924,7 @@ private:
 
         for (std::size_t index = 0; index < aoffi.size(); ++index) {
             const auto operand{aoffi.at(index)};
-            if (const auto immediate = std::get_if<ImmediateNode>(operand)) {
+            if (const auto immediate = std::get_if<ImmediateNode>(&*operand)) {
                 // Inline the string as an immediate integer in GLSL (AOFFI arguments are required
                 // to be constant by the standard).
                 expr += std::to_string(static_cast<s32>(immediate->GetValue()));
@@ -905,23 +945,23 @@ private:
     }
 
     std::string Assign(Operation operation) {
-        const Node dest = operation[0];
-        const Node src = operation[1];
+        const Node& dest = operation[0];
+        const Node& src = operation[1];
 
         std::string target;
-        if (const auto gpr = std::get_if<GprNode>(dest)) {
+        if (const auto gpr = std::get_if<GprNode>(&*dest)) {
             if (gpr->GetIndex() == Register::ZeroIndex) {
                 // Writing to Register::ZeroIndex is a no op
                 return {};
             }
             target = GetRegister(gpr->GetIndex());
-        } else if (const auto abuf = std::get_if<AbufNode>(dest)) {
+        } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
             UNIMPLEMENTED_IF(abuf->IsPhysicalBuffer());
 
             target = [&]() -> std::string {
                 switch (const auto attribute = abuf->GetIndex(); abuf->GetIndex()) {
                 case Attribute::Index::Position:
-                    return "position"s + GetSwizzle(abuf->GetElement());
+                    return "gl_Position"s + GetSwizzle(abuf->GetElement());
                 case Attribute::Index::PointSize:
                     return "gl_PointSize";
                 case Attribute::Index::ClipDistances0123:
@@ -937,9 +977,9 @@ private:
                     return "0";
                 }
             }();
-        } else if (const auto lmem = std::get_if<LmemNode>(dest)) {
+        } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
             target = fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
-        } else if (const auto gmem = std::get_if<GmemNode>(dest)) {
+        } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
             const std::string real = Visit(gmem->GetRealAddress());
             const std::string base = Visit(gmem->GetBaseAddress());
             const std::string final_offset = fmt::format("(ftou({}) - ftou({})) / 4", real, base);
@@ -1216,12 +1256,12 @@ private:
     }
 
     std::string LogicalAssign(Operation operation) {
-        const Node dest = operation[0];
-        const Node src = operation[1];
+        const Node& dest = operation[0];
+        const Node& src = operation[1];
 
         std::string target;
 
-        if (const auto pred = std::get_if<PredicateNode>(dest)) {
+        if (const auto pred = std::get_if<PredicateNode>(&*dest)) {
             ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment");
 
             const auto index = pred->GetIndex();
@@ -1232,7 +1272,7 @@ private:
                 return {};
             }
             target = GetPredicate(index);
-        } else if (const auto flag = std::get_if<InternalFlagNode>(dest)) {
+        } else if (const auto flag = std::get_if<InternalFlagNode>(&*dest)) {
             target = GetInternalFlag(flag->GetFlag());
         }
 
@@ -1409,7 +1449,7 @@ private:
     }
 
     std::string Branch(Operation operation) {
-        const auto target = std::get_if<ImmediateNode>(operation[0]);
+        const auto target = std::get_if<ImmediateNode>(&*operation[0]);
         UNIMPLEMENTED_IF(!target);
 
         code.AddLine("jmp_to = 0x{:x}u;", target->GetValue());
@@ -1418,15 +1458,18 @@ private:
     }
 
     std::string PushFlowStack(Operation operation) {
-        const auto target = std::get_if<ImmediateNode>(operation[0]);
+        const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+        const auto target = std::get_if<ImmediateNode>(&*operation[0]);
         UNIMPLEMENTED_IF(!target);
 
-        code.AddLine("flow_stack[flow_stack_top++] = 0x{:x}u;", target->GetValue());
+        code.AddLine("{}[{}++] = 0x{:x}u;", FlowStackName(stack), FlowStackTopName(stack),
+                     target->GetValue());
         return {};
     }
 
     std::string PopFlowStack(Operation operation) {
-        code.AddLine("jmp_to = flow_stack[--flow_stack_top];");
+        const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+        code.AddLine("jmp_to = {}[--{}];", FlowStackName(stack), FlowStackTopName(stack));
         code.AddLine("break;");
         return {};
     }
@@ -1447,27 +1490,9 @@ private:
 
         UNIMPLEMENTED_IF_MSG(header.ps.omap.sample_mask != 0, "Sample mask write is unimplemented");
 
-        code.AddLine("if (alpha_test[0] != 0) {{");
-        ++code.scope;
-        // We start on the register containing the alpha value in the first RT.
-        u32 current_reg = 3;
-        for (u32 render_target = 0; render_target < Maxwell::NumRenderTargets; ++render_target) {
-            // TODO(Blinkhawk): verify the behavior of alpha testing on hardware when
-            // multiple render targets are used.
-            if (header.ps.IsColorComponentOutputEnabled(render_target, 0) ||
-                header.ps.IsColorComponentOutputEnabled(render_target, 1) ||
-                header.ps.IsColorComponentOutputEnabled(render_target, 2) ||
-                header.ps.IsColorComponentOutputEnabled(render_target, 3)) {
-                code.AddLine("if (!AlphaFunc({})) discard;", SafeGetRegister(current_reg));
-                current_reg += 4;
-            }
-        }
-        --code.scope;
-        code.AddLine("}}");
-
         // Write the color outputs using the data in the shader registers, disabled
         // rendertargets/components are skipped in the register assignment.
-        current_reg = 0;
+        u32 current_reg = 0;
         for (u32 render_target = 0; render_target < Maxwell::NumRenderTargets; ++render_target) {
             // TODO(Subv): Figure out how dual-source blending is configured in the Switch.
             for (u32 component = 0; component < 4; ++component) {
@@ -1506,9 +1531,7 @@ private:
 
         // If a geometry shader is attached, it will always flip (it's the last stage before
         // fragment). For more info about flipping, refer to gl_shader_gen.cpp.
-        code.AddLine("position.xy *= viewport_flip.xy;");
-        code.AddLine("gl_Position = position;");
-        code.AddLine("position.w = 1.0;");
+        code.AddLine("gl_Position.xy *= viewport_flip.xy;");
         code.AddLine("EmitVertex();");
         return {};
     }
@@ -1746,8 +1769,7 @@ private:
     }
 
     u32 GetNumPhysicalVaryings() const {
-        return std::min<u32>(device.GetMaxVaryings() - GENERIC_VARYING_START_LOCATION,
-                             Maxwell::NumVaryings);
+        return std::min<u32>(device.GetMaxVaryings(), Maxwell::NumVaryings);
     }
 
     const Device& device;
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index d2bb705a9..9148629ec 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -23,12 +23,9 @@ ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setu
     out += GetCommonDeclarations();
 
     out += R"(
-layout (location = 0) out vec4 position;
-
 layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
     vec4 viewport_flip;
     uvec4 config_pack; // instance_id, flip_stage, y_direction, padding
-    uvec4 alpha_test;
 };
 
 )";
@@ -48,7 +45,6 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
 
     out += R"(
 void main() {
-    position = vec4(0.0, 0.0, 0.0, 0.0);
     execute_vertex();
 )";
 
@@ -59,19 +55,12 @@ void main() {
     out += R"(
 
     // Set Position Y direction
-    position.y *= utof(config_pack[2]);
+    gl_Position.y *= utof(config_pack[2]);
     // Check if the flip stage is VertexB
     // Config pack's second value is flip_stage
     if (config_pack[1] == 1) {
         // Viewport can be flipped, which is unsupported by glViewport
-        position.xy *= viewport_flip.xy;
-    }
-    gl_Position = position;
-
-    // TODO(bunnei): This is likely a hack, position.w should be interpolated as 1.0
-    // For now, this is here to bring order in lieu of proper emulation
-    if (config_pack[1] == 1) {
-        position.w = 1.0;
+        gl_Position.xy *= viewport_flip.xy;
     }
 })";
 
@@ -85,13 +74,9 @@ ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& se
     out += GetCommonDeclarations();
 
     out += R"(
-layout (location = 0) in vec4 gs_position[];
-layout (location = 0) out vec4 position;
-
 layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
     vec4 viewport_flip;
     uvec4 config_pack; // instance_id, flip_stage, y_direction, padding
-    uvec4 alpha_test;
 };
 
 )";
@@ -124,38 +109,11 @@ layout (location = 5) out vec4 FragColor5;
 layout (location = 6) out vec4 FragColor6;
 layout (location = 7) out vec4 FragColor7;
 
-layout (location = 0) in noperspective vec4 position;
-
 layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config {
     vec4 viewport_flip;
     uvec4 config_pack; // instance_id, flip_stage, y_direction, padding
-    uvec4 alpha_test;
 };
 
-bool AlphaFunc(in float value) {
-    float ref = uintBitsToFloat(alpha_test[2]);
-    switch (alpha_test[1]) {
-        case 1:
-            return false;
-        case 2:
-            return value < ref;
-        case 3:
-            return value == ref;
-        case 4:
-            return value <= ref;
-        case 5:
-            return value > ref;
-        case 6:
-            return value != ref;
-        case 7:
-            return value >= ref;
-        case 8:
-            return true;
-        default:
-            return false;
-    }
-}
-
 )";
     const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
     ProgramResult program =
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index 05ab01dcb..b05f90f20 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -48,17 +48,6 @@ void MaxwellUniformData::SetFromRegs(const Maxwell3D& maxwell, std::size_t shade
     viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0f : 1.0f;
     viewport_flip[1] = regs.viewport_transform[0].scale_y < 0.0 ? -1.0f : 1.0f;
 
-    auto func{static_cast<u32>(regs.alpha_test_func)};
-    // Normalize the gl variants of opCompare to be the same as the normal variants
-    const u32 op_gl_variant_base = static_cast<u32>(Maxwell3D::Regs::ComparisonOp::Never);
-    if (func >= op_gl_variant_base) {
-        func = func - op_gl_variant_base + 1U;
-    }
-
-    alpha_test.enabled = regs.alpha_test_enabled;
-    alpha_test.func = func;
-    alpha_test.ref = regs.alpha_test_ref;
-
     instance_id = state.current_instance;
 
     // Assign in which stage the position has to be flipped
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index cec18a832..6961e702a 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -27,14 +27,8 @@ struct MaxwellUniformData {
         GLuint flip_stage;
         GLfloat y_direction;
     };
-    struct alignas(16) {
-        GLuint enabled;
-        GLuint func;
-        GLfloat ref;
-        GLuint padding;
-    } alpha_test;
 };
-static_assert(sizeof(MaxwellUniformData) == 48, "MaxwellUniformData structure size is incorrect");
+static_assert(sizeof(MaxwellUniformData) == 32, "MaxwellUniformData structure size is incorrect");
 static_assert(sizeof(MaxwellUniformData) < 16384,
               "MaxwellUniformData structure must be less than 16kb as per the OpenGL spec");
 
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 7425fbe5d..d86e137ac 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -156,6 +156,10 @@ OpenGLState::OpenGLState() {
     polygon_offset.factor = 0.0f;
     polygon_offset.units = 0.0f;
     polygon_offset.clamp = 0.0f;
+
+    alpha_test.enabled = false;
+    alpha_test.func = GL_ALWAYS;
+    alpha_test.ref = 0.0f;
 }
 
 void OpenGLState::ApplyDefaultState() {
@@ -461,6 +465,14 @@ void OpenGLState::ApplyPolygonOffset() const {
     }
 }
 
+void OpenGLState::ApplyAlphaTest() const {
+    Enable(GL_ALPHA_TEST, cur_state.alpha_test.enabled, alpha_test.enabled);
+    if (UpdateTie(std::tie(cur_state.alpha_test.func, cur_state.alpha_test.ref),
+                  std::tie(alpha_test.func, alpha_test.ref))) {
+        glAlphaFunc(alpha_test.func, alpha_test.ref);
+    }
+}
+
 void OpenGLState::ApplyTextures() const {
     bool has_delta{};
     std::size_t first{};
@@ -533,6 +545,7 @@ void OpenGLState::Apply() const {
     ApplyTextures();
     ApplySamplers();
     ApplyPolygonOffset();
+    ApplyAlphaTest();
 }
 
 void OpenGLState::EmulateViewportWithScissor() {
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 41418a7b8..b0140495d 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -172,6 +172,12 @@ public:
         GLfloat clamp;
     } polygon_offset;
 
+    struct {
+        bool enabled; // GL_ALPHA_TEST
+        GLenum func;  // GL_ALPHA_TEST_FUNC
+        GLfloat ref;  // GL_ALPHA_TEST_REF
+    } alpha_test;
+
     std::array<bool, 8> clip_distance; // GL_CLIP_DISTANCE
 
     OpenGLState();
@@ -215,6 +221,7 @@ public:
     void ApplySamplers() const;
     void ApplyDepthClamp() const;
     void ApplyPolygonOffset() const;
+    void ApplyAlphaTest() const;
 
     /// Set the initial OpenGL state
     static void ApplyDefaultState();
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index ed7b5cff0..ea77dd211 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -128,6 +128,8 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
         return GL_TRIANGLE_STRIP;
     case Maxwell::PrimitiveTopology::TriangleFan:
         return GL_TRIANGLE_FAN;
+    case Maxwell::PrimitiveTopology::Quads:
+        return GL_QUADS;
     default:
         LOG_CRITICAL(Render_OpenGL, "Unimplemented topology={}", static_cast<u32>(topology));
         UNREACHABLE();
@@ -173,11 +175,8 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         return GL_CLAMP_TO_EDGE;
     case Tegra::Texture::WrapMode::Border:
         return GL_CLAMP_TO_BORDER;
-    case Tegra::Texture::WrapMode::ClampOGL:
-        // TODO(Subv): GL_CLAMP was removed as of OpenGL 3.1, to implement GL_CLAMP, we can use
-        // GL_CLAMP_TO_BORDER to get the border color of the texture, and then sample the edge to
-        // manually mix them. However the shader part of this is not yet implemented.
-        return GL_CLAMP_TO_BORDER;
+    case Tegra::Texture::WrapMode::Clamp:
+        return GL_CLAMP;
     case Tegra::Texture::WrapMode::MirrorOnceClampToEdge:
         return GL_MIRROR_CLAMP_TO_EDGE;
     case Tegra::Texture::WrapMode::MirrorOnceBorder:
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 3451d321d..aafd6f31b 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -18,7 +18,6 @@
 #include "core/perf_stats.h"
 #include "core/settings.h"
 #include "core/telemetry_session.h"
-#include "core/tracer/recorder.h"
 #include "video_core/morton.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 9fe1e3280..0bbbf6851 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -52,7 +52,7 @@ vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         return vk::SamplerAddressMode::eClampToEdge;
     case Tegra::Texture::WrapMode::Border:
         return vk::SamplerAddressMode::eClampToBorder;
-    case Tegra::Texture::WrapMode::ClampOGL:
+    case Tegra::Texture::WrapMode::Clamp:
         // TODO(Rodrigo): GL_CLAMP was removed as of OpenGL 3.1, to implement GL_CLAMP, we can use
         // eClampToBorder to get the border color of the texture, and then sample the edge to
         // manually mix them. However the shader part of this is not yet implemented.
diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp
index 00242ecbe..3b966ddc3 100644
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -18,6 +18,7 @@ constexpr std::array<vk::Format, 3> Depth24UnormS8Uint = {
     vk::Format::eD32SfloatS8Uint, vk::Format::eD16UnormS8Uint, {}};
 constexpr std::array<vk::Format, 3> Depth16UnormS8Uint = {
     vk::Format::eD24UnormS8Uint, vk::Format::eD32SfloatS8Uint, {}};
+constexpr std::array<vk::Format, 2> Astc = {vk::Format::eA8B8G8R8UnormPack32, {}};
 
 } // namespace Alternatives
 
@@ -51,15 +52,19 @@ VKDevice::VKDevice(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice phy
     : physical{physical}, format_properties{GetFormatProperties(dldi, physical)} {
     SetupFamilies(dldi, surface);
     SetupProperties(dldi);
+    SetupFeatures(dldi);
 }
 
 VKDevice::~VKDevice() = default;
 
 bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instance) {
-    const auto queue_cis = GetDeviceQueueCreateInfos();
-    vk::PhysicalDeviceFeatures device_features{};
+    vk::PhysicalDeviceFeatures device_features;
+    device_features.vertexPipelineStoresAndAtomics = true;
+    device_features.independentBlend = true;
+    device_features.textureCompressionASTC_LDR = is_optimal_astc_supported;
 
-    const std::vector<const char*> extensions = {VK_KHR_SWAPCHAIN_EXTENSION_NAME};
+    const auto queue_cis = GetDeviceQueueCreateInfos();
+    const std::vector<const char*> extensions = LoadExtensions(dldi);
     const vk::DeviceCreateInfo device_ci({}, static_cast<u32>(queue_cis.size()), queue_cis.data(),
                                          0, nullptr, static_cast<u32>(extensions.size()),
                                          extensions.data(), &device_features);
@@ -90,7 +95,7 @@ vk::Format VKDevice::GetSupportedFormat(vk::Format wanted_format,
         LOG_CRITICAL(Render_Vulkan,
                      "Format={} with usage={} and type={} has no defined alternatives and host "
                      "hardware does not support it",
-                     static_cast<u32>(wanted_format), static_cast<u32>(wanted_usage),
+                     vk::to_string(wanted_format), vk::to_string(wanted_usage),
                      static_cast<u32>(format_type));
         UNREACHABLE();
         return wanted_format;
@@ -118,6 +123,30 @@ vk::Format VKDevice::GetSupportedFormat(vk::Format wanted_format,
     return wanted_format;
 }
 
+bool VKDevice::IsOptimalAstcSupported(const vk::PhysicalDeviceFeatures& features,
+                                      const vk::DispatchLoaderDynamic& dldi) const {
+    if (!features.textureCompressionASTC_LDR) {
+        return false;
+    }
+    const auto format_feature_usage{
+        vk::FormatFeatureFlagBits::eSampledImage | vk::FormatFeatureFlagBits::eBlitSrc |
+        vk::FormatFeatureFlagBits::eBlitDst | vk::FormatFeatureFlagBits::eTransferSrc |
+        vk::FormatFeatureFlagBits::eTransferDst};
+    constexpr std::array<vk::Format, 9> astc_formats = {
+        vk::Format::eAstc4x4UnormBlock, vk::Format::eAstc4x4SrgbBlock,
+        vk::Format::eAstc8x8SrgbBlock,  vk::Format::eAstc8x6SrgbBlock,
+        vk::Format::eAstc5x4SrgbBlock,  vk::Format::eAstc5x5UnormBlock,
+        vk::Format::eAstc5x5SrgbBlock,  vk::Format::eAstc10x8UnormBlock,
+        vk::Format::eAstc10x8SrgbBlock};
+    for (const auto format : astc_formats) {
+        const auto format_properties{physical.getFormatProperties(format, dldi)};
+        if (!(format_properties.optimalTilingFeatures & format_feature_usage)) {
+            return false;
+        }
+    }
+    return true;
+}
+
 bool VKDevice::IsFormatSupported(vk::Format wanted_format, vk::FormatFeatureFlags wanted_usage,
                                  FormatType format_type) const {
     const auto it = format_properties.find(wanted_format);
@@ -132,11 +161,9 @@ bool VKDevice::IsFormatSupported(vk::Format wanted_format, vk::FormatFeatureFlag
 
 bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical,
                           vk::SurfaceKHR surface) {
-    const std::string swapchain_extension = VK_KHR_SWAPCHAIN_EXTENSION_NAME;
-
     bool has_swapchain{};
     for (const auto& prop : physical.enumerateDeviceExtensionProperties(nullptr, dldi)) {
-        has_swapchain |= prop.extensionName == swapchain_extension;
+        has_swapchain |= prop.extensionName == std::string(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
     }
     if (!has_swapchain) {
         // The device doesn't support creating swapchains.
@@ -160,8 +187,14 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev
     }
 
     // TODO(Rodrigo): Check if the device matches all requeriments.
-    const vk::PhysicalDeviceProperties props = physical.getProperties(dldi);
-    if (props.limits.maxUniformBufferRange < 65536) {
+    const auto properties{physical.getProperties(dldi)};
+    const auto limits{properties.limits};
+    if (limits.maxUniformBufferRange < 65536) {
+        return false;
+    }
+
+    const vk::PhysicalDeviceFeatures features{physical.getFeatures(dldi)};
+    if (!features.vertexPipelineStoresAndAtomics || !features.independentBlend) {
         return false;
     }
 
@@ -169,6 +202,30 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev
     return true;
 }
 
+std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynamic& dldi) {
+    std::vector<const char*> extensions;
+    extensions.reserve(2);
+    extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
+
+    const auto Test = [&](const vk::ExtensionProperties& extension,
+                          std::optional<std::reference_wrapper<bool>> status, const char* name,
+                          u32 revision) {
+        if (extension.extensionName != std::string(name)) {
+            return;
+        }
+        extensions.push_back(name);
+        if (status) {
+            status->get() = true;
+        }
+    };
+
+    for (const auto& extension : physical.enumerateDeviceExtensionProperties(nullptr, dldi)) {
+        Test(extension, ext_scalar_block_layout, VK_EXT_SCALAR_BLOCK_LAYOUT_EXTENSION_NAME, 1);
+    }
+
+    return extensions;
+}
+
 void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceKHR surface) {
     std::optional<u32> graphics_family_, present_family_;
 
@@ -196,10 +253,16 @@ void VKDevice::SetupProperties(const vk::DispatchLoaderDynamic& dldi) {
     const vk::PhysicalDeviceProperties props = physical.getProperties(dldi);
     device_type = props.deviceType;
     uniform_buffer_alignment = static_cast<u64>(props.limits.minUniformBufferOffsetAlignment);
+    max_storage_buffer_range = static_cast<u64>(props.limits.maxStorageBufferRange);
+}
+
+void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) {
+    const auto supported_features{physical.getFeatures(dldi)};
+    is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi);
 }
 
 std::vector<vk::DeviceQueueCreateInfo> VKDevice::GetDeviceQueueCreateInfos() const {
-    static const float QUEUE_PRIORITY = 1.f;
+    static const float QUEUE_PRIORITY = 1.0f;
 
     std::set<u32> unique_queue_families = {graphics_family, present_family};
     std::vector<vk::DeviceQueueCreateInfo> queue_cis;
@@ -212,26 +275,43 @@ std::vector<vk::DeviceQueueCreateInfo> VKDevice::GetDeviceQueueCreateInfos() con
 
 std::map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperties(
     const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical) {
+    static constexpr std::array formats{vk::Format::eA8B8G8R8UnormPack32,
+                                        vk::Format::eB5G6R5UnormPack16,
+                                        vk::Format::eA2B10G10R10UnormPack32,
+                                        vk::Format::eR32G32B32A32Sfloat,
+                                        vk::Format::eR16G16Unorm,
+                                        vk::Format::eR16G16Snorm,
+                                        vk::Format::eR8G8B8A8Srgb,
+                                        vk::Format::eR8Unorm,
+                                        vk::Format::eB10G11R11UfloatPack32,
+                                        vk::Format::eR32Sfloat,
+                                        vk::Format::eR16Sfloat,
+                                        vk::Format::eR16G16B16A16Sfloat,
+                                        vk::Format::eD32Sfloat,
+                                        vk::Format::eD16Unorm,
+                                        vk::Format::eD16UnormS8Uint,
+                                        vk::Format::eD24UnormS8Uint,
+                                        vk::Format::eD32SfloatS8Uint,
+                                        vk::Format::eBc1RgbaUnormBlock,
+                                        vk::Format::eBc2UnormBlock,
+                                        vk::Format::eBc3UnormBlock,
+                                        vk::Format::eBc4UnormBlock,
+                                        vk::Format::eBc5UnormBlock,
+                                        vk::Format::eBc5SnormBlock,
+                                        vk::Format::eBc7UnormBlock,
+                                        vk::Format::eAstc4x4UnormBlock,
+                                        vk::Format::eAstc4x4SrgbBlock,
+                                        vk::Format::eAstc8x8SrgbBlock,
+                                        vk::Format::eAstc8x6SrgbBlock,
+                                        vk::Format::eAstc5x4SrgbBlock,
+                                        vk::Format::eAstc5x5UnormBlock,
+                                        vk::Format::eAstc5x5SrgbBlock,
+                                        vk::Format::eAstc10x8UnormBlock,
+                                        vk::Format::eAstc10x8SrgbBlock};
     std::map<vk::Format, vk::FormatProperties> format_properties;
-
-    const auto AddFormatQuery = [&format_properties, &dldi, physical](vk::Format format) {
+    for (const auto format : formats) {
         format_properties.emplace(format, physical.getFormatProperties(format, dldi));
-    };
-    AddFormatQuery(vk::Format::eA8B8G8R8UnormPack32);
-    AddFormatQuery(vk::Format::eB5G6R5UnormPack16);
-    AddFormatQuery(vk::Format::eA2B10G10R10UnormPack32);
-    AddFormatQuery(vk::Format::eR8G8B8A8Srgb);
-    AddFormatQuery(vk::Format::eR8Unorm);
-    AddFormatQuery(vk::Format::eD32Sfloat);
-    AddFormatQuery(vk::Format::eD16Unorm);
-    AddFormatQuery(vk::Format::eD16UnormS8Uint);
-    AddFormatQuery(vk::Format::eD24UnormS8Uint);
-    AddFormatQuery(vk::Format::eD32SfloatS8Uint);
-    AddFormatQuery(vk::Format::eBc1RgbaUnormBlock);
-    AddFormatQuery(vk::Format::eBc2UnormBlock);
-    AddFormatQuery(vk::Format::eBc3UnormBlock);
-    AddFormatQuery(vk::Format::eBc4UnormBlock);
-
+    }
     return format_properties;
 }
 
diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h
index e87c7a508..537825d8b 100644
--- a/src/video_core/renderer_vulkan/vk_device.h
+++ b/src/video_core/renderer_vulkan/vk_device.h
@@ -11,7 +11,7 @@
 
 namespace Vulkan {
 
-/// Format usage descriptor
+/// Format usage descriptor.
 enum class FormatType { Linear, Optimal, Buffer };
 
 /// Handles data specific to a physical device.
@@ -34,12 +34,12 @@ public:
     vk::Format GetSupportedFormat(vk::Format wanted_format, vk::FormatFeatureFlags wanted_usage,
                                   FormatType format_type) const;
 
-    /// Returns the dispatch loader with direct function pointers of the device
+    /// Returns the dispatch loader with direct function pointers of the device.
     const vk::DispatchLoaderDynamic& GetDispatchLoader() const {
         return dld;
     }
 
-    /// Returns the logical device
+    /// Returns the logical device.
     vk::Device GetLogical() const {
         return logical.get();
     }
@@ -69,30 +69,55 @@ public:
         return present_family;
     }
 
-    /// Returns if the device is integrated with the host CPU
+    /// Returns if the device is integrated with the host CPU.
     bool IsIntegrated() const {
         return device_type == vk::PhysicalDeviceType::eIntegratedGpu;
     }
 
-    /// Returns uniform buffer alignment requeriment
+    /// Returns uniform buffer alignment requeriment.
     u64 GetUniformBufferAlignment() const {
         return uniform_buffer_alignment;
     }
 
+    /// Returns the maximum range for storage buffers.
+    u64 GetMaxStorageBufferRange() const {
+        return max_storage_buffer_range;
+    }
+
+    /// Returns true if ASTC is natively supported.
+    bool IsOptimalAstcSupported() const {
+        return is_optimal_astc_supported;
+    }
+
+    /// Returns true if the device supports VK_EXT_scalar_block_layout.
+    bool IsExtScalarBlockLayoutSupported() const {
+        return ext_scalar_block_layout;
+    }
+
     /// Checks if the physical device is suitable.
     static bool IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical,
                            vk::SurfaceKHR surface);
 
 private:
+    /// Loads extensions into a vector and stores available ones in this object.
+    std::vector<const char*> LoadExtensions(const vk::DispatchLoaderDynamic& dldi);
+
     /// Sets up queue families.
     void SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceKHR surface);
 
     /// Sets up device properties.
     void SetupProperties(const vk::DispatchLoaderDynamic& dldi);
 
+    /// Sets up device features.
+    void SetupFeatures(const vk::DispatchLoaderDynamic& dldi);
+
     /// Returns a list of queue initialization descriptors.
     std::vector<vk::DeviceQueueCreateInfo> GetDeviceQueueCreateInfos() const;
 
+    /// Returns true if ASTC textures are natively supported.
+    bool IsOptimalAstcSupported(const vk::PhysicalDeviceFeatures& features,
+                                const vk::DispatchLoaderDynamic& dldi) const;
+
     /// Returns true if a format is supported.
     bool IsFormatSupported(vk::Format wanted_format, vk::FormatFeatureFlags wanted_usage,
                            FormatType format_type) const;
@@ -101,16 +126,19 @@ private:
     static std::map<vk::Format, vk::FormatProperties> GetFormatProperties(
         const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical);
 
-    const vk::PhysicalDevice physical;  ///< Physical device
-    vk::DispatchLoaderDynamic dld;      ///< Device function pointers
-    UniqueDevice logical;               ///< Logical device
-    vk::Queue graphics_queue;           ///< Main graphics queue
-    vk::Queue present_queue;            ///< Main present queue
-    u32 graphics_family{};              ///< Main graphics queue family index
-    u32 present_family{};               ///< Main present queue family index
-    vk::PhysicalDeviceType device_type; ///< Physical device type
-    u64 uniform_buffer_alignment{};     ///< Uniform buffer alignment requeriment
-    std::map<vk::Format, vk::FormatProperties> format_properties; ///< Format properties dictionary
+    const vk::PhysicalDevice physical;  ///< Physical device.
+    vk::DispatchLoaderDynamic dld;      ///< Device function pointers.
+    UniqueDevice logical;               ///< Logical device.
+    vk::Queue graphics_queue;           ///< Main graphics queue.
+    vk::Queue present_queue;            ///< Main present queue.
+    u32 graphics_family{};              ///< Main graphics queue family index.
+    u32 present_family{};               ///< Main present queue family index.
+    vk::PhysicalDeviceType device_type; ///< Physical device type.
+    u64 uniform_buffer_alignment{};     ///< Uniform buffer alignment requeriment.
+    u64 max_storage_buffer_range{};     ///< Max storage buffer size.
+    bool is_optimal_astc_supported{};   ///< Support for native ASTC.
+    bool ext_scalar_block_layout{};     ///< Support for VK_EXT_scalar_block_layout.
+    std::map<vk::Format, vk::FormatProperties> format_properties; ///< Format properties dictionary.
 };
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index a5b25aeff..33ad9764a 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -17,6 +17,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/engines/shader_header.h"
+#include "video_core/renderer_vulkan/vk_device.h"
 #include "video_core/renderer_vulkan/vk_shader_decompiler.h"
 #include "video_core/shader/shader_ir.h"
 
@@ -33,7 +34,8 @@ using ShaderStage = Tegra::Engines::Maxwell3D::Regs::ShaderStage;
 using Operation = const OperationNode&;
 
 // TODO(Rodrigo): Use rasterizer's value
-constexpr u32 MAX_CONSTBUFFER_ELEMENTS = 0x1000;
+constexpr u32 MAX_CONSTBUFFER_FLOATS = 0x4000;
+constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_FLOATS / 4;
 constexpr u32 STAGE_BINDING_STRIDE = 0x100;
 
 enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
@@ -87,8 +89,8 @@ bool IsPrecise(Operation operand) {
 
 class SPIRVDecompiler : public Sirit::Module {
 public:
-    explicit SPIRVDecompiler(const ShaderIR& ir, ShaderStage stage)
-        : Module(0x00010300), ir{ir}, stage{stage}, header{ir.GetHeader()} {
+    explicit SPIRVDecompiler(const VKDevice& device, const ShaderIR& ir, ShaderStage stage)
+        : Module(0x00010300), device{device}, ir{ir}, stage{stage}, header{ir.GetHeader()} {
         AddCapability(spv::Capability::Shader);
         AddExtension("SPV_KHR_storage_buffer_storage_class");
         AddExtension("SPV_KHR_variable_pointers");
@@ -130,20 +132,16 @@ public:
             branch_labels.push_back(label);
         }
 
-        // TODO(Rodrigo): Figure out the actual depth of the flow stack, for now it seems unlikely
-        // that shaders will use 20 nested SSYs and PBKs.
-        constexpr u32 FLOW_STACK_SIZE = 20;
-        const Id flow_stack_type = TypeArray(t_uint, Constant(t_uint, FLOW_STACK_SIZE));
         jmp_to = Emit(OpVariable(TypePointer(spv::StorageClass::Function, t_uint),
                                  spv::StorageClass::Function, Constant(t_uint, first_address)));
-        flow_stack = Emit(OpVariable(TypePointer(spv::StorageClass::Function, flow_stack_type),
-                                     spv::StorageClass::Function, ConstantNull(flow_stack_type)));
-        flow_stack_top =
-            Emit(OpVariable(t_func_uint, spv::StorageClass::Function, Constant(t_uint, 0)));
+        std::tie(ssy_flow_stack, ssy_flow_stack_top) = CreateFlowStack();
+        std::tie(pbk_flow_stack, pbk_flow_stack_top) = CreateFlowStack();
 
         Name(jmp_to, "jmp_to");
-        Name(flow_stack, "flow_stack");
-        Name(flow_stack_top, "flow_stack_top");
+        Name(ssy_flow_stack, "ssy_flow_stack");
+        Name(ssy_flow_stack_top, "ssy_flow_stack_top");
+        Name(pbk_flow_stack, "pbk_flow_stack");
+        Name(pbk_flow_stack_top, "pbk_flow_stack_top");
 
         Emit(OpBranch(loop_label));
         Emit(loop_label);
@@ -195,7 +193,9 @@ public:
             entries.samplers.emplace_back(sampler);
         }
         for (const auto& attribute : ir.GetInputAttributes()) {
-            entries.attributes.insert(GetGenericAttributeLocation(attribute));
+            if (IsGenericAttribute(attribute)) {
+                entries.attributes.insert(GetGenericAttributeLocation(attribute));
+            }
         }
         entries.clip_distances = ir.GetClipDistances();
         entries.shader_length = ir.GetLength();
@@ -210,7 +210,6 @@ private:
         std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>;
 
     static constexpr auto INTERNAL_FLAGS_COUNT = static_cast<std::size_t>(InternalFlag::Amount);
-    static constexpr u32 CBUF_STRIDE = 16;
 
     void AllocateBindings() {
         const u32 binding_base = static_cast<u32>(stage) * STAGE_BINDING_STRIDE;
@@ -315,6 +314,7 @@ private:
         constexpr std::array<const char*, INTERNAL_FLAGS_COUNT> names = {"zero", "sign", "carry",
                                                                          "overflow"};
         for (std::size_t flag = 0; flag < INTERNAL_FLAGS_COUNT; ++flag) {
+            const auto flag_code = static_cast<InternalFlag>(flag);
             const Id id = OpVariable(t_prv_bool, spv::StorageClass::Private, v_false);
             internal_flags[flag] = AddGlobalVariable(Name(id, names[flag]));
         }
@@ -374,7 +374,9 @@ private:
         u32 binding = const_buffers_base_binding;
         for (const auto& entry : ir.GetConstantBuffers()) {
             const auto [index, size] = entry;
-            const Id id = OpVariable(t_cbuf_ubo, spv::StorageClass::Uniform);
+            const Id type =
+                device.IsExtScalarBlockLayoutSupported() ? t_cbuf_scalar_ubo : t_cbuf_std140_ubo;
+            const Id id = OpVariable(type, spv::StorageClass::Uniform);
             AddGlobalVariable(Name(id, fmt::format("cbuf_{}", index)));
 
             Decorate(id, spv::Decoration::Binding, binding++);
@@ -475,13 +477,13 @@ private:
     }
 
     void VisitBasicBlock(const NodeBlock& bb) {
-        for (const Node node : bb) {
+        for (const auto& node : bb) {
             static_cast<void>(Visit(node));
         }
     }
 
-    Id Visit(Node node) {
-        if (const auto operation = std::get_if<OperationNode>(node)) {
+    Id Visit(const Node& node) {
+        if (const auto operation = std::get_if<OperationNode>(&*node)) {
             const auto operation_index = static_cast<std::size_t>(operation->GetCode());
             const auto decompiler = operation_decompilers[operation_index];
             if (decompiler == nullptr) {
@@ -489,17 +491,17 @@ private:
             }
             return (this->*decompiler)(*operation);
 
-        } else if (const auto gpr = std::get_if<GprNode>(node)) {
+        } else if (const auto gpr = std::get_if<GprNode>(&*node)) {
             const u32 index = gpr->GetIndex();
             if (index == Register::ZeroIndex) {
                 return Constant(t_float, 0.0f);
             }
             return Emit(OpLoad(t_float, registers.at(index)));
 
-        } else if (const auto immediate = std::get_if<ImmediateNode>(node)) {
+        } else if (const auto immediate = std::get_if<ImmediateNode>(&*node)) {
             return BitcastTo<Type::Float>(Constant(t_uint, immediate->GetValue()));
 
-        } else if (const auto predicate = std::get_if<PredicateNode>(node)) {
+        } else if (const auto predicate = std::get_if<PredicateNode>(&*node)) {
             const auto value = [&]() -> Id {
                 switch (const auto index = predicate->GetIndex(); index) {
                 case Tegra::Shader::Pred::UnusedIndex:
@@ -515,7 +517,7 @@ private:
             }
             return value;
 
-        } else if (const auto abuf = std::get_if<AbufNode>(node)) {
+        } else if (const auto abuf = std::get_if<AbufNode>(&*node)) {
             const auto attribute = abuf->GetIndex();
             const auto element = abuf->GetElement();
 
@@ -565,40 +567,42 @@ private:
             }
             UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute));
 
-        } else if (const auto cbuf = std::get_if<CbufNode>(node)) {
-            const Node offset = cbuf->GetOffset();
+        } else if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
+            const Node& offset = cbuf->GetOffset();
             const Id buffer_id = constant_buffers.at(cbuf->GetIndex());
 
-            Id buffer_index{};
-            Id buffer_element{};
-
-            if (const auto immediate = std::get_if<ImmediateNode>(offset)) {
-                // Direct access
-                const u32 offset_imm = immediate->GetValue();
-                ASSERT(offset_imm % 4 == 0);
-                buffer_index = Constant(t_uint, offset_imm / 16);
-                buffer_element = Constant(t_uint, (offset_imm / 4) % 4);
-
-            } else if (std::holds_alternative<OperationNode>(*offset)) {
-                // Indirect access
-                // TODO(Rodrigo): Use a uniform buffer stride of 4 and drop this slow math (which
-                // emits sub-optimal code on GLSL from my testing).
-                const Id offset_id = BitcastTo<Type::Uint>(Visit(offset));
-                const Id unsafe_offset = Emit(OpUDiv(t_uint, offset_id, Constant(t_uint, 4)));
-                const Id final_offset = Emit(
-                    OpUMod(t_uint, unsafe_offset, Constant(t_uint, MAX_CONSTBUFFER_ELEMENTS - 1)));
-                buffer_index = Emit(OpUDiv(t_uint, final_offset, Constant(t_uint, 4)));
-                buffer_element = Emit(OpUMod(t_uint, final_offset, Constant(t_uint, 4)));
-
+            Id pointer{};
+            if (device.IsExtScalarBlockLayoutSupported()) {
+                const Id buffer_offset = Emit(OpShiftRightLogical(
+                    t_uint, BitcastTo<Type::Uint>(Visit(offset)), Constant(t_uint, 2u)));
+                pointer = Emit(
+                    OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0u), buffer_offset));
             } else {
-                UNREACHABLE_MSG("Unmanaged offset node type");
+                Id buffer_index{};
+                Id buffer_element{};
+                if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
+                    // Direct access
+                    const u32 offset_imm = immediate->GetValue();
+                    ASSERT(offset_imm % 4 == 0);
+                    buffer_index = Constant(t_uint, offset_imm / 16);
+                    buffer_element = Constant(t_uint, (offset_imm / 4) % 4);
+                } else if (std::holds_alternative<OperationNode>(*offset)) {
+                    // Indirect access
+                    const Id offset_id = BitcastTo<Type::Uint>(Visit(offset));
+                    const Id unsafe_offset = Emit(OpUDiv(t_uint, offset_id, Constant(t_uint, 4)));
+                    const Id final_offset = Emit(OpUMod(
+                        t_uint, unsafe_offset, Constant(t_uint, MAX_CONSTBUFFER_ELEMENTS - 1)));
+                    buffer_index = Emit(OpUDiv(t_uint, final_offset, Constant(t_uint, 4)));
+                    buffer_element = Emit(OpUMod(t_uint, final_offset, Constant(t_uint, 4)));
+                } else {
+                    UNREACHABLE_MSG("Unmanaged offset node type");
+                }
+                pointer = Emit(OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0),
+                                             buffer_index, buffer_element));
             }
-
-            const Id pointer = Emit(OpAccessChain(t_cbuf_float, buffer_id, Constant(t_uint, 0),
-                                                  buffer_index, buffer_element));
             return Emit(OpLoad(t_float, pointer));
 
-        } else if (const auto gmem = std::get_if<GmemNode>(node)) {
+        } else if (const auto gmem = std::get_if<GmemNode>(&*node)) {
             const Id gmem_buffer = global_buffers.at(gmem->GetDescriptor());
             const Id real = BitcastTo<Type::Uint>(Visit(gmem->GetRealAddress()));
             const Id base = BitcastTo<Type::Uint>(Visit(gmem->GetBaseAddress()));
@@ -608,11 +612,13 @@ private:
             return Emit(OpLoad(t_float, Emit(OpAccessChain(t_gmem_float, gmem_buffer,
                                                            Constant(t_uint, 0u), offset))));
 
-        } else if (const auto conditional = std::get_if<ConditionalNode>(node)) {
+        } else if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
             // It's invalid to call conditional on nested nodes, use an operation instead
             const Id true_label = OpLabel();
             const Id skip_label = OpLabel();
-            Emit(OpBranchConditional(Visit(conditional->GetCondition()), true_label, skip_label));
+            const Id condition = Visit(conditional->GetCondition());
+            Emit(OpSelectionMerge(skip_label, spv::SelectionControlMask::MaskNone));
+            Emit(OpBranchConditional(condition, true_label, skip_label));
             Emit(true_label);
 
             VisitBasicBlock(conditional->GetCode());
@@ -621,7 +627,7 @@ private:
             Emit(skip_label);
             return {};
 
-        } else if (const auto comment = std::get_if<CommentNode>(node)) {
+        } else if (const auto comment = std::get_if<CommentNode>(&*node)) {
             Name(Emit(OpUndef(t_void)), comment->GetText());
             return {};
         }
@@ -689,18 +695,18 @@ private:
     }
 
     Id Assign(Operation operation) {
-        const Node dest = operation[0];
-        const Node src = operation[1];
+        const Node& dest = operation[0];
+        const Node& src = operation[1];
 
         Id target{};
-        if (const auto gpr = std::get_if<GprNode>(dest)) {
+        if (const auto gpr = std::get_if<GprNode>(&*dest)) {
             if (gpr->GetIndex() == Register::ZeroIndex) {
                 // Writing to Register::ZeroIndex is a no op
                 return {};
             }
             target = registers.at(gpr->GetIndex());
 
-        } else if (const auto abuf = std::get_if<AbufNode>(dest)) {
+        } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
             target = [&]() -> Id {
                 switch (const auto attribute = abuf->GetIndex(); attribute) {
                 case Attribute::Index::Position:
@@ -725,7 +731,7 @@ private:
                 }
             }();
 
-        } else if (const auto lmem = std::get_if<LmemNode>(dest)) {
+        } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
             Id address = BitcastTo<Type::Uint>(Visit(lmem->GetAddress()));
             address = Emit(OpUDiv(t_uint, address, Constant(t_uint, 4)));
             target = Emit(OpAccessChain(t_prv_float, local_memory, {address}));
@@ -771,11 +777,11 @@ private:
     }
 
     Id LogicalAssign(Operation operation) {
-        const Node dest = operation[0];
-        const Node src = operation[1];
+        const Node& dest = operation[0];
+        const Node& src = operation[1];
 
         Id target{};
-        if (const auto pred = std::get_if<PredicateNode>(dest)) {
+        if (const auto pred = std::get_if<PredicateNode>(&*dest)) {
             ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment");
 
             const auto index = pred->GetIndex();
@@ -787,7 +793,7 @@ private:
             }
             target = predicates.at(index);
 
-        } else if (const auto flag = std::get_if<InternalFlagNode>(dest)) {
+        } else if (const auto flag = std::get_if<InternalFlagNode>(&*dest)) {
             target = internal_flags.at(static_cast<u32>(flag->GetFlag()));
         }
 
@@ -873,7 +879,7 @@ private:
         } else {
             u32 component_value = 0;
             if (meta->component) {
-                const auto component = std::get_if<ImmediateNode>(meta->component);
+                const auto component = std::get_if<ImmediateNode>(&*meta->component);
                 ASSERT_MSG(component, "Component is not an immediate value");
                 component_value = component->GetValue();
             }
@@ -930,7 +936,7 @@ private:
     }
 
     Id Branch(Operation operation) {
-        const auto target = std::get_if<ImmediateNode>(operation[0]);
+        const auto target = std::get_if<ImmediateNode>(&*operation[0]);
         UNIMPLEMENTED_IF(!target);
 
         Emit(OpStore(jmp_to, Constant(t_uint, target->GetValue())));
@@ -939,9 +945,10 @@ private:
     }
 
     Id PushFlowStack(Operation operation) {
-        const auto target = std::get_if<ImmediateNode>(operation[0]);
+        const auto target = std::get_if<ImmediateNode>(&*operation[0]);
         ASSERT(target);
 
+        const auto [flow_stack, flow_stack_top] = GetFlowStack(operation);
         const Id current = Emit(OpLoad(t_uint, flow_stack_top));
         const Id next = Emit(OpIAdd(t_uint, current, Constant(t_uint, 1)));
         const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, current));
@@ -952,6 +959,7 @@ private:
     }
 
     Id PopFlowStack(Operation operation) {
+        const auto [flow_stack, flow_stack_top] = GetFlowStack(operation);
         const Id current = Emit(OpLoad(t_uint, flow_stack_top));
         const Id previous = Emit(OpISub(t_uint, current, Constant(t_uint, 1)));
         const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, previous));
@@ -968,11 +976,11 @@ private:
         case ShaderStage::Vertex: {
             // TODO(Rodrigo): We should use VK_EXT_depth_range_unrestricted instead, but it doesn't
             // seem to be working on Nvidia's drivers and Intel (mesa and blob) doesn't support it.
-            const Id position = AccessElement(t_float4, per_vertex, position_index);
-            Id depth = Emit(OpLoad(t_float, AccessElement(t_out_float, position, 2)));
+            const Id z_pointer = AccessElement(t_out_float, per_vertex, position_index, 2u);
+            Id depth = Emit(OpLoad(t_float, z_pointer));
             depth = Emit(OpFAdd(t_float, depth, Constant(t_float, 1.0f)));
             depth = Emit(OpFMul(t_float, depth, Constant(t_float, 0.5f)));
-            Emit(OpStore(AccessElement(t_out_float, position, 2), depth));
+            Emit(OpStore(z_pointer, depth));
             break;
         }
         case ShaderStage::Fragment: {
@@ -1162,6 +1170,31 @@ private:
         Emit(skip_label);
     }
 
+    std::tuple<Id, Id> CreateFlowStack() {
+        // TODO(Rodrigo): Figure out the actual depth of the flow stack, for now it seems unlikely
+        // that shaders will use 20 nested SSYs and PBKs.
+        constexpr u32 FLOW_STACK_SIZE = 20;
+        constexpr auto storage_class = spv::StorageClass::Function;
+
+        const Id flow_stack_type = TypeArray(t_uint, Constant(t_uint, FLOW_STACK_SIZE));
+        const Id stack = Emit(OpVariable(TypePointer(storage_class, flow_stack_type), storage_class,
+                                         ConstantNull(flow_stack_type)));
+        const Id top = Emit(OpVariable(t_func_uint, storage_class, Constant(t_uint, 0)));
+        return std::tie(stack, top);
+    }
+
+    std::pair<Id, Id> GetFlowStack(Operation operation) {
+        const auto stack_class = std::get<MetaStackClass>(operation.GetMeta());
+        switch (stack_class) {
+        case MetaStackClass::Ssy:
+            return {ssy_flow_stack, ssy_flow_stack_top};
+        case MetaStackClass::Pbk:
+            return {pbk_flow_stack, pbk_flow_stack_top};
+        }
+        UNREACHABLE();
+        return {};
+    }
+
     static constexpr OperationDecompilersArray operation_decompilers = {
         &SPIRVDecompiler::Assign,
 
@@ -1311,6 +1344,7 @@ private:
         &SPIRVDecompiler::WorkGroupId<2>,
     };
 
+    const VKDevice& device;
     const ShaderIR& ir;
     const ShaderStage stage;
     const Tegra::Shader::Header header;
@@ -1349,12 +1383,18 @@ private:
     const Id t_out_float4 = Name(TypePointer(spv::StorageClass::Output, t_float4), "out_float4");
 
     const Id t_cbuf_float = TypePointer(spv::StorageClass::Uniform, t_float);
-    const Id t_cbuf_array =
-        Decorate(Name(TypeArray(t_float4, Constant(t_uint, MAX_CONSTBUFFER_ELEMENTS)), "CbufArray"),
-                 spv::Decoration::ArrayStride, CBUF_STRIDE);
-    const Id t_cbuf_struct = MemberDecorate(
-        Decorate(TypeStruct(t_cbuf_array), spv::Decoration::Block), 0, spv::Decoration::Offset, 0);
-    const Id t_cbuf_ubo = TypePointer(spv::StorageClass::Uniform, t_cbuf_struct);
+    const Id t_cbuf_std140 = Decorate(
+        Name(TypeArray(t_float4, Constant(t_uint, MAX_CONSTBUFFER_ELEMENTS)), "CbufStd140Array"),
+        spv::Decoration::ArrayStride, 16u);
+    const Id t_cbuf_scalar = Decorate(
+        Name(TypeArray(t_float, Constant(t_uint, MAX_CONSTBUFFER_FLOATS)), "CbufScalarArray"),
+        spv::Decoration::ArrayStride, 4u);
+    const Id t_cbuf_std140_struct = MemberDecorate(
+        Decorate(TypeStruct(t_cbuf_std140), spv::Decoration::Block), 0, spv::Decoration::Offset, 0);
+    const Id t_cbuf_scalar_struct = MemberDecorate(
+        Decorate(TypeStruct(t_cbuf_scalar), spv::Decoration::Block), 0, spv::Decoration::Offset, 0);
+    const Id t_cbuf_std140_ubo = TypePointer(spv::StorageClass::Uniform, t_cbuf_std140_struct);
+    const Id t_cbuf_scalar_ubo = TypePointer(spv::StorageClass::Uniform, t_cbuf_scalar_struct);
 
     const Id t_gmem_float = TypePointer(spv::StorageClass::StorageBuffer, t_float);
     const Id t_gmem_array =
@@ -1397,14 +1437,17 @@ private:
 
     Id execute_function{};
     Id jmp_to{};
-    Id flow_stack_top{};
-    Id flow_stack{};
+    Id ssy_flow_stack_top{};
+    Id pbk_flow_stack_top{};
+    Id ssy_flow_stack{};
+    Id pbk_flow_stack{};
     Id continue_label{};
     std::map<u32, Id> labels;
 };
 
-DecompilerResult Decompile(const VideoCommon::Shader::ShaderIR& ir, Maxwell::ShaderStage stage) {
-    auto decompiler = std::make_unique<SPIRVDecompiler>(ir, stage);
+DecompilerResult Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir,
+                           Maxwell::ShaderStage stage) {
+    auto decompiler = std::make_unique<SPIRVDecompiler>(device, ir, stage);
     decompiler->Decompile();
     return {std::move(decompiler), decompiler->GetShaderEntries()};
 }
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
index 329d8fa38..f90541cc1 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
@@ -20,10 +20,13 @@ namespace VideoCommon::Shader {
 class ShaderIR;
 }
 
+namespace Vulkan {
+class VKDevice;
+}
+
 namespace Vulkan::VKShader {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-
 using SamplerEntry = VideoCommon::Shader::Sampler;
 
 constexpr u32 DESCRIPTOR_SET = 0;
@@ -75,6 +78,7 @@ struct ShaderEntries {
 
 using DecompilerResult = std::pair<std::unique_ptr<Sirit::Module>, ShaderEntries>;
 
-DecompilerResult Decompile(const VideoCommon::Shader::ShaderIR& ir, Maxwell::ShaderStage stage);
+DecompilerResult Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir,
+                           Maxwell::ShaderStage stage);
 
 } // namespace Vulkan::VKShader
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index 2da595c0d..a0554c97e 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -11,6 +11,7 @@
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/engines/shader_header.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp
index b4859bc1e..87d8fecaa 100644
--- a/src/video_core/shader/decode/arithmetic.cpp
+++ b/src/video_core/shader/decode/arithmetic.cpp
@@ -6,6 +6,7 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/arithmetic_half.cpp b/src/video_core/shader/decode/arithmetic_half.cpp
index 3a29c4a46..b06cbe441 100644
--- a/src/video_core/shader/decode/arithmetic_half.cpp
+++ b/src/video_core/shader/decode/arithmetic_half.cpp
@@ -6,6 +6,7 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/arithmetic_half_immediate.cpp b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
index 5341e460f..7bcf38f23 100644
--- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
@@ -6,6 +6,7 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/arithmetic_immediate.cpp b/src/video_core/shader/decode/arithmetic_immediate.cpp
index 3095f2fd4..f1875967c 100644
--- a/src/video_core/shader/decode/arithmetic_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_immediate.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp
index 9fd4b273e..c8c1a7f40 100644
--- a/src/video_core/shader/decode/arithmetic_integer.cpp
+++ b/src/video_core/shader/decode/arithmetic_integer.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/arithmetic_integer_immediate.cpp b/src/video_core/shader/decode/arithmetic_integer_immediate.cpp
index 679ac0d4e..73880db0e 100644
--- a/src/video_core/shader/decode/arithmetic_integer_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_integer_immediate.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/bfe.cpp b/src/video_core/shader/decode/bfe.cpp
index 1ae192c6a..e02bcd097 100644
--- a/src/video_core/shader/decode/bfe.cpp
+++ b/src/video_core/shader/decode/bfe.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/bfi.cpp b/src/video_core/shader/decode/bfi.cpp
index 0b12a0d08..8be1119df 100644
--- a/src/video_core/shader/decode/bfi.cpp
+++ b/src/video_core/shader/decode/bfi.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp
index b5ec9a6f5..4221f0c58 100644
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/ffma.cpp b/src/video_core/shader/decode/ffma.cpp
index a1d04c6e5..29be25ca3 100644
--- a/src/video_core/shader/decode/ffma.cpp
+++ b/src/video_core/shader/decode/ffma.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/float_set.cpp b/src/video_core/shader/decode/float_set.cpp
index cc522f1de..f5013e44a 100644
--- a/src/video_core/shader/decode/float_set.cpp
+++ b/src/video_core/shader/decode/float_set.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/float_set_predicate.cpp b/src/video_core/shader/decode/float_set_predicate.cpp
index 9d2322a1d..2323052b0 100644
--- a/src/video_core/shader/decode/float_set_predicate.cpp
+++ b/src/video_core/shader/decode/float_set_predicate.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp
index 755f2ec44..48ca7a4af 100644
--- a/src/video_core/shader/decode/half_set.cpp
+++ b/src/video_core/shader/decode/half_set.cpp
@@ -8,6 +8,7 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp
index fba44d714..d59d15bd8 100644
--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/hfma2.cpp b/src/video_core/shader/decode/hfma2.cpp
index a425f9eb7..c3bcf1ae9 100644
--- a/src/video_core/shader/decode/hfma2.cpp
+++ b/src/video_core/shader/decode/hfma2.cpp
@@ -7,6 +7,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/integer_set.cpp b/src/video_core/shader/decode/integer_set.cpp
index a4cdaf74d..46e3d5905 100644
--- a/src/video_core/shader/decode/integer_set.cpp
+++ b/src/video_core/shader/decode/integer_set.cpp
@@ -4,6 +4,7 @@
 
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/integer_set_predicate.cpp b/src/video_core/shader/decode/integer_set_predicate.cpp
index a6a1fb632..dd20775d7 100644
--- a/src/video_core/shader/decode/integer_set_predicate.cpp
+++ b/src/video_core/shader/decode/integer_set_predicate.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index e6a010a7d..80fc0ccfc 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -10,6 +10,7 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
@@ -169,7 +170,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
             const Node it_offset = Immediate(i * 4);
             const Node real_address =
                 Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
-            const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));
+            const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
 
             SetTemporal(bb, i, gmem);
         }
@@ -262,7 +263,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
             const Node it_offset = Immediate(i * 4);
             const Node real_address =
                 Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
-            const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));
+            const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
 
             bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporal(i + 1)));
         }
@@ -298,9 +299,9 @@ std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackAndGetGlobalMemory(NodeB
 
     const Node base_address{
         TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()))};
-    const auto cbuf = std::get_if<CbufNode>(base_address);
+    const auto cbuf = std::get_if<CbufNode>(&*base_address);
     ASSERT(cbuf != nullptr);
-    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset());
+    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(&*cbuf->GetOffset());
     ASSERT(cbuf_offset_imm != nullptr);
     const auto cbuf_offset = cbuf_offset_imm->GetValue();
 
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index a6c123573..d46a8ab82 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -6,6 +6,7 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
@@ -108,22 +109,20 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
                              "Constant buffer flow is not supported");
 
-        // The SSY opcode tells the GPU where to re-converge divergent execution paths, it sets the
-        // target of the jump that the SYNC instruction will make. The SSY opcode has a similar
-        // structure to the BRA opcode.
+        // The SSY opcode tells the GPU where to re-converge divergent execution paths with SYNC.
         const u32 target = pc + instr.bra.GetBranchTarget();
-        bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target)));
+        bb.push_back(
+            Operation(OperationCode::PushFlowStack, MetaStackClass::Ssy, Immediate(target)));
         break;
     }
     case OpCode::Id::PBK: {
         UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
                              "Constant buffer PBK is not supported");
 
-        // PBK pushes to a stack the address where BRK will jump to. This shares stack with SSY but
-        // using SYNC on a PBK address will kill the shader execution. We don't emulate this because
-        // it's very unlikely a driver will emit such invalid shader.
+        // PBK pushes to a stack the address where BRK will jump to.
         const u32 target = pc + instr.bra.GetBranchTarget();
-        bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target)));
+        bb.push_back(
+            Operation(OperationCode::PushFlowStack, MetaStackClass::Pbk, Immediate(target)));
         break;
     }
     case OpCode::Id::SYNC: {
@@ -132,7 +131,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
                              static_cast<u32>(cc));
 
         // The SYNC opcode jumps to the address previously set by the SSY opcode
-        bb.push_back(Operation(OperationCode::PopFlowStack));
+        bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Ssy));
         break;
     }
     case OpCode::Id::BRK: {
@@ -141,7 +140,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
                              static_cast<u32>(cc));
 
         // The BRK opcode jumps to the address previously set by the PBK opcode
-        bb.push_back(Operation(OperationCode::PopFlowStack));
+        bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Pbk));
         break;
     }
     case OpCode::Id::IPA: {
diff --git a/src/video_core/shader/decode/predicate_set_predicate.cpp b/src/video_core/shader/decode/predicate_set_predicate.cpp
index 71844c42b..9290d22eb 100644
--- a/src/video_core/shader/decode/predicate_set_predicate.cpp
+++ b/src/video_core/shader/decode/predicate_set_predicate.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/predicate_set_register.cpp b/src/video_core/shader/decode/predicate_set_register.cpp
index 387491bd3..febbfeb50 100644
--- a/src/video_core/shader/decode/predicate_set_register.cpp
+++ b/src/video_core/shader/decode/predicate_set_register.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/register_set_predicate.cpp b/src/video_core/shader/decode/register_set_predicate.cpp
index f8659e48e..e6c9d287e 100644
--- a/src/video_core/shader/decode/register_set_predicate.cpp
+++ b/src/video_core/shader/decode/register_set_predicate.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/shift.cpp b/src/video_core/shader/decode/shift.cpp
index 44ae87ece..2ac16eeb0 100644
--- a/src/video_core/shader/decode/shift.cpp
+++ b/src/video_core/shader/decode/shift.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 5b033126d..4a356dbd4 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -11,6 +11,7 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
@@ -291,8 +292,8 @@ const Sampler& ShaderIR::GetBindlessSampler(const Tegra::Shader::Register& reg,
     const Node sampler_register = GetRegister(reg);
     const Node base_sampler =
         TrackCbuf(sampler_register, global_code, static_cast<s64>(global_code.size()));
-    const auto cbuf = std::get_if<CbufNode>(base_sampler);
-    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset());
+    const auto cbuf = std::get_if<CbufNode>(&*base_sampler);
+    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(&*cbuf->GetOffset());
     ASSERT(cbuf_offset_imm != nullptr);
     const auto cbuf_offset = cbuf_offset_imm->GetValue();
     const auto cbuf_index = cbuf->GetIndex();
@@ -388,8 +389,8 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
                                Node array, Node depth_compare, u32 bias_offset,
                                std::vector<Node> aoffi,
                                std::optional<Tegra::Shader::Register> bindless_reg) {
-    const bool is_array = array;
-    const bool is_shadow = depth_compare;
+    const auto is_array = static_cast<bool>(array);
+    const auto is_shadow = static_cast<bool>(depth_compare);
     const bool is_bindless = bindless_reg.has_value();
 
     UNIMPLEMENTED_IF_MSG((texture_type == TextureType::Texture3D && (is_array || is_shadow)) ||
diff --git a/src/video_core/shader/decode/video.cpp b/src/video_core/shader/decode/video.cpp
index cb9ab72b1..97fc6f9b1 100644
--- a/src/video_core/shader/decode/video.cpp
+++ b/src/video_core/shader/decode/video.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/decode/xmad.cpp b/src/video_core/shader/decode/xmad.cpp
index 04a776398..93dee77d1 100644
--- a/src/video_core/shader/decode/xmad.cpp
+++ b/src/video_core/shader/decode/xmad.cpp
@@ -5,6 +5,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
new file mode 100644
index 000000000..3cfb911bb
--- /dev/null
+++ b/src/video_core/shader/node.h
@@ -0,0 +1,519 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/engines/shader_bytecode.h"
+
+namespace VideoCommon::Shader {
+
+enum class OperationCode {
+    Assign, /// (float& dest, float src) -> void
+
+    Select, /// (MetaArithmetic, bool pred, float a, float b) -> float
+
+    FAdd,          /// (MetaArithmetic, float a, float b) -> float
+    FMul,          /// (MetaArithmetic, float a, float b) -> float
+    FDiv,          /// (MetaArithmetic, float a, float b) -> float
+    FFma,          /// (MetaArithmetic, float a, float b, float c) -> float
+    FNegate,       /// (MetaArithmetic, float a) -> float
+    FAbsolute,     /// (MetaArithmetic, float a) -> float
+    FClamp,        /// (MetaArithmetic, float value, float min, float max) -> float
+    FMin,          /// (MetaArithmetic, float a, float b) -> float
+    FMax,          /// (MetaArithmetic, float a, float b) -> float
+    FCos,          /// (MetaArithmetic, float a) -> float
+    FSin,          /// (MetaArithmetic, float a) -> float
+    FExp2,         /// (MetaArithmetic, float a) -> float
+    FLog2,         /// (MetaArithmetic, float a) -> float
+    FInverseSqrt,  /// (MetaArithmetic, float a) -> float
+    FSqrt,         /// (MetaArithmetic, float a) -> float
+    FRoundEven,    /// (MetaArithmetic, float a) -> float
+    FFloor,        /// (MetaArithmetic, float a) -> float
+    FCeil,         /// (MetaArithmetic, float a) -> float
+    FTrunc,        /// (MetaArithmetic, float a) -> float
+    FCastInteger,  /// (MetaArithmetic, int a) -> float
+    FCastUInteger, /// (MetaArithmetic, uint a) -> float
+
+    IAdd,                  /// (MetaArithmetic, int a, int b) -> int
+    IMul,                  /// (MetaArithmetic, int a, int b) -> int
+    IDiv,                  /// (MetaArithmetic, int a, int b) -> int
+    INegate,               /// (MetaArithmetic, int a) -> int
+    IAbsolute,             /// (MetaArithmetic, int a) -> int
+    IMin,                  /// (MetaArithmetic, int a, int b) -> int
+    IMax,                  /// (MetaArithmetic, int a, int b) -> int
+    ICastFloat,            /// (MetaArithmetic, float a) -> int
+    ICastUnsigned,         /// (MetaArithmetic, uint a) -> int
+    ILogicalShiftLeft,     /// (MetaArithmetic, int a, uint b) -> int
+    ILogicalShiftRight,    /// (MetaArithmetic, int a, uint b) -> int
+    IArithmeticShiftRight, /// (MetaArithmetic, int a, uint b) -> int
+    IBitwiseAnd,           /// (MetaArithmetic, int a, int b) -> int
+    IBitwiseOr,            /// (MetaArithmetic, int a, int b) -> int
+    IBitwiseXor,           /// (MetaArithmetic, int a, int b) -> int
+    IBitwiseNot,           /// (MetaArithmetic, int a) -> int
+    IBitfieldInsert,       /// (MetaArithmetic, int base, int insert, int offset, int bits) -> int
+    IBitfieldExtract,      /// (MetaArithmetic, int value, int offset, int offset) -> int
+    IBitCount,             /// (MetaArithmetic, int) -> int
+
+    UAdd,                  /// (MetaArithmetic, uint a, uint b) -> uint
+    UMul,                  /// (MetaArithmetic, uint a, uint b) -> uint
+    UDiv,                  /// (MetaArithmetic, uint a, uint b) -> uint
+    UMin,                  /// (MetaArithmetic, uint a, uint b) -> uint
+    UMax,                  /// (MetaArithmetic, uint a, uint b) -> uint
+    UCastFloat,            /// (MetaArithmetic, float a) -> uint
+    UCastSigned,           /// (MetaArithmetic, int a) -> uint
+    ULogicalShiftLeft,     /// (MetaArithmetic, uint a, uint b) -> uint
+    ULogicalShiftRight,    /// (MetaArithmetic, uint a, uint b) -> uint
+    UArithmeticShiftRight, /// (MetaArithmetic, uint a, uint b) -> uint
+    UBitwiseAnd,           /// (MetaArithmetic, uint a, uint b) -> uint
+    UBitwiseOr,            /// (MetaArithmetic, uint a, uint b) -> uint
+    UBitwiseXor,           /// (MetaArithmetic, uint a, uint b) -> uint
+    UBitwiseNot,           /// (MetaArithmetic, uint a) -> uint
+    UBitfieldInsert,  /// (MetaArithmetic, uint base, uint insert, int offset, int bits) -> uint
+    UBitfieldExtract, /// (MetaArithmetic, uint value, int offset, int offset) -> uint
+    UBitCount,        /// (MetaArithmetic, uint) -> uint
+
+    HAdd,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
+    HMul,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
+    HFma,      /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
+    HAbsolute, /// (f16vec2 a) -> f16vec2
+    HNegate,   /// (f16vec2 a, bool first, bool second) -> f16vec2
+    HClamp,    /// (f16vec2 src, float min, float max) -> f16vec2
+    HUnpack,   /// (Tegra::Shader::HalfType, T value) -> f16vec2
+    HMergeF32, /// (f16vec2 src) -> float
+    HMergeH0,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
+    HMergeH1,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
+    HPack2,    /// (float a, float b) -> f16vec2
+
+    LogicalAssign, /// (bool& dst, bool src) -> void
+    LogicalAnd,    /// (bool a, bool b) -> bool
+    LogicalOr,     /// (bool a, bool b) -> bool
+    LogicalXor,    /// (bool a, bool b) -> bool
+    LogicalNegate, /// (bool a) -> bool
+    LogicalPick2,  /// (bool2 pair, uint index) -> bool
+    LogicalAll2,   /// (bool2 a) -> bool
+    LogicalAny2,   /// (bool2 a) -> bool
+
+    LogicalFLessThan,     /// (float a, float b) -> bool
+    LogicalFEqual,        /// (float a, float b) -> bool
+    LogicalFLessEqual,    /// (float a, float b) -> bool
+    LogicalFGreaterThan,  /// (float a, float b) -> bool
+    LogicalFNotEqual,     /// (float a, float b) -> bool
+    LogicalFGreaterEqual, /// (float a, float b) -> bool
+    LogicalFIsNan,        /// (float a) -> bool
+
+    LogicalILessThan,     /// (int a, int b) -> bool
+    LogicalIEqual,        /// (int a, int b) -> bool
+    LogicalILessEqual,    /// (int a, int b) -> bool
+    LogicalIGreaterThan,  /// (int a, int b) -> bool
+    LogicalINotEqual,     /// (int a, int b) -> bool
+    LogicalIGreaterEqual, /// (int a, int b) -> bool
+
+    LogicalULessThan,     /// (uint a, uint b) -> bool
+    LogicalUEqual,        /// (uint a, uint b) -> bool
+    LogicalULessEqual,    /// (uint a, uint b) -> bool
+    LogicalUGreaterThan,  /// (uint a, uint b) -> bool
+    LogicalUNotEqual,     /// (uint a, uint b) -> bool
+    LogicalUGreaterEqual, /// (uint a, uint b) -> bool
+
+    Logical2HLessThan,            /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HEqual,               /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HLessEqual,           /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterThan,         /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HNotEqual,            /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterEqual,        /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HLessThanWithNan,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HEqualWithNan,        /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HLessEqualWithNan,    /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterThanWithNan,  /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HNotEqualWithNan,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+    Logical2HGreaterEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
+
+    Texture,                /// (MetaTexture, float[N] coords) -> float4
+    TextureLod,             /// (MetaTexture, float[N] coords) -> float4
+    TextureGather,          /// (MetaTexture, float[N] coords) -> float4
+    TextureQueryDimensions, /// (MetaTexture, float a) -> float4
+    TextureQueryLod,        /// (MetaTexture, float[N] coords) -> float4
+    TexelFetch,             /// (MetaTexture, int[N], int) -> float4
+
+    Branch,        /// (uint branch_target) -> void
+    PushFlowStack, /// (uint branch_target) -> void
+    PopFlowStack,  /// () -> void
+    Exit,          /// () -> void
+    Discard,       /// () -> void
+
+    EmitVertex,   /// () -> void
+    EndPrimitive, /// () -> void
+
+    YNegate,            /// () -> float
+    LocalInvocationIdX, /// () -> uint
+    LocalInvocationIdY, /// () -> uint
+    LocalInvocationIdZ, /// () -> uint
+    WorkGroupIdX,       /// () -> uint
+    WorkGroupIdY,       /// () -> uint
+    WorkGroupIdZ,       /// () -> uint
+
+    Amount,
+};
+
+enum class InternalFlag {
+    Zero = 0,
+    Sign = 1,
+    Carry = 2,
+    Overflow = 3,
+    Amount = 4,
+};
+
+enum class MetaStackClass {
+    Ssy,
+    Pbk,
+};
+
+class OperationNode;
+class ConditionalNode;
+class GprNode;
+class ImmediateNode;
+class InternalFlagNode;
+class PredicateNode;
+class AbufNode;
+class CbufNode;
+class LmemNode;
+class GmemNode;
+class CommentNode;
+
+using NodeData =
+    std::variant<OperationNode, ConditionalNode, GprNode, ImmediateNode, InternalFlagNode,
+                 PredicateNode, AbufNode, CbufNode, LmemNode, GmemNode, CommentNode>;
+using Node = std::shared_ptr<NodeData>;
+using Node4 = std::array<Node, 4>;
+using NodeBlock = std::vector<Node>;
+
+class Sampler {
+public:
+    /// This constructor is for bound samplers
+    explicit Sampler(std::size_t offset, std::size_t index, Tegra::Shader::TextureType type,
+                     bool is_array, bool is_shadow)
+        : offset{offset}, index{index}, type{type}, is_array{is_array}, is_shadow{is_shadow},
+          is_bindless{false} {}
+
+    /// This constructor is for bindless samplers
+    explicit Sampler(u32 cbuf_index, u32 cbuf_offset, std::size_t index,
+                     Tegra::Shader::TextureType type, bool is_array, bool is_shadow)
+        : offset{(static_cast<u64>(cbuf_index) << 32) | cbuf_offset}, index{index}, type{type},
+          is_array{is_array}, is_shadow{is_shadow}, is_bindless{true} {}
+
+    /// This constructor is for serialization/deserialization
+    explicit Sampler(std::size_t offset, std::size_t index, Tegra::Shader::TextureType type,
+                     bool is_array, bool is_shadow, bool is_bindless)
+        : offset{offset}, index{index}, type{type}, is_array{is_array}, is_shadow{is_shadow},
+          is_bindless{is_bindless} {}
+
+    std::size_t GetOffset() const {
+        return offset;
+    }
+
+    std::size_t GetIndex() const {
+        return index;
+    }
+
+    Tegra::Shader::TextureType GetType() const {
+        return type;
+    }
+
+    bool IsArray() const {
+        return is_array;
+    }
+
+    bool IsShadow() const {
+        return is_shadow;
+    }
+
+    bool IsBindless() const {
+        return is_bindless;
+    }
+
+    std::pair<u32, u32> GetBindlessCBuf() const {
+        return {static_cast<u32>(offset >> 32), static_cast<u32>(offset)};
+    }
+
+    bool operator<(const Sampler& rhs) const {
+        return std::tie(index, offset, type, is_array, is_shadow, is_bindless) <
+               std::tie(rhs.index, rhs.offset, rhs.type, rhs.is_array, rhs.is_shadow,
+                        rhs.is_bindless);
+    }
+
+private:
+    /// Offset in TSC memory from which to read the sampler object, as specified by the sampling
+    /// instruction.
+    std::size_t offset{};
+    std::size_t index{}; ///< Value used to index into the generated GLSL sampler array.
+    Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc)
+    bool is_array{};    ///< Whether the texture is being sampled as an array texture or not.
+    bool is_shadow{};   ///< Whether the texture is being sampled as a depth texture or not.
+    bool is_bindless{}; ///< Whether this sampler belongs to a bindless texture or not.
+};
+
+struct GlobalMemoryBase {
+    u32 cbuf_index{};
+    u32 cbuf_offset{};
+
+    bool operator<(const GlobalMemoryBase& rhs) const {
+        return std::tie(cbuf_index, cbuf_offset) < std::tie(rhs.cbuf_index, rhs.cbuf_offset);
+    }
+};
+
+/// Parameters describing an arithmetic operation
+struct MetaArithmetic {
+    bool precise{}; ///< Whether the operation can be constraint or not
+};
+
+/// Parameters describing a texture sampler
+struct MetaTexture {
+    const Sampler& sampler;
+    Node array;
+    Node depth_compare;
+    std::vector<Node> aoffi;
+    Node bias;
+    Node lod;
+    Node component{};
+    u32 element{};
+};
+
+/// Parameters that modify an operation but are not part of any particular operand
+using Meta = std::variant<MetaArithmetic, MetaTexture, MetaStackClass, Tegra::Shader::HalfType>;
+
+/// Holds any kind of operation that can be done in the IR
+class OperationNode final {
+public:
+    explicit OperationNode(OperationCode code) : OperationNode(code, Meta{}) {}
+
+    explicit OperationNode(OperationCode code, Meta meta)
+        : OperationNode(code, meta, std::vector<Node>{}) {}
+
+    explicit OperationNode(OperationCode code, std::vector<Node> operands)
+        : OperationNode(code, Meta{}, std::move(operands)) {}
+
+    explicit OperationNode(OperationCode code, Meta meta, std::vector<Node> operands)
+        : code{code}, meta{std::move(meta)}, operands{std::move(operands)} {}
+
+    template <typename... Args>
+    explicit OperationNode(OperationCode code, Meta meta, Args&&... operands)
+        : code{code}, meta{std::move(meta)}, operands{operands...} {}
+
+    OperationCode GetCode() const {
+        return code;
+    }
+
+    const Meta& GetMeta() const {
+        return meta;
+    }
+
+    std::size_t GetOperandsCount() const {
+        return operands.size();
+    }
+
+    const Node& operator[](std::size_t operand_index) const {
+        return operands.at(operand_index);
+    }
+
+private:
+    OperationCode code{};
+    Meta meta{};
+    std::vector<Node> operands;
+};
+
+/// Encloses inside any kind of node that returns a boolean conditionally-executed code
+class ConditionalNode final {
+public:
+    explicit ConditionalNode(Node condition, std::vector<Node>&& code)
+        : condition{std::move(condition)}, code{std::move(code)} {}
+
+    const Node& GetCondition() const {
+        return condition;
+    }
+
+    const std::vector<Node>& GetCode() const {
+        return code;
+    }
+
+private:
+    Node condition;         ///< Condition to be satisfied
+    std::vector<Node> code; ///< Code to execute
+};
+
+/// A general purpose register
+class GprNode final {
+public:
+    explicit constexpr GprNode(Tegra::Shader::Register index) : index{index} {}
+
+    u32 GetIndex() const {
+        return static_cast<u32>(index);
+    }
+
+private:
+    Tegra::Shader::Register index{};
+};
+
+/// A 32-bits value that represents an immediate value
+class ImmediateNode final {
+public:
+    explicit constexpr ImmediateNode(u32 value) : value{value} {}
+
+    u32 GetValue() const {
+        return value;
+    }
+
+private:
+    u32 value{};
+};
+
+/// One of Maxwell's internal flags
+class InternalFlagNode final {
+public:
+    explicit constexpr InternalFlagNode(InternalFlag flag) : flag{flag} {}
+
+    InternalFlag GetFlag() const {
+        return flag;
+    }
+
+private:
+    InternalFlag flag{};
+};
+
+/// A predicate register, it can be negated without additional nodes
+class PredicateNode final {
+public:
+    explicit constexpr PredicateNode(Tegra::Shader::Pred index, bool negated)
+        : index{index}, negated{negated} {}
+
+    Tegra::Shader::Pred GetIndex() const {
+        return index;
+    }
+
+    bool IsNegated() const {
+        return negated;
+    }
+
+private:
+    Tegra::Shader::Pred index{};
+    bool negated{};
+};
+
+/// Attribute buffer memory (known as attributes or varyings in GLSL terms)
+class AbufNode final {
+public:
+    // Initialize for standard attributes (index is explicit).
+    explicit AbufNode(Tegra::Shader::Attribute::Index index, u32 element, Node buffer = {})
+        : buffer{std::move(buffer)}, index{index}, element{element} {}
+
+    // Initialize for physical attributes (index is a variable value).
+    explicit AbufNode(Node physical_address, Node buffer = {})
+        : physical_address{std::move(physical_address)}, buffer{std::move(buffer)} {}
+
+    Tegra::Shader::Attribute::Index GetIndex() const {
+        return index;
+    }
+
+    u32 GetElement() const {
+        return element;
+    }
+
+    const Node& GetBuffer() const {
+        return buffer;
+    }
+
+    bool IsPhysicalBuffer() const {
+        return static_cast<bool>(physical_address);
+    }
+
+    const Node& GetPhysicalAddress() const {
+        return physical_address;
+    }
+
+private:
+    Node physical_address;
+    Node buffer;
+    Tegra::Shader::Attribute::Index index{};
+    u32 element{};
+};
+
+/// Constant buffer node, usually mapped to uniform buffers in GLSL
+class CbufNode final {
+public:
+    explicit CbufNode(u32 index, Node offset) : index{index}, offset{std::move(offset)} {}
+
+    u32 GetIndex() const {
+        return index;
+    }
+
+    const Node& GetOffset() const {
+        return offset;
+    }
+
+private:
+    u32 index{};
+    Node offset;
+};
+
+/// Local memory node
+class LmemNode final {
+public:
+    explicit LmemNode(Node address) : address{std::move(address)} {}
+
+    const Node& GetAddress() const {
+        return address;
+    }
+
+private:
+    Node address;
+};
+
+/// Global memory node
+class GmemNode final {
+public:
+    explicit GmemNode(Node real_address, Node base_address, const GlobalMemoryBase& descriptor)
+        : real_address{std::move(real_address)}, base_address{std::move(base_address)},
+          descriptor{descriptor} {}
+
+    const Node& GetRealAddress() const {
+        return real_address;
+    }
+
+    const Node& GetBaseAddress() const {
+        return base_address;
+    }
+
+    const GlobalMemoryBase& GetDescriptor() const {
+        return descriptor;
+    }
+
+private:
+    Node real_address;
+    Node base_address;
+    GlobalMemoryBase descriptor;
+};
+
+/// Commentary, can be dropped
+class CommentNode final {
+public:
+    explicit CommentNode(std::string text) : text{std::move(text)} {}
+
+    const std::string& GetText() const {
+        return text;
+    }
+
+private:
+    std::string text;
+};
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/node_helper.cpp b/src/video_core/shader/node_helper.cpp
new file mode 100644
index 000000000..6fccbbba3
--- /dev/null
+++ b/src/video_core/shader/node_helper.cpp
@@ -0,0 +1,99 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstring>
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/shader/node_helper.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+Node Conditional(Node condition, std::vector<Node> code) {
+    return MakeNode<ConditionalNode>(condition, std::move(code));
+}
+
+Node Comment(std::string text) {
+    return MakeNode<CommentNode>(std::move(text));
+}
+
+Node Immediate(u32 value) {
+    return MakeNode<ImmediateNode>(value);
+}
+
+Node Immediate(s32 value) {
+    return Immediate(static_cast<u32>(value));
+}
+
+Node Immediate(f32 value) {
+    u32 integral;
+    std::memcpy(&integral, &value, sizeof(u32));
+    return Immediate(integral);
+}
+
+OperationCode SignedToUnsignedCode(OperationCode operation_code, bool is_signed) {
+    if (is_signed) {
+        return operation_code;
+    }
+    switch (operation_code) {
+    case OperationCode::FCastInteger:
+        return OperationCode::FCastUInteger;
+    case OperationCode::IAdd:
+        return OperationCode::UAdd;
+    case OperationCode::IMul:
+        return OperationCode::UMul;
+    case OperationCode::IDiv:
+        return OperationCode::UDiv;
+    case OperationCode::IMin:
+        return OperationCode::UMin;
+    case OperationCode::IMax:
+        return OperationCode::UMax;
+    case OperationCode::ICastFloat:
+        return OperationCode::UCastFloat;
+    case OperationCode::ICastUnsigned:
+        return OperationCode::UCastSigned;
+    case OperationCode::ILogicalShiftLeft:
+        return OperationCode::ULogicalShiftLeft;
+    case OperationCode::ILogicalShiftRight:
+        return OperationCode::ULogicalShiftRight;
+    case OperationCode::IArithmeticShiftRight:
+        return OperationCode::UArithmeticShiftRight;
+    case OperationCode::IBitwiseAnd:
+        return OperationCode::UBitwiseAnd;
+    case OperationCode::IBitwiseOr:
+        return OperationCode::UBitwiseOr;
+    case OperationCode::IBitwiseXor:
+        return OperationCode::UBitwiseXor;
+    case OperationCode::IBitwiseNot:
+        return OperationCode::UBitwiseNot;
+    case OperationCode::IBitfieldInsert:
+        return OperationCode::UBitfieldInsert;
+    case OperationCode::IBitCount:
+        return OperationCode::UBitCount;
+    case OperationCode::LogicalILessThan:
+        return OperationCode::LogicalULessThan;
+    case OperationCode::LogicalIEqual:
+        return OperationCode::LogicalUEqual;
+    case OperationCode::LogicalILessEqual:
+        return OperationCode::LogicalULessEqual;
+    case OperationCode::LogicalIGreaterThan:
+        return OperationCode::LogicalUGreaterThan;
+    case OperationCode::LogicalINotEqual:
+        return OperationCode::LogicalUNotEqual;
+    case OperationCode::LogicalIGreaterEqual:
+        return OperationCode::LogicalUGreaterEqual;
+    case OperationCode::INegate:
+        UNREACHABLE_MSG("Can't negate an unsigned integer");
+        return {};
+    case OperationCode::IAbsolute:
+        UNREACHABLE_MSG("Can't apply absolute to an unsigned integer");
+        return {};
+    default:
+        UNREACHABLE_MSG("Unknown signed operation with code={}", static_cast<u32>(operation_code));
+        return {};
+    }
+}
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/node_helper.h b/src/video_core/shader/node_helper.h
new file mode 100644
index 000000000..0c2aa749b
--- /dev/null
+++ b/src/video_core/shader/node_helper.h
@@ -0,0 +1,65 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/shader/node.h"
+
+namespace VideoCommon::Shader {
+
+/// This arithmetic operation cannot be constraint
+inline constexpr MetaArithmetic PRECISE = {true};
+/// This arithmetic operation can be optimized away
+inline constexpr MetaArithmetic NO_PRECISE = {false};
+
+/// Creates a conditional node
+Node Conditional(Node condition, std::vector<Node> code);
+
+/// Creates a commentary node
+Node Comment(std::string text);
+
+/// Creates an u32 immediate
+Node Immediate(u32 value);
+
+/// Creates a s32 immediate
+Node Immediate(s32 value);
+
+/// Creates a f32 immediate
+Node Immediate(f32 value);
+
+/// Converts an signed operation code to an unsigned operation code
+OperationCode SignedToUnsignedCode(OperationCode operation_code, bool is_signed);
+
+template <typename T, typename... Args>
+Node MakeNode(Args&&... args) {
+    static_assert(std::is_convertible_v<T, NodeData>);
+    return std::make_shared<NodeData>(T(std::forward<Args>(args)...));
+}
+
+template <typename... Args>
+Node Operation(OperationCode code, Args&&... args) {
+    if constexpr (sizeof...(args) == 0) {
+        return MakeNode<OperationNode>(code);
+    } else if constexpr (std::is_convertible_v<std::tuple_element_t<0, std::tuple<Args...>>,
+                                               Meta>) {
+        return MakeNode<OperationNode>(code, std::forward<Args>(args)...);
+    } else {
+        return MakeNode<OperationNode>(code, Meta{}, std::forward<Args>(args)...);
+    }
+}
+
+template <typename... Args>
+Node SignedOperation(OperationCode code, bool is_signed, Args&&... args) {
+    return Operation(SignedToUnsignedCode(code, is_signed), std::forward<Args>(args)...);
+}
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index 8a6ee5cf5..11b545cca 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -9,6 +9,7 @@
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace VideoCommon::Shader {
@@ -28,30 +29,11 @@ ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset)
 
 ShaderIR::~ShaderIR() = default;
 
-Node ShaderIR::StoreNode(NodeData&& node_data) {
-    auto store = std::make_unique<NodeData>(node_data);
-    const Node node = store.get();
-    stored_nodes.push_back(std::move(store));
-    return node;
-}
-
-Node ShaderIR::Conditional(Node condition, std::vector<Node>&& code) {
-    return StoreNode(ConditionalNode(condition, std::move(code)));
-}
-
-Node ShaderIR::Comment(std::string text) {
-    return StoreNode(CommentNode(std::move(text)));
-}
-
-Node ShaderIR::Immediate(u32 value) {
-    return StoreNode(ImmediateNode(value));
-}
-
 Node ShaderIR::GetRegister(Register reg) {
     if (reg != Register::ZeroIndex) {
         used_registers.insert(static_cast<u32>(reg));
     }
-    return StoreNode(GprNode(reg));
+    return MakeNode<GprNode>(reg);
 }
 
 Node ShaderIR::GetImmediate19(Instruction instr) {
@@ -69,7 +51,7 @@ Node ShaderIR::GetConstBuffer(u64 index_, u64 offset_) {
     const auto [entry, is_new] = used_cbufs.try_emplace(index);
     entry->second.MarkAsUsed(offset);
 
-    return StoreNode(CbufNode(index, Immediate(offset)));
+    return MakeNode<CbufNode>(index, Immediate(offset));
 }
 
 Node ShaderIR::GetConstBufferIndirect(u64 index_, u64 offset_, Node node) {
@@ -80,7 +62,7 @@ Node ShaderIR::GetConstBufferIndirect(u64 index_, u64 offset_, Node node) {
     entry->second.MarkAsUsedIndirect();
 
     const Node final_offset = Operation(OperationCode::UAdd, NO_PRECISE, node, Immediate(offset));
-    return StoreNode(CbufNode(index, final_offset));
+    return MakeNode<CbufNode>(index, final_offset);
 }
 
 Node ShaderIR::GetPredicate(u64 pred_, bool negated) {
@@ -89,7 +71,7 @@ Node ShaderIR::GetPredicate(u64 pred_, bool negated) {
         used_predicates.insert(pred);
     }
 
-    return StoreNode(PredicateNode(pred, negated));
+    return MakeNode<PredicateNode>(pred, negated);
 }
 
 Node ShaderIR::GetPredicate(bool immediate) {
@@ -98,12 +80,12 @@ Node ShaderIR::GetPredicate(bool immediate) {
 
 Node ShaderIR::GetInputAttribute(Attribute::Index index, u64 element, Node buffer) {
     used_input_attributes.emplace(index);
-    return StoreNode(AbufNode(index, static_cast<u32>(element), buffer));
+    return MakeNode<AbufNode>(index, static_cast<u32>(element), buffer);
 }
 
 Node ShaderIR::GetPhysicalInputAttribute(Tegra::Shader::Register physical_address, Node buffer) {
     uses_physical_attributes = true;
-    return StoreNode(AbufNode(GetRegister(physical_address), buffer));
+    return MakeNode<AbufNode>(GetRegister(physical_address), buffer);
 }
 
 Node ShaderIR::GetOutputAttribute(Attribute::Index index, u64 element, Node buffer) {
@@ -115,11 +97,11 @@ Node ShaderIR::GetOutputAttribute(Attribute::Index index, u64 element, Node buff
     }
     used_output_attributes.insert(index);
 
-    return StoreNode(AbufNode(index, static_cast<u32>(element), buffer));
+    return MakeNode<AbufNode>(index, static_cast<u32>(element), buffer);
 }
 
 Node ShaderIR::GetInternalFlag(InternalFlag flag, bool negated) {
-    const Node node = StoreNode(InternalFlagNode(flag));
+    const Node node = MakeNode<InternalFlagNode>(flag);
     if (negated) {
         return Operation(OperationCode::LogicalNegate, node);
     }
@@ -127,7 +109,7 @@ Node ShaderIR::GetInternalFlag(InternalFlag flag, bool negated) {
 }
 
 Node ShaderIR::GetLocalMemory(Node address) {
-    return StoreNode(LmemNode(address));
+    return MakeNode<LmemNode>(address);
 }
 
 Node ShaderIR::GetTemporal(u32 id) {
@@ -393,68 +375,4 @@ Node ShaderIR::BitfieldExtract(Node value, u32 offset, u32 bits) {
                      Immediate(bits));
 }
 
-/*static*/ OperationCode ShaderIR::SignedToUnsignedCode(OperationCode operation_code,
-                                                        bool is_signed) {
-    if (is_signed) {
-        return operation_code;
-    }
-    switch (operation_code) {
-    case OperationCode::FCastInteger:
-        return OperationCode::FCastUInteger;
-    case OperationCode::IAdd:
-        return OperationCode::UAdd;
-    case OperationCode::IMul:
-        return OperationCode::UMul;
-    case OperationCode::IDiv:
-        return OperationCode::UDiv;
-    case OperationCode::IMin:
-        return OperationCode::UMin;
-    case OperationCode::IMax:
-        return OperationCode::UMax;
-    case OperationCode::ICastFloat:
-        return OperationCode::UCastFloat;
-    case OperationCode::ICastUnsigned:
-        return OperationCode::UCastSigned;
-    case OperationCode::ILogicalShiftLeft:
-        return OperationCode::ULogicalShiftLeft;
-    case OperationCode::ILogicalShiftRight:
-        return OperationCode::ULogicalShiftRight;
-    case OperationCode::IArithmeticShiftRight:
-        return OperationCode::UArithmeticShiftRight;
-    case OperationCode::IBitwiseAnd:
-        return OperationCode::UBitwiseAnd;
-    case OperationCode::IBitwiseOr:
-        return OperationCode::UBitwiseOr;
-    case OperationCode::IBitwiseXor:
-        return OperationCode::UBitwiseXor;
-    case OperationCode::IBitwiseNot:
-        return OperationCode::UBitwiseNot;
-    case OperationCode::IBitfieldInsert:
-        return OperationCode::UBitfieldInsert;
-    case OperationCode::IBitCount:
-        return OperationCode::UBitCount;
-    case OperationCode::LogicalILessThan:
-        return OperationCode::LogicalULessThan;
-    case OperationCode::LogicalIEqual:
-        return OperationCode::LogicalUEqual;
-    case OperationCode::LogicalILessEqual:
-        return OperationCode::LogicalULessEqual;
-    case OperationCode::LogicalIGreaterThan:
-        return OperationCode::LogicalUGreaterThan;
-    case OperationCode::LogicalINotEqual:
-        return OperationCode::LogicalUNotEqual;
-    case OperationCode::LogicalIGreaterEqual:
-        return OperationCode::LogicalUGreaterEqual;
-    case OperationCode::INegate:
-        UNREACHABLE_MSG("Can't negate an unsigned integer");
-        return {};
-    case OperationCode::IAbsolute:
-        UNREACHABLE_MSG("Can't apply absolute to an unsigned integer");
-        return {};
-    default:
-        UNREACHABLE_MSG("Unknown signed operation with code={}", static_cast<u32>(operation_code));
-        return {};
-    }
-}
-
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index ff7472e30..edcf2288e 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -18,188 +18,14 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/engines/shader_header.h"
+#include "video_core/shader/node.h"
 
 namespace VideoCommon::Shader {
 
-class OperationNode;
-class ConditionalNode;
-class GprNode;
-class ImmediateNode;
-class InternalFlagNode;
-class PredicateNode;
-class AbufNode; ///< Attribute buffer
-class CbufNode; ///< Constant buffer
-class LmemNode; ///< Local memory
-class GmemNode; ///< Global memory
-class CommentNode;
-
 using ProgramCode = std::vector<u64>;
 
-using NodeData =
-    std::variant<OperationNode, ConditionalNode, GprNode, ImmediateNode, InternalFlagNode,
-                 PredicateNode, AbufNode, CbufNode, LmemNode, GmemNode, CommentNode>;
-using Node = const NodeData*;
-using Node4 = std::array<Node, 4>;
-using NodeBlock = std::vector<Node>;
-
 constexpr u32 MAX_PROGRAM_LENGTH = 0x1000;
 
-enum class OperationCode {
-    Assign, /// (float& dest, float src) -> void
-
-    Select, /// (MetaArithmetic, bool pred, float a, float b) -> float
-
-    FAdd,          /// (MetaArithmetic, float a, float b) -> float
-    FMul,          /// (MetaArithmetic, float a, float b) -> float
-    FDiv,          /// (MetaArithmetic, float a, float b) -> float
-    FFma,          /// (MetaArithmetic, float a, float b, float c) -> float
-    FNegate,       /// (MetaArithmetic, float a) -> float
-    FAbsolute,     /// (MetaArithmetic, float a) -> float
-    FClamp,        /// (MetaArithmetic, float value, float min, float max) -> float
-    FMin,          /// (MetaArithmetic, float a, float b) -> float
-    FMax,          /// (MetaArithmetic, float a, float b) -> float
-    FCos,          /// (MetaArithmetic, float a) -> float
-    FSin,          /// (MetaArithmetic, float a) -> float
-    FExp2,         /// (MetaArithmetic, float a) -> float
-    FLog2,         /// (MetaArithmetic, float a) -> float
-    FInverseSqrt,  /// (MetaArithmetic, float a) -> float
-    FSqrt,         /// (MetaArithmetic, float a) -> float
-    FRoundEven,    /// (MetaArithmetic, float a) -> float
-    FFloor,        /// (MetaArithmetic, float a) -> float
-    FCeil,         /// (MetaArithmetic, float a) -> float
-    FTrunc,        /// (MetaArithmetic, float a) -> float
-    FCastInteger,  /// (MetaArithmetic, int a) -> float
-    FCastUInteger, /// (MetaArithmetic, uint a) -> float
-
-    IAdd,                  /// (MetaArithmetic, int a, int b) -> int
-    IMul,                  /// (MetaArithmetic, int a, int b) -> int
-    IDiv,                  /// (MetaArithmetic, int a, int b) -> int
-    INegate,               /// (MetaArithmetic, int a) -> int
-    IAbsolute,             /// (MetaArithmetic, int a) -> int
-    IMin,                  /// (MetaArithmetic, int a, int b) -> int
-    IMax,                  /// (MetaArithmetic, int a, int b) -> int
-    ICastFloat,            /// (MetaArithmetic, float a) -> int
-    ICastUnsigned,         /// (MetaArithmetic, uint a) -> int
-    ILogicalShiftLeft,     /// (MetaArithmetic, int a, uint b) -> int
-    ILogicalShiftRight,    /// (MetaArithmetic, int a, uint b) -> int
-    IArithmeticShiftRight, /// (MetaArithmetic, int a, uint b) -> int
-    IBitwiseAnd,           /// (MetaArithmetic, int a, int b) -> int
-    IBitwiseOr,            /// (MetaArithmetic, int a, int b) -> int
-    IBitwiseXor,           /// (MetaArithmetic, int a, int b) -> int
-    IBitwiseNot,           /// (MetaArithmetic, int a) -> int
-    IBitfieldInsert,       /// (MetaArithmetic, int base, int insert, int offset, int bits) -> int
-    IBitfieldExtract,      /// (MetaArithmetic, int value, int offset, int offset) -> int
-    IBitCount,             /// (MetaArithmetic, int) -> int
-
-    UAdd,                  /// (MetaArithmetic, uint a, uint b) -> uint
-    UMul,                  /// (MetaArithmetic, uint a, uint b) -> uint
-    UDiv,                  /// (MetaArithmetic, uint a, uint b) -> uint
-    UMin,                  /// (MetaArithmetic, uint a, uint b) -> uint
-    UMax,                  /// (MetaArithmetic, uint a, uint b) -> uint
-    UCastFloat,            /// (MetaArithmetic, float a) -> uint
-    UCastSigned,           /// (MetaArithmetic, int a) -> uint
-    ULogicalShiftLeft,     /// (MetaArithmetic, uint a, uint b) -> uint
-    ULogicalShiftRight,    /// (MetaArithmetic, uint a, uint b) -> uint
-    UArithmeticShiftRight, /// (MetaArithmetic, uint a, uint b) -> uint
-    UBitwiseAnd,           /// (MetaArithmetic, uint a, uint b) -> uint
-    UBitwiseOr,            /// (MetaArithmetic, uint a, uint b) -> uint
-    UBitwiseXor,           /// (MetaArithmetic, uint a, uint b) -> uint
-    UBitwiseNot,           /// (MetaArithmetic, uint a) -> uint
-    UBitfieldInsert,  /// (MetaArithmetic, uint base, uint insert, int offset, int bits) -> uint
-    UBitfieldExtract, /// (MetaArithmetic, uint value, int offset, int offset) -> uint
-    UBitCount,        /// (MetaArithmetic, uint) -> uint
-
-    HAdd,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
-    HMul,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
-    HFma,      /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
-    HAbsolute, /// (f16vec2 a) -> f16vec2
-    HNegate,   /// (f16vec2 a, bool first, bool second) -> f16vec2
-    HClamp,    /// (f16vec2 src, float min, float max) -> f16vec2
-    HUnpack,   /// (Tegra::Shader::HalfType, T value) -> f16vec2
-    HMergeF32, /// (f16vec2 src) -> float
-    HMergeH0,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
-    HMergeH1,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
-    HPack2,    /// (float a, float b) -> f16vec2
-
-    LogicalAssign, /// (bool& dst, bool src) -> void
-    LogicalAnd,    /// (bool a, bool b) -> bool
-    LogicalOr,     /// (bool a, bool b) -> bool
-    LogicalXor,    /// (bool a, bool b) -> bool
-    LogicalNegate, /// (bool a) -> bool
-    LogicalPick2,  /// (bool2 pair, uint index) -> bool
-    LogicalAll2,   /// (bool2 a) -> bool
-    LogicalAny2,   /// (bool2 a) -> bool
-
-    LogicalFLessThan,     /// (float a, float b) -> bool
-    LogicalFEqual,        /// (float a, float b) -> bool
-    LogicalFLessEqual,    /// (float a, float b) -> bool
-    LogicalFGreaterThan,  /// (float a, float b) -> bool
-    LogicalFNotEqual,     /// (float a, float b) -> bool
-    LogicalFGreaterEqual, /// (float a, float b) -> bool
-    LogicalFIsNan,        /// (float a) -> bool
-
-    LogicalILessThan,     /// (int a, int b) -> bool
-    LogicalIEqual,        /// (int a, int b) -> bool
-    LogicalILessEqual,    /// (int a, int b) -> bool
-    LogicalIGreaterThan,  /// (int a, int b) -> bool
-    LogicalINotEqual,     /// (int a, int b) -> bool
-    LogicalIGreaterEqual, /// (int a, int b) -> bool
-
-    LogicalULessThan,     /// (uint a, uint b) -> bool
-    LogicalUEqual,        /// (uint a, uint b) -> bool
-    LogicalULessEqual,    /// (uint a, uint b) -> bool
-    LogicalUGreaterThan,  /// (uint a, uint b) -> bool
-    LogicalUNotEqual,     /// (uint a, uint b) -> bool
-    LogicalUGreaterEqual, /// (uint a, uint b) -> bool
-
-    Logical2HLessThan,            /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HEqual,               /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HLessEqual,           /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HGreaterThan,         /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HNotEqual,            /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HGreaterEqual,        /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HLessThanWithNan,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HEqualWithNan,        /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HLessEqualWithNan,    /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HGreaterThanWithNan,  /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HNotEqualWithNan,     /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-    Logical2HGreaterEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
-
-    Texture,                /// (MetaTexture, float[N] coords) -> float4
-    TextureLod,             /// (MetaTexture, float[N] coords) -> float4
-    TextureGather,          /// (MetaTexture, float[N] coords) -> float4
-    TextureQueryDimensions, /// (MetaTexture, float a) -> float4
-    TextureQueryLod,        /// (MetaTexture, float[N] coords) -> float4
-    TexelFetch,             /// (MetaTexture, int[N], int) -> float4
-
-    Branch,        /// (uint branch_target) -> void
-    PushFlowStack, /// (uint branch_target) -> void
-    PopFlowStack,  /// () -> void
-    Exit,          /// () -> void
-    Discard,       /// () -> void
-
-    EmitVertex,   /// () -> void
-    EndPrimitive, /// () -> void
-
-    YNegate,            /// () -> float
-    LocalInvocationIdX, /// () -> uint
-    LocalInvocationIdY, /// () -> uint
-    LocalInvocationIdZ, /// () -> uint
-    WorkGroupIdX,       /// () -> uint
-    WorkGroupIdY,       /// () -> uint
-    WorkGroupIdZ,       /// () -> uint
-
-    Amount,
-};
-
-enum class InternalFlag {
-    Zero = 0,
-    Sign = 1,
-    Carry = 2,
-    Overflow = 3,
-    Amount = 4,
-};
-
 /// Describes the behaviour of code path of a given entry point and a return point.
 enum class ExitMethod {
     Undetermined, ///< Internal value. Only occur when analyzing JMP loop.
@@ -208,71 +34,6 @@ enum class ExitMethod {
     AlwaysEnd,    ///< All code paths reach a END instruction.
 };
 
-class Sampler {
-public:
-    // Use this constructor for bounded Samplers
-    explicit Sampler(std::size_t offset, std::size_t index, Tegra::Shader::TextureType type,
-                     bool is_array, bool is_shadow)
-        : offset{offset}, index{index}, type{type}, is_array{is_array}, is_shadow{is_shadow},
-          is_bindless{false} {}
-
-    // Use this constructor for bindless Samplers
-    explicit Sampler(u32 cbuf_index, u32 cbuf_offset, std::size_t index,
-                     Tegra::Shader::TextureType type, bool is_array, bool is_shadow)
-        : offset{(static_cast<u64>(cbuf_index) << 32) | cbuf_offset}, index{index}, type{type},
-          is_array{is_array}, is_shadow{is_shadow}, is_bindless{true} {}
-
-    // Use this only for serialization/deserialization
-    explicit Sampler(std::size_t offset, std::size_t index, Tegra::Shader::TextureType type,
-                     bool is_array, bool is_shadow, bool is_bindless)
-        : offset{offset}, index{index}, type{type}, is_array{is_array}, is_shadow{is_shadow},
-          is_bindless{is_bindless} {}
-
-    std::size_t GetOffset() const {
-        return offset;
-    }
-
-    std::size_t GetIndex() const {
-        return index;
-    }
-
-    Tegra::Shader::TextureType GetType() const {
-        return type;
-    }
-
-    bool IsArray() const {
-        return is_array;
-    }
-
-    bool IsShadow() const {
-        return is_shadow;
-    }
-
-    bool IsBindless() const {
-        return is_bindless;
-    }
-
-    std::pair<u32, u32> GetBindlessCBuf() const {
-        return {static_cast<u32>(offset >> 32), static_cast<u32>(offset)};
-    }
-
-    bool operator<(const Sampler& rhs) const {
-        return std::tie(index, offset, type, is_array, is_shadow, is_bindless) <
-               std::tie(rhs.index, rhs.offset, rhs.type, rhs.is_array, rhs.is_shadow,
-                        rhs.is_bindless);
-    }
-
-private:
-    /// Offset in TSC memory from which to read the sampler object, as specified by the sampling
-    /// instruction.
-    std::size_t offset{};
-    std::size_t index{}; ///< Value used to index into the generated GLSL sampler array.
-    Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc)
-    bool is_array{};    ///< Whether the texture is being sampled as an array texture or not.
-    bool is_shadow{};   ///< Whether the texture is being sampled as a depth texture or not.
-    bool is_bindless{}; ///< Whether this sampler belongs to a bindless texture or not.
-};
-
 class ConstBuffer {
 public:
     explicit ConstBuffer(u32 max_offset, bool is_indirect)
@@ -305,268 +66,11 @@ private:
     bool is_indirect{};
 };
 
-struct GlobalMemoryBase {
-    u32 cbuf_index{};
-    u32 cbuf_offset{};
-
-    bool operator<(const GlobalMemoryBase& rhs) const {
-        return std::tie(cbuf_index, cbuf_offset) < std::tie(rhs.cbuf_index, rhs.cbuf_offset);
-    }
-};
-
 struct GlobalMemoryUsage {
     bool is_read{};
     bool is_written{};
 };
 
-struct MetaArithmetic {
-    bool precise{};
-};
-
-struct MetaTexture {
-    const Sampler& sampler;
-    Node array{};
-    Node depth_compare{};
-    std::vector<Node> aoffi;
-    Node bias{};
-    Node lod{};
-    Node component{};
-    u32 element{};
-};
-
-constexpr MetaArithmetic PRECISE = {true};
-constexpr MetaArithmetic NO_PRECISE = {false};
-
-using Meta = std::variant<MetaArithmetic, MetaTexture, Tegra::Shader::HalfType>;
-
-/// Holds any kind of operation that can be done in the IR
-class OperationNode final {
-public:
-    explicit OperationNode(OperationCode code) : code{code} {}
-
-    explicit OperationNode(OperationCode code, Meta&& meta) : code{code}, meta{std::move(meta)} {}
-
-    template <typename... T>
-    explicit OperationNode(OperationCode code, const T*... operands)
-        : OperationNode(code, {}, operands...) {}
-
-    template <typename... T>
-    explicit OperationNode(OperationCode code, Meta&& meta, const T*... operands_)
-        : code{code}, meta{std::move(meta)}, operands{operands_...} {}
-
-    explicit OperationNode(OperationCode code, Meta&& meta, std::vector<Node>&& operands)
-        : code{code}, meta{meta}, operands{std::move(operands)} {}
-
-    explicit OperationNode(OperationCode code, std::vector<Node>&& operands)
-        : code{code}, operands{std::move(operands)} {}
-
-    OperationCode GetCode() const {
-        return code;
-    }
-
-    const Meta& GetMeta() const {
-        return meta;
-    }
-
-    std::size_t GetOperandsCount() const {
-        return operands.size();
-    }
-
-    Node operator[](std::size_t operand_index) const {
-        return operands.at(operand_index);
-    }
-
-private:
-    const OperationCode code;
-    const Meta meta;
-    std::vector<Node> operands;
-};
-
-/// Encloses inside any kind of node that returns a boolean conditionally-executed code
-class ConditionalNode final {
-public:
-    explicit ConditionalNode(Node condition, std::vector<Node>&& code)
-        : condition{condition}, code{std::move(code)} {}
-
-    Node GetCondition() const {
-        return condition;
-    }
-
-    const std::vector<Node>& GetCode() const {
-        return code;
-    }
-
-private:
-    const Node condition;   ///< Condition to be satisfied
-    std::vector<Node> code; ///< Code to execute
-};
-
-/// A general purpose register
-class GprNode final {
-public:
-    explicit constexpr GprNode(Tegra::Shader::Register index) : index{index} {}
-
-    u32 GetIndex() const {
-        return static_cast<u32>(index);
-    }
-
-private:
-    const Tegra::Shader::Register index;
-};
-
-/// A 32-bits value that represents an immediate value
-class ImmediateNode final {
-public:
-    explicit constexpr ImmediateNode(u32 value) : value{value} {}
-
-    u32 GetValue() const {
-        return value;
-    }
-
-private:
-    const u32 value;
-};
-
-/// One of Maxwell's internal flags
-class InternalFlagNode final {
-public:
-    explicit constexpr InternalFlagNode(InternalFlag flag) : flag{flag} {}
-
-    InternalFlag GetFlag() const {
-        return flag;
-    }
-
-private:
-    const InternalFlag flag;
-};
-
-/// A predicate register, it can be negated without additional nodes
-class PredicateNode final {
-public:
-    explicit constexpr PredicateNode(Tegra::Shader::Pred index, bool negated)
-        : index{index}, negated{negated} {}
-
-    Tegra::Shader::Pred GetIndex() const {
-        return index;
-    }
-
-    bool IsNegated() const {
-        return negated;
-    }
-
-private:
-    const Tegra::Shader::Pred index;
-    const bool negated;
-};
-
-/// Attribute buffer memory (known as attributes or varyings in GLSL terms)
-class AbufNode final {
-public:
-    // Initialize for standard attributes (index is explicit).
-    explicit constexpr AbufNode(Tegra::Shader::Attribute::Index index, u32 element,
-                                Node buffer = {})
-        : buffer{buffer}, index{index}, element{element} {}
-
-    // Initialize for physical attributes (index is a variable value).
-    explicit constexpr AbufNode(Node physical_address, Node buffer = {})
-        : physical_address{physical_address}, buffer{buffer} {}
-
-    Tegra::Shader::Attribute::Index GetIndex() const {
-        return index;
-    }
-
-    u32 GetElement() const {
-        return element;
-    }
-
-    Node GetBuffer() const {
-        return buffer;
-    }
-
-    bool IsPhysicalBuffer() const {
-        return physical_address != nullptr;
-    }
-
-    Node GetPhysicalAddress() const {
-        return physical_address;
-    }
-
-private:
-    Node physical_address{};
-    Node buffer{};
-    Tegra::Shader::Attribute::Index index{};
-    u32 element{};
-};
-
-/// Constant buffer node, usually mapped to uniform buffers in GLSL
-class CbufNode final {
-public:
-    explicit constexpr CbufNode(u32 index, Node offset) : index{index}, offset{offset} {}
-
-    u32 GetIndex() const {
-        return index;
-    }
-
-    Node GetOffset() const {
-        return offset;
-    }
-
-private:
-    const u32 index;
-    const Node offset;
-};
-
-/// Local memory node
-class LmemNode final {
-public:
-    explicit constexpr LmemNode(Node address) : address{address} {}
-
-    Node GetAddress() const {
-        return address;
-    }
-
-private:
-    const Node address;
-};
-
-/// Global memory node
-class GmemNode final {
-public:
-    explicit constexpr GmemNode(Node real_address, Node base_address,
-                                const GlobalMemoryBase& descriptor)
-        : real_address{real_address}, base_address{base_address}, descriptor{descriptor} {}
-
-    Node GetRealAddress() const {
-        return real_address;
-    }
-
-    Node GetBaseAddress() const {
-        return base_address;
-    }
-
-    const GlobalMemoryBase& GetDescriptor() const {
-        return descriptor;
-    }
-
-private:
-    const Node real_address;
-    const Node base_address;
-    const GlobalMemoryBase descriptor;
-};
-
-/// Commentary, can be dropped
-class CommentNode final {
-public:
-    explicit CommentNode(std::string text) : text{std::move(text)} {}
-
-    const std::string& GetText() const {
-        return text;
-    }
-
-private:
-    std::string text;
-};
-
 class ShaderIR final {
 public:
     explicit ShaderIR(const ProgramCode& program_code, u32 main_offset);
@@ -663,26 +167,6 @@ private:
     u32 DecodeXmad(NodeBlock& bb, u32 pc);
     u32 DecodeOther(NodeBlock& bb, u32 pc);
 
-    /// Internalizes node's data and returns a managed pointer to a clone of that node
-    Node StoreNode(NodeData&& node_data);
-
-    /// Creates a conditional node
-    Node Conditional(Node condition, std::vector<Node>&& code);
-    /// Creates a commentary
-    Node Comment(std::string text);
-    /// Creates an u32 immediate
-    Node Immediate(u32 value);
-    /// Creates a s32 immediate
-    Node Immediate(s32 value) {
-        return Immediate(static_cast<u32>(value));
-    }
-    /// Creates a f32 immediate
-    Node Immediate(f32 value) {
-        u32 integral;
-        std::memcpy(&integral, &value, sizeof(u32));
-        return Immediate(integral);
-    }
-
     /// Generates a node for a passed register.
     Node GetRegister(Tegra::Shader::Register reg);
     /// Generates a node representing a 19-bit immediate value
@@ -827,37 +311,6 @@ private:
     std::tuple<Node, Node, GlobalMemoryBase> TrackAndGetGlobalMemory(
         NodeBlock& bb, Tegra::Shader::Instruction instr, bool is_write);
 
-    template <typename... T>
-    Node Operation(OperationCode code, const T*... operands) {
-        return StoreNode(OperationNode(code, operands...));
-    }
-
-    template <typename... T>
-    Node Operation(OperationCode code, Meta&& meta, const T*... operands) {
-        return StoreNode(OperationNode(code, std::move(meta), operands...));
-    }
-
-    Node Operation(OperationCode code, std::vector<Node>&& operands) {
-        return StoreNode(OperationNode(code, std::move(operands)));
-    }
-
-    Node Operation(OperationCode code, Meta&& meta, std::vector<Node>&& operands) {
-        return StoreNode(OperationNode(code, std::move(meta), std::move(operands)));
-    }
-
-    template <typename... T>
-    Node SignedOperation(OperationCode code, bool is_signed, const T*... operands) {
-        return StoreNode(OperationNode(SignedToUnsignedCode(code, is_signed), operands...));
-    }
-
-    template <typename... T>
-    Node SignedOperation(OperationCode code, bool is_signed, Meta&& meta, const T*... operands) {
-        return StoreNode(
-            OperationNode(SignedToUnsignedCode(code, is_signed), std::move(meta), operands...));
-    }
-
-    static OperationCode SignedToUnsignedCode(OperationCode operation_code, bool is_signed);
-
     const ProgramCode& program_code;
     const u32 main_offset;
 
@@ -868,8 +321,6 @@ private:
     std::map<u32, NodeBlock> basic_blocks;
     NodeBlock global_code;
 
-    std::vector<std::unique_ptr<NodeData>> stored_nodes;
-
     std::set<u32> used_registers;
     std::set<Tegra::Shader::Pred> used_predicates;
     std::set<Tegra::Shader::Attribute::Index> used_input_attributes;
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index 19ede1eb9..fc957d980 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -16,12 +16,12 @@ std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
                                    OperationCode operation_code) {
     for (; cursor >= 0; --cursor) {
         const Node node = code.at(cursor);
-        if (const auto operation = std::get_if<OperationNode>(node)) {
+        if (const auto operation = std::get_if<OperationNode>(&*node)) {
             if (operation->GetCode() == operation_code) {
                 return {node, cursor};
             }
         }
-        if (const auto conditional = std::get_if<ConditionalNode>(node)) {
+        if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
             const auto& conditional_code = conditional->GetCode();
             const auto [found, internal_cursor] = FindOperation(
                 conditional_code, static_cast<s64>(conditional_code.size() - 1), operation_code);
@@ -35,11 +35,11 @@ std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
 } // namespace
 
 Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const {
-    if (const auto cbuf = std::get_if<CbufNode>(tracked)) {
+    if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
         // Cbuf found, but it has to be immediate
         return std::holds_alternative<ImmediateNode>(*cbuf->GetOffset()) ? tracked : nullptr;
     }
-    if (const auto gpr = std::get_if<GprNode>(tracked)) {
+    if (const auto gpr = std::get_if<GprNode>(&*tracked)) {
         if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) {
             return nullptr;
         }
@@ -51,7 +51,7 @@ Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const
         }
         return TrackCbuf(source, code, new_cursor);
     }
-    if (const auto operation = std::get_if<OperationNode>(tracked)) {
+    if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
         for (std::size_t i = 0; i < operation->GetOperandsCount(); ++i) {
             if (const auto found = TrackCbuf((*operation)[i], code, cursor)) {
                 // Cbuf found in operand
@@ -60,7 +60,7 @@ Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const
         }
         return nullptr;
     }
-    if (const auto conditional = std::get_if<ConditionalNode>(tracked)) {
+    if (const auto conditional = std::get_if<ConditionalNode>(&*tracked)) {
         const auto& conditional_code = conditional->GetCode();
         return TrackCbuf(tracked, conditional_code, static_cast<s64>(conditional_code.size()));
     }
@@ -75,7 +75,7 @@ std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code,
     if (!found) {
         return {};
     }
-    if (const auto immediate = std::get_if<ImmediateNode>(found)) {
+    if (const auto immediate = std::get_if<ImmediateNode>(&*found)) {
         return immediate->GetValue();
     }
     return {};
@@ -88,11 +88,11 @@ std::pair<Node, s64> ShaderIR::TrackRegister(const GprNode* tracked, const NodeB
         if (!found_node) {
             return {};
         }
-        const auto operation = std::get_if<OperationNode>(found_node);
+        const auto operation = std::get_if<OperationNode>(&*found_node);
         ASSERT(operation);
 
         const auto& target = (*operation)[0];
-        if (const auto gpr_target = std::get_if<GprNode>(target)) {
+        if (const auto gpr_target = std::get_if<GprNode>(&*target)) {
             if (gpr_target->GetIndex() == tracked->GetIndex()) {
                 return {(*operation)[1], new_cursor};
             }
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index bea0d5bc2..219bfd559 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -251,7 +251,7 @@ enum class WrapMode : u32 {
     Mirror = 1,
     ClampToEdge = 2,
     Border = 3,
-    ClampOGL = 4,
+    Clamp = 4,
     MirrorOnceClampToEdge = 5,
     MirrorOnceBorder = 6,
     MirrorOnceClampOGL = 7,