21 files changed, 2917 insertions, 478 deletions
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
new file mode 100644
index 000000000..eb5158407
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
@@ -0,0 +1,2073 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <variant>
+
+#include <fmt/format.h>
+
+#include "common/alignment.h"
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/renderer_opengl/gl_arb_decompiler.h"
+#include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/shader/registry.h"
+#include "video_core/shader/shader_ir.h"
+
+// Predicates in the decompiled code follow the convention that -1 means true and 0 means false.
+// GLASM lacks booleans, so they have to be implemented as integers.
+// Using -1 for true is useful because both CMP.S and NOT.U can negate it, and CMP.S can be used to
+// select between two values, because -1 will be evaluated as true and 0 as false.
+
+namespace OpenGL {
+
+namespace {
+
+using Tegra::Engines::ShaderType;
+using Tegra::Shader::Attribute;
+using Tegra::Shader::PixelImap;
+using Tegra::Shader::Register;
+using namespace VideoCommon::Shader;
+using Operation = const OperationNode&;
+
+constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"};
+
+char Swizzle(std::size_t component) {
+    ASSERT(component < 4);
+    return component["xyzw"];
+}
+
+constexpr bool IsGenericAttribute(Attribute::Index index) {
+    return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31;
+}
+
+u32 GetGenericAttributeIndex(Attribute::Index index) {
+    ASSERT(IsGenericAttribute(index));
+    return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
+}
+
+std::string_view Modifiers(Operation operation) {
+    const auto meta = std::get_if<MetaArithmetic>(&operation.GetMeta());
+    if (meta && meta->precise) {
+        return ".PREC";
+    }
+    return "";
+}
+
+std::string_view GetInputFlags(PixelImap attribute) {
+    switch (attribute) {
+    case PixelImap::Perspective:
+        return "";
+    case PixelImap::Constant:
+        return "FLAT ";
+    case PixelImap::ScreenLinear:
+        return "NOPERSPECTIVE ";
+    case PixelImap::Unused:
+        break;
+    }
+    UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute));
+    return {};
+}
+
+std::string_view ImageType(Tegra::Shader::ImageType image_type) {
+    switch (image_type) {
+    case Tegra::Shader::ImageType::Texture1D:
+        return "1D";
+    case Tegra::Shader::ImageType::TextureBuffer:
+        return "BUFFER";
+    case Tegra::Shader::ImageType::Texture1DArray:
+        return "ARRAY1D";
+    case Tegra::Shader::ImageType::Texture2D:
+        return "2D";
+    case Tegra::Shader::ImageType::Texture2DArray:
+        return "ARRAY2D";
+    case Tegra::Shader::ImageType::Texture3D:
+        return "3D";
+    }
+    UNREACHABLE();
+    return {};
+}
+
+std::string_view StackName(MetaStackClass stack) {
+    switch (stack) {
+    case MetaStackClass::Ssy:
+        return "SSY";
+    case MetaStackClass::Pbk:
+        return "PBK";
+    }
+    UNREACHABLE();
+    return "";
+};
+
+std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology topology) {
+    switch (topology) {
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Points:
+        return "POINTS";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Lines:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStrip:
+        return "LINES";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency:
+        return "LINES_ADJACENCY";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Triangles:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStrip:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleFan:
+        return "TRIANGLES";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
+        return "TRIANGLES_ADJACENCY";
+    default:
+        UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology));
+        return "POINTS";
+    }
+}
+
+std::string_view TopologyName(Tegra::Shader::OutputTopology topology) {
+    switch (topology) {
+    case Tegra::Shader::OutputTopology::PointList:
+        return "POINTS";
+    case Tegra::Shader::OutputTopology::LineStrip:
+        return "LINE_STRIP";
+    case Tegra::Shader::OutputTopology::TriangleStrip:
+        return "TRIANGLE_STRIP";
+    default:
+        UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology));
+        return "points";
+    }
+}
+
+std::string_view StageInputName(ShaderType stage) {
+    switch (stage) {
+    case ShaderType::Vertex:
+    case ShaderType::Geometry:
+        return "vertex";
+    case ShaderType::Fragment:
+        return "fragment";
+    case ShaderType::Compute:
+        return "invocation";
+    default:
+        UNREACHABLE();
+        return "";
+    }
+}
+
+std::string TextureType(const MetaTexture& meta) {
+    if (meta.sampler.is_buffer) {
+        return "BUFFER";
+    }
+    std::string type;
+    if (meta.sampler.is_shadow) {
+        type += "SHADOW";
+    }
+    if (meta.sampler.is_array) {
+        type += "ARRAY";
+    }
+    type += [&meta] {
+        switch (meta.sampler.type) {
+        case Tegra::Shader::TextureType::Texture1D:
+            return "1D";
+        case Tegra::Shader::TextureType::Texture2D:
+            return "2D";
+        case Tegra::Shader::TextureType::Texture3D:
+            return "3D";
+        case Tegra::Shader::TextureType::TextureCube:
+            return "CUBE";
+        }
+        UNREACHABLE();
+        return "2D";
+    }();
+    return type;
+}
+
+std::string GlobalMemoryName(const GlobalMemoryBase& base) {
+    return fmt::format("gmem{}_{}", base.cbuf_index, base.cbuf_offset);
+}
+
+class ARBDecompiler final {
+public:
+    explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+                           ShaderType stage, std::string_view identifier);
+
+    std::string Code() const {
+        return shader_source;
+    }
+
+private:
+    void DeclareHeader();
+    void DeclareVertex();
+    void DeclareGeometry();
+    void DeclareFragment();
+    void DeclareCompute();
+    void DeclareInputAttributes();
+    void DeclareOutputAttributes();
+    void DeclareLocalMemory();
+    void DeclareGlobalMemory();
+    void DeclareConstantBuffers();
+    void DeclareRegisters();
+    void DeclareTemporaries();
+    void DeclarePredicates();
+    void DeclareInternalFlags();
+
+    void InitializeVariables();
+
+    void DecompileAST();
+    void DecompileBranchMode();
+
+    void VisitAST(const ASTNode& node);
+    std::string VisitExpression(const Expr& node);
+
+    void VisitBlock(const NodeBlock& bb);
+
+    std::string Visit(const Node& node);
+
+    std::pair<std::string, std::size_t> BuildCoords(Operation);
+    std::string BuildAoffi(Operation);
+    void Exit();
+
+    std::string Assign(Operation);
+    std::string Select(Operation);
+    std::string FClamp(Operation);
+    std::string FCastHalf0(Operation);
+    std::string FCastHalf1(Operation);
+    std::string FSqrt(Operation);
+    std::string FSwizzleAdd(Operation);
+    std::string HAdd2(Operation);
+    std::string HMul2(Operation);
+    std::string HFma2(Operation);
+    std::string HAbsolute(Operation);
+    std::string HNegate(Operation);
+    std::string HClamp(Operation);
+    std::string HCastFloat(Operation);
+    std::string HUnpack(Operation);
+    std::string HMergeF32(Operation);
+    std::string HMergeH0(Operation);
+    std::string HMergeH1(Operation);
+    std::string HPack2(Operation);
+    std::string LogicalAssign(Operation);
+    std::string LogicalPick2(Operation);
+    std::string LogicalAnd2(Operation);
+    std::string FloatOrdered(Operation);
+    std::string FloatUnordered(Operation);
+    std::string LogicalAddCarry(Operation);
+    std::string Texture(Operation);
+    std::string TextureGather(Operation);
+    std::string TextureQueryDimensions(Operation);
+    std::string TextureQueryLod(Operation);
+    std::string TexelFetch(Operation);
+    std::string TextureGradient(Operation);
+    std::string ImageLoad(Operation);
+    std::string ImageStore(Operation);
+    std::string Branch(Operation);
+    std::string BranchIndirect(Operation);
+    std::string PushFlowStack(Operation);
+    std::string PopFlowStack(Operation);
+    std::string Exit(Operation);
+    std::string Discard(Operation);
+    std::string EmitVertex(Operation);
+    std::string EndPrimitive(Operation);
+    std::string InvocationId(Operation);
+    std::string YNegate(Operation);
+    std::string ThreadId(Operation);
+    std::string ShuffleIndexed(Operation);
+    std::string Barrier(Operation);
+    std::string MemoryBarrierGroup(Operation);
+    std::string MemoryBarrierGlobal(Operation);
+
+    template <const std::string_view& op>
+    std::string Unary(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("{}{} {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]));
+        return temporary;
+    }
+
+    template <const std::string_view& op>
+    std::string Binary(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("{}{} {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
+                Visit(operation[1]));
+        return temporary;
+    }
+
+    template <const std::string_view& op>
+    std::string Trinary(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("{}{} {}, {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
+                Visit(operation[1]), Visit(operation[2]));
+        return temporary;
+    }
+
+    template <const std::string_view& op, bool unordered>
+    std::string FloatComparison(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("TRUNC.U.CC RC.x, {};", Binary<op>(operation));
+        AddLine("MOV.S {}, 0;", temporary);
+        AddLine("MOV.S {} (NE.x), -1;", temporary);
+
+        const std::string op_a = Visit(operation[0]);
+        const std::string op_b = Visit(operation[1]);
+        if constexpr (unordered) {
+            AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), -1;", temporary);
+            AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), -1;", temporary);
+        } else if (op == SNE_F) {
+            AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), 0;", temporary);
+            AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), 0;", temporary);
+        }
+        return temporary;
+    }
+
+    template <const std::string_view& op, bool is_nan>
+    std::string HalfComparison(Operation operation) {
+        std::string tmp1 = AllocVectorTemporary();
+        const std::string tmp2 = AllocVectorTemporary();
+        const std::string op_a = Visit(operation[0]);
+        const std::string op_b = Visit(operation[1]);
+        AddLine("UP2H.F {}, {};", tmp1, op_a);
+        AddLine("UP2H.F {}, {};", tmp2, op_b);
+        AddLine("{} {}, {}, {};", op, tmp1, tmp1, tmp2);
+        AddLine("TRUNC.U.CC RC.xy, {};", tmp1);
+        AddLine("MOV.S {}.xy, {{0, 0, 0, 0}};", tmp1);
+        AddLine("MOV.S {}.x (NE.x), -1;", tmp1);
+        AddLine("MOV.S {}.y (NE.y), -1;", tmp1);
+        if constexpr (is_nan) {
+            AddLine("MOVC.F RC.x, {};", op_a);
+            AddLine("MOV.S {}.x (NAN.x), -1;", tmp1);
+            AddLine("MOVC.F RC.x, {};", op_b);
+            AddLine("MOV.S {}.y (NAN.x), -1;", tmp1);
+        }
+        return tmp1;
+    }
+
+    template <const std::string_view& op, const std::string_view& type>
+    std::string AtomicImage(Operation operation) {
+        const auto& meta = std::get<MetaImage>(operation.GetMeta());
+        const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+        const std::size_t num_coords = operation.GetOperandsCount();
+        const std::size_t num_values = meta.values.size();
+
+        const std::string coord = AllocVectorTemporary();
+        const std::string value = AllocVectorTemporary();
+        for (std::size_t i = 0; i < num_coords; ++i) {
+            AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
+        }
+        for (std::size_t i = 0; i < num_values; ++i) {
+            AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
+        }
+
+        AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, coord, value, coord,
+                image_id, ImageType(meta.image.type));
+        return fmt::format("{}.x", coord);
+    }
+
+    template <const std::string_view& op, const std::string_view& type>
+    std::string Atomic(Operation operation) {
+        std::string temporary = AllocTemporary();
+        std::string address;
+        std::string_view opname;
+        if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) {
+            AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
+                    Visit(gmem->GetBaseAddress()));
+            address = fmt::format("{}[{}]", GlobalMemoryName(gmem->GetDescriptor()), temporary);
+            opname = "ATOMB";
+        } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
+            address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress()));
+            opname = "ATOMS";
+        } else {
+            UNREACHABLE();
+            return "{0, 0, 0, 0}";
+        }
+        AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address);
+        return temporary;
+    }
+
+    template <char type>
+    std::string Negate(Operation operation) {
+        std::string temporary = AllocTemporary();
+        if constexpr (type == 'F') {
+            AddLine("MOV.F32 {}, -{};", temporary, Visit(operation[0]));
+        } else {
+            AddLine("MOV.{} {}, -{};", type, temporary, Visit(operation[0]));
+        }
+        return temporary;
+    }
+
+    template <char type>
+    std::string Absolute(Operation operation) {
+        std::string temporary = AllocTemporary();
+        AddLine("MOV.{} {}, |{}|;", type, temporary, Visit(operation[0]));
+        return temporary;
+    }
+
+    template <char type>
+    std::string BitfieldInsert(Operation operation) {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[3]));
+        AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[2]));
+        AddLine("BFI.{} {}.x, {}, {}, {};", type, temporary, temporary, Visit(operation[1]),
+                Visit(operation[0]));
+        return fmt::format("{}.x", temporary);
+    }
+
+    template <char type>
+    std::string BitfieldExtract(Operation operation) {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[2]));
+        AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[1]));
+        AddLine("BFE.{} {}.x, {}, {};", type, temporary, temporary, Visit(operation[0]));
+        return fmt::format("{}.x", temporary);
+    }
+
+    template <char swizzle>
+    std::string LocalInvocationId(Operation) {
+        return fmt::format("invocation.localid.{}", swizzle);
+    }
+
+    template <char swizzle>
+    std::string WorkGroupId(Operation) {
+        return fmt::format("invocation.groupid.{}", swizzle);
+    }
+
+    template <char c1, char c2>
+    std::string ThreadMask(Operation) {
+        return fmt::format("{}.thread{}{}mask", StageInputName(stage), c1, c2);
+    }
+
+    template <typename... Args>
+    void AddExpression(std::string_view text, Args&&... args) {
+        shader_source += fmt::format(text, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void AddLine(std::string_view text, Args&&... args) {
+        AddExpression(text, std::forward<Args>(args)...);
+        shader_source += '\n';
+    }
+
+    std::string AllocTemporary() {
+        max_temporaries = std::max(max_temporaries, num_temporaries + 1);
+        return fmt::format("T{}.x", num_temporaries++);
+    }
+
+    std::string AllocVectorTemporary() {
+        max_temporaries = std::max(max_temporaries, num_temporaries + 1);
+        return fmt::format("T{}", num_temporaries++);
+    }
+
+    void ResetTemporaries() noexcept {
+        num_temporaries = 0;
+    }
+
+    const Device& device;
+    const ShaderIR& ir;
+    const Registry& registry;
+    const ShaderType stage;
+
+    std::size_t num_temporaries = 0;
+    std::size_t max_temporaries = 0;
+
+    std::string shader_source;
+
+    static constexpr std::string_view ADD_F32 = "ADD.F32";
+    static constexpr std::string_view ADD_S = "ADD.S";
+    static constexpr std::string_view ADD_U = "ADD.U";
+    static constexpr std::string_view MUL_F32 = "MUL.F32";
+    static constexpr std::string_view MUL_S = "MUL.S";
+    static constexpr std::string_view MUL_U = "MUL.U";
+    static constexpr std::string_view DIV_F32 = "DIV.F32";
+    static constexpr std::string_view DIV_S = "DIV.S";
+    static constexpr std::string_view DIV_U = "DIV.U";
+    static constexpr std::string_view MAD_F32 = "MAD.F32";
+    static constexpr std::string_view RSQ_F32 = "RSQ.F32";
+    static constexpr std::string_view COS_F32 = "COS.F32";
+    static constexpr std::string_view SIN_F32 = "SIN.F32";
+    static constexpr std::string_view EX2_F32 = "EX2.F32";
+    static constexpr std::string_view LG2_F32 = "LG2.F32";
+    static constexpr std::string_view SLT_F = "SLT.F32";
+    static constexpr std::string_view SLT_S = "SLT.S";
+    static constexpr std::string_view SLT_U = "SLT.U";
+    static constexpr std::string_view SEQ_F = "SEQ.F32";
+    static constexpr std::string_view SEQ_S = "SEQ.S";
+    static constexpr std::string_view SEQ_U = "SEQ.U";
+    static constexpr std::string_view SLE_F = "SLE.F32";
+    static constexpr std::string_view SLE_S = "SLE.S";
+    static constexpr std::string_view SLE_U = "SLE.U";
+    static constexpr std::string_view SGT_F = "SGT.F32";
+    static constexpr std::string_view SGT_S = "SGT.S";
+    static constexpr std::string_view SGT_U = "SGT.U";
+    static constexpr std::string_view SNE_F = "SNE.F32";
+    static constexpr std::string_view SNE_S = "SNE.S";
+    static constexpr std::string_view SNE_U = "SNE.U";
+    static constexpr std::string_view SGE_F = "SGE.F32";
+    static constexpr std::string_view SGE_S = "SGE.S";
+    static constexpr std::string_view SGE_U = "SGE.U";
+    static constexpr std::string_view AND_S = "AND.S";
+    static constexpr std::string_view AND_U = "AND.U";
+    static constexpr std::string_view TRUNC_F = "TRUNC.F";
+    static constexpr std::string_view TRUNC_S = "TRUNC.S";
+    static constexpr std::string_view TRUNC_U = "TRUNC.U";
+    static constexpr std::string_view SHL_S = "SHL.S";
+    static constexpr std::string_view SHL_U = "SHL.U";
+    static constexpr std::string_view SHR_S = "SHR.S";
+    static constexpr std::string_view SHR_U = "SHR.U";
+    static constexpr std::string_view OR_S = "OR.S";
+    static constexpr std::string_view OR_U = "OR.U";
+    static constexpr std::string_view XOR_S = "XOR.S";
+    static constexpr std::string_view XOR_U = "XOR.U";
+    static constexpr std::string_view NOT_S = "NOT.S";
+    static constexpr std::string_view NOT_U = "NOT.U";
+    static constexpr std::string_view BTC_S = "BTC.S";
+    static constexpr std::string_view BTC_U = "BTC.U";
+    static constexpr std::string_view BTFM_S = "BTFM.S";
+    static constexpr std::string_view BTFM_U = "BTFM.U";
+    static constexpr std::string_view ROUND_F = "ROUND.F";
+    static constexpr std::string_view CEIL_F = "CEIL.F";
+    static constexpr std::string_view FLR_F = "FLR.F";
+    static constexpr std::string_view I2F_S = "I2F.S";
+    static constexpr std::string_view I2F_U = "I2F.U";
+    static constexpr std::string_view MIN_F = "MIN.F";
+    static constexpr std::string_view MIN_S = "MIN.S";
+    static constexpr std::string_view MIN_U = "MIN.U";
+    static constexpr std::string_view MAX_F = "MAX.F";
+    static constexpr std::string_view MAX_S = "MAX.S";
+    static constexpr std::string_view MAX_U = "MAX.U";
+    static constexpr std::string_view MOV_U = "MOV.U";
+    static constexpr std::string_view TGBALLOT_U = "TGBALLOT.U";
+    static constexpr std::string_view TGALL_U = "TGALL.U";
+    static constexpr std::string_view TGANY_U = "TGANY.U";
+    static constexpr std::string_view TGEQ_U = "TGEQ.U";
+    static constexpr std::string_view EXCH = "EXCH";
+    static constexpr std::string_view ADD = "ADD";
+    static constexpr std::string_view MIN = "MIN";
+    static constexpr std::string_view MAX = "MAX";
+    static constexpr std::string_view AND = "AND";
+    static constexpr std::string_view OR = "OR";
+    static constexpr std::string_view XOR = "XOR";
+    static constexpr std::string_view U32 = "U32";
+    static constexpr std::string_view S32 = "S32";
+
+    static constexpr std::size_t NUM_ENTRIES = static_cast<std::size_t>(OperationCode::Amount);
+    using DecompilerType = std::string (ARBDecompiler::*)(Operation);
+    static constexpr std::array<DecompilerType, NUM_ENTRIES> OPERATION_DECOMPILERS = {
+        &ARBDecompiler::Assign,
+
+        &ARBDecompiler::Select,
+
+        &ARBDecompiler::Binary<ADD_F32>,
+        &ARBDecompiler::Binary<MUL_F32>,
+        &ARBDecompiler::Binary<DIV_F32>,
+        &ARBDecompiler::Trinary<MAD_F32>,
+        &ARBDecompiler::Negate<'F'>,
+        &ARBDecompiler::Absolute<'F'>,
+        &ARBDecompiler::FClamp,
+        &ARBDecompiler::FCastHalf0,
+        &ARBDecompiler::FCastHalf1,
+        &ARBDecompiler::Binary<MIN_F>,
+        &ARBDecompiler::Binary<MAX_F>,
+        &ARBDecompiler::Unary<COS_F32>,
+        &ARBDecompiler::Unary<SIN_F32>,
+        &ARBDecompiler::Unary<EX2_F32>,
+        &ARBDecompiler::Unary<LG2_F32>,
+        &ARBDecompiler::Unary<RSQ_F32>,
+        &ARBDecompiler::FSqrt,
+        &ARBDecompiler::Unary<ROUND_F>,
+        &ARBDecompiler::Unary<FLR_F>,
+        &ARBDecompiler::Unary<CEIL_F>,
+        &ARBDecompiler::Unary<TRUNC_F>,
+        &ARBDecompiler::Unary<I2F_S>,
+        &ARBDecompiler::Unary<I2F_U>,
+        &ARBDecompiler::FSwizzleAdd,
+
+        &ARBDecompiler::Binary<ADD_S>,
+        &ARBDecompiler::Binary<MUL_S>,
+        &ARBDecompiler::Binary<DIV_S>,
+        &ARBDecompiler::Negate<'S'>,
+        &ARBDecompiler::Absolute<'S'>,
+        &ARBDecompiler::Binary<MIN_S>,
+        &ARBDecompiler::Binary<MAX_S>,
+
+        &ARBDecompiler::Unary<TRUNC_S>,
+        &ARBDecompiler::Unary<MOV_U>,
+        &ARBDecompiler::Binary<SHL_S>,
+        &ARBDecompiler::Binary<SHR_U>,
+        &ARBDecompiler::Binary<SHR_S>,
+        &ARBDecompiler::Binary<AND_S>,
+        &ARBDecompiler::Binary<OR_S>,
+        &ARBDecompiler::Binary<XOR_S>,
+        &ARBDecompiler::Unary<NOT_S>,
+        &ARBDecompiler::BitfieldInsert<'S'>,
+        &ARBDecompiler::BitfieldExtract<'S'>,
+        &ARBDecompiler::Unary<BTC_S>,
+        &ARBDecompiler::Unary<BTFM_S>,
+
+        &ARBDecompiler::Binary<ADD_U>,
+        &ARBDecompiler::Binary<MUL_U>,
+        &ARBDecompiler::Binary<DIV_U>,
+        &ARBDecompiler::Binary<MIN_U>,
+        &ARBDecompiler::Binary<MAX_U>,
+        &ARBDecompiler::Unary<TRUNC_U>,
+        &ARBDecompiler::Unary<MOV_U>,
+        &ARBDecompiler::Binary<SHL_U>,
+        &ARBDecompiler::Binary<SHR_U>,
+        &ARBDecompiler::Binary<SHR_U>,
+        &ARBDecompiler::Binary<AND_U>,
+        &ARBDecompiler::Binary<OR_U>,
+        &ARBDecompiler::Binary<XOR_U>,
+        &ARBDecompiler::Unary<NOT_U>,
+        &ARBDecompiler::BitfieldInsert<'U'>,
+        &ARBDecompiler::BitfieldExtract<'U'>,
+        &ARBDecompiler::Unary<BTC_U>,
+        &ARBDecompiler::Unary<BTFM_U>,
+
+        &ARBDecompiler::HAdd2,
+        &ARBDecompiler::HMul2,
+        &ARBDecompiler::HFma2,
+        &ARBDecompiler::HAbsolute,
+        &ARBDecompiler::HNegate,
+        &ARBDecompiler::HClamp,
+        &ARBDecompiler::HCastFloat,
+        &ARBDecompiler::HUnpack,
+        &ARBDecompiler::HMergeF32,
+        &ARBDecompiler::HMergeH0,
+        &ARBDecompiler::HMergeH1,
+        &ARBDecompiler::HPack2,
+
+        &ARBDecompiler::LogicalAssign,
+        &ARBDecompiler::Binary<AND_U>,
+        &ARBDecompiler::Binary<OR_U>,
+        &ARBDecompiler::Binary<XOR_U>,
+        &ARBDecompiler::Unary<NOT_U>,
+        &ARBDecompiler::LogicalPick2,
+        &ARBDecompiler::LogicalAnd2,
+
+        &ARBDecompiler::FloatComparison<SLT_F, false>,
+        &ARBDecompiler::FloatComparison<SEQ_F, false>,
+        &ARBDecompiler::FloatComparison<SLE_F, false>,
+        &ARBDecompiler::FloatComparison<SGT_F, false>,
+        &ARBDecompiler::FloatComparison<SNE_F, false>,
+        &ARBDecompiler::FloatComparison<SGE_F, false>,
+        &ARBDecompiler::FloatOrdered,
+        &ARBDecompiler::FloatUnordered,
+        &ARBDecompiler::FloatComparison<SLT_F, true>,
+        &ARBDecompiler::FloatComparison<SEQ_F, true>,
+        &ARBDecompiler::FloatComparison<SLE_F, true>,
+        &ARBDecompiler::FloatComparison<SGT_F, true>,
+        &ARBDecompiler::FloatComparison<SNE_F, true>,
+        &ARBDecompiler::FloatComparison<SGE_F, true>,
+
+        &ARBDecompiler::Binary<SLT_S>,
+        &ARBDecompiler::Binary<SEQ_S>,
+        &ARBDecompiler::Binary<SLE_S>,
+        &ARBDecompiler::Binary<SGT_S>,
+        &ARBDecompiler::Binary<SNE_S>,
+        &ARBDecompiler::Binary<SGE_S>,
+
+        &ARBDecompiler::Binary<SLT_U>,
+        &ARBDecompiler::Binary<SEQ_U>,
+        &ARBDecompiler::Binary<SLE_U>,
+        &ARBDecompiler::Binary<SGT_U>,
+        &ARBDecompiler::Binary<SNE_U>,
+        &ARBDecompiler::Binary<SGE_U>,
+
+        &ARBDecompiler::LogicalAddCarry,
+
+        &ARBDecompiler::HalfComparison<SLT_F, false>,
+        &ARBDecompiler::HalfComparison<SEQ_F, false>,
+        &ARBDecompiler::HalfComparison<SLE_F, false>,
+        &ARBDecompiler::HalfComparison<SGT_F, false>,
+        &ARBDecompiler::HalfComparison<SNE_F, false>,
+        &ARBDecompiler::HalfComparison<SGE_F, false>,
+        &ARBDecompiler::HalfComparison<SLT_F, true>,
+        &ARBDecompiler::HalfComparison<SEQ_F, true>,
+        &ARBDecompiler::HalfComparison<SLE_F, true>,
+        &ARBDecompiler::HalfComparison<SGT_F, true>,
+        &ARBDecompiler::HalfComparison<SNE_F, true>,
+        &ARBDecompiler::HalfComparison<SGE_F, true>,
+
+        &ARBDecompiler::Texture,
+        &ARBDecompiler::Texture,
+        &ARBDecompiler::TextureGather,
+        &ARBDecompiler::TextureQueryDimensions,
+        &ARBDecompiler::TextureQueryLod,
+        &ARBDecompiler::TexelFetch,
+        &ARBDecompiler::TextureGradient,
+
+        &ARBDecompiler::ImageLoad,
+        &ARBDecompiler::ImageStore,
+
+        &ARBDecompiler::AtomicImage<ADD, U32>,
+        &ARBDecompiler::AtomicImage<AND, U32>,
+        &ARBDecompiler::AtomicImage<OR, U32>,
+        &ARBDecompiler::AtomicImage<XOR, U32>,
+        &ARBDecompiler::AtomicImage<EXCH, U32>,
+
+        &ARBDecompiler::Atomic<EXCH, U32>,
+        &ARBDecompiler::Atomic<ADD, U32>,
+        &ARBDecompiler::Atomic<MIN, U32>,
+        &ARBDecompiler::Atomic<MAX, U32>,
+        &ARBDecompiler::Atomic<AND, U32>,
+        &ARBDecompiler::Atomic<OR, U32>,
+        &ARBDecompiler::Atomic<XOR, U32>,
+
+        &ARBDecompiler::Atomic<EXCH, S32>,
+        &ARBDecompiler::Atomic<ADD, S32>,
+        &ARBDecompiler::Atomic<MIN, S32>,
+        &ARBDecompiler::Atomic<MAX, S32>,
+        &ARBDecompiler::Atomic<AND, S32>,
+        &ARBDecompiler::Atomic<OR, S32>,
+        &ARBDecompiler::Atomic<XOR, S32>,
+
+        &ARBDecompiler::Atomic<ADD, U32>,
+        &ARBDecompiler::Atomic<MIN, U32>,
+        &ARBDecompiler::Atomic<MAX, U32>,
+        &ARBDecompiler::Atomic<AND, U32>,
+        &ARBDecompiler::Atomic<OR, U32>,
+        &ARBDecompiler::Atomic<XOR, U32>,
+
+        &ARBDecompiler::Atomic<ADD, S32>,
+        &ARBDecompiler::Atomic<MIN, S32>,
+        &ARBDecompiler::Atomic<MAX, S32>,
+        &ARBDecompiler::Atomic<AND, S32>,
+        &ARBDecompiler::Atomic<OR, S32>,
+        &ARBDecompiler::Atomic<XOR, S32>,
+
+        &ARBDecompiler::Branch,
+        &ARBDecompiler::BranchIndirect,
+        &ARBDecompiler::PushFlowStack,
+        &ARBDecompiler::PopFlowStack,
+        &ARBDecompiler::Exit,
+        &ARBDecompiler::Discard,
+
+        &ARBDecompiler::EmitVertex,
+        &ARBDecompiler::EndPrimitive,
+
+        &ARBDecompiler::InvocationId,
+        &ARBDecompiler::YNegate,
+        &ARBDecompiler::LocalInvocationId<'x'>,
+        &ARBDecompiler::LocalInvocationId<'y'>,
+        &ARBDecompiler::LocalInvocationId<'z'>,
+        &ARBDecompiler::WorkGroupId<'x'>,
+        &ARBDecompiler::WorkGroupId<'y'>,
+        &ARBDecompiler::WorkGroupId<'z'>,
+
+        &ARBDecompiler::Unary<TGBALLOT_U>,
+        &ARBDecompiler::Unary<TGALL_U>,
+        &ARBDecompiler::Unary<TGANY_U>,
+        &ARBDecompiler::Unary<TGEQ_U>,
+
+        &ARBDecompiler::ThreadId,
+        &ARBDecompiler::ThreadMask<'e', 'q'>,
+        &ARBDecompiler::ThreadMask<'g', 'e'>,
+        &ARBDecompiler::ThreadMask<'g', 't'>,
+        &ARBDecompiler::ThreadMask<'l', 'e'>,
+        &ARBDecompiler::ThreadMask<'l', 't'>,
+        &ARBDecompiler::ShuffleIndexed,
+
+        &ARBDecompiler::Barrier,
+        &ARBDecompiler::MemoryBarrierGroup,
+        &ARBDecompiler::MemoryBarrierGlobal,
+    };
+};
+
+ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+                             ShaderType stage, std::string_view identifier)
+    : device{device}, ir{ir}, registry{registry}, stage{stage} {
+    AddLine("TEMP RC;");
+    AddLine("TEMP FSWZA[4];");
+    AddLine("TEMP FSWZB[4];");
+    if (ir.IsDecompiled()) {
+        DecompileAST();
+    } else {
+        DecompileBranchMode();
+    }
+    AddLine("END");
+
+    const std::string code = std::move(shader_source);
+    DeclareHeader();
+    DeclareVertex();
+    DeclareGeometry();
+    DeclareFragment();
+    DeclareCompute();
+    DeclareInputAttributes();
+    DeclareOutputAttributes();
+    DeclareLocalMemory();
+    DeclareGlobalMemory();
+    DeclareConstantBuffers();
+    DeclareRegisters();
+    DeclareTemporaries();
+    DeclarePredicates();
+    DeclareInternalFlags();
+
+    shader_source += code;
+}
+
+std::string_view HeaderStageName(ShaderType stage) {
+    switch (stage) {
+    case ShaderType::Vertex:
+        return "vp";
+    case ShaderType::Geometry:
+        return "gp";
+    case ShaderType::Fragment:
+        return "fp";
+    case ShaderType::Compute:
+        return "cp";
+    default:
+        UNREACHABLE();
+        return "";
+    }
+}
+
+void ARBDecompiler::DeclareHeader() {
+    AddLine("!!NV{}5.0", HeaderStageName(stage));
+    // Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D
+    AddLine("OPTION NV_internal;");
+    AddLine("OPTION NV_gpu_program_fp64;");
+    AddLine("OPTION NV_shader_storage_buffer;");
+    AddLine("OPTION NV_shader_thread_group;");
+    if (ir.UsesWarps() && device.HasWarpIntrinsics()) {
+        AddLine("OPTION NV_shader_thread_shuffle;");
+    }
+    if (stage == ShaderType::Vertex) {
+        if (device.HasNvViewportArray2()) {
+            AddLine("OPTION NV_viewport_array2;");
+        }
+    }
+    if (stage == ShaderType::Fragment) {
+        AddLine("OPTION ARB_draw_buffers;");
+    }
+    if (device.HasImageLoadFormatted()) {
+        AddLine("OPTION EXT_shader_image_load_formatted;");
+    }
+}
+
+void ARBDecompiler::DeclareVertex() {
+    if (stage != ShaderType::Vertex) {
+        return;
+    }
+    AddLine("OUTPUT result_clip[] = {{ result.clip[0..7] }};");
+}
+
+void ARBDecompiler::DeclareGeometry() {
+    if (stage != ShaderType::Geometry) {
+        return;
+    }
+    const auto& info = registry.GetGraphicsInfo();
+    const auto& header = ir.GetHeader();
+    AddLine("PRIMITIVE_IN {};", PrimitiveDescription(info.primitive_topology));
+    AddLine("PRIMITIVE_OUT {};", TopologyName(header.common3.output_topology));
+    AddLine("VERTICES_OUT {};", header.common4.max_output_vertices.Value());
+    AddLine("ATTRIB vertex_position = vertex.position;");
+}
+
+void ARBDecompiler::DeclareFragment() {
+    if (stage != ShaderType::Fragment) {
+        return;
+    }
+    AddLine("OUTPUT result_color7 = result.color[7];");
+    AddLine("OUTPUT result_color6 = result.color[6];");
+    AddLine("OUTPUT result_color5 = result.color[5];");
+    AddLine("OUTPUT result_color4 = result.color[4];");
+    AddLine("OUTPUT result_color3 = result.color[3];");
+    AddLine("OUTPUT result_color2 = result.color[2];");
+    AddLine("OUTPUT result_color1 = result.color[1];");
+    AddLine("OUTPUT result_color0 = result.color;");
+}
+
+void ARBDecompiler::DeclareCompute() {
+    if (stage != ShaderType::Compute) {
+        return;
+    }
+    const ComputeInfo& info = registry.GetComputeInfo();
+    AddLine("GROUP_SIZE {} {} {};", info.workgroup_size[0], info.workgroup_size[1],
+            info.workgroup_size[2]);
+    if (info.shared_memory_size_in_words > 0) {
+        const u32 size_in_bytes = info.shared_memory_size_in_words * 4;
+        AddLine("SHARED_MEMORY {};", size_in_bytes);
+        AddLine("SHARED shared_mem[] = {{program.sharedmem}};");
+    }
+}
+
+void ARBDecompiler::DeclareInputAttributes() {
+    if (stage == ShaderType::Compute) {
+        return;
+    }
+    const std::string_view stage_name = StageInputName(stage);
+    for (const auto attribute : ir.GetInputAttributes()) {
+        if (!IsGenericAttribute(attribute)) {
+            continue;
+        }
+        const u32 index = GetGenericAttributeIndex(attribute);
+
+        std::string_view suffix;
+        if (stage == ShaderType::Fragment) {
+            const auto input_mode{ir.GetHeader().ps.GetPixelImap(index)};
+            if (input_mode == PixelImap::Unused) {
+                return;
+            }
+            suffix = GetInputFlags(input_mode);
+        }
+        AddLine("{}ATTRIB in_attr{}[] = {{ {}.attrib[{}..{}] }};", suffix, index, stage_name, index,
+                index);
+    }
+}
+
+void ARBDecompiler::DeclareOutputAttributes() {
+    if (stage == ShaderType::Compute) {
+        return;
+    }
+    for (const auto attribute : ir.GetOutputAttributes()) {
+        if (!IsGenericAttribute(attribute)) {
+            continue;
+        }
+        const u32 index = GetGenericAttributeIndex(attribute);
+        AddLine("OUTPUT out_attr{}[] = {{ result.attrib[{}..{}] }};", index, index, index);
+    }
+}
+
+void ARBDecompiler::DeclareLocalMemory() {
+    u64 size = 0;
+    if (stage == ShaderType::Compute) {
+        size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL;
+    } else {
+        size = ir.GetHeader().GetLocalMemorySize();
+    }
+    if (size == 0) {
+        return;
+    }
+    const u64 element_count = Common::AlignUp(size, 4) / 4;
+    AddLine("TEMP lmem[{}];", element_count);
+}
+
+void ARBDecompiler::DeclareGlobalMemory() {
+    u32 binding = 0; // device.GetBaseBindings(stage).shader_storage_buffer;
+    for (const auto& pair : ir.GetGlobalMemory()) {
+        const auto& base = pair.first;
+        AddLine("STORAGE {}[] = {{ program.storage[{}] }};", GlobalMemoryName(base), binding);
+        ++binding;
+    }
+}
+
+void ARBDecompiler::DeclareConstantBuffers() {
+    u32 binding = 0;
+    for (const auto& cbuf : ir.GetConstantBuffers()) {
+        AddLine("CBUFFER cbuf{}[] = {{ program.buffer[{}] }};", cbuf.first, binding);
+        ++binding;
+    }
+}
+
+void ARBDecompiler::DeclareRegisters() {
+    for (const u32 gpr : ir.GetRegisters()) {
+        AddLine("TEMP R{};", gpr);
+    }
+}
+
+void ARBDecompiler::DeclareTemporaries() {
+    for (std::size_t i = 0; i < max_temporaries; ++i) {
+        AddLine("TEMP T{};", i);
+    }
+}
+
+void ARBDecompiler::DeclarePredicates() {
+    for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
+        AddLine("TEMP P{};", static_cast<u64>(pred));
+    }
+}
+
+void ARBDecompiler::DeclareInternalFlags() {
+    for (const char* name : INTERNAL_FLAG_NAMES) {
+        AddLine("TEMP {};", name);
+    }
+}
+
+void ARBDecompiler::InitializeVariables() {
+    AddLine("MOV.F32 FSWZA[0], -1;");
+    AddLine("MOV.F32 FSWZA[1], 1;");
+    AddLine("MOV.F32 FSWZA[2], -1;");
+    AddLine("MOV.F32 FSWZA[3], 0;");
+    AddLine("MOV.F32 FSWZB[0], -1;");
+    AddLine("MOV.F32 FSWZB[1], -1;");
+    AddLine("MOV.F32 FSWZB[2], 1;");
+    AddLine("MOV.F32 FSWZB[3], -1;");
+
+    if (stage == ShaderType::Vertex || stage == ShaderType::Geometry) {
+        AddLine("MOV.F result.position, {{0, 0, 0, 1}};");
+    }
+    for (const auto attribute : ir.GetOutputAttributes()) {
+        if (!IsGenericAttribute(attribute)) {
+            continue;
+        }
+        const u32 index = GetGenericAttributeIndex(attribute);
+        AddLine("MOV.F result.attrib[{}], {{0, 0, 0, 1}};", index);
+    }
+    for (const u32 gpr : ir.GetRegisters()) {
+        AddLine("MOV.F R{}, {{0, 0, 0, 0}};", gpr);
+    }
+    for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
+        AddLine("MOV.U P{}, {{0, 0, 0, 0}};", static_cast<u64>(pred));
+    }
+}
+
+void ARBDecompiler::DecompileAST() {
+    const u32 num_flow_variables = ir.GetASTNumVariables();
+    for (u32 i = 0; i < num_flow_variables; ++i) {
+        AddLine("TEMP F{};", i);
+    }
+    for (u32 i = 0; i < num_flow_variables; ++i) {
+        AddLine("MOV.U F{}, {{0, 0, 0, 0}};", i);
+    }
+
+    InitializeVariables();
+
+    VisitAST(ir.GetASTProgram());
+}
+
+void ARBDecompiler::DecompileBranchMode() {
+    static constexpr u32 FLOW_STACK_SIZE = 20;
+    if (!ir.IsFlowStackDisabled()) {
+        AddLine("TEMP SSY[{}];", FLOW_STACK_SIZE);
+        AddLine("TEMP PBK[{}];", FLOW_STACK_SIZE);
+        AddLine("TEMP SSY_TOP;");
+        AddLine("TEMP PBK_TOP;");
+    }
+
+    AddLine("TEMP PC;");
+
+    if (!ir.IsFlowStackDisabled()) {
+        AddLine("MOV.U SSY_TOP.x, 0;");
+        AddLine("MOV.U PBK_TOP.x, 0;");
+    }
+
+    InitializeVariables();
+
+    const auto basic_block_end = ir.GetBasicBlocks().end();
+    auto basic_block_it = ir.GetBasicBlocks().begin();
+    const u32 first_address = basic_block_it->first;
+    AddLine("MOV.U PC.x, {};", first_address);
+
+    AddLine("REP;");
+
+    std::size_t num_blocks = 0;
+    while (basic_block_it != basic_block_end) {
+        const auto& [address, bb] = *basic_block_it;
+        ++num_blocks;
+
+        AddLine("SEQ.S.CC RC.x, PC.x, {};", address);
+        AddLine("IF NE.x;");
+
+        VisitBlock(bb);
+
+        ++basic_block_it;
+
+        if (basic_block_it != basic_block_end) {
+            const auto op = std::get_if<OperationNode>(&*bb[bb.size() - 1]);
+            if (!op || op->GetCode() != OperationCode::Branch) {
+                const u32 next_address = basic_block_it->first;
+                AddLine("MOV.U PC.x, {};", next_address);
+                AddLine("CONT;");
+            }
+        }
+
+        AddLine("ELSE;");
+    }
+    AddLine("RET;");
+    while (num_blocks--) {
+        AddLine("ENDIF;");
+    }
+
+    AddLine("ENDREP;");
+}
+
+void ARBDecompiler::VisitAST(const ASTNode& node) {
+    if (const auto ast = std::get_if<ASTProgram>(&*node->GetInnerData())) {
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+    } else if (const auto ast = std::get_if<ASTIfThen>(&*node->GetInnerData())) {
+        const std::string condition = VisitExpression(ast->condition);
+        ResetTemporaries();
+
+        AddLine("MOVC.U RC.x, {};", condition);
+        AddLine("IF NE.x;");
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+        AddLine("ENDIF;");
+    } else if (const auto ast = std::get_if<ASTIfElse>(&*node->GetInnerData())) {
+        AddLine("ELSE;");
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+    } else if (const auto ast = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) {
+        VisitBlock(ast->nodes);
+    } else if (const auto ast = std::get_if<ASTVarSet>(&*node->GetInnerData())) {
+        AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition));
+        ResetTemporaries();
+    } else if (const auto ast = std::get_if<ASTDoWhile>(&*node->GetInnerData())) {
+        const std::string condition = VisitExpression(ast->condition);
+        ResetTemporaries();
+        AddLine("REP;");
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+        AddLine("MOVC.U RC.x, {};", condition);
+        AddLine("BRK (NE.x);");
+        AddLine("ENDREP;");
+    } else if (const auto ast = std::get_if<ASTReturn>(&*node->GetInnerData())) {
+        const bool is_true = ExprIsTrue(ast->condition);
+        if (!is_true) {
+            AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+            AddLine("IF NE.x;");
+            ResetTemporaries();
+        }
+        if (ast->kills) {
+            AddLine("KIL TR;");
+        } else {
+            Exit();
+        }
+        if (!is_true) {
+            AddLine("ENDIF;");
+        }
+    } else if (const auto ast = std::get_if<ASTBreak>(&*node->GetInnerData())) {
+        if (ExprIsTrue(ast->condition)) {
+            AddLine("BRK;");
+        } else {
+            AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+            AddLine("BRK (NE.x);");
+            ResetTemporaries();
+        }
+    } else if (std::holds_alternative<ASTLabel>(*node->GetInnerData())) {
+        // Nothing to do
+    } else {
+        UNREACHABLE();
+    }
+}
+
+std::string ARBDecompiler::VisitExpression(const Expr& node) {
+    if (const auto expr = std::get_if<ExprAnd>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("AND.U {}, {}, {};", result, VisitExpression(expr->operand1),
+                VisitExpression(expr->operand2));
+        return result;
+    }
+    if (const auto expr = std::get_if<ExprOr>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("OR.U {}, {}, {};", result, VisitExpression(expr->operand1),
+                VisitExpression(expr->operand2));
+        return result;
+    }
+    if (const auto expr = std::get_if<ExprNot>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("CMP.S {}, {}, 0, -1;", result, VisitExpression(expr->operand1));
+        return result;
+    }
+    if (const auto expr = std::get_if<ExprPredicate>(&*node)) {
+        return fmt::format("P{}.x", static_cast<u64>(expr->predicate));
+    }
+    if (const auto expr = std::get_if<ExprCondCode>(&*node)) {
+        return Visit(ir.GetConditionCode(expr->cc));
+    }
+    if (const auto expr = std::get_if<ExprVar>(&*node)) {
+        return fmt::format("F{}.x", expr->var_index);
+    }
+    if (const auto expr = std::get_if<ExprBoolean>(&*node)) {
+        return expr->value ? "0xffffffff" : "0";
+    }
+    if (const auto expr = std::get_if<ExprGprEqual>(&*node)) {
+        std::string result = AllocTemporary();
+        AddLine("SEQ.U {}, R{}.x, {};", result, expr->gpr, expr->value);
+        return result;
+    }
+    UNREACHABLE();
+    return "0";
+}
+
+void ARBDecompiler::VisitBlock(const NodeBlock& bb) {
+    for (const auto& node : bb) {
+        Visit(node);
+    }
+}
+
+std::string ARBDecompiler::Visit(const Node& node) {
+    if (const auto operation = std::get_if<OperationNode>(&*node)) {
+        if (const auto amend_index = operation->GetAmendIndex()) {
+            Visit(ir.GetAmendNode(*amend_index));
+        }
+        const std::size_t index = static_cast<std::size_t>(operation->GetCode());
+        if (index >= OPERATION_DECOMPILERS.size()) {
+            UNREACHABLE_MSG("Out of bounds operation: {}", index);
+            return {};
+        }
+        const auto decompiler = OPERATION_DECOMPILERS[index];
+        if (decompiler == nullptr) {
+            UNREACHABLE_MSG("Undefined operation: {}", index);
+            return {};
+        }
+        return (this->*decompiler)(*operation);
+    }
+
+    if (const auto gpr = std::get_if<GprNode>(&*node)) {
+        const u32 index = gpr->GetIndex();
+        if (index == Register::ZeroIndex) {
+            return "{0, 0, 0, 0}.x";
+        }
+        return fmt::format("R{}.x", index);
+    }
+
+    if (const auto cv = std::get_if<CustomVarNode>(&*node)) {
+        return fmt::format("CV{}.x", cv->GetIndex());
+    }
+
+    if (const auto immediate = std::get_if<ImmediateNode>(&*node)) {
+        std::string temporary = AllocTemporary();
+        AddLine("MOV.U {}, {};", temporary, immediate->GetValue());
+        return temporary;
+    }
+
+    if (const auto predicate = std::get_if<PredicateNode>(&*node)) {
+        std::string temporary = AllocTemporary();
+        switch (const auto index = predicate->GetIndex(); index) {
+        case Tegra::Shader::Pred::UnusedIndex:
+            AddLine("MOV.S {}, -1;", temporary);
+            break;
+        case Tegra::Shader::Pred::NeverExecute:
+            AddLine("MOV.S {}, 0;", temporary);
+            break;
+        default:
+            AddLine("MOV.S {}, P{}.x;", temporary, static_cast<u64>(index));
+            break;
+        }
+        if (predicate->IsNegated()) {
+            AddLine("CMP.S {}, {}, 0, -1;", temporary, temporary);
+        }
+        return temporary;
+    }
+
+    if (const auto abuf = std::get_if<AbufNode>(&*node)) {
+        if (abuf->IsPhysicalBuffer()) {
+            UNIMPLEMENTED_MSG("Physical buffers are not implemented");
+            return "{0, 0, 0, 0}.x";
+        }
+
+        const auto buffer_index = [this, &abuf]() -> std::string {
+            if (stage != ShaderType::Geometry) {
+                return "";
+            }
+            return fmt::format("[{}]", Visit(abuf->GetBuffer()));
+        };
+
+        const Attribute::Index index = abuf->GetIndex();
+        const u32 element = abuf->GetElement();
+        const char swizzle = Swizzle(element);
+        switch (index) {
+        case Attribute::Index::Position: {
+            if (stage == ShaderType::Geometry) {
+                return fmt::format("{}_position[{}].{}", StageInputName(stage),
+                                   Visit(abuf->GetBuffer()), swizzle);
+            } else {
+                return fmt::format("{}.position.{}", StageInputName(stage), swizzle);
+            }
+        }
+        case Attribute::Index::TessCoordInstanceIDVertexID:
+            ASSERT(stage == ShaderType::Vertex);
+            switch (element) {
+            case 2:
+                return "vertex.instance";
+            case 3:
+                return "vertex.id";
+            }
+            UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
+            break;
+        case Attribute::Index::PointCoord:
+            switch (element) {
+            case 0:
+                return "fragment.pointcoord.x";
+            case 1:
+                return "fragment.pointcoord.y";
+            }
+            UNIMPLEMENTED();
+            break;
+        case Attribute::Index::FrontFacing: {
+            ASSERT(stage == ShaderType::Fragment);
+            ASSERT(element == 3);
+            const std::string temporary = AllocVectorTemporary();
+            AddLine("SGT.S RC.x, fragment.facing, {{0, 0, 0, 0}};");
+            AddLine("MOV.U.CC RC.x, -RC;");
+            AddLine("MOV.S {}.x, 0;", temporary);
+            AddLine("MOV.S {}.x (NE.x), -1;", temporary);
+            return fmt::format("{}.x", temporary);
+        }
+        default:
+            if (IsGenericAttribute(index)) {
+                if (stage == ShaderType::Geometry) {
+                    return fmt::format("in_attr{}[{}][0].{}", GetGenericAttributeIndex(index),
+                                       Visit(abuf->GetBuffer()), swizzle);
+                } else {
+                    return fmt::format("{}.attrib[{}].{}", StageInputName(stage),
+                                       GetGenericAttributeIndex(index), swizzle);
+                }
+            }
+            UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast<int>(index));
+            break;
+        }
+        return "{0, 0, 0, 0}.x";
+    }
+
+    if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
+        std::string offset_string;
+        const auto& offset = cbuf->GetOffset();
+        if (const auto imm = std::get_if<ImmediateNode>(&*offset)) {
+            offset_string = std::to_string(imm->GetValue());
+        } else {
+            offset_string = Visit(offset);
+        }
+        std::string temporary = AllocTemporary();
+        AddLine("LDC.F32 {}, cbuf{}[{}];", temporary, cbuf->GetIndex(), offset_string);
+        return temporary;
+    }
+
+    if (const auto gmem = std::get_if<GmemNode>(&*node)) {
+        std::string temporary = AllocTemporary();
+        AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
+                Visit(gmem->GetBaseAddress()));
+        AddLine("LDB.U32 {}, {}[{}];", temporary, GlobalMemoryName(gmem->GetDescriptor()),
+                temporary);
+        return temporary;
+    }
+
+    if (const auto lmem = std::get_if<LmemNode>(&*node)) {
+        std::string temporary = Visit(lmem->GetAddress());
+        AddLine("SHR.U {}, {}, 2;", temporary, temporary);
+        AddLine("MOV.U {}, lmem[{}].x;", temporary, temporary);
+        return temporary;
+    }
+
+    if (const auto smem = std::get_if<SmemNode>(&*node)) {
+        std::string temporary = Visit(smem->GetAddress());
+        AddLine("LDS.U32 {}, shared_mem[{}];", temporary, temporary);
+        return temporary;
+    }
+
+    if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) {
+        const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
+        return fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
+    }
+
+    if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
+        if (const auto amend_index = conditional->GetAmendIndex()) {
+            Visit(ir.GetAmendNode(*amend_index));
+        }
+        AddLine("MOVC.U RC.x, {};", Visit(conditional->GetCondition()));
+        AddLine("IF NE.x;");
+        VisitBlock(conditional->GetCode());
+        AddLine("ENDIF;");
+        return {};
+    }
+
+    if (const auto cmt = std::get_if<CommentNode>(&*node)) {
+        // Uncommenting this will generate invalid code. GLASM lacks comments.
+        // AddLine("// {}", cmt->GetText());
+        return {};
+    }
+
+    UNIMPLEMENTED();
+    return {};
+}
+
+std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    UNIMPLEMENTED_IF(meta.sampler.is_indexed);
+    UNIMPLEMENTED_IF(meta.sampler.is_shadow && meta.sampler.is_array &&
+                     meta.sampler.type == Tegra::Shader::TextureType::TextureCube);
+
+    const std::size_t count = operation.GetOperandsCount();
+    std::string temporary = AllocVectorTemporary();
+    std::size_t i = 0;
+    for (; i < count; ++i) {
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+    }
+    if (meta.sampler.is_array) {
+        AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i++), Visit(meta.array));
+    }
+    if (meta.sampler.is_shadow) {
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i++), Visit(meta.depth_compare));
+    }
+    return {std::move(temporary), i};
+}
+
+std::string ARBDecompiler::BuildAoffi(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    if (meta.aoffi.empty()) {
+        return {};
+    }
+    const std::string temporary = AllocVectorTemporary();
+    std::size_t i = 0;
+    for (auto& node : meta.aoffi) {
+        AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i++), Visit(node));
+    }
+    return fmt::format(", offset({})", temporary);
+}
+
+void ARBDecompiler::Exit() {
+    if (stage != ShaderType::Fragment) {
+        AddLine("RET;");
+        return;
+    }
+
+    const auto safe_get_register = [this](u32 reg) -> std::string {
+        // TODO(Rodrigo): Replace with contains once C++20 releases
+        const auto& used_registers = ir.GetRegisters();
+        if (used_registers.find(reg) != used_registers.end()) {
+            return fmt::format("R{}.x", reg);
+        }
+        return "{0, 0, 0, 0}.x";
+    };
+
+    const auto& header = ir.GetHeader();
+    u32 current_reg = 0;
+    for (u32 rt = 0; rt < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; ++rt) {
+        for (u32 component = 0; component < 4; ++component) {
+            if (!header.ps.IsColorComponentOutputEnabled(rt, component)) {
+                continue;
+            }
+            AddLine("MOV.F result_color{}.{}, {};", rt, Swizzle(component),
+                    safe_get_register(current_reg));
+            ++current_reg;
+        }
+    }
+    if (header.ps.omap.depth) {
+        AddLine("MOV.F result.depth.z, {};", safe_get_register(current_reg + 1));
+    }
+
+    AddLine("RET;");
+}
+
+std::string ARBDecompiler::Assign(Operation operation) {
+    const Node& dest = operation[0];
+    const Node& src = operation[1];
+
+    std::string dest_name;
+    if (const auto gpr = std::get_if<GprNode>(&*dest)) {
+        if (gpr->GetIndex() == Register::ZeroIndex) {
+            // Writing to Register::ZeroIndex is a no op
+            return {};
+        }
+        dest_name = fmt::format("R{}.x", gpr->GetIndex());
+    } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
+        const u32 element = abuf->GetElement();
+        const char swizzle = Swizzle(element);
+        switch (const Attribute::Index index = abuf->GetIndex()) {
+        case Attribute::Index::Position:
+            dest_name = fmt::format("result.position.{}", swizzle);
+            break;
+        case Attribute::Index::LayerViewportPointSize:
+            switch (element) {
+            case 0:
+                UNIMPLEMENTED();
+                return {};
+            case 1:
+            case 2:
+                if (!device.HasNvViewportArray2()) {
+                    LOG_ERROR(
+                        Render_OpenGL,
+                        "NV_viewport_array2 is missing. Maxwell gen 2 or better is required.");
+                    return {};
+                }
+                dest_name = element == 1 ? "result.layer.x" : "result.viewport.x";
+                break;
+            case 3:
+                dest_name = "result.pointsize.x";
+                break;
+            }
+            break;
+        case Attribute::Index::ClipDistances0123:
+            dest_name = fmt::format("result.clip[{}].x", element);
+            break;
+        case Attribute::Index::ClipDistances4567:
+            dest_name = fmt::format("result.clip[{}].x", element + 4);
+            break;
+        default:
+            if (!IsGenericAttribute(index)) {
+                UNREACHABLE();
+                return {};
+            }
+            dest_name =
+                fmt::format("result.attrib[{}].{}", GetGenericAttributeIndex(index), swizzle);
+            break;
+        }
+    } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
+        const std::string address = Visit(lmem->GetAddress());
+        AddLine("SHR.U {}, {}, 2;", address, address);
+        dest_name = fmt::format("lmem[{}].x", address);
+    } else if (const auto smem = std::get_if<SmemNode>(&*dest)) {
+        AddLine("STS.U32 {}, shared_mem[{}];", Visit(src), Visit(smem->GetAddress()));
+        ResetTemporaries();
+        return {};
+    } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
+        const std::string temporary = AllocTemporary();
+        AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
+                Visit(gmem->GetBaseAddress()));
+        AddLine("STB.U32 {}, {}[{}];", Visit(src), GlobalMemoryName(gmem->GetDescriptor()),
+                temporary);
+        ResetTemporaries();
+        return {};
+    } else {
+        UNREACHABLE();
+        ResetTemporaries();
+        return {};
+    }
+
+    AddLine("MOV.U {}, {};", dest_name, Visit(src));
+    ResetTemporaries();
+    return {};
+}
+
+std::string ARBDecompiler::Select(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("CMP.S {}, {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]),
+            Visit(operation[2]));
+    return temporary;
+}
+
+std::string ARBDecompiler::FClamp(Operation operation) {
+    // 1.0f in hex, replace with std::bit_cast on C++20
+    static constexpr u32 POSITIVE_ONE = 0x3f800000;
+
+    std::string temporary = AllocTemporary();
+    const Node& value = operation[0];
+    const Node& low = operation[1];
+    const Node& high = operation[2];
+    const auto* const imm_low = std::get_if<ImmediateNode>(&*low);
+    const auto* const imm_high = std::get_if<ImmediateNode>(&*high);
+    if (imm_low && imm_high && imm_low->GetValue() == 0 && imm_high->GetValue() == POSITIVE_ONE) {
+        AddLine("MOV.F32.SAT {}, {};", temporary, Visit(value));
+    } else {
+        AddLine("MIN.F {}, {}, {};", temporary, Visit(value), Visit(high));
+        AddLine("MAX.F {}, {}, {};", temporary, temporary, Visit(low));
+    }
+    return temporary;
+}
+
+std::string ARBDecompiler::FCastHalf0(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.x, {};", temporary, Visit(operation[0]));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::FCastHalf1(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.y, {};", temporary, Visit(operation[0]));
+    AddLine("MOV {}.x, {}.y;", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::FSqrt(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("RSQ.F32 {}, {};", temporary, Visit(operation[0]));
+    AddLine("RCP.F32 {}, {};", temporary, temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::FSwizzleAdd(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    if (!device.HasWarpIntrinsics()) {
+        LOG_ERROR(Render_OpenGL,
+                  "NV_shader_thread_shuffle is missing. Kepler or better is required.");
+        AddLine("ADD.F {}.x, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]));
+        return fmt::format("{}.x", temporary);
+    }
+
+    AddLine("AND.U {}.z, {}.threadid, 3;", temporary, StageInputName(stage));
+    AddLine("SHL.U {}.z, {}.z, 1;", temporary, temporary);
+    AddLine("SHR.U {}.z, {}, {}.z;", temporary, Visit(operation[2]), temporary);
+    AddLine("AND.U {}.z, {}.z, 3;", temporary, temporary);
+    AddLine("MUL.F32 {}.x, {}, FSWZA[{}.z];", temporary, Visit(operation[0]), temporary);
+    AddLine("MUL.F32 {}.y, {}, FSWZB[{}.z];", temporary, Visit(operation[1]), temporary);
+    AddLine("ADD.F32 {}.x, {}.x, {}.y;", temporary, temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HAdd2(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+    AddLine("ADD.F16 {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HMul2(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+    AddLine("MUL.F16 {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HFma2(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    const std::string tmp3 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+    AddLine("UP2H.F {}.xy, {};", tmp3, Visit(operation[2]));
+    AddLine("MAD.F16 {}, {}, {}, {};", tmp1, tmp1, tmp2, tmp3);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HAbsolute(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("PK2H.F {}.x, |{}|;", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HNegate(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("MOVC.S RC.x, {};", Visit(operation[1]));
+    AddLine("MOV.F {}.x (NE.x), -{}.x;", temporary, temporary);
+    AddLine("MOVC.S RC.x, {};", Visit(operation[2]));
+    AddLine("MOV.F {}.y (NE.x), -{}.y;", temporary, temporary);
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HClamp(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[1]));
+    AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
+    AddLine("MAX.F {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[2]));
+    AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
+    AddLine("MIN.F {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HCastFloat(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("MOV.F {}.y, {{0, 0, 0, 0}};", temporary);
+    AddLine("MOV.F {}.x, {};", temporary, Visit(operation[0]));
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HUnpack(Operation operation) {
+    const std::string operand = Visit(operation[0]);
+    switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) {
+    case Tegra::Shader::HalfType::H0_H1:
+        return operand;
+    case Tegra::Shader::HalfType::F32: {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("MOV.U {}.x, {};", temporary, operand);
+        AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
+        AddLine("PK2H.F {}.x, {};", temporary, temporary);
+        return fmt::format("{}.x", temporary);
+    }
+    case Tegra::Shader::HalfType::H0_H0: {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("UP2H.F {}.xy, {};", temporary, operand);
+        AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
+        AddLine("PK2H.F {}.x, {};", temporary, temporary);
+        return fmt::format("{}.x", temporary);
+    }
+    case Tegra::Shader::HalfType::H1_H1: {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("UP2H.F {}.xy, {};", temporary, operand);
+        AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
+        AddLine("PK2H.F {}.x, {};", temporary, temporary);
+        return fmt::format("{}.x", temporary);
+    }
+    }
+    UNREACHABLE();
+    return "{0, 0, 0, 0}.x";
+}
+
+std::string ARBDecompiler::HMergeF32(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HMergeH0(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
+    AddLine("MOV.U {}.x, {}.z;", temporary, temporary);
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HMergeH1(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
+    AddLine("MOV.U {}.y, {}.w;", temporary, temporary);
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HPack2(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("MOV.U {}.x, {};", temporary, Visit(operation[0]));
+    AddLine("MOV.U {}.y, {};", temporary, Visit(operation[1]));
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::LogicalAssign(Operation operation) {
+    const Node& dest = operation[0];
+    const Node& src = operation[1];
+
+    std::string target;
+
+    if (const auto pred = std::get_if<PredicateNode>(&*dest)) {
+        ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment");
+
+        const Tegra::Shader::Pred index = pred->GetIndex();
+        switch (index) {
+        case Tegra::Shader::Pred::NeverExecute:
+        case Tegra::Shader::Pred::UnusedIndex:
+            // Writing to these predicates is a no-op
+            return {};
+        }
+        target = fmt::format("P{}.x", static_cast<u64>(index));
+    } else if (const auto internal_flag = std::get_if<InternalFlagNode>(&*dest)) {
+        const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
+        target = fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
+    } else {
+        UNREACHABLE();
+        ResetTemporaries();
+        return {};
+    }
+
+    AddLine("MOV.U {}, {};", target, Visit(src));
+    ResetTemporaries();
+    return {};
+}
+
+std::string ARBDecompiler::LogicalPick2(Operation operation) {
+    std::string temporary = AllocTemporary();
+    const u32 index = std::get<ImmediateNode>(*operation[1]).GetValue();
+    AddLine("MOV.U {}, {}.{};", temporary, Visit(operation[0]), Swizzle(index));
+    return temporary;
+}
+
+std::string ARBDecompiler::LogicalAnd2(Operation operation) {
+    std::string temporary = AllocTemporary();
+    const std::string op = Visit(operation[0]);
+    AddLine("AND.U {}, {}.x, {}.y;", temporary, op, op);
+    return temporary;
+}
+
+std::string ARBDecompiler::FloatOrdered(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
+    AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
+    AddLine("MOV.S {}, -1;", temporary);
+    AddLine("MOV.S {} (NAN.x), 0;", temporary);
+    AddLine("MOV.S {} (NAN.y), 0;", temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::FloatUnordered(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
+    AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
+    AddLine("MOV.S {}, 0;", temporary);
+    AddLine("MOV.S {} (NAN.x), -1;", temporary);
+    AddLine("MOV.S {} (NAN.y), -1;", temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::LogicalAddCarry(Operation operation) {
+    std::string temporary = AllocTemporary();
+    AddLine("ADDC.U RC, {}, {};", Visit(operation[0]), Visit(operation[1]));
+    AddLine("MOV.S {}, 0;", temporary);
+    AddLine("IF CF.x;");
+    AddLine("MOV.S {}, -1;", temporary);
+    AddLine("ENDIF;");
+    return temporary;
+}
+
+std::string ARBDecompiler::Texture(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const auto [temporary, swizzle] = BuildCoords(operation);
+
+    std::string_view opcode = "TEX";
+    std::string extra;
+    if (meta.bias) {
+        ASSERT(!meta.lod);
+        opcode = "TXB";
+
+        if (swizzle < 4) {
+            AddLine("MOV.F {}.w, {};", temporary, Visit(meta.bias));
+        } else {
+            const std::string bias = AllocTemporary();
+            AddLine("MOV.F {}, {};", bias, Visit(meta.bias));
+            extra = fmt::format(" {},", bias);
+        }
+    }
+    if (meta.lod) {
+        ASSERT(!meta.bias);
+        opcode = "TXL";
+
+        if (swizzle < 4) {
+            AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
+        } else {
+            const std::string lod = AllocTemporary();
+            AddLine("MOV.F {}, {};", lod, Visit(meta.lod));
+            extra = fmt::format(" {},", lod);
+        }
+    }
+
+    AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, temporary, extra, sampler_id,
+            TextureType(meta), BuildAoffi(operation));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureGather(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const auto [temporary, swizzle] = BuildCoords(operation);
+
+    std::string comp;
+    if (!meta.sampler.is_shadow) {
+        const auto& immediate = std::get<ImmediateNode>(*meta.component);
+        comp = fmt::format(".{}", Swizzle(immediate.GetValue()));
+    }
+
+    AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp,
+            TextureType(meta), BuildAoffi(operation));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureQueryDimensions(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const std::string temporary = AllocVectorTemporary();
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+
+    ASSERT(!meta.sampler.is_array);
+
+    const std::string lod = operation.GetOperandsCount() > 0 ? Visit(operation[0]) : "0";
+    AddLine("TXQ {}, {}, texture[{}], {};", temporary, lod, sampler_id, TextureType(meta));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureQueryLod(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const std::string temporary = AllocVectorTemporary();
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+
+    ASSERT(!meta.sampler.is_array);
+
+    const std::size_t count = operation.GetOperandsCount();
+    for (std::size_t i = 0; i < count; ++i) {
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+    }
+    AddLine("LOD.F {}, {}, texture[{}], {};", temporary, temporary, sampler_id, TextureType(meta));
+    AddLine("MUL.F32 {}, {}, {{256, 256, 0, 0}};", temporary, temporary);
+    AddLine("TRUNC.S {}, {};", temporary, temporary);
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TexelFetch(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const auto [temporary, swizzle] = BuildCoords(operation);
+
+    if (!meta.sampler.is_buffer) {
+        ASSERT(swizzle < 4);
+        AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
+    }
+    AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, temporary, sampler_id, TextureType(meta),
+            BuildAoffi(operation));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureGradient(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const std::string ddx = AllocVectorTemporary();
+    const std::string ddy = AllocVectorTemporary();
+    const std::string coord = BuildCoords(operation).first;
+
+    const std::size_t num_components = meta.derivates.size() / 2;
+    for (std::size_t index = 0; index < num_components; ++index) {
+        const char swizzle = Swizzle(index);
+        AddLine("MOV.F {}.{}, {};", ddx, swizzle, Visit(meta.derivates[index * 2]));
+        AddLine("MOV.F {}.{}, {};", ddy, swizzle, Visit(meta.derivates[index * 2 + 1]));
+    }
+
+    const std::string_view result = coord;
+    AddLine("TXD.F {}, {}, {}, {}, texture[{}], {}{};", result, coord, ddx, ddy, sampler_id,
+            TextureType(meta), BuildAoffi(operation));
+    AddLine("MOV.F {}.x, {}.{};", result, result, Swizzle(meta.element));
+    return fmt::format("{}.x", result);
+}
+
+std::string ARBDecompiler::ImageLoad(Operation operation) {
+    const auto& meta = std::get<MetaImage>(operation.GetMeta());
+    const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+    const std::size_t count = operation.GetOperandsCount();
+    const std::string_view type = ImageType(meta.image.type);
+
+    const std::string temporary = AllocVectorTemporary();
+    for (std::size_t i = 0; i < count; ++i) {
+        AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+    }
+    AddLine("LOADIM.F {}, {}, image[{}], {};", temporary, temporary, image_id, type);
+    AddLine("MOV.F {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::ImageStore(Operation operation) {
+    const auto& meta = std::get<MetaImage>(operation.GetMeta());
+    const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+    const std::size_t num_coords = operation.GetOperandsCount();
+    const std::size_t num_values = meta.values.size();
+    const std::string_view type = ImageType(meta.image.type);
+
+    const std::string coord = AllocVectorTemporary();
+    const std::string value = AllocVectorTemporary();
+    for (std::size_t i = 0; i < num_coords; ++i) {
+        AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
+    }
+    for (std::size_t i = 0; i < num_values; ++i) {
+        AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
+    }
+    AddLine("STOREIM.F image[{}], {}, {}, {};", image_id, value, coord, type);
+    return {};
+}
+
+std::string ARBDecompiler::Branch(Operation operation) {
+    const auto target = std::get<ImmediateNode>(*operation[0]);
+    AddLine("MOV.U PC.x, {};", target.GetValue());
+    AddLine("CONT;");
+    return {};
+}
+
+std::string ARBDecompiler::BranchIndirect(Operation operation) {
+    AddLine("MOV.U PC.x, {};", Visit(operation[0]));
+    AddLine("CONT;");
+    return {};
+}
+
+std::string ARBDecompiler::PushFlowStack(Operation operation) {
+    const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+    const u32 target = std::get<ImmediateNode>(*operation[0]).GetValue();
+    const std::string_view stack_name = StackName(stack);
+    AddLine("MOV.U {}[{}_TOP.x].x, {};", stack_name, stack_name, target);
+    AddLine("ADD.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
+    return {};
+}
+
+std::string ARBDecompiler::PopFlowStack(Operation operation) {
+    const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+    const std::string_view stack_name = StackName(stack);
+    AddLine("SUB.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
+    AddLine("MOV.U PC.x, {}[{}_TOP.x].x;", stack_name, stack_name);
+    AddLine("CONT;");
+    return {};
+}
+
+std::string ARBDecompiler::Exit(Operation) {
+    Exit();
+    return {};
+}
+
+std::string ARBDecompiler::Discard(Operation) {
+    AddLine("KIL TR;");
+    return {};
+}
+
+std::string ARBDecompiler::EmitVertex(Operation) {
+    AddLine("EMIT;");
+    return {};
+}
+
+std::string ARBDecompiler::EndPrimitive(Operation) {
+    AddLine("ENDPRIM;");
+    return {};
+}
+
+std::string ARBDecompiler::InvocationId(Operation) {
+    return "primitive.invocation";
+}
+
+std::string ARBDecompiler::YNegate(Operation) {
+    LOG_WARNING(Render_OpenGL, "(STUBBED)");
+    const std::string temporary = AllocTemporary();
+    AddLine("MOV.F {}, 1;", temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::ThreadId(Operation) {
+    return fmt::format("{}.threadid", StageInputName(stage));
+}
+
+std::string ARBDecompiler::ShuffleIndexed(Operation operation) {
+    if (!device.HasWarpIntrinsics()) {
+        LOG_ERROR(Render_OpenGL,
+                  "NV_shader_thread_shuffle is missing. Kepler or better is required.");
+        return Visit(operation[0]);
+    }
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("SHFIDX.U {}, {}, {}, {{31, 0, 0, 0}};", temporary, Visit(operation[0]),
+            Visit(operation[1]));
+    AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::Barrier(Operation) {
+    if (!ir.IsDecompiled()) {
+        LOG_ERROR(Render_OpenGL, "BAR used but shader is not decompiled");
+        return {};
+    }
+    AddLine("BAR;");
+    return {};
+}
+
+std::string ARBDecompiler::MemoryBarrierGroup(Operation) {
+    AddLine("MEMBAR.CTA;");
+    return {};
+}
+
+std::string ARBDecompiler::MemoryBarrierGlobal(Operation) {
+    AddLine("MEMBAR;");
+    return {};
+}
+
+} // Anonymous namespace
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                                    const VideoCommon::Shader::Registry& registry,
+                                    Tegra::Engines::ShaderType stage, std::string_view identifier) {
+    return ARBDecompiler(device, ir, registry, stage, identifier).Code();
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.h b/src/video_core/renderer_opengl/gl_arb_decompiler.h
new file mode 100644
index 000000000..6afc87220
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.h
@@ -0,0 +1,29 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <string>
+#include <string_view>
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+enum class ShaderType : u32;
+}
+
+namespace VideoCommon::Shader {
+class ShaderIR;
+class Registry;
+} // namespace VideoCommon::Shader
+
+namespace OpenGL {
+
+class Device;
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                                    const VideoCommon::Shader::Registry& registry,
+                                    Tegra::Engines::ShaderType stage, std::string_view identifier);
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 9964ea894..e461e4c70 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -22,22 +22,53 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
 
-CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size)
+Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
     : VideoCommon::BufferBlock{cpu_addr, size} {
     gl_buffer.Create();
     glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+    }
+}
+
+Buffer::~Buffer() = default;
+
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) {
+    glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+                         data);
+}
+
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) {
+    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
+    const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size);
+    const GLintptr gl_offset = static_cast<GLintptr>(offset);
+    if (read_buffer.handle == 0) {
+        read_buffer.Create();
+        glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr,
+                          GL_STREAM_READ);
+    }
+    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
+    glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size);
+    glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data);
 }
 
-CachedBufferBlock::~CachedBufferBlock() = default;
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                      std::size_t size) {
+    glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
+                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+}
 
 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
-                               const Device& device, std::size_t stream_size)
-    : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {
+                               const Device& device_, std::size_t stream_size)
+    : GenericBufferCache{rasterizer, system,
+                         std::make_unique<OGLStreamBuffer>(device_, stream_size, true)},
+      device{device_} {
     if (!device.HasFastBufferSubData()) {
         return;
     }
 
-    static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
+    static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
     glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
     for (const GLuint cbuf : cbufs) {
         glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
@@ -48,44 +79,21 @@ OGLBufferCache::~OGLBufferCache() {
     glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
 }
 
-Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<CachedBufferBlock>(cpu_addr, size);
+std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+    return std::make_shared<Buffer>(device, cpu_addr, size);
 }
 
-GLuint OGLBufferCache::ToHandle(const Buffer& buffer) {
-    return buffer->GetHandle();
-}
-
-GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {
-    return 0;
-}
-
-void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                     const u8* data) {
-    glNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
-                         static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                       u8* data) {
-    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
-    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
-    glGetNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
-                            static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                               std::size_t dst_offset, std::size_t size) {
-    glCopyNamedBufferSubData(src->GetHandle(), dst->GetHandle(), static_cast<GLintptr>(src_offset),
-                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) {
+    return {0, 0, 0};
 }
 
 OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
                                                              std::size_t size) {
     DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
-    const GLuint& cbuf = cbufs[cbuf_cursor++];
+    const GLuint cbuf = cbufs[cbuf_cursor++];
+
     glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
-    return {cbuf, 0};
+    return {cbuf, 0, 0};
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index a9e86cfc7..88fdc0536 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -10,7 +10,6 @@
 #include "common/common_types.h"
 #include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
@@ -24,57 +23,58 @@ class Device;
 class OGLStreamBuffer;
 class RasterizerOpenGL;
 
-class CachedBufferBlock;
+class Buffer : public VideoCommon::BufferBlock {
+public:
+    explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size);
+    ~Buffer();
 
-using Buffer = std::shared_ptr<CachedBufferBlock>;
-using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
+    void Upload(std::size_t offset, std::size_t size, const u8* data);
 
-class CachedBufferBlock : public VideoCommon::BufferBlock {
-public:
-    explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size);
-    ~CachedBufferBlock();
+    void Download(std::size_t offset, std::size_t size, u8* data);
+
+    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                  std::size_t size);
 
-    GLuint GetHandle() const {
+    GLuint Handle() const noexcept {
         return gl_buffer.handle;
     }
 
+    u64 Address() const noexcept {
+        return gpu_address;
+    }
+
 private:
     OGLBuffer gl_buffer;
+    OGLBuffer read_buffer;
+    u64 gpu_address = 0;
 };
 
+using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
 class OGLBufferCache final : public GenericBufferCache {
 public:
     explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
                             const Device& device, std::size_t stream_size);
     ~OGLBufferCache();
 
-    GLuint GetEmptyBuffer(std::size_t) override;
+    BufferInfo GetEmptyBuffer(std::size_t) override;
 
     void Acquire() noexcept {
         cbuf_cursor = 0;
     }
 
 protected:
-    Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
-
-    GLuint ToHandle(const Buffer& buffer) override;
-
-    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                         const u8* data) override;
-
-    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                           u8* data) override;
-
-    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                   std::size_t dst_offset, std::size_t size) override;
+    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
 
     BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
 
 private:
+    static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+                                             Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+
+    const Device& device;
+
     std::size_t cbuf_cursor = 0;
-    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
-                           Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram>
-        cbufs;
+    std::array<GLuint, NUM_CBUFS> cbufs{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 466a911db..c1f20f0ab 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -6,6 +6,7 @@
 #include <array>
 #include <cstddef>
 #include <cstring>
+#include <limits>
 #include <optional>
 #include <vector>
 
@@ -26,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1;
 
 constexpr u32 NumStages = 5;
 
-constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
-                                  GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS,
-                                  GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS};
+constexpr std::array LimitUBOs = {
+    GL_MAX_VERTEX_UNIFORM_BLOCKS,          GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
+    GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
+    GL_MAX_FRAGMENT_UNIFORM_BLOCKS,        GL_MAX_COMPUTE_UNIFORM_BLOCKS};
 
 constexpr std::array LimitSSBOs = {
-    GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
+    GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS,          GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
     GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
-    GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS};
+    GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS,        GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS};
 
-constexpr std::array LimitSamplers = {
-    GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
-    GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
-    GL_MAX_TEXTURE_IMAGE_UNITS};
+constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS};
 
-constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS,
-                                    GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
-                                    GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS,
-                                    GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS};
+constexpr std::array LimitImages = {
+    GL_MAX_VERTEX_IMAGE_UNIFORMS,          GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
+    GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
+    GL_MAX_FRAGMENT_IMAGE_UNIFORMS,        GL_MAX_COMPUTE_IMAGE_UNIFORMS};
 
 template <typename T>
 T GetInteger(GLenum pname) {
@@ -85,6 +89,13 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
     return std::exchange(base, base + amount);
 }
 
+std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
+    std::array<u32, Tegra::Engines::MaxShaderTypes> max;
+    std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(),
+                   [](GLenum pname) { return GetInteger<u32>(pname); });
+    return max;
+}
+
 std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
     std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;
 
@@ -112,16 +123,24 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
     u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS);
     u32 base_images = 0;
 
-    // Reserve more image bindings on fragment and vertex stages.
+    // GL_MAX_IMAGE_UNITS is guaranteed by the spec to have a minimum value of 8.
+    // Due to the limitation of GL_MAX_IMAGE_UNITS, reserve at least 4 image bindings on the
+    // fragment stage, and at least 1 for the rest of the stages.
+    // So far games are observed to use 1 image binding on vertex and 4 on fragment stages.
+
+    // Reserve at least 4 image bindings on the fragment stage.
     bindings[4].image =
-        Extract(base_images, num_images, num_images / NumStages + 2, LimitImages[4]);
-    bindings[0].image =
-        Extract(base_images, num_images, num_images / NumStages + 1, LimitImages[0]);
+        Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]);
+
+    // This is guaranteed to be at least 1.
+    const u32 total_extracted_images = num_images / (NumStages - 1);
 
     // Reserve the other image bindings.
-    const u32 total_extracted_images = num_images / (NumStages - 2);
-    for (std::size_t i = 2; i < NumStages; ++i) {
+    for (std::size_t i = 0; i < NumStages; ++i) {
         const std::size_t stage = stage_swizzle[i];
+        if (stage == 4) {
+            continue;
+        }
         bindings[stage].image =
             Extract(base_images, num_images, total_extracted_images, LimitImages[stage]);
     }
@@ -133,6 +152,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
 }
 
 bool IsASTCSupported() {
+    static constexpr std::array targets = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY};
     static constexpr std::array formats = {
         GL_COMPRESSED_RGBA_ASTC_4x4_KHR,           GL_COMPRESSED_RGBA_ASTC_5x4_KHR,
         GL_COMPRESSED_RGBA_ASTC_5x5_KHR,           GL_COMPRESSED_RGBA_ASTC_6x5_KHR,
@@ -149,25 +169,44 @@ bool IsASTCSupported() {
         GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR,  GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR,
         GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR,
     };
-    return std::find_if_not(formats.begin(), formats.end(), [](GLenum format) {
-               GLint supported;
-               glGetInternalformativ(GL_TEXTURE_2D, format, GL_INTERNALFORMAT_SUPPORTED, 1,
-                                     &supported);
-               return supported == GL_TRUE;
-           }) == formats.end();
+    static constexpr std::array required_support = {
+        GL_VERTEX_TEXTURE,   GL_TESS_CONTROL_TEXTURE, GL_TESS_EVALUATION_TEXTURE,
+        GL_GEOMETRY_TEXTURE, GL_FRAGMENT_TEXTURE,     GL_COMPUTE_TEXTURE,
+    };
+
+    for (const GLenum target : targets) {
+        for (const GLenum format : formats) {
+            for (const GLenum support : required_support) {
+                GLint value;
+                glGetInternalformativ(target, format, support, 1, &value);
+                if (value != GL_FULL_SUPPORT) {
+                    return false;
+                }
+            }
+        }
+    }
+    return true;
 }
 
 } // Anonymous namespace
 
-Device::Device() : base_bindings{BuildBaseBindings()} {
+Device::Device()
+    : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
     const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
-    const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
+    const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
+    const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
     const std::vector extensions = GetExtensions();
 
     const bool is_nvidia = vendor == "NVIDIA Corporation";
     const bool is_amd = vendor == "ATI Technologies Inc.";
-    const bool is_intel = vendor == "Intel";
-    const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr;
+
+    bool disable_fast_buffer_sub_data = false;
+    if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
+        LOG_WARNING(
+            Render_OpenGL,
+            "Beta driver 443.24 is known to have issues. There might be performance issues.");
+        disable_fast_buffer_sub_data = true;
+    }
 
     uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
     shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
@@ -178,36 +217,43 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
     has_shader_ballot = GLAD_GL_ARB_shader_ballot;
     has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
     has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted");
+    has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod");
     has_astc = IsASTCSupported();
     has_variable_aoffi = TestVariableAoffi();
     has_component_indexing_bug = is_amd;
     has_precise_bug = TestPreciseBug();
-    has_broken_compute = is_intel_proprietary;
-    has_fast_buffer_sub_data = is_nvidia;
-    use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
-                           GLAD_GL_NV_compute_program5;
+    has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
+    has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory;
+
+    // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive
+    // uniform buffers as "push constants"
+    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
+
+    use_assembly_shaders = Settings::values.use_assembly_shaders.GetValue() &&
+                           GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 &&
+                           GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2;
 
     LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
     LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
     LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
 
-    if (Settings::values.use_assembly_shaders && !use_assembly_shaders) {
+    if (Settings::values.use_assembly_shaders.GetValue() && !use_assembly_shaders) {
         LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");
     }
 }
 
 Device::Device(std::nullptr_t) {
-    uniform_buffer_alignment = 0;
+    max_uniform_buffers.fill(std::numeric_limits<u32>::max());
+    uniform_buffer_alignment = 4;
+    shader_storage_alignment = 4;
     max_vertex_attributes = 16;
     max_varyings = 15;
     has_warp_intrinsics = true;
     has_shader_ballot = true;
     has_vertex_viewport_layer = true;
     has_image_load_formatted = true;
+    has_texture_shadow_lod = true;
     has_variable_aoffi = true;
-    has_component_indexing_bug = false;
-    has_broken_compute = false;
-    has_precise_bug = false;
 }
 
 bool Device::TestVariableAoffi() {
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index e915dbd86..e1d811966 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -24,6 +24,10 @@ public:
     explicit Device();
     explicit Device(std::nullptr_t);
 
+    u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
+        return max_uniform_buffers[static_cast<std::size_t>(shader_type)];
+    }
+
     const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept {
         return base_bindings[stage_index];
     }
@@ -64,6 +68,14 @@ public:
         return has_image_load_formatted;
     }
 
+    bool HasTextureShadowLod() const {
+        return has_texture_shadow_lod;
+    }
+
+    bool HasVertexBufferUnifiedMemory() const {
+        return has_vertex_buffer_unified_memory;
+    }
+
     bool HasASTC() const {
         return has_astc;
     }
@@ -80,14 +92,14 @@ public:
         return has_precise_bug;
     }
 
-    bool HasBrokenCompute() const {
-        return has_broken_compute;
-    }
-
     bool HasFastBufferSubData() const {
         return has_fast_buffer_sub_data;
     }
 
+    bool HasNvViewportArray2() const {
+        return has_nv_viewport_array2;
+    }
+
     bool UseAssemblyShaders() const {
         return use_assembly_shaders;
     }
@@ -96,7 +108,8 @@ private:
     static bool TestVariableAoffi();
     static bool TestPreciseBug();
 
-    std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings;
+    std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
+    std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
     std::size_t uniform_buffer_alignment{};
     std::size_t shader_storage_alignment{};
     u32 max_vertex_attributes{};
@@ -105,12 +118,14 @@ private:
     bool has_shader_ballot{};
     bool has_vertex_viewport_layer{};
     bool has_image_load_formatted{};
+    bool has_texture_shadow_lod{};
+    bool has_vertex_buffer_unified_memory{};
     bool has_astc{};
     bool has_variable_aoffi{};
     bool has_component_indexing_bug{};
     bool has_precise_bug{};
-    bool has_broken_compute{};
     bool has_fast_buffer_sub_data{};
+    bool has_nv_viewport_array2{};
     bool use_assembly_shaders{};
 };
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 716d43e65..e960a0ef1 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -30,6 +30,7 @@
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/maxwell_to_gl.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
+#include "video_core/shader_cache.h"
 
 namespace OpenGL {
 
@@ -54,15 +55,34 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255
 
 namespace {
 
-constexpr std::size_t NumSupportedVertexAttributes = 16;
+constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
+constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
+    NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize;
+constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
+    NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
+
+constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
+constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;
 
 template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
                                                ShaderType shader_type, std::size_t index = 0) {
+    if constexpr (std::is_same_v<Entry, SamplerEntry>) {
+        if (entry.is_separated) {
+            const u32 buffer_1 = entry.buffer;
+            const u32 buffer_2 = entry.secondary_buffer;
+            const u32 offset_1 = entry.offset;
+            const u32 offset_2 = entry.secondary_offset;
+            const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1);
+            const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2);
+            return engine.GetTextureInfo(handle_1 | handle_2);
+        }
+    }
     if (entry.is_bindless) {
-        const auto tex_handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
-        return engine.GetTextureInfo(tex_handle);
+        const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
+        return engine.GetTextureInfo(handle);
     }
+
     const auto& gpu_profile = engine.AccessGuestDriverProfile();
     const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
     if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
@@ -87,6 +107,34 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
     return buffer.size;
 }
 
+/// Translates hardware transform feedback indices
+/// @param location Hardware location
+/// @return Pair of ARB_transform_feedback3 token stream first and third arguments
+/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt
+std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) {
+    const u8 index = location / 4;
+    if (index >= 8 && index <= 39) {
+        return {GL_GENERIC_ATTRIB_NV, index - 8};
+    }
+    if (index >= 48 && index <= 55) {
+        return {GL_TEXTURE_COORD_NV, index - 48};
+    }
+    switch (index) {
+    case 7:
+        return {GL_POSITION, 0};
+    case 40:
+        return {GL_PRIMARY_COLOR_NV, 0};
+    case 41:
+        return {GL_SECONDARY_COLOR_NV, 0};
+    case 42:
+        return {GL_BACK_PRIMARY_COLOR_NV, 0};
+    case 43:
+        return {GL_BACK_SECONDARY_COLOR_NV, 0};
+    }
+    UNIMPLEMENTED_MSG("index={}", static_cast<int>(index));
+    return {GL_POSITION, 0};
+}
+
 void oglEnable(GLenum cap, bool state) {
     (state ? glEnable : glDisable)(cap);
 }
@@ -104,6 +152,9 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind
       screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} {
     CheckExtensions();
 
+    unified_uniform_buffer.Create();
+    glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
+
     if (device.UseAssemblyShaders()) {
         glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
         for (const GLuint cbuf : staging_cbufs) {
@@ -143,7 +194,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
     // avoid OpenGL errors.
     // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't
     // assume every shader uses them all.
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
         if (!flags[Dirty::VertexFormat0 + index]) {
             continue;
         }
@@ -162,9 +213,10 @@ void RasterizerOpenGL::SetupVertexFormat() {
         if (attrib.type == Maxwell::VertexAttribute::Type::SignedInt ||
             attrib.type == Maxwell::VertexAttribute::Type::UnsignedInt) {
             glVertexAttribIFormat(gl_index, attrib.ComponentCount(),
-                                  MaxwellToGL::VertexType(attrib), attrib.offset);
+                                  MaxwellToGL::VertexFormat(attrib), attrib.offset);
         } else {
-            glVertexAttribFormat(gl_index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib),
+            glVertexAttribFormat(gl_index, attrib.ComponentCount(),
+                                 MaxwellToGL::VertexFormat(attrib),
                                  attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset);
         }
         glVertexAttribBinding(gl_index, attrib.buffer);
@@ -181,9 +233,11 @@ void RasterizerOpenGL::SetupVertexBuffer() {
 
     MICROPROFILE_SCOPE(OpenGL_VB);
 
+    const bool use_unified_memory = device.HasVertexBufferUnifiedMemory();
+
     // Upload all guest vertex arrays sequentially to our buffer
     const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {
         if (!flags[Dirty::VertexBuffer0 + index]) {
             continue;
         }
@@ -196,16 +250,25 @@ void RasterizerOpenGL::SetupVertexBuffer() {
 
         const GPUVAddr start = vertex_array.StartAddress();
         const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
-
         ASSERT(end >= start);
+
+        const GLuint gl_index = static_cast<GLuint>(index);
         const u64 size = end - start;
         if (size == 0) {
-            glBindVertexBuffer(static_cast<GLuint>(index), 0, 0, vertex_array.stride);
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            if (use_unified_memory) {
+                glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0);
+            }
             continue;
         }
-        const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);
-        glBindVertexBuffer(static_cast<GLuint>(index), vertex_buffer, vertex_buffer_offset,
-                           vertex_array.stride);
+        const auto info = buffer_cache.UploadMemory(start, size);
+        if (use_unified_memory) {
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index,
+                                   info.address + info.offset, size);
+        } else {
+            glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride);
+        }
     }
 }
 
@@ -218,7 +281,7 @@ void RasterizerOpenGL::SetupVertexInstances() {
     flags[Dirty::VertexInstances] = false;
 
     const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
         if (!flags[Dirty::VertexInstance0 + index]) {
             continue;
         }
@@ -235,9 +298,9 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
     MICROPROFILE_SCOPE(OpenGL_Index);
     const auto& regs = system.GPU().Maxwell3D().regs;
     const std::size_t size = CalculateIndexBufferSize();
-    const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer);
-    return offset;
+    const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle);
+    return info.offset;
 }
 
 void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
@@ -273,7 +336,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
             continue;
         }
 
-        Shader shader{shader_cache.GetStageProgram(program)};
+        Shader* const shader = shader_cache.GetStageProgram(program);
 
         if (device.UseAssemblyShaders()) {
             // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this
@@ -567,7 +630,16 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
                    (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
 
     // Prepare the vertex array.
-    buffer_cache.Map(buffer_size);
+    const bool invalidated = buffer_cache.Map(buffer_size);
+
+    if (invalidated) {
+        // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty
+        auto& dirty = gpu.dirty.flags;
+        dirty[Dirty::VertexBuffers] = true;
+        for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
+            dirty[index] = true;
+        }
+    }
 
     // Prepare vertex array format.
     SetupVertexFormat();
@@ -584,9 +656,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     if (!device.UseAssemblyShaders()) {
         MaxwellUniformData ubo;
         ubo.SetFromRegs(gpu);
-        const auto [buffer, offset] =
+        const auto info =
             buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
-        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
+        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,
                           static_cast<GLsizeiptr>(sizeof(ubo)));
     }
 
@@ -655,10 +727,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
 }
 
 void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
-    if (device.HasBrokenCompute()) {
-        return;
-    }
-
     buffer_cache.Acquire();
     current_cbuf = 0;
 
@@ -837,7 +905,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
     return true;
 }
 
-void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
     static constexpr std::array PARAMETER_LUT = {
         GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
         GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
@@ -846,41 +914,62 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad
     MICROPROFILE_SCOPE(OpenGL_UBO);
     const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
     const auto& shader_stage = stages[stage_index];
+    const auto& entries = shader->GetEntries();
+    const bool use_unified = entries.use_unified_uniforms;
+    const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;
 
-    u32 binding =
-        device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).uniform_buffer;
-    for (const auto& entry : shader->GetEntries().const_buffers) {
-        const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
-        SetupConstBuffer(PARAMETER_LUT[stage_index], binding++, buffer, entry);
+    const auto base_bindings = device.GetBaseBindings(stage_index);
+    u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
+    for (const auto& entry : entries.const_buffers) {
+        const u32 index = entry.GetIndex();
+        const auto& buffer = shader_stage.const_buffers[index];
+        SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
+                         base_unified_offset + index * Maxwell::MaxConstBufferSize);
+        ++binding;
+    }
+    if (use_unified) {
+        const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
+                                           entries.global_memory_entries.size());
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
+                          base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
     }
 }
 
-void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) {
     MICROPROFILE_SCOPE(OpenGL_UBO);
     const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    const auto& entries = kernel->GetEntries();
+    const bool use_unified = entries.use_unified_uniforms;
 
     u32 binding = 0;
-    for (const auto& entry : kernel->GetEntries().const_buffers) {
+    for (const auto& entry : entries.const_buffers) {
         const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
         const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
         Tegra::Engines::ConstBufferInfo buffer;
         buffer.address = config.Address();
         buffer.size = config.size;
         buffer.enabled = mask[entry.GetIndex()];
-        SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding++, buffer, entry);
+        SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
+                         use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
+        ++binding;
+    }
+    if (use_unified) {
+        const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
+                          NUM_CONST_BUFFERS_BYTES_PER_STAGE);
     }
 }
 
 void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
                                         const Tegra::Engines::ConstBufferInfo& buffer,
-                                        const ConstBufferEntry& entry) {
+                                        const ConstBufferEntry& entry, bool use_unified,
+                                        std::size_t unified_offset) {
     if (!buffer.enabled) {
         // Set values to zero to unbind buffers
         if (device.UseAssemblyShaders()) {
             glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
         } else {
-            glBindBufferRange(GL_UNIFORM_BUFFER, binding,
-                              buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
+            glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float));
         }
         return;
     }
@@ -889,23 +978,33 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
     // UBO alignment requirements.
     const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
 
-    const auto alignment = device.GetUniformBufferAlignment();
-    auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
-                                                    device.HasFastBufferSubData());
-    if (!device.UseAssemblyShaders()) {
-        glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
+    const bool fast_upload = !use_unified && device.HasFastBufferSubData();
+
+    const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
+    const GPUVAddr gpu_addr = buffer.address;
+    auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
+
+    if (device.UseAssemblyShaders()) {
+        UNIMPLEMENTED_IF(use_unified);
+        if (info.offset != 0) {
+            const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
+            glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size);
+            info.handle = staging_cbuf;
+            info.offset = 0;
+        }
+        glBindBufferRangeNV(stage, binding, info.handle, info.offset, size);
         return;
     }
-    if (offset != 0) {
-        const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
-        glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
-        cbuf = staging_cbuf;
-        offset = 0;
+
+    if (use_unified) {
+        glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset,
+                                 unified_offset, size);
+    } else {
+        glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size);
     }
-    glBindBufferRangeNV(stage, binding, cbuf, offset, size);
 }
 
-void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) {
     auto& gpu{system.GPU()};
     auto& memory_manager{gpu.MemoryManager()};
     const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
@@ -920,7 +1019,7 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad
     }
 }
 
-void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
     auto& gpu{system.GPU()};
     auto& memory_manager{gpu.MemoryManager()};
     const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
@@ -937,13 +1036,12 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
 void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
                                          GPUVAddr gpu_addr, std::size_t size) {
     const auto alignment{device.GetShaderStorageBufferAlignment()};
-    const auto [ssbo, buffer_offset] =
-        buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
-    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, ssbo, buffer_offset,
+    const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
+    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
                       static_cast<GLsizeiptr>(size));
 }
 
-void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
     const auto& maxwell3d = system.GPU().Maxwell3D();
     u32 binding = device.GetBaseBindings(stage_index).sampler;
@@ -956,7 +1054,7 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader&
     }
 }
 
-void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
     const auto& compute = system.GPU().KeplerCompute();
     u32 binding = 0;
@@ -985,7 +1083,7 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
     }
 }
 
-void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) {
     const auto& maxwell3d = system.GPU().Maxwell3D();
     u32 binding = device.GetBaseBindings(stage_index).image;
     for (const auto& entry : shader->GetEntries().images) {
@@ -995,7 +1093,7 @@ void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& sh
     }
 }
 
-void RasterizerOpenGL::SetupComputeImages(const Shader& shader) {
+void RasterizerOpenGL::SetupComputeImages(Shader* shader) {
     const auto& compute = system.GPU().KeplerCompute();
     u32 binding = 0;
     for (const auto& entry : shader->GetEntries().images) {
@@ -1024,6 +1122,26 @@ void RasterizerOpenGL::SyncViewport() {
     const auto& regs = gpu.regs;
 
     const bool dirty_viewport = flags[Dirty::Viewports];
+    const bool dirty_clip_control = flags[Dirty::ClipControl];
+
+    if (dirty_clip_control || flags[Dirty::FrontFace]) {
+        flags[Dirty::FrontFace] = false;
+
+        GLenum mode = MaxwellToGL::FrontFace(regs.front_face);
+        if (regs.screen_y_control.triangle_rast_flip != 0 &&
+            regs.viewport_transform[0].scale_y < 0.0f) {
+            switch (mode) {
+            case GL_CW:
+                mode = GL_CCW;
+                break;
+            case GL_CCW:
+                mode = GL_CW;
+                break;
+            }
+        }
+        glFrontFace(mode);
+    }
+
     if (dirty_viewport || flags[Dirty::ClipControl]) {
         flags[Dirty::ClipControl] = false;
 
@@ -1121,11 +1239,6 @@ void RasterizerOpenGL::SyncCullMode() {
             glDisable(GL_CULL_FACE);
         }
     }
-
-    if (flags[Dirty::FrontFace]) {
-        flags[Dirty::FrontFace] = false;
-        glFrontFace(MaxwellToGL::FrontFace(regs.front_face));
-    }
 }
 
 void RasterizerOpenGL::SyncPrimitiveRestart() {
@@ -1496,12 +1609,70 @@ void RasterizerOpenGL::SyncFramebufferSRGB() {
     oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);
 }
 
+void RasterizerOpenGL::SyncTransformFeedback() {
+    // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal
+    // when this is required.
+    const auto& regs = system.GPU().Maxwell3D().regs;
+
+    static constexpr std::size_t STRIDE = 3;
+    std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs;
+    std::array<GLint, Maxwell::NumTransformFeedbackBuffers> streams;
+
+    GLint* cursor = attribs.data();
+    GLint* current_stream = streams.data();
+
+    for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) {
+        const auto& layout = regs.tfb_layouts[feedback];
+        UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding");
+        if (layout.varying_count == 0) {
+            continue;
+        }
+
+        *current_stream = static_cast<GLint>(feedback);
+        if (current_stream != streams.data()) {
+            // When stepping one stream, push the expected token
+            cursor[0] = GL_NEXT_BUFFER_NV;
+            cursor[1] = 0;
+            cursor[2] = 0;
+            cursor += STRIDE;
+        }
+        ++current_stream;
+
+        const auto& locations = regs.tfb_varying_locs[feedback];
+        std::optional<u8> current_index;
+        for (u32 offset = 0; offset < layout.varying_count; ++offset) {
+            const u8 location = locations[offset];
+            const u8 index = location / 4;
+
+            if (current_index == index) {
+                // Increase number of components of the previous attachment
+                ++cursor[-2];
+                continue;
+            }
+            current_index = index;
+
+            std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location);
+            cursor[1] = 1;
+            cursor += STRIDE;
+        }
+    }
+
+    const GLsizei num_attribs = static_cast<GLsizei>((cursor - attribs.data()) / STRIDE);
+    const GLsizei num_strides = static_cast<GLsizei>(current_stream - streams.data());
+    glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(),
+                                       GL_INTERLEAVED_ATTRIBS);
+}
+
 void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
     const auto& regs = system.GPU().Maxwell3D().regs;
     if (regs.tfb_enabled == 0) {
         return;
     }
 
+    if (device.UseAssemblyShaders()) {
+        SyncTransformFeedback();
+    }
+
     UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
                      regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
                      regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
@@ -1528,6 +1699,10 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
                           static_cast<GLsizeiptr>(size));
     }
 
+    // We may have to call BeginTransformFeedbackNV here since they seem to call different
+    // implementations on Nvidia's driver (the pointer is different) but we are using
+    // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB
+    // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works.
     glBeginTransformFeedback(GL_POINTS);
 }
 
@@ -1549,8 +1724,9 @@ void RasterizerOpenGL::EndTransformFeedback() {
         const GLuint handle = transform_feedback_buffers[index].handle;
         const GPUVAddr gpu_addr = binding.Address();
         const std::size_t size = binding.buffer_size;
-        const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
-        glCopyNamedBufferSubData(handle, dest_buffer, 0, offset, static_cast<GLsizeiptr>(size));
+        const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+        glCopyNamedBufferSubData(handle, info.handle, 0, info.offset,
+                                 static_cast<GLsizeiptr>(size));
     }
 }
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 87f7fe159..4f082592f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -19,7 +19,6 @@
 #include "video_core/engines/const_buffer_info.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_accelerated.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
@@ -100,40 +99,41 @@ private:
     void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil);
 
     /// Configures the current constbuffers to use for the draw command.
-    void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader);
+    void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader);
 
     /// Configures the current constbuffers to use for the kernel invocation.
-    void SetupComputeConstBuffers(const Shader& kernel);
+    void SetupComputeConstBuffers(Shader* kernel);
 
     /// Configures a constant buffer.
     void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
-                          const ConstBufferEntry& entry);
+                          const ConstBufferEntry& entry, bool use_unified,
+                          std::size_t unified_offset);
 
     /// Configures the current global memory entries to use for the draw command.
-    void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
+    void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader);
 
     /// Configures the current global memory entries to use for the kernel invocation.
-    void SetupComputeGlobalMemory(const Shader& kernel);
+    void SetupComputeGlobalMemory(Shader* kernel);
 
     /// Configures a constant buffer.
     void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
                            std::size_t size);
 
     /// Configures the current textures to use for the draw command.
-    void SetupDrawTextures(std::size_t stage_index, const Shader& shader);
+    void SetupDrawTextures(std::size_t stage_index, Shader* shader);
 
     /// Configures the textures used in a compute shader.
-    void SetupComputeTextures(const Shader& kernel);
+    void SetupComputeTextures(Shader* kernel);
 
     /// Configures a texture.
     void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
                       const SamplerEntry& entry);
 
     /// Configures images in a graphics shader.
-    void SetupDrawImages(std::size_t stage_index, const Shader& shader);
+    void SetupDrawImages(std::size_t stage_index, Shader* shader);
 
     /// Configures images in a compute shader.
-    void SetupComputeImages(const Shader& shader);
+    void SetupComputeImages(Shader* shader);
 
     /// Configures an image.
     void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
@@ -201,6 +201,10 @@ private:
     /// Syncs the framebuffer sRGB state to match the guest state
     void SyncFramebufferSRGB();
 
+    /// Syncs transform feedback state to match guest state
+    /// @note Only valid on assembly shaders
+    void SyncTransformFeedback();
+
     /// Begin a transform feedback
     void BeginTransformFeedback(GLenum primitive_mode);
 
@@ -253,6 +257,7 @@ private:
         Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
     std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
     std::size_t current_cbuf = 0;
+    OGLBuffer unified_uniform_buffer;
 
     /// Number of commands queued to the OpenGL driver. Reseted on flush.
     std::size_t num_queued_commands = 0;
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 4cd0f36cf..c6a3bf3a1 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -20,6 +20,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_arb_decompiler.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
@@ -29,6 +30,7 @@
 #include "video_core/shader/memory_util.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"
 
 namespace OpenGL {
 
@@ -147,7 +149,8 @@ ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 u
     auto program = std::make_shared<ProgramHandle>();
 
     if (device.UseAssemblyShaders()) {
-        const std::string arb = "Not implemented";
+        const std::string arb =
+            DecompileAssemblyShader(device, ir, registry, shader_type, shader_id);
 
         GLuint& arb_prog = program->assembly_program.handle;
 
@@ -194,12 +197,9 @@ std::unordered_set<GLenum> GetSupportedFormats() {
 
 } // Anonymous namespace
 
-CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
-                           std::shared_ptr<VideoCommon::Shader::Registry> registry,
-                           ShaderEntries entries, ProgramSharedPtr program_)
-    : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)},
-      size_in_bytes{size_in_bytes}, program{std::move(program_)} {
-    // Assign either the assembly program or source program. We can't have both.
+Shader::Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry_, ShaderEntries entries_,
+               ProgramSharedPtr program_)
+    : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)} {
     handle = program->assembly_program.handle;
     if (handle == 0) {
         handle = program->source_program.handle;
@@ -207,16 +207,16 @@ CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
     ASSERT(handle != 0);
 }
 
-CachedShader::~CachedShader() = default;
+Shader::~Shader() = default;
 
-GLuint CachedShader::GetHandle() const {
+GLuint Shader::GetHandle() const {
     DEBUG_ASSERT(registry->IsConsistent());
     return handle;
 }
 
-Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
-                                           Maxwell::ShaderProgram program_type, ProgramCode code,
-                                           ProgramCode code_b) {
+std::unique_ptr<Shader> Shader::CreateStageFromMemory(const ShaderParameters& params,
+                                                      Maxwell::ShaderProgram program_type,
+                                                      ProgramCode code, ProgramCode code_b) {
     const auto shader_type = GetShaderType(program_type);
     const std::size_t size_in_bytes = code.size() * sizeof(u64);
 
@@ -241,11 +241,12 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
     entry.bindless_samplers = registry->GetBindlessSamplers();
     params.disk_cache.SaveEntry(std::move(entry));
 
-    return std::shared_ptr<CachedShader>(new CachedShader(
-        params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
+    return std::unique_ptr<Shader>(new Shader(
+        std::move(registry), MakeEntries(params.device, ir, shader_type), std::move(program)));
 }
 
-Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
+std::unique_ptr<Shader> Shader::CreateKernelFromMemory(const ShaderParameters& params,
+                                                       ProgramCode code) {
     const std::size_t size_in_bytes = code.size() * sizeof(u64);
 
     auto& engine = params.system.GPU().KeplerCompute();
@@ -265,22 +266,23 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog
     entry.bindless_samplers = registry->GetBindlessSamplers();
     params.disk_cache.SaveEntry(std::move(entry));
 
-    return std::shared_ptr<CachedShader>(new CachedShader(
-        params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
+    return std::unique_ptr<Shader>(new Shader(std::move(registry),
+                                              MakeEntries(params.device, ir, ShaderType::Compute),
+                                              std::move(program)));
 }
 
-Shader CachedShader::CreateFromCache(const ShaderParameters& params,
-                                     const PrecompiledShader& precompiled_shader,
-                                     std::size_t size_in_bytes) {
-    return std::shared_ptr<CachedShader>(
-        new CachedShader(params.cpu_addr, size_in_bytes, precompiled_shader.registry,
-                         precompiled_shader.entries, precompiled_shader.program));
+std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params,
+                                                const PrecompiledShader& precompiled_shader) {
+    return std::unique_ptr<Shader>(new Shader(
+        precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program));
 }
 
 ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
                                      Core::Frontend::EmuWindow& emu_window, const Device& device)
-    : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device},
-      disk_cache{system} {}
+    : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system},
+      emu_window{emu_window}, device{device}, disk_cache{system} {}
+
+ShaderCacheOpenGL::~ShaderCacheOpenGL() = default;
 
 void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
                                       const VideoCore::DiskResourceLoadCallback& callback) {
@@ -348,7 +350,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
             PrecompiledShader shader;
             shader.program = std::move(program);
             shader.registry = std::move(registry);
-            shader.entries = MakeEntries(ir);
+            shader.entries = MakeEntries(device, ir, entry.type);
 
             std::scoped_lock lock{mutex};
             if (callback) {
@@ -434,7 +436,7 @@ ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(
     return program;
 }
 
-Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
+Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
     if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) {
         return last_shaders[static_cast<std::size_t>(program)];
     }
@@ -444,8 +446,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
 
     // Look up shader in the cache based on address
     const auto cpu_addr{memory_manager.GpuToCpuAddress(address)};
-    Shader shader{cpu_addr ? TryGet(*cpu_addr) : null_shader};
-    if (shader) {
+    if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) {
         return last_shaders[static_cast<std::size_t>(program)] = shader;
     }
 
@@ -459,62 +460,64 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
         const u8* host_ptr_b = memory_manager.GetPointer(address_b);
         code_b = GetShaderCode(memory_manager, address_b, host_ptr_b, false);
     }
+    const std::size_t code_size = code.size() * sizeof(u64);
 
-    const auto unique_identifier = GetUniqueIdentifier(
+    const u64 unique_identifier = GetUniqueIdentifier(
         GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b);
 
     const ShaderParameters params{system,    disk_cache, device,
                                   *cpu_addr, host_ptr,   unique_identifier};
 
+    std::unique_ptr<Shader> shader;
     const auto found = runtime_cache.find(unique_identifier);
     if (found == runtime_cache.end()) {
-        shader = CachedShader::CreateStageFromMemory(params, program, std::move(code),
-                                                     std::move(code_b));
+        shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b));
     } else {
-        const std::size_t size_in_bytes = code.size() * sizeof(u64);
-        shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
+        shader = Shader::CreateFromCache(params, found->second);
     }
 
+    Shader* const result = shader.get();
     if (cpu_addr) {
-        Register(shader);
+        Register(std::move(shader), *cpu_addr, code_size);
     } else {
-        null_shader = shader;
+        null_shader = std::move(shader);
     }
 
-    return last_shaders[static_cast<std::size_t>(program)] = shader;
+    return last_shaders[static_cast<std::size_t>(program)] = result;
 }
 
-Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
+Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
     auto& memory_manager{system.GPU().MemoryManager()};
     const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)};
 
-    auto kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel;
-    if (kernel) {
+    if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) {
         return kernel;
     }
 
     const auto host_ptr{memory_manager.GetPointer(code_addr)};
     // No kernel found, create a new one
-    auto code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
-    const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
+    ProgramCode code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
+    const std::size_t code_size{code.size() * sizeof(u64)};
+    const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
 
     const ShaderParameters params{system,    disk_cache, device,
                                   *cpu_addr, host_ptr,   unique_identifier};
 
+    std::unique_ptr<Shader> kernel;
     const auto found = runtime_cache.find(unique_identifier);
     if (found == runtime_cache.end()) {
-        kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
+        kernel = Shader::CreateKernelFromMemory(params, std::move(code));
     } else {
-        const std::size_t size_in_bytes = code.size() * sizeof(u64);
-        kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
+        kernel = Shader::CreateFromCache(params, found->second);
     }
 
+    Shader* const result = kernel.get();
     if (cpu_addr) {
-        Register(kernel);
+        Register(std::move(kernel), *cpu_addr, code_size);
     } else {
-        null_kernel = kernel;
+        null_kernel = std::move(kernel);
     }
-    return kernel;
+    return result;
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index b2ae8d7f9..994aaeaf2 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -18,12 +18,12 @@
 
 #include "common/common_types.h"
 #include "video_core/engines/shader_type.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_disk_cache.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"
 
 namespace Core {
 class System;
@@ -35,12 +35,9 @@ class EmuWindow;
 
 namespace OpenGL {
 
-class CachedShader;
 class Device;
 class RasterizerOpenGL;
-struct UnspecializedShader;
 
-using Shader = std::shared_ptr<CachedShader>;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 struct ProgramHandle {
@@ -64,62 +61,53 @@ struct ShaderParameters {
     u64 unique_identifier;
 };
 
-class CachedShader final : public RasterizerCacheObject {
+class Shader final {
 public:
-    ~CachedShader();
+    ~Shader();
 
     /// Gets the GL program handle for the shader
     GLuint GetHandle() const;
 
-    /// Returns the size in bytes of the shader
-    std::size_t GetSizeInBytes() const override {
-        return size_in_bytes;
-    }
-
     /// Gets the shader entries for the shader
     const ShaderEntries& GetEntries() const {
         return entries;
     }
 
-    static Shader CreateStageFromMemory(const ShaderParameters& params,
-                                        Maxwell::ShaderProgram program_type,
-                                        ProgramCode program_code, ProgramCode program_code_b);
-    static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code);
+    static std::unique_ptr<Shader> CreateStageFromMemory(const ShaderParameters& params,
+                                                         Maxwell::ShaderProgram program_type,
+                                                         ProgramCode program_code,
+                                                         ProgramCode program_code_b);
+    static std::unique_ptr<Shader> CreateKernelFromMemory(const ShaderParameters& params,
+                                                          ProgramCode code);
 
-    static Shader CreateFromCache(const ShaderParameters& params,
-                                  const PrecompiledShader& precompiled_shader,
-                                  std::size_t size_in_bytes);
+    static std::unique_ptr<Shader> CreateFromCache(const ShaderParameters& params,
+                                                   const PrecompiledShader& precompiled_shader);
 
 private:
-    explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
-                          std::shared_ptr<VideoCommon::Shader::Registry> registry,
-                          ShaderEntries entries, ProgramSharedPtr program);
+    explicit Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry, ShaderEntries entries,
+                    ProgramSharedPtr program);
 
     std::shared_ptr<VideoCommon::Shader::Registry> registry;
     ShaderEntries entries;
-    std::size_t size_in_bytes = 0;
     ProgramSharedPtr program;
     GLuint handle = 0;
 };
 
-class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
+class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> {
 public:
     explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
                                Core::Frontend::EmuWindow& emu_window, const Device& device);
+    ~ShaderCacheOpenGL() override;
 
     /// Loads disk cache for the current game
     void LoadDiskCache(const std::atomic_bool& stop_loading,
                        const VideoCore::DiskResourceLoadCallback& callback);
 
     /// Gets the current specified shader stage program
-    Shader GetStageProgram(Maxwell::ShaderProgram program);
+    Shader* GetStageProgram(Maxwell::ShaderProgram program);
 
     /// Gets a compute kernel in the passed address
-    Shader GetComputeKernel(GPUVAddr code_addr);
-
-protected:
-    // We do not have to flush this cache as things in it are never modified by us.
-    void FlushObjectInner(const Shader& object) override {}
+    Shader* GetComputeKernel(GPUVAddr code_addr);
 
 private:
     ProgramSharedPtr GeneratePrecompiledProgram(
@@ -132,10 +120,10 @@ private:
     ShaderDiskCacheOpenGL disk_cache;
     std::unordered_map<u64, PrecompiledShader> runtime_cache;
 
-    Shader null_shader{};
-    Shader null_kernel{};
+    std::unique_ptr<Shader> null_shader;
+    std::unique_ptr<Shader> null_kernel;
 
-    std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
+    std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 9cb115959..2c49aeaac 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -37,6 +37,7 @@ using Tegra::Shader::IpaMode;
 using Tegra::Shader::IpaSampleMode;
 using Tegra::Shader::PixelImap;
 using Tegra::Shader::Register;
+using Tegra::Shader::TextureType;
 using VideoCommon::Shader::BuildTransformFeedback;
 using VideoCommon::Shader::Registry;
 
@@ -61,8 +62,8 @@ struct TextureDerivates {};
 using TextureArgument = std::pair<Type, Node>;
 using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>;
 
-constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
-    static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));
+constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
+constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);
 
 constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
 #define ftou floatBitsToUint
@@ -402,6 +403,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
     return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
 }
 
+bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
+    const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
+    // We waste one UBO for emulation
+    const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
+    return num_ubos > num_available_ubos;
+}
+
 struct GenericVaryingDescription {
     std::string name;
     u8 first_element = 0;
@@ -412,8 +420,9 @@ class GLSLDecompiler final {
 public:
     explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
                             ShaderType stage, std::string_view identifier, std::string_view suffix)
-        : device{device}, ir{ir}, registry{registry}, stage{stage},
-          identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {
+        : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier},
+          suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{
+                                                      UseUnifiedUniforms(device, ir, stage)} {
         if (stage != ShaderType::Compute) {
             transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
         }
@@ -518,6 +527,9 @@ private:
         if (device.HasImageLoadFormatted()) {
             code.AddLine("#extension GL_EXT_shader_image_load_formatted : require");
         }
+        if (device.HasTextureShadowLod()) {
+            code.AddLine("#extension GL_EXT_texture_shadow_lod : require");
+        }
         if (device.HasWarpIntrinsics()) {
             code.AddLine("#extension GL_NV_gpu_shader5 : require");
             code.AddLine("#extension GL_NV_shader_thread_group : require");
@@ -618,7 +630,9 @@ private:
                 break;
             }
         }
-        if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) {
+
+        if (stage != ShaderType::Geometry &&
+            (stage != ShaderType::Vertex || device.HasVertexViewportLayer())) {
             if (ir.UsesLayer()) {
                 code.AddLine("int gl_Layer;");
             }
@@ -647,6 +661,16 @@ private:
         --code.scope;
         code.AddLine("}};");
         code.AddNewLine();
+
+        if (stage == ShaderType::Geometry) {
+            if (ir.UsesLayer()) {
+                code.AddLine("out int gl_Layer;");
+            }
+            if (ir.UsesViewportIndex()) {
+                code.AddLine("out int gl_ViewportIndex;");
+            }
+        }
+        code.AddNewLine();
     }
 
     void DeclareRegisters() {
@@ -834,12 +858,24 @@ private:
     }
 
     void DeclareConstantBuffers() {
+        if (use_unified_uniforms) {
+            const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
+                                static_cast<u32>(ir.GetGlobalMemory().size());
+            code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
+                         binding);
+            code.AddLine("    uint cbufs[];");
+            code.AddLine("}};");
+            code.AddNewLine();
+            return;
+        }
+
         u32 binding = device.GetBaseBindings(stage).uniform_buffer;
-        for (const auto& buffers : ir.GetConstantBuffers()) {
-            const auto index = buffers.first;
+        for (const auto [index, info] : ir.GetConstantBuffers()) {
+            const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4;
+            const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements;
             code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++,
                          GetConstBufferBlock(index));
-            code.AddLine("    uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS);
+            code.AddLine("    uvec4 {}[{}];", GetConstBuffer(index), size);
             code.AddLine("}};");
             code.AddNewLine();
         }
@@ -877,13 +913,13 @@ private:
                     return "samplerBuffer";
                 }
                 switch (sampler.type) {
-                case Tegra::Shader::TextureType::Texture1D:
+                case TextureType::Texture1D:
                     return "sampler1D";
-                case Tegra::Shader::TextureType::Texture2D:
+                case TextureType::Texture2D:
                     return "sampler2D";
-                case Tegra::Shader::TextureType::Texture3D:
+                case TextureType::Texture3D:
                     return "sampler3D";
-                case Tegra::Shader::TextureType::TextureCube:
+                case TextureType::TextureCube:
                     return "samplerCube";
                 default:
                     UNREACHABLE();
@@ -1038,42 +1074,51 @@ private:
 
         if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
             const Node offset = cbuf->GetOffset();
+            const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;
+
             if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
                 // Direct access
                 const u32 offset_imm = immediate->GetValue();
                 ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
-                return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
-                                    offset_imm / (4 * 4), (offset_imm / 4) % 4),
-                        Type::Uint};
+                if (use_unified_uniforms) {
+                    return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4),
+                            Type::Uint};
+                } else {
+                    return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
+                                        offset_imm / (4 * 4), (offset_imm / 4) % 4),
+                            Type::Uint};
+                }
             }
 
-            if (std::holds_alternative<OperationNode>(*offset)) {
-                // Indirect access
-                const std::string final_offset = code.GenerateTemporary();
-                code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
+            // Indirect access
+            if (use_unified_uniforms) {
+                return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
+                                    Visit(offset).AsUint()),
+                        Type::Uint};
+            }
 
-                if (!device.HasComponentIndexingBug()) {
-                    return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
-                                        final_offset, final_offset),
-                            Type::Uint};
-                }
+            const std::string final_offset = code.GenerateTemporary();
+            code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
 
-                // AMD's proprietary GLSL compiler emits ill code for variable component access.
-                // To bypass this driver bug generate 4 ifs, one per each component.
-                const std::string pack = code.GenerateTemporary();
-                code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
-                             final_offset);
-
-                const std::string result = code.GenerateTemporary();
-                code.AddLine("uint {};", result);
-                for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
-                    code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result,
-                                 pack, GetSwizzle(swizzle));
-                }
-                return {result, Type::Uint};
+            if (!device.HasComponentIndexingBug()) {
+                return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
+                                    final_offset, final_offset),
+                        Type::Uint};
             }
 
-            UNREACHABLE_MSG("Unmanaged offset node type");
+            // AMD's proprietary GLSL compiler emits ill code for variable component access.
+            // To bypass this driver bug generate 4 ifs, one per each component.
+            const std::string pack = code.GenerateTemporary();
+            code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
+                         final_offset);
+
+            const std::string result = code.GenerateTemporary();
+            code.AddLine("uint {};", result);
+            for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
+                code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack,
+                             GetSwizzle(swizzle));
+            }
+            return {result, Type::Uint};
         }
 
         if (const auto gmem = std::get_if<GmemNode>(&*node)) {
@@ -1339,8 +1384,19 @@ private:
         const std::size_t count = operation.GetOperandsCount();
         const bool has_array = meta->sampler.is_array;
         const bool has_shadow = meta->sampler.is_shadow;
+        const bool workaround_lod_array_shadow_as_grad =
+            !device.HasTextureShadowLod() && function_suffix == "Lod" && meta->sampler.is_shadow &&
+            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+             meta->sampler.type == TextureType::TextureCube);
+
+        std::string expr = "texture";
+
+        if (workaround_lod_array_shadow_as_grad) {
+            expr += "Grad";
+        } else {
+            expr += function_suffix;
+        }
 
-        std::string expr = "texture" + function_suffix;
         if (!meta->aoffi.empty()) {
             expr += "Offset";
         } else if (!meta->ptp.empty()) {
@@ -1374,6 +1430,16 @@ private:
             expr += ')';
         }
 
+        if (workaround_lod_array_shadow_as_grad) {
+            switch (meta->sampler.type) {
+            case TextureType::Texture2D:
+                return expr + ", vec2(0.0), vec2(0.0))";
+            case TextureType::TextureCube:
+                return expr + ", vec3(0.0), vec3(0.0))";
+            }
+            UNREACHABLE();
+        }
+
         for (const auto& variant : extras) {
             if (const auto argument = std::get_if<TextureArgument>(&variant)) {
                 expr += GenerateTextureArgument(*argument);
@@ -2000,8 +2066,19 @@ private:
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
 
-        std::string expr = GenerateTexture(
-            operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+        std::string expr{};
+
+        if (!device.HasTextureShadowLod() && meta->sampler.is_shadow &&
+            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+             meta->sampler.type == TextureType::TextureCube)) {
+            LOG_ERROR(Render_OpenGL,
+                      "Device lacks GL_EXT_texture_shadow_lod, using textureGrad as a workaround");
+            expr = GenerateTexture(operation, "Lod", {});
+        } else {
+            expr = GenerateTexture(operation, "Lod",
+                                   {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+        }
+
         if (meta->sampler.is_shadow) {
             expr = "vec4(" + expr + ')';
         }
@@ -2710,6 +2787,7 @@ private:
     const std::string_view identifier;
     const std::string_view suffix;
     const Header header;
+    const bool use_unified_uniforms;
     std::unordered_map<u8, VaryingTFB> transform_feedback;
 
     ShaderWriter code;
@@ -2905,7 +2983,7 @@ void GLSLDecompiler::DecompileAST() {
 
 } // Anonymous namespace
 
-ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
+ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) {
     ShaderEntries entries;
     for (const auto& cbuf : ir.GetConstantBuffers()) {
         entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
@@ -2926,6 +3004,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
         entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
     }
     entries.shader_length = ir.GetLength();
+    entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
     return entries;
 }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index e8a178764..451c9689a 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -53,11 +53,13 @@ struct ShaderEntries {
     std::vector<GlobalMemoryEntry> global_memory_entries;
     std::vector<SamplerEntry> samplers;
     std::vector<ImageEntry> images;
-    u32 clip_distances{};
     std::size_t shader_length{};
+    u32 clip_distances{};
+    bool use_unified_uniforms{};
 };
 
-ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir);
+ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                          Tegra::Engines::ShaderType stage);
 
 std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
                             const VideoCommon::Shader::Registry& registry,
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 9e95a122b..2dcc2b0eb 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap;
 
 namespace {
 
+using VideoCommon::Shader::SeparateSamplerKey;
+
 using ShaderCacheVersionHash = std::array<u8, 64>;
 
 struct ConstBufferKey {
@@ -37,18 +39,26 @@ struct ConstBufferKey {
     u32 value = 0;
 };
 
-struct BoundSamplerKey {
+struct BoundSamplerEntry {
     u32 offset = 0;
     Tegra::Engines::SamplerDescriptor sampler;
 };
 
-struct BindlessSamplerKey {
+struct SeparateSamplerEntry {
+    u32 cbuf1 = 0;
+    u32 cbuf2 = 0;
+    u32 offset1 = 0;
+    u32 offset2 = 0;
+    Tegra::Engines::SamplerDescriptor sampler;
+};
+
+struct BindlessSamplerEntry {
     u32 cbuf = 0;
     u32 offset = 0;
     Tegra::Engines::SamplerDescriptor sampler;
 };
 
-constexpr u32 NativeVersion = 20;
+constexpr u32 NativeVersion = 21;
 
 ShaderCacheVersionHash GetShaderCacheVersionHash() {
     ShaderCacheVersionHash hash{};
@@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
     u32 texture_handler_size_value;
     u32 num_keys;
     u32 num_bound_samplers;
+    u32 num_separate_samplers;
     u32 num_bindless_samplers;
     if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 ||
         file.ReadArray(&is_texture_handler_size_known, 1) != 1 ||
         file.ReadArray(&texture_handler_size_value, 1) != 1 ||
         file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 ||
         file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 ||
+        file.ReadArray(&num_separate_samplers, 1) != 1 ||
         file.ReadArray(&num_bindless_samplers, 1) != 1) {
         return false;
     }
@@ -101,23 +113,32 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
     }
 
     std::vector<ConstBufferKey> flat_keys(num_keys);
-    std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers);
-    std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers);
+    std::vector<BoundSamplerEntry> flat_bound_samplers(num_bound_samplers);
+    std::vector<SeparateSamplerEntry> flat_separate_samplers(num_separate_samplers);
+    std::vector<BindlessSamplerEntry> flat_bindless_samplers(num_bindless_samplers);
     if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() ||
         file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) !=
             flat_bound_samplers.size() ||
+        file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) !=
+            flat_separate_samplers.size() ||
         file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) !=
             flat_bindless_samplers.size()) {
         return false;
     }
-    for (const auto& key : flat_keys) {
-        keys.insert({{key.cbuf, key.offset}, key.value});
+    for (const auto& entry : flat_keys) {
+        keys.insert({{entry.cbuf, entry.offset}, entry.value});
     }
-    for (const auto& key : flat_bound_samplers) {
-        bound_samplers.emplace(key.offset, key.sampler);
+    for (const auto& entry : flat_bound_samplers) {
+        bound_samplers.emplace(entry.offset, entry.sampler);
     }
-    for (const auto& key : flat_bindless_samplers) {
-        bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
+    for (const auto& entry : flat_separate_samplers) {
+        SeparateSamplerKey key;
+        key.buffers = {entry.cbuf1, entry.cbuf2};
+        key.offsets = {entry.offset1, entry.offset2};
+        separate_samplers.emplace(key, entry.sampler);
+    }
+    for (const auto& entry : flat_bindless_samplers) {
+        bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler});
     }
 
     return true;
@@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
         file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 ||
         file.WriteObject(static_cast<u32>(keys.size())) != 1 ||
         file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 ||
+        file.WriteObject(static_cast<u32>(separate_samplers.size())) != 1 ||
         file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) {
         return false;
     }
@@ -152,22 +174,34 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
         flat_keys.push_back(ConstBufferKey{address.first, address.second, value});
     }
 
-    std::vector<BoundSamplerKey> flat_bound_samplers;
+    std::vector<BoundSamplerEntry> flat_bound_samplers;
     flat_bound_samplers.reserve(bound_samplers.size());
     for (const auto& [address, sampler] : bound_samplers) {
-        flat_bound_samplers.push_back(BoundSamplerKey{address, sampler});
+        flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler});
+    }
+
+    std::vector<SeparateSamplerEntry> flat_separate_samplers;
+    flat_separate_samplers.reserve(separate_samplers.size());
+    for (const auto& [key, sampler] : separate_samplers) {
+        SeparateSamplerEntry entry;
+        std::tie(entry.cbuf1, entry.cbuf2) = key.buffers;
+        std::tie(entry.offset1, entry.offset2) = key.offsets;
+        entry.sampler = sampler;
+        flat_separate_samplers.push_back(entry);
     }
 
-    std::vector<BindlessSamplerKey> flat_bindless_samplers;
+    std::vector<BindlessSamplerEntry> flat_bindless_samplers;
     flat_bindless_samplers.reserve(bindless_samplers.size());
     for (const auto& [address, sampler] : bindless_samplers) {
         flat_bindless_samplers.push_back(
-            BindlessSamplerKey{address.first, address.second, sampler});
+            BindlessSamplerEntry{address.first, address.second, sampler});
     }
 
     return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() &&
            file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) ==
                flat_bound_samplers.size() &&
+           file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) ==
+               flat_separate_samplers.size() &&
            file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) ==
                flat_bindless_samplers.size();
 }
@@ -179,7 +213,7 @@ ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default;
 std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTransferable() {
     // Skip games without title id
     const bool has_title_id = system.CurrentProcess()->GetTitleID() != 0;
-    if (!Settings::values.use_disk_shader_cache || !has_title_id) {
+    if (!Settings::values.use_disk_shader_cache.GetValue() || !has_title_id) {
         return {};
     }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index d5be52e40..a79cef0e9 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -57,6 +57,7 @@ struct ShaderDiskCacheEntry {
     VideoCommon::Shader::ComputeInfo compute_info;
     VideoCommon::Shader::KeyMap keys;
     VideoCommon::Shader::BoundSamplerMap bound_samplers;
+    VideoCommon::Shader::SeparateSamplerMap separate_samplers;
     VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
 };
 
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index 6ec328c53..3655ff629 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -2,11 +2,13 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <deque>
+#include <tuple>
 #include <vector>
+
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
 MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
@@ -14,8 +16,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
 
 namespace OpenGL {
 
-OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent,
-                                 bool use_persistent)
+OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage)
     : buffer_size(size) {
     gl_buffer.Create();
 
@@ -29,34 +30,22 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p
         allocate_size *= 2;
     }
 
-    if (use_persistent) {
-        persistent = true;
-        coherent = prefer_coherent;
-        const GLbitfield flags =
-            GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
-        glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
-        mapped_ptr = static_cast<u8*>(glMapNamedBufferRange(
-            gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
-    } else {
-        glNamedBufferData(gl_buffer.handle, allocate_size, nullptr, GL_STREAM_DRAW);
+    static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT;
+    glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
+    mapped_ptr = static_cast<u8*>(
+        glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT));
+
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
     }
 }
 
 OGLStreamBuffer::~OGLStreamBuffer() {
-    if (persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
-    }
+    glUnmapNamedBuffer(gl_buffer.handle);
     gl_buffer.Release();
 }
 
-GLuint OGLStreamBuffer::GetHandle() const {
-    return gl_buffer.handle;
-}
-
-GLsizeiptr OGLStreamBuffer::GetSize() const {
-    return buffer_size;
-}
-
 std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
     ASSERT(size <= buffer_size);
     ASSERT(alignment <= buffer_size);
@@ -68,36 +57,21 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
 
     bool invalidate = false;
     if (buffer_pos + size > buffer_size) {
+        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
+        glInvalidateBufferData(gl_buffer.handle);
+
         buffer_pos = 0;
         invalidate = true;
-
-        if (persistent) {
-            glUnmapNamedBuffer(gl_buffer.handle);
-        }
     }
 
-    if (invalidate || !persistent) {
-        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
-        GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
-                           (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) |
-                           (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
-        mapped_ptr = static_cast<u8*>(
-            glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags));
-        mapped_offset = buffer_pos;
-    }
-
-    return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
+    return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate);
 }
 
 void OGLStreamBuffer::Unmap(GLsizeiptr size) {
     ASSERT(size <= mapped_size);
 
-    if (!coherent && size > 0) {
-        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size);
-    }
-
-    if (!persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
+    if (size > 0) {
+        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size);
     }
 
     buffer_pos += size;
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index f8383cbd4..307a67113 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -11,15 +11,13 @@
 
 namespace OpenGL {
 
+class Device;
+
 class OGLStreamBuffer : private NonCopyable {
 public:
-    explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false,
-                             bool use_persistent = true);
+    explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage);
     ~OGLStreamBuffer();
 
-    GLuint GetHandle() const;
-    GLsizeiptr GetSize() const;
-
     /*
      * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
      * and the optional alignment requirement.
@@ -32,15 +30,24 @@ public:
 
     void Unmap(GLsizeiptr size);
 
+    GLuint Handle() const {
+        return gl_buffer.handle;
+    }
+
+    u64 Address() const {
+        return gpu_address;
+    }
+
+    GLsizeiptr Size() const noexcept {
+        return buffer_size;
+    }
+
 private:
     OGLBuffer gl_buffer;
 
-    bool coherent = false;
-    bool persistent = false;
-
+    GLuint64EXT gpu_address = 0;
     GLintptr buffer_pos = 0;
     GLsizeiptr buffer_size = 0;
-    GLintptr mapped_offset = 0;
     GLsizeiptr mapped_size = 0;
     u8* mapped_ptr = nullptr;
 };
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 4faa8b90c..61505879b 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -263,9 +263,14 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param
     target = GetTextureTarget(params.target);
     texture = CreateTexture(params, target, internal_format, texture_buffer);
     DecorateSurfaceName();
-    main_view = CreateViewInner(
-        ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels),
-        true);
+
+    u32 num_layers = 1;
+    if (params.is_layered || params.target == SurfaceTarget::Texture3D) {
+        num_layers = params.depth;
+    }
+
+    main_view =
+        CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true);
 }
 
 CachedSurface::~CachedSurface() = default;
@@ -404,8 +409,7 @@ View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_pr
 
 CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params,
                                      bool is_proxy)
-    : VideoCommon::ViewBase(params), surface{surface},
-      format{GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format},
+    : VideoCommon::ViewBase(params), surface{surface}, format{surface.internal_format},
       target{GetTextureTarget(params.target)}, is_proxy{is_proxy} {
     if (!is_proxy) {
         main_view = CreateTextureView();
@@ -414,20 +418,23 @@ CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& p
 
 CachedSurfaceView::~CachedSurfaceView() = default;
 
-void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
+void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const {
     ASSERT(params.num_levels == 1);
 
+    if (params.target == SurfaceTarget::Texture3D) {
+        if (params.num_layers > 1) {
+            ASSERT(params.base_layer == 0);
+            glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level);
+        } else {
+            glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle,
+                                   params.base_level, params.base_layer);
+        }
+        return;
+    }
+
     if (params.num_layers > 1) {
-        // Layered framebuffer attachments
         UNIMPLEMENTED_IF(params.base_layer != 0);
-
-        switch (params.target) {
-        case SurfaceTarget::Texture2DArray:
-            glFramebufferTexture(target, attachment, GetTexture(), 0);
-            break;
-        default:
-            UNIMPLEMENTED();
-        }
+        glFramebufferTexture(fb_target, attachment, GetTexture(), 0);
         return;
     }
 
@@ -435,16 +442,16 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
     const GLuint texture = surface.GetTexture();
     switch (surface.GetSurfaceParams().target) {
     case SurfaceTarget::Texture1D:
-        glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level);
+        glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level);
         break;
     case SurfaceTarget::Texture2D:
-        glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level);
+        glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level);
         break;
     case SurfaceTarget::Texture1DArray:
     case SurfaceTarget::Texture2DArray:
     case SurfaceTarget::TextureCubemap:
     case SurfaceTarget::TextureCubeArray:
-        glFramebufferTextureLayer(target, attachment, texture, params.base_level,
+        glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level,
                                   params.base_layer);
         break;
     default:
@@ -501,8 +508,13 @@ OGLTextureView CachedSurfaceView::CreateTextureView() const {
     OGLTextureView texture_view;
     texture_view.Create();
 
-    glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level,
-                  params.num_levels, params.base_layer, params.num_layers);
+    if (target == GL_TEXTURE_3D) {
+        glTextureView(texture_view.handle, target, surface.texture.handle, format,
+                      params.base_level, params.num_levels, 0, 1);
+    } else {
+        glTextureView(texture_view.handle, target, surface.texture.handle, format,
+                      params.base_level, params.num_levels, params.base_layer, params.num_layers);
+    }
     ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle);
 
     return texture_view;
@@ -545,8 +557,8 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
                                    const Tegra::Engines::Fermi2D::Config& copy_config) {
     const auto& src_params{src_view->GetSurfaceParams()};
     const auto& dst_params{dst_view->GetSurfaceParams()};
-    UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D);
-    UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D);
+    UNIMPLEMENTED_IF(src_params.depth != 1);
+    UNIMPLEMENTED_IF(dst_params.depth != 1);
 
     state_tracker.NotifyScissor0();
     state_tracker.NotifyFramebuffer();
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 8a2ac8603..bfc4ddf5d 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -80,8 +80,10 @@ public:
     explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy);
     ~CachedSurfaceView();
 
-    /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER
-    void Attach(GLenum attachment, GLenum target) const;
+    /// @brief Attaches this texture view to the currently bound fb_target framebuffer
+    /// @param attachment   Attachment to bind textures to
+    /// @param fb_target    Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER)
+    void Attach(GLenum attachment, GLenum fb_target) const;
 
     GLuint GetTexture(Tegra::Texture::SwizzleSource x_source,
                       Tegra::Texture::SwizzleSource y_source,
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 994ae98eb..fe9bd4b5a 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -24,10 +24,11 @@ namespace MaxwellToGL {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
-inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
+inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) {
     switch (attrib.type) {
-    case Maxwell::VertexAttribute::Type::UnsignedInt:
     case Maxwell::VertexAttribute::Type::UnsignedNorm:
+    case Maxwell::VertexAttribute::Type::UnsignedScaled:
+    case Maxwell::VertexAttribute::Type::UnsignedInt:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -46,12 +47,11 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_UNSIGNED_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_UNSIGNED_INT_2_10_10_10_REV;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
-    case Maxwell::VertexAttribute::Type::SignedInt:
+        break;
     case Maxwell::VertexAttribute::Type::SignedNorm:
+    case Maxwell::VertexAttribute::Type::SignedScaled:
+    case Maxwell::VertexAttribute::Type::SignedInt:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
         case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -70,10 +70,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_INT_2_10_10_10_REV;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
+        break;
     case Maxwell::VertexAttribute::Type::Float:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_16:
@@ -86,46 +84,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_32_32_32:
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return GL_FLOAT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
-        }
-    case Maxwell::VertexAttribute::Type::UnsignedScaled:
-        switch (attrib.size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return GL_UNSIGNED_BYTE;
-        case Maxwell::VertexAttribute::Size::Size_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return GL_UNSIGNED_SHORT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
-    case Maxwell::VertexAttribute::Type::SignedScaled:
-        switch (attrib.size) {
-        case Maxwell::VertexAttribute::Size::Size_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8:
-        case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
-            return GL_BYTE;
-        case Maxwell::VertexAttribute::Size::Size_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16:
-        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
-            return GL_SHORT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
-        }
-    default:
-        LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
-        return {};
+        break;
     }
+    UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", attrib.TypeString(),
+                      attrib.SizeString());
+    return {};
 }
 
 inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
@@ -137,8 +101,7 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
     case Maxwell::IndexFormat::UnsignedInt:
         return GL_UNSIGNED_INT;
     }
-    LOG_CRITICAL(Render_OpenGL, "Unimplemented index_format={}", static_cast<u32>(index_format));
-    UNREACHABLE();
+    UNREACHABLE_MSG("Invalid index_format={}", static_cast<u32>(index_format));
     return {};
 }
 
@@ -180,33 +143,32 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
 }
 
 inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
-                                Tegra::Texture::TextureMipmapFilter mip_filter_mode) {
+                                Tegra::Texture::TextureMipmapFilter mipmap_filter_mode) {
     switch (filter_mode) {
-    case Tegra::Texture::TextureFilter::Linear: {
-        switch (mip_filter_mode) {
+    case Tegra::Texture::TextureFilter::Nearest:
+        switch (mipmap_filter_mode) {
         case Tegra::Texture::TextureMipmapFilter::None:
-            return GL_LINEAR;
+            return GL_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Nearest:
-            return GL_LINEAR_MIPMAP_NEAREST;
+            return GL_NEAREST_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
-            return GL_LINEAR_MIPMAP_LINEAR;
+            return GL_NEAREST_MIPMAP_LINEAR;
         }
         break;
-    }
-    case Tegra::Texture::TextureFilter::Nearest: {
-        switch (mip_filter_mode) {
+    case Tegra::Texture::TextureFilter::Linear:
+        switch (mipmap_filter_mode) {
         case Tegra::Texture::TextureMipmapFilter::None:
-            return GL_NEAREST;
+            return GL_LINEAR;
         case Tegra::Texture::TextureMipmapFilter::Nearest:
-            return GL_NEAREST_MIPMAP_NEAREST;
+            return GL_LINEAR_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
-            return GL_NEAREST_MIPMAP_LINEAR;
+            return GL_LINEAR_MIPMAP_LINEAR;
         }
         break;
     }
-    }
-    LOG_ERROR(Render_OpenGL, "Unimplemented texture filter mode={}", static_cast<u32>(filter_mode));
-    return GL_LINEAR;
+    UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}",
+                    static_cast<u32>(filter_mode), static_cast<u32>(mipmap_filter_mode));
+    return GL_NEAREST;
 }
 
 inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
@@ -229,10 +191,15 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         } else {
             return GL_MIRROR_CLAMP_TO_EDGE;
         }
-    default:
-        LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
-        return GL_REPEAT;
+    case Tegra::Texture::WrapMode::MirrorOnceClampOGL:
+        if (GL_EXT_texture_mirror_clamp) {
+            return GL_MIRROR_CLAMP_EXT;
+        } else {
+            return GL_MIRROR_CLAMP_TO_EDGE;
+        }
     }
+    UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
+    return GL_REPEAT;
 }
 
 inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
@@ -254,8 +221,7 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
     case Tegra::Texture::DepthCompareFunc::Always:
         return GL_ALWAYS;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented texture depth compare function ={}",
-              static_cast<u32>(func));
+    UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", static_cast<u32>(func));
     return GL_GREATER;
 }
 
@@ -277,7 +243,7 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) {
     case Maxwell::Blend::Equation::MaxGL:
         return GL_MAX;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation));
+    UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast<u32>(equation));
     return GL_FUNC_ADD;
 }
 
@@ -341,7 +307,7 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) {
     case Maxwell::Blend::Factor::OneMinusConstantAlphaGL:
         return GL_ONE_MINUS_CONSTANT_ALPHA;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor));
+    UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast<u32>(factor));
     return GL_ZERO;
 }
 
@@ -361,7 +327,7 @@ inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) {
     case Tegra::Texture::SwizzleSource::OneFloat:
         return GL_ONE;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source));
+    UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast<u32>(source));
     return GL_ZERO;
 }
 
@@ -392,7 +358,7 @@ inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) {
     case Maxwell::ComparisonOp::AlwaysOld:
         return GL_ALWAYS;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented comparison op={}", static_cast<u32>(comparison));
+    UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast<u32>(comparison));
     return GL_ALWAYS;
 }
 
@@ -423,7 +389,7 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) {
     case Maxwell::StencilOp::DecrWrapOGL:
         return GL_DECR_WRAP;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented stencil op={}", static_cast<u32>(stencil));
+    UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast<u32>(stencil));
     return GL_KEEP;
 }
 
@@ -434,7 +400,7 @@ inline GLenum FrontFace(Maxwell::FrontFace front_face) {
     case Maxwell::FrontFace::CounterClockWise:
         return GL_CCW;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face));
+    UNIMPLEMENTED_MSG("Unimplemented front face cull={}", static_cast<u32>(front_face));
     return GL_CCW;
 }
 
@@ -447,7 +413,7 @@ inline GLenum CullFace(Maxwell::CullFace cull_face) {
     case Maxwell::CullFace::FrontAndBack:
         return GL_FRONT_AND_BACK;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face));
+    UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face));
     return GL_BACK;
 }
 
@@ -486,7 +452,7 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) {
     case Maxwell::LogicOperation::Set:
         return GL_SET;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented logic operation={}", static_cast<u32>(operation));
+    UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(operation));
     return GL_COPY;
 }
 
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 6b489e6db..e66cdc083 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -455,8 +455,8 @@ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color
 void RendererOpenGL::InitOpenGLObjects() {
     frame_mailbox = std::make_unique<FrameMailbox>();
 
-    glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue,
-                 0.0f);
+    glClearColor(Settings::values.bg_red.GetValue(), Settings::values.bg_green.GetValue(),
+                 Settings::values.bg_blue.GetValue(), 0.0f);
 
     // Create shader programs
     OGLShader vertex_shader;
@@ -488,6 +488,15 @@ void RendererOpenGL::InitOpenGLObjects() {
 
     // Clear screen to black
     LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture);
+
+    // Enable unified vertex attributes and query vertex buffer address when the driver supports it
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+
+        glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV,
+                                         &vertex_buffer_address);
+    }
 }
 
 void RendererOpenGL::AddTelemetryFields() {
@@ -552,8 +561,8 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
 void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
     if (renderer_settings.set_background_color) {
         // Update background color before drawing
-        glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue,
-                     0.0f);
+        glClearColor(Settings::values.bg_red.GetValue(), Settings::values.bg_green.GetValue(),
+                     Settings::values.bg_blue.GetValue(), 0.0f);
     }
 
     // Set projection matrix
@@ -656,7 +665,13 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
                          offsetof(ScreenRectVertex, tex_coord));
     glVertexAttribBinding(PositionLocation, 0);
     glVertexAttribBinding(TexCoordLocation, 0);
-    glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glBindVertexBuffer(0, 0, 0, sizeof(ScreenRectVertex));
+        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0, vertex_buffer_address,
+                               sizeof(vertices));
+    } else {
+        glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    }
 
     glBindTextureUnit(0, screen_info.display_texture);
     glBindSampler(0, 0);
@@ -751,8 +766,9 @@ void RendererOpenGL::RenderScreenshot() {
 }
 
 bool RendererOpenGL::Init() {
-    if (GLAD_GL_KHR_debug) {
+    if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
         glEnable(GL_DEBUG_OUTPUT);
+        glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
         glDebugMessageCallback(DebugHandler, nullptr);
     }
 
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 61bf507f4..8b18d32e6 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -107,6 +107,9 @@ private:
     OGLPipeline pipeline;
     OGLFramebuffer screenshot_framebuffer;
 
+    // GPU address of the vertex buffer
+    GLuint64EXT vertex_buffer_address = 0;
+
     /// Display information for Switch screen
     ScreenInfo screen_info;