29 files changed, 2545 insertions, 678 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index a710c4bc5..281810357 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -9,6 +9,7 @@ add_library(video_core STATIC
     engines/maxwell_3d.h
     engines/maxwell_compute.cpp
     engines/maxwell_compute.h
+    engines/shader_bytecode.h
     gpu.cpp
     gpu.h
     macro_interpreter.cpp
@@ -27,6 +28,8 @@ add_library(video_core STATIC
     renderer_opengl/gl_shader_decompiler.h
     renderer_opengl/gl_shader_gen.cpp
     renderer_opengl/gl_shader_gen.h
+    renderer_opengl/gl_shader_manager.cpp
+    renderer_opengl/gl_shader_manager.h
     renderer_opengl/gl_shader_util.cpp
     renderer_opengl/gl_shader_util.h
     renderer_opengl/gl_state.cpp
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 2d7c3152f..2a3ff234a 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -74,8 +74,6 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) {
 
     regs.reg_array[method] = value;
 
-#define MAXWELL3D_REG_INDEX(field_name) (offsetof(Regs, field_name) / sizeof(u32))
-
     switch (method) {
     case MAXWELL3D_REG_INDEX(code_address.code_address_high):
     case MAXWELL3D_REG_INDEX(code_address.code_address_low): {
@@ -136,7 +134,7 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) {
         break;
     }
 
-#undef MAXWELL3D_REG_INDEX
+    VideoCore::g_renderer->Rasterizer()->NotifyMaxwellRegisterChanged(method);
 
     if (debug_context) {
         debug_context->OnEvent(Tegra::DebugContext::Event::MaxwellCommandProcessed, nullptr);
@@ -165,6 +163,7 @@ void Maxwell3D::ProcessQueryGet() {
 void Maxwell3D::DrawArrays() {
     LOG_DEBUG(HW_GPU, "called, topology=%d, count=%d", regs.draw.topology.Value(),
               regs.vertex_buffer.count);
+    ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
 
     auto debug_context = Core::System::GetInstance().GetGPUDebugContext();
 
@@ -176,7 +175,8 @@ void Maxwell3D::DrawArrays() {
         debug_context->OnEvent(Tegra::DebugContext::Event::FinishedPrimitiveBatch, nullptr);
     }
 
-    VideoCore::g_renderer->Rasterizer()->AccelerateDrawBatch(false /*is_indexed*/);
+    const bool is_indexed{regs.index_array.count && !regs.vertex_buffer.count};
+    VideoCore::g_renderer->Rasterizer()->AccelerateDrawBatch(is_indexed);
 }
 
 void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) {
@@ -218,10 +218,12 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
     Texture::TICEntry tic_entry;
     Memory::ReadBlock(tic_address_cpu, &tic_entry, sizeof(Texture::TICEntry));
 
-    ASSERT_MSG(tic_entry.header_version == Texture::TICHeaderVersion::BlockLinear,
-               "TIC versions other than BlockLinear are unimplemented");
+    ASSERT_MSG(tic_entry.header_version == Texture::TICHeaderVersion::BlockLinear ||
+                   tic_entry.header_version == Texture::TICHeaderVersion::Pitch,
+               "TIC versions other than BlockLinear or Pitch are unimplemented");
 
-    ASSERT_MSG(tic_entry.texture_type == Texture::TextureType::Texture2D,
+    ASSERT_MSG((tic_entry.texture_type == Texture::TextureType::Texture2D) ||
+                   (tic_entry.texture_type == Texture::TextureType::Texture2DNoMipmap),
                "Texture types other than Texture2D are unimplemented");
 
     auto r_type = tic_entry.r_type.Value();
@@ -301,5 +303,26 @@ u32 Maxwell3D::GetRegisterValue(u32 method) const {
     return regs.reg_array[method];
 }
 
+bool Maxwell3D::IsShaderStageEnabled(Regs::ShaderStage stage) const {
+    // The Vertex stage is always enabled.
+    if (stage == Regs::ShaderStage::Vertex)
+        return true;
+
+    switch (stage) {
+    case Regs::ShaderStage::TesselationControl:
+        return regs.shader_config[static_cast<size_t>(Regs::ShaderProgram::TesselationControl)]
+                   .enable != 0;
+    case Regs::ShaderStage::TesselationEval:
+        return regs.shader_config[static_cast<size_t>(Regs::ShaderProgram::TesselationEval)]
+                   .enable != 0;
+    case Regs::ShaderStage::Geometry:
+        return regs.shader_config[static_cast<size_t>(Regs::ShaderProgram::Geometry)].enable != 0;
+    case Regs::ShaderStage::Fragment:
+        return regs.shader_config[static_cast<size_t>(Regs::ShaderProgram::Fragment)].enable != 0;
+    }
+
+    UNREACHABLE();
+}
+
 } // namespace Engines
 } // namespace Tegra
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 98b39b2ff..d4fcedace 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -20,6 +20,9 @@
 namespace Tegra {
 namespace Engines {
 
+#define MAXWELL3D_REG_INDEX(field_name)                                                            \
+    (offsetof(Tegra::Engines::Maxwell3D::Regs, field_name) / sizeof(u32))
+
 class Maxwell3D final {
 public:
     explicit Maxwell3D(MemoryManager& memory_manager);
@@ -248,6 +251,52 @@ public:
             Patches = 0xe,
         };
 
+        enum class IndexFormat : u32 {
+            UnsignedByte = 0x0,
+            UnsignedShort = 0x1,
+            UnsignedInt = 0x2,
+        };
+
+        struct Blend {
+            enum class Equation : u32 {
+                Add = 1,
+                Subtract = 2,
+                ReverseSubtract = 3,
+                Min = 4,
+                Max = 5,
+            };
+
+            enum class Factor : u32 {
+                Zero = 0x1,
+                One = 0x2,
+                SourceColor = 0x3,
+                OneMinusSourceColor = 0x4,
+                SourceAlpha = 0x5,
+                OneMinusSourceAlpha = 0x6,
+                DestAlpha = 0x7,
+                OneMinusDestAlpha = 0x8,
+                DestColor = 0x9,
+                OneMinusDestColor = 0xa,
+                SourceAlphaSaturate = 0xb,
+                Source1Color = 0x10,
+                OneMinusSource1Color = 0x11,
+                Source1Alpha = 0x12,
+                OneMinusSource1Alpha = 0x13,
+                ConstantColor = 0x61,
+                OneMinusConstantColor = 0x62,
+                ConstantAlpha = 0x63,
+                OneMinusConstantAlpha = 0x64,
+            };
+
+            u32 separate_alpha;
+            Equation equation_rgb;
+            Factor factor_source_rgb;
+            Factor factor_dest_rgb;
+            Equation equation_a;
+            Factor factor_source_a;
+            Factor factor_dest_a;
+        };
+
         union {
             struct {
                 INSERT_PADDING_WORDS(0x200);
@@ -270,7 +319,15 @@ public:
                     }
                 } rt[NumRenderTargets];
 
-                INSERT_PADDING_WORDS(0x80);
+                struct {
+                    f32 scale_x;
+                    f32 scale_y;
+                    f32 scale_z;
+                    u32 translate_x;
+                    u32 translate_y;
+                    u32 translate_z;
+                    INSERT_PADDING_WORDS(2);
+                } viewport_transform[NumViewports];
 
                 struct {
                     union {
@@ -375,7 +432,42 @@ public:
                     };
                 } draw;
 
-                INSERT_PADDING_WORDS(0x139);
+                INSERT_PADDING_WORDS(0x6B);
+
+                struct {
+                    u32 start_addr_high;
+                    u32 start_addr_low;
+                    u32 end_addr_high;
+                    u32 end_addr_low;
+                    IndexFormat format;
+                    u32 first;
+                    u32 count;
+
+                    unsigned FormatSizeInBytes() const {
+                        switch (format) {
+                        case IndexFormat::UnsignedByte:
+                            return 1;
+                        case IndexFormat::UnsignedShort:
+                            return 2;
+                        case IndexFormat::UnsignedInt:
+                            return 4;
+                        }
+                        UNREACHABLE();
+                    }
+
+                    GPUVAddr StartAddress() const {
+                        return static_cast<GPUVAddr>(
+                            (static_cast<GPUVAddr>(start_addr_high) << 32) | start_addr_low);
+                    }
+
+                    GPUVAddr EndAddress() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(end_addr_high) << 32) |
+                                                     end_addr_low);
+                    }
+                } index_array;
+
+                INSERT_PADDING_WORDS(0xC7);
+
                 struct {
                     u32 query_address_high;
                     u32 query_address_low;
@@ -410,7 +502,9 @@ public:
                     }
                 } vertex_array[NumVertexArrays];
 
-                INSERT_PADDING_WORDS(0x40);
+                Blend blend;
+
+                INSERT_PADDING_WORDS(0x39);
 
                 struct {
                     u32 limit_high;
@@ -427,14 +521,11 @@ public:
                         BitField<0, 1, u32> enable;
                         BitField<4, 4, ShaderProgram> program;
                     };
-                    u32 start_id;
-                    INSERT_PADDING_WORDS(1);
-                    u32 gpr_alloc;
-                    ShaderStage type;
-                    INSERT_PADDING_WORDS(9);
+                    u32 offset;
+                    INSERT_PADDING_WORDS(14);
                 } shader_config[MaxShaderProgram];
 
-                INSERT_PADDING_WORDS(0x8C);
+                INSERT_PADDING_WORDS(0x80);
 
                 struct {
                     u32 cb_size;
@@ -507,6 +598,7 @@ public:
     };
 
     State state{};
+    MemoryManager& memory_manager;
 
     /// Reads a register value located at the input method address
     u32 GetRegisterValue(u32 method) const;
@@ -520,9 +612,10 @@ public:
     /// Returns a list of enabled textures for the specified shader stage.
     std::vector<Texture::FullTextureInfo> GetStageTextures(Regs::ShaderStage stage) const;
 
-private:
-    MemoryManager& memory_manager;
+    /// Returns whether the specified shader stage is enabled or not.
+    bool IsShaderStageEnabled(Regs::ShaderStage stage) const;
 
+private:
     std::unordered_map<u32, std::vector<u32>> uploaded_macros;
 
     /// Macro method that is currently being executed / being fed parameters.
@@ -564,6 +657,7 @@ private:
                   "Field " #field_name " has invalid position")
 
 ASSERT_REG_POSITION(rt, 0x200);
+ASSERT_REG_POSITION(viewport_transform[0], 0x280);
 ASSERT_REG_POSITION(viewport, 0x300);
 ASSERT_REG_POSITION(vertex_buffer, 0x35D);
 ASSERT_REG_POSITION(zeta, 0x3F8);
@@ -573,8 +667,10 @@ ASSERT_REG_POSITION(tsc, 0x557);
 ASSERT_REG_POSITION(tic, 0x55D);
 ASSERT_REG_POSITION(code_address, 0x582);
 ASSERT_REG_POSITION(draw, 0x585);
+ASSERT_REG_POSITION(index_array, 0x5F2);
 ASSERT_REG_POSITION(query, 0x6C0);
 ASSERT_REG_POSITION(vertex_array[0], 0x700);
+ASSERT_REG_POSITION(blend, 0x780);
 ASSERT_REG_POSITION(vertex_array_limit[0], 0x7C0);
 ASSERT_REG_POSITION(shader_config[0], 0x800);
 ASSERT_REG_POSITION(const_buffer, 0x8E0);
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
new file mode 100644
index 000000000..5a006aee5
--- /dev/null
+++ b/src/video_core/engines/shader_bytecode.h
@@ -0,0 +1,439 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <bitset>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+
+#include <boost/optional.hpp>
+
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+namespace Shader {
+
+struct Register {
+    // Register 255 is special cased to always be 0
+    static constexpr size_t ZeroIndex = 255;
+
+    constexpr Register() = default;
+
+    constexpr Register(u64 value) : value(value) {}
+
+    constexpr operator u64() const {
+        return value;
+    }
+
+    template <typename T>
+    constexpr u64 operator-(const T& oth) const {
+        return value - oth;
+    }
+
+    template <typename T>
+    constexpr u64 operator&(const T& oth) const {
+        return value & oth;
+    }
+
+    constexpr u64 operator&(const Register& oth) const {
+        return value & oth.value;
+    }
+
+    constexpr u64 operator~() const {
+        return ~value;
+    }
+
+private:
+    u64 value{};
+};
+
+union Attribute {
+    Attribute() = default;
+
+    constexpr explicit Attribute(u64 value) : value(value) {}
+
+    enum class Index : u64 {
+        Position = 7,
+        Attribute_0 = 8,
+    };
+
+    union {
+        BitField<22, 2, u64> element;
+        BitField<24, 6, Index> index;
+        BitField<47, 3, u64> size;
+    } fmt20;
+
+    union {
+        BitField<30, 2, u64> element;
+        BitField<32, 6, Index> index;
+    } fmt28;
+
+    BitField<39, 8, u64> reg;
+    u64 value{};
+};
+
+union Sampler {
+    Sampler() = default;
+
+    constexpr explicit Sampler(u64 value) : value(value) {}
+
+    enum class Index : u64 {
+        Sampler_0 = 8,
+    };
+
+    BitField<36, 13, Index> index;
+    u64 value{};
+};
+
+union Uniform {
+    BitField<20, 14, u64> offset;
+    BitField<34, 5, u64> index;
+};
+
+} // namespace Shader
+} // namespace Tegra
+
+namespace std {
+
+// TODO(bunnei): The below is forbidden by the C++ standard, but works fine. See #330.
+template <>
+struct make_unsigned<Tegra::Shader::Attribute> {
+    using type = Tegra::Shader::Attribute;
+};
+
+template <>
+struct make_unsigned<Tegra::Shader::Register> {
+    using type = Tegra::Shader::Register;
+};
+
+} // namespace std
+
+namespace Tegra {
+namespace Shader {
+
+enum class Pred : u64 {
+    UnusedIndex = 0x7,
+    NeverExecute = 0xF,
+};
+
+enum class PredCondition : u64 {
+    LessThan = 1,
+    Equal = 2,
+    LessEqual = 3,
+    GreaterThan = 4,
+    NotEqual = 5,
+    GreaterEqual = 6,
+    // TODO(Subv): Other condition types
+};
+
+enum class PredOperation : u64 {
+    And = 0,
+    Or = 1,
+    Xor = 2,
+};
+
+enum class SubOp : u64 {
+    Cos = 0x0,
+    Sin = 0x1,
+    Ex2 = 0x2,
+    Lg2 = 0x3,
+    Rcp = 0x4,
+    Rsq = 0x5,
+    Min = 0x8,
+};
+
+union Instruction {
+    Instruction& operator=(const Instruction& instr) {
+        value = instr.value;
+        return *this;
+    }
+
+    constexpr Instruction(u64 value) : value{value} {}
+
+    BitField<0, 8, Register> gpr0;
+    BitField<8, 8, Register> gpr8;
+    union {
+        BitField<16, 4, Pred> full_pred;
+        BitField<16, 3, u64> pred_index;
+    } pred;
+    BitField<19, 1, u64> negate_pred;
+    BitField<20, 8, Register> gpr20;
+    BitField<20, 7, SubOp> sub_op;
+    BitField<28, 8, Register> gpr28;
+    BitField<39, 8, Register> gpr39;
+    BitField<48, 16, u64> opcode;
+
+    union {
+        BitField<20, 19, u64> imm20_19;
+        BitField<20, 32, u64> imm20_32;
+        BitField<45, 1, u64> negate_b;
+        BitField<46, 1, u64> abs_a;
+        BitField<48, 1, u64> negate_a;
+        BitField<49, 1, u64> abs_b;
+        BitField<50, 1, u64> abs_d;
+        BitField<56, 1, u64> negate_imm;
+
+        float GetImm20_19() const {
+            float result{};
+            u32 imm{static_cast<u32>(imm20_19)};
+            imm <<= 12;
+            imm |= negate_imm ? 0x80000000 : 0;
+            std::memcpy(&result, &imm, sizeof(imm));
+            return result;
+        }
+
+        float GetImm20_32() const {
+            float result{};
+            u32 imm{static_cast<u32>(imm20_32)};
+            std::memcpy(&result, &imm, sizeof(imm));
+            return result;
+        }
+    } alu;
+
+    union {
+        BitField<48, 1, u64> negate_b;
+        BitField<49, 1, u64> negate_c;
+    } ffma;
+
+    union {
+        BitField<0, 3, u64> pred0;
+        BitField<3, 3, u64> pred3;
+        BitField<7, 1, u64> abs_a;
+        BitField<39, 3, u64> pred39;
+        BitField<42, 1, u64> neg_pred;
+        BitField<43, 1, u64> neg_a;
+        BitField<44, 1, u64> abs_b;
+        BitField<45, 2, PredOperation> op;
+        BitField<47, 1, u64> ftz;
+        BitField<48, 4, PredCondition> cond;
+        BitField<56, 1, u64> neg_b;
+    } fsetp;
+
+    BitField<61, 1, u64> is_b_imm;
+    BitField<60, 1, u64> is_b_gpr;
+    BitField<59, 1, u64> is_c_gpr;
+
+    Attribute attribute;
+    Uniform uniform;
+    Sampler sampler;
+
+    u64 value;
+};
+static_assert(sizeof(Instruction) == 0x8, "Incorrect structure size");
+static_assert(std::is_standard_layout<Instruction>::value,
+              "Structure does not have standard layout");
+
+class OpCode {
+public:
+    enum class Id {
+        KIL,
+        LD_A,
+        ST_A,
+        TEXQ, // Texture Query
+        TEXS, // Texture Fetch with scalar/non-vec4 source/destinations
+        TLDS, // Texture Load with scalar/non-vec4 source/destinations
+        EXIT,
+        IPA,
+        FFMA_IMM, // Fused Multiply and Add
+        FFMA_CR,
+        FFMA_RC,
+        FFMA_RR,
+        FADD_C,
+        FADD_R,
+        FADD_IMM,
+        FMUL_C,
+        FMUL_R,
+        FMUL_IMM,
+        FMUL32_IMM,
+        MUFU, // Multi-Function Operator
+        RRO,  // Range Reduction Operator
+        F2F_C,
+        F2F_R,
+        F2F_IMM,
+        F2I_C,
+        F2I_R,
+        F2I_IMM,
+        I2F_C,
+        I2F_R,
+        I2F_IMM,
+        LOP32I,
+        MOV_C,
+        MOV_R,
+        MOV_IMM,
+        MOV32I,
+        SHR_C,
+        SHR_R,
+        SHR_IMM,
+        FSETP_C, // Set Predicate
+        FSETP_R,
+        FSETP_IMM,
+        ISETP_C,
+        ISETP_IMM,
+        ISETP_R,
+    };
+
+    enum class Type {
+        Trivial,
+        Arithmetic,
+        Ffma,
+        Flow,
+        Memory,
+        FloatPredicate,
+        IntegerPredicate,
+        Unknown,
+    };
+
+    class Matcher {
+    public:
+        Matcher(const char* const name, u16 mask, u16 expected, OpCode::Id id, OpCode::Type type)
+            : name{name}, mask{mask}, expected{expected}, id{id}, type{type} {}
+
+        const char* GetName() const {
+            return name;
+        }
+
+        u16 GetMask() const {
+            return mask;
+        }
+
+        Id GetId() const {
+            return id;
+        }
+
+        Type GetType() const {
+            return type;
+        }
+
+        /**
+         * Tests to see if the given instruction is the instruction this matcher represents.
+         * @param instruction The instruction to test
+         * @returns true if the given instruction matches.
+         */
+        bool Matches(u16 instruction) const {
+            return (instruction & mask) == expected;
+        }
+
+    private:
+        const char* name;
+        u16 mask;
+        u16 expected;
+        Id id;
+        Type type;
+    };
+
+    static boost::optional<const Matcher&> Decode(Instruction instr) {
+        static const auto table{GetDecodeTable()};
+
+        const auto matches_instruction = [instr](const auto& matcher) {
+            return matcher.Matches(static_cast<u16>(instr.opcode));
+        };
+
+        auto iter = std::find_if(table.begin(), table.end(), matches_instruction);
+        return iter != table.end() ? boost::optional<const Matcher&>(*iter) : boost::none;
+    }
+
+private:
+    struct Detail {
+    private:
+        static constexpr size_t opcode_bitsize = 16;
+
+        /**
+         * Generates the mask and the expected value after masking from a given bitstring.
+         * A '0' in a bitstring indicates that a zero must be present at that bit position.
+         * A '1' in a bitstring indicates that a one must be present at that bit position.
+         */
+        static auto GetMaskAndExpect(const char* const bitstring) {
+            u16 mask = 0, expect = 0;
+            for (size_t i = 0; i < opcode_bitsize; i++) {
+                const size_t bit_position = opcode_bitsize - i - 1;
+                switch (bitstring[i]) {
+                case '0':
+                    mask |= 1 << bit_position;
+                    break;
+                case '1':
+                    expect |= 1 << bit_position;
+                    mask |= 1 << bit_position;
+                    break;
+                default:
+                    // Ignore
+                    break;
+                }
+            }
+            return std::make_tuple(mask, expect);
+        }
+
+    public:
+        /// Creates a matcher that can match and parse instructions based on bitstring.
+        static auto GetMatcher(const char* const bitstring, OpCode::Id op, OpCode::Type type,
+                               const char* const name) {
+            const auto mask_expect = GetMaskAndExpect(bitstring);
+            return Matcher(name, std::get<0>(mask_expect), std::get<1>(mask_expect), op, type);
+        }
+    };
+
+    static std::vector<Matcher> GetDecodeTable() {
+        std::vector<Matcher> table = {
+#define INST(bitstring, op, type, name) Detail::GetMatcher(bitstring, op, type, name)
+            INST("111000110011----", Id::KIL, Type::Flow, "KIL"),
+            INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),
+            INST("1110111111110---", Id::ST_A, Type::Memory, "ST_A"),
+            INST("1101111101001---", Id::TEXQ, Type::Memory, "TEXQ"),
+            INST("1101100---------", Id::TEXS, Type::Memory, "TEXS"),
+            INST("1101101---------", Id::TLDS, Type::Memory, "TLDS"),
+            INST("111000110000----", Id::EXIT, Type::Trivial, "EXIT"),
+            INST("11100000--------", Id::IPA, Type::Trivial, "IPA"),
+            INST("001100101-------", Id::FFMA_IMM, Type::Ffma, "FFMA_IMM"),
+            INST("010010011-------", Id::FFMA_CR, Type::Ffma, "FFMA_CR"),
+            INST("010100011-------", Id::FFMA_RC, Type::Ffma, "FFMA_RC"),
+            INST("010110011-------", Id::FFMA_RR, Type::Ffma, "FFMA_RR"),
+            INST("0100110001011---", Id::FADD_C, Type::Arithmetic, "FADD_C"),
+            INST("0101110001011---", Id::FADD_R, Type::Arithmetic, "FADD_R"),
+            INST("0011100-01011---", Id::FADD_IMM, Type::Arithmetic, "FADD_IMM"),
+            INST("0100110001101---", Id::FMUL_C, Type::Arithmetic, "FMUL_C"),
+            INST("0101110001101---", Id::FMUL_R, Type::Arithmetic, "FMUL_R"),
+            INST("0011100-01101---", Id::FMUL_IMM, Type::Arithmetic, "FMUL_IMM"),
+            INST("00011110--------", Id::FMUL32_IMM, Type::Arithmetic, "FMUL32_IMM"),
+            INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
+            INST("0101110010010---", Id::RRO, Type::Arithmetic, "RRO"),
+            INST("0100110010101---", Id::F2F_C, Type::Arithmetic, "F2F_C"),
+            INST("0101110010101---", Id::F2F_R, Type::Arithmetic, "F2F_R"),
+            INST("0011100-10101---", Id::F2F_IMM, Type::Arithmetic, "F2F_IMM"),
+            INST("0100110010110---", Id::F2I_C, Type::Arithmetic, "F2I_C"),
+            INST("0101110010110---", Id::F2I_R, Type::Arithmetic, "F2I_R"),
+            INST("0011100-10110---", Id::F2I_IMM, Type::Arithmetic, "F2I_IMM"),
+            INST("0100110010111---", Id::I2F_C, Type::Arithmetic, "I2F_C"),
+            INST("0101110010111---", Id::I2F_R, Type::Arithmetic, "I2F_R"),
+            INST("0011100-10111---", Id::I2F_IMM, Type::Arithmetic, "I2F_IMM"),
+            INST("000001----------", Id::LOP32I, Type::Arithmetic, "LOP32I"),
+            INST("0100110010011---", Id::MOV_C, Type::Arithmetic, "MOV_C"),
+            INST("0101110010011---", Id::MOV_R, Type::Arithmetic, "MOV_R"),
+            INST("0011100-10011---", Id::MOV_IMM, Type::Arithmetic, "MOV_IMM"),
+            INST("000000010000----", Id::MOV32I, Type::Arithmetic, "MOV32I"),
+            INST("0100110000101---", Id::SHR_C, Type::Arithmetic, "SHR_C"),
+            INST("0101110000101---", Id::SHR_R, Type::Arithmetic, "SHR_R"),
+            INST("0011100-00101---", Id::SHR_IMM, Type::Arithmetic, "SHR_IMM"),
+            INST("010010111011----", Id::FSETP_C, Type::FloatPredicate, "FSETP_C"),
+            INST("010110111011----", Id::FSETP_R, Type::FloatPredicate, "FSETP_R"),
+            INST("0011011-1011----", Id::FSETP_IMM, Type::FloatPredicate, "FSETP_IMM"),
+            INST("010010110110----", Id::ISETP_C, Type::IntegerPredicate, "ISETP_C"),
+            INST("010110110110----", Id::ISETP_R, Type::IntegerPredicate, "ISETP_R"),
+            INST("0011011-0110----", Id::ISETP_IMM, Type::IntegerPredicate, "ISETP_IMM"),
+        };
+#undef INST
+        std::stable_sort(table.begin(), table.end(), [](const auto& a, const auto& b) {
+            // If a matcher has more bits in its mask it is more specific, so it
+            // should come first.
+            return std::bitset<16>(a.GetMask()).count() > std::bitset<16>(b.GetMask()).count();
+        });
+
+        return table;
+    }
+};
+
+} // namespace Shader
+} // namespace Tegra
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 71a8661b4..2888daedc 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -15,7 +15,10 @@ namespace Tegra {
 
 enum class RenderTargetFormat : u32 {
     NONE = 0x0,
+    RGBA16_FLOAT = 0xCA,
+    RGB10_A2_UNORM = 0xD1,
     RGBA8_UNORM = 0xD5,
+    RGBA8_SRGB = 0xD6,
 };
 
 class DebugContext;
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 35d262189..36629dd11 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -19,7 +19,7 @@ public:
     virtual void DrawArrays() = 0;
 
     /// Notify rasterizer that the specified Maxwell register has been changed
-    virtual void NotifyMaxwellRegisterChanged(u32 id) = 0;
+    virtual void NotifyMaxwellRegisterChanged(u32 method) = 0;
 
     /// Notify rasterizer that all caches should be flushed to Switch memory
     virtual void FlushAll() = 0;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index f217a265b..2d4a0d6db 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <algorithm>
 #include <memory>
 #include <string>
 #include <tuple>
@@ -13,7 +14,6 @@
 #include "common/math_util.h"
 #include "common/microprofile.h"
 #include "common/scope_exit.h"
-#include "common/vector_math.h"
 #include "core/core.h"
 #include "core/hle/kernel/process.h"
 #include "core/settings.h"
@@ -34,33 +34,7 @@ MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192));
 MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255));
 MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100));
 
-enum class UniformBindings : GLuint { Common, VS, FS };
-
-static void SetShaderUniformBlockBinding(GLuint shader, const char* name, UniformBindings binding,
-                                         size_t expected_size) {
-    GLuint ub_index = glGetUniformBlockIndex(shader, name);
-    if (ub_index != GL_INVALID_INDEX) {
-        GLint ub_size = 0;
-        glGetActiveUniformBlockiv(shader, ub_index, GL_UNIFORM_BLOCK_DATA_SIZE, &ub_size);
-        ASSERT_MSG(ub_size == expected_size,
-                   "Uniform block size did not match! Got %d, expected %zu",
-                   static_cast<int>(ub_size), expected_size);
-        glUniformBlockBinding(shader, ub_index, static_cast<GLuint>(binding));
-    }
-}
-
-static void SetShaderUniformBlockBindings(GLuint shader) {
-    SetShaderUniformBlockBinding(shader, "shader_data", UniformBindings::Common,
-                                 sizeof(RasterizerOpenGL::UniformData));
-    SetShaderUniformBlockBinding(shader, "vs_config", UniformBindings::VS,
-                                 sizeof(RasterizerOpenGL::VSUniformData));
-    SetShaderUniformBlockBinding(shader, "fs_config", UniformBindings::FS,
-                                 sizeof(RasterizerOpenGL::FSUniformData));
-}
-
 RasterizerOpenGL::RasterizerOpenGL() {
-    shader_dirty = true;
-
     has_ARB_buffer_storage = false;
     has_ARB_direct_state_access = false;
     has_ARB_separate_shader_objects = false;
@@ -72,6 +46,14 @@ RasterizerOpenGL::RasterizerOpenGL() {
         state.texture_units[i].sampler = texture_samplers[i].sampler.handle;
     }
 
+    // Create SSBOs
+    for (size_t stage = 0; stage < ssbos.size(); ++stage) {
+        for (size_t buffer = 0; buffer < ssbos[stage].size(); ++buffer) {
+            ssbos[stage][buffer].Create();
+            state.draw.const_buffers[stage][buffer].ssbo = ssbos[stage][buffer].handle;
+        }
+    }
+
     GLint ext_num;
     glGetIntegerv(GL_NUM_EXTENSIONS, &ext_num);
     for (GLint i = 0; i < ext_num; i++) {
@@ -88,6 +70,8 @@ RasterizerOpenGL::RasterizerOpenGL() {
         }
     }
 
+    ASSERT_MSG(has_ARB_separate_shader_objects, "has_ARB_separate_shader_objects is unsupported");
+
     // Clipping plane 0 is always enabled for PICA fixed clip plane z <= 0
     state.clip_distance[0] = true;
 
@@ -102,36 +86,30 @@ RasterizerOpenGL::RasterizerOpenGL() {
     state.draw.uniform_buffer = uniform_buffer.handle;
     state.Apply();
 
-    glBufferData(GL_UNIFORM_BUFFER, sizeof(UniformData), nullptr, GL_STATIC_DRAW);
-    glBindBufferBase(GL_UNIFORM_BUFFER, 0, uniform_buffer.handle);
-
-    uniform_block_data.dirty = true;
-
     // Create render framebuffer
     framebuffer.Create();
 
-    if (has_ARB_separate_shader_objects) {
-        hw_vao.Create();
-        hw_vao_enabled_attributes.fill(false);
+    hw_vao.Create();
+    hw_vao_enabled_attributes.fill(false);
 
-        stream_buffer = OGLStreamBuffer::MakeBuffer(has_ARB_buffer_storage, GL_ARRAY_BUFFER);
-        stream_buffer->Create(STREAM_BUFFER_SIZE, STREAM_BUFFER_SIZE / 2);
-        state.draw.vertex_buffer = stream_buffer->GetHandle();
+    stream_buffer = OGLStreamBuffer::MakeBuffer(has_ARB_buffer_storage, GL_ARRAY_BUFFER);
+    stream_buffer->Create(STREAM_BUFFER_SIZE, STREAM_BUFFER_SIZE / 2);
+    state.draw.vertex_buffer = stream_buffer->GetHandle();
 
-        pipeline.Create();
-        state.draw.program_pipeline = pipeline.handle;
-        state.draw.shader_program = 0;
-        state.draw.vertex_array = hw_vao.handle;
-        state.Apply();
+    shader_program_manager = std::make_unique<GLShader::ProgramManager>();
+    state.draw.shader_program = 0;
+    state.draw.vertex_array = hw_vao.handle;
+    state.Apply();
 
-        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, stream_buffer->GetHandle());
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, stream_buffer->GetHandle());
 
-        vs_uniform_buffer.Create();
-        glBindBuffer(GL_UNIFORM_BUFFER, vs_uniform_buffer.handle);
-        glBufferData(GL_UNIFORM_BUFFER, sizeof(VSUniformData), nullptr, GL_STREAM_COPY);
-        glBindBufferBase(GL_UNIFORM_BUFFER, 1, vs_uniform_buffer.handle);
-    } else {
-        UNREACHABLE();
+    for (unsigned index = 0; index < uniform_buffers.size(); ++index) {
+        auto& buffer = uniform_buffers[index];
+        buffer.Create();
+        glBindBuffer(GL_UNIFORM_BUFFER, buffer.handle);
+        glBufferData(GL_UNIFORM_BUFFER, sizeof(GLShader::MaxwellUniformData), nullptr,
+                     GL_STREAM_COPY);
+        glBindBufferBase(GL_UNIFORM_BUFFER, index, buffer.handle);
     }
 
     accelerate_draw = AccelDraw::Disabled;
@@ -149,17 +127,6 @@ RasterizerOpenGL::~RasterizerOpenGL() {
     }
 }
 
-void RasterizerOpenGL::AnalyzeVertexArray(bool is_indexed) {
-    const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
-
-    if (is_indexed) {
-        UNREACHABLE();
-    }
-
-    // TODO(bunnei): Add support for 1+ vertex arrays
-    vs_input_size = regs.vertex_buffer.count * regs.vertex_array[0].stride;
-}
-
 void RasterizerOpenGL::SetupVertexArray(u8* array_ptr, GLintptr buffer_offset) {
     MICROPROFILE_SCOPE(OpenGL_VAO);
     const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
@@ -171,6 +138,7 @@ void RasterizerOpenGL::SetupVertexArray(u8* array_ptr, GLintptr buffer_offset) {
 
     // TODO(bunnei): Add support for 1+ vertex arrays
     const auto& vertex_array{regs.vertex_array[0]};
+    const auto& vertex_array_limit{regs.vertex_array_limit[0]};
     ASSERT_MSG(vertex_array.enable, "vertex array 0 is disabled?");
     ASSERT_MSG(!vertex_array.divisor, "vertex array 0 divisor is unimplemented!");
     for (unsigned index = 1; index < Maxwell::NumVertexArrays; ++index) {
@@ -183,6 +151,10 @@ void RasterizerOpenGL::SetupVertexArray(u8* array_ptr, GLintptr buffer_offset) {
     // to avoid OpenGL errors.
     for (unsigned index = 0; index < 16; ++index) {
         auto& attrib = regs.vertex_attrib_format[index];
+        NGLOG_DEBUG(HW_GPU, "vertex attrib {}, count={}, size={}, type={}, offset={}, normalize={}",
+                    index, attrib.ComponentCount(), attrib.SizeString(), attrib.TypeString(),
+                    attrib.offset.Value(), attrib.IsNormalized());
+
         glVertexAttribPointer(index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib),
                               attrib.IsNormalized() ? GL_TRUE : GL_FALSE, vertex_array.stride,
                               reinterpret_cast<GLvoid*>(buffer_offset + attrib.offset));
@@ -191,7 +163,7 @@ void RasterizerOpenGL::SetupVertexArray(u8* array_ptr, GLintptr buffer_offset) {
     }
 
     // Copy vertex array data
-    const u32 data_size{vertex_array.stride * regs.vertex_buffer.count};
+    const u64 data_size{vertex_array_limit.LimitAddress() - vertex_array.StartAddress() + 1};
     const VAddr data_addr{memory_manager->PhysicalToVirtualAddress(vertex_array.StartAddress())};
     res_cache.FlushRegion(data_addr, data_size, nullptr);
     Memory::ReadBlock(data_addr, array_ptr, data_size);
@@ -200,26 +172,89 @@ void RasterizerOpenGL::SetupVertexArray(u8* array_ptr, GLintptr buffer_offset) {
     buffer_offset += data_size;
 }
 
-void RasterizerOpenGL::SetupVertexShader(VSUniformData* ub_ptr, GLintptr buffer_offset) {
-    MICROPROFILE_SCOPE(OpenGL_VS);
-    LOG_CRITICAL(Render_OpenGL, "Emulated shaders are not supported! Using a passthrough shader.");
-    glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, current_shader->shader.handle);
-}
+void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset, size_t ptr_pos) {
+    // Helper function for uploading uniform data
+    const auto copy_buffer = [&](GLuint handle, GLintptr offset, GLsizeiptr size) {
+        if (has_ARB_direct_state_access) {
+            glCopyNamedBufferSubData(stream_buffer->GetHandle(), handle, offset, 0, size);
+        } else {
+            glBindBuffer(GL_COPY_WRITE_BUFFER, handle);
+            glCopyBufferSubData(GL_ARRAY_BUFFER, GL_COPY_WRITE_BUFFER, offset, 0, size);
+        }
+    };
 
-void RasterizerOpenGL::SetupFragmentShader(FSUniformData* ub_ptr, GLintptr buffer_offset) {
-    MICROPROFILE_SCOPE(OpenGL_FS);
-    UNREACHABLE();
-}
+    auto& gpu = Core::System().GetInstance().GPU().Maxwell3D();
+    ASSERT_MSG(!gpu.regs.shader_config[0].enable, "VertexA is unsupported!");
 
-bool RasterizerOpenGL::AccelerateDrawBatch(bool is_indexed) {
-    if (!has_ARB_separate_shader_objects) {
-        UNREACHABLE();
-        return false;
+    // Next available bindpoint to use when uploading the const buffers to the GLSL shaders.
+    u32 current_constbuffer_bindpoint = 0;
+
+    for (unsigned index = 1; index < Maxwell::MaxShaderProgram; ++index) {
+        ptr_pos += sizeof(GLShader::MaxwellUniformData);
+
+        auto& shader_config = gpu.regs.shader_config[index];
+        const Maxwell::ShaderProgram program{static_cast<Maxwell::ShaderProgram>(index)};
+
+        const auto& stage = index - 1; // Stage indices are 0 - 5
+
+        const bool is_enabled = gpu.IsShaderStageEnabled(static_cast<Maxwell::ShaderStage>(stage));
+
+        // Skip stages that are not enabled
+        if (!is_enabled) {
+            continue;
+        }
+
+        // Upload uniform data as one UBO per stage
+        const GLintptr ubo_offset = buffer_offset + static_cast<GLintptr>(ptr_pos);
+        copy_buffer(uniform_buffers[stage].handle, ubo_offset,
+                    sizeof(GLShader::MaxwellUniformData));
+        GLShader::MaxwellUniformData* ub_ptr =
+            reinterpret_cast<GLShader::MaxwellUniformData*>(&buffer_ptr[ptr_pos]);
+        ub_ptr->SetFromRegs(gpu.state.shader_stages[stage]);
+
+        // Fetch program code from memory
+        GLShader::ProgramCode program_code;
+        const u64 gpu_address{gpu.regs.code_address.CodeAddress() + shader_config.offset};
+        const VAddr cpu_address{gpu.memory_manager.PhysicalToVirtualAddress(gpu_address)};
+        Memory::ReadBlock(cpu_address, program_code.data(), program_code.size() * sizeof(u64));
+        GLShader::ShaderSetup setup{std::move(program_code)};
+
+        GLShader::ShaderEntries shader_resources;
+
+        switch (program) {
+        case Maxwell::ShaderProgram::VertexB: {
+            GLShader::MaxwellVSConfig vs_config{setup};
+            shader_resources =
+                shader_program_manager->UseProgrammableVertexShader(vs_config, setup);
+            break;
+        }
+        case Maxwell::ShaderProgram::Fragment: {
+            GLShader::MaxwellFSConfig fs_config{setup};
+            shader_resources =
+                shader_program_manager->UseProgrammableFragmentShader(fs_config, setup);
+            break;
+        }
+        default:
+            LOG_CRITICAL(HW_GPU, "Unimplemented shader index=%d, enable=%d, offset=0x%08X", index,
+                         shader_config.enable.Value(), shader_config.offset);
+            UNREACHABLE();
+        }
+
+        GLuint gl_stage_program = shader_program_manager->GetCurrentProgramStage(
+            static_cast<Maxwell::ShaderStage>(stage));
+
+        // Configure the const buffers for this shader stage.
+        current_constbuffer_bindpoint =
+            SetupConstBuffers(static_cast<Maxwell::ShaderStage>(stage), gl_stage_program,
+                              current_constbuffer_bindpoint, shader_resources.const_buffer_entries);
     }
 
+    shader_program_manager->UseTrivialGeometryShader();
+}
+
+bool RasterizerOpenGL::AccelerateDrawBatch(bool is_indexed) {
     accelerate_draw = is_indexed ? AccelDraw::Indexed : AccelDraw::Arrays;
     DrawArrays();
-
     return true;
 }
 
@@ -255,18 +290,18 @@ void RasterizerOpenGL::DrawArrays() {
                               : (depth_surface == nullptr ? 1u : depth_surface->res_scale);
 
     MathUtil::Rectangle<u32> draw_rect{
-        static_cast<u32>(MathUtil::Clamp<s32>(static_cast<s32>(surfaces_rect.left) +
-                                                  viewport_rect.left * res_scale,
-                                              surfaces_rect.left, surfaces_rect.right)), // Left
-        static_cast<u32>(MathUtil::Clamp<s32>(static_cast<s32>(surfaces_rect.bottom) +
-                                                  viewport_rect.top * res_scale,
-                                              surfaces_rect.bottom, surfaces_rect.top)), // Top
-        static_cast<u32>(MathUtil::Clamp<s32>(static_cast<s32>(surfaces_rect.left) +
-                                                  viewport_rect.right * res_scale,
-                                              surfaces_rect.left, surfaces_rect.right)), // Right
-        static_cast<u32>(MathUtil::Clamp<s32>(static_cast<s32>(surfaces_rect.bottom) +
-                                                  viewport_rect.bottom * res_scale,
-                                              surfaces_rect.bottom, surfaces_rect.top))}; // Bottom
+        static_cast<u32>(
+            std::clamp<s32>(static_cast<s32>(surfaces_rect.left) + viewport_rect.left * res_scale,
+                            surfaces_rect.left, surfaces_rect.right)), // Left
+        static_cast<u32>(
+            std::clamp<s32>(static_cast<s32>(surfaces_rect.bottom) + viewport_rect.top * res_scale,
+                            surfaces_rect.bottom, surfaces_rect.top)), // Top
+        static_cast<u32>(
+            std::clamp<s32>(static_cast<s32>(surfaces_rect.left) + viewport_rect.right * res_scale,
+                            surfaces_rect.left, surfaces_rect.right)), // Right
+        static_cast<u32>(std::clamp<s32>(static_cast<s32>(surfaces_rect.bottom) +
+                                             viewport_rect.bottom * res_scale,
+                                         surfaces_rect.bottom, surfaces_rect.top))}; // Bottom
 
     // Bind the framebuffer surfaces
     BindFramebufferSurfaces(color_surface, depth_surface, has_stencil);
@@ -280,18 +315,6 @@ void RasterizerOpenGL::DrawArrays() {
     // Sync and bind the texture surfaces
     BindTextures();
 
-    // Sync and bind the shader
-    if (shader_dirty) {
-        SetShader();
-        shader_dirty = false;
-    }
-
-    // Sync the uniform data
-    if (uniform_block_data.dirty) {
-        glBufferSubData(GL_UNIFORM_BUFFER, 0, sizeof(UniformData), &uniform_block_data.data);
-        uniform_block_data.dirty = false;
-    }
-
     // Viewport can have negative offsets or larger dimensions than our framebuffer sub-rect. Enable
     // scissor test to prevent drawing outside of the framebuffer region
     state.scissor.enabled = true;
@@ -303,15 +326,22 @@ void RasterizerOpenGL::DrawArrays() {
 
     // Draw the vertex batch
     const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
-    AnalyzeVertexArray(is_indexed);
+    const u64 index_buffer_size{regs.index_array.count * regs.index_array.FormatSizeInBytes()};
+    const unsigned vertex_num{is_indexed ? regs.index_array.count : regs.vertex_buffer.count};
+
+    // TODO(bunnei): Add support for 1+ vertex arrays
+    vs_input_size = vertex_num * regs.vertex_array[0].stride;
+
     state.draw.vertex_buffer = stream_buffer->GetHandle();
     state.Apply();
 
     size_t buffer_size = static_cast<size_t>(vs_input_size);
     if (is_indexed) {
-        UNREACHABLE();
+        buffer_size = Common::AlignUp(buffer_size, 4) + index_buffer_size;
     }
-    buffer_size += sizeof(VSUniformData);
+
+    // Uniform space for the 5 shader stages
+    buffer_size += sizeof(GLShader::MaxwellUniformData) * Maxwell::MaxShaderStage;
 
     size_t ptr_pos = 0;
     u8* buffer_ptr;
@@ -322,36 +352,37 @@ void RasterizerOpenGL::DrawArrays() {
     SetupVertexArray(buffer_ptr, buffer_offset);
     ptr_pos += vs_input_size;
 
+    // If indexed mode, copy the index buffer
     GLintptr index_buffer_offset = 0;
     if (is_indexed) {
-        UNREACHABLE();
-    }
+        ptr_pos = Common::AlignUp(ptr_pos, 4);
 
-    SetupVertexShader(reinterpret_cast<VSUniformData*>(&buffer_ptr[ptr_pos]),
-                      buffer_offset + static_cast<GLintptr>(ptr_pos));
-    const GLintptr vs_ubo_offset = buffer_offset + static_cast<GLintptr>(ptr_pos);
-    ptr_pos += sizeof(VSUniformData);
+        const auto& memory_manager = Core::System().GetInstance().GPU().memory_manager;
+        const VAddr index_data_addr{
+            memory_manager->PhysicalToVirtualAddress(regs.index_array.StartAddress())};
+        Memory::ReadBlock(index_data_addr, &buffer_ptr[ptr_pos], index_buffer_size);
 
-    stream_buffer->Unmap();
+        index_buffer_offset = buffer_offset + static_cast<GLintptr>(ptr_pos);
+        ptr_pos += index_buffer_size;
+    }
 
-    const auto copy_buffer = [&](GLuint handle, GLintptr offset, GLsizeiptr size) {
-        if (has_ARB_direct_state_access) {
-            glCopyNamedBufferSubData(stream_buffer->GetHandle(), handle, offset, 0, size);
-        } else {
-            glBindBuffer(GL_COPY_WRITE_BUFFER, handle);
-            glCopyBufferSubData(GL_ARRAY_BUFFER, GL_COPY_WRITE_BUFFER, offset, 0, size);
-        }
-    };
+    SetupShaders(buffer_ptr, buffer_offset, ptr_pos);
 
-    copy_buffer(vs_uniform_buffer.handle, vs_ubo_offset, sizeof(VSUniformData));
+    stream_buffer->Unmap();
 
-    glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, current_shader->shader.handle);
+    shader_program_manager->ApplyTo(state);
+    state.Apply();
 
+    const GLenum primitive_mode{MaxwellToGL::PrimitiveTopology(regs.draw.topology)};
     if (is_indexed) {
-        UNREACHABLE();
+        const GLint index_min{static_cast<GLint>(regs.index_array.first)};
+        const GLint index_max{static_cast<GLint>(regs.index_array.first + regs.index_array.count)};
+        glDrawRangeElementsBaseVertex(primitive_mode, index_min, index_max, regs.index_array.count,
+                                      MaxwellToGL::IndexFormat(regs.index_array.format),
+                                      reinterpret_cast<const void*>(index_buffer_offset),
+                                      -index_min);
     } else {
-        glDrawArrays(MaxwellToGL::PrimitiveTopology(regs.draw.topology), 0,
-                     regs.vertex_buffer.count);
+        glDrawArrays(primitive_mode, 0, regs.vertex_buffer.count);
     }
 
     // Disable scissor test
@@ -384,7 +415,7 @@ void RasterizerOpenGL::DrawArrays() {
 
 void RasterizerOpenGL::BindTextures() {
     using Regs = Tegra::Engines::Maxwell3D::Regs;
-    auto maxwell3d = Core::System::GetInstance().GPU().Get3DEngine();
+    auto& maxwell3d = Core::System::GetInstance().GPU().Get3DEngine();
 
     // Each Maxwell shader stage can have an arbitrary number of textures, but we're limited to a
     // certain number in OpenGL. We try to only use the minimum amount of host textures by not
@@ -415,7 +446,32 @@ void RasterizerOpenGL::BindTextures() {
     }
 }
 
-void RasterizerOpenGL::NotifyMaxwellRegisterChanged(u32 id) {}
+void RasterizerOpenGL::NotifyMaxwellRegisterChanged(u32 method) {
+    const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
+    switch (method) {
+    case MAXWELL3D_REG_INDEX(blend.separate_alpha):
+        ASSERT_MSG(false, "unimplemented");
+        break;
+    case MAXWELL3D_REG_INDEX(blend.equation_rgb):
+        state.blend.rgb_equation = MaxwellToGL::BlendEquation(regs.blend.equation_rgb);
+        break;
+    case MAXWELL3D_REG_INDEX(blend.factor_source_rgb):
+        state.blend.src_rgb_func = MaxwellToGL::BlendFunc(regs.blend.factor_source_rgb);
+        break;
+    case MAXWELL3D_REG_INDEX(blend.factor_dest_rgb):
+        state.blend.dst_rgb_func = MaxwellToGL::BlendFunc(regs.blend.factor_dest_rgb);
+        break;
+    case MAXWELL3D_REG_INDEX(blend.equation_a):
+        state.blend.a_equation = MaxwellToGL::BlendEquation(regs.blend.equation_a);
+        break;
+    case MAXWELL3D_REG_INDEX(blend.factor_source_a):
+        state.blend.src_a_func = MaxwellToGL::BlendFunc(regs.blend.factor_source_a);
+        break;
+    case MAXWELL3D_REG_INDEX(blend.factor_dest_a):
+        state.blend.dst_a_func = MaxwellToGL::BlendFunc(regs.blend.factor_dest_a);
+        break;
+    }
+}
 
 void RasterizerOpenGL::FlushAll() {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
@@ -467,9 +523,12 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& framebu
     src_params.width = std::min(framebuffer.width, pixel_stride);
     src_params.height = framebuffer.height;
     src_params.stride = pixel_stride;
-    src_params.is_tiled = false;
+    src_params.is_tiled = true;
+    src_params.block_height = Tegra::Texture::TICEntry::DefaultBlockHeight;
     src_params.pixel_format =
         SurfaceParams::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format);
+    src_params.component_type =
+        SurfaceParams::ComponentTypeFromGPUPixelFormat(framebuffer.pixel_format);
     src_params.UpdateParams();
 
     MathUtil::Rectangle<u32> src_rect;
@@ -531,70 +590,53 @@ void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntr
     }
 }
 
-void RasterizerOpenGL::SetShader() {
-    // TODO(bunnei): The below sets up a static test shader for passing untransformed vertices to
-    // OpenGL for rendering. This should be removed/replaced when we start emulating Maxwell
-    // shaders.
-
-    static constexpr char vertex_shader[] = R"(
-#version 150 core
-
-in vec2 vert_position;
-in vec2 vert_tex_coord;
-out vec2 frag_tex_coord;
-
-void main() {
-    // Multiply input position by the rotscale part of the matrix and then manually translate by
-    // the last column. This is equivalent to using a full 3x3 matrix and expanding the vector
-    // to `vec3(vert_position.xy, 1.0)`
-    gl_Position = vec4(mat2(mat3x2(0.0015625f, 0.0, 0.0, -0.0027778, -1.0, 1.0)) * vert_position + mat3x2(0.0015625f, 0.0, 0.0, -0.0027778, -1.0, 1.0)[2], 0.0, 1.0);
-    frag_tex_coord = vert_tex_coord;
-}
-)";
-
-    static constexpr char fragment_shader[] = R"(
-#version 150 core
-
-in vec2 frag_tex_coord;
-out vec4 color;
-
-uniform sampler2D tex[32];
+u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, GLuint program,
+                                        u32 current_bindpoint,
+                                        const std::vector<GLShader::ConstBufferEntry>& entries) {
+    auto& gpu = Core::System::GetInstance().GPU();
+    auto& maxwell3d = gpu.Get3DEngine();
 
-void main() {
-    color = texture(tex[0], frag_tex_coord);
-}
-)";
+    ASSERT_MSG(maxwell3d.IsShaderStageEnabled(stage),
+               "Attempted to upload constbuffer of disabled shader stage");
 
-    if (current_shader) {
-        return;
+    // Reset all buffer draw state for this stage.
+    for (auto& buffer : state.draw.const_buffers[static_cast<size_t>(stage)]) {
+        buffer.bindpoint = 0;
+        buffer.enabled = false;
     }
 
-    LOG_CRITICAL(Render_OpenGL, "Emulated shaders are not supported! Using a passthrough shader.");
-
-    current_shader = &test_shader;
-    if (has_ARB_separate_shader_objects) {
-        test_shader.shader.Create(vertex_shader, nullptr, fragment_shader, {}, true);
-        glActiveShaderProgram(pipeline.handle, test_shader.shader.handle);
-    } else {
-        UNREACHABLE();
+    // Upload only the enabled buffers from the 16 constbuffers of each shader stage
+    auto& shader_stage = maxwell3d.state.shader_stages[static_cast<size_t>(stage)];
+
+    for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
+        const auto& used_buffer = entries[bindpoint];
+        const auto& buffer = shader_stage.const_buffers[used_buffer.GetIndex()];
+        auto& buffer_draw_state =
+            state.draw.const_buffers[static_cast<size_t>(stage)][used_buffer.GetIndex()];
+
+        ASSERT_MSG(buffer.enabled, "Attempted to upload disabled constbuffer");
+        buffer_draw_state.enabled = true;
+        buffer_draw_state.bindpoint = current_bindpoint + bindpoint;
+
+        VAddr addr = gpu.memory_manager->PhysicalToVirtualAddress(buffer.address);
+        std::vector<u8> data(used_buffer.GetSize() * sizeof(float));
+        Memory::ReadBlock(addr, data.data(), data.size());
+
+        glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer_draw_state.ssbo);
+        glBufferData(GL_SHADER_STORAGE_BUFFER, data.size(), data.data(), GL_DYNAMIC_DRAW);
+        glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
+
+        // Now configure the bindpoint of the buffer inside the shader
+        std::string buffer_name = used_buffer.GetName();
+        GLuint index =
+            glGetProgramResourceIndex(program, GL_SHADER_STORAGE_BLOCK, buffer_name.c_str());
+        if (index != -1)
+            glShaderStorageBlockBinding(program, index, buffer_draw_state.bindpoint);
     }
 
-    state.draw.shader_program = test_shader.shader.handle;
     state.Apply();
 
-    for (u32 texture = 0; texture < texture_samplers.size(); ++texture) {
-        // Set the texture samplers to correspond to different texture units
-        std::string uniform_name = "tex[" + std::to_string(texture) + "]";
-        GLint uniform_tex = glGetUniformLocation(test_shader.shader.handle, uniform_name.c_str());
-        if (uniform_tex != -1) {
-            glUniform1i(uniform_tex, TextureUnits::MaxwellTexture(texture).id);
-        }
-    }
-
-    if (has_ARB_separate_shader_objects) {
-        state.draw.shader_program = 0;
-        state.Apply();
-    }
+    return current_bindpoint + entries.size();
 }
 
 void RasterizerOpenGL::BindFramebufferSurfaces(const Surface& color_surface,
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index d868bf421..03e02b52a 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -6,19 +6,16 @@
 
 #include <array>
 #include <cstddef>
-#include <cstring>
 #include <memory>
-#include <unordered_map>
 #include <vector>
 #include <glad/glad.h>
-#include "common/bit_field.h"
 #include "common/common_types.h"
-#include "common/hash.h"
-#include "common/vector_math.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
+#include "video_core/renderer_opengl/gl_shader_manager.h"
 #include "video_core/renderer_opengl/gl_state.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
@@ -30,7 +27,7 @@ public:
     ~RasterizerOpenGL() override;
 
     void DrawArrays() override;
-    void NotifyMaxwellRegisterChanged(u32 id) override;
+    void NotifyMaxwellRegisterChanged(u32 method) override;
     void FlushAll() override;
     void FlushRegion(VAddr addr, u64 size) override;
     void InvalidateRegion(VAddr addr, u64 size) override;
@@ -45,7 +42,7 @@ public:
     /// OpenGL shader generated for a given Maxwell register state
     struct MaxwellShader {
         /// OpenGL shader resource
-        OGLShader shader;
+        OGLProgram shader;
     };
 
     struct VertexShader {
@@ -56,34 +53,6 @@ public:
         OGLShader shader;
     };
 
-    /// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned
-    // NOTE: Always keep a vec4 at the end. The GL spec is not clear wether the alignment at
-    //       the end of a uniform block is included in UNIFORM_BLOCK_DATA_SIZE or not.
-    //       Not following that rule will cause problems on some AMD drivers.
-    struct UniformData {};
-
-    // static_assert(
-    //    sizeof(UniformData) == 0x460,
-    //    "The size of the UniformData structure has changed, update the structure in the shader");
-    static_assert(sizeof(UniformData) < 16384,
-                  "UniformData structure must be less than 16kb as per the OpenGL spec");
-
-    struct VSUniformData {};
-    // static_assert(
-    //    sizeof(VSUniformData) == 1856,
-    //    "The size of the VSUniformData structure has changed, update the structure in the
-    //    shader");
-    static_assert(sizeof(VSUniformData) < 16384,
-                  "VSUniformData structure must be less than 16kb as per the OpenGL spec");
-
-    struct FSUniformData {};
-    // static_assert(
-    //    sizeof(FSUniformData) == 1856,
-    //    "The size of the FSUniformData structure has changed, update the structure in the
-    //    shader");
-    static_assert(sizeof(FSUniformData) < 16384,
-                  "FSUniformData structure must be less than 16kb as per the OpenGL spec");
-
 private:
     class SamplerInfo {
     public:
@@ -113,6 +82,18 @@ private:
     /// Binds the required textures to OpenGL before drawing a batch.
     void BindTextures();
 
+    /*
+     * Configures the current constbuffers to use for the draw command.
+     * @param stage The shader stage to configure buffers for.
+     * @param program The OpenGL program object that contains the specified stage.
+     * @param current_bindpoint The offset at which to start counting new buffer bindpoints.
+     * @param entries Vector describing the buffers that are actually used in the guest shader.
+     * @returns The next available bindpoint for use in the next shader stage.
+     */
+    u32 SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, GLuint program,
+                          u32 current_bindpoint,
+                          const std::vector<GLShader::ConstBufferEntry>& entries);
+
     /// Syncs the viewport to match the guest state
     void SyncViewport(const MathUtil::Rectangle<u32>& surfaces_rect, u16 res_scale);
 
@@ -122,9 +103,6 @@ private:
     /// Syncs the clip coefficients to match the guest state
     void SyncClipCoef();
 
-    /// Sets the OpenGL shader in accordance with the current guest state
-    void SetShader();
-
     /// Syncs the cull mode to match the guest state
     void SyncCullMode();
 
@@ -152,23 +130,16 @@ private:
 
     RasterizerCacheOpenGL res_cache;
 
-    /// Shader used for test renderering - to be removed once we have emulated shaders
-    MaxwellShader test_shader{};
-
-    const MaxwellShader* current_shader{};
-    bool shader_dirty{};
-
-    struct {
-        UniformData data;
-        bool dirty;
-    } uniform_block_data = {};
-
-    OGLPipeline pipeline;
+    std::unique_ptr<GLShader::ProgramManager> shader_program_manager;
     OGLVertexArray sw_vao;
     OGLVertexArray hw_vao;
     std::array<bool, 16> hw_vao_enabled_attributes;
 
-    std::array<SamplerInfo, 32> texture_samplers;
+    std::array<SamplerInfo, GLShader::NumTextureSamplers> texture_samplers;
+    std::array<std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers>,
+               Tegra::Engines::Maxwell3D::Regs::MaxShaderStage>
+        ssbos;
+
     static constexpr size_t VERTEX_BUFFER_SIZE = 128 * 1024 * 1024;
     std::unique_ptr<OGLStreamBuffer> vertex_buffer;
     OGLBuffer uniform_buffer;
@@ -179,22 +150,11 @@ private:
 
     GLsizeiptr vs_input_size;
 
-    void AnalyzeVertexArray(bool is_indexed);
     void SetupVertexArray(u8* array_ptr, GLintptr buffer_offset);
 
-    OGLBuffer vs_uniform_buffer;
-    std::unordered_map<GLShader::MaxwellVSConfig, VertexShader*> vs_shader_map;
-    std::unordered_map<std::string, VertexShader> vs_shader_cache;
-    OGLShader vs_default_shader;
-
-    void SetupVertexShader(VSUniformData* ub_ptr, GLintptr buffer_offset);
-
-    OGLBuffer fs_uniform_buffer;
-    std::unordered_map<GLShader::MaxwellFSConfig, FragmentShader*> fs_shader_map;
-    std::unordered_map<std::string, FragmentShader> fs_shader_cache;
-    OGLShader fs_default_shader;
+    std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::MaxShaderStage> uniform_buffers;
 
-    void SetupFragmentShader(FSUniformData* ub_ptr, GLintptr buffer_offset);
+    void SetupShaders(u8* buffer_ptr, GLintptr buffer_offset, size_t ptr_pos);
 
     enum class AccelDraw { Disabled, Arrays, Indexed };
     AccelDraw accelerate_draw;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 5cbafa2e7..ced2b8247 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -7,7 +7,6 @@
 #include <cstring>
 #include <iterator>
 #include <memory>
-#include <unordered_set>
 #include <utility>
 #include <vector>
 #include <boost/optional.hpp>
@@ -20,7 +19,6 @@
 #include "common/math_util.h"
 #include "common/microprofile.h"
 #include "common/scope_exit.h"
-#include "common/vector_math.h"
 #include "core/core.h"
 #include "core/frontend/emu_window.h"
 #include "core/hle/kernel/process.h"
@@ -36,6 +34,7 @@
 
 using SurfaceType = SurfaceParams::SurfaceType;
 using PixelFormat = SurfaceParams::PixelFormat;
+using ComponentType = SurfaceParams::ComponentType;
 
 struct FormatTuple {
     GLint internal_format;
@@ -47,26 +46,24 @@ struct FormatTuple {
     u32 compression_factor;
 };
 
-static constexpr std::array<FormatTuple, 1> fb_format_tuples = {{
-    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, false, 1}, // RGBA8
+static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_format_tuples = {{
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, false, 1},                     // ABGR8
+    {GL_RGB, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, false, 1},                        // B5G6R5
+    {GL_COMPRESSED_RGB_S3TC_DXT1_EXT, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, true, 16},   // DXT1
+    {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, true, 16}, // DXT23
+    {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, true, 16}, // DXT45
 }};
 
-static constexpr std::array<FormatTuple, 2> tex_format_tuples = {{
-    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, false, 1},                       // RGBA8
-    {GL_COMPRESSED_RGB_S3TC_DXT1_EXT, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, true, 16}, // DXT1
-}};
-
-static const FormatTuple& GetFormatTuple(PixelFormat pixel_format) {
+static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) {
     const SurfaceType type = SurfaceParams::GetFormatType(pixel_format);
-    if (type == SurfaceType::Color) {
-        ASSERT(static_cast<size_t>(pixel_format) < fb_format_tuples.size());
-        return fb_format_tuples[static_cast<unsigned int>(pixel_format)];
+    if (type == SurfaceType::ColorTexture) {
+        ASSERT(static_cast<size_t>(pixel_format) < tex_format_tuples.size());
+        // For now only UNORM components are supported
+        ASSERT(component_type == ComponentType::UNorm);
+        return tex_format_tuples[static_cast<unsigned int>(pixel_format)];
     } else if (type == SurfaceType::Depth || type == SurfaceType::DepthStencil) {
         // TODO(Subv): Implement depth formats
         ASSERT_MSG(false, "Unimplemented");
-    } else if (type == SurfaceType::Texture) {
-        ASSERT(static_cast<size_t>(pixel_format) < tex_format_tuples.size());
-        return tex_format_tuples[static_cast<unsigned int>(pixel_format)];
     }
 
     UNREACHABLE();
@@ -85,56 +82,42 @@ static u16 GetResolutionScaleFactor() {
 }
 
 template <bool morton_to_gl, PixelFormat format>
-static void MortonCopyTile(u32 stride, u8* tile_buffer, u8* gl_buffer) {
-    constexpr u32 bytes_per_pixel = SurfaceParams::GetFormatBpp(format) / 8;
-    constexpr u32 gl_bytes_per_pixel = CachedSurface::GetGLBytesPerPixel(format);
-    for (u32 y = 0; y < 8; ++y) {
-        for (u32 x = 0; x < 8; ++x) {
-            u8* tile_ptr = tile_buffer + VideoCore::MortonInterleave(x, y) * bytes_per_pixel;
-            u8* gl_ptr = gl_buffer + ((7 - y) * stride + x) * gl_bytes_per_pixel;
-            if (morton_to_gl) {
-                std::memcpy(gl_ptr, tile_ptr, bytes_per_pixel);
-            } else {
-                std::memcpy(tile_ptr, gl_ptr, bytes_per_pixel);
-            }
-        }
-    }
-}
-
-template <bool morton_to_gl, PixelFormat format>
-void MortonCopy(u32 stride, u32 height, u8* gl_buffer, VAddr base, VAddr start, VAddr end) {
+void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, VAddr base, VAddr start,
+                VAddr end) {
     constexpr u32 bytes_per_pixel = SurfaceParams::GetFormatBpp(format) / 8;
     constexpr u32 gl_bytes_per_pixel = CachedSurface::GetGLBytesPerPixel(format);
 
-    // TODO(bunnei): Assumes the default rendering GOB size of 16 (128 lines). We should check the
-    // configuration for this and perform more generic un/swizzle
-    LOG_WARNING(Render_OpenGL, "need to use correct swizzle/GOB parameters!");
-    VideoCore::MortonCopyPixels128(stride, height, bytes_per_pixel, gl_bytes_per_pixel,
-                                   Memory::GetPointer(base), gl_buffer, morton_to_gl);
-}
-
-template <>
-void MortonCopy<true, PixelFormat::DXT1>(u32 stride, u32 height, u8* gl_buffer, VAddr base,
-                                         VAddr start, VAddr end) {
-    constexpr u32 bytes_per_pixel = SurfaceParams::GetFormatBpp(PixelFormat::DXT1) / 8;
-    constexpr u32 gl_bytes_per_pixel = CachedSurface::GetGLBytesPerPixel(PixelFormat::DXT1);
-
-    // TODO(bunnei): Assumes the default rendering GOB size of 16 (128 lines). We should check the
-    // configuration for this and perform more generic un/swizzle
-    LOG_WARNING(Render_OpenGL, "need to use correct swizzle/GOB parameters!");
-    auto data =
-        Tegra::Texture::UnswizzleTexture(base, Tegra::Texture::TextureFormat::DXT1, stride, height);
-    std::memcpy(gl_buffer, data.data(), data.size());
+    if (morton_to_gl) {
+        auto data = Tegra::Texture::UnswizzleTexture(
+            base, SurfaceParams::TextureFormatFromPixelFormat(format), stride, height,
+            block_height);
+        std::memcpy(gl_buffer, data.data(), data.size());
+    } else {
+        // TODO(bunnei): Assumes the default rendering GOB size of 16 (128 lines). We should check
+        // the configuration for this and perform more generic un/swizzle
+        LOG_WARNING(Render_OpenGL, "need to use correct swizzle/GOB parameters!");
+        VideoCore::MortonCopyPixels128(stride, height, bytes_per_pixel, gl_bytes_per_pixel,
+                                       Memory::GetPointer(base), gl_buffer, morton_to_gl);
+    }
 }
 
-static constexpr std::array<void (*)(u32, u32, u8*, VAddr, VAddr, VAddr), 2> morton_to_gl_fns = {
-    MortonCopy<true, PixelFormat::RGBA8>,
-    MortonCopy<true, PixelFormat::DXT1>,
+static constexpr std::array<void (*)(u32, u32, u32, u8*, VAddr, VAddr, VAddr),
+                            SurfaceParams::MaxPixelFormat>
+    morton_to_gl_fns = {
+        MortonCopy<true, PixelFormat::ABGR8>, MortonCopy<true, PixelFormat::B5G6R5>,
+        MortonCopy<true, PixelFormat::DXT1>,  MortonCopy<true, PixelFormat::DXT23>,
+        MortonCopy<true, PixelFormat::DXT45>,
 };
 
-static constexpr std::array<void (*)(u32, u32, u8*, VAddr, VAddr, VAddr), 2> gl_to_morton_fns = {
-    MortonCopy<false, PixelFormat::RGBA8>,
-    MortonCopy<false, PixelFormat::DXT1>,
+static constexpr std::array<void (*)(u32, u32, u32, u8*, VAddr, VAddr, VAddr),
+                            SurfaceParams::MaxPixelFormat>
+    gl_to_morton_fns = {
+        MortonCopy<false, PixelFormat::ABGR8>,
+        MortonCopy<false, PixelFormat::B5G6R5>,
+        // TODO(Subv): Swizzling the DXT1/DXT23/DXT45 formats is not yet supported
+        nullptr,
+        nullptr,
+        nullptr,
 };
 
 // Allocate an uninitialized texture of appropriate size and format for the surface
@@ -183,7 +166,7 @@ static bool BlitTextures(GLuint src_tex, const MathUtil::Rectangle<u32>& src_rec
 
     u32 buffers = 0;
 
-    if (type == SurfaceType::Color || type == SurfaceType::Texture) {
+    if (type == SurfaceType::ColorTexture) {
         glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, src_tex,
                                0);
         glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
@@ -311,15 +294,18 @@ MathUtil::Rectangle<u32> SurfaceParams::GetScaledSubRect(const SurfaceParams& su
 
 bool SurfaceParams::ExactMatch(const SurfaceParams& other_surface) const {
     return std::tie(other_surface.addr, other_surface.width, other_surface.height,
-                    other_surface.stride, other_surface.pixel_format, other_surface.is_tiled) ==
-               std::tie(addr, width, height, stride, pixel_format, is_tiled) &&
+                    other_surface.stride, other_surface.block_height, other_surface.pixel_format,
+                    other_surface.component_type,
+                    other_surface.is_tiled) == std::tie(addr, width, height, stride, block_height,
+                                                        pixel_format, component_type, is_tiled) &&
            pixel_format != PixelFormat::Invalid;
 }
 
 bool SurfaceParams::CanSubRect(const SurfaceParams& sub_surface) const {
     return sub_surface.addr >= addr && sub_surface.end <= end &&
            sub_surface.pixel_format == pixel_format && pixel_format != PixelFormat::Invalid &&
-           sub_surface.is_tiled == is_tiled &&
+           sub_surface.is_tiled == is_tiled && sub_surface.block_height == block_height &&
+           sub_surface.component_type == component_type &&
            (sub_surface.addr - addr) % BytesInPixels(is_tiled ? 64 : 1) == 0 &&
            (sub_surface.stride == stride || sub_surface.height <= (is_tiled ? 8u : 1u)) &&
            GetSubRect(sub_surface).left + sub_surface.width <= stride;
@@ -328,7 +314,8 @@ bool SurfaceParams::CanSubRect(const SurfaceParams& sub_surface) const {
 bool SurfaceParams::CanExpand(const SurfaceParams& expanded_surface) const {
     return pixel_format != PixelFormat::Invalid && pixel_format == expanded_surface.pixel_format &&
            addr <= expanded_surface.end && expanded_surface.addr <= end &&
-           is_tiled == expanded_surface.is_tiled && stride == expanded_surface.stride &&
+           is_tiled == expanded_surface.is_tiled && block_height == expanded_surface.block_height &&
+           component_type == expanded_surface.component_type && stride == expanded_surface.stride &&
            (std::max(expanded_surface.addr, addr) - std::min(expanded_surface.addr, addr)) %
                    BytesInPixels(stride * (is_tiled ? 8 : 1)) ==
                0;
@@ -339,6 +326,10 @@ bool SurfaceParams::CanTexCopy(const SurfaceParams& texcopy_params) const {
         end < texcopy_params.end) {
         return false;
     }
+    if (texcopy_params.block_height != block_height ||
+        texcopy_params.component_type != component_type)
+        return false;
+
     if (texcopy_params.width != texcopy_params.stride) {
         const u32 tile_stride = static_cast<u32>(BytesInPixels(stride * (is_tiled ? 8 : 1)));
         return (texcopy_params.addr - addr) % BytesInPixels(is_tiled ? 64 : 1) == 0 &&
@@ -481,18 +472,13 @@ void CachedSurface::LoadGLBuffer(VAddr load_start, VAddr load_end) {
     const u64 start_offset = load_start - addr;
 
     if (!is_tiled) {
-        ASSERT(type == SurfaceType::Color);
         const u32 bytes_per_pixel{GetFormatBpp() >> 3};
 
-        // TODO(bunnei): Assumes the default rendering GOB size of 16 (128 lines). We should check
-        // the configuration for this and perform more generic un/swizzle
-        LOG_WARNING(Render_OpenGL, "need to use correct swizzle/GOB parameters!");
-        VideoCore::MortonCopyPixels128(width, height, bytes_per_pixel, 4,
-                                       texture_src_data + start_offset, &gl_buffer[start_offset],
-                                       true);
+        std::memcpy(&gl_buffer[start_offset], texture_src_data + start_offset,
+                    bytes_per_pixel * width * height);
     } else {
-        morton_to_gl_fns[static_cast<size_t>(pixel_format)](stride, height, &gl_buffer[0], addr,
-                                                            load_start, load_end);
+        morton_to_gl_fns[static_cast<size_t>(pixel_format)](
+            stride, block_height, height, &gl_buffer[0], addr, load_start, load_end);
     }
 }
 
@@ -533,11 +519,10 @@ void CachedSurface::FlushGLBuffer(VAddr flush_start, VAddr flush_end) {
         if (backup_bytes)
             std::memcpy(&dst_buffer[coarse_start_offset], &backup_data[0], backup_bytes);
     } else if (!is_tiled) {
-        ASSERT(type == SurfaceType::Color);
         std::memcpy(dst_buffer + start_offset, &gl_buffer[start_offset], flush_end - flush_start);
     } else {
-        gl_to_morton_fns[static_cast<size_t>(pixel_format)](stride, height, &gl_buffer[0], addr,
-                                                            flush_start, flush_end);
+        gl_to_morton_fns[static_cast<size_t>(pixel_format)](
+            stride, block_height, height, &gl_buffer[0], addr, flush_start, flush_end);
     }
 }
 
@@ -556,7 +541,7 @@ void CachedSurface::UploadGLTexture(const MathUtil::Rectangle<u32>& rect, GLuint
     GLint y0 = static_cast<GLint>(rect.bottom);
     size_t buffer_offset = (y0 * stride + x0) * GetGLBytesPerPixel(pixel_format);
 
-    const FormatTuple& tuple = GetFormatTuple(pixel_format);
+    const FormatTuple& tuple = GetFormatTuple(pixel_format, component_type);
     GLuint target_tex = texture.handle;
 
     // If not 1x scale, create 1x texture that we will blit from to replace texture subrect in
@@ -629,7 +614,7 @@ void CachedSurface::DownloadGLTexture(const MathUtil::Rectangle<u32>& rect, GLui
     OpenGLState prev_state = state;
     SCOPE_EXIT({ prev_state.Apply(); });
 
-    const FormatTuple& tuple = GetFormatTuple(pixel_format);
+    const FormatTuple& tuple = GetFormatTuple(pixel_format, component_type);
 
     // Ensure no bad interactions with GL_PACK_ALIGNMENT
     ASSERT(stride * GetGLBytesPerPixel(pixel_format) % 4 == 0);
@@ -662,7 +647,7 @@ void CachedSurface::DownloadGLTexture(const MathUtil::Rectangle<u32>& rect, GLui
         state.draw.read_framebuffer = read_fb_handle;
         state.Apply();
 
-        if (type == SurfaceType::Color || type == SurfaceType::Texture) {
+        if (type == SurfaceType::ColorTexture) {
             glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D,
                                    texture.handle, 0);
             glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
@@ -685,7 +670,8 @@ void CachedSurface::DownloadGLTexture(const MathUtil::Rectangle<u32>& rect, GLui
     glPixelStorei(GL_PACK_ROW_LENGTH, 0);
 }
 
-enum MatchFlags {
+enum class MatchFlags {
+    None = 0,
     Invalid = 1,      // Flag that can be applied to other match types, invalid matches require
                       // validation before they can be used
     Exact = 1 << 1,   // Surfaces perfectly match
@@ -699,6 +685,10 @@ constexpr MatchFlags operator|(MatchFlags lhs, MatchFlags rhs) {
     return static_cast<MatchFlags>(static_cast<int>(lhs) | static_cast<int>(rhs));
 }
 
+constexpr MatchFlags operator&(MatchFlags lhs, MatchFlags rhs) {
+    return static_cast<MatchFlags>(static_cast<int>(lhs) & static_cast<int>(rhs));
+}
+
 /// Get the best surface match (and its match type) for the given flags
 template <MatchFlags find_flags>
 Surface FindMatch(const SurfaceCache& surface_cache, const SurfaceParams& params,
@@ -716,15 +706,15 @@ Surface FindMatch(const SurfaceCache& surface_cache, const SurfaceParams& params
                                          : (params.res_scale <= surface->res_scale);
             // validity will be checked in GetCopyableInterval
             bool is_valid =
-                find_flags & MatchFlags::Copy
+                (find_flags & MatchFlags::Copy) != MatchFlags::None
                     ? true
                     : surface->IsRegionValid(validate_interval.value_or(params.GetInterval()));
 
-            if (!(find_flags & MatchFlags::Invalid) && !is_valid)
+            if ((find_flags & MatchFlags::Invalid) == MatchFlags::None && !is_valid)
                 continue;
 
             auto IsMatch_Helper = [&](auto check_type, auto match_fn) {
-                if (!(find_flags & check_type))
+                if ((find_flags & check_type) == MatchFlags::None)
                     return;
 
                 bool matched;
@@ -818,7 +808,7 @@ void main() {
     color = texelFetch(tbo, tbo_offset).rabg;
 }
 )";
-    d24s8_abgr_shader.Create(vs_source, nullptr, fs_source);
+    d24s8_abgr_shader.CreateFromSource(vs_source, nullptr, fs_source);
 
     OpenGLState state = OpenGLState::GetCurState();
     GLuint old_program = state.draw.shader_program;
@@ -1041,9 +1031,25 @@ Surface RasterizerCacheOpenGL::GetTextureSurface(const Tegra::Texture::FullTextu
     params.height = config.tic.Height();
     params.is_tiled = config.tic.IsTiled();
     params.pixel_format = SurfaceParams::PixelFormatFromTextureFormat(config.tic.format);
+
+    // TODO(Subv): Different types per component are not supported.
+    ASSERT(config.tic.r_type.Value() == config.tic.g_type.Value() &&
+           config.tic.r_type.Value() == config.tic.b_type.Value() &&
+           config.tic.r_type.Value() == config.tic.a_type.Value());
+
+    params.component_type = SurfaceParams::ComponentTypeFromTexture(config.tic.r_type.Value());
+
+    if (config.tic.IsTiled()) {
+        params.block_height = config.tic.BlockHeight();
+    } else {
+        // Use the texture-provided stride value if the texture isn't tiled.
+        params.stride = params.PixelsInBytes(config.tic.Pitch());
+    }
+
     params.UpdateParams();
 
-    if (config.tic.Width() % 8 != 0 || config.tic.Height() % 8 != 0) {
+    if (config.tic.Width() % 8 != 0 || config.tic.Height() % 8 != 0 ||
+        params.stride != params.width) {
         Surface src_surface;
         MathUtil::Rectangle<u32> rect;
         std::tie(src_surface, rect) = GetSurfaceSubRect(params, ScaleMatch::Ignore, true);
@@ -1083,10 +1089,10 @@ SurfaceSurfaceRect_Tuple RasterizerCacheOpenGL::GetFramebufferSurfaces(
     }
 
     MathUtil::Rectangle<u32> viewport_clamped{
-        static_cast<u32>(MathUtil::Clamp(viewport.left, 0, static_cast<s32>(config.width))),
-        static_cast<u32>(MathUtil::Clamp(viewport.top, 0, static_cast<s32>(config.height))),
-        static_cast<u32>(MathUtil::Clamp(viewport.right, 0, static_cast<s32>(config.width))),
-        static_cast<u32>(MathUtil::Clamp(viewport.bottom, 0, static_cast<s32>(config.height)))};
+        static_cast<u32>(std::clamp(viewport.left, 0, static_cast<s32>(config.width))),
+        static_cast<u32>(std::clamp(viewport.top, 0, static_cast<s32>(config.height))),
+        static_cast<u32>(std::clamp(viewport.right, 0, static_cast<s32>(config.width))),
+        static_cast<u32>(std::clamp(viewport.bottom, 0, static_cast<s32>(config.height)))};
 
     // get color and depth surfaces
     SurfaceParams color_params;
@@ -1094,10 +1100,13 @@ SurfaceSurfaceRect_Tuple RasterizerCacheOpenGL::GetFramebufferSurfaces(
     color_params.res_scale = resolution_scale_factor;
     color_params.width = config.width;
     color_params.height = config.height;
+    // TODO(Subv): Can framebuffers use a different block height?
+    color_params.block_height = Tegra::Texture::TICEntry::DefaultBlockHeight;
     SurfaceParams depth_params = color_params;
 
     color_params.addr = memory_manager->PhysicalToVirtualAddress(config.Address());
     color_params.pixel_format = SurfaceParams::PixelFormatFromRenderTargetFormat(config.format);
+    color_params.component_type = SurfaceParams::ComponentTypeFromRenderTarget(config.format);
     color_params.UpdateParams();
 
     ASSERT_MSG(!using_depth_fb, "depth buffer is unimplemented");
@@ -1293,7 +1302,6 @@ void RasterizerCacheOpenGL::InvalidateRegion(VAddr addr, u64 size, const Surface
     const SurfaceInterval invalid_interval(addr, addr + size);
 
     if (region_owner != nullptr) {
-        ASSERT(region_owner->type != SurfaceType::Texture);
         ASSERT(addr >= region_owner->addr && addr + size <= region_owner->end);
         // Surfaces can't have a gap
         ASSERT(region_owner->width == region_owner->stride);
@@ -1355,7 +1363,8 @@ Surface RasterizerCacheOpenGL::CreateSurface(const SurfaceParams& params) {
 
     surface->gl_buffer_size = 0;
     surface->invalid_regions.insert(surface->GetInterval());
-    AllocateSurfaceTexture(surface->texture.handle, GetFormatTuple(surface->pixel_format),
+    AllocateSurfaceTexture(surface->texture.handle,
+                           GetFormatTuple(surface->pixel_format, surface->component_type),
                            surface->GetScaledWidth(), surface->GetScaledHeight());
 
     return surface;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 06524fc59..6861efe16 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -52,27 +52,45 @@ enum class ScaleMatch {
 
 struct SurfaceParams {
     enum class PixelFormat {
-        RGBA8 = 0,
-        DXT1 = 1,
+        ABGR8 = 0,
+        B5G6R5 = 1,
+        DXT1 = 2,
+        DXT23 = 3,
+        DXT45 = 4,
+
+        Max,
         Invalid = 255,
     };
 
+    static constexpr size_t MaxPixelFormat = static_cast<size_t>(PixelFormat::Max);
+
+    enum class ComponentType {
+        Invalid = 0,
+        SNorm = 1,
+        UNorm = 2,
+        SInt = 3,
+        UInt = 4,
+        Float = 5,
+    };
+
     enum class SurfaceType {
-        Color = 0,
-        Texture = 1,
-        Depth = 2,
-        DepthStencil = 3,
-        Fill = 4,
-        Invalid = 5
+        ColorTexture = 0,
+        Depth = 1,
+        DepthStencil = 2,
+        Fill = 3,
+        Invalid = 4,
     };
 
     static constexpr unsigned int GetFormatBpp(PixelFormat format) {
         if (format == PixelFormat::Invalid)
             return 0;
 
-        constexpr std::array<unsigned int, 2> bpp_table = {
-            32, // RGBA8
-            64, // DXT1
+        constexpr std::array<unsigned int, MaxPixelFormat> bpp_table = {
+            32,  // ABGR8
+            16,  // B5G6R5
+            64,  // DXT1
+            128, // DXT23
+            128, // DXT45
         };
 
         ASSERT(static_cast<size_t>(format) < bpp_table.size());
@@ -85,8 +103,9 @@ struct SurfaceParams {
     static PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) {
         switch (format) {
         case Tegra::RenderTargetFormat::RGBA8_UNORM:
-            return PixelFormat::RGBA8;
+            return PixelFormat::ABGR8;
         default:
+            NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
             UNREACHABLE();
         }
     }
@@ -94,8 +113,9 @@ struct SurfaceParams {
     static PixelFormat PixelFormatFromGPUPixelFormat(Tegra::FramebufferConfig::PixelFormat format) {
         switch (format) {
         case Tegra::FramebufferConfig::PixelFormat::ABGR8:
-            return PixelFormat::RGBA8;
+            return PixelFormat::ABGR8;
         default:
+            NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
             UNREACHABLE();
         }
     }
@@ -104,10 +124,69 @@ struct SurfaceParams {
         // TODO(Subv): Properly implement this
         switch (format) {
         case Tegra::Texture::TextureFormat::A8R8G8B8:
-            return PixelFormat::RGBA8;
+            return PixelFormat::ABGR8;
+        case Tegra::Texture::TextureFormat::B5G6R5:
+            return PixelFormat::B5G6R5;
         case Tegra::Texture::TextureFormat::DXT1:
             return PixelFormat::DXT1;
+        case Tegra::Texture::TextureFormat::DXT23:
+            return PixelFormat::DXT23;
+        case Tegra::Texture::TextureFormat::DXT45:
+            return PixelFormat::DXT45;
+        default:
+            NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
+            UNREACHABLE();
+        }
+    }
+
+    static Tegra::Texture::TextureFormat TextureFormatFromPixelFormat(PixelFormat format) {
+        // TODO(Subv): Properly implement this
+        switch (format) {
+        case PixelFormat::ABGR8:
+            return Tegra::Texture::TextureFormat::A8R8G8B8;
+        case PixelFormat::B5G6R5:
+            return Tegra::Texture::TextureFormat::B5G6R5;
+        case PixelFormat::DXT1:
+            return Tegra::Texture::TextureFormat::DXT1;
+        case PixelFormat::DXT23:
+            return Tegra::Texture::TextureFormat::DXT23;
+        case PixelFormat::DXT45:
+            return Tegra::Texture::TextureFormat::DXT45;
+        default:
+            UNREACHABLE();
+        }
+    }
+
+    static ComponentType ComponentTypeFromTexture(Tegra::Texture::ComponentType type) {
+        // TODO(Subv): Implement more component types
+        switch (type) {
+        case Tegra::Texture::ComponentType::UNORM:
+            return ComponentType::UNorm;
         default:
+            NGLOG_CRITICAL(HW_GPU, "Unimplemented component type={}", static_cast<u32>(type));
+            UNREACHABLE();
+        }
+    }
+
+    static ComponentType ComponentTypeFromRenderTarget(Tegra::RenderTargetFormat format) {
+        // TODO(Subv): Implement more render targets
+        switch (format) {
+        case Tegra::RenderTargetFormat::RGBA8_UNORM:
+        case Tegra::RenderTargetFormat::RGB10_A2_UNORM:
+            return ComponentType::UNorm;
+        default:
+            NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
+            UNREACHABLE();
+        }
+    }
+
+    static ComponentType ComponentTypeFromGPUPixelFormat(
+        Tegra::FramebufferConfig::PixelFormat format) {
+        switch (format) {
+        case Tegra::FramebufferConfig::PixelFormat::ABGR8:
+            return ComponentType::UNorm;
+        default:
+            NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
             UNREACHABLE();
         }
     }
@@ -116,8 +195,7 @@ struct SurfaceParams {
         SurfaceType a_type = GetFormatType(pixel_format_a);
         SurfaceType b_type = GetFormatType(pixel_format_b);
 
-        if ((a_type == SurfaceType::Color || a_type == SurfaceType::Texture) &&
-            (b_type == SurfaceType::Color || b_type == SurfaceType::Texture)) {
+        if (a_type == SurfaceType::ColorTexture && b_type == SurfaceType::ColorTexture) {
             return true;
         }
 
@@ -133,12 +211,8 @@ struct SurfaceParams {
     }
 
     static SurfaceType GetFormatType(PixelFormat pixel_format) {
-        if ((unsigned int)pixel_format <= static_cast<unsigned int>(PixelFormat::RGBA8)) {
-            return SurfaceType::Color;
-        }
-
-        if ((unsigned int)pixel_format <= static_cast<unsigned int>(PixelFormat::DXT1)) {
-            return SurfaceType::Texture;
+        if (static_cast<size_t>(pixel_format) < MaxPixelFormat) {
+            return SurfaceType::ColorTexture;
         }
 
         // TODO(Subv): Implement the other formats
@@ -210,11 +284,13 @@ struct SurfaceParams {
     u32 width = 0;
     u32 height = 0;
     u32 stride = 0;
+    u32 block_height = 0;
     u16 res_scale = 1;
 
     bool is_tiled = false;
     PixelFormat pixel_format = PixelFormat::Invalid;
     SurfaceType type = SurfaceType::Invalid;
+    ComponentType component_type = ComponentType::Invalid;
 };
 
 struct CachedSurface : SurfaceParams {
@@ -334,7 +410,7 @@ private:
     OGLVertexArray attributeless_vao;
     OGLBuffer d24s8_abgr_buffer;
     GLsizeiptr d24s8_abgr_buffer_size;
-    OGLShader d24s8_abgr_shader;
+    OGLProgram d24s8_abgr_shader;
     GLint d24s8_abgr_tbo_size_u_id;
     GLint d24s8_abgr_viewport_u_id;
 };
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index 7da5e74d1..93f9172e7 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -13,14 +13,16 @@
 class OGLTexture : private NonCopyable {
 public:
     OGLTexture() = default;
-    OGLTexture(OGLTexture&& o) {
-        std::swap(handle, o.handle);
-    }
+
+    OGLTexture(OGLTexture&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
     ~OGLTexture() {
         Release();
     }
-    OGLTexture& operator=(OGLTexture&& o) {
-        std::swap(handle, o.handle);
+
+    OGLTexture& operator=(OGLTexture&& o) noexcept {
+        Release();
+        handle = std::exchange(o.handle, 0);
         return *this;
     }
 
@@ -46,14 +48,16 @@ public:
 class OGLSampler : private NonCopyable {
 public:
     OGLSampler() = default;
-    OGLSampler(OGLSampler&& o) {
-        std::swap(handle, o.handle);
-    }
+
+    OGLSampler(OGLSampler&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
     ~OGLSampler() {
         Release();
     }
-    OGLSampler& operator=(OGLSampler&& o) {
-        std::swap(handle, o.handle);
+
+    OGLSampler& operator=(OGLSampler&& o) noexcept {
+        Release();
+        handle = std::exchange(o.handle, 0);
         return *this;
     }
 
@@ -79,25 +83,71 @@ public:
 class OGLShader : private NonCopyable {
 public:
     OGLShader() = default;
-    OGLShader(OGLShader&& o) {
-        std::swap(handle, o.handle);
-    }
+
+    OGLShader(OGLShader&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
     ~OGLShader() {
         Release();
     }
-    OGLShader& operator=(OGLShader&& o) {
-        std::swap(handle, o.handle);
+
+    OGLShader& operator=(OGLShader&& o) noexcept {
+        Release();
+        handle = std::exchange(o.handle, 0);
         return *this;
     }
 
-    /// Creates a new internal OpenGL resource and stores the handle
-    void Create(const char* vert_shader, const char* geo_shader, const char* frag_shader,
-                const std::vector<const char*>& feedback_vars = {},
-                bool separable_program = false) {
+    void Create(const char* source, GLenum type) {
         if (handle != 0)
             return;
-        handle = GLShader::LoadProgram(vert_shader, geo_shader, frag_shader, feedback_vars,
-                                       separable_program);
+        if (source == nullptr)
+            return;
+        handle = GLShader::LoadShader(source, type);
+    }
+
+    void Release() {
+        if (handle == 0)
+            return;
+        glDeleteShader(handle);
+        handle = 0;
+    }
+
+    GLuint handle = 0;
+};
+
+class OGLProgram : private NonCopyable {
+public:
+    OGLProgram() = default;
+
+    OGLProgram(OGLProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
+    ~OGLProgram() {
+        Release();
+    }
+
+    OGLProgram& operator=(OGLProgram&& o) noexcept {
+        Release();
+        handle = std::exchange(o.handle, 0);
+        return *this;
+    }
+
+    template <typename... T>
+    void Create(bool separable_program, T... shaders) {
+        if (handle != 0)
+            return;
+        handle = GLShader::LoadProgram(separable_program, shaders...);
+    }
+
+    /// Creates a new internal OpenGL resource and stores the handle
+    void CreateFromSource(const char* vert_shader, const char* geo_shader, const char* frag_shader,
+                          bool separable_program = false) {
+        OGLShader vert, geo, frag;
+        if (vert_shader)
+            vert.Create(vert_shader, GL_VERTEX_SHADER);
+        if (geo_shader)
+            geo.Create(geo_shader, GL_GEOMETRY_SHADER);
+        if (frag_shader)
+            frag.Create(frag_shader, GL_FRAGMENT_SHADER);
+        Create(separable_program, vert.handle, geo.handle, frag.handle);
     }
 
     /// Deletes the internal OpenGL resource
@@ -115,13 +165,12 @@ public:
 class OGLPipeline : private NonCopyable {
 public:
     OGLPipeline() = default;
-    OGLPipeline(OGLPipeline&& o) {
-        handle = std::exchange<GLuint>(o.handle, 0);
-    }
+    OGLPipeline(OGLPipeline&& o) noexcept : handle{std::exchange<GLuint>(o.handle, 0)} {}
+
     ~OGLPipeline() {
         Release();
     }
-    OGLPipeline& operator=(OGLPipeline&& o) {
+    OGLPipeline& operator=(OGLPipeline&& o) noexcept {
         handle = std::exchange<GLuint>(o.handle, 0);
         return *this;
     }
@@ -148,14 +197,16 @@ public:
 class OGLBuffer : private NonCopyable {
 public:
     OGLBuffer() = default;
-    OGLBuffer(OGLBuffer&& o) {
-        std::swap(handle, o.handle);
-    }
+
+    OGLBuffer(OGLBuffer&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
     ~OGLBuffer() {
         Release();
     }
-    OGLBuffer& operator=(OGLBuffer&& o) {
-        std::swap(handle, o.handle);
+
+    OGLBuffer& operator=(OGLBuffer&& o) noexcept {
+        Release();
+        handle = std::exchange(o.handle, 0);
         return *this;
     }
 
@@ -182,12 +233,12 @@ class OGLSync : private NonCopyable {
 public:
     OGLSync() = default;
 
-    OGLSync(OGLSync&& o) : handle(std::exchange(o.handle, nullptr)) {}
+    OGLSync(OGLSync&& o) noexcept : handle(std::exchange(o.handle, nullptr)) {}
 
     ~OGLSync() {
         Release();
     }
-    OGLSync& operator=(OGLSync&& o) {
+    OGLSync& operator=(OGLSync&& o) noexcept {
         Release();
         handle = std::exchange(o.handle, nullptr);
         return *this;
@@ -214,14 +265,16 @@ public:
 class OGLVertexArray : private NonCopyable {
 public:
     OGLVertexArray() = default;
-    OGLVertexArray(OGLVertexArray&& o) {
-        std::swap(handle, o.handle);
-    }
+
+    OGLVertexArray(OGLVertexArray&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
     ~OGLVertexArray() {
         Release();
     }
-    OGLVertexArray& operator=(OGLVertexArray&& o) {
-        std::swap(handle, o.handle);
+
+    OGLVertexArray& operator=(OGLVertexArray&& o) noexcept {
+        Release();
+        handle = std::exchange(o.handle, 0);
         return *this;
     }
 
@@ -247,14 +300,16 @@ public:
 class OGLFramebuffer : private NonCopyable {
 public:
     OGLFramebuffer() = default;
-    OGLFramebuffer(OGLFramebuffer&& o) {
-        std::swap(handle, o.handle);
-    }
+
+    OGLFramebuffer(OGLFramebuffer&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
     ~OGLFramebuffer() {
         Release();
     }
-    OGLFramebuffer& operator=(OGLFramebuffer&& o) {
-        std::swap(handle, o.handle);
+
+    OGLFramebuffer& operator=(OGLFramebuffer&& o) noexcept {
+        Release();
+        handle = std::exchange(o.handle, 0);
         return *this;
     }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 564ea8f9e..086424395 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -2,57 +2,778 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <map>
+#include <set>
 #include <string>
-#include <queue>
+#include <string_view>
 #include "common/assert.h"
 #include "common/common_types.h"
+#include "video_core/engines/shader_bytecode.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 
-namespace Maxwell3D {
-namespace Shader {
+namespace GLShader {
 namespace Decompiler {
 
+using Tegra::Shader::Attribute;
+using Tegra::Shader::Instruction;
+using Tegra::Shader::OpCode;
+using Tegra::Shader::Register;
+using Tegra::Shader::Sampler;
+using Tegra::Shader::SubOp;
+using Tegra::Shader::Uniform;
+
 constexpr u32 PROGRAM_END = MAX_PROGRAM_CODE_LENGTH;
 
-class Impl {
+class DecompileFail : public std::runtime_error {
+public:
+    using std::runtime_error::runtime_error;
+};
+
+/// Describes the behaviour of code path of a given entry point and a return point.
+enum class ExitMethod {
+    Undetermined, ///< Internal value. Only occur when analyzing JMP loop.
+    AlwaysReturn, ///< All code paths reach the return point.
+    Conditional,  ///< Code path reaches the return point or an END instruction conditionally.
+    AlwaysEnd,    ///< All code paths reach a END instruction.
+};
+
+/// A subroutine is a range of code refereced by a CALL, IF or LOOP instruction.
+struct Subroutine {
+    /// Generates a name suitable for GLSL source code.
+    std::string GetName() const {
+        return "sub_" + std::to_string(begin) + "_" + std::to_string(end);
+    }
+
+    u32 begin;              ///< Entry point of the subroutine.
+    u32 end;                ///< Return point of the subroutine.
+    ExitMethod exit_method; ///< Exit method of the subroutine.
+    std::set<u32> labels;   ///< Addresses refereced by JMP instructions.
+
+    bool operator<(const Subroutine& rhs) const {
+        return std::tie(begin, end) < std::tie(rhs.begin, rhs.end);
+    }
+};
+
+/// Analyzes shader code and produces a set of subroutines.
+class ControlFlowAnalyzer {
 public:
-    Impl(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>& program_code,
-         const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>& swizzle_data, u32 main_offset,
-         const std::function<std::string(u32)>& inputreg_getter,
-         const std::function<std::string(u32)>& outputreg_getter, bool sanitize_mul,
-         const std::string& emit_cb, const std::string& setemit_cb)
-        : program_code(program_code), swizzle_data(swizzle_data), main_offset(main_offset),
-          inputreg_getter(inputreg_getter), outputreg_getter(outputreg_getter),
-          sanitize_mul(sanitize_mul), emit_cb(emit_cb), setemit_cb(setemit_cb) {}
+    ControlFlowAnalyzer(const ProgramCode& program_code, u32 main_offset)
+        : program_code(program_code) {
+
+        // Recursively finds all subroutines.
+        const Subroutine& program_main = AddSubroutine(main_offset, PROGRAM_END);
+        if (program_main.exit_method != ExitMethod::AlwaysEnd)
+            throw DecompileFail("Program does not always end");
+    }
 
-    std::string Decompile() {
-        UNREACHABLE();
-        return {};
+    std::set<Subroutine> GetSubroutines() {
+        return std::move(subroutines);
     }
 
 private:
-    const std::array<u32, MAX_PROGRAM_CODE_LENGTH>& program_code;
-    const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>& swizzle_data;
-    u32 main_offset;
-    const std::function<std::string(u32)>& inputreg_getter;
-    const std::function<std::string(u32)>& outputreg_getter;
-    bool sanitize_mul;
-    const std::string& emit_cb;
-    const std::string& setemit_cb;
+    const ProgramCode& program_code;
+    std::set<Subroutine> subroutines;
+    std::map<std::pair<u32, u32>, ExitMethod> exit_method_map;
+
+    /// Adds and analyzes a new subroutine if it is not added yet.
+    const Subroutine& AddSubroutine(u32 begin, u32 end) {
+        auto iter = subroutines.find(Subroutine{begin, end});
+        if (iter != subroutines.end())
+            return *iter;
+
+        Subroutine subroutine{begin, end};
+        subroutine.exit_method = Scan(begin, end, subroutine.labels);
+        if (subroutine.exit_method == ExitMethod::Undetermined)
+            throw DecompileFail("Recursive function detected");
+        return *subroutines.insert(std::move(subroutine)).first;
+    }
+
+    /// Scans a range of code for labels and determines the exit method.
+    ExitMethod Scan(u32 begin, u32 end, std::set<u32>& labels) {
+        auto [iter, inserted] =
+            exit_method_map.emplace(std::make_pair(begin, end), ExitMethod::Undetermined);
+        ExitMethod& exit_method = iter->second;
+        if (!inserted)
+            return exit_method;
+
+        for (u32 offset = begin; offset != end && offset != PROGRAM_END; ++offset) {
+            if (const auto opcode = OpCode::Decode({program_code[offset]})) {
+                switch (opcode->GetId()) {
+                case OpCode::Id::EXIT: {
+                    return exit_method = ExitMethod::AlwaysEnd;
+                }
+                }
+            }
+        }
+        return exit_method = ExitMethod::AlwaysReturn;
+    }
+};
+
+class ShaderWriter {
+public:
+    void AddLine(std::string_view text) {
+        DEBUG_ASSERT(scope >= 0);
+        if (!text.empty()) {
+            AppendIndentation();
+        }
+        shader_source += text;
+        AddNewLine();
+    }
+
+    void AddLine(char character) {
+        DEBUG_ASSERT(scope >= 0);
+        AppendIndentation();
+        shader_source += character;
+        AddNewLine();
+    }
+
+    void AddNewLine() {
+        DEBUG_ASSERT(scope >= 0);
+        shader_source += '\n';
+    }
+
+    std::string GetResult() {
+        return std::move(shader_source);
+    }
+
+    int scope = 0;
+
+private:
+    void AppendIndentation() {
+        shader_source.append(static_cast<size_t>(scope) * 4, ' ');
+    }
+
+    std::string shader_source;
 };
 
-std::string DecompileProgram(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>& program_code,
-                             const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>& swizzle_data,
-                             u32 main_offset,
-                             const std::function<std::string(u32)>& inputreg_getter,
-                             const std::function<std::string(u32)>& outputreg_getter,
-                             bool sanitize_mul, const std::string& emit_cb,
-                             const std::string& setemit_cb) {
-    Impl impl(program_code, swizzle_data, main_offset, inputreg_getter, outputreg_getter,
-              sanitize_mul, emit_cb, setemit_cb);
-    return impl.Decompile();
+class GLSLGenerator {
+public:
+    GLSLGenerator(const std::set<Subroutine>& subroutines, const ProgramCode& program_code,
+                  u32 main_offset, Maxwell3D::Regs::ShaderStage stage)
+        : subroutines(subroutines), program_code(program_code), main_offset(main_offset),
+          stage(stage) {
+
+        Generate();
+    }
+
+    std::string GetShaderCode() {
+        return declarations.GetResult() + shader.GetResult();
+    }
+
+    /// Returns entries in the shader that are useful for external functions
+    ShaderEntries GetEntries() const {
+        return {GetConstBuffersDeclarations()};
+    }
+
+private:
+    /// Gets the Subroutine object corresponding to the specified address.
+    const Subroutine& GetSubroutine(u32 begin, u32 end) const {
+        auto iter = subroutines.find(Subroutine{begin, end});
+        ASSERT(iter != subroutines.end());
+        return *iter;
+    }
+
+    /// Generates code representing an input attribute register.
+    std::string GetInputAttribute(Attribute::Index attribute) {
+        switch (attribute) {
+        case Attribute::Index::Position:
+            return "position";
+        default:
+            const u32 index{static_cast<u32>(attribute) -
+                            static_cast<u32>(Attribute::Index::Attribute_0)};
+            if (attribute >= Attribute::Index::Attribute_0) {
+                declr_input_attribute.insert(attribute);
+                return "input_attribute_" + std::to_string(index);
+            }
+
+            NGLOG_CRITICAL(HW_GPU, "Unhandled input attribute: {}", index);
+            UNREACHABLE();
+        }
+    }
+
+    /// Generates code representing an output attribute register.
+    std::string GetOutputAttribute(Attribute::Index attribute) {
+        switch (attribute) {
+        case Attribute::Index::Position:
+            return "position";
+        default:
+            const u32 index{static_cast<u32>(attribute) -
+                            static_cast<u32>(Attribute::Index::Attribute_0)};
+            if (attribute >= Attribute::Index::Attribute_0) {
+                declr_output_attribute.insert(attribute);
+                return "output_attribute_" + std::to_string(index);
+            }
+
+            NGLOG_CRITICAL(HW_GPU, "Unhandled output attribute: {}", index);
+            UNREACHABLE();
+        }
+    }
+
+    /// Generates code representing a 19-bit immediate value
+    static std::string GetImmediate19(const Instruction& instr) {
+        return std::to_string(instr.alu.GetImm20_19());
+    }
+
+    /// Generates code representing a 32-bit immediate value
+    static std::string GetImmediate32(const Instruction& instr) {
+        return std::to_string(instr.alu.GetImm20_32());
+    }
+
+    /// Generates code representing a temporary (GPR) register.
+    std::string GetRegister(const Register& reg, unsigned elem = 0) {
+        if (reg == Register::ZeroIndex)
+            return "0";
+        if (stage == Maxwell3D::Regs::ShaderStage::Fragment && reg < 4) {
+            // GPRs 0-3 are output color for the fragment shader
+            return std::string{"color."} + "rgba"[(reg + elem) & 3];
+        }
+
+        return *declr_register.insert("register_" + std::to_string(reg + elem)).first;
+    }
+
+    /// Generates code representing a uniform (C buffer) register.
+    std::string GetUniform(const Uniform& reg) {
+        declr_const_buffers[reg.index].MarkAsUsed(static_cast<unsigned>(reg.index),
+                                                  static_cast<unsigned>(reg.offset), stage);
+        return 'c' + std::to_string(reg.index) + '[' + std::to_string(reg.offset) + ']';
+    }
+
+    /// Generates code representing a texture sampler.
+    std::string GetSampler(const Sampler& sampler) const {
+        // TODO(Subv): Support more than just texture sampler 0
+        ASSERT_MSG(sampler.index == Sampler::Index::Sampler_0, "unsupported");
+        const unsigned index{static_cast<unsigned>(sampler.index.Value()) -
+                             static_cast<unsigned>(Sampler::Index::Sampler_0)};
+        return "tex[" + std::to_string(index) + "]";
+    }
+
+    /**
+     * Adds code that calls a subroutine.
+     * @param subroutine the subroutine to call.
+     */
+    void CallSubroutine(const Subroutine& subroutine) {
+        if (subroutine.exit_method == ExitMethod::AlwaysEnd) {
+            shader.AddLine(subroutine.GetName() + "();");
+            shader.AddLine("return true;");
+        } else if (subroutine.exit_method == ExitMethod::Conditional) {
+            shader.AddLine("if (" + subroutine.GetName() + "()) { return true; }");
+        } else {
+            shader.AddLine(subroutine.GetName() + "();");
+        }
+    }
+
+    /**
+     * Writes code that does an assignment operation.
+     * @param reg the destination register code.
+     * @param value the code representing the value to assign.
+     */
+    void SetDest(u64 elem, const std::string& reg, const std::string& value,
+                 u64 dest_num_components, u64 value_num_components, bool is_abs = false) {
+        std::string swizzle = ".";
+        swizzle += "xyzw"[elem];
+
+        std::string dest = reg + (dest_num_components != 1 ? swizzle : "");
+        std::string src = "(" + value + ")" + (value_num_components != 1 ? swizzle : "");
+        src = is_abs ? "abs(" + src + ")" : src;
+
+        shader.AddLine(dest + " = " + src + ";");
+    }
+
+    /*
+     * Writes code that assigns a predicate boolean variable.
+     * @param pred The id of the predicate to write to.
+     * @param value The expression value to assign to the predicate.
+     */
+    void SetPredicate(u64 pred, const std::string& value) {
+        using Tegra::Shader::Pred;
+        // Can't assign to the constant predicate.
+        ASSERT(pred != static_cast<u64>(Pred::UnusedIndex));
+
+        std::string variable = 'p' + std::to_string(pred);
+        shader.AddLine(variable + " = " + value + ';');
+        declr_predicates.insert(std::move(variable));
+    }
+
+    /*
+     * Returns the condition to use in the 'if' for a predicated instruction.
+     * @param instr Instruction to generate the if condition for.
+     * @returns string containing the predicate condition.
+     */
+    std::string GetPredicateCondition(Instruction instr) const {
+        using Tegra::Shader::Pred;
+        ASSERT(instr.pred.pred_index != static_cast<u64>(Pred::UnusedIndex));
+
+        std::string variable =
+            'p' + std::to_string(static_cast<u64>(instr.pred.pred_index.Value()));
+
+        if (instr.negate_pred) {
+            return "!(" + variable + ')';
+        }
+
+        return variable;
+    }
+
+    /*
+     * Returns whether the instruction at the specified offset is a 'sched' instruction.
+     * Sched instructions always appear before a sequence of 3 instructions.
+     */
+    bool IsSchedInstruction(u32 offset) const {
+        // sched instructions appear once every 4 instructions.
+        static constexpr size_t SchedPeriod = 4;
+        u32 absolute_offset = offset - main_offset;
+
+        return (absolute_offset % SchedPeriod) == 0;
+    }
+
+    /**
+     * Compiles a single instruction from Tegra to GLSL.
+     * @param offset the offset of the Tegra shader instruction.
+     * @return the offset of the next instruction to execute. Usually it is the current offset
+     * + 1. If the current instruction always terminates the program, returns PROGRAM_END.
+     */
+    u32 CompileInstr(u32 offset) {
+        // Ignore sched instructions when generating code.
+        if (IsSchedInstruction(offset)) {
+            return offset + 1;
+        }
+
+        const Instruction instr = {program_code[offset]};
+        const auto opcode = OpCode::Decode(instr);
+
+        // Decoding failure
+        if (!opcode) {
+            NGLOG_CRITICAL(HW_GPU, "Unhandled instruction: {0:x}", instr.value);
+            UNREACHABLE();
+        }
+
+        shader.AddLine("// " + std::to_string(offset) + ": " + opcode->GetName());
+
+        using Tegra::Shader::Pred;
+        ASSERT_MSG(instr.pred.full_pred != Pred::NeverExecute,
+                   "NeverExecute predicate not implemented");
+
+        if (instr.pred.pred_index != static_cast<u64>(Pred::UnusedIndex)) {
+            shader.AddLine("if (" + GetPredicateCondition(instr) + ')');
+            shader.AddLine('{');
+            ++shader.scope;
+        }
+
+        switch (opcode->GetType()) {
+        case OpCode::Type::Arithmetic: {
+            std::string dest = GetRegister(instr.gpr0);
+            std::string op_a = instr.alu.negate_a ? "-" : "";
+            op_a += GetRegister(instr.gpr8);
+            if (instr.alu.abs_a) {
+                op_a = "abs(" + op_a + ")";
+            }
+
+            std::string op_b = instr.alu.negate_b ? "-" : "";
+
+            if (instr.is_b_imm) {
+                op_b += GetImmediate19(instr);
+            } else {
+                if (instr.is_b_gpr) {
+                    op_b += GetRegister(instr.gpr20);
+                } else {
+                    op_b += GetUniform(instr.uniform);
+                }
+            }
+
+            if (instr.alu.abs_b) {
+                op_b = "abs(" + op_b + ")";
+            }
+
+            switch (opcode->GetId()) {
+            case OpCode::Id::FMUL_C:
+            case OpCode::Id::FMUL_R:
+            case OpCode::Id::FMUL_IMM: {
+                SetDest(0, dest, op_a + " * " + op_b, 1, 1, instr.alu.abs_d);
+                break;
+            }
+            case OpCode::Id::FMUL32_IMM: {
+                // fmul32i doesn't have abs or neg bits.
+                SetDest(0, dest, GetRegister(instr.gpr8) + " * " + GetImmediate32(instr), 1, 1);
+                break;
+            }
+            case OpCode::Id::FADD_C:
+            case OpCode::Id::FADD_R:
+            case OpCode::Id::FADD_IMM: {
+                SetDest(0, dest, op_a + " + " + op_b, 1, 1, instr.alu.abs_d);
+                break;
+            }
+            case OpCode::Id::MUFU: {
+                switch (instr.sub_op) {
+                case SubOp::Cos:
+                    SetDest(0, dest, "cos(" + op_a + ")", 1, 1, instr.alu.abs_d);
+                    break;
+                case SubOp::Sin:
+                    SetDest(0, dest, "sin(" + op_a + ")", 1, 1, instr.alu.abs_d);
+                    break;
+                case SubOp::Ex2:
+                    SetDest(0, dest, "exp2(" + op_a + ")", 1, 1, instr.alu.abs_d);
+                    break;
+                case SubOp::Lg2:
+                    SetDest(0, dest, "log2(" + op_a + ")", 1, 1, instr.alu.abs_d);
+                    break;
+                case SubOp::Rcp:
+                    SetDest(0, dest, "1.0 / " + op_a, 1, 1, instr.alu.abs_d);
+                    break;
+                case SubOp::Rsq:
+                    SetDest(0, dest, "inversesqrt(" + op_a + ")", 1, 1, instr.alu.abs_d);
+                    break;
+                case SubOp::Min:
+                    SetDest(0, dest, "min(" + op_a + "," + op_b + ")", 1, 1, instr.alu.abs_d);
+                    break;
+                default:
+                    NGLOG_CRITICAL(HW_GPU, "Unhandled MUFU sub op: {0:x}",
+                                   static_cast<unsigned>(instr.sub_op.Value()));
+                    UNREACHABLE();
+                }
+                break;
+            }
+            case OpCode::Id::RRO: {
+                NGLOG_DEBUG(HW_GPU, "Skipping RRO instruction");
+                break;
+            }
+            default: {
+                NGLOG_CRITICAL(HW_GPU, "Unhandled arithmetic instruction: {}", opcode->GetName());
+                UNREACHABLE();
+            }
+            }
+            break;
+        }
+        case OpCode::Type::Ffma: {
+            std::string dest = GetRegister(instr.gpr0);
+            std::string op_a = GetRegister(instr.gpr8);
+            std::string op_b = instr.ffma.negate_b ? "-" : "";
+            std::string op_c = instr.ffma.negate_c ? "-" : "";
+
+            switch (opcode->GetId()) {
+            case OpCode::Id::FFMA_CR: {
+                op_b += GetUniform(instr.uniform);
+                op_c += GetRegister(instr.gpr39);
+                break;
+            }
+            case OpCode::Id::FFMA_RR: {
+                op_b += GetRegister(instr.gpr20);
+                op_c += GetRegister(instr.gpr39);
+                break;
+            }
+            case OpCode::Id::FFMA_RC: {
+                op_b += GetRegister(instr.gpr39);
+                op_c += GetUniform(instr.uniform);
+                break;
+            }
+            case OpCode::Id::FFMA_IMM: {
+                op_b += GetImmediate19(instr);
+                op_c += GetRegister(instr.gpr39);
+                break;
+            }
+            default: {
+                NGLOG_CRITICAL(HW_GPU, "Unhandled FFMA instruction: {}", opcode->GetName());
+                UNREACHABLE();
+            }
+            }
+
+            SetDest(0, dest, op_a + " * " + op_b + " + " + op_c, 1, 1);
+            break;
+        }
+        case OpCode::Type::Memory: {
+            std::string gpr0 = GetRegister(instr.gpr0);
+            const Attribute::Index attribute = instr.attribute.fmt20.index;
+
+            switch (opcode->GetId()) {
+            case OpCode::Id::LD_A: {
+                ASSERT_MSG(instr.attribute.fmt20.size == 0, "untested");
+                SetDest(instr.attribute.fmt20.element, gpr0, GetInputAttribute(attribute), 1, 4);
+                break;
+            }
+            case OpCode::Id::ST_A: {
+                ASSERT_MSG(instr.attribute.fmt20.size == 0, "untested");
+                SetDest(instr.attribute.fmt20.element, GetOutputAttribute(attribute), gpr0, 4, 1);
+                break;
+            }
+            case OpCode::Id::TEXS: {
+                ASSERT_MSG(instr.attribute.fmt20.size == 4, "untested");
+                const std::string op_a = GetRegister(instr.gpr8);
+                const std::string op_b = GetRegister(instr.gpr20);
+                const std::string sampler = GetSampler(instr.sampler);
+                const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");";
+                // Add an extra scope and declare the texture coords inside to prevent overwriting
+                // them in case they are used as outputs of the texs instruction.
+                shader.AddLine("{");
+                ++shader.scope;
+                shader.AddLine(coord);
+                const std::string texture = "texture(" + sampler + ", coords)";
+                for (unsigned elem = 0; elem < instr.attribute.fmt20.size; ++elem) {
+                    SetDest(elem, GetRegister(instr.gpr0, elem), texture, 1, 4);
+                }
+                --shader.scope;
+                shader.AddLine("}");
+                break;
+            }
+            default: {
+                NGLOG_CRITICAL(HW_GPU, "Unhandled memory instruction: {}", opcode->GetName());
+                UNREACHABLE();
+            }
+            }
+            break;
+        }
+        case OpCode::Type::FloatPredicate: {
+            std::string op_a = instr.fsetp.neg_a ? "-" : "";
+            op_a += GetRegister(instr.gpr8);
+
+            if (instr.fsetp.abs_a) {
+                op_a = "abs(" + op_a + ')';
+            }
+
+            std::string op_b{};
+
+            if (instr.is_b_imm) {
+                if (instr.fsetp.neg_b) {
+                    // Only the immediate version of fsetp has a neg_b bit.
+                    op_b += '-';
+                }
+                op_b += '(' + GetImmediate19(instr) + ')';
+            } else {
+                if (instr.is_b_gpr) {
+                    op_b += GetRegister(instr.gpr20);
+                } else {
+                    op_b += GetUniform(instr.uniform);
+                }
+            }
+
+            if (instr.fsetp.abs_b) {
+                op_b = "abs(" + op_b + ')';
+            }
+
+            using Tegra::Shader::Pred;
+            ASSERT_MSG(instr.fsetp.pred0 == static_cast<u64>(Pred::UnusedIndex) &&
+                           instr.fsetp.pred39 == static_cast<u64>(Pred::UnusedIndex),
+                       "Compound predicates are not implemented");
+
+            // We can't use the constant predicate as destination.
+            ASSERT(instr.fsetp.pred3 != static_cast<u64>(Pred::UnusedIndex));
+
+            using Tegra::Shader::PredCondition;
+            switch (instr.fsetp.cond) {
+            case PredCondition::LessThan:
+                SetPredicate(instr.fsetp.pred3, '(' + op_a + ") < (" + op_b + ')');
+                break;
+            case PredCondition::Equal:
+                SetPredicate(instr.fsetp.pred3, '(' + op_a + ") == (" + op_b + ')');
+                break;
+            default:
+                NGLOG_CRITICAL(HW_GPU, "Unhandled predicate condition: {} (a: {}, b: {})",
+                               static_cast<unsigned>(instr.fsetp.cond.Value()), op_a, op_b);
+                UNREACHABLE();
+            }
+            break;
+        }
+        default: {
+            switch (opcode->GetId()) {
+            case OpCode::Id::EXIT: {
+                ASSERT_MSG(instr.pred.pred_index == static_cast<u64>(Pred::UnusedIndex),
+                           "Predicated exits not implemented");
+                shader.AddLine("return true;");
+                offset = PROGRAM_END - 1;
+                break;
+            }
+            case OpCode::Id::KIL: {
+                shader.AddLine("discard;");
+                break;
+            }
+            case OpCode::Id::IPA: {
+                const auto& attribute = instr.attribute.fmt28;
+                std::string dest = GetRegister(instr.gpr0);
+                SetDest(attribute.element, dest, GetInputAttribute(attribute.index), 1, 4);
+                break;
+            }
+            default: {
+                NGLOG_CRITICAL(HW_GPU, "Unhandled instruction: {}", opcode->GetName());
+                UNREACHABLE();
+            }
+            }
+
+            break;
+        }
+        }
+
+        // Close the predicate condition scope.
+        if (instr.pred.pred_index != static_cast<u64>(Pred::UnusedIndex)) {
+            --shader.scope;
+            shader.AddLine('}');
+        }
+
+        return offset + 1;
+    }
+
+    /**
+     * Compiles a range of instructions from Tegra to GLSL.
+     * @param begin the offset of the starting instruction.
+     * @param end the offset where the compilation should stop (exclusive).
+     * @return the offset of the next instruction to compile. PROGRAM_END if the program
+     * terminates.
+     */
+    u32 CompileRange(u32 begin, u32 end) {
+        u32 program_counter;
+        for (program_counter = begin; program_counter < (begin > end ? PROGRAM_END : end);) {
+            program_counter = CompileInstr(program_counter);
+        }
+        return program_counter;
+    }
+
+    void Generate() {
+        // Add declarations for all subroutines
+        for (const auto& subroutine : subroutines) {
+            shader.AddLine("bool " + subroutine.GetName() + "();");
+        }
+        shader.AddNewLine();
+
+        // Add the main entry point
+        shader.AddLine("bool exec_shader() {");
+        ++shader.scope;
+        CallSubroutine(GetSubroutine(main_offset, PROGRAM_END));
+        --shader.scope;
+        shader.AddLine("}\n");
+
+        // Add definitions for all subroutines
+        for (const auto& subroutine : subroutines) {
+            std::set<u32> labels = subroutine.labels;
+
+            shader.AddLine("bool " + subroutine.GetName() + "() {");
+            ++shader.scope;
+
+            if (labels.empty()) {
+                if (CompileRange(subroutine.begin, subroutine.end) != PROGRAM_END) {
+                    shader.AddLine("return false;");
+                }
+            } else {
+                labels.insert(subroutine.begin);
+                shader.AddLine("uint jmp_to = " + std::to_string(subroutine.begin) + "u;");
+                shader.AddLine("while (true) {");
+                ++shader.scope;
+
+                shader.AddLine("switch (jmp_to) {");
+
+                for (auto label : labels) {
+                    shader.AddLine("case " + std::to_string(label) + "u: {");
+                    ++shader.scope;
+
+                    auto next_it = labels.lower_bound(label + 1);
+                    u32 next_label = next_it == labels.end() ? subroutine.end : *next_it;
+
+                    u32 compile_end = CompileRange(label, next_label);
+                    if (compile_end > next_label && compile_end != PROGRAM_END) {
+                        // This happens only when there is a label inside a IF/LOOP block
+                        shader.AddLine("{ jmp_to = " + std::to_string(compile_end) + "u; break; }");
+                        labels.emplace(compile_end);
+                    }
+
+                    --shader.scope;
+                    shader.AddLine('}');
+                }
+
+                shader.AddLine("default: return false;");
+                shader.AddLine('}');
+
+                --shader.scope;
+                shader.AddLine('}');
+
+                shader.AddLine("return false;");
+            }
+
+            --shader.scope;
+            shader.AddLine("}\n");
+
+            DEBUG_ASSERT(shader.scope == 0);
+        }
+
+        GenerateDeclarations();
+    }
+
+    /// Returns a list of constant buffer declarations
+    std::vector<ConstBufferEntry> GetConstBuffersDeclarations() const {
+        std::vector<ConstBufferEntry> result;
+        std::copy_if(declr_const_buffers.begin(), declr_const_buffers.end(),
+                     std::back_inserter(result), [](const auto& entry) { return entry.IsUsed(); });
+        return result;
+    }
+
+    /// Add declarations for registers
+    void GenerateDeclarations() {
+        for (const auto& reg : declr_register) {
+            declarations.AddLine("float " + reg + " = 0.0;");
+        }
+        declarations.AddNewLine();
+
+        for (const auto& index : declr_input_attribute) {
+            // TODO(bunnei): Use proper number of elements for these
+            declarations.AddLine("layout(location = " +
+                                 std::to_string(static_cast<u32>(index) -
+                                                static_cast<u32>(Attribute::Index::Attribute_0)) +
+                                 ") in vec4 " + GetInputAttribute(index) + ";");
+        }
+        declarations.AddNewLine();
+
+        for (const auto& index : declr_output_attribute) {
+            // TODO(bunnei): Use proper number of elements for these
+            declarations.AddLine("layout(location = " +
+                                 std::to_string(static_cast<u32>(index) -
+                                                static_cast<u32>(Attribute::Index::Attribute_0)) +
+                                 ") out vec4 " + GetOutputAttribute(index) + ";");
+        }
+        declarations.AddNewLine();
+
+        unsigned const_buffer_layout = 0;
+        for (const auto& entry : GetConstBuffersDeclarations()) {
+            declarations.AddLine("layout(std430) buffer " + entry.GetName());
+            declarations.AddLine('{');
+            declarations.AddLine("    float c" + std::to_string(entry.GetIndex()) + "[];");
+            declarations.AddLine("};");
+            declarations.AddNewLine();
+            ++const_buffer_layout;
+        }
+
+        declarations.AddNewLine();
+        for (const auto& pred : declr_predicates) {
+            declarations.AddLine("bool " + pred + " = false;");
+        }
+        declarations.AddNewLine();
+    }
+
+private:
+    const std::set<Subroutine>& subroutines;
+    const ProgramCode& program_code;
+    const u32 main_offset;
+    Maxwell3D::Regs::ShaderStage stage;
+
+    ShaderWriter shader;
+    ShaderWriter declarations;
+
+    // Declarations
+    std::set<std::string> declr_register;
+    std::set<std::string> declr_predicates;
+    std::set<Attribute::Index> declr_input_attribute;
+    std::set<Attribute::Index> declr_output_attribute;
+    std::array<ConstBufferEntry, Maxwell3D::Regs::MaxConstBuffers> declr_const_buffers;
+}; // namespace Decompiler
+
+std::string GetCommonDeclarations() {
+    return "bool exec_shader();";
+}
+
+boost::optional<ProgramResult> DecompileProgram(const ProgramCode& program_code, u32 main_offset,
+                                                Maxwell3D::Regs::ShaderStage stage) {
+    try {
+        auto subroutines = ControlFlowAnalyzer(program_code, main_offset).GetSubroutines();
+        GLSLGenerator generator(subroutines, program_code, main_offset, stage);
+        return ProgramResult{generator.GetShaderCode(), generator.GetEntries()};
+    } catch (const DecompileFail& exception) {
+        NGLOG_ERROR(HW_GPU, "Shader decompilation failed: {}", exception.what());
+    }
+    return boost::none;
 }
 
 } // namespace Decompiler
-} // namespace Shader
-} // namespace Maxwell3D
+} // namespace GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index 02ebfcbe8..382c76b7a 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -2,26 +2,25 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#pragma once
+
 #include <array>
 #include <functional>
 #include <string>
+#include <boost/optional.hpp>
 #include "common/common_types.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/renderer_opengl/gl_shader_gen.h"
 
-namespace Maxwell3D {
-namespace Shader {
+namespace GLShader {
 namespace Decompiler {
 
-constexpr size_t MAX_PROGRAM_CODE_LENGTH{0x100000};
-constexpr size_t MAX_SWIZZLE_DATA_LENGTH{0x100000};
+using Tegra::Engines::Maxwell3D;
+
+std::string GetCommonDeclarations();
 
-std::string DecompileProgram(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>& program_code,
-                             const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>& swizzle_data,
-                             u32 main_offset,
-                             const std::function<std::string(u32)>& inputreg_getter,
-                             const std::function<std::string(u32)>& outputreg_getter,
-                             bool sanitize_mul, const std::string& emit_cb = "",
-                             const std::string& setemit_cb = "");
+boost::optional<ProgramResult> DecompileProgram(const ProgramCode& program_code, u32 main_offset,
+                                                Maxwell3D::Regs::ShaderStage stage);
 
 } // namespace Decompiler
-} // namespace Shader
-} // namespace Maxwell3D
+} // namespace GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 8f3c98800..254f6e2c3 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -3,18 +3,74 @@
 // Refer to the license.txt file included.
 
 #include "common/assert.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
 
 namespace GLShader {
 
-std::string GenerateVertexShader(const MaxwellVSConfig& config) {
-    UNREACHABLE();
-    return {};
+using Tegra::Engines::Maxwell3D;
+
+static constexpr u32 PROGRAM_OFFSET{10};
+
+ProgramResult GenerateVertexShader(const ShaderSetup& setup, const MaxwellVSConfig& config) {
+    std::string out = "#version 430 core\n";
+    out += "#extension GL_ARB_separate_shader_objects : enable\n\n";
+    out += Decompiler::GetCommonDeclarations();
+
+    ProgramResult program = Decompiler::DecompileProgram(setup.program_code, PROGRAM_OFFSET,
+                                                         Maxwell3D::Regs::ShaderStage::Vertex)
+                                .get_value_or({});
+    out += R"(
+
+out gl_PerVertex {
+    vec4 gl_Position;
+};
+
+out vec4 position;
+
+layout (std140) uniform vs_config {
+    vec4 viewport_flip;
+};
+
+void main() {
+    exec_shader();
+
+    // Viewport can be flipped, which is unsupported by glViewport
+    position.xy *= viewport_flip.xy;
+    gl_Position = position;
+}
+)";
+    out += program.first;
+    return {out, program.second};
+}
+
+ProgramResult GenerateFragmentShader(const ShaderSetup& setup, const MaxwellFSConfig& config) {
+    std::string out = "#version 430 core\n";
+    out += "#extension GL_ARB_separate_shader_objects : enable\n\n";
+    out += Decompiler::GetCommonDeclarations();
+
+    ProgramResult program = Decompiler::DecompileProgram(setup.program_code, PROGRAM_OFFSET,
+                                                         Maxwell3D::Regs::ShaderStage::Fragment)
+                                .get_value_or({});
+    out += R"(
+
+in vec4 position;
+out vec4 color;
+
+layout (std140) uniform fs_config {
+    vec4 viewport_flip;
+};
+
+uniform sampler2D tex[32];
+
+void main() {
+    exec_shader();
 }
 
-std::string GenerateFragmentShader(const MaxwellFSConfig& config) {
-    UNREACHABLE();
-    return {};
+)";
+    out += program.first;
+    return {out, program.second};
 }
 
 } // namespace GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index 5101e7d30..458032b5c 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -4,46 +4,113 @@
 
 #pragma once
 
-#include <cstring>
+#include <array>
 #include <string>
 #include <type_traits>
+#include <utility>
+#include <vector>
+#include "common/common_types.h"
 #include "common/hash.h"
 
 namespace GLShader {
 
-enum Attributes {
-    ATTRIBUTE_POSITION,
-    ATTRIBUTE_COLOR,
-    ATTRIBUTE_TEXCOORD0,
-    ATTRIBUTE_TEXCOORD1,
-    ATTRIBUTE_TEXCOORD2,
-    ATTRIBUTE_TEXCOORD0_W,
-    ATTRIBUTE_NORMQUAT,
-    ATTRIBUTE_VIEW,
+constexpr size_t MAX_PROGRAM_CODE_LENGTH{0x1000};
+
+using ProgramCode = std::array<u64, MAX_PROGRAM_CODE_LENGTH>;
+
+class ConstBufferEntry {
+    using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+
+public:
+    void MarkAsUsed(unsigned index, unsigned offset, Maxwell::ShaderStage stage) {
+        is_used = true;
+        this->index = index;
+        this->stage = stage;
+        max_offset = std::max(max_offset, offset);
+    }
+
+    bool IsUsed() const {
+        return is_used;
+    }
+
+    unsigned GetIndex() const {
+        return index;
+    }
+
+    unsigned GetSize() const {
+        return max_offset + 1;
+    }
+
+    std::string GetName() const {
+        return BufferBaseNames[static_cast<size_t>(stage)] + std::to_string(index);
+    }
+
+private:
+    static constexpr std::array<const char*, Maxwell::MaxShaderStage> BufferBaseNames = {
+        "buffer_vs_c", "buffer_tessc_c", "buffer_tesse_c", "buffer_gs_c", "buffer_fs_c",
+    };
+
+    bool is_used{};
+    unsigned index{};
+    unsigned max_offset{};
+    Maxwell::ShaderStage stage;
 };
 
-struct MaxwellShaderConfigCommon {
-    explicit MaxwellShaderConfigCommon(){};
+struct ShaderEntries {
+    std::vector<ConstBufferEntry> const_buffer_entries;
 };
 
-struct MaxwellVSConfig : MaxwellShaderConfigCommon {
-    explicit MaxwellVSConfig() : MaxwellShaderConfigCommon() {}
+using ProgramResult = std::pair<std::string, ShaderEntries>;
 
-    bool operator==(const MaxwellVSConfig& o) const {
-        return std::memcmp(this, &o, sizeof(MaxwellVSConfig)) == 0;
-    };
+struct ShaderSetup {
+    ShaderSetup(ProgramCode&& program_code) : program_code(std::move(program_code)) {}
+
+    ProgramCode program_code;
+    bool program_code_hash_dirty = true;
+
+    u64 GetProgramCodeHash() {
+        if (program_code_hash_dirty) {
+            program_code_hash = Common::ComputeHash64(&program_code, sizeof(program_code));
+            program_code_hash_dirty = false;
+        }
+        return program_code_hash;
+    }
+
+private:
+    u64 program_code_hash{};
 };
 
-struct MaxwellFSConfig : MaxwellShaderConfigCommon {
-    explicit MaxwellFSConfig() : MaxwellShaderConfigCommon() {}
+struct MaxwellShaderConfigCommon {
+    void Init(ShaderSetup& setup) {
+        program_hash = setup.GetProgramCodeHash();
+    }
 
-    bool operator==(const MaxwellFSConfig& o) const {
-        return std::memcmp(this, &o, sizeof(MaxwellFSConfig)) == 0;
-    };
+    u64 program_hash;
 };
 
-std::string GenerateVertexShader(const MaxwellVSConfig& config);
-std::string GenerateFragmentShader(const MaxwellFSConfig& config);
+struct MaxwellVSConfig : Common::HashableStruct<MaxwellShaderConfigCommon> {
+    explicit MaxwellVSConfig(ShaderSetup& setup) {
+        state.Init(setup);
+    }
+};
+
+struct MaxwellFSConfig : Common::HashableStruct<MaxwellShaderConfigCommon> {
+    explicit MaxwellFSConfig(ShaderSetup& setup) {
+        state.Init(setup);
+    }
+};
+
+/**
+ * Generates the GLSL vertex shader program source code for the given VS program
+ * @returns String of the shader source code
+ */
+ProgramResult GenerateVertexShader(const ShaderSetup& setup, const MaxwellVSConfig& config);
+
+/**
+ * Generates the GLSL fragment shader program source code for the given FS program
+ * @returns String of the shader source code
+ */
+ProgramResult GenerateFragmentShader(const ShaderSetup& setup, const MaxwellFSConfig& config);
 
 } // namespace GLShader
 
@@ -52,14 +119,14 @@ namespace std {
 template <>
 struct hash<GLShader::MaxwellVSConfig> {
     size_t operator()(const GLShader::MaxwellVSConfig& k) const {
-        return Common::ComputeHash64(&k, sizeof(GLShader::MaxwellVSConfig));
+        return k.Hash();
     }
 };
 
 template <>
 struct hash<GLShader::MaxwellFSConfig> {
     size_t operator()(const GLShader::MaxwellFSConfig& k) const {
-        return Common::ComputeHash64(&k, sizeof(GLShader::MaxwellFSConfig));
+        return k.Hash();
     }
 };
 
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
new file mode 100644
index 000000000..17b3925a0
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -0,0 +1,64 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/core.h"
+#include "core/hle/kernel/process.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/renderer_opengl/gl_shader_manager.h"
+
+namespace GLShader {
+
+namespace Impl {
+void SetShaderUniformBlockBinding(GLuint shader, const char* name,
+                                  Maxwell3D::Regs::ShaderStage binding, size_t expected_size) {
+    GLuint ub_index = glGetUniformBlockIndex(shader, name);
+    if (ub_index != GL_INVALID_INDEX) {
+        GLint ub_size = 0;
+        glGetActiveUniformBlockiv(shader, ub_index, GL_UNIFORM_BLOCK_DATA_SIZE, &ub_size);
+        ASSERT_MSG(ub_size == expected_size,
+                   "Uniform block size did not match! Got %d, expected %zu",
+                   static_cast<int>(ub_size), expected_size);
+        glUniformBlockBinding(shader, ub_index, static_cast<GLuint>(binding));
+    }
+}
+
+void SetShaderUniformBlockBindings(GLuint shader) {
+    SetShaderUniformBlockBinding(shader, "vs_config", Maxwell3D::Regs::ShaderStage::Vertex,
+                                 sizeof(MaxwellUniformData));
+    SetShaderUniformBlockBinding(shader, "gs_config", Maxwell3D::Regs::ShaderStage::Geometry,
+                                 sizeof(MaxwellUniformData));
+    SetShaderUniformBlockBinding(shader, "fs_config", Maxwell3D::Regs::ShaderStage::Fragment,
+                                 sizeof(MaxwellUniformData));
+}
+
+void SetShaderSamplerBindings(GLuint shader) {
+    OpenGLState cur_state = OpenGLState::GetCurState();
+    GLuint old_program = std::exchange(cur_state.draw.shader_program, shader);
+    cur_state.Apply();
+
+    // Set the texture samplers to correspond to different texture units
+    for (u32 texture = 0; texture < NumTextureSamplers; ++texture) {
+        // Set the texture samplers to correspond to different texture units
+        std::string uniform_name = "tex[" + std::to_string(texture) + "]";
+        GLint uniform_tex = glGetUniformLocation(shader, uniform_name.c_str());
+        if (uniform_tex != -1) {
+            glUniform1i(uniform_tex, TextureUnits::MaxwellTexture(texture).id);
+        }
+    }
+
+    cur_state.draw.shader_program = old_program;
+    cur_state.Apply();
+}
+
+} // namespace Impl
+
+void MaxwellUniformData::SetFromRegs(const Maxwell3D::State::ShaderStageInfo& shader_stage) {
+    const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
+
+    // TODO(bunnei): Support more than one viewport
+    viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0 : 1.0;
+    viewport_flip[1] = regs.viewport_transform[0].scale_y < 0.0 ? -1.0 : 1.0;
+}
+
+} // namespace GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
new file mode 100644
index 000000000..e963b4b7e
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -0,0 +1,175 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <tuple>
+#include <unordered_map>
+#include <boost/functional/hash.hpp>
+#include <glad/glad.h>
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/renderer_opengl/gl_shader_gen.h"
+#include "video_core/renderer_opengl/maxwell_to_gl.h"
+
+namespace GLShader {
+
+/// Number of OpenGL texture samplers that can be used in the fragment shader
+static constexpr size_t NumTextureSamplers = 32;
+
+using Tegra::Engines::Maxwell3D;
+
+namespace Impl {
+void SetShaderUniformBlockBindings(GLuint shader);
+void SetShaderSamplerBindings(GLuint shader);
+} // namespace Impl
+
+/// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned
+// NOTE: Always keep a vec4 at the end. The GL spec is not clear wether the alignment at
+//       the end of a uniform block is included in UNIFORM_BLOCK_DATA_SIZE or not.
+//       Not following that rule will cause problems on some AMD drivers.
+struct MaxwellUniformData {
+    void SetFromRegs(const Maxwell3D::State::ShaderStageInfo& shader_stage);
+    alignas(16) GLvec4 viewport_flip;
+};
+static_assert(sizeof(MaxwellUniformData) == 16, "MaxwellUniformData structure size is incorrect");
+static_assert(sizeof(MaxwellUniformData) < 16384,
+              "MaxwellUniformData structure must be less than 16kb as per the OpenGL spec");
+
+class OGLShaderStage {
+public:
+    OGLShaderStage() = default;
+
+    void Create(const ProgramResult& program_result, GLenum type) {
+        OGLShader shader;
+        shader.Create(program_result.first.c_str(), type);
+        program.Create(true, shader.handle);
+        Impl::SetShaderUniformBlockBindings(program.handle);
+        Impl::SetShaderSamplerBindings(program.handle);
+        entries = program_result.second;
+    }
+    GLuint GetHandle() const {
+        return program.handle;
+    }
+
+    ShaderEntries GetEntries() const {
+        return entries;
+    }
+
+private:
+    OGLProgram program;
+    ShaderEntries entries;
+};
+
+// TODO(wwylele): beautify this doc
+// This is a shader cache designed for translating PICA shader to GLSL shader.
+// The double cache is needed because diffent KeyConfigType, which includes a hash of the code
+// region (including its leftover unused code) can generate the same GLSL code.
+template <typename KeyConfigType,
+          ProgramResult (*CodeGenerator)(const ShaderSetup&, const KeyConfigType&),
+          GLenum ShaderType>
+class ShaderCache {
+public:
+    ShaderCache() = default;
+
+    using Result = std::pair<GLuint, ShaderEntries>;
+
+    Result Get(const KeyConfigType& key, const ShaderSetup& setup) {
+        auto map_it = shader_map.find(key);
+        if (map_it == shader_map.end()) {
+            ProgramResult program = CodeGenerator(setup, key);
+
+            auto [iter, new_shader] = shader_cache.emplace(program.first, OGLShaderStage{});
+            OGLShaderStage& cached_shader = iter->second;
+            if (new_shader) {
+                cached_shader.Create(program, ShaderType);
+            }
+            shader_map[key] = &cached_shader;
+            return {cached_shader.GetHandle(), program.second};
+        } else {
+            return {map_it->second->GetHandle(), map_it->second->GetEntries()};
+        }
+    }
+
+private:
+    std::unordered_map<KeyConfigType, OGLShaderStage*> shader_map;
+    std::unordered_map<std::string, OGLShaderStage> shader_cache;
+};
+
+using VertexShaders = ShaderCache<MaxwellVSConfig, &GenerateVertexShader, GL_VERTEX_SHADER>;
+
+using FragmentShaders = ShaderCache<MaxwellFSConfig, &GenerateFragmentShader, GL_FRAGMENT_SHADER>;
+
+class ProgramManager {
+public:
+    ProgramManager() {
+        pipeline.Create();
+    }
+
+    ShaderEntries UseProgrammableVertexShader(const MaxwellVSConfig& config,
+                                              const ShaderSetup setup) {
+        ShaderEntries result;
+        std::tie(current.vs, result) = vertex_shaders.Get(config, setup);
+        return result;
+    }
+
+    ShaderEntries UseProgrammableFragmentShader(const MaxwellFSConfig& config,
+                                                const ShaderSetup setup) {
+        ShaderEntries result;
+        std::tie(current.fs, result) = fragment_shaders.Get(config, setup);
+        return result;
+    }
+
+    GLuint GetCurrentProgramStage(Maxwell3D::Regs::ShaderStage stage) {
+        switch (stage) {
+        case Maxwell3D::Regs::ShaderStage::Vertex:
+            return current.vs;
+        case Maxwell3D::Regs::ShaderStage::Fragment:
+            return current.fs;
+        }
+
+        UNREACHABLE();
+    }
+
+    void UseTrivialGeometryShader() {
+        current.gs = 0;
+    }
+
+    void ApplyTo(OpenGLState& state) {
+        // Workaround for AMD bug
+        glUseProgramStages(pipeline.handle,
+                           GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT | GL_FRAGMENT_SHADER_BIT,
+                           0);
+
+        glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, current.vs);
+        glUseProgramStages(pipeline.handle, GL_GEOMETRY_SHADER_BIT, current.gs);
+        glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, current.fs);
+        state.draw.shader_program = 0;
+        state.draw.program_pipeline = pipeline.handle;
+    }
+
+private:
+    struct ShaderTuple {
+        GLuint vs = 0, gs = 0, fs = 0;
+        bool operator==(const ShaderTuple& rhs) const {
+            return std::tie(vs, gs, fs) == std::tie(rhs.vs, rhs.gs, rhs.fs);
+        }
+        struct Hash {
+            std::size_t operator()(const ShaderTuple& tuple) const {
+                std::size_t hash = 0;
+                boost::hash_combine(hash, tuple.vs);
+                boost::hash_combine(hash, tuple.gs);
+                boost::hash_combine(hash, tuple.fs);
+                return hash;
+            }
+        };
+    };
+    ShaderTuple current;
+    VertexShaders vertex_shaders;
+    FragmentShaders fragment_shaders;
+
+    std::unordered_map<ShaderTuple, OGLProgram, ShaderTuple::Hash> program_cache;
+    OGLPipeline pipeline;
+};
+
+} // namespace GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp
index a6c6204d5..8568fface 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_util.cpp
@@ -10,156 +10,41 @@
 
 namespace GLShader {
 
-GLuint LoadProgram(const char* vertex_shader, const char* geometry_shader,
-                   const char* fragment_shader, const std::vector<const char*>& feedback_vars,
-                   bool separable_program) {
-    // Create the shaders
-    GLuint vertex_shader_id = vertex_shader ? glCreateShader(GL_VERTEX_SHADER) : 0;
-    GLuint geometry_shader_id = geometry_shader ? glCreateShader(GL_GEOMETRY_SHADER) : 0;
-    GLuint fragment_shader_id = fragment_shader ? glCreateShader(GL_FRAGMENT_SHADER) : 0;
+GLuint LoadShader(const char* source, GLenum type) {
+    const char* debug_type;
+    switch (type) {
+    case GL_VERTEX_SHADER:
+        debug_type = "vertex";
+        break;
+    case GL_GEOMETRY_SHADER:
+        debug_type = "geometry";
+        break;
+    case GL_FRAGMENT_SHADER:
+        debug_type = "fragment";
+        break;
+    default:
+        UNREACHABLE();
+    }
+    GLuint shader_id = glCreateShader(type);
+    glShaderSource(shader_id, 1, &source, nullptr);
+    NGLOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type);
+    glCompileShader(shader_id);
 
     GLint result = GL_FALSE;
-    int info_log_length;
-
-    if (vertex_shader) {
-        // Compile Vertex Shader
-        LOG_DEBUG(Render_OpenGL, "Compiling vertex shader...");
-
-        glShaderSource(vertex_shader_id, 1, &vertex_shader, nullptr);
-        glCompileShader(vertex_shader_id);
-
-        // Check Vertex Shader
-        glGetShaderiv(vertex_shader_id, GL_COMPILE_STATUS, &result);
-        glGetShaderiv(vertex_shader_id, GL_INFO_LOG_LENGTH, &info_log_length);
-
-        if (info_log_length > 1) {
-            std::vector<char> vertex_shader_error(info_log_length);
-            glGetShaderInfoLog(vertex_shader_id, info_log_length, nullptr, &vertex_shader_error[0]);
-            if (result == GL_TRUE) {
-                LOG_DEBUG(Render_OpenGL, "%s", &vertex_shader_error[0]);
-            } else {
-                LOG_CRITICAL(Render_OpenGL, "Error compiling vertex shader:\n%s",
-                             &vertex_shader_error[0]);
-            }
-        }
-    }
-
-    if (geometry_shader) {
-        // Compile Geometry Shader
-        LOG_DEBUG(Render_OpenGL, "Compiling geometry shader...");
-
-        glShaderSource(geometry_shader_id, 1, &geometry_shader, nullptr);
-        glCompileShader(geometry_shader_id);
-
-        // Check Geometry Shader
-        glGetShaderiv(geometry_shader_id, GL_COMPILE_STATUS, &result);
-        glGetShaderiv(geometry_shader_id, GL_INFO_LOG_LENGTH, &info_log_length);
-
-        if (info_log_length > 1) {
-            std::vector<char> geometry_shader_error(info_log_length);
-            glGetShaderInfoLog(geometry_shader_id, info_log_length, nullptr,
-                               &geometry_shader_error[0]);
-            if (result == GL_TRUE) {
-                LOG_DEBUG(Render_OpenGL, "%s", &geometry_shader_error[0]);
-            } else {
-                LOG_CRITICAL(Render_OpenGL, "Error compiling geometry shader:\n%s",
-                             &geometry_shader_error[0]);
-            }
-        }
-    }
-
-    if (fragment_shader) {
-        // Compile Fragment Shader
-        LOG_DEBUG(Render_OpenGL, "Compiling fragment shader...");
-
-        glShaderSource(fragment_shader_id, 1, &fragment_shader, nullptr);
-        glCompileShader(fragment_shader_id);
-
-        // Check Fragment Shader
-        glGetShaderiv(fragment_shader_id, GL_COMPILE_STATUS, &result);
-        glGetShaderiv(fragment_shader_id, GL_INFO_LOG_LENGTH, &info_log_length);
-
-        if (info_log_length > 1) {
-            std::vector<char> fragment_shader_error(info_log_length);
-            glGetShaderInfoLog(fragment_shader_id, info_log_length, nullptr,
-                               &fragment_shader_error[0]);
-            if (result == GL_TRUE) {
-                LOG_DEBUG(Render_OpenGL, "%s", &fragment_shader_error[0]);
-            } else {
-                LOG_CRITICAL(Render_OpenGL, "Error compiling fragment shader:\n%s",
-                             &fragment_shader_error[0]);
-            }
-        }
-    }
-
-    // Link the program
-    LOG_DEBUG(Render_OpenGL, "Linking program...");
-
-    GLuint program_id = glCreateProgram();
-    if (vertex_shader) {
-        glAttachShader(program_id, vertex_shader_id);
-    }
-    if (geometry_shader) {
-        glAttachShader(program_id, geometry_shader_id);
-    }
-    if (fragment_shader) {
-        glAttachShader(program_id, fragment_shader_id);
-    }
-
-    if (!feedback_vars.empty()) {
-        auto varyings = feedback_vars;
-        glTransformFeedbackVaryings(program_id, static_cast<GLsizei>(feedback_vars.size()),
-                                    &varyings[0], GL_INTERLEAVED_ATTRIBS);
-    }
-
-    if (separable_program) {
-        glProgramParameteri(program_id, GL_PROGRAM_SEPARABLE, GL_TRUE);
-    }
-
-    glLinkProgram(program_id);
-
-    // Check the program
-    glGetProgramiv(program_id, GL_LINK_STATUS, &result);
-    glGetProgramiv(program_id, GL_INFO_LOG_LENGTH, &info_log_length);
+    GLint info_log_length;
+    glGetShaderiv(shader_id, GL_COMPILE_STATUS, &result);
+    glGetShaderiv(shader_id, GL_INFO_LOG_LENGTH, &info_log_length);
 
     if (info_log_length > 1) {
-        std::vector<char> program_error(info_log_length);
-        glGetProgramInfoLog(program_id, info_log_length, nullptr, &program_error[0]);
+        std::string shader_error(info_log_length, ' ');
+        glGetShaderInfoLog(shader_id, info_log_length, nullptr, &shader_error[0]);
         if (result == GL_TRUE) {
-            LOG_DEBUG(Render_OpenGL, "%s", &program_error[0]);
+            NGLOG_DEBUG(Render_OpenGL, "{}", shader_error);
         } else {
-            LOG_CRITICAL(Render_OpenGL, "Error linking shader:\n%s", &program_error[0]);
+            NGLOG_ERROR(Render_OpenGL, "Error compiling {} shader:\n{}", debug_type, shader_error);
         }
     }
-
-    // If the program linking failed at least one of the shaders was probably bad
-    if (result == GL_FALSE) {
-        if (vertex_shader) {
-            LOG_CRITICAL(Render_OpenGL, "Vertex shader:\n%s", vertex_shader);
-        }
-        if (geometry_shader) {
-            LOG_CRITICAL(Render_OpenGL, "Geometry shader:\n%s", geometry_shader);
-        }
-        if (fragment_shader) {
-            LOG_CRITICAL(Render_OpenGL, "Fragment shader:\n%s", fragment_shader);
-        }
-    }
-    ASSERT_MSG(result == GL_TRUE, "Shader not linked");
-
-    if (vertex_shader) {
-        glDetachShader(program_id, vertex_shader_id);
-        glDeleteShader(vertex_shader_id);
-    }
-    if (geometry_shader) {
-        glDetachShader(program_id, geometry_shader_id);
-        glDeleteShader(geometry_shader_id);
-    }
-    if (fragment_shader) {
-        glDetachShader(program_id, fragment_shader_id);
-        glDeleteShader(fragment_shader_id);
-    }
-
-    return program_id;
+    return shader_id;
 }
 
 } // namespace GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_util.h b/src/video_core/renderer_opengl/gl_shader_util.h
index fc7b5e080..a1fa9e814 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.h
+++ b/src/video_core/renderer_opengl/gl_shader_util.h
@@ -6,18 +6,60 @@
 
 #include <vector>
 #include <glad/glad.h>
+#include "common/assert.h"
+#include "common/logging/log.h"
 
 namespace GLShader {
 
 /**
+ * Utility function to create and compile an OpenGL GLSL shader
+ * @param source String of the GLSL shader program
+ * @param type Type of the shader (GL_VERTEX_SHADER, GL_GEOMETRY_SHADER or GL_FRAGMENT_SHADER)
+ */
+GLuint LoadShader(const char* source, GLenum type);
+
+/**
  * Utility function to create and compile an OpenGL GLSL shader program (vertex + fragment shader)
- * @param vertex_shader String of the GLSL vertex shader program
- * @param geometry_shader String of the GLSL geometry shader program
- * @param fragment_shader String of the GLSL fragment shader program
- * @returns Handle of the newly created OpenGL shader object
+ * @param separable_program whether to create a separable program
+ * @param shaders ID of shaders to attach to the program
+ * @returns Handle of the newly created OpenGL program object
  */
-GLuint LoadProgram(const char* vertex_shader, const char* geometry_shader,
-                   const char* fragment_shader, const std::vector<const char*>& feedback_vars = {},
-                   bool separable_program = false);
+template <typename... T>
+GLuint LoadProgram(bool separable_program, T... shaders) {
+    // Link the program
+    NGLOG_DEBUG(Render_OpenGL, "Linking program...");
+
+    GLuint program_id = glCreateProgram();
+
+    ((shaders == 0 ? (void)0 : glAttachShader(program_id, shaders)), ...);
+
+    if (separable_program) {
+        glProgramParameteri(program_id, GL_PROGRAM_SEPARABLE, GL_TRUE);
+    }
+
+    glLinkProgram(program_id);
+
+    // Check the program
+    GLint result = GL_FALSE;
+    GLint info_log_length;
+    glGetProgramiv(program_id, GL_LINK_STATUS, &result);
+    glGetProgramiv(program_id, GL_INFO_LOG_LENGTH, &info_log_length);
+
+    if (info_log_length > 1) {
+        std::string program_error(info_log_length, ' ');
+        glGetProgramInfoLog(program_id, info_log_length, nullptr, &program_error[0]);
+        if (result == GL_TRUE) {
+            NGLOG_DEBUG(Render_OpenGL, "{}", program_error);
+        } else {
+            NGLOG_ERROR(Render_OpenGL, "Error linking shader:\n{}", program_error);
+        }
+    }
+
+    ASSERT_MSG(result == GL_TRUE, "Shader not linked");
+
+    ((shaders == 0 ? (void)0 : glDetachShader(program_id, shaders)), ...);
+
+    return program_id;
+}
 
 } // namespace GLShader
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 6da3a7781..f91dfe36a 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -2,8 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <iterator>
 #include <glad/glad.h>
-#include "common/common_funcs.h"
 #include "common/logging/log.h"
 #include "video_core/renderer_opengl/gl_state.h"
 
@@ -192,7 +192,7 @@ void OpenGLState::Apply() const {
     }
 
     // Textures
-    for (unsigned i = 0; i < ARRAY_SIZE(texture_units); ++i) {
+    for (size_t i = 0; i < std::size(texture_units); ++i) {
         if (texture_units[i].texture_2d != cur_state.texture_units[i].texture_2d) {
             glActiveTexture(TextureUnits::MaxwellTexture(i).Enum());
             glBindTexture(GL_TEXTURE_2D, texture_units[i].texture_2d);
@@ -202,6 +202,20 @@ void OpenGLState::Apply() const {
         }
     }
 
+    // Constbuffers
+    for (u32 stage = 0; stage < draw.const_buffers.size(); ++stage) {
+        for (u32 buffer_id = 0; buffer_id < draw.const_buffers[stage].size(); ++buffer_id) {
+            auto& current = cur_state.draw.const_buffers[stage][buffer_id];
+            auto& new_state = draw.const_buffers[stage][buffer_id];
+            if (current.enabled != new_state.enabled || current.bindpoint != new_state.bindpoint ||
+                current.ssbo != new_state.ssbo) {
+                if (new_state.enabled) {
+                    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, new_state.bindpoint, new_state.ssbo);
+                }
+            }
+        }
+    }
+
     // Lighting LUTs
     if (lighting_lut.texture_buffer != cur_state.lighting_lut.texture_buffer) {
         glActiveTexture(TextureUnits::LightingLUT.Enum());
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index b18af14bb..75c08e645 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -123,6 +123,12 @@ public:
         GLuint uniform_buffer;   // GL_UNIFORM_BUFFER_BINDING
         GLuint shader_program;   // GL_CURRENT_PROGRAM
         GLuint program_pipeline; // GL_PROGRAM_PIPELINE_BINDING
+        struct ConstBufferConfig {
+            bool enabled = false;
+            GLuint bindpoint;
+            GLuint ssbo;
+        };
+        std::array<std::array<ConstBufferConfig, 16>, 5> const_buffers{};
     } draw;
 
     struct {
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index 4bc2f52e0..e78dc5784 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#pragma once
+
 #include <memory>
 #include <glad/glad.h>
 #include "common/common_types.h"
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 48ee80125..a49265b38 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -10,6 +10,14 @@
 #include "common/logging/log.h"
 #include "video_core/engines/maxwell_3d.h"
 
+using GLvec2 = std::array<GLfloat, 2>;
+using GLvec3 = std::array<GLfloat, 3>;
+using GLvec4 = std::array<GLfloat, 4>;
+
+using GLuvec2 = std::array<GLuint, 2>;
+using GLuvec3 = std::array<GLuint, 3>;
+using GLuvec4 = std::array<GLuint, 4>;
+
 namespace MaxwellToGL {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
@@ -23,7 +31,7 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_UNSIGNED_BYTE;
         }
 
-        LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size=%s", attrib.SizeString().c_str());
+        NGLOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
         UNREACHABLE();
         return {};
     }
@@ -32,17 +40,33 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         return GL_FLOAT;
     }
 
-    LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex type=%s", attrib.TypeString().c_str());
+    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
+    UNREACHABLE();
+    return {};
+}
+
+inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
+    switch (index_format) {
+    case Maxwell::IndexFormat::UnsignedByte:
+        return GL_UNSIGNED_BYTE;
+    case Maxwell::IndexFormat::UnsignedShort:
+        return GL_UNSIGNED_SHORT;
+    case Maxwell::IndexFormat::UnsignedInt:
+        return GL_UNSIGNED_INT;
+    }
+    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented index_format={}", static_cast<u32>(index_format));
     UNREACHABLE();
     return {};
 }
 
 inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
     switch (topology) {
+    case Maxwell::PrimitiveTopology::Triangles:
+        return GL_TRIANGLES;
     case Maxwell::PrimitiveTopology::TriangleStrip:
         return GL_TRIANGLE_STRIP;
     }
-    LOG_CRITICAL(Render_OpenGL, "Unimplemented primitive topology=%d", topology);
+    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented topology={}", static_cast<u32>(topology));
     UNREACHABLE();
     return {};
 }
@@ -54,18 +78,90 @@ inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode) {
     case Tegra::Texture::TextureFilter::Nearest:
         return GL_NEAREST;
     }
-    LOG_CRITICAL(Render_OpenGL, "Unimplemented texture filter mode=%u",
-                 static_cast<u32>(filter_mode));
+    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented texture filter mode={}",
+                   static_cast<u32>(filter_mode));
     UNREACHABLE();
     return {};
 }
 
 inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
     switch (wrap_mode) {
+    case Tegra::Texture::WrapMode::Wrap:
+        return GL_REPEAT;
     case Tegra::Texture::WrapMode::ClampToEdge:
         return GL_CLAMP_TO_EDGE;
+    case Tegra::Texture::WrapMode::ClampOGL:
+        // TODO(Subv): GL_CLAMP was removed as of OpenGL 3.1, to implement GL_CLAMP, we can use
+        // GL_CLAMP_TO_BORDER to get the border color of the texture, and then sample the edge to
+        // manually mix them. However the shader part of this is not yet implemented.
+        return GL_CLAMP_TO_BORDER;
+    }
+    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented texture wrap mode={}",
+                   static_cast<u32>(wrap_mode));
+    UNREACHABLE();
+    return {};
+}
+
+inline GLenum BlendEquation(Maxwell::Blend::Equation equation) {
+    switch (equation) {
+    case Maxwell::Blend::Equation::Add:
+        return GL_FUNC_ADD;
+    case Maxwell::Blend::Equation::Subtract:
+        return GL_FUNC_SUBTRACT;
+    case Maxwell::Blend::Equation::ReverseSubtract:
+        return GL_FUNC_REVERSE_SUBTRACT;
+    case Maxwell::Blend::Equation::Min:
+        return GL_MIN;
+    case Maxwell::Blend::Equation::Max:
+        return GL_MAX;
+    }
+    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation));
+    UNREACHABLE();
+    return {};
+}
+
+inline GLenum BlendFunc(Maxwell::Blend::Factor factor) {
+    switch (factor) {
+    case Maxwell::Blend::Factor::Zero:
+        return GL_ZERO;
+    case Maxwell::Blend::Factor::One:
+        return GL_ONE;
+    case Maxwell::Blend::Factor::SourceColor:
+        return GL_SRC_COLOR;
+    case Maxwell::Blend::Factor::OneMinusSourceColor:
+        return GL_ONE_MINUS_SRC_COLOR;
+    case Maxwell::Blend::Factor::SourceAlpha:
+        return GL_SRC_ALPHA;
+    case Maxwell::Blend::Factor::OneMinusSourceAlpha:
+        return GL_ONE_MINUS_SRC_ALPHA;
+    case Maxwell::Blend::Factor::DestAlpha:
+        return GL_DST_ALPHA;
+    case Maxwell::Blend::Factor::OneMinusDestAlpha:
+        return GL_ONE_MINUS_DST_ALPHA;
+    case Maxwell::Blend::Factor::DestColor:
+        return GL_DST_COLOR;
+    case Maxwell::Blend::Factor::OneMinusDestColor:
+        return GL_ONE_MINUS_DST_COLOR;
+    case Maxwell::Blend::Factor::SourceAlphaSaturate:
+        return GL_SRC_ALPHA_SATURATE;
+    case Maxwell::Blend::Factor::Source1Color:
+        return GL_SRC1_COLOR;
+    case Maxwell::Blend::Factor::OneMinusSource1Color:
+        return GL_ONE_MINUS_SRC1_COLOR;
+    case Maxwell::Blend::Factor::Source1Alpha:
+        return GL_SRC1_ALPHA;
+    case Maxwell::Blend::Factor::OneMinusSource1Alpha:
+        return GL_ONE_MINUS_SRC1_ALPHA;
+    case Maxwell::Blend::Factor::ConstantColor:
+        return GL_CONSTANT_COLOR;
+    case Maxwell::Blend::Factor::OneMinusConstantColor:
+        return GL_ONE_MINUS_CONSTANT_COLOR;
+    case Maxwell::Blend::Factor::ConstantAlpha:
+        return GL_CONSTANT_ALPHA;
+    case Maxwell::Blend::Factor::OneMinusConstantAlpha:
+        return GL_ONE_MINUS_CONSTANT_ALPHA;
     }
-    LOG_CRITICAL(Render_OpenGL, "Unimplemented texture wrap mode=%u", static_cast<u32>(wrap_mode));
+    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor));
     UNREACHABLE();
     return {};
 }
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 78b50b227..ab0acb20a 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -9,13 +9,10 @@
 #include <memory>
 #include <glad/glad.h>
 #include "common/assert.h"
-#include "common/bit_field.h"
 #include "common/logging/log.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/frontend/emu_window.h"
-#include "core/hw/hw.h"
-#include "core/hw/lcd.h"
 #include "core/memory.h"
 #include "core/settings.h"
 #include "core/tracer/recorder.h"
@@ -57,7 +54,7 @@ uniform sampler2D color_texture;
 void main() {
     // Swap RGBA -> ABGR so we don't have to do this on the CPU. This needs to change if we have to
     // support more framebuffer pixel formats.
-    color = texture(color_texture, frag_tex_coord).abgr;
+    color = texture(color_texture, frag_tex_coord);
 }
 )";
 
@@ -210,7 +207,7 @@ void RendererOpenGL::InitOpenGLObjects() {
                  0.0f);
 
     // Link shaders and get variable locations
-    shader.Create(vertex_shader, nullptr, fragment_shader);
+    shader.CreateFromSource(vertex_shader, nullptr, fragment_shader);
     state.draw.shader_program = shader.handle;
     state.Apply();
     uniform_modelview_matrix = glGetUniformLocation(shader.handle, "modelview_matrix");
@@ -311,10 +308,10 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
         }
 
     std::array<ScreenRectVertex, 4> vertices = {{
-        ScreenRectVertex(x, y, texcoords.top, right),
-        ScreenRectVertex(x + w, y, texcoords.bottom, right),
-        ScreenRectVertex(x, y + h, texcoords.top, left),
-        ScreenRectVertex(x + w, y + h, texcoords.bottom, left),
+        ScreenRectVertex(x, y, texcoords.top, left),
+        ScreenRectVertex(x + w, y, texcoords.bottom, left),
+        ScreenRectVertex(x, y + h, texcoords.top, right),
+        ScreenRectVertex(x + w, y + h, texcoords.bottom, right),
     }};
 
     state.texture_units[0].texture_2d = screen_info.display_texture;
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index c52f40037..2cc6d9a00 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -72,7 +72,7 @@ private:
     // OpenGL object IDs
     OGLVertexArray vertex_array;
     OGLBuffer vertex_buffer;
-    OGLShader shader;
+    OGLProgram shader;
 
     /// Display information for Switch screen
     ScreenInfo screen_info;
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 2e87281eb..4df687786 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -48,31 +48,39 @@ u32 BytesPerPixel(TextureFormat format) {
     case TextureFormat::DXT1:
         // In this case a 'pixel' actually refers to a 4x4 tile.
         return 8;
+    case TextureFormat::DXT23:
+    case TextureFormat::DXT45:
+        // In this case a 'pixel' actually refers to a 4x4 tile.
+        return 16;
     case TextureFormat::A8R8G8B8:
         return 4;
+    case TextureFormat::B5G6R5:
+        return 2;
     default:
         UNIMPLEMENTED_MSG("Format not implemented");
         break;
     }
 }
 
-std::vector<u8> UnswizzleTexture(VAddr address, TextureFormat format, u32 width, u32 height) {
+std::vector<u8> UnswizzleTexture(VAddr address, TextureFormat format, u32 width, u32 height,
+                                 u32 block_height) {
     u8* data = Memory::GetPointer(address);
     u32 bytes_per_pixel = BytesPerPixel(format);
 
-    static constexpr u32 DefaultBlockHeight = 16;
-
     std::vector<u8> unswizzled_data(width * height * bytes_per_pixel);
 
     switch (format) {
     case TextureFormat::DXT1:
-        // In the DXT1 format, each 4x4 tile is swizzled instead of just individual pixel values.
+    case TextureFormat::DXT23:
+    case TextureFormat::DXT45:
+        // In the DXT formats, each 4x4 tile is swizzled instead of just individual pixel values.
         CopySwizzledData(width / 4, height / 4, bytes_per_pixel, bytes_per_pixel, data,
-                         unswizzled_data.data(), true, DefaultBlockHeight);
+                         unswizzled_data.data(), true, block_height);
         break;
     case TextureFormat::A8R8G8B8:
+    case TextureFormat::B5G6R5:
         CopySwizzledData(width, height, bytes_per_pixel, bytes_per_pixel, data,
-                         unswizzled_data.data(), true, DefaultBlockHeight);
+                         unswizzled_data.data(), true, block_height);
         break;
     default:
         UNIMPLEMENTED_MSG("Format not implemented");
@@ -89,7 +97,10 @@ std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat
     // TODO(Subv): Implement.
     switch (format) {
     case TextureFormat::DXT1:
+    case TextureFormat::DXT23:
+    case TextureFormat::DXT45:
     case TextureFormat::A8R8G8B8:
+    case TextureFormat::B5G6R5:
         // TODO(Subv): For the time being just forward the same data without any decoding.
         rgba_data = texture_data;
         break;
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index 0c21694ff..a700911cf 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -14,7 +14,8 @@ namespace Texture {
 /**
  * Unswizzles a swizzled texture without changing its format.
  */
-std::vector<u8> UnswizzleTexture(VAddr address, TextureFormat format, u32 width, u32 height);
+std::vector<u8> UnswizzleTexture(VAddr address, TextureFormat format, u32 width, u32 height,
+                                 u32 block_height = TICEntry::DefaultBlockHeight);
 
 /**
  * Decodes an unswizzled texture into a A8R8G8B8 texture.
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index c12ed6e1d..86e45aa88 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include "common/assert.h"
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
@@ -13,8 +14,11 @@ namespace Tegra {
 namespace Texture {
 
 enum class TextureFormat : u32 {
-    A8R8G8B8 = 8,
+    A8R8G8B8 = 0x8,
+    B5G6R5 = 0x15,
     DXT1 = 0x24,
+    DXT23 = 0x25,
+    DXT45 = 0x26,
 };
 
 enum class TextureType : u32 {
@@ -55,6 +59,8 @@ union TextureHandle {
 static_assert(sizeof(TextureHandle) == 4, "TextureHandle has wrong size");
 
 struct TICEntry {
+    static constexpr u32 DefaultBlockHeight = 16;
+
     union {
         u32 raw;
         BitField<0, 7, TextureFormat> format;
@@ -68,7 +74,12 @@ struct TICEntry {
         BitField<0, 16, u32> address_high;
         BitField<21, 3, TICHeaderVersion> header_version;
     };
-    INSERT_PADDING_BYTES(4);
+    union {
+        BitField<3, 3, u32> block_height;
+
+        // High 16 bits of the pitch value
+        BitField<0, 16, u32> pitch_high;
+    };
     union {
         BitField<0, 16, u32> width_minus_1;
         BitField<23, 4, TextureType> texture_type;
@@ -80,6 +91,13 @@ struct TICEntry {
         return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low);
     }
 
+    u32 Pitch() const {
+        ASSERT(header_version == TICHeaderVersion::Pitch ||
+               header_version == TICHeaderVersion::PitchColorKey);
+        // The pitch value is 21 bits, and is 32B aligned.
+        return pitch_high << 5;
+    }
+
     u32 Width() const {
         return width_minus_1 + 1;
     }
@@ -88,6 +106,13 @@ struct TICEntry {
         return height_minus_1 + 1;
     }
 
+    u32 BlockHeight() const {
+        ASSERT(header_version == TICHeaderVersion::BlockLinear ||
+               header_version == TICHeaderVersion::BlockLinearColorKey);
+        // The block height is stored in log2 format.
+        return 1 << block_height;
+    }
+
     bool IsTiled() const {
         return header_version == TICHeaderVersion::BlockLinear ||
                header_version == TICHeaderVersion::BlockLinearColorKey;
diff --git a/src/video_core/utils.h b/src/video_core/utils.h
index be0f7e22b..e0a14d48f 100644
--- a/src/video_core/utils.h
+++ b/src/video_core/utils.h
@@ -151,7 +151,7 @@ static inline void MortonCopyPixels128(u32 width, u32 height, u32 bytes_per_pixe
             const u32 coarse_y = y & ~127;
             u32 morton_offset =
                 GetMortonOffset128(x, y, bytes_per_pixel) + coarse_y * width * bytes_per_pixel;
-            u32 gl_pixel_index = (x + (height - 1 - y) * width) * gl_bytes_per_pixel;
+            u32 gl_pixel_index = (x + y * width) * gl_bytes_per_pixel;
 
             data_ptrs[morton_to_gl] = morton_data + morton_offset;
             data_ptrs[!morton_to_gl] = &gl_data[gl_pixel_index];