33 files changed, 3423 insertions, 1874 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 281810357..c6431e722 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -9,6 +9,8 @@ add_library(video_core STATIC
     engines/maxwell_3d.h
     engines/maxwell_compute.cpp
     engines/maxwell_compute.h
+    engines/maxwell_dma.cpp
+    engines/maxwell_dma.h
     engines/shader_bytecode.h
     gpu.cpp
     gpu.h
@@ -39,6 +41,8 @@ add_library(video_core STATIC
     renderer_opengl/maxwell_to_gl.h
     renderer_opengl/renderer_opengl.cpp
     renderer_opengl/renderer_opengl.h
+    textures/astc.cpp
+    textures/astc.h
     textures/decoders.cpp
     textures/decoders.h
     textures/texture.h
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index d72d6f760..31ea3adad 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -16,6 +16,7 @@
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
+#include "video_core/engines/maxwell_dma.h"
 #include "video_core/gpu.h"
 #include "video_core/renderer_base.h"
 #include "video_core/video_core.h"
@@ -28,21 +29,21 @@ enum class BufferMethods {
 };
 
 void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params) {
-    NGLOG_WARNING(HW_GPU,
-                  "Processing method {:08X} on subchannel {} value "
-                  "{:08X} remaining params {}",
-                  method, subchannel, value, remaining_params);
+    LOG_WARNING(HW_GPU,
+                "Processing method {:08X} on subchannel {} value "
+                "{:08X} remaining params {}",
+                method, subchannel, value, remaining_params);
 
     if (method == static_cast<u32>(BufferMethods::BindObject)) {
         // Bind the current subchannel to the desired engine id.
-        NGLOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", subchannel, value);
+        LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", subchannel, value);
         bound_engines[subchannel] = static_cast<EngineID>(value);
         return;
     }
 
     if (method < static_cast<u32>(BufferMethods::CountBufferMethods)) {
         // TODO(Subv): Research and implement these methods.
-        NGLOG_ERROR(HW_GPU, "Special buffer methods other than Bind are not implemented");
+        LOG_ERROR(HW_GPU, "Special buffer methods other than Bind are not implemented");
         return;
     }
 
@@ -60,8 +61,11 @@ void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params)
     case EngineID::MAXWELL_COMPUTE_B:
         maxwell_compute->WriteReg(method, value);
         break;
+    case EngineID::MAXWELL_DMA_COPY_A:
+        maxwell_dma->WriteReg(method, value);
+        break;
     default:
-        UNIMPLEMENTED();
+        UNIMPLEMENTED_MSG("Unimplemented engine");
     }
 }
 
diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h
index bbba8e380..9382a75e5 100644
--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -55,8 +55,10 @@ public:
         virtual ~BreakPointObserver() {
             auto context = context_weak.lock();
             if (context) {
-                std::unique_lock<std::mutex> lock(context->breakpoint_mutex);
-                context->breakpoint_observers.remove(this);
+                {
+                    std::unique_lock<std::mutex> lock(context->breakpoint_mutex);
+                    context->breakpoint_observers.remove(this);
+                }
 
                 // If we are the last observer to be destroyed, tell the debugger context that
                 // it is free to continue. In particular, this is required for a proper yuzu
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index 6b9382f06..34053e393 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -26,8 +26,8 @@ void Fermi2D::WriteReg(u32 method, u32 value) {
 }
 
 void Fermi2D::HandleSurfaceCopy() {
-    NGLOG_WARNING(HW_GPU, "Requested a surface copy with operation {}",
-                  static_cast<u32>(regs.operation));
+    LOG_WARNING(HW_GPU, "Requested a surface copy with operation {}",
+                static_cast<u32>(regs.operation));
 
     const GPUVAddr source = regs.src.Address();
     const GPUVAddr dest = regs.dst.Address();
@@ -47,6 +47,7 @@ void Fermi2D::HandleSurfaceCopy() {
 
     if (regs.src.linear == regs.dst.linear) {
         // If the input layout and the output layout are the same, just perform a raw copy.
+        ASSERT(regs.src.BlockHeight() == regs.dst.BlockHeight());
         Memory::CopyBlock(dest_cpu, source_cpu,
                           src_bytes_per_pixel * regs.dst.width * regs.dst.height);
         return;
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 86e9dc998..3bca16364 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -126,6 +126,10 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) {
         DrawArrays();
         break;
     }
+    case MAXWELL3D_REG_INDEX(clear_buffers): {
+        ProcessClearBuffers();
+        break;
+    }
     case MAXWELL3D_REG_INDEX(query.query_get): {
         ProcessQueryGet();
         break;
@@ -207,8 +211,8 @@ void Maxwell3D::ProcessQueryGet() {
 }
 
 void Maxwell3D::DrawArrays() {
-    NGLOG_DEBUG(HW_GPU, "called, topology={}, count={}",
-                static_cast<u32>(regs.draw.topology.Value()), regs.vertex_buffer.count);
+    LOG_DEBUG(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()),
+              regs.vertex_buffer.count);
     ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");
 
     auto debug_context = Core::System::GetInstance().GetGPUDebugContext();
@@ -328,8 +332,9 @@ std::vector<Texture::FullTextureInfo> Maxwell3D::GetStageTextures(Regs::ShaderSt
 
         Texture::FullTextureInfo tex_info{};
         // TODO(Subv): Use the shader to determine which textures are actually accessed.
-        tex_info.index = (current_texture - tex_info_buffer.address - TextureInfoOffset) /
-                         sizeof(Texture::TextureHandle);
+        tex_info.index =
+            static_cast<u32>(current_texture - tex_info_buffer.address - TextureInfoOffset) /
+            sizeof(Texture::TextureHandle);
 
         // Load the TIC data.
         if (tex_handle.tic_id != 0) {
@@ -414,5 +419,13 @@ bool Maxwell3D::IsShaderStageEnabled(Regs::ShaderStage stage) const {
     UNREACHABLE();
 }
 
+void Maxwell3D::ProcessClearBuffers() {
+    ASSERT(regs.clear_buffers.R == regs.clear_buffers.G &&
+           regs.clear_buffers.R == regs.clear_buffers.B &&
+           regs.clear_buffers.R == regs.clear_buffers.A);
+
+    VideoCore::g_renderer->Rasterizer()->Clear();
+}
+
 } // namespace Engines
 } // namespace Tegra
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 2dc251205..5a7cf0107 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -280,6 +280,46 @@ public:
             UnsignedInt = 0x2,
         };
 
+        enum class ComparisonOp : u32 {
+            // These values are used by Nouveau and most games, they correspond to the OpenGL token
+            // values for these operations.
+            Never = 0x200,
+            Less = 0x201,
+            Equal = 0x202,
+            LessEqual = 0x203,
+            Greater = 0x204,
+            NotEqual = 0x205,
+            GreaterEqual = 0x206,
+            Always = 0x207,
+
+            // These values are used by some games, they seem to be NV04 values.
+            NeverOld = 1,
+            LessOld = 2,
+            EqualOld = 3,
+            LessEqualOld = 4,
+            GreaterOld = 5,
+            NotEqualOld = 6,
+            GreaterEqualOld = 7,
+            AlwaysOld = 8,
+        };
+
+        struct Cull {
+            enum class FrontFace : u32 {
+                ClockWise = 0x0900,
+                CounterClockWise = 0x0901,
+            };
+
+            enum class CullFace : u32 {
+                Front = 0x0404,
+                Back = 0x0405,
+                FrontAndBack = 0x0408,
+            };
+
+            u32 enabled;
+            FrontFace front_face;
+            CullFace cull_face;
+        };
+
         struct Blend {
             enum class Equation : u32 {
                 Add = 1,
@@ -321,6 +361,24 @@ public:
             INSERT_PADDING_WORDS(1);
         };
 
+        struct RenderTargetConfig {
+            u32 address_high;
+            u32 address_low;
+            u32 width;
+            u32 height;
+            Tegra::RenderTargetFormat format;
+            u32 block_dimensions;
+            u32 array_mode;
+            u32 layer_stride;
+            u32 base_layer;
+            INSERT_PADDING_WORDS(7);
+
+            GPUVAddr Address() const {
+                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                             address_low);
+            }
+        };
+
         union {
             struct {
                 INSERT_PADDING_WORDS(0x45);
@@ -333,23 +391,7 @@ public:
 
                 INSERT_PADDING_WORDS(0x1B8);
 
-                struct {
-                    u32 address_high;
-                    u32 address_low;
-                    u32 width;
-                    u32 height;
-                    Tegra::RenderTargetFormat format;
-                    u32 block_dimensions;
-                    u32 array_mode;
-                    u32 layer_stride;
-                    u32 base_layer;
-                    INSERT_PADDING_WORDS(7);
-
-                    GPUVAddr Address() const {
-                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                                     address_low);
-                    }
-                } rt[NumRenderTargets];
+                RenderTargetConfig rt[NumRenderTargets];
 
                 struct {
                     f32 scale_x;
@@ -406,12 +448,17 @@ public:
                     u32 count;
                 } vertex_buffer;
 
-                INSERT_PADDING_WORDS(0x99);
+                INSERT_PADDING_WORDS(1);
+
+                float clear_color[4];
+                float clear_depth;
+
+                INSERT_PADDING_WORDS(0x93);
 
                 struct {
                     u32 address_high;
                     u32 address_low;
-                    u32 format;
+                    Tegra::DepthFormat format;
                     u32 block_dimensions;
                     u32 layer_stride;
 
@@ -433,11 +480,23 @@ public:
                     };
                 } rt_control;
 
-                INSERT_PADDING_WORDS(0x31);
+                INSERT_PADDING_WORDS(0x2B);
+
+                u32 depth_test_enable;
+
+                INSERT_PADDING_WORDS(0x5);
 
                 u32 independent_blend_enable;
 
-                INSERT_PADDING_WORDS(0x15);
+                u32 depth_write_enabled;
+
+                INSERT_PADDING_WORDS(0x7);
+
+                u32 d3d_cull_mode;
+
+                ComparisonOp depth_test_func;
+
+                INSERT_PADDING_WORDS(0xB);
 
                 struct {
                     u32 separate_alpha;
@@ -453,7 +512,17 @@ public:
                     u32 enable[NumRenderTargets];
                 } blend;
 
-                INSERT_PADDING_WORDS(0x77);
+                INSERT_PADDING_WORDS(0xB);
+
+                union {
+                    BitField<4, 1, u32> triangle_rast_flip;
+                } screen_y_control;
+
+                INSERT_PADDING_WORDS(0x21);
+
+                u32 vb_element_base;
+
+                INSERT_PADDING_WORDS(0x49);
 
                 struct {
                     u32 tsc_address_high;
@@ -479,7 +548,12 @@ public:
                     }
                 } tic;
 
-                INSERT_PADDING_WORDS(0x22);
+                INSERT_PADDING_WORDS(0x21);
+
+                union {
+                    BitField<2, 1, u32> coord_origin;
+                    BitField<3, 10, u32> enable;
+                } point_coord_replace;
 
                 struct {
                     u32 code_address_high;
@@ -534,7 +608,27 @@ public:
                     }
                 } index_array;
 
-                INSERT_PADDING_WORDS(0xC7);
+                INSERT_PADDING_WORDS(0x7);
+
+                INSERT_PADDING_WORDS(0x46);
+
+                Cull cull;
+
+                INSERT_PADDING_WORDS(0x2B);
+
+                union {
+                    u32 raw;
+                    BitField<0, 1, u32> Z;
+                    BitField<1, 1, u32> S;
+                    BitField<2, 1, u32> R;
+                    BitField<3, 1, u32> G;
+                    BitField<4, 1, u32> B;
+                    BitField<5, 1, u32> A;
+                    BitField<6, 4, u32> RT;
+                    BitField<10, 11, u32> layer;
+                } clear_buffers;
+
+                INSERT_PADDING_WORDS(0x4B);
 
                 struct {
                     u32 query_address_high;
@@ -716,6 +810,9 @@ private:
     /// Handles writes to the macro uploading registers.
     void ProcessMacroUpload(u32 data);
 
+    /// Handles a write to the CLEAR_BUFFERS register.
+    void ProcessClearBuffers();
+
     /// Handles a write to the QUERY_GET register.
     void ProcessQueryGet();
 
@@ -738,16 +835,27 @@ ASSERT_REG_POSITION(rt, 0x200);
 ASSERT_REG_POSITION(viewport_transform[0], 0x280);
 ASSERT_REG_POSITION(viewport, 0x300);
 ASSERT_REG_POSITION(vertex_buffer, 0x35D);
+ASSERT_REG_POSITION(clear_color[0], 0x360);
+ASSERT_REG_POSITION(clear_depth, 0x364);
 ASSERT_REG_POSITION(zeta, 0x3F8);
 ASSERT_REG_POSITION(vertex_attrib_format[0], 0x458);
 ASSERT_REG_POSITION(rt_control, 0x487);
+ASSERT_REG_POSITION(depth_test_enable, 0x4B3);
 ASSERT_REG_POSITION(independent_blend_enable, 0x4B9);
+ASSERT_REG_POSITION(depth_write_enabled, 0x4BA);
+ASSERT_REG_POSITION(d3d_cull_mode, 0x4C2);
+ASSERT_REG_POSITION(depth_test_func, 0x4C3);
 ASSERT_REG_POSITION(blend, 0x4CF);
+ASSERT_REG_POSITION(screen_y_control, 0x4EB);
+ASSERT_REG_POSITION(vb_element_base, 0x50D);
 ASSERT_REG_POSITION(tsc, 0x557);
 ASSERT_REG_POSITION(tic, 0x55D);
+ASSERT_REG_POSITION(point_coord_replace, 0x581);
 ASSERT_REG_POSITION(code_address, 0x582);
 ASSERT_REG_POSITION(draw, 0x585);
 ASSERT_REG_POSITION(index_array, 0x5F2);
+ASSERT_REG_POSITION(cull, 0x646);
+ASSERT_REG_POSITION(clear_buffers, 0x674);
 ASSERT_REG_POSITION(query, 0x6C0);
 ASSERT_REG_POSITION(vertex_array[0], 0x700);
 ASSERT_REG_POSITION(independent_blend, 0x780);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
new file mode 100644
index 000000000..6e740713f
--- /dev/null
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -0,0 +1,73 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/memory.h"
+#include "video_core/engines/maxwell_dma.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra {
+namespace Engines {
+
+MaxwellDMA::MaxwellDMA(MemoryManager& memory_manager) : memory_manager(memory_manager) {}
+
+void MaxwellDMA::WriteReg(u32 method, u32 value) {
+    ASSERT_MSG(method < Regs::NUM_REGS,
+               "Invalid MaxwellDMA register, increase the size of the Regs structure");
+
+    regs.reg_array[method] = value;
+
+#define MAXWELLDMA_REG_INDEX(field_name)                                                           \
+    (offsetof(Tegra::Engines::MaxwellDMA::Regs, field_name) / sizeof(u32))
+
+    switch (method) {
+    case MAXWELLDMA_REG_INDEX(exec): {
+        HandleCopy();
+        break;
+    }
+    }
+
+#undef MAXWELLDMA_REG_INDEX
+}
+
+void MaxwellDMA::HandleCopy() {
+    LOG_WARNING(HW_GPU, "Requested a DMA copy");
+
+    const GPUVAddr source = regs.src_address.Address();
+    const GPUVAddr dest = regs.dst_address.Address();
+
+    const VAddr source_cpu = *memory_manager.GpuToCpuAddress(source);
+    const VAddr dest_cpu = *memory_manager.GpuToCpuAddress(dest);
+
+    // TODO(Subv): Perform more research and implement all features of this engine.
+    ASSERT(regs.exec.enable_swizzle == 0);
+    ASSERT(regs.exec.enable_2d == 1);
+    ASSERT(regs.exec.query_mode == Regs::QueryMode::None);
+    ASSERT(regs.exec.query_intr == Regs::QueryIntr::None);
+    ASSERT(regs.exec.copy_mode == Regs::CopyMode::Unk2);
+    ASSERT(regs.src_params.pos_x == 0);
+    ASSERT(regs.src_params.pos_y == 0);
+    ASSERT(regs.dst_params.pos_x == 0);
+    ASSERT(regs.dst_params.pos_y == 0);
+
+    if (regs.exec.is_dst_linear == regs.exec.is_src_linear) {
+        Memory::CopyBlock(dest_cpu, source_cpu, regs.x_count * regs.y_count);
+        return;
+    }
+
+    u8* src_buffer = Memory::GetPointer(source_cpu);
+    u8* dst_buffer = Memory::GetPointer(dest_cpu);
+
+    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
+        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
+        Texture::CopySwizzledData(regs.src_params.size_x, regs.src_params.size_y, 1, 1, src_buffer,
+                                  dst_buffer, true, regs.src_params.BlockHeight());
+    } else {
+        // If the input is linear and the output is tiled, swizzle the input and copy it over.
+        Texture::CopySwizzledData(regs.dst_params.size_x, regs.dst_params.size_y, 1, 1, dst_buffer,
+                                  src_buffer, false, regs.dst_params.BlockHeight());
+    }
+}
+
+} // namespace Engines
+} // namespace Tegra
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
new file mode 100644
index 000000000..905749bde
--- /dev/null
+++ b/src/video_core/engines/maxwell_dma.h
@@ -0,0 +1,155 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include "common/assert.h"
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra {
+namespace Engines {
+
+class MaxwellDMA final {
+public:
+    explicit MaxwellDMA(MemoryManager& memory_manager);
+    ~MaxwellDMA() = default;
+
+    /// Write the value to the register identified by method.
+    void WriteReg(u32 method, u32 value);
+
+    struct Regs {
+        static constexpr size_t NUM_REGS = 0x1D6;
+
+        struct Parameters {
+            union {
+                BitField<0, 4, u32> block_depth;
+                BitField<4, 4, u32> block_height;
+                BitField<8, 4, u32> block_width;
+            };
+            u32 size_x;
+            u32 size_y;
+            u32 size_z;
+            u32 pos_z;
+            union {
+                BitField<0, 16, u32> pos_x;
+                BitField<16, 16, u32> pos_y;
+            };
+
+            u32 BlockHeight() const {
+                return 1 << block_height;
+            }
+        };
+
+        static_assert(sizeof(Parameters) == 24, "Parameters has wrong size");
+
+        enum class CopyMode : u32 {
+            None = 0,
+            Unk1 = 1,
+            Unk2 = 2,
+        };
+
+        enum class QueryMode : u32 {
+            None = 0,
+            Short = 1,
+            Long = 2,
+        };
+
+        enum class QueryIntr : u32 {
+            None = 0,
+            Block = 1,
+            NonBlock = 2,
+        };
+
+        union {
+            struct {
+                INSERT_PADDING_WORDS(0xC0);
+
+                struct {
+                    union {
+                        BitField<0, 2, CopyMode> copy_mode;
+                        BitField<2, 1, u32> flush;
+
+                        BitField<3, 2, QueryMode> query_mode;
+                        BitField<5, 2, QueryIntr> query_intr;
+
+                        BitField<7, 1, u32> is_src_linear;
+                        BitField<8, 1, u32> is_dst_linear;
+
+                        BitField<9, 1, u32> enable_2d;
+                        BitField<10, 1, u32> enable_swizzle;
+                    };
+                } exec;
+
+                INSERT_PADDING_WORDS(0x3F);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } src_address;
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } dst_address;
+
+                u32 src_pitch;
+                u32 dst_pitch;
+                u32 x_count;
+                u32 y_count;
+
+                INSERT_PADDING_WORDS(0xBB);
+
+                Parameters dst_params;
+
+                INSERT_PADDING_WORDS(1);
+
+                Parameters src_params;
+
+                INSERT_PADDING_WORDS(0x13);
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+
+    MemoryManager& memory_manager;
+
+private:
+    /// Performs the copy from the source buffer to the destination buffer as configured in the
+    /// registers.
+    void HandleCopy();
+};
+
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(MaxwellDMA::Regs, field_name) == position * 4,                          \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(exec, 0xC0);
+ASSERT_REG_POSITION(src_address, 0x100);
+ASSERT_REG_POSITION(dst_address, 0x102);
+ASSERT_REG_POSITION(src_pitch, 0x104);
+ASSERT_REG_POSITION(dst_pitch, 0x105);
+ASSERT_REG_POSITION(x_count, 0x106);
+ASSERT_REG_POSITION(y_count, 0x107);
+ASSERT_REG_POSITION(dst_params, 0x1C3);
+ASSERT_REG_POSITION(src_params, 0x1CA);
+
+#undef ASSERT_REG_POSITION
+
+} // namespace Engines
+} // namespace Tegra
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index ec8dbd370..2bc1782ad 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -142,6 +142,7 @@ enum class PredCondition : u64 {
     GreaterThan = 4,
     NotEqual = 5,
     GreaterEqual = 6,
+    NotEqualWithNan = 13,
     // TODO(Subv): Other condition types
 };
 
@@ -165,7 +166,7 @@ enum class SubOp : u64 {
     Lg2 = 0x3,
     Rcp = 0x4,
     Rsq = 0x5,
-    Min = 0x8,
+    Sqrt = 0x8,
 };
 
 enum class F2iRoundingOp : u64 {
@@ -193,6 +194,13 @@ enum class UniformType : u64 {
     Double = 5,
 };
 
+enum class IMinMaxExchange : u64 {
+    None = 0,
+    XLo = 1,
+    XMed = 2,
+    XHi = 3,
+};
+
 union Instruction {
     Instruction& operator=(const Instruction& instr) {
         value = instr.value;
@@ -209,20 +217,19 @@ union Instruction {
     } pred;
     BitField<19, 1, u64> negate_pred;
     BitField<20, 8, Register> gpr20;
-    BitField<20, 7, SubOp> sub_op;
+    BitField<20, 4, SubOp> sub_op;
     BitField<28, 8, Register> gpr28;
     BitField<39, 8, Register> gpr39;
     BitField<48, 16, u64> opcode;
-    BitField<50, 1, u64> saturate_a;
 
     union {
         BitField<20, 19, u64> imm20_19;
-        BitField<20, 32, u64> imm20_32;
+        BitField<20, 32, s64> imm20_32;
         BitField<45, 1, u64> negate_b;
         BitField<46, 1, u64> abs_a;
         BitField<48, 1, u64> negate_a;
         BitField<49, 1, u64> abs_b;
-        BitField<50, 1, u64> abs_d;
+        BitField<50, 1, u64> saturate_d;
         BitField<56, 1, u64> negate_imm;
 
         union {
@@ -231,10 +238,18 @@ union Instruction {
         } fmnmx;
 
         union {
+            BitField<39, 1, u64> invert_a;
+            BitField<40, 1, u64> invert_b;
+            BitField<41, 2, LogicOperation> operation;
+            BitField<44, 2, u64> unk44;
+            BitField<48, 3, Pred> pred48;
+        } lop;
+
+        union {
             BitField<53, 2, LogicOperation> operation;
             BitField<55, 1, u64> invert_a;
             BitField<56, 1, u64> invert_b;
-        } lop;
+        } lop32i;
 
         float GetImm20_19() const {
             float result{};
@@ -247,7 +262,7 @@ union Instruction {
 
         float GetImm20_32() const {
             float result{};
-            u32 imm{static_cast<u32>(imm20_32)};
+            s32 imm{static_cast<s32>(imm20_32)};
             std::memcpy(&result, &imm, sizeof(imm));
             return result;
         }
@@ -271,6 +286,18 @@ union Instruction {
     } alu_integer;
 
     union {
+        BitField<39, 3, u64> pred;
+        BitField<42, 1, u64> negate_pred;
+        BitField<43, 2, IMinMaxExchange> exchange;
+        BitField<48, 1, u64> is_signed;
+    } imnmx;
+
+    union {
+        BitField<54, 1, u64> saturate;
+        BitField<56, 1, u64> negate_a;
+    } iadd32i;
+
+    union {
         BitField<20, 8, u64> shift_position;
         BitField<28, 8, u64> shift_length;
         BitField<48, 1, u64> negate_b;
@@ -316,6 +343,19 @@ union Instruction {
     } isetp;
 
     union {
+        BitField<0, 3, u64> pred0;
+        BitField<3, 3, u64> pred3;
+        BitField<12, 3, u64> pred12;
+        BitField<15, 1, u64> neg_pred12;
+        BitField<24, 2, PredOperation> cond;
+        BitField<29, 3, u64> pred29;
+        BitField<32, 1, u64> neg_pred29;
+        BitField<39, 3, u64> pred39;
+        BitField<42, 1, u64> neg_pred39;
+        BitField<45, 2, PredOperation> op;
+    } psetp;
+
+    union {
         BitField<39, 3, u64> pred39;
         BitField<42, 1, u64> neg_pred;
         BitField<43, 1, u64> neg_a;
@@ -339,7 +379,8 @@ union Instruction {
     } iset;
 
     union {
-        BitField<10, 2, Register::Size> size;
+        BitField<8, 2, Register::Size> dest_size;
+        BitField<10, 2, Register::Size> src_size;
         BitField<12, 1, u64> is_output_signed;
         BitField<13, 1, u64> is_input_signed;
         BitField<41, 2, u64> selector;
@@ -359,7 +400,7 @@ union Instruction {
         BitField<31, 4, u64> component_mask;
 
         bool IsComponentEnabled(size_t component) const {
-            return ((1 << component) & component_mask) != 0;
+            return ((1ull << component) & component_mask) != 0;
         }
     } tex;
 
@@ -378,7 +419,7 @@ union Instruction {
 
             ASSERT(component_mask_selector < mask.size());
 
-            return ((1 << component) & mask[component_mask_selector]) != 0;
+            return ((1ull << component) & mask[component_mask_selector]) != 0;
         }
     } texs;
 
@@ -424,6 +465,8 @@ public:
     enum class Id {
         KIL,
         SSY,
+        SYNC,
+        DEPBAR,
         BFE_C,
         BFE_R,
         BFE_IMM,
@@ -451,6 +494,7 @@ public:
         IADD_C,
         IADD_R,
         IADD_IMM,
+        IADD32I,
         ISCADD_C, // Scale and Add
         ISCADD_R,
         ISCADD_IMM,
@@ -470,6 +514,9 @@ public:
         I2I_C,
         I2I_R,
         I2I_IMM,
+        LOP_C,
+        LOP_R,
+        LOP_IMM,
         LOP32I,
         MOV_C,
         MOV_R,
@@ -509,12 +556,14 @@ public:
     enum class Type {
         Trivial,
         Arithmetic,
+        ArithmeticImmediate,
         ArithmeticInteger,
+        ArithmeticIntegerImmediate,
         Bfe,
-        Logic,
         Shift,
         Ffma,
         Flow,
+        Synch,
         Memory,
         FloatSet,
         FloatSetPredicate,
@@ -619,10 +668,12 @@ private:
             INST("111000110011----", Id::KIL, Type::Flow, "KIL"),
             INST("111000101001----", Id::SSY, Type::Flow, "SSY"),
             INST("111000100100----", Id::BRA, Type::Flow, "BRA"),
+            INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"),
+            INST("1111000011111---", Id::SYNC, Type::Synch, "SYNC"),
             INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),
             INST("1110111110010---", Id::LD_C, Type::Memory, "LD_C"),
             INST("1110111111110---", Id::ST_A, Type::Memory, "ST_A"),
-            INST("1100000000111---", Id::TEX, Type::Memory, "TEX"),
+            INST("110000----111---", Id::TEX, Type::Memory, "TEX"),
             INST("1101111101001---", Id::TEXQ, Type::Memory, "TEXQ"),
             INST("1101100---------", Id::TEXS, Type::Memory, "TEXS"),
             INST("1101101---------", Id::TLDS, Type::Memory, "TLDS"),
@@ -638,10 +689,11 @@ private:
             INST("0100110001101---", Id::FMUL_C, Type::Arithmetic, "FMUL_C"),
             INST("0101110001101---", Id::FMUL_R, Type::Arithmetic, "FMUL_R"),
             INST("0011100-01101---", Id::FMUL_IMM, Type::Arithmetic, "FMUL_IMM"),
-            INST("00011110--------", Id::FMUL32_IMM, Type::Arithmetic, "FMUL32_IMM"),
+            INST("00011110--------", Id::FMUL32_IMM, Type::ArithmeticImmediate, "FMUL32_IMM"),
             INST("0100110000010---", Id::IADD_C, Type::ArithmeticInteger, "IADD_C"),
             INST("0101110000010---", Id::IADD_R, Type::ArithmeticInteger, "IADD_R"),
             INST("0011100-00010---", Id::IADD_IMM, Type::ArithmeticInteger, "IADD_IMM"),
+            INST("0001110---------", Id::IADD32I, Type::ArithmeticIntegerImmediate, "IADD32I"),
             INST("0100110000011---", Id::ISCADD_C, Type::ArithmeticInteger, "ISCADD_C"),
             INST("0101110000011---", Id::ISCADD_R, Type::ArithmeticInteger, "ISCADD_R"),
             INST("0011100-00011---", Id::ISCADD_IMM, Type::ArithmeticInteger, "ISCADD_IMM"),
@@ -658,17 +710,20 @@ private:
             INST("0100110010011---", Id::MOV_C, Type::Arithmetic, "MOV_C"),
             INST("0101110010011---", Id::MOV_R, Type::Arithmetic, "MOV_R"),
             INST("0011100-10011---", Id::MOV_IMM, Type::Arithmetic, "MOV_IMM"),
-            INST("000000010000----", Id::MOV32_IMM, Type::Arithmetic, "MOV32_IMM"),
+            INST("000000010000----", Id::MOV32_IMM, Type::ArithmeticImmediate, "MOV32_IMM"),
             INST("0100110001100---", Id::FMNMX_C, Type::Arithmetic, "FMNMX_C"),
             INST("0101110001100---", Id::FMNMX_R, Type::Arithmetic, "FMNMX_R"),
             INST("0011100-01100---", Id::FMNMX_IMM, Type::Arithmetic, "FMNMX_IMM"),
-            INST("0100110000100---", Id::IMNMX_C, Type::Arithmetic, "FMNMX_IMM"),
-            INST("0101110000100---", Id::IMNMX_R, Type::Arithmetic, "FMNMX_IMM"),
-            INST("0011100-00100---", Id::IMNMX_IMM, Type::Arithmetic, "FMNMX_IMM"),
+            INST("0100110000100---", Id::IMNMX_C, Type::ArithmeticInteger, "IMNMX_C"),
+            INST("0101110000100---", Id::IMNMX_R, Type::ArithmeticInteger, "IMNMX_R"),
+            INST("0011100-00100---", Id::IMNMX_IMM, Type::ArithmeticInteger, "IMNMX_IMM"),
             INST("0100110000000---", Id::BFE_C, Type::Bfe, "BFE_C"),
             INST("0101110000000---", Id::BFE_R, Type::Bfe, "BFE_R"),
             INST("0011100-00000---", Id::BFE_IMM, Type::Bfe, "BFE_IMM"),
-            INST("000001----------", Id::LOP32I, Type::Logic, "LOP32I"),
+            INST("0100110001000---", Id::LOP_C, Type::ArithmeticInteger, "LOP_C"),
+            INST("0101110001000---", Id::LOP_R, Type::ArithmeticInteger, "LOP_R"),
+            INST("0011100001000---", Id::LOP_IMM, Type::ArithmeticInteger, "LOP_IMM"),
+            INST("000001----------", Id::LOP32I, Type::ArithmeticIntegerImmediate, "LOP32I"),
             INST("0100110001001---", Id::SHL_C, Type::Shift, "SHL_C"),
             INST("0101110001001---", Id::SHL_R, Type::Shift, "SHL_R"),
             INST("0011100-01001---", Id::SHL_IMM, Type::Shift, "SHL_IMM"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 66351fe6e..e36483145 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -5,6 +5,7 @@
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
+#include "video_core/engines/maxwell_dma.h"
 #include "video_core/gpu.h"
 
 namespace Tegra {
@@ -14,6 +15,7 @@ GPU::GPU() {
     maxwell_3d = std::make_unique<Engines::Maxwell3D>(*memory_manager);
     fermi_2d = std::make_unique<Engines::Fermi2D>(*memory_manager);
     maxwell_compute = std::make_unique<Engines::MaxwellCompute>();
+    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(*memory_manager);
 }
 
 GPU::~GPU() = default;
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 5852b9619..cc5ca656e 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -16,6 +16,7 @@ namespace Tegra {
 enum class RenderTargetFormat : u32 {
     NONE = 0x0,
     RGBA32_FLOAT = 0xC0,
+    RGBA32_UINT = 0xC2,
     RGBA16_FLOAT = 0xCA,
     RGB10_A2_UNORM = 0xD1,
     RGBA8_UNORM = 0xD5,
@@ -23,6 +24,15 @@ enum class RenderTargetFormat : u32 {
     R11G11B10_FLOAT = 0xE0,
 };
 
+enum class DepthFormat : u32 {
+    Z32_FLOAT = 0xA,
+    Z16_UNORM = 0x13,
+    S8_Z24_UNORM = 0x14,
+    Z24_X8_UNORM = 0x15,
+    Z24_S8_UNORM = 0x16,
+    Z24_C8_UNORM = 0x18,
+};
+
 /// Returns the number of bytes per pixel of each rendertarget format.
 u32 RenderTargetBytesPerPixel(RenderTargetFormat format);
 
@@ -63,6 +73,7 @@ namespace Engines {
 class Fermi2D;
 class Maxwell3D;
 class MaxwellCompute;
+class MaxwellDMA;
 } // namespace Engines
 
 enum class EngineID {
@@ -103,6 +114,8 @@ private:
     std::unique_ptr<Engines::Fermi2D> fermi_2d;
     /// Compute engine
     std::unique_ptr<Engines::MaxwellCompute> maxwell_compute;
+    /// DMA engine
+    std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
 };
 
 } // namespace Tegra
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 5cefce9fc..2f814a184 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -100,9 +100,9 @@ boost::optional<GPUVAddr> MemoryManager::FindFreeBlock(u64 size, u64 align) {
 
 boost::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) {
     VAddr base_addr = PageSlot(gpu_addr);
-    ASSERT(base_addr != static_cast<u64>(PageStatus::Unmapped));
 
-    if (base_addr == static_cast<u64>(PageStatus::Allocated)) {
+    if (base_addr == static_cast<u64>(PageStatus::Allocated) ||
+        base_addr == static_cast<u64>(PageStatus::Unmapped)) {
         return {};
     }
 
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index f0e48a802..499e84b89 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -19,6 +19,9 @@ public:
     /// Draw the current batch of vertex arrays
     virtual void DrawArrays() = 0;
 
+    /// Clear the current framebuffer
+    virtual void Clear() = 0;
+
     /// Notify rasterizer that the specified Maxwell register has been changed
     virtual void NotifyMaxwellRegisterChanged(u32 method) = 0;
 
@@ -51,9 +54,8 @@ public:
     }
 
     /// Attempt to use a faster method to display the framebuffer to screen
-    virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& framebuffer,
-                                   VAddr framebuffer_addr, u32 pixel_stride,
-                                   ScreenInfo& screen_info) {
+    virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
+                                   u32 pixel_stride, ScreenInfo& screen_info) {
         return false;
     }
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 6f05f24a0..ea138d402 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -112,7 +112,7 @@ RasterizerOpenGL::RasterizerOpenGL() {
 
     glEnable(GL_BLEND);
 
-    NGLOG_CRITICAL(Render_OpenGL, "Sync fixed function OpenGL state here!");
+    LOG_CRITICAL(Render_OpenGL, "Sync fixed function OpenGL state here!");
 }
 
 RasterizerOpenGL::~RasterizerOpenGL() {
@@ -146,7 +146,6 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
         u64 size = end - start + 1;
 
         // Copy vertex array data
-        res_cache.FlushRegion(start, size, nullptr);
         Memory::ReadBlock(*memory_manager->GpuToCpuAddress(start), array_ptr, size);
 
         // Bind the vertex array to the buffer at the current offset.
@@ -166,9 +165,9 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
     // assume every shader uses them all.
     for (unsigned index = 0; index < 16; ++index) {
         auto& attrib = regs.vertex_attrib_format[index];
-        NGLOG_DEBUG(HW_GPU, "vertex attrib {}, count={}, size={}, type={}, offset={}, normalize={}",
-                    index, attrib.ComponentCount(), attrib.SizeString(), attrib.TypeString(),
-                    attrib.offset.Value(), attrib.IsNormalized());
+        LOG_DEBUG(HW_GPU, "vertex attrib {}, count={}, size={}, type={}, offset={}, normalize={}",
+                  index, attrib.ComponentCount(), attrib.SizeString(), attrib.TypeString(),
+                  attrib.offset.Value(), attrib.IsNormalized());
 
         auto& buffer = regs.vertex_array[attrib.buffer];
         ASSERT(buffer.IsEnabled());
@@ -197,8 +196,8 @@ void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
     ASSERT_MSG(!gpu.regs.shader_config[0].enable, "VertexA is unsupported!");
 
     // Next available bindpoints to use when uploading the const buffers and textures to the GLSL
-    // shaders.
-    u32 current_constbuffer_bindpoint = 0;
+    // shaders. The constbuffer bindpoint starts after the shader stage configuration bind points.
+    u32 current_constbuffer_bindpoint = uniform_buffers.size();
     u32 current_texture_bindpoint = 0;
 
     for (unsigned index = 1; index < Maxwell::MaxShaderProgram; ++index) {
@@ -252,8 +251,8 @@ void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
             break;
         }
         default:
-            NGLOG_CRITICAL(HW_GPU, "Unimplemented shader index={}, enable={}, offset=0x{:08X}",
-                           index, shader_config.enable.Value(), shader_config.offset);
+            LOG_CRITICAL(HW_GPU, "Unimplemented shader index={}, enable={}, offset=0x{:08X}", index,
+                         shader_config.enable.Value(), shader_config.offset);
             UNREACHABLE();
         }
 
@@ -298,17 +297,16 @@ bool RasterizerOpenGL::AccelerateDrawBatch(bool is_indexed) {
     return true;
 }
 
-void RasterizerOpenGL::DrawArrays() {
-    if (accelerate_draw == AccelDraw::Disabled)
-        return;
-
-    MICROPROFILE_SCOPE(OpenGL_Drawing);
+std::pair<Surface, Surface> RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb,
+                                                                    bool using_depth_fb) {
     const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
 
-    // TODO(bunnei): Implement these
+    // Sync the depth test state before configuring the framebuffer surfaces.
+    SyncDepthTestState();
+
+    // TODO(bunnei): Implement this
     const bool has_stencil = false;
-    const bool using_color_fb = true;
-    const bool using_depth_fb = false;
+
     const MathUtil::Rectangle<s32> viewport_rect{regs.viewport_transform[0].GetRect()};
 
     const bool write_color_fb =
@@ -325,35 +323,21 @@ void RasterizerOpenGL::DrawArrays() {
     std::tie(color_surface, depth_surface, surfaces_rect) =
         res_cache.GetFramebufferSurfaces(using_color_fb, using_depth_fb, viewport_rect);
 
-    const u16 res_scale = color_surface != nullptr
-                              ? color_surface->res_scale
-                              : (depth_surface == nullptr ? 1u : depth_surface->res_scale);
-
     MathUtil::Rectangle<u32> draw_rect{
+        static_cast<u32>(std::clamp<s32>(static_cast<s32>(surfaces_rect.left) + viewport_rect.left,
+                                         surfaces_rect.left, surfaces_rect.right)), // Left
+        static_cast<u32>(std::clamp<s32>(static_cast<s32>(surfaces_rect.bottom) + viewport_rect.top,
+                                         surfaces_rect.bottom, surfaces_rect.top)), // Top
+        static_cast<u32>(std::clamp<s32>(static_cast<s32>(surfaces_rect.left) + viewport_rect.right,
+                                         surfaces_rect.left, surfaces_rect.right)), // Right
         static_cast<u32>(
-            std::clamp<s32>(static_cast<s32>(surfaces_rect.left) + viewport_rect.left * res_scale,
-                            surfaces_rect.left, surfaces_rect.right)), // Left
-        static_cast<u32>(
-            std::clamp<s32>(static_cast<s32>(surfaces_rect.bottom) + viewport_rect.top * res_scale,
-                            surfaces_rect.bottom, surfaces_rect.top)), // Top
-        static_cast<u32>(
-            std::clamp<s32>(static_cast<s32>(surfaces_rect.left) + viewport_rect.right * res_scale,
-                            surfaces_rect.left, surfaces_rect.right)), // Right
-        static_cast<u32>(std::clamp<s32>(static_cast<s32>(surfaces_rect.bottom) +
-                                             viewport_rect.bottom * res_scale,
-                                         surfaces_rect.bottom, surfaces_rect.top))}; // Bottom
+            std::clamp<s32>(static_cast<s32>(surfaces_rect.bottom) + viewport_rect.bottom,
+                            surfaces_rect.bottom, surfaces_rect.top))}; // Bottom
 
     // Bind the framebuffer surfaces
     BindFramebufferSurfaces(color_surface, depth_surface, has_stencil);
 
-    // Sync the viewport
-    SyncViewport(surfaces_rect, res_scale);
-
-    // Sync the blend state registers
-    SyncBlendState();
-
-    // TODO(bunnei): Sync framebuffer_scale uniform here
-    // TODO(bunnei): Sync scissorbox uniform(s) here
+    SyncViewport(surfaces_rect);
 
     // Viewport can have negative offsets or larger dimensions than our framebuffer sub-rect. Enable
     // scissor test to prevent drawing outside of the framebuffer region
@@ -364,6 +348,66 @@ void RasterizerOpenGL::DrawArrays() {
     state.scissor.height = draw_rect.GetHeight();
     state.Apply();
 
+    // Only return the surface to be marked as dirty if writing to it is enabled.
+    return std::make_pair(write_color_fb ? color_surface : nullptr,
+                          write_depth_fb ? depth_surface : nullptr);
+}
+
+void RasterizerOpenGL::Clear() {
+    const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
+
+    bool use_color_fb = false;
+    bool use_depth_fb = false;
+
+    GLbitfield clear_mask = 0;
+    if (regs.clear_buffers.R && regs.clear_buffers.G && regs.clear_buffers.B &&
+        regs.clear_buffers.A) {
+        clear_mask |= GL_COLOR_BUFFER_BIT;
+        use_color_fb = true;
+    }
+    if (regs.clear_buffers.Z) {
+        clear_mask |= GL_DEPTH_BUFFER_BIT;
+        use_depth_fb = true;
+    }
+
+    if (clear_mask == 0)
+        return;
+
+    auto [dirty_color_surface, dirty_depth_surface] =
+        ConfigureFramebuffers(use_color_fb, use_depth_fb);
+
+    // TODO(Subv): Support clearing only partial colors.
+    glClearColor(regs.clear_color[0], regs.clear_color[1], regs.clear_color[2],
+                 regs.clear_color[3]);
+    glClearDepth(regs.clear_depth);
+
+    glClear(clear_mask);
+
+    // Mark framebuffer surfaces as dirty
+    if (dirty_color_surface != nullptr) {
+        res_cache.MarkSurfaceAsDirty(dirty_color_surface);
+    }
+    if (dirty_depth_surface != nullptr) {
+        res_cache.MarkSurfaceAsDirty(dirty_depth_surface);
+    }
+}
+
+void RasterizerOpenGL::DrawArrays() {
+    if (accelerate_draw == AccelDraw::Disabled)
+        return;
+
+    MICROPROFILE_SCOPE(OpenGL_Drawing);
+    const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
+
+    auto [dirty_color_surface, dirty_depth_surface] =
+        ConfigureFramebuffers(true, regs.zeta.Address() != 0);
+
+    SyncBlendState();
+    SyncCullMode();
+
+    // TODO(bunnei): Sync framebuffer_scale uniform here
+    // TODO(bunnei): Sync scissorbox uniform(s) here
+
     // Draw the vertex batch
     const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
     const u64 index_buffer_size{regs.index_array.count * regs.index_array.FormatSizeInBytes()};
@@ -420,14 +464,16 @@ void RasterizerOpenGL::DrawArrays() {
 
     const GLenum primitive_mode{MaxwellToGL::PrimitiveTopology(regs.draw.topology)};
     if (is_indexed) {
-        const GLint index_min{static_cast<GLint>(regs.index_array.first)};
-        const GLint index_max{static_cast<GLint>(regs.index_array.first + regs.index_array.count)};
-        glDrawRangeElementsBaseVertex(primitive_mode, index_min, index_max, regs.index_array.count,
-                                      MaxwellToGL::IndexFormat(regs.index_array.format),
-                                      reinterpret_cast<const void*>(index_buffer_offset),
-                                      -index_min);
+        const GLint base_vertex{static_cast<GLint>(regs.vb_element_base)};
+
+        // Adjust the index buffer offset so it points to the first desired index.
+        index_buffer_offset += regs.index_array.first * regs.index_array.FormatSizeInBytes();
+
+        glDrawElementsBaseVertex(primitive_mode, regs.index_array.count,
+                                 MaxwellToGL::IndexFormat(regs.index_array.format),
+                                 reinterpret_cast<const void*>(index_buffer_offset), base_vertex);
     } else {
-        glDrawArrays(primitive_mode, 0, regs.vertex_buffer.count);
+        glDrawArrays(primitive_mode, regs.vertex_buffer.first, regs.vertex_buffer.count);
     }
 
     // Disable scissor test
@@ -437,24 +483,16 @@ void RasterizerOpenGL::DrawArrays() {
 
     // Unbind textures for potential future use as framebuffer attachments
     for (auto& texture_unit : state.texture_units) {
-        texture_unit.texture_2d = 0;
+        texture_unit.Unbind();
     }
     state.Apply();
 
     // Mark framebuffer surfaces as dirty
-    MathUtil::Rectangle<u32> draw_rect_unscaled{
-        draw_rect.left / res_scale, draw_rect.top / res_scale, draw_rect.right / res_scale,
-        draw_rect.bottom / res_scale};
-
-    if (color_surface != nullptr && write_color_fb) {
-        auto interval = color_surface->GetSubRectInterval(draw_rect_unscaled);
-        res_cache.InvalidateRegion(boost::icl::first(interval), boost::icl::length(interval),
-                                   color_surface);
+    if (dirty_color_surface != nullptr) {
+        res_cache.MarkSurfaceAsDirty(dirty_color_surface);
     }
-    if (depth_surface != nullptr && write_depth_fb) {
-        auto interval = depth_surface->GetSubRectInterval(draw_rect_unscaled);
-        res_cache.InvalidateRegion(boost::icl::first(interval), boost::icl::length(interval),
-                                   depth_surface);
+    if (dirty_depth_surface != nullptr) {
+        res_cache.MarkSurfaceAsDirty(dirty_depth_surface);
     }
 }
 
@@ -462,7 +500,7 @@ void RasterizerOpenGL::NotifyMaxwellRegisterChanged(u32 method) {}
 
 void RasterizerOpenGL::FlushAll() {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
-    res_cache.FlushAll();
+    res_cache.FlushRegion(0, Kernel::VMManager::MAX_ADDRESS);
 }
 
 void RasterizerOpenGL::FlushRegion(Tegra::GPUVAddr addr, u64 size) {
@@ -472,13 +510,13 @@ void RasterizerOpenGL::FlushRegion(Tegra::GPUVAddr addr, u64 size) {
 
 void RasterizerOpenGL::InvalidateRegion(Tegra::GPUVAddr addr, u64 size) {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
-    res_cache.InvalidateRegion(addr, size, nullptr);
+    res_cache.InvalidateRegion(addr, size);
 }
 
 void RasterizerOpenGL::FlushAndInvalidateRegion(Tegra::GPUVAddr addr, u64 size) {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
     res_cache.FlushRegion(addr, size);
-    res_cache.InvalidateRegion(addr, size, nullptr);
+    res_cache.InvalidateRegion(addr, size);
 }
 
 bool RasterizerOpenGL::AccelerateDisplayTransfer(const void* config) {
@@ -497,45 +535,28 @@ bool RasterizerOpenGL::AccelerateFill(const void* config) {
     return true;
 }
 
-bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& framebuffer,
+bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
                                          VAddr framebuffer_addr, u32 pixel_stride,
                                          ScreenInfo& screen_info) {
-    if (framebuffer_addr == 0) {
-        return false;
+    if (!framebuffer_addr) {
+        return {};
     }
+
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
 
-    SurfaceParams src_params;
-    src_params.cpu_addr = framebuffer_addr;
-    src_params.addr = res_cache.TryFindFramebufferGpuAddress(framebuffer_addr).get_value_or(0);
-    src_params.width = std::min(framebuffer.width, pixel_stride);
-    src_params.height = framebuffer.height;
-    src_params.stride = pixel_stride;
-    src_params.is_tiled = true;
-    src_params.block_height = Tegra::Texture::TICEntry::DefaultBlockHeight;
-    src_params.pixel_format =
-        SurfaceParams::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format);
-    src_params.component_type =
-        SurfaceParams::ComponentTypeFromGPUPixelFormat(framebuffer.pixel_format);
-    src_params.UpdateParams();
-
-    MathUtil::Rectangle<u32> src_rect;
-    Surface src_surface;
-    std::tie(src_surface, src_rect) =
-        res_cache.GetSurfaceSubRect(src_params, ScaleMatch::Ignore, true);
-
-    if (src_surface == nullptr) {
-        return false;
+    const auto& surface{res_cache.TryFindFramebufferSurface(framebuffer_addr)};
+    if (!surface) {
+        return {};
     }
 
-    u32 scaled_width = src_surface->GetScaledWidth();
-    u32 scaled_height = src_surface->GetScaledHeight();
+    // Verify that the cached surface is the same size and format as the requested framebuffer
+    const auto& params{surface->GetSurfaceParams()};
+    const auto& pixel_format{SurfaceParams::PixelFormatFromGPUPixelFormat(config.pixel_format)};
+    ASSERT_MSG(params.width == config.width, "Framebuffer width is different");
+    ASSERT_MSG(params.height == config.height, "Framebuffer height is different");
+    ASSERT_MSG(params.pixel_format == pixel_format, "Framebuffer pixel_format is different");
 
-    screen_info.display_texcoords = MathUtil::Rectangle<float>(
-        (float)src_rect.bottom / (float)scaled_height, (float)src_rect.left / (float)scaled_width,
-        (float)src_rect.top / (float)scaled_height, (float)src_rect.right / (float)scaled_width);
-
-    screen_info.display_texture = src_surface->texture.handle;
+    screen_info.display_texture = surface->Texture().handle;
 
     return true;
 }
@@ -608,32 +629,44 @@ u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, GLuint progr
 
         boost::optional<VAddr> addr = gpu.memory_manager->GpuToCpuAddress(buffer.address);
 
-        std::vector<u8> data;
+        size_t size = 0;
+
         if (used_buffer.IsIndirect()) {
             // Buffer is accessed indirectly, so upload the entire thing
-            data.resize(buffer.size * sizeof(float));
+            size = buffer.size * sizeof(float);
+
+            if (size > MaxConstbufferSize) {
+                LOG_ERROR(HW_GPU, "indirect constbuffer size {} exceeds maximum {}", size,
+                          MaxConstbufferSize);
+                size = MaxConstbufferSize;
+            }
         } else {
             // Buffer is accessed directly, upload just what we use
-            data.resize(used_buffer.GetSize() * sizeof(float));
+            size = used_buffer.GetSize() * sizeof(float);
         }
 
+        // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
+        // UBO alignment requirements.
+        size = Common::AlignUp(size, sizeof(GLvec4));
+        ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big");
+
+        std::vector<u8> data(size);
         Memory::ReadBlock(*addr, data.data(), data.size());
 
-        glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer_draw_state.ssbo);
-        glBufferData(GL_SHADER_STORAGE_BUFFER, data.size(), data.data(), GL_DYNAMIC_DRAW);
-        glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);
+        glBindBuffer(GL_UNIFORM_BUFFER, buffer_draw_state.ssbo);
+        glBufferData(GL_UNIFORM_BUFFER, data.size(), data.data(), GL_DYNAMIC_DRAW);
+        glBindBuffer(GL_UNIFORM_BUFFER, 0);
 
         // Now configure the bindpoint of the buffer inside the shader
         std::string buffer_name = used_buffer.GetName();
-        GLuint index =
-            glGetProgramResourceIndex(program, GL_SHADER_STORAGE_BLOCK, buffer_name.c_str());
+        GLuint index = glGetProgramResourceIndex(program, GL_UNIFORM_BLOCK, buffer_name.c_str());
         if (index != -1)
-            glShaderStorageBlockBinding(program, index, buffer_draw_state.bindpoint);
+            glUniformBlockBinding(program, index, buffer_draw_state.bindpoint);
     }
 
     state.Apply();
 
-    return current_bindpoint + entries.size();
+    return current_bindpoint + static_cast<u32>(entries.size());
 }
 
 u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, GLuint program, u32 current_unit,
@@ -653,16 +686,23 @@ u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, GLuint program,
 
         // Bind the uniform to the sampler.
         GLint uniform = glGetUniformLocation(program, entry.GetName().c_str());
-        ASSERT(uniform != -1);
+        if (uniform == -1) {
+            continue;
+        }
+
         glProgramUniform1i(program, uniform, current_bindpoint);
 
         const auto texture = maxwell3d.GetStageTexture(entry.GetStage(), entry.GetOffset());
-        ASSERT(texture.enabled);
+
+        if (!texture.enabled) {
+            state.texture_units[current_bindpoint].texture_2d = 0;
+            continue;
+        }
 
         texture_samplers[current_bindpoint].SyncWithConfig(texture.tsc);
         Surface surface = res_cache.GetTextureSurface(texture);
         if (surface != nullptr) {
-            state.texture_units[current_bindpoint].texture_2d = surface->texture.handle;
+            state.texture_units[current_bindpoint].texture_2d = surface->Texture().handle;
             state.texture_units[current_bindpoint].swizzle.r =
                 MaxwellToGL::SwizzleSource(texture.tic.x_source);
             state.texture_units[current_bindpoint].swizzle.g =
@@ -679,7 +719,7 @@ u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, GLuint program,
 
     state.Apply();
 
-    return current_unit + entries.size();
+    return current_unit + static_cast<u32>(entries.size());
 }
 
 void RasterizerOpenGL::BindFramebufferSurfaces(const Surface& color_surface,
@@ -688,16 +728,16 @@ void RasterizerOpenGL::BindFramebufferSurfaces(const Surface& color_surface,
     state.Apply();
 
     glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D,
-                           color_surface != nullptr ? color_surface->texture.handle : 0, 0);
+                           color_surface != nullptr ? color_surface->Texture().handle : 0, 0);
     if (depth_surface != nullptr) {
         if (has_stencil) {
             // attach both depth and stencil
             glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
-                                   depth_surface->texture.handle, 0);
+                                   depth_surface->Texture().handle, 0);
         } else {
             // attach depth
             glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D,
-                                   depth_surface->texture.handle, 0);
+                                   depth_surface->Texture().handle, 0);
             // clear stencil attachment
             glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
         }
@@ -708,14 +748,14 @@ void RasterizerOpenGL::BindFramebufferSurfaces(const Surface& color_surface,
     }
 }
 
-void RasterizerOpenGL::SyncViewport(const MathUtil::Rectangle<u32>& surfaces_rect, u16 res_scale) {
+void RasterizerOpenGL::SyncViewport(const MathUtil::Rectangle<u32>& surfaces_rect) {
     const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
     const MathUtil::Rectangle<s32> viewport_rect{regs.viewport_transform[0].GetRect()};
 
-    state.viewport.x = static_cast<GLint>(surfaces_rect.left) + viewport_rect.left * res_scale;
-    state.viewport.y = static_cast<GLint>(surfaces_rect.bottom) + viewport_rect.bottom * res_scale;
-    state.viewport.width = static_cast<GLsizei>(viewport_rect.GetWidth() * res_scale);
-    state.viewport.height = static_cast<GLsizei>(viewport_rect.GetHeight() * res_scale);
+    state.viewport.x = static_cast<GLint>(surfaces_rect.left) + viewport_rect.left;
+    state.viewport.y = static_cast<GLint>(surfaces_rect.bottom) + viewport_rect.bottom;
+    state.viewport.width = static_cast<GLsizei>(viewport_rect.GetWidth());
+    state.viewport.height = static_cast<GLsizei>(viewport_rect.GetHeight());
 }
 
 void RasterizerOpenGL::SyncClipEnabled() {
@@ -727,7 +767,27 @@ void RasterizerOpenGL::SyncClipCoef() {
 }
 
 void RasterizerOpenGL::SyncCullMode() {
-    UNREACHABLE();
+    const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
+
+    state.cull.enabled = regs.cull.enabled != 0;
+
+    if (state.cull.enabled) {
+        state.cull.front_face = MaxwellToGL::FrontFace(regs.cull.front_face);
+        state.cull.mode = MaxwellToGL::CullFace(regs.cull.cull_face);
+
+        const bool flip_triangles{regs.screen_y_control.triangle_rast_flip == 0 ||
+                                  regs.viewport_transform[0].scale_y < 0.0f};
+
+        // If the GPU is configured to flip the rasterized triangles, then we need to flip the
+        // notion of front and back. Note: We flip the triangles when the value of the register is 0
+        // because OpenGL already does it for us.
+        if (flip_triangles) {
+            if (state.cull.front_face == GL_CCW)
+                state.cull.front_face = GL_CW;
+            else if (state.cull.front_face == GL_CW)
+                state.cull.front_face = GL_CCW;
+        }
+    }
 }
 
 void RasterizerOpenGL::SyncDepthScale() {
@@ -738,9 +798,20 @@ void RasterizerOpenGL::SyncDepthOffset() {
     UNREACHABLE();
 }
 
+void RasterizerOpenGL::SyncDepthTestState() {
+    const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
+
+    state.depth.test_enabled = regs.depth_test_enable != 0;
+    state.depth.write_mask = regs.depth_write_enabled ? GL_TRUE : GL_FALSE;
+
+    if (!state.depth.test_enabled)
+        return;
+
+    state.depth.test_func = MaxwellToGL::ComparisonOp(regs.depth_test_func);
+}
+
 void RasterizerOpenGL::SyncBlendState() {
     const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
-    ASSERT_MSG(regs.independent_blend_enable == 1, "Only independent blending is implemented");
 
     // TODO(Subv): Support more than just render target 0.
     state.blend.enabled = regs.blend.enable[0] != 0;
@@ -748,6 +819,7 @@ void RasterizerOpenGL::SyncBlendState() {
     if (!state.blend.enabled)
         return;
 
+    ASSERT_MSG(regs.independent_blend_enable == 1, "Only independent blending is implemented");
     ASSERT_MSG(!regs.independent_blend[0].separate_alpha, "Unimplemented");
     state.blend.rgb_equation = MaxwellToGL::BlendEquation(regs.independent_blend[0].equation_rgb);
     state.blend.src_rgb_func = MaxwellToGL::BlendFunc(regs.independent_blend[0].factor_source_rgb);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index b7c8cf843..c406142e4 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -7,6 +7,7 @@
 #include <array>
 #include <cstddef>
 #include <memory>
+#include <utility>
 #include <vector>
 #include <glad/glad.h>
 #include "common/common_types.h"
@@ -28,6 +29,7 @@ public:
     ~RasterizerOpenGL() override;
 
     void DrawArrays() override;
+    void Clear() override;
     void NotifyMaxwellRegisterChanged(u32 method) override;
     void FlushAll() override;
     void FlushRegion(Tegra::GPUVAddr addr, u64 size) override;
@@ -54,6 +56,11 @@ public:
         OGLShader shader;
     };
 
+    /// Maximum supported size that a constbuffer can have in bytes.
+    static constexpr size_t MaxConstbufferSize = 0x10000;
+    static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0,
+                  "The maximum size of a constbuffer must be a multiple of the size of GLvec4");
+
 private:
     class SamplerInfo {
     public:
@@ -76,6 +83,10 @@ private:
         u32 border_color_a;
     };
 
+    /// Configures the color and depth framebuffer states and returns the dirty <Color, Depth>
+    /// surfaces if writing was enabled.
+    std::pair<Surface, Surface> ConfigureFramebuffers(bool using_color_fb, bool using_depth_fb);
+
     /// Binds the framebuffer color and depth surface
     void BindFramebufferSurfaces(const Surface& color_surface, const Surface& depth_surface,
                                  bool has_stencil);
@@ -104,7 +115,7 @@ private:
                       u32 current_unit, const std::vector<GLShader::SamplerEntry>& entries);
 
     /// Syncs the viewport to match the guest state
-    void SyncViewport(const MathUtil::Rectangle<u32>& surfaces_rect, u16 res_scale);
+    void SyncViewport(const MathUtil::Rectangle<u32>& surfaces_rect);
 
     /// Syncs the clip enabled status to match the guest state
     void SyncClipEnabled();
@@ -121,6 +132,9 @@ private:
     /// Syncs the depth offset to match the guest state
     void SyncDepthOffset();
 
+    /// Syncs the depth test state to match the guest state
+    void SyncDepthTestState();
+
     /// Syncs the blend state to match the guest state
     void SyncBlendState();
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index ff48a2669..323ff7408 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -1,36 +1,23 @@
-// Copyright 2015 Citra Emulator Project
+// Copyright 2018 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #include <algorithm>
-#include <atomic>
-#include <cstring>
-#include <iterator>
-#include <memory>
-#include <utility>
-#include <vector>
-#include <boost/optional.hpp>
-#include <boost/range/iterator_range.hpp>
 #include <glad/glad.h>
+
 #include "common/alignment.h"
-#include "common/bit_field.h"
-#include "common/color.h"
-#include "common/logging/log.h"
-#include "common/math_util.h"
+#include "common/assert.h"
 #include "common/microprofile.h"
 #include "common/scope_exit.h"
 #include "core/core.h"
-#include "core/frontend/emu_window.h"
 #include "core/hle/kernel/process.h"
-#include "core/hle/kernel/vm_manager.h"
 #include "core/memory.h"
 #include "core/settings.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
-#include "video_core/renderer_opengl/gl_state.h"
+#include "video_core/textures/astc.h"
 #include "video_core/textures/decoders.h"
 #include "video_core/utils.h"
-#include "video_core/video_core.h"
 
 using SurfaceType = SurfaceParams::SurfaceType;
 using PixelFormat = SurfaceParams::PixelFormat;
@@ -40,89 +27,178 @@ struct FormatTuple {
     GLint internal_format;
     GLenum format;
     GLenum type;
+    ComponentType component_type;
     bool compressed;
 };
 
+/*static*/ SurfaceParams SurfaceParams::CreateForTexture(
+    const Tegra::Texture::FullTextureInfo& config) {
+
+    SurfaceParams params{};
+    params.addr = config.tic.Address();
+    params.is_tiled = config.tic.IsTiled();
+    params.block_height = params.is_tiled ? config.tic.BlockHeight() : 0,
+    params.pixel_format = PixelFormatFromTextureFormat(config.tic.format);
+    params.component_type = ComponentTypeFromTexture(config.tic.r_type.Value());
+    params.type = GetFormatType(params.pixel_format);
+    params.width = Common::AlignUp(config.tic.Width(), GetCompressionFactor(params.pixel_format));
+    params.height = Common::AlignUp(config.tic.Height(), GetCompressionFactor(params.pixel_format));
+    params.unaligned_height = config.tic.Height();
+    params.size_in_bytes = params.SizeInBytes();
+    return params;
+}
+
+/*static*/ SurfaceParams SurfaceParams::CreateForFramebuffer(
+    const Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig& config) {
+
+    SurfaceParams params{};
+    params.addr = config.Address();
+    params.is_tiled = true;
+    params.block_height = Tegra::Texture::TICEntry::DefaultBlockHeight;
+    params.pixel_format = PixelFormatFromRenderTargetFormat(config.format);
+    params.component_type = ComponentTypeFromRenderTarget(config.format);
+    params.type = GetFormatType(params.pixel_format);
+    params.width = config.width;
+    params.height = config.height;
+    params.unaligned_height = config.height;
+    params.size_in_bytes = params.SizeInBytes();
+    return params;
+}
+
+/*static*/ SurfaceParams SurfaceParams::CreateForDepthBuffer(
+    const Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig& config, Tegra::GPUVAddr zeta_address,
+    Tegra::DepthFormat format) {
+
+    SurfaceParams params{};
+    params.addr = zeta_address;
+    params.is_tiled = true;
+    params.block_height = Tegra::Texture::TICEntry::DefaultBlockHeight;
+    params.pixel_format = PixelFormatFromDepthFormat(format);
+    params.component_type = ComponentTypeFromDepthFormat(format);
+    params.type = GetFormatType(params.pixel_format);
+    params.size_in_bytes = params.SizeInBytes();
+    params.width = config.width;
+    params.height = config.height;
+    params.unaligned_height = config.height;
+    params.size_in_bytes = params.SizeInBytes();
+    return params;
+}
+
 static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_format_tuples = {{
-    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, false},                    // ABGR8
-    {GL_RGB, GL_RGB, GL_UNSIGNED_SHORT_5_6_5, false},                           // B5G6R5
-    {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, false},              // A2B10G10R10
-    {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, false},                // A1B5G5R5
-    {GL_R8, GL_RED, GL_UNSIGNED_BYTE, false},                                   // R8
-    {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, false},                                // RGBA16F
-    {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, false},        // R11FG11FB10F
-    {GL_COMPRESSED_RGB_S3TC_DXT1_EXT, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, true},   // DXT1
-    {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, true}, // DXT23
-    {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, true}, // DXT45
-    {GL_COMPRESSED_RED_RGTC1, GL_RED, GL_UNSIGNED_INT_8_8_8_8, true},           // DXN1
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // ABGR8
+    {GL_RGB, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, ComponentType::UNorm, false},    // B5G6R5
+    {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, ComponentType::UNorm,
+     false}, // A2B10G10R10
+    {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, ComponentType::UNorm, false}, // A1B5G5R5
+    {GL_R8, GL_RED, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},                    // R8
+    {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, ComponentType::Float, false},                 // RGBA16F
+    {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, ComponentType::Float,
+     false},                                                                     // R11FG11FB10F
+    {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // RGBA32UI
+    {GL_COMPRESSED_RGB_S3TC_DXT1_EXT, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm,
+     true}, // DXT1
+    {GL_COMPRESSED_RGBA_S3TC_DXT3_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm,
+     true}, // DXT23
+    {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm,
+     true},                                                                                 // DXT45
+    {GL_COMPRESSED_RED_RGTC1, GL_RED, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, true}, // DXN1
+    {GL_COMPRESSED_RGBA_BPTC_UNORM_ARB, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm,
+     true},                                                             // BC7U
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_4X4
+
+    // DepthStencil formats
+    {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, ComponentType::UNorm,
+     false}, // Z24S8
+    {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, ComponentType::UNorm,
+     false},                                                                            // S8Z24
+    {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT, ComponentType::Float, false}, // Z32F
 }};
 
 static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType component_type) {
-    const SurfaceType type = SurfaceParams::GetFormatType(pixel_format);
-    if (type == SurfaceType::ColorTexture) {
-        ASSERT(static_cast<size_t>(pixel_format) < tex_format_tuples.size());
-        // For now only UNORM components are supported, or either R11FG11FB10F or RGBA16F which are
-        // type FLOAT
-        ASSERT(component_type == ComponentType::UNorm || pixel_format == PixelFormat::RGBA16F ||
-               pixel_format == PixelFormat::R11FG11FB10F);
-        return tex_format_tuples[static_cast<unsigned int>(pixel_format)];
-    } else if (type == SurfaceType::Depth || type == SurfaceType::DepthStencil) {
-        // TODO(Subv): Implement depth formats
-        ASSERT_MSG(false, "Unimplemented");
-    }
+    ASSERT(static_cast<size_t>(pixel_format) < tex_format_tuples.size());
+    auto& format = tex_format_tuples[static_cast<unsigned int>(pixel_format)];
+    ASSERT(component_type == format.component_type);
 
-    UNREACHABLE();
-    return {};
+    return format;
 }
 
-template <typename Map, typename Interval>
-constexpr auto RangeFromInterval(Map& map, const Interval& interval) {
-    return boost::make_iterator_range(map.equal_range(interval));
+VAddr SurfaceParams::GetCpuAddr() const {
+    const auto& gpu = Core::System::GetInstance().GPU();
+    return *gpu.memory_manager->GpuToCpuAddress(addr);
 }
 
-static u16 GetResolutionScaleFactor() {
-    return static_cast<u16>(!Settings::values.resolution_factor
-                                ? VideoCore::g_emu_window->GetFramebufferLayout().GetScalingRatio()
-                                : Settings::values.resolution_factor);
+static bool IsPixelFormatASTC(PixelFormat format) {
+    switch (format) {
+    case PixelFormat::ASTC_2D_4X4:
+        return true;
+    default:
+        return false;
+    }
+}
+
+static std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) {
+    switch (format) {
+    case PixelFormat::ASTC_2D_4X4:
+        return {4, 4};
+    default:
+        LOG_CRITICAL(HW_GPU, "Unhandled format: {}", static_cast<u32>(format));
+        UNREACHABLE();
+    }
+}
+
+MathUtil::Rectangle<u32> SurfaceParams::GetRect() const {
+    u32 actual_height{unaligned_height};
+    if (IsPixelFormatASTC(pixel_format)) {
+        // ASTC formats must stop at the ATSC block size boundary
+        actual_height = Common::AlignDown(actual_height, GetASTCBlockSize(pixel_format).second);
+    }
+    return {0, actual_height, width, 0};
 }
 
 template <bool morton_to_gl, PixelFormat format>
-void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, Tegra::GPUVAddr base,
-                Tegra::GPUVAddr start, Tegra::GPUVAddr end) {
+void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, Tegra::GPUVAddr addr) {
     constexpr u32 bytes_per_pixel = SurfaceParams::GetFormatBpp(format) / CHAR_BIT;
     constexpr u32 gl_bytes_per_pixel = CachedSurface::GetGLBytesPerPixel(format);
     const auto& gpu = Core::System::GetInstance().GPU();
 
     if (morton_to_gl) {
-        auto data = Tegra::Texture::UnswizzleTexture(
-            *gpu.memory_manager->GpuToCpuAddress(base),
-            SurfaceParams::TextureFormatFromPixelFormat(format), stride, height, block_height);
-        std::memcpy(gl_buffer, data.data(), data.size());
+        if (SurfaceParams::GetFormatType(format) == SurfaceType::ColorTexture) {
+            auto data = Tegra::Texture::UnswizzleTexture(
+                *gpu.memory_manager->GpuToCpuAddress(addr),
+                SurfaceParams::TextureFormatFromPixelFormat(format), stride, height, block_height);
+            std::memcpy(gl_buffer, data.data(), data.size());
+        } else {
+            auto data = Tegra::Texture::UnswizzleDepthTexture(
+                *gpu.memory_manager->GpuToCpuAddress(addr),
+                SurfaceParams::DepthFormatFromPixelFormat(format), stride, height, block_height);
+            std::memcpy(gl_buffer, data.data(), data.size());
+        }
     } else {
-        // TODO(bunnei): Assumes the default rendering GOB size of 16 (128 lines). We should check
-        // the configuration for this and perform more generic un/swizzle
-        NGLOG_WARNING(Render_OpenGL, "need to use correct swizzle/GOB parameters!");
+        // TODO(bunnei): Assumes the default rendering GOB size of 16 (128 lines). We should
+        // check the configuration for this and perform more generic un/swizzle
+        LOG_WARNING(Render_OpenGL, "need to use correct swizzle/GOB parameters!");
         VideoCore::MortonCopyPixels128(
             stride, height, bytes_per_pixel, gl_bytes_per_pixel,
-            Memory::GetPointer(*gpu.memory_manager->GpuToCpuAddress(base)), gl_buffer,
+            Memory::GetPointer(*gpu.memory_manager->GpuToCpuAddress(addr)), gl_buffer,
             morton_to_gl);
     }
 }
 
-static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr, Tegra::GPUVAddr,
-                                     Tegra::GPUVAddr),
+static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr),
                             SurfaceParams::MaxPixelFormat>
     morton_to_gl_fns = {
         MortonCopy<true, PixelFormat::ABGR8>,        MortonCopy<true, PixelFormat::B5G6R5>,
         MortonCopy<true, PixelFormat::A2B10G10R10>,  MortonCopy<true, PixelFormat::A1B5G5R5>,
         MortonCopy<true, PixelFormat::R8>,           MortonCopy<true, PixelFormat::RGBA16F>,
-        MortonCopy<true, PixelFormat::R11FG11FB10F>, MortonCopy<true, PixelFormat::DXT1>,
-        MortonCopy<true, PixelFormat::DXT23>,        MortonCopy<true, PixelFormat::DXT45>,
-        MortonCopy<true, PixelFormat::DXN1>,
+        MortonCopy<true, PixelFormat::R11FG11FB10F>, MortonCopy<true, PixelFormat::RGBA32UI>,
+        MortonCopy<true, PixelFormat::DXT1>,         MortonCopy<true, PixelFormat::DXT23>,
+        MortonCopy<true, PixelFormat::DXT45>,        MortonCopy<true, PixelFormat::DXN1>,
+        MortonCopy<true, PixelFormat::BC7U>,         MortonCopy<true, PixelFormat::ASTC_2D_4X4>,
+        MortonCopy<true, PixelFormat::Z24S8>,        MortonCopy<true, PixelFormat::S8Z24>,
+        MortonCopy<true, PixelFormat::Z32F>,
 };
 
-static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr, Tegra::GPUVAddr,
-                                     Tegra::GPUVAddr),
+static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr),
                             SurfaceParams::MaxPixelFormat>
     gl_to_morton_fns = {
         MortonCopy<false, PixelFormat::ABGR8>,
@@ -132,11 +208,17 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr, Tegra:
         MortonCopy<false, PixelFormat::R8>,
         MortonCopy<false, PixelFormat::RGBA16F>,
         MortonCopy<false, PixelFormat::R11FG11FB10F>,
-        // TODO(Subv): Swizzling the DXT1/DXT23/DXT45/DXN1 formats is not yet supported
+        MortonCopy<false, PixelFormat::RGBA32UI>,
+        // TODO(Subv): Swizzling the DXT1/DXT23/DXT45/DXN1/BC7U formats is not yet supported
+        nullptr,
         nullptr,
         nullptr,
         nullptr,
         nullptr,
+        MortonCopy<false, PixelFormat::ABGR8>,
+        MortonCopy<false, PixelFormat::Z24S8>,
+        MortonCopy<false, PixelFormat::S8Z24>,
+        MortonCopy<false, PixelFormat::Z32F>,
 };
 
 // Allocate an uninitialized texture of appropriate size and format for the surface
@@ -166,374 +248,144 @@ static void AllocateSurfaceTexture(GLuint texture, const FormatTuple& format_tup
     cur_state.Apply();
 }
 
-static bool BlitTextures(GLuint src_tex, const MathUtil::Rectangle<u32>& src_rect, GLuint dst_tex,
-                         const MathUtil::Rectangle<u32>& dst_rect, SurfaceType type,
-                         GLuint read_fb_handle, GLuint draw_fb_handle) {
-
-    glCopyImageSubData(src_tex, GL_TEXTURE_2D, 0, src_rect.left, src_rect.bottom, 0, dst_tex,
-                       GL_TEXTURE_2D, 0, dst_rect.left, dst_rect.bottom, 0, src_rect.GetWidth(),
-                       src_rect.GetHeight(), 0);
-    return true;
-}
-
-static bool FillSurface(const Surface& surface, const u8* fill_data,
-                        const MathUtil::Rectangle<u32>& fill_rect, GLuint draw_fb_handle) {
-    UNREACHABLE();
-    return {};
-}
-
-SurfaceParams SurfaceParams::FromInterval(SurfaceInterval interval) const {
-    SurfaceParams params = *this;
-    const u32 tiled_size = is_tiled ? 8 : 1;
-    const u64 stride_tiled_bytes = BytesInPixels(stride * tiled_size);
-    Tegra::GPUVAddr aligned_start =
-        addr + Common::AlignDown(boost::icl::first(interval) - addr, stride_tiled_bytes);
-    Tegra::GPUVAddr aligned_end =
-        addr + Common::AlignUp(boost::icl::last_next(interval) - addr, stride_tiled_bytes);
-
-    if (aligned_end - aligned_start > stride_tiled_bytes) {
-        params.addr = aligned_start;
-        params.height = static_cast<u32>((aligned_end - aligned_start) / BytesInPixels(stride));
-    } else {
-        // 1 row
-        ASSERT(aligned_end - aligned_start == stride_tiled_bytes);
-        const u64 tiled_alignment = BytesInPixels(is_tiled ? 8 * 8 : 1);
-        aligned_start =
-            addr + Common::AlignDown(boost::icl::first(interval) - addr, tiled_alignment);
-        aligned_end =
-            addr + Common::AlignUp(boost::icl::last_next(interval) - addr, tiled_alignment);
-        params.addr = aligned_start;
-        params.width = static_cast<u32>(PixelsInBytes(aligned_end - aligned_start) / tiled_size);
-        params.stride = params.width;
-        params.height = tiled_size;
-    }
-    params.UpdateParams();
-
-    return params;
-}
-
-SurfaceInterval SurfaceParams::GetSubRectInterval(MathUtil::Rectangle<u32> unscaled_rect) const {
-    if (unscaled_rect.GetHeight() == 0 || unscaled_rect.GetWidth() == 0) {
-        return {};
-    }
-
-    if (is_tiled) {
-        unscaled_rect.left = Common::AlignDown(unscaled_rect.left, 8) * 8;
-        unscaled_rect.bottom = Common::AlignDown(unscaled_rect.bottom, 8) / 8;
-        unscaled_rect.right = Common::AlignUp(unscaled_rect.right, 8) * 8;
-        unscaled_rect.top = Common::AlignUp(unscaled_rect.top, 8) / 8;
-    }
-
-    const u32 stride_tiled = !is_tiled ? stride : stride * 8;
-
-    const u32 pixel_offset =
-        stride_tiled * (!is_tiled ? unscaled_rect.bottom : (height / 8) - unscaled_rect.top) +
-        unscaled_rect.left;
-
-    const u32 pixels = (unscaled_rect.GetHeight() - 1) * stride_tiled + unscaled_rect.GetWidth();
-
-    return {addr + BytesInPixels(pixel_offset), addr + BytesInPixels(pixel_offset + pixels)};
-}
-
-MathUtil::Rectangle<u32> SurfaceParams::GetSubRect(const SurfaceParams& sub_surface) const {
-    const u32 begin_pixel_index = static_cast<u32>(PixelsInBytes(sub_surface.addr - addr));
-
-    if (is_tiled) {
-        const int x0 = (begin_pixel_index % (stride * 8)) / 8;
-        const int y0 = (begin_pixel_index / (stride * 8)) * 8;
-        // Top to bottom
-        return MathUtil::Rectangle<u32>(x0, height - y0, x0 + sub_surface.width,
-                                        height - (y0 + sub_surface.height));
-    }
-
-    const int x0 = begin_pixel_index % stride;
-    const int y0 = begin_pixel_index / stride;
-    // Bottom to top
-    return MathUtil::Rectangle<u32>(x0, y0 + sub_surface.height, x0 + sub_surface.width, y0);
-}
-
-MathUtil::Rectangle<u32> SurfaceParams::GetScaledSubRect(const SurfaceParams& sub_surface) const {
-    auto rect = GetSubRect(sub_surface);
-    rect.left = rect.left * res_scale;
-    rect.right = rect.right * res_scale;
-    rect.top = rect.top * res_scale;
-    rect.bottom = rect.bottom * res_scale;
-    return rect;
-}
-
-bool SurfaceParams::ExactMatch(const SurfaceParams& other_surface) const {
-    return std::tie(other_surface.addr, other_surface.width, other_surface.height,
-                    other_surface.stride, other_surface.block_height, other_surface.pixel_format,
-                    other_surface.component_type,
-                    other_surface.is_tiled) == std::tie(addr, width, height, stride, block_height,
-                                                        pixel_format, component_type, is_tiled) &&
-           pixel_format != PixelFormat::Invalid;
-}
-
-bool SurfaceParams::CanSubRect(const SurfaceParams& sub_surface) const {
-    return sub_surface.addr >= addr && sub_surface.end <= end &&
-           sub_surface.pixel_format == pixel_format && pixel_format != PixelFormat::Invalid &&
-           sub_surface.is_tiled == is_tiled && sub_surface.block_height == block_height &&
-           sub_surface.component_type == component_type &&
-           (sub_surface.addr - addr) % BytesInPixels(is_tiled ? 64 : 1) == 0 &&
-           (sub_surface.stride == stride || sub_surface.height <= (is_tiled ? 8u : 1u)) &&
-           GetSubRect(sub_surface).left + sub_surface.width <= stride;
-}
-
-bool SurfaceParams::CanExpand(const SurfaceParams& expanded_surface) const {
-    return pixel_format != PixelFormat::Invalid && pixel_format == expanded_surface.pixel_format &&
-           addr <= expanded_surface.end && expanded_surface.addr <= end &&
-           is_tiled == expanded_surface.is_tiled && block_height == expanded_surface.block_height &&
-           component_type == expanded_surface.component_type && stride == expanded_surface.stride &&
-           (std::max(expanded_surface.addr, addr) - std::min(expanded_surface.addr, addr)) %
-                   BytesInPixels(stride * (is_tiled ? 8 : 1)) ==
-               0;
-}
-
-bool SurfaceParams::CanTexCopy(const SurfaceParams& texcopy_params) const {
-    if (pixel_format == PixelFormat::Invalid || addr > texcopy_params.addr ||
-        end < texcopy_params.end) {
-        return false;
-    }
-    if (texcopy_params.block_height != block_height ||
-        texcopy_params.component_type != component_type)
-        return false;
-
-    if (texcopy_params.width != texcopy_params.stride) {
-        const u32 tile_stride = static_cast<u32>(BytesInPixels(stride * (is_tiled ? 8 : 1)));
-        return (texcopy_params.addr - addr) % BytesInPixels(is_tiled ? 64 : 1) == 0 &&
-               texcopy_params.width % BytesInPixels(is_tiled ? 64 : 1) == 0 &&
-               (texcopy_params.height == 1 || texcopy_params.stride == tile_stride) &&
-               ((texcopy_params.addr - addr) % tile_stride) + texcopy_params.width <= tile_stride;
-    }
-    return FromInterval(texcopy_params.GetInterval()).GetInterval() == texcopy_params.GetInterval();
-}
-
-VAddr SurfaceParams::GetCpuAddr() const {
-    // When this function is used, only cpu_addr or (GPU) addr should be set, not both
-    ASSERT(!(cpu_addr && addr));
-    const auto& gpu = Core::System::GetInstance().GPU();
-    return cpu_addr.get_value_or(*gpu.memory_manager->GpuToCpuAddress(addr));
-}
-
-bool CachedSurface::CanFill(const SurfaceParams& dest_surface,
-                            SurfaceInterval fill_interval) const {
-    if (type == SurfaceType::Fill && IsRegionValid(fill_interval) &&
-        boost::icl::first(fill_interval) >= addr &&
-        boost::icl::last_next(fill_interval) <= end && // dest_surface is within our fill range
-        dest_surface.FromInterval(fill_interval).GetInterval() ==
-            fill_interval) { // make sure interval is a rectangle in dest surface
-        if (fill_size * CHAR_BIT != dest_surface.GetFormatBpp()) {
-            // Check if bits repeat for our fill_size
-            const u32 dest_bytes_per_pixel = std::max(dest_surface.GetFormatBpp() / CHAR_BIT, 1u);
-            std::vector<u8> fill_test(fill_size * dest_bytes_per_pixel);
-
-            for (u32 i = 0; i < dest_bytes_per_pixel; ++i)
-                std::memcpy(&fill_test[i * fill_size], &fill_data[0], fill_size);
-
-            for (u32 i = 0; i < fill_size; ++i)
-                if (std::memcmp(&fill_test[dest_bytes_per_pixel * i], &fill_test[0],
-                                dest_bytes_per_pixel) != 0)
-                    return false;
-
-            if (dest_surface.GetFormatBpp() == 4 && (fill_test[0] & 0xF) != (fill_test[0] >> 4))
-                return false;
+CachedSurface::CachedSurface(const SurfaceParams& params) : params(params) {
+    texture.Create();
+    const auto& rect{params.GetRect()};
+    AllocateSurfaceTexture(texture.handle,
+                           GetFormatTuple(params.pixel_format, params.component_type),
+                           rect.GetWidth(), rect.GetHeight());
+}
+
+static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height) {
+    union S8Z24 {
+        BitField<0, 24, u32> z24;
+        BitField<24, 8, u32> s8;
+    };
+    static_assert(sizeof(S8Z24) == 4, "S8Z24 is incorrect size");
+
+    union Z24S8 {
+        BitField<0, 8, u32> s8;
+        BitField<8, 24, u32> z24;
+    };
+    static_assert(sizeof(Z24S8) == 4, "Z24S8 is incorrect size");
+
+    S8Z24 input_pixel{};
+    Z24S8 output_pixel{};
+    for (size_t y = 0; y < height; ++y) {
+        for (size_t x = 0; x < width; ++x) {
+            const size_t offset{y * width + x};
+            std::memcpy(&input_pixel, &data[offset], sizeof(S8Z24));
+            output_pixel.s8.Assign(input_pixel.s8);
+            output_pixel.z24.Assign(input_pixel.z24);
+            std::memcpy(&data[offset], &output_pixel, sizeof(Z24S8));
         }
-        return true;
     }
-    return false;
-}
-
-bool CachedSurface::CanCopy(const SurfaceParams& dest_surface,
-                            SurfaceInterval copy_interval) const {
-    SurfaceParams subrect_params = dest_surface.FromInterval(copy_interval);
-    ASSERT(subrect_params.GetInterval() == copy_interval);
-    if (CanSubRect(subrect_params))
-        return true;
-
-    if (CanFill(dest_surface, copy_interval))
-        return true;
-
-    return false;
 }
-
-SurfaceInterval SurfaceParams::GetCopyableInterval(const Surface& src_surface) const {
-    SurfaceInterval result{};
-    const auto valid_regions =
-        SurfaceRegions(GetInterval() & src_surface->GetInterval()) - src_surface->invalid_regions;
-    for (auto& valid_interval : valid_regions) {
-        const SurfaceInterval aligned_interval{
-            addr + Common::AlignUp(boost::icl::first(valid_interval) - addr,
-                                   BytesInPixels(is_tiled ? 8 * 8 : 1)),
-            addr + Common::AlignDown(boost::icl::last_next(valid_interval) - addr,
-                                     BytesInPixels(is_tiled ? 8 * 8 : 1))};
-
-        if (BytesInPixels(is_tiled ? 8 * 8 : 1) > boost::icl::length(valid_interval) ||
-            boost::icl::length(aligned_interval) == 0) {
-            continue;
-        }
-
-        // Get the rectangle within aligned_interval
-        const u32 stride_bytes = static_cast<u32>(BytesInPixels(stride)) * (is_tiled ? 8 : 1);
-        SurfaceInterval rect_interval{
-            addr + Common::AlignUp(boost::icl::first(aligned_interval) - addr, stride_bytes),
-            addr + Common::AlignDown(boost::icl::last_next(aligned_interval) - addr, stride_bytes),
-        };
-        if (boost::icl::first(rect_interval) > boost::icl::last_next(rect_interval)) {
-            // 1 row
-            rect_interval = aligned_interval;
-        } else if (boost::icl::length(rect_interval) == 0) {
-            // 2 rows that do not make a rectangle, return the larger one
-            const SurfaceInterval row1{boost::icl::first(aligned_interval),
-                                       boost::icl::first(rect_interval)};
-            const SurfaceInterval row2{boost::icl::first(rect_interval),
-                                       boost::icl::last_next(aligned_interval)};
-            rect_interval = (boost::icl::length(row1) > boost::icl::length(row2)) ? row1 : row2;
-        }
-
-        if (boost::icl::length(rect_interval) > boost::icl::length(result)) {
-            result = rect_interval;
-        }
+/**
+ * Helper function to perform software conversion (as needed) when loading a buffer from Switch
+ * memory. This is for Maxwell pixel formats that cannot be represented as-is in OpenGL or with
+ * typical desktop GPUs.
+ */
+static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelFormat pixel_format,
+                                               u32 width, u32 height) {
+    switch (pixel_format) {
+    case PixelFormat::ASTC_2D_4X4: {
+        // Convert ASTC pixel formats to RGBA8, as most desktop GPUs do not support ASTC.
+        u32 block_width{};
+        u32 block_height{};
+        std::tie(block_width, block_height) = GetASTCBlockSize(pixel_format);
+        data = Tegra::Texture::ASTC::Decompress(data, width, height, block_width, block_height);
+        break;
+    }
+    case PixelFormat::S8Z24:
+        // Convert the S8Z24 depth format to Z24S8, as OpenGL does not support S8Z24.
+        ConvertS8Z24ToZ24S8(data, width, height);
+        break;
+    }
+}
+
+/**
+ * Helper function to perform software conversion (as needed) when flushing a buffer to Switch
+ * memory. This is for Maxwell pixel formats that cannot be represented as-is in OpenGL or with
+ * typical desktop GPUs.
+ */
+static void ConvertFormatAsNeeded_FlushGLBuffer(std::vector<u8>& /*data*/, PixelFormat pixel_format,
+                                                u32 /*width*/, u32 /*height*/) {
+    switch (pixel_format) {
+    case PixelFormat::ASTC_2D_4X4:
+    case PixelFormat::S8Z24:
+        LOG_CRITICAL(Render_OpenGL, "Unimplemented pixel_format={}",
+                     static_cast<u32>(pixel_format));
+        UNREACHABLE();
+        break;
     }
-    return result;
-}
-
-void RasterizerCacheOpenGL::CopySurface(const Surface& src_surface, const Surface& dst_surface,
-                                        SurfaceInterval copy_interval) {
-    SurfaceParams subrect_params = dst_surface->FromInterval(copy_interval);
-    ASSERT(subrect_params.GetInterval() == copy_interval);
-
-    ASSERT(src_surface != dst_surface);
-
-    // This is only called when CanCopy is true, no need to run checks here
-    if (src_surface->type == SurfaceType::Fill) {
-        // FillSurface needs a 4 bytes buffer
-        const u64 fill_offset =
-            (boost::icl::first(copy_interval) - src_surface->addr) % src_surface->fill_size;
-        std::array<u8, 4> fill_buffer;
-
-        u64 fill_buff_pos = fill_offset;
-        for (int i : {0, 1, 2, 3})
-            fill_buffer[i] = src_surface->fill_data[fill_buff_pos++ % src_surface->fill_size];
-
-        FillSurface(dst_surface, &fill_buffer[0], dst_surface->GetScaledSubRect(subrect_params),
-                    draw_framebuffer.handle);
-        return;
-    }
-    if (src_surface->CanSubRect(subrect_params)) {
-        BlitTextures(src_surface->texture.handle, src_surface->GetScaledSubRect(subrect_params),
-                     dst_surface->texture.handle, dst_surface->GetScaledSubRect(subrect_params),
-                     src_surface->type, read_framebuffer.handle, draw_framebuffer.handle);
-        return;
-    }
-    UNREACHABLE();
 }
 
 MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 64, 192));
-void CachedSurface::LoadGLBuffer(Tegra::GPUVAddr load_start, Tegra::GPUVAddr load_end) {
-    ASSERT(type != SurfaceType::Fill);
+void CachedSurface::LoadGLBuffer() {
+    ASSERT(params.type != SurfaceType::Fill);
 
-    u8* const texture_src_data = Memory::GetPointer(GetCpuAddr());
-    if (texture_src_data == nullptr)
-        return;
+    u8* const texture_src_data = Memory::GetPointer(params.GetCpuAddr());
 
-    if (gl_buffer == nullptr) {
-        gl_buffer_size = GetActualWidth() * GetActualHeight() * GetGLBytesPerPixel(pixel_format);
-        gl_buffer.reset(new u8[gl_buffer_size]);
-    }
+    ASSERT(texture_src_data);
 
-    MICROPROFILE_SCOPE(OpenGL_SurfaceLoad);
+    gl_buffer.resize(params.width * params.height * GetGLBytesPerPixel(params.pixel_format));
 
-    ASSERT(load_start >= addr && load_end <= end);
-    const u64 start_offset = load_start - addr;
+    MICROPROFILE_SCOPE(OpenGL_SurfaceLoad);
 
-    if (!is_tiled) {
-        const u32 bytes_per_pixel{GetFormatBpp() >> 3};
+    if (!params.is_tiled) {
+        const u32 bytes_per_pixel{params.GetFormatBpp() >> 3};
 
-        std::memcpy(&gl_buffer[start_offset], texture_src_data + start_offset,
-                    bytes_per_pixel * width * height);
+        std::memcpy(gl_buffer.data(), texture_src_data,
+                    bytes_per_pixel * params.width * params.height);
     } else {
-        morton_to_gl_fns[static_cast<size_t>(pixel_format)](GetActualWidth(), block_height,
-                                                            GetActualHeight(), &gl_buffer[0], addr,
-                                                            load_start, load_end);
+        morton_to_gl_fns[static_cast<size_t>(params.pixel_format)](
+            params.width, params.block_height, params.height, gl_buffer.data(), params.addr);
     }
+
+    ConvertFormatAsNeeded_LoadGLBuffer(gl_buffer, params.pixel_format, params.width, params.height);
 }
 
 MICROPROFILE_DEFINE(OpenGL_SurfaceFlush, "OpenGL", "Surface Flush", MP_RGB(128, 192, 64));
-void CachedSurface::FlushGLBuffer(Tegra::GPUVAddr flush_start, Tegra::GPUVAddr flush_end) {
-    u8* const dst_buffer = Memory::GetPointer(GetCpuAddr());
-    if (dst_buffer == nullptr)
-        return;
-
-    ASSERT(gl_buffer_size == width * height * GetGLBytesPerPixel(pixel_format));
+void CachedSurface::FlushGLBuffer() {
+    u8* const dst_buffer = Memory::GetPointer(params.GetCpuAddr());
 
-    // TODO: Should probably be done in ::Memory:: and check for other regions too
-    // same as loadglbuffer()
-    if (flush_start < Memory::VRAM_VADDR_END && flush_end > Memory::VRAM_VADDR_END)
-        flush_end = Memory::VRAM_VADDR_END;
-
-    if (flush_start < Memory::VRAM_VADDR && flush_end > Memory::VRAM_VADDR)
-        flush_start = Memory::VRAM_VADDR;
+    ASSERT(dst_buffer);
+    ASSERT(gl_buffer.size() ==
+           params.width * params.height * GetGLBytesPerPixel(params.pixel_format));
 
     MICROPROFILE_SCOPE(OpenGL_SurfaceFlush);
 
-    ASSERT(flush_start >= addr && flush_end <= end);
-    const u64 start_offset = flush_start - addr;
-    const u64 end_offset = flush_end - addr;
-
-    if (type == SurfaceType::Fill) {
-        const u64 coarse_start_offset = start_offset - (start_offset % fill_size);
-        const u64 backup_bytes = start_offset % fill_size;
-        std::array<u8, 4> backup_data;
-        if (backup_bytes)
-            std::memcpy(&backup_data[0], &dst_buffer[coarse_start_offset], backup_bytes);
-
-        for (u64 offset = coarse_start_offset; offset < end_offset; offset += fill_size) {
-            std::memcpy(&dst_buffer[offset], &fill_data[0],
-                        std::min(fill_size, end_offset - offset));
-        }
+    ConvertFormatAsNeeded_FlushGLBuffer(gl_buffer, params.pixel_format, params.width,
+                                        params.height);
 
-        if (backup_bytes)
-            std::memcpy(&dst_buffer[coarse_start_offset], &backup_data[0], backup_bytes);
-    } else if (!is_tiled) {
-        std::memcpy(dst_buffer + start_offset, &gl_buffer[start_offset], flush_end - flush_start);
+    if (!params.is_tiled) {
+        std::memcpy(dst_buffer, gl_buffer.data(), params.size_in_bytes);
     } else {
-        gl_to_morton_fns[static_cast<size_t>(pixel_format)](
-            stride, block_height, height, &gl_buffer[0], addr, flush_start, flush_end);
+        gl_to_morton_fns[static_cast<size_t>(params.pixel_format)](
+            params.width, params.block_height, params.height, gl_buffer.data(), params.addr);
     }
 }
 
 MICROPROFILE_DEFINE(OpenGL_TextureUL, "OpenGL", "Texture Upload", MP_RGB(128, 64, 192));
-void CachedSurface::UploadGLTexture(const MathUtil::Rectangle<u32>& rect, GLuint read_fb_handle,
-                                    GLuint draw_fb_handle) {
-    if (type == SurfaceType::Fill)
+void CachedSurface::UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle) {
+    if (params.type == SurfaceType::Fill)
         return;
 
     MICROPROFILE_SCOPE(OpenGL_TextureUL);
 
-    ASSERT(gl_buffer_size ==
-           GetActualWidth() * GetActualHeight() * GetGLBytesPerPixel(pixel_format));
+    ASSERT(gl_buffer.size() ==
+           params.width * params.height * GetGLBytesPerPixel(params.pixel_format));
+
+    const auto& rect{params.GetRect()};
 
     // Load data from memory to the surface
     GLint x0 = static_cast<GLint>(rect.left);
     GLint y0 = static_cast<GLint>(rect.bottom);
-    size_t buffer_offset = (y0 * stride + x0) * GetGLBytesPerPixel(pixel_format);
+    size_t buffer_offset = (y0 * params.width + x0) * GetGLBytesPerPixel(params.pixel_format);
 
-    const FormatTuple& tuple = GetFormatTuple(pixel_format, component_type);
+    const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type);
     GLuint target_tex = texture.handle;
-
-    // If not 1x scale, create 1x texture that we will blit from to replace texture subrect in
-    // surface
-    OGLTexture unscaled_tex;
-    if (res_scale != 1) {
-        x0 = 0;
-        y0 = 0;
-
-        unscaled_tex.Create();
-        AllocateSurfaceTexture(unscaled_tex.handle, tuple, rect.GetWidth(), rect.GetHeight());
-        target_tex = unscaled_tex.handle;
-    }
-
     OpenGLState cur_state = OpenGLState::GetCurState();
 
     GLuint old_tex = cur_state.texture_units[0].texture_2d;
@@ -541,15 +393,15 @@ void CachedSurface::UploadGLTexture(const MathUtil::Rectangle<u32>& rect, GLuint
     cur_state.Apply();
 
     // Ensure no bad interactions with GL_UNPACK_ALIGNMENT
-    ASSERT(stride * GetGLBytesPerPixel(pixel_format) % 4 == 0);
-    glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(stride));
+    ASSERT(params.width * GetGLBytesPerPixel(params.pixel_format) % 4 == 0);
+    glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(params.width));
 
     glActiveTexture(GL_TEXTURE0);
     if (tuple.compressed) {
-        glCompressedTexImage2D(GL_TEXTURE_2D, 0, tuple.internal_format,
-                               static_cast<GLsizei>(rect.GetWidth() * GetCompresssionFactor()),
-                               static_cast<GLsizei>(rect.GetHeight() * GetCompresssionFactor()), 0,
-                               size, &gl_buffer[buffer_offset]);
+        glCompressedTexImage2D(
+            GL_TEXTURE_2D, 0, tuple.internal_format, static_cast<GLsizei>(params.width),
+            static_cast<GLsizei>(params.height), 0, static_cast<GLsizei>(params.size_in_bytes),
+            &gl_buffer[buffer_offset]);
     } else {
         glTexSubImage2D(GL_TEXTURE_2D, 0, x0, y0, static_cast<GLsizei>(rect.GetWidth()),
                         static_cast<GLsizei>(rect.GetHeight()), tuple.format, tuple.type,
@@ -560,845 +412,250 @@ void CachedSurface::UploadGLTexture(const MathUtil::Rectangle<u32>& rect, GLuint
 
     cur_state.texture_units[0].texture_2d = old_tex;
     cur_state.Apply();
-
-    if (res_scale != 1) {
-        auto scaled_rect = rect;
-        scaled_rect.left *= res_scale;
-        scaled_rect.top *= res_scale;
-        scaled_rect.right *= res_scale;
-        scaled_rect.bottom *= res_scale;
-
-        BlitTextures(unscaled_tex.handle, {0, rect.GetHeight(), rect.GetWidth(), 0}, texture.handle,
-                     scaled_rect, type, read_fb_handle, draw_fb_handle);
-    }
 }
 
 MICROPROFILE_DEFINE(OpenGL_TextureDL, "OpenGL", "Texture Download", MP_RGB(128, 192, 64));
-void CachedSurface::DownloadGLTexture(const MathUtil::Rectangle<u32>& rect, GLuint read_fb_handle,
-                                      GLuint draw_fb_handle) {
-    if (type == SurfaceType::Fill)
+void CachedSurface::DownloadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle) {
+    if (params.type == SurfaceType::Fill)
         return;
 
     MICROPROFILE_SCOPE(OpenGL_TextureDL);
 
-    if (gl_buffer == nullptr) {
-        gl_buffer_size = width * height * GetGLBytesPerPixel(pixel_format);
-        gl_buffer.reset(new u8[gl_buffer_size]);
-    }
+    gl_buffer.resize(params.width * params.height * GetGLBytesPerPixel(params.pixel_format));
 
     OpenGLState state = OpenGLState::GetCurState();
     OpenGLState prev_state = state;
     SCOPE_EXIT({ prev_state.Apply(); });
 
-    const FormatTuple& tuple = GetFormatTuple(pixel_format, component_type);
+    const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type);
 
     // Ensure no bad interactions with GL_PACK_ALIGNMENT
-    ASSERT(stride * GetGLBytesPerPixel(pixel_format) % 4 == 0);
-    glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(stride));
-    size_t buffer_offset = (rect.bottom * stride + rect.left) * GetGLBytesPerPixel(pixel_format);
-
-    // If not 1x scale, blit scaled texture to a new 1x texture and use that to flush
-    if (res_scale != 1) {
-        auto scaled_rect = rect;
-        scaled_rect.left *= res_scale;
-        scaled_rect.top *= res_scale;
-        scaled_rect.right *= res_scale;
-        scaled_rect.bottom *= res_scale;
-
-        OGLTexture unscaled_tex;
-        unscaled_tex.Create();
-
-        MathUtil::Rectangle<u32> unscaled_tex_rect{0, rect.GetHeight(), rect.GetWidth(), 0};
-        AllocateSurfaceTexture(unscaled_tex.handle, tuple, rect.GetWidth(), rect.GetHeight());
-        BlitTextures(texture.handle, scaled_rect, unscaled_tex.handle, unscaled_tex_rect, type,
-                     read_fb_handle, draw_fb_handle);
-
-        state.texture_units[0].texture_2d = unscaled_tex.handle;
-        state.Apply();
-
-        glActiveTexture(GL_TEXTURE0);
-        glGetTexImage(GL_TEXTURE_2D, 0, tuple.format, tuple.type, &gl_buffer[buffer_offset]);
-    } else {
-        state.ResetTexture(texture.handle);
-        state.draw.read_framebuffer = read_fb_handle;
-        state.Apply();
-
-        if (type == SurfaceType::ColorTexture) {
-            glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D,
-                                   texture.handle, 0);
-            glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
-                                   0, 0);
-        } else if (type == SurfaceType::Depth) {
-            glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
-            glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D,
-                                   texture.handle, 0);
-            glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
-        } else {
-            glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
-            glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
-                                   texture.handle, 0);
-        }
-        glReadPixels(static_cast<GLint>(rect.left), static_cast<GLint>(rect.bottom),
-                     static_cast<GLsizei>(rect.GetWidth()), static_cast<GLsizei>(rect.GetHeight()),
-                     tuple.format, tuple.type, &gl_buffer[buffer_offset]);
-    }
+    ASSERT(params.width * GetGLBytesPerPixel(params.pixel_format) % 4 == 0);
+    glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.width));
 
-    glPixelStorei(GL_PACK_ROW_LENGTH, 0);
-}
-
-enum class MatchFlags {
-    None = 0,
-    Invalid = 1,      // Flag that can be applied to other match types, invalid matches require
-                      // validation before they can be used
-    Exact = 1 << 1,   // Surfaces perfectly match
-    SubRect = 1 << 2, // Surface encompasses params
-    Copy = 1 << 3,    // Surface we can copy from
-    Expand = 1 << 4,  // Surface that can expand params
-    TexCopy = 1 << 5  // Surface that will match a display transfer "texture copy" parameters
-};
-
-constexpr MatchFlags operator|(MatchFlags lhs, MatchFlags rhs) {
-    return static_cast<MatchFlags>(static_cast<int>(lhs) | static_cast<int>(rhs));
-}
+    const auto& rect{params.GetRect()};
+    size_t buffer_offset =
+        (rect.bottom * params.width + rect.left) * GetGLBytesPerPixel(params.pixel_format);
 
-constexpr MatchFlags operator&(MatchFlags lhs, MatchFlags rhs) {
-    return static_cast<MatchFlags>(static_cast<int>(lhs) & static_cast<int>(rhs));
-}
+    state.UnbindTexture(texture.handle);
+    state.draw.read_framebuffer = read_fb_handle;
+    state.Apply();
 
-/// Get the best surface match (and its match type) for the given flags
-template <MatchFlags find_flags>
-Surface FindMatch(const SurfaceCache& surface_cache, const SurfaceParams& params,
-                  ScaleMatch match_scale_type,
-                  boost::optional<SurfaceInterval> validate_interval = boost::none) {
-    Surface match_surface = nullptr;
-    bool match_valid = false;
-    u32 match_scale = 0;
-    SurfaceInterval match_interval{};
-
-    for (auto& pair : RangeFromInterval(surface_cache, params.GetInterval())) {
-        for (auto& surface : pair.second) {
-            bool res_scale_matched = match_scale_type == ScaleMatch::Exact
-                                         ? (params.res_scale == surface->res_scale)
-                                         : (params.res_scale <= surface->res_scale);
-            // validity will be checked in GetCopyableInterval
-            bool is_valid =
-                (find_flags & MatchFlags::Copy) != MatchFlags::None
-                    ? true
-                    : surface->IsRegionValid(validate_interval.value_or(params.GetInterval()));
-
-            if ((find_flags & MatchFlags::Invalid) == MatchFlags::None && !is_valid)
-                continue;
-
-            auto IsMatch_Helper = [&](auto check_type, auto match_fn) {
-                if ((find_flags & check_type) == MatchFlags::None)
-                    return;
-
-                bool matched;
-                SurfaceInterval surface_interval;
-                std::tie(matched, surface_interval) = match_fn();
-                if (!matched)
-                    return;
-
-                if (!res_scale_matched && match_scale_type != ScaleMatch::Ignore &&
-                    surface->type != SurfaceType::Fill)
-                    return;
-
-                // Found a match, update only if this is better than the previous one
-                auto UpdateMatch = [&] {
-                    match_surface = surface;
-                    match_valid = is_valid;
-                    match_scale = surface->res_scale;
-                    match_interval = surface_interval;
-                };
-
-                if (surface->res_scale > match_scale) {
-                    UpdateMatch();
-                    return;
-                } else if (surface->res_scale < match_scale) {
-                    return;
-                }
-
-                if (is_valid && !match_valid) {
-                    UpdateMatch();
-                    return;
-                } else if (is_valid != match_valid) {
-                    return;
-                }
-
-                if (boost::icl::length(surface_interval) > boost::icl::length(match_interval)) {
-                    UpdateMatch();
-                }
-            };
-            IsMatch_Helper(std::integral_constant<MatchFlags, MatchFlags::Exact>{}, [&] {
-                return std::make_pair(surface->ExactMatch(params), surface->GetInterval());
-            });
-            IsMatch_Helper(std::integral_constant<MatchFlags, MatchFlags::SubRect>{}, [&] {
-                return std::make_pair(surface->CanSubRect(params), surface->GetInterval());
-            });
-            IsMatch_Helper(std::integral_constant<MatchFlags, MatchFlags::Copy>{}, [&] {
-                auto copy_interval =
-                    params.FromInterval(*validate_interval).GetCopyableInterval(surface);
-                bool matched = boost::icl::length(copy_interval & *validate_interval) != 0 &&
-                               surface->CanCopy(params, copy_interval);
-                return std::make_pair(matched, copy_interval);
-            });
-            IsMatch_Helper(std::integral_constant<MatchFlags, MatchFlags::Expand>{}, [&] {
-                return std::make_pair(surface->CanExpand(params), surface->GetInterval());
-            });
-            IsMatch_Helper(std::integral_constant<MatchFlags, MatchFlags::TexCopy>{}, [&] {
-                return std::make_pair(surface->CanTexCopy(params), surface->GetInterval());
-            });
-        }
+    if (params.type == SurfaceType::ColorTexture) {
+        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D,
+                               texture.handle, 0);
+        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
+                               0);
+    } else if (params.type == SurfaceType::Depth) {
+        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
+        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D,
+                               texture.handle, 0);
+        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
+    } else {
+        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
+        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
+                               texture.handle, 0);
     }
-    return match_surface;
+    glReadPixels(static_cast<GLint>(rect.left), static_cast<GLint>(rect.bottom),
+                 static_cast<GLsizei>(rect.GetWidth()), static_cast<GLsizei>(rect.GetHeight()),
+                 tuple.format, tuple.type, &gl_buffer[buffer_offset]);
+
+    glPixelStorei(GL_PACK_ROW_LENGTH, 0);
 }
 
 RasterizerCacheOpenGL::RasterizerCacheOpenGL() {
     read_framebuffer.Create();
     draw_framebuffer.Create();
-
-    attributeless_vao.Create();
-
-    d24s8_abgr_buffer.Create();
-    d24s8_abgr_buffer_size = 0;
-
-    const char* vs_source = R"(
-#version 330 core
-const vec2 vertices[4] = vec2[4](vec2(-1.0, -1.0), vec2(1.0, -1.0), vec2(-1.0, 1.0), vec2(1.0, 1.0));
-void main() {
-    gl_Position = vec4(vertices[gl_VertexID], 0.0, 1.0);
-}
-)";
-    const char* fs_source = R"(
-#version 330 core
-
-uniform samplerBuffer tbo;
-uniform vec2 tbo_size;
-uniform vec4 viewport;
-
-out vec4 color;
-
-void main() {
-    vec2 tbo_coord = (gl_FragCoord.xy - viewport.xy) * tbo_size / viewport.zw;
-    int tbo_offset = int(tbo_coord.y) * int(tbo_size.x) + int(tbo_coord.x);
-    color = texelFetch(tbo, tbo_offset).rabg;
-}
-)";
-    d24s8_abgr_shader.CreateFromSource(vs_source, nullptr, fs_source);
-
-    OpenGLState state = OpenGLState::GetCurState();
-    GLuint old_program = state.draw.shader_program;
-    state.draw.shader_program = d24s8_abgr_shader.handle;
-    state.Apply();
-
-    GLint tbo_u_id = glGetUniformLocation(d24s8_abgr_shader.handle, "tbo");
-    ASSERT(tbo_u_id != -1);
-    glUniform1i(tbo_u_id, 0);
-
-    state.draw.shader_program = old_program;
-    state.Apply();
-
-    d24s8_abgr_tbo_size_u_id = glGetUniformLocation(d24s8_abgr_shader.handle, "tbo_size");
-    ASSERT(d24s8_abgr_tbo_size_u_id != -1);
-    d24s8_abgr_viewport_u_id = glGetUniformLocation(d24s8_abgr_shader.handle, "viewport");
-    ASSERT(d24s8_abgr_viewport_u_id != -1);
 }
 
 RasterizerCacheOpenGL::~RasterizerCacheOpenGL() {
-    FlushAll();
-    while (!surface_cache.empty())
-        UnregisterSurface(*surface_cache.begin()->second.begin());
-}
-
-bool RasterizerCacheOpenGL::BlitSurfaces(const Surface& src_surface,
-                                         const MathUtil::Rectangle<u32>& src_rect,
-                                         const Surface& dst_surface,
-                                         const MathUtil::Rectangle<u32>& dst_rect) {
-    if (!SurfaceParams::CheckFormatsBlittable(src_surface->pixel_format, dst_surface->pixel_format))
-        return false;
-
-    return BlitTextures(src_surface->texture.handle, src_rect, dst_surface->texture.handle,
-                        dst_rect, src_surface->type, read_framebuffer.handle,
-                        draw_framebuffer.handle);
-}
-
-void RasterizerCacheOpenGL::ConvertD24S8toABGR(GLuint src_tex,
-                                               const MathUtil::Rectangle<u32>& src_rect,
-                                               GLuint dst_tex,
-                                               const MathUtil::Rectangle<u32>& dst_rect) {
-    OpenGLState prev_state = OpenGLState::GetCurState();
-    SCOPE_EXIT({ prev_state.Apply(); });
-
-    OpenGLState state;
-    state.draw.read_framebuffer = read_framebuffer.handle;
-    state.draw.draw_framebuffer = draw_framebuffer.handle;
-    state.Apply();
-
-    glBindBuffer(GL_PIXEL_PACK_BUFFER, d24s8_abgr_buffer.handle);
-
-    GLsizeiptr target_pbo_size = src_rect.GetWidth() * src_rect.GetHeight() * 4;
-    if (target_pbo_size > d24s8_abgr_buffer_size) {
-        d24s8_abgr_buffer_size = target_pbo_size * 2;
-        glBufferData(GL_PIXEL_PACK_BUFFER, d24s8_abgr_buffer_size, nullptr, GL_STREAM_COPY);
-    }
-
-    glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
-    glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, src_tex,
-                           0);
-    glReadPixels(static_cast<GLint>(src_rect.left), static_cast<GLint>(src_rect.bottom),
-                 static_cast<GLsizei>(src_rect.GetWidth()),
-                 static_cast<GLsizei>(src_rect.GetHeight()), GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8,
-                 0);
-
-    glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
-
-    // PBO now contains src_tex in RABG format
-    state.draw.shader_program = d24s8_abgr_shader.handle;
-    state.draw.vertex_array = attributeless_vao.handle;
-    state.viewport.x = static_cast<GLint>(dst_rect.left);
-    state.viewport.y = static_cast<GLint>(dst_rect.bottom);
-    state.viewport.width = static_cast<GLsizei>(dst_rect.GetWidth());
-    state.viewport.height = static_cast<GLsizei>(dst_rect.GetHeight());
-    state.Apply();
-
-    OGLTexture tbo;
-    tbo.Create();
-    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_BUFFER, tbo.handle);
-    glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA8, d24s8_abgr_buffer.handle);
-
-    glUniform2f(d24s8_abgr_tbo_size_u_id, static_cast<GLfloat>(src_rect.GetWidth()),
-                static_cast<GLfloat>(src_rect.GetHeight()));
-    glUniform4f(d24s8_abgr_viewport_u_id, static_cast<GLfloat>(state.viewport.x),
-                static_cast<GLfloat>(state.viewport.y), static_cast<GLfloat>(state.viewport.width),
-                static_cast<GLfloat>(state.viewport.height));
-
-    glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, dst_tex, 0);
-    glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
-    glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
-
-    glBindTexture(GL_TEXTURE_BUFFER, 0);
-}
-
-Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, ScaleMatch match_res_scale,
-                                          bool load_if_create) {
-    if (params.addr == 0 || params.height * params.width == 0) {
-        return nullptr;
-    }
-    // Use GetSurfaceSubRect instead
-    ASSERT(params.width == params.stride);
-
-    ASSERT(!params.is_tiled ||
-           (params.GetActualWidth() % 8 == 0 && params.GetActualHeight() % 8 == 0));
-
-    // Check for an exact match in existing surfaces
-    Surface surface =
-        FindMatch<MatchFlags::Exact | MatchFlags::Invalid>(surface_cache, params, match_res_scale);
-
-    if (surface == nullptr) {
-        u16 target_res_scale = params.res_scale;
-        if (match_res_scale != ScaleMatch::Exact) {
-            // This surface may have a subrect of another surface with a higher res_scale, find it
-            // to adjust our params
-            SurfaceParams find_params = params;
-            Surface expandable = FindMatch<MatchFlags::Expand | MatchFlags::Invalid>(
-                surface_cache, find_params, match_res_scale);
-            if (expandable != nullptr && expandable->res_scale > target_res_scale) {
-                target_res_scale = expandable->res_scale;
-            }
-        }
-        SurfaceParams new_params = params;
-        new_params.res_scale = target_res_scale;
-        surface = CreateSurface(new_params);
-        RegisterSurface(surface);
-    }
-
-    if (load_if_create) {
-        ValidateSurface(surface, params.addr, params.size);
-    }
-
-    return surface;
-}
-
-boost::optional<Tegra::GPUVAddr> RasterizerCacheOpenGL::TryFindFramebufferGpuAddress(
-    VAddr cpu_addr) const {
-    // Tries to find the GPU address of a framebuffer based on the CPU address. This is because
-    // final output framebuffers are specified by CPU address, but internally our GPU cache uses GPU
-    // addresses. We iterate through all cached framebuffers, and compare their starting CPU address
-    // to the one provided. This is obviously not great, and won't work if the framebuffer overlaps
-    // surfaces.
-
-    std::vector<Tegra::GPUVAddr> gpu_addresses;
-    for (const auto& pair : surface_cache) {
-        for (const auto& surface : pair.second) {
-            const VAddr surface_cpu_addr = surface->GetCpuAddr();
-            if (cpu_addr >= surface_cpu_addr && cpu_addr < (surface_cpu_addr + surface->size)) {
-                ASSERT_MSG(cpu_addr == surface_cpu_addr, "overlapping surfaces are unsupported");
-                gpu_addresses.push_back(surface->addr);
-            }
-        }
+    while (!surface_cache.empty()) {
+        UnregisterSurface(surface_cache.begin()->second);
     }
-
-    if (gpu_addresses.empty()) {
-        return {};
-    }
-
-    ASSERT_MSG(gpu_addresses.size() == 1, ">1 surface is unsupported");
-    return gpu_addresses[0];
-}
-
-SurfaceRect_Tuple RasterizerCacheOpenGL::GetSurfaceSubRect(const SurfaceParams& params,
-                                                           ScaleMatch match_res_scale,
-                                                           bool load_if_create) {
-    if (params.addr == 0 || params.height * params.width == 0) {
-        return std::make_tuple(nullptr, MathUtil::Rectangle<u32>{});
-    }
-
-    // Attempt to find encompassing surface
-    Surface surface = FindMatch<MatchFlags::SubRect | MatchFlags::Invalid>(surface_cache, params,
-                                                                           match_res_scale);
-
-    // Check if FindMatch failed because of res scaling
-    // If that's the case create a new surface with
-    // the dimensions of the lower res_scale surface
-    // to suggest it should not be used again
-    if (surface == nullptr && match_res_scale != ScaleMatch::Ignore) {
-        surface = FindMatch<MatchFlags::SubRect | MatchFlags::Invalid>(surface_cache, params,
-                                                                       ScaleMatch::Ignore);
-        if (surface != nullptr) {
-            ASSERT(surface->res_scale < params.res_scale);
-            SurfaceParams new_params = *surface;
-            new_params.res_scale = params.res_scale;
-
-            surface = CreateSurface(new_params);
-            RegisterSurface(surface);
-        }
-    }
-
-    SurfaceParams aligned_params = params;
-    if (params.is_tiled) {
-        aligned_params.height = Common::AlignUp(params.height, 8);
-        aligned_params.width = Common::AlignUp(params.width, 8);
-        aligned_params.stride = Common::AlignUp(params.stride, 8);
-        aligned_params.UpdateParams();
-    }
-
-    // Check for a surface we can expand before creating a new one
-    if (surface == nullptr) {
-        surface = FindMatch<MatchFlags::Expand | MatchFlags::Invalid>(surface_cache, aligned_params,
-                                                                      match_res_scale);
-        if (surface != nullptr) {
-            aligned_params.width = aligned_params.stride;
-            aligned_params.UpdateParams();
-
-            SurfaceParams new_params = *surface;
-            new_params.addr = std::min(aligned_params.addr, surface->addr);
-            new_params.end = std::max(aligned_params.end, surface->end);
-            new_params.size = new_params.end - new_params.addr;
-            new_params.height = static_cast<u32>(
-                new_params.size / aligned_params.BytesInPixels(aligned_params.stride));
-            ASSERT(new_params.size % aligned_params.BytesInPixels(aligned_params.stride) == 0);
-
-            Surface new_surface = CreateSurface(new_params);
-            DuplicateSurface(surface, new_surface);
-
-            // Delete the expanded surface, this can't be done safely yet
-            // because it may still be in use
-            remove_surfaces.emplace(surface);
-
-            surface = new_surface;
-            RegisterSurface(new_surface);
-        }
-    }
-
-    // No subrect found - create and return a new surface
-    if (surface == nullptr) {
-        SurfaceParams new_params = aligned_params;
-        // Can't have gaps in a surface
-        new_params.width = aligned_params.stride;
-        new_params.UpdateParams();
-        // GetSurface will create the new surface and possibly adjust res_scale if necessary
-        surface = GetSurface(new_params, match_res_scale, load_if_create);
-    } else if (load_if_create) {
-        ValidateSurface(surface, aligned_params.addr, aligned_params.size);
-    }
-
-    return std::make_tuple(surface, surface->GetScaledSubRect(params));
 }
 
 Surface RasterizerCacheOpenGL::GetTextureSurface(const Tegra::Texture::FullTextureInfo& config) {
-    auto& gpu = Core::System::GetInstance().GPU();
-
-    SurfaceParams params;
-    params.addr = config.tic.Address();
-    params.is_tiled = config.tic.IsTiled();
-    params.pixel_format = SurfaceParams::PixelFormatFromTextureFormat(config.tic.format);
-
-    params.width = Common::AlignUp(config.tic.Width(), params.GetCompresssionFactor()) /
-                   params.GetCompresssionFactor();
-    params.height = Common::AlignUp(config.tic.Height(), params.GetCompresssionFactor()) /
-                    params.GetCompresssionFactor();
-
-    // TODO(Subv): Different types per component are not supported.
-    ASSERT(config.tic.r_type.Value() == config.tic.g_type.Value() &&
-           config.tic.r_type.Value() == config.tic.b_type.Value() &&
-           config.tic.r_type.Value() == config.tic.a_type.Value());
-
-    params.component_type = SurfaceParams::ComponentTypeFromTexture(config.tic.r_type.Value());
-
-    if (config.tic.IsTiled()) {
-        params.block_height = config.tic.BlockHeight();
-        params.width = Common::AlignUp(params.width, params.block_height);
-        params.height = Common::AlignUp(params.height, params.block_height);
-    } else {
-        // Use the texture-provided stride value if the texture isn't tiled.
-        params.stride = static_cast<u32>(params.PixelsInBytes(config.tic.Pitch()));
-    }
-
-    params.UpdateParams();
-
-    if (params.GetActualWidth() % 8 != 0 || params.GetActualHeight() % 8 != 0 ||
-        params.stride != params.width) {
-        Surface src_surface;
-        MathUtil::Rectangle<u32> rect;
-        std::tie(src_surface, rect) = GetSurfaceSubRect(params, ScaleMatch::Ignore, true);
-
-        rect = rect.Scale(params.GetCompresssionFactor());
-
-        params.res_scale = src_surface->res_scale;
-        Surface tmp_surface = CreateSurface(params);
-
-        auto dst_rect = tmp_surface->GetScaledRect().Scale(params.GetCompresssionFactor());
-        BlitTextures(src_surface->texture.handle, rect, tmp_surface->texture.handle, dst_rect,
-                     SurfaceParams::GetFormatType(params.pixel_format), read_framebuffer.handle,
-                     draw_framebuffer.handle);
-
-        remove_surfaces.emplace(tmp_surface);
-        return tmp_surface;
-    }
-
-    return GetSurface(params, ScaleMatch::Ignore, true);
+    return GetSurface(SurfaceParams::CreateForTexture(config));
 }
 
 SurfaceSurfaceRect_Tuple RasterizerCacheOpenGL::GetFramebufferSurfaces(
     bool using_color_fb, bool using_depth_fb, const MathUtil::Rectangle<s32>& viewport) {
     const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
-    const auto& config = regs.rt[0];
 
     // TODO(bunnei): This is hard corded to use just the first render buffer
-    NGLOG_WARNING(Render_OpenGL, "hard-coded for render target 0!");
-
-    // update resolution_scale_factor and reset cache if changed
-    // TODO (bunnei): This code was ported as-is from Citra, and is technically not thread-safe. We
-    // need to fix this before making the renderer multi-threaded.
-    static u16 resolution_scale_factor = GetResolutionScaleFactor();
-    if (resolution_scale_factor != GetResolutionScaleFactor()) {
-        resolution_scale_factor = GetResolutionScaleFactor();
-        FlushAll();
-        while (!surface_cache.empty())
-            UnregisterSurface(*surface_cache.begin()->second.begin());
-    }
-
-    MathUtil::Rectangle<u32> viewport_clamped{
-        static_cast<u32>(std::clamp(viewport.left, 0, static_cast<s32>(config.width))),
-        static_cast<u32>(std::clamp(viewport.top, 0, static_cast<s32>(config.height))),
-        static_cast<u32>(std::clamp(viewport.right, 0, static_cast<s32>(config.width))),
-        static_cast<u32>(std::clamp(viewport.bottom, 0, static_cast<s32>(config.height)))};
+    LOG_WARNING(Render_OpenGL, "hard-coded for render target 0!");
 
     // get color and depth surfaces
-    SurfaceParams color_params;
-    color_params.is_tiled = true;
-    color_params.res_scale = resolution_scale_factor;
-    color_params.width = config.width;
-    color_params.height = config.height;
-    // TODO(Subv): Can framebuffers use a different block height?
-    color_params.block_height = Tegra::Texture::TICEntry::DefaultBlockHeight;
-    SurfaceParams depth_params = color_params;
-
-    color_params.addr = config.Address();
-    color_params.pixel_format = SurfaceParams::PixelFormatFromRenderTargetFormat(config.format);
-    color_params.component_type = SurfaceParams::ComponentTypeFromRenderTarget(config.format);
-    color_params.UpdateParams();
-
-    ASSERT_MSG(!using_depth_fb, "depth buffer is unimplemented");
-    // depth_params.addr = config.GetDepthBufferPhysicalAddress();
-    // depth_params.pixel_format = SurfaceParams::PixelFormatFromDepthFormat(config.depth_format);
-    // depth_params.UpdateParams();
-
-    auto color_vp_interval = color_params.GetSubRectInterval(viewport_clamped);
-    auto depth_vp_interval = depth_params.GetSubRectInterval(viewport_clamped);
-
-    // Make sure that framebuffers don't overlap if both color and depth are being used
-    if (using_color_fb && using_depth_fb &&
-        boost::icl::length(color_vp_interval & depth_vp_interval)) {
-        NGLOG_CRITICAL(Render_OpenGL, "Color and depth framebuffer memory regions overlap; "
-                                      "overlapping framebuffers not supported!");
-        using_depth_fb = false;
+    SurfaceParams color_params{};
+    SurfaceParams depth_params{};
+
+    if (using_color_fb) {
+        color_params = SurfaceParams::CreateForFramebuffer(regs.rt[0]);
+    }
+
+    if (using_depth_fb) {
+        depth_params =
+            SurfaceParams::CreateForDepthBuffer(regs.rt[0], regs.zeta.Address(), regs.zeta.format);
     }
 
     MathUtil::Rectangle<u32> color_rect{};
-    Surface color_surface = nullptr;
-    if (using_color_fb)
-        std::tie(color_surface, color_rect) =
-            GetSurfaceSubRect(color_params, ScaleMatch::Exact, false);
+    Surface color_surface;
+    if (using_color_fb) {
+        color_surface = GetSurface(color_params);
+        if (color_surface) {
+            color_rect = color_surface->GetSurfaceParams().GetRect();
+        }
+    }
 
     MathUtil::Rectangle<u32> depth_rect{};
-    Surface depth_surface = nullptr;
-    if (using_depth_fb)
-        std::tie(depth_surface, depth_rect) =
-            GetSurfaceSubRect(depth_params, ScaleMatch::Exact, false);
+    Surface depth_surface;
+    if (using_depth_fb) {
+        depth_surface = GetSurface(depth_params);
+        if (depth_surface) {
+            depth_rect = depth_surface->GetSurfaceParams().GetRect();
+        }
+    }
 
     MathUtil::Rectangle<u32> fb_rect{};
-    if (color_surface != nullptr && depth_surface != nullptr) {
+    if (color_surface && depth_surface) {
         fb_rect = color_rect;
         // Color and Depth surfaces must have the same dimensions and offsets
         if (color_rect.bottom != depth_rect.bottom || color_rect.top != depth_rect.top ||
             color_rect.left != depth_rect.left || color_rect.right != depth_rect.right) {
-            color_surface = GetSurface(color_params, ScaleMatch::Exact, false);
-            depth_surface = GetSurface(depth_params, ScaleMatch::Exact, false);
-            fb_rect = color_surface->GetScaledRect();
+            color_surface = GetSurface(color_params);
+            depth_surface = GetSurface(depth_params);
+            fb_rect = color_surface->GetSurfaceParams().GetRect();
         }
-    } else if (color_surface != nullptr) {
+    } else if (color_surface) {
         fb_rect = color_rect;
-    } else if (depth_surface != nullptr) {
+    } else if (depth_surface) {
         fb_rect = depth_rect;
     }
 
-    if (color_surface != nullptr) {
-        ValidateSurface(color_surface, boost::icl::first(color_vp_interval),
-                        boost::icl::length(color_vp_interval));
-    }
-    if (depth_surface != nullptr) {
-        ValidateSurface(depth_surface, boost::icl::first(depth_vp_interval),
-                        boost::icl::length(depth_vp_interval));
-    }
-
     return std::make_tuple(color_surface, depth_surface, fb_rect);
 }
 
-Surface RasterizerCacheOpenGL::GetFillSurface(const void* config) {
-    UNREACHABLE();
-    return {};
+void RasterizerCacheOpenGL::LoadSurface(const Surface& surface) {
+    surface->LoadGLBuffer();
+    surface->UploadGLTexture(read_framebuffer.handle, draw_framebuffer.handle);
 }
 
-SurfaceRect_Tuple RasterizerCacheOpenGL::GetTexCopySurface(const SurfaceParams& params) {
-    MathUtil::Rectangle<u32> rect{};
+void RasterizerCacheOpenGL::MarkSurfaceAsDirty(const Surface& surface) {
+    if (Settings::values.use_accurate_framebuffers) {
+        // If enabled, always flush dirty surfaces
+        surface->DownloadGLTexture(read_framebuffer.handle, draw_framebuffer.handle);
+        surface->FlushGLBuffer();
+    } else {
+        // Otherwise, don't mark surfaces that we write to as cached, because the resulting loads
+        // and flushes are very slow and do not seem to improve accuracy
+        const auto& params{surface->GetSurfaceParams()};
+        Memory::RasterizerMarkRegionCached(params.addr, params.size_in_bytes, false);
+    }
+}
 
-    Surface match_surface = FindMatch<MatchFlags::TexCopy | MatchFlags::Invalid>(
-        surface_cache, params, ScaleMatch::Ignore);
+Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params) {
+    if (params.addr == 0 || params.height * params.width == 0) {
+        return {};
+    }
 
-    if (match_surface != nullptr) {
-        ValidateSurface(match_surface, params.addr, params.size);
+    const auto& gpu = Core::System::GetInstance().GPU();
+    // Don't try to create any entries in the cache if the address of the texture is invalid.
+    if (gpu.memory_manager->GpuToCpuAddress(params.addr) == boost::none)
+        return {};
 
-        SurfaceParams match_subrect;
-        if (params.width != params.stride) {
-            const u32 tiled_size = match_surface->is_tiled ? 8 : 1;
-            match_subrect = params;
-            match_subrect.width =
-                static_cast<u32>(match_surface->PixelsInBytes(params.width) / tiled_size);
-            match_subrect.stride =
-                static_cast<u32>(match_surface->PixelsInBytes(params.stride) / tiled_size);
-            match_subrect.height *= tiled_size;
-        } else {
-            match_subrect = match_surface->FromInterval(params.GetInterval());
-            ASSERT(match_subrect.GetInterval() == params.GetInterval());
+    // Check for an exact match in existing surfaces
+    const auto& surface_key{SurfaceKey::Create(params)};
+    const auto& search{surface_cache.find(surface_key)};
+    Surface surface;
+    if (search != surface_cache.end()) {
+        surface = search->second;
+        if (Settings::values.use_accurate_framebuffers) {
+            // Reload the surface from Switch memory
+            LoadSurface(surface);
         }
-
-        rect = match_surface->GetScaledSubRect(match_subrect);
+    } else {
+        surface = std::make_shared<CachedSurface>(params);
+        RegisterSurface(surface);
+        LoadSurface(surface);
     }
 
-    return std::make_tuple(match_surface, rect);
+    return surface;
 }
 
-void RasterizerCacheOpenGL::DuplicateSurface(const Surface& src_surface,
-                                             const Surface& dest_surface) {
-    ASSERT(dest_surface->addr <= src_surface->addr && dest_surface->end >= src_surface->end);
-
-    BlitSurfaces(src_surface, src_surface->GetScaledRect(), dest_surface,
-                 dest_surface->GetScaledSubRect(*src_surface));
-
-    dest_surface->invalid_regions -= src_surface->GetInterval();
-    dest_surface->invalid_regions += src_surface->invalid_regions;
-
-    SurfaceRegions regions;
-    for (auto& pair : RangeFromInterval(dirty_regions, src_surface->GetInterval())) {
-        if (pair.second == src_surface) {
-            regions += pair.first;
+Surface RasterizerCacheOpenGL::TryFindFramebufferSurface(VAddr cpu_addr) const {
+    // Tries to find the GPU address of a framebuffer based on the CPU address. This is because
+    // final output framebuffers are specified by CPU address, but internally our GPU cache uses
+    // GPU addresses. We iterate through all cached framebuffers, and compare their starting CPU
+    // address to the one provided. This is obviously not great, and won't work if the
+    // framebuffer overlaps surfaces.
+
+    std::vector<Surface> surfaces;
+    for (const auto& surface : surface_cache) {
+        const auto& params = surface.second->GetSurfaceParams();
+        const VAddr surface_cpu_addr = params.GetCpuAddr();
+        if (cpu_addr >= surface_cpu_addr && cpu_addr < (surface_cpu_addr + params.size_in_bytes)) {
+            ASSERT_MSG(cpu_addr == surface_cpu_addr, "overlapping surfaces are unsupported");
+            surfaces.push_back(surface.second);
         }
     }
-    for (auto& interval : regions) {
-        dirty_regions.set({interval, dest_surface});
-    }
-}
 
-void RasterizerCacheOpenGL::ValidateSurface(const Surface& surface, Tegra::GPUVAddr addr,
-                                            u64 size) {
-    if (size == 0)
-        return;
-
-    const SurfaceInterval validate_interval(addr, addr + size);
-
-    if (surface->type == SurfaceType::Fill) {
-        // Sanity check, fill surfaces will always be valid when used
-        ASSERT(surface->IsRegionValid(validate_interval));
-        return;
+    if (surfaces.empty()) {
+        return {};
     }
 
-    while (true) {
-        const auto it = surface->invalid_regions.find(validate_interval);
-        if (it == surface->invalid_regions.end())
-            break;
-
-        const auto interval = *it & validate_interval;
-        // Look for a valid surface to copy from
-        SurfaceParams params = surface->FromInterval(interval);
-
-        Surface copy_surface =
-            FindMatch<MatchFlags::Copy>(surface_cache, params, ScaleMatch::Ignore, interval);
-        if (copy_surface != nullptr) {
-            SurfaceInterval copy_interval = params.GetCopyableInterval(copy_surface);
-            CopySurface(copy_surface, surface, copy_interval);
-            surface->invalid_regions.erase(copy_interval);
-            continue;
-        }
+    ASSERT_MSG(surfaces.size() == 1, ">1 surface is unsupported");
 
-        // Load data from Switch memory
-        FlushRegion(params.addr, params.size);
-        surface->LoadGLBuffer(params.addr, params.end);
-        surface->UploadGLTexture(surface->GetSubRect(params), read_framebuffer.handle,
-                                 draw_framebuffer.handle);
-        surface->invalid_regions.erase(params.GetInterval());
-    }
+    return surfaces[0];
 }
 
-void RasterizerCacheOpenGL::FlushRegion(Tegra::GPUVAddr addr, u64 size, Surface flush_surface) {
-    if (size == 0)
-        return;
-
-    const SurfaceInterval flush_interval(addr, addr + size);
-    SurfaceRegions flushed_intervals;
-
-    for (auto& pair : RangeFromInterval(dirty_regions, flush_interval)) {
-        // small sizes imply that this most likely comes from the cpu, flush the entire region
-        // the point is to avoid thousands of small writes every frame if the cpu decides to access
-        // that region, anything higher than 8 you're guaranteed it comes from a service
-        const auto interval = size <= 8 ? pair.first : pair.first & flush_interval;
-        auto& surface = pair.second;
-
-        if (flush_surface != nullptr && surface != flush_surface)
-            continue;
+void RasterizerCacheOpenGL::FlushRegion(Tegra::GPUVAddr /*addr*/, size_t /*size*/) {
+    // TODO(bunnei): This is unused in the current implementation of the rasterizer cache. We should
+    // probably implement this in the future, but for now, the `use_accurate_framebufers` setting
+    // can be used to always flush.
+}
 
-        // Sanity check, this surface is the last one that marked this region dirty
-        ASSERT(surface->IsRegionValid(interval));
+void RasterizerCacheOpenGL::InvalidateRegion(Tegra::GPUVAddr addr, size_t size) {
+    for (const auto& pair : surface_cache) {
+        const auto& surface{pair.second};
+        const auto& params{surface->GetSurfaceParams()};
 
-        if (surface->type != SurfaceType::Fill) {
-            SurfaceParams params = surface->FromInterval(interval);
-            surface->DownloadGLTexture(surface->GetSubRect(params), read_framebuffer.handle,
-                                       draw_framebuffer.handle);
+        if (params.IsOverlappingRegion(addr, size)) {
+            UnregisterSurface(surface);
         }
-        surface->FlushGLBuffer(boost::icl::first(interval), boost::icl::last_next(interval));
-        flushed_intervals += interval;
     }
-    // Reset dirty regions
-    dirty_regions -= flushed_intervals;
 }
 
-void RasterizerCacheOpenGL::FlushAll() {
-    FlushRegion(0, Kernel::VMManager::MAX_ADDRESS);
-}
+void RasterizerCacheOpenGL::RegisterSurface(const Surface& surface) {
+    const auto& params{surface->GetSurfaceParams()};
+    const auto& surface_key{SurfaceKey::Create(params)};
+    const auto& search{surface_cache.find(surface_key)};
 
-void RasterizerCacheOpenGL::InvalidateRegion(Tegra::GPUVAddr addr, u64 size,
-                                             const Surface& region_owner) {
-    if (size == 0)
+    if (search != surface_cache.end()) {
+        // Registered already
         return;
-
-    const SurfaceInterval invalid_interval(addr, addr + size);
-
-    if (region_owner != nullptr) {
-        ASSERT(addr >= region_owner->addr && addr + size <= region_owner->end);
-        // Surfaces can't have a gap
-        ASSERT(region_owner->width == region_owner->stride);
-        region_owner->invalid_regions.erase(invalid_interval);
-    }
-
-    for (auto& pair : RangeFromInterval(surface_cache, invalid_interval)) {
-        for (auto& cached_surface : pair.second) {
-            if (cached_surface == region_owner)
-                continue;
-
-            // If cpu is invalidating this region we want to remove it
-            // to (likely) mark the memory pages as uncached
-            if (region_owner == nullptr && size <= 8) {
-                FlushRegion(cached_surface->addr, cached_surface->size, cached_surface);
-                remove_surfaces.emplace(cached_surface);
-                continue;
-            }
-
-            const auto interval = cached_surface->GetInterval() & invalid_interval;
-            cached_surface->invalid_regions.insert(interval);
-
-            // Remove only "empty" fill surfaces to avoid destroying and recreating OGL textures
-            if (cached_surface->type == SurfaceType::Fill &&
-                cached_surface->IsSurfaceFullyInvalid()) {
-                remove_surfaces.emplace(cached_surface);
-            }
-        }
     }
 
-    if (region_owner != nullptr)
-        dirty_regions.set({invalid_interval, region_owner});
-    else
-        dirty_regions.erase(invalid_interval);
-
-    for (auto& remove_surface : remove_surfaces) {
-        if (remove_surface == region_owner) {
-            Surface expanded_surface = FindMatch<MatchFlags::SubRect | MatchFlags::Invalid>(
-                surface_cache, *region_owner, ScaleMatch::Ignore);
-            ASSERT(expanded_surface);
-
-            if ((region_owner->invalid_regions - expanded_surface->invalid_regions).empty()) {
-                DuplicateSurface(region_owner, expanded_surface);
-            } else {
-                continue;
-            }
-        }
-        UnregisterSurface(remove_surface);
-    }
-
-    remove_surfaces.clear();
+    surface_cache[surface_key] = surface;
+    UpdatePagesCachedCount(params.addr, params.size_in_bytes, 1);
 }
 
-Surface RasterizerCacheOpenGL::CreateSurface(const SurfaceParams& params) {
-    Surface surface = std::make_shared<CachedSurface>();
-    static_cast<SurfaceParams&>(*surface) = params;
-
-    surface->texture.Create();
-
-    surface->gl_buffer_size = 0;
-    surface->invalid_regions.insert(surface->GetInterval());
-    AllocateSurfaceTexture(surface->texture.handle,
-                           GetFormatTuple(surface->pixel_format, surface->component_type),
-                           surface->GetScaledWidth(), surface->GetScaledHeight());
-
-    return surface;
-}
+void RasterizerCacheOpenGL::UnregisterSurface(const Surface& surface) {
+    const auto& params{surface->GetSurfaceParams()};
+    const auto& surface_key{SurfaceKey::Create(params)};
+    const auto& search{surface_cache.find(surface_key)};
 
-void RasterizerCacheOpenGL::RegisterSurface(const Surface& surface) {
-    if (surface->registered) {
+    if (search == surface_cache.end()) {
+        // Unregistered already
         return;
     }
-    surface->registered = true;
-    surface_cache.add({surface->GetInterval(), SurfaceSet{surface}});
-    UpdatePagesCachedCount(surface->addr, surface->size, 1);
+
+    UpdatePagesCachedCount(params.addr, params.size_in_bytes, -1);
+    surface_cache.erase(search);
 }
 
-void RasterizerCacheOpenGL::UnregisterSurface(const Surface& surface) {
-    if (!surface->registered) {
-        return;
-    }
-    surface->registered = false;
-    UpdatePagesCachedCount(surface->addr, surface->size, -1);
-    surface_cache.subtract({surface->GetInterval(), SurfaceSet{surface}});
+template <typename Map, typename Interval>
+constexpr auto RangeFromInterval(Map& map, const Interval& interval) {
+    return boost::make_iterator_range(map.equal_range(interval));
 }
 
 void RasterizerCacheOpenGL::UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta) {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 0f43e863d..1bedae992 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -1,57 +1,26 @@
-// Copyright 2015 Citra Emulator Project
+// Copyright 2018 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
 
 #include <array>
+#include <map>
 #include <memory>
-#include <set>
-#include <tuple>
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
-#endif
+#include <vector>
 #include <boost/icl/interval_map.hpp>
-#include <boost/icl/interval_set.hpp>
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-#include <boost/optional.hpp>
-#include <glad/glad.h>
-#include "common/assert.h"
-#include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "common/hash.h"
 #include "common/math_util.h"
-#include "video_core/gpu.h"
-#include "video_core/memory_manager.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/textures/texture.h"
 
-struct CachedSurface;
+class CachedSurface;
 using Surface = std::shared_ptr<CachedSurface>;
-using SurfaceSet = std::set<Surface>;
-
-using SurfaceRegions = boost::icl::interval_set<Tegra::GPUVAddr>;
-using SurfaceMap = boost::icl::interval_map<Tegra::GPUVAddr, Surface>;
-using SurfaceCache = boost::icl::interval_map<Tegra::GPUVAddr, SurfaceSet>;
-
-using SurfaceInterval = SurfaceCache::interval_type;
-static_assert(std::is_same<SurfaceRegions::interval_type, SurfaceCache::interval_type>() &&
-                  std::is_same<SurfaceMap::interval_type, SurfaceCache::interval_type>(),
-              "incorrect interval types");
-
-using SurfaceRect_Tuple = std::tuple<Surface, MathUtil::Rectangle<u32>>;
 using SurfaceSurfaceRect_Tuple = std::tuple<Surface, Surface, MathUtil::Rectangle<u32>>;
-
 using PageMap = boost::icl::interval_map<u64, int>;
 
-enum class ScaleMatch {
-    Exact,   // only accept same res scale
-    Upscale, // only allow higher scale than params
-    Ignore   // accept every scaled res
-};
-
 struct SurfaceParams {
     enum class PixelFormat {
         ABGR8 = 0,
@@ -61,12 +30,24 @@ struct SurfaceParams {
         R8 = 4,
         RGBA16F = 5,
         R11FG11FB10F = 6,
-        DXT1 = 7,
-        DXT23 = 8,
-        DXT45 = 9,
-        DXN1 = 10, // This is also known as BC4
+        RGBA32UI = 7,
+        DXT1 = 8,
+        DXT23 = 9,
+        DXT45 = 10,
+        DXN1 = 11, // This is also known as BC4
+        BC7U = 12,
+        ASTC_2D_4X4 = 13,
 
-        Max,
+        MaxColorFormat,
+
+        // DepthStencil formats
+        Z24S8 = 14,
+        S8Z24 = 15,
+        Z32F = 16,
+
+        MaxDepthStencilFormat,
+
+        Max = MaxDepthStencilFormat,
         Invalid = 255,
     };
 
@@ -92,10 +73,10 @@ struct SurfaceParams {
     /**
      * Gets the compression factor for the specified PixelFormat. This applies to just the
      * "compressed width" and "compressed height", not the overall compression factor of a
-     * compressed image. This is used for maintaining proper surface sizes for compressed texture
-     * formats.
+     * compressed image. This is used for maintaining proper surface sizes for compressed
+     * texture formats.
      */
-    static constexpr u32 GetCompresssionFactor(PixelFormat format) {
+    static constexpr u32 GetCompressionFactor(PixelFormat format) {
         if (format == PixelFormat::Invalid)
             return 0;
 
@@ -107,18 +88,21 @@ struct SurfaceParams {
             1, // R8
             1, // RGBA16F
             1, // R11FG11FB10F
+            1, // RGBA32UI
             4, // DXT1
             4, // DXT23
             4, // DXT45
             4, // DXN1
+            4, // BC7U
+            4, // ASTC_2D_4X4
+            1, // Z24S8
+            1, // S8Z24
+            1, // Z32F
         }};
 
         ASSERT(static_cast<size_t>(format) < compression_factor_table.size());
         return compression_factor_table[static_cast<size_t>(format)];
     }
-    u32 GetCompresssionFactor() const {
-        return GetCompresssionFactor(pixel_format);
-    }
 
     static constexpr u32 GetFormatBpp(PixelFormat format) {
         if (format == PixelFormat::Invalid)
@@ -132,10 +116,16 @@ struct SurfaceParams {
             8,   // R8
             64,  // RGBA16F
             32,  // R11FG11FB10F
+            128, // RGBA32UI
             64,  // DXT1
             128, // DXT23
             128, // DXT45
             64,  // DXN1
+            128, // BC7U
+            32,  // ASTC_2D_4X4
+            32,  // Z24S8
+            32,  // S8Z24
+            32,  // Z32F
         }};
 
         ASSERT(static_cast<size_t>(format) < bpp_table.size());
@@ -145,6 +135,20 @@ struct SurfaceParams {
         return GetFormatBpp(pixel_format);
     }
 
+    static PixelFormat PixelFormatFromDepthFormat(Tegra::DepthFormat format) {
+        switch (format) {
+        case Tegra::DepthFormat::S8_Z24_UNORM:
+            return PixelFormat::S8Z24;
+        case Tegra::DepthFormat::Z24_S8_UNORM:
+            return PixelFormat::Z24S8;
+        case Tegra::DepthFormat::Z32_FLOAT:
+            return PixelFormat::Z32F;
+        default:
+            LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
+            UNREACHABLE();
+        }
+    }
+
     static PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) {
         switch (format) {
         case Tegra::RenderTargetFormat::RGBA8_UNORM:
@@ -156,18 +160,10 @@ struct SurfaceParams {
             return PixelFormat::RGBA16F;
         case Tegra::RenderTargetFormat::R11G11B10_FLOAT:
             return PixelFormat::R11FG11FB10F;
+        case Tegra::RenderTargetFormat::RGBA32_UINT:
+            return PixelFormat::RGBA32UI;
         default:
-            NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
-            UNREACHABLE();
-        }
-    }
-
-    static PixelFormat PixelFormatFromGPUPixelFormat(Tegra::FramebufferConfig::PixelFormat format) {
-        switch (format) {
-        case Tegra::FramebufferConfig::PixelFormat::ABGR8:
-            return PixelFormat::ABGR8;
-        default:
-            NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
+            LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
             UNREACHABLE();
         }
     }
@@ -189,6 +185,8 @@ struct SurfaceParams {
             return PixelFormat::RGBA16F;
         case Tegra::Texture::TextureFormat::BF10GF11RF11:
             return PixelFormat::R11FG11FB10F;
+        case Tegra::Texture::TextureFormat::R32_G32_B32_A32:
+            return PixelFormat::RGBA32UI;
         case Tegra::Texture::TextureFormat::DXT1:
             return PixelFormat::DXT1;
         case Tegra::Texture::TextureFormat::DXT23:
@@ -197,8 +195,12 @@ struct SurfaceParams {
             return PixelFormat::DXT45;
         case Tegra::Texture::TextureFormat::DXN1:
             return PixelFormat::DXN1;
+        case Tegra::Texture::TextureFormat::BC7U:
+            return PixelFormat::BC7U;
+        case Tegra::Texture::TextureFormat::ASTC_2D_4X4:
+            return PixelFormat::ASTC_2D_4X4;
         default:
-            NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
+            LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
             UNREACHABLE();
         }
     }
@@ -220,6 +222,8 @@ struct SurfaceParams {
             return Tegra::Texture::TextureFormat::R16_G16_B16_A16;
         case PixelFormat::R11FG11FB10F:
             return Tegra::Texture::TextureFormat::BF10GF11RF11;
+        case PixelFormat::RGBA32UI:
+            return Tegra::Texture::TextureFormat::R32_G32_B32_A32;
         case PixelFormat::DXT1:
             return Tegra::Texture::TextureFormat::DXT1;
         case PixelFormat::DXT23:
@@ -228,6 +232,23 @@ struct SurfaceParams {
             return Tegra::Texture::TextureFormat::DXT45;
         case PixelFormat::DXN1:
             return Tegra::Texture::TextureFormat::DXN1;
+        case PixelFormat::BC7U:
+            return Tegra::Texture::TextureFormat::BC7U;
+        case PixelFormat::ASTC_2D_4X4:
+            return Tegra::Texture::TextureFormat::ASTC_2D_4X4;
+        default:
+            UNREACHABLE();
+        }
+    }
+
+    static Tegra::DepthFormat DepthFormatFromPixelFormat(PixelFormat format) {
+        switch (format) {
+        case PixelFormat::S8Z24:
+            return Tegra::DepthFormat::S8_Z24_UNORM;
+        case PixelFormat::Z24S8:
+            return Tegra::DepthFormat::Z24_S8_UNORM;
+        case PixelFormat::Z32F:
+            return Tegra::DepthFormat::Z32_FLOAT;
         default:
             UNREACHABLE();
         }
@@ -239,7 +260,7 @@ struct SurfaceParams {
         case Tegra::Texture::ComponentType::UNORM:
             return ComponentType::UNorm;
         default:
-            NGLOG_CRITICAL(HW_GPU, "Unimplemented component type={}", static_cast<u32>(type));
+            LOG_CRITICAL(HW_GPU, "Unimplemented component type={}", static_cast<u32>(type));
             UNREACHABLE();
         }
     }
@@ -254,215 +275,153 @@ struct SurfaceParams {
         case Tegra::RenderTargetFormat::RGBA16_FLOAT:
         case Tegra::RenderTargetFormat::R11G11B10_FLOAT:
             return ComponentType::Float;
+        case Tegra::RenderTargetFormat::RGBA32_UINT:
+            return ComponentType::UInt;
         default:
-            NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
+            LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
             UNREACHABLE();
         }
     }
 
-    static ComponentType ComponentTypeFromGPUPixelFormat(
-        Tegra::FramebufferConfig::PixelFormat format) {
+    static PixelFormat PixelFormatFromGPUPixelFormat(Tegra::FramebufferConfig::PixelFormat format) {
         switch (format) {
         case Tegra::FramebufferConfig::PixelFormat::ABGR8:
-            return ComponentType::UNorm;
+            return PixelFormat::ABGR8;
         default:
-            NGLOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
+            LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
             UNREACHABLE();
         }
     }
 
-    static bool CheckFormatsBlittable(PixelFormat pixel_format_a, PixelFormat pixel_format_b) {
-        SurfaceType a_type = GetFormatType(pixel_format_a);
-        SurfaceType b_type = GetFormatType(pixel_format_b);
-
-        if (a_type == SurfaceType::ColorTexture && b_type == SurfaceType::ColorTexture) {
-            return true;
-        }
-
-        if (a_type == SurfaceType::Depth && b_type == SurfaceType::Depth) {
-            return true;
-        }
-
-        if (a_type == SurfaceType::DepthStencil && b_type == SurfaceType::DepthStencil) {
-            return true;
+    static ComponentType ComponentTypeFromDepthFormat(Tegra::DepthFormat format) {
+        switch (format) {
+        case Tegra::DepthFormat::S8_Z24_UNORM:
+        case Tegra::DepthFormat::Z24_S8_UNORM:
+            return ComponentType::UNorm;
+        case Tegra::DepthFormat::Z32_FLOAT:
+            return ComponentType::Float;
+        default:
+            LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
+            UNREACHABLE();
         }
-
-        return false;
     }
 
     static SurfaceType GetFormatType(PixelFormat pixel_format) {
-        if (static_cast<size_t>(pixel_format) < MaxPixelFormat) {
+        if (static_cast<size_t>(pixel_format) < static_cast<size_t>(PixelFormat::MaxColorFormat)) {
             return SurfaceType::ColorTexture;
         }
 
+        if (static_cast<size_t>(pixel_format) <
+            static_cast<size_t>(PixelFormat::MaxDepthStencilFormat)) {
+            return SurfaceType::DepthStencil;
+        }
+
         // TODO(Subv): Implement the other formats
         ASSERT(false);
 
         return SurfaceType::Invalid;
     }
 
-    /// Update the params "size", "end" and "type" from the already set "addr", "width", "height"
-    /// and "pixel_format"
-    void UpdateParams() {
-        if (stride == 0) {
-            stride = width;
-        }
-        type = GetFormatType(pixel_format);
-        size = !is_tiled ? BytesInPixels(stride * (height - 1) + width)
-                         : BytesInPixels(stride * 8 * (height / 8 - 1) + width * 8);
-        end = addr + size;
-    }
-
-    SurfaceInterval GetInterval() const {
-        return SurfaceInterval::right_open(addr, end);
-    }
-
-    // Returns the outer rectangle containing "interval"
-    SurfaceParams FromInterval(SurfaceInterval interval) const;
-
-    SurfaceInterval GetSubRectInterval(MathUtil::Rectangle<u32> unscaled_rect) const;
-
-    // Returns the region of the biggest valid rectange within interval
-    SurfaceInterval GetCopyableInterval(const Surface& src_surface) const;
-
-    /**
-     * Gets the actual width (in pixels) of the surface. This is provided because `width` is used
-     * for tracking the surface region in memory, which may be compressed for certain formats. In
-     * this scenario, `width` is actually the compressed width.
-     */
-    u32 GetActualWidth() const {
-        return width * GetCompresssionFactor();
-    }
-
-    /**
-     * Gets the actual height (in pixels) of the surface. This is provided because `height` is used
-     * for tracking the surface region in memory, which may be compressed for certain formats. In
-     * this scenario, `height` is actually the compressed height.
-     */
-    u32 GetActualHeight() const {
-        return height * GetCompresssionFactor();
-    }
+    /// Returns the rectangle corresponding to this surface
+    MathUtil::Rectangle<u32> GetRect() const;
 
-    u32 GetScaledWidth() const {
-        return width * res_scale;
+    /// Returns the size of this surface in bytes, adjusted for compression
+    size_t SizeInBytes() const {
+        const u32 compression_factor{GetCompressionFactor(pixel_format)};
+        ASSERT(width % compression_factor == 0);
+        ASSERT(height % compression_factor == 0);
+        return (width / compression_factor) * (height / compression_factor) *
+               GetFormatBpp(pixel_format) / CHAR_BIT;
     }
 
-    u32 GetScaledHeight() const {
-        return height * res_scale;
-    }
+    /// Returns the CPU virtual address for this surface
+    VAddr GetCpuAddr() const;
 
-    MathUtil::Rectangle<u32> GetRect() const {
-        return {0, height, width, 0};
+    /// Returns true if the specified region overlaps with this surface's region in Switch memory
+    bool IsOverlappingRegion(Tegra::GPUVAddr region_addr, size_t region_size) const {
+        return addr <= (region_addr + region_size) && region_addr <= (addr + size_in_bytes);
     }
 
-    MathUtil::Rectangle<u32> GetScaledRect() const {
-        return {0, GetScaledHeight(), GetScaledWidth(), 0};
-    }
+    /// Creates SurfaceParams from a texture configuration
+    static SurfaceParams CreateForTexture(const Tegra::Texture::FullTextureInfo& config);
+
+    /// Creates SurfaceParams from a framebuffer configuration
+    static SurfaceParams CreateForFramebuffer(
+        const Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig& config);
+
+    /// Creates SurfaceParams for a depth buffer configuration
+    static SurfaceParams CreateForDepthBuffer(
+        const Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig& config,
+        Tegra::GPUVAddr zeta_address, Tegra::DepthFormat format);
+
+    Tegra::GPUVAddr addr;
+    bool is_tiled;
+    u32 block_height;
+    PixelFormat pixel_format;
+    ComponentType component_type;
+    SurfaceType type;
+    u32 width;
+    u32 height;
+    u32 unaligned_height;
+    size_t size_in_bytes;
+};
 
-    u64 PixelsInBytes(u64 size) const {
-        return size * CHAR_BIT / GetFormatBpp(pixel_format);
+/// Hashable variation of SurfaceParams, used for a key in the surface cache
+struct SurfaceKey : Common::HashableStruct<SurfaceParams> {
+    static SurfaceKey Create(const SurfaceParams& params) {
+        SurfaceKey res;
+        res.state = params;
+        return res;
     }
+};
 
-    u64 BytesInPixels(u64 pixels) const {
-        return pixels * GetFormatBpp(pixel_format) / CHAR_BIT;
+namespace std {
+template <>
+struct hash<SurfaceKey> {
+    size_t operator()(const SurfaceKey& k) const {
+        return k.Hash();
     }
-
-    VAddr GetCpuAddr() const;
-
-    bool ExactMatch(const SurfaceParams& other_surface) const;
-    bool CanSubRect(const SurfaceParams& sub_surface) const;
-    bool CanExpand(const SurfaceParams& expanded_surface) const;
-    bool CanTexCopy(const SurfaceParams& texcopy_params) const;
-
-    MathUtil::Rectangle<u32> GetSubRect(const SurfaceParams& sub_surface) const;
-    MathUtil::Rectangle<u32> GetScaledSubRect(const SurfaceParams& sub_surface) const;
-
-    Tegra::GPUVAddr addr = 0;
-    Tegra::GPUVAddr end = 0;
-    boost::optional<VAddr> cpu_addr;
-    u64 size = 0;
-
-    u32 width = 0;
-    u32 height = 0;
-    u32 stride = 0;
-    u32 block_height = 0;
-    u16 res_scale = 1;
-
-    bool is_tiled = false;
-    PixelFormat pixel_format = PixelFormat::Invalid;
-    SurfaceType type = SurfaceType::Invalid;
-    ComponentType component_type = ComponentType::Invalid;
 };
+} // namespace std
 
-struct CachedSurface : SurfaceParams {
-    bool CanFill(const SurfaceParams& dest_surface, SurfaceInterval fill_interval) const;
-    bool CanCopy(const SurfaceParams& dest_surface, SurfaceInterval copy_interval) const;
-
-    bool IsRegionValid(SurfaceInterval interval) const {
-        return (invalid_regions.find(interval) == invalid_regions.end());
-    }
+class CachedSurface final {
+public:
+    CachedSurface(const SurfaceParams& params);
 
-    bool IsSurfaceFullyInvalid() const {
-        return (invalid_regions & GetInterval()) == SurfaceRegions(GetInterval());
+    const OGLTexture& Texture() const {
+        return texture;
     }
 
-    bool registered = false;
-    SurfaceRegions invalid_regions;
-
-    u64 fill_size = 0; /// Number of bytes to read from fill_data
-    std::array<u8, 4> fill_data;
-
-    OGLTexture texture;
-
-    static constexpr unsigned int GetGLBytesPerPixel(PixelFormat format) {
-        if (format == PixelFormat::Invalid)
+    static constexpr unsigned int GetGLBytesPerPixel(SurfaceParams::PixelFormat format) {
+        if (format == SurfaceParams::PixelFormat::Invalid)
             return 0;
 
         return SurfaceParams::GetFormatBpp(format) / CHAR_BIT;
     }
 
-    std::unique_ptr<u8[]> gl_buffer;
-    size_t gl_buffer_size = 0;
+    const SurfaceParams& GetSurfaceParams() const {
+        return params;
+    }
 
     // Read/Write data in Switch memory to/from gl_buffer
-    void LoadGLBuffer(Tegra::GPUVAddr load_start, Tegra::GPUVAddr load_end);
-    void FlushGLBuffer(Tegra::GPUVAddr flush_start, Tegra::GPUVAddr flush_end);
+    void LoadGLBuffer();
+    void FlushGLBuffer();
 
     // Upload/Download data in gl_buffer in/to this surface's texture
-    void UploadGLTexture(const MathUtil::Rectangle<u32>& rect, GLuint read_fb_handle,
-                         GLuint draw_fb_handle);
-    void DownloadGLTexture(const MathUtil::Rectangle<u32>& rect, GLuint read_fb_handle,
-                           GLuint draw_fb_handle);
+    void UploadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle);
+    void DownloadGLTexture(GLuint read_fb_handle, GLuint draw_fb_handle);
+
+private:
+    OGLTexture texture;
+    std::vector<u8> gl_buffer;
+    SurfaceParams params;
 };
 
-class RasterizerCacheOpenGL : NonCopyable {
+class RasterizerCacheOpenGL final : NonCopyable {
 public:
     RasterizerCacheOpenGL();
     ~RasterizerCacheOpenGL();
 
-    /// Blit one surface's texture to another
-    bool BlitSurfaces(const Surface& src_surface, const MathUtil::Rectangle<u32>& src_rect,
-                      const Surface& dst_surface, const MathUtil::Rectangle<u32>& dst_rect);
-
-    void ConvertD24S8toABGR(GLuint src_tex, const MathUtil::Rectangle<u32>& src_rect,
-                            GLuint dst_tex, const MathUtil::Rectangle<u32>& dst_rect);
-
-    /// Copy one surface's region to another
-    void CopySurface(const Surface& src_surface, const Surface& dst_surface,
-                     SurfaceInterval copy_interval);
-
-    /// Load a texture from Switch memory to OpenGL and cache it (if not already cached)
-    Surface GetSurface(const SurfaceParams& params, ScaleMatch match_res_scale,
-                       bool load_if_create);
-
-    /// Tries to find a framebuffer GPU address based on the provided CPU address
-    boost::optional<Tegra::GPUVAddr> TryFindFramebufferGpuAddress(VAddr cpu_addr) const;
-
-    /// Attempt to find a subrect (resolution scaled) of a surface, otherwise loads a texture from
-    /// Switch memory to OpenGL and caches it (if not already cached)
-    SurfaceRect_Tuple GetSurfaceSubRect(const SurfaceParams& params, ScaleMatch match_res_scale,
-                                        bool load_if_create);
-
     /// Get a surface based on the texture configuration
     Surface GetTextureSurface(const Tegra::Texture::FullTextureInfo& config);
 
@@ -470,29 +429,21 @@ public:
     SurfaceSurfaceRect_Tuple GetFramebufferSurfaces(bool using_color_fb, bool using_depth_fb,
                                                     const MathUtil::Rectangle<s32>& viewport);
 
-    /// Get a surface that matches the fill config
-    Surface GetFillSurface(const void* config);
+    /// Marks the specified surface as "dirty", in that it is out of sync with Switch memory
+    void MarkSurfaceAsDirty(const Surface& surface);
 
-    /// Get a surface that matches a "texture copy" display transfer config
-    SurfaceRect_Tuple GetTexCopySurface(const SurfaceParams& params);
+    /// Tries to find a framebuffer GPU address based on the provided CPU address
+    Surface TryFindFramebufferSurface(VAddr cpu_addr) const;
 
     /// Write any cached resources overlapping the region back to memory (if dirty)
-    void FlushRegion(Tegra::GPUVAddr addr, u64 size, Surface flush_surface = nullptr);
-
-    /// Mark region as being invalidated by region_owner (nullptr if Switch memory)
-    void InvalidateRegion(Tegra::GPUVAddr addr, u64 size, const Surface& region_owner);
+    void FlushRegion(Tegra::GPUVAddr addr, size_t size);
 
-    /// Flush all cached resources tracked by this cache manager
-    void FlushAll();
+    /// Mark the specified region as being invalidated
+    void InvalidateRegion(Tegra::GPUVAddr addr, size_t size);
 
 private:
-    void DuplicateSurface(const Surface& src_surface, const Surface& dest_surface);
-
-    /// Update surface's texture for given region when necessary
-    void ValidateSurface(const Surface& surface, Tegra::GPUVAddr addr, u64 size);
-
-    /// Create a new surface
-    Surface CreateSurface(const SurfaceParams& params);
+    void LoadSurface(const Surface& surface);
+    Surface GetSurface(const SurfaceParams& params);
 
     /// Register surface into the cache
     void RegisterSurface(const Surface& surface);
@@ -503,18 +454,9 @@ private:
     /// Increase/decrease the number of surface in pages touching the specified region
     void UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta);
 
-    SurfaceCache surface_cache;
+    std::unordered_map<SurfaceKey, Surface> surface_cache;
     PageMap cached_pages;
-    SurfaceMap dirty_regions;
-    SurfaceSet remove_surfaces;
 
     OGLFramebuffer read_framebuffer;
     OGLFramebuffer draw_framebuffer;
-
-    OGLVertexArray attributeless_vao;
-    OGLBuffer d24s8_abgr_buffer;
-    GLsizeiptr d24s8_abgr_buffer_size;
-    OGLProgram d24s8_abgr_shader;
-    GLint d24s8_abgr_tbo_size_u_id;
-    GLint d24s8_abgr_viewport_u_id;
 };
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index 93f9172e7..0fed93ca5 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -38,7 +38,7 @@ public:
         if (handle == 0)
             return;
         glDeleteTextures(1, &handle);
-        OpenGLState::GetCurState().ResetTexture(handle).Apply();
+        OpenGLState::GetCurState().UnbindTexture(handle).Apply();
         handle = 0;
     }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 67726e7c6..5914077e8 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -9,6 +9,7 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
+#include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 
 namespace GLShader {
@@ -16,6 +17,7 @@ namespace Decompiler {
 
 using Tegra::Shader::Attribute;
 using Tegra::Shader::Instruction;
+using Tegra::Shader::LogicOperation;
 using Tegra::Shader::OpCode;
 using Tegra::Shader::Register;
 using Tegra::Shader::Sampler;
@@ -266,6 +268,27 @@ public:
     }
 
     /**
+     * Returns code that does an integer size conversion for the specified size.
+     * @param value Value to perform integer size conversion on.
+     * @param size Register size to use for conversion instructions.
+     * @returns GLSL string corresponding to the value converted to the specified size.
+     */
+    static std::string ConvertIntegerSize(const std::string& value, Register::Size size) {
+        switch (size) {
+        case Register::Size::Byte:
+            return "((" + value + " << 24) >> 24)";
+        case Register::Size::Short:
+            return "((" + value + " << 16) >> 16)";
+        case Register::Size::Word:
+            // Default - do nothing
+            return value;
+        default:
+            LOG_CRITICAL(HW_GPU, "Unimplemented conversion size {}", static_cast<u32>(size));
+            UNREACHABLE();
+        }
+    }
+
+    /**
      * Gets a register as an float.
      * @param reg The register to get.
      * @param elem The element to use for the operation.
@@ -281,15 +304,18 @@ public:
      * @param reg The register to get.
      * @param elem The element to use for the operation.
      * @param is_signed Whether to get the register as a signed (or unsigned) integer.
+     * @param size Register size to use for conversion instructions.
      * @returns GLSL string corresponding to the register as an integer.
      */
-    std::string GetRegisterAsInteger(const Register& reg, unsigned elem = 0,
-                                     bool is_signed = true) {
+    std::string GetRegisterAsInteger(const Register& reg, unsigned elem = 0, bool is_signed = true,
+                                     Register::Size size = Register::Size::Word) {
         const std::string func = GetGLSLConversionFunc(
             GLSLRegister::Type::Float,
             is_signed ? GLSLRegister::Type::Integer : GLSLRegister::Type::UnsignedInteger);
 
-        return func + '(' + GetRegister(reg, elem) + ')';
+        std::string value = func + '(' + GetRegister(reg, elem) + ')';
+
+        return ConvertIntegerSize(value, size);
     }
 
     /**
@@ -299,13 +325,15 @@ public:
      * @param value The code representing the value to assign.
      * @param dest_num_components Number of components in the destination.
      * @param value_num_components Number of components in the value.
-     * @param is_abs Optional, when True, applies absolute value to output.
+     * @param is_saturated Optional, when True, saturates the provided value.
      * @param dest_elem Optional, the destination element to use for the operation.
      */
     void SetRegisterToFloat(const Register& reg, u64 elem, const std::string& value,
-                            u64 dest_num_components, u64 value_num_components, bool is_abs = false,
-                            u64 dest_elem = 0) {
-        SetRegister(reg, elem, value, dest_num_components, value_num_components, is_abs, dest_elem);
+                            u64 dest_num_components, u64 value_num_components,
+                            bool is_saturated = false, u64 dest_elem = 0) {
+
+        SetRegister(reg, elem, is_saturated ? "clamp(" + value + ", 0.0, 1.0)" : value,
+                    dest_num_components, value_num_components, dest_elem);
     }
 
     /**
@@ -315,18 +343,22 @@ public:
      * @param value The code representing the value to assign.
      * @param dest_num_components Number of components in the destination.
      * @param value_num_components Number of components in the value.
-     * @param is_abs Optional, when True, applies absolute value to output.
+     * @param is_saturated Optional, when True, saturates the provided value.
      * @param dest_elem Optional, the destination element to use for the operation.
+     * @param size Register size to use for conversion instructions.
      */
     void SetRegisterToInteger(const Register& reg, bool is_signed, u64 elem,
                               const std::string& value, u64 dest_num_components,
-                              u64 value_num_components, bool is_abs = false, u64 dest_elem = 0) {
+                              u64 value_num_components, bool is_saturated = false,
+                              u64 dest_elem = 0, Register::Size size = Register::Size::Word) {
+        ASSERT_MSG(!is_saturated, "Unimplemented");
+
         const std::string func = GetGLSLConversionFunc(
             is_signed ? GLSLRegister::Type::Integer : GLSLRegister::Type::UnsignedInteger,
             GLSLRegister::Type::Float);
 
-        SetRegister(reg, elem, func + '(' + value + ')', dest_num_components, value_num_components,
-                    is_abs, dest_elem);
+        SetRegister(reg, elem, func + '(' + ConvertIntegerSize(value, size) + ')',
+                    dest_num_components, value_num_components, dest_elem);
     }
 
     /**
@@ -366,7 +398,8 @@ public:
     /// Generates code representing a uniform (C buffer) register, interpreted as the input type.
     std::string GetUniform(u64 index, u64 offset, GLSLRegister::Type type) {
         declr_const_buffers[index].MarkAsUsed(index, offset, stage);
-        std::string value = 'c' + std::to_string(index) + '[' + std::to_string(offset) + ']';
+        std::string value = 'c' + std::to_string(index) + '[' + std::to_string(offset / 4) + "][" +
+                            std::to_string(offset % 4) + ']';
 
         if (type == GLSLRegister::Type::Float) {
             return value;
@@ -380,8 +413,12 @@ public:
     std::string GetUniformIndirect(u64 index, s64 offset, const Register& index_reg,
                                    GLSLRegister::Type type) {
         declr_const_buffers[index].MarkAsUsedIndirect(index, stage);
-        std::string value = 'c' + std::to_string(index) + "[(floatBitsToInt(" +
-                            GetRegister(index_reg, 0) + ") + " + std::to_string(offset) + ") / 4]";
+
+        std::string final_offset = "((floatBitsToInt(" + GetRegister(index_reg, 0) + ") + " +
+                                   std::to_string(offset) + ") / 4)";
+
+        std::string value =
+            'c' + std::to_string(index) + '[' + final_offset + " / 4][" + final_offset + " % 4]";
 
         if (type == GLSLRegister::Type::Float) {
             return value;
@@ -423,9 +460,10 @@ public:
 
         unsigned const_buffer_layout = 0;
         for (const auto& entry : GetConstBuffersDeclarations()) {
-            declarations.AddLine("layout(std430) buffer " + entry.GetName());
+            declarations.AddLine("layout(std140) uniform " + entry.GetName());
             declarations.AddLine('{');
-            declarations.AddLine("    float c" + std::to_string(entry.GetIndex()) + "[];");
+            declarations.AddLine("    vec4 c" + std::to_string(entry.GetIndex()) +
+                                 "[MAX_CONSTBUFFER_ELEMENTS];");
             declarations.AddLine("};");
             declarations.AddNewLine();
             ++const_buffer_layout;
@@ -500,13 +538,11 @@ private:
      * @param value The code representing the value to assign.
      * @param dest_num_components Number of components in the destination.
      * @param value_num_components Number of components in the value.
-     * @param is_abs Optional, when True, applies absolute value to output.
      * @param dest_elem Optional, the destination element to use for the operation.
      */
     void SetRegister(const Register& reg, u64 elem, const std::string& value,
-                     u64 dest_num_components, u64 value_num_components, bool is_abs,
-                     u64 dest_elem) {
-        std::string dest = GetRegister(reg, dest_elem);
+                     u64 dest_num_components, u64 value_num_components, u64 dest_elem) {
+        std::string dest = GetRegister(reg, static_cast<u32>(dest_elem));
         if (dest_num_components > 1) {
             dest += GetSwizzle(elem);
         }
@@ -516,8 +552,6 @@ private:
             src += GetSwizzle(elem);
         }
 
-        src = is_abs ? "abs(" + src + ')' : src;
-
         shader.AddLine(dest + " = " + src + ';');
     }
 
@@ -547,7 +581,7 @@ private:
                 return "input_attribute_" + std::to_string(index);
             }
 
-            NGLOG_CRITICAL(HW_GPU, "Unhandled input attribute: {}", index);
+            LOG_CRITICAL(HW_GPU, "Unhandled input attribute: {}", index);
             UNREACHABLE();
         }
     }
@@ -565,7 +599,7 @@ private:
                 return "output_attribute_" + std::to_string(index);
             }
 
-            NGLOG_CRITICAL(HW_GPU, "Unhandled output attribute: {}", index);
+            LOG_CRITICAL(HW_GPU, "Unhandled output attribute: {}", index);
             UNREACHABLE();
         }
     }
@@ -685,21 +719,31 @@ private:
     /**
      * Returns the comparison string to use to compare two values in the 'set' family of
      * instructions.
-     * @params condition The condition used in the 'set'-family instruction.
+     * @param condition The condition used in the 'set'-family instruction.
+     * @param op_a First operand to use for the comparison.
+     * @param op_b Second operand to use for the comparison.
      * @returns String corresponding to the GLSL operator that matches the desired comparison.
      */
-    std::string GetPredicateComparison(Tegra::Shader::PredCondition condition) const {
+    std::string GetPredicateComparison(Tegra::Shader::PredCondition condition,
+                                       const std::string& op_a, const std::string& op_b) const {
         using Tegra::Shader::PredCondition;
         static const std::unordered_map<PredCondition, const char*> PredicateComparisonStrings = {
-            {PredCondition::LessThan, "<"},   {PredCondition::Equal, "=="},
-            {PredCondition::LessEqual, "<="}, {PredCondition::GreaterThan, ">"},
-            {PredCondition::NotEqual, "!="},  {PredCondition::GreaterEqual, ">="},
+            {PredCondition::LessThan, "<"},         {PredCondition::Equal, "=="},
+            {PredCondition::LessEqual, "<="},       {PredCondition::GreaterThan, ">"},
+            {PredCondition::NotEqual, "!="},        {PredCondition::GreaterEqual, ">="},
+            {PredCondition::NotEqualWithNan, "!="},
         };
 
-        auto comparison = PredicateComparisonStrings.find(condition);
+        const auto& comparison{PredicateComparisonStrings.find(condition)};
         ASSERT_MSG(comparison != PredicateComparisonStrings.end(),
                    "Unknown predicate comparison operation");
-        return comparison->second;
+
+        std::string predicate{'(' + op_a + ") " + comparison->second + " (" + op_b + ')'};
+        if (condition == PredCondition::NotEqualWithNan) {
+            predicate += " || isnan(" + op_a + ") || isnan(" + op_b + ')';
+        }
+
+        return predicate;
     }
 
     /**
@@ -733,6 +777,31 @@ private:
         return (absolute_offset % SchedPeriod) == 0;
     }
 
+    void WriteLogicOperation(Register dest, LogicOperation logic_op, const std::string& op_a,
+                             const std::string& op_b) {
+        switch (logic_op) {
+        case LogicOperation::And: {
+            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " & " + op_b + ')', 1, 1);
+            break;
+        }
+        case LogicOperation::Or: {
+            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " | " + op_b + ')', 1, 1);
+            break;
+        }
+        case LogicOperation::Xor: {
+            regs.SetRegisterToInteger(dest, true, 0, '(' + op_a + " ^ " + op_b + ')', 1, 1);
+            break;
+        }
+        case LogicOperation::PassB: {
+            regs.SetRegisterToInteger(dest, true, 0, op_b, 1, 1);
+            break;
+        }
+        default:
+            LOG_CRITICAL(HW_GPU, "Unimplemented logic operation: {}", static_cast<u32>(logic_op));
+            UNREACHABLE();
+        }
+    }
+
     /**
      * Compiles a single instruction from Tegra to GLSL.
      * @param offset the offset of the Tegra shader instruction.
@@ -750,8 +819,9 @@ private:
 
         // Decoding failure
         if (!opcode) {
-            NGLOG_CRITICAL(HW_GPU, "Unhandled instruction: {0:x}", instr.value);
+            LOG_CRITICAL(HW_GPU, "Unhandled instruction: {0:x}", instr.value);
             UNREACHABLE();
+            return offset + 1;
         }
 
         shader.AddLine("// " + std::to_string(offset) + ": " + opcode->GetName());
@@ -770,22 +840,25 @@ private:
 
         switch (opcode->GetType()) {
         case OpCode::Type::Arithmetic: {
-            std::string op_a = instr.alu.negate_a ? "-" : "";
-            op_a += regs.GetRegisterAsFloat(instr.gpr8);
+            std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
             if (instr.alu.abs_a) {
                 op_a = "abs(" + op_a + ')';
             }
 
-            std::string op_b = instr.alu.negate_b ? "-" : "";
+            if (instr.alu.negate_a) {
+                op_a = "-(" + op_a + ')';
+            }
+
+            std::string op_b;
 
             if (instr.is_b_imm) {
-                op_b += GetImmediate19(instr);
+                op_b = GetImmediate19(instr);
             } else {
                 if (instr.is_b_gpr) {
-                    op_b += regs.GetRegisterAsFloat(instr.gpr20);
+                    op_b = regs.GetRegisterAsFloat(instr.gpr20);
                 } else {
-                    op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
-                                            GLSLRegister::Type::Float);
+                    op_b = regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                           GLSLRegister::Type::Float);
                 }
             }
 
@@ -793,6 +866,10 @@ private:
                 op_b = "abs(" + op_b + ')';
             }
 
+            if (instr.alu.negate_b) {
+                op_b = "-(" + op_b + ')';
+            }
+
             switch (opcode->GetId()) {
             case OpCode::Id::MOV_C:
             case OpCode::Id::MOV_R: {
@@ -800,68 +877,53 @@ private:
                 break;
             }
 
-            case OpCode::Id::MOV32_IMM: {
-                // mov32i doesn't have abs or neg bits.
-                regs.SetRegisterToFloat(instr.gpr0, 0, GetImmediate32(instr), 1, 1);
-                break;
-            }
             case OpCode::Id::FMUL_C:
             case OpCode::Id::FMUL_R:
             case OpCode::Id::FMUL_IMM: {
-                ASSERT_MSG(!instr.saturate_a, "Unimplemented");
-
-                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1, instr.alu.abs_d);
-                break;
-            }
-            case OpCode::Id::FMUL32_IMM: {
-                // fmul32i doesn't have abs or neg bits.
-                regs.SetRegisterToFloat(
-                    instr.gpr0, 0,
-                    regs.GetRegisterAsFloat(instr.gpr8) + " * " + GetImmediate32(instr), 1, 1);
+                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b, 1, 1,
+                                        instr.alu.saturate_d);
                 break;
             }
             case OpCode::Id::FADD_C:
             case OpCode::Id::FADD_R:
             case OpCode::Id::FADD_IMM: {
-                ASSERT_MSG(!instr.saturate_a, "Unimplemented");
-
-                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1, instr.alu.abs_d);
+                regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " + " + op_b, 1, 1,
+                                        instr.alu.saturate_d);
                 break;
             }
             case OpCode::Id::MUFU: {
-                ASSERT_MSG(!instr.saturate_a, "Unimplemented");
-
                 switch (instr.sub_op) {
                 case SubOp::Cos:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "cos(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Sin:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "sin(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Ex2:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "exp2(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Lg2:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "log2(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Rcp:
-                    regs.SetRegisterToFloat(instr.gpr0, 0, "1.0 / " + op_a, 1, 1, instr.alu.abs_d);
+                    regs.SetRegisterToFloat(instr.gpr0, 0, "1.0 / " + op_a, 1, 1,
+                                            instr.alu.saturate_d);
                     break;
                 case SubOp::Rsq:
                     regs.SetRegisterToFloat(instr.gpr0, 0, "inversesqrt(" + op_a + ')', 1, 1,
-                                            instr.alu.abs_d);
+                                            instr.alu.saturate_d);
                     break;
-                case SubOp::Min:
-                    regs.SetRegisterToFloat(instr.gpr0, 0, "min(" + op_a + "," + op_b + ')', 1, 1,
-                                            instr.alu.abs_d);
+                case SubOp::Sqrt:
+                    regs.SetRegisterToFloat(instr.gpr0, 0, "sqrt(" + op_a + ')', 1, 1,
+                                            instr.alu.saturate_d);
                     break;
                 default:
-                    NGLOG_CRITICAL(HW_GPU, "Unhandled MUFU sub op: {0:x}",
-                                   static_cast<unsigned>(instr.sub_op.Value()));
+                    LOG_CRITICAL(HW_GPU, "Unhandled MUFU sub op: {0:x}",
+                                 static_cast<unsigned>(instr.sub_op.Value()));
                     UNREACHABLE();
                 }
                 break;
@@ -884,16 +946,31 @@ private:
                 // Currently RRO is only implemented as a register move.
                 // Usage of `abs_b` and `negate_b` here should also be correct.
                 regs.SetRegisterToFloat(instr.gpr0, 0, op_b, 1, 1);
-                NGLOG_WARNING(HW_GPU, "RRO instruction is incomplete");
+                LOG_WARNING(HW_GPU, "RRO instruction is incomplete");
                 break;
             }
             default: {
-                NGLOG_CRITICAL(HW_GPU, "Unhandled arithmetic instruction: {}", opcode->GetName());
+                LOG_CRITICAL(HW_GPU, "Unhandled arithmetic instruction: {}", opcode->GetName());
                 UNREACHABLE();
             }
             }
             break;
         }
+        case OpCode::Type::ArithmeticImmediate: {
+            switch (opcode->GetId()) {
+            case OpCode::Id::MOV32_IMM: {
+                regs.SetRegisterToFloat(instr.gpr0, 0, GetImmediate32(instr), 1, 1);
+                break;
+            }
+            case OpCode::Id::FMUL32_IMM: {
+                regs.SetRegisterToFloat(
+                    instr.gpr0, 0,
+                    regs.GetRegisterAsFloat(instr.gpr8) + " * " + GetImmediate32(instr), 1, 1);
+                break;
+            }
+            }
+            break;
+        }
         case OpCode::Type::Bfe: {
             ASSERT_MSG(!instr.bfe.negate_b, "Unimplemented");
 
@@ -912,56 +989,13 @@ private:
                 break;
             }
             default: {
-                NGLOG_CRITICAL(HW_GPU, "Unhandled BFE instruction: {}", opcode->GetName());
+                LOG_CRITICAL(HW_GPU, "Unhandled BFE instruction: {}", opcode->GetName());
                 UNREACHABLE();
             }
             }
 
             break;
         }
-        case OpCode::Type::Logic: {
-            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, true);
-
-            if (instr.alu.lop.invert_a)
-                op_a = "~(" + op_a + ')';
-
-            switch (opcode->GetId()) {
-            case OpCode::Id::LOP32I: {
-                u32 imm = static_cast<u32>(instr.alu.imm20_32.Value());
-
-                if (instr.alu.lop.invert_b)
-                    imm = ~imm;
-
-                switch (instr.alu.lop.operation) {
-                case Tegra::Shader::LogicOperation::And: {
-                    regs.SetRegisterToInteger(instr.gpr0, true, 0,
-                                              '(' + op_a + " & " + std::to_string(imm) + ')', 1, 1);
-                    break;
-                }
-                case Tegra::Shader::LogicOperation::Or: {
-                    regs.SetRegisterToInteger(instr.gpr0, true, 0,
-                                              '(' + op_a + " | " + std::to_string(imm) + ')', 1, 1);
-                    break;
-                }
-                case Tegra::Shader::LogicOperation::Xor: {
-                    regs.SetRegisterToInteger(instr.gpr0, true, 0,
-                                              '(' + op_a + " ^ " + std::to_string(imm) + ')', 1, 1);
-                    break;
-                }
-                default:
-                    NGLOG_CRITICAL(HW_GPU, "Unimplemented lop32i operation: {}",
-                                   static_cast<u32>(instr.alu.lop.operation.Value()));
-                    UNREACHABLE();
-                }
-                break;
-            }
-            default: {
-                NGLOG_CRITICAL(HW_GPU, "Unhandled logic instruction: {}", opcode->GetName());
-                UNREACHABLE();
-            }
-            }
-            break;
-        }
 
         case OpCode::Type::Shift: {
             std::string op_a = regs.GetRegisterAsInteger(instr.gpr8, 0, true);
@@ -998,21 +1032,46 @@ private:
                 regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " << " + op_b, 1, 1);
                 break;
             default: {
-                NGLOG_CRITICAL(HW_GPU, "Unhandled shift instruction: {}", opcode->GetName());
+                LOG_CRITICAL(HW_GPU, "Unhandled shift instruction: {}", opcode->GetName());
                 UNREACHABLE();
             }
             }
             break;
         }
 
-        case OpCode::Type::ArithmeticInteger: {
+        case OpCode::Type::ArithmeticIntegerImmediate: {
             std::string op_a = regs.GetRegisterAsInteger(instr.gpr8);
+            std::string op_b = std::to_string(instr.alu.imm20_32.Value());
 
-            if (instr.alu_integer.negate_a)
-                op_a = '-' + op_a;
+            switch (opcode->GetId()) {
+            case OpCode::Id::IADD32I:
+                if (instr.iadd32i.negate_a)
+                    op_a = "-(" + op_a + ')';
 
-            std::string op_b = instr.alu_integer.negate_b ? "-" : "";
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " + " + op_b, 1, 1,
+                                          instr.iadd32i.saturate != 0);
+                break;
+            case OpCode::Id::LOP32I: {
+                if (instr.alu.lop32i.invert_a)
+                    op_a = "~(" + op_a + ')';
 
+                if (instr.alu.lop32i.invert_b)
+                    op_b = "~(" + op_b + ')';
+
+                WriteLogicOperation(instr.gpr0, instr.alu.lop32i.operation, op_a, op_b);
+                break;
+            }
+            default: {
+                LOG_CRITICAL(HW_GPU, "Unhandled ArithmeticIntegerImmediate instruction: {}",
+                             opcode->GetName());
+                UNREACHABLE();
+            }
+            }
+            break;
+        }
+        case OpCode::Type::ArithmeticInteger: {
+            std::string op_a = regs.GetRegisterAsInteger(instr.gpr8);
+            std::string op_b;
             if (instr.is_b_imm) {
                 op_b += '(' + std::to_string(instr.alu.GetSignedImm20_20()) + ')';
             } else {
@@ -1028,22 +1087,63 @@ private:
             case OpCode::Id::IADD_C:
             case OpCode::Id::IADD_R:
             case OpCode::Id::IADD_IMM: {
-                ASSERT_MSG(!instr.saturate_a, "Unimplemented");
-                regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " + " + op_b, 1, 1);
+                if (instr.alu_integer.negate_a)
+                    op_a = "-(" + op_a + ')';
+
+                if (instr.alu_integer.negate_b)
+                    op_b = "-(" + op_b + ')';
+
+                regs.SetRegisterToInteger(instr.gpr0, true, 0, op_a + " + " + op_b, 1, 1,
+                                          instr.alu.saturate_d);
                 break;
             }
             case OpCode::Id::ISCADD_C:
             case OpCode::Id::ISCADD_R:
             case OpCode::Id::ISCADD_IMM: {
+                if (instr.alu_integer.negate_a)
+                    op_a = "-(" + op_a + ')';
+
+                if (instr.alu_integer.negate_b)
+                    op_b = "-(" + op_b + ')';
+
                 std::string shift = std::to_string(instr.alu_integer.shift_amount.Value());
 
                 regs.SetRegisterToInteger(instr.gpr0, true, 0,
                                           "((" + op_a + " << " + shift + ") + " + op_b + ')', 1, 1);
                 break;
             }
+            case OpCode::Id::LOP_C:
+            case OpCode::Id::LOP_R:
+            case OpCode::Id::LOP_IMM: {
+                ASSERT_MSG(!instr.alu.lop.unk44, "Unimplemented");
+                ASSERT_MSG(instr.alu.lop.pred48 == Pred::UnusedIndex, "Unimplemented");
+
+                if (instr.alu.lop.invert_a)
+                    op_a = "~(" + op_a + ')';
+
+                if (instr.alu.lop.invert_b)
+                    op_b = "~(" + op_b + ')';
+
+                WriteLogicOperation(instr.gpr0, instr.alu.lop.operation, op_a, op_b);
+                break;
+            }
+            case OpCode::Id::IMNMX_C:
+            case OpCode::Id::IMNMX_R:
+            case OpCode::Id::IMNMX_IMM: {
+                ASSERT_MSG(instr.imnmx.exchange == Tegra::Shader::IMinMaxExchange::None,
+                           "Unimplemented");
+                std::string condition =
+                    GetPredicateCondition(instr.imnmx.pred, instr.imnmx.negate_pred != 0);
+                std::string parameters = op_a + ',' + op_b;
+                regs.SetRegisterToInteger(instr.gpr0, instr.imnmx.is_signed, 0,
+                                          '(' + condition + ") ? min(" + parameters + ") : max(" +
+                                              parameters + ')',
+                                          1, 1);
+                break;
+            }
             default: {
-                NGLOG_CRITICAL(HW_GPU, "Unhandled ArithmeticInteger instruction: {}",
-                               opcode->GetName());
+                LOG_CRITICAL(HW_GPU, "Unhandled ArithmeticInteger instruction: {}",
+                             opcode->GetName());
                 UNREACHABLE();
             }
             }
@@ -1051,8 +1151,6 @@ private:
             break;
         }
         case OpCode::Type::Ffma: {
-            ASSERT_MSG(!instr.saturate_a, "Unimplemented");
-
             std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
             std::string op_b = instr.ffma.negate_b ? "-" : "";
             std::string op_c = instr.ffma.negate_c ? "-" : "";
@@ -1081,38 +1179,38 @@ private:
                 break;
             }
             default: {
-                NGLOG_CRITICAL(HW_GPU, "Unhandled FFMA instruction: {}", opcode->GetName());
+                LOG_CRITICAL(HW_GPU, "Unhandled FFMA instruction: {}", opcode->GetName());
                 UNREACHABLE();
             }
             }
 
-            regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b + " + " + op_c, 1, 1);
+            regs.SetRegisterToFloat(instr.gpr0, 0, op_a + " * " + op_b + " + " + op_c, 1, 1,
+                                    instr.alu.saturate_d);
             break;
         }
         case OpCode::Type::Conversion: {
-            ASSERT_MSG(instr.conversion.size == Register::Size::Word, "Unimplemented");
             ASSERT_MSG(!instr.conversion.negate_a, "Unimplemented");
-            ASSERT_MSG(!instr.saturate_a, "Unimplemented");
 
             switch (opcode->GetId()) {
             case OpCode::Id::I2I_R: {
                 ASSERT_MSG(!instr.conversion.selector, "Unimplemented");
 
-                std::string op_a =
-                    regs.GetRegisterAsInteger(instr.gpr20, 0, instr.conversion.is_input_signed);
+                std::string op_a = regs.GetRegisterAsInteger(
+                    instr.gpr20, 0, instr.conversion.is_input_signed, instr.conversion.src_size);
 
                 if (instr.conversion.abs_a) {
                     op_a = "abs(" + op_a + ')';
                 }
 
                 regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_output_signed, 0, op_a, 1,
-                                          1);
+                                          1, instr.alu.saturate_d, 0, instr.conversion.dest_size);
                 break;
             }
             case OpCode::Id::I2F_R: {
+                ASSERT_MSG(instr.conversion.dest_size == Register::Size::Word, "Unimplemented");
                 ASSERT_MSG(!instr.conversion.selector, "Unimplemented");
-                std::string op_a =
-                    regs.GetRegisterAsInteger(instr.gpr20, 0, instr.conversion.is_input_signed);
+                std::string op_a = regs.GetRegisterAsInteger(
+                    instr.gpr20, 0, instr.conversion.is_input_signed, instr.conversion.src_size);
 
                 if (instr.conversion.abs_a) {
                     op_a = "abs(" + op_a + ')';
@@ -1122,13 +1220,16 @@ private:
                 break;
             }
             case OpCode::Id::F2F_R: {
-                ASSERT_MSG(!instr.saturate_a, "Unimplemented");
-
+                ASSERT_MSG(instr.conversion.dest_size == Register::Size::Word, "Unimplemented");
+                ASSERT_MSG(instr.conversion.src_size == Register::Size::Word, "Unimplemented");
                 std::string op_a = regs.GetRegisterAsFloat(instr.gpr20);
 
                 switch (instr.conversion.f2f.rounding) {
                 case Tegra::Shader::F2fRoundingOp::None:
                     break;
+                case Tegra::Shader::F2fRoundingOp::Round:
+                    op_a = "roundEven(" + op_a + ')';
+                    break;
                 case Tegra::Shader::F2fRoundingOp::Floor:
                     op_a = "floor(" + op_a + ')';
                     break;
@@ -1139,8 +1240,8 @@ private:
                     op_a = "trunc(" + op_a + ')';
                     break;
                 default:
-                    NGLOG_CRITICAL(HW_GPU, "Unimplemented f2f rounding mode {}",
-                                   static_cast<u32>(instr.conversion.f2f.rounding.Value()));
+                    LOG_CRITICAL(HW_GPU, "Unimplemented f2f rounding mode {}",
+                                 static_cast<u32>(instr.conversion.f2f.rounding.Value()));
                     UNREACHABLE();
                     break;
                 }
@@ -1149,10 +1250,11 @@ private:
                     op_a = "abs(" + op_a + ')';
                 }
 
-                regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1);
+                regs.SetRegisterToFloat(instr.gpr0, 0, op_a, 1, 1, instr.alu.saturate_d);
                 break;
             }
             case OpCode::Id::F2I_R: {
+                ASSERT_MSG(instr.conversion.src_size == Register::Size::Word, "Unimplemented");
                 std::string op_a = regs.GetRegisterAsFloat(instr.gpr20);
 
                 if (instr.conversion.abs_a) {
@@ -1172,8 +1274,8 @@ private:
                     op_a = "trunc(" + op_a + ')';
                     break;
                 default:
-                    NGLOG_CRITICAL(HW_GPU, "Unimplemented f2i rounding mode {}",
-                                   static_cast<u32>(instr.conversion.f2i.rounding.Value()));
+                    LOG_CRITICAL(HW_GPU, "Unimplemented f2i rounding mode {}",
+                                 static_cast<u32>(instr.conversion.f2i.rounding.Value()));
                     UNREACHABLE();
                     break;
                 }
@@ -1185,11 +1287,11 @@ private:
                 }
 
                 regs.SetRegisterToInteger(instr.gpr0, instr.conversion.is_output_signed, 0, op_a, 1,
-                                          1);
+                                          1, false, 0, instr.conversion.dest_size);
                 break;
             }
             default: {
-                NGLOG_CRITICAL(HW_GPU, "Unhandled conversion instruction: {}", opcode->GetName());
+                LOG_CRITICAL(HW_GPU, "Unhandled conversion instruction: {}", opcode->GetName());
                 UNREACHABLE();
             }
             }
@@ -1224,8 +1326,8 @@ private:
                     break;
 
                 default:
-                    NGLOG_CRITICAL(HW_GPU, "Unhandled type: {}",
-                                   static_cast<unsigned>(instr.ld_c.type.Value()));
+                    LOG_CRITICAL(HW_GPU, "Unhandled type: {}",
+                                 static_cast<unsigned>(instr.ld_c.type.Value()));
                     UNREACHABLE();
                 }
                 break;
@@ -1298,7 +1400,7 @@ private:
                 break;
             }
             default: {
-                NGLOG_CRITICAL(HW_GPU, "Unhandled memory instruction: {}", opcode->GetName());
+                LOG_CRITICAL(HW_GPU, "Unhandled memory instruction: {}", opcode->GetName());
                 UNREACHABLE();
             }
             }
@@ -1340,10 +1442,9 @@ private:
             std::string second_pred =
                 GetPredicateCondition(instr.fsetp.pred39, instr.fsetp.neg_pred != 0);
 
-            std::string comparator = GetPredicateComparison(instr.fsetp.cond);
             std::string combiner = GetPredicateCombiner(instr.fsetp.op);
 
-            std::string predicate = '(' + op_a + ") " + comparator + " (" + op_b + ')';
+            std::string predicate = GetPredicateComparison(instr.fsetp.cond, op_a, op_b);
             // Set the primary predicate to the result of Predicate OP SecondPredicate
             SetPredicate(instr.fsetp.pred3,
                          '(' + predicate + ") " + combiner + " (" + second_pred + ')');
@@ -1378,10 +1479,9 @@ private:
             std::string second_pred =
                 GetPredicateCondition(instr.isetp.pred39, instr.isetp.neg_pred != 0);
 
-            std::string comparator = GetPredicateComparison(instr.isetp.cond);
             std::string combiner = GetPredicateCombiner(instr.isetp.op);
 
-            std::string predicate = '(' + op_a + ") " + comparator + " (" + op_b + ')';
+            std::string predicate = GetPredicateComparison(instr.isetp.cond, op_a, op_b);
             // Set the primary predicate to the result of Predicate OP SecondPredicate
             SetPredicate(instr.isetp.pred3,
                          '(' + predicate + ") " + combiner + " (" + second_pred + ')');
@@ -1394,6 +1494,36 @@ private:
             }
             break;
         }
+        case OpCode::Type::PredicateSetPredicate: {
+            std::string op_a =
+                GetPredicateCondition(instr.psetp.pred12, instr.psetp.neg_pred12 != 0);
+            std::string op_b =
+                GetPredicateCondition(instr.psetp.pred29, instr.psetp.neg_pred29 != 0);
+
+            using Tegra::Shader::Pred;
+            // We can't use the constant predicate as destination.
+            ASSERT(instr.psetp.pred3 != static_cast<u64>(Pred::UnusedIndex));
+
+            std::string second_pred =
+                GetPredicateCondition(instr.psetp.pred39, instr.psetp.neg_pred39 != 0);
+
+            std::string combiner = GetPredicateCombiner(instr.psetp.op);
+
+            std::string predicate =
+                '(' + op_a + ") " + GetPredicateCombiner(instr.psetp.cond) + " (" + op_b + ')';
+
+            // Set the primary predicate to the result of Predicate OP SecondPredicate
+            SetPredicate(instr.psetp.pred3,
+                         '(' + predicate + ") " + combiner + " (" + second_pred + ')');
+
+            if (instr.psetp.pred0 != static_cast<u64>(Pred::UnusedIndex)) {
+                // Set the secondary predicate to the result of !Predicate OP SecondPredicate,
+                // if enabled
+                SetPredicate(instr.psetp.pred0,
+                             "!(" + predicate + ") " + combiner + " (" + second_pred + ')');
+            }
+            break;
+        }
         case OpCode::Type::FloatSet: {
             std::string op_a = instr.fset.neg_a ? "-" : "";
             op_a += regs.GetRegisterAsFloat(instr.gpr8);
@@ -1428,11 +1558,10 @@ private:
             std::string second_pred =
                 GetPredicateCondition(instr.fset.pred39, instr.fset.neg_pred != 0);
 
-            std::string comparator = GetPredicateComparison(instr.fset.cond);
             std::string combiner = GetPredicateCombiner(instr.fset.op);
 
-            std::string predicate = "(((" + op_a + ") " + comparator + " (" + op_b + ")) " +
-                                    combiner + " (" + second_pred + "))";
+            std::string predicate = "((" + GetPredicateComparison(instr.fset.cond, op_a, op_b) +
+                                    ") " + combiner + " (" + second_pred + "))";
 
             if (instr.fset.bf) {
                 regs.SetRegisterToFloat(instr.gpr0, 0, predicate + " ? 1.0 : 0.0", 1, 1);
@@ -1463,11 +1592,10 @@ private:
             std::string second_pred =
                 GetPredicateCondition(instr.iset.pred39, instr.iset.neg_pred != 0);
 
-            std::string comparator = GetPredicateComparison(instr.iset.cond);
             std::string combiner = GetPredicateCombiner(instr.iset.op);
 
-            std::string predicate = "(((" + op_a + ") " + comparator + " (" + op_b + ")) " +
-                                    combiner + " (" + second_pred + "))";
+            std::string predicate = "((" + GetPredicateComparison(instr.iset.cond, op_a, op_b) +
+                                    ") " + combiner + " (" + second_pred + "))";
 
             if (instr.iset.bf) {
                 regs.SetRegisterToFloat(instr.gpr0, 0, predicate + " ? 1.0 : 0.0", 1, 1);
@@ -1518,8 +1646,15 @@ private:
                 // can ignore this when generating GLSL code.
                 break;
             }
+            case OpCode::Id::DEPBAR:
+            case OpCode::Id::SYNC: {
+                // TODO(Subv): Find out if we actually have to care about these instructions or if
+                // the GLSL compiler takes care of that for us.
+                LOG_WARNING(HW_GPU, "DEPBAR/SYNC instruction is stubbed");
+                break;
+            }
             default: {
-                NGLOG_CRITICAL(HW_GPU, "Unhandled instruction: {}", opcode->GetName());
+                LOG_CRITICAL(HW_GPU, "Unhandled instruction: {}", opcode->GetName());
                 UNREACHABLE();
             }
             }
@@ -1646,7 +1781,10 @@ private:
 }; // namespace Decompiler
 
 std::string GetCommonDeclarations() {
-    return "bool exec_shader();";
+    std::string declarations = "bool exec_shader();\n";
+    declarations += "#define MAX_CONSTBUFFER_ELEMENTS " +
+                    std::to_string(RasterizerOpenGL::MaxConstbufferSize / (sizeof(GLvec4)));
+    return declarations;
 }
 
 boost::optional<ProgramResult> DecompileProgram(const ProgramCode& program_code, u32 main_offset,
@@ -1656,7 +1794,7 @@ boost::optional<ProgramResult> DecompileProgram(const ProgramCode& program_code,
         GLSLGenerator generator(subroutines, program_code, main_offset, stage);
         return ProgramResult{generator.GetShaderCode(), generator.GetEntries()};
     } catch (const DecompileFail& exception) {
-        NGLOG_ERROR(HW_GPU, "Shader decompilation failed: {}", exception.what());
+        LOG_ERROR(HW_GPU, "Shader decompilation failed: {}", exception.what());
     }
     return boost::none;
 }
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index b88d592b7..c1e6fac9f 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -39,6 +39,10 @@ void main() {
     // Viewport can be flipped, which is unsupported by glViewport
     position.xy *= viewport_flip.xy;
     gl_Position = position;
+
+    // TODO(bunnei): This is likely a hack, position.w should be interpolated as 1.0
+    // For now, this is here to bring order in lieu of proper emulation
+    position.w = 1.0;
 }
 )";
     out += program.first;
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index 7c00beb33..d7167b298 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -38,8 +38,8 @@ void MaxwellUniformData::SetFromRegs(const Maxwell3D::State::ShaderStageInfo& sh
     const auto& regs = Core::System().GetInstance().GPU().Maxwell3D().regs;
 
     // TODO(bunnei): Support more than one viewport
-    viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0 : 1.0;
-    viewport_flip[1] = regs.viewport_transform[0].scale_y < 0.0 ? -1.0 : 1.0;
+    viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0f : 1.0f;
+    viewport_flip[1] = regs.viewport_transform[0].scale_y < 0.0 ? -1.0f : 1.0f;
 }
 
 } // namespace GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp
index 8568fface..3c087d638 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_util.cpp
@@ -27,7 +27,7 @@ GLuint LoadShader(const char* source, GLenum type) {
     }
     GLuint shader_id = glCreateShader(type);
     glShaderSource(shader_id, 1, &source, nullptr);
-    NGLOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type);
+    LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type);
     glCompileShader(shader_id);
 
     GLint result = GL_FALSE;
@@ -39,9 +39,9 @@ GLuint LoadShader(const char* source, GLenum type) {
         std::string shader_error(info_log_length, ' ');
         glGetShaderInfoLog(shader_id, info_log_length, nullptr, &shader_error[0]);
         if (result == GL_TRUE) {
-            NGLOG_DEBUG(Render_OpenGL, "{}", shader_error);
+            LOG_DEBUG(Render_OpenGL, "{}", shader_error);
         } else {
-            NGLOG_ERROR(Render_OpenGL, "Error compiling {} shader:\n{}", debug_type, shader_error);
+            LOG_ERROR(Render_OpenGL, "Error compiling {} shader:\n{}", debug_type, shader_error);
         }
     }
     return shader_id;
diff --git a/src/video_core/renderer_opengl/gl_shader_util.h b/src/video_core/renderer_opengl/gl_shader_util.h
index 2036a06a9..0e4d782e2 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.h
+++ b/src/video_core/renderer_opengl/gl_shader_util.h
@@ -29,7 +29,7 @@ void LogShaderSource(T... shaders) {
 
         std::string source(source_length, ' ');
         glGetShaderSource(shader, source_length, nullptr, &source[0]);
-        NGLOG_INFO(Render_OpenGL, "Shader source {}", source);
+        LOG_INFO(Render_OpenGL, "Shader source {}", source);
     }
 }
 
@@ -49,7 +49,7 @@ GLuint LoadShader(const char* source, GLenum type);
 template <typename... T>
 GLuint LoadProgram(bool separable_program, T... shaders) {
     // Link the program
-    NGLOG_DEBUG(Render_OpenGL, "Linking program...");
+    LOG_DEBUG(Render_OpenGL, "Linking program...");
 
     GLuint program_id = glCreateProgram();
 
@@ -71,9 +71,9 @@ GLuint LoadProgram(bool separable_program, T... shaders) {
         std::string program_error(info_log_length, ' ');
         glGetProgramInfoLog(program_id, info_log_length, nullptr, &program_error[0]);
         if (result == GL_TRUE) {
-            NGLOG_DEBUG(Render_OpenGL, "{}", program_error);
+            LOG_DEBUG(Render_OpenGL, "{}", program_error);
         } else {
-            NGLOG_ERROR(Render_OpenGL, "Error linking shader:\n{}", program_error);
+            LOG_ERROR(Render_OpenGL, "Error linking shader:\n{}", program_error);
         }
     }
 
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 44f0c8a01..2e8a422a8 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -48,24 +48,9 @@ OpenGLState::OpenGLState() {
     logic_op = GL_COPY;
 
     for (auto& texture_unit : texture_units) {
-        texture_unit.texture_2d = 0;
-        texture_unit.sampler = 0;
-        texture_unit.swizzle.r = GL_RED;
-        texture_unit.swizzle.g = GL_GREEN;
-        texture_unit.swizzle.b = GL_BLUE;
-        texture_unit.swizzle.a = GL_ALPHA;
+        texture_unit.Reset();
     }
 
-    lighting_lut.texture_buffer = 0;
-
-    fog_lut.texture_buffer = 0;
-
-    proctex_lut.texture_buffer = 0;
-    proctex_diff_lut.texture_buffer = 0;
-    proctex_color_map.texture_buffer = 0;
-    proctex_alpha_map.texture_buffer = 0;
-    proctex_noise_lut.texture_buffer = 0;
-
     draw.read_framebuffer = 0;
     draw.draw_framebuffer = 0;
     draw.vertex_array = 0;
@@ -196,13 +181,13 @@ void OpenGLState::Apply() const {
     }
 
     // Textures
-    for (size_t i = 0; i < std::size(texture_units); ++i) {
+    for (int i = 0; i < std::size(texture_units); ++i) {
         if (texture_units[i].texture_2d != cur_state.texture_units[i].texture_2d) {
             glActiveTexture(TextureUnits::MaxwellTexture(i).Enum());
             glBindTexture(GL_TEXTURE_2D, texture_units[i].texture_2d);
         }
         if (texture_units[i].sampler != cur_state.texture_units[i].sampler) {
-            glBindSampler(i, texture_units[i].sampler);
+            glBindSampler(static_cast<GLuint>(i), texture_units[i].sampler);
         }
         // Update the texture swizzle
         if (texture_units[i].swizzle.r != cur_state.texture_units[i].swizzle.r ||
@@ -223,54 +208,12 @@ void OpenGLState::Apply() const {
             if (current.enabled != new_state.enabled || current.bindpoint != new_state.bindpoint ||
                 current.ssbo != new_state.ssbo) {
                 if (new_state.enabled) {
-                    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, new_state.bindpoint, new_state.ssbo);
+                    glBindBufferBase(GL_UNIFORM_BUFFER, new_state.bindpoint, new_state.ssbo);
                 }
             }
         }
     }
 
-    // Lighting LUTs
-    if (lighting_lut.texture_buffer != cur_state.lighting_lut.texture_buffer) {
-        glActiveTexture(TextureUnits::LightingLUT.Enum());
-        glBindTexture(GL_TEXTURE_BUFFER, lighting_lut.texture_buffer);
-    }
-
-    // Fog LUT
-    if (fog_lut.texture_buffer != cur_state.fog_lut.texture_buffer) {
-        glActiveTexture(TextureUnits::FogLUT.Enum());
-        glBindTexture(GL_TEXTURE_BUFFER, fog_lut.texture_buffer);
-    }
-
-    // ProcTex Noise LUT
-    if (proctex_noise_lut.texture_buffer != cur_state.proctex_noise_lut.texture_buffer) {
-        glActiveTexture(TextureUnits::ProcTexNoiseLUT.Enum());
-        glBindTexture(GL_TEXTURE_BUFFER, proctex_noise_lut.texture_buffer);
-    }
-
-    // ProcTex Color Map
-    if (proctex_color_map.texture_buffer != cur_state.proctex_color_map.texture_buffer) {
-        glActiveTexture(TextureUnits::ProcTexColorMap.Enum());
-        glBindTexture(GL_TEXTURE_BUFFER, proctex_color_map.texture_buffer);
-    }
-
-    // ProcTex Alpha Map
-    if (proctex_alpha_map.texture_buffer != cur_state.proctex_alpha_map.texture_buffer) {
-        glActiveTexture(TextureUnits::ProcTexAlphaMap.Enum());
-        glBindTexture(GL_TEXTURE_BUFFER, proctex_alpha_map.texture_buffer);
-    }
-
-    // ProcTex LUT
-    if (proctex_lut.texture_buffer != cur_state.proctex_lut.texture_buffer) {
-        glActiveTexture(TextureUnits::ProcTexLUT.Enum());
-        glBindTexture(GL_TEXTURE_BUFFER, proctex_lut.texture_buffer);
-    }
-
-    // ProcTex Diff LUT
-    if (proctex_diff_lut.texture_buffer != cur_state.proctex_diff_lut.texture_buffer) {
-        glActiveTexture(TextureUnits::ProcTexDiffLUT.Enum());
-        glBindTexture(GL_TEXTURE_BUFFER, proctex_diff_lut.texture_buffer);
-    }
-
     // Framebuffer
     if (draw.read_framebuffer != cur_state.draw.read_framebuffer) {
         glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer);
@@ -338,26 +281,12 @@ void OpenGLState::Apply() const {
     cur_state = *this;
 }
 
-OpenGLState& OpenGLState::ResetTexture(GLuint handle) {
+OpenGLState& OpenGLState::UnbindTexture(GLuint handle) {
     for (auto& unit : texture_units) {
         if (unit.texture_2d == handle) {
-            unit.texture_2d = 0;
+            unit.Unbind();
         }
     }
-    if (lighting_lut.texture_buffer == handle)
-        lighting_lut.texture_buffer = 0;
-    if (fog_lut.texture_buffer == handle)
-        fog_lut.texture_buffer = 0;
-    if (proctex_noise_lut.texture_buffer == handle)
-        proctex_noise_lut.texture_buffer = 0;
-    if (proctex_color_map.texture_buffer == handle)
-        proctex_color_map.texture_buffer = 0;
-    if (proctex_alpha_map.texture_buffer == handle)
-        proctex_alpha_map.texture_buffer = 0;
-    if (proctex_lut.texture_buffer == handle)
-        proctex_lut.texture_buffer = 0;
-    if (proctex_diff_lut.texture_buffer == handle)
-        proctex_diff_lut.texture_buffer = 0;
     return *this;
 }
 
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 839e50e93..3398d7c04 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -91,35 +91,20 @@ public:
             GLint b; // GL_TEXTURE_SWIZZLE_B
             GLint a; // GL_TEXTURE_SWIZZLE_A
         } swizzle;
-    } texture_units[32];
-
-    struct {
-        GLuint texture_buffer; // GL_TEXTURE_BINDING_BUFFER
-    } lighting_lut;
-
-    struct {
-        GLuint texture_buffer; // GL_TEXTURE_BINDING_BUFFER
-    } fog_lut;
-
-    struct {
-        GLuint texture_buffer; // GL_TEXTURE_BINDING_BUFFER
-    } proctex_noise_lut;
 
-    struct {
-        GLuint texture_buffer; // GL_TEXTURE_BINDING_BUFFER
-    } proctex_color_map;
-
-    struct {
-        GLuint texture_buffer; // GL_TEXTURE_BINDING_BUFFER
-    } proctex_alpha_map;
-
-    struct {
-        GLuint texture_buffer; // GL_TEXTURE_BINDING_BUFFER
-    } proctex_lut;
-
-    struct {
-        GLuint texture_buffer; // GL_TEXTURE_BINDING_BUFFER
-    } proctex_diff_lut;
+        void Unbind() {
+            texture_2d = 0;
+            swizzle.r = GL_RED;
+            swizzle.g = GL_GREEN;
+            swizzle.b = GL_BLUE;
+            swizzle.a = GL_ALPHA;
+        }
+
+        void Reset() {
+            Unbind();
+            sampler = 0;
+        }
+    } texture_units[32];
 
     struct {
         GLuint read_framebuffer; // GL_READ_FRAMEBUFFER_BINDING
@@ -165,7 +150,7 @@ public:
     void Apply() const;
 
     /// Resets any references to the given resource
-    OpenGLState& ResetTexture(GLuint handle);
+    OpenGLState& UnbindTexture(GLuint handle);
     OpenGLState& ResetSampler(GLuint handle);
     OpenGLState& ResetProgram(GLuint handle);
     OpenGLState& ResetPipeline(GLuint handle);
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 2155fb019..e19c3b280 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -29,9 +29,13 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
             return GL_UNSIGNED_BYTE;
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+            return GL_UNSIGNED_SHORT;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return GL_UNSIGNED_INT_2_10_10_10_REV;
         }
 
-        NGLOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
+        LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
         UNREACHABLE();
         return {};
     }
@@ -41,9 +45,13 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
             return GL_BYTE;
+        case Maxwell::VertexAttribute::Size::Size_16_16:
+            return GL_SHORT;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return GL_INT_2_10_10_10_REV;
         }
 
-        NGLOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
+        LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
         UNREACHABLE();
         return {};
     }
@@ -52,7 +60,7 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         return GL_FLOAT;
     }
 
-    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
+    LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
     UNREACHABLE();
     return {};
 }
@@ -66,7 +74,7 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
     case Maxwell::IndexFormat::UnsignedInt:
         return GL_UNSIGNED_INT;
     }
-    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented index_format={}", static_cast<u32>(index_format));
+    LOG_CRITICAL(Render_OpenGL, "Unimplemented index_format={}", static_cast<u32>(index_format));
     UNREACHABLE();
     return {};
 }
@@ -78,7 +86,7 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
     case Maxwell::PrimitiveTopology::TriangleStrip:
         return GL_TRIANGLE_STRIP;
     }
-    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented topology={}", static_cast<u32>(topology));
+    LOG_CRITICAL(Render_OpenGL, "Unimplemented topology={}", static_cast<u32>(topology));
     UNREACHABLE();
     return {};
 }
@@ -90,8 +98,8 @@ inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode) {
     case Tegra::Texture::TextureFilter::Nearest:
         return GL_NEAREST;
     }
-    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented texture filter mode={}",
-                   static_cast<u32>(filter_mode));
+    LOG_CRITICAL(Render_OpenGL, "Unimplemented texture filter mode={}",
+                 static_cast<u32>(filter_mode));
     UNREACHABLE();
     return {};
 }
@@ -110,8 +118,7 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         // manually mix them. However the shader part of this is not yet implemented.
         return GL_CLAMP_TO_BORDER;
     }
-    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented texture wrap mode={}",
-                   static_cast<u32>(wrap_mode));
+    LOG_CRITICAL(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
     UNREACHABLE();
     return {};
 }
@@ -129,7 +136,7 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) {
     case Maxwell::Blend::Equation::Max:
         return GL_MAX;
     }
-    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation));
+    LOG_CRITICAL(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation));
     UNREACHABLE();
     return {};
 }
@@ -175,7 +182,7 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) {
     case Maxwell::Blend::Factor::OneMinusConstantAlpha:
         return GL_ONE_MINUS_CONSTANT_ALPHA;
     }
-    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor));
+    LOG_CRITICAL(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor));
     UNREACHABLE();
     return {};
 }
@@ -196,7 +203,65 @@ inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) {
     case Tegra::Texture::SwizzleSource::OneFloat:
         return GL_ONE;
     }
-    NGLOG_CRITICAL(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source));
+    LOG_CRITICAL(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source));
+    UNREACHABLE();
+    return {};
+}
+
+inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) {
+    switch (comparison) {
+    case Maxwell::ComparisonOp::Never:
+    case Maxwell::ComparisonOp::NeverOld:
+        return GL_NEVER;
+    case Maxwell::ComparisonOp::Less:
+    case Maxwell::ComparisonOp::LessOld:
+        return GL_LESS;
+    case Maxwell::ComparisonOp::Equal:
+    case Maxwell::ComparisonOp::EqualOld:
+        return GL_EQUAL;
+    case Maxwell::ComparisonOp::LessEqual:
+    case Maxwell::ComparisonOp::LessEqualOld:
+        return GL_LEQUAL;
+    case Maxwell::ComparisonOp::Greater:
+    case Maxwell::ComparisonOp::GreaterOld:
+        return GL_GREATER;
+    case Maxwell::ComparisonOp::NotEqual:
+    case Maxwell::ComparisonOp::NotEqualOld:
+        return GL_NOTEQUAL;
+    case Maxwell::ComparisonOp::GreaterEqual:
+    case Maxwell::ComparisonOp::GreaterEqualOld:
+        return GL_GEQUAL;
+    case Maxwell::ComparisonOp::Always:
+    case Maxwell::ComparisonOp::AlwaysOld:
+        return GL_ALWAYS;
+    }
+    LOG_CRITICAL(Render_OpenGL, "Unimplemented comparison op={}", static_cast<u32>(comparison));
+    UNREACHABLE();
+    return {};
+}
+
+inline GLenum FrontFace(Maxwell::Cull::FrontFace front_face) {
+    switch (front_face) {
+    case Maxwell::Cull::FrontFace::ClockWise:
+        return GL_CW;
+    case Maxwell::Cull::FrontFace::CounterClockWise:
+        return GL_CCW;
+    }
+    LOG_CRITICAL(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face));
+    UNREACHABLE();
+    return {};
+}
+
+inline GLenum CullFace(Maxwell::Cull::CullFace cull_face) {
+    switch (cull_face) {
+    case Maxwell::Cull::CullFace::Front:
+        return GL_FRONT;
+    case Maxwell::Cull::CullFace::Back:
+        return GL_BACK;
+    case Maxwell::Cull::CullFace::FrontAndBack:
+        return GL_FRONT_AND_BACK;
+    }
+    LOG_CRITICAL(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face));
     UNREACHABLE();
     return {};
 }
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index f33766bfd..00841e937 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -150,7 +150,6 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf
                                          screen_info)) {
         // Reset the screen info's display texture to its own permanent texture
         screen_info.display_texture = screen_info.texture.resource.handle;
-        screen_info.display_texcoords = MathUtil::Rectangle<float>(0.f, 0.f, 1.f, 1.f);
 
         Memory::RasterizerFlushVirtualRegion(framebuffer_addr, size_in_bytes,
                                              Memory::FlushMode::Flush);
@@ -302,8 +301,8 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
             right = texcoords.left;
         } else {
             // Other transformations are unsupported
-            NGLOG_CRITICAL(Render_OpenGL, "Unsupported framebuffer_transform_flags={}",
-                           static_cast<u32>(framebuffer_transform_flags));
+            LOG_CRITICAL(Render_OpenGL, "Unsupported framebuffer_transform_flags={}",
+                         static_cast<u32>(framebuffer_transform_flags));
             UNIMPLEMENTED();
         }
     }
@@ -405,14 +404,14 @@ static void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum
 
     switch (severity) {
     case GL_DEBUG_SEVERITY_HIGH:
-        NGLOG_ERROR(Render_OpenGL, format, str_source, str_type, id, message);
+        LOG_ERROR(Render_OpenGL, format, str_source, str_type, id, message);
         break;
     case GL_DEBUG_SEVERITY_MEDIUM:
-        NGLOG_WARNING(Render_OpenGL, format, str_source, str_type, id, message);
+        LOG_WARNING(Render_OpenGL, format, str_source, str_type, id, message);
         break;
     case GL_DEBUG_SEVERITY_NOTIFICATION:
     case GL_DEBUG_SEVERITY_LOW:
-        NGLOG_DEBUG(Render_OpenGL, format, str_source, str_type, id, message);
+        LOG_DEBUG(Render_OpenGL, format, str_source, str_type, id, message);
         break;
     }
 }
@@ -430,9 +429,9 @@ bool RendererOpenGL::Init() {
     const char* gpu_vendor{reinterpret_cast<char const*>(glGetString(GL_VENDOR))};
     const char* gpu_model{reinterpret_cast<char const*>(glGetString(GL_RENDERER))};
 
-    NGLOG_INFO(Render_OpenGL, "GL_VERSION: {}", gl_version);
-    NGLOG_INFO(Render_OpenGL, "GL_VENDOR: {}", gpu_vendor);
-    NGLOG_INFO(Render_OpenGL, "GL_RENDERER: {}", gpu_model);
+    LOG_INFO(Render_OpenGL, "GL_VERSION: {}", gl_version);
+    LOG_INFO(Render_OpenGL, "GL_VENDOR: {}", gpu_vendor);
+    LOG_INFO(Render_OpenGL, "GL_RENDERER: {}", gpu_model);
 
     Core::Telemetry().AddField(Telemetry::FieldType::UserSystem, "GPU_Vendor", gpu_vendor);
     Core::Telemetry().AddField(Telemetry::FieldType::UserSystem, "GPU_Model", gpu_model);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 2cc6d9a00..21f0d298c 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -27,7 +27,7 @@ struct TextureInfo {
 /// Structure used for storing information about the display target for the Switch screen
 struct ScreenInfo {
     GLuint display_texture;
-    MathUtil::Rectangle<float> display_texcoords;
+    const MathUtil::Rectangle<float> display_texcoords{0.0f, 0.0f, 1.0f, 1.0f};
     TextureInfo texture;
 };
 
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
new file mode 100644
index 000000000..3c4ad1c9d
--- /dev/null
+++ b/src/video_core/textures/astc.cpp
@@ -0,0 +1,1646 @@
+// Copyright 2016 The University of North Carolina at Chapel Hill
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Please send all BUG REPORTS to <pavel@cs.unc.edu>.
+// <http://gamma.cs.unc.edu/FasTC/>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include "video_core/textures/astc.h"
+
+class BitStream {
+public:
+    BitStream(unsigned char* ptr, int nBits = 0, int start_offset = 0)
+        : m_BitsWritten(0), m_BitsRead(0), m_NumBits(nBits), m_CurByte(ptr),
+          m_NextBit(start_offset % 8), done(false) {}
+
+    int GetBitsWritten() const {
+        return m_BitsWritten;
+    }
+
+    ~BitStream() {}
+
+    void WriteBitsR(unsigned int val, unsigned int nBits) {
+        for (unsigned int i = 0; i < nBits; i++) {
+            WriteBit((val >> (nBits - i - 1)) & 1);
+        }
+    }
+
+    void WriteBits(unsigned int val, unsigned int nBits) {
+        for (unsigned int i = 0; i < nBits; i++) {
+            WriteBit((val >> i) & 1);
+        }
+    }
+
+    int GetBitsRead() const {
+        return m_BitsRead;
+    }
+
+    int ReadBit() {
+
+        int bit = *m_CurByte >> m_NextBit++;
+        while (m_NextBit >= 8) {
+            m_NextBit -= 8;
+            m_CurByte++;
+        }
+
+        m_BitsRead++;
+        return bit & 1;
+    }
+
+    unsigned int ReadBits(unsigned int nBits) {
+        unsigned int ret = 0;
+        for (unsigned int i = 0; i < nBits; i++) {
+            ret |= (ReadBit() & 1) << i;
+        }
+        return ret;
+    }
+
+private:
+    void WriteBit(int b) {
+
+        if (done)
+            return;
+
+        const unsigned int mask = 1 << m_NextBit++;
+
+        // clear the bit
+        *m_CurByte &= ~mask;
+
+        // Write the bit, if necessary
+        if (b)
+            *m_CurByte |= mask;
+
+        // Next byte?
+        if (m_NextBit >= 8) {
+            m_CurByte += 1;
+            m_NextBit = 0;
+        }
+
+        done = done || ++m_BitsWritten >= m_NumBits;
+    }
+
+    int m_BitsWritten;
+    const int m_NumBits;
+    unsigned char* m_CurByte;
+    int m_NextBit;
+    int m_BitsRead;
+
+    bool done;
+};
+
+template <typename IntType>
+class Bits {
+private:
+    const IntType& m_Bits;
+
+    // Don't copy
+    Bits() {}
+    Bits(const Bits&) {}
+    Bits& operator=(const Bits&) {}
+
+public:
+    explicit Bits(IntType& v) : m_Bits(v) {}
+
+    uint8_t operator[](uint32_t bitPos) {
+        return static_cast<uint8_t>((m_Bits >> bitPos) & 1);
+    }
+
+    IntType operator()(uint32_t start, uint32_t end) {
+        if (start == end) {
+            return (*this)[start];
+        } else if (start > end) {
+            uint32_t t = start;
+            start = end;
+            end = t;
+        }
+
+        uint64_t mask = (1 << (end - start + 1)) - 1;
+        return (m_Bits >> start) & mask;
+    }
+};
+
+enum EIntegerEncoding { eIntegerEncoding_JustBits, eIntegerEncoding_Quint, eIntegerEncoding_Trit };
+
+class IntegerEncodedValue {
+private:
+    const EIntegerEncoding m_Encoding;
+    const uint32_t m_NumBits;
+    uint32_t m_BitValue;
+    union {
+        uint32_t m_QuintValue;
+        uint32_t m_TritValue;
+    };
+
+public:
+    // Jank, but we're not doing any heavy lifting in this class, so it's
+    // probably OK. It allows us to use these in std::vectors...
+    IntegerEncodedValue& operator=(const IntegerEncodedValue& other) {
+        new (this) IntegerEncodedValue(other);
+        return *this;
+    }
+
+    IntegerEncodedValue(EIntegerEncoding encoding, uint32_t numBits)
+        : m_Encoding(encoding), m_NumBits(numBits) {}
+
+    EIntegerEncoding GetEncoding() const {
+        return m_Encoding;
+    }
+    uint32_t BaseBitLength() const {
+        return m_NumBits;
+    }
+
+    uint32_t GetBitValue() const {
+        return m_BitValue;
+    }
+    void SetBitValue(uint32_t val) {
+        m_BitValue = val;
+    }
+
+    uint32_t GetTritValue() const {
+        return m_TritValue;
+    }
+    void SetTritValue(uint32_t val) {
+        m_TritValue = val;
+    }
+
+    uint32_t GetQuintValue() const {
+        return m_QuintValue;
+    }
+    void SetQuintValue(uint32_t val) {
+        m_QuintValue = val;
+    }
+
+    bool MatchesEncoding(const IntegerEncodedValue& other) {
+        return m_Encoding == other.m_Encoding && m_NumBits == other.m_NumBits;
+    }
+
+    // Returns the number of bits required to encode nVals values.
+    uint32_t GetBitLength(uint32_t nVals) {
+        uint32_t totalBits = m_NumBits * nVals;
+        if (m_Encoding == eIntegerEncoding_Trit) {
+            totalBits += (nVals * 8 + 4) / 5;
+        } else if (m_Encoding == eIntegerEncoding_Quint) {
+            totalBits += (nVals * 7 + 2) / 3;
+        }
+        return totalBits;
+    }
+
+    // Count the number of bits set in a number.
+    static inline uint32_t Popcnt(uint32_t n) {
+        uint32_t c;
+        for (c = 0; n; c++) {
+            n &= n - 1;
+        }
+        return c;
+    }
+
+    // Returns a new instance of this struct that corresponds to the
+    // can take no more than maxval values
+    static IntegerEncodedValue CreateEncoding(uint32_t maxVal) {
+        while (maxVal > 0) {
+            uint32_t check = maxVal + 1;
+
+            // Is maxVal a power of two?
+            if (!(check & (check - 1))) {
+                return IntegerEncodedValue(eIntegerEncoding_JustBits, Popcnt(maxVal));
+            }
+
+            // Is maxVal of the type 3*2^n - 1?
+            if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
+                return IntegerEncodedValue(eIntegerEncoding_Trit, Popcnt(check / 3 - 1));
+            }
+
+            // Is maxVal of the type 5*2^n - 1?
+            if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
+                return IntegerEncodedValue(eIntegerEncoding_Quint, Popcnt(check / 5 - 1));
+            }
+
+            // Apparently it can't be represented with a bounded integer sequence...
+            // just iterate.
+            maxVal--;
+        }
+        return IntegerEncodedValue(eIntegerEncoding_JustBits, 0);
+    }
+
+    // Fills result with the values that are encoded in the given
+    // bitstream. We must know beforehand what the maximum possible
+    // value is, and how many values we're decoding.
+    static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, BitStream& bits,
+                                      uint32_t maxRange, uint32_t nValues) {
+        // Determine encoding parameters
+        IntegerEncodedValue val = IntegerEncodedValue::CreateEncoding(maxRange);
+
+        // Start decoding
+        uint32_t nValsDecoded = 0;
+        while (nValsDecoded < nValues) {
+            switch (val.GetEncoding()) {
+            case eIntegerEncoding_Quint:
+                DecodeQuintBlock(bits, result, val.BaseBitLength());
+                nValsDecoded += 3;
+                break;
+
+            case eIntegerEncoding_Trit:
+                DecodeTritBlock(bits, result, val.BaseBitLength());
+                nValsDecoded += 5;
+                break;
+
+            case eIntegerEncoding_JustBits:
+                val.SetBitValue(bits.ReadBits(val.BaseBitLength()));
+                result.push_back(val);
+                nValsDecoded++;
+                break;
+            }
+        }
+    }
+
+private:
+    static void DecodeTritBlock(BitStream& bits, std::vector<IntegerEncodedValue>& result,
+                                uint32_t nBitsPerValue) {
+        // Implement the algorithm in section C.2.12
+        uint32_t m[5];
+        uint32_t t[5];
+        uint32_t T;
+
+        // Read the trit encoded block according to
+        // table C.2.14
+        m[0] = bits.ReadBits(nBitsPerValue);
+        T = bits.ReadBits(2);
+        m[1] = bits.ReadBits(nBitsPerValue);
+        T |= bits.ReadBits(2) << 2;
+        m[2] = bits.ReadBits(nBitsPerValue);
+        T |= bits.ReadBit() << 4;
+        m[3] = bits.ReadBits(nBitsPerValue);
+        T |= bits.ReadBits(2) << 5;
+        m[4] = bits.ReadBits(nBitsPerValue);
+        T |= bits.ReadBit() << 7;
+
+        uint32_t C = 0;
+
+        Bits<uint32_t> Tb(T);
+        if (Tb(2, 4) == 7) {
+            C = (Tb(5, 7) << 2) | Tb(0, 1);
+            t[4] = t[3] = 2;
+        } else {
+            C = Tb(0, 4);
+            if (Tb(5, 6) == 3) {
+                t[4] = 2;
+                t[3] = Tb[7];
+            } else {
+                t[4] = Tb[7];
+                t[3] = Tb(5, 6);
+            }
+        }
+
+        Bits<uint32_t> Cb(C);
+        if (Cb(0, 1) == 3) {
+            t[2] = 2;
+            t[1] = Cb[4];
+            t[0] = (Cb[3] << 1) | (Cb[2] & ~Cb[3]);
+        } else if (Cb(2, 3) == 3) {
+            t[2] = 2;
+            t[1] = 2;
+            t[0] = Cb(0, 1);
+        } else {
+            t[2] = Cb[4];
+            t[1] = Cb(2, 3);
+            t[0] = (Cb[1] << 1) | (Cb[0] & ~Cb[1]);
+        }
+
+        for (uint32_t i = 0; i < 5; i++) {
+            IntegerEncodedValue val(eIntegerEncoding_Trit, nBitsPerValue);
+            val.SetBitValue(m[i]);
+            val.SetTritValue(t[i]);
+            result.push_back(val);
+        }
+    }
+
+    static void DecodeQuintBlock(BitStream& bits, std::vector<IntegerEncodedValue>& result,
+                                 uint32_t nBitsPerValue) {
+        // Implement the algorithm in section C.2.12
+        uint32_t m[3];
+        uint32_t q[3];
+        uint32_t Q;
+
+        // Read the trit encoded block according to
+        // table C.2.15
+        m[0] = bits.ReadBits(nBitsPerValue);
+        Q = bits.ReadBits(3);
+        m[1] = bits.ReadBits(nBitsPerValue);
+        Q |= bits.ReadBits(2) << 3;
+        m[2] = bits.ReadBits(nBitsPerValue);
+        Q |= bits.ReadBits(2) << 5;
+
+        Bits<uint32_t> Qb(Q);
+        if (Qb(1, 2) == 3 && Qb(5, 6) == 0) {
+            q[0] = q[1] = 4;
+            q[2] = (Qb[0] << 2) | ((Qb[4] & ~Qb[0]) << 1) | (Qb[3] & ~Qb[0]);
+        } else {
+            uint32_t C = 0;
+            if (Qb(1, 2) == 3) {
+                q[2] = 4;
+                C = (Qb(3, 4) << 3) | ((~Qb(5, 6) & 3) << 1) | Qb[0];
+            } else {
+                q[2] = Qb(5, 6);
+                C = Qb(0, 4);
+            }
+
+            Bits<uint32_t> Cb(C);
+            if (Cb(0, 2) == 5) {
+                q[1] = 4;
+                q[0] = Cb(3, 4);
+            } else {
+                q[1] = Cb(3, 4);
+                q[0] = Cb(0, 2);
+            }
+        }
+
+        for (uint32_t i = 0; i < 3; i++) {
+            IntegerEncodedValue val(eIntegerEncoding_Quint, nBitsPerValue);
+            val.m_BitValue = m[i];
+            val.m_QuintValue = q[i];
+            result.push_back(val);
+        }
+    }
+};
+
+namespace ASTCC {
+
+struct TexelWeightParams {
+    uint32_t m_Width;
+    uint32_t m_Height;
+    bool m_bDualPlane;
+    uint32_t m_MaxWeight;
+    bool m_bError;
+    bool m_bVoidExtentLDR;
+    bool m_bVoidExtentHDR;
+
+    TexelWeightParams() {
+        memset(this, 0, sizeof(*this));
+    }
+
+    uint32_t GetPackedBitSize() {
+        // How many indices do we have?
+        uint32_t nIdxs = m_Height * m_Width;
+        if (m_bDualPlane) {
+            nIdxs *= 2;
+        }
+
+        return IntegerEncodedValue::CreateEncoding(m_MaxWeight).GetBitLength(nIdxs);
+    }
+
+    uint32_t GetNumWeightValues() const {
+        uint32_t ret = m_Width * m_Height;
+        if (m_bDualPlane) {
+            ret *= 2;
+        }
+        return ret;
+    }
+};
+
+TexelWeightParams DecodeBlockInfo(BitStream& strm) {
+    TexelWeightParams params;
+
+    // Read the entire block mode all at once
+    uint16_t modeBits = strm.ReadBits(11);
+
+    // Does this match the void extent block mode?
+    if ((modeBits & 0x01FF) == 0x1FC) {
+        if (modeBits & 0x200) {
+            params.m_bVoidExtentHDR = true;
+        } else {
+            params.m_bVoidExtentLDR = true;
+        }
+
+        // Next two bits must be one.
+        if (!(modeBits & 0x400) || !strm.ReadBit()) {
+            params.m_bError = true;
+        }
+
+        return params;
+    }
+
+    // First check if the last four bits are zero
+    if ((modeBits & 0xF) == 0) {
+        params.m_bError = true;
+        return params;
+    }
+
+    // If the last two bits are zero, then if bits
+    // [6-8] are all ones, this is also reserved.
+    if ((modeBits & 0x3) == 0 && (modeBits & 0x1C0) == 0x1C0) {
+        params.m_bError = true;
+        return params;
+    }
+
+    // Otherwise, there is no error... Figure out the layout
+    // of the block mode. Layout is determined by a number
+    // between 0 and 9 corresponding to table C.2.8 of the
+    // ASTC spec.
+    uint32_t layout = 0;
+
+    if ((modeBits & 0x1) || (modeBits & 0x2)) {
+        // layout is in [0-4]
+        if (modeBits & 0x8) {
+            // layout is in [2-4]
+            if (modeBits & 0x4) {
+                // layout is in [3-4]
+                if (modeBits & 0x100) {
+                    layout = 4;
+                } else {
+                    layout = 3;
+                }
+            } else {
+                layout = 2;
+            }
+        } else {
+            // layout is in [0-1]
+            if (modeBits & 0x4) {
+                layout = 1;
+            } else {
+                layout = 0;
+            }
+        }
+    } else {
+        // layout is in [5-9]
+        if (modeBits & 0x100) {
+            // layout is in [7-9]
+            if (modeBits & 0x80) {
+                // layout is in [7-8]
+                assert((modeBits & 0x40) == 0U);
+                if (modeBits & 0x20) {
+                    layout = 8;
+                } else {
+                    layout = 7;
+                }
+            } else {
+                layout = 9;
+            }
+        } else {
+            // layout is in [5-6]
+            if (modeBits & 0x80) {
+                layout = 6;
+            } else {
+                layout = 5;
+            }
+        }
+    }
+
+    assert(layout < 10);
+
+    // Determine R
+    uint32_t R = !!(modeBits & 0x10);
+    if (layout < 5) {
+        R |= (modeBits & 0x3) << 1;
+    } else {
+        R |= (modeBits & 0xC) >> 1;
+    }
+    assert(2 <= R && R <= 7);
+
+    // Determine width & height
+    switch (layout) {
+    case 0: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 7) & 0x3;
+        params.m_Width = B + 4;
+        params.m_Height = A + 2;
+        break;
+    }
+
+    case 1: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 7) & 0x3;
+        params.m_Width = B + 8;
+        params.m_Height = A + 2;
+        break;
+    }
+
+    case 2: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 7) & 0x3;
+        params.m_Width = A + 2;
+        params.m_Height = B + 8;
+        break;
+    }
+
+    case 3: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 7) & 0x1;
+        params.m_Width = A + 2;
+        params.m_Height = B + 6;
+        break;
+    }
+
+    case 4: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 7) & 0x1;
+        params.m_Width = B + 2;
+        params.m_Height = A + 2;
+        break;
+    }
+
+    case 5: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        params.m_Width = 12;
+        params.m_Height = A + 2;
+        break;
+    }
+
+    case 6: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        params.m_Width = A + 2;
+        params.m_Height = 12;
+        break;
+    }
+
+    case 7: {
+        params.m_Width = 6;
+        params.m_Height = 10;
+        break;
+    }
+
+    case 8: {
+        params.m_Width = 10;
+        params.m_Height = 6;
+        break;
+    }
+
+    case 9: {
+        uint32_t A = (modeBits >> 5) & 0x3;
+        uint32_t B = (modeBits >> 9) & 0x3;
+        params.m_Width = A + 6;
+        params.m_Height = B + 6;
+        break;
+    }
+
+    default:
+        assert(!"Don't know this layout...");
+        params.m_bError = true;
+        break;
+    }
+
+    // Determine whether or not we're using dual planes
+    // and/or high precision layouts.
+    bool D = (layout != 9) && (modeBits & 0x400);
+    bool H = (layout != 9) && (modeBits & 0x200);
+
+    if (H) {
+        const uint32_t maxWeights[6] = {9, 11, 15, 19, 23, 31};
+        params.m_MaxWeight = maxWeights[R - 2];
+    } else {
+        const uint32_t maxWeights[6] = {1, 2, 3, 4, 5, 7};
+        params.m_MaxWeight = maxWeights[R - 2];
+    }
+
+    params.m_bDualPlane = D;
+
+    return params;
+}
+
+void FillVoidExtentLDR(BitStream& strm, uint32_t* const outBuf, uint32_t blockWidth,
+                       uint32_t blockHeight) {
+    // Don't actually care about the void extent, just read the bits...
+    for (int i = 0; i < 4; ++i) {
+        strm.ReadBits(13);
+    }
+
+    // Decode the RGBA components and renormalize them to the range [0, 255]
+    uint16_t r = strm.ReadBits(16);
+    uint16_t g = strm.ReadBits(16);
+    uint16_t b = strm.ReadBits(16);
+    uint16_t a = strm.ReadBits(16);
+
+    uint32_t rgba = (r >> 8) | (g & 0xFF00) | (static_cast<uint32_t>(b) & 0xFF00) << 8 |
+                    (static_cast<uint32_t>(a) & 0xFF00) << 16;
+
+    for (uint32_t j = 0; j < blockHeight; j++)
+        for (uint32_t i = 0; i < blockWidth; i++) {
+            outBuf[j * blockWidth + i] = rgba;
+        }
+}
+
+void FillError(uint32_t* outBuf, uint32_t blockWidth, uint32_t blockHeight) {
+    for (uint32_t j = 0; j < blockHeight; j++)
+        for (uint32_t i = 0; i < blockWidth; i++) {
+            outBuf[j * blockWidth + i] = 0xFFFF00FF;
+        }
+}
+
+// Replicates low numBits such that [(toBit - 1):(toBit - 1 - fromBit)]
+// is the same as [(numBits - 1):0] and repeats all the way down.
+template <typename IntType>
+IntType Replicate(const IntType& val, uint32_t numBits, uint32_t toBit) {
+    if (numBits == 0)
+        return 0;
+    if (toBit == 0)
+        return 0;
+    IntType v = val & ((1 << numBits) - 1);
+    IntType res = v;
+    uint32_t reslen = numBits;
+    while (reslen < toBit) {
+        uint32_t comp = 0;
+        if (numBits > toBit - reslen) {
+            uint32_t newshift = toBit - reslen;
+            comp = numBits - newshift;
+            numBits = newshift;
+        }
+        res <<= numBits;
+        res |= v >> comp;
+        reslen += numBits;
+    }
+    return res;
+}
+
+class Pixel {
+protected:
+    typedef int16_t ChannelType;
+    uint8_t m_BitDepth[4];
+    int16_t color[4];
+
+public:
+    Pixel() {
+        for (int i = 0; i < 4; i++) {
+            m_BitDepth[i] = 8;
+            color[i] = 0;
+        }
+    }
+
+    Pixel(ChannelType a, ChannelType r, ChannelType g, ChannelType b, unsigned bitDepth = 8) {
+        for (int i = 0; i < 4; i++)
+            m_BitDepth[i] = bitDepth;
+
+        color[0] = a;
+        color[1] = r;
+        color[2] = g;
+        color[3] = b;
+    }
+
+    // Changes the depth of each pixel. This scales the values to
+    // the appropriate bit depth by either truncating the least
+    // significant bits when going from larger to smaller bit depth
+    // or by repeating the most significant bits when going from
+    // smaller to larger bit depths.
+    void ChangeBitDepth(const uint8_t (&depth)[4]) {
+        for (uint32_t i = 0; i < 4; i++) {
+            Component(i) = ChangeBitDepth(Component(i), m_BitDepth[i], depth[i]);
+            m_BitDepth[i] = depth[i];
+        }
+    }
+
+    template <typename IntType>
+    static float ConvertChannelToFloat(IntType channel, uint8_t bitDepth) {
+        float denominator = static_cast<float>((1 << bitDepth) - 1);
+        return static_cast<float>(channel) / denominator;
+    }
+
+    // Changes the bit depth of a single component. See the comment
+    // above for how we do this.
+    static ChannelType ChangeBitDepth(Pixel::ChannelType val, uint8_t oldDepth, uint8_t newDepth) {
+        assert(newDepth <= 8);
+        assert(oldDepth <= 8);
+
+        if (oldDepth == newDepth) {
+            // Do nothing
+            return val;
+        } else if (oldDepth == 0 && newDepth != 0) {
+            return (1 << newDepth) - 1;
+        } else if (newDepth > oldDepth) {
+            return Replicate(val, oldDepth, newDepth);
+        } else {
+            // oldDepth > newDepth
+            if (newDepth == 0) {
+                return 0xFF;
+            } else {
+                uint8_t bitsWasted = oldDepth - newDepth;
+                uint16_t v = static_cast<uint16_t>(val);
+                v = (v + (1 << (bitsWasted - 1))) >> bitsWasted;
+                v = ::std::min<uint16_t>(::std::max<uint16_t>(0, v), (1 << newDepth) - 1);
+                return static_cast<uint8_t>(v);
+            }
+        }
+
+        assert(!"We shouldn't get here.");
+        return 0;
+    }
+
+    const ChannelType& A() const {
+        return color[0];
+    }
+    ChannelType& A() {
+        return color[0];
+    }
+    const ChannelType& R() const {
+        return color[1];
+    }
+    ChannelType& R() {
+        return color[1];
+    }
+    const ChannelType& G() const {
+        return color[2];
+    }
+    ChannelType& G() {
+        return color[2];
+    }
+    const ChannelType& B() const {
+        return color[3];
+    }
+    ChannelType& B() {
+        return color[3];
+    }
+    const ChannelType& Component(uint32_t idx) const {
+        return color[idx];
+    }
+    ChannelType& Component(uint32_t idx) {
+        return color[idx];
+    }
+
+    void GetBitDepth(uint8_t (&outDepth)[4]) const {
+        for (int i = 0; i < 4; i++) {
+            outDepth[i] = m_BitDepth[i];
+        }
+    }
+
+    // Take all of the components, transform them to their 8-bit variants,
+    // and then pack each channel into an R8G8B8A8 32-bit integer. We assume
+    // that the architecture is little-endian, so the alpha channel will end
+    // up in the most-significant byte.
+    uint32_t Pack() const {
+        Pixel eightBit(*this);
+        const uint8_t eightBitDepth[4] = {8, 8, 8, 8};
+        eightBit.ChangeBitDepth(eightBitDepth);
+
+        uint32_t r = 0;
+        r |= eightBit.A();
+        r <<= 8;
+        r |= eightBit.B();
+        r <<= 8;
+        r |= eightBit.G();
+        r <<= 8;
+        r |= eightBit.R();
+        return r;
+    }
+
+    // Clamps the pixel to the range [0,255]
+    void ClampByte() {
+        for (uint32_t i = 0; i < 4; i++) {
+            color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
+        }
+    }
+
+    void MakeOpaque() {
+        A() = 255;
+    }
+};
+
+void DecodeColorValues(uint32_t* out, uint8_t* data, uint32_t* modes, const uint32_t nPartitions,
+                       const uint32_t nBitsForColorData) {
+    // First figure out how many color values we have
+    uint32_t nValues = 0;
+    for (uint32_t i = 0; i < nPartitions; i++) {
+        nValues += ((modes[i] >> 2) + 1) << 1;
+    }
+
+    // Then based on the number of values and the remaining number of bits,
+    // figure out the max value for each of them...
+    uint32_t range = 256;
+    while (--range > 0) {
+        IntegerEncodedValue val = IntegerEncodedValue::CreateEncoding(range);
+        uint32_t bitLength = val.GetBitLength(nValues);
+        if (bitLength <= nBitsForColorData) {
+            // Find the smallest possible range that matches the given encoding
+            while (--range > 0) {
+                IntegerEncodedValue newval = IntegerEncodedValue::CreateEncoding(range);
+                if (!newval.MatchesEncoding(val)) {
+                    break;
+                }
+            }
+
+            // Return to last matching range.
+            range++;
+            break;
+        }
+    }
+
+    // We now have enough to decode our integer sequence.
+    std::vector<IntegerEncodedValue> decodedColorValues;
+    BitStream colorStream(data);
+    IntegerEncodedValue::DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
+
+    // Once we have the decoded values, we need to dequantize them to the 0-255 range
+    // This procedure is outlined in ASTC spec C.2.13
+    uint32_t outIdx = 0;
+    std::vector<IntegerEncodedValue>::const_iterator itr;
+    for (itr = decodedColorValues.begin(); itr != decodedColorValues.end(); itr++) {
+        // Have we already decoded all that we need?
+        if (outIdx >= nValues) {
+            break;
+        }
+
+        const IntegerEncodedValue& val = *itr;
+        uint32_t bitlen = val.BaseBitLength();
+        uint32_t bitval = val.GetBitValue();
+
+        assert(bitlen >= 1);
+
+        uint32_t A = 0, B = 0, C = 0, D = 0;
+        // A is just the lsb replicated 9 times.
+        A = Replicate(bitval & 1, 1, 9);
+
+        switch (val.GetEncoding()) {
+        // Replicate bits
+        case eIntegerEncoding_JustBits:
+            out[outIdx++] = Replicate(bitval, bitlen, 8);
+            break;
+
+        // Use algorithm in C.2.13
+        case eIntegerEncoding_Trit: {
+
+            D = val.GetTritValue();
+
+            switch (bitlen) {
+            case 1: {
+                C = 204;
+            } break;
+
+            case 2: {
+                C = 93;
+                // B = b000b0bb0
+                uint32_t b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
+            } break;
+
+            case 3: {
+                C = 44;
+                // B = cb000cbcb
+                uint32_t cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 2) | cb;
+            } break;
+
+            case 4: {
+                C = 22;
+                // B = dcb000dcb
+                uint32_t dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | dcb;
+            } break;
+
+            case 5: {
+                C = 11;
+                // B = edcb000ed
+                uint32_t edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 2);
+            } break;
+
+            case 6: {
+                C = 5;
+                // B = fedcb000f
+                uint32_t fedcb = (bitval >> 1) & 0x1F;
+                B = (fedcb << 4) | (fedcb >> 4);
+            } break;
+
+            default:
+                assert(!"Unsupported trit encoding for color values!");
+                break;
+            } // switch(bitlen)
+        }     // case eIntegerEncoding_Trit
+        break;
+
+        case eIntegerEncoding_Quint: {
+
+            D = val.GetQuintValue();
+
+            switch (bitlen) {
+            case 1: {
+                C = 113;
+            } break;
+
+            case 2: {
+                C = 54;
+                // B = b0000bb00
+                uint32_t b = (bitval >> 1) & 1;
+                B = (b << 8) | (b << 3) | (b << 2);
+            } break;
+
+            case 3: {
+                C = 26;
+                // B = cb0000cbc
+                uint32_t cb = (bitval >> 1) & 3;
+                B = (cb << 7) | (cb << 1) | (cb >> 1);
+            } break;
+
+            case 4: {
+                C = 13;
+                // B = dcb0000dc
+                uint32_t dcb = (bitval >> 1) & 7;
+                B = (dcb << 6) | (dcb >> 1);
+            } break;
+
+            case 5: {
+                C = 6;
+                // B = edcb0000e
+                uint32_t edcb = (bitval >> 1) & 0xF;
+                B = (edcb << 5) | (edcb >> 3);
+            } break;
+
+            default:
+                assert(!"Unsupported quint encoding for color values!");
+                break;
+            } // switch(bitlen)
+        }     // case eIntegerEncoding_Quint
+        break;
+        } // switch(val.GetEncoding())
+
+        if (val.GetEncoding() != eIntegerEncoding_JustBits) {
+            uint32_t T = D * C + B;
+            T ^= A;
+            T = (A & 0x80) | (T >> 2);
+            out[outIdx++] = T;
+        }
+    }
+
+    // Make sure that each of our values is in the proper range...
+    for (uint32_t i = 0; i < nValues; i++) {
+        assert(out[i] <= 255);
+    }
+}
+
+uint32_t UnquantizeTexelWeight(const IntegerEncodedValue& val) {
+    uint32_t bitval = val.GetBitValue();
+    uint32_t bitlen = val.BaseBitLength();
+
+    uint32_t A = Replicate(bitval & 1, 1, 7);
+    uint32_t B = 0, C = 0, D = 0;
+
+    uint32_t result = 0;
+    switch (val.GetEncoding()) {
+    case eIntegerEncoding_JustBits:
+        result = Replicate(bitval, bitlen, 6);
+        break;
+
+    case eIntegerEncoding_Trit: {
+        D = val.GetTritValue();
+        assert(D < 3);
+
+        switch (bitlen) {
+        case 0: {
+            uint32_t results[3] = {0, 32, 63};
+            result = results[D];
+        } break;
+
+        case 1: {
+            C = 50;
+        } break;
+
+        case 2: {
+            C = 23;
+            uint32_t b = (bitval >> 1) & 1;
+            B = (b << 6) | (b << 2) | b;
+        } break;
+
+        case 3: {
+            C = 11;
+            uint32_t cb = (bitval >> 1) & 3;
+            B = (cb << 5) | cb;
+        } break;
+
+        default:
+            assert(!"Invalid trit encoding for texel weight");
+            break;
+        }
+    } break;
+
+    case eIntegerEncoding_Quint: {
+        D = val.GetQuintValue();
+        assert(D < 5);
+
+        switch (bitlen) {
+        case 0: {
+            uint32_t results[5] = {0, 16, 32, 47, 63};
+            result = results[D];
+        } break;
+
+        case 1: {
+            C = 28;
+        } break;
+
+        case 2: {
+            C = 13;
+            uint32_t b = (bitval >> 1) & 1;
+            B = (b << 6) | (b << 1);
+        } break;
+
+        default:
+            assert(!"Invalid quint encoding for texel weight");
+            break;
+        }
+    } break;
+    }
+
+    if (val.GetEncoding() != eIntegerEncoding_JustBits && bitlen > 0) {
+        // Decode the value...
+        result = D * C + B;
+        result ^= A;
+        result = (A & 0x20) | (result >> 2);
+    }
+
+    assert(result < 64);
+
+    // Change from [0,63] to [0,64]
+    if (result > 32) {
+        result += 1;
+    }
+
+    return result;
+}
+
+void UnquantizeTexelWeights(uint32_t out[2][144], std::vector<IntegerEncodedValue>& weights,
+                            const TexelWeightParams& params, const uint32_t blockWidth,
+                            const uint32_t blockHeight) {
+    uint32_t weightIdx = 0;
+    uint32_t unquantized[2][144];
+    std::vector<IntegerEncodedValue>::const_iterator itr;
+    for (itr = weights.begin(); itr != weights.end(); itr++) {
+        unquantized[0][weightIdx] = UnquantizeTexelWeight(*itr);
+
+        if (params.m_bDualPlane) {
+            itr++;
+            unquantized[1][weightIdx] = UnquantizeTexelWeight(*itr);
+            if (itr == weights.end()) {
+                break;
+            }
+        }
+
+        if (++weightIdx >= (params.m_Width * params.m_Height))
+            break;
+    }
+
+    // Do infill if necessary (Section C.2.18) ...
+    uint32_t Ds = (1024 + (blockWidth / 2)) / (blockWidth - 1);
+    uint32_t Dt = (1024 + (blockHeight / 2)) / (blockHeight - 1);
+
+    const uint32_t kPlaneScale = params.m_bDualPlane ? 2U : 1U;
+    for (uint32_t plane = 0; plane < kPlaneScale; plane++)
+        for (uint32_t t = 0; t < blockHeight; t++)
+            for (uint32_t s = 0; s < blockWidth; s++) {
+                uint32_t cs = Ds * s;
+                uint32_t ct = Dt * t;
+
+                uint32_t gs = (cs * (params.m_Width - 1) + 32) >> 6;
+                uint32_t gt = (ct * (params.m_Height - 1) + 32) >> 6;
+
+                uint32_t js = gs >> 4;
+                uint32_t fs = gs & 0xF;
+
+                uint32_t jt = gt >> 4;
+                uint32_t ft = gt & 0x0F;
+
+                uint32_t w11 = (fs * ft + 8) >> 4;
+                uint32_t w10 = ft - w11;
+                uint32_t w01 = fs - w11;
+                uint32_t w00 = 16 - fs - ft + w11;
+
+                uint32_t v0 = js + jt * params.m_Width;
+
+#define FIND_TEXEL(tidx, bidx)                                                                     \
+    uint32_t p##bidx = 0;                                                                          \
+    do {                                                                                           \
+        if ((tidx) < (params.m_Width * params.m_Height)) {                                         \
+            p##bidx = unquantized[plane][(tidx)];                                                  \
+        }                                                                                          \
+    } while (0)
+
+                FIND_TEXEL(v0, 00);
+                FIND_TEXEL(v0 + 1, 01);
+                FIND_TEXEL(v0 + params.m_Width, 10);
+                FIND_TEXEL(v0 + params.m_Width + 1, 11);
+
+#undef FIND_TEXEL
+
+                out[plane][t * blockWidth + s] =
+                    (p00 * w00 + p01 * w01 + p10 * w10 + p11 * w11 + 8) >> 4;
+            }
+}
+
+// Transfers a bit as described in C.2.14
+static inline void BitTransferSigned(int32_t& a, int32_t& b) {
+    b >>= 1;
+    b |= a & 0x80;
+    a >>= 1;
+    a &= 0x3F;
+    if (a & 0x20)
+        a -= 0x40;
+}
+
+// Adds more precision to the blue channel as described
+// in C.2.14
+static inline Pixel BlueContract(int32_t a, int32_t r, int32_t g, int32_t b) {
+    return Pixel(static_cast<int16_t>(a), static_cast<int16_t>((r + b) >> 1),
+                 static_cast<int16_t>((g + b) >> 1), static_cast<int16_t>(b));
+}
+
+// Partition selection functions as specified in
+// C.2.21
+static inline uint32_t hash52(uint32_t p) {
+    p ^= p >> 15;
+    p -= p << 17;
+    p += p << 7;
+    p += p << 4;
+    p ^= p >> 5;
+    p += p << 16;
+    p ^= p >> 7;
+    p ^= p >> 3;
+    p ^= p << 6;
+    p ^= p >> 17;
+    return p;
+}
+
+static uint32_t SelectPartition(int32_t seed, int32_t x, int32_t y, int32_t z,
+                                int32_t partitionCount, int32_t smallBlock) {
+    if (1 == partitionCount)
+        return 0;
+
+    if (smallBlock) {
+        x <<= 1;
+        y <<= 1;
+        z <<= 1;
+    }
+
+    seed += (partitionCount - 1) * 1024;
+
+    uint32_t rnum = hash52(static_cast<uint32_t>(seed));
+    uint8_t seed1 = static_cast<uint8_t>(rnum & 0xF);
+    uint8_t seed2 = static_cast<uint8_t>((rnum >> 4) & 0xF);
+    uint8_t seed3 = static_cast<uint8_t>((rnum >> 8) & 0xF);
+    uint8_t seed4 = static_cast<uint8_t>((rnum >> 12) & 0xF);
+    uint8_t seed5 = static_cast<uint8_t>((rnum >> 16) & 0xF);
+    uint8_t seed6 = static_cast<uint8_t>((rnum >> 20) & 0xF);
+    uint8_t seed7 = static_cast<uint8_t>((rnum >> 24) & 0xF);
+    uint8_t seed8 = static_cast<uint8_t>((rnum >> 28) & 0xF);
+    uint8_t seed9 = static_cast<uint8_t>((rnum >> 18) & 0xF);
+    uint8_t seed10 = static_cast<uint8_t>((rnum >> 22) & 0xF);
+    uint8_t seed11 = static_cast<uint8_t>((rnum >> 26) & 0xF);
+    uint8_t seed12 = static_cast<uint8_t>(((rnum >> 30) | (rnum << 2)) & 0xF);
+
+    seed1 *= seed1;
+    seed2 *= seed2;
+    seed3 *= seed3;
+    seed4 *= seed4;
+    seed5 *= seed5;
+    seed6 *= seed6;
+    seed7 *= seed7;
+    seed8 *= seed8;
+    seed9 *= seed9;
+    seed10 *= seed10;
+    seed11 *= seed11;
+    seed12 *= seed12;
+
+    int32_t sh1, sh2, sh3;
+    if (seed & 1) {
+        sh1 = (seed & 2) ? 4 : 5;
+        sh2 = (partitionCount == 3) ? 6 : 5;
+    } else {
+        sh1 = (partitionCount == 3) ? 6 : 5;
+        sh2 = (seed & 2) ? 4 : 5;
+    }
+    sh3 = (seed & 0x10) ? sh1 : sh2;
+
+    seed1 >>= sh1;
+    seed2 >>= sh2;
+    seed3 >>= sh1;
+    seed4 >>= sh2;
+    seed5 >>= sh1;
+    seed6 >>= sh2;
+    seed7 >>= sh1;
+    seed8 >>= sh2;
+    seed9 >>= sh3;
+    seed10 >>= sh3;
+    seed11 >>= sh3;
+    seed12 >>= sh3;
+
+    int32_t a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
+    int32_t b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
+    int32_t c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
+    int32_t d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+
+    a &= 0x3F;
+    b &= 0x3F;
+    c &= 0x3F;
+    d &= 0x3F;
+
+    if (partitionCount < 4)
+        d = 0;
+    if (partitionCount < 3)
+        c = 0;
+
+    if (a >= b && a >= c && a >= d)
+        return 0;
+    else if (b >= c && b >= d)
+        return 1;
+    else if (c >= d)
+        return 2;
+    return 3;
+}
+
+static inline uint32_t Select2DPartition(int32_t seed, int32_t x, int32_t y, int32_t partitionCount,
+                                         int32_t smallBlock) {
+    return SelectPartition(seed, x, y, 0, partitionCount, smallBlock);
+}
+
+// Section C.2.14
+void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValues,
+                      uint32_t colorEndpointMode) {
+#define READ_UINT_VALUES(N)                                                                        \
+    uint32_t v[N];                                                                                 \
+    for (uint32_t i = 0; i < N; i++) {                                                             \
+        v[i] = *(colorValues++);                                                                   \
+    }
+
+#define READ_INT_VALUES(N)                                                                         \
+    int32_t v[N];                                                                                  \
+    for (uint32_t i = 0; i < N; i++) {                                                             \
+        v[i] = static_cast<int32_t>(*(colorValues++));                                             \
+    }
+
+    switch (colorEndpointMode) {
+    case 0: {
+        READ_UINT_VALUES(2)
+        ep1 = Pixel(0xFF, v[0], v[0], v[0]);
+        ep2 = Pixel(0xFF, v[1], v[1], v[1]);
+    } break;
+
+    case 1: {
+        READ_UINT_VALUES(2)
+        uint32_t L0 = (v[0] >> 2) | (v[1] & 0xC0);
+        uint32_t L1 = std::max(L0 + (v[1] & 0x3F), 0xFFU);
+        ep1 = Pixel(0xFF, L0, L0, L0);
+        ep2 = Pixel(0xFF, L1, L1, L1);
+    } break;
+
+    case 4: {
+        READ_UINT_VALUES(4)
+        ep1 = Pixel(v[2], v[0], v[0], v[0]);
+        ep2 = Pixel(v[3], v[1], v[1], v[1]);
+    } break;
+
+    case 5: {
+        READ_INT_VALUES(4)
+        BitTransferSigned(v[1], v[0]);
+        BitTransferSigned(v[3], v[2]);
+        ep1 = Pixel(v[2], v[0], v[0], v[0]);
+        ep2 = Pixel(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1]);
+        ep1.ClampByte();
+        ep2.ClampByte();
+    } break;
+
+    case 6: {
+        READ_UINT_VALUES(4)
+        ep1 = Pixel(0xFF, v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep2 = Pixel(0xFF, v[0], v[1], v[2]);
+    } break;
+
+    case 8: {
+        READ_UINT_VALUES(6)
+        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+            ep1 = Pixel(0xFF, v[0], v[2], v[4]);
+            ep2 = Pixel(0xFF, v[1], v[3], v[5]);
+        } else {
+            ep1 = BlueContract(0xFF, v[1], v[3], v[5]);
+            ep2 = BlueContract(0xFF, v[0], v[2], v[4]);
+        }
+    } break;
+
+    case 9: {
+        READ_INT_VALUES(6)
+        BitTransferSigned(v[1], v[0]);
+        BitTransferSigned(v[3], v[2]);
+        BitTransferSigned(v[5], v[4]);
+        if (v[1] + v[3] + v[5] >= 0) {
+            ep1 = Pixel(0xFF, v[0], v[2], v[4]);
+            ep2 = Pixel(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+        } else {
+            ep1 = BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+            ep2 = BlueContract(0xFF, v[0], v[2], v[4]);
+        }
+        ep1.ClampByte();
+        ep2.ClampByte();
+    } break;
+
+    case 10: {
+        READ_UINT_VALUES(6)
+        ep1 = Pixel(v[4], v[0] * v[3] >> 8, v[1] * v[3] >> 8, v[2] * v[3] >> 8);
+        ep2 = Pixel(v[5], v[0], v[1], v[2]);
+    } break;
+
+    case 12: {
+        READ_UINT_VALUES(8)
+        if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) {
+            ep1 = Pixel(v[6], v[0], v[2], v[4]);
+            ep2 = Pixel(v[7], v[1], v[3], v[5]);
+        } else {
+            ep1 = BlueContract(v[7], v[1], v[3], v[5]);
+            ep2 = BlueContract(v[6], v[0], v[2], v[4]);
+        }
+    } break;
+
+    case 13: {
+        READ_INT_VALUES(8)
+        BitTransferSigned(v[1], v[0]);
+        BitTransferSigned(v[3], v[2]);
+        BitTransferSigned(v[5], v[4]);
+        BitTransferSigned(v[7], v[6]);
+        if (v[1] + v[3] + v[5] >= 0) {
+            ep1 = Pixel(v[6], v[0], v[2], v[4]);
+            ep2 = Pixel(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+        } else {
+            ep1 = BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]);
+            ep2 = BlueContract(v[6], v[0], v[2], v[4]);
+        }
+        ep1.ClampByte();
+        ep2.ClampByte();
+    } break;
+
+    default:
+        assert(!"Unsupported color endpoint mode (is it HDR?)");
+        break;
+    }
+
+#undef READ_UINT_VALUES
+#undef READ_INT_VALUES
+}
+
+void DecompressBlock(uint8_t inBuf[16], const uint32_t blockWidth, const uint32_t blockHeight,
+                     uint32_t* outBuf) {
+    BitStream strm(inBuf);
+    TexelWeightParams weightParams = DecodeBlockInfo(strm);
+
+    // Was there an error?
+    if (weightParams.m_bError) {
+        assert(!"Invalid block mode");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    if (weightParams.m_bVoidExtentLDR) {
+        FillVoidExtentLDR(strm, outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    if (weightParams.m_bVoidExtentHDR) {
+        assert(!"HDR void extent blocks are unsupported!");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    if (weightParams.m_Width > blockWidth) {
+        assert(!"Texel weight grid width should be smaller than block width");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    if (weightParams.m_Height > blockHeight) {
+        assert(!"Texel weight grid height should be smaller than block height");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    // Read num partitions
+    uint32_t nPartitions = strm.ReadBits(2) + 1;
+    assert(nPartitions <= 4);
+
+    if (nPartitions == 4 && weightParams.m_bDualPlane) {
+        assert(!"Dual plane mode is incompatible with four partition blocks");
+        FillError(outBuf, blockWidth, blockHeight);
+        return;
+    }
+
+    // Based on the number of partitions, read the color endpoint mode for
+    // each partition.
+
+    // Determine partitions, partition index, and color endpoint modes
+    int32_t planeIdx = -1;
+    uint32_t partitionIndex;
+    uint32_t colorEndpointMode[4] = {0, 0, 0, 0};
+
+    // Define color data.
+    uint8_t colorEndpointData[16];
+    memset(colorEndpointData, 0, sizeof(colorEndpointData));
+    BitStream colorEndpointStream(colorEndpointData, 16 * 8, 0);
+
+    // Read extra config data...
+    uint32_t baseCEM = 0;
+    if (nPartitions == 1) {
+        colorEndpointMode[0] = strm.ReadBits(4);
+        partitionIndex = 0;
+    } else {
+        partitionIndex = strm.ReadBits(10);
+        baseCEM = strm.ReadBits(6);
+    }
+    uint32_t baseMode = (baseCEM & 3);
+
+    // Remaining bits are color endpoint data...
+    uint32_t nWeightBits = weightParams.GetPackedBitSize();
+    int32_t remainingBits = 128 - nWeightBits - strm.GetBitsRead();
+
+    // Consider extra bits prior to texel data...
+    uint32_t extraCEMbits = 0;
+    if (baseMode) {
+        switch (nPartitions) {
+        case 2:
+            extraCEMbits += 2;
+            break;
+        case 3:
+            extraCEMbits += 5;
+            break;
+        case 4:
+            extraCEMbits += 8;
+            break;
+        default:
+            assert(false);
+            break;
+        }
+    }
+    remainingBits -= extraCEMbits;
+
+    // Do we have a dual plane situation?
+    uint32_t planeSelectorBits = 0;
+    if (weightParams.m_bDualPlane) {
+        planeSelectorBits = 2;
+    }
+    remainingBits -= planeSelectorBits;
+
+    // Read color data...
+    uint32_t colorDataBits = remainingBits;
+    while (remainingBits > 0) {
+        uint32_t nb = std::min(remainingBits, 8);
+        uint32_t b = strm.ReadBits(nb);
+        colorEndpointStream.WriteBits(b, nb);
+        remainingBits -= 8;
+    }
+
+    // Read the plane selection bits
+    planeIdx = strm.ReadBits(planeSelectorBits);
+
+    // Read the rest of the CEM
+    if (baseMode) {
+        uint32_t extraCEM = strm.ReadBits(extraCEMbits);
+        uint32_t CEM = (extraCEM << 6) | baseCEM;
+        CEM >>= 2;
+
+        bool C[4] = {0};
+        for (uint32_t i = 0; i < nPartitions; i++) {
+            C[i] = CEM & 1;
+            CEM >>= 1;
+        }
+
+        uint8_t M[4] = {0};
+        for (uint32_t i = 0; i < nPartitions; i++) {
+            M[i] = CEM & 3;
+            CEM >>= 2;
+            assert(M[i] <= 3);
+        }
+
+        for (uint32_t i = 0; i < nPartitions; i++) {
+            colorEndpointMode[i] = baseMode;
+            if (!(C[i]))
+                colorEndpointMode[i] -= 1;
+            colorEndpointMode[i] <<= 2;
+            colorEndpointMode[i] |= M[i];
+        }
+    } else if (nPartitions > 1) {
+        uint32_t CEM = baseCEM >> 2;
+        for (uint32_t i = 0; i < nPartitions; i++) {
+            colorEndpointMode[i] = CEM;
+        }
+    }
+
+    // Make sure everything up till here is sane.
+    for (uint32_t i = 0; i < nPartitions; i++) {
+        assert(colorEndpointMode[i] < 16);
+    }
+    assert(strm.GetBitsRead() + weightParams.GetPackedBitSize() == 128);
+
+    // Decode both color data and texel weight data
+    uint32_t colorValues[32]; // Four values, two endpoints, four maximum paritions
+    DecodeColorValues(colorValues, colorEndpointData, colorEndpointMode, nPartitions,
+                      colorDataBits);
+
+    Pixel endpoints[4][2];
+    const uint32_t* colorValuesPtr = colorValues;
+    for (uint32_t i = 0; i < nPartitions; i++) {
+        ComputeEndpoints(endpoints[i][0], endpoints[i][1], colorValuesPtr, colorEndpointMode[i]);
+    }
+
+    // Read the texel weight data..
+    uint8_t texelWeightData[16];
+    memcpy(texelWeightData, inBuf, sizeof(texelWeightData));
+
+    // Reverse everything
+    for (uint32_t i = 0; i < 8; i++) {
+// Taken from http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
+#define REVERSE_BYTE(b) (((b)*0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32
+        unsigned char a = static_cast<unsigned char>(REVERSE_BYTE(texelWeightData[i]));
+        unsigned char b = static_cast<unsigned char>(REVERSE_BYTE(texelWeightData[15 - i]));
+#undef REVERSE_BYTE
+
+        texelWeightData[i] = b;
+        texelWeightData[15 - i] = a;
+    }
+
+    // Make sure that higher non-texel bits are set to zero
+    const uint32_t clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1;
+    texelWeightData[clearByteStart - 1] &= (1 << (weightParams.GetPackedBitSize() % 8)) - 1;
+    memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart);
+
+    std::vector<IntegerEncodedValue> texelWeightValues;
+    BitStream weightStream(texelWeightData);
+
+    IntegerEncodedValue::DecodeIntegerSequence(texelWeightValues, weightStream,
+                                               weightParams.m_MaxWeight,
+                                               weightParams.GetNumWeightValues());
+
+    // Blocks can be at most 12x12, so we can have as many as 144 weights
+    uint32_t weights[2][144];
+    UnquantizeTexelWeights(weights, texelWeightValues, weightParams, blockWidth, blockHeight);
+
+    // Now that we have endpoints and weights, we can interpolate and generate
+    // the proper decoding...
+    for (uint32_t j = 0; j < blockHeight; j++)
+        for (uint32_t i = 0; i < blockWidth; i++) {
+            uint32_t partition = Select2DPartition(partitionIndex, i, j, nPartitions,
+                                                   (blockHeight * blockWidth) < 32);
+            assert(partition < nPartitions);
+
+            Pixel p;
+            for (uint32_t c = 0; c < 4; c++) {
+                uint32_t C0 = endpoints[partition][0].Component(c);
+                C0 = Replicate(C0, 8, 16);
+                uint32_t C1 = endpoints[partition][1].Component(c);
+                C1 = Replicate(C1, 8, 16);
+
+                uint32_t plane = 0;
+                if (weightParams.m_bDualPlane && (((planeIdx + 1) & 3) == c)) {
+                    plane = 1;
+                }
+
+                uint32_t weight = weights[plane][j * blockWidth + i];
+                uint32_t C = (C0 * (64 - weight) + C1 * weight + 32) / 64;
+                if (C == 65535) {
+                    p.Component(c) = 255;
+                } else {
+                    double Cf = static_cast<double>(C);
+                    p.Component(c) = static_cast<uint16_t>(255.0 * (Cf / 65536.0) + 0.5);
+                }
+            }
+
+            outBuf[j * blockWidth + i] = p.Pack();
+        }
+}
+
+} // namespace ASTCC
+
+namespace Tegra::Texture::ASTC {
+
+std::vector<uint8_t> Decompress(std::vector<uint8_t>& data, uint32_t width, uint32_t height,
+                                uint32_t block_width, uint32_t block_height) {
+    uint32_t blockIdx = 0;
+    std::vector<uint8_t> outData;
+    outData.resize(height * width * 4);
+    for (uint32_t j = 0; j < height; j += block_height) {
+        for (uint32_t i = 0; i < width; i += block_width) {
+
+            uint8_t* blockPtr = data.data() + blockIdx * 16;
+
+            // Blocks can be at most 12x12
+            uint32_t uncompData[144];
+            ASTCC::DecompressBlock(blockPtr, block_width, block_height, uncompData);
+
+            uint32_t decompWidth = std::min(block_width, width - i);
+            uint32_t decompHeight = std::min(block_height, height - j);
+
+            uint8_t* outRow = outData.data() + (j * width + i) * 4;
+            for (uint32_t jj = 0; jj < decompHeight; jj++) {
+                memcpy(outRow + jj * width * 4, uncompData + jj * block_width, decompWidth * 4);
+            }
+
+            blockIdx++;
+        }
+    }
+
+    return outData;
+}
+
+} // namespace Tegra::Texture::ASTC
diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h
new file mode 100644
index 000000000..f0d7c0e56
--- /dev/null
+++ b/src/video_core/textures/astc.h
@@ -0,0 +1,15 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+namespace Tegra::Texture::ASTC {
+
+std::vector<uint8_t> Decompress(std::vector<uint8_t>& data, uint32_t width, uint32_t height,
+                                uint32_t block_width, uint32_t block_height);
+
+} // namespace Tegra::Texture::ASTC
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 7bf9c4c4b..b3937b2fe 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -5,6 +5,7 @@
 #include <cstring>
 #include "common/assert.h"
 #include "core/memory.h"
+#include "video_core/gpu.h"
 #include "video_core/textures/decoders.h"
 #include "video_core/textures/texture.h"
 
@@ -51,8 +52,10 @@ u32 BytesPerPixel(TextureFormat format) {
         return 8;
     case TextureFormat::DXT23:
     case TextureFormat::DXT45:
+    case TextureFormat::BC7U:
         // In this case a 'pixel' actually refers to a 4x4 tile.
         return 16;
+    case TextureFormat::ASTC_2D_4X4:
     case TextureFormat::A8R8G8B8:
     case TextureFormat::A2B10G10R10:
     case TextureFormat::BF10GF11RF11:
@@ -64,6 +67,20 @@ u32 BytesPerPixel(TextureFormat format) {
         return 1;
     case TextureFormat::R16_G16_B16_A16:
         return 8;
+    case TextureFormat::R32_G32_B32_A32:
+        return 16;
+    default:
+        UNIMPLEMENTED_MSG("Format not implemented");
+        break;
+    }
+}
+
+static u32 DepthBytesPerPixel(DepthFormat format) {
+    switch (format) {
+    case DepthFormat::S8_Z24_UNORM:
+    case DepthFormat::Z24_S8_UNORM:
+    case DepthFormat::Z32_FLOAT:
+        return 4;
     default:
         UNIMPLEMENTED_MSG("Format not implemented");
         break;
@@ -82,6 +99,7 @@ std::vector<u8> UnswizzleTexture(VAddr address, TextureFormat format, u32 width,
     case TextureFormat::DXT23:
     case TextureFormat::DXT45:
     case TextureFormat::DXN1:
+    case TextureFormat::BC7U:
         // In the DXT and DXN formats, each 4x4 tile is swizzled instead of just individual pixel
         // values.
         CopySwizzledData(width / 4, height / 4, bytes_per_pixel, bytes_per_pixel, data,
@@ -93,7 +111,31 @@ std::vector<u8> UnswizzleTexture(VAddr address, TextureFormat format, u32 width,
     case TextureFormat::B5G6R5:
     case TextureFormat::R8:
     case TextureFormat::R16_G16_B16_A16:
+    case TextureFormat::R32_G32_B32_A32:
     case TextureFormat::BF10GF11RF11:
+    case TextureFormat::ASTC_2D_4X4:
+        CopySwizzledData(width, height, bytes_per_pixel, bytes_per_pixel, data,
+                         unswizzled_data.data(), true, block_height);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Format not implemented");
+        break;
+    }
+
+    return unswizzled_data;
+}
+
+std::vector<u8> UnswizzleDepthTexture(VAddr address, DepthFormat format, u32 width, u32 height,
+                                      u32 block_height) {
+    u8* data = Memory::GetPointer(address);
+    u32 bytes_per_pixel = DepthBytesPerPixel(format);
+
+    std::vector<u8> unswizzled_data(width * height * bytes_per_pixel);
+
+    switch (format) {
+    case DepthFormat::S8_Z24_UNORM:
+    case DepthFormat::Z24_S8_UNORM:
+    case DepthFormat::Z32_FLOAT:
         CopySwizzledData(width, height, bytes_per_pixel, bytes_per_pixel, data,
                          unswizzled_data.data(), true, block_height);
         break;
@@ -115,12 +157,15 @@ std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat
     case TextureFormat::DXT23:
     case TextureFormat::DXT45:
     case TextureFormat::DXN1:
+    case TextureFormat::BC7U:
+    case TextureFormat::ASTC_2D_4X4:
     case TextureFormat::A8R8G8B8:
     case TextureFormat::A2B10G10R10:
     case TextureFormat::A1B5G5R5:
     case TextureFormat::B5G6R5:
     case TextureFormat::R8:
     case TextureFormat::BF10GF11RF11:
+    case TextureFormat::R32_G32_B32_A32:
         // TODO(Subv): For the time being just forward the same data without any decoding.
         rgba_data = texture_data;
         break;
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index 2562c4b06..2b088c077 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -17,6 +17,12 @@ namespace Texture {
 std::vector<u8> UnswizzleTexture(VAddr address, TextureFormat format, u32 width, u32 height,
                                  u32 block_height = TICEntry::DefaultBlockHeight);
 
+/**
+ * Unswizzles a swizzled depth texture without changing its format.
+ */
+std::vector<u8> UnswizzleDepthTexture(VAddr address, DepthFormat format, u32 width, u32 height,
+                                      u32 block_height = TICEntry::DefaultBlockHeight);
+
 /// Copies texture data from a buffer and performs swizzling/unswizzling as necessary.
 void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_per_pixel,
                       u8* swizzled_data, u8* unswizzled_data, bool unswizzle, u32 block_height);
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index 89dc8ed1e..289140f31 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -24,9 +24,9 @@ bool Init(EmuWindow* emu_window) {
     g_renderer = std::make_unique<RendererOpenGL>();
     g_renderer->SetWindow(g_emu_window);
     if (g_renderer->Init()) {
-        NGLOG_DEBUG(Render, "initialized OK");
+        LOG_DEBUG(Render, "initialized OK");
     } else {
-        NGLOG_CRITICAL(Render, "initialization failed !");
+        LOG_CRITICAL(Render, "initialization failed !");
         return false;
     }
     return true;
@@ -36,7 +36,7 @@ bool Init(EmuWindow* emu_window) {
 void Shutdown() {
     g_renderer.reset();
 
-    NGLOG_DEBUG(Render, "shutdown OK");
+    LOG_DEBUG(Render, "shutdown OK");
 }
 
 } // namespace VideoCore