18 files changed, 619 insertions, 155 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index f5ae57039..09ecc5bad 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -27,6 +27,8 @@ add_library(video_core STATIC
     renderer_base.h
     renderer_opengl/gl_buffer_cache.cpp
     renderer_opengl/gl_buffer_cache.h
+    renderer_opengl/gl_primitive_assembler.cpp
+    renderer_opengl/gl_primitive_assembler.h
     renderer_opengl/gl_rasterizer.cpp
     renderer_opengl/gl_rasterizer.h
     renderer_opengl/gl_rasterizer_cache.cpp
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index ea1555c5d..912e785b9 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -4,11 +4,13 @@
 
 #include "core/memory.h"
 #include "video_core/engines/fermi_2d.h"
+#include "video_core/rasterizer_interface.h"
 #include "video_core/textures/decoders.h"
 
 namespace Tegra::Engines {
 
-Fermi2D::Fermi2D(MemoryManager& memory_manager) : memory_manager(memory_manager) {}
+Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager)
+    : memory_manager(memory_manager), rasterizer{rasterizer} {}
 
 void Fermi2D::WriteReg(u32 method, u32 value) {
     ASSERT_MSG(method < Regs::NUM_REGS,
@@ -44,27 +46,31 @@ void Fermi2D::HandleSurfaceCopy() {
     u32 src_bytes_per_pixel = RenderTargetBytesPerPixel(regs.src.format);
     u32 dst_bytes_per_pixel = RenderTargetBytesPerPixel(regs.dst.format);
 
-    if (regs.src.linear == regs.dst.linear) {
-        // If the input layout and the output layout are the same, just perform a raw copy.
-        ASSERT(regs.src.BlockHeight() == regs.dst.BlockHeight());
-        Memory::CopyBlock(dest_cpu, source_cpu,
-                          src_bytes_per_pixel * regs.dst.width * regs.dst.height);
-        return;
-    }
-
-    u8* src_buffer = Memory::GetPointer(source_cpu);
-    u8* dst_buffer = Memory::GetPointer(dest_cpu);
-
-    if (!regs.src.linear && regs.dst.linear) {
-        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
-        Texture::CopySwizzledData(regs.src.width, regs.src.height, src_bytes_per_pixel,
-                                  dst_bytes_per_pixel, src_buffer, dst_buffer, true,
-                                  regs.src.BlockHeight());
-    } else {
-        // If the input is linear and the output is tiled, swizzle the input and copy it over.
-        Texture::CopySwizzledData(regs.src.width, regs.src.height, src_bytes_per_pixel,
-                                  dst_bytes_per_pixel, dst_buffer, src_buffer, false,
-                                  regs.dst.BlockHeight());
+    if (!rasterizer.AccelerateSurfaceCopy(regs.src, regs.dst)) {
+        // TODO(bunnei): The below implementation currently will not get hit, as
+        // AccelerateSurfaceCopy tries to always copy and will always return success. This should be
+        // changed once we properly support flushing.
+
+        if (regs.src.linear == regs.dst.linear) {
+            // If the input layout and the output layout are the same, just perform a raw copy.
+            ASSERT(regs.src.BlockHeight() == regs.dst.BlockHeight());
+            Memory::CopyBlock(dest_cpu, source_cpu,
+                              src_bytes_per_pixel * regs.dst.width * regs.dst.height);
+            return;
+        }
+        u8* src_buffer = Memory::GetPointer(source_cpu);
+        u8* dst_buffer = Memory::GetPointer(dest_cpu);
+        if (!regs.src.linear && regs.dst.linear) {
+            // If the input is tiled and the output is linear, deswizzle the input and copy it over.
+            Texture::CopySwizzledData(regs.src.width, regs.src.height, src_bytes_per_pixel,
+                                      dst_bytes_per_pixel, src_buffer, dst_buffer, true,
+                                      regs.src.BlockHeight());
+        } else {
+            // If the input is linear and the output is tiled, swizzle the input and copy it over.
+            Texture::CopySwizzledData(regs.src.width, regs.src.height, src_bytes_per_pixel,
+                                      dst_bytes_per_pixel, dst_buffer, src_buffer, false,
+                                      regs.dst.BlockHeight());
+        }
     }
 }
 
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 021b83eaa..81d15c62a 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -12,6 +12,10 @@
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 
+namespace VideoCore {
+class RasterizerInterface;
+}
+
 namespace Tegra::Engines {
 
 #define FERMI2D_REG_INDEX(field_name)                                                              \
@@ -19,7 +23,7 @@ namespace Tegra::Engines {
 
 class Fermi2D final {
 public:
-    explicit Fermi2D(MemoryManager& memory_manager);
+    explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager);
     ~Fermi2D() = default;
 
     /// Write the value to the register identified by method.
@@ -94,6 +98,8 @@ public:
     MemoryManager& memory_manager;
 
 private:
+    VideoCore::RasterizerInterface& rasterizer;
+
     /// Performs the copy from the source surface to the destination surface as configured in the
     /// registers.
     void HandleSurfaceCopy();
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 9f5581045..4290da33f 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -744,6 +744,12 @@ public:
                         return static_cast<GPUVAddr>((static_cast<GPUVAddr>(end_addr_high) << 32) |
                                                      end_addr_low);
                     }
+
+                    /// Adjust the index buffer offset so it points to the first desired index.
+                    GPUVAddr IndexStart() const {
+                        return StartAddress() + static_cast<size_t>(first) *
+                                                    static_cast<size_t>(FormatSizeInBytes());
+                    }
                 } index_array;
 
                 INSERT_PADDING_WORDS(0x7);
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index baa8b63b7..9ba7e3533 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -25,7 +25,7 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
 GPU::GPU(VideoCore::RasterizerInterface& rasterizer) {
     memory_manager = std::make_unique<Tegra::MemoryManager>();
     maxwell_3d = std::make_unique<Engines::Maxwell3D>(rasterizer, *memory_manager);
-    fermi_2d = std::make_unique<Engines::Fermi2D>(*memory_manager);
+    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
     maxwell_compute = std::make_unique<Engines::MaxwellCompute>();
     maxwell_dma = std::make_unique<Engines::MaxwellDMA>(*memory_manager);
     kepler_memory = std::make_unique<Engines::KeplerMemory>(*memory_manager);
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index cd819d69f..06fc59dbe 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "common/common_types.h"
+#include "video_core/engines/fermi_2d.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 
@@ -33,13 +34,9 @@ public:
     /// and invalidated
     virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
 
-    /// Attempt to use a faster method to perform a display transfer with is_texture_copy = 0
-    virtual bool AccelerateDisplayTransfer(const void* config) {
-        return false;
-    }
-
-    /// Attempt to use a faster method to perform a display transfer with is_texture_copy = 1
-    virtual bool AccelerateTextureCopy(const void* config) {
+    /// Attempt to use a faster method to perform a surface copy
+    virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
+                                       const Tegra::Engines::Fermi2D::Regs::Surface& dst) {
         return false;
     }
 
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 578aca789..c142095c5 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -34,7 +34,7 @@ GLintptr OGLBufferCache::UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size
     }
 
     AlignBuffer(alignment);
-    GLintptr uploaded_offset = buffer_offset;
+    const GLintptr uploaded_offset = buffer_offset;
 
     Memory::ReadBlock(*cpu_addr, buffer_ptr, size);
 
@@ -57,13 +57,23 @@ GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, std::size_t s
                                           std::size_t alignment) {
     AlignBuffer(alignment);
     std::memcpy(buffer_ptr, raw_pointer, size);
-    GLintptr uploaded_offset = buffer_offset;
+    const GLintptr uploaded_offset = buffer_offset;
 
     buffer_ptr += size;
     buffer_offset += size;
     return uploaded_offset;
 }
 
+std::tuple<u8*, GLintptr> OGLBufferCache::ReserveMemory(std::size_t size, std::size_t alignment) {
+    AlignBuffer(alignment);
+    u8* const uploaded_ptr = buffer_ptr;
+    const GLintptr uploaded_offset = buffer_offset;
+
+    buffer_ptr += size;
+    buffer_offset += size;
+    return std::make_tuple(uploaded_ptr, uploaded_offset);
+}
+
 void OGLBufferCache::Map(std::size_t max_size) {
     bool invalidate;
     std::tie(buffer_ptr, buffer_offset_base, invalidate) =
@@ -74,6 +84,7 @@ void OGLBufferCache::Map(std::size_t max_size) {
         InvalidateAll();
     }
 }
+
 void OGLBufferCache::Unmap() {
     stream_buffer.Unmap(buffer_offset - buffer_offset_base);
 }
@@ -84,7 +95,7 @@ GLuint OGLBufferCache::GetHandle() const {
 
 void OGLBufferCache::AlignBuffer(std::size_t alignment) {
     // Align the offset, not the mapped pointer
-    GLintptr offset_aligned =
+    const GLintptr offset_aligned =
         static_cast<GLintptr>(Common::AlignUp(static_cast<std::size_t>(buffer_offset), alignment));
     buffer_ptr += offset_aligned - buffer_offset;
     buffer_offset = offset_aligned;
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index 6c18461f4..965976334 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -6,6 +6,7 @@
 
 #include <cstddef>
 #include <memory>
+#include <tuple>
 
 #include "common/common_types.h"
 #include "video_core/rasterizer_cache.h"
@@ -33,11 +34,17 @@ class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBuffer
 public:
     explicit OGLBufferCache(std::size_t size);
 
+    /// Uploads data from a guest GPU address. Returns host's buffer offset where it's been
+    /// allocated.
     GLintptr UploadMemory(Tegra::GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
                           bool cache = true);
 
+    /// Uploads from a host memory. Returns host's buffer offset where it's been allocated.
     GLintptr UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment = 4);
 
+    /// Reserves memory to be used by host's CPU. Returns mapped address and offset.
+    std::tuple<u8*, GLintptr> ReserveMemory(std::size_t size, std::size_t alignment = 4);
+
     void Map(std::size_t max_size);
     void Unmap();
 
diff --git a/src/video_core/renderer_opengl/gl_primitive_assembler.cpp b/src/video_core/renderer_opengl/gl_primitive_assembler.cpp
new file mode 100644
index 000000000..ee1d9601b
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_primitive_assembler.cpp
@@ -0,0 +1,64 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "core/memory.h"
+#include "video_core/renderer_opengl/gl_buffer_cache.h"
+#include "video_core/renderer_opengl/gl_primitive_assembler.h"
+
+namespace OpenGL {
+
+constexpr u32 TRIANGLES_PER_QUAD = 6;
+constexpr std::array<u32, TRIANGLES_PER_QUAD> QUAD_MAP = {0, 1, 2, 0, 2, 3};
+
+PrimitiveAssembler::PrimitiveAssembler(OGLBufferCache& buffer_cache) : buffer_cache(buffer_cache) {}
+
+PrimitiveAssembler::~PrimitiveAssembler() = default;
+
+std::size_t PrimitiveAssembler::CalculateQuadSize(u32 count) const {
+    ASSERT_MSG(count % 4 == 0, "Quad count is expected to be a multiple of 4");
+    return (count / 4) * TRIANGLES_PER_QUAD * sizeof(GLuint);
+}
+
+GLintptr PrimitiveAssembler::MakeQuadArray(u32 first, u32 count) {
+    const std::size_t size{CalculateQuadSize(count)};
+    auto [dst_pointer, index_offset] = buffer_cache.ReserveMemory(size);
+
+    for (u32 primitive = 0; primitive < count / 4; ++primitive) {
+        for (u32 i = 0; i < TRIANGLES_PER_QUAD; ++i) {
+            const u32 index = first + primitive * 4 + QUAD_MAP[i];
+            std::memcpy(dst_pointer, &index, sizeof(index));
+            dst_pointer += sizeof(index);
+        }
+    }
+
+    return index_offset;
+}
+
+GLintptr PrimitiveAssembler::MakeQuadIndexed(Tegra::GPUVAddr gpu_addr, std::size_t index_size,
+                                             u32 count) {
+    const std::size_t map_size{CalculateQuadSize(count)};
+    auto [dst_pointer, index_offset] = buffer_cache.ReserveMemory(map_size);
+
+    auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
+    const boost::optional<VAddr> cpu_addr{memory_manager.GpuToCpuAddress(gpu_addr)};
+    const u8* source{Memory::GetPointer(*cpu_addr)};
+
+    for (u32 primitive = 0; primitive < count / 4; ++primitive) {
+        for (std::size_t i = 0; i < TRIANGLES_PER_QUAD; ++i) {
+            const u32 index = primitive * 4 + QUAD_MAP[i];
+            const u8* src_offset = source + (index * index_size);
+
+            std::memcpy(dst_pointer, src_offset, index_size);
+            dst_pointer += index_size;
+        }
+    }
+
+    return index_offset;
+}
+
+} // namespace OpenGL
+\ No newline at end of file
diff --git a/src/video_core/renderer_opengl/gl_primitive_assembler.h b/src/video_core/renderer_opengl/gl_primitive_assembler.h
new file mode 100644
index 000000000..a8cb88eb5
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_primitive_assembler.h
@@ -0,0 +1,33 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <vector>
+#include <glad/glad.h>
+
+#include "common/common_types.h"
+#include "video_core/memory_manager.h"
+
+namespace OpenGL {
+
+class OGLBufferCache;
+
+class PrimitiveAssembler {
+public:
+    explicit PrimitiveAssembler(OGLBufferCache& buffer_cache);
+    ~PrimitiveAssembler();
+
+    /// Calculates the size required by MakeQuadArray and MakeQuadIndexed.
+    std::size_t CalculateQuadSize(u32 count) const;
+
+    GLintptr MakeQuadArray(u32 first, u32 count);
+
+    GLintptr MakeQuadIndexed(Tegra::GPUVAddr gpu_addr, std::size_t index_size, u32 count);
+
+private:
+    OGLBufferCache& buffer_cache;
+};
+
+} // namespace OpenGL
+\ No newline at end of file
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 587d9dffb..209bdf181 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -42,6 +42,41 @@ MICROPROFILE_DEFINE(OpenGL_Framebuffer, "OpenGL", "Framebuffer Setup", MP_RGB(12
 MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192));
 MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(128, 128, 192));
 MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100));
+MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255, 100, 100));
+
+struct DrawParameters {
+    GLenum primitive_mode;
+    GLsizei count;
+    GLint current_instance;
+    bool use_indexed;
+
+    GLint vertex_first;
+
+    GLenum index_format;
+    GLint base_vertex;
+    GLintptr index_buffer_offset;
+
+    void DispatchDraw() const {
+        if (use_indexed) {
+            const auto index_buffer_ptr = reinterpret_cast<const void*>(index_buffer_offset);
+            if (current_instance > 0) {
+                glDrawElementsInstancedBaseVertexBaseInstance(primitive_mode, count, index_format,
+                                                              index_buffer_ptr, 1, base_vertex,
+                                                              current_instance);
+            } else {
+                glDrawElementsBaseVertex(primitive_mode, count, index_format, index_buffer_ptr,
+                                         base_vertex);
+            }
+        } else {
+            if (current_instance > 0) {
+                glDrawArraysInstancedBaseInstance(primitive_mode, vertex_first, count, 1,
+                                                  current_instance);
+            } else {
+                glDrawArrays(primitive_mode, vertex_first, count);
+            }
+        }
+    }
+};
 
 RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo& info)
     : emu_window{window}, screen_info{info}, buffer_cache(STREAM_BUFFER_SIZE) {
@@ -172,6 +207,54 @@ void RasterizerOpenGL::SetupVertexArrays() {
     }
 }
 
+DrawParameters RasterizerOpenGL::SetupDraw() {
+    const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
+    const auto& regs = gpu.regs;
+    const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
+
+    DrawParameters params{};
+    params.current_instance = gpu.state.current_instance;
+
+    if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) {
+        MICROPROFILE_SCOPE(OpenGL_PrimitiveAssembly);
+
+        params.use_indexed = true;
+        params.primitive_mode = GL_TRIANGLES;
+
+        if (is_indexed) {
+            params.index_format = MaxwellToGL::IndexFormat(regs.index_array.format);
+            params.count = (regs.index_array.count / 4) * 6;
+            params.index_buffer_offset = primitive_assembler.MakeQuadIndexed(
+                regs.index_array.IndexStart(), regs.index_array.FormatSizeInBytes(),
+                regs.index_array.count);
+            params.base_vertex = static_cast<GLint>(regs.vb_element_base);
+        } else {
+            // MakeQuadArray always generates u32 indexes
+            params.index_format = GL_UNSIGNED_INT;
+            params.count = (regs.vertex_buffer.count / 4) * 6;
+            params.index_buffer_offset =
+                primitive_assembler.MakeQuadArray(regs.vertex_buffer.first, params.count);
+        }
+        return params;
+    }
+
+    params.use_indexed = is_indexed;
+    params.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology);
+
+    if (is_indexed) {
+        MICROPROFILE_SCOPE(OpenGL_Index);
+        params.index_format = MaxwellToGL::IndexFormat(regs.index_array.format);
+        params.count = regs.index_array.count;
+        params.index_buffer_offset =
+            buffer_cache.UploadMemory(regs.index_array.IndexStart(), CalculateIndexBufferSize());
+        params.base_vertex = static_cast<GLint>(regs.vb_element_base);
+    } else {
+        params.count = regs.vertex_buffer.count;
+        params.vertex_first = regs.vertex_buffer.first;
+    }
+    return params;
+}
+
 void RasterizerOpenGL::SetupShaders() {
     MICROPROFILE_SCOPE(OpenGL_Shader);
     const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
@@ -256,6 +339,13 @@ std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
     return size;
 }
 
+std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const {
+    const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
+
+    return static_cast<std::size_t>(regs.index_array.count) *
+           static_cast<std::size_t>(regs.index_array.FormatSizeInBytes());
+}
+
 bool RasterizerOpenGL::AccelerateDrawBatch(bool is_indexed) {
     accelerate_draw = is_indexed ? AccelDraw::Indexed : AccelDraw::Arrays;
     DrawArrays();
@@ -459,16 +549,23 @@ void RasterizerOpenGL::DrawArrays() {
 
     // Draw the vertex batch
     const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
-    const u64 index_buffer_size{static_cast<u64>(regs.index_array.count) *
-                                static_cast<u64>(regs.index_array.FormatSizeInBytes())};
 
     state.draw.vertex_buffer = buffer_cache.GetHandle();
     state.Apply();
 
     std::size_t buffer_size = CalculateVertexArraysSize();
 
-    if (is_indexed) {
-        buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + index_buffer_size;
+    // Add space for index buffer (keeping in mind non-core primitives)
+    switch (regs.draw.topology) {
+    case Maxwell::PrimitiveTopology::Quads:
+        buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
+                      primitive_assembler.CalculateQuadSize(regs.vertex_buffer.count);
+        break;
+    default:
+        if (is_indexed) {
+            buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + CalculateIndexBufferSize();
+        }
+        break;
     }
 
     // Uniform space for the 5 shader stages
@@ -482,20 +579,7 @@ void RasterizerOpenGL::DrawArrays() {
     buffer_cache.Map(buffer_size);
 
     SetupVertexArrays();
-
-    // If indexed mode, copy the index buffer
-    GLintptr index_buffer_offset = 0;
-    if (is_indexed) {
-        MICROPROFILE_SCOPE(OpenGL_Index);
-
-        // Adjust the index buffer offset so it points to the first desired index.
-        auto index_start = regs.index_array.StartAddress();
-        index_start += static_cast<size_t>(regs.index_array.first) *
-                       static_cast<size_t>(regs.index_array.FormatSizeInBytes());
-
-        index_buffer_offset = buffer_cache.UploadMemory(index_start, index_buffer_size);
-    }
-
+    DrawParameters params = SetupDraw();
     SetupShaders();
 
     buffer_cache.Unmap();
@@ -503,31 +587,8 @@ void RasterizerOpenGL::DrawArrays() {
     shader_program_manager->ApplyTo(state);
     state.Apply();
 
-    const GLenum primitive_mode{MaxwellToGL::PrimitiveTopology(regs.draw.topology)};
-    if (is_indexed) {
-        const GLint base_vertex{static_cast<GLint>(regs.vb_element_base)};
-
-        if (gpu.state.current_instance > 0) {
-            glDrawElementsInstancedBaseVertexBaseInstance(
-                primitive_mode, regs.index_array.count,
-                MaxwellToGL::IndexFormat(regs.index_array.format),
-                reinterpret_cast<const void*>(index_buffer_offset), 1, base_vertex,
-                gpu.state.current_instance);
-        } else {
-            glDrawElementsBaseVertex(primitive_mode, regs.index_array.count,
-                                     MaxwellToGL::IndexFormat(regs.index_array.format),
-                                     reinterpret_cast<const void*>(index_buffer_offset),
-                                     base_vertex);
-        }
-    } else {
-        if (gpu.state.current_instance > 0) {
-            glDrawArraysInstancedBaseInstance(primitive_mode, regs.vertex_buffer.first,
-                                              regs.vertex_buffer.count, 1,
-                                              gpu.state.current_instance);
-        } else {
-            glDrawArrays(primitive_mode, regs.vertex_buffer.first, regs.vertex_buffer.count);
-        }
-    }
+    // Execute draw call
+    params.DispatchDraw();
 
     // Disable scissor test
     state.scissor.enabled = false;
@@ -556,14 +617,10 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
     InvalidateRegion(addr, size);
 }
 
-bool RasterizerOpenGL::AccelerateDisplayTransfer(const void* config) {
+bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
+                                             const Tegra::Engines::Fermi2D::Regs::Surface& dst) {
     MICROPROFILE_SCOPE(OpenGL_Blits);
-    UNREACHABLE();
-    return true;
-}
-
-bool RasterizerOpenGL::AccelerateTextureCopy(const void* config) {
-    UNREACHABLE();
+    res_cache.FermiCopySurface(src, dst);
     return true;
 }
 
@@ -601,10 +658,13 @@ void RasterizerOpenGL::SamplerInfo::Create() {
     sampler.Create();
     mag_filter = min_filter = Tegra::Texture::TextureFilter::Linear;
     wrap_u = wrap_v = wrap_p = Tegra::Texture::WrapMode::Wrap;
+    uses_depth_compare = false;
+    depth_compare_func = Tegra::Texture::DepthCompareFunc::Never;
 
     // default is GL_LINEAR_MIPMAP_LINEAR
     glSamplerParameteri(sampler.handle, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
     // Other attributes have correct defaults
+    glSamplerParameteri(sampler.handle, GL_TEXTURE_COMPARE_FUNC, GL_NEVER);
 }
 
 void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntry& config) {
@@ -632,6 +692,21 @@ void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntr
         glSamplerParameteri(s, GL_TEXTURE_WRAP_R, MaxwellToGL::WrapMode(wrap_p));
     }
 
+    if (uses_depth_compare != (config.depth_compare_enabled == 1)) {
+        uses_depth_compare = (config.depth_compare_enabled == 1);
+        if (uses_depth_compare) {
+            glSamplerParameteri(s, GL_TEXTURE_COMPARE_MODE, GL_COMPARE_REF_TO_TEXTURE);
+        } else {
+            glSamplerParameteri(s, GL_TEXTURE_COMPARE_MODE, GL_NONE);
+        }
+    }
+
+    if (depth_compare_func != config.depth_compare_func) {
+        depth_compare_func = config.depth_compare_func;
+        glSamplerParameteri(s, GL_TEXTURE_COMPARE_FUNC,
+                            MaxwellToGL::DepthCompareFunc(depth_compare_func));
+    }
+
     if (wrap_u == Tegra::Texture::WrapMode::Border || wrap_v == Tegra::Texture::WrapMode::Border ||
         wrap_p == Tegra::Texture::WrapMode::Border) {
         const GLvec4 new_border_color = {{config.border_color_r, config.border_color_g,
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 4c8ecbd1c..0dab2018b 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -23,6 +23,7 @@
 #include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
+#include "video_core/renderer_opengl/gl_primitive_assembler.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
@@ -38,6 +39,7 @@ class EmuWindow;
 namespace OpenGL {
 
 struct ScreenInfo;
+struct DrawParameters;
 
 class RasterizerOpenGL : public VideoCore::RasterizerInterface {
 public:
@@ -50,8 +52,8 @@ public:
     void FlushRegion(VAddr addr, u64 size) override;
     void InvalidateRegion(VAddr addr, u64 size) override;
     void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
-    bool AccelerateDisplayTransfer(const void* config) override;
-    bool AccelerateTextureCopy(const void* config) override;
+    bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
+                               const Tegra::Engines::Fermi2D::Regs::Surface& dst) override;
     bool AccelerateFill(const void* config) override;
     bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
                            u32 pixel_stride) override;
@@ -94,6 +96,8 @@ private:
         Tegra::Texture::WrapMode wrap_u;
         Tegra::Texture::WrapMode wrap_v;
         Tegra::Texture::WrapMode wrap_p;
+        bool uses_depth_compare;
+        Tegra::Texture::DepthCompareFunc depth_compare_func;
         GLvec4 border_color;
     };
 
@@ -192,12 +196,17 @@ private:
     static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
     OGLBufferCache buffer_cache;
     OGLFramebuffer framebuffer;
+    PrimitiveAssembler primitive_assembler{buffer_cache};
     GLint uniform_buffer_alignment;
 
     std::size_t CalculateVertexArraysSize() const;
 
+    std::size_t CalculateIndexBufferSize() const;
+
     void SetupVertexArrays();
 
+    DrawParameters SetupDraw();
+
     void SetupShaders();
 
     enum class AccelDraw { Disabled, Arrays, Indexed };
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index ce967c4d6..56ff83eff 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -143,6 +143,28 @@ static VAddr TryGetCpuAddr(Tegra::GPUVAddr gpu_addr) {
     return params;
 }
 
+/*static*/ SurfaceParams SurfaceParams::CreateForFermiCopySurface(
+    const Tegra::Engines::Fermi2D::Regs::Surface& config) {
+    SurfaceParams params{};
+    params.addr = TryGetCpuAddr(config.Address());
+    params.is_tiled = !config.linear;
+    params.block_height = params.is_tiled ? config.BlockHeight() : 0,
+    params.pixel_format = PixelFormatFromRenderTargetFormat(config.format);
+    params.component_type = ComponentTypeFromRenderTarget(config.format);
+    params.type = GetFormatType(params.pixel_format);
+    params.width = config.width;
+    params.height = config.height;
+    params.unaligned_height = config.height;
+    params.target = SurfaceTarget::Texture2D;
+    params.depth = 1;
+    params.size_in_bytes_total = params.SizeInBytesTotal();
+    params.size_in_bytes_2d = params.SizeInBytes2D();
+    params.max_mip_level = 0;
+    params.rt = {};
+
+    return params;
+}
+
 static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_format_tuples = {{
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // ABGR8U
     {GL_RGBA8, GL_RGBA, GL_BYTE, ComponentType::SNorm, false},                     // ABGR8S
@@ -559,6 +581,18 @@ static bool BlitSurface(const Surface& src_surface, const Surface& dst_surface,
     return true;
 }
 
+static void FastCopySurface(const Surface& src_surface, const Surface& dst_surface) {
+    const auto& src_params{src_surface->GetSurfaceParams()};
+    const auto& dst_params{dst_surface->GetSurfaceParams()};
+
+    const u32 width{std::min(src_params.width, dst_params.width)};
+    const u32 height{std::min(src_params.height, dst_params.height)};
+
+    glCopyImageSubData(src_surface->Texture().handle, SurfaceTargetToGL(src_params.target), 0, 0, 0,
+                       0, dst_surface->Texture().handle, SurfaceTargetToGL(dst_params.target), 0, 0,
+                       0, 0, width, height, 1);
+}
+
 static void CopySurface(const Surface& src_surface, const Surface& dst_surface,
                         GLuint copy_pbo_handle, GLenum src_attachment = 0,
                         GLenum dst_attachment = 0, std::size_t cubemap_face = 0) {
@@ -1033,6 +1067,26 @@ Surface RasterizerCacheOpenGL::GetUncachedSurface(const SurfaceParams& params) {
     return surface;
 }
 
+void RasterizerCacheOpenGL::FermiCopySurface(
+    const Tegra::Engines::Fermi2D::Regs::Surface& src_config,
+    const Tegra::Engines::Fermi2D::Regs::Surface& dst_config) {
+
+    const auto& src_params = SurfaceParams::CreateForFermiCopySurface(src_config);
+    const auto& dst_params = SurfaceParams::CreateForFermiCopySurface(dst_config);
+
+    ASSERT(src_params.width == dst_params.width);
+    ASSERT(src_params.height == dst_params.height);
+    ASSERT(src_params.pixel_format == dst_params.pixel_format);
+    ASSERT(src_params.block_height == dst_params.block_height);
+    ASSERT(src_params.is_tiled == dst_params.is_tiled);
+    ASSERT(src_params.depth == dst_params.depth);
+    ASSERT(src_params.depth == 1); // Currently, FastCopySurface only works with 2D surfaces
+    ASSERT(src_params.target == dst_params.target);
+    ASSERT(src_params.rt.index == dst_params.rt.index);
+
+    FastCopySurface(GetSurface(src_params, true), GetSurface(dst_params, false));
+}
+
 Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface,
                                                const SurfaceParams& new_params) {
     // Verify surface is compatible for blitting
@@ -1041,6 +1095,15 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface,
     // Get a new surface with the new parameters, and blit the previous surface to it
     Surface new_surface{GetUncachedSurface(new_params)};
 
+    // For compatible surfaces, we can just do fast glCopyImageSubData based copy
+    if (old_params.target == new_params.target && old_params.type == new_params.type &&
+        old_params.depth == new_params.depth && old_params.depth == 1 &&
+        SurfaceParams::GetFormatBpp(old_params.pixel_format) ==
+            SurfaceParams::GetFormatBpp(new_params.pixel_format)) {
+        FastCopySurface(old_surface, new_surface);
+        return new_surface;
+    }
+
     // If the format is the same, just do a framebuffer blit. This is significantly faster than
     // using PBOs. The is also likely less accurate, as textures will be converted rather than
     // reinterpreted. When use_accurate_framebuffers setting is enabled, perform a more accurate
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 49025a3fe..0b4940b3c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -13,6 +13,7 @@
 #include "common/common_types.h"
 #include "common/hash.h"
 #include "common/math_util.h"
+#include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
@@ -719,6 +720,10 @@ struct SurfaceParams {
                                               Tegra::GPUVAddr zeta_address,
                                               Tegra::DepthFormat format);
 
+    /// Creates SurfaceParams for a Fermi2D surface copy
+    static SurfaceParams CreateForFermiCopySurface(
+        const Tegra::Engines::Fermi2D::Regs::Surface& config);
+
     /// Checks if surfaces are compatible for caching
     bool IsCompatibleSurface(const SurfaceParams& other) const {
         return std::tie(pixel_format, type, width, height, target, depth) ==
@@ -837,6 +842,10 @@ public:
     /// Tries to find a framebuffer using on the provided CPU address
     Surface TryFindFramebufferSurface(VAddr addr) const;
 
+    /// Copies the contents of one surface to another
+    void FermiCopySurface(const Tegra::Engines::Fermi2D::Regs::Surface& src_config,
+                          const Tegra::Engines::Fermi2D::Regs::Surface& dst_config);
+
 private:
     void LoadSurface(const Surface& surface);
     Surface GetSurface(const SurfaceParams& params, bool preserve_contents = true);
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 579a78702..7e57de78a 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -508,7 +508,7 @@ public:
     /// Returns the GLSL sampler used for the input shader sampler, and creates a new one if
     /// necessary.
     std::string AccessSampler(const Sampler& sampler, Tegra::Shader::TextureType type,
-                              bool is_array) {
+                              bool is_array, bool is_shadow) {
         const std::size_t offset = static_cast<std::size_t>(sampler.index.Value());
 
         // If this sampler has already been used, return the existing mapping.
@@ -517,13 +517,14 @@ public:
                          [&](const SamplerEntry& entry) { return entry.GetOffset() == offset; });
 
         if (itr != used_samplers.end()) {
-            ASSERT(itr->GetType() == type && itr->IsArray() == is_array);
+            ASSERT(itr->GetType() == type && itr->IsArray() == is_array &&
+                   itr->IsShadow() == is_shadow);
             return itr->GetName();
         }
 
         // Otherwise create a new mapping for this sampler
         const std::size_t next_index = used_samplers.size();
-        const SamplerEntry entry{stage, offset, next_index, type, is_array};
+        const SamplerEntry entry{stage, offset, next_index, type, is_array, is_shadow};
         used_samplers.emplace_back(entry);
         return entry.GetName();
     }
@@ -747,8 +748,9 @@ private:
     }
 
     /// Generates code representing a texture sampler.
-    std::string GetSampler(const Sampler& sampler, Tegra::Shader::TextureType type, bool is_array) {
-        return regs.AccessSampler(sampler, type, is_array);
+    std::string GetSampler(const Sampler& sampler, Tegra::Shader::TextureType type, bool is_array,
+                           bool is_shadow) {
+        return regs.AccessSampler(sampler, type, is_array, is_shadow);
     }
 
     /**
@@ -1002,6 +1004,24 @@ private:
         shader.AddLine('}');
     }
 
+    static u32 TextureCoordinates(Tegra::Shader::TextureType texture_type) {
+        switch (texture_type) {
+        case Tegra::Shader::TextureType::Texture1D: {
+            return 1;
+        }
+        case Tegra::Shader::TextureType::Texture2D: {
+            return 2;
+        }
+        case Tegra::Shader::TextureType::TextureCube: {
+            return 3;
+        }
+        default:
+            LOG_CRITICAL(HW_GPU, "Unhandled texture type {}", static_cast<u32>(texture_type));
+            UNREACHABLE();
+            return 0;
+        }
+    }
+
     /*
      * Emits code to push the input target address to the SSY address stack, incrementing the stack
      * top.
@@ -1896,24 +1916,35 @@ private:
                            "NODEP is not implemented");
                 ASSERT_MSG(!instr.tex.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI),
                            "AOFFI is not implemented");
-                ASSERT_MSG(!instr.tex.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC),
-                           "DC is not implemented");
 
-                switch (texture_type) {
-                case Tegra::Shader::TextureType::Texture1D: {
+                const bool depth_compare =
+                    instr.tex.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC);
+                u32 num_coordinates = TextureCoordinates(texture_type);
+                if (depth_compare)
+                    num_coordinates += 1;
+
+                switch (num_coordinates) {
+                case 1: {
                     const std::string x = regs.GetRegisterAsFloat(instr.gpr8);
                     coord = "float coords = " + x + ';';
                     break;
                 }
-                case Tegra::Shader::TextureType::Texture2D: {
+                case 2: {
                     const std::string x = regs.GetRegisterAsFloat(instr.gpr8);
                     const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
                     coord = "vec2 coords = vec2(" + x + ", " + y + ");";
                     break;
                 }
+                case 3: {
+                    const std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    const std::string z = regs.GetRegisterAsFloat(instr.gpr20);
+                    coord = "vec3 coords = vec3(" + x + ", " + y + ", " + z + ");";
+                    break;
+                }
                 default:
-                    LOG_CRITICAL(HW_GPU, "Unhandled texture type {}",
-                                 static_cast<u32>(texture_type));
+                    LOG_CRITICAL(HW_GPU, "Unhandled coordinates number {}",
+                                 static_cast<u32>(num_coordinates));
                     UNREACHABLE();
 
                     // Fallback to interpreting as a 2D texture for now
@@ -1924,9 +1955,10 @@ private:
                 }
                 // TODO: make sure coordinates are always indexed to gpr8 and gpr20 is always bias
                 // or lod.
-                const std::string op_c = regs.GetRegisterAsFloat(instr.gpr20);
+                std::string op_c;
 
-                const std::string sampler = GetSampler(instr.sampler, texture_type, false);
+                const std::string sampler =
+                    GetSampler(instr.sampler, texture_type, false, depth_compare);
                 // Add an extra scope and declare the texture coords inside to prevent
                 // overwriting them in case they are used as outputs of the texs instruction.
 
@@ -1935,7 +1967,7 @@ private:
                 shader.AddLine(coord);
                 std::string texture;
 
-                switch (instr.tex.process_mode) {
+                switch (instr.tex.GetTextureProcessMode()) {
                 case Tegra::Shader::TextureProcessMode::None: {
                     texture = "texture(" + sampler + ", coords)";
                     break;
@@ -1946,12 +1978,22 @@ private:
                 }
                 case Tegra::Shader::TextureProcessMode::LB:
                 case Tegra::Shader::TextureProcessMode::LBA: {
+                    if (num_coordinates <= 2) {
+                        op_c = regs.GetRegisterAsFloat(instr.gpr20);
+                    } else {
+                        op_c = regs.GetRegisterAsFloat(instr.gpr20.Value() + 1);
+                    }
                     // TODO: Figure if A suffix changes the equation at all.
                     texture = "texture(" + sampler + ", coords, " + op_c + ')';
                     break;
                 }
                 case Tegra::Shader::TextureProcessMode::LL:
                 case Tegra::Shader::TextureProcessMode::LLA: {
+                    if (num_coordinates <= 2) {
+                        op_c = regs.GetRegisterAsFloat(instr.gpr20);
+                    } else {
+                        op_c = regs.GetRegisterAsFloat(instr.gpr20.Value() + 1);
+                    }
                     // TODO: Figure if A suffix changes the equation at all.
                     texture = "textureLod(" + sampler + ", coords, " + op_c + ')';
                     break;
@@ -1959,18 +2001,22 @@ private:
                 default: {
                     texture = "texture(" + sampler + ", coords)";
                     LOG_CRITICAL(HW_GPU, "Unhandled texture process mode {}",
-                                 static_cast<u32>(instr.tex.process_mode.Value()));
+                                 static_cast<u32>(instr.tex.GetTextureProcessMode()));
                     UNREACHABLE();
                 }
                 }
-                std::size_t dest_elem{};
-                for (std::size_t elem = 0; elem < 4; ++elem) {
-                    if (!instr.tex.IsComponentEnabled(elem)) {
-                        // Skip disabled components
-                        continue;
+                if (!depth_compare) {
+                    std::size_t dest_elem{};
+                    for (std::size_t elem = 0; elem < 4; ++elem) {
+                        if (!instr.tex.IsComponentEnabled(elem)) {
+                            // Skip disabled components
+                            continue;
+                        }
+                        regs.SetRegisterToFloat(instr.gpr0, elem, texture, 1, 4, false, dest_elem);
+                        ++dest_elem;
                     }
-                    regs.SetRegisterToFloat(instr.gpr0, elem, texture, 1, 4, false, dest_elem);
-                    ++dest_elem;
+                } else {
+                    regs.SetRegisterToFloat(instr.gpr0, 0, texture, 1, 1, false);
                 }
                 --shader.scope;
                 shader.AddLine("}");
@@ -1983,11 +2029,15 @@ private:
 
                 ASSERT_MSG(!instr.texs.UsesMiscMode(Tegra::Shader::TextureMiscMode::NODEP),
                            "NODEP is not implemented");
-                ASSERT_MSG(!instr.texs.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC),
-                           "DC is not implemented");
 
-                switch (texture_type) {
-                case Tegra::Shader::TextureType::Texture2D: {
+                const bool depth_compare =
+                    instr.texs.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC);
+                u32 num_coordinates = TextureCoordinates(texture_type);
+                if (depth_compare)
+                    num_coordinates += 1;
+
+                switch (num_coordinates) {
+                case 2: {
                     if (is_array) {
                         const std::string index = regs.GetRegisterAsInteger(instr.gpr8);
                         const std::string x = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
@@ -2000,17 +2050,25 @@ private:
                     }
                     break;
                 }
-                case Tegra::Shader::TextureType::TextureCube: {
-                    ASSERT_MSG(!is_array, "Unimplemented");
-                    std::string x = regs.GetRegisterAsFloat(instr.gpr8);
-                    std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
-                    std::string z = regs.GetRegisterAsFloat(instr.gpr20);
-                    coord = "vec3 coords = vec3(" + x + ", " + y + ", " + z + ");";
+                case 3: {
+                    if (is_array) {
+                        UNIMPLEMENTED_MSG("3-coordinate arrays not fully implemented");
+                        const std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                        const std::string y = regs.GetRegisterAsFloat(instr.gpr20);
+                        coord = "vec2 coords = vec2(" + x + ", " + y + ");";
+                        texture_type = Tegra::Shader::TextureType::Texture2D;
+                        is_array = false;
+                    } else {
+                        const std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                        const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                        const std::string z = regs.GetRegisterAsFloat(instr.gpr20);
+                        coord = "vec3 coords = vec3(" + x + ", " + y + ", " + z + ");";
+                    }
                     break;
                 }
                 default:
-                    LOG_CRITICAL(HW_GPU, "Unhandled texture type {}",
-                                 static_cast<u32>(texture_type));
+                    LOG_CRITICAL(HW_GPU, "Unhandled coordinates number {}",
+                                 static_cast<u32>(num_coordinates));
                     UNREACHABLE();
 
                     // Fallback to interpreting as a 2D texture for now
@@ -2020,9 +2078,35 @@ private:
                     texture_type = Tegra::Shader::TextureType::Texture2D;
                     is_array = false;
                 }
-                const std::string sampler = GetSampler(instr.sampler, texture_type, is_array);
-                const std::string texture = "texture(" + sampler + ", coords)";
-                WriteTexsInstruction(instr, coord, texture);
+                const std::string sampler =
+                    GetSampler(instr.sampler, texture_type, is_array, depth_compare);
+                std::string texture;
+                switch (instr.texs.GetTextureProcessMode()) {
+                case Tegra::Shader::TextureProcessMode::None: {
+                    texture = "texture(" + sampler + ", coords)";
+                    break;
+                }
+                case Tegra::Shader::TextureProcessMode::LZ: {
+                    texture = "textureLod(" + sampler + ", coords, 0.0)";
+                    break;
+                }
+                case Tegra::Shader::TextureProcessMode::LL: {
+                    const std::string op_c = regs.GetRegisterAsFloat(instr.gpr20.Value() + 1);
+                    texture = "textureLod(" + sampler + ", coords, " + op_c + ')';
+                    break;
+                }
+                default: {
+                    texture = "texture(" + sampler + ", coords)";
+                    LOG_CRITICAL(HW_GPU, "Unhandled texture process mode {}",
+                                 static_cast<u32>(instr.texs.GetTextureProcessMode()));
+                    UNREACHABLE();
+                }
+                }
+                if (!depth_compare) {
+                    WriteTexsInstruction(instr, coord, texture);
+                } else {
+                    WriteTexsInstruction(instr, coord, "vec4(" + texture + ')');
+                }
                 break;
             }
             case OpCode::Id::TLDS: {
@@ -2062,9 +2146,26 @@ private:
                                  static_cast<u32>(texture_type));
                     UNREACHABLE();
                 }
-
-                const std::string sampler = GetSampler(instr.sampler, texture_type, is_array);
-                const std::string texture = "texelFetch(" + sampler + ", coords, 0)";
+                const std::string sampler =
+                    GetSampler(instr.sampler, texture_type, is_array, false);
+                std::string texture = "texelFetch(" + sampler + ", coords, 0)";
+                const std::string op_c = regs.GetRegisterAsInteger(instr.gpr20.Value() + 1);
+                switch (instr.tlds.GetTextureProcessMode()) {
+                case Tegra::Shader::TextureProcessMode::LZ: {
+                    texture = "texelFetch(" + sampler + ", coords, 0)";
+                    break;
+                }
+                case Tegra::Shader::TextureProcessMode::LL: {
+                    texture = "texelFetch(" + sampler + ", coords, " + op_c + ')';
+                    break;
+                }
+                default: {
+                    texture = "texelFetch(" + sampler + ", coords, 0)";
+                    LOG_CRITICAL(HW_GPU, "Unhandled texture process mode {}",
+                                 static_cast<u32>(instr.tlds.GetTextureProcessMode()));
+                    UNREACHABLE();
+                }
+                }
                 WriteTexsInstruction(instr, coord, texture);
                 break;
             }
@@ -2077,28 +2178,43 @@ private:
                            "NODEP is not implemented");
                 ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI),
                            "AOFFI is not implemented");
-                ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC),
-                           "DC is not implemented");
                 ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::NDV),
                            "NDV is not implemented");
                 ASSERT_MSG(!instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::PTP),
                            "PTP is not implemented");
-
-                switch (instr.tld4.texture_type) {
-                case Tegra::Shader::TextureType::Texture2D: {
+                const bool depth_compare =
+                    instr.tld4.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC);
+                auto texture_type = instr.tld4.texture_type.Value();
+                u32 num_coordinates = TextureCoordinates(texture_type);
+                if (depth_compare)
+                    num_coordinates += 1;
+
+                switch (num_coordinates) {
+                case 2: {
                     const std::string x = regs.GetRegisterAsFloat(instr.gpr8);
                     const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
                     coord = "vec2 coords = vec2(" + x + ", " + y + ");";
                     break;
                 }
+                case 3: {
+                    const std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    const std::string z = regs.GetRegisterAsFloat(instr.gpr8.Value() + 2);
+                    coord = "vec3 coords = vec3(" + x + ", " + y + ", " + z + ");";
+                    break;
+                }
                 default:
-                    LOG_CRITICAL(HW_GPU, "Unhandled texture type {}",
-                                 static_cast<u32>(instr.tld4.texture_type.Value()));
+                    LOG_CRITICAL(HW_GPU, "Unhandled coordinates number {}",
+                                 static_cast<u32>(num_coordinates));
                     UNREACHABLE();
+                    const std::string x = regs.GetRegisterAsFloat(instr.gpr8);
+                    const std::string y = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    coord = "vec2 coords = vec2(" + x + ", " + y + ");";
+                    texture_type = Tegra::Shader::TextureType::Texture2D;
                 }
 
                 const std::string sampler =
-                    GetSampler(instr.sampler, instr.tld4.texture_type, false);
+                    GetSampler(instr.sampler, texture_type, false, depth_compare);
                 // Add an extra scope and declare the texture coords inside to prevent
                 // overwriting them in case they are used as outputs of the texs instruction.
                 shader.AddLine("{");
@@ -2106,15 +2222,18 @@ private:
                 shader.AddLine(coord);
                 const std::string texture = "textureGather(" + sampler + ", coords, " +
                                             std::to_string(instr.tld4.component) + ')';
-
-                std::size_t dest_elem{};
-                for (std::size_t elem = 0; elem < 4; ++elem) {
-                    if (!instr.tex.IsComponentEnabled(elem)) {
-                        // Skip disabled components
-                        continue;
+                if (!depth_compare) {
+                    std::size_t dest_elem{};
+                    for (std::size_t elem = 0; elem < 4; ++elem) {
+                        if (!instr.tex.IsComponentEnabled(elem)) {
+                            // Skip disabled components
+                            continue;
+                        }
+                        regs.SetRegisterToFloat(instr.gpr0, elem, texture, 1, 4, false, dest_elem);
+                        ++dest_elem;
                     }
-                    regs.SetRegisterToFloat(instr.gpr0, elem, texture, 1, 4, false, dest_elem);
-                    ++dest_elem;
+                } else {
+                    regs.SetRegisterToFloat(instr.gpr0, 0, texture, 1, 1, false);
                 }
                 --shader.scope;
                 shader.AddLine("}");
@@ -2125,18 +2244,30 @@ private:
                            "NODEP is not implemented");
                 ASSERT_MSG(!instr.tld4s.UsesMiscMode(Tegra::Shader::TextureMiscMode::AOFFI),
                            "AOFFI is not implemented");
-                ASSERT_MSG(!instr.tld4s.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC),
-                           "DC is not implemented");
 
+                const bool depth_compare =
+                    instr.tld4s.UsesMiscMode(Tegra::Shader::TextureMiscMode::DC);
                 const std::string op_a = regs.GetRegisterAsFloat(instr.gpr8);
                 const std::string op_b = regs.GetRegisterAsFloat(instr.gpr20);
                 // TODO(Subv): Figure out how the sampler type is encoded in the TLD4S instruction.
-                const std::string sampler =
-                    GetSampler(instr.sampler, Tegra::Shader::TextureType::Texture2D, false);
-                const std::string coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");";
+                const std::string sampler = GetSampler(
+                    instr.sampler, Tegra::Shader::TextureType::Texture2D, false, depth_compare);
+                std::string coord;
+                if (!depth_compare) {
+                    coord = "vec2 coords = vec2(" + op_a + ", " + op_b + ");";
+                } else {
+                    // Note: TLD4S coordinate encoding works just like TEXS's
+                    const std::string op_c = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
+                    coord = "vec3 coords = vec3(" + op_a + ", " + op_c + ", " + op_b + ");";
+                }
                 const std::string texture = "textureGather(" + sampler + ", coords, " +
                                             std::to_string(instr.tld4s.component) + ')';
-                WriteTexsInstruction(instr, coord, texture);
+
+                if (!depth_compare) {
+                    WriteTexsInstruction(instr, coord, texture);
+                } else {
+                    WriteTexsInstruction(instr, coord, "vec4(" + texture + ')');
+                }
                 break;
             }
             case OpCode::Id::TXQ: {
@@ -2147,7 +2278,7 @@ private:
                 // Sadly, not all texture instructions specify the type of texture their sampler
                 // uses. This must be fixed at a later instance.
                 const std::string sampler =
-                    GetSampler(instr.sampler, Tegra::Shader::TextureType::Texture2D, false);
+                    GetSampler(instr.sampler, Tegra::Shader::TextureType::Texture2D, false, false);
                 switch (instr.txq.query_type) {
                 case Tegra::Shader::TextureQueryType::Dimension: {
                     const std::string texture = "textureQueryLevels(" + sampler + ')';
@@ -2172,7 +2303,8 @@ private:
                 const std::string op_b = regs.GetRegisterAsFloat(instr.gpr8.Value() + 1);
                 const bool is_array = instr.tmml.array != 0;
                 auto texture_type = instr.tmml.texture_type.Value();
-                const std::string sampler = GetSampler(instr.sampler, texture_type, is_array);
+                const std::string sampler =
+                    GetSampler(instr.sampler, texture_type, is_array, false);
 
                 // TODO: add coordinates for different samplers once other texture types are
                 // implemented.
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index d53b93ad5..e56f39e78 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -75,8 +75,9 @@ class SamplerEntry {
 
 public:
     SamplerEntry(Maxwell::ShaderStage stage, std::size_t offset, std::size_t index,
-                 Tegra::Shader::TextureType type, bool is_array)
-        : offset(offset), stage(stage), sampler_index(index), type(type), is_array(is_array) {}
+                 Tegra::Shader::TextureType type, bool is_array, bool is_shadow)
+        : offset(offset), stage(stage), sampler_index(index), type(type), is_array(is_array),
+          is_shadow(is_shadow) {}
 
     std::size_t GetOffset() const {
         return offset;
@@ -117,6 +118,8 @@ public:
         }
         if (is_array)
             glsl_type += "Array";
+        if (is_shadow)
+            glsl_type += "Shadow";
         return glsl_type;
     }
 
@@ -128,6 +131,10 @@ public:
         return is_array;
     }
 
+    bool IsShadow() const {
+        return is_shadow;
+    }
+
     u32 GetHash() const {
         return (static_cast<u32>(stage) << 16) | static_cast<u32>(sampler_index);
     }
@@ -147,7 +154,8 @@ private:
     Maxwell::ShaderStage stage;      ///< Shader stage where this sampler was used.
     std::size_t sampler_index;       ///< Value used to index into the generated GLSL sampler array.
     Tegra::Shader::TextureType type; ///< The type used to sample this texture (Texture2D, etc)
-    bool is_array; ///< Whether the texture is being sampled as an array texture or not.
+    bool is_array;  ///< Whether the texture is being sampled as an array texture or not.
+    bool is_shadow; ///< Whether the texture is being sampled as a depth texture or not.
 };
 
 struct ShaderEntries {
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 67273e164..3c3bcaae4 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -159,6 +159,31 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
     return {};
 }
 
+inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
+    switch (func) {
+    case Tegra::Texture::DepthCompareFunc::Never:
+        return GL_NEVER;
+    case Tegra::Texture::DepthCompareFunc::Less:
+        return GL_LESS;
+    case Tegra::Texture::DepthCompareFunc::LessEqual:
+        return GL_LEQUAL;
+    case Tegra::Texture::DepthCompareFunc::Equal:
+        return GL_EQUAL;
+    case Tegra::Texture::DepthCompareFunc::NotEqual:
+        return GL_NOTEQUAL;
+    case Tegra::Texture::DepthCompareFunc::Greater:
+        return GL_GREATER;
+    case Tegra::Texture::DepthCompareFunc::GreaterEqual:
+        return GL_GEQUAL;
+    case Tegra::Texture::DepthCompareFunc::Always:
+        return GL_ALWAYS;
+    }
+    LOG_CRITICAL(Render_OpenGL, "Unimplemented texture depth compare function ={}",
+                 static_cast<u32>(func));
+    UNREACHABLE();
+    return {};
+}
+
 inline GLenum BlendEquation(Maxwell::Blend::Equation equation) {
     switch (equation) {
     case Maxwell::Blend::Equation::Add:
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index 14aea4838..8f31d825a 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -227,6 +227,17 @@ enum class WrapMode : u32 {
     MirrorOnceClampOGL = 7,
 };
 
+enum class DepthCompareFunc : u32 {
+    Never = 0,
+    Less = 1,
+    Equal = 2,
+    LessEqual = 3,
+    Greater = 4,
+    NotEqual = 5,
+    GreaterEqual = 6,
+    Always = 7,
+};
+
 enum class TextureFilter : u32 {
     Nearest = 1,
     Linear = 2,
@@ -244,7 +255,7 @@ struct TSCEntry {
         BitField<3, 3, WrapMode> wrap_v;
         BitField<6, 3, WrapMode> wrap_p;
         BitField<9, 1, u32> depth_compare_enabled;
-        BitField<10, 3, u32> depth_compare_func;
+        BitField<10, 3, DepthCompareFunc> depth_compare_func;
     };
     union {
         BitField<0, 2, TextureFilter> mag_filter;