14 files changed, 310 insertions, 218 deletions
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
index 1e96b0310..eb5158407 100644
--- a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
@@ -281,14 +281,14 @@ private:
 
     template <const std::string_view& op>
     std::string Unary(Operation operation) {
-        const std::string temporary = AllocTemporary();
+        std::string temporary = AllocTemporary();
         AddLine("{}{} {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]));
         return temporary;
     }
 
     template <const std::string_view& op>
     std::string Binary(Operation operation) {
-        const std::string temporary = AllocTemporary();
+        std::string temporary = AllocTemporary();
         AddLine("{}{} {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
                 Visit(operation[1]));
         return temporary;
@@ -296,7 +296,7 @@ private:
 
     template <const std::string_view& op>
     std::string Trinary(Operation operation) {
-        const std::string temporary = AllocTemporary();
+        std::string temporary = AllocTemporary();
         AddLine("{}{} {}, {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
                 Visit(operation[1]), Visit(operation[2]));
         return temporary;
@@ -304,7 +304,7 @@ private:
 
     template <const std::string_view& op, bool unordered>
     std::string FloatComparison(Operation operation) {
-        const std::string temporary = AllocTemporary();
+        std::string temporary = AllocTemporary();
         AddLine("TRUNC.U.CC RC.x, {};", Binary<op>(operation));
         AddLine("MOV.S {}, 0;", temporary);
         AddLine("MOV.S {} (NE.x), -1;", temporary);
@@ -331,7 +331,7 @@ private:
 
     template <const std::string_view& op, bool is_nan>
     std::string HalfComparison(Operation operation) {
-        const std::string tmp1 = AllocVectorTemporary();
+        std::string tmp1 = AllocVectorTemporary();
         const std::string tmp2 = AllocVectorTemporary();
         const std::string op_a = Visit(operation[0]);
         const std::string op_b = Visit(operation[1]);
@@ -367,15 +367,14 @@ private:
             AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
         }
 
-        const std::string result = coord;
-        AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, result, value, coord,
+        AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, coord, value, coord,
                 image_id, ImageType(meta.image.type));
-        return fmt::format("{}.x", result);
+        return fmt::format("{}.x", coord);
     }
 
     template <const std::string_view& op, const std::string_view& type>
     std::string Atomic(Operation operation) {
-        const std::string temporary = AllocTemporary();
+        std::string temporary = AllocTemporary();
         std::string address;
         std::string_view opname;
         if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) {
@@ -396,7 +395,7 @@ private:
 
     template <char type>
     std::string Negate(Operation operation) {
-        const std::string temporary = AllocTemporary();
+        std::string temporary = AllocTemporary();
         if constexpr (type == 'F') {
             AddLine("MOV.F32 {}, -{};", temporary, Visit(operation[0]));
         } else {
@@ -407,7 +406,7 @@ private:
 
     template <char type>
     std::string Absolute(Operation operation) {
-        const std::string temporary = AllocTemporary();
+        std::string temporary = AllocTemporary();
         AddLine("MOV.{} {}, |{}|;", type, temporary, Visit(operation[0]));
         return temporary;
     }
@@ -1156,20 +1155,20 @@ void ARBDecompiler::VisitAST(const ASTNode& node) {
 }
 
 std::string ARBDecompiler::VisitExpression(const Expr& node) {
-    const std::string result = AllocTemporary();
     if (const auto expr = std::get_if<ExprAnd>(&*node)) {
+        std::string result = AllocTemporary();
         AddLine("AND.U {}, {}, {};", result, VisitExpression(expr->operand1),
                 VisitExpression(expr->operand2));
         return result;
     }
     if (const auto expr = std::get_if<ExprOr>(&*node)) {
-        const std::string result = AllocTemporary();
+        std::string result = AllocTemporary();
         AddLine("OR.U {}, {}, {};", result, VisitExpression(expr->operand1),
                 VisitExpression(expr->operand2));
         return result;
     }
     if (const auto expr = std::get_if<ExprNot>(&*node)) {
-        const std::string result = AllocTemporary();
+        std::string result = AllocTemporary();
         AddLine("CMP.S {}, {}, 0, -1;", result, VisitExpression(expr->operand1));
         return result;
     }
@@ -1186,7 +1185,7 @@ std::string ARBDecompiler::VisitExpression(const Expr& node) {
         return expr->value ? "0xffffffff" : "0";
     }
     if (const auto expr = std::get_if<ExprGprEqual>(&*node)) {
-        const std::string result = AllocTemporary();
+        std::string result = AllocTemporary();
         AddLine("SEQ.U {}, R{}.x, {};", result, expr->gpr, expr->value);
         return result;
     }
@@ -1231,13 +1230,13 @@ std::string ARBDecompiler::Visit(const Node& node) {
     }
 
     if (const auto immediate = std::get_if<ImmediateNode>(&*node)) {
-        const std::string temporary = AllocTemporary();
+        std::string temporary = AllocTemporary();
         AddLine("MOV.U {}, {};", temporary, immediate->GetValue());
         return temporary;
     }
 
     if (const auto predicate = std::get_if<PredicateNode>(&*node)) {
-        const std::string temporary = AllocTemporary();
+        std::string temporary = AllocTemporary();
         switch (const auto index = predicate->GetIndex(); index) {
         case Tegra::Shader::Pred::UnusedIndex:
             AddLine("MOV.S {}, -1;", temporary);
@@ -1333,13 +1332,13 @@ std::string ARBDecompiler::Visit(const Node& node) {
         } else {
             offset_string = Visit(offset);
         }
-        const std::string temporary = AllocTemporary();
+        std::string temporary = AllocTemporary();
         AddLine("LDC.F32 {}, cbuf{}[{}];", temporary, cbuf->GetIndex(), offset_string);
         return temporary;
     }
 
     if (const auto gmem = std::get_if<GmemNode>(&*node)) {
-        const std::string temporary = AllocTemporary();
+        std::string temporary = AllocTemporary();
         AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
                 Visit(gmem->GetBaseAddress()));
         AddLine("LDB.U32 {}, {}[{}];", temporary, GlobalMemoryName(gmem->GetDescriptor()),
@@ -1348,14 +1347,14 @@ std::string ARBDecompiler::Visit(const Node& node) {
     }
 
     if (const auto lmem = std::get_if<LmemNode>(&*node)) {
-        const std::string temporary = Visit(lmem->GetAddress());
+        std::string temporary = Visit(lmem->GetAddress());
         AddLine("SHR.U {}, {}, 2;", temporary, temporary);
         AddLine("MOV.U {}, lmem[{}].x;", temporary, temporary);
         return temporary;
     }
 
     if (const auto smem = std::get_if<SmemNode>(&*node)) {
-        const std::string temporary = Visit(smem->GetAddress());
+        std::string temporary = Visit(smem->GetAddress());
         AddLine("LDS.U32 {}, shared_mem[{}];", temporary, temporary);
         return temporary;
     }
@@ -1535,7 +1534,7 @@ std::string ARBDecompiler::Assign(Operation operation) {
 }
 
 std::string ARBDecompiler::Select(Operation operation) {
-    const std::string temporary = AllocTemporary();
+    std::string temporary = AllocTemporary();
     AddLine("CMP.S {}, {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]),
             Visit(operation[2]));
     return temporary;
@@ -1545,12 +1544,12 @@ std::string ARBDecompiler::FClamp(Operation operation) {
     // 1.0f in hex, replace with std::bit_cast on C++20
     static constexpr u32 POSITIVE_ONE = 0x3f800000;
 
-    const std::string temporary = AllocTemporary();
+    std::string temporary = AllocTemporary();
     const Node& value = operation[0];
     const Node& low = operation[1];
     const Node& high = operation[2];
-    const auto imm_low = std::get_if<ImmediateNode>(&*low);
-    const auto imm_high = std::get_if<ImmediateNode>(&*high);
+    const auto* const imm_low = std::get_if<ImmediateNode>(&*low);
+    const auto* const imm_high = std::get_if<ImmediateNode>(&*high);
     if (imm_low && imm_high && imm_low->GetValue() == 0 && imm_high->GetValue() == POSITIVE_ONE) {
         AddLine("MOV.F32.SAT {}, {};", temporary, Visit(value));
     } else {
@@ -1574,7 +1573,7 @@ std::string ARBDecompiler::FCastHalf1(Operation operation) {
 }
 
 std::string ARBDecompiler::FSqrt(Operation operation) {
-    const std::string temporary = AllocTemporary();
+    std::string temporary = AllocTemporary();
     AddLine("RSQ.F32 {}, {};", temporary, Visit(operation[0]));
     AddLine("RCP.F32 {}, {};", temporary, temporary);
     return temporary;
@@ -1588,7 +1587,7 @@ std::string ARBDecompiler::FSwizzleAdd(Operation operation) {
         AddLine("ADD.F {}.x, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]));
         return fmt::format("{}.x", temporary);
     }
-    const std::string lut = AllocVectorTemporary();
+
     AddLine("AND.U {}.z, {}.threadid, 3;", temporary, StageInputName(stage));
     AddLine("SHL.U {}.z, {}.z, 1;", temporary, temporary);
     AddLine("SHR.U {}.z, {}, {}.z;", temporary, Visit(operation[2]), temporary);
@@ -1766,21 +1765,21 @@ std::string ARBDecompiler::LogicalAssign(Operation operation) {
 }
 
 std::string ARBDecompiler::LogicalPick2(Operation operation) {
-    const std::string temporary = AllocTemporary();
+    std::string temporary = AllocTemporary();
     const u32 index = std::get<ImmediateNode>(*operation[1]).GetValue();
     AddLine("MOV.U {}, {}.{};", temporary, Visit(operation[0]), Swizzle(index));
     return temporary;
 }
 
 std::string ARBDecompiler::LogicalAnd2(Operation operation) {
-    const std::string temporary = AllocTemporary();
+    std::string temporary = AllocTemporary();
     const std::string op = Visit(operation[0]);
     AddLine("AND.U {}, {}.x, {}.y;", temporary, op, op);
     return temporary;
 }
 
 std::string ARBDecompiler::FloatOrdered(Operation operation) {
-    const std::string temporary = AllocTemporary();
+    std::string temporary = AllocTemporary();
     AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
     AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
     AddLine("MOV.S {}, -1;", temporary);
@@ -1790,7 +1789,7 @@ std::string ARBDecompiler::FloatOrdered(Operation operation) {
 }
 
 std::string ARBDecompiler::FloatUnordered(Operation operation) {
-    const std::string temporary = AllocTemporary();
+    std::string temporary = AllocTemporary();
     AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
     AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
     AddLine("MOV.S {}, 0;", temporary);
@@ -1800,7 +1799,7 @@ std::string ARBDecompiler::FloatUnordered(Operation operation) {
 }
 
 std::string ARBDecompiler::LogicalAddCarry(Operation operation) {
-    const std::string temporary = AllocTemporary();
+    std::string temporary = AllocTemporary();
     AddLine("ADDC.U RC, {}, {};", Visit(operation[0]), Visit(operation[1]));
     AddLine("MOV.S {}, 0;", temporary);
     AddLine("IF CF.x;");
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index ad0577a4f..d9f7b4cc6 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -22,21 +22,46 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
 
-Buffer::Buffer(VAddr cpu_addr, const std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} {
+Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
+    : VideoCommon::BufferBlock{cpu_addr, size} {
     gl_buffer.Create();
     glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+    }
 }
 
 Buffer::~Buffer() = default;
 
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const {
+    glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+                         data);
+}
+
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const {
+    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
+    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
+    glGetNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+                            data);
+}
+
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                      std::size_t size) const {
+    glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
+                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+}
+
 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
-                               const Device& device, std::size_t stream_size)
-    : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {
+                               const Device& device_, std::size_t stream_size)
+    : GenericBufferCache{rasterizer, system,
+                         std::make_unique<OGLStreamBuffer>(device_, stream_size, true)},
+      device{device_} {
     if (!device.HasFastBufferSubData()) {
         return;
     }
 
-    static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
+    static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
     glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
     for (const GLuint cbuf : cbufs) {
         glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
@@ -48,39 +73,20 @@ OGLBufferCache::~OGLBufferCache() {
 }
 
 std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<Buffer>(cpu_addr, size);
+    return std::make_shared<Buffer>(device, cpu_addr, size);
 }
 
-GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {
-    return 0;
-}
-
-void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                     const u8* data) {
-    glNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),
-                         static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                       u8* data) {
-    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
-    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
-    glGetNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),
-                            static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                               std::size_t dst_offset, std::size_t size) {
-    glCopyNamedBufferSubData(src.Handle(), dst.Handle(), static_cast<GLintptr>(src_offset),
-                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) {
+    return {0, 0, 0};
 }
 
 OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
                                                              std::size_t size) {
     DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
     const GLuint cbuf = cbufs[cbuf_cursor++];
+
     glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
-    return {cbuf, 0};
+    return {cbuf, 0, 0};
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index a49aaf9c4..59d95adbc 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -25,15 +25,27 @@ class RasterizerOpenGL;
 
 class Buffer : public VideoCommon::BufferBlock {
 public:
-    explicit Buffer(VAddr cpu_addr, const std::size_t size);
+    explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size);
     ~Buffer();
 
-    GLuint Handle() const {
+    void Upload(std::size_t offset, std::size_t size, const u8* data) const;
+
+    void Download(std::size_t offset, std::size_t size, u8* data) const;
+
+    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                  std::size_t size) const;
+
+    GLuint Handle() const noexcept {
         return gl_buffer.handle;
     }
 
+    u64 Address() const noexcept {
+        return gpu_address;
+    }
+
 private:
     OGLBuffer gl_buffer;
+    u64 gpu_address = 0;
 };
 
 using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
@@ -43,7 +55,7 @@ public:
                             const Device& device, std::size_t stream_size);
     ~OGLBufferCache();
 
-    GLuint GetEmptyBuffer(std::size_t) override;
+    BufferInfo GetEmptyBuffer(std::size_t) override;
 
     void Acquire() noexcept {
         cbuf_cursor = 0;
@@ -52,22 +64,16 @@ public:
 protected:
     std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
 
-    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                         const u8* data) override;
-
-    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                           u8* data) override;
-
-    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                   std::size_t dst_offset, std::size_t size) override;
-
     BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
 
 private:
+    static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+                                             Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+
+    const Device& device;
+
     std::size_t cbuf_cursor = 0;
-    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
-                           Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram>
-        cbufs;
+    std::array<GLuint, NUM_CBUFS> cbufs{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index e245e27ec..b6b6659c1 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -123,16 +123,24 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
     u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS);
     u32 base_images = 0;
 
-    // Reserve more image bindings on fragment and vertex stages.
+    // GL_MAX_IMAGE_UNITS is guaranteed by the spec to have a minimum value of 8.
+    // Due to the limitation of GL_MAX_IMAGE_UNITS, reserve at least 4 image bindings on the
+    // fragment stage, and at least 1 for the rest of the stages.
+    // So far games are observed to use 1 image binding on vertex and 4 on fragment stages.
+
+    // Reserve at least 4 image bindings on the fragment stage.
     bindings[4].image =
-        Extract(base_images, num_images, num_images / NumStages + 2, LimitImages[4]);
-    bindings[0].image =
-        Extract(base_images, num_images, num_images / NumStages + 1, LimitImages[0]);
+        Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]);
+
+    // This is guaranteed to be at least 1.
+    const u32 total_extracted_images = num_images / (NumStages - 1);
 
     // Reserve the other image bindings.
-    const u32 total_extracted_images = num_images / (NumStages - 2);
-    for (std::size_t i = 2; i < NumStages; ++i) {
+    for (std::size_t i = 0; i < NumStages; ++i) {
         const std::size_t stage = stage_swizzle[i];
+        if (stage == 4) {
+            continue;
+        }
         bindings[stage].image =
             Extract(base_images, num_images, total_extracted_images, LimitImages[stage]);
     }
@@ -170,7 +178,7 @@ bool IsASTCSupported() {
         for (const GLenum format : formats) {
             for (const GLenum support : required_support) {
                 GLint value;
-                glGetInternalformativ(GL_TEXTURE_2D, format, support, 1, &value);
+                glGetInternalformativ(target, format, support, 1, &value);
                 if (value != GL_FULL_SUPPORT) {
                     return false;
                 }
@@ -180,16 +188,32 @@ bool IsASTCSupported() {
     return true;
 }
 
+/// @brief Returns true when a GL_RENDERER is a Turing GPU
+/// @param renderer GL_RENDERER string
+bool IsTuring(std::string_view renderer) {
+    static constexpr std::array<std::string_view, 12> TURING_GPUS = {
+        "GTX 1650",        "GTX 1660",        "RTX 2060",        "RTX 2070",
+        "RTX 2080",        "TITAN RTX",       "Quadro RTX 3000", "Quadro RTX 4000",
+        "Quadro RTX 5000", "Quadro RTX 6000", "Quadro RTX 8000", "Tesla T4",
+    };
+    return std::any_of(TURING_GPUS.begin(), TURING_GPUS.end(),
+                       [renderer](std::string_view candidate) {
+                           return renderer.find(candidate) != std::string_view::npos;
+                       });
+}
+
 } // Anonymous namespace
 
 Device::Device()
     : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
     const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
+    const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
     const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
     const std::vector extensions = GetExtensions();
 
     const bool is_nvidia = vendor == "NVIDIA Corporation";
     const bool is_amd = vendor == "ATI Technologies Inc.";
+    const bool is_turing = is_nvidia && IsTuring(renderer);
 
     bool disable_fast_buffer_sub_data = false;
     if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
@@ -208,12 +232,21 @@ Device::Device()
     has_shader_ballot = GLAD_GL_ARB_shader_ballot;
     has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
     has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted");
+    has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod");
     has_astc = IsASTCSupported();
     has_variable_aoffi = TestVariableAoffi();
     has_component_indexing_bug = is_amd;
     has_precise_bug = TestPreciseBug();
-    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
     has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
+
+    // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive
+    // uniform buffers as "push constants"
+    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
+
+    // Nvidia's driver on Turing GPUs randomly crashes when the buffer is made resident, or on
+    // DeleteBuffers. Disable unified memory on these devices.
+    has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory && !is_turing;
+
     use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
                            GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback &&
                            GLAD_GL_NV_transform_feedback2;
@@ -237,6 +270,7 @@ Device::Device(std::nullptr_t) {
     has_shader_ballot = true;
     has_vertex_viewport_layer = true;
     has_image_load_formatted = true;
+    has_texture_shadow_lod = true;
     has_variable_aoffi = true;
 }
 
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index 145347943..e1d811966 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -68,6 +68,14 @@ public:
         return has_image_load_formatted;
     }
 
+    bool HasTextureShadowLod() const {
+        return has_texture_shadow_lod;
+    }
+
+    bool HasVertexBufferUnifiedMemory() const {
+        return has_vertex_buffer_unified_memory;
+    }
+
     bool HasASTC() const {
         return has_astc;
     }
@@ -110,6 +118,8 @@ private:
     bool has_shader_ballot{};
     bool has_vertex_viewport_layer{};
     bool has_image_load_formatted{};
+    bool has_texture_shadow_lod{};
+    bool has_vertex_buffer_unified_memory{};
     bool has_astc{};
     bool has_variable_aoffi{};
     bool has_component_indexing_bug{};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 2d6c11320..362457ffe 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -61,7 +61,8 @@ constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
 constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
     NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
 
-constexpr std::size_t NumSupportedVertexAttributes = 16;
+constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
+constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;
 
 template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
@@ -193,7 +194,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
     // avoid OpenGL errors.
     // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't
     // assume every shader uses them all.
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
         if (!flags[Dirty::VertexFormat0 + index]) {
             continue;
         }
@@ -231,9 +232,11 @@ void RasterizerOpenGL::SetupVertexBuffer() {
 
     MICROPROFILE_SCOPE(OpenGL_VB);
 
+    const bool use_unified_memory = device.HasVertexBufferUnifiedMemory();
+
     // Upload all guest vertex arrays sequentially to our buffer
     const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {
         if (!flags[Dirty::VertexBuffer0 + index]) {
             continue;
         }
@@ -246,16 +249,25 @@ void RasterizerOpenGL::SetupVertexBuffer() {
 
         const GPUVAddr start = vertex_array.StartAddress();
         const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
-
         ASSERT(end >= start);
+
+        const GLuint gl_index = static_cast<GLuint>(index);
         const u64 size = end - start;
         if (size == 0) {
-            glBindVertexBuffer(static_cast<GLuint>(index), 0, 0, vertex_array.stride);
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            if (use_unified_memory) {
+                glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0);
+            }
             continue;
         }
-        const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);
-        glBindVertexBuffer(static_cast<GLuint>(index), vertex_buffer, vertex_buffer_offset,
-                           vertex_array.stride);
+        const auto info = buffer_cache.UploadMemory(start, size);
+        if (use_unified_memory) {
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index,
+                                   info.address + info.offset, size);
+        } else {
+            glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride);
+        }
     }
 }
 
@@ -268,7 +280,7 @@ void RasterizerOpenGL::SetupVertexInstances() {
     flags[Dirty::VertexInstances] = false;
 
     const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
         if (!flags[Dirty::VertexInstance0 + index]) {
             continue;
         }
@@ -285,9 +297,9 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
     MICROPROFILE_SCOPE(OpenGL_Index);
     const auto& regs = system.GPU().Maxwell3D().regs;
     const std::size_t size = CalculateIndexBufferSize();
-    const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer);
-    return offset;
+    const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle);
+    return info.offset;
 }
 
 void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
@@ -643,9 +655,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     if (!device.UseAssemblyShaders()) {
         MaxwellUniformData ubo;
         ubo.SetFromRegs(gpu);
-        const auto [buffer, offset] =
+        const auto info =
             buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
-        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
+        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,
                           static_cast<GLsizeiptr>(sizeof(ubo)));
     }
 
@@ -956,8 +968,7 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
         if (device.UseAssemblyShaders()) {
             glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
         } else {
-            glBindBufferRange(GL_UNIFORM_BUFFER, binding,
-                              buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
+            glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float));
         }
         return;
     }
@@ -970,24 +981,25 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
 
     const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
     const GPUVAddr gpu_addr = buffer.address;
-    auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
+    auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
 
     if (device.UseAssemblyShaders()) {
         UNIMPLEMENTED_IF(use_unified);
-        if (offset != 0) {
+        if (info.offset != 0) {
             const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
-            glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
-            cbuf = staging_cbuf;
-            offset = 0;
+            glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size);
+            info.handle = staging_cbuf;
+            info.offset = 0;
         }
-        glBindBufferRangeNV(stage, binding, cbuf, offset, size);
+        glBindBufferRangeNV(stage, binding, info.handle, info.offset, size);
         return;
     }
 
     if (use_unified) {
-        glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size);
+        glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset,
+                                 unified_offset, size);
     } else {
-        glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
+        glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size);
     }
 }
 
@@ -1023,9 +1035,8 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
 void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
                                          GPUVAddr gpu_addr, std::size_t size) {
     const auto alignment{device.GetShaderStorageBufferAlignment()};
-    const auto [ssbo, buffer_offset] =
-        buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
-    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, ssbo, buffer_offset,
+    const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
+    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
                       static_cast<GLsizeiptr>(size));
 }
 
@@ -1712,8 +1723,9 @@ void RasterizerOpenGL::EndTransformFeedback() {
         const GLuint handle = transform_feedback_buffers[index].handle;
         const GPUVAddr gpu_addr = binding.Address();
         const std::size_t size = binding.buffer_size;
-        const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
-        glCopyNamedBufferSubData(handle, dest_buffer, 0, offset, static_cast<GLsizeiptr>(size));
+        const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+        glCopyNamedBufferSubData(handle, info.handle, 0, info.offset,
+                                 static_cast<GLsizeiptr>(size));
     }
 }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 46e780a06..c6a3bf3a1 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -460,8 +460,9 @@ Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
         const u8* host_ptr_b = memory_manager.GetPointer(address_b);
         code_b = GetShaderCode(memory_manager, address_b, host_ptr_b, false);
     }
+    const std::size_t code_size = code.size() * sizeof(u64);
 
-    const auto unique_identifier = GetUniqueIdentifier(
+    const u64 unique_identifier = GetUniqueIdentifier(
         GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b);
 
     const ShaderParameters params{system,    disk_cache, device,
@@ -477,7 +478,7 @@ Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
 
     Shader* const result = shader.get();
     if (cpu_addr) {
-        Register(std::move(shader), *cpu_addr, code.size() * sizeof(u64));
+        Register(std::move(shader), *cpu_addr, code_size);
     } else {
         null_shader = std::move(shader);
     }
@@ -495,8 +496,9 @@ Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
 
     const auto host_ptr{memory_manager.GetPointer(code_addr)};
     // No kernel found, create a new one
-    auto code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
-    const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
+    ProgramCode code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
+    const std::size_t code_size{code.size() * sizeof(u64)};
+    const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
 
     const ShaderParameters params{system,    disk_cache, device,
                                   *cpu_addr, host_ptr,   unique_identifier};
@@ -511,7 +513,7 @@ Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
 
     Shader* const result = kernel.get();
     if (cpu_addr) {
-        Register(std::move(kernel), *cpu_addr, code.size() * sizeof(u64));
+        Register(std::move(kernel), *cpu_addr, code_size);
     } else {
         null_kernel = std::move(kernel);
     }
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 6848f1388..994aaeaf2 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -37,7 +37,6 @@ namespace OpenGL {
 
 class Device;
 class RasterizerOpenGL;
-struct UnspecializedShader;
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index d6e30b321..2c49aeaac 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -37,6 +37,7 @@ using Tegra::Shader::IpaMode;
 using Tegra::Shader::IpaSampleMode;
 using Tegra::Shader::PixelImap;
 using Tegra::Shader::Register;
+using Tegra::Shader::TextureType;
 using VideoCommon::Shader::BuildTransformFeedback;
 using VideoCommon::Shader::Registry;
 
@@ -526,6 +527,9 @@ private:
         if (device.HasImageLoadFormatted()) {
             code.AddLine("#extension GL_EXT_shader_image_load_formatted : require");
         }
+        if (device.HasTextureShadowLod()) {
+            code.AddLine("#extension GL_EXT_texture_shadow_lod : require");
+        }
         if (device.HasWarpIntrinsics()) {
             code.AddLine("#extension GL_NV_gpu_shader5 : require");
             code.AddLine("#extension GL_NV_shader_thread_group : require");
@@ -909,13 +913,13 @@ private:
                     return "samplerBuffer";
                 }
                 switch (sampler.type) {
-                case Tegra::Shader::TextureType::Texture1D:
+                case TextureType::Texture1D:
                     return "sampler1D";
-                case Tegra::Shader::TextureType::Texture2D:
+                case TextureType::Texture2D:
                     return "sampler2D";
-                case Tegra::Shader::TextureType::Texture3D:
+                case TextureType::Texture3D:
                     return "sampler3D";
-                case Tegra::Shader::TextureType::TextureCube:
+                case TextureType::TextureCube:
                     return "samplerCube";
                 default:
                     UNREACHABLE();
@@ -1380,8 +1384,19 @@ private:
         const std::size_t count = operation.GetOperandsCount();
         const bool has_array = meta->sampler.is_array;
         const bool has_shadow = meta->sampler.is_shadow;
+        const bool workaround_lod_array_shadow_as_grad =
+            !device.HasTextureShadowLod() && function_suffix == "Lod" && meta->sampler.is_shadow &&
+            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+             meta->sampler.type == TextureType::TextureCube);
+
+        std::string expr = "texture";
+
+        if (workaround_lod_array_shadow_as_grad) {
+            expr += "Grad";
+        } else {
+            expr += function_suffix;
+        }
 
-        std::string expr = "texture" + function_suffix;
         if (!meta->aoffi.empty()) {
             expr += "Offset";
         } else if (!meta->ptp.empty()) {
@@ -1415,6 +1430,16 @@ private:
             expr += ')';
         }
 
+        if (workaround_lod_array_shadow_as_grad) {
+            switch (meta->sampler.type) {
+            case TextureType::Texture2D:
+                return expr + ", vec2(0.0), vec2(0.0))";
+            case TextureType::TextureCube:
+                return expr + ", vec3(0.0), vec3(0.0))";
+            }
+            UNREACHABLE();
+        }
+
         for (const auto& variant : extras) {
             if (const auto argument = std::get_if<TextureArgument>(&variant)) {
                 expr += GenerateTextureArgument(*argument);
@@ -2041,8 +2066,19 @@ private:
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
 
-        std::string expr = GenerateTexture(
-            operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+        std::string expr{};
+
+        if (!device.HasTextureShadowLod() && meta->sampler.is_shadow &&
+            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+             meta->sampler.type == TextureType::TextureCube)) {
+            LOG_ERROR(Render_OpenGL,
+                      "Device lacks GL_EXT_texture_shadow_lod, using textureGrad as a workaround");
+            expr = GenerateTexture(operation, "Lod", {});
+        } else {
+            expr = GenerateTexture(operation, "Lod",
+                                   {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+        }
+
         if (meta->sampler.is_shadow) {
             expr = "vec4(" + expr + ')';
         }
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index 932a2f69e..3655ff629 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -2,11 +2,13 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <deque>
+#include <tuple>
 #include <vector>
+
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
 MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
@@ -14,8 +16,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
 
 namespace OpenGL {
 
-OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent,
-                                 bool use_persistent)
+OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage)
     : buffer_size(size) {
     gl_buffer.Create();
 
@@ -29,23 +30,19 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p
         allocate_size *= 2;
     }
 
-    if (use_persistent) {
-        persistent = true;
-        coherent = prefer_coherent;
-        const GLbitfield flags =
-            GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
-        glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
-        mapped_ptr = static_cast<u8*>(glMapNamedBufferRange(
-            gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
-    } else {
-        glNamedBufferData(gl_buffer.handle, allocate_size, nullptr, GL_STREAM_DRAW);
+    static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT;
+    glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
+    mapped_ptr = static_cast<u8*>(
+        glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT));
+
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
     }
 }
 
 OGLStreamBuffer::~OGLStreamBuffer() {
-    if (persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
-    }
+    glUnmapNamedBuffer(gl_buffer.handle);
     gl_buffer.Release();
 }
 
@@ -60,36 +57,21 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
 
     bool invalidate = false;
     if (buffer_pos + size > buffer_size) {
+        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
+        glInvalidateBufferData(gl_buffer.handle);
+
         buffer_pos = 0;
         invalidate = true;
-
-        if (persistent) {
-            glUnmapNamedBuffer(gl_buffer.handle);
-        }
     }
 
-    if (invalidate || !persistent) {
-        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
-        GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
-                           (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) |
-                           (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
-        mapped_ptr = static_cast<u8*>(
-            glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags));
-        mapped_offset = buffer_pos;
-    }
-
-    return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
+    return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate);
 }
 
 void OGLStreamBuffer::Unmap(GLsizeiptr size) {
     ASSERT(size <= mapped_size);
 
-    if (!coherent && size > 0) {
-        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size);
-    }
-
-    if (!persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
+    if (size > 0) {
+        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size);
     }
 
     buffer_pos += size;
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index 866da3594..307a67113 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -11,10 +11,11 @@
 
 namespace OpenGL {
 
+class Device;
+
 class OGLStreamBuffer : private NonCopyable {
 public:
-    explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false,
-                             bool use_persistent = true);
+    explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage);
     ~OGLStreamBuffer();
 
     /*
@@ -33,19 +34,20 @@ public:
         return gl_buffer.handle;
     }
 
-    GLsizeiptr Size() const {
+    u64 Address() const {
+        return gpu_address;
+    }
+
+    GLsizeiptr Size() const noexcept {
         return buffer_size;
     }
 
 private:
     OGLBuffer gl_buffer;
 
-    bool coherent = false;
-    bool persistent = false;
-
+    GLuint64EXT gpu_address = 0;
     GLintptr buffer_pos = 0;
     GLsizeiptr buffer_size = 0;
-    GLintptr mapped_offset = 0;
     GLsizeiptr mapped_size = 0;
     u8* mapped_ptr = nullptr;
 };
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 994ae98eb..35e329240 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -46,10 +46,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_UNSIGNED_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_UNSIGNED_INT_2_10_10_10_REV;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
+        break;
     case Maxwell::VertexAttribute::Type::SignedInt:
     case Maxwell::VertexAttribute::Type::SignedNorm:
         switch (attrib.size) {
@@ -70,10 +68,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_INT_2_10_10_10_REV;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
+        break;
     case Maxwell::VertexAttribute::Type::Float:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_16:
@@ -86,10 +82,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_32_32_32:
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return GL_FLOAT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
+        break;
     case Maxwell::VertexAttribute::Type::UnsignedScaled:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
@@ -102,10 +96,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
             return GL_UNSIGNED_SHORT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
+        break;
     case Maxwell::VertexAttribute::Type::SignedScaled:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
@@ -118,14 +110,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
             return GL_SHORT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
-    default:
-        LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
-        return {};
+        break;
     }
+    UNIMPLEMENTED_MSG("Unimplemented vertex type={} and size={}", attrib.TypeString(),
+                      attrib.SizeString());
+    return {};
 }
 
 inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
@@ -137,8 +127,7 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
     case Maxwell::IndexFormat::UnsignedInt:
         return GL_UNSIGNED_INT;
     }
-    LOG_CRITICAL(Render_OpenGL, "Unimplemented index_format={}", static_cast<u32>(index_format));
-    UNREACHABLE();
+    UNREACHABLE_MSG("Invalid index_format={}", static_cast<u32>(index_format));
     return {};
 }
 
@@ -180,33 +169,32 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
 }
 
 inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
-                                Tegra::Texture::TextureMipmapFilter mip_filter_mode) {
+                                Tegra::Texture::TextureMipmapFilter mipmap_filter_mode) {
     switch (filter_mode) {
-    case Tegra::Texture::TextureFilter::Linear: {
-        switch (mip_filter_mode) {
+    case Tegra::Texture::TextureFilter::Nearest:
+        switch (mipmap_filter_mode) {
         case Tegra::Texture::TextureMipmapFilter::None:
-            return GL_LINEAR;
+            return GL_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Nearest:
-            return GL_LINEAR_MIPMAP_NEAREST;
+            return GL_NEAREST_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
-            return GL_LINEAR_MIPMAP_LINEAR;
+            return GL_NEAREST_MIPMAP_LINEAR;
         }
         break;
-    }
-    case Tegra::Texture::TextureFilter::Nearest: {
-        switch (mip_filter_mode) {
+    case Tegra::Texture::TextureFilter::Linear:
+        switch (mipmap_filter_mode) {
         case Tegra::Texture::TextureMipmapFilter::None:
-            return GL_NEAREST;
+            return GL_LINEAR;
         case Tegra::Texture::TextureMipmapFilter::Nearest:
-            return GL_NEAREST_MIPMAP_NEAREST;
+            return GL_LINEAR_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
-            return GL_NEAREST_MIPMAP_LINEAR;
+            return GL_LINEAR_MIPMAP_LINEAR;
         }
         break;
     }
-    }
-    LOG_ERROR(Render_OpenGL, "Unimplemented texture filter mode={}", static_cast<u32>(filter_mode));
-    return GL_LINEAR;
+    UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}",
+                    static_cast<u32>(filter_mode), static_cast<u32>(mipmap_filter_mode));
+    return GL_NEAREST;
 }
 
 inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
@@ -229,10 +217,9 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         } else {
             return GL_MIRROR_CLAMP_TO_EDGE;
         }
-    default:
-        LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
-        return GL_REPEAT;
     }
+    UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
+    return GL_REPEAT;
 }
 
 inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
@@ -254,8 +241,7 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
     case Tegra::Texture::DepthCompareFunc::Always:
         return GL_ALWAYS;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented texture depth compare function ={}",
-              static_cast<u32>(func));
+    UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", static_cast<u32>(func));
     return GL_GREATER;
 }
 
@@ -277,7 +263,7 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) {
     case Maxwell::Blend::Equation::MaxGL:
         return GL_MAX;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation));
+    UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast<u32>(equation));
     return GL_FUNC_ADD;
 }
 
@@ -341,7 +327,7 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) {
     case Maxwell::Blend::Factor::OneMinusConstantAlphaGL:
         return GL_ONE_MINUS_CONSTANT_ALPHA;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor));
+    UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast<u32>(factor));
     return GL_ZERO;
 }
 
@@ -361,7 +347,7 @@ inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) {
     case Tegra::Texture::SwizzleSource::OneFloat:
         return GL_ONE;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source));
+    UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast<u32>(source));
     return GL_ZERO;
 }
 
@@ -392,7 +378,7 @@ inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) {
     case Maxwell::ComparisonOp::AlwaysOld:
         return GL_ALWAYS;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented comparison op={}", static_cast<u32>(comparison));
+    UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast<u32>(comparison));
     return GL_ALWAYS;
 }
 
@@ -423,7 +409,7 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) {
     case Maxwell::StencilOp::DecrWrapOGL:
         return GL_DECR_WRAP;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented stencil op={}", static_cast<u32>(stencil));
+    UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast<u32>(stencil));
     return GL_KEEP;
 }
 
@@ -434,7 +420,7 @@ inline GLenum FrontFace(Maxwell::FrontFace front_face) {
     case Maxwell::FrontFace::CounterClockWise:
         return GL_CCW;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face));
+    UNIMPLEMENTED_MSG("Unimplemented front face cull={}", static_cast<u32>(front_face));
     return GL_CCW;
 }
 
@@ -447,7 +433,7 @@ inline GLenum CullFace(Maxwell::CullFace cull_face) {
     case Maxwell::CullFace::FrontAndBack:
         return GL_FRONT_AND_BACK;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face));
+    UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face));
     return GL_BACK;
 }
 
@@ -486,7 +472,7 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) {
     case Maxwell::LogicOperation::Set:
         return GL_SET;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented logic operation={}", static_cast<u32>(operation));
+    UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(operation));
     return GL_COPY;
 }
 
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 6214fcbc3..c40adb6e7 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -488,6 +488,15 @@ void RendererOpenGL::InitOpenGLObjects() {
 
     // Clear screen to black
     LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture);
+
+    // Enable unified vertex attributes and query vertex buffer address when the driver supports it
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+
+        glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV,
+                                         &vertex_buffer_address);
+    }
 }
 
 void RendererOpenGL::AddTelemetryFields() {
@@ -656,7 +665,13 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
                          offsetof(ScreenRectVertex, tex_coord));
     glVertexAttribBinding(PositionLocation, 0);
     glVertexAttribBinding(TexCoordLocation, 0);
-    glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glBindVertexBuffer(0, 0, 0, sizeof(ScreenRectVertex));
+        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0, vertex_buffer_address,
+                               sizeof(vertices));
+    } else {
+        glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+    }
 
     glBindTextureUnit(0, screen_info.display_texture);
     glBindSampler(0, 0);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 61bf507f4..8b18d32e6 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -107,6 +107,9 @@ private:
     OGLPipeline pipeline;
     OGLFramebuffer screenshot_framebuffer;
 
+    // GPU address of the vertex buffer
+    GLuint64EXT vertex_buffer_address = 0;
+
     /// Display information for Switch screen
     ScreenInfo screen_info;