47 files changed, 1479 insertions, 621 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 6839abe71..7c18c27b3 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_library(video_core STATIC
+    buffer_cache.h
     dma_pusher.cpp
     dma_pusher.h
     debug_utils/debug_utils.cpp
@@ -43,8 +44,6 @@ add_library(video_core STATIC
     renderer_opengl/gl_device.h
     renderer_opengl/gl_framebuffer_cache.cpp
     renderer_opengl/gl_framebuffer_cache.h
-    renderer_opengl/gl_global_cache.cpp
-    renderer_opengl/gl_global_cache.h
     renderer_opengl/gl_rasterizer.cpp
     renderer_opengl/gl_rasterizer.h
     renderer_opengl/gl_resource_manager.cpp
@@ -103,6 +102,8 @@ add_library(video_core STATIC
     shader/decode/video.cpp
     shader/decode/xmad.cpp
     shader/decode/other.cpp
+    shader/control_flow.cpp
+    shader/control_flow.h
     shader/decode.cpp
     shader/node_helper.cpp
     shader/node_helper.h
diff --git a/src/video_core/buffer_cache.h b/src/video_core/buffer_cache.h
new file mode 100644
index 000000000..6f868b8b4
--- /dev/null
+++ b/src/video_core/buffer_cache.h
@@ -0,0 +1,299 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "common/alignment.h"
+#include "common/common_types.h"
+#include "core/core.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_cache.h"
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace VideoCommon {
+
+template <typename BufferStorageType>
+class CachedBuffer final : public RasterizerCacheObject {
+public:
+    explicit CachedBuffer(VAddr cpu_addr, u8* host_ptr)
+        : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr} {}
+    ~CachedBuffer() override = default;
+
+    VAddr GetCpuAddr() const override {
+        return cpu_addr;
+    }
+
+    std::size_t GetSizeInBytes() const override {
+        return size;
+    }
+
+    u8* GetWritableHostPtr() const {
+        return host_ptr;
+    }
+
+    std::size_t GetSize() const {
+        return size;
+    }
+
+    std::size_t GetCapacity() const {
+        return capacity;
+    }
+
+    bool IsInternalized() const {
+        return is_internal;
+    }
+
+    const BufferStorageType& GetBuffer() const {
+        return buffer;
+    }
+
+    void SetSize(std::size_t new_size) {
+        size = new_size;
+    }
+
+    void SetInternalState(bool is_internal_) {
+        is_internal = is_internal_;
+    }
+
+    BufferStorageType ExchangeBuffer(BufferStorageType buffer_, std::size_t new_capacity) {
+        capacity = new_capacity;
+        std::swap(buffer, buffer_);
+        return buffer_;
+    }
+
+private:
+    u8* host_ptr{};
+    VAddr cpu_addr{};
+    std::size_t size{};
+    std::size_t capacity{};
+    bool is_internal{};
+    BufferStorageType buffer;
+};
+
+template <typename BufferStorageType, typename BufferType, typename StreamBuffer>
+class BufferCache : public RasterizerCache<std::shared_ptr<CachedBuffer<BufferStorageType>>> {
+public:
+    using Buffer = std::shared_ptr<CachedBuffer<BufferStorageType>>;
+    using BufferInfo = std::pair<const BufferType*, u64>;
+
+    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
+                         std::unique_ptr<StreamBuffer> stream_buffer)
+        : RasterizerCache<Buffer>{rasterizer}, system{system},
+          stream_buffer{std::move(stream_buffer)}, stream_buffer_handle{
+                                                       this->stream_buffer->GetHandle()} {}
+    ~BufferCache() = default;
+
+    void Unregister(const Buffer& entry) override {
+        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
+        if (entry->IsInternalized()) {
+            internalized_entries.erase(entry->GetCacheAddr());
+        }
+        ReserveBuffer(entry);
+        RasterizerCache<Buffer>::Unregister(entry);
+    }
+
+    void TickFrame() {
+        marked_for_destruction_index =
+            (marked_for_destruction_index + 1) % marked_for_destruction_ring_buffer.size();
+        MarkedForDestruction().clear();
+    }
+
+    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
+                            bool internalize = false, bool is_written = false) {
+        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
+
+        auto& memory_manager = system.GPU().MemoryManager();
+        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
+        if (!host_ptr) {
+            return {GetEmptyBuffer(size), 0};
+        }
+        const auto cache_addr = ToCacheAddr(host_ptr);
+
+        // Cache management is a big overhead, so only cache entries with a given size.
+        // TODO: Figure out which size is the best for given games.
+        constexpr std::size_t max_stream_size = 0x800;
+        if (!internalize && size < max_stream_size &&
+            internalized_entries.find(cache_addr) == internalized_entries.end()) {
+            return StreamBufferUpload(host_ptr, size, alignment);
+        }
+
+        auto entry = RasterizerCache<Buffer>::TryGet(cache_addr);
+        if (!entry) {
+            return FixedBufferUpload(gpu_addr, host_ptr, size, internalize, is_written);
+        }
+
+        if (entry->GetSize() < size) {
+            IncreaseBufferSize(entry, size);
+        }
+        if (is_written) {
+            entry->MarkAsModified(true, *this);
+        }
+        return {ToHandle(entry->GetBuffer()), 0};
+    }
+
+    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
+    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
+                                std::size_t alignment = 4) {
+        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
+        return StreamBufferUpload(raw_pointer, size, alignment);
+    }
+
+    void Map(std::size_t max_size) {
+        std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
+        buffer_offset = buffer_offset_base;
+    }
+
+    /// Finishes the upload stream, returns true on bindings invalidation.
+    bool Unmap() {
+        stream_buffer->Unmap(buffer_offset - buffer_offset_base);
+        return std::exchange(invalidated, false);
+    }
+
+    virtual const BufferType* GetEmptyBuffer(std::size_t size) = 0;
+
+protected:
+    void FlushObjectInner(const Buffer& entry) override {
+        DownloadBufferData(entry->GetBuffer(), 0, entry->GetSize(), entry->GetWritableHostPtr());
+    }
+
+    virtual BufferStorageType CreateBuffer(std::size_t size) = 0;
+
+    virtual const BufferType* ToHandle(const BufferStorageType& storage) = 0;
+
+    virtual void UploadBufferData(const BufferStorageType& buffer, std::size_t offset,
+                                  std::size_t size, const u8* data) = 0;
+
+    virtual void DownloadBufferData(const BufferStorageType& buffer, std::size_t offset,
+                                    std::size_t size, u8* data) = 0;
+
+    virtual void CopyBufferData(const BufferStorageType& src, const BufferStorageType& dst,
+                                std::size_t src_offset, std::size_t dst_offset,
+                                std::size_t size) = 0;
+
+private:
+    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
+                                  std::size_t alignment) {
+        AlignBuffer(alignment);
+        const std::size_t uploaded_offset = buffer_offset;
+        std::memcpy(buffer_ptr, raw_pointer, size);
+
+        buffer_ptr += size;
+        buffer_offset += size;
+        return {&stream_buffer_handle, uploaded_offset};
+    }
+
+    BufferInfo FixedBufferUpload(GPUVAddr gpu_addr, u8* host_ptr, std::size_t size,
+                                 bool internalize, bool is_written) {
+        auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
+        const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
+        ASSERT(cpu_addr);
+
+        auto entry = GetUncachedBuffer(*cpu_addr, host_ptr);
+        entry->SetSize(size);
+        entry->SetInternalState(internalize);
+        RasterizerCache<Buffer>::Register(entry);
+
+        if (internalize) {
+            internalized_entries.emplace(ToCacheAddr(host_ptr));
+        }
+        if (is_written) {
+            entry->MarkAsModified(true, *this);
+        }
+
+        if (entry->GetCapacity() < size) {
+            MarkedForDestruction().push_back(entry->ExchangeBuffer(CreateBuffer(size), size));
+        }
+
+        UploadBufferData(entry->GetBuffer(), 0, size, host_ptr);
+        return {ToHandle(entry->GetBuffer()), 0};
+    }
+
+    void IncreaseBufferSize(Buffer& entry, std::size_t new_size) {
+        const std::size_t old_size = entry->GetSize();
+        if (entry->GetCapacity() < new_size) {
+            const auto& old_buffer = entry->GetBuffer();
+            auto new_buffer = CreateBuffer(new_size);
+
+            // Copy bits from the old buffer to the new buffer.
+            CopyBufferData(old_buffer, new_buffer, 0, 0, old_size);
+            MarkedForDestruction().push_back(
+                entry->ExchangeBuffer(std::move(new_buffer), new_size));
+
+            // This buffer could have been used
+            invalidated = true;
+        }
+        // Upload the new bits.
+        const std::size_t size_diff = new_size - old_size;
+        UploadBufferData(entry->GetBuffer(), old_size, size_diff, entry->GetHostPtr() + old_size);
+
+        // Update entry's size in the object and in the cache.
+        Unregister(entry);
+
+        entry->SetSize(new_size);
+        RasterizerCache<Buffer>::Register(entry);
+    }
+
+    Buffer GetUncachedBuffer(VAddr cpu_addr, u8* host_ptr) {
+        if (auto entry = TryGetReservedBuffer(host_ptr)) {
+            return entry;
+        }
+        return std::make_shared<CachedBuffer<BufferStorageType>>(cpu_addr, host_ptr);
+    }
+
+    Buffer TryGetReservedBuffer(u8* host_ptr) {
+        const auto it = buffer_reserve.find(ToCacheAddr(host_ptr));
+        if (it == buffer_reserve.end()) {
+            return {};
+        }
+        auto& reserve = it->second;
+        auto entry = reserve.back();
+        reserve.pop_back();
+        return entry;
+    }
+
+    void ReserveBuffer(Buffer entry) {
+        buffer_reserve[entry->GetCacheAddr()].push_back(std::move(entry));
+    }
+
+    void AlignBuffer(std::size_t alignment) {
+        // Align the offset, not the mapped pointer
+        const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
+        buffer_ptr += offset_aligned - buffer_offset;
+        buffer_offset = offset_aligned;
+    }
+
+    std::vector<BufferStorageType>& MarkedForDestruction() {
+        return marked_for_destruction_ring_buffer[marked_for_destruction_index];
+    }
+
+    Core::System& system;
+
+    std::unique_ptr<StreamBuffer> stream_buffer;
+    BufferType stream_buffer_handle{};
+
+    bool invalidated = false;
+
+    u8* buffer_ptr = nullptr;
+    u64 buffer_offset = 0;
+    u64 buffer_offset_base = 0;
+
+    std::size_t marked_for_destruction_index = 0;
+    std::array<std::vector<BufferStorageType>, 4> marked_for_destruction_ring_buffer;
+
+    std::unordered_set<CacheAddr> internalized_entries;
+    std::unordered_map<CacheAddr, std::vector<Buffer>> buffer_reserve;
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 13e314944..8d15c8a48 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -67,6 +67,7 @@ public:
         static constexpr std::size_t MaxShaderStage = 5;
         // Maximum number of const buffers per shader stage.
         static constexpr std::size_t MaxConstBuffers = 18;
+        static constexpr std::size_t MaxConstBufferSize = 0x10000;
 
         enum class QueryMode : u32 {
             Write = 0,
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index b92921d0f..79d469b88 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -1278,6 +1278,7 @@ union Instruction {
     union {
         BitField<49, 1, u64> nodep_flag;
         BitField<53, 4, u64> texture_info;
+        BitField<59, 1, u64> fp32_flag;
 
         TextureType GetTextureType() const {
             // The TLDS instruction has a weird encoding for the texture type.
@@ -1368,6 +1369,20 @@ union Instruction {
     } bra;
 
     union {
+        BitField<20, 24, u64> target;
+        BitField<5, 1, u64> constant_buffer;
+
+        s32 GetBranchExtend() const {
+            // Sign extend the branch target offset
+            u32 mask = 1U << (24 - 1);
+            u32 value = static_cast<u32>(target);
+            // The branch offset is relative to the next instruction and is stored in bytes, so
+            // divide it by the size of an instruction and add 1 to it.
+            return static_cast<s32>((value ^ mask) - mask) / sizeof(Instruction) + 1;
+        }
+    } brx;
+
+    union {
         BitField<39, 1, u64> emit; // EmitVertex
         BitField<40, 1, u64> cut;  // EndPrimitive
     } out;
@@ -1464,6 +1479,7 @@ public:
         BFE_IMM,
         BFI_IMM_R,
         BRA,
+        BRX,
         PBK,
         LD_A,
         LD_L,
@@ -1738,6 +1754,7 @@ private:
             INST("111000101001----", Id::SSY, Type::Flow, "SSY"),
             INST("111000101010----", Id::PBK, Type::Flow, "PBK"),
             INST("111000100100----", Id::BRA, Type::Flow, "BRA"),
+            INST("111000100101----", Id::BRX, Type::Flow, "BRX"),
             INST("1111000011111---", Id::SYNC, Type::Flow, "SYNC"),
             INST("111000110100---", Id::BRK, Type::Flow, "BRK"),
             INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),
@@ -1760,7 +1777,7 @@ private:
             INST("1101111101010---", Id::TXQ_B, Type::Texture, "TXQ_B"),
             INST("1101-00---------", Id::TEXS, Type::Texture, "TEXS"),
             INST("11011100--11----", Id::TLD, Type::Texture, "TLD"),
-            INST("1101101---------", Id::TLDS, Type::Texture, "TLDS"),
+            INST("1101-01---------", Id::TLDS, Type::Texture, "TLDS"),
             INST("110010----111---", Id::TLD4, Type::Texture, "TLD4"),
             INST("1101111100------", Id::TLD4S, Type::Texture, "TLD4S"),
             INST("110111110110----", Id::TMML_B, Type::Texture, "TMML_B"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 52706505b..1b4975498 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -31,7 +31,7 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
 
 GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
     auto& rasterizer{renderer.Rasterizer()};
-    memory_manager = std::make_unique<Tegra::MemoryManager>(rasterizer);
+    memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
     dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
     maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
     fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro_interpreter.cpp
index c766ed692..9f59a2dc1 100644
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro_interpreter.cpp
@@ -4,14 +4,18 @@
 
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/macro_interpreter.h"
 
+MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
+
 namespace Tegra {
 
 MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
 
 void MacroInterpreter::Execute(u32 offset, std::vector<u32> parameters) {
+    MICROPROFILE_SCOPE(MacroInterp);
     Reset();
     registers[1] = parameters[0];
     this->parameters = std::move(parameters);
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 322453116..bffae940c 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -5,13 +5,17 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/core.h"
+#include "core/hle/kernel/process.h"
+#include "core/hle/kernel/vm_manager.h"
 #include "core/memory.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 
 namespace Tegra {
 
-MemoryManager::MemoryManager(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {
+MemoryManager::MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer)
+    : rasterizer{rasterizer}, system{system} {
     std::fill(page_table.pointers.begin(), page_table.pointers.end(), nullptr);
     std::fill(page_table.attributes.begin(), page_table.attributes.end(),
               Common::PageType::Unmapped);
@@ -49,6 +53,11 @@ GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, u64 size) {
     const GPUVAddr gpu_addr{FindFreeRegion(address_space_base, aligned_size)};
 
     MapBackingMemory(gpu_addr, Memory::GetPointer(cpu_addr), aligned_size, cpu_addr);
+    ASSERT(system.CurrentProcess()
+               ->VMManager()
+               .SetMemoryAttribute(cpu_addr, size, Kernel::MemoryAttribute::DeviceMapped,
+                                   Kernel::MemoryAttribute::DeviceMapped)
+               .IsSuccess());
 
     return gpu_addr;
 }
@@ -59,7 +68,11 @@ GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size)
     const u64 aligned_size{Common::AlignUp(size, page_size)};
 
     MapBackingMemory(gpu_addr, Memory::GetPointer(cpu_addr), aligned_size, cpu_addr);
-
+    ASSERT(system.CurrentProcess()
+               ->VMManager()
+               .SetMemoryAttribute(cpu_addr, size, Kernel::MemoryAttribute::DeviceMapped,
+                                   Kernel::MemoryAttribute::DeviceMapped)
+               .IsSuccess());
     return gpu_addr;
 }
 
@@ -68,9 +81,16 @@ GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) {
 
     const u64 aligned_size{Common::AlignUp(size, page_size)};
     const CacheAddr cache_addr{ToCacheAddr(GetPointer(gpu_addr))};
+    const auto cpu_addr = GpuToCpuAddress(gpu_addr);
+    ASSERT(cpu_addr);
 
     rasterizer.FlushAndInvalidateRegion(cache_addr, aligned_size);
     UnmapRange(gpu_addr, aligned_size);
+    ASSERT(system.CurrentProcess()
+               ->VMManager()
+               .SetMemoryAttribute(cpu_addr.value(), size, Kernel::MemoryAttribute::DeviceMapped,
+                                   Kernel::MemoryAttribute::None)
+               .IsSuccess());
 
     return gpu_addr;
 }
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 43a84bd52..aea010087 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -14,6 +14,10 @@ namespace VideoCore {
 class RasterizerInterface;
 }
 
+namespace Core {
+class System;
+}
+
 namespace Tegra {
 
 /**
@@ -47,7 +51,7 @@ struct VirtualMemoryArea {
 
 class MemoryManager final {
 public:
-    explicit MemoryManager(VideoCore::RasterizerInterface& rasterizer);
+    explicit MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer);
     ~MemoryManager();
 
     GPUVAddr AllocateSpace(u64 size, u64 align);
@@ -173,6 +177,8 @@ private:
     Common::PageTable page_table{page_bits};
     VMAMap vma_map;
     VideoCore::RasterizerInterface& rasterizer;
+
+    Core::System& system;
 };
 
 } // namespace Tegra
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 5ee4f8e8e..2b7367568 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -47,6 +47,9 @@ public:
     /// and invalidated
     virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
 
+    /// Notify rasterizer that a frame is about to finish
+    virtual void TickFrame() = 0;
+
     /// Attempt to use a faster method to perform a surface copy
     virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                        const Tegra::Engines::Fermi2D::Regs::Surface& dst,
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 2b9bd142e..2a9b523f5 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -2,103 +2,57 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <cstring>
 #include <memory>
 
-#include "common/alignment.h"
-#include "core/core.h"
-#include "video_core/memory_manager.h"
+#include <glad/glad.h>
+
+#include "common/assert.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
 
 namespace OpenGL {
 
-CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset,
-                                     std::size_t alignment, u8* host_ptr)
-    : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size}, offset{offset},
-      alignment{alignment} {}
-
-OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size)
-    : RasterizerCache{rasterizer}, stream_buffer(size, true) {}
-
-GLintptr OGLBufferCache::UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment,
-                                      bool cache) {
-    std::lock_guard lock{mutex};
-    auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
-
-    // Cache management is a big overhead, so only cache entries with a given size.
-    // TODO: Figure out which size is the best for given games.
-    cache &= size >= 2048;
-
-    const auto& host_ptr{memory_manager.GetPointer(gpu_addr)};
-    if (cache) {
-        auto entry = TryGet(host_ptr);
-        if (entry) {
-            if (entry->GetSize() >= size && entry->GetAlignment() == alignment) {
-                return entry->GetOffset();
-            }
-            Unregister(entry);
-        }
-    }
+OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
+                               std::size_t stream_size)
+    : VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer>{
+          rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {}
 
-    AlignBuffer(alignment);
-    const GLintptr uploaded_offset = buffer_offset;
+OGLBufferCache::~OGLBufferCache() = default;
 
-    if (!host_ptr) {
-        return uploaded_offset;
-    }
-
-    std::memcpy(buffer_ptr, host_ptr, size);
-    buffer_ptr += size;
-    buffer_offset += size;
-
-    if (cache) {
-        auto entry = std::make_shared<CachedBufferEntry>(
-            *memory_manager.GpuToCpuAddress(gpu_addr), size, uploaded_offset, alignment, host_ptr);
-        Register(entry);
-    }
-
-    return uploaded_offset;
+OGLBuffer OGLBufferCache::CreateBuffer(std::size_t size) {
+    OGLBuffer buffer;
+    buffer.Create();
+    glNamedBufferData(buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+    return buffer;
 }
 
-GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, std::size_t size,
-                                          std::size_t alignment) {
-    std::lock_guard lock{mutex};
-    AlignBuffer(alignment);
-    std::memcpy(buffer_ptr, raw_pointer, size);
-    const GLintptr uploaded_offset = buffer_offset;
-
-    buffer_ptr += size;
-    buffer_offset += size;
-    return uploaded_offset;
+const GLuint* OGLBufferCache::ToHandle(const OGLBuffer& buffer) {
+    return &buffer.handle;
 }
 
-bool OGLBufferCache::Map(std::size_t max_size) {
-    bool invalidate;
-    std::tie(buffer_ptr, buffer_offset_base, invalidate) =
-        stream_buffer.Map(static_cast<GLsizeiptr>(max_size), 4);
-    buffer_offset = buffer_offset_base;
-
-    if (invalidate) {
-        InvalidateAll();
-    }
-    return invalidate;
+const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
+    static const GLuint null_buffer = 0;
+    return &null_buffer;
 }
 
-void OGLBufferCache::Unmap() {
-    stream_buffer.Unmap(buffer_offset - buffer_offset_base);
+void OGLBufferCache::UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
+                                      const u8* data) {
+    glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+                         static_cast<GLsizeiptr>(size), data);
 }
 
-GLuint OGLBufferCache::GetHandle() const {
-    return stream_buffer.GetHandle();
+void OGLBufferCache::DownloadBufferData(const OGLBuffer& buffer, std::size_t offset,
+                                        std::size_t size, u8* data) {
+    glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+                            static_cast<GLsizeiptr>(size), data);
 }
 
-void OGLBufferCache::AlignBuffer(std::size_t alignment) {
-    // Align the offset, not the mapped pointer
-    const GLintptr offset_aligned =
-        static_cast<GLintptr>(Common::AlignUp(static_cast<std::size_t>(buffer_offset), alignment));
-    buffer_ptr += offset_aligned - buffer_offset;
-    buffer_offset = offset_aligned;
+void OGLBufferCache::CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst,
+                                    std::size_t src_offset, std::size_t dst_offset,
+                                    std::size_t size) {
+    glCopyNamedBufferSubData(src.handle, dst.handle, static_cast<GLintptr>(src_offset),
+                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index f2347581b..8c8ac4038 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -4,80 +4,44 @@
 
 #pragma once
 
-#include <cstddef>
 #include <memory>
-#include <tuple>
 
 #include "common/common_types.h"
+#include "video_core/buffer_cache.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
+namespace Core {
+class System;
+}
+
 namespace OpenGL {
 
+class OGLStreamBuffer;
 class RasterizerOpenGL;
 
-class CachedBufferEntry final : public RasterizerCacheObject {
-public:
-    explicit CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset,
-                               std::size_t alignment, u8* host_ptr);
-
-    VAddr GetCpuAddr() const override {
-        return cpu_addr;
-    }
-
-    std::size_t GetSizeInBytes() const override {
-        return size;
-    }
-
-    std::size_t GetSize() const {
-        return size;
-    }
-
-    GLintptr GetOffset() const {
-        return offset;
-    }
-
-    std::size_t GetAlignment() const {
-        return alignment;
-    }
-
-private:
-    VAddr cpu_addr{};
-    std::size_t size{};
-    GLintptr offset{};
-    std::size_t alignment{};
-};
-
-class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> {
+class OGLBufferCache final : public VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer> {
 public:
-    explicit OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size);
-
-    /// Uploads data from a guest GPU address. Returns host's buffer offset where it's been
-    /// allocated.
-    GLintptr UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
-                          bool cache = true);
+    explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
+                            std::size_t stream_size);
+    ~OGLBufferCache();
 
-    /// Uploads from a host memory. Returns host's buffer offset where it's been allocated.
-    GLintptr UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment = 4);
-
-    bool Map(std::size_t max_size);
-    void Unmap();
-
-    GLuint GetHandle() const;
+    const GLuint* GetEmptyBuffer(std::size_t) override;
 
 protected:
-    void AlignBuffer(std::size_t alignment);
+    OGLBuffer CreateBuffer(std::size_t size) override;
+
+    const GLuint* ToHandle(const OGLBuffer& buffer) override;
 
-    // We do not have to flush this cache as things in it are never modified by us.
-    void FlushObjectInner(const std::shared_ptr<CachedBufferEntry>& object) override {}
+    void UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
+                          const u8* data) override;
 
-private:
-    OGLStreamBuffer stream_buffer;
+    void DownloadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
+                            u8* data) override;
 
-    u8* buffer_ptr = nullptr;
-    GLintptr buffer_offset = 0;
-    GLintptr buffer_offset_base = 0;
+    void CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst, std::size_t src_offset,
+                        std::size_t dst_offset, std::size_t size) override;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 4b1f22f7a..85424a4c9 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -24,6 +24,7 @@ T GetInteger(GLenum pname) {
 
 Device::Device() {
     uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
+    shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
     max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
     max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
     has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index 7a3a0675b..dc883722d 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -18,6 +18,10 @@ public:
         return uniform_buffer_alignment;
     }
 
+    std::size_t GetShaderStorageBufferAlignment() const {
+        return shader_storage_alignment;
+    }
+
     u32 GetMaxVertexAttributes() const {
         return max_vertex_attributes;
     }
@@ -43,6 +47,7 @@ private:
     static bool TestComponentIndexingBug();
 
     std::size_t uniform_buffer_alignment{};
+    std::size_t shader_storage_alignment{};
     u32 max_vertex_attributes{};
     u32 max_varyings{};
     bool has_vertex_viewport_layer{};
diff --git a/src/video_core/renderer_opengl/gl_global_cache.cpp b/src/video_core/renderer_opengl/gl_global_cache.cpp
deleted file mode 100644
index d5e385151..000000000
--- a/src/video_core/renderer_opengl/gl_global_cache.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <glad/glad.h>
-
-#include "common/logging/log.h"
-#include "core/core.h"
-#include "video_core/memory_manager.h"
-#include "video_core/renderer_opengl/gl_global_cache.h"
-#include "video_core/renderer_opengl/gl_rasterizer.h"
-#include "video_core/renderer_opengl/gl_shader_decompiler.h"
-#include "video_core/renderer_opengl/utils.h"
-
-namespace OpenGL {
-
-CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size)
-    : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, host_ptr{host_ptr}, size{size},
-      max_size{max_size} {
-    buffer.Create();
-    LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory");
-}
-
-CachedGlobalRegion::~CachedGlobalRegion() = default;
-
-void CachedGlobalRegion::Reload(u32 size_) {
-    size = size_;
-    if (size > max_size) {
-        size = max_size;
-        LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the supported size {}!", size_,
-                     max_size);
-    }
-    glNamedBufferData(buffer.handle, size, host_ptr, GL_STREAM_DRAW);
-}
-
-void CachedGlobalRegion::Flush() {
-    LOG_DEBUG(Render_OpenGL, "Flushing {} bytes to CPU memory address 0x{:16}", size, cpu_addr);
-    glGetNamedBufferSubData(buffer.handle, 0, static_cast<GLsizeiptr>(size), host_ptr);
-}
-
-GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const {
-    const auto search{reserve.find(addr)};
-    if (search == reserve.end()) {
-        return {};
-    }
-    return search->second;
-}
-
-GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr,
-                                                              u32 size) {
-    GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)};
-    if (!region) {
-        // No reserved surface available, create a new one and reserve it
-        auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
-        const auto cpu_addr{memory_manager.GpuToCpuAddress(addr)};
-        ASSERT(cpu_addr);
-
-        region = std::make_shared<CachedGlobalRegion>(*cpu_addr, host_ptr, size, max_ssbo_size);
-        ReserveGlobalRegion(region);
-    }
-    region->Reload(size);
-    return region;
-}
-
-void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) {
-    reserve.insert_or_assign(region->GetCacheAddr(), std::move(region));
-}
-
-GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer)
-    : RasterizerCache{rasterizer} {
-    GLint max_ssbo_size_;
-    glGetIntegerv(GL_MAX_SHADER_STORAGE_BLOCK_SIZE, &max_ssbo_size_);
-    max_ssbo_size = static_cast<u32>(max_ssbo_size_);
-}
-
-GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
-    const GLShader::GlobalMemoryEntry& global_region,
-    Tegra::Engines::Maxwell3D::Regs::ShaderStage stage) {
-    std::lock_guard lock{mutex};
-
-    auto& gpu{Core::System::GetInstance().GPU()};
-    auto& memory_manager{gpu.MemoryManager()};
-    const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
-    const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address +
-                    global_region.GetCbufOffset()};
-    const auto actual_addr{memory_manager.Read<u64>(addr)};
-    const auto size{memory_manager.Read<u32>(addr + 8)};
-
-    // Look up global region in the cache based on address
-    const auto& host_ptr{memory_manager.GetPointer(actual_addr)};
-    GlobalRegion region{TryGet(host_ptr)};
-
-    if (!region) {
-        // No global region found - create a new one
-        region = GetUncachedGlobalRegion(actual_addr, host_ptr, size);
-        Register(region);
-    }
-
-    return region;
-}
-
-} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_global_cache.h b/src/video_core/renderer_opengl/gl_global_cache.h
deleted file mode 100644
index 2d467a240..000000000
--- a/src/video_core/renderer_opengl/gl_global_cache.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <memory>
-#include <unordered_map>
-
-#include <glad/glad.h>
-
-#include "common/assert.h"
-#include "common/common_types.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_cache.h"
-#include "video_core/renderer_opengl/gl_resource_manager.h"
-
-namespace OpenGL {
-
-namespace GLShader {
-class GlobalMemoryEntry;
-}
-
-class RasterizerOpenGL;
-class CachedGlobalRegion;
-using GlobalRegion = std::shared_ptr<CachedGlobalRegion>;
-
-class CachedGlobalRegion final : public RasterizerCacheObject {
-public:
-    explicit CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size);
-    ~CachedGlobalRegion();
-
-    VAddr GetCpuAddr() const override {
-        return cpu_addr;
-    }
-
-    std::size_t GetSizeInBytes() const override {
-        return size;
-    }
-
-    /// Gets the GL program handle for the buffer
-    GLuint GetBufferHandle() const {
-        return buffer.handle;
-    }
-
-    /// Reloads the global region from guest memory
-    void Reload(u32 size_);
-
-    void Flush();
-
-private:
-    VAddr cpu_addr{};
-    u8* host_ptr{};
-    u32 size{};
-    u32 max_size{};
-
-    OGLBuffer buffer;
-};
-
-class GlobalRegionCacheOpenGL final : public RasterizerCache<GlobalRegion> {
-public:
-    explicit GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer);
-
-    /// Gets the current specified shader stage program
-    GlobalRegion GetGlobalRegion(const GLShader::GlobalMemoryEntry& descriptor,
-                                 Tegra::Engines::Maxwell3D::Regs::ShaderStage stage);
-
-protected:
-    void FlushObjectInner(const GlobalRegion& object) override {
-        object->Flush();
-    }
-
-private:
-    GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const;
-    GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, u32 size);
-    void ReserveGlobalRegion(GlobalRegion region);
-
-    std::unordered_map<CacheAddr, GlobalRegion> reserve;
-    u32 max_ssbo_size{};
-};
-
-} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index f45a3c5ef..0bb5c068c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -20,6 +20,7 @@
 #include "core/hle/kernel/process.h"
 #include "core/settings.h"
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
@@ -80,11 +81,25 @@ struct DrawParameters {
     }
 };
 
+static std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
+                                      const GLShader::ConstBufferEntry& entry) {
+    if (!entry.IsIndirect()) {
+        return entry.GetSize();
+    }
+
+    if (buffer.size > Maxwell::MaxConstBufferSize) {
+        LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size,
+                    Maxwell::MaxConstBufferSize);
+        return Maxwell::MaxConstBufferSize;
+    }
+
+    return buffer.size;
+}
+
 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
                                    ScreenInfo& info)
     : texture_cache{system, *this, device}, shader_cache{*this, system, emu_window, device},
-      global_cache{*this}, system{system}, screen_info{info},
-      buffer_cache(*this, STREAM_BUFFER_SIZE) {
+      system{system}, screen_info{info}, buffer_cache{*this, system, STREAM_BUFFER_SIZE} {
     OpenGLState::ApplyDefaultState();
 
     shader_program_manager = std::make_unique<GLShader::ProgramManager>();
@@ -129,8 +144,6 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
         state.draw.vertex_array = vao;
         state.ApplyVertexArrayState();
 
-        glVertexArrayElementBuffer(vao, buffer_cache.GetHandle());
-
         // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL.
         // Enables the first 16 vertex attributes always, as we don't know which ones are actually
         // used until shader time. Note, Tegra technically supports 32, but we're capping this to 16
@@ -197,11 +210,11 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
 
         ASSERT(end > start);
         const u64 size = end - start + 1;
-        const GLintptr vertex_buffer_offset = buffer_cache.UploadMemory(start, size);
+        const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);
 
         // Bind the vertex array to the buffer at the current offset.
-        glVertexArrayVertexBuffer(vao, index, buffer_cache.GetHandle(), vertex_buffer_offset,
-                                  vertex_array.stride);
+        vertex_array_pushbuffer.SetVertexBuffer(index, vertex_buffer, vertex_buffer_offset,
+                                                vertex_array.stride);
 
         if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) {
             // Enable vertex buffer instancing with the specified divisor.
@@ -215,7 +228,19 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
     gpu.dirty_flags.vertex_array.reset();
 }
 
-DrawParameters RasterizerOpenGL::SetupDraw() {
+GLintptr RasterizerOpenGL::SetupIndexBuffer() {
+    if (accelerate_draw != AccelDraw::Indexed) {
+        return 0;
+    }
+    MICROPROFILE_SCOPE(OpenGL_Index);
+    const auto& regs = system.GPU().Maxwell3D().regs;
+    const std::size_t size = CalculateIndexBufferSize();
+    const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
+    vertex_array_pushbuffer.SetIndexBuffer(buffer);
+    return offset;
+}
+
+DrawParameters RasterizerOpenGL::SetupDraw(GLintptr index_buffer_offset) {
     const auto& gpu = system.GPU().Maxwell3D();
     const auto& regs = gpu.regs;
     const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
@@ -227,11 +252,9 @@ DrawParameters RasterizerOpenGL::SetupDraw() {
     params.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology);
 
     if (is_indexed) {
-        MICROPROFILE_SCOPE(OpenGL_Index);
         params.index_format = MaxwellToGL::IndexFormat(regs.index_array.format);
         params.count = regs.index_array.count;
-        params.index_buffer_offset =
-            buffer_cache.UploadMemory(regs.index_array.IndexStart(), CalculateIndexBufferSize());
+        params.index_buffer_offset = index_buffer_offset;
         params.base_vertex = static_cast<GLint>(regs.vb_element_base);
     } else {
         params.count = regs.vertex_buffer.count;
@@ -247,10 +270,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
     BaseBindings base_bindings;
     std::array<bool, Maxwell::NumClipDistances> clip_distances{};
 
-    // Prepare packed bindings
-    bind_ubo_pushbuffer.Setup(base_bindings.cbuf);
-    bind_ssbo_pushbuffer.Setup(base_bindings.gmem);
-
     for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
         const auto& shader_config = gpu.regs.shader_config[index];
         const Maxwell::ShaderProgram program{static_cast<Maxwell::ShaderProgram>(index)};
@@ -271,12 +290,11 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
 
         GLShader::MaxwellUniformData ubo{};
         ubo.SetFromRegs(gpu, stage);
-        const GLintptr offset =
+        const auto [buffer, offset] =
             buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
 
         // Bind the emulation info buffer
-        bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset,
-                                 static_cast<GLsizeiptr>(sizeof(ubo)));
+        bind_ubo_pushbuffer.Push(buffer, offset, static_cast<GLsizeiptr>(sizeof(ubo)));
 
         Shader shader{shader_cache.GetStageProgram(program)};
 
@@ -321,9 +339,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
         base_bindings = next_bindings;
     }
 
-    bind_ubo_pushbuffer.Bind();
-    bind_ssbo_pushbuffer.Bind();
-
     SyncClipEnabled(clip_distances);
 
     gpu.dirty_flags.shaders = false;
@@ -634,26 +649,46 @@ void RasterizerOpenGL::DrawArrays() {
                       Maxwell::MaxShaderStage;
 
     // Add space for at least 18 constant buffers
-    buffer_size +=
-        Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment());
+    buffer_size += Maxwell::MaxConstBuffers *
+                   (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
 
-    const bool invalidate = buffer_cache.Map(buffer_size);
-    if (invalidate) {
-        // As all cached buffers are invalidated, we need to recheck their state.
-        gpu.dirty_flags.vertex_array.set();
-    }
+    // Prepare the vertex array.
+    buffer_cache.Map(buffer_size);
 
+    // Prepare vertex array format.
     const GLuint vao = SetupVertexFormat();
+    vertex_array_pushbuffer.Setup(vao);
+
+    // Upload vertex and index data.
     SetupVertexBuffer(vao);
+    const GLintptr index_buffer_offset = SetupIndexBuffer();
 
-    DrawParameters params = SetupDraw();
+    // Setup draw parameters. It will automatically choose what glDraw* method to use.
+    const DrawParameters params = SetupDraw(index_buffer_offset);
+
+    // Prepare packed bindings.
+    bind_ubo_pushbuffer.Setup(0);
+    bind_ssbo_pushbuffer.Setup(0);
+
+    // Setup shaders and their used resources.
     texture_cache.GuardSamplers(true);
     SetupShaders(params.primitive_mode);
     texture_cache.GuardSamplers(false);
 
     ConfigureFramebuffers(state);
 
-    buffer_cache.Unmap();
+    // Signal the buffer cache that we are not going to upload more things.
+    const bool invalidate = buffer_cache.Unmap();
+
+    // Now that we are no longer uploading data, we can safely bind the buffers to OpenGL.
+    vertex_array_pushbuffer.Bind();
+    bind_ubo_pushbuffer.Bind();
+    bind_ssbo_pushbuffer.Bind();
+
+    if (invalidate) {
+        // As all cached buffers are invalidated, we need to recheck their state.
+        gpu.dirty_flags.vertex_array.set();
+    }
 
     shader_program_manager->ApplyTo(state);
     state.Apply();
@@ -675,7 +710,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
         return;
     }
     texture_cache.FlushRegion(addr, size);
-    global_cache.FlushRegion(addr, size);
+    buffer_cache.FlushRegion(addr, size);
 }
 
 void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
@@ -685,7 +720,6 @@ void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
     }
     texture_cache.InvalidateRegion(addr, size);
     shader_cache.InvalidateRegion(addr, size);
-    global_cache.InvalidateRegion(addr, size);
     buffer_cache.InvalidateRegion(addr, size);
 }
 
@@ -696,6 +730,10 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
     InvalidateRegion(addr, size);
 }
 
+void RasterizerOpenGL::TickFrame() {
+    buffer_cache.TickFrame();
+}
+
 bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                              const Tegra::Engines::Fermi2D::Regs::Surface& dst,
                                              const Tegra::Engines::Fermi2D::Config& copy_config) {
@@ -739,11 +777,9 @@ void RasterizerOpenGL::SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::Sh
     MICROPROFILE_SCOPE(OpenGL_UBO);
     const auto stage_index = static_cast<std::size_t>(stage);
     const auto& shader_stage = system.GPU().Maxwell3D().state.shader_stages[stage_index];
-    const auto& entries = shader->GetShaderEntries().const_buffers;
 
     // Upload only the enabled buffers from the 16 constbuffers of each shader stage
-    for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
-        const auto& entry = entries[bindpoint];
+    for (const auto& entry : shader->GetShaderEntries().const_buffers) {
         SetupConstBuffer(shader_stage.const_buffers[entry.GetIndex()], entry);
     }
 }
@@ -752,46 +788,34 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b
                                         const GLShader::ConstBufferEntry& entry) {
     if (!buffer.enabled) {
         // Set values to zero to unbind buffers
-        bind_ubo_pushbuffer.Push(0, 0, 0);
+        bind_ubo_pushbuffer.Push(buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
         return;
     }
 
-    std::size_t size;
-    if (entry.IsIndirect()) {
-        // Buffer is accessed indirectly, so upload the entire thing
-        size = buffer.size;
-
-        if (size > MaxConstbufferSize) {
-            LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", size,
-                        MaxConstbufferSize);
-            size = MaxConstbufferSize;
-        }
-    } else {
-        // Buffer is accessed directly, upload just what we use
-        size = entry.GetSize();
-    }
-
     // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
     // UBO alignment requirements.
-    size = Common::AlignUp(size, sizeof(GLvec4));
-    ASSERT_MSG(size <= MaxConstbufferSize, "Constant buffer is too big");
+    const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
 
-    const std::size_t alignment = device.GetUniformBufferAlignment();
-    const GLintptr offset = buffer_cache.UploadMemory(buffer.address, size, alignment);
-    bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, size);
+    const auto alignment = device.GetUniformBufferAlignment();
+    const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment);
+    bind_ubo_pushbuffer.Push(cbuf, offset, size);
 }
 
 void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
                                           const Shader& shader) {
-    const auto& entries = shader->GetShaderEntries().global_memory_entries;
-    for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
-        const auto& entry{entries[bindpoint]};
-        const auto& region{global_cache.GetGlobalRegion(entry, stage)};
-        if (entry.IsWritten()) {
-            region->MarkAsModified(true, global_cache);
-        }
-        bind_ssbo_pushbuffer.Push(region->GetBufferHandle(), 0,
-                                  static_cast<GLsizeiptr>(region->GetSizeInBytes()));
+    auto& gpu{system.GPU()};
+    auto& memory_manager{gpu.MemoryManager()};
+    const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
+    const auto alignment{device.GetShaderStorageBufferAlignment()};
+
+    for (const auto& entry : shader->GetShaderEntries().global_memory_entries) {
+        const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()};
+        const auto actual_addr{memory_manager.Read<u64>(addr)};
+        const auto size{memory_manager.Read<u32>(addr + 8)};
+
+        const auto [ssbo, buffer_offset] =
+            buffer_cache.UploadMemory(actual_addr, size, alignment, true, entry.IsWritten());
+        bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
     }
 }
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index bf67e3a70..40b571d58 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -24,7 +24,6 @@
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_framebuffer_cache.h"
-#include "video_core/renderer_opengl/gl_global_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_sampler_cache.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
@@ -63,6 +62,7 @@ public:
     void FlushRegion(CacheAddr addr, u64 size) override;
     void InvalidateRegion(CacheAddr addr, u64 size) override;
     void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
+    void TickFrame() override;
     bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                const Tegra::Engines::Fermi2D::Regs::Surface& dst,
                                const Tegra::Engines::Fermi2D::Config& copy_config) override;
@@ -73,11 +73,6 @@ public:
     void LoadDiskResources(const std::atomic_bool& stop_loading,
                            const VideoCore::DiskResourceLoadCallback& callback) override;
 
-    /// Maximum supported size that a constbuffer can have in bytes.
-    static constexpr std::size_t MaxConstbufferSize = 0x10000;
-    static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0,
-                  "The maximum size of a constbuffer must be a multiple of the size of GLvec4");
-
 private:
     struct FramebufferConfigState {
         bool using_color_fb{};
@@ -98,17 +93,19 @@ private:
 
     /**
      * Configures the color and depth framebuffer states.
-     * @param must_reconfigure If true, tells the framebuffer to skip the cache and reconfigure
-     * again. Used by the texture cache to solve texception conflicts
-     * @param use_color_fb If true, configure color framebuffers.
-     * @param using_depth_fb If true, configure the depth/stencil framebuffer.
-     * @param preserve_contents If true, tries to preserve data from a previously used framebuffer.
+     *
+     * @param current_state       The current OpenGL state.
+     * @param using_color_fb      If true, configure color framebuffers.
+     * @param using_depth_fb      If true, configure the depth/stencil framebuffer.
+     * @param preserve_contents   If true, tries to preserve data from a previously used
+     *                            framebuffer.
      * @param single_color_target Specifies if a single color buffer target should be used.
+     *
      * @returns If depth (first) or stencil (second) are being stored in the bound zeta texture
-     * (requires using_depth_fb to be true)
+     *          (requires using_depth_fb to be true)
      */
     std::pair<bool, bool> ConfigureFramebuffers(
-        OpenGLState& current_state, bool use_color_fb = true, bool using_depth_fb = true,
+        OpenGLState& current_state, bool using_color_fb = true, bool using_depth_fb = true,
         bool preserve_contents = true, std::optional<std::size_t> single_color_target = {});
 
     /// Configures the current constbuffers to use for the draw command.
@@ -189,7 +186,6 @@ private:
 
     TextureCacheOpenGL texture_cache;
     ShaderCacheOpenGL shader_cache;
-    GlobalRegionCacheOpenGL global_cache;
     SamplerCacheOpenGL sampler_cache;
     FramebufferCacheOpenGL framebuffer_cache;
 
@@ -208,6 +204,7 @@ private:
     static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
     OGLBufferCache buffer_cache;
 
+    VertexArrayPushBuffer vertex_array_pushbuffer;
     BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
     BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
 
@@ -220,7 +217,9 @@ private:
 
     void SetupVertexBuffer(GLuint vao);
 
-    DrawParameters SetupDraw();
+    GLintptr SetupIndexBuffer();
+
+    DrawParameters SetupDraw(GLintptr index_buffer_offset);
 
     void SetupShaders(GLenum primitive_mode);
 
diff --git a/src/video_core/renderer_opengl/gl_sampler_cache.h b/src/video_core/renderer_opengl/gl_sampler_cache.h
index defbc2d81..34ee37f00 100644
--- a/src/video_core/renderer_opengl/gl_sampler_cache.h
+++ b/src/video_core/renderer_opengl/gl_sampler_cache.h
@@ -17,9 +17,9 @@ public:
     ~SamplerCacheOpenGL();
 
 protected:
-    OGLSampler CreateSampler(const Tegra::Texture::TSCEntry& tsc) const;
+    OGLSampler CreateSampler(const Tegra::Texture::TSCEntry& tsc) const override;
 
-    GLuint ToSamplerType(const OGLSampler& sampler) const;
+    GLuint ToSamplerType(const OGLSampler& sampler) const override;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 7fea92713..32dd9eae7 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -129,9 +129,11 @@ std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
 
 /// Hashes one (or two) program streams
 u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& code,
-                        const ProgramCode& code_b) {
-    u64 unique_identifier =
-        Common::CityHash64(reinterpret_cast<const char*>(code.data()), CalculateProgramSize(code));
+                        const ProgramCode& code_b, std::size_t size_a = 0, std::size_t size_b = 0) {
+    if (size_a == 0) {
+        size_a = CalculateProgramSize(code);
+    }
+    u64 unique_identifier = Common::CityHash64(reinterpret_cast<const char*>(code.data()), size_a);
     if (program_type != Maxwell::ShaderProgram::VertexA) {
         return unique_identifier;
     }
@@ -140,8 +142,11 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode&
     std::size_t seed = 0;
     boost::hash_combine(seed, unique_identifier);
 
-    const u64 identifier_b = Common::CityHash64(reinterpret_cast<const char*>(code_b.data()),
-                                                CalculateProgramSize(code_b));
+    if (size_b == 0) {
+        size_b = CalculateProgramSize(code_b);
+    }
+    const u64 identifier_b =
+        Common::CityHash64(reinterpret_cast<const char*>(code_b.data()), size_b);
     boost::hash_combine(seed, identifier_b);
     return static_cast<u64>(seed);
 }
@@ -150,14 +155,17 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode&
 GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type,
                                       ProgramCode program_code, ProgramCode program_code_b) {
     GLShader::ShaderSetup setup(program_code);
+    setup.program.size_a = CalculateProgramSize(program_code);
+    setup.program.size_b = 0;
     if (program_type == Maxwell::ShaderProgram::VertexA) {
         // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders.
         // Conventional HW does not support this, so we combine VertexA and VertexB into one
         // stage here.
         setup.SetProgramB(program_code_b);
+        setup.program.size_b = CalculateProgramSize(program_code_b);
     }
-    setup.program.unique_identifier =
-        GetUniqueIdentifier(program_type, program_code, program_code_b);
+    setup.program.unique_identifier = GetUniqueIdentifier(
+        program_type, program_code, program_code_b, setup.program.size_a, setup.program.size_b);
 
     switch (program_type) {
     case Maxwell::ShaderProgram::VertexA:
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 3408e6c41..119073776 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -47,7 +47,7 @@ using TextureArgument = std::pair<Type, Node>;
 using TextureIR = std::variant<TextureAoffi, TextureArgument>;
 
 constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
-    static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float));
+    static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));
 
 class ShaderWriter {
 public:
@@ -192,10 +192,12 @@ public:
 
         // TODO(Subv): Figure out the actual depth of the flow stack, for now it seems
         // unlikely that shaders will use 20 nested SSYs and PBKs.
-        constexpr u32 FLOW_STACK_SIZE = 20;
-        for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) {
-            code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE);
-            code.AddLine("uint {} = 0u;", FlowStackTopName(stack));
+        if (!ir.IsFlowStackDisabled()) {
+            constexpr u32 FLOW_STACK_SIZE = 20;
+            for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) {
+                code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE);
+                code.AddLine("uint {} = 0u;", FlowStackTopName(stack));
+            }
         }
 
         code.AddLine("while (true) {{");
@@ -1602,6 +1604,14 @@ private:
         return {};
     }
 
+    std::string BranchIndirect(Operation operation) {
+        const std::string op_a = VisitOperand(operation, 0, Type::Uint);
+
+        code.AddLine("jmp_to = {};", op_a);
+        code.AddLine("break;");
+        return {};
+    }
+
     std::string PushFlowStack(Operation operation) {
         const auto stack = std::get<MetaStackClass>(operation.GetMeta());
         const auto target = std::get_if<ImmediateNode>(&*operation[0]);
@@ -1836,6 +1846,7 @@ private:
         &GLSLDecompiler::ImageStore,
 
         &GLSLDecompiler::Branch,
+        &GLSLDecompiler::BranchIndirect,
         &GLSLDecompiler::PushFlowStack,
         &GLSLDecompiler::PopFlowStack,
         &GLSLDecompiler::Exit,
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 9148629ec..f9ee8429e 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -29,14 +29,14 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
 };
 
 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
     ProgramResult program =
         Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");
 
     out += program.first;
 
     if (setup.IsDualProgram()) {
-        const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET);
+        const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET, setup.program.size_b);
         ProgramResult program_b =
             Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");
 
@@ -80,7 +80,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
 };
 
 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
     ProgramResult program =
         Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
     out += program.first;
@@ -115,7 +115,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config {
 };
 
 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
     ProgramResult program =
         Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");
 
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index 0536c8a03..7cbc590f8 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -27,6 +27,8 @@ struct ShaderSetup {
         ProgramCode code;
         ProgramCode code_b; // Used for dual vertex shaders
         u64 unique_identifier;
+        std::size_t size_a;
+        std::size_t size_b;
     } program;
 
     /// Used in scenarios where we have a dual vertex shaders
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index d86e137ac..0eae98afe 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -6,8 +6,11 @@
 #include <glad/glad.h>
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "video_core/renderer_opengl/gl_state.h"
 
+MICROPROFILE_DEFINE(OpenGL_State, "OpenGL", "State Change", MP_RGB(192, 128, 128));
+
 namespace OpenGL {
 
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
@@ -524,6 +527,7 @@ void OpenGLState::ApplySamplers() const {
 }
 
 void OpenGLState::Apply() const {
+    MICROPROFILE_SCOPE(OpenGL_State);
     ApplyFramebufferState();
     ApplyVertexArrayState();
     ApplyShaderProgram();
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 08ae1a429..b1f6bc7c2 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -31,6 +31,8 @@ using VideoCore::Surface::SurfaceType;
 
 MICROPROFILE_DEFINE(OpenGL_Texture_Upload, "OpenGL", "Texture Upload", MP_RGB(128, 192, 128));
 MICROPROFILE_DEFINE(OpenGL_Texture_Download, "OpenGL", "Texture Download", MP_RGB(128, 192, 128));
+MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy",
+                    MP_RGB(128, 192, 128));
 
 namespace {
 
@@ -535,6 +537,7 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
 }
 
 void TextureCacheOpenGL::BufferCopy(Surface& src_surface, Surface& dst_surface) {
+    MICROPROFILE_SCOPE(OpenGL_Texture_Buffer_Copy);
     const auto& src_params = src_surface->GetSurfaceParams();
     const auto& dst_params = dst_surface->GetSurfaceParams();
     UNIMPLEMENTED_IF(src_params.num_levels > 1 || dst_params.num_levels > 1);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index b142521ec..9ecdddb0d 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -101,7 +101,6 @@ RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::Syst
 
 RendererOpenGL::~RendererOpenGL() = default;
 
-/// Swap buffers (render frame)
 void RendererOpenGL::SwapBuffers(
     std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
 
@@ -130,6 +129,8 @@ void RendererOpenGL::SwapBuffers(
 
         DrawScreen(render_window.GetFramebufferLayout());
 
+        rasterizer->TickFrame();
+
         render_window.SwapBuffers();
     }
 
@@ -262,7 +263,6 @@ void RendererOpenGL::CreateRasterizer() {
     if (rasterizer) {
         return;
     }
-    // Initialize sRGB Usage
     OpenGLState::ClearsRGBUsed();
     rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info);
 }
diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp
index 68c36988d..c504a2c1a 100644
--- a/src/video_core/renderer_opengl/utils.cpp
+++ b/src/video_core/renderer_opengl/utils.cpp
@@ -13,29 +13,67 @@
 
 namespace OpenGL {
 
+VertexArrayPushBuffer::VertexArrayPushBuffer() = default;
+
+VertexArrayPushBuffer::~VertexArrayPushBuffer() = default;
+
+void VertexArrayPushBuffer::Setup(GLuint vao_) {
+    vao = vao_;
+    index_buffer = nullptr;
+    vertex_buffers.clear();
+}
+
+void VertexArrayPushBuffer::SetIndexBuffer(const GLuint* buffer) {
+    index_buffer = buffer;
+}
+
+void VertexArrayPushBuffer::SetVertexBuffer(GLuint binding_index, const GLuint* buffer,
+                                            GLintptr offset, GLsizei stride) {
+    vertex_buffers.push_back(Entry{binding_index, buffer, offset, stride});
+}
+
+void VertexArrayPushBuffer::Bind() {
+    if (index_buffer) {
+        glVertexArrayElementBuffer(vao, *index_buffer);
+    }
+
+    // TODO(Rodrigo): Find a way to ARB_multi_bind this
+    for (const auto& entry : vertex_buffers) {
+        glVertexArrayVertexBuffer(vao, entry.binding_index, *entry.buffer, entry.offset,
+                                  entry.stride);
+    }
+}
+
 BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {}
 
 BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default;
 
 void BindBuffersRangePushBuffer::Setup(GLuint first_) {
     first = first_;
-    buffers.clear();
+    buffer_pointers.clear();
     offsets.clear();
     sizes.clear();
 }
 
-void BindBuffersRangePushBuffer::Push(GLuint buffer, GLintptr offset, GLsizeiptr size) {
-    buffers.push_back(buffer);
+void BindBuffersRangePushBuffer::Push(const GLuint* buffer, GLintptr offset, GLsizeiptr size) {
+    buffer_pointers.push_back(buffer);
     offsets.push_back(offset);
     sizes.push_back(size);
 }
 
-void BindBuffersRangePushBuffer::Bind() const {
-    const std::size_t count{buffers.size()};
+void BindBuffersRangePushBuffer::Bind() {
+    // Ensure sizes are valid.
+    const std::size_t count{buffer_pointers.size()};
     DEBUG_ASSERT(count == offsets.size() && count == sizes.size());
     if (count == 0) {
         return;
     }
+
+    // Dereference buffers.
+    buffers.resize(count);
+    std::transform(buffer_pointers.begin(), buffer_pointers.end(), buffers.begin(),
+                   [](const GLuint* pointer) { return *pointer; });
+
     glBindBuffersRange(target, first, static_cast<GLsizei>(count), buffers.data(), offsets.data(),
                        sizes.data());
 }
diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h
index 4a752f3b4..6c2b45546 100644
--- a/src/video_core/renderer_opengl/utils.h
+++ b/src/video_core/renderer_opengl/utils.h
@@ -11,20 +11,49 @@
 
 namespace OpenGL {
 
-class BindBuffersRangePushBuffer {
+class VertexArrayPushBuffer final {
 public:
-    BindBuffersRangePushBuffer(GLenum target);
+    explicit VertexArrayPushBuffer();
+    ~VertexArrayPushBuffer();
+
+    void Setup(GLuint vao_);
+
+    void SetIndexBuffer(const GLuint* buffer);
+
+    void SetVertexBuffer(GLuint binding_index, const GLuint* buffer, GLintptr offset,
+                         GLsizei stride);
+
+    void Bind();
+
+private:
+    struct Entry {
+        GLuint binding_index{};
+        const GLuint* buffer{};
+        GLintptr offset{};
+        GLsizei stride{};
+    };
+
+    GLuint vao{};
+    const GLuint* index_buffer{};
+    std::vector<Entry> vertex_buffers;
+};
+
+class BindBuffersRangePushBuffer final {
+public:
+    explicit BindBuffersRangePushBuffer(GLenum target);
     ~BindBuffersRangePushBuffer();
 
     void Setup(GLuint first_);
 
-    void Push(GLuint buffer, GLintptr offset, GLsizeiptr size);
+    void Push(const GLuint* buffer, GLintptr offset, GLsizeiptr size);
 
-    void Bind() const;
+    void Bind();
 
 private:
-    GLenum target;
-    GLuint first;
+    GLenum target{};
+    GLuint first{};
+    std::vector<const GLuint*> buffer_pointers;
+
     std::vector<GLuint> buffers;
     std::vector<GLintptr> offsets;
     std::vector<GLsizeiptr> sizes;
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 02a9f5ecb..d2e9f4031 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -109,8 +109,8 @@ void VKBufferCache::Reserve(std::size_t max_size) {
     }
 }
 
-VKExecutionContext VKBufferCache::Send(VKExecutionContext exctx) {
-    return stream_buffer->Send(exctx, buffer_offset - buffer_offset_base);
+void VKBufferCache::Send() {
+    stream_buffer->Send(buffer_offset - buffer_offset_base);
 }
 
 void VKBufferCache::AlignBuffer(std::size_t alignment) {
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index 3edf460df..49f13bcdc 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -77,7 +77,7 @@ public:
     void Reserve(std::size_t max_size);
 
     /// Ensures that the set data is sent to the device.
-    [[nodiscard]] VKExecutionContext Send(VKExecutionContext exctx);
+    void Send();
 
     /// Returns the buffer cache handle.
     vk::Buffer GetBuffer() const {
diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.h b/src/video_core/renderer_vulkan/vk_sampler_cache.h
index 771b05c73..1f73b716b 100644
--- a/src/video_core/renderer_vulkan/vk_sampler_cache.h
+++ b/src/video_core/renderer_vulkan/vk_sampler_cache.h
@@ -4,9 +4,6 @@
 
 #pragma once
 
-#include <unordered_map>
-
-#include "common/common_types.h"
 #include "video_core/renderer_vulkan/declarations.h"
 #include "video_core/sampler_cache.h"
 #include "video_core/textures/texture.h"
@@ -21,9 +18,9 @@ public:
     ~VKSamplerCache();
 
 protected:
-    UniqueSampler CreateSampler(const Tegra::Texture::TSCEntry& tsc) const;
+    UniqueSampler CreateSampler(const Tegra::Texture::TSCEntry& tsc) const override;
 
-    vk::Sampler ToSamplerType(const UniqueSampler& sampler) const;
+    vk::Sampler ToSamplerType(const UniqueSampler& sampler) const override;
 
 private:
     const VKDevice& device;
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index f1fea1871..0f8116458 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -19,23 +19,19 @@ VKScheduler::VKScheduler(const VKDevice& device, VKResourceManager& resource_man
 
 VKScheduler::~VKScheduler() = default;
 
-VKExecutionContext VKScheduler::GetExecutionContext() const {
-    return VKExecutionContext(current_fence, current_cmdbuf);
-}
-
-VKExecutionContext VKScheduler::Flush(vk::Semaphore semaphore) {
+void VKScheduler::Flush(bool release_fence, vk::Semaphore semaphore) {
     SubmitExecution(semaphore);
-    current_fence->Release();
+    if (release_fence)
+        current_fence->Release();
     AllocateNewContext();
-    return GetExecutionContext();
 }
 
-VKExecutionContext VKScheduler::Finish(vk::Semaphore semaphore) {
+void VKScheduler::Finish(bool release_fence, vk::Semaphore semaphore) {
     SubmitExecution(semaphore);
     current_fence->Wait();
-    current_fence->Release();
+    if (release_fence)
+        current_fence->Release();
     AllocateNewContext();
-    return GetExecutionContext();
 }
 
 void VKScheduler::SubmitExecution(vk::Semaphore semaphore) {
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index cfaf5376f..0e5b49c7f 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -10,10 +10,43 @@
 namespace Vulkan {
 
 class VKDevice;
-class VKExecutionContext;
 class VKFence;
 class VKResourceManager;
 
+class VKFenceView {
+public:
+    VKFenceView() = default;
+    VKFenceView(VKFence* const& fence) : fence{fence} {}
+
+    VKFence* operator->() const noexcept {
+        return fence;
+    }
+
+    operator VKFence&() const noexcept {
+        return *fence;
+    }
+
+private:
+    VKFence* const& fence;
+};
+
+class VKCommandBufferView {
+public:
+    VKCommandBufferView() = default;
+    VKCommandBufferView(const vk::CommandBuffer& cmdbuf) : cmdbuf{cmdbuf} {}
+
+    const vk::CommandBuffer* operator->() const noexcept {
+        return &cmdbuf;
+    }
+
+    operator vk::CommandBuffer() const noexcept {
+        return cmdbuf;
+    }
+
+private:
+    const vk::CommandBuffer& cmdbuf;
+};
+
 /// The scheduler abstracts command buffer and fence management with an interface that's able to do
 /// OpenGL-like operations on Vulkan command buffers.
 class VKScheduler {
@@ -21,16 +54,21 @@ public:
     explicit VKScheduler(const VKDevice& device, VKResourceManager& resource_manager);
     ~VKScheduler();
 
-    /// Gets the current execution context.
-    [[nodiscard]] VKExecutionContext GetExecutionContext() const;
+    /// Gets a reference to the current fence.
+    VKFenceView GetFence() const {
+        return current_fence;
+    }
+
+    /// Gets a reference to the current command buffer.
+    VKCommandBufferView GetCommandBuffer() const {
+        return current_cmdbuf;
+    }
 
-    /// Sends the current execution context to the GPU. It invalidates the current execution context
-    /// and returns a new one.
-    VKExecutionContext Flush(vk::Semaphore semaphore = nullptr);
+    /// Sends the current execution context to the GPU.
+    void Flush(bool release_fence = true, vk::Semaphore semaphore = nullptr);
 
-    /// Sends the current execution context to the GPU and waits for it to complete. It invalidates
-    /// the current execution context and returns a new one.
-    VKExecutionContext Finish(vk::Semaphore semaphore = nullptr);
+    /// Sends the current execution context to the GPU and waits for it to complete.
+    void Finish(bool release_fence = true, vk::Semaphore semaphore = nullptr);
 
 private:
     void SubmitExecution(vk::Semaphore semaphore);
@@ -44,26 +82,4 @@ private:
     VKFence* next_fence = nullptr;
 };
 
-class VKExecutionContext {
-    friend class VKScheduler;
-
-public:
-    VKExecutionContext() = default;
-
-    VKFence& GetFence() const {
-        return *fence;
-    }
-
-    vk::CommandBuffer GetCommandBuffer() const {
-        return cmdbuf;
-    }
-
-private:
-    explicit VKExecutionContext(VKFence* fence, vk::CommandBuffer cmdbuf)
-        : fence{fence}, cmdbuf{cmdbuf} {}
-
-    VKFence* fence{};
-    vk::CommandBuffer cmdbuf;
-};
-
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 6071c6d99..9b2d8e987 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -947,6 +947,14 @@ private:
         return {};
     }
 
+    Id BranchIndirect(Operation operation) {
+        const Id op_a = VisitOperand<Type::Uint>(operation, 0);
+
+        Emit(OpStore(jmp_to, op_a));
+        BranchingOp([&]() { Emit(OpBranch(continue_label)); });
+        return {};
+    }
+
     Id PushFlowStack(Operation operation) {
         const auto target = std::get_if<ImmediateNode>(&*operation[0]);
         ASSERT(target);
@@ -1332,6 +1340,7 @@ private:
         &SPIRVDecompiler::ImageStore,
 
         &SPIRVDecompiler::Branch,
+        &SPIRVDecompiler::BranchIndirect,
         &SPIRVDecompiler::PushFlowStack,
         &SPIRVDecompiler::PopFlowStack,
         &SPIRVDecompiler::Exit,
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
index 58ffa42f2..62f1427f5 100644
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp
@@ -46,12 +46,12 @@ std::tuple<u8*, u64, bool> VKStreamBuffer::Reserve(u64 size) {
     return {mapped_pointer + offset, offset, invalidation_mark.has_value()};
 }
 
-VKExecutionContext VKStreamBuffer::Send(VKExecutionContext exctx, u64 size) {
+void VKStreamBuffer::Send(u64 size) {
     ASSERT_MSG(size <= mapped_size, "Reserved size is too small");
 
     if (invalidation_mark) {
         // TODO(Rodrigo): Find a better way to invalidate than waiting for all watches to finish.
-        exctx = scheduler.Flush();
+        scheduler.Flush();
         std::for_each(watches.begin(), watches.begin() + *invalidation_mark,
                       [&](auto& resource) { resource->Wait(); });
         invalidation_mark = std::nullopt;
@@ -62,11 +62,9 @@ VKExecutionContext VKStreamBuffer::Send(VKExecutionContext exctx, u64 size) {
         ReserveWatches(WATCHES_RESERVE_CHUNK);
     }
     // Add a watch for this allocation.
-    watches[used_watches++]->Watch(exctx.GetFence());
+    watches[used_watches++]->Watch(scheduler.GetFence());
 
     offset += size;
-
-    return exctx;
 }
 
 void VKStreamBuffer::CreateBuffers(VKMemoryManager& memory_manager, vk::BufferUsageFlags usage) {
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h
index 69d036ccd..842e54162 100644
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.h
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h
@@ -37,7 +37,7 @@ public:
     std::tuple<u8*, u64, bool> Reserve(u64 size);
 
     /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.
-    [[nodiscard]] VKExecutionContext Send(VKExecutionContext exctx, u64 size);
+    void Send(u64 size);
 
     vk::Buffer GetBuffer() const {
         return *buffer;
diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp
new file mode 100644
index 000000000..fdcc970ff
--- /dev/null
+++ b/src/video_core/shader/control_flow.cpp
@@ -0,0 +1,476 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <list>
+#include <map>
+#include <stack>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/shader/control_flow.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+using Tegra::Shader::Instruction;
+using Tegra::Shader::OpCode;
+
+constexpr s32 unassigned_branch = -2;
+
+struct Query {
+    u32 address{};
+    std::stack<u32> ssy_stack{};
+    std::stack<u32> pbk_stack{};
+};
+
+struct BlockStack {
+    BlockStack() = default;
+    BlockStack(const BlockStack& b) = default;
+    BlockStack(const Query& q) : ssy_stack{q.ssy_stack}, pbk_stack{q.pbk_stack} {}
+    std::stack<u32> ssy_stack{};
+    std::stack<u32> pbk_stack{};
+};
+
+struct BlockBranchInfo {
+    Condition condition{};
+    s32 address{exit_branch};
+    bool kill{};
+    bool is_sync{};
+    bool is_brk{};
+    bool ignore{};
+};
+
+struct BlockInfo {
+    u32 start{};
+    u32 end{};
+    bool visited{};
+    BlockBranchInfo branch{};
+
+    bool IsInside(const u32 address) const {
+        return start <= address && address <= end;
+    }
+};
+
+struct CFGRebuildState {
+    explicit CFGRebuildState(const ProgramCode& program_code, const std::size_t program_size,
+                             const u32 start)
+        : program_code{program_code}, program_size{program_size}, start{start} {}
+
+    u32 start{};
+    std::vector<BlockInfo> block_info{};
+    std::list<u32> inspect_queries{};
+    std::list<Query> queries{};
+    std::unordered_map<u32, u32> registered{};
+    std::unordered_set<u32> labels{};
+    std::map<u32, u32> ssy_labels{};
+    std::map<u32, u32> pbk_labels{};
+    std::unordered_map<u32, BlockStack> stacks{};
+    const ProgramCode& program_code;
+    const std::size_t program_size;
+};
+
+enum class BlockCollision : u32 { None, Found, Inside };
+
+std::pair<BlockCollision, u32> TryGetBlock(CFGRebuildState& state, u32 address) {
+    const auto& blocks = state.block_info;
+    for (u32 index = 0; index < blocks.size(); index++) {
+        if (blocks[index].start == address) {
+            return {BlockCollision::Found, index};
+        }
+        if (blocks[index].IsInside(address)) {
+            return {BlockCollision::Inside, index};
+        }
+    }
+    return {BlockCollision::None, -1};
+}
+
+struct ParseInfo {
+    BlockBranchInfo branch_info{};
+    u32 end_address{};
+};
+
+BlockInfo& CreateBlockInfo(CFGRebuildState& state, u32 start, u32 end) {
+    auto& it = state.block_info.emplace_back();
+    it.start = start;
+    it.end = end;
+    const u32 index = static_cast<u32>(state.block_info.size() - 1);
+    state.registered.insert({start, index});
+    return it;
+}
+
+Pred GetPredicate(u32 index, bool negated) {
+    return static_cast<Pred>(index + (negated ? 8 : 0));
+}
+
+/**
+ * Returns whether the instruction at the specified offset is a 'sched' instruction.
+ * Sched instructions always appear before a sequence of 3 instructions.
+ */
+constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
+    constexpr u32 SchedPeriod = 4;
+    u32 absolute_offset = offset - main_offset;
+
+    return (absolute_offset % SchedPeriod) == 0;
+}
+
+enum class ParseResult : u32 {
+    ControlCaught,
+    BlockEnd,
+    AbnormalFlow,
+};
+
+std::pair<ParseResult, ParseInfo> ParseCode(CFGRebuildState& state, u32 address) {
+    u32 offset = static_cast<u32>(address);
+    const u32 end_address = static_cast<u32>(state.program_size / sizeof(Instruction));
+    ParseInfo parse_info{};
+
+    const auto insert_label = [](CFGRebuildState& state, u32 address) {
+        const auto pair = state.labels.emplace(address);
+        if (pair.second) {
+            state.inspect_queries.push_back(address);
+        }
+    };
+
+    while (true) {
+        if (offset >= end_address) {
+            // ASSERT_OR_EXECUTE can't be used, as it ignores the break
+            ASSERT_MSG(false, "Shader passed the current limit!");
+            parse_info.branch_info.address = exit_branch;
+            parse_info.branch_info.ignore = false;
+            break;
+        }
+        if (state.registered.count(offset) != 0) {
+            parse_info.branch_info.address = offset;
+            parse_info.branch_info.ignore = true;
+            break;
+        }
+        if (IsSchedInstruction(offset, state.start)) {
+            offset++;
+            continue;
+        }
+        const Instruction instr = {state.program_code[offset]};
+        const auto opcode = OpCode::Decode(instr);
+        if (!opcode || opcode->get().GetType() != OpCode::Type::Flow) {
+            offset++;
+            continue;
+        }
+
+        switch (opcode->get().GetId()) {
+        case OpCode::Id::EXIT: {
+            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
+            parse_info.branch_info.condition.predicate =
+                GetPredicate(pred_index, instr.negate_pred != 0);
+            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
+                offset++;
+                continue;
+            }
+            const ConditionCode cc = instr.flow_condition_code;
+            parse_info.branch_info.condition.cc = cc;
+            if (cc == ConditionCode::F) {
+                offset++;
+                continue;
+            }
+            parse_info.branch_info.address = exit_branch;
+            parse_info.branch_info.kill = false;
+            parse_info.branch_info.is_sync = false;
+            parse_info.branch_info.is_brk = false;
+            parse_info.branch_info.ignore = false;
+            parse_info.end_address = offset;
+
+            return {ParseResult::ControlCaught, parse_info};
+        }
+        case OpCode::Id::BRA: {
+            if (instr.bra.constant_buffer != 0) {
+                return {ParseResult::AbnormalFlow, parse_info};
+            }
+            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
+            parse_info.branch_info.condition.predicate =
+                GetPredicate(pred_index, instr.negate_pred != 0);
+            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
+                offset++;
+                continue;
+            }
+            const ConditionCode cc = instr.flow_condition_code;
+            parse_info.branch_info.condition.cc = cc;
+            if (cc == ConditionCode::F) {
+                offset++;
+                continue;
+            }
+            const u32 branch_offset = offset + instr.bra.GetBranchTarget();
+            if (branch_offset == 0) {
+                parse_info.branch_info.address = exit_branch;
+            } else {
+                parse_info.branch_info.address = branch_offset;
+            }
+            insert_label(state, branch_offset);
+            parse_info.branch_info.kill = false;
+            parse_info.branch_info.is_sync = false;
+            parse_info.branch_info.is_brk = false;
+            parse_info.branch_info.ignore = false;
+            parse_info.end_address = offset;
+
+            return {ParseResult::ControlCaught, parse_info};
+        }
+        case OpCode::Id::SYNC: {
+            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
+            parse_info.branch_info.condition.predicate =
+                GetPredicate(pred_index, instr.negate_pred != 0);
+            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
+                offset++;
+                continue;
+            }
+            const ConditionCode cc = instr.flow_condition_code;
+            parse_info.branch_info.condition.cc = cc;
+            if (cc == ConditionCode::F) {
+                offset++;
+                continue;
+            }
+            parse_info.branch_info.address = unassigned_branch;
+            parse_info.branch_info.kill = false;
+            parse_info.branch_info.is_sync = true;
+            parse_info.branch_info.is_brk = false;
+            parse_info.branch_info.ignore = false;
+            parse_info.end_address = offset;
+
+            return {ParseResult::ControlCaught, parse_info};
+        }
+        case OpCode::Id::BRK: {
+            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
+            parse_info.branch_info.condition.predicate =
+                GetPredicate(pred_index, instr.negate_pred != 0);
+            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
+                offset++;
+                continue;
+            }
+            const ConditionCode cc = instr.flow_condition_code;
+            parse_info.branch_info.condition.cc = cc;
+            if (cc == ConditionCode::F) {
+                offset++;
+                continue;
+            }
+            parse_info.branch_info.address = unassigned_branch;
+            parse_info.branch_info.kill = false;
+            parse_info.branch_info.is_sync = false;
+            parse_info.branch_info.is_brk = true;
+            parse_info.branch_info.ignore = false;
+            parse_info.end_address = offset;
+
+            return {ParseResult::ControlCaught, parse_info};
+        }
+        case OpCode::Id::KIL: {
+            const auto pred_index = static_cast<u32>(instr.pred.pred_index);
+            parse_info.branch_info.condition.predicate =
+                GetPredicate(pred_index, instr.negate_pred != 0);
+            if (parse_info.branch_info.condition.predicate == Pred::NeverExecute) {
+                offset++;
+                continue;
+            }
+            const ConditionCode cc = instr.flow_condition_code;
+            parse_info.branch_info.condition.cc = cc;
+            if (cc == ConditionCode::F) {
+                offset++;
+                continue;
+            }
+            parse_info.branch_info.address = exit_branch;
+            parse_info.branch_info.kill = true;
+            parse_info.branch_info.is_sync = false;
+            parse_info.branch_info.is_brk = false;
+            parse_info.branch_info.ignore = false;
+            parse_info.end_address = offset;
+
+            return {ParseResult::ControlCaught, parse_info};
+        }
+        case OpCode::Id::SSY: {
+            const u32 target = offset + instr.bra.GetBranchTarget();
+            insert_label(state, target);
+            state.ssy_labels.emplace(offset, target);
+            break;
+        }
+        case OpCode::Id::PBK: {
+            const u32 target = offset + instr.bra.GetBranchTarget();
+            insert_label(state, target);
+            state.pbk_labels.emplace(offset, target);
+            break;
+        }
+        case OpCode::Id::BRX: {
+            return {ParseResult::AbnormalFlow, parse_info};
+        }
+        default:
+            break;
+        }
+
+        offset++;
+    }
+    parse_info.branch_info.kill = false;
+    parse_info.branch_info.is_sync = false;
+    parse_info.branch_info.is_brk = false;
+    parse_info.end_address = offset - 1;
+    return {ParseResult::BlockEnd, parse_info};
+}
+
+bool TryInspectAddress(CFGRebuildState& state) {
+    if (state.inspect_queries.empty()) {
+        return false;
+    }
+
+    const u32 address = state.inspect_queries.front();
+    state.inspect_queries.pop_front();
+    const auto [result, block_index] = TryGetBlock(state, address);
+    switch (result) {
+    case BlockCollision::Found: {
+        return true;
+    }
+    case BlockCollision::Inside: {
+        // This case is the tricky one:
+        // We need to Split the block in 2 sepparate blocks
+        const u32 end = state.block_info[block_index].end;
+        BlockInfo& new_block = CreateBlockInfo(state, address, end);
+        BlockInfo& current_block = state.block_info[block_index];
+        current_block.end = address - 1;
+        new_block.branch = current_block.branch;
+        BlockBranchInfo forward_branch{};
+        forward_branch.address = address;
+        forward_branch.ignore = true;
+        current_block.branch = forward_branch;
+        return true;
+    }
+    default:
+        break;
+    }
+    const auto [parse_result, parse_info] = ParseCode(state, address);
+    if (parse_result == ParseResult::AbnormalFlow) {
+        // if it's AbnormalFlow, we end it as false, ending the CFG reconstruction
+        return false;
+    }
+
+    BlockInfo& block_info = CreateBlockInfo(state, address, parse_info.end_address);
+    block_info.branch = parse_info.branch_info;
+    if (parse_info.branch_info.condition.IsUnconditional()) {
+        return true;
+    }
+
+    const u32 fallthrough_address = parse_info.end_address + 1;
+    state.inspect_queries.push_front(fallthrough_address);
+    return true;
+}
+
+bool TryQuery(CFGRebuildState& state) {
+    const auto gather_labels = [](std::stack<u32>& cc, std::map<u32, u32>& labels,
+                                  BlockInfo& block) {
+        auto gather_start = labels.lower_bound(block.start);
+        const auto gather_end = labels.upper_bound(block.end);
+        while (gather_start != gather_end) {
+            cc.push(gather_start->second);
+            gather_start++;
+        }
+    };
+    if (state.queries.empty()) {
+        return false;
+    }
+    Query& q = state.queries.front();
+    const u32 block_index = state.registered[q.address];
+    BlockInfo& block = state.block_info[block_index];
+    // If the block is visted, check if the stacks match, else gather the ssy/pbk
+    // labels into the current stack and look if the branch at the end of the block
+    // consumes a label. Schedule new queries accordingly
+    if (block.visited) {
+        BlockStack& stack = state.stacks[q.address];
+        const bool all_okay = (stack.ssy_stack.size() == 0 || q.ssy_stack == stack.ssy_stack) &&
+                              (stack.pbk_stack.size() == 0 || q.pbk_stack == stack.pbk_stack);
+        state.queries.pop_front();
+        return all_okay;
+    }
+    block.visited = true;
+    state.stacks[q.address] = BlockStack{q};
+    Query q2(q);
+    state.queries.pop_front();
+    gather_labels(q2.ssy_stack, state.ssy_labels, block);
+    gather_labels(q2.pbk_stack, state.pbk_labels, block);
+    if (!block.branch.condition.IsUnconditional()) {
+        q2.address = block.end + 1;
+        state.queries.push_back(q2);
+    }
+    Query conditional_query{q2};
+    if (block.branch.is_sync) {
+        if (block.branch.address == unassigned_branch) {
+            block.branch.address = conditional_query.ssy_stack.top();
+        }
+        conditional_query.ssy_stack.pop();
+    }
+    if (block.branch.is_brk) {
+        if (block.branch.address == unassigned_branch) {
+            block.branch.address = conditional_query.pbk_stack.top();
+        }
+        conditional_query.pbk_stack.pop();
+    }
+    conditional_query.address = block.branch.address;
+    state.queries.push_back(conditional_query);
+    return true;
+}
+
+std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
+                                              u32 start_address) {
+    CFGRebuildState state{program_code, program_size, start_address};
+    // Inspect Code and generate blocks
+    state.labels.clear();
+    state.labels.emplace(start_address);
+    state.inspect_queries.push_back(state.start);
+    while (!state.inspect_queries.empty()) {
+        if (!TryInspectAddress(state)) {
+            return {};
+        }
+    }
+    // Decompile Stacks
+    Query start_query{};
+    start_query.address = state.start;
+    state.queries.push_back(start_query);
+    bool decompiled = true;
+    while (!state.queries.empty()) {
+        if (!TryQuery(state)) {
+            decompiled = false;
+            break;
+        }
+    }
+    // Sort and organize results
+    std::sort(state.block_info.begin(), state.block_info.end(),
+              [](const BlockInfo& a, const BlockInfo& b) -> bool { return a.start < b.start; });
+    ShaderCharacteristics result_out{};
+    result_out.decompilable = decompiled;
+    result_out.start = start_address;
+    result_out.end = start_address;
+    for (auto& block : state.block_info) {
+        ShaderBlock new_block{};
+        new_block.start = block.start;
+        new_block.end = block.end;
+        new_block.ignore_branch = block.branch.ignore;
+        if (!new_block.ignore_branch) {
+            new_block.branch.cond = block.branch.condition;
+            new_block.branch.kills = block.branch.kill;
+            new_block.branch.address = block.branch.address;
+        }
+        result_out.end = std::max(result_out.end, block.end);
+        result_out.blocks.push_back(new_block);
+    }
+    if (result_out.decompilable) {
+        result_out.labels = std::move(state.labels);
+        return {result_out};
+    }
+    // If it's not decompilable, merge the unlabelled blocks together
+    auto back = result_out.blocks.begin();
+    auto next = std::next(back);
+    while (next != result_out.blocks.end()) {
+        if (state.labels.count(next->start) == 0 && next->start == back->end + 1) {
+            back->end = next->end;
+            next = result_out.blocks.erase(next);
+            continue;
+        }
+        back = next;
+        next++;
+    }
+    return {result_out};
+}
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/control_flow.h b/src/video_core/shader/control_flow.h
new file mode 100644
index 000000000..5e8ea3271
--- /dev/null
+++ b/src/video_core/shader/control_flow.h
@@ -0,0 +1,63 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstring>
+#include <list>
+#include <optional>
+#include <unordered_set>
+
+#include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+using Tegra::Shader::ConditionCode;
+using Tegra::Shader::Pred;
+
+constexpr s32 exit_branch = -1;
+
+struct Condition {
+    Pred predicate{Pred::UnusedIndex};
+    ConditionCode cc{ConditionCode::T};
+
+    bool IsUnconditional() const {
+        return predicate == Pred::UnusedIndex && cc == ConditionCode::T;
+    }
+    bool operator==(const Condition& other) const {
+        return std::tie(predicate, cc) == std::tie(other.predicate, other.cc);
+    }
+};
+
+struct ShaderBlock {
+    u32 start{};
+    u32 end{};
+    bool ignore_branch{};
+    struct Branch {
+        Condition cond{};
+        bool kills{};
+        s32 address{};
+        bool operator==(const Branch& b) const {
+            return std::tie(cond, kills, address) == std::tie(b.cond, b.kills, b.address);
+        }
+    } branch{};
+    bool operator==(const ShaderBlock& sb) const {
+        return std::tie(start, end, ignore_branch, branch) ==
+               std::tie(sb.start, sb.end, sb.ignore_branch, sb.branch);
+    }
+};
+
+struct ShaderCharacteristics {
+    std::list<ShaderBlock> blocks{};
+    bool decompilable{};
+    u32 start{};
+    u32 end{};
+    std::unordered_set<u32> labels{};
+};
+
+std::optional<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 program_size,
+                                              u32 start_address);
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index 2c9ff28f2..29c8895c5 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -11,6 +11,7 @@
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/engines/shader_header.h"
+#include "video_core/shader/control_flow.h"
 #include "video_core/shader/node_helper.h"
 #include "video_core/shader/shader_ir.h"
 
@@ -21,20 +22,6 @@ using Tegra::Shader::OpCode;
 
 namespace {
 
-/// Merges exit method of two parallel branches.
-constexpr ExitMethod ParallelExit(ExitMethod a, ExitMethod b) {
-    if (a == ExitMethod::Undetermined) {
-        return b;
-    }
-    if (b == ExitMethod::Undetermined) {
-        return a;
-    }
-    if (a == b) {
-        return a;
-    }
-    return ExitMethod::Conditional;
-}
-
 /**
  * Returns whether the instruction at the specified offset is a 'sched' instruction.
  * Sched instructions always appear before a sequence of 3 instructions.
@@ -51,85 +38,104 @@ constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
 void ShaderIR::Decode() {
     std::memcpy(&header, program_code.data(), sizeof(Tegra::Shader::Header));
 
-    std::set<u32> labels;
-    const ExitMethod exit_method = Scan(main_offset, MAX_PROGRAM_LENGTH, labels);
-    if (exit_method != ExitMethod::AlwaysEnd) {
-        UNREACHABLE_MSG("Program does not always end");
-    }
-
-    if (labels.empty()) {
-        basic_blocks.insert({main_offset, DecodeRange(main_offset, MAX_PROGRAM_LENGTH)});
+    disable_flow_stack = false;
+    const auto info = ScanFlow(program_code, program_size, main_offset);
+    if (info) {
+        const auto& shader_info = *info;
+        coverage_begin = shader_info.start;
+        coverage_end = shader_info.end;
+        if (shader_info.decompilable) {
+            disable_flow_stack = true;
+            const auto insert_block = ([this](NodeBlock& nodes, u32 label) {
+                if (label == exit_branch) {
+                    return;
+                }
+                basic_blocks.insert({label, nodes});
+            });
+            const auto& blocks = shader_info.blocks;
+            NodeBlock current_block;
+            u32 current_label = exit_branch;
+            for (auto& block : blocks) {
+                if (shader_info.labels.count(block.start) != 0) {
+                    insert_block(current_block, current_label);
+                    current_block.clear();
+                    current_label = block.start;
+                }
+                if (!block.ignore_branch) {
+                    DecodeRangeInner(current_block, block.start, block.end);
+                    InsertControlFlow(current_block, block);
+                } else {
+                    DecodeRangeInner(current_block, block.start, block.end + 1);
+                }
+            }
+            insert_block(current_block, current_label);
+            return;
+        }
+        LOG_WARNING(HW_GPU, "Flow Stack Removing Failed! Falling back to old method");
+        // we can't decompile it, fallback to standard method
+        for (const auto& block : shader_info.blocks) {
+            basic_blocks.insert({block.start, DecodeRange(block.start, block.end + 1)});
+        }
         return;
     }
+    LOG_WARNING(HW_GPU, "Flow Analysis Failed! Falling back to brute force compiling");
+
+    // Now we need to deal with an undecompilable shader. We need to brute force
+    // a shader that captures every position.
+    coverage_begin = main_offset;
+    const u32 shader_end = static_cast<u32>(program_size / sizeof(u64));
+    coverage_end = shader_end;
+    for (u32 label = main_offset; label < shader_end; label++) {
+        basic_blocks.insert({label, DecodeRange(label, label + 1)});
+    }
+}
 
-    labels.insert(main_offset);
-
-    for (const u32 label : labels) {
-        const auto next_it = labels.lower_bound(label + 1);
-        const u32 next_label = next_it == labels.end() ? MAX_PROGRAM_LENGTH : *next_it;
+NodeBlock ShaderIR::DecodeRange(u32 begin, u32 end) {
+    NodeBlock basic_block;
+    DecodeRangeInner(basic_block, begin, end);
+    return basic_block;
+}
 
-        basic_blocks.insert({label, DecodeRange(label, next_label)});
+void ShaderIR::DecodeRangeInner(NodeBlock& bb, u32 begin, u32 end) {
+    for (u32 pc = begin; pc < (begin > end ? MAX_PROGRAM_LENGTH : end);) {
+        pc = DecodeInstr(bb, pc);
     }
 }
 
-ExitMethod ShaderIR::Scan(u32 begin, u32 end, std::set<u32>& labels) {
-    const auto [iter, inserted] =
-        exit_method_map.emplace(std::make_pair(begin, end), ExitMethod::Undetermined);
-    ExitMethod& exit_method = iter->second;
-    if (!inserted)
-        return exit_method;
-
-    for (u32 offset = begin; offset != end && offset != MAX_PROGRAM_LENGTH; ++offset) {
-        coverage_begin = std::min(coverage_begin, offset);
-        coverage_end = std::max(coverage_end, offset + 1);
-
-        const Instruction instr = {program_code[offset]};
-        const auto opcode = OpCode::Decode(instr);
-        if (!opcode)
-            continue;
-        switch (opcode->get().GetId()) {
-        case OpCode::Id::EXIT: {
-            // The EXIT instruction can be predicated, which means that the shader can conditionally
-            // end on this instruction. We have to consider the case where the condition is not met
-            // and check the exit method of that other basic block.
-            using Tegra::Shader::Pred;
-            if (instr.pred.pred_index == static_cast<u64>(Pred::UnusedIndex)) {
-                return exit_method = ExitMethod::AlwaysEnd;
-            } else {
-                const ExitMethod not_met = Scan(offset + 1, end, labels);
-                return exit_method = ParallelExit(ExitMethod::AlwaysEnd, not_met);
-            }
+void ShaderIR::InsertControlFlow(NodeBlock& bb, const ShaderBlock& block) {
+    const auto apply_conditions = ([&](const Condition& cond, Node n) -> Node {
+        Node result = n;
+        if (cond.cc != ConditionCode::T) {
+            result = Conditional(GetConditionCode(cond.cc), {result});
         }
-        case OpCode::Id::BRA: {
-            const u32 target = offset + instr.bra.GetBranchTarget();
-            labels.insert(target);
-            const ExitMethod no_jmp = Scan(offset + 1, end, labels);
-            const ExitMethod jmp = Scan(target, end, labels);
-            return exit_method = ParallelExit(no_jmp, jmp);
-        }
-        case OpCode::Id::SSY:
-        case OpCode::Id::PBK: {
-            // The SSY and PBK use a similar encoding as the BRA instruction.
-            UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
-                                 "Constant buffer branching is not supported");
-            const u32 target = offset + instr.bra.GetBranchTarget();
-            labels.insert(target);
-            // Continue scanning for an exit method.
-            break;
+        if (cond.predicate != Pred::UnusedIndex) {
+            u32 pred = static_cast<u32>(cond.predicate);
+            const bool is_neg = pred > 7;
+            if (is_neg) {
+                pred -= 8;
+            }
+            result = Conditional(GetPredicate(pred, is_neg), {result});
         }
-        default:
-            break;
+        return result;
+    });
+    if (block.branch.address < 0) {
+        if (block.branch.kills) {
+            Node n = Operation(OperationCode::Discard);
+            n = apply_conditions(block.branch.cond, n);
+            bb.push_back(n);
+            global_code.push_back(n);
+            return;
         }
+        Node n = Operation(OperationCode::Exit);
+        n = apply_conditions(block.branch.cond, n);
+        bb.push_back(n);
+        global_code.push_back(n);
+        return;
     }
-    return exit_method = ExitMethod::AlwaysReturn;
-}
-
-NodeBlock ShaderIR::DecodeRange(u32 begin, u32 end) {
-    NodeBlock basic_block;
-    for (u32 pc = begin; pc < (begin > end ? MAX_PROGRAM_LENGTH : end);) {
-        pc = DecodeInstr(basic_block, pc);
-    }
-    return basic_block;
+    Node n = Operation(OperationCode::Branch, Immediate(block.branch.address));
+    n = apply_conditions(block.branch.cond, n);
+    bb.push_back(n);
+    global_code.push_back(n);
 }
 
 u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
@@ -140,15 +146,18 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
 
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
+    const u32 nv_address = ConvertAddressToNvidiaSpace(pc);
 
     // Decoding failure
     if (!opcode) {
         UNIMPLEMENTED_MSG("Unhandled instruction: {0:x}", instr.value);
+        bb.push_back(Comment(fmt::format("{:05x} Unimplemented Shader instruction (0x{:016x})",
+                                         nv_address, instr.value)));
         return pc + 1;
     }
 
-    bb.push_back(
-        Comment(fmt::format("{}: {} (0x{:016x})", pc, opcode->get().GetName(), instr.value)));
+    bb.push_back(Comment(
+        fmt::format("{:05x} {} (0x{:016x})", nv_address, opcode->get().GetName(), instr.value)));
 
     using Tegra::Shader::Pred;
     UNIMPLEMENTED_IF_MSG(instr.pred.full_pred == Pred::NeverExecute,
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index d46a8ab82..42e3de02f 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -91,11 +91,46 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         break;
     }
     case OpCode::Id::BRA: {
-        UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
-                             "BRA with constant buffers are not implemented");
+        Node branch;
+        if (instr.bra.constant_buffer == 0) {
+            const u32 target = pc + instr.bra.GetBranchTarget();
+            branch = Operation(OperationCode::Branch, Immediate(target));
+        } else {
+            const u32 target = pc + 1;
+            const Node op_a = GetConstBuffer(instr.cbuf36.index, instr.cbuf36.GetOffset());
+            const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, true,
+                                                 PRECISE, op_a, Immediate(3));
+            const Node operand =
+                Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target));
+            branch = Operation(OperationCode::BranchIndirect, convert);
+        }
 
-        const u32 target = pc + instr.bra.GetBranchTarget();
-        const Node branch = Operation(OperationCode::Branch, Immediate(target));
+        const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
+        if (cc != Tegra::Shader::ConditionCode::T) {
+            bb.push_back(Conditional(GetConditionCode(cc), {branch}));
+        } else {
+            bb.push_back(branch);
+        }
+        break;
+    }
+    case OpCode::Id::BRX: {
+        Node operand;
+        if (instr.brx.constant_buffer != 0) {
+            const s32 target = pc + 1;
+            const Node index = GetRegister(instr.gpr8);
+            const Node op_a =
+                GetConstBufferIndirect(instr.cbuf36.index, instr.cbuf36.GetOffset() + 0, index);
+            const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, true,
+                                                 PRECISE, op_a, Immediate(3));
+            operand = Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target));
+        } else {
+            const s32 target = pc + instr.brx.GetBranchExtend();
+            const Node op_a = GetRegister(instr.gpr8);
+            const Node convert = SignedOperation(OperationCode::IArithmeticShiftRight, true,
+                                                 PRECISE, op_a, Immediate(3));
+            operand = Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target));
+        }
+        const Node branch = Operation(OperationCode::BranchIndirect, operand);
 
         const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
         if (cc != Tegra::Shader::ConditionCode::T) {
@@ -109,6 +144,10 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
                              "Constant buffer flow is not supported");
 
+        if (disable_flow_stack) {
+            break;
+        }
+
         // The SSY opcode tells the GPU where to re-converge divergent execution paths with SYNC.
         const u32 target = pc + instr.bra.GetBranchTarget();
         bb.push_back(
@@ -119,6 +158,10 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
                              "Constant buffer PBK is not supported");
 
+        if (disable_flow_stack) {
+            break;
+        }
+
         // PBK pushes to a stack the address where BRK will jump to.
         const u32 target = pc + instr.bra.GetBranchTarget();
         bb.push_back(
@@ -130,6 +173,10 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "SYNC condition code used: {}",
                              static_cast<u32>(cc));
 
+        if (disable_flow_stack) {
+            break;
+        }
+
         // The SYNC opcode jumps to the address previously set by the SSY opcode
         bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Ssy));
         break;
@@ -138,6 +185,9 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
         const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
         UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "BRK condition code used: {}",
                              static_cast<u32>(cc));
+        if (disable_flow_stack) {
+            break;
+        }
 
         // The BRK opcode jumps to the address previously set by the PBK opcode
         bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Pbk));
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index cb480be9b..323be3f14 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -269,7 +269,13 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
             LOG_WARNING(HW_GPU, "TLDS.NODEP implementation is incomplete");
         }
 
-        WriteTexsInstructionFloat(bb, instr, GetTldsCode(instr, texture_type, is_array));
+        const Node4 components = GetTldsCode(instr, texture_type, is_array);
+
+        if (instr.tlds.fp32_flag) {
+            WriteTexsInstructionFloat(bb, instr, components);
+        } else {
+            WriteTexsInstructionHalfFloat(bb, instr, components);
+        }
         break;
     }
     default:
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index 0ac83fcf0..7427ed896 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -148,11 +148,12 @@ enum class OperationCode {
 
     ImageStore, /// (MetaImage, float[N] coords) -> void
 
-    Branch,        /// (uint branch_target) -> void
-    PushFlowStack, /// (uint branch_target) -> void
-    PopFlowStack,  /// () -> void
-    Exit,          /// () -> void
-    Discard,       /// () -> void
+    Branch,         /// (uint branch_target) -> void
+    BranchIndirect, /// (uint branch_target) -> void
+    PushFlowStack,  /// (uint branch_target) -> void
+    PopFlowStack,   /// () -> void
+    Exit,           /// () -> void
+    Discard,        /// () -> void
 
     EmitVertex,   /// () -> void
     EndPrimitive, /// () -> void
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index fd80dd116..caa409788 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -22,8 +22,8 @@ using Tegra::Shader::PredCondition;
 using Tegra::Shader::PredOperation;
 using Tegra::Shader::Register;
 
-ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset)
-    : program_code{program_code}, main_offset{main_offset} {
+ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset, const std::size_t size)
+    : program_code{program_code}, main_offset{main_offset}, program_size{size} {
     Decode();
 }
 
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index a656761ea..03c888def 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -22,18 +22,12 @@
 
 namespace VideoCommon::Shader {
 
+struct ShaderBlock;
+
 using ProgramCode = std::vector<u64>;
 
 constexpr u32 MAX_PROGRAM_LENGTH = 0x1000;
 
-/// Describes the behaviour of code path of a given entry point and a return point.
-enum class ExitMethod {
-    Undetermined, ///< Internal value. Only occur when analyzing JMP loop.
-    AlwaysReturn, ///< All code paths reach the return point.
-    Conditional,  ///< Code path reaches the return point or an END instruction conditionally.
-    AlwaysEnd,    ///< All code paths reach a END instruction.
-};
-
 class ConstBuffer {
 public:
     explicit ConstBuffer(u32 max_offset, bool is_indirect)
@@ -73,7 +67,7 @@ struct GlobalMemoryUsage {
 
 class ShaderIR final {
 public:
-    explicit ShaderIR(const ProgramCode& program_code, u32 main_offset);
+    explicit ShaderIR(const ProgramCode& program_code, u32 main_offset, std::size_t size);
     ~ShaderIR();
 
     const std::map<u32, NodeBlock>& GetBasicBlocks() const {
@@ -141,12 +135,20 @@ public:
         return header;
     }
 
+    bool IsFlowStackDisabled() const {
+        return disable_flow_stack;
+    }
+
+    u32 ConvertAddressToNvidiaSpace(const u32 address) const {
+        return (address - main_offset) * sizeof(Tegra::Shader::Instruction);
+    }
+
 private:
     void Decode();
 
-    ExitMethod Scan(u32 begin, u32 end, std::set<u32>& labels);
-
     NodeBlock DecodeRange(u32 begin, u32 end);
+    void DecodeRangeInner(NodeBlock& bb, u32 begin, u32 end);
+    void InsertControlFlow(NodeBlock& bb, const ShaderBlock& block);
 
     /**
      * Decodes a single instruction from Tegra to IR.
@@ -338,10 +340,11 @@ private:
 
     const ProgramCode& program_code;
     const u32 main_offset;
+    const std::size_t program_size;
+    bool disable_flow_stack{};
 
     u32 coverage_begin{};
     u32 coverage_end{};
-    std::map<std::pair<u32, u32>, ExitMethod> exit_method_map;
 
     std::map<u32, NodeBlock> basic_blocks;
     NodeBlock global_code;
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 7a0fdb19b..6af9044ca 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -75,9 +75,12 @@ MatchStructureResult SurfaceBaseImpl::MatchesStructure(const SurfaceParams& rhs)
 
     // Linear Surface check
     if (!params.is_tiled) {
-        if (std::tie(params.width, params.height, params.pitch) ==
-            std::tie(rhs.width, rhs.height, rhs.pitch)) {
-            return MatchStructureResult::FullMatch;
+        if (std::tie(params.height, params.pitch) == std::tie(rhs.height, rhs.pitch)) {
+            if (params.width == rhs.width) {
+                return MatchStructureResult::FullMatch;
+            } else {
+                return MatchStructureResult::SemiMatch;
+            }
         }
         return MatchStructureResult::None;
     }
diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h
index 8ba386a8a..bcce8d863 100644
--- a/src/video_core/texture_cache/surface_base.h
+++ b/src/video_core/texture_cache/surface_base.h
@@ -200,8 +200,9 @@ public:
         modification_tick = tick;
     }
 
-    void MarkAsRenderTarget(const bool is_target) {
+    void MarkAsRenderTarget(const bool is_target, const u32 index) {
         this->is_target = is_target;
+        this->index = index;
     }
 
     void MarkAsPicked(const bool is_picked) {
@@ -221,6 +222,10 @@ public:
         return is_target;
     }
 
+    u32 GetRenderTarget() const {
+        return index;
+    }
+
     bool IsRegistered() const {
         return is_registered;
     }
@@ -307,10 +312,13 @@ private:
         return view;
     }
 
+    static constexpr u32 NO_RT = 0xFFFFFFFF;
+
     bool is_modified{};
     bool is_target{};
     bool is_registered{};
     bool is_picked{};
+    u32 index{NO_RT};
     u64 modification_tick{};
 };
 
diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp
index 9c56e2b4f..fd5472451 100644
--- a/src/video_core/texture_cache/surface_params.cpp
+++ b/src/video_core/texture_cache/surface_params.cpp
@@ -290,12 +290,19 @@ std::size_t SurfaceParams::GetLayerSize(bool as_host_size, bool uncompressed) co
 
 std::size_t SurfaceParams::GetInnerMipmapMemorySize(u32 level, bool as_host_size,
                                                     bool uncompressed) const {
-    const bool tiled{as_host_size ? false : is_tiled};
     const u32 width{GetMipmapSize(uncompressed, GetMipWidth(level), GetDefaultBlockWidth())};
     const u32 height{GetMipmapSize(uncompressed, GetMipHeight(level), GetDefaultBlockHeight())};
     const u32 depth{is_layered ? 1U : GetMipDepth(level)};
-    return Tegra::Texture::CalculateSize(tiled, GetBytesPerPixel(), width, height, depth,
-                                         GetMipBlockHeight(level), GetMipBlockDepth(level));
+    if (is_tiled) {
+        return Tegra::Texture::CalculateSize(!as_host_size, GetBytesPerPixel(), width, height,
+                                             depth, GetMipBlockHeight(level),
+                                             GetMipBlockDepth(level));
+    } else if (as_host_size || IsBuffer()) {
+        return GetBytesPerPixel() * width * height * depth;
+    } else {
+        // Linear Texture Case
+        return pitch * height * depth;
+    }
 }
 
 bool SurfaceParams::operator==(const SurfaceParams& rhs) const {
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index c9e72531a..7f9623c62 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -133,11 +133,11 @@ public:
             regs.zeta.memory_layout.block_depth, regs.zeta.memory_layout.type)};
         auto surface_view = GetSurface(gpu_addr, depth_params, preserve_contents, true);
         if (depth_buffer.target)
-            depth_buffer.target->MarkAsRenderTarget(false);
+            depth_buffer.target->MarkAsRenderTarget(false, NO_RT);
         depth_buffer.target = surface_view.first;
         depth_buffer.view = surface_view.second;
         if (depth_buffer.target)
-            depth_buffer.target->MarkAsRenderTarget(true);
+            depth_buffer.target->MarkAsRenderTarget(true, DEPTH_RT);
         return surface_view.second;
     }
 
@@ -167,11 +167,11 @@ public:
         auto surface_view = GetSurface(gpu_addr, SurfaceParams::CreateForFramebuffer(system, index),
                                        preserve_contents, true);
         if (render_targets[index].target)
-            render_targets[index].target->MarkAsRenderTarget(false);
+            render_targets[index].target->MarkAsRenderTarget(false, NO_RT);
         render_targets[index].target = surface_view.first;
         render_targets[index].view = surface_view.second;
         if (render_targets[index].target)
-            render_targets[index].target->MarkAsRenderTarget(true);
+            render_targets[index].target->MarkAsRenderTarget(true, static_cast<u32>(index));
         return surface_view.second;
     }
 
@@ -191,7 +191,7 @@ public:
         if (depth_buffer.target == nullptr) {
             return;
         }
-        depth_buffer.target->MarkAsRenderTarget(false);
+        depth_buffer.target->MarkAsRenderTarget(false, NO_RT);
         depth_buffer.target = nullptr;
         depth_buffer.view = nullptr;
     }
@@ -200,7 +200,7 @@ public:
         if (render_targets[index].target == nullptr) {
             return;
         }
-        render_targets[index].target->MarkAsRenderTarget(false);
+        render_targets[index].target->MarkAsRenderTarget(false, NO_RT);
         render_targets[index].target = nullptr;
         render_targets[index].view = nullptr;
     }
@@ -270,6 +270,16 @@ protected:
     // and reading it from a sepparate buffer.
     virtual void BufferCopy(TSurface& src_surface, TSurface& dst_surface) = 0;
 
+    void ManageRenderTargetUnregister(TSurface& surface) {
+        auto& maxwell3d = system.GPU().Maxwell3D();
+        const u32 index = surface->GetRenderTarget();
+        if (index == DEPTH_RT) {
+            maxwell3d.dirty_flags.zeta_buffer = true;
+        } else {
+            maxwell3d.dirty_flags.color_buffer.set(index, true);
+        }
+    }
+
     void Register(TSurface surface) {
         const GPUVAddr gpu_addr = surface->GetGpuAddr();
         const CacheAddr cache_ptr = ToCacheAddr(system.GPU().MemoryManager().GetPointer(gpu_addr));
@@ -294,6 +304,9 @@ protected:
         if (guard_render_targets && surface->IsProtected()) {
             return;
         }
+        if (!guard_render_targets && surface->IsRenderTarget()) {
+            ManageRenderTargetUnregister(surface);
+        }
         const GPUVAddr gpu_addr = surface->GetGpuAddr();
         const CacheAddr cache_ptr = surface->GetCacheAddr();
         const std::size_t size = surface->GetSizeInBytes();
@@ -649,15 +662,6 @@ private:
                 }
                 return {current_surface, *view};
             }
-            // The next case is unsafe, so if we r in accurate GPU, just skip it
-            if (Settings::values.use_accurate_gpu_emulation) {
-                return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
-                                      MatchTopologyResult::FullMatch);
-            }
-            // This is the case the texture is a part of the parent.
-            if (current_surface->MatchesSubTexture(params, gpu_addr)) {
-                return RebuildSurface(current_surface, params, is_render);
-            }
         } else {
             // If there are many overlaps, odds are they are subtextures of the candidate
             // surface. We try to construct a new surface based on the candidate parameters,
@@ -793,6 +797,9 @@ private:
     static constexpr u64 registry_page_size{1 << registry_page_bits};
     std::unordered_map<CacheAddr, std::vector<TSurface>> registry;
 
+    static constexpr u32 DEPTH_RT = 8;
+    static constexpr u32 NO_RT = 0xFFFFFFFF;
+
     // The L1 Cache is used for fast texture lookup before checking the overlaps
     // This avoids calculating size and other stuffs.
     std::unordered_map<CacheAddr, TSurface> l1_cache;