16 files changed, 768 insertions, 227 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 509ca117a..6113e17ff 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -87,6 +87,7 @@ add_library(video_core STATIC
     shader/decode.cpp
     shader/shader_ir.cpp
     shader/shader_ir.h
+    shader/track.cpp
     surface.cpp
     surface.h
     textures/astc.cpp
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index cdef97bc6..9989825f8 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -208,6 +208,8 @@ enum class UniformType : u64 {
     SignedShort = 3,
     Single = 4,
     Double = 5,
+    Quad = 6,
+    UnsignedQuad = 7,
 };
 
 enum class StoreType : u64 {
@@ -785,6 +787,12 @@ union Instruction {
     } st_l;
 
     union {
+        BitField<48, 3, UniformType> type;
+        BitField<46, 2, u64> cache_mode;
+        BitField<20, 24, s64> immediate_offset;
+    } ldg;
+
+    union {
         BitField<0, 3, u64> pred0;
         BitField<3, 3, u64> pred3;
         BitField<7, 1, u64> abs_a;
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 08cf6268f..d3d32a359 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -3,6 +3,8 @@
 // Refer to the license.txt file included.
 
 #include "common/assert.h"
+#include "core/core_timing.h"
+#include "core/memory.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/kepler_memory.h"
 #include "video_core/engines/maxwell_3d.h"
@@ -124,9 +126,36 @@ u32 DepthFormatBytesPerPixel(DepthFormat format) {
     }
 }
 
+// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
+// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
+// So the values you see in docs might be multiplied by 4.
 enum class BufferMethods {
-    BindObject = 0,
-    CountBufferMethods = 0x40,
+    BindObject = 0x0,
+    Nop = 0x2,
+    SemaphoreAddressHigh = 0x4,
+    SemaphoreAddressLow = 0x5,
+    SemaphoreSequence = 0x6,
+    SemaphoreTrigger = 0x7,
+    NotifyIntr = 0x8,
+    WrcacheFlush = 0x9,
+    Unk28 = 0xA,
+    Unk2c = 0xB,
+    RefCnt = 0x14,
+    SemaphoreAcquire = 0x1A,
+    SemaphoreRelease = 0x1B,
+    Unk70 = 0x1C,
+    Unk74 = 0x1D,
+    Unk78 = 0x1E,
+    Unk7c = 0x1F,
+    Yield = 0x20,
+    NonPullerMethods = 0x40,
+};
+
+enum class GpuSemaphoreOperation {
+    AcquireEqual = 0x1,
+    WriteLong = 0x2,
+    AcquireGequal = 0x4,
+    AcquireMask = 0x8,
 };
 
 void GPU::CallMethod(const MethodCall& method_call) {
@@ -135,20 +164,78 @@ void GPU::CallMethod(const MethodCall& method_call) {
 
     ASSERT(method_call.subchannel < bound_engines.size());
 
-    if (method_call.method == static_cast<u32>(BufferMethods::BindObject)) {
-        // Bind the current subchannel to the desired engine id.
-        LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel,
-                  method_call.argument);
-        bound_engines[method_call.subchannel] = static_cast<EngineID>(method_call.argument);
-        return;
+    if (ExecuteMethodOnEngine(method_call)) {
+        CallEngineMethod(method_call);
+    } else {
+        CallPullerMethod(method_call);
     }
+}
+
+bool GPU::ExecuteMethodOnEngine(const MethodCall& method_call) {
+    const auto method = static_cast<BufferMethods>(method_call.method);
+    return method >= BufferMethods::NonPullerMethods;
+}
 
-    if (method_call.method < static_cast<u32>(BufferMethods::CountBufferMethods)) {
-        // TODO(Subv): Research and implement these methods.
-        LOG_ERROR(HW_GPU, "Special buffer methods other than Bind are not implemented");
-        return;
+void GPU::CallPullerMethod(const MethodCall& method_call) {
+    regs.reg_array[method_call.method] = method_call.argument;
+    const auto method = static_cast<BufferMethods>(method_call.method);
+
+    switch (method) {
+    case BufferMethods::BindObject: {
+        ProcessBindMethod(method_call);
+        break;
+    }
+    case BufferMethods::Nop:
+    case BufferMethods::SemaphoreAddressHigh:
+    case BufferMethods::SemaphoreAddressLow:
+    case BufferMethods::SemaphoreSequence:
+    case BufferMethods::RefCnt:
+        break;
+    case BufferMethods::SemaphoreTrigger: {
+        ProcessSemaphoreTriggerMethod();
+        break;
+    }
+    case BufferMethods::NotifyIntr: {
+        // TODO(Kmather73): Research and implement this method.
+        LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented");
+        break;
+    }
+    case BufferMethods::WrcacheFlush: {
+        // TODO(Kmather73): Research and implement this method.
+        LOG_ERROR(HW_GPU, "Special puller engine method WrcacheFlush not implemented");
+        break;
+    }
+    case BufferMethods::Unk28: {
+        // TODO(Kmather73): Research and implement this method.
+        LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented");
+        break;
+    }
+    case BufferMethods::Unk2c: {
+        // TODO(Kmather73): Research and implement this method.
+        LOG_ERROR(HW_GPU, "Special puller engine method Unk2c not implemented");
+        break;
+    }
+    case BufferMethods::SemaphoreAcquire: {
+        ProcessSemaphoreAcquire();
+        break;
     }
+    case BufferMethods::SemaphoreRelease: {
+        ProcessSemaphoreRelease();
+        break;
+    }
+    case BufferMethods::Yield: {
+        // TODO(Kmather73): Research and implement this method.
+        LOG_ERROR(HW_GPU, "Special puller engine method Yield not implemented");
+        break;
+    }
+    default:
+        LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented",
+                  static_cast<u32>(method));
+        break;
+    }
+}
 
+void GPU::CallEngineMethod(const MethodCall& method_call) {
     const EngineID engine = bound_engines[method_call.subchannel];
 
     switch (engine) {
@@ -172,4 +259,76 @@ void GPU::CallMethod(const MethodCall& method_call) {
     }
 }
 
+void GPU::ProcessBindMethod(const MethodCall& method_call) {
+    // Bind the current subchannel to the desired engine id.
+    LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel,
+              method_call.argument);
+    bound_engines[method_call.subchannel] = static_cast<EngineID>(method_call.argument);
+}
+
+void GPU::ProcessSemaphoreTriggerMethod() {
+    const auto semaphoreOperationMask = 0xF;
+    const auto op =
+        static_cast<GpuSemaphoreOperation>(regs.semaphore_trigger & semaphoreOperationMask);
+    if (op == GpuSemaphoreOperation::WriteLong) {
+        auto address = memory_manager->GpuToCpuAddress(regs.smaphore_address.SmaphoreAddress());
+        struct Block {
+            u32 sequence;
+            u32 zeros = 0;
+            u64 timestamp;
+        };
+
+        Block block{};
+        block.sequence = regs.semaphore_sequence;
+        // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of
+        // CoreTiming
+        block.timestamp = CoreTiming::GetTicks();
+        Memory::WriteBlock(*address, &block, sizeof(block));
+    } else {
+        const auto address =
+            memory_manager->GpuToCpuAddress(regs.smaphore_address.SmaphoreAddress());
+        const u32 word = Memory::Read32(*address);
+        if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) ||
+            (op == GpuSemaphoreOperation::AcquireGequal &&
+             static_cast<s32>(word - regs.semaphore_sequence) > 0) ||
+            (op == GpuSemaphoreOperation::AcquireMask && (word & regs.semaphore_sequence))) {
+            // Nothing to do in this case
+        } else {
+            regs.acquire_source = true;
+            regs.acquire_value = regs.semaphore_sequence;
+            if (op == GpuSemaphoreOperation::AcquireEqual) {
+                regs.acquire_active = true;
+                regs.acquire_mode = false;
+            } else if (op == GpuSemaphoreOperation::AcquireGequal) {
+                regs.acquire_active = true;
+                regs.acquire_mode = true;
+            } else if (op == GpuSemaphoreOperation::AcquireMask) {
+                // TODO(kemathe) The acquire mask operation waits for a value that, ANDed with
+                // semaphore_sequence, gives a non-0 result
+                LOG_ERROR(HW_GPU, "Invalid semaphore operation AcquireMask not implemented");
+            } else {
+                LOG_ERROR(HW_GPU, "Invalid semaphore operation");
+            }
+        }
+    }
+}
+
+void GPU::ProcessSemaphoreRelease() {
+    const auto address = memory_manager->GpuToCpuAddress(regs.smaphore_address.SmaphoreAddress());
+    Memory::Write32(*address, regs.semaphore_release);
+}
+
+void GPU::ProcessSemaphoreAcquire() {
+    const auto address = memory_manager->GpuToCpuAddress(regs.smaphore_address.SmaphoreAddress());
+    const u32 word = Memory::Read32(*address);
+    const auto value = regs.semaphore_acquire;
+    if (word != value) {
+        regs.acquire_active = true;
+        regs.acquire_value = value;
+        // TODO(kemathe73) figure out how to do the acquire_timeout
+        regs.acquire_mode = false;
+        regs.acquire_source = false;
+    }
+}
+
 } // namespace Tegra
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index af5ccd1e9..fb8975811 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -156,6 +156,46 @@ public:
     /// Returns a const reference to the GPU DMA pusher.
     const Tegra::DmaPusher& DmaPusher() const;
 
+    struct Regs {
+        static constexpr size_t NUM_REGS = 0x100;
+
+        union {
+            struct {
+                INSERT_PADDING_WORDS(0x4);
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+
+                    GPUVAddr SmaphoreAddress() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } smaphore_address;
+
+                u32 semaphore_sequence;
+                u32 semaphore_trigger;
+                INSERT_PADDING_WORDS(0xC);
+
+                // The puser and the puller share the reference counter, the pusher only has read
+                // access
+                u32 reference_count;
+                INSERT_PADDING_WORDS(0x5);
+
+                u32 semaphore_acquire;
+                u32 semaphore_release;
+                INSERT_PADDING_WORDS(0xE4);
+
+                // Puller state
+                u32 acquire_mode;
+                u32 acquire_source;
+                u32 acquire_active;
+                u32 acquire_timeout;
+                u32 acquire_value;
+            };
+            std::array<u32, NUM_REGS> reg_array;
+        };
+    } regs{};
+
 private:
     std::unique_ptr<Tegra::DmaPusher> dma_pusher;
     std::unique_ptr<Tegra::MemoryManager> memory_manager;
@@ -173,6 +213,37 @@ private:
     std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
     /// Inline memory engine
     std::unique_ptr<Engines::KeplerMemory> kepler_memory;
+
+    void ProcessBindMethod(const MethodCall& method_call);
+    void ProcessSemaphoreTriggerMethod();
+    void ProcessSemaphoreRelease();
+    void ProcessSemaphoreAcquire();
+
+    // Calls a GPU puller method.
+    void CallPullerMethod(const MethodCall& method_call);
+    // Calls a GPU engine method.
+    void CallEngineMethod(const MethodCall& method_call);
+    // Determines where the method should be executed.
+    bool ExecuteMethodOnEngine(const MethodCall& method_call);
 };
 
+#define ASSERT_REG_POSITION(field_name, position)                                                  \
+    static_assert(offsetof(GPU::Regs, field_name) == position * 4,                                 \
+                  "Field " #field_name " has invalid position")
+
+ASSERT_REG_POSITION(smaphore_address, 0x4);
+ASSERT_REG_POSITION(semaphore_sequence, 0x6);
+ASSERT_REG_POSITION(semaphore_trigger, 0x7);
+ASSERT_REG_POSITION(reference_count, 0x14);
+ASSERT_REG_POSITION(semaphore_acquire, 0x1A);
+ASSERT_REG_POSITION(semaphore_release, 0x1B);
+
+ASSERT_REG_POSITION(acquire_mode, 0x100);
+ASSERT_REG_POSITION(acquire_source, 0x101);
+ASSERT_REG_POSITION(acquire_active, 0x102);
+ASSERT_REG_POSITION(acquire_timeout, 0x103);
+ASSERT_REG_POSITION(acquire_value, 0x104);
+
+#undef ASSERT_REG_POSITION
+
 } // namespace Tegra
diff --git a/src/video_core/renderer_opengl/gl_global_cache.cpp b/src/video_core/renderer_opengl/gl_global_cache.cpp
index 7992b82c4..c7f32feaa 100644
--- a/src/video_core/renderer_opengl/gl_global_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_global_cache.cpp
@@ -4,8 +4,13 @@
 
 #include <glad/glad.h>
 
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/core.h"
+#include "core/memory.h"
 #include "video_core/renderer_opengl/gl_global_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
+#include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/utils.h"
 
 namespace OpenGL {
@@ -18,7 +23,72 @@ CachedGlobalRegion::CachedGlobalRegion(VAddr addr, u32 size) : addr{addr}, size{
     LabelGLObject(GL_BUFFER, buffer.handle, addr, "GlobalMemory");
 }
 
+void CachedGlobalRegion::Reload(u32 size_) {
+    constexpr auto max_size = static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize);
+
+    size = size_;
+    if (size > max_size) {
+        size = max_size;
+        LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the expected size {}!", size_,
+                     max_size);
+    }
+
+    // TODO(Rodrigo): Get rid of Memory::GetPointer with a staging buffer
+    glBindBuffer(GL_SHADER_STORAGE_BUFFER, buffer.handle);
+    glBufferData(GL_SHADER_STORAGE_BUFFER, size, Memory::GetPointer(addr), GL_DYNAMIC_DRAW);
+}
+
+GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(VAddr addr, u32 size) const {
+    const auto search{reserve.find(addr)};
+    if (search == reserve.end()) {
+        return {};
+    }
+    return search->second;
+}
+
+GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(VAddr addr, u32 size) {
+    GlobalRegion region{TryGetReservedGlobalRegion(addr, size)};
+    if (!region) {
+        // No reserved surface available, create a new one and reserve it
+        region = std::make_shared<CachedGlobalRegion>(addr, size);
+        ReserveGlobalRegion(region);
+    }
+    region->Reload(size);
+    return region;
+}
+
+void GlobalRegionCacheOpenGL::ReserveGlobalRegion(const GlobalRegion& region) {
+    reserve[region->GetAddr()] = region;
+}
+
 GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer)
     : RasterizerCache{rasterizer} {}
 
+GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
+    const GLShader::GlobalMemoryEntry& global_region,
+    Tegra::Engines::Maxwell3D::Regs::ShaderStage stage) {
+
+    auto& gpu{Core::System::GetInstance().GPU()};
+    const auto cbufs = gpu.Maxwell3D().state.shader_stages[static_cast<u64>(stage)];
+    const auto cbuf_addr = gpu.MemoryManager().GpuToCpuAddress(
+        cbufs.const_buffers[global_region.GetCbufIndex()].address + global_region.GetCbufOffset());
+    ASSERT(cbuf_addr);
+
+    const auto actual_addr_gpu = Memory::Read64(*cbuf_addr);
+    const auto size = Memory::Read32(*cbuf_addr + 8);
+    const auto actual_addr = gpu.MemoryManager().GpuToCpuAddress(actual_addr_gpu);
+    ASSERT(actual_addr);
+
+    // Look up global region in the cache based on address
+    GlobalRegion region = TryGet(*actual_addr);
+
+    if (!region) {
+        // No global region found - create a new one
+        region = GetUncachedGlobalRegion(*actual_addr, size);
+        Register(region);
+    }
+
+    return region;
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_global_cache.h b/src/video_core/renderer_opengl/gl_global_cache.h
index 406a735bc..37830bb7c 100644
--- a/src/video_core/renderer_opengl/gl_global_cache.h
+++ b/src/video_core/renderer_opengl/gl_global_cache.h
@@ -5,9 +5,13 @@
 #pragma once
 
 #include <memory>
+#include <unordered_map>
+
 #include <glad/glad.h>
 
+#include "common/assert.h"
 #include "common/common_types.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 
@@ -40,6 +44,9 @@ public:
         return buffer.handle;
     }
 
+    /// Reloads the global region from guest memory
+    void Reload(u32 size_);
+
     // TODO(Rodrigo): When global memory is written (STG), implement flushing
     void Flush() override {
         UNIMPLEMENTED();
@@ -55,6 +62,17 @@ private:
 class GlobalRegionCacheOpenGL final : public RasterizerCache<GlobalRegion> {
 public:
     explicit GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer);
+
+    /// Gets the current specified shader stage program
+    GlobalRegion GetGlobalRegion(const GLShader::GlobalMemoryEntry& descriptor,
+                                 Tegra::Engines::Maxwell3D::Regs::ShaderStage stage);
+
+private:
+    GlobalRegion TryGetReservedGlobalRegion(VAddr addr, u32 size) const;
+    GlobalRegion GetUncachedGlobalRegion(VAddr addr, u32 size);
+    void ReserveGlobalRegion(const GlobalRegion& region);
+
+    std::unordered_map<VAddr, GlobalRegion> reserve;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 71829fee0..ee313cb2f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -297,10 +297,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
     MICROPROFILE_SCOPE(OpenGL_Shader);
     auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
 
-    // Next available bindpoints to use when uploading the const buffers and textures to the GLSL
-    // shaders. The constbuffer bindpoint starts after the shader stage configuration bind points.
-    u32 current_constbuffer_bindpoint = Tegra::Engines::Maxwell3D::Regs::MaxShaderStage;
-    u32 current_texture_bindpoint = 0;
+    BaseBindings base_bindings;
     std::array<bool, Maxwell::NumClipDistances> clip_distances{};
 
     for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
@@ -324,43 +321,35 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
         const GLintptr offset = buffer_cache.UploadHostMemory(
             &ubo, sizeof(ubo), static_cast<std::size_t>(uniform_buffer_alignment));
 
-        // Bind the buffer
-        glBindBufferRange(GL_UNIFORM_BUFFER, static_cast<GLuint>(stage), buffer_cache.GetHandle(),
-                          offset, static_cast<GLsizeiptr>(sizeof(ubo)));
+        // Bind the emulation info buffer
+        glBindBufferRange(GL_UNIFORM_BUFFER, base_bindings.cbuf, buffer_cache.GetHandle(), offset,
+                          static_cast<GLsizeiptr>(sizeof(ubo)));
 
         Shader shader{shader_cache.GetStageProgram(program)};
+        const auto [program_handle, next_bindings] =
+            shader->GetProgramHandle(primitive_mode, base_bindings);
 
         switch (program) {
         case Maxwell::ShaderProgram::VertexA:
-        case Maxwell::ShaderProgram::VertexB: {
-            shader_program_manager->UseProgrammableVertexShader(
-                shader->GetProgramHandle(primitive_mode));
+        case Maxwell::ShaderProgram::VertexB:
+            shader_program_manager->UseProgrammableVertexShader(program_handle);
             break;
-        }
-        case Maxwell::ShaderProgram::Geometry: {
-            shader_program_manager->UseProgrammableGeometryShader(
-                shader->GetProgramHandle(primitive_mode));
+        case Maxwell::ShaderProgram::Geometry:
+            shader_program_manager->UseProgrammableGeometryShader(program_handle);
             break;
-        }
-        case Maxwell::ShaderProgram::Fragment: {
-            shader_program_manager->UseProgrammableFragmentShader(
-                shader->GetProgramHandle(primitive_mode));
+        case Maxwell::ShaderProgram::Fragment:
+            shader_program_manager->UseProgrammableFragmentShader(program_handle);
             break;
-        }
         default:
             LOG_CRITICAL(HW_GPU, "Unimplemented shader index={}, enable={}, offset=0x{:08X}", index,
                          shader_config.enable.Value(), shader_config.offset);
             UNREACHABLE();
         }
 
-        // Configure the const buffers for this shader stage.
-        current_constbuffer_bindpoint =
-            SetupConstBuffers(static_cast<Maxwell::ShaderStage>(stage), shader, primitive_mode,
-                              current_constbuffer_bindpoint);
-
-        // Configure the textures for this shader stage.
-        current_texture_bindpoint = SetupTextures(static_cast<Maxwell::ShaderStage>(stage), shader,
-                                                  primitive_mode, current_texture_bindpoint);
+        const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage);
+        SetupConstBuffers(stage_enum, shader, program_handle, base_bindings);
+        SetupGlobalRegions(stage_enum, shader, program_handle, base_bindings);
+        SetupTextures(stage_enum, shader, program_handle, base_bindings);
 
         // Workaround for Intel drivers.
         // When a clip distance is enabled but not set in the shader it crops parts of the screen
@@ -375,6 +364,8 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
             // VertexB was combined with VertexA, so we skip the VertexB iteration
             index++;
         }
+
+        base_bindings = next_bindings;
     }
 
     SyncClipEnabled(clip_distances);
@@ -924,8 +915,9 @@ void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntr
     }
 }
 
-u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, Shader& shader,
-                                        GLenum primitive_mode, u32 current_bindpoint) {
+void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                                         const Shader& shader, GLuint program_handle,
+                                         BaseBindings base_bindings) {
     MICROPROFILE_SCOPE(OpenGL_UBO);
     const auto& gpu = Core::System::GetInstance().GPU();
     const auto& maxwell3d = gpu.Maxwell3D();
@@ -973,75 +965,73 @@ u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, Shader& shad
         size = Common::AlignUp(size, sizeof(GLvec4));
         ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big");
 
-        GLintptr const_buffer_offset = buffer_cache.UploadMemory(
+        const GLintptr const_buffer_offset = buffer_cache.UploadMemory(
             buffer.address, size, static_cast<std::size_t>(uniform_buffer_alignment));
 
-        // Now configure the bindpoint of the buffer inside the shader
-        glUniformBlockBinding(shader->GetProgramHandle(primitive_mode),
-                              shader->GetProgramResourceIndex(used_buffer),
-                              current_bindpoint + bindpoint);
-
         // Prepare values for multibind
         bind_buffers[bindpoint] = buffer_cache.GetHandle();
         bind_offsets[bindpoint] = const_buffer_offset;
         bind_sizes[bindpoint] = size;
     }
 
-    glBindBuffersRange(GL_UNIFORM_BUFFER, current_bindpoint, static_cast<GLsizei>(entries.size()),
+    // The first binding is reserved for emulation values
+    const GLuint ubo_base_binding = base_bindings.cbuf + 1;
+    glBindBuffersRange(GL_UNIFORM_BUFFER, ubo_base_binding, static_cast<GLsizei>(entries.size()),
                        bind_buffers.data(), bind_offsets.data(), bind_sizes.data());
+}
+
+void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                                          const Shader& shader, GLenum primitive_mode,
+                                          BaseBindings base_bindings) {
+    // TODO(Rodrigo): Use ARB_multi_bind here
+    const auto& entries = shader->GetShaderEntries().global_memory_entries;
 
-    return current_bindpoint + static_cast<u32>(entries.size());
+    for (u32 bindpoint = 0; bindpoint < static_cast<u32>(entries.size()); ++bindpoint) {
+        const auto& entry = entries[bindpoint];
+        const u32 current_bindpoint = base_bindings.gmem + bindpoint;
+        const auto& region = global_cache.GetGlobalRegion(entry, stage);
+
+        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, current_bindpoint, region->GetBufferHandle());
+    }
 }
 
-u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, Shader& shader,
-                                    GLenum primitive_mode, u32 current_unit) {
+void RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader,
+                                     GLuint program_handle, BaseBindings base_bindings) {
     MICROPROFILE_SCOPE(OpenGL_Texture);
     const auto& gpu = Core::System::GetInstance().GPU();
     const auto& maxwell3d = gpu.Maxwell3D();
     const auto& entries = shader->GetShaderEntries().samplers;
 
-    ASSERT_MSG(current_unit + entries.size() <= std::size(state.texture_units),
+    ASSERT_MSG(base_bindings.sampler + entries.size() <= std::size(state.texture_units),
                "Exceeded the number of active textures.");
 
     for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
         const auto& entry = entries[bindpoint];
-        const u32 current_bindpoint = current_unit + bindpoint;
-
-        // Bind the uniform to the sampler.
-
-        glProgramUniform1i(shader->GetProgramHandle(primitive_mode),
-                           shader->GetUniformLocation(entry), current_bindpoint);
+        const u32 current_bindpoint = base_bindings.sampler + bindpoint;
+        auto& unit = state.texture_units[current_bindpoint];
 
         const auto texture = maxwell3d.GetStageTexture(entry.GetStage(), entry.GetOffset());
-
         if (!texture.enabled) {
-            state.texture_units[current_bindpoint].texture = 0;
+            unit.texture = 0;
             continue;
         }
 
         texture_samplers[current_bindpoint].SyncWithConfig(texture.tsc);
+
         Surface surface = res_cache.GetTextureSurface(texture, entry);
         if (surface != nullptr) {
-            const GLuint handle =
+            unit.texture =
                 entry.IsArray() ? surface->TextureLayer().handle : surface->Texture().handle;
-            const GLenum target = entry.IsArray() ? surface->TargetLayer() : surface->Target();
-            state.texture_units[current_bindpoint].texture = handle;
-            state.texture_units[current_bindpoint].target = target;
-            state.texture_units[current_bindpoint].swizzle.r =
-                MaxwellToGL::SwizzleSource(texture.tic.x_source);
-            state.texture_units[current_bindpoint].swizzle.g =
-                MaxwellToGL::SwizzleSource(texture.tic.y_source);
-            state.texture_units[current_bindpoint].swizzle.b =
-                MaxwellToGL::SwizzleSource(texture.tic.z_source);
-            state.texture_units[current_bindpoint].swizzle.a =
-                MaxwellToGL::SwizzleSource(texture.tic.w_source);
+            unit.target = entry.IsArray() ? surface->TargetLayer() : surface->Target();
+            unit.swizzle.r = MaxwellToGL::SwizzleSource(texture.tic.x_source);
+            unit.swizzle.g = MaxwellToGL::SwizzleSource(texture.tic.y_source);
+            unit.swizzle.b = MaxwellToGL::SwizzleSource(texture.tic.z_source);
+            unit.swizzle.a = MaxwellToGL::SwizzleSource(texture.tic.w_source);
         } else {
             // Can occur when texture addr is null or its memory is unmapped/invalid
-            state.texture_units[current_bindpoint].texture = 0;
+            unit.texture = 0;
         }
     }
-
-    return current_unit + static_cast<u32>(entries.size());
 }
 
 void RasterizerOpenGL::SyncViewport(OpenGLState& current_state) {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 21c51f874..a103692f9 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -127,25 +127,18 @@ private:
                                bool using_depth_fb = true, bool preserve_contents = true,
                                std::optional<std::size_t> single_color_target = {});
 
-    /**
-     * Configures the current constbuffers to use for the draw command.
-     * @param stage The shader stage to configure buffers for.
-     * @param shader The shader object that contains the specified stage.
-     * @param current_bindpoint The offset at which to start counting new buffer bindpoints.
-     * @returns The next available bindpoint for use in the next shader stage.
-     */
-    u32 SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, Shader& shader,
-                          GLenum primitive_mode, u32 current_bindpoint);
-
-    /**
-     * Configures the current textures to use for the draw command.
-     * @param stage The shader stage to configure textures for.
-     * @param shader The shader object that contains the specified stage.
-     * @param current_unit The offset at which to start counting unused texture units.
-     * @returns The next available bindpoint for use in the next shader stage.
-     */
-    u32 SetupTextures(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, Shader& shader,
-                      GLenum primitive_mode, u32 current_unit);
+    /// Configures the current constbuffers to use for the draw command.
+    void SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader,
+                           GLuint program_handle, BaseBindings base_bindings);
+
+    /// Configures the current global memory entries to use for the draw command.
+    void SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                            const Shader& shader, GLenum primitive_mode,
+                            BaseBindings base_bindings);
+
+    /// Configures the current textures to use for the draw command.
+    void SetupTextures(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader,
+                       GLuint program_handle, BaseBindings base_bindings);
 
     /// Syncs the viewport and depth range to match the guest state
     void SyncViewport(OpenGLState& current_state);
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index b3aca39af..90eda7814 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -34,36 +34,25 @@ static ProgramCode GetShaderCode(VAddr addr) {
     return program_code;
 }
 
-/// Helper function to set shader uniform block bindings for a single shader stage
-static void SetShaderUniformBlockBinding(GLuint shader, const char* name,
-                                         Maxwell::ShaderStage binding, std::size_t expected_size) {
-    const GLuint ub_index = glGetUniformBlockIndex(shader, name);
-    if (ub_index == GL_INVALID_INDEX) {
-        return;
+/// Gets the shader type from a Maxwell program type
+constexpr GLenum GetShaderType(Maxwell::ShaderProgram program_type) {
+    switch (program_type) {
+    case Maxwell::ShaderProgram::VertexA:
+    case Maxwell::ShaderProgram::VertexB:
+        return GL_VERTEX_SHADER;
+    case Maxwell::ShaderProgram::Geometry:
+        return GL_GEOMETRY_SHADER;
+    case Maxwell::ShaderProgram::Fragment:
+        return GL_FRAGMENT_SHADER;
+    default:
+        return GL_NONE;
     }
-
-    GLint ub_size = 0;
-    glGetActiveUniformBlockiv(shader, ub_index, GL_UNIFORM_BLOCK_DATA_SIZE, &ub_size);
-    ASSERT_MSG(static_cast<std::size_t>(ub_size) == expected_size,
-               "Uniform block size did not match! Got {}, expected {}", ub_size, expected_size);
-    glUniformBlockBinding(shader, ub_index, static_cast<GLuint>(binding));
-}
-
-/// Sets shader uniform block bindings for an entire shader program
-static void SetShaderUniformBlockBindings(GLuint shader) {
-    SetShaderUniformBlockBinding(shader, "vs_config", Maxwell::ShaderStage::Vertex,
-                                 sizeof(GLShader::MaxwellUniformData));
-    SetShaderUniformBlockBinding(shader, "gs_config", Maxwell::ShaderStage::Geometry,
-                                 sizeof(GLShader::MaxwellUniformData));
-    SetShaderUniformBlockBinding(shader, "fs_config", Maxwell::ShaderStage::Fragment,
-                                 sizeof(GLShader::MaxwellUniformData));
 }
 
 CachedShader::CachedShader(VAddr addr, Maxwell::ShaderProgram program_type)
     : addr{addr}, program_type{program_type}, setup{GetShaderCode(addr)} {
 
     GLShader::ProgramResult program_result;
-    GLenum gl_type{};
 
     switch (program_type) {
     case Maxwell::ShaderProgram::VertexA:
@@ -74,17 +63,14 @@ CachedShader::CachedShader(VAddr addr, Maxwell::ShaderProgram program_type)
     case Maxwell::ShaderProgram::VertexB:
         CalculateProperties();
         program_result = GLShader::GenerateVertexShader(setup);
-        gl_type = GL_VERTEX_SHADER;
         break;
     case Maxwell::ShaderProgram::Geometry:
         CalculateProperties();
         program_result = GLShader::GenerateGeometryShader(setup);
-        gl_type = GL_GEOMETRY_SHADER;
         break;
     case Maxwell::ShaderProgram::Fragment:
         CalculateProperties();
         program_result = GLShader::GenerateFragmentShader(setup);
-        gl_type = GL_FRAGMENT_SHADER;
         break;
     default:
         LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type));
@@ -92,59 +78,105 @@ CachedShader::CachedShader(VAddr addr, Maxwell::ShaderProgram program_type)
         return;
     }
 
+    code = program_result.first;
     entries = program_result.second;
     shader_length = entries.shader_length;
+}
 
-    if (program_type != Maxwell::ShaderProgram::Geometry) {
-        OGLShader shader;
-        shader.Create(program_result.first.c_str(), gl_type);
-        program.Create(true, shader.handle);
-        SetShaderUniformBlockBindings(program.handle);
-        LabelGLObject(GL_PROGRAM, program.handle, addr);
+std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(GLenum primitive_mode,
+                                                                BaseBindings base_bindings) {
+    GLuint handle{};
+    if (program_type == Maxwell::ShaderProgram::Geometry) {
+        handle = GetGeometryShader(primitive_mode, base_bindings);
     } else {
-        // Store shader's code to lazily build it on draw
-        geometry_programs.code = program_result.first;
+        const auto [entry, is_cache_miss] = programs.try_emplace(base_bindings);
+        auto& program = entry->second;
+        if (is_cache_miss) {
+            std::string source = AllocateBindings(base_bindings);
+            source += code;
+
+            OGLShader shader;
+            shader.Create(source.c_str(), GetShaderType(program_type));
+            program.Create(true, shader.handle);
+            LabelGLObject(GL_PROGRAM, program.handle, addr);
+        }
+
+        handle = program.handle;
     }
+
+    // Add const buffer and samplers offset reserved by this shader. One UBO binding is reserved for
+    // emulation values
+    base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size()) + 1;
+    base_bindings.gmem += static_cast<u32>(entries.global_memory_entries.size());
+    base_bindings.sampler += static_cast<u32>(entries.samplers.size());
+
+    return {handle, base_bindings};
 }
 
-GLuint CachedShader::GetProgramResourceIndex(const GLShader::ConstBufferEntry& buffer) {
-    const auto search{resource_cache.find(buffer.GetHash())};
-    if (search == resource_cache.end()) {
-        const GLuint index{
-            glGetProgramResourceIndex(program.handle, GL_UNIFORM_BLOCK, buffer.GetName().c_str())};
-        resource_cache[buffer.GetHash()] = index;
-        return index;
+std::string CachedShader::AllocateBindings(BaseBindings base_bindings) {
+    std::string code = "#version 430 core\n";
+    code += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
+
+    for (const auto& cbuf : entries.const_buffers) {
+        code += fmt::format("#define CBUF_BINDING_{} {}\n", cbuf.GetIndex(), base_bindings.cbuf++);
     }
 
-    return search->second;
-}
+    for (const auto& gmem : entries.global_memory_entries) {
+        code += fmt::format("#define GMEM_BINDING_{}_{} {}\n", gmem.GetCbufIndex(),
+                            gmem.GetCbufOffset(), base_bindings.gmem++);
+    }
 
-GLint CachedShader::GetUniformLocation(const GLShader::SamplerEntry& sampler) {
-    const auto search{uniform_cache.find(sampler.GetHash())};
-    if (search == uniform_cache.end()) {
-        const GLint index{glGetUniformLocation(program.handle, sampler.GetName().c_str())};
-        uniform_cache[sampler.GetHash()] = index;
-        return index;
+    for (const auto& sampler : entries.samplers) {
+        code += fmt::format("#define SAMPLER_BINDING_{} {}\n", sampler.GetIndex(),
+                            base_bindings.sampler++);
     }
 
-    return search->second;
+    return code;
+}
+
+GLuint CachedShader::GetGeometryShader(GLenum primitive_mode, BaseBindings base_bindings) {
+    const auto [entry, is_cache_miss] = geometry_programs.try_emplace(base_bindings);
+    auto& programs = entry->second;
+
+    switch (primitive_mode) {
+    case GL_POINTS:
+        return LazyGeometryProgram(programs.points, base_bindings, "points", 1, "ShaderPoints");
+    case GL_LINES:
+    case GL_LINE_STRIP:
+        return LazyGeometryProgram(programs.lines, base_bindings, "lines", 2, "ShaderLines");
+    case GL_LINES_ADJACENCY:
+    case GL_LINE_STRIP_ADJACENCY:
+        return LazyGeometryProgram(programs.lines_adjacency, base_bindings, "lines_adjacency", 4,
+                                   "ShaderLinesAdjacency");
+    case GL_TRIANGLES:
+    case GL_TRIANGLE_STRIP:
+    case GL_TRIANGLE_FAN:
+        return LazyGeometryProgram(programs.triangles, base_bindings, "triangles", 3,
+                                   "ShaderTriangles");
+    case GL_TRIANGLES_ADJACENCY:
+    case GL_TRIANGLE_STRIP_ADJACENCY:
+        return LazyGeometryProgram(programs.triangles_adjacency, base_bindings,
+                                   "triangles_adjacency", 6, "ShaderTrianglesAdjacency");
+    default:
+        UNREACHABLE_MSG("Unknown primitive mode.");
+        return LazyGeometryProgram(programs.points, base_bindings, "points", 1, "ShaderPoints");
+    }
 }
 
-GLuint CachedShader::LazyGeometryProgram(OGLProgram& target_program,
+GLuint CachedShader::LazyGeometryProgram(OGLProgram& target_program, BaseBindings base_bindings,
                                          const std::string& glsl_topology, u32 max_vertices,
                                          const std::string& debug_name) {
     if (target_program.handle != 0) {
         return target_program.handle;
     }
-    std::string source = "#version 430 core\n";
+    std::string source = AllocateBindings(base_bindings);
     source += "layout (" + glsl_topology + ") in;\n";
     source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n';
-    source += geometry_programs.code;
+    source += code;
 
     OGLShader shader;
     shader.Create(source.c_str(), GL_GEOMETRY_SHADER);
     target_program.Create(true, shader.handle);
-    SetShaderUniformBlockBindings(target_program.handle);
     LabelGLObject(GL_PROGRAM, target_program.handle, addr, debug_name);
     return target_program.handle;
 };
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index e0887dd7b..904d15dd0 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -7,6 +7,9 @@
 #include <array>
 #include <map>
 #include <memory>
+#include <tuple>
+
+#include <glad/glad.h>
 
 #include "common/assert.h"
 #include "common/common_types.h"
@@ -23,6 +26,16 @@ class RasterizerOpenGL;
 using Shader = std::shared_ptr<CachedShader>;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
+struct BaseBindings {
+    u32 cbuf{};
+    u32 gmem{};
+    u32 sampler{};
+
+    bool operator<(const BaseBindings& rhs) const {
+        return std::tie(cbuf, gmem, sampler) < std::tie(rhs.cbuf, rhs.gmem, rhs.sampler);
+    }
+};
+
 class CachedShader final : public RasterizerCacheObject {
 public:
     CachedShader(VAddr addr, Maxwell::ShaderProgram program_type);
@@ -44,70 +57,45 @@ public:
     }
 
     /// Gets the GL program handle for the shader
-    GLuint GetProgramHandle(GLenum primitive_mode) {
-        if (program_type != Maxwell::ShaderProgram::Geometry) {
-            return program.handle;
-        }
-        switch (primitive_mode) {
-        case GL_POINTS:
-            return LazyGeometryProgram(geometry_programs.points, "points", 1, "ShaderPoints");
-        case GL_LINES:
-        case GL_LINE_STRIP:
-            return LazyGeometryProgram(geometry_programs.lines, "lines", 2, "ShaderLines");
-        case GL_LINES_ADJACENCY:
-        case GL_LINE_STRIP_ADJACENCY:
-            return LazyGeometryProgram(geometry_programs.lines_adjacency, "lines_adjacency", 4,
-                                       "ShaderLinesAdjacency");
-        case GL_TRIANGLES:
-        case GL_TRIANGLE_STRIP:
-        case GL_TRIANGLE_FAN:
-            return LazyGeometryProgram(geometry_programs.triangles, "triangles", 3,
-                                       "ShaderTriangles");
-        case GL_TRIANGLES_ADJACENCY:
-        case GL_TRIANGLE_STRIP_ADJACENCY:
-            return LazyGeometryProgram(geometry_programs.triangles_adjacency, "triangles_adjacency",
-                                       6, "ShaderTrianglesAdjacency");
-        default:
-            UNREACHABLE_MSG("Unknown primitive mode.");
-            return LazyGeometryProgram(geometry_programs.points, "points", 1, "ShaderPoints");
-        }
-    }
+    std::tuple<GLuint, BaseBindings> GetProgramHandle(GLenum primitive_mode,
+                                                      BaseBindings base_bindings);
 
-    /// Gets the GL program resource location for the specified resource, caching as needed
-    GLuint GetProgramResourceIndex(const GLShader::ConstBufferEntry& buffer);
+private:
+    // Geometry programs. These are needed because GLSL needs an input topology but it's not
+    // declared by the hardware. Workaround this issue by generating a different shader per input
+    // topology class.
+    struct GeometryPrograms {
+        OGLProgram points;
+        OGLProgram lines;
+        OGLProgram lines_adjacency;
+        OGLProgram triangles;
+        OGLProgram triangles_adjacency;
+    };
 
-    /// Gets the GL uniform location for the specified resource, caching as needed
-    GLint GetUniformLocation(const GLShader::SamplerEntry& sampler);
+    std::string AllocateBindings(BaseBindings base_bindings);
+
+    GLuint GetGeometryShader(GLenum primitive_mode, BaseBindings base_bindings);
 
-private:
     /// Generates a geometry shader or returns one that already exists.
-    GLuint LazyGeometryProgram(OGLProgram& target_program, const std::string& glsl_topology,
-                               u32 max_vertices, const std::string& debug_name);
+    GLuint LazyGeometryProgram(OGLProgram& target_program, BaseBindings base_bindings,
+                               const std::string& glsl_topology, u32 max_vertices,
+                               const std::string& debug_name);
 
     void CalculateProperties();
 
-    VAddr addr;
-    std::size_t shader_length;
-    Maxwell::ShaderProgram program_type;
+    VAddr addr{};
+    std::size_t shader_length{};
+    Maxwell::ShaderProgram program_type{};
     GLShader::ShaderSetup setup;
     GLShader::ShaderEntries entries;
 
-    // Non-geometry program.
-    OGLProgram program;
+    std::string code;
 
-    // Geometry programs. These are needed because GLSL needs an input topology but it's not
-    // declared by the hardware. Workaround this issue by generating a different shader per input
-    // topology class.
-    struct {
-        std::string code;
-        OGLProgram points;
-        OGLProgram lines;
-        OGLProgram lines_adjacency;
-        OGLProgram triangles;
-        OGLProgram triangles_adjacency;
-    } geometry_programs;
+    std::map<BaseBindings, OGLProgram> programs;
+    std::map<BaseBindings, GeometryPrograms> geometry_programs;
 
-    std::map<u32, GLuint> resource_cache;
+    std::map<u32, GLuint> cbuf_resource_cache;
+    std::map<u32, GLuint> gmem_resource_cache;
     std::map<u32, GLint> uniform_cache;
 };
 
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 3411cf9e6..004245431 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -34,6 +34,8 @@ using Operation = const OperationNode&;
 enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 };
 constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
     static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float));
+constexpr u32 MAX_GLOBALMEMORY_ELEMENTS =
+    static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize) / sizeof(float);
 
 enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
 
@@ -143,6 +145,7 @@ public:
         DeclareInputAttributes();
         DeclareOutputAttributes();
         DeclareConstantBuffers();
+        DeclareGlobalMemory();
         DeclareSamplers();
 
         code.AddLine("void execute_" + suffix + "() {");
@@ -190,12 +193,15 @@ public:
     ShaderEntries GetShaderEntries() const {
         ShaderEntries entries;
         for (const auto& cbuf : ir.GetConstantBuffers()) {
-            ConstBufferEntry desc(cbuf.second, stage, GetConstBufferBlock(cbuf.first), cbuf.first);
-            entries.const_buffers.push_back(desc);
+            entries.const_buffers.emplace_back(cbuf.second, stage, GetConstBufferBlock(cbuf.first),
+                                               cbuf.first);
         }
         for (const auto& sampler : ir.GetSamplers()) {
-            SamplerEntry desc(sampler, stage, GetSampler(sampler));
-            entries.samplers.push_back(desc);
+            entries.samplers.emplace_back(sampler, stage, GetSampler(sampler));
+        }
+        for (const auto& gmem : ir.GetGlobalMemoryBases()) {
+            entries.global_memory_entries.emplace_back(gmem.cbuf_index, gmem.cbuf_offset, stage,
+                                                       GetGlobalMemoryBlock(gmem));
         }
         entries.clip_distances = ir.GetClipDistances();
         entries.shader_length = ir.GetLength();
@@ -368,13 +374,26 @@ private:
     void DeclareConstantBuffers() {
         for (const auto& entry : ir.GetConstantBuffers()) {
             const auto [index, size] = entry;
-            code.AddLine("layout (std140) uniform " + GetConstBufferBlock(index) + " {");
+            code.AddLine("layout (std140, binding = CBUF_BINDING_" + std::to_string(index) +
+                         ") uniform " + GetConstBufferBlock(index) + " {");
             code.AddLine("    vec4 " + GetConstBuffer(index) + "[MAX_CONSTBUFFER_ELEMENTS];");
             code.AddLine("};");
             code.AddNewLine();
         }
     }
 
+    void DeclareGlobalMemory() {
+        for (const auto& entry : ir.GetGlobalMemoryBases()) {
+            const std::string binding =
+                fmt::format("GMEM_BINDING_{}_{}", entry.cbuf_index, entry.cbuf_offset);
+            code.AddLine("layout (std430, binding = " + binding + ") buffer " +
+                         GetGlobalMemoryBlock(entry) + " {");
+            code.AddLine("    float " + GetGlobalMemory(entry) + "[MAX_GLOBALMEMORY_ELEMENTS];");
+            code.AddLine("};");
+            code.AddNewLine();
+        }
+    }
+
     void DeclareSamplers() {
         const auto& samplers = ir.GetSamplers();
         for (const auto& sampler : samplers) {
@@ -398,7 +417,8 @@ private:
             if (sampler.IsShadow())
                 sampler_type += "Shadow";
 
-            code.AddLine("uniform " + sampler_type + ' ' + GetSampler(sampler) + ';');
+            code.AddLine("layout (binding = SAMPLER_BINDING_" + std::to_string(sampler.GetIndex()) +
+                         ") uniform " + sampler_type + ' ' + GetSampler(sampler) + ';');
         }
         if (!samplers.empty())
             code.AddNewLine();
@@ -538,6 +558,12 @@ private:
                 UNREACHABLE_MSG("Unmanaged offset node type");
             }
 
+        } else if (const auto gmem = std::get_if<GmemNode>(node)) {
+            const std::string real = Visit(gmem->GetRealAddress());
+            const std::string base = Visit(gmem->GetBaseAddress());
+            const std::string final_offset = "(ftou(" + real + ") - ftou(" + base + ")) / 4";
+            return fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset);
+
         } else if (const auto lmem = std::get_if<LmemNode>(node)) {
             return fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
 
@@ -1471,6 +1497,15 @@ private:
         return GetDeclarationWithSuffix(index, "cbuf");
     }
 
+    std::string GetGlobalMemory(const GlobalMemoryBase& descriptor) const {
+        return fmt::format("gmem_{}_{}_{}", descriptor.cbuf_index, descriptor.cbuf_offset, suffix);
+    }
+
+    std::string GetGlobalMemoryBlock(const GlobalMemoryBase& descriptor) const {
+        return fmt::format("gmem_block_{}_{}_{}", descriptor.cbuf_index, descriptor.cbuf_offset,
+                           suffix);
+    }
+
     std::string GetConstBufferBlock(u32 index) const {
         return GetDeclarationWithSuffix(index, "cbuf_block");
     }
@@ -1505,8 +1540,10 @@ private:
 };
 
 std::string GetCommonDeclarations() {
-    return "#define MAX_CONSTBUFFER_ELEMENTS " + std::to_string(MAX_CONSTBUFFER_ELEMENTS) +
-           "\n"
+    const auto cbuf = std::to_string(MAX_CONSTBUFFER_ELEMENTS);
+    const auto gmem = std::to_string(MAX_GLOBALMEMORY_ELEMENTS);
+    return "#define MAX_CONSTBUFFER_ELEMENTS " + cbuf + "\n" +
+           "#define MAX_GLOBALMEMORY_ELEMENTS " + gmem + "\n" +
            "#define ftoi floatBitsToInt\n"
            "#define ftou floatBitsToUint\n"
            "#define itof intBitsToFloat\n"
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index 396a560d8..0856a1361 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -38,10 +38,6 @@ public:
         return index;
     }
 
-    u32 GetHash() const {
-        return (static_cast<u32>(stage) << 16) | index;
-    }
-
 private:
     std::string name;
     Maxwell::ShaderStage stage{};
@@ -62,18 +58,44 @@ public:
         return stage;
     }
 
-    u32 GetHash() const {
-        return (static_cast<u32>(stage) << 16) | static_cast<u32>(GetIndex());
+private:
+    std::string name;
+    Maxwell::ShaderStage stage{};
+};
+
+class GlobalMemoryEntry {
+public:
+    explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, Maxwell::ShaderStage stage,
+                               std::string name)
+        : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, stage{stage}, name{std::move(name)} {}
+
+    u32 GetCbufIndex() const {
+        return cbuf_index;
+    }
+
+    u32 GetCbufOffset() const {
+        return cbuf_offset;
+    }
+
+    const std::string& GetName() const {
+        return name;
+    }
+
+    Maxwell::ShaderStage GetStage() const {
+        return stage;
     }
 
 private:
-    std::string name;
+    u32 cbuf_index{};
+    u32 cbuf_offset{};
     Maxwell::ShaderStage stage{};
+    std::string name;
 };
 
 struct ShaderEntries {
     std::vector<ConstBufferEntry> const_buffers;
     std::vector<SamplerEntry> samplers;
+    std::vector<GlobalMemoryEntry> global_memory_entries;
     std::array<bool, Maxwell::NumClipDistances> clip_distances{};
     std::size_t shader_length{};
 };
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 446d1a93f..04e1db911 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -20,15 +20,14 @@ static constexpr u32 PROGRAM_OFFSET{10};
 ProgramResult GenerateVertexShader(const ShaderSetup& setup) {
     const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
 
-    std::string out = "#version 430 core\n";
-    out += "#extension GL_ARB_separate_shader_objects : enable\n\n";
+    std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
     out += "// Shader Unique Id: VS" + id + "\n\n";
     out += GetCommonDeclarations();
 
     out += R"(
 layout (location = 0) out vec4 position;
 
-layout(std140) uniform vs_config {
+layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
     vec4 viewport_flip;
     uvec4 config_pack; // instance_id, flip_stage, y_direction, padding
     uvec4 alpha_test;
@@ -78,7 +77,6 @@ void main() {
 }
 
 ProgramResult GenerateGeometryShader(const ShaderSetup& setup) {
-    // Version is intentionally skipped in shader generation, it's added by the lazy compilation.
     const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
 
     std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
@@ -89,7 +87,7 @@ ProgramResult GenerateGeometryShader(const ShaderSetup& setup) {
 layout (location = 0) in vec4 gs_position[];
 layout (location = 0) out vec4 position;
 
-layout (std140) uniform gs_config {
+layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
     vec4 viewport_flip;
     uvec4 config_pack; // instance_id, flip_stage, y_direction, padding
     uvec4 alpha_test;
@@ -112,8 +110,7 @@ void main() {
 ProgramResult GenerateFragmentShader(const ShaderSetup& setup) {
     const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
 
-    std::string out = "#version 430 core\n";
-    out += "#extension GL_ARB_separate_shader_objects : enable\n\n";
+    std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
     out += "// Shader Unique Id: FS" + id + "\n\n";
     out += GetCommonDeclarations();
 
@@ -129,7 +126,7 @@ layout (location = 7) out vec4 FragColor7;
 
 layout (location = 0) in vec4 position;
 
-layout (std140) uniform fs_config {
+layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config {
     vec4 viewport_flip;
     uvec4 config_pack; // instance_id, flip_stage, y_direction, padding
     uvec4 alpha_test;
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index ae71672d6..04cb386b7 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -4,6 +4,7 @@
 
 #include <algorithm>
 #include <vector>
+#include <fmt/format.h>
 
 #include "common/assert.h"
 #include "common/common_types.h"
@@ -119,6 +120,54 @@ u32 ShaderIR::DecodeMemory(BasicBlock& bb, const BasicBlock& code, u32 pc) {
         }
         break;
     }
+    case OpCode::Id::LDG: {
+        const u32 count = [&]() {
+            switch (instr.ldg.type) {
+            case Tegra::Shader::UniformType::Single:
+                return 1;
+            case Tegra::Shader::UniformType::Double:
+                return 2;
+            case Tegra::Shader::UniformType::Quad:
+            case Tegra::Shader::UniformType::UnsignedQuad:
+                return 4;
+            default:
+                UNIMPLEMENTED_MSG("Unimplemented LDG size!");
+                return 1;
+            }
+        }();
+
+        const Node addr_register = GetRegister(instr.gpr8);
+        const Node base_address = TrackCbuf(addr_register, code, static_cast<s64>(code.size()));
+        const auto cbuf = std::get_if<CbufNode>(base_address);
+        ASSERT(cbuf != nullptr);
+        const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset());
+        ASSERT(cbuf_offset_imm != nullptr);
+        const auto cbuf_offset = cbuf_offset_imm->GetValue() * 4;
+
+        bb.push_back(Comment(
+            fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset)));
+
+        const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset};
+        used_global_memory_bases.insert(descriptor);
+
+        const Node immediate_offset =
+            Immediate(static_cast<u32>(instr.ldg.immediate_offset.Value()));
+        const Node base_real_address =
+            Operation(OperationCode::UAdd, NO_PRECISE, immediate_offset, addr_register);
+
+        for (u32 i = 0; i < count; ++i) {
+            const Node it_offset = Immediate(i * 4);
+            const Node real_address =
+                Operation(OperationCode::UAdd, NO_PRECISE, base_real_address, it_offset);
+            const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));
+
+            SetTemporal(bb, i, gmem);
+        }
+        for (u32 i = 0; i < count; ++i) {
+            SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+        }
+        break;
+    }
     case OpCode::Id::ST_A: {
         UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex,
                              "Indirect attribute loads are not supported");
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index ef8f94480..c4ecb2e3c 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -257,6 +257,15 @@ private:
     bool is_indirect{};
 };
 
+struct GlobalMemoryBase {
+    u32 cbuf_index{};
+    u32 cbuf_offset{};
+
+    bool operator<(const GlobalMemoryBase& rhs) const {
+        return std::tie(cbuf_index, cbuf_offset) < std::tie(rhs.cbuf_index, rhs.cbuf_offset);
+    }
+};
+
 struct MetaArithmetic {
     bool precise{};
 };
@@ -478,14 +487,26 @@ private:
 /// Global memory node
 class GmemNode final {
 public:
-    explicit constexpr GmemNode(Node address) : address{address} {}
+    explicit constexpr GmemNode(Node real_address, Node base_address,
+                                const GlobalMemoryBase& descriptor)
+        : real_address{real_address}, base_address{base_address}, descriptor{descriptor} {}
 
-    Node GetAddress() const {
-        return address;
+    Node GetRealAddress() const {
+        return real_address;
+    }
+
+    Node GetBaseAddress() const {
+        return base_address;
+    }
+
+    const GlobalMemoryBase& GetDescriptor() const {
+        return descriptor;
     }
 
 private:
-    const Node address;
+    const Node real_address;
+    const Node base_address;
+    const GlobalMemoryBase descriptor;
 };
 
 /// Commentary, can be dropped
@@ -543,6 +564,10 @@ public:
         return used_clip_distances;
     }
 
+    const std::set<GlobalMemoryBase>& GetGlobalMemoryBases() const {
+        return used_global_memory_bases;
+    }
+
     std::size_t GetLength() const {
         return static_cast<std::size_t>(coverage_end * sizeof(u64));
     }
@@ -734,6 +759,10 @@ private:
     void WriteLop3Instruction(BasicBlock& bb, Tegra::Shader::Register dest, Node op_a, Node op_b,
                               Node op_c, Node imm_lut, bool sets_cc);
 
+    Node TrackCbuf(Node tracked, const BasicBlock& code, s64 cursor);
+
+    std::pair<Node, s64> TrackRegister(const GprNode* tracked, const BasicBlock& code, s64 cursor);
+
     template <typename... T>
     Node Operation(OperationCode code, const T*... operands) {
         return StoreNode(OperationNode(code, operands...));
@@ -786,6 +815,7 @@ private:
     std::map<u32, ConstBuffer> used_cbufs;
     std::set<Sampler> used_samplers;
     std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{};
+    std::set<GlobalMemoryBase> used_global_memory_bases;
 
     Tegra::Shader::Header header;
 };
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
new file mode 100644
index 000000000..d6d29ee9f
--- /dev/null
+++ b/src/video_core/shader/track.cpp
@@ -0,0 +1,76 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <utility>
+#include <variant>
+
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+namespace {
+std::pair<Node, s64> FindOperation(const BasicBlock& code, s64 cursor,
+                                   OperationCode operation_code) {
+    for (; cursor >= 0; --cursor) {
+        const Node node = code[cursor];
+        if (const auto operation = std::get_if<OperationNode>(node)) {
+            if (operation->GetCode() == operation_code)
+                return {node, cursor};
+        }
+    }
+    return {};
+}
+} // namespace
+
+Node ShaderIR::TrackCbuf(Node tracked, const BasicBlock& code, s64 cursor) {
+    if (const auto cbuf = std::get_if<CbufNode>(tracked)) {
+        // Cbuf found, but it has to be immediate
+        return std::holds_alternative<ImmediateNode>(*cbuf->GetOffset()) ? tracked : nullptr;
+    }
+    if (const auto gpr = std::get_if<GprNode>(tracked)) {
+        if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) {
+            return nullptr;
+        }
+        // Reduce the cursor in one to avoid infinite loops when the instruction sets the same
+        // register that it uses as operand
+        const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1);
+        if (!source) {
+            return nullptr;
+        }
+        return TrackCbuf(source, code, new_cursor);
+    }
+    if (const auto operation = std::get_if<OperationNode>(tracked)) {
+        for (std::size_t i = 0; i < operation->GetOperandsCount(); ++i) {
+            if (const auto found = TrackCbuf((*operation)[i], code, cursor)) {
+                // Cbuf found in operand
+                return found;
+            }
+        }
+        return nullptr;
+    }
+    return nullptr;
+}
+
+std::pair<Node, s64> ShaderIR::TrackRegister(const GprNode* tracked, const BasicBlock& code,
+                                             s64 cursor) {
+    for (; cursor >= 0; --cursor) {
+        const auto [found_node, new_cursor] = FindOperation(code, cursor, OperationCode::Assign);
+        if (!found_node) {
+            return {};
+        }
+        const auto operation = std::get_if<OperationNode>(found_node);
+        ASSERT(operation);
+
+        const auto& target = (*operation)[0];
+        if (const auto gpr_target = std::get_if<GprNode>(target)) {
+            if (gpr_target->GetIndex() == tracked->GetIndex()) {
+                return {(*operation)[1], new_cursor};
+            }
+        }
+    }
+    return {};
+}
+
+} // namespace VideoCommon::Shader