17 files changed, 329 insertions, 45 deletions
diff --git a/src/audio_core/CMakeLists.txt b/src/audio_core/CMakeLists.txt
index 13b5e400e..eba0a5697 100644
--- a/src/audio_core/CMakeLists.txt
+++ b/src/audio_core/CMakeLists.txt
@@ -7,6 +7,7 @@ set(SRCS
             hle/source.cpp
             interpolate.cpp
             sink_details.cpp
+            time_stretch.cpp
             )
 
 set(HEADERS
@@ -21,6 +22,7 @@ set(HEADERS
             null_sink.h
             sink.h
             sink_details.h
+            time_stretch.h
             )
 
 include_directories(../../externals/soundtouch/include)
diff --git a/src/audio_core/hle/dsp.cpp b/src/audio_core/hle/dsp.cpp
index 0cdbdb06a..5113ad8ca 100644
--- a/src/audio_core/hle/dsp.cpp
+++ b/src/audio_core/hle/dsp.cpp
@@ -9,6 +9,7 @@
 #include "audio_core/hle/pipe.h"
 #include "audio_core/hle/source.h"
 #include "audio_core/sink.h"
+#include "audio_core/time_stretch.h"
 
 namespace DSP {
 namespace HLE {
@@ -48,15 +49,29 @@ static std::array<Source, num_sources> sources = {
 };
 
 static std::unique_ptr<AudioCore::Sink> sink;
+static AudioCore::TimeStretcher time_stretcher;
 
 void Init() {
     DSP::HLE::ResetPipes();
+
     for (auto& source : sources) {
         source.Reset();
     }
+
+    time_stretcher.Reset();
+    if (sink) {
+        time_stretcher.SetOutputSampleRate(sink->GetNativeSampleRate());
+    }
 }
 
 void Shutdown() {
+    time_stretcher.Flush();
+    while (true) {
+        std::vector<s16> residual_audio = time_stretcher.Process(sink->SamplesInQueue());
+        if (residual_audio.empty())
+            break;
+        sink->EnqueueSamples(residual_audio);
+    }
 }
 
 bool Tick() {
@@ -77,6 +92,7 @@ bool Tick() {
 
 void SetSink(std::unique_ptr<AudioCore::Sink> sink_) {
     sink = std::move(sink_);
+    time_stretcher.SetOutputSampleRate(sink->GetNativeSampleRate());
 }
 
 } // namespace HLE
diff --git a/src/audio_core/time_stretch.cpp b/src/audio_core/time_stretch.cpp
new file mode 100644
index 000000000..ea38f40d0
--- /dev/null
+++ b/src/audio_core/time_stretch.cpp
@@ -0,0 +1,144 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <chrono>
+#include <cmath>
+#include <vector>
+
+#include <SoundTouch.h>
+
+#include "audio_core/audio_core.h"
+#include "audio_core/time_stretch.h"
+
+#include "common/common_types.h"
+#include "common/logging/log.h"
+#include "common/math_util.h"
+
+using steady_clock = std::chrono::steady_clock;
+
+namespace AudioCore {
+
+constexpr double MIN_RATIO = 0.1;
+constexpr double MAX_RATIO = 100.0;
+
+static double ClampRatio(double ratio) {
+    return MathUtil::Clamp(ratio, MIN_RATIO, MAX_RATIO);
+}
+
+constexpr double MIN_DELAY_TIME = 0.05; // Units: seconds
+constexpr double MAX_DELAY_TIME = 0.25; // Units: seconds
+constexpr size_t DROP_FRAMES_SAMPLE_DELAY = 16000; // Units: samples
+
+constexpr double SMOOTHING_FACTOR = 0.007;
+
+struct TimeStretcher::Impl {
+    soundtouch::SoundTouch soundtouch;
+
+    steady_clock::time_point frame_timer = steady_clock::now();
+    size_t samples_queued = 0;
+
+    double smoothed_ratio = 1.0;
+
+    double sample_rate = static_cast<double>(native_sample_rate);
+};
+
+std::vector<s16> TimeStretcher::Process(size_t samples_in_queue) {
+    // This is a very simple algorithm without any fancy control theory. It works and is stable.
+
+    double ratio = CalculateCurrentRatio();
+    ratio = CorrectForUnderAndOverflow(ratio, samples_in_queue);
+    impl->smoothed_ratio = (1.0 - SMOOTHING_FACTOR) * impl->smoothed_ratio + SMOOTHING_FACTOR * ratio;
+    impl->smoothed_ratio = ClampRatio(impl->smoothed_ratio);
+
+    // SoundTouch's tempo definition the inverse of our ratio definition.
+    impl->soundtouch.setTempo(1.0 / impl->smoothed_ratio);
+
+    std::vector<s16> samples = GetSamples();
+    if (samples_in_queue >= DROP_FRAMES_SAMPLE_DELAY) {
+        samples.clear();
+        LOG_DEBUG(Audio, "Dropping frames!");
+    }
+    return samples;
+}
+
+TimeStretcher::TimeStretcher() : impl(std::make_unique<Impl>()) {
+    impl->soundtouch.setPitch(1.0);
+    impl->soundtouch.setChannels(2);
+    impl->soundtouch.setSampleRate(native_sample_rate);
+    Reset();
+}
+
+TimeStretcher::~TimeStretcher() {
+    impl->soundtouch.clear();
+}
+
+void TimeStretcher::SetOutputSampleRate(unsigned int sample_rate) {
+    impl->sample_rate = static_cast<double>(sample_rate);
+    impl->soundtouch.setRate(static_cast<double>(native_sample_rate) / impl->sample_rate);
+}
+
+void TimeStretcher::AddSamples(const s16* buffer, size_t num_samples) {
+    impl->soundtouch.putSamples(buffer, static_cast<uint>(num_samples));
+    impl->samples_queued += num_samples;
+}
+
+void TimeStretcher::Flush() {
+    impl->soundtouch.flush();
+}
+
+void TimeStretcher::Reset() {
+    impl->soundtouch.setTempo(1.0);
+    impl->soundtouch.clear();
+    impl->smoothed_ratio = 1.0;
+    impl->frame_timer = steady_clock::now();
+    impl->samples_queued = 0;
+    SetOutputSampleRate(native_sample_rate);
+}
+
+double TimeStretcher::CalculateCurrentRatio() {
+    const steady_clock::time_point now = steady_clock::now();
+    const std::chrono::duration<double> duration = now - impl->frame_timer;
+
+    const double expected_time = static_cast<double>(impl->samples_queued) / static_cast<double>(native_sample_rate);
+    const double actual_time = duration.count();
+
+    double ratio;
+    if (expected_time != 0) {
+        ratio = ClampRatio(actual_time / expected_time);
+    } else {
+        ratio = impl->smoothed_ratio;
+    }
+
+    impl->frame_timer = now;
+    impl->samples_queued = 0;
+
+    return ratio;
+}
+
+double TimeStretcher::CorrectForUnderAndOverflow(double ratio, size_t sample_delay) const {
+    const size_t min_sample_delay = static_cast<size_t>(MIN_DELAY_TIME * impl->sample_rate);
+    const size_t max_sample_delay = static_cast<size_t>(MAX_DELAY_TIME * impl->sample_rate);
+
+    if (sample_delay < min_sample_delay) {
+        // Make the ratio bigger.
+        ratio = ratio > 1.0 ? ratio * ratio : sqrt(ratio);
+    } else if (sample_delay > max_sample_delay) {
+        // Make the ratio smaller.
+        ratio = ratio > 1.0 ? sqrt(ratio) : ratio * ratio;
+    }
+
+    return ClampRatio(ratio);
+}
+
+std::vector<s16> TimeStretcher::GetSamples() {
+    uint available = impl->soundtouch.numSamples();
+
+    std::vector<s16> output(static_cast<size_t>(available) * 2);
+
+    impl->soundtouch.receiveSamples(output.data(), available);
+
+    return output;
+}
+
+} // namespace AudioCore
diff --git a/src/audio_core/time_stretch.h b/src/audio_core/time_stretch.h
new file mode 100644
index 000000000..1fde3f72a
--- /dev/null
+++ b/src/audio_core/time_stretch.h
@@ -0,0 +1,57 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include "common/common_types.h"
+
+namespace AudioCore {
+
+class TimeStretcher final {
+public:
+    TimeStretcher();
+    ~TimeStretcher();
+
+    /**
+     * Set sample rate for the samples that Process returns.
+     * @param sample_rate The sample rate.
+     */
+    void SetOutputSampleRate(unsigned int sample_rate);
+
+    /**
+     * Add samples to be processed.
+     * @param sample_buffer Buffer of samples in interleaved stereo PCM16 format.
+     * @param num_sample Number of samples.
+     */
+    void AddSamples(const s16* sample_buffer, size_t num_samples);
+
+    /// Flush audio remaining in internal buffers.
+    void Flush();
+
+    /// Resets internal state and clears buffers.
+    void Reset();
+
+    /**
+     * Does audio stretching and produces the time-stretched samples.
+     * Timer calculations use sample_delay to determine how much of a margin we have.
+     * @param sample_delay How many samples are buffered downstream of this module and haven't been played yet.
+     * @return Samples to play in interleaved stereo PCM16 format.
+     */
+    std::vector<s16> Process(size_t sample_delay);
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> impl;
+
+    /// INTERNAL: ratio = wallclock time / emulated time
+    double CalculateCurrentRatio();
+    /// INTERNAL: If we have too many or too few samples downstream, nudge ratio in the appropriate direction.
+    double CorrectForUnderAndOverflow(double ratio, size_t sample_delay) const;
+    /// INTERNAL: Gets the time-stretched samples from SoundTouch.
+    std::vector<s16> GetSamples();
+};
+
+} // namespace AudioCore
diff --git a/src/core/hle/kernel/memory.cpp b/src/core/hle/kernel/memory.cpp
index 4be20db22..17ae87aef 100644
--- a/src/core/hle/kernel/memory.cpp
+++ b/src/core/hle/kernel/memory.cpp
@@ -55,6 +55,9 @@ void MemoryInit(u32 mem_type) {
         memory_regions[i].size = memory_region_sizes[mem_type][i];
         memory_regions[i].used = 0;
         memory_regions[i].linear_heap_memory = std::make_shared<std::vector<u8>>();
+        // Reserve enough space for this region of FCRAM.
+        // We do not want this block of memory to be relocated when allocating from it.
+        memory_regions[i].linear_heap_memory->reserve(memory_regions[i].size);
 
         base += memory_regions[i].size;
     }
diff --git a/src/core/hle/service/dsp_dsp.cpp b/src/core/hle/service/dsp_dsp.cpp
index 274fc751a..10730d7ac 100644
--- a/src/core/hle/service/dsp_dsp.cpp
+++ b/src/core/hle/service/dsp_dsp.cpp
@@ -440,9 +440,9 @@ static void GetHeadphoneStatus(Service::Interface* self) {
 
     cmd_buff[0] = IPC::MakeHeader(0x1F, 2, 0);
     cmd_buff[1] = RESULT_SUCCESS.raw; // No error
-    cmd_buff[2] = 0; // Not using headphones?
+    cmd_buff[2] = 0; // Not using headphones
 
-    LOG_WARNING(Service_DSP, "(STUBBED) called");
+    LOG_DEBUG(Service_DSP, "called");
 }
 
 /**
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index ed2e2f3ae..bcd1ae78d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -104,7 +104,6 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
 
     // Sync fixed function OpenGL state
     SyncCullMode();
-    SyncDepthModifiers();
     SyncBlendEnabled();
     SyncBlendFuncs();
     SyncBlendColor();
@@ -259,8 +258,10 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
 
     // Depth modifiers
     case PICA_REG_INDEX(viewport_depth_range):
+        SyncDepthScale();
+        break;
     case PICA_REG_INDEX(viewport_depth_near_plane):
-        SyncDepthModifiers();
+        SyncDepthOffset();
         break;
 
     // Depth buffering
@@ -880,6 +881,8 @@ void RasterizerOpenGL::SetShader() {
         glUniformBlockBinding(current_shader->shader.handle, block_index, 0);
 
         // Update uniforms
+        SyncDepthScale();
+        SyncDepthOffset();
         SyncAlphaTest();
         SyncCombinerColor();
         auto& tev_stages = Pica::g_state.regs.GetTevStages();
@@ -922,13 +925,20 @@ void RasterizerOpenGL::SyncCullMode() {
     }
 }
 
-void RasterizerOpenGL::SyncDepthModifiers() {
+void RasterizerOpenGL::SyncDepthScale() {
     float depth_scale = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_range).ToFloat32();
-    float depth_offset = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_near_plane).ToFloat32();
+    if (depth_scale != uniform_block_data.data.depth_scale) {
+        uniform_block_data.data.depth_scale = depth_scale;
+        uniform_block_data.dirty = true;
+    }
+}
 
-    uniform_block_data.data.depth_scale = depth_scale;
-    uniform_block_data.data.depth_offset = depth_offset;
-    uniform_block_data.dirty = true;
+void RasterizerOpenGL::SyncDepthOffset() {
+    float depth_offset = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_near_plane).ToFloat32();
+    if (depth_offset != uniform_block_data.data.depth_offset) {
+        uniform_block_data.data.depth_offset = depth_offset;
+        uniform_block_data.dirty = true;
+    }
 }
 
 void RasterizerOpenGL::SyncBlendEnabled() {
@@ -937,6 +947,8 @@ void RasterizerOpenGL::SyncBlendEnabled() {
 
 void RasterizerOpenGL::SyncBlendFuncs() {
     const auto& regs = Pica::g_state.regs;
+    state.blend.rgb_equation = PicaToGL::BlendEquation(regs.output_merger.alpha_blending.blend_equation_rgb);
+    state.blend.a_equation = PicaToGL::BlendEquation(regs.output_merger.alpha_blending.blend_equation_a);
     state.blend.src_rgb_func = PicaToGL::BlendFunc(regs.output_merger.alpha_blending.factor_source_rgb);
     state.blend.dst_rgb_func = PicaToGL::BlendFunc(regs.output_merger.alpha_blending.factor_dest_rgb);
     state.blend.src_a_func = PicaToGL::BlendFunc(regs.output_merger.alpha_blending.factor_source_a);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index eed00011a..d70369400 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -339,8 +339,11 @@ private:
     /// Syncs the cull mode to match the PICA register
     void SyncCullMode();
 
-    /// Syncs the depth scale and offset to match the PICA registers
-    void SyncDepthModifiers();
+    /// Syncs the depth scale to match the PICA register
+    void SyncDepthScale();
+
+    /// Syncs the depth offset to match the PICA register
+    void SyncDepthOffset();
 
     /// Syncs the blend enabled status to match the PICA register
     void SyncBlendEnabled();
@@ -413,7 +416,7 @@ private:
         UniformData data;
         bool lut_dirty[6];
         bool dirty;
-    } uniform_block_data;
+    } uniform_block_data = {};
 
     std::array<SamplerInfo, 3> texture_samplers;
     OGLVertexArray vertex_array;
@@ -422,5 +425,5 @@ private:
     OGLFramebuffer framebuffer;
 
     std::array<OGLTexture, 6> lighting_luts;
-    std::array<std::array<GLvec4, 256>, 6> lighting_lut_data;
+    std::array<std::array<GLvec4, 256>, 6> lighting_lut_data{};
 };
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 02cd9f417..fa141fc9a 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -36,6 +36,8 @@ OpenGLState::OpenGLState() {
     stencil.action_stencil_fail = GL_KEEP;
 
     blend.enabled = false;
+    blend.rgb_equation = GL_FUNC_ADD;
+    blend.a_equation = GL_FUNC_ADD;
     blend.src_rgb_func = GL_ONE;
     blend.dst_rgb_func = GL_ZERO;
     blend.src_a_func = GL_ONE;
@@ -165,6 +167,11 @@ void OpenGLState::Apply() const {
                             blend.src_a_func, blend.dst_a_func);
     }
 
+    if (blend.rgb_equation != cur_state.blend.rgb_equation ||
+            blend.a_equation != cur_state.blend.a_equation) {
+        glBlendEquationSeparate(blend.rgb_equation, blend.a_equation);
+    }
+
     if (logic_op != cur_state.logic_op) {
         glLogicOp(logic_op);
     }
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 24f20e47c..228727054 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -40,6 +40,8 @@ public:
 
     struct {
         bool enabled; // GL_BLEND
+        GLenum rgb_equation; // GL_BLEND_EQUATION_RGB
+        GLenum a_equation; // GL_BLEND_EQUATION_ALPHA
         GLenum src_rgb_func; // GL_BLEND_SRC_RGB
         GLenum dst_rgb_func; // GL_BLEND_DST_RGB
         GLenum src_a_func; // GL_BLEND_SRC_ALPHA
diff --git a/src/video_core/renderer_opengl/pica_to_gl.h b/src/video_core/renderer_opengl/pica_to_gl.h
index 976d1f364..6dc2758c5 100644
--- a/src/video_core/renderer_opengl/pica_to_gl.h
+++ b/src/video_core/renderer_opengl/pica_to_gl.h
@@ -78,6 +78,26 @@ inline GLenum WrapMode(Pica::Regs::TextureConfig::WrapMode mode) {
     return gl_mode;
 }
 
+inline GLenum BlendEquation(Pica::Regs::BlendEquation equation) {
+    static const GLenum blend_equation_table[] = {
+        GL_FUNC_ADD,              // BlendEquation::Add
+        GL_FUNC_SUBTRACT,         // BlendEquation::Subtract
+        GL_FUNC_REVERSE_SUBTRACT, // BlendEquation::ReverseSubtract
+        GL_MIN,                   // BlendEquation::Min
+        GL_MAX,                   // BlendEquation::Max
+    };
+
+    // Range check table for input
+    if (static_cast<size_t>(equation) >= ARRAY_SIZE(blend_equation_table)) {
+        LOG_CRITICAL(Render_OpenGL, "Unknown blend equation %d", equation);
+        UNREACHABLE();
+
+        return GL_FUNC_ADD;
+    }
+
+    return blend_equation_table[(unsigned)equation];
+}
+
 inline GLenum BlendFunc(Pica::Regs::BlendFactor factor) {
     static const GLenum blend_func_table[] = {
         GL_ZERO,                     // BlendFactor::Zero
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index e93a9d92a..161097610 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -64,6 +64,7 @@ MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
 
 OutputVertex ShaderSetup::Run(UnitState<false>& state, const InputVertex& input, int num_attributes) {
     auto& config = g_state.regs.vs;
+    auto& setup = g_state.vs;
 
     MICROPROFILE_SCOPE(GPU_Shader);
 
@@ -81,11 +82,11 @@ OutputVertex ShaderSetup::Run(UnitState<false>& state, const InputVertex& input,
 
 #ifdef ARCHITECTURE_x86_64
     if (VideoCore::g_shader_jit_enabled)
-        jit_shader->Run(&state.registers, g_state.regs.vs.main_offset);
+        jit_shader->Run(setup, state, config.main_offset);
     else
-        RunInterpreter(state);
+        RunInterpreter(setup, state, config.main_offset);
 #else
-    RunInterpreter(state);
+    RunInterpreter(setup, state, config.main_offset);
 #endif // ARCHITECTURE_x86_64
 
     // Setup output data
@@ -156,7 +157,7 @@ DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_
     state.conditional_code[0] = false;
     state.conditional_code[1] = false;
 
-    RunInterpreter(state);
+    RunInterpreter(setup, state, config.main_offset);
     return state.debug;
 }
 
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 983e4a967..84898f21c 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -283,10 +283,10 @@ struct UnitState {
     static size_t InputOffset(const SourceRegister& reg) {
         switch (reg.GetRegisterType()) {
         case RegisterType::Input:
-            return offsetof(UnitState::Registers, input) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState, registers.input) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
 
         case RegisterType::Temporary:
-            return offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
 
         default:
             UNREACHABLE();
@@ -297,10 +297,10 @@ struct UnitState {
     static size_t OutputOffset(const DestRegister& reg) {
         switch (reg.GetRegisterType()) {
         case RegisterType::Output:
-            return offsetof(UnitState::Registers, output) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState, registers.output) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
 
         case RegisterType::Temporary:
-            return offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
 
         default:
             UNREACHABLE();
@@ -323,6 +323,23 @@ struct ShaderSetup {
         std::array<Math::Vec4<u8>, 4> i;
     } uniforms;
 
+    static size_t UniformOffset(RegisterType type, unsigned index) {
+        switch (type) {
+        case RegisterType::FloatUniform:
+            return offsetof(ShaderSetup, uniforms.f) + index*sizeof(Math::Vec4<float24>);
+
+        case RegisterType::BoolUniform:
+            return offsetof(ShaderSetup, uniforms.b) + index*sizeof(bool);
+
+        case RegisterType::IntUniform:
+            return offsetof(ShaderSetup, uniforms.i) + index*sizeof(Math::Vec4<u8>);
+
+        default:
+            UNREACHABLE();
+            return 0;
+        }
+    }
+
     std::array<u32, 1024> program_code;
     std::array<u32, 1024> swizzle_data;
 
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index 3a827d11f..714e8bfd5 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -41,11 +41,11 @@ struct CallStackElement {
 };
 
 template<bool Debug>
-void RunInterpreter(UnitState<Debug>& state) {
+void RunInterpreter(const ShaderSetup& setup, UnitState<Debug>& state, unsigned offset) {
     // TODO: Is there a maximal size for this?
     boost::container::static_vector<CallStackElement, 16> call_stack;
 
-    u32 program_counter = g_state.regs.vs.main_offset;
+    u32 program_counter = offset;
 
     const auto& uniforms = g_state.vs.uniforms;
     const auto& swizzle_data = g_state.vs.swizzle_data;
@@ -647,8 +647,8 @@ void RunInterpreter(UnitState<Debug>& state) {
 }
 
 // Explicit instantiation
-template void RunInterpreter(UnitState<false>& state);
-template void RunInterpreter(UnitState<true>& state);
+template void RunInterpreter(const ShaderSetup& setup, UnitState<false>& state, unsigned offset);
+template void RunInterpreter(const ShaderSetup& setup, UnitState<true>& state, unsigned offset);
 
 } // namespace
 
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h
index 6048cdf3a..bb3ce1c6e 100644
--- a/src/video_core/shader/shader_interpreter.h
+++ b/src/video_core/shader/shader_interpreter.h
@@ -11,7 +11,7 @@ namespace Shader {
 template <bool Debug> struct UnitState;
 
 template<bool Debug>
-void RunInterpreter(UnitState<Debug>& state);
+void RunInterpreter(const ShaderSetup& setup, UnitState<Debug>& state, unsigned offset);
 
 } // namespace
 
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 99f6c51eb..43e7e6b4c 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -102,7 +102,7 @@ const JitFunction instr_table[64] = {
 // purposes, as documented below:
 
 /// Pointer to the uniform memory
-static const X64Reg UNIFORMS = R9;
+static const X64Reg SETUP = R9;
 /// The two 32-bit VS address offset registers set by the MOVA instruction
 static const X64Reg ADDROFFS_REG_0 = R10;
 static const X64Reg ADDROFFS_REG_1 = R11;
@@ -117,7 +117,7 @@ static const X64Reg COND0 = R13;
 /// Result of the previous CMP instruction for the Y-component comparison
 static const X64Reg COND1 = R14;
 /// Pointer to the UnitState instance for the current VS unit
-static const X64Reg REGISTERS = R15;
+static const X64Reg STATE = R15;
 /// SIMD scratch register
 static const X64Reg SCRATCH = XMM0;
 /// Loaded with the first swizzled source register, otherwise can be used as a scratch register
@@ -136,7 +136,7 @@ static const X64Reg NEGBIT = XMM15;
 // State registers that must not be modified by external functions calls
 // Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed
 static const BitSet32 persistent_regs = {
-    UNIFORMS, REGISTERS, // Pointers to register blocks
+    SETUP, STATE, // Pointers to register blocks
     ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, // Cached registers
     ONE+16, NEGBIT+16, // Constants
 };
@@ -177,10 +177,10 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe
     size_t src_offset;
 
     if (src_reg.GetRegisterType() == RegisterType::FloatUniform) {
-        src_ptr = UNIFORMS;
-        src_offset = src_reg.GetIndex() * sizeof(float24) * 4;
+        src_ptr = SETUP;
+        src_offset = ShaderSetup::UniformOffset(RegisterType::FloatUniform, src_reg.GetIndex());
     } else {
-        src_ptr = REGISTERS;
+        src_ptr = STATE;
         src_offset = UnitState<false>::InputOffset(src_reg);
     }
 
@@ -264,11 +264,11 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) {
     // If all components are enabled, write the result to the destination register
     if (swiz.dest_mask == NO_DEST_REG_MASK) {
         // Store dest back to memory
-        MOVAPS(MDisp(REGISTERS, dest_offset_disp), src);
+        MOVAPS(MDisp(STATE, dest_offset_disp), src);
 
     } else {
         // Not all components are enabled, so mask the result when storing to the destination register...
-        MOVAPS(SCRATCH, MDisp(REGISTERS, dest_offset_disp));
+        MOVAPS(SCRATCH, MDisp(STATE, dest_offset_disp));
 
         if (Common::GetCPUCaps().sse4_1) {
             u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
@@ -287,7 +287,7 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) {
         }
 
         // Store dest back to memory
-        MOVAPS(MDisp(REGISTERS, dest_offset_disp), SCRATCH);
+        MOVAPS(MDisp(STATE, dest_offset_disp), SCRATCH);
     }
 }
 
@@ -336,8 +336,8 @@ void JitShader::Compile_EvaluateCondition(Instruction instr) {
 }
 
 void JitShader::Compile_UniformCondition(Instruction instr) {
-    int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool));
-    CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0));
+    int offset = ShaderSetup::UniformOffset(RegisterType::BoolUniform, instr.flow_control.bool_uniform_id);
+    CMP(sizeof(bool) * 8, MDisp(SETUP, offset), Imm8(0));
 }
 
 BitSet32 JitShader::PersistentCallerSavedRegs() {
@@ -714,8 +714,8 @@ void JitShader::Compile_LOOP(Instruction instr) {
 
     looping = true;
 
-    int offset = offsetof(decltype(g_state.vs.uniforms), i) + (instr.flow_control.int_uniform_id * sizeof(Math::Vec4<u8>));
-    MOV(32, R(LOOPCOUNT), MDisp(UNIFORMS, offset));
+    int offset = ShaderSetup::UniformOffset(RegisterType::IntUniform, instr.flow_control.int_uniform_id);
+    MOV(32, R(LOOPCOUNT), MDisp(SETUP, offset));
     MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT));
     SHR(32, R(LOOPCOUNT_REG), Imm8(8));
     AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start
@@ -826,8 +826,8 @@ void JitShader::Compile() {
     // The stack pointer is 8 modulo 16 at the entry of a procedure
     ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);
 
-    MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1));
-    MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms));
+    MOV(PTRBITS, R(SETUP), R(ABI_PARAM1));
+    MOV(PTRBITS, R(STATE), R(ABI_PARAM2));
 
     // Zero address/loop  registers
     XOR(64, R(ADDROFFS_REG_0), R(ADDROFFS_REG_0));
@@ -845,7 +845,7 @@ void JitShader::Compile() {
     MOVAPS(NEGBIT, MatR(RAX));
 
     // Jump to start of the shader program
-    JMPptr(R(ABI_PARAM2));
+    JMPptr(R(ABI_PARAM3));
 
     // Compile entire program
     Compile_Block(static_cast<unsigned>(g_state.vs.program_code.size()));
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index 30aa7ff30..5468459d4 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -36,8 +36,8 @@ class JitShader : public Gen::XCodeBlock {
 public:
     JitShader();
 
-    void Run(void* registers, unsigned offset) const {
-        program(registers, code_ptr[offset]);
+    void Run(const ShaderSetup& setup, UnitState<false>& state, unsigned offset) const {
+        program(&setup, &state, code_ptr[offset]);
     }
 
     void Compile();
@@ -117,7 +117,7 @@ private:
     /// Branches that need to be fixed up once the entire shader program is compiled
     std::vector<std::pair<Gen::FixupBranch, unsigned>> fixup_branches;
 
-    using CompiledShader = void(void* registers, const u8* start_addr);
+    using CompiledShader = void(const void* setup, void* state, const u8* start_addr);
     CompiledShader* program = nullptr;
 };