75 files changed, 1961 insertions, 642 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index de4fe716a..1e1245160 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,6 +5,7 @@ add_subdirectory(common)
 add_subdirectory(core)
 add_subdirectory(video_core)
 add_subdirectory(audio_core)
+add_subdirectory(tests)
 if (ENABLE_SDL2)
     add_subdirectory(citra)
 endif()
diff --git a/src/audio_core/CMakeLists.txt b/src/audio_core/CMakeLists.txt
index 13b5e400e..a72a907ef 100644
--- a/src/audio_core/CMakeLists.txt
+++ b/src/audio_core/CMakeLists.txt
@@ -3,10 +3,12 @@ set(SRCS
             codec.cpp
             hle/dsp.cpp
             hle/filter.cpp
+            hle/mixers.cpp
             hle/pipe.cpp
             hle/source.cpp
             interpolate.cpp
             sink_details.cpp
+            time_stretch.cpp
             )
 
 set(HEADERS
@@ -15,12 +17,14 @@ set(HEADERS
             hle/common.h
             hle/dsp.h
             hle/filter.h
+            hle/mixers.h
             hle/pipe.h
             hle/source.h
             interpolate.h
             null_sink.h
             sink.h
             sink_details.h
+            time_stretch.h
             )
 
 include_directories(../../externals/soundtouch/include)
diff --git a/src/audio_core/hle/dsp.cpp b/src/audio_core/hle/dsp.cpp
index 0cdbdb06a..0640e1eff 100644
--- a/src/audio_core/hle/dsp.cpp
+++ b/src/audio_core/hle/dsp.cpp
@@ -6,13 +6,17 @@
 #include <memory>
 
 #include "audio_core/hle/dsp.h"
+#include "audio_core/hle/mixers.h"
 #include "audio_core/hle/pipe.h"
 #include "audio_core/hle/source.h"
 #include "audio_core/sink.h"
+#include "audio_core/time_stretch.h"
 
 namespace DSP {
 namespace HLE {
 
+// Region management
+
 std::array<SharedMemory, 2> g_regions;
 
 static size_t CurrentRegionIndex() {
@@ -40,43 +44,96 @@ static SharedMemory& WriteRegion() {
     return g_regions[1 - CurrentRegionIndex()];
 }
 
+// Audio processing and mixing
+
 static std::array<Source, num_sources> sources = {
     Source(0), Source(1), Source(2), Source(3), Source(4), Source(5),
     Source(6), Source(7), Source(8), Source(9), Source(10), Source(11),
     Source(12), Source(13), Source(14), Source(15), Source(16), Source(17),
     Source(18), Source(19), Source(20), Source(21), Source(22), Source(23)
 };
+static Mixers mixers;
+
+static StereoFrame16 GenerateCurrentFrame() {
+    SharedMemory& read = ReadRegion();
+    SharedMemory& write = WriteRegion();
+
+    std::array<QuadFrame32, 3> intermediate_mixes = {};
+
+    // Generate intermediate mixes
+    for (size_t i = 0; i < num_sources; i++) {
+        write.source_statuses.status[i] = sources[i].Tick(read.source_configurations.config[i], read.adpcm_coefficients.coeff[i]);
+        for (size_t mix = 0; mix < 3; mix++) {
+            sources[i].MixInto(intermediate_mixes[mix], mix);
+        }
+    }
+
+    // Generate final mix
+    write.dsp_status = mixers.Tick(read.dsp_configuration, read.intermediate_mix_samples, write.intermediate_mix_samples, intermediate_mixes);
+
+    StereoFrame16 output_frame = mixers.GetOutput();
+
+    // Write current output frame to the shared memory region
+    for (size_t samplei = 0; samplei < output_frame.size(); samplei++) {
+        for (size_t channeli = 0; channeli < output_frame[0].size(); channeli++) {
+            write.final_samples.pcm16[samplei][channeli] = s16_le(output_frame[samplei][channeli]);
+        }
+    }
+
+    return output_frame;
+}
+
+// Audio output
 
 static std::unique_ptr<AudioCore::Sink> sink;
+static AudioCore::TimeStretcher time_stretcher;
+
+static void OutputCurrentFrame(const StereoFrame16& frame) {
+    time_stretcher.AddSamples(&frame[0][0], frame.size());
+    sink->EnqueueSamples(time_stretcher.Process(sink->SamplesInQueue()));
+}
+
+// Public Interface
 
 void Init() {
     DSP::HLE::ResetPipes();
+
     for (auto& source : sources) {
         source.Reset();
     }
+
+    mixers.Reset();
+
+    time_stretcher.Reset();
+    if (sink) {
+        time_stretcher.SetOutputSampleRate(sink->GetNativeSampleRate());
+    }
 }
 
 void Shutdown() {
+    time_stretcher.Flush();
+    while (true) {
+        std::vector<s16> residual_audio = time_stretcher.Process(sink->SamplesInQueue());
+        if (residual_audio.empty())
+            break;
+        sink->EnqueueSamples(residual_audio);
+    }
 }
 
 bool Tick() {
-    SharedMemory& read = ReadRegion();
-    SharedMemory& write = WriteRegion();
+    StereoFrame16 current_frame = {};
 
-    std::array<QuadFrame32, 3> intermediate_mixes = {};
+    // TODO: Check dsp::DSP semaphore (which indicates emulated application has finished writing to shared memory region)
+    current_frame = GenerateCurrentFrame();
 
-    for (size_t i = 0; i < num_sources; i++) {
-        write.source_statuses.status[i] = sources[i].Tick(read.source_configurations.config[i], read.adpcm_coefficients.coeff[i]);
-        for (size_t mix = 0; mix < 3; mix++) {
-            sources[i].MixInto(intermediate_mixes[mix], mix);
-        }
-    }
+    OutputCurrentFrame(current_frame);
 
     return true;
 }
 
 void SetSink(std::unique_ptr<AudioCore::Sink> sink_) {
     sink = std::move(sink_);
+    time_stretcher.SetOutputSampleRate(sink->GetNativeSampleRate());
 }
 
 } // namespace HLE
diff --git a/src/audio_core/hle/dsp.h b/src/audio_core/hle/dsp.h
index f6e53f68f..9275cd7de 100644
--- a/src/audio_core/hle/dsp.h
+++ b/src/audio_core/hle/dsp.h
@@ -428,7 +428,7 @@ ASSERT_DSP_STRUCT(DspStatus, 32);
 /// Final mixed output in PCM16 stereo format, what you hear out of the speakers.
 /// When the application writes to this region it has no effect.
 struct FinalMixSamples {
-    s16_le pcm16[2 * samples_per_frame];
+    s16_le pcm16[samples_per_frame][2];
 };
 ASSERT_DSP_STRUCT(FinalMixSamples, 640);
 
diff --git a/src/audio_core/hle/mixers.cpp b/src/audio_core/hle/mixers.cpp
new file mode 100644
index 000000000..18335f7f0
--- /dev/null
+++ b/src/audio_core/hle/mixers.cpp
@@ -0,0 +1,201 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstddef>
+
+#include "audio_core/hle/common.h"
+#include "audio_core/hle/dsp.h"
+#include "audio_core/hle/mixers.h"
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "common/math_util.h"
+
+namespace DSP {
+namespace HLE {
+
+void Mixers::Reset() {
+    current_frame.fill({});
+    state = {};
+}
+
+DspStatus Mixers::Tick(DspConfiguration& config,
+        const IntermediateMixSamples& read_samples,
+        IntermediateMixSamples& write_samples,
+        const std::array<QuadFrame32, 3>& input)
+{
+    ParseConfig(config);
+
+    AuxReturn(read_samples);
+    AuxSend(write_samples, input);
+
+    MixCurrentFrame();
+
+    return GetCurrentStatus();
+}
+
+void Mixers::ParseConfig(DspConfiguration& config) {
+    if (!config.dirty_raw) {
+        return;
+    }
+
+    if (config.mixer1_enabled_dirty) {
+        config.mixer1_enabled_dirty.Assign(0);
+        state.mixer1_enabled = config.mixer1_enabled != 0;
+        LOG_TRACE(Audio_DSP, "mixers mixer1_enabled = %hu", config.mixer1_enabled);
+    }
+
+    if (config.mixer2_enabled_dirty) {
+        config.mixer2_enabled_dirty.Assign(0);
+        state.mixer2_enabled = config.mixer2_enabled != 0;
+        LOG_TRACE(Audio_DSP, "mixers mixer2_enabled = %hu", config.mixer2_enabled);
+    }
+
+    if (config.volume_0_dirty) {
+        config.volume_0_dirty.Assign(0);
+        state.intermediate_mixer_volume[0] = config.volume[0];
+        LOG_TRACE(Audio_DSP, "mixers volume[0] = %f", config.volume[0]);
+    }
+
+    if (config.volume_1_dirty) {
+        config.volume_1_dirty.Assign(0);
+        state.intermediate_mixer_volume[1] = config.volume[1];
+        LOG_TRACE(Audio_DSP, "mixers volume[1] = %f", config.volume[1]);
+    }
+
+    if (config.volume_2_dirty) {
+        config.volume_2_dirty.Assign(0);
+        state.intermediate_mixer_volume[2] = config.volume[2];
+        LOG_TRACE(Audio_DSP, "mixers volume[2] = %f", config.volume[2]);
+    }
+
+    if (config.output_format_dirty) {
+        config.output_format_dirty.Assign(0);
+        state.output_format = config.output_format;
+        LOG_TRACE(Audio_DSP, "mixers output_format = %zu", static_cast<size_t>(config.output_format));
+    }
+
+    if (config.headphones_connected_dirty) {
+        config.headphones_connected_dirty.Assign(0);
+        // Do nothing.
+        // (Note: Whether headphones are connected does affect coefficients used for surround sound.)
+        LOG_TRACE(Audio_DSP, "mixers headphones_connected=%hu", config.headphones_connected);
+    }
+
+    if (config.dirty_raw) {
+        LOG_DEBUG(Audio_DSP, "mixers remaining_dirty=%x", config.dirty_raw);
+    }
+
+    config.dirty_raw = 0;
+}
+
+static s16 ClampToS16(s32 value) {
+    return static_cast<s16>(MathUtil::Clamp(value, -32768, 32767));
+}
+
+static std::array<s16, 2> AddAndClampToS16(const std::array<s16, 2>& a, const std::array<s16, 2>& b) {
+    return {
+        ClampToS16(static_cast<s32>(a[0]) + static_cast<s32>(b[0])),
+        ClampToS16(static_cast<s32>(a[1]) + static_cast<s32>(b[1]))
+    };
+}
+
+void Mixers::DownmixAndMixIntoCurrentFrame(float gain, const QuadFrame32& samples) {
+    // TODO(merry): Limiter. (Currently we're performing final mixing assuming a disabled limiter.)
+
+    switch (state.output_format) {
+    case OutputFormat::Mono:
+        std::transform(current_frame.begin(), current_frame.end(), samples.begin(), current_frame.begin(),
+            [gain](const std::array<s16, 2>& accumulator, const std::array<s32, 4>& sample) -> std::array<s16, 2> {
+                // Downmix to mono
+                s16 mono = ClampToS16(static_cast<s32>((gain * sample[0] + gain * sample[1] + gain * sample[2] + gain * sample[3]) / 2));
+                // Mix into current frame
+                return AddAndClampToS16(accumulator, { mono, mono });
+            });
+        return;
+
+    case OutputFormat::Surround:
+        // TODO(merry): Implement surround sound.
+        // fallthrough
+
+    case OutputFormat::Stereo:
+        std::transform(current_frame.begin(), current_frame.end(), samples.begin(), current_frame.begin(),
+            [gain](const std::array<s16, 2>& accumulator, const std::array<s32, 4>& sample) -> std::array<s16, 2> {
+                // Downmix to stereo
+                s16 left = ClampToS16(static_cast<s32>(gain * sample[0] + gain * sample[2]));
+                s16 right = ClampToS16(static_cast<s32>(gain * sample[1] + gain * sample[3]));
+                // Mix into current frame
+                return AddAndClampToS16(accumulator, { left, right });
+            });
+        return;
+    }
+
+    UNREACHABLE_MSG("Invalid output_format %zu", static_cast<size_t>(state.output_format));
+}
+
+void Mixers::AuxReturn(const IntermediateMixSamples& read_samples) {
+    // NOTE: read_samples.mix{1,2}.pcm32 annoyingly have their dimensions in reverse order to QuadFrame32.
+
+    if (state.mixer1_enabled) {
+        for (size_t sample = 0; sample < samples_per_frame; sample++) {
+            for (size_t channel = 0; channel < 4; channel++) {
+                state.intermediate_mix_buffer[1][sample][channel] = read_samples.mix1.pcm32[channel][sample];
+            }
+        }
+    }
+
+    if (state.mixer2_enabled) {
+        for (size_t sample = 0; sample < samples_per_frame; sample++) {
+            for (size_t channel = 0; channel < 4; channel++) {
+                state.intermediate_mix_buffer[2][sample][channel] = read_samples.mix2.pcm32[channel][sample];
+            }
+        }
+    }
+}
+
+void Mixers::AuxSend(IntermediateMixSamples& write_samples, const std::array<QuadFrame32, 3>& input) {
+    // NOTE: read_samples.mix{1,2}.pcm32 annoyingly have their dimensions in reverse order to QuadFrame32.
+
+    state.intermediate_mix_buffer[0] = input[0];
+
+    if (state.mixer1_enabled) {
+        for (size_t sample = 0; sample < samples_per_frame; sample++) {
+            for (size_t channel = 0; channel < 4; channel++) {
+                write_samples.mix1.pcm32[channel][sample] = input[1][sample][channel];
+            }
+        }
+    } else {
+        state.intermediate_mix_buffer[1] = input[1];
+    }
+
+    if (state.mixer2_enabled) {
+        for (size_t sample = 0; sample < samples_per_frame; sample++) {
+            for (size_t channel = 0; channel < 4; channel++) {
+                write_samples.mix2.pcm32[channel][sample] = input[2][sample][channel];
+            }
+        }
+    } else {
+        state.intermediate_mix_buffer[2] = input[2];
+    }
+}
+
+void Mixers::MixCurrentFrame() {
+    current_frame.fill({});
+
+    for (size_t mix = 0; mix < 3; mix++) {
+        DownmixAndMixIntoCurrentFrame(state.intermediate_mixer_volume[mix], state.intermediate_mix_buffer[mix]);
+    }
+
+    // TODO(merry): Compressor. (We currently assume a disabled compressor.)
+}
+
+DspStatus Mixers::GetCurrentStatus() const {
+    DspStatus status;
+    status.unknown = 0;
+    status.dropped_frames = 0;
+    return status;
+}
+
+} // namespace HLE
+} // namespace DSP
diff --git a/src/audio_core/hle/mixers.h b/src/audio_core/hle/mixers.h
new file mode 100644
index 000000000..b52952eb5
--- /dev/null
+++ b/src/audio_core/hle/mixers.h
@@ -0,0 +1,63 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+
+#include "audio_core/hle/common.h"
+#include "audio_core/hle/dsp.h"
+
+namespace DSP {
+namespace HLE {
+
+class Mixers final {
+public:
+    Mixers() {
+        Reset();
+    }
+
+    void Reset();
+
+    DspStatus Tick(DspConfiguration& config,
+                   const IntermediateMixSamples& read_samples,
+                   IntermediateMixSamples& write_samples,
+                   const std::array<QuadFrame32, 3>& input);
+
+    StereoFrame16 GetOutput() const {
+        return current_frame;
+    }
+
+private:
+    StereoFrame16 current_frame = {};
+
+    using OutputFormat = DspConfiguration::OutputFormat;
+
+    struct {
+        std::array<float, 3> intermediate_mixer_volume = {};
+
+        bool mixer1_enabled = false;
+        bool mixer2_enabled = false;
+        std::array<QuadFrame32, 3> intermediate_mix_buffer = {};
+
+        OutputFormat output_format = OutputFormat::Stereo;
+
+    } state;
+
+    /// INTERNAL: Update our internal state based on the current config.
+    void ParseConfig(DspConfiguration& config);
+    /// INTERNAL: Read samples from shared memory that have been modified by the ARM11.
+    void AuxReturn(const IntermediateMixSamples& read_samples);
+    /// INTERNAL: Write samples to shared memory for the ARM11 to modify.
+    void AuxSend(IntermediateMixSamples& write_samples, const std::array<QuadFrame32, 3>& input);
+    /// INTERNAL: Mix current_frame.
+    void MixCurrentFrame();
+    /// INTERNAL: Downmix from quadraphonic to stereo based on status.output_format and accumulate into current_frame.
+    void DownmixAndMixIntoCurrentFrame(float gain, const QuadFrame32& samples);
+    /// INTERNAL: Generate DspStatus based on internal state.
+    DspStatus GetCurrentStatus() const;
+};
+
+} // namespace HLE
+} // namespace DSP
diff --git a/src/audio_core/hle/source.cpp b/src/audio_core/hle/source.cpp
index daaf6e3f3..30552fe26 100644
--- a/src/audio_core/hle/source.cpp
+++ b/src/audio_core/hle/source.cpp
@@ -126,13 +126,13 @@ void Source::ParseConfig(SourceConfiguration::Configuration& config, const s16_l
     if (config.simple_filter_dirty) {
         config.simple_filter_dirty.Assign(0);
         state.filters.Configure(config.simple_filter);
-        LOG_TRACE(Audio_DSP, "source_id=%zu simple filter update");
+        LOG_TRACE(Audio_DSP, "source_id=%zu simple filter update", source_id);
     }
 
     if (config.biquad_filter_dirty) {
         config.biquad_filter_dirty.Assign(0);
         state.filters.Configure(config.biquad_filter);
-        LOG_TRACE(Audio_DSP, "source_id=%zu biquad filter update");
+        LOG_TRACE(Audio_DSP, "source_id=%zu biquad filter update", source_id);
     }
 
     if (config.interpolation_dirty) {
diff --git a/src/audio_core/time_stretch.cpp b/src/audio_core/time_stretch.cpp
new file mode 100644
index 000000000..ea38f40d0
--- /dev/null
+++ b/src/audio_core/time_stretch.cpp
@@ -0,0 +1,144 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <chrono>
+#include <cmath>
+#include <vector>
+
+#include <SoundTouch.h>
+
+#include "audio_core/audio_core.h"
+#include "audio_core/time_stretch.h"
+
+#include "common/common_types.h"
+#include "common/logging/log.h"
+#include "common/math_util.h"
+
+using steady_clock = std::chrono::steady_clock;
+
+namespace AudioCore {
+
+constexpr double MIN_RATIO = 0.1;
+constexpr double MAX_RATIO = 100.0;
+
+static double ClampRatio(double ratio) {
+    return MathUtil::Clamp(ratio, MIN_RATIO, MAX_RATIO);
+}
+
+constexpr double MIN_DELAY_TIME = 0.05; // Units: seconds
+constexpr double MAX_DELAY_TIME = 0.25; // Units: seconds
+constexpr size_t DROP_FRAMES_SAMPLE_DELAY = 16000; // Units: samples
+
+constexpr double SMOOTHING_FACTOR = 0.007;
+
+struct TimeStretcher::Impl {
+    soundtouch::SoundTouch soundtouch;
+
+    steady_clock::time_point frame_timer = steady_clock::now();
+    size_t samples_queued = 0;
+
+    double smoothed_ratio = 1.0;
+
+    double sample_rate = static_cast<double>(native_sample_rate);
+};
+
+std::vector<s16> TimeStretcher::Process(size_t samples_in_queue) {
+    // This is a very simple algorithm without any fancy control theory. It works and is stable.
+
+    double ratio = CalculateCurrentRatio();
+    ratio = CorrectForUnderAndOverflow(ratio, samples_in_queue);
+    impl->smoothed_ratio = (1.0 - SMOOTHING_FACTOR) * impl->smoothed_ratio + SMOOTHING_FACTOR * ratio;
+    impl->smoothed_ratio = ClampRatio(impl->smoothed_ratio);
+
+    // SoundTouch's tempo definition the inverse of our ratio definition.
+    impl->soundtouch.setTempo(1.0 / impl->smoothed_ratio);
+
+    std::vector<s16> samples = GetSamples();
+    if (samples_in_queue >= DROP_FRAMES_SAMPLE_DELAY) {
+        samples.clear();
+        LOG_DEBUG(Audio, "Dropping frames!");
+    }
+    return samples;
+}
+
+TimeStretcher::TimeStretcher() : impl(std::make_unique<Impl>()) {
+    impl->soundtouch.setPitch(1.0);
+    impl->soundtouch.setChannels(2);
+    impl->soundtouch.setSampleRate(native_sample_rate);
+    Reset();
+}
+
+TimeStretcher::~TimeStretcher() {
+    impl->soundtouch.clear();
+}
+
+void TimeStretcher::SetOutputSampleRate(unsigned int sample_rate) {
+    impl->sample_rate = static_cast<double>(sample_rate);
+    impl->soundtouch.setRate(static_cast<double>(native_sample_rate) / impl->sample_rate);
+}
+
+void TimeStretcher::AddSamples(const s16* buffer, size_t num_samples) {
+    impl->soundtouch.putSamples(buffer, static_cast<uint>(num_samples));
+    impl->samples_queued += num_samples;
+}
+
+void TimeStretcher::Flush() {
+    impl->soundtouch.flush();
+}
+
+void TimeStretcher::Reset() {
+    impl->soundtouch.setTempo(1.0);
+    impl->soundtouch.clear();
+    impl->smoothed_ratio = 1.0;
+    impl->frame_timer = steady_clock::now();
+    impl->samples_queued = 0;
+    SetOutputSampleRate(native_sample_rate);
+}
+
+double TimeStretcher::CalculateCurrentRatio() {
+    const steady_clock::time_point now = steady_clock::now();
+    const std::chrono::duration<double> duration = now - impl->frame_timer;
+
+    const double expected_time = static_cast<double>(impl->samples_queued) / static_cast<double>(native_sample_rate);
+    const double actual_time = duration.count();
+
+    double ratio;
+    if (expected_time != 0) {
+        ratio = ClampRatio(actual_time / expected_time);
+    } else {
+        ratio = impl->smoothed_ratio;
+    }
+
+    impl->frame_timer = now;
+    impl->samples_queued = 0;
+
+    return ratio;
+}
+
+double TimeStretcher::CorrectForUnderAndOverflow(double ratio, size_t sample_delay) const {
+    const size_t min_sample_delay = static_cast<size_t>(MIN_DELAY_TIME * impl->sample_rate);
+    const size_t max_sample_delay = static_cast<size_t>(MAX_DELAY_TIME * impl->sample_rate);
+
+    if (sample_delay < min_sample_delay) {
+        // Make the ratio bigger.
+        ratio = ratio > 1.0 ? ratio * ratio : sqrt(ratio);
+    } else if (sample_delay > max_sample_delay) {
+        // Make the ratio smaller.
+        ratio = ratio > 1.0 ? sqrt(ratio) : ratio * ratio;
+    }
+
+    return ClampRatio(ratio);
+}
+
+std::vector<s16> TimeStretcher::GetSamples() {
+    uint available = impl->soundtouch.numSamples();
+
+    std::vector<s16> output(static_cast<size_t>(available) * 2);
+
+    impl->soundtouch.receiveSamples(output.data(), available);
+
+    return output;
+}
+
+} // namespace AudioCore
diff --git a/src/audio_core/time_stretch.h b/src/audio_core/time_stretch.h
new file mode 100644
index 000000000..1fde3f72a
--- /dev/null
+++ b/src/audio_core/time_stretch.h
@@ -0,0 +1,57 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include "common/common_types.h"
+
+namespace AudioCore {
+
+class TimeStretcher final {
+public:
+    TimeStretcher();
+    ~TimeStretcher();
+
+    /**
+     * Set sample rate for the samples that Process returns.
+     * @param sample_rate The sample rate.
+     */
+    void SetOutputSampleRate(unsigned int sample_rate);
+
+    /**
+     * Add samples to be processed.
+     * @param sample_buffer Buffer of samples in interleaved stereo PCM16 format.
+     * @param num_sample Number of samples.
+     */
+    void AddSamples(const s16* sample_buffer, size_t num_samples);
+
+    /// Flush audio remaining in internal buffers.
+    void Flush();
+
+    /// Resets internal state and clears buffers.
+    void Reset();
+
+    /**
+     * Does audio stretching and produces the time-stretched samples.
+     * Timer calculations use sample_delay to determine how much of a margin we have.
+     * @param sample_delay How many samples are buffered downstream of this module and haven't been played yet.
+     * @return Samples to play in interleaved stereo PCM16 format.
+     */
+    std::vector<s16> Process(size_t sample_delay);
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> impl;
+
+    /// INTERNAL: ratio = wallclock time / emulated time
+    double CalculateCurrentRatio();
+    /// INTERNAL: If we have too many or too few samples downstream, nudge ratio in the appropriate direction.
+    double CorrectForUnderAndOverflow(double ratio, size_t sample_delay) const;
+    /// INTERNAL: Gets the time-stretched samples from SoundTouch.
+    std::vector<s16> GetSamples();
+};
+
+} // namespace AudioCore
diff --git a/src/citra_qt/CMakeLists.txt b/src/citra_qt/CMakeLists.txt
index 3f0099200..0a5d4624b 100644
--- a/src/citra_qt/CMakeLists.txt
+++ b/src/citra_qt/CMakeLists.txt
@@ -20,6 +20,7 @@ set(SRCS
             util/spinbox.cpp
             util/util.cpp
             bootmanager.cpp
+            configure_audio.cpp
             configure_debug.cpp
             configure_dialog.cpp
             configure_general.cpp
@@ -51,6 +52,7 @@ set(HEADERS
             util/spinbox.h
             util/util.h
             bootmanager.h
+            configure_audio.h
             configure_debug.h
             configure_dialog.h
             configure_general.h
@@ -69,6 +71,7 @@ set(UIS
             debugger/profiler.ui
             debugger/registers.ui
             configure.ui
+            configure_audio.ui
             configure_debug.ui
             configure_general.ui
             hotkeys.ui
diff --git a/src/citra_qt/configure.ui b/src/citra_qt/configure.ui
index 6ae056ff9..e1624bbef 100644
--- a/src/citra_qt/configure.ui
+++ b/src/citra_qt/configure.ui
@@ -29,6 +29,11 @@
        <string>Input</string>
       </attribute>
      </widget>
+      <widget class="ConfigureAudio" name="audioTab">
+        <attribute name="title">
+          <string>Audio</string>
+        </attribute>
+      </widget>
      <widget class="ConfigureDebug" name="debugTab">
       <attribute name="title">
        <string>Debug</string>
@@ -53,6 +58,12 @@
    <container>1</container>
   </customwidget>
   <customwidget>
+   <class>ConfigureAudio</class>
+   <extends>QWidget</extends>
+   <header>configure_audio.h</header>
+   <container>1</container>
+  </customwidget>
+  <customwidget>
    <class>ConfigureDebug</class>
    <extends>QWidget</extends>
    <header>configure_debug.h</header>
diff --git a/src/citra_qt/configure_audio.cpp b/src/citra_qt/configure_audio.cpp
new file mode 100644
index 000000000..cedfa2f2a
--- /dev/null
+++ b/src/citra_qt/configure_audio.cpp
@@ -0,0 +1,44 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "audio_core/sink_details.h"
+
+#include "citra_qt/configure_audio.h"
+#include "ui_configure_audio.h"
+
+#include "core/settings.h"
+
+ConfigureAudio::ConfigureAudio(QWidget* parent) :
+        QWidget(parent),
+        ui(std::make_unique<Ui::ConfigureAudio>())
+{
+    ui->setupUi(this);
+
+    ui->output_sink_combo_box->clear();
+    ui->output_sink_combo_box->addItem("auto");
+    for (const auto& sink_detail : AudioCore::g_sink_details) {
+        ui->output_sink_combo_box->addItem(sink_detail.id);
+    }
+
+    this->setConfiguration();
+}
+
+ConfigureAudio::~ConfigureAudio() {
+}
+
+void ConfigureAudio::setConfiguration() {
+    int new_sink_index = 0;
+    for (int index = 0; index < ui->output_sink_combo_box->count(); index++) {
+        if (ui->output_sink_combo_box->itemText(index).toStdString() == Settings::values.sink_id) {
+            new_sink_index = index;
+            break;
+        }
+    }
+    ui->output_sink_combo_box->setCurrentIndex(new_sink_index);
+}
+
+void ConfigureAudio::applyConfiguration() {
+    Settings::values.sink_id = ui->output_sink_combo_box->itemText(ui->output_sink_combo_box->currentIndex()).toStdString();
+    Settings::Apply();
+}
diff --git a/src/citra_qt/configure_audio.h b/src/citra_qt/configure_audio.h
new file mode 100644
index 000000000..51df2e27b
--- /dev/null
+++ b/src/citra_qt/configure_audio.h
@@ -0,0 +1,27 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <QWidget>
+
+namespace Ui {
+class ConfigureAudio;
+}
+
+class ConfigureAudio : public QWidget {
+    Q_OBJECT
+
+public:
+    explicit ConfigureAudio(QWidget* parent = nullptr);
+    ~ConfigureAudio();
+
+    void applyConfiguration();
+
+private:
+    void setConfiguration();
+
+    std::unique_ptr<Ui::ConfigureAudio> ui;
+};
diff --git a/src/citra_qt/configure_audio.ui b/src/citra_qt/configure_audio.ui
new file mode 100644
index 000000000..d7f6946ca
--- /dev/null
+++ b/src/citra_qt/configure_audio.ui
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="utf-8"?>
+
+<ui version="4.0">
+ <class>ConfigureAudio</class>
+ <widget class="QWidget" name="ConfigureAudio">
+  <layout class="QVBoxLayout">
+   <item>
+    <widget class="QGroupBox">
+     <property name="title">
+      <string>Audio</string>
+     </property>
+     <layout class="QVBoxLayout">
+      <item>
+       <layout class="QHBoxLayout">
+        <item>
+         <widget class="QLabel">
+          <property name="text">
+           <string>Output Engine:</string>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <widget class="QComboBox" name="output_sink_combo_box">
+         </widget>
+        </item>
+       </layout>
+      </item>
+     </layout>
+    </widget>
+   </item>
+   <item>
+    <spacer>
+     <property name="orientation">
+      <enum>Qt::Vertical</enum>
+     </property>
+     <property name="sizeHint" stdset="0">
+      <size>
+       <width>20</width>
+       <height>40</height>
+      </size>
+     </property>
+    </spacer>
+   </item>
+  </layout>
+ </widget>
+ <resources />
+ <connections />
+</ui>
diff --git a/src/citra_qt/configure_dialog.cpp b/src/citra_qt/configure_dialog.cpp
index 87c26c715..2f0317fe0 100644
--- a/src/citra_qt/configure_dialog.cpp
+++ b/src/citra_qt/configure_dialog.cpp
@@ -25,5 +25,6 @@ void ConfigureDialog::setConfiguration() {
 
 void ConfigureDialog::applyConfiguration() {
     ui->generalTab->applyConfiguration();
+    ui->audioTab->applyConfiguration();
     ui->debugTab->applyConfiguration();
 }
diff --git a/src/citra_qt/debugger/graphics_tracing.cpp b/src/citra_qt/debugger/graphics_tracing.cpp
index 1402f8e79..9c80f7ec9 100644
--- a/src/citra_qt/debugger/graphics_tracing.cpp
+++ b/src/citra_qt/debugger/graphics_tracing.cpp
@@ -74,7 +74,7 @@ void GraphicsTracingWidget::StartRecording() {
     std::array<u32, 4 * 16> default_attributes;
     for (unsigned i = 0; i < 16; ++i) {
         for (unsigned comp = 0; comp < 3; ++comp) {
-            default_attributes[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs.default_attributes[i][comp].ToFloat32());
+            default_attributes[4 * i + comp] = nihstro::to_float24(Pica::g_state.vs_default_attributes[i][comp].ToFloat32());
         }
     }
 
diff --git a/src/citra_qt/debugger/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics_vertex_shader.cpp
index 854f6ff16..391666d35 100644
--- a/src/citra_qt/debugger/graphics_vertex_shader.cpp
+++ b/src/citra_qt/debugger/graphics_vertex_shader.cpp
@@ -501,7 +501,7 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d
     info.labels.insert({ entry_point, "main" });
 
     // Generate debug information
-    debug_data = Pica::Shader::ProduceDebugInfo(input_vertex, num_attributes, shader_config, shader_setup);
+    debug_data = Pica::g_state.vs.ProduceDebugInfo(input_vertex, num_attributes, shader_config, shader_setup);
 
     // Reload widget state
     for (int attr = 0; attr < num_attributes; ++attr) {
diff --git a/src/citra_qt/debugger/profiler.cpp b/src/citra_qt/debugger/profiler.cpp
index 7bb010f77..585ac049a 100644
--- a/src/citra_qt/debugger/profiler.cpp
+++ b/src/citra_qt/debugger/profiler.cpp
@@ -151,6 +151,8 @@ private:
     /// This timer is used to redraw the widget's contents continuously. To save resources, it only
     /// runs while the widget is visible.
     QTimer update_timer;
+    /// Scale the coordinate system appropriately when physical DPI != logical DPI.
+    qreal x_scale, y_scale;
 };
 
 #endif
@@ -220,11 +222,17 @@ MicroProfileWidget::MicroProfileWidget(QWidget* parent) : QWidget(parent) {
     MicroProfileInitUI();
 
     connect(&update_timer, SIGNAL(timeout()), SLOT(update()));
+
+    QPainter painter(this);
+    x_scale = qreal(painter.device()->physicalDpiX()) / qreal(painter.device()->logicalDpiX());
+    y_scale = qreal(painter.device()->physicalDpiY()) / qreal(painter.device()->logicalDpiY());
 }
 
 void MicroProfileWidget::paintEvent(QPaintEvent* ev) {
     QPainter painter(this);
 
+    painter.scale(x_scale, y_scale);
+
     painter.setBackground(Qt::black);
     painter.eraseRect(rect());
 
@@ -248,24 +256,24 @@ void MicroProfileWidget::hideEvent(QHideEvent* ev) {
 }
 
 void MicroProfileWidget::mouseMoveEvent(QMouseEvent* ev) {
-    MicroProfileMousePosition(ev->x(), ev->y(), 0);
+    MicroProfileMousePosition(ev->x() / x_scale, ev->y() / y_scale, 0);
     ev->accept();
 }
 
 void MicroProfileWidget::mousePressEvent(QMouseEvent* ev) {
-    MicroProfileMousePosition(ev->x(), ev->y(), 0);
+    MicroProfileMousePosition(ev->x() / x_scale, ev->y() / y_scale, 0);
     MicroProfileMouseButton(ev->buttons() & Qt::LeftButton, ev->buttons() & Qt::RightButton);
     ev->accept();
 }
 
 void MicroProfileWidget::mouseReleaseEvent(QMouseEvent* ev) {
-    MicroProfileMousePosition(ev->x(), ev->y(), 0);
+    MicroProfileMousePosition(ev->x() / x_scale, ev->y() / y_scale, 0);
     MicroProfileMouseButton(ev->buttons() & Qt::LeftButton, ev->buttons() & Qt::RightButton);
     ev->accept();
 }
 
 void MicroProfileWidget::wheelEvent(QWheelEvent* ev) {
-    MicroProfileMousePosition(ev->x(), ev->y(), ev->delta() / 120);
+    MicroProfileMousePosition(ev->x() / x_scale, ev->y() / y_scale, ev->delta() / 120);
     ev->accept();
 }
 
diff --git a/src/common/swap.h b/src/common/swap.h
index a7c37bc44..1749bd7a4 100644
--- a/src/common/swap.h
+++ b/src/common/swap.h
@@ -25,6 +25,8 @@
     #include <sys/endian.h>
 #endif
 
+#include <cstring>
+
 #include "common/common_types.h"
 
 // GCC 4.6+
@@ -58,9 +60,6 @@
 
 namespace Common {
 
-inline u8 swap8(u8 _data) {return _data;}
-inline u32 swap24(const u8* _data) {return (_data[0] << 16) | (_data[1] << 8) | _data[2];}
-
 #ifdef _MSC_VER
 inline u16 swap16(u16 _data) {return _byteswap_ushort(_data);}
 inline u32 swap32(u32 _data) {return _byteswap_ulong (_data);}
@@ -92,52 +91,29 @@ inline u64 swap64(u64 data) {return ((u64)swap32(data) << 32) | swap32(data >> 3
 #endif
 
 inline float swapf(float f) {
-    union {
-        float f;
-        unsigned int u32;
-    } dat1, dat2;
-
-    dat1.f = f;
-    dat2.u32 = swap32(dat1.u32);
+    static_assert(sizeof(u32) == sizeof(float),
+                  "float must be the same size as uint32_t.");
 
-    return dat2.f;
-}
-
-inline double swapd(double f) {
-    union  {
-        double f;
-        unsigned long long u64;
-    } dat1, dat2;
+    u32 value;
+    std::memcpy(&value, &f, sizeof(u32));
 
-    dat1.f = f;
-    dat2.u64 = swap64(dat1.u64);
+    value = swap32(value);
+    std::memcpy(&f, &value, sizeof(u32));
 
-    return dat2.f;
+    return f;
 }
 
-inline u16 swap16(const u8* _pData) {return swap16(*(const u16*)_pData);}
-inline u32 swap32(const u8* _pData) {return swap32(*(const u32*)_pData);}
-inline u64 swap64(const u8* _pData) {return swap64(*(const u64*)_pData);}
-
-template <int count>
-void swap(u8*);
+inline double swapd(double f) {
+    static_assert(sizeof(u64) == sizeof(double),
+                  "double must be the same size as uint64_t.");
 
-template <>
-inline void swap<1>(u8* data) { }
+    u64 value;
+    std::memcpy(&value, &f, sizeof(u64));
 
-template <>
-inline void swap<2>(u8* data) {
-    *reinterpret_cast<u16*>(data) = swap16(data);
-}
-
-template <>
-inline void swap<4>(u8* data) {
-    *reinterpret_cast<u32*>(data) = swap32(data);
-}
+    value = swap64(value);
+    std::memcpy(&f, &value, sizeof(u64));
 
-template <>
-inline void swap<8>(u8* data) {
-    *reinterpret_cast<u64*>(data) = swap64(data);
+    return f;
 }
 
 }  // Namespace Common
@@ -534,35 +510,35 @@ bool operator==(const S &p, const swap_struct_t<T, F> v) {
 template <typename T>
 struct swap_64_t {
     static T swap(T x) {
-        return (T)Common::swap64(*(u64 *)&x);
+        return static_cast<T>(Common::swap64(x));
     }
 };
 
 template <typename T>
 struct swap_32_t {
     static T swap(T x) {
-        return (T)Common::swap32(*(u32 *)&x);
+        return static_cast<T>(Common::swap32(x));
     }
 };
 
 template <typename T>
 struct swap_16_t {
     static T swap(T x) {
-        return (T)Common::swap16(*(u16 *)&x);
+        return static_cast<T>(Common::swap16(x));
     }
 };
 
 template <typename T>
 struct swap_float_t {
     static T swap(T x) {
-        return (T)Common::swapf(*(float *)&x);
+        return static_cast<T>(Common::swapf(x));
     }
 };
 
 template <typename T>
 struct swap_double_t {
     static T swap(T x) {
-        return (T)Common::swapd(*(double *)&x);
+        return static_cast<T>(Common::swapd(x));
     }
 };
 
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index a8d891689..12080a802 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -42,6 +42,7 @@ set(SRCS
             hle/kernel/timer.cpp
             hle/kernel/vm_manager.cpp
             hle/service/ac_u.cpp
+            hle/service/act_a.cpp
             hle/service/act_u.cpp
             hle/service/am/am.cpp
             hle/service/am/am_app.cpp
@@ -52,6 +53,7 @@ set(SRCS
             hle/service/apt/apt_a.cpp
             hle/service/apt/apt_s.cpp
             hle/service/apt/apt_u.cpp
+            hle/service/apt/bcfnt/bcfnt.cpp
             hle/service/boss/boss.cpp
             hle/service/boss/boss_p.cpp
             hle/service/boss/boss_u.cpp
@@ -175,6 +177,7 @@ set(HEADERS
             hle/kernel/vm_manager.h
             hle/result.h
             hle/service/ac_u.h
+            hle/service/act_a.h
             hle/service/act_u.h
             hle/service/am/am.h
             hle/service/am/am_app.h
@@ -185,6 +188,7 @@ set(HEADERS
             hle/service/apt/apt_a.h
             hle/service/apt/apt_s.h
             hle/service/apt/apt_u.h
+            hle/service/apt/bcfnt/bcfnt.h
             hle/service/boss/boss.h
             hle/service/boss/boss_p.h
             hle/service/boss/boss_u.h
diff --git a/src/core/arm/arm_interface.h b/src/core/arm/arm_interface.h
index 533067d4f..d8abe5aeb 100644
--- a/src/core/arm/arm_interface.h
+++ b/src/core/arm/arm_interface.h
@@ -6,6 +6,7 @@
 
 #include "common/common_types.h"
 #include "core/arm/skyeye_common/arm_regformat.h"
+#include "core/arm/skyeye_common/vfp/asm_vfp.h"
 
 namespace Core {
     struct ThreadContext;
diff --git a/src/core/arm/dyncom/arm_dyncom.cpp b/src/core/arm/dyncom/arm_dyncom.cpp
index a3581132c..13492a08b 100644
--- a/src/core/arm/dyncom/arm_dyncom.cpp
+++ b/src/core/arm/dyncom/arm_dyncom.cpp
@@ -93,7 +93,7 @@ void ARM_DynCom::ResetContext(Core::ThreadContext& context, u32 stack_top, u32 e
     context.cpu_registers[0] = arg;
     context.pc = entry_point;
     context.sp = stack_top;
-    context.cpsr = 0x1F | ((entry_point & 1) << 5); // Usermode and THUMB mode
+    context.cpsr = USER32MODE | ((entry_point & 1) << 5); // Usermode and THUMB mode
 }
 
 void ARM_DynCom::SaveContext(Core::ThreadContext& ctx) {
diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
index 8d4b26815..cfc67287f 100644
--- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
+++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
@@ -5527,28 +5527,32 @@ unsigned InterpreterMainLoop(ARMul_State* cpu) {
 
             // SMUAD and SMLAD
             if (BIT(op2, 1) == 0) {
-                RD = (product1 + product2);
+                u32 rd_val = (product1 + product2);
 
                 if (inst_cream->Ra != 15) {
-                    RD += cpu->Reg[inst_cream->Ra];
+                    rd_val += cpu->Reg[inst_cream->Ra];
 
                     if (ARMul_AddOverflowQ(product1 + product2, cpu->Reg[inst_cream->Ra]))
                         cpu->Cpsr |= (1 << 27);
                 }
 
+                RD = rd_val;
+
                 if (ARMul_AddOverflowQ(product1, product2))
                     cpu->Cpsr |= (1 << 27);
             }
             // SMUSD and SMLSD
             else {
-                RD = (product1 - product2);
+                u32 rd_val = (product1 - product2);
 
                 if (inst_cream->Ra != 15) {
-                    RD += cpu->Reg[inst_cream->Ra];
+                    rd_val += cpu->Reg[inst_cream->Ra];
 
                     if (ARMul_AddOverflowQ(product1 - product2, cpu->Reg[inst_cream->Ra]))
                         cpu->Cpsr |= (1 << 27);
                 }
+
+                RD = rd_val;
             }
         }
 
diff --git a/src/core/gdbstub/gdbstub.cpp b/src/core/gdbstub/gdbstub.cpp
index 1360ee845..820b19e1a 100644
--- a/src/core/gdbstub/gdbstub.cpp
+++ b/src/core/gdbstub/gdbstub.cpp
@@ -437,7 +437,7 @@ static void HandleSetThread() {
  *
  * @param signal Signal to be sent to client.
  */
-void SendSignal(u32 signal) {
+static void SendSignal(u32 signal) {
     if (gdbserver_socket == -1) {
         return;
     }
@@ -713,7 +713,7 @@ static void Continue() {
  * @param addr Address of breakpoint.
  * @param len Length of breakpoint.
  */
-bool CommitBreakpoint(BreakpointType type, PAddr addr, u32 len) {
+static bool CommitBreakpoint(BreakpointType type, PAddr addr, u32 len) {
     std::map<u32, Breakpoint>& p = GetBreakpointList(type);
 
     Breakpoint breakpoint;
@@ -907,7 +907,7 @@ void ToggleServer(bool status) {
     }
 }
 
-void Init(u16 port) {
+static void Init(u16 port) {
     if (!g_server_enabled) {
         // Set the halt loop to false in case the user enabled the gdbstub mid-execution.
         // This way the CPU can still execute normally.
diff --git a/src/core/hle/applets/applet.h b/src/core/hle/applets/applet.h
index af442f81d..754c6f7db 100644
--- a/src/core/hle/applets/applet.h
+++ b/src/core/hle/applets/applet.h
@@ -65,6 +65,7 @@ protected:
     virtual ResultCode StartImpl(const Service::APT::AppletStartupParameter& parameter) = 0;
 
     Service::APT::AppletId id; ///< Id of this Applet
+    std::shared_ptr<std::vector<u8>> heap_memory; ///< Heap memory for this Applet
 };
 
 /// Returns whether a library applet is currently running
diff --git a/src/core/hle/applets/mii_selector.cpp b/src/core/hle/applets/mii_selector.cpp
index b4456ca90..bf39eca22 100644
--- a/src/core/hle/applets/mii_selector.cpp
+++ b/src/core/hle/applets/mii_selector.cpp
@@ -35,9 +35,14 @@ ResultCode MiiSelector::ReceiveParameter(const Service::APT::MessageParameter& p
     ASSERT(sizeof(capture_info) == parameter.buffer_size);
 
     memcpy(&capture_info, parameter.data, sizeof(capture_info));
+
     using Kernel::MemoryPermission;
-    framebuffer_memory = Kernel::SharedMemory::Create(capture_info.size, MemoryPermission::ReadWrite,
-                                                      MemoryPermission::ReadWrite, "MiiSelector Memory");
+    // Allocate a heap block of the required size for this applet.
+    heap_memory = std::make_shared<std::vector<u8>>(capture_info.size);
+    // Create a SharedMemory that directly points to this heap block.
+    framebuffer_memory = Kernel::SharedMemory::CreateForApplet(heap_memory, 0, heap_memory->size(),
+                                                               MemoryPermission::ReadWrite, MemoryPermission::ReadWrite,
+                                                               "MiiSelector Memory");
 
     // Send the response message with the newly created SharedMemory
     Service::APT::MessageParameter result;
diff --git a/src/core/hle/applets/swkbd.cpp b/src/core/hle/applets/swkbd.cpp
index 87238aa1c..90c6adc65 100644
--- a/src/core/hle/applets/swkbd.cpp
+++ b/src/core/hle/applets/swkbd.cpp
@@ -40,8 +40,12 @@ ResultCode SoftwareKeyboard::ReceiveParameter(Service::APT::MessageParameter con
     memcpy(&capture_info, parameter.data, sizeof(capture_info));
 
     using Kernel::MemoryPermission;
-    framebuffer_memory = Kernel::SharedMemory::Create(capture_info.size, MemoryPermission::ReadWrite,
-                                                      MemoryPermission::ReadWrite, "SoftwareKeyboard Memory");
+    // Allocate a heap block of the required size for this applet.
+    heap_memory = std::make_shared<std::vector<u8>>(capture_info.size);
+    // Create a SharedMemory that directly points to this heap block.
+    framebuffer_memory = Kernel::SharedMemory::CreateForApplet(heap_memory, 0, heap_memory->size(),
+                                                               MemoryPermission::ReadWrite, MemoryPermission::ReadWrite,
+                                                               "SoftwareKeyboard Memory");
 
     // Send the response message with the newly created SharedMemory
     Service::APT::MessageParameter result;
diff --git a/src/core/hle/function_wrappers.h b/src/core/hle/function_wrappers.h
index 4d718b681..bf7f875b6 100644
--- a/src/core/hle/function_wrappers.h
+++ b/src/core/hle/function_wrappers.h
@@ -170,7 +170,8 @@ template<ResultCode func(s64*, u32, s32)> void Wrap() {
 
 template<ResultCode func(u32*, u32, u32, u32, u32)> void Wrap() {
     u32 param_1 = 0;
-    u32 retval = func(&param_1, PARAM(1), PARAM(2), PARAM(3), PARAM(4)).raw;
+    // The last parameter is passed in R0 instead of R4
+    u32 retval = func(&param_1, PARAM(1), PARAM(2), PARAM(3), PARAM(0)).raw;
     Core::g_app_core->SetReg(1, param_1);
     FuncReturn(retval);
 }
diff --git a/src/core/hle/kernel/memory.cpp b/src/core/hle/kernel/memory.cpp
index 862643448..17ae87aef 100644
--- a/src/core/hle/kernel/memory.cpp
+++ b/src/core/hle/kernel/memory.cpp
@@ -55,6 +55,9 @@ void MemoryInit(u32 mem_type) {
         memory_regions[i].size = memory_region_sizes[mem_type][i];
         memory_regions[i].used = 0;
         memory_regions[i].linear_heap_memory = std::make_shared<std::vector<u8>>();
+        // Reserve enough space for this region of FCRAM.
+        // We do not want this block of memory to be relocated when allocating from it.
+        memory_regions[i].linear_heap_memory->reserve(memory_regions[i].size);
 
         base += memory_regions[i].size;
     }
@@ -107,9 +110,7 @@ struct MemoryArea {
 
 // We don't declare the IO regions in here since its handled by other means.
 static MemoryArea memory_areas[] = {
-    {SHARED_MEMORY_VADDR, SHARED_MEMORY_SIZE,     "Shared Memory"}, // Shared memory
     {VRAM_VADDR,          VRAM_SIZE,              "VRAM"},          // Video memory (VRAM)
-    {TLS_AREA_VADDR,      TLS_AREA_SIZE,          "TLS Area"},      // TLS memory
 };
 
 }
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp
index 0546f6e16..69302cc82 100644
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -209,7 +209,7 @@ ResultVal<VAddr> Process::LinearAllocate(VAddr target, u32 size, VMAPermission p
         return ERR_INVALID_ADDRESS;
     }
 
-    // Expansion of the linear heap is only allowed if you do an allocation immediatelly at its
+    // Expansion of the linear heap is only allowed if you do an allocation immediately at its
     // end. It's possible to free gaps in the middle of the heap and then reallocate them later,
     // but expansions are only allowed at the end.
     if (target == heap_end) {
diff --git a/src/core/hle/kernel/process.h b/src/core/hle/kernel/process.h
index a06afef2b..d781ef32c 100644
--- a/src/core/hle/kernel/process.h
+++ b/src/core/hle/kernel/process.h
@@ -142,8 +142,11 @@ public:
 
     MemoryRegionInfo* memory_region = nullptr;
 
-    /// Bitmask of the used TLS slots
-    std::bitset<300> used_tls_slots;
+    /// The Thread Local Storage area is allocated as processes create threads,
+    /// each TLS area is 0x200 bytes, so one page (0x1000) is split up in 8 parts, and each part
+    /// holds the TLS for a specific thread. This vector contains which parts are in use for each page as a bitmask.
+    /// This vector will grow as more pages are allocated for new threads.
+    std::vector<std::bitset<8>> tls_slots;
 
     VAddr GetLinearHeapAreaAddress() const;
     VAddr GetLinearHeapBase() const;
diff --git a/src/core/hle/kernel/shared_memory.cpp b/src/core/hle/kernel/shared_memory.cpp
index d90f0f00f..6a22c8986 100644
--- a/src/core/hle/kernel/shared_memory.cpp
+++ b/src/core/hle/kernel/shared_memory.cpp
@@ -7,6 +7,7 @@
 #include "common/logging/log.h"
 
 #include "core/memory.h"
+#include "core/hle/kernel/memory.h"
 #include "core/hle/kernel/shared_memory.h"
 
 namespace Kernel {
@@ -14,93 +15,157 @@ namespace Kernel {
 SharedMemory::SharedMemory() {}
 SharedMemory::~SharedMemory() {}
 
-SharedPtr<SharedMemory> SharedMemory::Create(u32 size, MemoryPermission permissions,
-        MemoryPermission other_permissions, std::string name) {
+SharedPtr<SharedMemory> SharedMemory::Create(SharedPtr<Process> owner_process, u32 size, MemoryPermission permissions,
+        MemoryPermission other_permissions, VAddr address, MemoryRegion region, std::string name) {
     SharedPtr<SharedMemory> shared_memory(new SharedMemory);
 
+    shared_memory->owner_process = owner_process;
     shared_memory->name = std::move(name);
-    shared_memory->base_address = 0x0;
-    shared_memory->fixed_address = 0x0;
     shared_memory->size = size;
     shared_memory->permissions = permissions;
     shared_memory->other_permissions = other_permissions;
 
+    if (address == 0) {
+        // We need to allocate a block from the Linear Heap ourselves.
+        // We'll manually allocate some memory from the linear heap in the specified region.
+        MemoryRegionInfo* memory_region = GetMemoryRegion(region);
+        auto& linheap_memory = memory_region->linear_heap_memory;
+
+        ASSERT_MSG(linheap_memory->size() + size <= memory_region->size, "Not enough space in region to allocate shared memory!");
+
+        shared_memory->backing_block = linheap_memory;
+        shared_memory->backing_block_offset = linheap_memory->size();
+        // Allocate some memory from the end of the linear heap for this region.
+        linheap_memory->insert(linheap_memory->end(), size, 0);
+        memory_region->used += size;
+
+        shared_memory->linear_heap_phys_address = Memory::FCRAM_PADDR + memory_region->base + shared_memory->backing_block_offset;
+
+        // Increase the amount of used linear heap memory for the owner process.
+        if (shared_memory->owner_process != nullptr) {
+            shared_memory->owner_process->linear_heap_used += size;
+        }
+
+        // Refresh the address mappings for the current process.
+        if (Kernel::g_current_process != nullptr) {
+            Kernel::g_current_process->vm_manager.RefreshMemoryBlockMappings(linheap_memory.get());
+        }
+    } else {
+        // TODO(Subv): What happens if an application tries to create multiple memory blocks pointing to the same address?
+        auto& vm_manager = shared_memory->owner_process->vm_manager;
+        // The memory is already available and mapped in the owner process.
+        auto vma = vm_manager.FindVMA(address)->second;
+        // Copy it over to our own storage
+        shared_memory->backing_block = std::make_shared<std::vector<u8>>(vma.backing_block->data() + vma.offset,
+                                                                         vma.backing_block->data() + vma.offset + size);
+        shared_memory->backing_block_offset = 0;
+        // Unmap the existing pages
+        vm_manager.UnmapRange(address, size);
+        // Map our own block into the address space
+        vm_manager.MapMemoryBlock(address, shared_memory->backing_block, 0, size, MemoryState::Shared);
+        // Reprotect the block with the new permissions
+        vm_manager.ReprotectRange(address, size, ConvertPermissions(permissions));
+    }
+
+    shared_memory->base_address = address;
     return shared_memory;
 }
 
-ResultCode SharedMemory::Map(VAddr address, MemoryPermission permissions,
-        MemoryPermission other_permissions) {
+SharedPtr<SharedMemory> SharedMemory::CreateForApplet(std::shared_ptr<std::vector<u8>> heap_block, u32 offset, u32 size,
+                                                      MemoryPermission permissions, MemoryPermission other_permissions, std::string name) {
+    SharedPtr<SharedMemory> shared_memory(new SharedMemory);
 
-    if (base_address != 0) {
-        LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s: already mapped at 0x%08X!",
-            GetObjectId(), address, name.c_str(), base_address);
-        // TODO: Verify error code with hardware
-        return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::Kernel,
-            ErrorSummary::InvalidArgument, ErrorLevel::Permanent);
-    }
+    shared_memory->owner_process = nullptr;
+    shared_memory->name = std::move(name);
+    shared_memory->size = size;
+    shared_memory->permissions = permissions;
+    shared_memory->other_permissions = other_permissions;
+    shared_memory->backing_block = heap_block;
+    shared_memory->backing_block_offset = offset;
+    shared_memory->base_address = Memory::HEAP_VADDR + offset;
 
-    // TODO(Subv): Return E0E01BEE when permissions and other_permissions don't
-    // match what was specified when the memory block was created.
+    return shared_memory;
+}
 
-    // TODO(Subv): Return E0E01BEE when address should be 0.
-    // Note: Find out when that's the case.
+ResultCode SharedMemory::Map(Process* target_process, VAddr address, MemoryPermission permissions,
+        MemoryPermission other_permissions) {
 
-    if (fixed_address != 0) {
-         if (address != 0 && address != fixed_address) {
-            LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s: fixed_addres is 0x%08X!",
-                    GetObjectId(), address, name.c_str(), fixed_address);
-            // TODO: Verify error code with hardware
-            return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::Kernel,
-                ErrorSummary::InvalidArgument, ErrorLevel::Permanent);
-        }
+    MemoryPermission own_other_permissions = target_process == owner_process ? this->permissions : this->other_permissions;
 
-        // HACK(yuriks): This is only here to support the APT shared font mapping right now.
-        // Later, this should actually map the memory block onto the address space.
-        return RESULT_SUCCESS;
+    // Automatically allocated memory blocks can only be mapped with other_permissions = DontCare
+    if (base_address == 0 && other_permissions != MemoryPermission::DontCare) {
+        return ResultCode(ErrorDescription::InvalidCombination, ErrorModule::OS, ErrorSummary::InvalidArgument, ErrorLevel::Usage);
     }
 
-    if (address < Memory::SHARED_MEMORY_VADDR || address + size >= Memory::SHARED_MEMORY_VADDR_END) {
-        LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s outside of shared mem bounds!",
-                GetObjectId(), address, name.c_str());
-        // TODO: Verify error code with hardware
-        return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::Kernel,
-                ErrorSummary::InvalidArgument, ErrorLevel::Permanent);
+    // Error out if the requested permissions don't match what the creator process allows.
+    if (static_cast<u32>(permissions) & ~static_cast<u32>(own_other_permissions)) {
+        LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s, permissions don't match",
+                  GetObjectId(), address, name.c_str());
+        return ResultCode(ErrorDescription::InvalidCombination, ErrorModule::OS, ErrorSummary::InvalidArgument, ErrorLevel::Usage);
     }
 
-    // TODO: Test permissions
+    // Heap-backed memory blocks can not be mapped with other_permissions = DontCare
+    if (base_address != 0 && other_permissions == MemoryPermission::DontCare) {
+        LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s, permissions don't match",
+                  GetObjectId(), address, name.c_str());
+        return ResultCode(ErrorDescription::InvalidCombination, ErrorModule::OS, ErrorSummary::InvalidArgument, ErrorLevel::Usage);
+    }
 
-    // HACK: Since there's no way to write to the memory block without mapping it onto the game
-    // process yet, at least initialize memory the first time it's mapped.
-    if (address != this->base_address) {
-        std::memset(Memory::GetPointer(address), 0, size);
+    // Error out if the provided permissions are not compatible with what the creator process needs.
+    if (other_permissions != MemoryPermission::DontCare &&
+        static_cast<u32>(this->permissions) & ~static_cast<u32>(other_permissions)) {
+        LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s, permissions don't match",
+                  GetObjectId(), address, name.c_str());
+        return ResultCode(ErrorDescription::WrongPermission, ErrorModule::OS, ErrorSummary::WrongArgument, ErrorLevel::Permanent);
     }
 
-    this->base_address = address;
+    // TODO(Subv): Check for the Shared Device Mem flag in the creator process.
+    /*if (was_created_with_shared_device_mem && address != 0) {
+        return ResultCode(ErrorDescription::InvalidCombination, ErrorModule::OS, ErrorSummary::InvalidArgument, ErrorLevel::Usage);
+    }*/
 
-    return RESULT_SUCCESS;
-}
+    // TODO(Subv): The same process that created a SharedMemory object
+    // can not map it in its own address space unless it was created with addr=0, result 0xD900182C.
 
-ResultCode SharedMemory::Unmap(VAddr address) {
-    if (base_address == 0) {
-        // TODO(Subv): Verify what actually happens when you want to unmap a memory block that
-        // was originally mapped with address = 0
-        return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::OS, ErrorSummary::InvalidArgument, ErrorLevel::Usage);
+    if (address != 0) {
+        if (address < Memory::HEAP_VADDR || address + size >= Memory::SHARED_MEMORY_VADDR_END) {
+            LOG_ERROR(Kernel, "cannot map id=%u, address=0x%08X name=%s, invalid address",
+                      GetObjectId(), address, name.c_str());
+            return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::OS,
+                              ErrorSummary::InvalidArgument, ErrorLevel::Usage);
+        }
     }
 
-    if (base_address != address)
-        return ResultCode(ErrorDescription::WrongAddress, ErrorModule::OS, ErrorSummary::InvalidState, ErrorLevel::Usage);
+    VAddr target_address = address;
 
-    base_address = 0;
+    if (base_address == 0 && target_address == 0) {
+        // Calculate the address at which to map the memory block.
+        target_address = Memory::PhysicalToVirtualAddress(linear_heap_phys_address);
+    }
+
+    // Map the memory block into the target process
+    auto result = target_process->vm_manager.MapMemoryBlock(target_address, backing_block, backing_block_offset, size, MemoryState::Shared);
+    if (result.Failed()) {
+        LOG_ERROR(Kernel, "cannot map id=%u, target_address=0x%08X name=%s, error mapping to virtual memory",
+                  GetObjectId(), target_address, name.c_str());
+        return result.Code();
+    }
 
-    return RESULT_SUCCESS;
+    return target_process->vm_manager.ReprotectRange(target_address, size, ConvertPermissions(permissions));
 }
 
-u8* SharedMemory::GetPointer(u32 offset) {
-    if (base_address != 0)
-        return Memory::GetPointer(base_address + offset);
+ResultCode SharedMemory::Unmap(Process* target_process, VAddr address) {
+    // TODO(Subv): Verify what happens if the application tries to unmap an address that is not mapped to a SharedMemory.
+    return target_process->vm_manager.UnmapRange(address, size);
+}
+
+VMAPermission SharedMemory::ConvertPermissions(MemoryPermission permission) {
+    u32 masked_permissions = static_cast<u32>(permission) & static_cast<u32>(MemoryPermission::ReadWriteExecute);
+    return static_cast<VMAPermission>(masked_permissions);
+};
 
-    LOG_ERROR(Kernel_SVC, "memory block id=%u not mapped!", GetObjectId());
-    return nullptr;
+u8* SharedMemory::GetPointer(u32 offset) {
+    return backing_block->data() + backing_block_offset + offset;
 }
 
 } // namespace
diff --git a/src/core/hle/kernel/shared_memory.h b/src/core/hle/kernel/shared_memory.h
index b51049ad0..0c404a9f8 100644
--- a/src/core/hle/kernel/shared_memory.h
+++ b/src/core/hle/kernel/shared_memory.h
@@ -9,6 +9,7 @@
 #include "common/common_types.h"
 
 #include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/process.h"
 #include "core/hle/result.h"
 
 namespace Kernel {
@@ -29,14 +30,29 @@ enum class MemoryPermission : u32 {
 class SharedMemory final : public Object {
 public:
     /**
-     * Creates a shared memory object
+     * Creates a shared memory object.
+     * @param owner_process Process that created this shared memory object.
      * @param size Size of the memory block. Must be page-aligned.
      * @param permissions Permission restrictions applied to the process which created the block.
      * @param other_permissions Permission restrictions applied to other processes mapping the block.
+     * @param address The address from which to map the Shared Memory.
+     * @param region If the address is 0, the shared memory will be allocated in this region of the linear heap.
      * @param name Optional object name, used for debugging purposes.
      */
-    static SharedPtr<SharedMemory> Create(u32 size, MemoryPermission permissions,
-            MemoryPermission other_permissions, std::string name = "Unknown");
+    static SharedPtr<SharedMemory> Create(SharedPtr<Process> owner_process, u32 size, MemoryPermission permissions,
+            MemoryPermission other_permissions, VAddr address = 0, MemoryRegion region = MemoryRegion::BASE, std::string name = "Unknown");
+
+    /**
+     * Creates a shared memory object from a block of memory managed by an HLE applet.
+     * @param heap_block Heap block of the HLE applet.
+     * @param offset The offset into the heap block that the SharedMemory will map.
+     * @param size Size of the memory block. Must be page-aligned.
+     * @param permissions Permission restrictions applied to the process which created the block.
+     * @param other_permissions Permission restrictions applied to other processes mapping the block.
+     * @param name Optional object name, used for debugging purposes.
+     */
+    static SharedPtr<SharedMemory> CreateForApplet(std::shared_ptr<std::vector<u8>> heap_block, u32 offset, u32 size,
+                                                   MemoryPermission permissions, MemoryPermission other_permissions, std::string name = "Unknown Applet");
 
     std::string GetTypeName() const override { return "SharedMemory"; }
     std::string GetName() const override { return name; }
@@ -45,19 +61,27 @@ public:
     HandleType GetHandleType() const override { return HANDLE_TYPE; }
 
     /**
-     * Maps a shared memory block to an address in system memory
+     * Converts the specified MemoryPermission into the equivalent VMAPermission.
+     * @param permission The MemoryPermission to convert.
+     */
+    static VMAPermission ConvertPermissions(MemoryPermission permission);
+
+    /**
+     * Maps a shared memory block to an address in the target process' address space
+     * @param target_process Process on which to map the memory block.
      * @param address Address in system memory to map shared memory block to
      * @param permissions Memory block map permissions (specified by SVC field)
      * @param other_permissions Memory block map other permissions (specified by SVC field)
      */
-    ResultCode Map(VAddr address, MemoryPermission permissions, MemoryPermission other_permissions);
+    ResultCode Map(Process* target_process, VAddr address, MemoryPermission permissions, MemoryPermission other_permissions);
 
     /**
      * Unmaps a shared memory block from the specified address in system memory
+     * @param target_process Process from which to umap the memory block.
      * @param address Address in system memory where the shared memory block is mapped
      * @return Result code of the unmap operation
      */
-    ResultCode Unmap(VAddr address);
+    ResultCode Unmap(Process* target_process, VAddr address);
 
     /**
     * Gets a pointer to the shared memory block
@@ -66,10 +90,16 @@ public:
     */
     u8* GetPointer(u32 offset = 0);
 
-    /// Address of shared memory block in the process.
+    /// Process that created this shared memory block.
+    SharedPtr<Process> owner_process;
+    /// Address of shared memory block in the owner process if specified.
     VAddr base_address;
-    /// Fixed address to allow mapping to. Used for blocks created from the linear heap.
-    VAddr fixed_address;
+    /// Physical address of the shared memory block in the linear heap if no address was specified during creation.
+    PAddr linear_heap_phys_address;
+    /// Backing memory for this shared memory block.
+    std::shared_ptr<std::vector<u8>> backing_block;
+    /// Offset into the backing block for this shared memory.
+    u32 backing_block_offset;
     /// Size of the memory block. Page-aligned.
     u32 size;
     /// Permission restrictions applied to the process which created the block.
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index 6dc95d0f1..43def6146 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -117,9 +117,10 @@ void Thread::Stop() {
     }
     wait_objects.clear();
 
-    Kernel::g_current_process->used_tls_slots[tls_index] = false;
-    g_current_process->misc_memory_used -= Memory::TLS_ENTRY_SIZE;
-    g_current_process->memory_region->used -= Memory::TLS_ENTRY_SIZE;
+    // Mark the TLS slot in the thread's page as free.
+    u32 tls_page = (tls_address - Memory::TLS_AREA_VADDR) / Memory::PAGE_SIZE;
+    u32 tls_slot = ((tls_address - Memory::TLS_AREA_VADDR) % Memory::PAGE_SIZE) / Memory::TLS_ENTRY_SIZE;
+    Kernel::g_current_process->tls_slots[tls_page].reset(tls_slot);
 
     HLE::Reschedule(__func__);
 }
@@ -366,6 +367,31 @@ static void DebugThreadQueue() {
     }
 }
 
+/**
+ * Finds a free location for the TLS section of a thread.
+ * @param tls_slots The TLS page array of the thread's owner process.
+ * Returns a tuple of (page, slot, alloc_needed) where:
+ * page: The index of the first allocated TLS page that has free slots.
+ * slot: The index of the first free slot in the indicated page.
+ * alloc_needed: Whether there's a need to allocate a new TLS page (All pages are full).
+ */
+std::tuple<u32, u32, bool> GetFreeThreadLocalSlot(std::vector<std::bitset<8>>& tls_slots) {
+    // Iterate over all the allocated pages, and try to find one where not all slots are used.
+    for (unsigned page = 0; page < tls_slots.size(); ++page) {
+        const auto& page_tls_slots = tls_slots[page];
+        if (!page_tls_slots.all()) {
+            // We found a page with at least one free slot, find which slot it is
+            for (unsigned slot = 0; slot < page_tls_slots.size(); ++slot) {
+                if (!page_tls_slots.test(slot)) {
+                    return std::make_tuple(page, slot, false);
+                }
+            }
+        }
+    }
+
+    return std::make_tuple(0, 0, true);
+}
+
 ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point, s32 priority,
         u32 arg, s32 processor_id, VAddr stack_top) {
     if (priority < THREADPRIO_HIGHEST || priority > THREADPRIO_LOWEST) {
@@ -403,22 +429,50 @@ ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point,
     thread->name = std::move(name);
     thread->callback_handle = wakeup_callback_handle_table.Create(thread).MoveFrom();
     thread->owner_process = g_current_process;
-    thread->tls_index = -1;
     thread->waitsynch_waited = false;
 
     // Find the next available TLS index, and mark it as used
-    auto& used_tls_slots = Kernel::g_current_process->used_tls_slots;
-    for (unsigned int i = 0; i < used_tls_slots.size(); ++i) {
-        if (used_tls_slots[i] == false) {
-            thread->tls_index = i;
-            used_tls_slots[i] = true;
-            break;
+    auto& tls_slots = Kernel::g_current_process->tls_slots;
+    bool needs_allocation = true;
+    u32 available_page; // Which allocated page has free space
+    u32 available_slot; // Which slot within the page is free
+
+    std::tie(available_page, available_slot, needs_allocation) = GetFreeThreadLocalSlot(tls_slots);
+
+    if (needs_allocation) {
+        // There are no already-allocated pages with free slots, lets allocate a new one.
+        // TLS pages are allocated from the BASE region in the linear heap.
+        MemoryRegionInfo* memory_region = GetMemoryRegion(MemoryRegion::BASE);
+        auto& linheap_memory = memory_region->linear_heap_memory;
+
+        if (linheap_memory->size() + Memory::PAGE_SIZE > memory_region->size) {
+            LOG_ERROR(Kernel_SVC, "Not enough space in region to allocate a new TLS page for thread");
+            return ResultCode(ErrorDescription::OutOfMemory, ErrorModule::Kernel, ErrorSummary::OutOfResource, ErrorLevel::Permanent);
         }
+
+        u32 offset = linheap_memory->size();
+
+        // Allocate some memory from the end of the linear heap for this region.
+        linheap_memory->insert(linheap_memory->end(), Memory::PAGE_SIZE, 0);
+        memory_region->used += Memory::PAGE_SIZE;
+        Kernel::g_current_process->linear_heap_used += Memory::PAGE_SIZE;
+
+        tls_slots.emplace_back(0); // The page is completely available at the start
+        available_page = tls_slots.size() - 1;
+        available_slot = 0; // Use the first slot in the new page
+
+        auto& vm_manager = Kernel::g_current_process->vm_manager;
+        vm_manager.RefreshMemoryBlockMappings(linheap_memory.get());
+
+        // Map the page to the current process' address space.
+        // TODO(Subv): Find the correct MemoryState for this region.
+        vm_manager.MapMemoryBlock(Memory::TLS_AREA_VADDR + available_page * Memory::PAGE_SIZE,
+                                  linheap_memory, offset, Memory::PAGE_SIZE, MemoryState::Private);
     }
 
-    ASSERT_MSG(thread->tls_index != -1, "Out of TLS space");
-    g_current_process->misc_memory_used += Memory::TLS_ENTRY_SIZE;
-    g_current_process->memory_region->used += Memory::TLS_ENTRY_SIZE;
+    // Mark the slot as used
+    tls_slots[available_page].set(available_slot);
+    thread->tls_address = Memory::TLS_AREA_VADDR + available_page * Memory::PAGE_SIZE + available_slot * Memory::TLS_ENTRY_SIZE;
 
     // TODO(peachum): move to ScheduleThread() when scheduler is added so selected core is used
     // to initialize the context
@@ -472,6 +526,8 @@ SharedPtr<Thread> SetupMainThread(u32 entry_point, s32 priority) {
 
     SharedPtr<Thread> thread = thread_res.MoveFrom();
 
+    thread->context.fpscr = FPSCR_DEFAULT_NAN | FPSCR_FLUSH_TO_ZERO | FPSCR_ROUND_TOZERO | FPSCR_IXC; // 0x03C00010
+
     // Run new "main" thread
     SwitchContext(thread.get());
 
@@ -509,10 +565,6 @@ void Thread::SetWaitSynchronizationOutput(s32 output) {
     context.cpu_registers[1] = output;
 }
 
-VAddr Thread::GetTLSAddress() const {
-    return Memory::TLS_AREA_VADDR + tls_index * Memory::TLS_ENTRY_SIZE;
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 void ThreadingInit() {
diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h
index 97ba57fc5..deab5d5a6 100644
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -127,7 +127,7 @@ public:
      * Returns the Thread Local Storage address of the current thread
      * @returns VAddr of the thread's TLS
      */
-    VAddr GetTLSAddress() const;
+    VAddr GetTLSAddress() const { return tls_address; }
 
     Core::ThreadContext context;
 
@@ -144,7 +144,7 @@ public:
 
     s32 processor_id;
 
-    s32 tls_index; ///< Index of the Thread Local Storage of the thread
+    VAddr tls_address; ///< Virtual address of the Thread Local Storage of the thread
 
     bool waitsynch_waited; ///< Set to true if the last svcWaitSynch call caused the thread to wait
 
diff --git a/src/core/hle/result.h b/src/core/hle/result.h
index 3fc1ab4ee..bfb3327ce 100644
--- a/src/core/hle/result.h
+++ b/src/core/hle/result.h
@@ -17,6 +17,7 @@
 /// Detailed description of the error. This listing is likely incomplete.
 enum class ErrorDescription : u32 {
     Success = 0,
+    WrongPermission = 46,
     OS_InvalidBufferDescriptor = 48,
     WrongAddress = 53,
     FS_NotFound = 120,
diff --git a/src/core/hle/service/act_a.cpp b/src/core/hle/service/act_a.cpp
new file mode 100644
index 000000000..3a775fa90
--- /dev/null
+++ b/src/core/hle/service/act_a.cpp
@@ -0,0 +1,26 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/hle/service/act_a.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Namespace ACT_A
+
+namespace ACT_A {
+
+const Interface::FunctionInfo FunctionTable[] = {
+    {0x041300C2, nullptr, "UpdateMiiImage"},
+    {0x041B0142, nullptr, "AgreeEula"},
+    {0x04210042, nullptr, "UploadMii"},
+    {0x04230082, nullptr, "ValidateMailAddress"},
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Interface class
+
+Interface::Interface() {
+    Register(FunctionTable);
+}
+
+} // namespace
diff --git a/src/core/hle/service/act_a.h b/src/core/hle/service/act_a.h
new file mode 100644
index 000000000..765cae644
--- /dev/null
+++ b/src/core/hle/service/act_a.h
@@ -0,0 +1,23 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "core/hle/service/service.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Namespace ACT_A
+
+namespace ACT_A {
+
+class Interface : public Service::Interface {
+public:
+    Interface();
+
+    std::string GetPortName() const override {
+        return "act:a";
+    }
+};
+
+} // namespace
diff --git a/src/core/hle/service/act_u.cpp b/src/core/hle/service/act_u.cpp
index b23d17fba..05de4d002 100644
--- a/src/core/hle/service/act_u.cpp
+++ b/src/core/hle/service/act_u.cpp
@@ -10,7 +10,10 @@
 namespace ACT_U {
 
 const Interface::FunctionInfo FunctionTable[] = {
+    {0x00010084, nullptr, "Initialize"},
+    {0x00020040, nullptr, "GetErrorCode"},
     {0x000600C2, nullptr, "GetAccountDataBlock"},
+    {0x000D0040, nullptr, "GenerateUuid"},
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/core/hle/service/apt/apt.cpp b/src/core/hle/service/apt/apt.cpp
index 6d72e8188..73fce6079 100644
--- a/src/core/hle/service/apt/apt.cpp
+++ b/src/core/hle/service/apt/apt.cpp
@@ -12,6 +12,7 @@
 #include "core/hle/service/apt/apt_a.h"
 #include "core/hle/service/apt/apt_s.h"
 #include "core/hle/service/apt/apt_u.h"
+#include "core/hle/service/apt/bcfnt/bcfnt.h"
 #include "core/hle/service/fs/archive.h"
 
 #include "core/hle/kernel/event.h"
@@ -22,23 +23,14 @@
 namespace Service {
 namespace APT {
 
-// Address used for shared font (as observed on HW)
-// TODO(bunnei): This is the hard-coded address where we currently dump the shared font from via
-// https://github.com/citra-emu/3dsutils. This is technically a hack, and will not work at any
-// address other than 0x18000000 due to internal pointers in the shared font dump that would need to
-// be relocated. This might be fixed by dumping the shared font @ address 0x00000000 and then
-// correctly mapping it in Citra, however we still do not understand how the mapping is determined.
-static const VAddr SHARED_FONT_VADDR = 0x18000000;
-
 /// Handle to shared memory region designated to for shared system font
 static Kernel::SharedPtr<Kernel::SharedMemory> shared_font_mem;
+static bool shared_font_relocated = false;
 
 static Kernel::SharedPtr<Kernel::Mutex> lock;
 static Kernel::SharedPtr<Kernel::Event> notification_event; ///< APT notification event
 static Kernel::SharedPtr<Kernel::Event> parameter_event; ///< APT parameter event
 
-static std::shared_ptr<std::vector<u8>> shared_font;
-
 static u32 cpu_percent; ///< CPU time available to the running application
 
 /// Parameter data to be returned in the next call to Glance/ReceiveParameter
@@ -74,23 +66,25 @@ void Initialize(Service::Interface* self) {
 void GetSharedFont(Service::Interface* self) {
     u32* cmd_buff = Kernel::GetCommandBuffer();
 
-    if (shared_font != nullptr) {
-        // TODO(yuriks): This is a hack to keep this working right now even with our completely
-        // broken shared memory system.
-        shared_font_mem->fixed_address = SHARED_FONT_VADDR;
-        Kernel::g_current_process->vm_manager.MapMemoryBlock(shared_font_mem->fixed_address,
-                shared_font, 0, shared_font_mem->size, Kernel::MemoryState::Shared);
-
-        cmd_buff[0] = IPC::MakeHeader(0x44, 2, 2);
-        cmd_buff[1] = RESULT_SUCCESS.raw; // No error
-        cmd_buff[2] = SHARED_FONT_VADDR;
-        cmd_buff[3] = IPC::MoveHandleDesc();
-        cmd_buff[4] = Kernel::g_handle_table.Create(shared_font_mem).MoveFrom();
-    } else {
-        cmd_buff[0] = IPC::MakeHeader(0x44, 1, 0);
-        cmd_buff[1] = -1; // Generic error (not really possible to verify this on hardware)
-        LOG_ERROR(Kernel_SVC, "called, but %s has not been loaded!", SHARED_FONT);
+    // The shared font has to be relocated to the new address before being passed to the application.
+    VAddr target_address = Memory::PhysicalToVirtualAddress(shared_font_mem->linear_heap_phys_address);
+    // The shared font dumped by 3dsutils (https://github.com/citra-emu/3dsutils) uses this address as base,
+    // so we relocate it from there to our real address.
+    // TODO(Subv): This address is wrong if the shared font is dumped from a n3DS,
+    // we need a way to automatically calculate the original address of the font from the file.
+    static const VAddr SHARED_FONT_VADDR = 0x18000000;
+    if (!shared_font_relocated) {
+        BCFNT::RelocateSharedFont(shared_font_mem, SHARED_FONT_VADDR, target_address);
+        shared_font_relocated = true;
     }
+    cmd_buff[0] = IPC::MakeHeader(0x44, 2, 2);
+    cmd_buff[1] = RESULT_SUCCESS.raw; // No error
+    // Since the SharedMemory interface doesn't provide the address at which the memory was allocated,
+    // the real APT service calculates this address by scanning the entire address space (using svcQueryMemory)
+    // and searches for an allocation of the same size as the Shared Font.
+    cmd_buff[2] = target_address;
+    cmd_buff[3] = IPC::MoveHandleDesc();
+    cmd_buff[4] = Kernel::g_handle_table.Create(shared_font_mem).MoveFrom();
 }
 
 void NotifyToWait(Service::Interface* self) {
@@ -433,14 +427,12 @@ void Init() {
     FileUtil::IOFile file(filepath, "rb");
 
     if (file.IsOpen()) {
-        // Read shared font data
-        shared_font = std::make_shared<std::vector<u8>>((size_t)file.GetSize());
-        file.ReadBytes(shared_font->data(), shared_font->size());
-
         // Create shared font memory object
         using Kernel::MemoryPermission;
-        shared_font_mem = Kernel::SharedMemory::Create(3 * 1024 * 1024, // 3MB
-                MemoryPermission::ReadWrite, MemoryPermission::Read, "APT_U:shared_font_mem");
+        shared_font_mem = Kernel::SharedMemory::Create(nullptr, 0x332000, // 3272 KB
+                MemoryPermission::ReadWrite, MemoryPermission::Read, 0, Kernel::MemoryRegion::SYSTEM, "APT:SharedFont");
+        // Read shared font data
+        file.ReadBytes(shared_font_mem->GetPointer(), file.GetSize());
     } else {
         LOG_WARNING(Service_APT, "Unable to load shared font: %s", filepath.c_str());
         shared_font_mem = nullptr;
@@ -459,8 +451,8 @@ void Init() {
 }
 
 void Shutdown() {
-    shared_font = nullptr;
     shared_font_mem = nullptr;
+    shared_font_relocated = false;
     lock = nullptr;
     notification_event = nullptr;
     parameter_event = nullptr;
diff --git a/src/core/hle/service/apt/bcfnt/bcfnt.cpp b/src/core/hle/service/apt/bcfnt/bcfnt.cpp
new file mode 100644
index 000000000..b0d39d4a5
--- /dev/null
+++ b/src/core/hle/service/apt/bcfnt/bcfnt.cpp
@@ -0,0 +1,71 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/hle/service/apt/bcfnt/bcfnt.h"
+#include "core/hle/service/service.h"
+
+namespace Service {
+namespace APT {
+namespace BCFNT {
+
+void RelocateSharedFont(Kernel::SharedPtr<Kernel::SharedMemory> shared_font, VAddr previous_address, VAddr new_address) {
+    static const u32 SharedFontStartOffset = 0x80;
+    u8* data = shared_font->GetPointer(SharedFontStartOffset);
+
+    CFNT cfnt;
+    memcpy(&cfnt, data, sizeof(cfnt));
+
+    // Advance past the header
+    data = shared_font->GetPointer(SharedFontStartOffset + cfnt.header_size);
+
+    for (unsigned block = 0; block < cfnt.num_blocks; ++block) {
+
+        u32 section_size = 0;
+        if (memcmp(data, "FINF", 4) == 0) {
+            BCFNT::FINF finf;
+            memcpy(&finf, data, sizeof(finf));
+            section_size = finf.section_size;
+
+            // Relocate the offsets in the FINF section
+            finf.cmap_offset += new_address - previous_address;
+            finf.cwdh_offset += new_address - previous_address;
+            finf.tglp_offset += new_address - previous_address;
+
+            memcpy(data, &finf, sizeof(finf));
+        } else if (memcmp(data, "CMAP", 4) == 0) {
+            BCFNT::CMAP cmap;
+            memcpy(&cmap, data, sizeof(cmap));
+            section_size = cmap.section_size;
+
+            // Relocate the offsets in the CMAP section
+            cmap.next_cmap_offset += new_address - previous_address;
+
+            memcpy(data, &cmap, sizeof(cmap));
+        } else if (memcmp(data, "CWDH", 4) == 0) {
+            BCFNT::CWDH cwdh;
+            memcpy(&cwdh, data, sizeof(cwdh));
+            section_size = cwdh.section_size;
+
+            // Relocate the offsets in the CWDH section
+            cwdh.next_cwdh_offset += new_address - previous_address;
+
+            memcpy(data, &cwdh, sizeof(cwdh));
+        } else if (memcmp(data, "TGLP", 4) == 0) {
+            BCFNT::TGLP tglp;
+            memcpy(&tglp, data, sizeof(tglp));
+            section_size = tglp.section_size;
+
+            // Relocate the offsets in the TGLP section
+            tglp.sheet_data_offset += new_address - previous_address;
+
+            memcpy(data, &tglp, sizeof(tglp));
+        }
+
+        data += section_size;
+    }
+}
+
+} // namespace BCFNT
+} // namespace APT
+} // namespace Service
+\ No newline at end of file
diff --git a/src/core/hle/service/apt/bcfnt/bcfnt.h b/src/core/hle/service/apt/bcfnt/bcfnt.h
new file mode 100644
index 000000000..388c6bea0
--- /dev/null
+++ b/src/core/hle/service/apt/bcfnt/bcfnt.h
@@ -0,0 +1,87 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/swap.h"
+
+#include "core/hle/kernel/shared_memory.h"
+#include "core/hle/service/service.h"
+
+namespace Service {
+namespace APT {
+namespace BCFNT { ///< BCFNT Shared Font file structures
+
+struct CFNT {
+    u8 magic[4];
+    u16_le endianness;
+    u16_le header_size;
+    u32_le version;
+    u32_le file_size;
+    u32_le num_blocks;
+};
+
+struct FINF {
+    u8 magic[4];
+    u32_le section_size;
+    u8 font_type;
+    u8 line_feed;
+    u16_le alter_char_index;
+    u8 default_width[3];
+    u8 encoding;
+    u32_le tglp_offset;
+    u32_le cwdh_offset;
+    u32_le cmap_offset;
+    u8 height;
+    u8 width;
+    u8 ascent;
+    u8 reserved;
+};
+
+struct TGLP {
+    u8 magic[4];
+    u32_le section_size;
+    u8 cell_width;
+    u8 cell_height;
+    u8 baseline_position;
+    u8 max_character_width;
+    u32_le sheet_size;
+    u16_le num_sheets;
+    u16_le sheet_image_format;
+    u16_le num_columns;
+    u16_le num_rows;
+    u16_le sheet_width;
+    u16_le sheet_height;
+    u32_le sheet_data_offset;
+};
+
+struct CMAP {
+    u8 magic[4];
+    u32_le section_size;
+    u16_le code_begin;
+    u16_le code_end;
+    u16_le mapping_method;
+    u16_le reserved;
+    u32_le next_cmap_offset;
+};
+
+struct CWDH {
+    u8 magic[4];
+    u32_le section_size;
+    u16_le start_index;
+    u16_le end_index;
+    u32_le next_cwdh_offset;
+};
+
+/**
+ * Relocates the internal addresses of the BCFNT Shared Font to the new base.
+ * @param shared_font SharedMemory object that contains the Shared Font
+ * @param previous_address Previous address at which the offsets in the structure were based.
+ * @param new_address New base for the offsets in the structure.
+ */
+void RelocateSharedFont(Kernel::SharedPtr<Kernel::SharedMemory> shared_font, VAddr previous_address, VAddr new_address);
+
+} // namespace BCFNT
+} // namespace APT
+} // namespace Service
diff --git a/src/core/hle/service/csnd_snd.cpp b/src/core/hle/service/csnd_snd.cpp
index 6318bf2a7..d2bb8941c 100644
--- a/src/core/hle/service/csnd_snd.cpp
+++ b/src/core/hle/service/csnd_snd.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include <cstring>
+#include "common/alignment.h"
 #include "core/hle/hle.h"
 #include "core/hle/kernel/mutex.h"
 #include "core/hle/kernel/shared_memory.h"
@@ -41,14 +42,16 @@ static Kernel::SharedPtr<Kernel::Mutex> mutex = nullptr;
 void Initialize(Service::Interface* self) {
     u32* cmd_buff = Kernel::GetCommandBuffer();
 
-    shared_memory = Kernel::SharedMemory::Create(cmd_buff[1],
-            Kernel::MemoryPermission::ReadWrite,
-            Kernel::MemoryPermission::ReadWrite, "CSNDSharedMem");
+    u32 size = Common::AlignUp(cmd_buff[1], Memory::PAGE_SIZE);
+    using Kernel::MemoryPermission;
+    shared_memory = Kernel::SharedMemory::Create(nullptr, size,
+                                                 MemoryPermission::ReadWrite, MemoryPermission::ReadWrite,
+                                                 0, Kernel::MemoryRegion::BASE, "CSND:SharedMemory");
 
     mutex = Kernel::Mutex::Create(false);
 
-    cmd_buff[1] = 0;
-    cmd_buff[2] = 0x4000000;
+    cmd_buff[1] = RESULT_SUCCESS.raw;
+    cmd_buff[2] = IPC::MoveHandleDesc(2);
     cmd_buff[3] = Kernel::g_handle_table.Create(mutex).MoveFrom();
     cmd_buff[4] = Kernel::g_handle_table.Create(shared_memory).MoveFrom();
 }
diff --git a/src/core/hle/service/dsp_dsp.cpp b/src/core/hle/service/dsp_dsp.cpp
index 274fc751a..10730d7ac 100644
--- a/src/core/hle/service/dsp_dsp.cpp
+++ b/src/core/hle/service/dsp_dsp.cpp
@@ -440,9 +440,9 @@ static void GetHeadphoneStatus(Service::Interface* self) {
 
     cmd_buff[0] = IPC::MakeHeader(0x1F, 2, 0);
     cmd_buff[1] = RESULT_SUCCESS.raw; // No error
-    cmd_buff[2] = 0; // Not using headphones?
+    cmd_buff[2] = 0; // Not using headphones
 
-    LOG_WARNING(Service_DSP, "(STUBBED) called");
+    LOG_DEBUG(Service_DSP, "called");
 }
 
 /**
diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp
index b4c146e08..8ded9b09b 100644
--- a/src/core/hle/service/gsp_gpu.cpp
+++ b/src/core/hle/service/gsp_gpu.cpp
@@ -335,8 +335,9 @@ static void RegisterInterruptRelayQueue(Service::Interface* self) {
     g_interrupt_event->name = "GSP_GPU::interrupt_event";
 
     using Kernel::MemoryPermission;
-    g_shared_memory = Kernel::SharedMemory::Create(0x1000, MemoryPermission::ReadWrite,
-        MemoryPermission::ReadWrite, "GSPSharedMem");
+    g_shared_memory = Kernel::SharedMemory::Create(nullptr, 0x1000,
+                                                   MemoryPermission::ReadWrite, MemoryPermission::ReadWrite,
+                                                   0, Kernel::MemoryRegion::BASE, "GSP:SharedMemory");
 
     Handle shmem_handle = Kernel::g_handle_table.Create(g_shared_memory).MoveFrom();
 
diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp
index 1053d0f40..d216cecb4 100644
--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -280,8 +280,9 @@ void Init() {
     AddService(new HID_SPVR_Interface);
 
     using Kernel::MemoryPermission;
-    shared_mem = SharedMemory::Create(0x1000, MemoryPermission::ReadWrite,
-            MemoryPermission::Read, "HID:SharedMem");
+    shared_mem = SharedMemory::Create(nullptr, 0x1000,
+                                      MemoryPermission::ReadWrite, MemoryPermission::Read,
+                                      0, Kernel::MemoryRegion::BASE, "HID:SharedMemory");
 
     next_pad_index = 0;
     next_touch_index = 0;
diff --git a/src/core/hle/service/ir/ir.cpp b/src/core/hle/service/ir/ir.cpp
index 505c441c6..079a87e48 100644
--- a/src/core/hle/service/ir/ir.cpp
+++ b/src/core/hle/service/ir/ir.cpp
@@ -94,8 +94,9 @@ void Init() {
     AddService(new IR_User_Interface);
 
     using Kernel::MemoryPermission;
-    shared_memory = SharedMemory::Create(0x1000, Kernel::MemoryPermission::ReadWrite,
-                                         Kernel::MemoryPermission::ReadWrite, "IR:SharedMemory");
+    shared_memory = SharedMemory::Create(nullptr, 0x1000,
+                                         Kernel::MemoryPermission::ReadWrite, Kernel::MemoryPermission::ReadWrite,
+                                         0, Kernel::MemoryRegion::BASE, "IR:SharedMemory");
     transfer_shared_memory = nullptr;
 
     // Create event handle(s)
diff --git a/src/core/hle/service/service.cpp b/src/core/hle/service/service.cpp
index 0fe3a4d7a..d7e7d4fe3 100644
--- a/src/core/hle/service/service.cpp
+++ b/src/core/hle/service/service.cpp
@@ -7,6 +7,7 @@
 
 #include "core/hle/service/service.h"
 #include "core/hle/service/ac_u.h"
+#include "core/hle/service/act_a.h"
 #include "core/hle/service/act_u.h"
 #include "core/hle/service/csnd_snd.h"
 #include "core/hle/service/dlp_srvr.h"
@@ -119,6 +120,7 @@ void Init() {
     Service::PTM::Init();
 
     AddService(new AC_U::Interface);
+    AddService(new ACT_A::Interface);
     AddService(new ACT_U::Interface);
     AddService(new CSND_SND::Interface);
     AddService(new DLP_SRVR::Interface);
diff --git a/src/core/hle/svc.cpp b/src/core/hle/svc.cpp
index 60c8747f3..0ce72de87 100644
--- a/src/core/hle/svc.cpp
+++ b/src/core/hle/svc.cpp
@@ -6,6 +6,7 @@
 
 #include "common/logging/log.h"
 #include "common/microprofile.h"
+#include "common/scope_exit.h"
 #include "common/string_util.h"
 #include "common/symbols.h"
 
@@ -99,6 +100,7 @@ static ResultCode ControlMemory(u32* out_addr, u32 operation, u32 addr0, u32 add
     switch (operation & MEMOP_OPERATION_MASK) {
     case MEMOP_FREE:
     {
+        // TODO(Subv): What happens if an application tries to FREE a block of memory that has a SharedMemory pointing to it?
         if (addr0 >= Memory::HEAP_VADDR && addr0 < Memory::HEAP_VADDR_END) {
             ResultCode result = process.HeapFree(addr0, size);
             if (result.IsError()) return result;
@@ -160,8 +162,6 @@ static ResultCode MapMemoryBlock(Handle handle, u32 addr, u32 permissions, u32 o
     LOG_TRACE(Kernel_SVC, "called memblock=0x%08X, addr=0x%08X, mypermissions=0x%08X, otherpermission=%d",
         handle, addr, permissions, other_permissions);
 
-    // TODO(Subv): The same process that created a SharedMemory object can not map it in its own address space
-
     SharedPtr<SharedMemory> shared_memory = Kernel::g_handle_table.Get<SharedMemory>(handle);
     if (shared_memory == nullptr)
         return ERR_INVALID_HANDLE;
@@ -176,7 +176,7 @@ static ResultCode MapMemoryBlock(Handle handle, u32 addr, u32 permissions, u32 o
     case MemoryPermission::WriteExecute:
     case MemoryPermission::ReadWriteExecute:
     case MemoryPermission::DontCare:
-        return shared_memory->Map(addr, permissions_type,
+        return shared_memory->Map(Kernel::g_current_process.get(), addr, permissions_type,
                 static_cast<MemoryPermission>(other_permissions));
     default:
         LOG_ERROR(Kernel_SVC, "unknown permissions=0x%08X", permissions);
@@ -196,7 +196,7 @@ static ResultCode UnmapMemoryBlock(Handle handle, u32 addr) {
     if (shared_memory == nullptr)
         return ERR_INVALID_HANDLE;
 
-    return shared_memory->Unmap(addr);
+    return shared_memory->Unmap(Kernel::g_current_process.get(), addr);
 }
 
 /// Connect to an OS service given the port name, returns the handle to the port to out
@@ -327,9 +327,9 @@ static ResultCode WaitSynchronizationN(s32* out, Handle* handles, s32 handle_cou
         }
     }
 
-    HLE::Reschedule(__func__);
+    SCOPE_EXIT({HLE::Reschedule("WaitSynchronizationN");}); // Reschedule after putting the threads to sleep.
 
-    // If thread should wait, then set its state to waiting and then reschedule...
+    // If thread should wait, then set its state to waiting
     if (wait_thread) {
 
         // Actually wait the current thread on each object if we decided to wait...
@@ -503,6 +503,9 @@ static ResultCode CreateThread(Handle* out_handle, s32 priority, u32 entry_point
 
     CASCADE_RESULT(SharedPtr<Thread> thread, Kernel::Thread::Create(
             name, entry_point, priority, arg, processor_id, stack_top));
+
+    thread->context.fpscr = FPSCR_DEFAULT_NAN | FPSCR_FLUSH_TO_ZERO | FPSCR_ROUND_TOZERO; // 0x03C00000
+
     CASCADE_RESULT(*out_handle, Kernel::g_handle_table.Create(std::move(thread)));
 
     LOG_TRACE(Kernel_SVC, "called entrypoint=0x%08X (%s), arg=0x%08X, stacktop=0x%08X, "
@@ -790,18 +793,44 @@ static ResultCode CreateMemoryBlock(Handle* out_handle, u32 addr, u32 size, u32
     if (size % Memory::PAGE_SIZE != 0)
         return ResultCode(ErrorDescription::MisalignedSize, ErrorModule::OS, ErrorSummary::InvalidArgument, ErrorLevel::Usage);
 
-    // TODO(Subv): Return E0A01BF5 if the address is not in the application's heap
-
-    // TODO(Subv): Implement this function properly
+    SharedPtr<SharedMemory> shared_memory = nullptr;
 
     using Kernel::MemoryPermission;
-    SharedPtr<SharedMemory> shared_memory = SharedMemory::Create(size,
-            (MemoryPermission)my_permission, (MemoryPermission)other_permission);
-    // Map the SharedMemory to the specified address
-    shared_memory->base_address = addr;
+    auto VerifyPermissions = [](MemoryPermission permission) {
+        // SharedMemory blocks can not be created with Execute permissions
+        switch (permission) {
+        case MemoryPermission::None:
+        case MemoryPermission::Read:
+        case MemoryPermission::Write:
+        case MemoryPermission::ReadWrite:
+        case MemoryPermission::DontCare:
+            return true;
+        default:
+            return false;
+        }
+    };
+
+    if (!VerifyPermissions(static_cast<MemoryPermission>(my_permission)) ||
+        !VerifyPermissions(static_cast<MemoryPermission>(other_permission)))
+        return ResultCode(ErrorDescription::InvalidCombination, ErrorModule::OS,
+                          ErrorSummary::InvalidArgument, ErrorLevel::Usage);
+
+    if (addr < Memory::PROCESS_IMAGE_VADDR || addr + size > Memory::SHARED_MEMORY_VADDR_END) {
+        return ResultCode(ErrorDescription::InvalidAddress, ErrorModule::OS, ErrorSummary::InvalidArgument, ErrorLevel::Usage);
+    }
+
+    // When trying to create a memory block with address = 0,
+    // if the process has the Shared Device Memory flag in the exheader,
+    // then we have to allocate from the same region as the caller process instead of the BASE region.
+    Kernel::MemoryRegion region = Kernel::MemoryRegion::BASE;
+    if (addr == 0 && Kernel::g_current_process->flags.shared_device_mem)
+        region = Kernel::g_current_process->flags.memory_region;
+
+    shared_memory = SharedMemory::Create(Kernel::g_current_process, size,
+                                static_cast<MemoryPermission>(my_permission), static_cast<MemoryPermission>(other_permission), addr, region);
     CASCADE_RESULT(*out_handle, Kernel::g_handle_table.Create(std::move(shared_memory)));
 
-    LOG_WARNING(Kernel_SVC, "(STUBBED) called addr=0x%08X", addr);
+    LOG_WARNING(Kernel_SVC, "called addr=0x%08X", addr);
     return RESULT_SUCCESS;
 }
 
diff --git a/src/core/memory.h b/src/core/memory.h
index 9caa3c3f5..126d60471 100644
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -100,15 +100,9 @@ enum : VAddr {
     SHARED_PAGE_SIZE      = 0x00001000,
     SHARED_PAGE_VADDR_END = SHARED_PAGE_VADDR + SHARED_PAGE_SIZE,
 
-    // TODO(yuriks): The size of this area is dynamic, the kernel grows
-    // it as more and more threads are created. For now we'll just use a
-    // hardcoded value.
     /// Area where TLS (Thread-Local Storage) buffers are allocated.
     TLS_AREA_VADDR     = 0x1FF82000,
     TLS_ENTRY_SIZE     = 0x200,
-    TLS_AREA_SIZE      = 300 * TLS_ENTRY_SIZE + 0x800, // Space for up to 300 threads + round to page size
-    TLS_AREA_VADDR_END = TLS_AREA_VADDR + TLS_AREA_SIZE,
-
 
     /// Equivalent to LINEAR_HEAP_VADDR, but expanded to cover the extra memory in the New 3DS.
     NEW_LINEAR_HEAP_VADDR     = 0x30000000,
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
new file mode 100644
index 000000000..457c55571
--- /dev/null
+++ b/src/tests/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(SRCS
+            tests.cpp
+            )
+
+set(HEADERS
+            )
+
+create_directory_groups(${SRCS} ${HEADERS})
+
+include_directories(../../externals/catch/single_include/)
+
+add_executable(tests ${SRCS} ${HEADERS})
+target_link_libraries(tests core video_core audio_core common)
+target_link_libraries(tests ${PLATFORM_LIBRARIES})
+
+add_test(NAME tests COMMAND $<TARGET_FILE:tests>)
diff --git a/src/tests/tests.cpp b/src/tests/tests.cpp
new file mode 100644
index 000000000..73978676f
--- /dev/null
+++ b/src/tests/tests.cpp
@@ -0,0 +1,9 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+// Catch provides the main function since we've given it the
+// CATCH_CONFIG_MAIN preprocessor directive.
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 2bc747102..db99ce666 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -75,8 +75,6 @@ static void InitScreenCoordinates(OutputVertex& vtx)
     viewport.halfsize_y = float24::FromRaw(regs.viewport_size_y);
     viewport.offset_x   = float24::FromFloat32(static_cast<float>(regs.viewport_corner.x));
     viewport.offset_y   = float24::FromFloat32(static_cast<float>(regs.viewport_corner.y));
-    viewport.zscale     = float24::FromRaw(regs.viewport_depth_range);
-    viewport.offset_z   = float24::FromRaw(regs.viewport_depth_far_plane);
 
     float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w;
     vtx.color *= inv_w;
@@ -89,7 +87,7 @@ static void InitScreenCoordinates(OutputVertex& vtx)
 
     vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
     vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
-    vtx.screenpos[2] = viewport.offset_z + vtx.pos.z * inv_w * viewport.zscale;
+    vtx.screenpos[2] = vtx.pos.z * inv_w;
 }
 
 void ProcessTriangle(const OutputVertex &v0, const OutputVertex &v1, const OutputVertex &v2) {
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 941c5af9f..bf4664f9e 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -128,7 +128,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
                 // TODO: Verify that this actually modifies the register!
                 if (setup.index < 15) {
-                    g_state.vs.default_attributes[setup.index] = attribute;
+                    g_state.vs_default_attributes[setup.index] = attribute;
                     setup.index++;
                 } else {
                     // Put each attribute into an immediate input buffer.
@@ -144,12 +144,12 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                         immediate_attribute_id = 0;
 
                         Shader::UnitState<false> shader_unit;
-                        Shader::Setup();
+                        g_state.vs.Setup();
 
                         // Send to vertex shader
                         if (g_debug_context)
                             g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, static_cast<void*>(&immediate_input));
-                        Shader::OutputVertex output = Shader::Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1);
+                        Shader::OutputVertex output = g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1);
 
                         // Send to renderer
                         using Pica::Shader::OutputVertex;
@@ -236,7 +236,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
             vertex_cache_ids.fill(-1);
 
             Shader::UnitState<false> shader_unit;
-            Shader::Setup();
+            g_state.vs.Setup();
 
             for (unsigned int index = 0; index < regs.num_vertices; ++index)
             {
@@ -273,7 +273,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                     // Send to vertex shader
                     if (g_debug_context)
                         g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, (void*)&input);
-                    output = Shader::Run(shader_unit, input, loader.GetNumTotalAttributes());
+                    output = g_state.vs.Run(shader_unit, input, loader.GetNumTotalAttributes());
 
                     if (is_indexed) {
                         vertex_cache[vertex_cache_pos] = output;
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 2f645b441..871368323 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -696,106 +696,125 @@ finalise:
 #endif
 }
 
-void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages)
-{
+static std::string ReplacePattern(const std::string& input, const std::string& pattern, const std::string& replacement) {
+    size_t start = input.find(pattern);
+    if (start == std::string::npos)
+        return input;
+
+    std::string ret = input;
+    ret.replace(start, pattern.length(), replacement);
+    return ret;
+}
+
+static std::string GetTevStageConfigSourceString(const Pica::Regs::TevStageConfig::Source& source) {
     using Source = Pica::Regs::TevStageConfig::Source;
+    static const std::map<Source, std::string> source_map = {
+        { Source::PrimaryColor,           "PrimaryColor" },
+        { Source::PrimaryFragmentColor,   "PrimaryFragmentColor" },
+        { Source::SecondaryFragmentColor, "SecondaryFragmentColor" },
+        { Source::Texture0,               "Texture0" },
+        { Source::Texture1,               "Texture1" },
+        { Source::Texture2,               "Texture2" },
+        { Source::Texture3,               "Texture3" },
+        { Source::PreviousBuffer,         "PreviousBuffer" },
+        { Source::Constant,               "Constant" },
+        { Source::Previous,               "Previous" },
+    };
+
+    const auto src_it = source_map.find(source);
+    if (src_it == source_map.end())
+        return "Unknown";
+
+    return src_it->second;
+}
+
+static std::string GetTevStageConfigColorSourceString(const Pica::Regs::TevStageConfig::Source& source, const Pica::Regs::TevStageConfig::ColorModifier modifier) {
     using ColorModifier = Pica::Regs::TevStageConfig::ColorModifier;
+    static const std::map<ColorModifier, std::string> color_modifier_map = {
+        { ColorModifier::SourceColor,         "%source.rgb" },
+        { ColorModifier::OneMinusSourceColor, "(1.0 - %source.rgb)" },
+        { ColorModifier::SourceAlpha,         "%source.aaa" },
+        { ColorModifier::OneMinusSourceAlpha, "(1.0 - %source.aaa)" },
+        { ColorModifier::SourceRed,           "%source.rrr" },
+        { ColorModifier::OneMinusSourceRed,   "(1.0 - %source.rrr)" },
+        { ColorModifier::SourceGreen,         "%source.ggg" },
+        { ColorModifier::OneMinusSourceGreen, "(1.0 - %source.ggg)" },
+        { ColorModifier::SourceBlue,          "%source.bbb" },
+        { ColorModifier::OneMinusSourceBlue,  "(1.0 - %source.bbb)" },
+    };
+
+    auto src_str = GetTevStageConfigSourceString(source);
+    auto modifier_it = color_modifier_map.find(modifier);
+    std::string modifier_str = "%source.????";
+    if (modifier_it != color_modifier_map.end())
+        modifier_str = modifier_it->second;
+
+    return ReplacePattern(modifier_str, "%source", src_str);
+}
+
+static std::string GetTevStageConfigAlphaSourceString(const Pica::Regs::TevStageConfig::Source& source, const Pica::Regs::TevStageConfig::AlphaModifier modifier) {
     using AlphaModifier = Pica::Regs::TevStageConfig::AlphaModifier;
+    static const std::map<AlphaModifier, std::string> alpha_modifier_map = {
+        { AlphaModifier::SourceAlpha,         "%source.a" },
+        { AlphaModifier::OneMinusSourceAlpha, "(1.0 - %source.a)" },
+        { AlphaModifier::SourceRed,           "%source.r" },
+        { AlphaModifier::OneMinusSourceRed,   "(1.0 - %source.r)" },
+        { AlphaModifier::SourceGreen,         "%source.g" },
+        { AlphaModifier::OneMinusSourceGreen, "(1.0 - %source.g)" },
+        { AlphaModifier::SourceBlue,          "%source.b" },
+        { AlphaModifier::OneMinusSourceBlue,  "(1.0 - %source.b)" },
+    };
+
+    auto src_str = GetTevStageConfigSourceString(source);
+    auto modifier_it = alpha_modifier_map.find(modifier);
+    std::string modifier_str = "%source.????";
+    if (modifier_it != alpha_modifier_map.end())
+        modifier_str = modifier_it->second;
+
+    return ReplacePattern(modifier_str, "%source", src_str);
+}
+
+static std::string GetTevStageConfigOperationString(const Pica::Regs::TevStageConfig::Operation& operation) {
     using Operation = Pica::Regs::TevStageConfig::Operation;
+    static const std::map<Operation, std::string> combiner_map = {
+        { Operation::Replace,         "%source1" },
+        { Operation::Modulate,        "(%source1 * %source2)" },
+        { Operation::Add,             "(%source1 + %source2)" },
+        { Operation::AddSigned,       "(%source1 + %source2) - 0.5" },
+        { Operation::Lerp,            "lerp(%source1, %source2, %source3)" },
+        { Operation::Subtract,        "(%source1 - %source2)" },
+        { Operation::Dot3_RGB,        "dot(%source1, %source2)" },
+        { Operation::MultiplyThenAdd, "((%source1 * %source2) + %source3)" },
+        { Operation::AddThenMultiply, "((%source1 + %source2) * %source3)" },
+    };
 
-    std::string stage_info = "Tev setup:\n";
-    for (size_t index = 0; index < stages.size(); ++index) {
-        const auto& tev_stage = stages[index];
+    const auto op_it = combiner_map.find(operation);
+    if (op_it == combiner_map.end())
+        return "Unknown op (%source1, %source2, %source3)";
 
-        static const std::map<Source, std::string> source_map = {
-            { Source::PrimaryColor, "PrimaryColor" },
-            { Source::Texture0, "Texture0" },
-            { Source::Texture1, "Texture1" },
-            { Source::Texture2, "Texture2" },
-            { Source::Constant, "Constant" },
-            { Source::Previous, "Previous" },
-        };
+    return op_it->second;
+}
 
-        static const std::map<ColorModifier, std::string> color_modifier_map = {
-            { ColorModifier::SourceColor, { "%source.rgb" } },
-            { ColorModifier::SourceAlpha, { "%source.aaa" } },
-        };
-        static const std::map<AlphaModifier, std::string> alpha_modifier_map = {
-            { AlphaModifier::SourceAlpha, "%source.a" },
-            { AlphaModifier::OneMinusSourceAlpha, "(255 - %source.a)" },
-        };
+std::string GetTevStageConfigColorCombinerString(const Pica::Regs::TevStageConfig& tev_stage) {
+    auto op_str = GetTevStageConfigOperationString(tev_stage.color_op);
+    op_str = ReplacePattern(op_str, "%source1", GetTevStageConfigColorSourceString(tev_stage.color_source1, tev_stage.color_modifier1));
+    op_str = ReplacePattern(op_str, "%source2", GetTevStageConfigColorSourceString(tev_stage.color_source2, tev_stage.color_modifier2));
+    return   ReplacePattern(op_str, "%source3", GetTevStageConfigColorSourceString(tev_stage.color_source3, tev_stage.color_modifier3));
+}
 
-        static const std::map<Operation, std::string> combiner_map = {
-            { Operation::Replace, "%source1" },
-            { Operation::Modulate, "(%source1 * %source2) / 255" },
-            { Operation::Add, "(%source1 + %source2)" },
-            { Operation::Lerp, "lerp(%source1, %source2, %source3)" },
-        };
+std::string GetTevStageConfigAlphaCombinerString(const Pica::Regs::TevStageConfig& tev_stage) {
+    auto op_str = GetTevStageConfigOperationString(tev_stage.alpha_op);
+    op_str = ReplacePattern(op_str, "%source1", GetTevStageConfigAlphaSourceString(tev_stage.alpha_source1, tev_stage.alpha_modifier1));
+    op_str = ReplacePattern(op_str, "%source2", GetTevStageConfigAlphaSourceString(tev_stage.alpha_source2, tev_stage.alpha_modifier2));
+    return   ReplacePattern(op_str, "%source3", GetTevStageConfigAlphaSourceString(tev_stage.alpha_source3, tev_stage.alpha_modifier3));
+}
 
-        static auto ReplacePattern =
-                [](const std::string& input, const std::string& pattern, const std::string& replacement) -> std::string {
-                    size_t start = input.find(pattern);
-                    if (start == std::string::npos)
-                        return input;
-
-                    std::string ret = input;
-                    ret.replace(start, pattern.length(), replacement);
-                    return ret;
-                };
-        static auto GetColorSourceStr =
-                [](const Source& src, const ColorModifier& modifier) {
-                    auto src_it = source_map.find(src);
-                    std::string src_str = "Unknown";
-                    if (src_it != source_map.end())
-                        src_str = src_it->second;
-
-                    auto modifier_it = color_modifier_map.find(modifier);
-                    std::string modifier_str = "%source.????";
-                    if (modifier_it != color_modifier_map.end())
-                        modifier_str = modifier_it->second;
-
-                    return ReplacePattern(modifier_str, "%source", src_str);
-                };
-        static auto GetColorCombinerStr =
-                [](const Regs::TevStageConfig& tev_stage) {
-                    auto op_it = combiner_map.find(tev_stage.color_op);
-                    std::string op_str = "Unknown op (%source1, %source2, %source3)";
-                    if (op_it != combiner_map.end())
-                        op_str = op_it->second;
-
-                    op_str = ReplacePattern(op_str, "%source1", GetColorSourceStr(tev_stage.color_source1, tev_stage.color_modifier1));
-                    op_str = ReplacePattern(op_str, "%source2", GetColorSourceStr(tev_stage.color_source2, tev_stage.color_modifier2));
-                    return   ReplacePattern(op_str, "%source3", GetColorSourceStr(tev_stage.color_source3, tev_stage.color_modifier3));
-                };
-        static auto GetAlphaSourceStr =
-                [](const Source& src, const AlphaModifier& modifier) {
-                    auto src_it = source_map.find(src);
-                    std::string src_str = "Unknown";
-                    if (src_it != source_map.end())
-                        src_str = src_it->second;
-
-                    auto modifier_it = alpha_modifier_map.find(modifier);
-                    std::string modifier_str = "%source.????";
-                    if (modifier_it != alpha_modifier_map.end())
-                        modifier_str = modifier_it->second;
-
-                    return ReplacePattern(modifier_str, "%source", src_str);
-                };
-        static auto GetAlphaCombinerStr =
-                [](const Regs::TevStageConfig& tev_stage) {
-                    auto op_it = combiner_map.find(tev_stage.alpha_op);
-                    std::string op_str = "Unknown op (%source1, %source2, %source3)";
-                    if (op_it != combiner_map.end())
-                        op_str = op_it->second;
-
-                    op_str = ReplacePattern(op_str, "%source1", GetAlphaSourceStr(tev_stage.alpha_source1, tev_stage.alpha_modifier1));
-                    op_str = ReplacePattern(op_str, "%source2", GetAlphaSourceStr(tev_stage.alpha_source2, tev_stage.alpha_modifier2));
-                    return   ReplacePattern(op_str, "%source3", GetAlphaSourceStr(tev_stage.alpha_source3, tev_stage.alpha_modifier3));
-                };
-
-        stage_info += "Stage " + std::to_string(index) + ": " + GetColorCombinerStr(tev_stage) + "   " + GetAlphaCombinerStr(tev_stage) + "\n";
+void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig, 6>& stages) {
+    std::string stage_info = "Tev setup:\n";
+    for (size_t index = 0; index < stages.size(); ++index) {
+        const auto& tev_stage = stages[index];
+        stage_info += "Stage " + std::to_string(index) + ": " + GetTevStageConfigColorCombinerString(tev_stage) + "   " + GetTevStageConfigAlphaCombinerString(tev_stage) + "\n";
     }
-
     LOG_TRACE(HW_GPU, "%s", stage_info.c_str());
 }
 
diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h
index f628292a4..92e9734ae 100644
--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -224,7 +224,11 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int s, int t, const Texture
 
 void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data);
 
-void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages);
+std::string GetTevStageConfigColorCombinerString(const Pica::Regs::TevStageConfig& tev_stage);
+std::string GetTevStageConfigAlphaCombinerString(const Pica::Regs::TevStageConfig& tev_stage);
+
+/// Dumps the Tev stage config to log at trace level
+void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig, 6>& stages);
 
 /**
  * Used in the vertex loader to merge access records. TODO: Investigate if actually useful.
diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp
index be82cf4b5..ec78f9593 100644
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@@ -500,7 +500,7 @@ void Init() {
 }
 
 void Shutdown() {
-    Shader::Shutdown();
+    Shader::ClearCache();
 }
 
 template <typename T>
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 5891fb72a..86c0a0096 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -70,7 +70,7 @@ struct Regs {
     INSERT_PADDING_WORDS(0x9);
 
     BitField<0, 24, u32> viewport_depth_range; // float24
-    BitField<0, 24, u32> viewport_depth_far_plane; // float24
+    BitField<0, 24, u32> viewport_depth_near_plane; // float24
 
     BitField<0, 3, u32> vs_output_total;
 
@@ -122,9 +122,31 @@ struct Regs {
         BitField<16, 10, s32> y;
     } viewport_corner;
 
-    INSERT_PADDING_WORDS(0x17);
+    INSERT_PADDING_WORDS(0x1);
+
+    //TODO: early depth
+    INSERT_PADDING_WORDS(0x1);
+
+    INSERT_PADDING_WORDS(0x2);
+
+    enum DepthBuffering : u32 {
+        WBuffering  = 0,
+        ZBuffering  = 1,
+    };
+    BitField< 0, 1, DepthBuffering> depthmap_enable;
+
+    INSERT_PADDING_WORDS(0x12);
 
     struct TextureConfig {
+        enum TextureType : u32 {
+            Texture2D    = 0,
+            TextureCube  = 1,
+            Shadow2D     = 2,
+            Projection2D = 3,
+            ShadowCube   = 4,
+            Disabled     = 5,
+        };
+
         enum WrapMode : u32 {
             ClampToEdge    = 0,
             ClampToBorder  = 1,
@@ -155,6 +177,7 @@ struct Regs {
             BitField< 2, 1, TextureFilter> min_filter;
             BitField< 8, 2, WrapMode> wrap_t;
             BitField<12, 2, WrapMode> wrap_s;
+            BitField<28, 2, TextureType> type; ///< @note Only valid for texture 0 according to 3DBrew.
         };
 
         INSERT_PADDING_WORDS(0x1);
@@ -1279,10 +1302,11 @@ ASSERT_REG_POSITION(cull_mode, 0x40);
 ASSERT_REG_POSITION(viewport_size_x, 0x41);
 ASSERT_REG_POSITION(viewport_size_y, 0x43);
 ASSERT_REG_POSITION(viewport_depth_range, 0x4d);
-ASSERT_REG_POSITION(viewport_depth_far_plane, 0x4e);
+ASSERT_REG_POSITION(viewport_depth_near_plane, 0x4e);
 ASSERT_REG_POSITION(vs_output_attributes[0], 0x50);
 ASSERT_REG_POSITION(vs_output_attributes[1], 0x51);
 ASSERT_REG_POSITION(viewport_corner, 0x68);
+ASSERT_REG_POSITION(depthmap_enable, 0x6D);
 ASSERT_REG_POSITION(texture0_enable, 0x80);
 ASSERT_REG_POSITION(texture0, 0x81);
 ASSERT_REG_POSITION(texture0_format, 0x8e);
diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h
index 1059c6ae4..495174c25 100644
--- a/src/video_core/pica_state.h
+++ b/src/video_core/pica_state.h
@@ -25,6 +25,8 @@ struct State {
     Shader::ShaderSetup vs;
     Shader::ShaderSetup gs;
 
+    std::array<Math::Vec4<float24>, 16> vs_default_attributes;
+
     struct {
         union LutEntry {
             // Used for raw access
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index df67b9081..65168f05a 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -442,8 +442,33 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0,
 
                 DEBUG_ASSERT(0 != texture.config.address);
 
-                int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
-                int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
+                float24 u = uv[i].u();
+                float24 v = uv[i].v();
+
+                // Only unit 0 respects the texturing type (according to 3DBrew)
+                // TODO: Refactor so cubemaps and shadowmaps can be handled
+                if (i == 0) {
+                    switch(texture.config.type) {
+                    case Regs::TextureConfig::Texture2D:
+                        break;
+                    case Regs::TextureConfig::Projection2D: {
+                        auto tc0_w = GetInterpolatedAttribute(v0.tc0_w, v1.tc0_w, v2.tc0_w);
+                        u /= tc0_w;
+                        v /= tc0_w;
+                        break;
+                    }
+                    default:
+                        // TODO: Change to LOG_ERROR when more types are handled.
+                        LOG_DEBUG(HW_GPU, "Unhandled texture type %x", (int)texture.config.type);
+                        UNIMPLEMENTED();
+                        break;
+                    }
+                }
+
+                int s = (int)(u * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
+                int t = (int)(v * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
+
+
                 static auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) {
                     switch (mode) {
                         case Regs::TextureConfig::ClampToEdge:
@@ -862,10 +887,30 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0,
                 }
             }
 
+            // interpolated_z = z / w
+            float interpolated_z_over_w = (v0.screenpos[2].ToFloat32() * w0 +
+                                           v1.screenpos[2].ToFloat32() * w1 +
+                                           v2.screenpos[2].ToFloat32() * w2) / wsum;
+
+            // Not fully accurate. About 3 bits in precision are missing.
+            // Z-Buffer (z / w * scale + offset)
+            float depth_scale = float24::FromRaw(regs.viewport_depth_range).ToFloat32();
+            float depth_offset = float24::FromRaw(regs.viewport_depth_near_plane).ToFloat32();
+            float depth = interpolated_z_over_w * depth_scale + depth_offset;
+
+            // Potentially switch to W-Buffer
+            if (regs.depthmap_enable == Pica::Regs::DepthBuffering::WBuffering) {
+
+                // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w)
+                depth *= interpolated_w_inverse.ToFloat32() * wsum;
+            }
+
+            // Clamp the result
+            depth = MathUtil::Clamp(depth, 0.0f, 1.0f);
+
+            // Convert float to integer
             unsigned num_bits = Regs::DepthBitsPerPixel(regs.framebuffer.depth_format);
-            u32 z = (u32)((v0.screenpos[2].ToFloat32() * w0 +
-                           v1.screenpos[2].ToFloat32() * w1 +
-                           v2.screenpos[2].ToFloat32() * w2) * ((1 << num_bits) - 1) / wsum);
+            u32 z = (u32)(depth * ((1 << num_bits) - 1));
 
             if (output_merger.depth_test_enable) {
                 u32 ref_z = GetDepth(x >> 4, y >> 4);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 0b471dfd2..bcd1ae78d 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -76,6 +76,9 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
     glEnableVertexAttribArray(GLShader::ATTRIBUTE_TEXCOORD1);
     glEnableVertexAttribArray(GLShader::ATTRIBUTE_TEXCOORD2);
 
+    glVertexAttribPointer(GLShader::ATTRIBUTE_TEXCOORD0_W, 1, GL_FLOAT, GL_FALSE, sizeof(HardwareVertex), (GLvoid*)offsetof(HardwareVertex, tex_coord0_w));
+    glEnableVertexAttribArray(GLShader::ATTRIBUTE_TEXCOORD0_W);
+
     glVertexAttribPointer(GLShader::ATTRIBUTE_NORMQUAT, 4, GL_FLOAT, GL_FALSE, sizeof(HardwareVertex), (GLvoid*)offsetof(HardwareVertex, normquat));
     glEnableVertexAttribArray(GLShader::ATTRIBUTE_NORMQUAT);
 
@@ -101,7 +104,6 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
 
     // Sync fixed function OpenGL state
     SyncCullMode();
-    SyncDepthModifiers();
     SyncBlendEnabled();
     SyncBlendFuncs();
     SyncBlendColor();
@@ -256,8 +258,15 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
 
     // Depth modifiers
     case PICA_REG_INDEX(viewport_depth_range):
-    case PICA_REG_INDEX(viewport_depth_far_plane):
-        SyncDepthModifiers();
+        SyncDepthScale();
+        break;
+    case PICA_REG_INDEX(viewport_depth_near_plane):
+        SyncDepthOffset();
+        break;
+
+    // Depth buffering
+    case PICA_REG_INDEX(depthmap_enable):
+        shader_dirty = true;
         break;
 
     // Blending
@@ -314,6 +323,11 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
         SyncLogicOp();
         break;
 
+    // Texture 0 type
+    case PICA_REG_INDEX(texture0.type):
+        shader_dirty = true;
+        break;
+
     // TEV stages
     case PICA_REG_INDEX(tev_stage0.color_source1):
     case PICA_REG_INDEX(tev_stage0.color_modifier1):
@@ -867,6 +881,8 @@ void RasterizerOpenGL::SetShader() {
         glUniformBlockBinding(current_shader->shader.handle, block_index, 0);
 
         // Update uniforms
+        SyncDepthScale();
+        SyncDepthOffset();
         SyncAlphaTest();
         SyncCombinerColor();
         auto& tev_stages = Pica::g_state.regs.GetTevStages();
@@ -909,13 +925,20 @@ void RasterizerOpenGL::SyncCullMode() {
     }
 }
 
-void RasterizerOpenGL::SyncDepthModifiers() {
-    float depth_scale = -Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_range).ToFloat32();
-    float depth_offset = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_far_plane).ToFloat32() / 2.0f;
+void RasterizerOpenGL::SyncDepthScale() {
+    float depth_scale = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_range).ToFloat32();
+    if (depth_scale != uniform_block_data.data.depth_scale) {
+        uniform_block_data.data.depth_scale = depth_scale;
+        uniform_block_data.dirty = true;
+    }
+}
 
-    // TODO: Implement scale modifier
-    uniform_block_data.data.depth_offset = depth_offset;
-    uniform_block_data.dirty = true;
+void RasterizerOpenGL::SyncDepthOffset() {
+    float depth_offset = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_near_plane).ToFloat32();
+    if (depth_offset != uniform_block_data.data.depth_offset) {
+        uniform_block_data.data.depth_offset = depth_offset;
+        uniform_block_data.dirty = true;
+    }
 }
 
 void RasterizerOpenGL::SyncBlendEnabled() {
@@ -924,6 +947,8 @@ void RasterizerOpenGL::SyncBlendEnabled() {
 
 void RasterizerOpenGL::SyncBlendFuncs() {
     const auto& regs = Pica::g_state.regs;
+    state.blend.rgb_equation = PicaToGL::BlendEquation(regs.output_merger.alpha_blending.blend_equation_rgb);
+    state.blend.a_equation = PicaToGL::BlendEquation(regs.output_merger.alpha_blending.blend_equation_a);
     state.blend.src_rgb_func = PicaToGL::BlendFunc(regs.output_merger.alpha_blending.factor_source_rgb);
     state.blend.dst_rgb_func = PicaToGL::BlendFunc(regs.output_merger.alpha_blending.factor_dest_rgb);
     state.blend.src_a_func = PicaToGL::BlendFunc(regs.output_merger.alpha_blending.factor_source_a);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 82fa61742..d70369400 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -39,140 +39,185 @@ struct ScreenInfo;
  * directly accessing Pica registers. This should reduce the risk of bugs in shader generation where
  * Pica state is not being captured in the shader cache key, thereby resulting in (what should be)
  * two separate shaders sharing the same key.
+ *
+ * We use a union because "implicitly-defined copy/move constructor for a union X copies the object representation of X."
+ * and "implicitly-defined copy assignment operator for a union X copies the object representation (3.9) of X."
+ * = Bytewise copy instead of memberwise copy.
+ * This is important because the padding bytes are included in the hash and comparison between objects.
  */
-struct PicaShaderConfig {
+union PicaShaderConfig {
+
     /// Construct a PicaShaderConfig with the current Pica register configuration.
     static PicaShaderConfig CurrentConfig() {
         PicaShaderConfig res;
+
+        auto& state = res.state;
+        std::memset(&state, 0, sizeof(PicaShaderConfig::State));
+
         const auto& regs = Pica::g_state.regs;
 
-        res.alpha_test_func = regs.output_merger.alpha_test.enable ?
+        state.depthmap_enable = regs.depthmap_enable;
+
+        state.alpha_test_func = regs.output_merger.alpha_test.enable ?
             regs.output_merger.alpha_test.func.Value() : Pica::Regs::CompareFunc::Always;
 
+        state.texture0_type = regs.texture0.type;
+
         // Copy relevant tev stages fields.
         // We don't sync const_color here because of the high variance, it is a
         // shader uniform instead.
         const auto& tev_stages = regs.GetTevStages();
-        DEBUG_ASSERT(res.tev_stages.size() == tev_stages.size());
+        DEBUG_ASSERT(state.tev_stages.size() == tev_stages.size());
         for (size_t i = 0; i < tev_stages.size(); i++) {
             const auto& tev_stage = tev_stages[i];
-            res.tev_stages[i].sources_raw = tev_stage.sources_raw;
-            res.tev_stages[i].modifiers_raw = tev_stage.modifiers_raw;
-            res.tev_stages[i].ops_raw = tev_stage.ops_raw;
-            res.tev_stages[i].scales_raw = tev_stage.scales_raw;
+            state.tev_stages[i].sources_raw = tev_stage.sources_raw;
+            state.tev_stages[i].modifiers_raw = tev_stage.modifiers_raw;
+            state.tev_stages[i].ops_raw = tev_stage.ops_raw;
+            state.tev_stages[i].scales_raw = tev_stage.scales_raw;
         }
 
-        res.combiner_buffer_input =
+        state.combiner_buffer_input =
             regs.tev_combiner_buffer_input.update_mask_rgb.Value() |
             regs.tev_combiner_buffer_input.update_mask_a.Value() << 4;
 
         // Fragment lighting
 
-        res.lighting.enable = !regs.lighting.disable;
-        res.lighting.src_num = regs.lighting.num_lights + 1;
+        state.lighting.enable = !regs.lighting.disable;
+        state.lighting.src_num = regs.lighting.num_lights + 1;
 
-        for (unsigned light_index = 0; light_index < res.lighting.src_num; ++light_index) {
+        for (unsigned light_index = 0; light_index < state.lighting.src_num; ++light_index) {
             unsigned num = regs.lighting.light_enable.GetNum(light_index);
             const auto& light = regs.lighting.light[num];
-            res.lighting.light[light_index].num = num;
-            res.lighting.light[light_index].directional = light.directional != 0;
-            res.lighting.light[light_index].two_sided_diffuse = light.two_sided_diffuse != 0;
-            res.lighting.light[light_index].dist_atten_enable = !regs.lighting.IsDistAttenDisabled(num);
-            res.lighting.light[light_index].dist_atten_bias = Pica::float20::FromRaw(light.dist_atten_bias).ToFloat32();
-            res.lighting.light[light_index].dist_atten_scale = Pica::float20::FromRaw(light.dist_atten_scale).ToFloat32();
+            state.lighting.light[light_index].num = num;
+            state.lighting.light[light_index].directional = light.directional != 0;
+            state.lighting.light[light_index].two_sided_diffuse = light.two_sided_diffuse != 0;
+            state.lighting.light[light_index].dist_atten_enable = !regs.lighting.IsDistAttenDisabled(num);
+            state.lighting.light[light_index].dist_atten_bias = Pica::float20::FromRaw(light.dist_atten_bias).ToFloat32();
+            state.lighting.light[light_index].dist_atten_scale = Pica::float20::FromRaw(light.dist_atten_scale).ToFloat32();
         }
 
-        res.lighting.lut_d0.enable = regs.lighting.disable_lut_d0 == 0;
-        res.lighting.lut_d0.abs_input = regs.lighting.abs_lut_input.disable_d0 == 0;
-        res.lighting.lut_d0.type = regs.lighting.lut_input.d0.Value();
-        res.lighting.lut_d0.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d0);
-
-        res.lighting.lut_d1.enable = regs.lighting.disable_lut_d1 == 0;
-        res.lighting.lut_d1.abs_input = regs.lighting.abs_lut_input.disable_d1 == 0;
-        res.lighting.lut_d1.type = regs.lighting.lut_input.d1.Value();
-        res.lighting.lut_d1.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d1);
-
-        res.lighting.lut_fr.enable = regs.lighting.disable_lut_fr == 0;
-        res.lighting.lut_fr.abs_input = regs.lighting.abs_lut_input.disable_fr == 0;
-        res.lighting.lut_fr.type = regs.lighting.lut_input.fr.Value();
-        res.lighting.lut_fr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.fr);
-
-        res.lighting.lut_rr.enable = regs.lighting.disable_lut_rr == 0;
-        res.lighting.lut_rr.abs_input = regs.lighting.abs_lut_input.disable_rr == 0;
-        res.lighting.lut_rr.type = regs.lighting.lut_input.rr.Value();
-        res.lighting.lut_rr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rr);
-
-        res.lighting.lut_rg.enable = regs.lighting.disable_lut_rg == 0;
-        res.lighting.lut_rg.abs_input = regs.lighting.abs_lut_input.disable_rg == 0;
-        res.lighting.lut_rg.type = regs.lighting.lut_input.rg.Value();
-        res.lighting.lut_rg.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rg);
-
-        res.lighting.lut_rb.enable = regs.lighting.disable_lut_rb == 0;
-        res.lighting.lut_rb.abs_input = regs.lighting.abs_lut_input.disable_rb == 0;
-        res.lighting.lut_rb.type = regs.lighting.lut_input.rb.Value();
-        res.lighting.lut_rb.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rb);
-
-        res.lighting.config = regs.lighting.config;
-        res.lighting.fresnel_selector = regs.lighting.fresnel_selector;
-        res.lighting.bump_mode = regs.lighting.bump_mode;
-        res.lighting.bump_selector = regs.lighting.bump_selector;
-        res.lighting.bump_renorm = regs.lighting.disable_bump_renorm == 0;
-        res.lighting.clamp_highlights = regs.lighting.clamp_highlights != 0;
+        state.lighting.lut_d0.enable = regs.lighting.disable_lut_d0 == 0;
+        state.lighting.lut_d0.abs_input = regs.lighting.abs_lut_input.disable_d0 == 0;
+        state.lighting.lut_d0.type = regs.lighting.lut_input.d0.Value();
+        state.lighting.lut_d0.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d0);
+
+        state.lighting.lut_d1.enable = regs.lighting.disable_lut_d1 == 0;
+        state.lighting.lut_d1.abs_input = regs.lighting.abs_lut_input.disable_d1 == 0;
+        state.lighting.lut_d1.type = regs.lighting.lut_input.d1.Value();
+        state.lighting.lut_d1.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d1);
+
+        state.lighting.lut_fr.enable = regs.lighting.disable_lut_fr == 0;
+        state.lighting.lut_fr.abs_input = regs.lighting.abs_lut_input.disable_fr == 0;
+        state.lighting.lut_fr.type = regs.lighting.lut_input.fr.Value();
+        state.lighting.lut_fr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.fr);
+
+        state.lighting.lut_rr.enable = regs.lighting.disable_lut_rr == 0;
+        state.lighting.lut_rr.abs_input = regs.lighting.abs_lut_input.disable_rr == 0;
+        state.lighting.lut_rr.type = regs.lighting.lut_input.rr.Value();
+        state.lighting.lut_rr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rr);
+
+        state.lighting.lut_rg.enable = regs.lighting.disable_lut_rg == 0;
+        state.lighting.lut_rg.abs_input = regs.lighting.abs_lut_input.disable_rg == 0;
+        state.lighting.lut_rg.type = regs.lighting.lut_input.rg.Value();
+        state.lighting.lut_rg.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rg);
+
+        state.lighting.lut_rb.enable = regs.lighting.disable_lut_rb == 0;
+        state.lighting.lut_rb.abs_input = regs.lighting.abs_lut_input.disable_rb == 0;
+        state.lighting.lut_rb.type = regs.lighting.lut_input.rb.Value();
+        state.lighting.lut_rb.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rb);
+
+        state.lighting.config = regs.lighting.config;
+        state.lighting.fresnel_selector = regs.lighting.fresnel_selector;
+        state.lighting.bump_mode = regs.lighting.bump_mode;
+        state.lighting.bump_selector = regs.lighting.bump_selector;
+        state.lighting.bump_renorm = regs.lighting.disable_bump_renorm == 0;
+        state.lighting.clamp_highlights = regs.lighting.clamp_highlights != 0;
 
         return res;
     }
 
     bool TevStageUpdatesCombinerBufferColor(unsigned stage_index) const {
-        return (stage_index < 4) && (combiner_buffer_input & (1 << stage_index));
+        return (stage_index < 4) && (state.combiner_buffer_input & (1 << stage_index));
     }
 
     bool TevStageUpdatesCombinerBufferAlpha(unsigned stage_index) const {
-        return (stage_index < 4) && ((combiner_buffer_input >> 4) & (1 << stage_index));
+        return (stage_index < 4) && ((state.combiner_buffer_input >> 4) & (1 << stage_index));
     }
 
     bool operator ==(const PicaShaderConfig& o) const {
-        return std::memcmp(this, &o, sizeof(PicaShaderConfig)) == 0;
+        return std::memcmp(&state, &o.state, sizeof(PicaShaderConfig::State)) == 0;
+    };
+
+    // NOTE: MSVC15 (Update 2) doesn't think `delete`'d constructors and operators are TC.
+    //       This makes BitField not TC when used in a union or struct so we have to resort
+    //       to this ugly hack.
+    //       Once that bug is fixed we can use Pica::Regs::TevStageConfig here.
+    //       Doesn't include const_color because we don't sync it, see comment in CurrentConfig()
+    struct TevStageConfigRaw {
+        u32 sources_raw;
+        u32 modifiers_raw;
+        u32 ops_raw;
+        u32 scales_raw;
+        explicit operator Pica::Regs::TevStageConfig() const noexcept {
+            Pica::Regs::TevStageConfig stage;
+            stage.sources_raw = sources_raw;
+            stage.modifiers_raw = modifiers_raw;
+            stage.ops_raw = ops_raw;
+            stage.const_color = 0;
+            stage.scales_raw = scales_raw;
+            return stage;
+        }
     };
 
-    Pica::Regs::CompareFunc alpha_test_func = Pica::Regs::CompareFunc::Never;
-    std::array<Pica::Regs::TevStageConfig, 6> tev_stages = {};
-    u8 combiner_buffer_input = 0;
+    struct State {
 
-    struct {
-        struct {
-            unsigned num = 0;
-            bool directional = false;
-            bool two_sided_diffuse = false;
-            bool dist_atten_enable = false;
-            GLfloat dist_atten_scale = 0.0f;
-            GLfloat dist_atten_bias = 0.0f;
-        } light[8];
-
-        bool enable = false;
-        unsigned src_num = 0;
-        Pica::Regs::LightingBumpMode bump_mode = Pica::Regs::LightingBumpMode::None;
-        unsigned bump_selector = 0;
-        bool bump_renorm = false;
-        bool clamp_highlights = false;
-
-        Pica::Regs::LightingConfig config = Pica::Regs::LightingConfig::Config0;
-        Pica::Regs::LightingFresnelSelector fresnel_selector = Pica::Regs::LightingFresnelSelector::None;
+        Pica::Regs::CompareFunc alpha_test_func;
+        Pica::Regs::TextureConfig::TextureType texture0_type;
+        std::array<TevStageConfigRaw, 6> tev_stages;
+        u8 combiner_buffer_input;
+
+        Pica::Regs::DepthBuffering depthmap_enable;
 
         struct {
-            bool enable = false;
-            bool abs_input = false;
-            Pica::Regs::LightingLutInput type = Pica::Regs::LightingLutInput::NH;
-            float scale = 1.0f;
-        } lut_d0, lut_d1, lut_fr, lut_rr, lut_rg, lut_rb;
-    } lighting;
+            struct {
+                unsigned num;
+                bool directional;
+                bool two_sided_diffuse;
+                bool dist_atten_enable;
+                GLfloat dist_atten_scale;
+                GLfloat dist_atten_bias;
+            } light[8];
+
+            bool enable;
+            unsigned src_num;
+            Pica::Regs::LightingBumpMode bump_mode;
+            unsigned bump_selector;
+            bool bump_renorm;
+            bool clamp_highlights;
+
+            Pica::Regs::LightingConfig config;
+            Pica::Regs::LightingFresnelSelector fresnel_selector;
+
+            struct {
+                bool enable;
+                bool abs_input;
+                Pica::Regs::LightingLutInput type;
+                float scale;
+            } lut_d0, lut_d1, lut_fr, lut_rr, lut_rg, lut_rb;
+        } lighting;
+
+    } state;
 };
+#if (__GNUC__ >= 5) || defined(__clang__) || defined(_MSC_VER)
+static_assert(std::is_trivially_copyable<PicaShaderConfig::State>::value, "PicaShaderConfig::State must be trivially copyable");
+#endif
 
 namespace std {
 
 template <>
 struct hash<PicaShaderConfig> {
     size_t operator()(const PicaShaderConfig& k) const {
-        return Common::ComputeHash64(&k, sizeof(PicaShaderConfig));
+        return Common::ComputeHash64(&k.state, sizeof(PicaShaderConfig::State));
     }
 };
 
@@ -239,6 +284,7 @@ private:
             tex_coord1[1] = v.tc1.y.ToFloat32();
             tex_coord2[0] = v.tc2.x.ToFloat32();
             tex_coord2[1] = v.tc2.y.ToFloat32();
+            tex_coord0_w = v.tc0_w.ToFloat32();
             normquat[0] = v.quat.x.ToFloat32();
             normquat[1] = v.quat.y.ToFloat32();
             normquat[2] = v.quat.z.ToFloat32();
@@ -259,6 +305,7 @@ private:
         GLfloat tex_coord0[2];
         GLfloat tex_coord1[2];
         GLfloat tex_coord2[2];
+        GLfloat tex_coord0_w;
         GLfloat normquat[4];
         GLfloat view[3];
     };
@@ -277,6 +324,7 @@ private:
         GLvec4 const_color[6];
         GLvec4 tev_combiner_buffer_color;
         GLint alphatest_ref;
+        GLfloat depth_scale;
         GLfloat depth_offset;
         alignas(16) GLvec3 lighting_global_ambient;
         LightSrc light_src[8];
@@ -291,8 +339,11 @@ private:
     /// Syncs the cull mode to match the PICA register
     void SyncCullMode();
 
-    /// Syncs the depth scale and offset to match the PICA registers
-    void SyncDepthModifiers();
+    /// Syncs the depth scale to match the PICA register
+    void SyncDepthScale();
+
+    /// Syncs the depth offset to match the PICA register
+    void SyncDepthOffset();
 
     /// Syncs the blend enabled status to match the PICA register
     void SyncBlendEnabled();
@@ -365,7 +416,7 @@ private:
         UniformData data;
         bool lut_dirty[6];
         bool dirty;
-    } uniform_block_data;
+    } uniform_block_data = {};
 
     std::array<SamplerInfo, 3> texture_samplers;
     OGLVertexArray vertex_array;
@@ -374,5 +425,5 @@ private:
     OGLFramebuffer framebuffer;
 
     std::array<OGLTexture, 6> lighting_luts;
-    std::array<std::array<GLvec4, 256>, 6> lighting_lut_data;
+    std::array<std::array<GLvec4, 256>, 6> lighting_lut_data{};
 };
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 9011caa39..71d60e69c 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -32,8 +32,9 @@ static bool IsPassThroughTevStage(const TevStageConfig& stage) {
 }
 
 /// Writes the specified TEV stage source component(s)
-static void AppendSource(std::string& out, TevStageConfig::Source source,
+static void AppendSource(std::string& out, const PicaShaderConfig& config, TevStageConfig::Source source,
         const std::string& index_name) {
+    const auto& state = config.state;
     using Source = TevStageConfig::Source;
     switch (source) {
     case Source::PrimaryColor:
@@ -46,7 +47,20 @@ static void AppendSource(std::string& out, TevStageConfig::Source source,
         out += "secondary_fragment_color";
         break;
     case Source::Texture0:
-        out += "texture(tex[0], texcoord[0])";
+        // Only unit 0 respects the texturing type (according to 3DBrew)
+        switch(state.texture0_type) {
+        case Pica::Regs::TextureConfig::Texture2D:
+            out += "texture(tex[0], texcoord[0])";
+            break;
+        case Pica::Regs::TextureConfig::Projection2D:
+            out += "textureProj(tex[0], vec3(texcoord[0], texcoord0_w))";
+            break;
+        default:
+            out += "texture(tex[0], texcoord[0])";
+            LOG_CRITICAL(HW_GPU, "Unhandled texture type %x", static_cast<int>(state.texture0_type));
+            UNIMPLEMENTED();
+            break;
+        }
         break;
     case Source::Texture1:
         out += "texture(tex[1], texcoord[1])";
@@ -71,53 +85,53 @@ static void AppendSource(std::string& out, TevStageConfig::Source source,
 }
 
 /// Writes the color components to use for the specified TEV stage color modifier
-static void AppendColorModifier(std::string& out, TevStageConfig::ColorModifier modifier,
+static void AppendColorModifier(std::string& out, const PicaShaderConfig& config, TevStageConfig::ColorModifier modifier,
         TevStageConfig::Source source, const std::string& index_name) {
     using ColorModifier = TevStageConfig::ColorModifier;
     switch (modifier) {
     case ColorModifier::SourceColor:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".rgb";
         break;
     case ColorModifier::OneMinusSourceColor:
         out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".rgb";
         break;
     case ColorModifier::SourceAlpha:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".aaa";
         break;
     case ColorModifier::OneMinusSourceAlpha:
         out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".aaa";
         break;
     case ColorModifier::SourceRed:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".rrr";
         break;
     case ColorModifier::OneMinusSourceRed:
         out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".rrr";
         break;
     case ColorModifier::SourceGreen:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".ggg";
         break;
     case ColorModifier::OneMinusSourceGreen:
         out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".ggg";
         break;
     case ColorModifier::SourceBlue:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".bbb";
         break;
     case ColorModifier::OneMinusSourceBlue:
         out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".bbb";
         break;
     default:
@@ -128,44 +142,44 @@ static void AppendColorModifier(std::string& out, TevStageConfig::ColorModifier
 }
 
 /// Writes the alpha component to use for the specified TEV stage alpha modifier
-static void AppendAlphaModifier(std::string& out, TevStageConfig::AlphaModifier modifier,
+static void AppendAlphaModifier(std::string& out, const PicaShaderConfig& config, TevStageConfig::AlphaModifier modifier,
         TevStageConfig::Source source, const std::string& index_name) {
     using AlphaModifier = TevStageConfig::AlphaModifier;
     switch (modifier) {
     case AlphaModifier::SourceAlpha:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".a";
         break;
     case AlphaModifier::OneMinusSourceAlpha:
         out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".a";
         break;
     case AlphaModifier::SourceRed:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".r";
         break;
     case AlphaModifier::OneMinusSourceRed:
         out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".r";
         break;
     case AlphaModifier::SourceGreen:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".g";
         break;
     case AlphaModifier::OneMinusSourceGreen:
         out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".g";
         break;
     case AlphaModifier::SourceBlue:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".b";
         break;
     case AlphaModifier::OneMinusSourceBlue:
         out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".b";
         break;
     default:
@@ -287,16 +301,16 @@ static void AppendAlphaTestCondition(std::string& out, Regs::CompareFunc func) {
 
 /// Writes the code to emulate the specified TEV stage
 static void WriteTevStage(std::string& out, const PicaShaderConfig& config, unsigned index) {
-    auto& stage = config.tev_stages[index];
+    const auto stage = static_cast<const Pica::Regs::TevStageConfig>(config.state.tev_stages[index]);
     if (!IsPassThroughTevStage(stage)) {
         std::string index_name = std::to_string(index);
 
         out += "vec3 color_results_" + index_name + "[3] = vec3[3](";
-        AppendColorModifier(out, stage.color_modifier1, stage.color_source1, index_name);
+        AppendColorModifier(out, config, stage.color_modifier1, stage.color_source1, index_name);
         out += ", ";
-        AppendColorModifier(out, stage.color_modifier2, stage.color_source2, index_name);
+        AppendColorModifier(out, config, stage.color_modifier2, stage.color_source2, index_name);
         out += ", ";
-        AppendColorModifier(out, stage.color_modifier3, stage.color_source3, index_name);
+        AppendColorModifier(out, config, stage.color_modifier3, stage.color_source3, index_name);
         out += ");\n";
 
         out += "vec3 color_output_" + index_name + " = ";
@@ -304,11 +318,11 @@ static void WriteTevStage(std::string& out, const PicaShaderConfig& config, unsi
         out += ";\n";
 
         out += "float alpha_results_" + index_name + "[3] = float[3](";
-        AppendAlphaModifier(out, stage.alpha_modifier1, stage.alpha_source1, index_name);
+        AppendAlphaModifier(out, config, stage.alpha_modifier1, stage.alpha_source1, index_name);
         out += ", ";
-        AppendAlphaModifier(out, stage.alpha_modifier2, stage.alpha_source2, index_name);
+        AppendAlphaModifier(out, config, stage.alpha_modifier2, stage.alpha_source2, index_name);
         out += ", ";
-        AppendAlphaModifier(out, stage.alpha_modifier3, stage.alpha_source3, index_name);
+        AppendAlphaModifier(out, config, stage.alpha_modifier3, stage.alpha_source3, index_name);
         out += ");\n";
 
         out += "float alpha_output_" + index_name + " = ";
@@ -331,6 +345,8 @@ static void WriteTevStage(std::string& out, const PicaShaderConfig& config, unsi
 
 /// Writes the code to emulate fragment lighting
 static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
+    const auto& lighting = config.state.lighting;
+
     // Define lighting globals
     out += "vec4 diffuse_sum = vec4(0.0, 0.0, 0.0, 1.0);\n"
            "vec4 specular_sum = vec4(0.0, 0.0, 0.0, 1.0);\n"
@@ -338,17 +354,17 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
            "vec3 refl_value = vec3(0.0);\n";
 
     // Compute fragment normals
-    if (config.lighting.bump_mode == Pica::Regs::LightingBumpMode::NormalMap) {
+    if (lighting.bump_mode == Pica::Regs::LightingBumpMode::NormalMap) {
         // Bump mapping is enabled using a normal map, read perturbation vector from the selected texture
-        std::string bump_selector = std::to_string(config.lighting.bump_selector);
+        std::string bump_selector = std::to_string(lighting.bump_selector);
         out += "vec3 surface_normal = 2.0 * texture(tex[" + bump_selector + "], texcoord[" + bump_selector + "]).rgb - 1.0;\n";
 
         // Recompute Z-component of perturbation if 'renorm' is enabled, this provides a higher precision result
-        if (config.lighting.bump_renorm) {
+        if (lighting.bump_renorm) {
             std::string val = "(1.0 - (surface_normal.x*surface_normal.x + surface_normal.y*surface_normal.y))";
             out += "surface_normal.z = sqrt(max(" + val + ", 0.0));\n";
         }
-    } else if (config.lighting.bump_mode == Pica::Regs::LightingBumpMode::TangentMap) {
+    } else if (lighting.bump_mode == Pica::Regs::LightingBumpMode::TangentMap) {
         // Bump mapping is enabled using a tangent map
         LOG_CRITICAL(HW_GPU, "unimplemented bump mapping mode (tangent mapping)");
         UNIMPLEMENTED();
@@ -361,7 +377,7 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
     out += "vec3 normal = normalize(quaternion_rotate(normquat, surface_normal));\n";
 
     // Gets the index into the specified lookup table for specular lighting
-    auto GetLutIndex = [config](unsigned light_num, Regs::LightingLutInput input, bool abs) {
+    auto GetLutIndex = [&lighting](unsigned light_num, Regs::LightingLutInput input, bool abs) {
         const std::string half_angle = "normalize(normalize(view) + light_vector)";
         std::string index;
         switch (input) {
@@ -389,7 +405,7 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
 
         if (abs) {
             // LUT index is in the range of (0.0, 1.0)
-            index = config.lighting.light[light_num].two_sided_diffuse ? "abs(" + index + ")" : "max(" + index + ", 0.f)";
+            index = lighting.light[light_num].two_sided_diffuse ? "abs(" + index + ")" : "max(" + index + ", 0.f)";
             return "(FLOAT_255 * clamp(" + index + ", 0.0, 1.0))";
         } else {
             // LUT index is in the range of (-1.0, 1.0)
@@ -407,8 +423,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
     };
 
     // Write the code to emulate each enabled light
-    for (unsigned light_index = 0; light_index < config.lighting.src_num; ++light_index) {
-        const auto& light_config = config.lighting.light[light_index];
+    for (unsigned light_index = 0; light_index < lighting.src_num; ++light_index) {
+        const auto& light_config = lighting.light[light_index];
         std::string light_src = "light_src[" + std::to_string(light_config.num) + "]";
 
         // Compute light vector (directional or positional)
@@ -432,39 +448,39 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
         }
 
         // If enabled, clamp specular component if lighting result is negative
-        std::string clamp_highlights = config.lighting.clamp_highlights ? "(dot(light_vector, normal) <= 0.0 ? 0.0 : 1.0)" : "1.0";
+        std::string clamp_highlights = lighting.clamp_highlights ? "(dot(light_vector, normal) <= 0.0 ? 0.0 : 1.0)" : "1.0";
 
         // Specular 0 component
         std::string d0_lut_value = "1.0";
-        if (config.lighting.lut_d0.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::Distribution0)) {
+        if (lighting.lut_d0.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::Distribution0)) {
             // Lookup specular "distribution 0" LUT value
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_d0.type, config.lighting.lut_d0.abs_input);
-            d0_lut_value = "(" + std::to_string(config.lighting.lut_d0.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution0, index) + ")";
+            std::string index = GetLutIndex(light_config.num, lighting.lut_d0.type, lighting.lut_d0.abs_input);
+            d0_lut_value = "(" + std::to_string(lighting.lut_d0.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution0, index) + ")";
         }
         std::string specular_0 = "(" + d0_lut_value + " * " + light_src + ".specular_0)";
 
         // If enabled, lookup ReflectRed value, otherwise, 1.0 is used
-        if (config.lighting.lut_rr.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::ReflectRed)) {
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_rr.type, config.lighting.lut_rr.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_rr.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectRed, index) + ")";
+        if (lighting.lut_rr.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::ReflectRed)) {
+            std::string index = GetLutIndex(light_config.num, lighting.lut_rr.type, lighting.lut_rr.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_rr.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectRed, index) + ")";
             out += "refl_value.r = " + value + ";\n";
         } else {
             out += "refl_value.r = 1.0;\n";
         }
 
         // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used
-        if (config.lighting.lut_rg.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::ReflectGreen)) {
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_rg.type, config.lighting.lut_rg.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_rg.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectGreen, index) + ")";
+        if (lighting.lut_rg.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::ReflectGreen)) {
+            std::string index = GetLutIndex(light_config.num, lighting.lut_rg.type, lighting.lut_rg.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_rg.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectGreen, index) + ")";
             out += "refl_value.g = " + value + ";\n";
         } else {
             out += "refl_value.g = refl_value.r;\n";
         }
 
         // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used
-        if (config.lighting.lut_rb.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::ReflectBlue)) {
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_rb.type, config.lighting.lut_rb.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_rb.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectBlue, index) + ")";
+        if (lighting.lut_rb.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::ReflectBlue)) {
+            std::string index = GetLutIndex(light_config.num, lighting.lut_rb.type, lighting.lut_rb.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_rb.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectBlue, index) + ")";
             out += "refl_value.b = " + value + ";\n";
         } else {
             out += "refl_value.b = refl_value.r;\n";
@@ -472,27 +488,27 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
 
         // Specular 1 component
         std::string d1_lut_value = "1.0";
-        if (config.lighting.lut_d1.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::Distribution1)) {
+        if (lighting.lut_d1.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::Distribution1)) {
             // Lookup specular "distribution 1" LUT value
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_d1.type, config.lighting.lut_d1.abs_input);
-            d1_lut_value = "(" + std::to_string(config.lighting.lut_d1.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution1, index) + ")";
+            std::string index = GetLutIndex(light_config.num, lighting.lut_d1.type, lighting.lut_d1.abs_input);
+            d1_lut_value = "(" + std::to_string(lighting.lut_d1.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution1, index) + ")";
         }
         std::string specular_1 = "(" + d1_lut_value + " * refl_value * " + light_src + ".specular_1)";
 
         // Fresnel
-        if (config.lighting.lut_fr.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::Fresnel)) {
+        if (lighting.lut_fr.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::Fresnel)) {
             // Lookup fresnel LUT value
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_fr.type, config.lighting.lut_fr.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_fr.scale) + " * " + GetLutValue(Regs::LightingSampler::Fresnel, index) + ")";
+            std::string index = GetLutIndex(light_config.num, lighting.lut_fr.type, lighting.lut_fr.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_fr.scale) + " * " + GetLutValue(Regs::LightingSampler::Fresnel, index) + ")";
 
             // Enabled for difffuse lighting alpha component
-            if (config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::PrimaryAlpha ||
-                config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
+            if (lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::PrimaryAlpha ||
+                lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
                 out += "diffuse_sum.a  *= " + value + ";\n";
 
             // Enabled for the specular lighting alpha component
-            if (config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::SecondaryAlpha ||
-                config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
+            if (lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::SecondaryAlpha ||
+                lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
                 out += "specular_sum.a *= " + value + ";\n";
         }
 
@@ -510,6 +526,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
 }
 
 std::string GenerateFragmentShader(const PicaShaderConfig& config) {
+    const auto& state = config.state;
+
     std::string out = R"(
 #version 330 core
 #define NUM_TEV_STAGES 6
@@ -519,6 +537,7 @@ std::string GenerateFragmentShader(const PicaShaderConfig& config) {
 
 in vec4 primary_color;
 in vec2 texcoord[3];
+in float texcoord0_w;
 in vec4 normquat;
 in vec3 view;
 
@@ -536,6 +555,7 @@ layout (std140) uniform shader_data {
     vec4 const_color[NUM_TEV_STAGES];
     vec4 tev_combiner_buffer_color;
     int alphatest_ref;
+    float depth_scale;
     float depth_offset;
     vec3 lighting_global_ambient;
     LightSrc light_src[NUM_LIGHTS];
@@ -555,29 +575,37 @@ vec4 secondary_fragment_color = vec4(0.0);
 )";
 
     // Do not do any sort of processing if it's obvious we're not going to pass the alpha test
-    if (config.alpha_test_func == Regs::CompareFunc::Never) {
+    if (state.alpha_test_func == Regs::CompareFunc::Never) {
         out += "discard; }";
         return out;
     }
 
-    if (config.lighting.enable)
+    if (state.lighting.enable)
         WriteLighting(out, config);
 
     out += "vec4 combiner_buffer = vec4(0.0);\n";
     out += "vec4 next_combiner_buffer = tev_combiner_buffer_color;\n";
     out += "vec4 last_tex_env_out = vec4(0.0);\n";
 
-    for (size_t index = 0; index < config.tev_stages.size(); ++index)
+    for (size_t index = 0; index < state.tev_stages.size(); ++index)
         WriteTevStage(out, config, (unsigned)index);
 
-    if (config.alpha_test_func != Regs::CompareFunc::Always) {
+    if (state.alpha_test_func != Regs::CompareFunc::Always) {
         out += "if (";
-        AppendAlphaTestCondition(out, config.alpha_test_func);
+        AppendAlphaTestCondition(out, state.alpha_test_func);
         out += ") discard;\n";
     }
 
     out += "color = last_tex_env_out;\n";
-    out += "gl_FragDepth = gl_FragCoord.z + depth_offset;\n}";
+
+    out += "float z_over_w = 1.0 - gl_FragCoord.z * 2.0;\n";
+    out += "float depth = z_over_w * depth_scale + depth_offset;\n";
+    if (state.depthmap_enable == Pica::Regs::DepthBuffering::WBuffering) {
+        out += "depth /= gl_FragCoord.w;\n";
+    }
+    out += "gl_FragDepth = depth;\n";
+
+    out += "}";
 
     return out;
 }
@@ -585,17 +613,19 @@ vec4 secondary_fragment_color = vec4(0.0);
 std::string GenerateVertexShader() {
     std::string out = "#version 330 core\n";
 
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_POSITION)  + ") in vec4 vert_position;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_COLOR)     + ") in vec4 vert_color;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD0) + ") in vec2 vert_texcoord0;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD1) + ") in vec2 vert_texcoord1;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD2) + ") in vec2 vert_texcoord2;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_NORMQUAT)  + ") in vec4 vert_normquat;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_VIEW)      + ") in vec3 vert_view;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_POSITION)    + ") in vec4 vert_position;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_COLOR)       + ") in vec4 vert_color;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD0)   + ") in vec2 vert_texcoord0;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD1)   + ") in vec2 vert_texcoord1;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD2)   + ") in vec2 vert_texcoord2;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD0_W) + ") in float vert_texcoord0_w;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_NORMQUAT)    + ") in vec4 vert_normquat;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_VIEW)        + ") in vec3 vert_view;\n";
 
     out += R"(
 out vec4 primary_color;
 out vec2 texcoord[3];
+out float texcoord0_w;
 out vec4 normquat;
 out vec3 view;
 
@@ -604,6 +634,7 @@ void main() {
     texcoord[0] = vert_texcoord0;
     texcoord[1] = vert_texcoord1;
     texcoord[2] = vert_texcoord2;
+    texcoord0_w = vert_texcoord0_w;
     normquat = vert_normquat;
     view = vert_view;
     gl_Position = vec4(vert_position.x, vert_position.y, -vert_position.z, vert_position.w);
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index 3eb07d57a..bef3249cf 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -6,7 +6,7 @@
 
 #include <string>
 
-struct PicaShaderConfig;
+union PicaShaderConfig;
 
 namespace GLShader {
 
diff --git a/src/video_core/renderer_opengl/gl_shader_util.h b/src/video_core/renderer_opengl/gl_shader_util.h
index 097242f6f..f59912f79 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.h
+++ b/src/video_core/renderer_opengl/gl_shader_util.h
@@ -14,6 +14,7 @@ enum Attributes {
     ATTRIBUTE_TEXCOORD0,
     ATTRIBUTE_TEXCOORD1,
     ATTRIBUTE_TEXCOORD2,
+    ATTRIBUTE_TEXCOORD0_W,
     ATTRIBUTE_NORMQUAT,
     ATTRIBUTE_VIEW,
 };
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 02cd9f417..fa141fc9a 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -36,6 +36,8 @@ OpenGLState::OpenGLState() {
     stencil.action_stencil_fail = GL_KEEP;
 
     blend.enabled = false;
+    blend.rgb_equation = GL_FUNC_ADD;
+    blend.a_equation = GL_FUNC_ADD;
     blend.src_rgb_func = GL_ONE;
     blend.dst_rgb_func = GL_ZERO;
     blend.src_a_func = GL_ONE;
@@ -165,6 +167,11 @@ void OpenGLState::Apply() const {
                             blend.src_a_func, blend.dst_a_func);
     }
 
+    if (blend.rgb_equation != cur_state.blend.rgb_equation ||
+            blend.a_equation != cur_state.blend.a_equation) {
+        glBlendEquationSeparate(blend.rgb_equation, blend.a_equation);
+    }
+
     if (logic_op != cur_state.logic_op) {
         glLogicOp(logic_op);
     }
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 24f20e47c..228727054 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -40,6 +40,8 @@ public:
 
     struct {
         bool enabled; // GL_BLEND
+        GLenum rgb_equation; // GL_BLEND_EQUATION_RGB
+        GLenum a_equation; // GL_BLEND_EQUATION_ALPHA
         GLenum src_rgb_func; // GL_BLEND_SRC_RGB
         GLenum dst_rgb_func; // GL_BLEND_DST_RGB
         GLenum src_a_func; // GL_BLEND_SRC_ALPHA
diff --git a/src/video_core/renderer_opengl/pica_to_gl.h b/src/video_core/renderer_opengl/pica_to_gl.h
index 976d1f364..6dc2758c5 100644
--- a/src/video_core/renderer_opengl/pica_to_gl.h
+++ b/src/video_core/renderer_opengl/pica_to_gl.h
@@ -78,6 +78,26 @@ inline GLenum WrapMode(Pica::Regs::TextureConfig::WrapMode mode) {
     return gl_mode;
 }
 
+inline GLenum BlendEquation(Pica::Regs::BlendEquation equation) {
+    static const GLenum blend_equation_table[] = {
+        GL_FUNC_ADD,              // BlendEquation::Add
+        GL_FUNC_SUBTRACT,         // BlendEquation::Subtract
+        GL_FUNC_REVERSE_SUBTRACT, // BlendEquation::ReverseSubtract
+        GL_MIN,                   // BlendEquation::Min
+        GL_MAX,                   // BlendEquation::Max
+    };
+
+    // Range check table for input
+    if (static_cast<size_t>(equation) >= ARRAY_SIZE(blend_equation_table)) {
+        LOG_CRITICAL(Render_OpenGL, "Unknown blend equation %d", equation);
+        UNREACHABLE();
+
+        return GL_FUNC_ADD;
+    }
+
+    return blend_equation_table[(unsigned)equation];
+}
+
 inline GLenum BlendFunc(Pica::Regs::BlendFactor factor) {
     static const GLenum blend_func_table[] = {
         GL_ZERO,                     // BlendFactor::Zero
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 65dcc9156..161097610 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -35,7 +35,13 @@ static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map;
 static const JitShader* jit_shader;
 #endif // ARCHITECTURE_x86_64
 
-void Setup() {
+void ClearCache() {
+#ifdef ARCHITECTURE_x86_64
+    shader_map.clear();
+#endif // ARCHITECTURE_x86_64
+}
+
+void ShaderSetup::Setup() {
 #ifdef ARCHITECTURE_x86_64
     if (VideoCore::g_shader_jit_enabled) {
         u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^
@@ -54,20 +60,14 @@ void Setup() {
 #endif // ARCHITECTURE_x86_64
 }
 
-void Shutdown() {
-#ifdef ARCHITECTURE_x86_64
-    shader_map.clear();
-#endif // ARCHITECTURE_x86_64
-}
-
-MICROPROFILE_DEFINE(GPU_VertexShader, "GPU", "Vertex Shader", MP_RGB(50, 50, 240));
+MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
 
-OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes) {
+OutputVertex ShaderSetup::Run(UnitState<false>& state, const InputVertex& input, int num_attributes) {
     auto& config = g_state.regs.vs;
+    auto& setup = g_state.vs;
 
-    MICROPROFILE_SCOPE(GPU_VertexShader);
+    MICROPROFILE_SCOPE(GPU_Shader);
 
-    state.program_counter = config.main_offset;
     state.debug.max_offset = 0;
     state.debug.max_opdesc_id = 0;
 
@@ -82,11 +82,11 @@ OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attr
 
 #ifdef ARCHITECTURE_x86_64
     if (VideoCore::g_shader_jit_enabled)
-        jit_shader->Run(&state.registers, g_state.regs.vs.main_offset);
+        jit_shader->Run(setup, state, config.main_offset);
     else
-        RunInterpreter(state);
+        RunInterpreter(setup, state, config.main_offset);
 #else
-    RunInterpreter(state);
+    RunInterpreter(setup, state, config.main_offset);
 #endif // ARCHITECTURE_x86_64
 
     // Setup output data
@@ -140,10 +140,9 @@ OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attr
     return ret;
 }
 
-DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) {
+DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) {
     UnitState<true> state;
 
-    state.program_counter = config.main_offset;
     state.debug.max_offset = 0;
     state.debug.max_opdesc_id = 0;
 
@@ -158,7 +157,7 @@ DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, c
     state.conditional_code[0] = false;
     state.conditional_code[1] = false;
 
-    RunInterpreter(state);
+    RunInterpreter(setup, state, config.main_offset);
     return state.debug;
 }
 
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 56b83bfeb..84898f21c 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -43,7 +43,8 @@ struct OutputVertex {
     Math::Vec4<float24> color;
     Math::Vec2<float24> tc0;
     Math::Vec2<float24> tc1;
-    INSERT_PADDING_WORDS(2);
+    float24 tc0_w;
+    INSERT_PADDING_WORDS(1);
     Math::Vec3<float24> view;
     INSERT_PADDING_WORDS(1);
     Math::Vec2<float24> tc2;
@@ -83,23 +84,6 @@ struct OutputVertex {
 static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
 static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
 
-/// Vertex shader memory
-struct ShaderSetup {
-    struct {
-        // The float uniforms are accessed by the shader JIT using SSE instructions, and are
-        // therefore required to be 16-byte aligned.
-        alignas(16) Math::Vec4<float24> f[96];
-
-        std::array<bool, 16> b;
-        std::array<Math::Vec4<u8>, 4> i;
-    } uniforms;
-
-    Math::Vec4<float24> default_attributes[16];
-
-    std::array<u32, 1024> program_code;
-    std::array<u32, 1024> swizzle_data;
-};
-
 // Helper structure used to keep track of data useful for inspection of shader emulation
 template<bool full_debugging>
 struct DebugData;
@@ -288,38 +272,21 @@ struct UnitState {
     } registers;
     static_assert(std::is_pod<Registers>::value, "Structure is not POD");
 
-    u32 program_counter;
     bool conditional_code[2];
 
     // Two Address registers and one loop counter
     // TODO: How many bits do these actually have?
     s32 address_registers[3];
 
-    enum {
-        INVALID_ADDRESS = 0xFFFFFFFF
-    };
-
-    struct CallStackElement {
-        u32 final_address;  // Address upon which we jump to return_address
-        u32 return_address; // Where to jump when leaving scope
-        u8 repeat_counter;  // How often to repeat until this call stack element is removed
-        u8 loop_increment;  // Which value to add to the loop counter after an iteration
-                            // TODO: Should this be a signed value? Does it even matter?
-        u32 loop_address;   // The address where we'll return to after each loop iteration
-    };
-
-    // TODO: Is there a maximal size for this?
-    boost::container::static_vector<CallStackElement, 16> call_stack;
-
     DebugData<Debug> debug;
 
     static size_t InputOffset(const SourceRegister& reg) {
         switch (reg.GetRegisterType()) {
         case RegisterType::Input:
-            return offsetof(UnitState::Registers, input) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState, registers.input) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
 
         case RegisterType::Temporary:
-            return offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
 
         default:
             UNREACHABLE();
@@ -330,10 +297,10 @@ struct UnitState {
     static size_t OutputOffset(const DestRegister& reg) {
         switch (reg.GetRegisterType()) {
         case RegisterType::Output:
-            return offsetof(UnitState::Registers, output) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState, registers.output) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
 
         case RegisterType::Temporary:
-            return offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
 
         default:
             UNREACHABLE();
@@ -342,33 +309,66 @@ struct UnitState {
     }
 };
 
-/**
- * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per
- * vertex, which would happen within the `Run` function).
- */
-void Setup();
+/// Clears the shader cache
+void ClearCache();
 
-/// Performs any cleanup when the emulator is shutdown
-void Shutdown();
+struct ShaderSetup {
 
-/**
- * Runs the currently setup shader
- * @param state Shader unit state, must be setup per shader and per shader unit
- * @param input Input vertex into the shader
- * @param num_attributes The number of vertex shader attributes
- * @return The output vertex, after having been processed by the vertex shader
- */
-OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes);
+    struct {
+        // The float uniforms are accessed by the shader JIT using SSE instructions, and are
+        // therefore required to be 16-byte aligned.
+        alignas(16) Math::Vec4<float24> f[96];
 
-/**
- * Produce debug information based on the given shader and input vertex
- * @param input Input vertex into the shader
- * @param num_attributes The number of vertex shader attributes
- * @param config Configuration object for the shader pipeline
- * @param setup Setup object for the shader pipeline
- * @return Debug information for this shader with regards to the given vertex
- */
-DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup);
+        std::array<bool, 16> b;
+        std::array<Math::Vec4<u8>, 4> i;
+    } uniforms;
+
+    static size_t UniformOffset(RegisterType type, unsigned index) {
+        switch (type) {
+        case RegisterType::FloatUniform:
+            return offsetof(ShaderSetup, uniforms.f) + index*sizeof(Math::Vec4<float24>);
+
+        case RegisterType::BoolUniform:
+            return offsetof(ShaderSetup, uniforms.b) + index*sizeof(bool);
+
+        case RegisterType::IntUniform:
+            return offsetof(ShaderSetup, uniforms.i) + index*sizeof(Math::Vec4<u8>);
+
+        default:
+            UNREACHABLE();
+            return 0;
+        }
+    }
+
+    std::array<u32, 1024> program_code;
+    std::array<u32, 1024> swizzle_data;
+
+    /**
+     * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per
+     * vertex, which would happen within the `Run` function).
+     */
+    void Setup();
+
+    /**
+     * Runs the currently setup shader
+     * @param state Shader unit state, must be setup per shader and per shader unit
+     * @param input Input vertex into the shader
+     * @param num_attributes The number of vertex shader attributes
+     * @return The output vertex, after having been processed by the vertex shader
+     */
+    OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes);
+
+    /**
+     * Produce debug information based on the given shader and input vertex
+     * @param input Input vertex into the shader
+     * @param num_attributes The number of vertex shader attributes
+     * @param config Configuration object for the shader pipeline
+     * @param setup Setup object for the shader pipeline
+     * @return Debug information for this shader with regards to the given vertex
+     */
+    DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup);
+
+};
 
 } // namespace Shader
 
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index 7710f7fbc..714e8bfd5 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -29,8 +29,24 @@ namespace Pica {
 
 namespace Shader {
 
+constexpr u32 INVALID_ADDRESS = 0xFFFFFFFF;
+
+struct CallStackElement {
+    u32 final_address;  // Address upon which we jump to return_address
+    u32 return_address; // Where to jump when leaving scope
+    u8 repeat_counter;  // How often to repeat until this call stack element is removed
+    u8 loop_increment;  // Which value to add to the loop counter after an iteration
+                        // TODO: Should this be a signed value? Does it even matter?
+    u32 loop_address;   // The address where we'll return to after each loop iteration
+};
+
 template<bool Debug>
-void RunInterpreter(UnitState<Debug>& state) {
+void RunInterpreter(const ShaderSetup& setup, UnitState<Debug>& state, unsigned offset) {
+    // TODO: Is there a maximal size for this?
+    boost::container::static_vector<CallStackElement, 16> call_stack;
+
+    u32 program_counter = offset;
+
     const auto& uniforms = g_state.vs.uniforms;
     const auto& swizzle_data = g_state.vs.swizzle_data;
     const auto& program_code = g_state.vs.program_code;
@@ -41,16 +57,16 @@ void RunInterpreter(UnitState<Debug>& state) {
     unsigned iteration = 0;
     bool exit_loop = false;
     while (!exit_loop) {
-        if (!state.call_stack.empty()) {
-            auto& top = state.call_stack.back();
-            if (state.program_counter == top.final_address) {
+        if (!call_stack.empty()) {
+            auto& top = call_stack.back();
+            if (program_counter == top.final_address) {
                 state.address_registers[2] += top.loop_increment;
 
                 if (top.repeat_counter-- == 0) {
-                    state.program_counter = top.return_address;
-                    state.call_stack.pop_back();
+                    program_counter = top.return_address;
+                    call_stack.pop_back();
                 } else {
-                    state.program_counter = top.loop_address;
+                    program_counter = top.loop_address;
                 }
 
                 // TODO: Is "trying again" accurate to hardware?
@@ -58,20 +74,20 @@ void RunInterpreter(UnitState<Debug>& state) {
             }
         }
 
-        const Instruction instr = { program_code[state.program_counter] };
+        const Instruction instr = { program_code[program_counter] };
         const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] };
 
-        static auto call = [](UnitState<Debug>& state, u32 offset, u32 num_instructions,
+        static auto call = [&program_counter, &call_stack](UnitState<Debug>& state, u32 offset, u32 num_instructions,
                               u32 return_offset, u8 repeat_count, u8 loop_increment) {
-            state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
-            ASSERT(state.call_stack.size() < state.call_stack.capacity());
-            state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
+            program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
+            ASSERT(call_stack.size() < call_stack.capacity());
+            call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
         };
-        Record<DebugDataRecord::CUR_INSTR>(state.debug, iteration, state.program_counter);
+        Record<DebugDataRecord::CUR_INSTR>(state.debug, iteration, program_counter);
         if (iteration > 0)
-            Record<DebugDataRecord::NEXT_INSTR>(state.debug, iteration - 1, state.program_counter);
+            Record<DebugDataRecord::NEXT_INSTR>(state.debug, iteration - 1, program_counter);
 
-        state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + state.program_counter);
+        state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + program_counter);
 
         auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* {
             switch (source_reg.GetRegisterType()) {
@@ -519,7 +535,7 @@ void RunInterpreter(UnitState<Debug>& state) {
             case OpCode::Id::JMPC:
                 Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
                 if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
-                    state.program_counter = instr.flow_control.dest_offset - 1;
+                    program_counter = instr.flow_control.dest_offset - 1;
                 }
                 break;
 
@@ -527,7 +543,7 @@ void RunInterpreter(UnitState<Debug>& state) {
                 Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
 
                 if (uniforms.b[instr.flow_control.bool_uniform_id] == !(instr.flow_control.num_instructions & 1)) {
-                    state.program_counter = instr.flow_control.dest_offset - 1;
+                    program_counter = instr.flow_control.dest_offset - 1;
                 }
                 break;
 
@@ -535,7 +551,7 @@ void RunInterpreter(UnitState<Debug>& state) {
                 call(state,
                      instr.flow_control.dest_offset,
                      instr.flow_control.num_instructions,
-                     state.program_counter + 1, 0, 0);
+                     program_counter + 1, 0, 0);
                 break;
 
             case OpCode::Id::CALLU:
@@ -544,7 +560,7 @@ void RunInterpreter(UnitState<Debug>& state) {
                     call(state,
                         instr.flow_control.dest_offset,
                         instr.flow_control.num_instructions,
-                        state.program_counter + 1, 0, 0);
+                        program_counter + 1, 0, 0);
                 }
                 break;
 
@@ -554,7 +570,7 @@ void RunInterpreter(UnitState<Debug>& state) {
                     call(state,
                         instr.flow_control.dest_offset,
                         instr.flow_control.num_instructions,
-                        state.program_counter + 1, 0, 0);
+                        program_counter + 1, 0, 0);
                 }
                 break;
 
@@ -565,8 +581,8 @@ void RunInterpreter(UnitState<Debug>& state) {
                 Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
                 if (uniforms.b[instr.flow_control.bool_uniform_id]) {
                     call(state,
-                         state.program_counter + 1,
-                         instr.flow_control.dest_offset - state.program_counter - 1,
+                         program_counter + 1,
+                         instr.flow_control.dest_offset - program_counter - 1,
                          instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 } else {
                     call(state,
@@ -584,8 +600,8 @@ void RunInterpreter(UnitState<Debug>& state) {
                 Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
                 if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
                     call(state,
-                         state.program_counter + 1,
-                         instr.flow_control.dest_offset - state.program_counter - 1,
+                         program_counter + 1,
+                         instr.flow_control.dest_offset - program_counter - 1,
                          instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 } else {
                     call(state,
@@ -607,8 +623,8 @@ void RunInterpreter(UnitState<Debug>& state) {
 
                 Record<DebugDataRecord::LOOP_INT_IN>(state.debug, iteration, loop_param);
                 call(state,
-                     state.program_counter + 1,
-                     instr.flow_control.dest_offset - state.program_counter + 1,
+                     program_counter + 1,
+                     instr.flow_control.dest_offset - program_counter + 1,
                      instr.flow_control.dest_offset + 1,
                      loop_param.x,
                      loop_param.z);
@@ -625,14 +641,14 @@ void RunInterpreter(UnitState<Debug>& state) {
         }
         }
 
-        ++state.program_counter;
+        ++program_counter;
         ++iteration;
     }
 }
 
 // Explicit instantiation
-template void RunInterpreter(UnitState<false>& state);
-template void RunInterpreter(UnitState<true>& state);
+template void RunInterpreter(const ShaderSetup& setup, UnitState<false>& state, unsigned offset);
+template void RunInterpreter(const ShaderSetup& setup, UnitState<true>& state, unsigned offset);
 
 } // namespace
 
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h
index 6048cdf3a..bb3ce1c6e 100644
--- a/src/video_core/shader/shader_interpreter.h
+++ b/src/video_core/shader/shader_interpreter.h
@@ -11,7 +11,7 @@ namespace Shader {
 template <bool Debug> struct UnitState;
 
 template<bool Debug>
-void RunInterpreter(UnitState<Debug>& state);
+void RunInterpreter(const ShaderSetup& setup, UnitState<Debug>& state, unsigned offset);
 
 } // namespace
 
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 99f6c51eb..43e7e6b4c 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -102,7 +102,7 @@ const JitFunction instr_table[64] = {
 // purposes, as documented below:
 
 /// Pointer to the uniform memory
-static const X64Reg UNIFORMS = R9;
+static const X64Reg SETUP = R9;
 /// The two 32-bit VS address offset registers set by the MOVA instruction
 static const X64Reg ADDROFFS_REG_0 = R10;
 static const X64Reg ADDROFFS_REG_1 = R11;
@@ -117,7 +117,7 @@ static const X64Reg COND0 = R13;
 /// Result of the previous CMP instruction for the Y-component comparison
 static const X64Reg COND1 = R14;
 /// Pointer to the UnitState instance for the current VS unit
-static const X64Reg REGISTERS = R15;
+static const X64Reg STATE = R15;
 /// SIMD scratch register
 static const X64Reg SCRATCH = XMM0;
 /// Loaded with the first swizzled source register, otherwise can be used as a scratch register
@@ -136,7 +136,7 @@ static const X64Reg NEGBIT = XMM15;
 // State registers that must not be modified by external functions calls
 // Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed
 static const BitSet32 persistent_regs = {
-    UNIFORMS, REGISTERS, // Pointers to register blocks
+    SETUP, STATE, // Pointers to register blocks
     ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, // Cached registers
     ONE+16, NEGBIT+16, // Constants
 };
@@ -177,10 +177,10 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe
     size_t src_offset;
 
     if (src_reg.GetRegisterType() == RegisterType::FloatUniform) {
-        src_ptr = UNIFORMS;
-        src_offset = src_reg.GetIndex() * sizeof(float24) * 4;
+        src_ptr = SETUP;
+        src_offset = ShaderSetup::UniformOffset(RegisterType::FloatUniform, src_reg.GetIndex());
     } else {
-        src_ptr = REGISTERS;
+        src_ptr = STATE;
         src_offset = UnitState<false>::InputOffset(src_reg);
     }
 
@@ -264,11 +264,11 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) {
     // If all components are enabled, write the result to the destination register
     if (swiz.dest_mask == NO_DEST_REG_MASK) {
         // Store dest back to memory
-        MOVAPS(MDisp(REGISTERS, dest_offset_disp), src);
+        MOVAPS(MDisp(STATE, dest_offset_disp), src);
 
     } else {
         // Not all components are enabled, so mask the result when storing to the destination register...
-        MOVAPS(SCRATCH, MDisp(REGISTERS, dest_offset_disp));
+        MOVAPS(SCRATCH, MDisp(STATE, dest_offset_disp));
 
         if (Common::GetCPUCaps().sse4_1) {
             u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
@@ -287,7 +287,7 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) {
         }
 
         // Store dest back to memory
-        MOVAPS(MDisp(REGISTERS, dest_offset_disp), SCRATCH);
+        MOVAPS(MDisp(STATE, dest_offset_disp), SCRATCH);
     }
 }
 
@@ -336,8 +336,8 @@ void JitShader::Compile_EvaluateCondition(Instruction instr) {
 }
 
 void JitShader::Compile_UniformCondition(Instruction instr) {
-    int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool));
-    CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0));
+    int offset = ShaderSetup::UniformOffset(RegisterType::BoolUniform, instr.flow_control.bool_uniform_id);
+    CMP(sizeof(bool) * 8, MDisp(SETUP, offset), Imm8(0));
 }
 
 BitSet32 JitShader::PersistentCallerSavedRegs() {
@@ -714,8 +714,8 @@ void JitShader::Compile_LOOP(Instruction instr) {
 
     looping = true;
 
-    int offset = offsetof(decltype(g_state.vs.uniforms), i) + (instr.flow_control.int_uniform_id * sizeof(Math::Vec4<u8>));
-    MOV(32, R(LOOPCOUNT), MDisp(UNIFORMS, offset));
+    int offset = ShaderSetup::UniformOffset(RegisterType::IntUniform, instr.flow_control.int_uniform_id);
+    MOV(32, R(LOOPCOUNT), MDisp(SETUP, offset));
     MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT));
     SHR(32, R(LOOPCOUNT_REG), Imm8(8));
     AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start
@@ -826,8 +826,8 @@ void JitShader::Compile() {
     // The stack pointer is 8 modulo 16 at the entry of a procedure
     ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);
 
-    MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1));
-    MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms));
+    MOV(PTRBITS, R(SETUP), R(ABI_PARAM1));
+    MOV(PTRBITS, R(STATE), R(ABI_PARAM2));
 
     // Zero address/loop  registers
     XOR(64, R(ADDROFFS_REG_0), R(ADDROFFS_REG_0));
@@ -845,7 +845,7 @@ void JitShader::Compile() {
     MOVAPS(NEGBIT, MatR(RAX));
 
     // Jump to start of the shader program
-    JMPptr(R(ABI_PARAM2));
+    JMPptr(R(ABI_PARAM3));
 
     // Compile entire program
     Compile_Block(static_cast<unsigned>(g_state.vs.program_code.size()));
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index 30aa7ff30..5468459d4 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -36,8 +36,8 @@ class JitShader : public Gen::XCodeBlock {
 public:
     JitShader();
 
-    void Run(void* registers, unsigned offset) const {
-        program(registers, code_ptr[offset]);
+    void Run(const ShaderSetup& setup, UnitState<false>& state, unsigned offset) const {
+        program(&setup, &state, code_ptr[offset]);
     }
 
     void Compile();
@@ -117,7 +117,7 @@ private:
     /// Branches that need to be fixed up once the entire shader program is compiled
     std::vector<std::pair<Gen::FixupBranch, unsigned>> fixup_branches;
 
-    using CompiledShader = void(void* registers, const u8* start_addr);
+    using CompiledShader = void(const void* setup, void* state, const u8* start_addr);
     CompiledShader* program = nullptr;
 };
 
diff --git a/src/video_core/vertex_loader.cpp b/src/video_core/vertex_loader.cpp
index 18a7cf144..e40f0f1ee 100644
--- a/src/video_core/vertex_loader.cpp
+++ b/src/video_core/vertex_loader.cpp
@@ -130,7 +130,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, Shader::I
                 input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
         } else if (vertex_attribute_is_default[i]) {
             // Load the default attribute if we're configured to do so
-            input.attr[i] = g_state.vs.default_attributes[i];
+            input.attr[i] = g_state.vs_default_attributes[i];
             LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)",
                 i, vertex, index,
                 input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),