20 files changed, 314 insertions, 153 deletions
diff --git a/src/audio_core/CMakeLists.txt b/src/audio_core/CMakeLists.txt
index 82e4850f7..c381dbe1d 100644
--- a/src/audio_core/CMakeLists.txt
+++ b/src/audio_core/CMakeLists.txt
@@ -17,6 +17,8 @@ add_library(audio_core STATIC
     sink_stream.h
     stream.cpp
     stream.h
+    time_stretch.cpp
+    time_stretch.h
 
     $<$<BOOL:${ENABLE_CUBEB}>:cubeb_sink.cpp cubeb_sink.h>
 )
@@ -24,6 +26,7 @@ add_library(audio_core STATIC
 create_target_directory_groups(audio_core)
 
 target_link_libraries(audio_core PUBLIC common core)
+target_link_libraries(audio_core PRIVATE SoundTouch)
 
 if(ENABLE_CUBEB)
     target_link_libraries(audio_core PRIVATE cubeb)
diff --git a/src/audio_core/algorithm/filter.cpp b/src/audio_core/algorithm/filter.cpp
index 9fcd0614d..f65bf64f7 100644
--- a/src/audio_core/algorithm/filter.cpp
+++ b/src/audio_core/algorithm/filter.cpp
@@ -35,12 +35,12 @@ Filter::Filter(double a0, double a1, double a2, double b0, double b1, double b2)
     : a1(a1 / a0), a2(a2 / a0), b0(b0 / a0), b1(b1 / a0), b2(b2 / a0) {}
 
 void Filter::Process(std::vector<s16>& signal) {
-    const size_t num_frames = signal.size() / 2;
-    for (size_t i = 0; i < num_frames; i++) {
+    const std::size_t num_frames = signal.size() / 2;
+    for (std::size_t i = 0; i < num_frames; i++) {
         std::rotate(in.begin(), in.end() - 1, in.end());
         std::rotate(out.begin(), out.end() - 1, out.end());
 
-        for (size_t ch = 0; ch < channel_count; ch++) {
+        for (std::size_t ch = 0; ch < channel_count; ch++) {
             in[0][ch] = signal[i * channel_count + ch];
 
             out[0][ch] = b0 * in[0][ch] + b1 * in[1][ch] + b2 * in[2][ch] - a1 * out[1][ch] -
@@ -54,14 +54,14 @@ void Filter::Process(std::vector<s16>& signal) {
 /// Calculates the appropriate Q for each biquad in a cascading filter.
 /// @param total_count The total number of biquads to be cascaded.
 /// @param index 0-index of the biquad to calculate the Q value for.
-static double CascadingBiquadQ(size_t total_count, size_t index) {
+static double CascadingBiquadQ(std::size_t total_count, std::size_t index) {
     const double pole = M_PI * (2 * index + 1) / (4.0 * total_count);
     return 1.0 / (2.0 * std::cos(pole));
 }
 
-CascadingFilter CascadingFilter::LowPass(double cutoff, size_t cascade_size) {
+CascadingFilter CascadingFilter::LowPass(double cutoff, std::size_t cascade_size) {
     std::vector<Filter> cascade(cascade_size);
-    for (size_t i = 0; i < cascade_size; i++) {
+    for (std::size_t i = 0; i < cascade_size; i++) {
         cascade[i] = Filter::LowPass(cutoff, CascadingBiquadQ(cascade_size, i));
     }
     return CascadingFilter{std::move(cascade)};
diff --git a/src/audio_core/algorithm/filter.h b/src/audio_core/algorithm/filter.h
index a41beef98..3546d149b 100644
--- a/src/audio_core/algorithm/filter.h
+++ b/src/audio_core/algorithm/filter.h
@@ -30,7 +30,7 @@ public:
     void Process(std::vector<s16>& signal);
 
 private:
-    static constexpr size_t channel_count = 2;
+    static constexpr std::size_t channel_count = 2;
 
     /// Coefficients are in normalized form (a0 = 1.0).
     double a1, a2, b0, b1, b2;
@@ -46,7 +46,7 @@ public:
     /// Creates a cascading low-pass filter.
     /// @param cutoff Determines the cutoff frequency. A value from 0.0 to 1.0.
     /// @param cascade_size Number of biquads in cascade.
-    static CascadingFilter LowPass(double cutoff, size_t cascade_size);
+    static CascadingFilter LowPass(double cutoff, std::size_t cascade_size);
 
     /// Passthrough.
     CascadingFilter();
diff --git a/src/audio_core/algorithm/interpolate.cpp b/src/audio_core/algorithm/interpolate.cpp
index 11459821f..3aea9b0f2 100644
--- a/src/audio_core/algorithm/interpolate.cpp
+++ b/src/audio_core/algorithm/interpolate.cpp
@@ -14,7 +14,7 @@
 namespace AudioCore {
 
 /// The Lanczos kernel
-static double Lanczos(size_t a, double x) {
+static double Lanczos(std::size_t a, double x) {
     if (x == 0.0)
         return 1.0;
     const double px = M_PI * x;
@@ -37,15 +37,15 @@ std::vector<s16> Interpolate(InterpolationState& state, std::vector<s16> input,
     }
     state.nyquist.Process(input);
 
-    constexpr size_t taps = InterpolationState::lanczos_taps;
-    const size_t num_frames = input.size() / 2;
+    constexpr std::size_t taps = InterpolationState::lanczos_taps;
+    const std::size_t num_frames = input.size() / 2;
 
     std::vector<s16> output;
-    output.reserve(static_cast<size_t>(input.size() / ratio + 4));
+    output.reserve(static_cast<std::size_t>(input.size() / ratio + 4));
 
     double& pos = state.position;
     auto& h = state.history;
-    for (size_t i = 0; i < num_frames; ++i) {
+    for (std::size_t i = 0; i < num_frames; ++i) {
         std::rotate(h.begin(), h.end() - 1, h.end());
         h[0][0] = input[i * 2 + 0];
         h[0][1] = input[i * 2 + 1];
@@ -53,7 +53,7 @@ std::vector<s16> Interpolate(InterpolationState& state, std::vector<s16> input,
         while (pos <= 1.0) {
             double l = 0.0;
             double r = 0.0;
-            for (size_t j = 0; j < h.size(); j++) {
+            for (std::size_t j = 0; j < h.size(); j++) {
                 l += Lanczos(taps, pos + j - taps + 1) * h[j][0];
                 r += Lanczos(taps, pos + j - taps + 1) * h[j][1];
             }
diff --git a/src/audio_core/algorithm/interpolate.h b/src/audio_core/algorithm/interpolate.h
index c79c2eef4..edbd6460f 100644
--- a/src/audio_core/algorithm/interpolate.h
+++ b/src/audio_core/algorithm/interpolate.h
@@ -12,8 +12,8 @@
 namespace AudioCore {
 
 struct InterpolationState {
-    static constexpr size_t lanczos_taps = 4;
-    static constexpr size_t history_size = lanczos_taps * 2 - 1;
+    static constexpr std::size_t lanczos_taps = 4;
+    static constexpr std::size_t history_size = lanczos_taps * 2 - 1;
 
     double current_ratio = 0.0;
     CascadingFilter nyquist;
diff --git a/src/audio_core/audio_out.cpp b/src/audio_core/audio_out.cpp
index 12632a95c..0c8f5b18e 100644
--- a/src/audio_core/audio_out.cpp
+++ b/src/audio_core/audio_out.cpp
@@ -39,7 +39,8 @@ StreamPtr AudioOut::OpenStream(u32 sample_rate, u32 num_channels, std::string&&
         sink->AcquireSinkStream(sample_rate, num_channels, name), std::move(name));
 }
 
-std::vector<Buffer::Tag> AudioOut::GetTagsAndReleaseBuffers(StreamPtr stream, size_t max_count) {
+std::vector<Buffer::Tag> AudioOut::GetTagsAndReleaseBuffers(StreamPtr stream,
+                                                            std::size_t max_count) {
     return stream->GetTagsAndReleaseBuffers(max_count);
 }
 
diff --git a/src/audio_core/audio_out.h b/src/audio_core/audio_out.h
index 39b7e656b..df9607ac7 100644
--- a/src/audio_core/audio_out.h
+++ b/src/audio_core/audio_out.h
@@ -25,7 +25,7 @@ public:
                          Stream::ReleaseCallback&& release_callback);
 
     /// Returns a vector of recently released buffers specified by tag for the specified stream
-    std::vector<Buffer::Tag> GetTagsAndReleaseBuffers(StreamPtr stream, size_t max_count);
+    std::vector<Buffer::Tag> GetTagsAndReleaseBuffers(StreamPtr stream, std::size_t max_count);
 
     /// Starts an audio stream for playback
     void StartStream(StreamPtr stream);
diff --git a/src/audio_core/audio_renderer.cpp b/src/audio_core/audio_renderer.cpp
index 397b107f5..83b75e61f 100644
--- a/src/audio_core/audio_renderer.cpp
+++ b/src/audio_core/audio_renderer.cpp
@@ -3,9 +3,12 @@
 // Refer to the license.txt file included.
 
 #include "audio_core/algorithm/interpolate.h"
+#include "audio_core/audio_out.h"
 #include "audio_core/audio_renderer.h"
+#include "audio_core/codec.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/hle/kernel/event.h"
 #include "core/memory.h"
 
 namespace AudioCore {
@@ -13,20 +16,57 @@ namespace AudioCore {
 constexpr u32 STREAM_SAMPLE_RATE{48000};
 constexpr u32 STREAM_NUM_CHANNELS{2};
 
+class AudioRenderer::VoiceState {
+public:
+    bool IsPlaying() const {
+        return is_in_use && info.play_state == PlayState::Started;
+    }
+
+    const VoiceOutStatus& GetOutStatus() const {
+        return out_status;
+    }
+
+    const VoiceInfo& GetInfo() const {
+        return info;
+    }
+
+    VoiceInfo& Info() {
+        return info;
+    }
+
+    void SetWaveIndex(std::size_t index);
+    std::vector<s16> DequeueSamples(std::size_t sample_count);
+    void UpdateState();
+    void RefreshBuffer();
+
+private:
+    bool is_in_use{};
+    bool is_refresh_pending{};
+    std::size_t wave_index{};
+    std::size_t offset{};
+    Codec::ADPCMState adpcm_state{};
+    InterpolationState interp_state{};
+    std::vector<s16> samples;
+    VoiceOutStatus out_status{};
+    VoiceInfo info{};
+};
+
 AudioRenderer::AudioRenderer(AudioRendererParameter params,
                              Kernel::SharedPtr<Kernel::Event> buffer_event)
     : worker_params{params}, buffer_event{buffer_event}, voices(params.voice_count) {
 
-    audio_core = std::make_unique<AudioCore::AudioOut>();
-    stream = audio_core->OpenStream(STREAM_SAMPLE_RATE, STREAM_NUM_CHANNELS, "AudioRenderer",
-                                    [=]() { buffer_event->Signal(); });
-    audio_core->StartStream(stream);
+    audio_out = std::make_unique<AudioCore::AudioOut>();
+    stream = audio_out->OpenStream(STREAM_SAMPLE_RATE, STREAM_NUM_CHANNELS, "AudioRenderer",
+                                   [=]() { buffer_event->Signal(); });
+    audio_out->StartStream(stream);
 
     QueueMixedBuffer(0);
     QueueMixedBuffer(1);
     QueueMixedBuffer(2);
 }
 
+AudioRenderer::~AudioRenderer() = default;
+
 u32 AudioRenderer::GetSampleRate() const {
     return worker_params.sample_rate;
 }
@@ -52,8 +92,8 @@ std::vector<u8> AudioRenderer::UpdateAudioRenderer(const std::vector<u8>& input_
                 memory_pool_count * sizeof(MemoryPoolInfo));
 
     // Copy VoiceInfo structs
-    size_t offset{sizeof(UpdateDataHeader) + config.behavior_size + config.memory_pools_size +
-                  config.voice_resource_size};
+    std::size_t offset{sizeof(UpdateDataHeader) + config.behavior_size + config.memory_pools_size +
+                       config.voice_resource_size};
     for (auto& voice : voices) {
         std::memcpy(&voice.Info(), input_params.data() + offset, sizeof(VoiceInfo));
         offset += sizeof(VoiceInfo);
@@ -72,7 +112,7 @@ std::vector<u8> AudioRenderer::UpdateAudioRenderer(const std::vector<u8>& input_
 
     // Update memory pool state
     std::vector<MemoryPoolEntry> memory_pool(memory_pool_count);
-    for (size_t index = 0; index < memory_pool.size(); ++index) {
+    for (std::size_t index = 0; index < memory_pool.size(); ++index) {
         if (mem_pool_info[index].pool_state == MemoryPoolStates::RequestAttach) {
             memory_pool[index].state = MemoryPoolStates::Attached;
         } else if (mem_pool_info[index].pool_state == MemoryPoolStates::RequestDetach) {
@@ -93,7 +133,7 @@ std::vector<u8> AudioRenderer::UpdateAudioRenderer(const std::vector<u8>& input_
                 response_data.memory_pools_size);
 
     // Copy output voice status
-    size_t voice_out_status_offset{sizeof(UpdateDataHeader) + response_data.memory_pools_size};
+    std::size_t voice_out_status_offset{sizeof(UpdateDataHeader) + response_data.memory_pools_size};
     for (const auto& voice : voices) {
         std::memcpy(output_params.data() + voice_out_status_offset, &voice.GetOutStatus(),
                     sizeof(VoiceOutStatus));
@@ -103,12 +143,12 @@ std::vector<u8> AudioRenderer::UpdateAudioRenderer(const std::vector<u8>& input_
     return output_params;
 }
 
-void AudioRenderer::VoiceState::SetWaveIndex(size_t index) {
+void AudioRenderer::VoiceState::SetWaveIndex(std::size_t index) {
     wave_index = index & 3;
     is_refresh_pending = true;
 }
 
-std::vector<s16> AudioRenderer::VoiceState::DequeueSamples(size_t sample_count) {
+std::vector<s16> AudioRenderer::VoiceState::DequeueSamples(std::size_t sample_count) {
     if (!IsPlaying()) {
         return {};
     }
@@ -117,9 +157,9 @@ std::vector<s16> AudioRenderer::VoiceState::DequeueSamples(size_t sample_count)
         RefreshBuffer();
     }
 
-    const size_t max_size{samples.size() - offset};
-    const size_t dequeue_offset{offset};
-    size_t size{sample_count * STREAM_NUM_CHANNELS};
+    const std::size_t max_size{samples.size() - offset};
+    const std::size_t dequeue_offset{offset};
+    std::size_t size{sample_count * STREAM_NUM_CHANNELS};
     if (size > max_size) {
         size = max_size;
     }
@@ -184,7 +224,7 @@ void AudioRenderer::VoiceState::RefreshBuffer() {
     case 1:
         // 1 channel is upsampled to 2 channel
         samples.resize(new_samples.size() * 2);
-        for (size_t index = 0; index < new_samples.size(); ++index) {
+        for (std::size_t index = 0; index < new_samples.size(); ++index) {
             samples[index * 2] = new_samples[index];
             samples[index * 2 + 1] = new_samples[index];
         }
@@ -210,7 +250,7 @@ static constexpr s16 ClampToS16(s32 value) {
 }
 
 void AudioRenderer::QueueMixedBuffer(Buffer::Tag tag) {
-    constexpr size_t BUFFER_SIZE{512};
+    constexpr std::size_t BUFFER_SIZE{512};
     std::vector<s16> buffer(BUFFER_SIZE * stream->GetNumChannels());
 
     for (auto& voice : voices) {
@@ -218,7 +258,7 @@ void AudioRenderer::QueueMixedBuffer(Buffer::Tag tag) {
             continue;
         }
 
-        size_t offset{};
+        std::size_t offset{};
         s64 samples_remaining{BUFFER_SIZE};
         while (samples_remaining > 0) {
             const std::vector<s16> samples{voice.DequeueSamples(samples_remaining)};
@@ -236,11 +276,11 @@ void AudioRenderer::QueueMixedBuffer(Buffer::Tag tag) {
             }
         }
     }
-    audio_core->QueueBuffer(stream, tag, std::move(buffer));
+    audio_out->QueueBuffer(stream, tag, std::move(buffer));
 }
 
 void AudioRenderer::ReleaseAndQueueBuffers() {
-    const auto released_buffers{audio_core->GetTagsAndReleaseBuffers(stream, 2)};
+    const auto released_buffers{audio_out->GetTagsAndReleaseBuffers(stream, 2)};
     for (const auto& tag : released_buffers) {
         QueueMixedBuffer(tag);
     }
diff --git a/src/audio_core/audio_renderer.h b/src/audio_core/audio_renderer.h
index eba67f28e..2c4f5ab75 100644
--- a/src/audio_core/audio_renderer.h
+++ b/src/audio_core/audio_renderer.h
@@ -8,16 +8,20 @@
 #include <memory>
 #include <vector>
 
-#include "audio_core/algorithm/interpolate.h"
-#include "audio_core/audio_out.h"
-#include "audio_core/codec.h"
 #include "audio_core/stream.h"
+#include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/swap.h"
-#include "core/hle/kernel/event.h"
+#include "core/hle/kernel/object.h"
+
+namespace Kernel {
+class Event;
+}
 
 namespace AudioCore {
 
+class AudioOut;
+
 enum class PlayState : u8 {
     Started = 0,
     Stopped = 1,
@@ -158,6 +162,8 @@ static_assert(sizeof(UpdateDataHeader) == 0x40, "UpdateDataHeader has wrong size
 class AudioRenderer {
 public:
     AudioRenderer(AudioRendererParameter params, Kernel::SharedPtr<Kernel::Event> buffer_event);
+    ~AudioRenderer();
+
     std::vector<u8> UpdateAudioRenderer(const std::vector<u8>& input_params);
     void QueueMixedBuffer(Buffer::Tag tag);
     void ReleaseAndQueueBuffers();
@@ -166,45 +172,12 @@ public:
     u32 GetMixBufferCount() const;
 
 private:
-    class VoiceState {
-    public:
-        bool IsPlaying() const {
-            return is_in_use && info.play_state == PlayState::Started;
-        }
-
-        const VoiceOutStatus& GetOutStatus() const {
-            return out_status;
-        }
-
-        const VoiceInfo& GetInfo() const {
-            return info;
-        }
-
-        VoiceInfo& Info() {
-            return info;
-        }
-
-        void SetWaveIndex(size_t index);
-        std::vector<s16> DequeueSamples(size_t sample_count);
-        void UpdateState();
-        void RefreshBuffer();
-
-    private:
-        bool is_in_use{};
-        bool is_refresh_pending{};
-        size_t wave_index{};
-        size_t offset{};
-        Codec::ADPCMState adpcm_state{};
-        InterpolationState interp_state{};
-        std::vector<s16> samples;
-        VoiceOutStatus out_status{};
-        VoiceInfo info{};
-    };
+    class VoiceState;
 
     AudioRendererParameter worker_params;
     Kernel::SharedPtr<Kernel::Event> buffer_event;
     std::vector<VoiceState> voices;
-    std::unique_ptr<AudioCore::AudioOut> audio_core;
+    std::unique_ptr<AudioOut> audio_out;
     AudioCore::StreamPtr stream;
 };
 
diff --git a/src/audio_core/codec.cpp b/src/audio_core/codec.cpp
index c3021403f..454de798b 100644
--- a/src/audio_core/codec.cpp
+++ b/src/audio_core/codec.cpp
@@ -8,27 +8,27 @@
 
 namespace AudioCore::Codec {
 
-std::vector<s16> DecodeADPCM(const u8* const data, size_t size, const ADPCM_Coeff& coeff,
+std::vector<s16> DecodeADPCM(const u8* const data, std::size_t size, const ADPCM_Coeff& coeff,
                              ADPCMState& state) {
     // GC-ADPCM with scale factor and variable coefficients.
     // Frames are 8 bytes long containing 14 samples each.
     // Samples are 4 bits (one nibble) long.
 
-    constexpr size_t FRAME_LEN = 8;
-    constexpr size_t SAMPLES_PER_FRAME = 14;
+    constexpr std::size_t FRAME_LEN = 8;
+    constexpr std::size_t SAMPLES_PER_FRAME = 14;
     constexpr std::array<int, 16> SIGNED_NIBBLES = {
         {0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1}};
 
-    const size_t sample_count = (size / FRAME_LEN) * SAMPLES_PER_FRAME;
-    const size_t ret_size =
+    const std::size_t sample_count = (size / FRAME_LEN) * SAMPLES_PER_FRAME;
+    const std::size_t ret_size =
         sample_count % 2 == 0 ? sample_count : sample_count + 1; // Ensure multiple of two.
     std::vector<s16> ret(ret_size);
 
     int yn1 = state.yn1, yn2 = state.yn2;
 
-    const size_t NUM_FRAMES =
+    const std::size_t NUM_FRAMES =
         (sample_count + (SAMPLES_PER_FRAME - 1)) / SAMPLES_PER_FRAME; // Round up.
-    for (size_t framei = 0; framei < NUM_FRAMES; framei++) {
+    for (std::size_t framei = 0; framei < NUM_FRAMES; framei++) {
         const int frame_header = data[framei * FRAME_LEN];
         const int scale = 1 << (frame_header & 0xF);
         const int idx = (frame_header >> 4) & 0x7;
@@ -53,9 +53,9 @@ std::vector<s16> DecodeADPCM(const u8* const data, size_t size, const ADPCM_Coef
             return static_cast<s16>(val);
         };
 
-        size_t outputi = framei * SAMPLES_PER_FRAME;
-        size_t datai = framei * FRAME_LEN + 1;
-        for (size_t i = 0; i < SAMPLES_PER_FRAME && outputi < sample_count; i += 2) {
+        std::size_t outputi = framei * SAMPLES_PER_FRAME;
+        std::size_t datai = framei * FRAME_LEN + 1;
+        for (std::size_t i = 0; i < SAMPLES_PER_FRAME && outputi < sample_count; i += 2) {
             const s16 sample1 = decode_sample(SIGNED_NIBBLES[data[datai] >> 4]);
             ret[outputi] = sample1;
             outputi++;
diff --git a/src/audio_core/codec.h b/src/audio_core/codec.h
index 3f845c42c..ef2ce01a8 100644
--- a/src/audio_core/codec.h
+++ b/src/audio_core/codec.h
@@ -38,7 +38,7 @@ using ADPCM_Coeff = std::array<s16, 16>;
  * @param state ADPCM state, this is updated with new state
  * @return Decoded stereo signed PCM16 data, sample_count in length
  */
-std::vector<s16> DecodeADPCM(const u8* const data, size_t size, const ADPCM_Coeff& coeff,
+std::vector<s16> DecodeADPCM(const u8* const data, std::size_t size, const ADPCM_Coeff& coeff,
                              ADPCMState& state);
 
 }; // namespace AudioCore::Codec
diff --git a/src/audio_core/cubeb_sink.cpp b/src/audio_core/cubeb_sink.cpp
index 5a1177d0c..392039688 100644
--- a/src/audio_core/cubeb_sink.cpp
+++ b/src/audio_core/cubeb_sink.cpp
@@ -3,27 +3,23 @@
 // Refer to the license.txt file included.
 
 #include <algorithm>
+#include <atomic>
 #include <cstring>
-#include <mutex>
-
 #include "audio_core/cubeb_sink.h"
 #include "audio_core/stream.h"
+#include "audio_core/time_stretch.h"
 #include "common/logging/log.h"
+#include "common/ring_buffer.h"
+#include "core/settings.h"
 
 namespace AudioCore {
 
-class SinkStreamImpl final : public SinkStream {
+class CubebSinkStream final : public SinkStream {
 public:
-    SinkStreamImpl(cubeb* ctx, u32 sample_rate, u32 num_channels_, cubeb_devid output_device,
-                   const std::string& name)
-        : ctx{ctx}, num_channels{num_channels_} {
-
-        if (num_channels == 6) {
-            // 6-channel audio does not seem to work with cubeb + SDL, so we downsample this to 2
-            // channel for now
-            is_6_channel = true;
-            num_channels = 2;
-        }
+    CubebSinkStream(cubeb* ctx, u32 sample_rate, u32 num_channels_, cubeb_devid output_device,
+                    const std::string& name)
+        : ctx{ctx}, num_channels{std::min(num_channels_, 2u)}, time_stretch{sample_rate,
+                                                                            num_channels} {
 
         cubeb_stream_params params{};
         params.rate = sample_rate;
@@ -38,7 +34,7 @@ public:
 
         if (cubeb_stream_init(ctx, &stream_backend, name.c_str(), nullptr, nullptr, output_device,
                               &params, std::max(512u, minimum_latency),
-                              &SinkStreamImpl::DataCallback, &SinkStreamImpl::StateCallback,
+                              &CubebSinkStream::DataCallback, &CubebSinkStream::StateCallback,
                               this) != CUBEB_OK) {
             LOG_CRITICAL(Audio_Sink, "Error initializing cubeb stream");
             return;
@@ -50,7 +46,7 @@ public:
         }
     }
 
-    ~SinkStreamImpl() {
+    ~CubebSinkStream() {
         if (!ctx) {
             return;
         }
@@ -62,27 +58,32 @@ public:
         cubeb_stream_destroy(stream_backend);
     }
 
-    void EnqueueSamples(u32 num_channels, const std::vector<s16>& samples) override {
-        if (!ctx) {
+    void EnqueueSamples(u32 source_num_channels, const std::vector<s16>& samples) override {
+        if (source_num_channels > num_channels) {
+            // Downsample 6 channels to 2
+            std::vector<s16> buf;
+            buf.reserve(samples.size() * num_channels / source_num_channels);
+            for (std::size_t i = 0; i < samples.size(); i += source_num_channels) {
+                for (std::size_t ch = 0; ch < num_channels; ch++) {
+                    buf.push_back(samples[i + ch]);
+                }
+            }
+            queue.Push(buf);
             return;
         }
 
-        std::lock_guard lock{queue_mutex};
+        queue.Push(samples);
+    }
 
-        queue.reserve(queue.size() + samples.size() * GetNumChannels());
+    std::size_t SamplesInQueue(u32 num_channels) const override {
+        if (!ctx)
+            return 0;
 
-        if (is_6_channel) {
-            // Downsample 6 channels to 2
-            const size_t sample_count_copy_size = samples.size() * 2;
-            queue.reserve(sample_count_copy_size);
-            for (size_t i = 0; i < samples.size(); i += num_channels) {
-                queue.push_back(samples[i]);
-                queue.push_back(samples[i + 1]);
-            }
-        } else {
-            // Copy as-is
-            std::copy(samples.begin(), samples.end(), std::back_inserter(queue));
-        }
+        return queue.Size() / num_channels;
+    }
+
+    void Flush() override {
+        should_flush = true;
     }
 
     u32 GetNumChannels() const {
@@ -95,10 +96,11 @@ private:
     cubeb* ctx{};
     cubeb_stream* stream_backend{};
     u32 num_channels{};
-    bool is_6_channel{};
 
-    std::mutex queue_mutex;
-    std::vector<s16> queue;
+    Common::RingBuffer<s16, 0x10000> queue;
+    std::array<s16, 2> last_frame;
+    std::atomic<bool> should_flush{};
+    TimeStretcher time_stretch;
 
     static long DataCallback(cubeb_stream* stream, void* user_data, const void* input_buffer,
                              void* output_buffer, long num_frames);
@@ -117,10 +119,10 @@ CubebSink::CubebSink(std::string target_device_name) {
             LOG_WARNING(Audio_Sink, "Audio output device enumeration not supported");
         } else {
             const auto collection_end{collection.device + collection.count};
-            const auto device{std::find_if(collection.device, collection_end,
-                                           [&](const cubeb_device_info& device) {
-                                               return target_device_name == device.friendly_name;
-                                           })};
+            const auto device{
+                std::find_if(collection.device, collection_end, [&](const cubeb_device_info& info) {
+                    return target_device_name == info.friendly_name;
+                })};
             if (device != collection_end) {
                 output_device = device->devid;
             }
@@ -144,44 +146,59 @@ CubebSink::~CubebSink() {
 SinkStream& CubebSink::AcquireSinkStream(u32 sample_rate, u32 num_channels,
                                          const std::string& name) {
     sink_streams.push_back(
-        std::make_unique<SinkStreamImpl>(ctx, sample_rate, num_channels, output_device, name));
+        std::make_unique<CubebSinkStream>(ctx, sample_rate, num_channels, output_device, name));
     return *sink_streams.back();
 }
 
-long SinkStreamImpl::DataCallback(cubeb_stream* stream, void* user_data, const void* input_buffer,
-                                  void* output_buffer, long num_frames) {
-    SinkStreamImpl* impl = static_cast<SinkStreamImpl*>(user_data);
+long CubebSinkStream::DataCallback(cubeb_stream* stream, void* user_data, const void* input_buffer,
+                                   void* output_buffer, long num_frames) {
+    CubebSinkStream* impl = static_cast<CubebSinkStream*>(user_data);
     u8* buffer = reinterpret_cast<u8*>(output_buffer);
 
     if (!impl) {
         return {};
     }
 
-    std::lock_guard lock{impl->queue_mutex};
-
-    const size_t frames_to_write{
-        std::min(impl->queue.size() / impl->GetNumChannels(), static_cast<size_t>(num_frames))};
+    const std::size_t num_channels = impl->GetNumChannels();
+    const std::size_t samples_to_write = num_channels * num_frames;
+    std::size_t samples_written;
+
+    if (Settings::values.enable_audio_stretching) {
+        const std::vector<s16> in{impl->queue.Pop()};
+        const std::size_t num_in{in.size() / num_channels};
+        s16* const out{reinterpret_cast<s16*>(buffer)};
+        const std::size_t out_frames =
+            impl->time_stretch.Process(in.data(), num_in, out, num_frames);
+        samples_written = out_frames * num_channels;
+
+        if (impl->should_flush) {
+            impl->time_stretch.Flush();
+            impl->should_flush = false;
+        }
+    } else {
+        samples_written = impl->queue.Pop(buffer, samples_to_write);
+    }
 
-    memcpy(buffer, impl->queue.data(), frames_to_write * sizeof(s16) * impl->GetNumChannels());
-    impl->queue.erase(impl->queue.begin(),
-                      impl->queue.begin() + frames_to_write * impl->GetNumChannels());
+    if (samples_written >= num_channels) {
+        std::memcpy(&impl->last_frame[0], buffer + (samples_written - num_channels) * sizeof(s16),
+                    num_channels * sizeof(s16));
+    }
 
-    if (frames_to_write < num_frames) {
-        // Fill the rest of the frames with silence
-        memset(buffer + frames_to_write * sizeof(s16) * impl->GetNumChannels(), 0,
-               (num_frames - frames_to_write) * sizeof(s16) * impl->GetNumChannels());
+    // Fill the rest of the frames with last_frame
+    for (std::size_t i = samples_written; i < samples_to_write; i += num_channels) {
+        std::memcpy(buffer + i * sizeof(s16), &impl->last_frame[0], num_channels * sizeof(s16));
     }
 
     return num_frames;
 }
 
-void SinkStreamImpl::StateCallback(cubeb_stream* stream, void* user_data, cubeb_state state) {}
+void CubebSinkStream::StateCallback(cubeb_stream* stream, void* user_data, cubeb_state state) {}
 
 std::vector<std::string> ListCubebSinkDevices() {
     std::vector<std::string> device_list;
     cubeb* ctx;
 
-    if (cubeb_init(&ctx, "Citra Device Enumerator", nullptr) != CUBEB_OK) {
+    if (cubeb_init(&ctx, "yuzu Device Enumerator", nullptr) != CUBEB_OK) {
         LOG_CRITICAL(Audio_Sink, "cubeb_init failed");
         return {};
     }
@@ -190,7 +207,7 @@ std::vector<std::string> ListCubebSinkDevices() {
     if (cubeb_enumerate_devices(ctx, CUBEB_DEVICE_TYPE_OUTPUT, &collection) != CUBEB_OK) {
         LOG_WARNING(Audio_Sink, "Audio output device enumeration not supported");
     } else {
-        for (size_t i = 0; i < collection.count; i++) {
+        for (std::size_t i = 0; i < collection.count; i++) {
             const cubeb_device_info& device = collection.device[i];
             if (device.friendly_name) {
                 device_list.emplace_back(device.friendly_name);
diff --git a/src/audio_core/null_sink.h b/src/audio_core/null_sink.h
index f235d93e5..a78d78893 100644
--- a/src/audio_core/null_sink.h
+++ b/src/audio_core/null_sink.h
@@ -21,6 +21,12 @@ public:
 private:
     struct NullSinkStreamImpl final : SinkStream {
         void EnqueueSamples(u32 /*num_channels*/, const std::vector<s16>& /*samples*/) override {}
+
+        std::size_t SamplesInQueue(u32 /*num_channels*/) const override {
+            return 0;
+        }
+
+        void Flush() override {}
     } null_sink_stream;
 };
 
diff --git a/src/audio_core/sink_details.cpp b/src/audio_core/sink_details.cpp
index 955ba20fb..67cf1f3b2 100644
--- a/src/audio_core/sink_details.cpp
+++ b/src/audio_core/sink_details.cpp
@@ -24,7 +24,7 @@ const std::vector<SinkDetails> g_sink_details = {
                 [] { return std::vector<std::string>{"null"}; }},
 };
 
-const SinkDetails& GetSinkDetails(std::string sink_id) {
+const SinkDetails& GetSinkDetails(std::string_view sink_id) {
     auto iter =
         std::find_if(g_sink_details.begin(), g_sink_details.end(),
                      [sink_id](const auto& sink_detail) { return sink_detail.id == sink_id; });
diff --git a/src/audio_core/sink_details.h b/src/audio_core/sink_details.h
index ea666c554..03534b187 100644
--- a/src/audio_core/sink_details.h
+++ b/src/audio_core/sink_details.h
@@ -6,6 +6,8 @@
 
 #include <functional>
 #include <memory>
+#include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -30,6 +32,6 @@ struct SinkDetails {
 
 extern const std::vector<SinkDetails> g_sink_details;
 
-const SinkDetails& GetSinkDetails(std::string sink_id);
+const SinkDetails& GetSinkDetails(std::string_view sink_id);
 
 } // namespace AudioCore
diff --git a/src/audio_core/sink_stream.h b/src/audio_core/sink_stream.h
index 41b6736d8..4309ad094 100644
--- a/src/audio_core/sink_stream.h
+++ b/src/audio_core/sink_stream.h
@@ -25,6 +25,10 @@ public:
      * @param samples Samples in interleaved stereo PCM16 format.
      */
     virtual void EnqueueSamples(u32 num_channels, const std::vector<s16>& samples) = 0;
+
+    virtual std::size_t SamplesInQueue(u32 num_channels) const = 0;
+
+    virtual void Flush() = 0;
 };
 
 using SinkStreamPtr = std::unique_ptr<SinkStream>;
diff --git a/src/audio_core/stream.cpp b/src/audio_core/stream.cpp
index ad9e2915c..449db2416 100644
--- a/src/audio_core/stream.cpp
+++ b/src/audio_core/stream.cpp
@@ -7,16 +7,18 @@
 
 #include "audio_core/sink.h"
 #include "audio_core/sink_details.h"
+#include "audio_core/sink_stream.h"
 #include "audio_core/stream.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
 #include "core/settings.h"
 
 namespace AudioCore {
 
-constexpr size_t MaxAudioBufferCount{32};
+constexpr std::size_t MaxAudioBufferCount{32};
 
 u32 Stream::GetNumChannels() const {
     switch (format) {
@@ -51,7 +53,7 @@ void Stream::Stop() {
 }
 
 s64 Stream::GetBufferReleaseCycles(const Buffer& buffer) const {
-    const size_t num_samples{buffer.GetSamples().size() / GetNumChannels()};
+    const std::size_t num_samples{buffer.GetSamples().size() / GetNumChannels()};
     return CoreTiming::usToCycles((static_cast<u64>(num_samples) * 1000000) / sample_rate);
 }
 
@@ -72,6 +74,7 @@ static void VolumeAdjustSamples(std::vector<s16>& samples) {
 void Stream::PlayNextBuffer() {
     if (!IsPlaying()) {
         // Ensure we are in playing state before playing the next buffer
+        sink_stream.Flush();
         return;
     }
 
@@ -82,6 +85,7 @@ void Stream::PlayNextBuffer() {
 
     if (queued_buffers.empty()) {
         // No queued buffers - we are effectively paused
+        sink_stream.Flush();
         return;
     }
 
@@ -89,12 +93,16 @@ void Stream::PlayNextBuffer() {
     queued_buffers.pop();
 
     VolumeAdjustSamples(active_buffer->Samples());
+
     sink_stream.EnqueueSamples(GetNumChannels(), active_buffer->GetSamples());
 
     CoreTiming::ScheduleEventThreadsafe(GetBufferReleaseCycles(*active_buffer), release_event, {});
 }
 
+MICROPROFILE_DEFINE(AudioOutput, "Audio", "ReleaseActiveBuffer", MP_RGB(100, 100, 255));
+
 void Stream::ReleaseActiveBuffer() {
+    MICROPROFILE_SCOPE(AudioOutput);
     ASSERT(active_buffer);
     released_buffers.push(std::move(active_buffer));
     release_callback();
@@ -115,9 +123,9 @@ bool Stream::ContainsBuffer(Buffer::Tag tag) const {
     return {};
 }
 
-std::vector<Buffer::Tag> Stream::GetTagsAndReleaseBuffers(size_t max_count) {
+std::vector<Buffer::Tag> Stream::GetTagsAndReleaseBuffers(std::size_t max_count) {
     std::vector<Buffer::Tag> tags;
-    for (size_t count = 0; count < max_count && !released_buffers.empty(); ++count) {
+    for (std::size_t count = 0; count < max_count && !released_buffers.empty(); ++count) {
         tags.push_back(released_buffers.front()->GetTag());
         released_buffers.pop();
     }
diff --git a/src/audio_core/stream.h b/src/audio_core/stream.h
index 049b92ca9..27db1112f 100644
--- a/src/audio_core/stream.h
+++ b/src/audio_core/stream.h
@@ -11,13 +11,16 @@
 #include <queue>
 
 #include "audio_core/buffer.h"
-#include "audio_core/sink_stream.h"
-#include "common/assert.h"
 #include "common/common_types.h"
-#include "core/core_timing.h"
+
+namespace CoreTiming {
+struct EventType;
+}
 
 namespace AudioCore {
 
+class SinkStream;
+
 /**
  * Represents an audio stream, which is a sequence of queued buffers, to be outputed by AudioOut
  */
@@ -49,7 +52,7 @@ public:
     bool ContainsBuffer(Buffer::Tag tag) const;
 
     /// Returns a vector of recently released buffers specified by tag
-    std::vector<Buffer::Tag> GetTagsAndReleaseBuffers(size_t max_count);
+    std::vector<Buffer::Tag> GetTagsAndReleaseBuffers(std::size_t max_count);
 
     /// Returns true if the stream is currently playing
     bool IsPlaying() const {
@@ -57,7 +60,7 @@ public:
     }
 
     /// Returns the number of queued buffers
-    size_t GetQueueSize() const {
+    std::size_t GetQueueSize() const {
         return queued_buffers.size();
     }
 
diff --git a/src/audio_core/time_stretch.cpp b/src/audio_core/time_stretch.cpp
new file mode 100644
index 000000000..fc14151da
--- /dev/null
+++ b/src/audio_core/time_stretch.cpp
@@ -0,0 +1,69 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include "audio_core/time_stretch.h"
+#include "common/logging/log.h"
+
+namespace AudioCore {
+
+TimeStretcher::TimeStretcher(u32 sample_rate, u32 channel_count)
+    : m_sample_rate(sample_rate), m_channel_count(channel_count) {
+    m_sound_touch.setChannels(channel_count);
+    m_sound_touch.setSampleRate(sample_rate);
+    m_sound_touch.setPitch(1.0);
+    m_sound_touch.setTempo(1.0);
+}
+
+void TimeStretcher::Clear() {
+    m_sound_touch.clear();
+}
+
+void TimeStretcher::Flush() {
+    m_sound_touch.flush();
+}
+
+std::size_t TimeStretcher::Process(const s16* in, std::size_t num_in, s16* out,
+                                   std::size_t num_out) {
+    const double time_delta = static_cast<double>(num_out) / m_sample_rate; // seconds
+
+    // We were given actual_samples number of samples, and num_samples were requested from us.
+    double current_ratio = static_cast<double>(num_in) / static_cast<double>(num_out);
+
+    const double max_latency = 1.0; // seconds
+    const double max_backlog = m_sample_rate * max_latency;
+    const double backlog_fullness = m_sound_touch.numSamples() / max_backlog;
+    if (backlog_fullness > 5.0) {
+        // Too many samples in backlog: Don't push anymore on
+        num_in = 0;
+    }
+
+    // We ideally want the backlog to be about 50% full.
+    // This gives some headroom both ways to prevent underflow and overflow.
+    // We tweak current_ratio to encourage this.
+    constexpr double tweak_time_scale = 0.05; // seconds
+    const double tweak_correction = (backlog_fullness - 0.5) * (time_delta / tweak_time_scale);
+    current_ratio *= std::pow(1.0 + 2.0 * tweak_correction, tweak_correction < 0 ? 3.0 : 1.0);
+
+    // This low-pass filter smoothes out variance in the calculated stretch ratio.
+    // The time-scale determines how responsive this filter is.
+    constexpr double lpf_time_scale = 2.0; // seconds
+    const double lpf_gain = 1.0 - std::exp(-time_delta / lpf_time_scale);
+    m_stretch_ratio += lpf_gain * (current_ratio - m_stretch_ratio);
+
+    // Place a lower limit of 5% speed.  When a game boots up, there will be
+    // many silence samples.  These do not need to be timestretched.
+    m_stretch_ratio = std::max(m_stretch_ratio, 0.05);
+    m_sound_touch.setTempo(m_stretch_ratio);
+
+    LOG_DEBUG(Audio, "{:5}/{:5} ratio:{:0.6f} backlog:{:0.6f}", num_in, num_out, m_stretch_ratio,
+              backlog_fullness);
+
+    m_sound_touch.putSamples(in, static_cast<u32>(num_in));
+    return m_sound_touch.receiveSamples(out, static_cast<u32>(num_out));
+}
+
+} // namespace AudioCore
diff --git a/src/audio_core/time_stretch.h b/src/audio_core/time_stretch.h
new file mode 100644
index 000000000..decd760f1
--- /dev/null
+++ b/src/audio_core/time_stretch.h
@@ -0,0 +1,35 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <SoundTouch.h>
+#include "common/common_types.h"
+
+namespace AudioCore {
+
+class TimeStretcher {
+public:
+    TimeStretcher(u32 sample_rate, u32 channel_count);
+
+    /// @param in       Input sample buffer
+    /// @param num_in   Number of input frames in `in`
+    /// @param out      Output sample buffer
+    /// @param num_out  Desired number of output frames in `out`
+    /// @returns Actual number of frames written to `out`
+    std::size_t Process(const s16* in, std::size_t num_in, s16* out, std::size_t num_out);
+
+    void Clear();
+
+    void Flush();
+
+private:
+    u32 m_sample_rate;
+    u32 m_channel_count;
+    soundtouch::SoundTouch m_sound_touch;
+    double m_stretch_ratio = 1.0;
+};
+
+} // namespace AudioCore