32 files changed, 316 insertions, 174 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 158360830..f1c60d1f3 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -20,6 +20,7 @@
 #include "common/lru_cache.h"
 #include "common/microprofile.h"
 #include "common/polyfill_ranges.h"
+#include "common/scratch_buffer.h"
 #include "common/settings.h"
 #include "core/memory.h"
 #include "video_core/buffer_cache/buffer_base.h"
@@ -422,8 +423,7 @@ private:
     IntervalSet common_ranges;
     std::deque<IntervalSet> committed_ranges;
 
-    size_t immediate_buffer_capacity = 0;
-    std::unique_ptr<u8[]> immediate_buffer_alloc;
+    Common::ScratchBuffer<u8> immediate_buffer_alloc;
 
     struct LRUItemParams {
         using ObjectType = BufferId;
@@ -1927,11 +1927,8 @@ std::span<const u8> BufferCache<P>::ImmediateBufferWithData(VAddr cpu_addr, size
 
 template <class P>
 std::span<u8> BufferCache<P>::ImmediateBuffer(size_t wanted_capacity) {
-    if (wanted_capacity > immediate_buffer_capacity) {
-        immediate_buffer_capacity = wanted_capacity;
-        immediate_buffer_alloc = std::make_unique<u8[]>(wanted_capacity);
-    }
-    return std::span<u8>(immediate_buffer_alloc.get(), wanted_capacity);
+    immediate_buffer_alloc.resize_destructive(wanted_capacity);
+    return std::span<u8>(immediate_buffer_alloc.data(), wanted_capacity);
 }
 
 template <class P>
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 9835e3ac1..322de2606 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -56,7 +56,7 @@ bool DmaPusher::Step() {
 
     if (command_list.prefetch_command_list.size()) {
         // Prefetched command list from nvdrv, used for things like synchronization
-        command_headers = std::move(command_list.prefetch_command_list);
+        ProcessCommands(command_list.prefetch_command_list);
         dma_pushbuffer.pop();
     } else {
         const CommandListHeader command_list_header{
@@ -74,7 +74,7 @@ bool DmaPusher::Step() {
         }
 
         // Push buffer non-empty, read a word
-        command_headers.resize(command_list_header.size);
+        command_headers.resize_destructive(command_list_header.size);
         if (Settings::IsGPULevelHigh()) {
             memory_manager.ReadBlock(dma_get, command_headers.data(),
                                      command_list_header.size * sizeof(u32));
@@ -82,16 +82,21 @@ bool DmaPusher::Step() {
             memory_manager.ReadBlockUnsafe(dma_get, command_headers.data(),
                                            command_list_header.size * sizeof(u32));
         }
+        ProcessCommands(command_headers);
     }
-    for (std::size_t index = 0; index < command_headers.size();) {
-        const CommandHeader& command_header = command_headers[index];
+
+    return true;
+}
+
+void DmaPusher::ProcessCommands(std::span<const CommandHeader> commands) {
+    for (std::size_t index = 0; index < commands.size();) {
+        const CommandHeader& command_header = commands[index];
 
         if (dma_state.method_count) {
             // Data word of methods command
             if (dma_state.non_incrementing) {
                 const u32 max_write = static_cast<u32>(
-                    std::min<std::size_t>(index + dma_state.method_count, command_headers.size()) -
-                    index);
+                    std::min<std::size_t>(index + dma_state.method_count, commands.size()) - index);
                 CallMultiMethod(&command_header.argument, max_write);
                 dma_state.method_count -= max_write;
                 dma_state.is_last_call = true;
@@ -142,8 +147,6 @@ bool DmaPusher::Step() {
         }
         index++;
     }
-
-    return true;
 }
 
 void DmaPusher::SetState(const CommandHeader& command_header) {
diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h
index 938f0f11c..6f00de937 100644
--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@@ -4,11 +4,13 @@
 #pragma once
 
 #include <array>
+#include <span>
 #include <vector>
 #include <queue>
 
 #include "common/bit_field.h"
 #include "common/common_types.h"
+#include "common/scratch_buffer.h"
 #include "video_core/engines/engine_interface.h"
 #include "video_core/engines/puller.h"
 
@@ -136,13 +138,15 @@ private:
     static constexpr u32 non_puller_methods = 0x40;
     static constexpr u32 max_subchannels = 8;
     bool Step();
+    void ProcessCommands(std::span<const CommandHeader> commands);
 
     void SetState(const CommandHeader& command_header);
 
     void CallMethod(u32 argument) const;
     void CallMultiMethod(const u32* base_start, u32 num_methods) const;
 
-    std::vector<CommandHeader> command_headers; ///< Buffer for list of commands fetched at once
+    Common::ScratchBuffer<CommandHeader>
+        command_headers; ///< Buffer for list of commands fetched at once
 
     std::queue<CommandList> dma_pushbuffer; ///< Queue of command lists to be processed
     std::size_t dma_pushbuffer_subindex{};  ///< Index within a command list within the pushbuffer
@@ -159,7 +163,7 @@ private:
     DmaState dma_state{};
     bool dma_increment_once{};
 
-    bool ib_enable{true}; ///< IB mode enabled
+    const bool ib_enable{true}; ///< IB mode enabled
 
     std::array<Engines::EngineInterface*, max_subchannels> subchannels{};
 
diff --git a/src/video_core/engines/draw_manager.cpp b/src/video_core/engines/draw_manager.cpp
index b213c374f..3a78421f6 100644
--- a/src/video_core/engines/draw_manager.cpp
+++ b/src/video_core/engines/draw_manager.cpp
@@ -46,21 +46,26 @@ void DrawManager::ProcessMethodCall(u32 method, u32 argument) {
         SetInlineIndexBuffer(regs.inline_index_4x8.index2);
         SetInlineIndexBuffer(regs.inline_index_4x8.index3);
         break;
-    case MAXWELL3D_REG_INDEX(topology_override):
-        use_topology_override = true;
+    case MAXWELL3D_REG_INDEX(vertex_array_instance_first):
+    case MAXWELL3D_REG_INDEX(vertex_array_instance_subsequent): {
+        LOG_WARNING(HW_GPU, "(STUBBED) called");
         break;
+    }
     default:
         break;
     }
 }
 
 void DrawManager::Clear(u32 layer_count) {
-    maxwell3d->rasterizer->Clear(layer_count);
+    if (maxwell3d->ShouldExecute()) {
+        maxwell3d->rasterizer->Clear(layer_count);
+    }
 }
 
 void DrawManager::DrawDeferred() {
-    if (draw_state.draw_mode != DrawMode::Instance || draw_state.instance_count == 0)
+    if (draw_state.draw_mode != DrawMode::Instance || draw_state.instance_count == 0) {
         return;
+    }
     DrawEnd(draw_state.instance_count + 1, true);
     draw_state.instance_count = 0;
 }
@@ -115,8 +120,9 @@ void DrawManager::DrawEnd(u32 instance_count, bool force_draw) {
     const auto& regs{maxwell3d->regs};
     switch (draw_state.draw_mode) {
     case DrawMode::Instance:
-        if (!force_draw)
+        if (!force_draw) {
             break;
+        }
         [[fallthrough]];
     case DrawMode::General:
         draw_state.base_instance = regs.global_base_instance_index;
@@ -156,25 +162,28 @@ void DrawManager::DrawIndexSmall(u32 argument) {
     ProcessDraw(true, 1);
 }
 
-void DrawManager::ProcessTopologyOverride() {
-    if (!use_topology_override)
-        return;
-
+void DrawManager::UpdateTopology() {
     const auto& regs{maxwell3d->regs};
-    switch (regs.topology_override) {
-    case PrimitiveTopologyOverride::None:
-        break;
-    case PrimitiveTopologyOverride::Points:
-        draw_state.topology = PrimitiveTopology::Points;
-        break;
-    case PrimitiveTopologyOverride::Lines:
-        draw_state.topology = PrimitiveTopology::Lines;
-        break;
-    case PrimitiveTopologyOverride::LineStrip:
-        draw_state.topology = PrimitiveTopology::LineStrip;
+    switch (regs.primitive_topology_control) {
+    case PrimitiveTopologyControl::UseInBeginMethods:
         break;
-    default:
-        draw_state.topology = static_cast<PrimitiveTopology>(regs.topology_override);
+    case PrimitiveTopologyControl::UseSeparateState:
+        switch (regs.topology_override) {
+        case PrimitiveTopologyOverride::None:
+            break;
+        case PrimitiveTopologyOverride::Points:
+            draw_state.topology = PrimitiveTopology::Points;
+            break;
+        case PrimitiveTopologyOverride::Lines:
+            draw_state.topology = PrimitiveTopology::Lines;
+            break;
+        case PrimitiveTopologyOverride::LineStrip:
+            draw_state.topology = PrimitiveTopology::LineStrip;
+            break;
+        default:
+            draw_state.topology = static_cast<PrimitiveTopology>(regs.topology_override);
+            break;
+        }
         break;
     }
 }
@@ -183,9 +192,10 @@ void DrawManager::ProcessDraw(bool draw_indexed, u32 instance_count) {
     LOG_TRACE(HW_GPU, "called, topology={}, count={}", draw_state.topology,
               draw_indexed ? draw_state.index_buffer.count : draw_state.vertex_buffer.count);
 
-    ProcessTopologyOverride();
+    UpdateTopology();
 
-    if (maxwell3d->ShouldExecute())
+    if (maxwell3d->ShouldExecute()) {
         maxwell3d->rasterizer->Draw(draw_indexed, instance_count);
+    }
 }
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/draw_manager.h b/src/video_core/engines/draw_manager.h
index 4f67027ca..0e6930a9c 100644
--- a/src/video_core/engines/draw_manager.h
+++ b/src/video_core/engines/draw_manager.h
@@ -10,6 +10,7 @@ class RasterizerInterface;
 }
 
 namespace Tegra::Engines {
+using PrimitiveTopologyControl = Maxwell3D::Regs::PrimitiveTopologyControl;
 using PrimitiveTopology = Maxwell3D::Regs::PrimitiveTopology;
 using PrimitiveTopologyOverride = Maxwell3D::Regs::PrimitiveTopologyOverride;
 using IndexBuffer = Maxwell3D::Regs::IndexBuffer;
@@ -58,12 +59,11 @@ private:
 
     void DrawIndexSmall(u32 argument);
 
-    void ProcessTopologyOverride();
+    void UpdateTopology();
 
     void ProcessDraw(bool draw_indexed, u32 instance_count);
 
     Maxwell3D* maxwell3d{};
     State draw_state{};
-    bool use_topology_override{};
 };
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp
index e4f8331ab..cea1dd8b0 100644
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -24,7 +24,7 @@ void State::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
 void State::ProcessExec(const bool is_linear_) {
     write_offset = 0;
     copy_size = regs.line_length_in * regs.line_count;
-    inner_buffer.resize(copy_size);
+    inner_buffer.resize_destructive(copy_size);
     is_linear = is_linear_;
 }
 
@@ -70,7 +70,7 @@ void State::ProcessData(std::span<const u8> read_buffer) {
         const std::size_t dst_size = Tegra::Texture::CalculateSize(
             true, bytes_per_pixel, width, regs.dest.height, regs.dest.depth,
             regs.dest.BlockHeight(), regs.dest.BlockDepth());
-        tmp_buffer.resize(dst_size);
+        tmp_buffer.resize_destructive(dst_size);
         memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
         Tegra::Texture::SwizzleSubrect(tmp_buffer, read_buffer, bytes_per_pixel, width,
                                        regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,
diff --git a/src/video_core/engines/engine_upload.h b/src/video_core/engines/engine_upload.h
index 94fafd9dc..7242d2529 100644
--- a/src/video_core/engines/engine_upload.h
+++ b/src/video_core/engines/engine_upload.h
@@ -4,9 +4,10 @@
 #pragma once
 
 #include <span>
-#include <vector>
+
 #include "common/bit_field.h"
 #include "common/common_types.h"
+#include "common/scratch_buffer.h"
 
 namespace Tegra {
 class MemoryManager;
@@ -73,8 +74,8 @@ private:
 
     u32 write_offset = 0;
     u32 copy_size = 0;
-    std::vector<u8> inner_buffer;
-    std::vector<u8> tmp_buffer;
+    Common::ScratchBuffer<u8> inner_buffer;
+    Common::ScratchBuffer<u8> tmp_buffer;
     bool is_linear = false;
     Registers& regs;
     MemoryManager& memory_manager;
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index a189e60ae..f73d7bf0f 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -184,12 +184,8 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
     const size_t src_size =
         CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
 
-    if (read_buffer.size() < src_size) {
-        read_buffer.resize(src_size);
-    }
-    if (write_buffer.size() < dst_size) {
-        write_buffer.resize(dst_size);
-    }
+    read_buffer.resize_destructive(src_size);
+    write_buffer.resize_destructive(dst_size);
 
     memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
     memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
@@ -235,12 +231,8 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
         CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
     const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count;
 
-    if (read_buffer.size() < src_size) {
-        read_buffer.resize(src_size);
-    }
-    if (write_buffer.size() < dst_size) {
-        write_buffer.resize(dst_size);
-    }
+    read_buffer.resize_destructive(src_size);
+    write_buffer.resize_destructive(dst_size);
 
     memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
     if (Settings::IsGPULevelExtreme()) {
@@ -269,12 +261,8 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
     pos_x = pos_x % x_in_gob;
     pos_y = pos_y % 8;
 
-    if (read_buffer.size() < src_size) {
-        read_buffer.resize(src_size);
-    }
-    if (write_buffer.size() < dst_size) {
-        write_buffer.resize(dst_size);
-    }
+    read_buffer.resize_destructive(src_size);
+    write_buffer.resize_destructive(dst_size);
 
     if (Settings::IsGPULevelExtreme()) {
         memory_manager.ReadBlock(regs.offset_in + offset, read_buffer.data(), src_size);
@@ -333,14 +321,10 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() {
     const u32 pitch = x_elements * bytes_per_pixel;
     const size_t mid_buffer_size = pitch * regs.line_count;
 
-    if (read_buffer.size() < src_size) {
-        read_buffer.resize(src_size);
-    }
-    if (write_buffer.size() < dst_size) {
-        write_buffer.resize(dst_size);
-    }
+    read_buffer.resize_destructive(src_size);
+    write_buffer.resize_destructive(dst_size);
 
-    intermediate_buffer.resize(mid_buffer_size);
+    intermediate_buffer.resize_destructive(mid_buffer_size);
 
     memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
     memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index d40d3d302..c88191a61 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -6,8 +6,10 @@
 #include <array>
 #include <cstddef>
 #include <vector>
+
 #include "common/bit_field.h"
 #include "common/common_types.h"
+#include "common/scratch_buffer.h"
 #include "video_core/engines/engine_interface.h"
 
 namespace Core {
@@ -234,9 +236,9 @@ private:
     MemoryManager& memory_manager;
     VideoCore::RasterizerInterface* rasterizer = nullptr;
 
-    std::vector<u8> read_buffer;
-    std::vector<u8> write_buffer;
-    std::vector<u8> intermediate_buffer;
+    Common::ScratchBuffer<u8> read_buffer;
+    Common::ScratchBuffer<u8> write_buffer;
+    Common::ScratchBuffer<u8> intermediate_buffer;
 
     static constexpr std::size_t NUM_REGS = 0x800;
     struct Regs {
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 28b38273e..c6d54be63 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -223,8 +223,6 @@ struct GPU::Impl {
     /// core timing events.
     void Start() {
         gpu_thread.StartThread(*renderer, renderer->Context(), *scheduler);
-        cpu_context = renderer->GetRenderWindow().CreateSharedContext();
-        cpu_context->MakeCurrent();
     }
 
     void NotifyShutdown() {
@@ -235,6 +233,9 @@ struct GPU::Impl {
 
     /// Obtain the CPU Context
     void ObtainContext() {
+        if (!cpu_context) {
+            cpu_context = renderer->GetRenderWindow().CreateSharedContext();
+        }
         cpu_context->MakeCurrent();
     }
 
diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp
index ac0b7d20e..36a04e4e0 100644
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
@@ -155,7 +155,7 @@ void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) {
         // swizzle pitch linear to block linear
         const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
         const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0);
-        luma_buffer.resize(size);
+        luma_buffer.resize_destructive(size);
         std::span<const u8> frame_buff(converted_frame_buf_addr, 4 * width * height);
         Texture::SwizzleSubrect(luma_buffer, frame_buff, 4, width, height, 1, 0, 0, width, height,
                                 block_height, 0, width * 4);
@@ -181,8 +181,8 @@ void Vic::WriteYUVFrame(const AVFrame* frame, const VicConfig& config) {
 
     const auto stride = static_cast<size_t>(frame->linesize[0]);
 
-    luma_buffer.resize(aligned_width * surface_height);
-    chroma_buffer.resize(aligned_width * surface_height / 2);
+    luma_buffer.resize_destructive(aligned_width * surface_height);
+    chroma_buffer.resize_destructive(aligned_width * surface_height / 2);
 
     // Populate luma buffer
     const u8* luma_src = frame->data[0];
diff --git a/src/video_core/host1x/vic.h b/src/video_core/host1x/vic.h
index 2b78786e8..3d9753047 100644
--- a/src/video_core/host1x/vic.h
+++ b/src/video_core/host1x/vic.h
@@ -4,8 +4,9 @@
 #pragma once
 
 #include <memory>
-#include <vector>
+
 #include "common/common_types.h"
+#include "common/scratch_buffer.h"
 
 struct SwsContext;
 
@@ -49,8 +50,8 @@ private:
     /// size does not change during a stream
     using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>;
     AVMallocPtr converted_frame_buffer;
-    std::vector<u8> luma_buffer;
-    std::vector<u8> chroma_buffer;
+    Common::ScratchBuffer<u8> luma_buffer;
+    Common::ScratchBuffer<u8> chroma_buffer;
 
     GPUVAddr config_struct_address{};
     GPUVAddr output_surface_luma_address{};
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index e2e3dac34..cee5c3247 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -112,7 +112,7 @@ bool IsASTCSupported() {
 }
 } // Anonymous namespace
 
-Device::Device() {
+Device::Device(Core::Frontend::EmuWindow& emu_window) {
     if (!GLAD_GL_VERSION_4_6) {
         LOG_ERROR(Render_OpenGL, "OpenGL 4.6 is not available");
         throw std::runtime_error{"Insufficient version"};
@@ -126,9 +126,9 @@ Device::Device() {
     const bool is_intel = vendor_name == "Intel";
 
 #ifdef __unix__
-    const bool is_linux = true;
+    constexpr bool is_linux = true;
 #else
-    const bool is_linux = false;
+    constexpr bool is_linux = false;
 #endif
 
     bool disable_fast_buffer_sub_data = false;
@@ -193,9 +193,11 @@ Device::Device() {
         }
     }
 
+    strict_context_required = emu_window.StrictContextRequired();
     // Blocks AMD and Intel OpenGL drivers on Windows from using asynchronous shader compilation.
+    // Blocks EGL on Wayland from using asynchronous shader compilation.
     use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue() &&
-                               !(is_amd || (is_intel && !is_linux));
+                               !(is_amd || (is_intel && !is_linux)) && !strict_context_required;
     use_driver_cache = is_nvidia;
 
     LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index 5ef51ebcf..2a72d84be 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -5,6 +5,7 @@
 
 #include <cstddef>
 #include "common/common_types.h"
+#include "core/frontend/emu_window.h"
 #include "shader_recompiler/stage.h"
 
 namespace Settings {
@@ -15,7 +16,7 @@ namespace OpenGL {
 
 class Device {
 public:
-    explicit Device();
+    explicit Device(Core::Frontend::EmuWindow& emu_window);
 
     [[nodiscard]] std::string GetVendorName() const;
 
@@ -173,6 +174,10 @@ public:
         return can_report_memory;
     }
 
+    bool StrictContextRequired() const {
+        return strict_context_required;
+    }
+
 private:
     static bool TestVariableAoffi();
     static bool TestPreciseBug();
@@ -216,6 +221,7 @@ private:
     bool has_cbuf_ftou_bug{};
     bool has_bool_ref_bug{};
     bool can_report_memory{};
+    bool strict_context_required{};
 
     std::string vendor_name;
 };
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 64ed6f628..a44b8c454 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -138,9 +138,6 @@ void RasterizerOpenGL::LoadDiskResources(u64 title_id, std::stop_token stop_load
 
 void RasterizerOpenGL::Clear(u32 layer_count) {
     MICROPROFILE_SCOPE(OpenGL_Clears);
-    if (!maxwell3d->ShouldExecute()) {
-        return;
-    }
 
     const auto& regs = maxwell3d->regs;
     bool use_color{};
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index a59d0d24e..f8868a012 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -174,6 +174,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo
       texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, program_manager{program_manager_},
       state_tracker{state_tracker_}, shader_notify{shader_notify_},
       use_asynchronous_shaders{device.UseAsynchronousShaders()},
+      strict_context_required{device.StrictContextRequired()},
       profile{
           .supported_spirv = 0x00010000,
 
@@ -203,6 +204,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo
           .support_int64_atomics = false,
           .support_derivative_control = device.HasDerivativeControl(),
           .support_geometry_shader_passthrough = device.HasGeometryShaderPassthrough(),
+          .support_native_ndc = true,
           .support_gl_nv_gpu_shader_5 = device.HasNvGpuShader5(),
           .support_gl_amd_gpu_shader_half_float = device.HasAmdShaderHalfFloat(),
           .support_gl_texture_shadow_lod = device.HasTextureShadowLod(),
@@ -255,9 +257,14 @@ void ShaderCache::LoadDiskResources(u64 title_id, std::stop_token stop_loading,
     }
     shader_cache_filename = base_dir / "opengl.bin";
 
-    if (!workers) {
+    if (!workers && !strict_context_required) {
         workers = CreateWorkers();
     }
+    std::optional<Context> strict_context;
+    if (strict_context_required) {
+        strict_context.emplace(emu_window);
+    }
+
     struct {
         std::mutex mutex;
         size_t total{};
@@ -265,44 +272,49 @@ void ShaderCache::LoadDiskResources(u64 title_id, std::stop_token stop_loading,
         bool has_loaded{};
     } state;
 
+    const auto queue_work{[&](Common::UniqueFunction<void, Context*>&& work) {
+        if (strict_context_required) {
+            work(&strict_context.value());
+        } else {
+            workers->QueueWork(std::move(work));
+        }
+    }};
     const auto load_compute{[&](std::ifstream& file, FileEnvironment env) {
         ComputePipelineKey key;
         file.read(reinterpret_cast<char*>(&key), sizeof(key));
-        workers->QueueWork(
-            [this, key, env = std::move(env), &state, &callback](Context* ctx) mutable {
-                ctx->pools.ReleaseContents();
-                auto pipeline{CreateComputePipeline(ctx->pools, key, env)};
-                std::scoped_lock lock{state.mutex};
-                if (pipeline) {
-                    compute_cache.emplace(key, std::move(pipeline));
-                }
-                ++state.built;
-                if (state.has_loaded) {
-                    callback(VideoCore::LoadCallbackStage::Build, state.built, state.total);
-                }
-            });
+        queue_work([this, key, env = std::move(env), &state, &callback](Context* ctx) mutable {
+            ctx->pools.ReleaseContents();
+            auto pipeline{CreateComputePipeline(ctx->pools, key, env)};
+            std::scoped_lock lock{state.mutex};
+            if (pipeline) {
+                compute_cache.emplace(key, std::move(pipeline));
+            }
+            ++state.built;
+            if (state.has_loaded) {
+                callback(VideoCore::LoadCallbackStage::Build, state.built, state.total);
+            }
+        });
         ++state.total;
     }};
     const auto load_graphics{[&](std::ifstream& file, std::vector<FileEnvironment> envs) {
         GraphicsPipelineKey key;
         file.read(reinterpret_cast<char*>(&key), sizeof(key));
-        workers->QueueWork(
-            [this, key, envs = std::move(envs), &state, &callback](Context* ctx) mutable {
-                boost::container::static_vector<Shader::Environment*, 5> env_ptrs;
-                for (auto& env : envs) {
-                    env_ptrs.push_back(&env);
-                }
-                ctx->pools.ReleaseContents();
-                auto pipeline{CreateGraphicsPipeline(ctx->pools, key, MakeSpan(env_ptrs), false)};
-                std::scoped_lock lock{state.mutex};
-                if (pipeline) {
-                    graphics_cache.emplace(key, std::move(pipeline));
-                }
-                ++state.built;
-                if (state.has_loaded) {
-                    callback(VideoCore::LoadCallbackStage::Build, state.built, state.total);
-                }
-            });
+        queue_work([this, key, envs = std::move(envs), &state, &callback](Context* ctx) mutable {
+            boost::container::static_vector<Shader::Environment*, 5> env_ptrs;
+            for (auto& env : envs) {
+                env_ptrs.push_back(&env);
+            }
+            ctx->pools.ReleaseContents();
+            auto pipeline{CreateGraphicsPipeline(ctx->pools, key, MakeSpan(env_ptrs), false)};
+            std::scoped_lock lock{state.mutex};
+            if (pipeline) {
+                graphics_cache.emplace(key, std::move(pipeline));
+            }
+            ++state.built;
+            if (state.has_loaded) {
+                callback(VideoCore::LoadCallbackStage::Build, state.built, state.total);
+            }
+        });
         ++state.total;
     }};
     LoadPipelines(stop_loading, shader_cache_filename, CACHE_VERSION, load_compute, load_graphics);
@@ -314,6 +326,9 @@ void ShaderCache::LoadDiskResources(u64 title_id, std::stop_token stop_loading,
     state.has_loaded = true;
     lock.unlock();
 
+    if (strict_context_required) {
+        return;
+    }
     workers->WaitForRequests(stop_loading);
     if (!use_asynchronous_shaders) {
         workers.reset();
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 53ffea904..f82420592 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -69,6 +69,7 @@ private:
     StateTracker& state_tracker;
     VideoCore::ShaderNotify& shader_notify;
     const bool use_asynchronous_shaders;
+    const bool strict_context_required;
 
     GraphicsPipelineKey graphics_key{};
     GraphicsPipeline* current_pipeline{};
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 5b5e178ad..bc75680f0 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -140,8 +140,8 @@ RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_,
                                Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_,
                                std::unique_ptr<Core::Frontend::GraphicsContext> context_)
     : RendererBase{emu_window_, std::move(context_)}, telemetry_session{telemetry_session_},
-      emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, state_tracker{},
-      program_manager{device},
+      emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, device{emu_window_},
+      state_tracker{}, program_manager{device},
       rasterizer(emu_window, gpu, cpu_memory, device, screen_info, program_manager, state_tracker) {
     if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
         glEnable(GL_DEBUG_OUTPUT);
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index 18be54729..f502a7d09 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -139,23 +139,25 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     RenderScreenshot(*framebuffer, use_accelerated);
 
     bool has_been_recreated = false;
-    const auto recreate_swapchain = [&] {
+    const auto recreate_swapchain = [&](u32 width, u32 height) {
         if (!has_been_recreated) {
             has_been_recreated = true;
             scheduler.Finish();
         }
-        const Layout::FramebufferLayout layout = render_window.GetFramebufferLayout();
-        swapchain.Create(layout.width, layout.height, is_srgb);
+        swapchain.Create(width, height, is_srgb);
     };
-    if (swapchain.NeedsRecreation(is_srgb)) {
-        recreate_swapchain();
+
+    const Layout::FramebufferLayout layout = render_window.GetFramebufferLayout();
+    if (swapchain.NeedsRecreation(is_srgb) || swapchain.GetWidth() != layout.width ||
+        swapchain.GetHeight() != layout.height) {
+        recreate_swapchain(layout.width, layout.height);
     }
     bool is_outdated;
     do {
         swapchain.AcquireNextImage();
         is_outdated = swapchain.IsOutDated();
         if (is_outdated) {
-            recreate_swapchain();
+            recreate_swapchain(layout.width, layout.height);
         }
     } while (is_outdated);
     if (has_been_recreated) {
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 54a12b35f..6b54d7111 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -461,6 +461,9 @@ void BufferCacheRuntime::BindQuadIndexBuffer(PrimitiveTopology topology, u32 fir
 
 void BufferCacheRuntime::BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size,
                                           u32 stride) {
+    if (index >= device.GetMaxVertexInputBindings()) {
+        return;
+    }
     if (device.IsExtExtendedDynamicStateSupported()) {
         scheduler.Record([index, buffer, offset, size, stride](vk::CommandBuffer cmdbuf) {
             const VkDeviceSize vk_offset = buffer != VK_NULL_HANDLE ? offset : 0;
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index 006128638..515d8d869 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -529,7 +529,9 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
     static_vector<VkVertexInputBindingDivisorDescriptionEXT, 32> vertex_binding_divisors;
     static_vector<VkVertexInputAttributeDescription, 32> vertex_attributes;
     if (key.state.dynamic_vertex_input) {
-        for (size_t index = 0; index < key.state.attributes.size(); ++index) {
+        const size_t num_vertex_arrays = std::min(
+            key.state.attributes.size(), static_cast<size_t>(device.GetMaxVertexInputBindings()));
+        for (size_t index = 0; index < num_vertex_arrays; ++index) {
             const u32 type = key.state.DynamicAttributeType(index);
             if (!stage_infos[0].loads.Generic(index) || type == 0) {
                 continue;
@@ -551,7 +553,9 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
             });
         }
     } else {
-        for (size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+        const size_t num_vertex_arrays = std::min(
+            Maxwell::NumVertexArrays, static_cast<size_t>(device.GetMaxVertexInputBindings()));
+        for (size_t index = 0; index < num_vertex_arrays; ++index) {
             const bool instanced = key.state.binding_divisors[index] != 0;
             const auto rate =
                 instanced ? VK_VERTEX_INPUT_RATE_INSTANCE : VK_VERTEX_INPUT_RATE_VERTEX;
@@ -580,6 +584,8 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
             });
         }
     }
+    ASSERT(vertex_attributes.size() <= device.GetMaxVertexInputAttributes());
+
     VkPipelineVertexInputStateCreateInfo vertex_input_ci{
         .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
         .pNext = nullptr,
@@ -634,23 +640,33 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
     };
     std::array<VkViewportSwizzleNV, Maxwell::NumViewports> swizzles;
     std::ranges::transform(key.state.viewport_swizzles, swizzles.begin(), UnpackViewportSwizzle);
-    const VkPipelineViewportSwizzleStateCreateInfoNV swizzle_ci{
+    VkPipelineViewportSwizzleStateCreateInfoNV swizzle_ci{
         .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_SWIZZLE_STATE_CREATE_INFO_NV,
         .pNext = nullptr,
         .flags = 0,
         .viewportCount = Maxwell::NumViewports,
         .pViewportSwizzles = swizzles.data(),
     };
-    const VkPipelineViewportStateCreateInfo viewport_ci{
+    VkPipelineViewportDepthClipControlCreateInfoEXT ndc_info{
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_DEPTH_CLIP_CONTROL_CREATE_INFO_EXT,
+        .pNext = nullptr,
+        .negativeOneToOne = key.state.ndc_minus_one_to_one.Value() != 0 ? VK_TRUE : VK_FALSE,
+    };
+    VkPipelineViewportStateCreateInfo viewport_ci{
         .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
-        .pNext = device.IsNvViewportSwizzleSupported() ? &swizzle_ci : nullptr,
+        .pNext = nullptr,
         .flags = 0,
         .viewportCount = Maxwell::NumViewports,
         .pViewports = nullptr,
         .scissorCount = Maxwell::NumViewports,
         .pScissors = nullptr,
     };
-
+    if (device.IsNvViewportSwizzleSupported()) {
+        swizzle_ci.pNext = std::exchange(viewport_ci.pNext, &swizzle_ci);
+    }
+    if (device.IsExtDepthClipControlSupported()) {
+        ndc_info.pNext = std::exchange(viewport_ci.pNext, &ndc_info);
+    }
     VkPipelineRasterizationStateCreateInfo rasterization_ci{
         .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
         .pNext = nullptr,
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index 81f5f3e11..e7262420c 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -321,6 +321,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device
         .support_int64_atomics = device.IsExtShaderAtomicInt64Supported(),
         .support_derivative_control = true,
         .support_geometry_shader_passthrough = device.IsNvGeometryShaderPassthroughSupported(),
+        .support_native_ndc = device.IsExtDepthClipControlSupported(),
 
         .warp_size_potentially_larger_than_guest = device.IsWarpSizePotentiallyBiggerThanGuest(),
 
@@ -341,6 +342,15 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device
         .support_snorm_render_buffer = true,
         .support_viewport_index_layer = device.IsExtShaderViewportIndexLayerSupported(),
     };
+
+    if (device.GetMaxVertexInputAttributes() < Maxwell::NumVertexAttributes) {
+        LOG_WARNING(Render_Vulkan, "maxVertexInputAttributes is too low: {} < {}",
+                    device.GetMaxVertexInputAttributes(), Maxwell::NumVertexAttributes);
+    }
+    if (device.GetMaxVertexInputBindings() < Maxwell::NumVertexArrays) {
+        LOG_WARNING(Render_Vulkan, "maxVertexInputBindings is too low: {} < {}",
+                    device.GetMaxVertexInputBindings(), Maxwell::NumVertexArrays);
+    }
 }
 
 PipelineCache::~PipelineCache() = default;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 3774f303a..ac1eb9895 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -220,9 +220,6 @@ void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) {
 void RasterizerVulkan::Clear(u32 layer_count) {
     MICROPROFILE_SCOPE(Vulkan_Clearing);
 
-    if (!maxwell3d->ShouldExecute()) {
-        return;
-    }
     FlushWork();
 
     query_cache.UpdateCounters();
@@ -665,8 +662,7 @@ void RasterizerVulkan::BeginTransformFeedback() {
         return;
     }
     UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) ||
-                     regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation) ||
-                     regs.IsShaderConfigEnabled(Maxwell::ShaderType::Geometry));
+                     regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation));
     scheduler.Record(
         [](vk::CommandBuffer cmdbuf) { cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); });
 }
diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp
index d7be417f5..b6810eef9 100644
--- a/src/video_core/renderer_vulkan/vk_swapchain.cpp
+++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp
@@ -67,17 +67,19 @@ VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 wi
 
 } // Anonymous namespace
 
-Swapchain::Swapchain(VkSurfaceKHR surface_, const Device& device_, Scheduler& scheduler_, u32 width,
-                     u32 height, bool srgb)
+Swapchain::Swapchain(VkSurfaceKHR surface_, const Device& device_, Scheduler& scheduler_,
+                     u32 width_, u32 height_, bool srgb)
     : surface{surface_}, device{device_}, scheduler{scheduler_} {
-    Create(width, height, srgb);
+    Create(width_, height_, srgb);
 }
 
 Swapchain::~Swapchain() = default;
 
-void Swapchain::Create(u32 width, u32 height, bool srgb) {
+void Swapchain::Create(u32 width_, u32 height_, bool srgb) {
     is_outdated = false;
     is_suboptimal = false;
+    width = width_;
+    height = height_;
 
     const auto physical_device = device.GetPhysical();
     const auto capabilities{physical_device.GetSurfaceCapabilitiesKHR(surface)};
@@ -88,7 +90,7 @@ void Swapchain::Create(u32 width, u32 height, bool srgb) {
     device.GetLogical().WaitIdle();
     Destroy();
 
-    CreateSwapchain(capabilities, width, height, srgb);
+    CreateSwapchain(capabilities, srgb);
     CreateSemaphores();
     CreateImageViews();
 
@@ -148,8 +150,7 @@ void Swapchain::Present(VkSemaphore render_semaphore) {
     }
 }
 
-void Swapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, u32 width, u32 height,
-                                bool srgb) {
+void Swapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, bool srgb) {
     const auto physical_device{device.GetPhysical()};
     const auto formats{physical_device.GetSurfaceFormatsKHR(surface)};
     const auto present_modes{physical_device.GetSurfacePresentModesKHR(surface)};
diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h
index 111b3902d..caf1ff32b 100644
--- a/src/video_core/renderer_vulkan/vk_swapchain.h
+++ b/src/video_core/renderer_vulkan/vk_swapchain.h
@@ -80,9 +80,16 @@ public:
         return *present_semaphores[frame_index];
     }
 
+    u32 GetWidth() const {
+        return width;
+    }
+
+    u32 GetHeight() const {
+        return height;
+    }
+
 private:
-    void CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, u32 width, u32 height,
-                         bool srgb);
+    void CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, bool srgb);
     void CreateSemaphores();
     void CreateImageViews();
 
@@ -105,6 +112,9 @@ private:
     std::vector<u64> resource_ticks;
     std::vector<vk::Semaphore> present_semaphores;
 
+    u32 width;
+    u32 height;
+
     u32 image_index{};
     u32 frame_index{};
 
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 8e68a2e53..27c82cd20 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -39,6 +39,12 @@ TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface&
     sampler_descriptor.mipmap_filter.Assign(Tegra::Texture::TextureMipmapFilter::Linear);
     sampler_descriptor.cubemap_anisotropy.Assign(1);
 
+    // These values were chosen based on typical peak swizzle data sizes seen in some titles
+    static constexpr size_t SWIZZLE_DATA_BUFFER_INITIAL_CAPACITY = 8_MiB;
+    static constexpr size_t UNSWIZZLE_DATA_BUFFER_INITIAL_CAPACITY = 1_MiB;
+    swizzle_data_buffer.resize_destructive(SWIZZLE_DATA_BUFFER_INITIAL_CAPACITY);
+    unswizzle_data_buffer.resize_destructive(UNSWIZZLE_DATA_BUFFER_INITIAL_CAPACITY);
+
     // Make sure the first index is reserved for the null resources
     // This way the null resource becomes a compile time constant
     void(slot_images.insert(NullImageParams{}));
@@ -90,7 +96,8 @@ void TextureCache<P>::RunGarbageCollector() {
             const auto copies = FullDownloadCopies(image.info);
             image.DownloadMemory(map, copies);
             runtime.Finish();
-            SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span);
+            SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span,
+                         swizzle_data_buffer);
         }
         if (True(image.flags & ImageFlagBits::Tracked)) {
             UntrackImage(image, image_id);
@@ -461,7 +468,8 @@ void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) {
         const auto copies = FullDownloadCopies(image.info);
         image.DownloadMemory(map, copies);
         runtime.Finish();
-        SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span);
+        SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span,
+                     swizzle_data_buffer);
     }
 }
 
@@ -672,7 +680,8 @@ void TextureCache<P>::PopAsyncFlushes() {
     for (const ImageId image_id : download_ids) {
         const ImageBase& image = slot_images[image_id];
         const auto copies = FullDownloadCopies(image.info);
-        SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span);
+        SwizzleImage(*gpu_memory, image.gpu_addr, image.info, copies, download_span,
+                     swizzle_data_buffer);
         download_map.offset += image.unswizzled_size_bytes;
         download_span = download_span.subspan(image.unswizzled_size_bytes);
     }
@@ -734,13 +743,21 @@ void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging)
         gpu_memory->ReadBlockUnsafe(gpu_addr, mapped_span.data(), mapped_span.size_bytes());
         const auto uploads = FullUploadSwizzles(image.info);
         runtime.AccelerateImageUpload(image, staging, uploads);
-    } else if (True(image.flags & ImageFlagBits::Converted)) {
-        std::vector<u8> unswizzled_data(image.unswizzled_size_bytes);
-        auto copies = UnswizzleImage(*gpu_memory, gpu_addr, image.info, unswizzled_data);
-        ConvertImage(unswizzled_data, image.info, mapped_span, copies);
+        return;
+    }
+    const size_t guest_size_bytes = image.guest_size_bytes;
+    swizzle_data_buffer.resize_destructive(guest_size_bytes);
+    gpu_memory->ReadBlockUnsafe(gpu_addr, swizzle_data_buffer.data(), guest_size_bytes);
+
+    if (True(image.flags & ImageFlagBits::Converted)) {
+        unswizzle_data_buffer.resize_destructive(image.unswizzled_size_bytes);
+        auto copies = UnswizzleImage(*gpu_memory, gpu_addr, image.info, swizzle_data_buffer,
+                                     unswizzle_data_buffer);
+        ConvertImage(unswizzle_data_buffer, image.info, mapped_span, copies);
         image.UploadMemory(staging, copies);
     } else {
-        const auto copies = UnswizzleImage(*gpu_memory, gpu_addr, image.info, mapped_span);
+        const auto copies =
+            UnswizzleImage(*gpu_memory, gpu_addr, image.info, swizzle_data_buffer, mapped_span);
         image.UploadMemory(staging, copies);
     }
 }
@@ -910,7 +927,7 @@ void TextureCache<P>::InvalidateScale(Image& image) {
 }
 
 template <class P>
-u64 TextureCache<P>::GetScaledImageSizeBytes(ImageBase& image) {
+u64 TextureCache<P>::GetScaledImageSizeBytes(const ImageBase& image) {
     const u64 scale_up = static_cast<u64>(Settings::values.resolution_info.up_scale *
                                           Settings::values.resolution_info.up_scale);
     const u64 down_shift = static_cast<u64>(Settings::values.resolution_info.down_shift +
diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h
index 587339a31..4fd677a80 100644
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -17,6 +17,7 @@
 #include "common/literals.h"
 #include "common/lru_cache.h"
 #include "common/polyfill_ranges.h"
+#include "common/scratch_buffer.h"
 #include "video_core/compatible_formats.h"
 #include "video_core/control/channel_state_cache.h"
 #include "video_core/delayed_destruction_ring.h"
@@ -368,7 +369,7 @@ private:
     void InvalidateScale(Image& image);
     bool ScaleUp(Image& image);
     bool ScaleDown(Image& image);
-    u64 GetScaledImageSizeBytes(ImageBase& image);
+    u64 GetScaledImageSizeBytes(const ImageBase& image);
 
     Runtime& runtime;
 
@@ -417,6 +418,9 @@ private:
 
     std::unordered_map<GPUVAddr, ImageAllocId> image_allocs_table;
 
+    Common::ScratchBuffer<u8> swizzle_data_buffer;
+    Common::ScratchBuffer<u8> unswizzle_data_buffer;
+
     u64 modification_tick = 0;
     u64 frame_tick = 0;
 };
diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp
index e8c908b42..03acc68d9 100644
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@@ -505,7 +505,7 @@ void SwizzlePitchLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr
 
 void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr,
                              const ImageInfo& info, const BufferImageCopy& copy,
-                             std::span<const u8> input) {
+                             std::span<const u8> input, Common::ScratchBuffer<u8>& tmp_buffer) {
     const Extent3D size = info.size;
     const LevelInfo level_info = MakeLevelInfo(info);
     const Extent2D tile_size = DefaultBlockSize(info.format);
@@ -534,8 +534,8 @@ void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr
                        tile_size.height, info.tile_width_spacing);
     const size_t subresource_size = sizes[level];
 
-    const auto dst_data = std::make_unique<u8[]>(subresource_size);
-    const std::span<u8> dst(dst_data.get(), subresource_size);
+    tmp_buffer.resize_destructive(subresource_size);
+    const std::span<u8> dst(tmp_buffer);
 
     for (s32 layer = 0; layer < info.resources.layers; ++layer) {
         const std::span<const u8> src = input.subspan(host_offset);
@@ -765,8 +765,9 @@ bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config
 }
 
 std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr,
-                                            const ImageInfo& info, std::span<u8> output) {
-    const size_t guest_size_bytes = CalculateGuestSizeInBytes(info);
+                                            const ImageInfo& info, std::span<const u8> input,
+                                            std::span<u8> output) {
+    const size_t guest_size_bytes = input.size_bytes();
     const u32 bpp_log2 = BytesPerBlockLog2(info.format);
     const Extent3D size = info.size;
 
@@ -789,10 +790,6 @@ std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, GP
             .image_extent = size,
         }};
     }
-    const auto input_data = std::make_unique<u8[]>(guest_size_bytes);
-    gpu_memory.ReadBlockUnsafe(gpu_addr, input_data.get(), guest_size_bytes);
-    const std::span<const u8> input(input_data.get(), guest_size_bytes);
-
     const LevelInfo level_info = MakeLevelInfo(info);
     const s32 num_layers = info.resources.layers;
     const s32 num_levels = info.resources.levels;
@@ -980,13 +977,14 @@ std::vector<SwizzleParameters> FullUploadSwizzles(const ImageInfo& info) {
 }
 
 void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info,
-                  std::span<const BufferImageCopy> copies, std::span<const u8> memory) {
+                  std::span<const BufferImageCopy> copies, std::span<const u8> memory,
+                  Common::ScratchBuffer<u8>& tmp_buffer) {
     const bool is_pitch_linear = info.type == ImageType::Linear;
     for (const BufferImageCopy& copy : copies) {
         if (is_pitch_linear) {
             SwizzlePitchLinearImage(gpu_memory, gpu_addr, info, copy, memory);
         } else {
-            SwizzleBlockLinearImage(gpu_memory, gpu_addr, info, copy, memory);
+            SwizzleBlockLinearImage(gpu_memory, gpu_addr, info, copy, memory, tmp_buffer);
         }
     }
 }
diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h
index 5e28f4ab3..d103db8ae 100644
--- a/src/video_core/texture_cache/util.h
+++ b/src/video_core/texture_cache/util.h
@@ -7,6 +7,7 @@
 #include <span>
 
 #include "common/common_types.h"
+#include "common/scratch_buffer.h"
 
 #include "video_core/surface.h"
 #include "video_core/texture_cache/image_base.h"
@@ -59,6 +60,7 @@ struct OverlapResult {
 
 [[nodiscard]] std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory,
                                                           GPUVAddr gpu_addr, const ImageInfo& info,
+                                                          std::span<const u8> input,
                                                           std::span<u8> output);
 
 [[nodiscard]] BufferCopy UploadBufferCopy(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr,
@@ -76,7 +78,8 @@ void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8
 [[nodiscard]] std::vector<SwizzleParameters> FullUploadSwizzles(const ImageInfo& info);
 
 void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info,
-                  std::span<const BufferImageCopy> copies, std::span<const u8> memory);
+                  std::span<const BufferImageCopy> copies, std::span<const u8> memory,
+                  Common::ScratchBuffer<u8>& tmp_buffer);
 
 [[nodiscard]] bool IsBlockLinearSizeCompatible(const ImageInfo& new_info,
                                                const ImageInfo& overlap_info, u32 new_level,
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index 6a2ad4b1d..c4d31681a 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -421,7 +421,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
     VkPhysicalDevice8BitStorageFeatures bit8_storage{
         .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES,
         .pNext = nullptr,
-        .storageBuffer8BitAccess = false,
+        .storageBuffer8BitAccess = true,
         .uniformAndStorageBuffer8BitAccess = true,
         .storagePushConstant8 = false,
     };
@@ -660,6 +660,16 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
         LOG_INFO(Render_Vulkan, "Device doesn't support depth range unrestricted");
     }
 
+    VkPhysicalDeviceDepthClipControlFeaturesEXT depth_clip_control_features;
+    if (ext_depth_clip_control) {
+        depth_clip_control_features = {
+            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_CLIP_CONTROL_FEATURES_EXT,
+            .pNext = nullptr,
+            .depthClipControl = VK_TRUE,
+        };
+        SetNext(next, depth_clip_control_features);
+    }
+
     VkDeviceDiagnosticsConfigCreateInfoNV diagnostics_nv;
     if (Settings::values.enable_nsight_aftermath && nv_device_diagnostics_config) {
         nsight_aftermath_tracker = std::make_unique<NsightAftermathTracker>();
@@ -1044,6 +1054,7 @@ void Device::CheckSuitability(bool requires_swapchain) const {
         std::make_pair(bit16_storage.storageBuffer16BitAccess, "storageBuffer16BitAccess"),
         std::make_pair(bit16_storage.uniformAndStorageBuffer16BitAccess,
                        "uniformAndStorageBuffer16BitAccess"),
+        std::make_pair(bit8_storage.storageBuffer8BitAccess, "storageBuffer8BitAccess"),
         std::make_pair(bit8_storage.uniformAndStorageBuffer8BitAccess,
                        "uniformAndStorageBuffer8BitAccess"),
         std::make_pair(host_query_reset.hostQueryReset, "hostQueryReset"),
@@ -1083,6 +1094,7 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) {
     bool has_ext_vertex_input_dynamic_state{};
     bool has_ext_line_rasterization{};
     bool has_ext_primitive_topology_list_restart{};
+    bool has_ext_depth_clip_control{};
     for (const std::string& extension : supported_extensions) {
         const auto test = [&](std::optional<std::reference_wrapper<bool>> status, const char* name,
                               bool push) {
@@ -1116,6 +1128,7 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) {
         test(ext_shader_stencil_export, VK_EXT_SHADER_STENCIL_EXPORT_EXTENSION_NAME, true);
         test(ext_conservative_rasterization, VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME,
              true);
+        test(has_ext_depth_clip_control, VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME, false);
         test(has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME, false);
         test(has_ext_custom_border_color, VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME, false);
         test(has_ext_extended_dynamic_state, VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME, false);
@@ -1279,6 +1292,19 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) {
             ext_line_rasterization = true;
         }
     }
+    if (has_ext_depth_clip_control) {
+        VkPhysicalDeviceDepthClipControlFeaturesEXT depth_clip_control_features;
+        depth_clip_control_features.sType =
+            VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEPTH_CLIP_CONTROL_FEATURES_EXT;
+        depth_clip_control_features.pNext = nullptr;
+        features.pNext = &depth_clip_control_features;
+        physical.GetFeatures2(features);
+
+        if (depth_clip_control_features.depthClipControl) {
+            extensions.push_back(VK_EXT_DEPTH_CLIP_CONTROL_EXTENSION_NAME);
+            ext_depth_clip_control = true;
+        }
+    }
     if (has_khr_workgroup_memory_explicit_layout) {
         VkPhysicalDeviceWorkgroupMemoryExplicitLayoutFeaturesKHR layout;
         layout.sType =
@@ -1380,6 +1406,10 @@ void Device::SetupFeatures() {
     is_shader_storage_image_multisample = features.shaderStorageImageMultisample;
     is_blit_depth_stencil_supported = TestDepthStencilBlits();
     is_optimal_astc_supported = IsOptimalAstcSupported(features);
+
+    const VkPhysicalDeviceLimits& limits{properties.limits};
+    max_vertex_input_attributes = limits.maxVertexInputAttributes;
+    max_vertex_input_bindings = limits.maxVertexInputBindings;
 }
 
 void Device::SetupProperties() {
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index db802437c..6a26c4e6e 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -256,6 +256,11 @@ public:
         return ext_depth_range_unrestricted;
     }
 
+    /// Returns true if the device supports VK_EXT_depth_clip_control.
+    bool IsExtDepthClipControlSupported() const {
+        return ext_depth_clip_control;
+    }
+
     /// Returns true if the device supports VK_EXT_shader_viewport_index_layer.
     bool IsExtShaderViewportIndexLayerSupported() const {
         return ext_shader_viewport_index_layer;
@@ -368,6 +373,14 @@ public:
         return must_emulate_bgr565;
     }
 
+    u32 GetMaxVertexInputAttributes() const {
+        return max_vertex_input_attributes;
+    }
+
+    u32 GetMaxVertexInputBindings() const {
+        return max_vertex_input_bindings;
+    }
+
 private:
     /// Checks if the physical device is suitable.
     void CheckSuitability(bool requires_swapchain) const;
@@ -446,6 +459,7 @@ private:
     bool khr_swapchain_mutable_format{};         ///< Support for VK_KHR_swapchain_mutable_format.
     bool ext_index_type_uint8{};                 ///< Support for VK_EXT_index_type_uint8.
     bool ext_sampler_filter_minmax{};            ///< Support for VK_EXT_sampler_filter_minmax.
+    bool ext_depth_clip_control{};               ///< Support for VK_EXT_depth_clip_control
     bool ext_depth_range_unrestricted{};         ///< Support for VK_EXT_depth_range_unrestricted.
     bool ext_shader_viewport_index_layer{}; ///< Support for VK_EXT_shader_viewport_index_layer.
     bool ext_tooling_info{};                ///< Support for VK_EXT_tooling_info.
@@ -467,6 +481,8 @@ private:
     bool supports_d24_depth{};              ///< Supports D24 depth buffers.
     bool cant_blit_msaa{};                  ///< Does not support MSAA<->MSAA blitting.
     bool must_emulate_bgr565{};             ///< Emulates BGR565 by swizzling RGB565 format.
+    u32 max_vertex_input_attributes{};      ///< Max vertex input attributes in pipeline
+    u32 max_vertex_input_bindings{};        ///< Max vertex input buffers in pipeline
 
     // Telemetry parameters
     std::string vendor_name;                       ///< Device's driver name.
diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp
index 483b534a0..7dca7341c 100644
--- a/src/video_core/vulkan_common/vulkan_wrapper.cpp
+++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp
@@ -314,6 +314,18 @@ const char* ToString(VkResult result) noexcept {
         return "VK_ERROR_VALIDATION_FAILED_EXT";
     case VkResult::VK_ERROR_INVALID_SHADER_NV:
         return "VK_ERROR_INVALID_SHADER_NV";
+    case VkResult::VK_ERROR_IMAGE_USAGE_NOT_SUPPORTED_KHR:
+        return "VK_ERROR_IMAGE_USAGE_NOT_SUPPORTED_KHR";
+    case VkResult::VK_ERROR_VIDEO_PICTURE_LAYOUT_NOT_SUPPORTED_KHR:
+        return "VK_ERROR_VIDEO_PICTURE_LAYOUT_NOT_SUPPORTED_KHR";
+    case VkResult::VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR:
+        return "VK_ERROR_VIDEO_PROFILE_OPERATION_NOT_SUPPORTED_KHR";
+    case VkResult::VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR:
+        return "VK_ERROR_VIDEO_PROFILE_FORMAT_NOT_SUPPORTED_KHR";
+    case VkResult::VK_ERROR_VIDEO_PROFILE_CODEC_NOT_SUPPORTED_KHR:
+        return "VK_ERROR_VIDEO_PROFILE_CODEC_NOT_SUPPORTED_KHR";
+    case VkResult::VK_ERROR_VIDEO_STD_VERSION_NOT_SUPPORTED_KHR:
+        return "VK_ERROR_VIDEO_STD_VERSION_NOT_SUPPORTED_KHR";
     case VkResult::VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT:
         return "VK_ERROR_INVALID_DRM_FORMAT_MODIFIER_PLANE_LAYOUT_EXT";
     case VkResult::VK_ERROR_FRAGMENTATION_EXT: