12 files changed, 65 insertions, 70 deletions
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h
index c47b7d866..92d77eef2 100644
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -430,7 +430,7 @@ private:
         if (query_begin >= SizeBytes() || size < 0) {
             return;
         }
-        [[maybe_unused]] u64* const untracked_words = Array<Type::Untracked>();
+        u64* const untracked_words = Array<Type::Untracked>();
         u64* const state_words = Array<type>();
         const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes());
         u64* const words_begin = state_words + query_begin / BYTES_PER_WORD;
@@ -483,7 +483,7 @@ private:
                 NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
             }
             // Exclude CPU modified pages when visiting GPU pages
-            const u64 word = current_word;
+            const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0);
             u64 page = page_begin;
             page_begin = 0;
 
@@ -531,7 +531,7 @@ private:
     [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
         static_assert(type != Type::Untracked);
 
-        [[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
+        const u64* const untracked_words = Array<Type::Untracked>();
         const u64* const state_words = Array<type>();
         const u64 num_query_words = size / BYTES_PER_WORD + 1;
         const u64 word_begin = offset / BYTES_PER_WORD;
@@ -539,7 +539,8 @@ private:
         const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
         u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
         for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
-            const u64 word = state_words[word_index];
+            const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
+            const u64 word = state_words[word_index] & ~off_word;
             if (word == 0) {
                 continue;
             }
@@ -563,7 +564,7 @@ private:
     [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {
         static_assert(type != Type::Untracked);
 
-        [[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
+        const u64* const untracked_words = Array<Type::Untracked>();
         const u64* const state_words = Array<type>();
         const u64 num_query_words = size / BYTES_PER_WORD + 1;
         const u64 word_begin = offset / BYTES_PER_WORD;
@@ -573,7 +574,8 @@ private:
         u64 begin = std::numeric_limits<u64>::max();
         u64 end = 0;
         for (u64 word_index = word_begin; word_index < word_end; ++word_index) {
-            const u64 word = state_words[word_index];
+            const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
+            const u64 word = state_words[word_index] & ~off_word;
             if (word == 0) {
                 continue;
             }
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index c6d54be63..7024a19cf 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -99,7 +99,7 @@ struct GPU::Impl {
 
     /// Signal the ending of command list.
     void OnCommandListEnd() {
-        gpu_thread.OnCommandListEnd();
+        rasterizer->ReleaseFences();
     }
 
     /// Request a host GPU memory flush from the CPU.
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 164a5252a..9c103c0d4 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -40,8 +40,6 @@ static void RunThread(std::stop_token stop_token, Core::System& system,
             scheduler.Push(submit_list->channel, std::move(submit_list->entries));
         } else if (const auto* data = std::get_if<SwapBuffersCommand>(&next.data)) {
             renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
-        } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) {
-            rasterizer->ReleaseFences();
         } else if (std::holds_alternative<GPUTickCommand>(next.data)) {
             system.GPU().TickWork();
         } else if (const auto* flush = std::get_if<FlushRegionCommand>(&next.data)) {
@@ -110,10 +108,6 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
     rasterizer->OnCPUWrite(addr, size);
 }
 
-void ThreadManager::OnCommandListEnd() {
-    PushCommand(OnCommandListEndCommand());
-}
-
 u64 ThreadManager::PushCommand(CommandData&& command_data, bool block) {
     if (!is_async) {
         // In synchronous GPU mode, block the caller until the command has executed
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index c71a419c7..90bcb5958 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -77,16 +77,12 @@ struct FlushAndInvalidateRegionCommand final {
     u64 size;
 };
 
-/// Command called within the gpu, to schedule actions after a command list end
-struct OnCommandListEndCommand final {};
-
 /// Command to make the gpu look into pending requests
 struct GPUTickCommand final {};
 
 using CommandData =
     std::variant<std::monostate, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
-                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand,
-                 GPUTickCommand>;
+                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand, GPUTickCommand>;
 
 struct CommandDataContainer {
     CommandDataContainer() = default;
@@ -134,8 +130,6 @@ public:
     /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
     void FlushAndInvalidateRegion(VAddr addr, u64 size);
 
-    void OnCommandListEnd();
-
     void TickGPU();
 
 private:
diff --git a/src/video_core/host1x/vic.cpp b/src/video_core/host1x/vic.cpp
index 36a04e4e0..10d7ef884 100644
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
@@ -189,9 +189,7 @@ void Vic::WriteYUVFrame(const AVFrame* frame, const VicConfig& config) {
     for (std::size_t y = 0; y < frame_height; ++y) {
         const std::size_t src = y * stride;
         const std::size_t dst = y * aligned_width;
-        for (std::size_t x = 0; x < frame_width; ++x) {
-            luma_buffer[dst + x] = luma_src[src + x];
-        }
+        std::memcpy(luma_buffer.data() + dst, luma_src + src, frame_width);
     }
     host1x.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
                                       luma_buffer.size());
@@ -205,15 +203,15 @@ void Vic::WriteYUVFrame(const AVFrame* frame, const VicConfig& config) {
         // Frame from FFmpeg software
         // Populate chroma buffer from both channels with interleaving.
         const std::size_t half_width = frame_width / 2;
+        u8* chroma_buffer_data = chroma_buffer.data();
         const u8* chroma_b_src = frame->data[1];
         const u8* chroma_r_src = frame->data[2];
         for (std::size_t y = 0; y < half_height; ++y) {
             const std::size_t src = y * half_stride;
             const std::size_t dst = y * aligned_width;
-
             for (std::size_t x = 0; x < half_width; ++x) {
-                chroma_buffer[dst + x * 2] = chroma_b_src[src + x];
-                chroma_buffer[dst + x * 2 + 1] = chroma_r_src[src + x];
+                chroma_buffer_data[dst + x * 2] = chroma_b_src[src + x];
+                chroma_buffer_data[dst + x * 2 + 1] = chroma_r_src[src + x];
             }
         }
         break;
@@ -225,9 +223,7 @@ void Vic::WriteYUVFrame(const AVFrame* frame, const VicConfig& config) {
         for (std::size_t y = 0; y < half_height; ++y) {
             const std::size_t src = y * stride;
             const std::size_t dst = y * aligned_width;
-            for (std::size_t x = 0; x < frame_width; ++x) {
-                chroma_buffer[dst + x] = chroma_src[src + x];
-            }
+            std::memcpy(chroma_buffer.data() + dst, chroma_src + src, frame_width);
         }
         break;
     }
diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp
index 26d066004..1a0cea9b7 100644
--- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp
+++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp
@@ -30,7 +30,7 @@ bool ComputePipelineKey::operator==(const ComputePipelineKey& rhs) const noexcep
 ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cache_,
                                  BufferCache& buffer_cache_, ProgramManager& program_manager_,
                                  const Shader::Info& info_, std::string code,
-                                 std::vector<u32> code_v)
+                                 std::vector<u32> code_v, bool force_context_flush)
     : texture_cache{texture_cache_}, buffer_cache{buffer_cache_},
       program_manager{program_manager_}, info{info_} {
     switch (device.GetShaderBackend()) {
@@ -63,6 +63,15 @@ ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cac
     writes_global_memory = !use_storage_buffers &&
                            std::ranges::any_of(info.storage_buffers_descriptors,
                                                [](const auto& desc) { return desc.is_written; });
+    if (force_context_flush) {
+        std::scoped_lock lock{built_mutex};
+        built_fence.Create();
+        // Flush this context to ensure compilation commands and fence are in the GPU pipe.
+        glFlush();
+        built_condvar.notify_one();
+    } else {
+        is_built = true;
+    }
 }
 
 void ComputePipeline::Configure() {
@@ -142,6 +151,9 @@ void ComputePipeline::Configure() {
     }
     texture_cache.FillComputeImageViews(std::span(views.data(), views.size()));
 
+    if (!is_built) {
+        WaitForBuild();
+    }
     if (assembly_program.handle != 0) {
         program_manager.BindComputeAssemblyProgram(assembly_program.handle);
     } else {
@@ -223,4 +235,13 @@ void ComputePipeline::Configure() {
     }
 }
 
+void ComputePipeline::WaitForBuild() {
+    if (built_fence.handle == 0) {
+        std::unique_lock lock{built_mutex};
+        built_condvar.wait(lock, [this] { return built_fence.handle != 0; });
+    }
+    ASSERT(glClientWaitSync(built_fence.handle, 0, GL_TIMEOUT_IGNORED) != GL_WAIT_FAILED);
+    is_built = true;
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.h b/src/video_core/renderer_opengl/gl_compute_pipeline.h
index 6534dec32..9bcc72b59 100644
--- a/src/video_core/renderer_opengl/gl_compute_pipeline.h
+++ b/src/video_core/renderer_opengl/gl_compute_pipeline.h
@@ -50,7 +50,8 @@ class ComputePipeline {
 public:
     explicit ComputePipeline(const Device& device, TextureCache& texture_cache_,
                              BufferCache& buffer_cache_, ProgramManager& program_manager_,
-                             const Shader::Info& info_, std::string code, std::vector<u32> code_v);
+                             const Shader::Info& info_, std::string code, std::vector<u32> code_v,
+                             bool force_context_flush = false);
 
     void Configure();
 
@@ -65,6 +66,8 @@ public:
     }
 
 private:
+    void WaitForBuild();
+
     TextureCache& texture_cache;
     BufferCache& buffer_cache;
     Tegra::MemoryManager* gpu_memory;
@@ -81,6 +84,11 @@ private:
 
     bool use_storage_buffers{};
     bool writes_global_memory{};
+
+    std::mutex built_mutex;
+    std::condition_variable built_condvar;
+    OGLSync built_fence{};
+    bool is_built{false};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
index c115dabe1..29491e762 100644
--- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
+++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
@@ -176,7 +176,7 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c
                                    std::array<std::string, 5> sources,
                                    std::array<std::vector<u32>, 5> sources_spirv,
                                    const std::array<const Shader::Info*, 5>& infos,
-                                   const GraphicsPipelineKey& key_)
+                                   const GraphicsPipelineKey& key_, bool force_context_flush)
     : texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, program_manager{program_manager_},
       state_tracker{state_tracker_}, key{key_} {
     if (shader_notify) {
@@ -231,7 +231,8 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c
     const bool in_parallel = thread_worker != nullptr;
     const auto backend = device.GetShaderBackend();
     auto func{[this, sources = std::move(sources), sources_spirv = std::move(sources_spirv),
-               shader_notify, backend, in_parallel](ShaderContext::Context*) mutable {
+               shader_notify, backend, in_parallel,
+               force_context_flush](ShaderContext::Context*) mutable {
         for (size_t stage = 0; stage < 5; ++stage) {
             switch (backend) {
             case Settings::ShaderBackend::GLSL:
@@ -251,7 +252,7 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c
                 break;
             }
         }
-        if (in_parallel) {
+        if (force_context_flush || in_parallel) {
             std::scoped_lock lock{built_mutex};
             built_fence.Create();
             // Flush this context to ensure compilation commands and fence are in the GPU pipe.
diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.h b/src/video_core/renderer_opengl/gl_graphics_pipeline.h
index 1c06b3655..7bab3be0a 100644
--- a/src/video_core/renderer_opengl/gl_graphics_pipeline.h
+++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.h
@@ -78,7 +78,7 @@ public:
                               std::array<std::string, 5> sources,
                               std::array<std::vector<u32>, 5> sources_spirv,
                               const std::array<const Shader::Info*, 5>& infos,
-                              const GraphicsPipelineKey& key_);
+                              const GraphicsPipelineKey& key_, bool force_context_flush = false);
 
     void Configure(bool is_indexed) {
         configure_func(this, is_indexed);
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 7dd854e0f..626ea7dcb 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -286,7 +286,7 @@ void ShaderCache::LoadDiskResources(u64 title_id, std::stop_token stop_loading,
         file.read(reinterpret_cast<char*>(&key), sizeof(key));
         queue_work([this, key, env = std::move(env), &state, &callback](Context* ctx) mutable {
             ctx->pools.ReleaseContents();
-            auto pipeline{CreateComputePipeline(ctx->pools, key, env)};
+            auto pipeline{CreateComputePipeline(ctx->pools, key, env, true)};
             std::scoped_lock lock{state.mutex};
             if (pipeline) {
                 compute_cache.emplace(key, std::move(pipeline));
@@ -307,7 +307,7 @@ void ShaderCache::LoadDiskResources(u64 title_id, std::stop_token stop_loading,
                 env_ptrs.push_back(&env);
             }
             ctx->pools.ReleaseContents();
-            auto pipeline{CreateGraphicsPipeline(ctx->pools, key, MakeSpan(env_ptrs), false)};
+            auto pipeline{CreateGraphicsPipeline(ctx->pools, key, MakeSpan(env_ptrs), false, true)};
             std::scoped_lock lock{state.mutex};
             if (pipeline) {
                 graphics_cache.emplace(key, std::move(pipeline));
@@ -439,7 +439,8 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline() {
 
 std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline(
     ShaderContext::ShaderPools& pools, const GraphicsPipelineKey& key,
-    std::span<Shader::Environment* const> envs, bool build_in_parallel) try {
+    std::span<Shader::Environment* const> envs, bool use_shader_workers,
+    bool force_context_flush) try {
     LOG_INFO(Render_OpenGL, "0x{:016x}", key.Hash());
     size_t env_index{};
     u32 total_storage_buffers{};
@@ -531,10 +532,10 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline(
         }
         previous_program = &program;
     }
-    auto* const thread_worker{build_in_parallel ? workers.get() : nullptr};
+    auto* const thread_worker{use_shader_workers ? workers.get() : nullptr};
     return std::make_unique<GraphicsPipeline>(device, texture_cache, buffer_cache, program_manager,
                                               state_tracker, thread_worker, &shader_notify, sources,
-                                              sources_spirv, infos, key);
+                                              sources_spirv, infos, key, force_context_flush);
 
 } catch (Shader::Exception& exception) {
     LOG_ERROR(Render_OpenGL, "{}", exception.what());
@@ -559,8 +560,8 @@ std::unique_ptr<ComputePipeline> ShaderCache::CreateComputePipeline(
 }
 
 std::unique_ptr<ComputePipeline> ShaderCache::CreateComputePipeline(
-    ShaderContext::ShaderPools& pools, const ComputePipelineKey& key,
-    Shader::Environment& env) try {
+    ShaderContext::ShaderPools& pools, const ComputePipelineKey& key, Shader::Environment& env,
+    bool force_context_flush) try {
     LOG_INFO(Render_OpenGL, "0x{:016x}", key.Hash());
 
     Shader::Maxwell::Flow::CFG cfg{env, pools.flow_block, env.StartAddress()};
@@ -589,7 +590,7 @@ std::unique_ptr<ComputePipeline> ShaderCache::CreateComputePipeline(
     }
 
     return std::make_unique<ComputePipeline>(device, texture_cache, buffer_cache, program_manager,
-                                             program.info, code, code_spirv);
+                                             program.info, code, code_spirv, force_context_flush);
 } catch (Shader::Exception& exception) {
     LOG_ERROR(Render_OpenGL, "{}", exception.what());
     return nullptr;
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index f82420592..6b9732fca 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -50,14 +50,16 @@ private:
 
     std::unique_ptr<GraphicsPipeline> CreateGraphicsPipeline(
         ShaderContext::ShaderPools& pools, const GraphicsPipelineKey& key,
-        std::span<Shader::Environment* const> envs, bool build_in_parallel);
+        std::span<Shader::Environment* const> envs, bool use_shader_workers,
+        bool force_context_flush = false);
 
     std::unique_ptr<ComputePipeline> CreateComputePipeline(const ComputePipelineKey& key,
                                                            const VideoCommon::ShaderInfo* shader);
 
     std::unique_ptr<ComputePipeline> CreateComputePipeline(ShaderContext::ShaderPools& pools,
                                                            const ComputePipelineKey& key,
-                                                           Shader::Environment& env);
+                                                           Shader::Environment& env,
+                                                           bool force_context_flush = false);
 
     std::unique_ptr<ShaderWorker> CreateWorkers() const;
 
diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
index f91bb5a1d..baedc4424 100644
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -548,31 +548,7 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
     static_vector<VkVertexInputBindingDescription, 32> vertex_bindings;
     static_vector<VkVertexInputBindingDivisorDescriptionEXT, 32> vertex_binding_divisors;
     static_vector<VkVertexInputAttributeDescription, 32> vertex_attributes;
-    if (key.state.dynamic_vertex_input) {
-        const size_t num_vertex_arrays = std::min(
-            key.state.attributes.size(), static_cast<size_t>(device.GetMaxVertexInputBindings()));
-        for (size_t index = 0; index < num_vertex_arrays; ++index) {
-            const u32 type = key.state.DynamicAttributeType(index);
-            if (!stage_infos[0].loads.Generic(index) || type == 0) {
-                continue;
-            }
-            vertex_attributes.push_back({
-                .location = static_cast<u32>(index),
-                .binding = 0,
-                .format = type == 1   ? VK_FORMAT_R32_SFLOAT
-                          : type == 2 ? VK_FORMAT_R32_SINT
-                                      : VK_FORMAT_R32_UINT,
-                .offset = 0,
-            });
-        }
-        if (!vertex_attributes.empty()) {
-            vertex_bindings.push_back({
-                .binding = 0,
-                .stride = 4,
-                .inputRate = VK_VERTEX_INPUT_RATE_VERTEX,
-            });
-        }
-    } else {
+    if (!key.state.dynamic_vertex_input) {
         const size_t num_vertex_arrays = std::min(
             Maxwell::NumVertexArrays, static_cast<size_t>(device.GetMaxVertexInputBindings()));
         for (size_t index = 0; index < num_vertex_arrays; ++index) {