9 files changed, 189 insertions, 76 deletions
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index a126c359c..02e161270 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -77,6 +77,14 @@ void Fermi2D::Blit() {
     const auto bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format));
     const bool delegate_to_gpu = src.width > 512 && src.height > 512 && bytes_per_pixel <= 8 &&
                                  src.format != regs.dst.format;
+
+    auto srcX = args.src_x0;
+    auto srcY = args.src_y0;
+    if (args.sample_mode.origin == Origin::Corner) {
+        srcX -= (args.du_dx >> 33) << 32;
+        srcY -= (args.dv_dy >> 33) << 32;
+    }
+
     Config config{
         .operation = regs.operation,
         .filter = args.sample_mode.filter,
@@ -86,10 +94,10 @@ void Fermi2D::Blit() {
         .dst_y0 = args.dst_y0,
         .dst_x1 = args.dst_x0 + args.dst_width,
         .dst_y1 = args.dst_y0 + args.dst_height,
-        .src_x0 = static_cast<s32>(args.src_x0 >> 32),
-        .src_y0 = static_cast<s32>(args.src_y0 >> 32),
-        .src_x1 = static_cast<s32>((args.du_dx * args.dst_width + args.src_x0) >> 32),
-        .src_y1 = static_cast<s32>((args.dv_dy * args.dst_height + args.src_y0) >> 32),
+        .src_x0 = static_cast<s32>(srcX >> 32),
+        .src_y0 = static_cast<s32>(srcY >> 32),
+        .src_x1 = static_cast<s32>((srcX + args.du_dx * args.dst_width) >> 32),
+        .src_y1 = static_cast<s32>((srcY + args.dv_dy * args.dst_height) >> 32),
     };
 
     const auto need_align_to_pitch =
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 614d61db4..0932fadc2 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -4,6 +4,7 @@
 #include <cstring>
 #include <optional>
 #include "common/assert.h"
+#include "common/bit_util.h"
 #include "common/scope_exit.h"
 #include "common/settings.h"
 #include "core/core.h"
@@ -259,12 +260,13 @@ u32 Maxwell3D::GetMaxCurrentVertices() {
 size_t Maxwell3D::EstimateIndexBufferSize() {
     GPUVAddr start_address = regs.index_buffer.StartAddress();
     GPUVAddr end_address = regs.index_buffer.EndAddress();
-    static constexpr std::array<size_t, 4> max_sizes = {
-        std::numeric_limits<u8>::max(), std::numeric_limits<u16>::max(),
-        std::numeric_limits<u32>::max(), std::numeric_limits<u32>::max()};
+    static constexpr std::array<size_t, 3> max_sizes = {std::numeric_limits<u8>::max(),
+                                                        std::numeric_limits<u16>::max(),
+                                                        std::numeric_limits<u32>::max()};
     const size_t byte_size = regs.index_buffer.FormatSizeInBytes();
+    const size_t log2_byte_size = Common::Log2Ceil64(byte_size);
     return std::min<size_t>(
-        memory_manager.GetMemoryLayoutSize(start_address, byte_size * max_sizes[byte_size]) /
+        memory_manager.GetMemoryLayoutSize(start_address, byte_size * max_sizes[log2_byte_size]) /
             byte_size,
         static_cast<size_t>(end_address - start_address));
 }
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index 3ff8cad83..cc0b95f1a 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -176,6 +176,10 @@ public:
         return vendor_name == "ATI Technologies Inc.";
     }
 
+    bool IsIntel() const {
+        return vendor_name == "Intel";
+    }
+
     bool CanReportMemoryUsage() const {
         return can_report_memory;
     }
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 479bb8ba3..6ecda2984 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -218,6 +218,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo
           .lower_left_origin_mode = true,
           .need_declared_frag_colors = true,
           .need_fastmath_off = device.NeedsFastmathOff(),
+          .need_gather_subpixel_offset = device.IsAmd() || device.IsIntel(),
 
           .has_broken_spirv_clamp = true,
           .has_broken_unsigned_image_offsets = true,
diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp
index 8aa07ef9d..47c74e4d8 100644
--- a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp
+++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp
@@ -10,7 +10,14 @@
 
 namespace Vulkan {
 
-MasterSemaphore::MasterSemaphore(const Device& device) {
+MasterSemaphore::MasterSemaphore(const Device& device_) : device(device_) {
+    if (!device.HasTimelineSemaphore()) {
+        static constexpr VkFenceCreateInfo fence_ci{
+            .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, .pNext = nullptr, .flags = 0};
+        fence = device.GetLogical().CreateFence(fence_ci);
+        return;
+    }
+
     static constexpr VkSemaphoreTypeCreateInfo semaphore_type_ci{
         .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
         .pNext = nullptr,
@@ -42,4 +49,134 @@ MasterSemaphore::MasterSemaphore(const Device& device) {
 
 MasterSemaphore::~MasterSemaphore() = default;
 
+void MasterSemaphore::Refresh() {
+    if (!semaphore) {
+        // If we don't support timeline semaphores, there's nothing to refresh
+        return;
+    }
+
+    u64 this_tick{};
+    u64 counter{};
+    do {
+        this_tick = gpu_tick.load(std::memory_order_acquire);
+        counter = semaphore.GetCounter();
+        if (counter < this_tick) {
+            return;
+        }
+    } while (!gpu_tick.compare_exchange_weak(this_tick, counter, std::memory_order_release,
+                                             std::memory_order_relaxed));
+}
+
+void MasterSemaphore::Wait(u64 tick) {
+    if (!semaphore) {
+        // If we don't support timeline semaphores, use an atomic wait
+        while (true) {
+            u64 current_value = gpu_tick.load(std::memory_order_relaxed);
+            if (current_value >= tick) {
+                return;
+            }
+            gpu_tick.wait(current_value);
+        }
+
+        return;
+    }
+
+    // No need to wait if the GPU is ahead of the tick
+    if (IsFree(tick)) {
+        return;
+    }
+
+    // Update the GPU tick and try again
+    Refresh();
+
+    if (IsFree(tick)) {
+        return;
+    }
+
+    // If none of the above is hit, fallback to a regular wait
+    while (!semaphore.Wait(tick)) {
+    }
+
+    Refresh();
+}
+
+VkResult MasterSemaphore::SubmitQueue(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore,
+                                      VkSemaphore wait_semaphore, u64 host_tick) {
+    if (semaphore) {
+        return SubmitQueueTimeline(cmdbuf, signal_semaphore, wait_semaphore, host_tick);
+    } else {
+        return SubmitQueueFence(cmdbuf, signal_semaphore, wait_semaphore, host_tick);
+    }
+}
+
+static constexpr std::array<VkPipelineStageFlags, 2> wait_stage_masks{
+    VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+    VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+};
+
+VkResult MasterSemaphore::SubmitQueueTimeline(vk::CommandBuffer& cmdbuf,
+                                              VkSemaphore signal_semaphore,
+                                              VkSemaphore wait_semaphore, u64 host_tick) {
+    const VkSemaphore timeline_semaphore = *semaphore;
+
+    const u32 num_signal_semaphores = signal_semaphore ? 2 : 1;
+    const std::array signal_values{host_tick, u64(0)};
+    const std::array signal_semaphores{timeline_semaphore, signal_semaphore};
+
+    const u32 num_wait_semaphores = wait_semaphore ? 2 : 1;
+    const std::array wait_values{host_tick - 1, u64(1)};
+    const std::array wait_semaphores{timeline_semaphore, wait_semaphore};
+
+    const VkTimelineSemaphoreSubmitInfo timeline_si{
+        .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO,
+        .pNext = nullptr,
+        .waitSemaphoreValueCount = num_wait_semaphores,
+        .pWaitSemaphoreValues = wait_values.data(),
+        .signalSemaphoreValueCount = num_signal_semaphores,
+        .pSignalSemaphoreValues = signal_values.data(),
+    };
+    const VkSubmitInfo submit_info{
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .pNext = &timeline_si,
+        .waitSemaphoreCount = num_wait_semaphores,
+        .pWaitSemaphores = wait_semaphores.data(),
+        .pWaitDstStageMask = wait_stage_masks.data(),
+        .commandBufferCount = 1,
+        .pCommandBuffers = cmdbuf.address(),
+        .signalSemaphoreCount = num_signal_semaphores,
+        .pSignalSemaphores = signal_semaphores.data(),
+    };
+
+    return device.GetGraphicsQueue().Submit(submit_info);
+}
+
+VkResult MasterSemaphore::SubmitQueueFence(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore,
+                                           VkSemaphore wait_semaphore, u64 host_tick) {
+    const u32 num_signal_semaphores = signal_semaphore ? 1 : 0;
+    const u32 num_wait_semaphores = wait_semaphore ? 1 : 0;
+
+    const VkSubmitInfo submit_info{
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .pNext = nullptr,
+        .waitSemaphoreCount = num_wait_semaphores,
+        .pWaitSemaphores = &wait_semaphore,
+        .pWaitDstStageMask = wait_stage_masks.data(),
+        .commandBufferCount = 1,
+        .pCommandBuffers = cmdbuf.address(),
+        .signalSemaphoreCount = num_signal_semaphores,
+        .pSignalSemaphores = &signal_semaphore,
+    };
+
+    auto result = device.GetGraphicsQueue().Submit(submit_info, *fence);
+
+    if (result == VK_SUCCESS) {
+        fence.Wait();
+        fence.Reset();
+        gpu_tick.store(host_tick);
+        gpu_tick.notify_all();
+    }
+
+    return result;
+}
+
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h
index 689f02ea5..f2f61f781 100644
--- a/src/video_core/renderer_vulkan/vk_master_semaphore.h
+++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h
@@ -4,6 +4,8 @@
 #pragma once
 
 #include <atomic>
+#include <condition_variable>
+#include <mutex>
 #include <thread>
 
 #include "common/common_types.h"
@@ -29,11 +31,6 @@ public:
         return gpu_tick.load(std::memory_order_acquire);
     }
 
-    /// Returns the timeline semaphore handle.
-    [[nodiscard]] VkSemaphore Handle() const noexcept {
-        return *semaphore;
-    }
-
     /// Returns true when a tick has been hit by the GPU.
     [[nodiscard]] bool IsFree(u64 tick) const noexcept {
         return KnownGpuTick() >= tick;
@@ -45,37 +42,24 @@ public:
     }
 
     /// Refresh the known GPU tick
-    void Refresh() {
-        u64 this_tick{};
-        u64 counter{};
-        do {
-            this_tick = gpu_tick.load(std::memory_order_acquire);
-            counter = semaphore.GetCounter();
-            if (counter < this_tick) {
-                return;
-            }
-        } while (!gpu_tick.compare_exchange_weak(this_tick, counter, std::memory_order_release,
-                                                 std::memory_order_relaxed));
-    }
+    void Refresh();
 
     /// Waits for a tick to be hit on the GPU
-    void Wait(u64 tick) {
-        // No need to wait if the GPU is ahead of the tick
-        if (IsFree(tick)) {
-            return;
-        }
-        // Update the GPU tick and try again
-        Refresh();
-        if (IsFree(tick)) {
-            return;
-        }
-        // If none of the above is hit, fallback to a regular wait
-        while (!semaphore.Wait(tick)) {
-        }
-        Refresh();
-    }
+    void Wait(u64 tick);
+
+    /// Submits the device graphics queue, updating the tick as necessary
+    VkResult SubmitQueue(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore,
+                         VkSemaphore wait_semaphore, u64 host_tick);
+
+private:
+    VkResult SubmitQueueTimeline(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore,
+                                 VkSemaphore wait_semaphore, u64 host_tick);
+    VkResult SubmitQueueFence(vk::CommandBuffer& cmdbuf, VkSemaphore signal_semaphore,
+                              VkSemaphore wait_semaphore, u64 host_tick);
 
 private:
+    const Device& device;             ///< Device.
+    vk::Fence fence;                  ///< Fence.
     vk::Semaphore semaphore;          ///< Timeline semaphore.
     std::atomic<u64> gpu_tick{0};     ///< Current known GPU tick.
     std::atomic<u64> current_tick{1}; ///< Current logical tick.
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index 0684cceed..985cc3203 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -329,6 +329,11 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device
 
         .lower_left_origin_mode = false,
         .need_declared_frag_colors = false,
+        .need_gather_subpixel_offset = driver_id == VK_DRIVER_ID_AMD_PROPRIETARY ||
+                                       driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE ||
+                                       driver_id == VK_DRIVER_ID_MESA_RADV ||
+                                       driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS ||
+                                       driver_id == VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA,
 
         .has_broken_spirv_clamp = driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS,
         .has_broken_spirv_position_input = driver_id == VK_DRIVER_ID_QUALCOMM_PROPRIETARY,
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index b264e6ada..057e16967 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -212,45 +212,13 @@ void Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_s
     const u64 signal_value = master_semaphore->NextTick();
     Record([signal_semaphore, wait_semaphore, signal_value, this](vk::CommandBuffer cmdbuf) {
         cmdbuf.End();
-        const VkSemaphore timeline_semaphore = master_semaphore->Handle();
-
-        const u32 num_signal_semaphores = signal_semaphore ? 2U : 1U;
-        const std::array signal_values{signal_value, u64(0)};
-        const std::array signal_semaphores{timeline_semaphore, signal_semaphore};
-
-        const u32 num_wait_semaphores = wait_semaphore ? 2U : 1U;
-        const std::array wait_values{signal_value - 1, u64(1)};
-        const std::array wait_semaphores{timeline_semaphore, wait_semaphore};
-        static constexpr std::array<VkPipelineStageFlags, 2> wait_stage_masks{
-            VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
-            VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
-        };
-
-        const VkTimelineSemaphoreSubmitInfo timeline_si{
-            .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO,
-            .pNext = nullptr,
-            .waitSemaphoreValueCount = num_wait_semaphores,
-            .pWaitSemaphoreValues = wait_values.data(),
-            .signalSemaphoreValueCount = num_signal_semaphores,
-            .pSignalSemaphoreValues = signal_values.data(),
-        };
-        const VkSubmitInfo submit_info{
-            .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
-            .pNext = &timeline_si,
-            .waitSemaphoreCount = num_wait_semaphores,
-            .pWaitSemaphores = wait_semaphores.data(),
-            .pWaitDstStageMask = wait_stage_masks.data(),
-            .commandBufferCount = 1,
-            .pCommandBuffers = cmdbuf.address(),
-            .signalSemaphoreCount = num_signal_semaphores,
-            .pSignalSemaphores = signal_semaphores.data(),
-        };
 
         if (on_submit) {
             on_submit();
         }
 
-        switch (const VkResult result = device.GetGraphicsQueue().Submit(submit_info)) {
+        switch (const VkResult result = master_semaphore->SubmitQueue(
+                    cmdbuf, signal_semaphore, wait_semaphore, signal_value)) {
         case VK_SUCCESS:
             break;
         case VK_ERROR_DEVICE_LOST:
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index 41b5da18a..7d5018151 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -145,7 +145,6 @@
     FEATURE_NAME(robustness2, robustImageAccess2)                                                  \
     FEATURE_NAME(shader_demote_to_helper_invocation, shaderDemoteToHelperInvocation)               \
     FEATURE_NAME(shader_draw_parameters, shaderDrawParameters)                                     \
-    FEATURE_NAME(timeline_semaphore, timelineSemaphore)                                            \
     FEATURE_NAME(variable_pointer, variablePointers)                                               \
     FEATURE_NAME(variable_pointer, variablePointersStorageBuffer)
 
@@ -158,6 +157,7 @@
     FEATURE_NAME(provoking_vertex, provokingVertexLast)                                            \
     FEATURE_NAME(shader_float16_int8, shaderFloat16)                                               \
     FEATURE_NAME(shader_float16_int8, shaderInt8)                                                  \
+    FEATURE_NAME(timeline_semaphore, timelineSemaphore)                                            \
     FEATURE_NAME(transform_feedback, transformFeedback)                                            \
     FEATURE_NAME(uniform_buffer_standard_layout, uniformBufferStandardLayout)                      \
     FEATURE_NAME(vertex_input_dynamic_state, vertexInputDynamicState)
@@ -493,6 +493,10 @@ public:
         return extensions.shader_atomic_int64;
     }
 
+    bool HasTimelineSemaphore() const {
+        return features.timeline_semaphore.timelineSemaphore;
+    }
+
     /// Returns the minimum supported version of SPIR-V.
     u32 SupportedSpirvVersion() const {
         if (instance_version >= VK_API_VERSION_1_3) {