11 files changed, 119 insertions, 45 deletions
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index cfcda4f53..3dfba8197 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1259,7 +1259,8 @@ public:
 
                     GPUVAddr LimitAddress() const {
                         return static_cast<GPUVAddr>((static_cast<GPUVAddr>(limit_high) << 32) |
-                                                     limit_low);
+                                                     limit_low) +
+                               1;
                     }
                 } vertex_array_limit[NumVertexArrays];
 
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 7231597d4..cde3a26b9 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -655,6 +655,7 @@ union Instruction {
     }
 
     constexpr Instruction(u64 value) : value{value} {}
+    constexpr Instruction(const Instruction& instr) : value(instr.value) {}
 
     BitField<0, 8, Register> gpr0;
     BitField<8, 8, Register> gpr8;
@@ -817,11 +818,9 @@ union Instruction {
         BitField<32, 1, u64> saturate;
         BitField<49, 2, HalfMerge> merge;
 
-        BitField<43, 1, u64> negate_a;
         BitField<44, 1, u64> abs_a;
         BitField<47, 2, HalfType> type_a;
 
-        BitField<31, 1, u64> negate_b;
         BitField<30, 1, u64> abs_b;
         BitField<28, 2, HalfType> type_b;
 
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index fd49bc2a9..dbee9f634 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -51,11 +51,8 @@ GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, u64 size) {
     const GPUVAddr gpu_addr{FindFreeRegion(address_space_base, aligned_size)};
 
     MapBackingMemory(gpu_addr, system.Memory().GetPointer(cpu_addr), aligned_size, cpu_addr);
-    ASSERT(system.CurrentProcess()
-               ->PageTable()
-               .SetMemoryAttribute(cpu_addr, size, Kernel::Memory::MemoryAttribute::DeviceShared,
-                                   Kernel::Memory::MemoryAttribute::DeviceShared)
-               .IsSuccess());
+    ASSERT(
+        system.CurrentProcess()->PageTable().LockForDeviceAddressSpace(cpu_addr, size).IsSuccess());
 
     return gpu_addr;
 }
@@ -66,11 +63,8 @@ GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size)
     const u64 aligned_size{Common::AlignUp(size, page_size)};
 
     MapBackingMemory(gpu_addr, system.Memory().GetPointer(cpu_addr), aligned_size, cpu_addr);
-    ASSERT(system.CurrentProcess()
-               ->PageTable()
-               .SetMemoryAttribute(cpu_addr, size, Kernel::Memory::MemoryAttribute::DeviceShared,
-                                   Kernel::Memory::MemoryAttribute::DeviceShared)
-               .IsSuccess());
+    ASSERT(
+        system.CurrentProcess()->PageTable().LockForDeviceAddressSpace(cpu_addr, size).IsSuccess());
     return gpu_addr;
 }
 
@@ -87,9 +81,7 @@ GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) {
     UnmapRange(gpu_addr, aligned_size);
     ASSERT(system.CurrentProcess()
                ->PageTable()
-               .SetMemoryAttribute(cpu_addr.value(), size,
-                                   Kernel::Memory::MemoryAttribute::DeviceShared,
-                                   Kernel::Memory::MemoryAttribute::None)
+               .UnlockForDeviceAddressSpace(cpu_addr.value(), size)
                .IsSuccess());
 
     return gpu_addr;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 4c16c89d2..6fe155bcc 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -186,8 +186,12 @@ void RasterizerOpenGL::SetupVertexBuffer() {
         const GPUVAddr start = vertex_array.StartAddress();
         const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
 
-        ASSERT(end > start);
-        const u64 size = end - start + 1;
+        ASSERT(end >= start);
+        const u64 size = end - start;
+        if (size == 0) {
+            glBindVertexBuffer(static_cast<GLuint>(index), 0, 0, vertex_array.stride);
+            continue;
+        }
         const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);
         glBindVertexBuffer(static_cast<GLuint>(index), vertex_buffer, vertex_buffer_offset,
                            vertex_array.stride);
@@ -311,8 +315,8 @@ std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
         const GPUVAddr start = regs.vertex_array[index].StartAddress();
         const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
 
-        ASSERT(end > start);
-        size += end - start + 1;
+        size += end - start;
+        ASSERT(end >= start);
     }
 
     return size;
diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.h b/src/video_core/renderer_vulkan/fixed_pipeline_state.h
index 9fe6bdbf9..9a950f4de 100644
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.h
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.h
@@ -129,7 +129,7 @@ struct alignas(32) FixedPipelineState {
             auto& binding = bindings[index];
             binding.raw = 0;
             binding.enabled.Assign(enabled ? 1 : 0);
-            binding.stride.Assign(stride);
+            binding.stride.Assign(static_cast<u16>(stride));
             binding_divisors[index] = divisor;
         }
 
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 8a1f57891..68464e637 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -877,8 +877,12 @@ void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex
         const GPUVAddr start{vertex_array.StartAddress()};
         const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()};
 
-        ASSERT(end > start);
-        const std::size_t size{end - start + 1};
+        ASSERT(end >= start);
+        const std::size_t size{end - start};
+        if (size == 0) {
+            buffer_bindings.AddVertexBinding(DefaultBuffer(), 0);
+            continue;
+        }
         const auto [buffer, offset] = buffer_cache.UploadMemory(start, size);
         buffer_bindings.AddVertexBinding(buffer, offset);
     }
@@ -1033,8 +1037,7 @@ void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry,
                                         const Tegra::Engines::ConstBufferInfo& buffer) {
     if (!buffer.enabled) {
         // Set values to zero to unbind buffers
-        update_descriptor_queue.AddBuffer(buffer_cache.GetEmptyBuffer(sizeof(float)), 0,
-                                          sizeof(float));
+        update_descriptor_queue.AddBuffer(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE);
         return;
     }
 
@@ -1057,7 +1060,9 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd
     if (size == 0) {
         // Sometimes global memory pointers don't have a proper size. Upload a dummy entry
         // because Vulkan doesn't like empty buffers.
-        constexpr std::size_t dummy_size = 4;
+        // Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the
+        // default buffer.
+        static constexpr std::size_t dummy_size = 4;
         const auto buffer = buffer_cache.GetEmptyBuffer(dummy_size);
         update_descriptor_queue.AddBuffer(buffer, 0, dummy_size);
         return;
@@ -1222,7 +1227,7 @@ std::size_t RasterizerVulkan::CalculateVertexArraysSize() const {
         const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()};
         DEBUG_ASSERT(end >= start);
 
-        size += (end - start + 1) * regs.vertex_array[index].enable;
+        size += (end - start) * regs.vertex_array[index].enable;
     }
     return size;
 }
@@ -1269,4 +1274,29 @@ RenderPassParams RasterizerVulkan::GetRenderPassParams(Texceptions texceptions)
     return renderpass_params;
 }
 
+VkBuffer RasterizerVulkan::DefaultBuffer() {
+    if (default_buffer) {
+        return *default_buffer;
+    }
+
+    VkBufferCreateInfo ci;
+    ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+    ci.pNext = nullptr;
+    ci.flags = 0;
+    ci.size = DEFAULT_BUFFER_SIZE;
+    ci.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
+               VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+    ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+    ci.queueFamilyIndexCount = 0;
+    ci.pQueueFamilyIndices = nullptr;
+    default_buffer = device.GetLogical().CreateBuffer(ci);
+    default_buffer_commit = memory_manager.Commit(default_buffer, false);
+
+    scheduler.RequestOutsideRenderPassOperationContext();
+    scheduler.Record([buffer = *default_buffer](vk::CommandBuffer cmdbuf) {
+        cmdbuf.FillBuffer(buffer, 0, DEFAULT_BUFFER_SIZE, 0);
+    });
+    return *default_buffer;
+}
+
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 2fa46b0cc..d41a7929e 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -155,6 +155,7 @@ private:
     using Texceptions = std::bitset<Maxwell::NumRenderTargets + 1>;
 
     static constexpr std::size_t ZETA_TEXCEPTION_INDEX = 8;
+    static constexpr VkDeviceSize DEFAULT_BUFFER_SIZE = 4 * sizeof(float);
 
     void FlushWork();
 
@@ -247,6 +248,8 @@ private:
 
     RenderPassParams GetRenderPassParams(Texceptions texceptions) const;
 
+    VkBuffer DefaultBuffer();
+
     Core::System& system;
     Core::Frontend::EmuWindow& render_window;
     VKScreenInfo& screen_info;
@@ -271,6 +274,9 @@ private:
     VKFenceManager fence_manager;
     VKQueryCache query_cache;
 
+    vk::Buffer default_buffer;
+    VKMemoryCommit default_buffer_commit;
+
     std::array<View, Maxwell::NumRenderTargets> color_attachments;
     View zeta_attachment;
 
diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
index 94d954d7a..c76ab5c2d 100644
--- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
@@ -81,7 +81,7 @@ VKBuffer& VKStagingBufferPool::CreateStagingBuffer(std::size_t size, bool host_v
     ci.size = 1ULL << log2;
     ci.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
                VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
-               VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
+               VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
     ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
     ci.queueFamilyIndexCount = 0;
     ci.pQueueFamilyIndices = nullptr;
diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/renderer_vulkan/wrapper.cpp
index 539f3c974..7f5bc1404 100644
--- a/src/video_core/renderer_vulkan/wrapper.cpp
+++ b/src/video_core/renderer_vulkan/wrapper.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <algorithm>
 #include <exception>
 #include <memory>
 #include <optional>
@@ -16,6 +17,23 @@ namespace Vulkan::vk {
 
 namespace {
 
+void SortPhysicalDevices(std::vector<VkPhysicalDevice>& devices, const InstanceDispatch& dld) {
+    std::stable_sort(devices.begin(), devices.end(), [&](auto lhs, auto rhs) {
+        // This will call Vulkan more than needed, but these calls are cheap.
+        const auto lhs_properties = vk::PhysicalDevice(lhs, dld).GetProperties();
+        const auto rhs_properties = vk::PhysicalDevice(rhs, dld).GetProperties();
+
+        // Prefer discrete GPUs, Nvidia over AMD, AMD over Intel, Intel over the rest.
+        const bool preferred =
+            (lhs_properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU &&
+             rhs_properties.deviceType != VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) ||
+            (lhs_properties.vendorID == 0x10DE && rhs_properties.vendorID != 0x10DE) ||
+            (lhs_properties.vendorID == 0x1002 && rhs_properties.vendorID != 0x1002) ||
+            (lhs_properties.vendorID == 0x8086 && rhs_properties.vendorID != 0x8086);
+        return !preferred;
+    });
+}
+
 template <typename T>
 bool Proc(T& result, const InstanceDispatch& dld, const char* proc_name,
           VkInstance instance = nullptr) noexcept {
@@ -389,7 +407,8 @@ std::optional<std::vector<VkPhysicalDevice>> Instance::EnumeratePhysicalDevices(
     if (dld->vkEnumeratePhysicalDevices(handle, &num, physical_devices.data()) != VK_SUCCESS) {
         return std::nullopt;
     }
-    return physical_devices;
+    SortPhysicalDevices(physical_devices, *dld);
+    return std::make_optional(std::move(physical_devices));
 }
 
 DebugCallback Instance::TryCreateDebugCallback(
diff --git a/src/video_core/shader/decode/arithmetic_half.cpp b/src/video_core/shader/decode/arithmetic_half.cpp
index ee7d9a29d..a276aee44 100644
--- a/src/video_core/shader/decode/arithmetic_half.cpp
+++ b/src/video_core/shader/decode/arithmetic_half.cpp
@@ -19,22 +19,46 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    if (opcode->get().GetId() == OpCode::Id::HADD2_C ||
-        opcode->get().GetId() == OpCode::Id::HADD2_R) {
+    bool negate_a = false;
+    bool negate_b = false;
+    bool absolute_a = false;
+    bool absolute_b = false;
+
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HADD2_R:
         if (instr.alu_half.ftz == 0) {
             LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
         }
+        negate_a = ((instr.value >> 43) & 1) != 0;
+        negate_b = ((instr.value >> 31) & 1) != 0;
+        absolute_a = ((instr.value >> 44) & 1) != 0;
+        absolute_b = ((instr.value >> 30) & 1) != 0;
+        break;
+    case OpCode::Id::HADD2_C:
+        if (instr.alu_half.ftz == 0) {
+            LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+        }
+        negate_a = ((instr.value >> 43) & 1) != 0;
+        negate_b = ((instr.value >> 56) & 1) != 0;
+        absolute_a = ((instr.value >> 44) & 1) != 0;
+        absolute_b = ((instr.value >> 54) & 1) != 0;
+        break;
+    case OpCode::Id::HMUL2_R:
+        negate_a = ((instr.value >> 43) & 1) != 0;
+        absolute_a = ((instr.value >> 44) & 1) != 0;
+        absolute_b = ((instr.value >> 30) & 1) != 0;
+        break;
+    case OpCode::Id::HMUL2_C:
+        negate_b = ((instr.value >> 31) & 1) != 0;
+        absolute_a = ((instr.value >> 44) & 1) != 0;
+        absolute_b = ((instr.value >> 54) & 1) != 0;
+        break;
     }
 
-    const bool negate_a =
-        opcode->get().GetId() != OpCode::Id::HMUL2_R && instr.alu_half.negate_a != 0;
-    const bool negate_b =
-        opcode->get().GetId() != OpCode::Id::HMUL2_C && instr.alu_half.negate_b != 0;
-
     Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half.type_a);
-    op_a = GetOperandAbsNegHalf(op_a, instr.alu_half.abs_a, negate_a);
+    op_a = GetOperandAbsNegHalf(op_a, absolute_a, negate_a);
 
-    auto [type_b, op_b] = [&]() -> std::tuple<HalfType, Node> {
+    auto [type_b, op_b] = [this, instr, opcode]() -> std::pair<HalfType, Node> {
         switch (opcode->get().GetId()) {
         case OpCode::Id::HADD2_C:
         case OpCode::Id::HMUL2_C:
@@ -48,17 +72,16 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
         }
     }();
     op_b = UnpackHalfFloat(op_b, type_b);
-    // redeclaration to avoid a bug in clang with reusing local bindings in lambdas
-    Node op_b_alt = GetOperandAbsNegHalf(op_b, instr.alu_half.abs_b, negate_b);
+    op_b = GetOperandAbsNegHalf(op_b, absolute_b, negate_b);
 
-    Node value = [&]() {
+    Node value = [this, opcode, op_a, op_b = op_b] {
         switch (opcode->get().GetId()) {
         case OpCode::Id::HADD2_C:
         case OpCode::Id::HADD2_R:
-            return Operation(OperationCode::HAdd, PRECISE, op_a, op_b_alt);
+            return Operation(OperationCode::HAdd, PRECISE, op_a, op_b);
         case OpCode::Id::HMUL2_C:
         case OpCode::Id::HMUL2_R:
-            return Operation(OperationCode::HMul, PRECISE, op_a, op_b_alt);
+            return Operation(OperationCode::HMul, PRECISE, op_a, op_b);
         default:
             UNIMPLEMENTED_MSG("Unhandled half float instruction: {}", opcode->get().GetName());
             return Immediate(0);
diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp
index 0f4c3103a..9af8c606d 100644
--- a/src/video_core/shader/decode/arithmetic_integer.cpp
+++ b/src/video_core/shader/decode/arithmetic_integer.cpp
@@ -249,8 +249,8 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) {
             }
             case OpCode::Id::LEA_IMM: {
                 const bool neg = instr.lea.imm.neg != 0;
-                return {Immediate(static_cast<u32>(instr.lea.imm.entry_a)),
-                        GetOperandAbsNegInteger(GetRegister(instr.gpr8), false, neg, true),
+                return {GetOperandAbsNegInteger(GetRegister(instr.gpr8), false, neg, true),
+                        Immediate(static_cast<u32>(instr.lea.imm.entry_a)),
                         Immediate(static_cast<u32>(instr.lea.imm.entry_b))};
             }
             case OpCode::Id::LEA_RZ: {