25 files changed, 1175 insertions, 161 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index db9332d00..4b0c6346f 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -37,6 +37,7 @@ add_library(video_core STATIC
     memory_manager.h
     morton.cpp
     morton.h
+    query_cache.h
     rasterizer_accelerated.cpp
     rasterizer_accelerated.h
     rasterizer_cache.cpp
@@ -74,6 +75,8 @@ add_library(video_core STATIC
     renderer_opengl/gl_stream_buffer.h
     renderer_opengl/gl_texture_cache.cpp
     renderer_opengl/gl_texture_cache.h
+    renderer_opengl/gl_query_cache.cpp
+    renderer_opengl/gl_query_cache.h
     renderer_opengl/maxwell_to_gl.h
     renderer_opengl/renderer_opengl.cpp
     renderer_opengl/renderer_opengl.h
@@ -177,6 +180,8 @@ if (ENABLE_VULKAN)
         renderer_vulkan/vk_memory_manager.h
         renderer_vulkan/vk_pipeline_cache.cpp
         renderer_vulkan/vk_pipeline_cache.h
+        renderer_vulkan/vk_query_cache.cpp
+        renderer_vulkan/vk_query_cache.h
         renderer_vulkan/vk_rasterizer.cpp
         renderer_vulkan/vk_rasterizer.h
         renderer_vulkan/vk_renderpass_cache.cpp
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 0b3e8749b..b28de1092 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -4,6 +4,7 @@
 
 #include <cinttypes>
 #include <cstring>
+#include <optional>
 #include "common/assert.h"
 #include "core/core.h"
 #include "core/core_timing.h"
@@ -16,6 +17,8 @@
 
 namespace Tegra::Engines {
 
+using VideoCore::QueryType;
+
 /// First register id that is actually a Macro call.
 constexpr u32 MacroRegistersStart = 0xE00;
 
@@ -400,6 +403,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
         ProcessQueryCondition();
         break;
     }
+    case MAXWELL3D_REG_INDEX(counter_reset): {
+        ProcessCounterReset();
+        break;
+    }
     case MAXWELL3D_REG_INDEX(sync_info): {
         ProcessSyncPoint();
         break;
@@ -482,7 +489,7 @@ void Maxwell3D::FlushMMEInlineDraw() {
 
     const bool is_indexed = mme_draw.current_mode == MMEDrawMode::Indexed;
     if (ShouldExecute()) {
-        rasterizer.DrawMultiBatch(is_indexed);
+        rasterizer.Draw(is_indexed, true);
     }
 
     // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
@@ -544,40 +551,28 @@ void Maxwell3D::ProcessQueryGet() {
                "Units other than CROP are unimplemented");
 
     switch (regs.query.query_get.operation) {
-    case Regs::QueryOperation::Release: {
-        const u64 result = regs.query.query_sequence;
-        StampQueryResult(result, regs.query.query_get.short_query == 0);
+    case Regs::QueryOperation::Release:
+        StampQueryResult(regs.query.query_sequence, regs.query.query_get.short_query == 0);
         break;
-    }
-    case Regs::QueryOperation::Acquire: {
-        // Todo(Blinkhawk): Under this operation, the GPU waits for the CPU
-        // to write a value that matches the current payload.
+    case Regs::QueryOperation::Acquire:
+        // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that
+        // matches the current payload.
         UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE");
         break;
-    }
-    case Regs::QueryOperation::Counter: {
-        u64 result{};
-        switch (regs.query.query_get.select) {
-        case Regs::QuerySelect::Zero:
-            result = 0;
-            break;
-        default:
-            result = 1;
-            UNIMPLEMENTED_MSG("Unimplemented query select type {}",
-                              static_cast<u32>(regs.query.query_get.select.Value()));
+    case Regs::QueryOperation::Counter:
+        if (const std::optional<u64> result = GetQueryResult()) {
+            // If the query returns an empty optional it means it's cached and deferred.
+            // In this case we have a non-empty result, so we stamp it immediately.
+            StampQueryResult(*result, regs.query.query_get.short_query == 0);
         }
-        StampQueryResult(result, regs.query.query_get.short_query == 0);
         break;
-    }
-    case Regs::QueryOperation::Trap: {
+    case Regs::QueryOperation::Trap:
         UNIMPLEMENTED_MSG("Unimplemented query operation TRAP");
         break;
-    }
-    default: {
+    default:
         UNIMPLEMENTED_MSG("Unknown query operation");
         break;
     }
-    }
 }
 
 void Maxwell3D::ProcessQueryCondition() {
@@ -593,20 +588,20 @@ void Maxwell3D::ProcessQueryCondition() {
     }
     case Regs::ConditionMode::ResNonZero: {
         Regs::QueryCompare cmp;
-        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
         execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;
         break;
     }
     case Regs::ConditionMode::Equal: {
         Regs::QueryCompare cmp;
-        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
         execute_on =
             cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;
         break;
     }
     case Regs::ConditionMode::NotEqual: {
         Regs::QueryCompare cmp;
-        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
         execute_on =
             cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;
         break;
@@ -619,6 +614,18 @@ void Maxwell3D::ProcessQueryCondition() {
     }
 }
 
+void Maxwell3D::ProcessCounterReset() {
+    switch (regs.counter_reset) {
+    case Regs::CounterReset::SampleCnt:
+        rasterizer.ResetCounter(QueryType::SamplesPassed);
+        break;
+    default:
+        LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}",
+                    static_cast<int>(regs.counter_reset));
+        break;
+    }
+}
+
 void Maxwell3D::ProcessSyncPoint() {
     const u32 sync_point = regs.sync_info.sync_point.Value();
     const u32 increment = regs.sync_info.increment.Value();
@@ -647,7 +654,7 @@ void Maxwell3D::DrawArrays() {
 
     const bool is_indexed{regs.index_array.count && !regs.vertex_buffer.count};
     if (ShouldExecute()) {
-        rasterizer.DrawBatch(is_indexed);
+        rasterizer.Draw(is_indexed, false);
     }
 
     // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
@@ -661,6 +668,22 @@ void Maxwell3D::DrawArrays() {
     }
 }
 
+std::optional<u64> Maxwell3D::GetQueryResult() {
+    switch (regs.query.query_get.select) {
+    case Regs::QuerySelect::Zero:
+        return 0;
+    case Regs::QuerySelect::SamplesPassed:
+        // Deferred.
+        rasterizer.Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed,
+                         system.GPU().GetTicks());
+        return {};
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented query select type {}",
+                          static_cast<u32>(regs.query.query_get.select.Value()));
+        return 1;
+    }
+}
+
 void Maxwell3D::ProcessCBBind(std::size_t stage_index) {
     // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
     auto& shader = state.shader_stages[stage_index];
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 0a2af54e5..26939be3f 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <bitset>
+#include <optional>
 #include <type_traits>
 #include <unordered_map>
 #include <vector>
@@ -409,6 +410,27 @@ public:
             Linear = 1,
         };
 
+        enum class CounterReset : u32 {
+            SampleCnt = 0x01,
+            Unk02 = 0x02,
+            Unk03 = 0x03,
+            Unk04 = 0x04,
+            EmittedPrimitives = 0x10, // Not tested
+            Unk11 = 0x11,
+            Unk12 = 0x12,
+            Unk13 = 0x13,
+            Unk15 = 0x15,
+            Unk16 = 0x16,
+            Unk17 = 0x17,
+            Unk18 = 0x18,
+            Unk1A = 0x1A,
+            Unk1B = 0x1B,
+            Unk1C = 0x1C,
+            Unk1D = 0x1D,
+            Unk1E = 0x1E,
+            GeneratedPrimitives = 0x1F,
+        };
+
         struct Cull {
             enum class FrontFace : u32 {
                 ClockWise = 0x0900,
@@ -857,7 +879,7 @@ public:
                     BitField<7, 1, u32> c7;
                 } clip_distance_enabled;
 
-                INSERT_UNION_PADDING_WORDS(0x1);
+                u32 samplecnt_enable;
 
                 float point_size;
 
@@ -865,7 +887,11 @@ public:
 
                 u32 point_sprite_enable;
 
-                INSERT_UNION_PADDING_WORDS(0x5);
+                INSERT_UNION_PADDING_WORDS(0x3);
+
+                CounterReset counter_reset;
+
+                INSERT_UNION_PADDING_WORDS(0x1);
 
                 u32 zeta_enable;
 
@@ -1412,12 +1438,15 @@ private:
     /// Handles a write to the QUERY_GET register.
     void ProcessQueryGet();
 
-    // Writes the query result accordingly
+    /// Writes the query result accordingly.
     void StampQueryResult(u64 payload, bool long_query);
 
-    // Handles Conditional Rendering
+    /// Handles conditional rendering.
     void ProcessQueryCondition();
 
+    /// Handles counter resets.
+    void ProcessCounterReset();
+
     /// Handles writes to syncing register.
     void ProcessSyncPoint();
 
@@ -1434,6 +1463,9 @@ private:
 
     // Handles a instance drawcall from MME
     void StepInstance(MMEDrawMode expected_mode, u32 count);
+
+    /// Returns a query's value or an empty object if the value will be deferred through a cache.
+    std::optional<u64> GetQueryResult();
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
@@ -1499,8 +1531,10 @@ ASSERT_REG_POSITION(screen_y_control, 0x4EB);
 ASSERT_REG_POSITION(vb_element_base, 0x50D);
 ASSERT_REG_POSITION(vb_base_instance, 0x50E);
 ASSERT_REG_POSITION(clip_distance_enabled, 0x544);
+ASSERT_REG_POSITION(samplecnt_enable, 0x545);
 ASSERT_REG_POSITION(point_size, 0x546);
 ASSERT_REG_POSITION(point_sprite_enable, 0x548);
+ASSERT_REG_POSITION(counter_reset, 0x54C);
 ASSERT_REG_POSITION(zeta_enable, 0x54E);
 ASSERT_REG_POSITION(multisample_control, 0x54F);
 ASSERT_REG_POSITION(condition, 0x554);
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 11848fbce..f5d33f27a 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -9,6 +9,7 @@
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/vm_manager.h"
 #include "core/memory.h"
+#include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 
@@ -84,7 +85,9 @@ GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) {
     const auto cpu_addr = GpuToCpuAddress(gpu_addr);
     ASSERT(cpu_addr);
 
-    rasterizer.FlushAndInvalidateRegion(cache_addr, aligned_size);
+    // Flush and invalidate through the GPU interface, to be asynchronous if possible.
+    system.GPU().FlushAndInvalidateRegion(cache_addr, aligned_size);
+
     UnmapRange(gpu_addr, aligned_size);
     ASSERT(system.CurrentProcess()
                ->VMManager()
@@ -242,6 +245,8 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s
         switch (page_table.attributes[page_index]) {
         case Common::PageType::Memory: {
             const u8* src_ptr{page_table.pointers[page_index] + page_offset};
+            // Flush must happen on the rasterizer interface, such that memory is always synchronous
+            // when it is read (even when in asynchronous GPU mode). Fixes Dead Cells title menu.
             rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
             std::memcpy(dest_buffer, src_ptr, copy_amount);
             break;
@@ -292,6 +297,8 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const
         switch (page_table.attributes[page_index]) {
         case Common::PageType::Memory: {
             u8* dest_ptr{page_table.pointers[page_index] + page_offset};
+            // Invalidate must happen on the rasterizer interface, such that memory is always
+            // synchronous when it is written (even when in asynchronous GPU mode).
             rasterizer.InvalidateRegion(ToCacheAddr(dest_ptr), copy_amount);
             std::memcpy(dest_ptr, src_buffer, copy_amount);
             break;
@@ -339,6 +346,8 @@ void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::
 
         switch (page_table.attributes[page_index]) {
         case Common::PageType::Memory: {
+            // Flush must happen on the rasterizer interface, such that memory is always synchronous
+            // when it is copied (even when in asynchronous GPU mode).
             const u8* src_ptr{page_table.pointers[page_index] + page_offset};
             rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
             WriteBlock(dest_addr, src_ptr, copy_amount);
diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h
new file mode 100644
index 000000000..e66054ed0
--- /dev/null
+++ b/src/video_core/query_cache.h
@@ -0,0 +1,359 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <unordered_map>
+#include <vector>
+
+#include "common/assert.h"
+#include "core/core.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCommon {
+
+template <class QueryCache, class HostCounter>
+class CounterStreamBase {
+public:
+    explicit CounterStreamBase(QueryCache& cache, VideoCore::QueryType type)
+        : cache{cache}, type{type} {}
+
+    /// Updates the state of the stream, enabling or disabling as needed.
+    void Update(bool enabled) {
+        if (enabled) {
+            Enable();
+        } else {
+            Disable();
+        }
+    }
+
+    /// Resets the stream to zero. It doesn't disable the query after resetting.
+    void Reset() {
+        if (current) {
+            current->EndQuery();
+
+            // Immediately start a new query to avoid disabling its state.
+            current = cache.Counter(nullptr, type);
+        }
+        last = nullptr;
+    }
+
+    /// Returns the current counter slicing as needed.
+    std::shared_ptr<HostCounter> Current() {
+        if (!current) {
+            return nullptr;
+        }
+        current->EndQuery();
+        last = std::move(current);
+        current = cache.Counter(last, type);
+        return last;
+    }
+
+    /// Returns true when the counter stream is enabled.
+    bool IsEnabled() const {
+        return current != nullptr;
+    }
+
+private:
+    /// Enables the stream.
+    void Enable() {
+        if (current) {
+            return;
+        }
+        current = cache.Counter(last, type);
+    }
+
+    // Disables the stream.
+    void Disable() {
+        if (current) {
+            current->EndQuery();
+        }
+        last = std::exchange(current, nullptr);
+    }
+
+    QueryCache& cache;
+    const VideoCore::QueryType type;
+
+    std::shared_ptr<HostCounter> current;
+    std::shared_ptr<HostCounter> last;
+};
+
+template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter,
+          class QueryPool>
+class QueryCacheBase {
+public:
+    explicit QueryCacheBase(Core::System& system, VideoCore::RasterizerInterface& rasterizer)
+        : system{system}, rasterizer{rasterizer}, streams{{CounterStream{
+                                                      static_cast<QueryCache&>(*this),
+                                                      VideoCore::QueryType::SamplesPassed}}} {}
+
+    void InvalidateRegion(CacheAddr addr, std::size_t size) {
+        std::unique_lock lock{mutex};
+        FlushAndRemoveRegion(addr, size);
+    }
+
+    void FlushRegion(CacheAddr addr, std::size_t size) {
+        std::unique_lock lock{mutex};
+        FlushAndRemoveRegion(addr, size);
+    }
+
+    /**
+     * Records a query in GPU mapped memory, potentially marked with a timestamp.
+     * @param gpu_addr  GPU address to flush to when the mapped memory is read.
+     * @param type      Query type, e.g. SamplesPassed.
+     * @param timestamp Timestamp, when empty the flushed query is assumed to be short.
+     */
+    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) {
+        std::unique_lock lock{mutex};
+        auto& memory_manager = system.GPU().MemoryManager();
+        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
+
+        CachedQuery* query = TryGet(ToCacheAddr(host_ptr));
+        if (!query) {
+            const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
+            ASSERT_OR_EXECUTE(cpu_addr, return;);
+
+            query = Register(type, *cpu_addr, host_ptr, timestamp.has_value());
+        }
+
+        query->BindCounter(Stream(type).Current(), timestamp);
+    }
+
+    /// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch.
+    void UpdateCounters() {
+        std::unique_lock lock{mutex};
+        const auto& regs = system.GPU().Maxwell3D().regs;
+        Stream(VideoCore::QueryType::SamplesPassed).Update(regs.samplecnt_enable);
+    }
+
+    /// Resets a counter to zero. It doesn't disable the query after resetting.
+    void ResetCounter(VideoCore::QueryType type) {
+        std::unique_lock lock{mutex};
+        Stream(type).Reset();
+    }
+
+    /// Disable all active streams. Expected to be called at the end of a command buffer.
+    void DisableStreams() {
+        std::unique_lock lock{mutex};
+        for (auto& stream : streams) {
+            stream.Update(false);
+        }
+    }
+
+    /// Returns a new host counter.
+    std::shared_ptr<HostCounter> Counter(std::shared_ptr<HostCounter> dependency,
+                                         VideoCore::QueryType type) {
+        return std::make_shared<HostCounter>(static_cast<QueryCache&>(*this), std::move(dependency),
+                                             type);
+    }
+
+    /// Returns the counter stream of the specified type.
+    CounterStream& Stream(VideoCore::QueryType type) {
+        return streams[static_cast<std::size_t>(type)];
+    }
+
+    /// Returns the counter stream of the specified type.
+    const CounterStream& Stream(VideoCore::QueryType type) const {
+        return streams[static_cast<std::size_t>(type)];
+    }
+
+protected:
+    std::array<QueryPool, VideoCore::NumQueryTypes> query_pools;
+
+private:
+    /// Flushes a memory range to guest memory and removes it from the cache.
+    void FlushAndRemoveRegion(CacheAddr addr, std::size_t size) {
+        const u64 addr_begin = static_cast<u64>(addr);
+        const u64 addr_end = addr_begin + static_cast<u64>(size);
+        const auto in_range = [addr_begin, addr_end](CachedQuery& query) {
+            const u64 cache_begin = query.GetCacheAddr();
+            const u64 cache_end = cache_begin + query.SizeInBytes();
+            return cache_begin < addr_end && addr_begin < cache_end;
+        };
+
+        const u64 page_end = addr_end >> PAGE_SHIFT;
+        for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) {
+            const auto& it = cached_queries.find(page);
+            if (it == std::end(cached_queries)) {
+                continue;
+            }
+            auto& contents = it->second;
+            for (auto& query : contents) {
+                if (!in_range(query)) {
+                    continue;
+                }
+                rasterizer.UpdatePagesCachedCount(query.CpuAddr(), query.SizeInBytes(), -1);
+                query.Flush();
+            }
+            contents.erase(std::remove_if(std::begin(contents), std::end(contents), in_range),
+                           std::end(contents));
+        }
+    }
+
+    /// Registers the passed parameters as cached and returns a pointer to the stored cached query.
+    CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) {
+        rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1);
+        const u64 page = static_cast<u64>(ToCacheAddr(host_ptr)) >> PAGE_SHIFT;
+        return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr,
+                                                  host_ptr);
+    }
+
+    /// Tries to a get a cached query. Returns nullptr on failure.
+    CachedQuery* TryGet(CacheAddr addr) {
+        const u64 page = static_cast<u64>(addr) >> PAGE_SHIFT;
+        const auto it = cached_queries.find(page);
+        if (it == std::end(cached_queries)) {
+            return nullptr;
+        }
+        auto& contents = it->second;
+        const auto found =
+            std::find_if(std::begin(contents), std::end(contents),
+                         [addr](auto& query) { return query.GetCacheAddr() == addr; });
+        return found != std::end(contents) ? &*found : nullptr;
+    }
+
+    static constexpr std::uintptr_t PAGE_SIZE = 4096;
+    static constexpr unsigned PAGE_SHIFT = 12;
+
+    Core::System& system;
+    VideoCore::RasterizerInterface& rasterizer;
+
+    std::recursive_mutex mutex;
+
+    std::unordered_map<u64, std::vector<CachedQuery>> cached_queries;
+
+    std::array<CounterStream, VideoCore::NumQueryTypes> streams;
+};
+
+template <class QueryCache, class HostCounter>
+class HostCounterBase {
+public:
+    explicit HostCounterBase(std::shared_ptr<HostCounter> dependency_)
+        : dependency{std::move(dependency_)}, depth{dependency ? (dependency->Depth() + 1) : 0} {
+        // Avoid nesting too many dependencies to avoid a stack overflow when these are deleted.
+        constexpr u64 depth_threshold = 96;
+        if (depth > depth_threshold) {
+            depth = 0;
+            base_result = dependency->Query();
+            dependency = nullptr;
+        }
+    }
+    virtual ~HostCounterBase() = default;
+
+    /// Returns the current value of the query.
+    u64 Query() {
+        if (result) {
+            return *result;
+        }
+
+        u64 value = BlockingQuery() + base_result;
+        if (dependency) {
+            value += dependency->Query();
+            dependency = nullptr;
+        }
+
+        result = value;
+        return *result;
+    }
+
+    /// Returns true when flushing this query will potentially wait.
+    bool WaitPending() const noexcept {
+        return result.has_value();
+    }
+
+    u64 Depth() const noexcept {
+        return depth;
+    }
+
+protected:
+    /// Returns the value of query from the backend API blocking as needed.
+    virtual u64 BlockingQuery() const = 0;
+
+private:
+    std::shared_ptr<HostCounter> dependency; ///< Counter to add to this value.
+    std::optional<u64> result;               ///< Filled with the already returned value.
+    u64 depth;                               ///< Number of nested dependencies.
+    u64 base_result = 0;                     ///< Equivalent to nested dependencies value.
+};
+
+template <class HostCounter>
+class CachedQueryBase {
+public:
+    explicit CachedQueryBase(VAddr cpu_addr, u8* host_ptr)
+        : cpu_addr{cpu_addr}, host_ptr{host_ptr} {}
+    virtual ~CachedQueryBase() = default;
+
+    CachedQueryBase(CachedQueryBase&&) noexcept = default;
+    CachedQueryBase(const CachedQueryBase&) = delete;
+
+    CachedQueryBase& operator=(CachedQueryBase&&) noexcept = default;
+    CachedQueryBase& operator=(const CachedQueryBase&) = delete;
+
+    /// Flushes the query to guest memory.
+    virtual void Flush() {
+        // When counter is nullptr it means that it's just been reseted. We are supposed to write a
+        // zero in these cases.
+        const u64 value = counter ? counter->Query() : 0;
+        std::memcpy(host_ptr, &value, sizeof(u64));
+
+        if (timestamp) {
+            std::memcpy(host_ptr + TIMESTAMP_OFFSET, &*timestamp, sizeof(u64));
+        }
+    }
+
+    /// Binds a counter to this query.
+    void BindCounter(std::shared_ptr<HostCounter> counter_, std::optional<u64> timestamp_) {
+        if (counter) {
+            // If there's an old counter set it means the query is being rewritten by the game.
+            // To avoid losing the data forever, flush here.
+            Flush();
+        }
+        counter = std::move(counter_);
+        timestamp = timestamp_;
+    }
+
+    VAddr CpuAddr() const noexcept {
+        return cpu_addr;
+    }
+
+    CacheAddr GetCacheAddr() const noexcept {
+        return ToCacheAddr(host_ptr);
+    }
+
+    u64 SizeInBytes() const noexcept {
+        return SizeInBytes(timestamp.has_value());
+    }
+
+    static constexpr u64 SizeInBytes(bool with_timestamp) noexcept {
+        return with_timestamp ? LARGE_QUERY_SIZE : SMALL_QUERY_SIZE;
+    }
+
+protected:
+    /// Returns true when querying the counter may potentially block.
+    bool WaitPending() const noexcept {
+        return counter && counter->WaitPending();
+    }
+
+private:
+    static constexpr std::size_t SMALL_QUERY_SIZE = 8;   // Query size without timestamp.
+    static constexpr std::size_t LARGE_QUERY_SIZE = 16;  // Query size with timestamp.
+    static constexpr std::intptr_t TIMESTAMP_OFFSET = 8; // Timestamp offset in a large query.
+
+    VAddr cpu_addr;                       ///< Guest CPU address.
+    u8* host_ptr;                         ///< Writable host pointer.
+    std::shared_ptr<HostCounter> counter; ///< Host counter to query, owns the dependency tree.
+    std::optional<u64> timestamp;         ///< Timestamp to flush to guest memory.
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index c586cd6fe..f18eaf4bc 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -6,6 +6,7 @@
 
 #include <atomic>
 #include <functional>
+#include <optional>
 #include "common/common_types.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/gpu.h"
@@ -17,6 +18,11 @@ class MemoryManager;
 
 namespace VideoCore {
 
+enum class QueryType {
+    SamplesPassed,
+};
+constexpr std::size_t NumQueryTypes = 1;
+
 enum class LoadCallbackStage {
     Prepare,
     Decompile,
@@ -29,11 +35,8 @@ class RasterizerInterface {
 public:
     virtual ~RasterizerInterface() {}
 
-    /// Draw the current batch of vertex arrays
-    virtual bool DrawBatch(bool is_indexed) = 0;
-
-    /// Draw the current batch of multiple instances of vertex arrays
-    virtual bool DrawMultiBatch(bool is_indexed) = 0;
+    /// Dispatches a draw invocation
+    virtual void Draw(bool is_indexed, bool is_instanced) = 0;
 
     /// Clear the current framebuffer
     virtual void Clear() = 0;
@@ -41,6 +44,12 @@ public:
     /// Dispatches a compute shader invocation
     virtual void DispatchCompute(GPUVAddr code_addr) = 0;
 
+    /// Resets the counter of a query
+    virtual void ResetCounter(QueryType type) = 0;
+
+    /// Records a GPU query and caches it
+    virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0;
+
     /// Notify rasterizer that all caches should be flushed to Switch memory
     virtual void FlushAll() = 0;
 
diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp
new file mode 100644
index 000000000..f12e9f55f
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_query_cache.cpp
@@ -0,0 +1,120 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <cstring>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <glad/glad.h>
+
+#include "common/assert.h"
+#include "core/core.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_query_cache.h"
+#include "video_core/renderer_opengl/gl_rasterizer.h"
+
+namespace OpenGL {
+
+namespace {
+
+constexpr std::array<GLenum, VideoCore::NumQueryTypes> QueryTargets = {GL_SAMPLES_PASSED};
+
+constexpr GLenum GetTarget(VideoCore::QueryType type) {
+    return QueryTargets[static_cast<std::size_t>(type)];
+}
+
+} // Anonymous namespace
+
+QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& gl_rasterizer)
+    : VideoCommon::QueryCacheBase<
+          QueryCache, CachedQuery, CounterStream, HostCounter,
+          std::vector<OGLQuery>>{system,
+                                 static_cast<VideoCore::RasterizerInterface&>(gl_rasterizer)},
+      gl_rasterizer{gl_rasterizer} {}
+
+QueryCache::~QueryCache() = default;
+
+OGLQuery QueryCache::AllocateQuery(VideoCore::QueryType type) {
+    auto& reserve = query_pools[static_cast<std::size_t>(type)];
+    OGLQuery query;
+    if (reserve.empty()) {
+        query.Create(GetTarget(type));
+        return query;
+    }
+
+    query = std::move(reserve.back());
+    reserve.pop_back();
+    return query;
+}
+
+void QueryCache::Reserve(VideoCore::QueryType type, OGLQuery&& query) {
+    query_pools[static_cast<std::size_t>(type)].push_back(std::move(query));
+}
+
+bool QueryCache::AnyCommandQueued() const noexcept {
+    return gl_rasterizer.AnyCommandQueued();
+}
+
+HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type)
+    : VideoCommon::HostCounterBase<QueryCache, HostCounter>{std::move(dependency)}, cache{cache},
+      type{type}, query{cache.AllocateQuery(type)} {
+    glBeginQuery(GetTarget(type), query.handle);
+}
+
+HostCounter::~HostCounter() {
+    cache.Reserve(type, std::move(query));
+}
+
+void HostCounter::EndQuery() {
+    if (!cache.AnyCommandQueued()) {
+        // There are chances a query waited on without commands (glDraw, glClear, glDispatch). Not
+        // having any of these causes a lock. glFlush is considered a command, so we can safely wait
+        // for this. Insert to the OpenGL command stream a flush.
+        glFlush();
+    }
+    glEndQuery(GetTarget(type));
+}
+
+u64 HostCounter::BlockingQuery() const {
+    GLint64 value;
+    glGetQueryObjecti64v(query.handle, GL_QUERY_RESULT, &value);
+    return static_cast<u64>(value);
+}
+
+CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr)
+    : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {}
+
+CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept
+    : VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {}
+
+CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept {
+    VideoCommon::CachedQueryBase<HostCounter>::operator=(std::move(rhs));
+    cache = rhs.cache;
+    type = rhs.type;
+    return *this;
+}
+
+void CachedQuery::Flush() {
+    // Waiting for a query while another query of the same target is enabled locks Nvidia's driver.
+    // To avoid this disable and re-enable keeping the dependency stream.
+    // But we only have to do this if we have pending waits to be done.
+    auto& stream = cache->Stream(type);
+    const bool slice_counter = WaitPending() && stream.IsEnabled();
+    if (slice_counter) {
+        stream.Update(false);
+    }
+
+    VideoCommon::CachedQueryBase<HostCounter>::Flush();
+
+    if (slice_counter) {
+        stream.Update(true);
+    }
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h
new file mode 100644
index 000000000..d8e7052a1
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_query_cache.h
@@ -0,0 +1,78 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/query_cache.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+
+namespace Core {
+class System;
+}
+
+namespace OpenGL {
+
+class CachedQuery;
+class HostCounter;
+class QueryCache;
+class RasterizerOpenGL;
+
+using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>;
+
+class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream,
+                                                            HostCounter, std::vector<OGLQuery>> {
+public:
+    explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer);
+    ~QueryCache();
+
+    OGLQuery AllocateQuery(VideoCore::QueryType type);
+
+    void Reserve(VideoCore::QueryType type, OGLQuery&& query);
+
+    bool AnyCommandQueued() const noexcept;
+
+private:
+    RasterizerOpenGL& gl_rasterizer;
+};
+
+class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> {
+public:
+    explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type);
+    ~HostCounter();
+
+    void EndQuery();
+
+private:
+    u64 BlockingQuery() const override;
+
+    QueryCache& cache;
+    const VideoCore::QueryType type;
+    OGLQuery query;
+};
+
+class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> {
+public:
+    explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr,
+                         u8* host_ptr);
+    CachedQuery(CachedQuery&& rhs) noexcept;
+    CachedQuery(const CachedQuery&) = delete;
+
+    CachedQuery& operator=(CachedQuery&& rhs) noexcept;
+    CachedQuery& operator=(const CachedQuery&) = delete;
+
+    void Flush() override;
+
+private:
+    QueryCache* cache;
+    VideoCore::QueryType type;
+};
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index b0eb14c8b..e1965fb21 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -25,6 +25,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_query_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
@@ -92,8 +93,8 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
                                    ScreenInfo& info)
     : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device},
-      shader_cache{*this, system, emu_window, device}, system{system}, screen_info{info},
-      buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
+      shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, system{system},
+      screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
     shader_program_manager = std::make_unique<GLShader::ProgramManager>();
     state.draw.shader_program = 0;
     state.Apply();
@@ -541,11 +542,16 @@ void RasterizerOpenGL::Clear() {
     } else if (use_stencil) {
         glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil);
     }
+
+    ++num_queued_commands;
 }
 
 void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     MICROPROFILE_SCOPE(OpenGL_Drawing);
     auto& gpu = system.GPU().Maxwell3D();
+    const auto& regs = gpu.regs;
+
+    query_cache.UpdateCounters();
 
     SyncRasterizeEnable(state);
     SyncColorMask();
@@ -611,7 +617,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
 
     // Setup shaders and their used resources.
     texture_cache.GuardSamplers(true);
-    const auto primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology);
+    const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology);
     SetupShaders(primitive_mode);
     texture_cache.GuardSamplers(false);
 
@@ -638,35 +644,47 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
         glTextureBarrier();
     }
 
+    ++num_queued_commands;
+
     const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance);
     const GLsizei num_instances =
         static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1);
     if (is_indexed) {
-        const GLenum index_format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format);
         const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base);
         const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count);
-        glDrawElementsInstancedBaseVertexBaseInstance(
-            primitive_mode, num_vertices, index_format,
-            reinterpret_cast<const void*>(index_buffer_offset), num_instances, base_vertex,
-            base_instance);
+        const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset);
+        const GLenum format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format);
+        if (num_instances == 1 && base_instance == 0 && base_vertex == 0) {
+            glDrawElements(primitive_mode, num_vertices, format, offset);
+        } else if (num_instances == 1 && base_instance == 0) {
+            glDrawElementsBaseVertex(primitive_mode, num_vertices, format, offset, base_vertex);
+        } else if (base_vertex == 0 && base_instance == 0) {
+            glDrawElementsInstanced(primitive_mode, num_vertices, format, offset, num_instances);
+        } else if (base_vertex == 0) {
+            glDrawElementsInstancedBaseInstance(primitive_mode, num_vertices, format, offset,
+                                                num_instances, base_instance);
+        } else if (base_instance == 0) {
+            glDrawElementsInstancedBaseVertex(primitive_mode, num_vertices, format, offset,
+                                              num_instances, base_vertex);
+        } else {
+            glDrawElementsInstancedBaseVertexBaseInstance(primitive_mode, num_vertices, format,
+                                                          offset, num_instances, base_vertex,
+                                                          base_instance);
+        }
     } else {
         const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first);
         const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count);
-        glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices, num_instances,
-                                          base_instance);
+        if (num_instances == 1 && base_instance == 0) {
+            glDrawArrays(primitive_mode, base_vertex, num_vertices);
+        } else if (base_instance == 0) {
+            glDrawArraysInstanced(primitive_mode, base_vertex, num_vertices, num_instances);
+        } else {
+            glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices,
+                                              num_instances, base_instance);
+        }
     }
 }
 
-bool RasterizerOpenGL::DrawBatch(bool is_indexed) {
-    Draw(is_indexed, false);
-    return true;
-}
-
-bool RasterizerOpenGL::DrawMultiBatch(bool is_indexed) {
-    Draw(is_indexed, true);
-    return true;
-}
-
 void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
     if (device.HasBrokenCompute()) {
         return;
@@ -707,6 +725,16 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
     state.ApplyProgramPipeline();
 
     glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
+    ++num_queued_commands;
+}
+
+void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) {
+    query_cache.ResetCounter(type);
+}
+
+void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
+                             std::optional<u64> timestamp) {
+    query_cache.Query(gpu_addr, type, timestamp);
 }
 
 void RasterizerOpenGL::FlushAll() {}
@@ -718,6 +746,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
     }
     texture_cache.FlushRegion(addr, size);
     buffer_cache.FlushRegion(addr, size);
+    query_cache.FlushRegion(addr, size);
 }
 
 void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
@@ -728,6 +757,7 @@ void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
     texture_cache.InvalidateRegion(addr, size);
     shader_cache.InvalidateRegion(addr, size);
     buffer_cache.InvalidateRegion(addr, size);
+    query_cache.InvalidateRegion(addr, size);
 }
 
 void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
@@ -738,10 +768,18 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
 }
 
 void RasterizerOpenGL::FlushCommands() {
+    // Only flush when we have commands queued to OpenGL.
+    if (num_queued_commands == 0) {
+        return;
+    }
+    num_queued_commands = 0;
     glFlush();
 }
 
 void RasterizerOpenGL::TickFrame() {
+    // Ticking a frame means that buffers will be swapped, calling glFlush implicitly.
+    num_queued_commands = 0;
+
     buffer_cache.TickFrame();
 }
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 0501f3828..68abe9a21 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -24,6 +24,7 @@
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_framebuffer_cache.h"
+#include "video_core/renderer_opengl/gl_query_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_sampler_cache.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
@@ -57,10 +58,11 @@ public:
                               ScreenInfo& info);
     ~RasterizerOpenGL() override;
 
-    bool DrawBatch(bool is_indexed) override;
-    bool DrawMultiBatch(bool is_indexed) override;
+    void Draw(bool is_indexed, bool is_instanced) override;
     void Clear() override;
     void DispatchCompute(GPUVAddr code_addr) override;
+    void ResetCounter(VideoCore::QueryType type) override;
+    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
     void FlushAll() override;
     void FlushRegion(CacheAddr addr, u64 size) override;
     void InvalidateRegion(CacheAddr addr, u64 size) override;
@@ -75,6 +77,11 @@ public:
     void LoadDiskResources(const std::atomic_bool& stop_loading,
                            const VideoCore::DiskResourceLoadCallback& callback) override;
 
+    /// Returns true when there are commands queued to the OpenGL server.
+    bool AnyCommandQueued() const {
+        return num_queued_commands > 0;
+    }
+
 private:
     /// Configures the color and depth framebuffer states.
     void ConfigureFramebuffers();
@@ -102,9 +109,6 @@ private:
     void SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
                            std::size_t size);
 
-    /// Syncs all the state, shaders, render targets and textures setting before a draw call.
-    void Draw(bool is_indexed, bool is_instanced);
-
     /// Configures the current textures to use for the draw command.
     void SetupDrawTextures(std::size_t stage_index, const Shader& shader);
 
@@ -180,10 +184,23 @@ private:
     /// Syncs the alpha test state to match the guest state
     void SyncAlphaTest();
 
-    /// Check for extension that are not strictly required
-    /// but are needed for correct emulation
+    /// Check for extension that are not strictly required but are needed for correct emulation
     void CheckExtensions();
 
+    std::size_t CalculateVertexArraysSize() const;
+
+    std::size_t CalculateIndexBufferSize() const;
+
+    /// Updates and returns a vertex array object representing current vertex format
+    GLuint SetupVertexFormat();
+
+    void SetupVertexBuffer(GLuint vao);
+    void SetupVertexInstances(GLuint vao);
+
+    GLintptr SetupIndexBuffer();
+
+    void SetupShaders(GLenum primitive_mode);
+
     const Device device;
     OpenGLState state;
 
@@ -191,6 +208,7 @@ private:
     ShaderCacheOpenGL shader_cache;
     SamplerCacheOpenGL sampler_cache;
     FramebufferCacheOpenGL framebuffer_cache;
+    QueryCache query_cache;
 
     Core::System& system;
     ScreenInfo& screen_info;
@@ -208,19 +226,8 @@ private:
     BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
     BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
 
-    std::size_t CalculateVertexArraysSize() const;
-
-    std::size_t CalculateIndexBufferSize() const;
-
-    /// Updates and returns a vertex array object representing current vertex format
-    GLuint SetupVertexFormat();
-
-    void SetupVertexBuffer(GLuint vao);
-    void SetupVertexInstances(GLuint vao);
-
-    GLintptr SetupIndexBuffer();
-
-    void SetupShaders(GLenum primitive_mode);
+    /// Number of commands queued to the OpenGL driver. Reseted on flush.
+    std::size_t num_queued_commands = 0;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
index 5c96c1d46..f0ddfb276 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -207,4 +207,21 @@ void OGLFramebuffer::Release() {
     handle = 0;
 }
 
+void OGLQuery::Create(GLenum target) {
+    if (handle != 0)
+        return;
+
+    MICROPROFILE_SCOPE(OpenGL_ResourceCreation);
+    glCreateQueries(target, 1, &handle);
+}
+
+void OGLQuery::Release() {
+    if (handle == 0)
+        return;
+
+    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
+    glDeleteQueries(1, &handle);
+    handle = 0;
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index 3a85a1d4c..514d1d165 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -266,4 +266,29 @@ public:
     GLuint handle = 0;
 };
 
+class OGLQuery : private NonCopyable {
+public:
+    OGLQuery() = default;
+
+    OGLQuery(OGLQuery&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
+    ~OGLQuery() {
+        Release();
+    }
+
+    OGLQuery& operator=(OGLQuery&& o) noexcept {
+        Release();
+        handle = std::exchange(o.handle, 0);
+        return *this;
+    }
+
+    /// Creates a new internal OpenGL resource and stores the handle
+    void Create(GLenum target);
+
+    /// Deletes the internal OpenGL resource
+    void Release();
+
+    GLuint handle = 0;
+};
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 331808113..5403c3ab7 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -164,7 +164,7 @@ struct FormatTuple {
     {vk::Format::eUndefined, {}},                                // ASTC_2D_5X4
     {vk::Format::eUndefined, {}},                                // BGRA8_SRGB
     {vk::Format::eBc1RgbaSrgbBlock, {}},                         // DXT1_SRGB
-    {vk::Format::eUndefined, {}},                                // DXT23_SRGB
+    {vk::Format::eBc2SrgbBlock, {}},                             // DXT23_SRGB
     {vk::Format::eBc3SrgbBlock, {}},                             // DXT45_SRGB
     {vk::Format::eBc7SrgbBlock, {}},                             // BC7U_SRGB
     {vk::Format::eR4G4B4A4UnormPack16, Attachable},              // R4G4B4A4U
@@ -363,6 +363,8 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr
             return vk::Format::eR8G8B8A8Uint;
         case Maxwell::VertexAttribute::Size::Size_32:
             return vk::Format::eR32Uint;
+        case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
+            return vk::Format::eR32G32B32A32Uint;
         default:
             break;
         }
diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp
index 9840f26e5..d1da4f9d3 100644
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -104,8 +104,11 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
     features.depthBiasClamp = true;
     features.geometryShader = true;
     features.tessellationShader = true;
+    features.occlusionQueryPrecise = true;
     features.fragmentStoresAndAtomics = true;
     features.shaderImageGatherExtended = true;
+    features.shaderStorageImageReadWithoutFormat =
+        is_shader_storage_img_read_without_format_supported;
     features.shaderStorageImageWriteWithoutFormat = true;
     features.textureCompressionASTC_LDR = is_optimal_astc_supported;
 
@@ -117,6 +120,10 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
     bit8_storage.uniformAndStorageBuffer8BitAccess = true;
     SetNext(next, bit8_storage);
 
+    vk::PhysicalDeviceHostQueryResetFeaturesEXT host_query_reset;
+    host_query_reset.hostQueryReset = true;
+    SetNext(next, host_query_reset);
+
     vk::PhysicalDeviceFloat16Int8FeaturesKHR float16_int8;
     if (is_float16_supported) {
         float16_int8.shaderFloat16 = true;
@@ -273,6 +280,7 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev
         VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME,
         VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME,
         VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME,
+        VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME,
     };
     std::bitset<required_extensions.size()> available_extensions{};
 
@@ -340,6 +348,7 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev
         std::make_pair(features.depthBiasClamp, "depthBiasClamp"),
         std::make_pair(features.geometryShader, "geometryShader"),
         std::make_pair(features.tessellationShader, "tessellationShader"),
+        std::make_pair(features.occlusionQueryPrecise, "occlusionQueryPrecise"),
         std::make_pair(features.fragmentStoresAndAtomics, "fragmentStoresAndAtomics"),
         std::make_pair(features.shaderImageGatherExtended, "shaderImageGatherExtended"),
         std::make_pair(features.shaderStorageImageWriteWithoutFormat,
@@ -376,7 +385,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
         }
     };
 
-    extensions.reserve(13);
+    extensions.reserve(14);
     extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
     extensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME);
     extensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME);
@@ -384,6 +393,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
     extensions.push_back(VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME);
     extensions.push_back(VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME);
     extensions.push_back(VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME);
+    extensions.push_back(VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME);
 
     [[maybe_unused]] const bool nsight =
         std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
@@ -457,6 +467,8 @@ void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceK
 
 void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) {
     const auto supported_features{physical.getFeatures(dldi)};
+    is_shader_storage_img_read_without_format_supported =
+        supported_features.shaderStorageImageReadWithoutFormat;
     is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi);
 }
 
@@ -530,6 +542,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti
                                         vk::Format::eBc6HUfloatBlock,
                                         vk::Format::eBc6HSfloatBlock,
                                         vk::Format::eBc1RgbaSrgbBlock,
+                                        vk::Format::eBc2SrgbBlock,
                                         vk::Format::eBc3SrgbBlock,
                                         vk::Format::eBc7SrgbBlock,
                                         vk::Format::eAstc4x4SrgbBlock,
diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h
index 72603f9f6..2c27ad730 100644
--- a/src/video_core/renderer_vulkan/vk_device.h
+++ b/src/video_core/renderer_vulkan/vk_device.h
@@ -122,6 +122,11 @@ public:
         return properties.limits.maxPushConstantsSize;
     }
 
+    /// Returns true if Shader storage Image Read Without Format supported.
+    bool IsShaderStorageImageReadWithoutFormatSupported() const {
+        return is_shader_storage_img_read_without_format_supported;
+    }
+
     /// Returns true if ASTC is natively supported.
     bool IsOptimalAstcSupported() const {
         return is_optimal_astc_supported;
@@ -227,6 +232,8 @@ private:
     bool ext_depth_range_unrestricted{};       ///< Support for VK_EXT_depth_range_unrestricted.
     bool ext_shader_viewport_index_layer{};    ///< Support for VK_EXT_shader_viewport_index_layer.
     bool nv_device_diagnostic_checkpoints{};   ///< Support for VK_NV_device_diagnostic_checkpoints.
+    bool is_shader_storage_img_read_without_format_supported{}; ///< Support for shader storage
+                                                                ///< image read without format
 
     // Telemetry parameters
     std::string vendor_name;                      ///< Device's driver name.
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp
new file mode 100644
index 000000000..ffbf60dda
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp
@@ -0,0 +1,122 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_query_cache.h"
+#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
+
+namespace Vulkan {
+
+namespace {
+
+constexpr std::array QUERY_TARGETS = {vk::QueryType::eOcclusion};
+
+constexpr vk::QueryType GetTarget(VideoCore::QueryType type) {
+    return QUERY_TARGETS[static_cast<std::size_t>(type)];
+}
+
+} // Anonymous namespace
+
+QueryPool::QueryPool() : VKFencedPool{GROW_STEP} {}
+
+QueryPool::~QueryPool() = default;
+
+void QueryPool::Initialize(const VKDevice& device_, VideoCore::QueryType type_) {
+    device = &device_;
+    type = type_;
+}
+
+std::pair<vk::QueryPool, std::uint32_t> QueryPool::Commit(VKFence& fence) {
+    std::size_t index;
+    do {
+        index = CommitResource(fence);
+    } while (usage[index]);
+    usage[index] = true;
+
+    return {*pools[index / GROW_STEP], static_cast<std::uint32_t>(index % GROW_STEP)};
+}
+
+void QueryPool::Allocate(std::size_t begin, std::size_t end) {
+    usage.resize(end);
+
+    const auto dev = device->GetLogical();
+    const u32 size = static_cast<u32>(end - begin);
+    const vk::QueryPoolCreateInfo query_pool_ci({}, GetTarget(type), size, {});
+    pools.push_back(dev.createQueryPoolUnique(query_pool_ci, nullptr, device->GetDispatchLoader()));
+}
+
+void QueryPool::Reserve(std::pair<vk::QueryPool, std::uint32_t> query) {
+    const auto it =
+        std::find_if(std::begin(pools), std::end(pools),
+                     [query_pool = query.first](auto& pool) { return query_pool == *pool; });
+    ASSERT(it != std::end(pools));
+
+    const std::ptrdiff_t pool_index = std::distance(std::begin(pools), it);
+    usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false;
+}
+
+VKQueryCache::VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                           const VKDevice& device, VKScheduler& scheduler)
+    : VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter,
+                                  QueryPool>{system, rasterizer},
+      device{device}, scheduler{scheduler} {
+    for (std::size_t i = 0; i < static_cast<std::size_t>(VideoCore::NumQueryTypes); ++i) {
+        query_pools[i].Initialize(device, static_cast<VideoCore::QueryType>(i));
+    }
+}
+
+VKQueryCache::~VKQueryCache() = default;
+
+std::pair<vk::QueryPool, std::uint32_t> VKQueryCache::AllocateQuery(VideoCore::QueryType type) {
+    return query_pools[static_cast<std::size_t>(type)].Commit(scheduler.GetFence());
+}
+
+void VKQueryCache::Reserve(VideoCore::QueryType type,
+                           std::pair<vk::QueryPool, std::uint32_t> query) {
+    query_pools[static_cast<std::size_t>(type)].Reserve(query);
+}
+
+HostCounter::HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type)
+    : VideoCommon::HostCounterBase<VKQueryCache, HostCounter>{std::move(dependency)}, cache{cache},
+      type{type}, query{cache.AllocateQuery(type)}, ticks{cache.Scheduler().Ticks()} {
+    const auto dev = cache.Device().GetLogical();
+    cache.Scheduler().Record([dev, query = query](vk::CommandBuffer cmdbuf, auto& dld) {
+        dev.resetQueryPoolEXT(query.first, query.second, 1, dld);
+        cmdbuf.beginQuery(query.first, query.second, vk::QueryControlFlagBits::ePrecise, dld);
+    });
+}
+
+HostCounter::~HostCounter() {
+    cache.Reserve(type, query);
+}
+
+void HostCounter::EndQuery() {
+    cache.Scheduler().Record([query = query](auto cmdbuf, auto& dld) {
+        cmdbuf.endQuery(query.first, query.second, dld);
+    });
+}
+
+u64 HostCounter::BlockingQuery() const {
+    if (ticks >= cache.Scheduler().Ticks()) {
+        cache.Scheduler().Flush();
+    }
+
+    const auto dev = cache.Device().GetLogical();
+    const auto& dld = cache.Device().GetDispatchLoader();
+    u64 value;
+    dev.getQueryPoolResults(query.first, query.second, 1, sizeof(value), &value, sizeof(value),
+                            vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait, dld);
+    return value;
+}
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h
new file mode 100644
index 000000000..c3092ee96
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_query_cache.h
@@ -0,0 +1,104 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/query_cache.h"
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/renderer_vulkan/vk_resource_manager.h"
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace Vulkan {
+
+class CachedQuery;
+class HostCounter;
+class VKDevice;
+class VKQueryCache;
+class VKScheduler;
+
+using CounterStream = VideoCommon::CounterStreamBase<VKQueryCache, HostCounter>;
+
+class QueryPool final : public VKFencedPool {
+public:
+    explicit QueryPool();
+    ~QueryPool() override;
+
+    void Initialize(const VKDevice& device, VideoCore::QueryType type);
+
+    std::pair<vk::QueryPool, std::uint32_t> Commit(VKFence& fence);
+
+    void Reserve(std::pair<vk::QueryPool, std::uint32_t> query);
+
+protected:
+    void Allocate(std::size_t begin, std::size_t end) override;
+
+private:
+    static constexpr std::size_t GROW_STEP = 512;
+
+    const VKDevice* device = nullptr;
+    VideoCore::QueryType type = {};
+
+    std::vector<UniqueQueryPool> pools;
+    std::vector<bool> usage;
+};
+
+class VKQueryCache final
+    : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter,
+                                         QueryPool> {
+public:
+    explicit VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                          const VKDevice& device, VKScheduler& scheduler);
+    ~VKQueryCache();
+
+    std::pair<vk::QueryPool, std::uint32_t> AllocateQuery(VideoCore::QueryType type);
+
+    void Reserve(VideoCore::QueryType type, std::pair<vk::QueryPool, std::uint32_t> query);
+
+    const VKDevice& Device() const noexcept {
+        return device;
+    }
+
+    VKScheduler& Scheduler() const noexcept {
+        return scheduler;
+    }
+
+private:
+    const VKDevice& device;
+    VKScheduler& scheduler;
+};
+
+class HostCounter final : public VideoCommon::HostCounterBase<VKQueryCache, HostCounter> {
+public:
+    explicit HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type);
+    ~HostCounter();
+
+    void EndQuery();
+
+private:
+    u64 BlockingQuery() const override;
+
+    VKQueryCache& cache;
+    const VideoCore::QueryType type;
+    const std::pair<vk::QueryPool, std::uint32_t> query;
+    const u64 ticks;
+};
+
+class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> {
+public:
+    explicit CachedQuery(VKQueryCache&, VideoCore::QueryType, VAddr cpu_addr, u8* host_ptr)
+        : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr} {}
+};
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index aada38702..31c078f6a 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -289,25 +289,19 @@ RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWind
                     staging_pool),
       pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue),
       buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool),
-      sampler_cache(device) {}
-
-RasterizerVulkan::~RasterizerVulkan() = default;
-
-bool RasterizerVulkan::DrawBatch(bool is_indexed) {
-    Draw(is_indexed, false);
-    return true;
+      sampler_cache(device), query_cache(system, *this, device, scheduler) {
+    scheduler.SetQueryCache(query_cache);
 }
 
-bool RasterizerVulkan::DrawMultiBatch(bool is_indexed) {
-    Draw(is_indexed, true);
-    return true;
-}
+RasterizerVulkan::~RasterizerVulkan() = default;
 
 void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
     MICROPROFILE_SCOPE(Vulkan_Drawing);
 
     FlushWork();
 
+    query_cache.UpdateCounters();
+
     const auto& gpu = system.GPU().Maxwell3D();
     GraphicsPipelineCacheKey key{GetFixedPipelineState(gpu.regs)};
 
@@ -362,6 +356,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
 void RasterizerVulkan::Clear() {
     MICROPROFILE_SCOPE(Vulkan_Clearing);
 
+    query_cache.UpdateCounters();
+
     const auto& gpu = system.GPU().Maxwell3D();
     if (!system.GPU().Maxwell3D().ShouldExecute()) {
         return;
@@ -429,6 +425,8 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
     sampled_views.clear();
     image_views.clear();
 
+    query_cache.UpdateCounters();
+
     const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
     const ComputePipelineCacheKey key{
         code_addr,
@@ -471,17 +469,28 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
     });
 }
 
+void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) {
+    query_cache.ResetCounter(type);
+}
+
+void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
+                             std::optional<u64> timestamp) {
+    query_cache.Query(gpu_addr, type, timestamp);
+}
+
 void RasterizerVulkan::FlushAll() {}
 
 void RasterizerVulkan::FlushRegion(CacheAddr addr, u64 size) {
     texture_cache.FlushRegion(addr, size);
     buffer_cache.FlushRegion(addr, size);
+    query_cache.FlushRegion(addr, size);
 }
 
 void RasterizerVulkan::InvalidateRegion(CacheAddr addr, u64 size) {
     texture_cache.InvalidateRegion(addr, size);
     pipeline_cache.InvalidateRegion(addr, size);
     buffer_cache.InvalidateRegion(addr, size);
+    query_cache.InvalidateRegion(addr, size);
 }
 
 void RasterizerVulkan::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 7be71e734..138903d60 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -24,6 +24,7 @@
 #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
 #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
+#include "video_core/renderer_vulkan/vk_query_cache.h"
 #include "video_core/renderer_vulkan/vk_renderpass_cache.h"
 #include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_sampler_cache.h"
@@ -96,7 +97,7 @@ struct ImageView {
     vk::ImageLayout* layout = nullptr;
 };
 
-class RasterizerVulkan : public VideoCore::RasterizerAccelerated {
+class RasterizerVulkan final : public VideoCore::RasterizerAccelerated {
 public:
     explicit RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& render_window,
                               VKScreenInfo& screen_info, const VKDevice& device,
@@ -104,10 +105,11 @@ public:
                               VKScheduler& scheduler);
     ~RasterizerVulkan() override;
 
-    bool DrawBatch(bool is_indexed) override;
-    bool DrawMultiBatch(bool is_indexed) override;
+    void Draw(bool is_indexed, bool is_instanced) override;
     void Clear() override;
     void DispatchCompute(GPUVAddr code_addr) override;
+    void ResetCounter(VideoCore::QueryType type) override;
+    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
     void FlushAll() override;
     void FlushRegion(CacheAddr addr, u64 size) override;
     void InvalidateRegion(CacheAddr addr, u64 size) override;
@@ -140,8 +142,6 @@ private:
 
     static constexpr std::size_t ZETA_TEXCEPTION_INDEX = 8;
 
-    void Draw(bool is_indexed, bool is_instanced);
-
     void FlushWork();
 
     Texceptions UpdateAttachments();
@@ -247,6 +247,7 @@ private:
     VKPipelineCache pipeline_cache;
     VKBufferCache buffer_cache;
     VKSamplerCache sampler_cache;
+    VKQueryCache query_cache;
 
     std::array<View, Maxwell::NumRenderTargets> color_attachments;
     View zeta_attachment;
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index d66133ad1..92bd6c344 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -6,6 +6,7 @@
 #include "common/microprofile.h"
 #include "video_core/renderer_vulkan/declarations.h"
 #include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_query_cache.h"
 #include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 
@@ -139,6 +140,8 @@ void VKScheduler::SubmitExecution(vk::Semaphore semaphore) {
 }
 
 void VKScheduler::AllocateNewContext() {
+    ++ticks;
+
     std::unique_lock lock{mutex};
     current_fence = next_fence;
     next_fence = &resource_manager.CommitFence();
@@ -146,6 +149,10 @@ void VKScheduler::AllocateNewContext() {
     current_cmdbuf = resource_manager.CommitCommandBuffer(*current_fence);
     current_cmdbuf.begin({vk::CommandBufferUsageFlagBits::eOneTimeSubmit},
                          device.GetDispatchLoader());
+    // Enable counters once again. These are disabled when a command buffer is finished.
+    if (query_cache) {
+        query_cache->UpdateCounters();
+    }
 }
 
 void VKScheduler::InvalidateState() {
@@ -159,6 +166,7 @@ void VKScheduler::InvalidateState() {
 }
 
 void VKScheduler::EndPendingOperations() {
+    query_cache->DisableStreams();
     EndRenderPass();
 }
 
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index bcdffbba0..62fd7858b 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <atomic>
 #include <condition_variable>
 #include <memory>
 #include <optional>
@@ -18,6 +19,7 @@ namespace Vulkan {
 
 class VKDevice;
 class VKFence;
+class VKQueryCache;
 class VKResourceManager;
 
 class VKFenceView {
@@ -67,6 +69,11 @@ public:
     /// Binds a pipeline to the current execution context.
     void BindGraphicsPipeline(vk::Pipeline pipeline);
 
+    /// Assigns the query cache.
+    void SetQueryCache(VKQueryCache& query_cache_) {
+        query_cache = &query_cache_;
+    }
+
     /// Returns true when viewports have been set in the current command buffer.
     bool TouchViewports() {
         return std::exchange(state.viewports, true);
@@ -112,6 +119,11 @@ public:
         return current_fence;
     }
 
+    /// Returns the current command buffer tick.
+    u64 Ticks() const {
+        return ticks;
+    }
+
 private:
     class Command {
     public:
@@ -205,6 +217,8 @@ private:
 
     const VKDevice& device;
     VKResourceManager& resource_manager;
+    VKQueryCache* query_cache = nullptr;
+
     vk::CommandBuffer current_cmdbuf;
     VKFence* current_fence = nullptr;
     VKFence* next_fence = nullptr;
@@ -227,6 +241,7 @@ private:
     Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_reserve;
     std::mutex mutex;
     std::condition_variable cv;
+    std::atomic<u64> ticks = 0;
     bool quit = false;
 };
 
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 24a658dce..6d0bf6aa1 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -275,12 +275,14 @@ public:
         AddCapability(spv::Capability::ImageGatherExtended);
         AddCapability(spv::Capability::SampledBuffer);
         AddCapability(spv::Capability::StorageImageWriteWithoutFormat);
+        AddCapability(spv::Capability::DrawParameters);
         AddCapability(spv::Capability::SubgroupBallotKHR);
         AddCapability(spv::Capability::SubgroupVoteKHR);
         AddExtension("SPV_KHR_shader_ballot");
         AddExtension("SPV_KHR_subgroup_vote");
         AddExtension("SPV_KHR_storage_buffer_storage_class");
         AddExtension("SPV_KHR_variable_pointers");
+        AddExtension("SPV_KHR_shader_draw_parameters");
 
         if (ir.UsesViewportIndex()) {
             AddCapability(spv::Capability::MultiViewport);
@@ -290,6 +292,10 @@ public:
             }
         }
 
+        if (device.IsShaderStorageImageReadWithoutFormatSupported()) {
+            AddCapability(spv::Capability::StorageImageReadWithoutFormat);
+        }
+
         if (device.IsFloat16Supported()) {
             AddCapability(spv::Capability::Float16);
         }
@@ -492,9 +498,11 @@ private:
         interfaces.push_back(AddGlobalVariable(Name(out_vertex, "out_vertex")));
 
         // Declare input attributes
-        vertex_index = DeclareInputBuiltIn(spv::BuiltIn::VertexIndex, t_in_uint, "vertex_index");
+        vertex_index = DeclareInputBuiltIn(spv::BuiltIn::VertexIndex, t_in_int, "vertex_index");
         instance_index =
-            DeclareInputBuiltIn(spv::BuiltIn::InstanceIndex, t_in_uint, "instance_index");
+            DeclareInputBuiltIn(spv::BuiltIn::InstanceIndex, t_in_int, "instance_index");
+        base_vertex = DeclareInputBuiltIn(spv::BuiltIn::BaseVertex, t_in_int, "base_vertex");
+        base_instance = DeclareInputBuiltIn(spv::BuiltIn::BaseInstance, t_in_int, "base_instance");
     }
 
     void DeclareTessControl() {
@@ -1068,9 +1076,12 @@ private:
                     return {OpLoad(t_float, AccessElement(t_in_float, tess_coord, element)),
                             Type::Float};
                 case 2:
-                    return {OpLoad(t_uint, instance_index), Type::Uint};
+                    return {
+                        OpISub(t_int, OpLoad(t_int, instance_index), OpLoad(t_int, base_instance)),
+                        Type::Int};
                 case 3:
-                    return {OpLoad(t_uint, vertex_index), Type::Uint};
+                    return {OpISub(t_int, OpLoad(t_int, vertex_index), OpLoad(t_int, base_vertex)),
+                            Type::Int};
                 }
                 UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
                 return {Constant(t_uint, 0U), Type::Uint};
@@ -1748,8 +1759,16 @@ private:
     }
 
     Expression ImageLoad(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
+        if (!device.IsShaderStorageImageReadWithoutFormatSupported()) {
+            return {v_float_zero, Type::Float};
+        }
+
+        const auto& meta{std::get<MetaImage>(operation.GetMeta())};
+
+        const Id coords = GetCoordinates(operation, Type::Int);
+        const Id texel = OpImageRead(t_uint4, GetImage(operation), coords);
+
+        return {OpCompositeExtract(t_uint, texel, meta.element), Type::Uint};
     }
 
     Expression ImageStore(Operation operation) {
@@ -2542,6 +2561,8 @@ private:
 
     Id instance_index{};
     Id vertex_index{};
+    Id base_instance{};
+    Id base_vertex{};
     std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
     Id frag_depth{};
     Id frag_coord{};
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp
index 0eeb75559..6ead42070 100644
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -83,14 +83,14 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
 
         const bool input_signed = instr.conversion.is_input_signed;
 
-        if (instr.conversion.src_size == Register::Size::Byte) {
-            const u32 offset = static_cast<u32>(instr.conversion.int_src.selector) * 8;
-            if (offset > 0) {
-                value = SignedOperation(OperationCode::ILogicalShiftRight, input_signed,
-                                        std::move(value), Immediate(offset));
+        if (const u32 offset = static_cast<u32>(instr.conversion.int_src.selector); offset > 0) {
+            ASSERT(instr.conversion.src_size == Register::Size::Byte ||
+                   instr.conversion.src_size == Register::Size::Short);
+            if (instr.conversion.src_size == Register::Size::Short) {
+                ASSERT(offset == 0 || offset == 2);
             }
-        } else {
-            UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0);
+            value = SignedOperation(OperationCode::ILogicalShiftRight, input_signed,
+                                    std::move(value), Immediate(offset * 8));
         }
 
         value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed);
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 351c8c2f1..542636430 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -522,68 +522,53 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
                                Node array, Node depth_compare, u32 bias_offset,
                                std::vector<Node> aoffi,
                                std::optional<Tegra::Shader::Register> bindless_reg) {
-    const auto is_array = static_cast<bool>(array);
-    const auto is_shadow = static_cast<bool>(depth_compare);
+    const bool is_array = array != nullptr;
+    const bool is_shadow = depth_compare != nullptr;
     const bool is_bindless = bindless_reg.has_value();
 
-    UNIMPLEMENTED_IF_MSG((texture_type == TextureType::Texture3D && (is_array || is_shadow)) ||
-                             (texture_type == TextureType::TextureCube && is_array && is_shadow),
-                         "This method is not supported.");
+    UNIMPLEMENTED_IF(texture_type == TextureType::TextureCube && is_array && is_shadow);
+    ASSERT_MSG(texture_type != TextureType::Texture3D || is_array || is_shadow,
+               "Illegal texture type");
 
     const SamplerInfo info{texture_type, is_array, is_shadow, false};
-    Node index_var{};
+    Node index_var;
     const Sampler* sampler = is_bindless ? GetBindlessSampler(*bindless_reg, index_var, info)
                                          : GetSampler(instr.sampler, info);
-    Node4 values;
-    if (sampler == nullptr) {
-        for (u32 element = 0; element < values.size(); ++element) {
-            values[element] = Immediate(0);
-        }
-        return values;
+    if (!sampler) {
+        return {Immediate(0), Immediate(0), Immediate(0), Immediate(0)};
     }
 
     const bool lod_needed = process_mode == TextureProcessMode::LZ ||
                             process_mode == TextureProcessMode::LL ||
                             process_mode == TextureProcessMode::LLA;
-
-    // LOD selection (either via bias or explicit textureLod) not supported in GL for
-    // sampler2DArrayShadow and samplerCubeArrayShadow.
-    const bool gl_lod_supported =
-        !((texture_type == Tegra::Shader::TextureType::Texture2D && is_array && is_shadow) ||
-          (texture_type == Tegra::Shader::TextureType::TextureCube && is_array && is_shadow));
-
-    const OperationCode read_method =
-        (lod_needed && gl_lod_supported) ? OperationCode::TextureLod : OperationCode::Texture;
-
-    UNIMPLEMENTED_IF(process_mode != TextureProcessMode::None && !gl_lod_supported);
+    const OperationCode opcode = lod_needed ? OperationCode::TextureLod : OperationCode::Texture;
 
     Node bias;
     Node lod;
-    if (process_mode != TextureProcessMode::None && gl_lod_supported) {
-        switch (process_mode) {
-        case TextureProcessMode::LZ:
-            lod = Immediate(0.0f);
-            break;
-        case TextureProcessMode::LB:
-            // If present, lod or bias are always stored in the register
-            // indexed by the gpr20 field with an offset depending on the
-            // usage of the other registers
-            bias = GetRegister(instr.gpr20.Value() + bias_offset);
-            break;
-        case TextureProcessMode::LL:
-            lod = GetRegister(instr.gpr20.Value() + bias_offset);
-            break;
-        default:
-            UNIMPLEMENTED_MSG("Unimplemented process mode={}", static_cast<u32>(process_mode));
-            break;
-        }
+    switch (process_mode) {
+    case TextureProcessMode::None:
+        break;
+    case TextureProcessMode::LZ:
+        lod = Immediate(0.0f);
+        break;
+    case TextureProcessMode::LB:
+        // If present, lod or bias are always stored in the register indexed by the gpr20 field with
+        // an offset depending on the usage of the other registers.
+        bias = GetRegister(instr.gpr20.Value() + bias_offset);
+        break;
+    case TextureProcessMode::LL:
+        lod = GetRegister(instr.gpr20.Value() + bias_offset);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented process mode={}", static_cast<u32>(process_mode));
+        break;
     }
 
+    Node4 values;
     for (u32 element = 0; element < values.size(); ++element) {
-        auto copy_coords = coords;
         MetaTexture meta{*sampler, array, depth_compare, aoffi,    {}, {}, bias,
                          lod,      {},    element,       index_var};
-        values[element] = Operation(read_method, meta, std::move(copy_coords));
+        values[element] = Operation(opcode, meta, coords);
     }
 
     return values;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index f4c015635..0d105d386 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -721,7 +721,6 @@ private:
     std::pair<TSurface, TView> GetSurface(const GPUVAddr gpu_addr, const CacheAddr cache_addr,
                                           const SurfaceParams& params, bool preserve_contents,
                                           bool is_render) {
-
         // Step 1
         // Check Level 1 Cache for a fast structural match. If candidate surface
         // matches at certain level we are pretty much done.
@@ -733,14 +732,18 @@ private:
                 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
                                       topological_result);
             }
+
             const auto struct_result = current_surface->MatchesStructure(params);
-            if (struct_result != MatchStructureResult::None &&
-                (params.target != SurfaceTarget::Texture3D ||
-                 current_surface->MatchTarget(params.target))) {
-                if (struct_result == MatchStructureResult::FullMatch) {
-                    return ManageStructuralMatch(current_surface, params, is_render);
-                } else {
-                    return RebuildSurface(current_surface, params, is_render);
+            if (struct_result != MatchStructureResult::None) {
+                const auto& old_params = current_surface->GetSurfaceParams();
+                const bool not_3d = params.target != SurfaceTarget::Texture3D &&
+                                    old_params.target != SurfaceTarget::Texture3D;
+                if (not_3d || current_surface->MatchTarget(params.target)) {
+                    if (struct_result == MatchStructureResult::FullMatch) {
+                        return ManageStructuralMatch(current_surface, params, is_render);
+                    } else {
+                        return RebuildSurface(current_surface, params, is_render);
+                    }
                 }
             }
         }