53 files changed, 1862 insertions, 415 deletions
diff --git a/src/audio_core/algorithm/interpolate.cpp b/src/audio_core/algorithm/interpolate.cpp
index 5005ba519..a58f24169 100644
--- a/src/audio_core/algorithm/interpolate.cpp
+++ b/src/audio_core/algorithm/interpolate.cpp
@@ -5,6 +5,7 @@
 #define _USE_MATH_DEFINES
 
 #include <algorithm>
+#include <climits>
 #include <cmath>
 #include <vector>
 #include "audio_core/algorithm/interpolate.h"
@@ -13,13 +14,131 @@
 
 namespace AudioCore {
 
-/// The Lanczos kernel
-static double Lanczos(std::size_t a, double x) {
-    if (x == 0.0)
-        return 1.0;
-    const double px = M_PI * x;
-    return a * std::sin(px) * std::sin(px / a) / (px * px);
-}
+constexpr std::array<s16, 512> curve_lut0 = {
+    6600,  19426, 6722,  3,     6479,  19424, 6845,  9,     6359,  19419, 6968,  15,    6239,
+    19412, 7093,  22,    6121,  19403, 7219,  28,    6004,  19391, 7345,  34,    5888,  19377,
+    7472,  41,    5773,  19361, 7600,  48,    5659,  19342, 7728,  55,    5546,  19321, 7857,
+    62,    5434,  19298, 7987,  69,    5323,  19273, 8118,  77,    5213,  19245, 8249,  84,
+    5104,  19215, 8381,  92,    4997,  19183, 8513,  101,   4890,  19148, 8646,  109,   4785,
+    19112, 8780,  118,   4681,  19073, 8914,  127,   4579,  19031, 9048,  137,   4477,  18988,
+    9183,  147,   4377,  18942, 9318,  157,   4277,  18895, 9454,  168,   4179,  18845, 9590,
+    179,   4083,  18793, 9726,  190,   3987,  18738, 9863,  202,   3893,  18682, 10000, 215,
+    3800,  18624, 10137, 228,   3709,  18563, 10274, 241,   3618,  18500, 10411, 255,   3529,
+    18436, 10549, 270,   3441,  18369, 10687, 285,   3355,  18300, 10824, 300,   3269,  18230,
+    10962, 317,   3186,  18157, 11100, 334,   3103,  18082, 11238, 351,   3022,  18006, 11375,
+    369,   2942,  17927, 11513, 388,   2863,  17847, 11650, 408,   2785,  17765, 11788, 428,
+    2709,  17681, 11925, 449,   2635,  17595, 12062, 471,   2561,  17507, 12198, 494,   2489,
+    17418, 12334, 517,   2418,  17327, 12470, 541,   2348,  17234, 12606, 566,   2280,  17140,
+    12741, 592,   2213,  17044, 12876, 619,   2147,  16946, 13010, 647,   2083,  16846, 13144,
+    675,   2020,  16745, 13277, 704,   1958,  16643, 13409, 735,   1897,  16539, 13541, 766,
+    1838,  16434, 13673, 798,   1780,  16327, 13803, 832,   1723,  16218, 13933, 866,   1667,
+    16109, 14062, 901,   1613,  15998, 14191, 937,   1560,  15885, 14318, 975,   1508,  15772,
+    14445, 1013,  1457,  15657, 14571, 1052,  1407,  15540, 14695, 1093,  1359,  15423, 14819,
+    1134,  1312,  15304, 14942, 1177,  1266,  15185, 15064, 1221,  1221,  15064, 15185, 1266,
+    1177,  14942, 15304, 1312,  1134,  14819, 15423, 1359,  1093,  14695, 15540, 1407,  1052,
+    14571, 15657, 1457,  1013,  14445, 15772, 1508,  975,   14318, 15885, 1560,  937,   14191,
+    15998, 1613,  901,   14062, 16109, 1667,  866,   13933, 16218, 1723,  832,   13803, 16327,
+    1780,  798,   13673, 16434, 1838,  766,   13541, 16539, 1897,  735,   13409, 16643, 1958,
+    704,   13277, 16745, 2020,  675,   13144, 16846, 2083,  647,   13010, 16946, 2147,  619,
+    12876, 17044, 2213,  592,   12741, 17140, 2280,  566,   12606, 17234, 2348,  541,   12470,
+    17327, 2418,  517,   12334, 17418, 2489,  494,   12198, 17507, 2561,  471,   12062, 17595,
+    2635,  449,   11925, 17681, 2709,  428,   11788, 17765, 2785,  408,   11650, 17847, 2863,
+    388,   11513, 17927, 2942,  369,   11375, 18006, 3022,  351,   11238, 18082, 3103,  334,
+    11100, 18157, 3186,  317,   10962, 18230, 3269,  300,   10824, 18300, 3355,  285,   10687,
+    18369, 3441,  270,   10549, 18436, 3529,  255,   10411, 18500, 3618,  241,   10274, 18563,
+    3709,  228,   10137, 18624, 3800,  215,   10000, 18682, 3893,  202,   9863,  18738, 3987,
+    190,   9726,  18793, 4083,  179,   9590,  18845, 4179,  168,   9454,  18895, 4277,  157,
+    9318,  18942, 4377,  147,   9183,  18988, 4477,  137,   9048,  19031, 4579,  127,   8914,
+    19073, 4681,  118,   8780,  19112, 4785,  109,   8646,  19148, 4890,  101,   8513,  19183,
+    4997,  92,    8381,  19215, 5104,  84,    8249,  19245, 5213,  77,    8118,  19273, 5323,
+    69,    7987,  19298, 5434,  62,    7857,  19321, 5546,  55,    7728,  19342, 5659,  48,
+    7600,  19361, 5773,  41,    7472,  19377, 5888,  34,    7345,  19391, 6004,  28,    7219,
+    19403, 6121,  22,    7093,  19412, 6239,  15,    6968,  19419, 6359,  9,     6845,  19424,
+    6479,  3,     6722,  19426, 6600};
+
+constexpr std::array<s16, 512> curve_lut1 = {
+    -68,   32639, 69,    -5,    -200,  32630, 212,   -15,   -328,  32613, 359,   -26,   -450,
+    32586, 512,   -36,   -568,  32551, 669,   -47,   -680,  32507, 832,   -58,   -788,  32454,
+    1000,  -69,   -891,  32393, 1174,  -80,   -990,  32323, 1352,  -92,   -1084, 32244, 1536,
+    -103,  -1173, 32157, 1724,  -115,  -1258, 32061, 1919,  -128,  -1338, 31956, 2118,  -140,
+    -1414, 31844, 2322,  -153,  -1486, 31723, 2532,  -167,  -1554, 31593, 2747,  -180,  -1617,
+    31456, 2967,  -194,  -1676, 31310, 3192,  -209,  -1732, 31157, 3422,  -224,  -1783, 30995,
+    3657,  -240,  -1830, 30826, 3897,  -256,  -1874, 30649, 4143,  -272,  -1914, 30464, 4393,
+    -289,  -1951, 30272, 4648,  -307,  -1984, 30072, 4908,  -325,  -2014, 29866, 5172,  -343,
+    -2040, 29652, 5442,  -362,  -2063, 29431, 5716,  -382,  -2083, 29203, 5994,  -403,  -2100,
+    28968, 6277,  -424,  -2114, 28727, 6565,  -445,  -2125, 28480, 6857,  -468,  -2133, 28226,
+    7153,  -490,  -2139, 27966, 7453,  -514,  -2142, 27700, 7758,  -538,  -2142, 27428, 8066,
+    -563,  -2141, 27151, 8378,  -588,  -2136, 26867, 8694,  -614,  -2130, 26579, 9013,  -641,
+    -2121, 26285, 9336,  -668,  -2111, 25987, 9663,  -696,  -2098, 25683, 9993,  -724,  -2084,
+    25375, 10326, -753,  -2067, 25063, 10662, -783,  -2049, 24746, 11000, -813,  -2030, 24425,
+    11342, -844,  -2009, 24100, 11686, -875,  -1986, 23771, 12033, -907,  -1962, 23438, 12382,
+    -939,  -1937, 23103, 12733, -972,  -1911, 22764, 13086, -1005, -1883, 22422, 13441, -1039,
+    -1855, 22077, 13798, -1072, -1825, 21729, 14156, -1107, -1795, 21380, 14516, -1141, -1764,
+    21027, 14877, -1176, -1732, 20673, 15239, -1211, -1700, 20317, 15602, -1246, -1667, 19959,
+    15965, -1282, -1633, 19600, 16329, -1317, -1599, 19239, 16694, -1353, -1564, 18878, 17058,
+    -1388, -1530, 18515, 17423, -1424, -1495, 18151, 17787, -1459, -1459, 17787, 18151, -1495,
+    -1424, 17423, 18515, -1530, -1388, 17058, 18878, -1564, -1353, 16694, 19239, -1599, -1317,
+    16329, 19600, -1633, -1282, 15965, 19959, -1667, -1246, 15602, 20317, -1700, -1211, 15239,
+    20673, -1732, -1176, 14877, 21027, -1764, -1141, 14516, 21380, -1795, -1107, 14156, 21729,
+    -1825, -1072, 13798, 22077, -1855, -1039, 13441, 22422, -1883, -1005, 13086, 22764, -1911,
+    -972,  12733, 23103, -1937, -939,  12382, 23438, -1962, -907,  12033, 23771, -1986, -875,
+    11686, 24100, -2009, -844,  11342, 24425, -2030, -813,  11000, 24746, -2049, -783,  10662,
+    25063, -2067, -753,  10326, 25375, -2084, -724,  9993,  25683, -2098, -696,  9663,  25987,
+    -2111, -668,  9336,  26285, -2121, -641,  9013,  26579, -2130, -614,  8694,  26867, -2136,
+    -588,  8378,  27151, -2141, -563,  8066,  27428, -2142, -538,  7758,  27700, -2142, -514,
+    7453,  27966, -2139, -490,  7153,  28226, -2133, -468,  6857,  28480, -2125, -445,  6565,
+    28727, -2114, -424,  6277,  28968, -2100, -403,  5994,  29203, -2083, -382,  5716,  29431,
+    -2063, -362,  5442,  29652, -2040, -343,  5172,  29866, -2014, -325,  4908,  30072, -1984,
+    -307,  4648,  30272, -1951, -289,  4393,  30464, -1914, -272,  4143,  30649, -1874, -256,
+    3897,  30826, -1830, -240,  3657,  30995, -1783, -224,  3422,  31157, -1732, -209,  3192,
+    31310, -1676, -194,  2967,  31456, -1617, -180,  2747,  31593, -1554, -167,  2532,  31723,
+    -1486, -153,  2322,  31844, -1414, -140,  2118,  31956, -1338, -128,  1919,  32061, -1258,
+    -115,  1724,  32157, -1173, -103,  1536,  32244, -1084, -92,   1352,  32323, -990,  -80,
+    1174,  32393, -891,  -69,   1000,  32454, -788,  -58,   832,   32507, -680,  -47,   669,
+    32551, -568,  -36,   512,   32586, -450,  -26,   359,   32613, -328,  -15,   212,   32630,
+    -200,  -5,    69,    32639, -68};
+
+constexpr std::array<s16, 512> curve_lut2 = {
+    3195,  26287, 3329,  -32,   3064,  26281, 3467,  -34,   2936,  26270, 3608,  -38,   2811,
+    26253, 3751,  -42,   2688,  26230, 3897,  -46,   2568,  26202, 4046,  -50,   2451,  26169,
+    4199,  -54,   2338,  26130, 4354,  -58,   2227,  26085, 4512,  -63,   2120,  26035, 4673,
+    -67,   2015,  25980, 4837,  -72,   1912,  25919, 5004,  -76,   1813,  25852, 5174,  -81,
+    1716,  25780, 5347,  -87,   1622,  25704, 5522,  -92,   1531,  25621, 5701,  -98,   1442,
+    25533, 5882,  -103,  1357,  25440, 6066,  -109,  1274,  25342, 6253,  -115,  1193,  25239,
+    6442,  -121,  1115,  25131, 6635,  -127,  1040,  25018, 6830,  -133,  967,   24899, 7027,
+    -140,  897,   24776, 7227,  -146,  829,   24648, 7430,  -153,  764,   24516, 7635,  -159,
+    701,   24379, 7842,  -166,  641,   24237, 8052,  -174,  583,   24091, 8264,  -181,  526,
+    23940, 8478,  -187,  472,   23785, 8695,  -194,  420,   23626, 8914,  -202,  371,   23462,
+    9135,  -209,  324,   23295, 9358,  -215,  279,   23123, 9583,  -222,  236,   22948, 9809,
+    -230,  194,   22769, 10038, -237,  154,   22586, 10269, -243,  117,   22399, 10501, -250,
+    81,    22208, 10735, -258,  47,    22015, 10970, -265,  15,    21818, 11206, -271,  -16,
+    21618, 11444, -277,  -44,   21415, 11684, -283,  -71,   21208, 11924, -290,  -97,   20999,
+    12166, -296,  -121,  20786, 12409, -302,  -143,  20571, 12653, -306,  -163,  20354, 12898,
+    -311,  -183,  20134, 13143, -316,  -201,  19911, 13389, -321,  -218,  19686, 13635, -325,
+    -234,  19459, 13882, -328,  -248,  19230, 14130, -332,  -261,  18998, 14377, -335,  -273,
+    18765, 14625, -337,  -284,  18531, 14873, -339,  -294,  18295, 15121, -341,  -302,  18057,
+    15369, -341,  -310,  17817, 15617, -341,  -317,  17577, 15864, -340,  -323,  17335, 16111,
+    -340,  -328,  17092, 16357, -338,  -332,  16848, 16603, -336,  -336,  16603, 16848, -332,
+    -338,  16357, 17092, -328,  -340,  16111, 17335, -323,  -340,  15864, 17577, -317,  -341,
+    15617, 17817, -310,  -341,  15369, 18057, -302,  -341,  15121, 18295, -294,  -339,  14873,
+    18531, -284,  -337,  14625, 18765, -273,  -335,  14377, 18998, -261,  -332,  14130, 19230,
+    -248,  -328,  13882, 19459, -234,  -325,  13635, 19686, -218,  -321,  13389, 19911, -201,
+    -316,  13143, 20134, -183,  -311,  12898, 20354, -163,  -306,  12653, 20571, -143,  -302,
+    12409, 20786, -121,  -296,  12166, 20999, -97,   -290,  11924, 21208, -71,   -283,  11684,
+    21415, -44,   -277,  11444, 21618, -16,   -271,  11206, 21818, 15,    -265,  10970, 22015,
+    47,    -258,  10735, 22208, 81,    -250,  10501, 22399, 117,   -243,  10269, 22586, 154,
+    -237,  10038, 22769, 194,   -230,  9809,  22948, 236,   -222,  9583,  23123, 279,   -215,
+    9358,  23295, 324,   -209,  9135,  23462, 371,   -202,  8914,  23626, 420,   -194,  8695,
+    23785, 472,   -187,  8478,  23940, 526,   -181,  8264,  24091, 583,   -174,  8052,  24237,
+    641,   -166,  7842,  24379, 701,   -159,  7635,  24516, 764,   -153,  7430,  24648, 829,
+    -146,  7227,  24776, 897,   -140,  7027,  24899, 967,   -133,  6830,  25018, 1040,  -127,
+    6635,  25131, 1115,  -121,  6442,  25239, 1193,  -115,  6253,  25342, 1274,  -109,  6066,
+    25440, 1357,  -103,  5882,  25533, 1442,  -98,   5701,  25621, 1531,  -92,   5522,  25704,
+    1622,  -87,   5347,  25780, 1716,  -81,   5174,  25852, 1813,  -76,   5004,  25919, 1912,
+    -72,   4837,  25980, 2015,  -67,   4673,  26035, 2120,  -63,   4512,  26085, 2227,  -58,
+    4354,  26130, 2338,  -54,   4199,  26169, 2451,  -50,   4046,  26202, 2568,  -46,   3897,
+    26230, 2688,  -42,   3751,  26253, 2811,  -38,   3608,  26270, 2936,  -34,   3467,  26281,
+    3064,  -32,   3329,  26287, 3195};
 
 std::vector<s16> Interpolate(InterpolationState& state, std::vector<s16> input, double ratio) {
     if (input.size() < 2)
@@ -30,40 +149,39 @@ std::vector<s16> Interpolate(InterpolationState& state, std::vector<s16> input,
         ratio = 1.0;
     }
 
-    if (ratio != state.current_ratio) {
-        const double cutoff_frequency = std::min(0.5 / ratio, 0.5 * ratio);
-        state.nyquist = CascadingFilter::LowPass(std::clamp(cutoff_frequency, 0.0, 0.4), 3);
-        state.current_ratio = ratio;
-    }
-    state.nyquist.Process(input);
-
-    constexpr std::size_t taps = InterpolationState::lanczos_taps;
-    const std::size_t num_frames = input.size() / 2;
-
-    std::vector<s16> output;
-    output.reserve(static_cast<std::size_t>(input.size() / ratio + 4));
-
-    double& pos = state.position;
-    auto& h = state.history;
-    for (std::size_t i = 0; i < num_frames; ++i) {
-        std::rotate(h.begin(), h.end() - 1, h.end());
-        h[0][0] = input[i * 2 + 0];
-        h[0][1] = input[i * 2 + 1];
-
-        while (pos <= 1.0) {
-            double l = 0.0;
-            double r = 0.0;
-            for (std::size_t j = 0; j < h.size(); j++) {
-                const double lanczos_calc = Lanczos(taps, pos + j - taps + 1);
-                l += lanczos_calc * h[j][0];
-                r += lanczos_calc * h[j][1];
-            }
-            output.emplace_back(static_cast<s16>(std::clamp(l, -32768.0, 32767.0)));
-            output.emplace_back(static_cast<s16>(std::clamp(r, -32768.0, 32767.0)));
-
-            pos += ratio;
+    const int step = static_cast<int>(ratio * 0x8000);
+    const std::array<s16, 512>& lut = [step] {
+        if (step > 0xaaaa) {
+            return curve_lut0;
+        }
+        if (step <= 0x8000) {
+            return curve_lut1;
         }
-        pos -= 1.0;
+        return curve_lut2;
+    }();
+
+    std::vector<s16> output(static_cast<std::size_t>(input.size() / ratio));
+    int in_offset = 0;
+    for (std::size_t out_offset = 0; out_offset < output.size(); out_offset += 2) {
+        const int lut_index = (state.fraction >> 8) * 4;
+
+        const int l = input[(in_offset + 0) * 2 + 0] * lut[lut_index + 0] +
+                      input[(in_offset + 1) * 2 + 0] * lut[lut_index + 1] +
+                      input[(in_offset + 2) * 2 + 0] * lut[lut_index + 2] +
+                      input[(in_offset + 3) * 2 + 0] * lut[lut_index + 3];
+
+        const int r = input[(in_offset + 0) * 2 + 1] * lut[lut_index + 0] +
+                      input[(in_offset + 1) * 2 + 1] * lut[lut_index + 1] +
+                      input[(in_offset + 2) * 2 + 1] * lut[lut_index + 2] +
+                      input[(in_offset + 3) * 2 + 1] * lut[lut_index + 3];
+
+        const int new_offset = state.fraction + step;
+
+        in_offset += new_offset >> 15;
+        state.fraction = new_offset & 0x7fff;
+
+        output[out_offset + 0] = static_cast<s16>(std::clamp(l >> 15, SHRT_MIN, SHRT_MAX));
+        output[out_offset + 1] = static_cast<s16>(std::clamp(r >> 15, SHRT_MIN, SHRT_MAX));
     }
 
     return output;
diff --git a/src/audio_core/algorithm/interpolate.h b/src/audio_core/algorithm/interpolate.h
index edbd6460f..1b9831a75 100644
--- a/src/audio_core/algorithm/interpolate.h
+++ b/src/audio_core/algorithm/interpolate.h
@@ -6,19 +6,12 @@
 
 #include <array>
 #include <vector>
-#include "audio_core/algorithm/filter.h"
 #include "common/common_types.h"
 
 namespace AudioCore {
 
 struct InterpolationState {
-    static constexpr std::size_t lanczos_taps = 4;
-    static constexpr std::size_t history_size = lanczos_taps * 2 - 1;
-
-    double current_ratio = 0.0;
-    CascadingFilter nyquist;
-    std::array<std::array<s16, 2>, history_size> history = {};
-    double position = 0;
+    int fraction = 0;
 };
 
 /// Interpolates input signal to produce output signal.
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 26612e692..88c06b2ce 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -187,6 +187,8 @@ add_library(core STATIC
     hle/kernel/synchronization.h
     hle/kernel/thread.cpp
     hle/kernel/thread.h
+    hle/kernel/time_manager.cpp
+    hle/kernel/time_manager.h
     hle/kernel/transfer_memory.cpp
     hle/kernel/transfer_memory.h
     hle/kernel/vm_manager.cpp
diff --git a/src/core/core.cpp b/src/core/core.cpp
index 0eb0c0dca..86e314c94 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -707,4 +707,12 @@ const Service::SM::ServiceManager& System::ServiceManager() const {
     return *impl->service_manager;
 }
 
+void System::RegisterCoreThread(std::size_t id) {
+    impl->kernel.RegisterCoreThread(id);
+}
+
+void System::RegisterHostThread() {
+    impl->kernel.RegisterHostThread();
+}
+
 } // namespace Core
diff --git a/src/core/core.h b/src/core/core.h
index e69d68fcf..8d862a8e6 100644
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -360,6 +360,12 @@ public:
 
     const CurrentBuildProcessID& GetCurrentProcessBuildID() const;
 
+    /// Register a host thread as an emulated CPU Core.
+    void RegisterCoreThread(std::size_t id);
+
+    /// Register a host thread as an auxiliary thread.
+    void RegisterHostThread();
+
 private:
     System();
 
diff --git a/src/core/hardware_properties.h b/src/core/hardware_properties.h
index 213461b6a..b04e046ed 100644
--- a/src/core/hardware_properties.h
+++ b/src/core/hardware_properties.h
@@ -20,6 +20,8 @@ constexpr u32 NUM_CPU_CORES = 4;            // Number of CPU Cores
 
 } // namespace Hardware
 
+constexpr u32 INVALID_HOST_THREAD_ID = 0xFFFFFFFF;
+
 struct EmuThreadHandle {
     u32 host_handle;
     u32 guest_handle;
diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp
index 4eb1d8703..9232f4d7e 100644
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -3,9 +3,12 @@
 // Refer to the license.txt file included.
 
 #include <atomic>
+#include <bitset>
 #include <functional>
 #include <memory>
 #include <mutex>
+#include <thread>
+#include <unordered_map>
 #include <utility>
 
 #include "common/assert.h"
@@ -15,6 +18,7 @@
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
+#include "core/hardware_properties.h"
 #include "core/hle/kernel/client_port.h"
 #include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/handle_table.h"
@@ -25,6 +29,7 @@
 #include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/synchronization.h"
 #include "core/hle/kernel/thread.h"
+#include "core/hle/kernel/time_manager.h"
 #include "core/hle/lock.h"
 #include "core/hle/result.h"
 #include "core/memory.h"
@@ -44,7 +49,7 @@ static void ThreadWakeupCallback(u64 thread_handle, [[maybe_unused]] s64 cycles_
     std::lock_guard lock{HLE::g_hle_lock};
 
     std::shared_ptr<Thread> thread =
-        system.Kernel().RetrieveThreadFromWakeupCallbackHandleTable(proper_handle);
+        system.Kernel().RetrieveThreadFromGlobalHandleTable(proper_handle);
     if (thread == nullptr) {
         LOG_CRITICAL(Kernel, "Callback fired for invalid thread {:08X}", proper_handle);
         return;
@@ -97,8 +102,8 @@ static void ThreadWakeupCallback(u64 thread_handle, [[maybe_unused]] s64 cycles_
 }
 
 struct KernelCore::Impl {
-    explicit Impl(Core::System& system)
-        : system{system}, global_scheduler{system}, synchronization{system} {}
+    explicit Impl(Core::System& system, KernelCore& kernel)
+        : system{system}, global_scheduler{kernel}, synchronization{system}, time_manager{system} {}
 
     void Initialize(KernelCore& kernel) {
         Shutdown();
@@ -120,7 +125,7 @@ struct KernelCore::Impl {
 
         system_resource_limit = nullptr;
 
-        thread_wakeup_callback_handle_table.Clear();
+        global_handle_table.Clear();
         thread_wakeup_event_type = nullptr;
         preemption_event = nullptr;
 
@@ -138,8 +143,8 @@ struct KernelCore::Impl {
 
     void InitializePhysicalCores() {
         exclusive_monitor =
-            Core::MakeExclusiveMonitor(system.Memory(), global_scheduler.CpuCoresCount());
-        for (std::size_t i = 0; i < global_scheduler.CpuCoresCount(); i++) {
+            Core::MakeExclusiveMonitor(system.Memory(), Core::Hardware::NUM_CPU_CORES);
+        for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
             cores.emplace_back(system, i, *exclusive_monitor);
         }
     }
@@ -184,6 +189,50 @@ struct KernelCore::Impl {
         system.Memory().SetCurrentPageTable(*process);
     }
 
+    void RegisterCoreThread(std::size_t core_id) {
+        std::unique_lock lock{register_thread_mutex};
+        const std::thread::id this_id = std::this_thread::get_id();
+        const auto it = host_thread_ids.find(this_id);
+        ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
+        ASSERT(it == host_thread_ids.end());
+        ASSERT(!registered_core_threads[core_id]);
+        host_thread_ids[this_id] = static_cast<u32>(core_id);
+        registered_core_threads.set(core_id);
+    }
+
+    void RegisterHostThread() {
+        std::unique_lock lock{register_thread_mutex};
+        const std::thread::id this_id = std::this_thread::get_id();
+        const auto it = host_thread_ids.find(this_id);
+        ASSERT(it == host_thread_ids.end());
+        host_thread_ids[this_id] = registered_thread_ids++;
+    }
+
+    u32 GetCurrentHostThreadID() const {
+        const std::thread::id this_id = std::this_thread::get_id();
+        const auto it = host_thread_ids.find(this_id);
+        if (it == host_thread_ids.end()) {
+            return Core::INVALID_HOST_THREAD_ID;
+        }
+        return it->second;
+    }
+
+    Core::EmuThreadHandle GetCurrentEmuThreadID() const {
+        Core::EmuThreadHandle result = Core::EmuThreadHandle::InvalidHandle();
+        result.host_handle = GetCurrentHostThreadID();
+        if (result.host_handle >= Core::Hardware::NUM_CPU_CORES) {
+            return result;
+        }
+        const Kernel::Scheduler& sched = cores[result.host_handle].Scheduler();
+        const Kernel::Thread* current = sched.GetCurrentThread();
+        if (current != nullptr) {
+            result.guest_handle = current->GetGlobalHandle();
+        } else {
+            result.guest_handle = InvalidHandle;
+        }
+        return result;
+    }
+
     std::atomic<u32> next_object_id{0};
     std::atomic<u64> next_kernel_process_id{Process::InitialKIPIDMin};
     std::atomic<u64> next_user_process_id{Process::ProcessIDMin};
@@ -194,15 +243,16 @@ struct KernelCore::Impl {
     Process* current_process = nullptr;
     Kernel::GlobalScheduler global_scheduler;
     Kernel::Synchronization synchronization;
+    Kernel::TimeManager time_manager;
 
     std::shared_ptr<ResourceLimit> system_resource_limit;
 
     std::shared_ptr<Core::Timing::EventType> thread_wakeup_event_type;
     std::shared_ptr<Core::Timing::EventType> preemption_event;
 
-    // TODO(yuriks): This can be removed if Thread objects are explicitly pooled in the future,
-    // allowing us to simply use a pool index or similar.
-    Kernel::HandleTable thread_wakeup_callback_handle_table;
+    // This is the kernel's handle table or supervisor handle table which
+    // stores all the objects in place.
+    Kernel::HandleTable global_handle_table;
 
     /// Map of named ports managed by the kernel, which can be retrieved using
     /// the ConnectToPort SVC.
@@ -211,11 +261,17 @@ struct KernelCore::Impl {
     std::unique_ptr<Core::ExclusiveMonitor> exclusive_monitor;
     std::vector<Kernel::PhysicalCore> cores;
 
+    // 0-3 IDs represent core threads, >3 represent others
+    std::unordered_map<std::thread::id, u32> host_thread_ids;
+    u32 registered_thread_ids{Core::Hardware::NUM_CPU_CORES};
+    std::bitset<Core::Hardware::NUM_CPU_CORES> registered_core_threads;
+    std::mutex register_thread_mutex;
+
     // System context
     Core::System& system;
 };
 
-KernelCore::KernelCore(Core::System& system) : impl{std::make_unique<Impl>(system)} {}
+KernelCore::KernelCore(Core::System& system) : impl{std::make_unique<Impl>(system, *this)} {}
 KernelCore::~KernelCore() {
     Shutdown();
 }
@@ -232,9 +288,8 @@ std::shared_ptr<ResourceLimit> KernelCore::GetSystemResourceLimit() const {
     return impl->system_resource_limit;
 }
 
-std::shared_ptr<Thread> KernelCore::RetrieveThreadFromWakeupCallbackHandleTable(
-    Handle handle) const {
-    return impl->thread_wakeup_callback_handle_table.Get<Thread>(handle);
+std::shared_ptr<Thread> KernelCore::RetrieveThreadFromGlobalHandleTable(Handle handle) const {
+    return impl->global_handle_table.Get<Thread>(handle);
 }
 
 void KernelCore::AppendNewProcess(std::shared_ptr<Process> process) {
@@ -265,6 +320,14 @@ const Kernel::GlobalScheduler& KernelCore::GlobalScheduler() const {
     return impl->global_scheduler;
 }
 
+Kernel::Scheduler& KernelCore::Scheduler(std::size_t id) {
+    return impl->cores[id].Scheduler();
+}
+
+const Kernel::Scheduler& KernelCore::Scheduler(std::size_t id) const {
+    return impl->cores[id].Scheduler();
+}
+
 Kernel::PhysicalCore& KernelCore::PhysicalCore(std::size_t id) {
     return impl->cores[id];
 }
@@ -281,6 +344,14 @@ const Kernel::Synchronization& KernelCore::Synchronization() const {
     return impl->synchronization;
 }
 
+Kernel::TimeManager& KernelCore::TimeManager() {
+    return impl->time_manager;
+}
+
+const Kernel::TimeManager& KernelCore::TimeManager() const {
+    return impl->time_manager;
+}
+
 Core::ExclusiveMonitor& KernelCore::GetExclusiveMonitor() {
     return *impl->exclusive_monitor;
 }
@@ -338,12 +409,28 @@ const std::shared_ptr<Core::Timing::EventType>& KernelCore::ThreadWakeupCallback
     return impl->thread_wakeup_event_type;
 }
 
-Kernel::HandleTable& KernelCore::ThreadWakeupCallbackHandleTable() {
-    return impl->thread_wakeup_callback_handle_table;
+Kernel::HandleTable& KernelCore::GlobalHandleTable() {
+    return impl->global_handle_table;
+}
+
+const Kernel::HandleTable& KernelCore::GlobalHandleTable() const {
+    return impl->global_handle_table;
+}
+
+void KernelCore::RegisterCoreThread(std::size_t core_id) {
+    impl->RegisterCoreThread(core_id);
+}
+
+void KernelCore::RegisterHostThread() {
+    impl->RegisterHostThread();
+}
+
+u32 KernelCore::GetCurrentHostThreadID() const {
+    return impl->GetCurrentHostThreadID();
 }
 
-const Kernel::HandleTable& KernelCore::ThreadWakeupCallbackHandleTable() const {
-    return impl->thread_wakeup_callback_handle_table;
+Core::EmuThreadHandle KernelCore::GetCurrentEmuThreadID() const {
+    return impl->GetCurrentEmuThreadID();
 }
 
 } // namespace Kernel
diff --git a/src/core/hle/kernel/kernel.h b/src/core/hle/kernel/kernel.h
index 1eede3063..c4f78ab71 100644
--- a/src/core/hle/kernel/kernel.h
+++ b/src/core/hle/kernel/kernel.h
@@ -11,6 +11,7 @@
 #include "core/hle/kernel/object.h"
 
 namespace Core {
+struct EmuThreadHandle;
 class ExclusiveMonitor;
 class System;
 } // namespace Core
@@ -29,8 +30,10 @@ class HandleTable;
 class PhysicalCore;
 class Process;
 class ResourceLimit;
+class Scheduler;
 class Synchronization;
 class Thread;
+class TimeManager;
 
 /// Represents a single instance of the kernel.
 class KernelCore {
@@ -64,7 +67,7 @@ public:
     std::shared_ptr<ResourceLimit> GetSystemResourceLimit() const;
 
     /// Retrieves a shared pointer to a Thread instance within the thread wakeup handle table.
-    std::shared_ptr<Thread> RetrieveThreadFromWakeupCallbackHandleTable(Handle handle) const;
+    std::shared_ptr<Thread> RetrieveThreadFromGlobalHandleTable(Handle handle) const;
 
     /// Adds the given shared pointer to an internal list of active processes.
     void AppendNewProcess(std::shared_ptr<Process> process);
@@ -87,6 +90,12 @@ public:
     /// Gets the sole instance of the global scheduler
     const Kernel::GlobalScheduler& GlobalScheduler() const;
 
+    /// Gets the sole instance of the Scheduler assoviated with cpu core 'id'
+    Kernel::Scheduler& Scheduler(std::size_t id);
+
+    /// Gets the sole instance of the Scheduler assoviated with cpu core 'id'
+    const Kernel::Scheduler& Scheduler(std::size_t id) const;
+
     /// Gets the an instance of the respective physical CPU core.
     Kernel::PhysicalCore& PhysicalCore(std::size_t id);
 
@@ -99,6 +108,12 @@ public:
     /// Gets the an instance of the Synchronization Interface.
     const Kernel::Synchronization& Synchronization() const;
 
+    /// Gets the an instance of the TimeManager Interface.
+    Kernel::TimeManager& TimeManager();
+
+    /// Gets the an instance of the TimeManager Interface.
+    const Kernel::TimeManager& TimeManager() const;
+
     /// Stops execution of 'id' core, in order to reschedule a new thread.
     void PrepareReschedule(std::size_t id);
 
@@ -120,6 +135,18 @@ public:
     /// Determines whether or not the given port is a valid named port.
     bool IsValidNamedPort(NamedPortTable::const_iterator port) const;
 
+    /// Gets the current host_thread/guest_thread handle.
+    Core::EmuThreadHandle GetCurrentEmuThreadID() const;
+
+    /// Gets the current host_thread handle.
+    u32 GetCurrentHostThreadID() const;
+
+    /// Register the current thread as a CPU Core Thread.
+    void RegisterCoreThread(std::size_t core_id);
+
+    /// Register the current thread as a non CPU core thread.
+    void RegisterHostThread();
+
 private:
     friend class Object;
     friend class Process;
@@ -140,11 +167,11 @@ private:
     /// Retrieves the event type used for thread wakeup callbacks.
     const std::shared_ptr<Core::Timing::EventType>& ThreadWakeupCallbackEventType() const;
 
-    /// Provides a reference to the thread wakeup callback handle table.
-    Kernel::HandleTable& ThreadWakeupCallbackHandleTable();
+    /// Provides a reference to the global handle table.
+    Kernel::HandleTable& GlobalHandleTable();
 
-    /// Provides a const reference to the thread wakeup callback handle table.
-    const Kernel::HandleTable& ThreadWakeupCallbackHandleTable() const;
+    /// Provides a const reference to the global handle table.
+    const Kernel::HandleTable& GlobalHandleTable() const;
 
     struct Impl;
     std::unique_ptr<Impl> impl;
diff --git a/src/core/hle/kernel/scheduler.cpp b/src/core/hle/kernel/scheduler.cpp
index 86f1421bf..c65f82fb7 100644
--- a/src/core/hle/kernel/scheduler.cpp
+++ b/src/core/hle/kernel/scheduler.cpp
@@ -18,10 +18,11 @@
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/scheduler.h"
+#include "core/hle/kernel/time_manager.h"
 
 namespace Kernel {
 
-GlobalScheduler::GlobalScheduler(Core::System& system) : system{system} {}
+GlobalScheduler::GlobalScheduler(KernelCore& kernel) : kernel{kernel} {}
 
 GlobalScheduler::~GlobalScheduler() = default;
 
@@ -35,7 +36,7 @@ void GlobalScheduler::RemoveThread(std::shared_ptr<Thread> thread) {
 }
 
 void GlobalScheduler::UnloadThread(std::size_t core) {
-    Scheduler& sched = system.Scheduler(core);
+    Scheduler& sched = kernel.Scheduler(core);
     sched.UnloadThread();
 }
 
@@ -50,7 +51,7 @@ void GlobalScheduler::SelectThread(std::size_t core) {
         sched.is_context_switch_pending = sched.selected_thread != sched.current_thread;
         std::atomic_thread_fence(std::memory_order_seq_cst);
     };
-    Scheduler& sched = system.Scheduler(core);
+    Scheduler& sched = kernel.Scheduler(core);
     Thread* current_thread = nullptr;
     // Step 1: Get top thread in schedule queue.
     current_thread = scheduled_queue[core].empty() ? nullptr : scheduled_queue[core].front();
@@ -356,6 +357,32 @@ void GlobalScheduler::Shutdown() {
     thread_list.clear();
 }
 
+void GlobalScheduler::Lock() {
+    Core::EmuThreadHandle current_thread = kernel.GetCurrentEmuThreadID();
+    if (current_thread == current_owner) {
+        ++scope_lock;
+    } else {
+        inner_lock.lock();
+        current_owner = current_thread;
+        ASSERT(current_owner != Core::EmuThreadHandle::InvalidHandle());
+        scope_lock = 1;
+    }
+}
+
+void GlobalScheduler::Unlock() {
+    if (--scope_lock != 0) {
+        ASSERT(scope_lock > 0);
+        return;
+    }
+    for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
+        SelectThread(i);
+    }
+    current_owner = Core::EmuThreadHandle::InvalidHandle();
+    scope_lock = 1;
+    inner_lock.unlock();
+    // TODO(Blinkhawk): Setup the interrupts and change context on current core.
+}
+
 Scheduler::Scheduler(Core::System& system, Core::ARM_Interface& cpu_core, std::size_t core_id)
     : system(system), cpu_core(cpu_core), core_id(core_id) {}
 
@@ -485,4 +512,27 @@ void Scheduler::Shutdown() {
     selected_thread = nullptr;
 }
 
+SchedulerLock::SchedulerLock(KernelCore& kernel) : kernel{kernel} {
+    kernel.GlobalScheduler().Lock();
+}
+
+SchedulerLock::~SchedulerLock() {
+    kernel.GlobalScheduler().Unlock();
+}
+
+SchedulerLockAndSleep::SchedulerLockAndSleep(KernelCore& kernel, Handle& event_handle,
+                                             Thread* time_task, s64 nanoseconds)
+    : SchedulerLock{kernel}, event_handle{event_handle}, time_task{time_task}, nanoseconds{
+                                                                                   nanoseconds} {
+    event_handle = InvalidHandle;
+}
+
+SchedulerLockAndSleep::~SchedulerLockAndSleep() {
+    if (sleep_cancelled) {
+        return;
+    }
+    auto& time_manager = kernel.TimeManager();
+    time_manager.ScheduleTimeEvent(event_handle, time_task, nanoseconds);
+}
+
 } // namespace Kernel
diff --git a/src/core/hle/kernel/scheduler.h b/src/core/hle/kernel/scheduler.h
index 96db049cb..1c93a838c 100644
--- a/src/core/hle/kernel/scheduler.h
+++ b/src/core/hle/kernel/scheduler.h
@@ -6,6 +6,7 @@
 
 #include <atomic>
 #include <memory>
+#include <mutex>
 #include <vector>
 
 #include "common/common_types.h"
@@ -20,11 +21,13 @@ class System;
 
 namespace Kernel {
 
+class KernelCore;
 class Process;
+class SchedulerLock;
 
 class GlobalScheduler final {
 public:
-    explicit GlobalScheduler(Core::System& system);
+    explicit GlobalScheduler(KernelCore& kernel);
     ~GlobalScheduler();
 
     /// Adds a new thread to the scheduler
@@ -138,6 +141,14 @@ public:
     void Shutdown();
 
 private:
+    friend class SchedulerLock;
+
+    /// Lock the scheduler to the current thread.
+    void Lock();
+
+    /// Unlocks the scheduler, reselects threads, interrupts cores for rescheduling
+    /// and reschedules current core if needed.
+    void Unlock();
     /**
      * Transfers a thread into an specific core. If the destination_core is -1
      * it will be unscheduled from its source code and added into its suggested
@@ -158,9 +169,14 @@ private:
     // ordered from Core 0 to Core 3.
     std::array<u32, Core::Hardware::NUM_CPU_CORES> preemption_priorities = {59, 59, 59, 62};
 
+    /// Scheduler lock mechanisms.
+    std::mutex inner_lock{}; // TODO(Blinkhawk): Replace for a SpinLock
+    std::atomic<s64> scope_lock{};
+    Core::EmuThreadHandle current_owner{Core::EmuThreadHandle::InvalidHandle()};
+
     /// Lists all thread ids that aren't deleted/etc.
     std::vector<std::shared_ptr<Thread>> thread_list;
-    Core::System& system;
+    KernelCore& kernel;
 };
 
 class Scheduler final {
@@ -227,4 +243,30 @@ private:
     bool is_context_switch_pending = false;
 };
 
+class SchedulerLock {
+public:
+    explicit SchedulerLock(KernelCore& kernel);
+    ~SchedulerLock();
+
+protected:
+    KernelCore& kernel;
+};
+
+class SchedulerLockAndSleep : public SchedulerLock {
+public:
+    explicit SchedulerLockAndSleep(KernelCore& kernel, Handle& event_handle, Thread* time_task,
+                                   s64 nanoseconds);
+    ~SchedulerLockAndSleep();
+
+    void CancelSleep() {
+        sleep_cancelled = true;
+    }
+
+private:
+    Handle& event_handle;
+    Thread* time_task;
+    s64 nanoseconds;
+    bool sleep_cancelled{};
+};
+
 } // namespace Kernel
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index ae5f2c8bd..bf850e0b2 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -46,9 +46,9 @@ Thread::~Thread() = default;
 void Thread::Stop() {
     // Cancel any outstanding wakeup events for this thread
     Core::System::GetInstance().CoreTiming().UnscheduleEvent(kernel.ThreadWakeupCallbackEventType(),
-                                                             callback_handle);
-    kernel.ThreadWakeupCallbackHandleTable().Close(callback_handle);
-    callback_handle = 0;
+                                                             global_handle);
+    kernel.GlobalHandleTable().Close(global_handle);
+    global_handle = 0;
     SetStatus(ThreadStatus::Dead);
     Signal();
 
@@ -73,12 +73,12 @@ void Thread::WakeAfterDelay(s64 nanoseconds) {
     // thread-safe version of ScheduleEvent.
     const s64 cycles = Core::Timing::nsToCycles(std::chrono::nanoseconds{nanoseconds});
     Core::System::GetInstance().CoreTiming().ScheduleEvent(
-        cycles, kernel.ThreadWakeupCallbackEventType(), callback_handle);
+        cycles, kernel.ThreadWakeupCallbackEventType(), global_handle);
 }
 
 void Thread::CancelWakeupTimer() {
     Core::System::GetInstance().CoreTiming().UnscheduleEvent(kernel.ThreadWakeupCallbackEventType(),
-                                                             callback_handle);
+                                                             global_handle);
 }
 
 void Thread::ResumeFromWait() {
@@ -190,7 +190,7 @@ ResultVal<std::shared_ptr<Thread>> Thread::Create(KernelCore& kernel, std::strin
     thread->condvar_wait_address = 0;
     thread->wait_handle = 0;
     thread->name = std::move(name);
-    thread->callback_handle = kernel.ThreadWakeupCallbackHandleTable().Create(thread).Unwrap();
+    thread->global_handle = kernel.GlobalHandleTable().Create(thread).Unwrap();
     thread->owner_process = &owner_process;
     auto& scheduler = kernel.GlobalScheduler();
     scheduler.AddThread(thread);
diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h
index 7a4916318..129e7858a 100644
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -453,6 +453,10 @@ public:
         is_sync_cancelled = value;
     }
 
+    Handle GetGlobalHandle() const {
+        return global_handle;
+    }
+
 private:
     void SetSchedulingStatus(ThreadSchedStatus new_status);
     void SetCurrentPriority(u32 new_priority);
@@ -514,7 +518,7 @@ private:
     VAddr arb_wait_address{0};
 
     /// Handle used as userdata to reference this object when inserting into the CoreTiming queue.
-    Handle callback_handle = 0;
+    Handle global_handle = 0;
 
     /// Callback that will be invoked when the thread is resumed from a waiting state. If the thread
     /// was waiting via WaitSynchronization then the object will be the last object that became
diff --git a/src/core/hle/kernel/time_manager.cpp b/src/core/hle/kernel/time_manager.cpp
new file mode 100644
index 000000000..21b290468
--- /dev/null
+++ b/src/core/hle/kernel/time_manager.cpp
@@ -0,0 +1,44 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "core/core_timing_util.h"
+#include "core/hle/kernel/handle_table.h"
+#include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/thread.h"
+#include "core/hle/kernel/time_manager.h"
+
+namespace Kernel {
+
+TimeManager::TimeManager(Core::System& system) : system{system} {
+    time_manager_event_type = Core::Timing::CreateEvent(
+        "Kernel::TimeManagerCallback", [this](u64 thread_handle, [[maybe_unused]] s64 cycles_late) {
+            Handle proper_handle = static_cast<Handle>(thread_handle);
+            std::shared_ptr<Thread> thread =
+                this->system.Kernel().RetrieveThreadFromGlobalHandleTable(proper_handle);
+            thread->ResumeFromWait();
+        });
+}
+
+void TimeManager::ScheduleTimeEvent(Handle& event_handle, Thread* timetask, s64 nanoseconds) {
+    if (nanoseconds > 0) {
+        ASSERT(timetask);
+        event_handle = timetask->GetGlobalHandle();
+        const s64 cycles = Core::Timing::nsToCycles(std::chrono::nanoseconds{nanoseconds});
+        system.CoreTiming().ScheduleEvent(cycles, time_manager_event_type, event_handle);
+    } else {
+        event_handle = InvalidHandle;
+    }
+}
+
+void TimeManager::UnscheduleTimeEvent(Handle event_handle) {
+    if (event_handle == InvalidHandle) {
+        return;
+    }
+    system.CoreTiming().UnscheduleEvent(time_manager_event_type, event_handle);
+}
+
+} // namespace Kernel
diff --git a/src/core/hle/kernel/time_manager.h b/src/core/hle/kernel/time_manager.h
new file mode 100644
index 000000000..eaec486d1
--- /dev/null
+++ b/src/core/hle/kernel/time_manager.h
@@ -0,0 +1,43 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+
+#include "core/hle/kernel/object.h"
+
+namespace Core {
+class System;
+} // namespace Core
+
+namespace Core::Timing {
+struct EventType;
+} // namespace Core::Timing
+
+namespace Kernel {
+
+class Thread;
+
+/**
+ * The `TimeManager` takes care of scheduling time events on threads and executes their TimeUp
+ * method when the event is triggered.
+ */
+class TimeManager {
+public:
+    explicit TimeManager(Core::System& system);
+
+    /// Schedule a time event on `timetask` thread that will expire in 'nanoseconds'
+    /// returns a non-invalid handle in `event_handle` if correctly scheduled
+    void ScheduleTimeEvent(Handle& event_handle, Thread* timetask, s64 nanoseconds);
+
+    /// Unschedule an existing time event
+    void UnscheduleTimeEvent(Handle event_handle);
+
+private:
+    Core::System& system;
+    std::shared_ptr<Core::Timing::EventType> time_manager_event_type;
+};
+
+} // namespace Kernel
diff --git a/src/core/hle/service/bcat/backend/boxcat.cpp b/src/core/hle/service/bcat/backend/boxcat.cpp
index 67e39a5c4..f589864ee 100644
--- a/src/core/hle/service/bcat/backend/boxcat.cpp
+++ b/src/core/hle/service/bcat/backend/boxcat.cpp
@@ -200,7 +200,8 @@ private:
     DownloadResult DownloadInternal(const std::string& resolved_path, u32 timeout_seconds,
                                     const std::string& content_type_name) {
         if (client == nullptr) {
-            client = std::make_unique<httplib::SSLClient>(BOXCAT_HOSTNAME, PORT, timeout_seconds);
+            client = std::make_unique<httplib::SSLClient>(BOXCAT_HOSTNAME, PORT);
+            client->set_timeout_sec(timeout_seconds);
         }
 
         httplib::Headers headers{
@@ -448,8 +449,8 @@ std::optional<std::vector<u8>> Boxcat::GetLaunchParameter(TitleIDVersion title)
 
 Boxcat::StatusResult Boxcat::GetStatus(std::optional<std::string>& global,
                                        std::map<std::string, EventStatus>& games) {
-    httplib::SSLClient client{BOXCAT_HOSTNAME, static_cast<int>(PORT),
-                              static_cast<int>(TIMEOUT_SECONDS)};
+    httplib::SSLClient client{BOXCAT_HOSTNAME, static_cast<int>(PORT)};
+    client.set_timeout_sec(static_cast<int>(TIMEOUT_SECONDS));
 
     httplib::Headers headers{
         {std::string("Game-Assets-API-Version"), std::string(BOXCAT_API_VERSION)},
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index db9332d00..4b0c6346f 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -37,6 +37,7 @@ add_library(video_core STATIC
     memory_manager.h
     morton.cpp
     morton.h
+    query_cache.h
     rasterizer_accelerated.cpp
     rasterizer_accelerated.h
     rasterizer_cache.cpp
@@ -74,6 +75,8 @@ add_library(video_core STATIC
     renderer_opengl/gl_stream_buffer.h
     renderer_opengl/gl_texture_cache.cpp
     renderer_opengl/gl_texture_cache.h
+    renderer_opengl/gl_query_cache.cpp
+    renderer_opengl/gl_query_cache.h
     renderer_opengl/maxwell_to_gl.h
     renderer_opengl/renderer_opengl.cpp
     renderer_opengl/renderer_opengl.h
@@ -177,6 +180,8 @@ if (ENABLE_VULKAN)
         renderer_vulkan/vk_memory_manager.h
         renderer_vulkan/vk_pipeline_cache.cpp
         renderer_vulkan/vk_pipeline_cache.h
+        renderer_vulkan/vk_query_cache.cpp
+        renderer_vulkan/vk_query_cache.h
         renderer_vulkan/vk_rasterizer.cpp
         renderer_vulkan/vk_rasterizer.h
         renderer_vulkan/vk_renderpass_cache.cpp
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 0b3e8749b..b28de1092 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -4,6 +4,7 @@
 
 #include <cinttypes>
 #include <cstring>
+#include <optional>
 #include "common/assert.h"
 #include "core/core.h"
 #include "core/core_timing.h"
@@ -16,6 +17,8 @@
 
 namespace Tegra::Engines {
 
+using VideoCore::QueryType;
+
 /// First register id that is actually a Macro call.
 constexpr u32 MacroRegistersStart = 0xE00;
 
@@ -400,6 +403,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
         ProcessQueryCondition();
         break;
     }
+    case MAXWELL3D_REG_INDEX(counter_reset): {
+        ProcessCounterReset();
+        break;
+    }
     case MAXWELL3D_REG_INDEX(sync_info): {
         ProcessSyncPoint();
         break;
@@ -482,7 +489,7 @@ void Maxwell3D::FlushMMEInlineDraw() {
 
     const bool is_indexed = mme_draw.current_mode == MMEDrawMode::Indexed;
     if (ShouldExecute()) {
-        rasterizer.DrawMultiBatch(is_indexed);
+        rasterizer.Draw(is_indexed, true);
     }
 
     // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
@@ -544,40 +551,28 @@ void Maxwell3D::ProcessQueryGet() {
                "Units other than CROP are unimplemented");
 
     switch (regs.query.query_get.operation) {
-    case Regs::QueryOperation::Release: {
-        const u64 result = regs.query.query_sequence;
-        StampQueryResult(result, regs.query.query_get.short_query == 0);
+    case Regs::QueryOperation::Release:
+        StampQueryResult(regs.query.query_sequence, regs.query.query_get.short_query == 0);
         break;
-    }
-    case Regs::QueryOperation::Acquire: {
-        // Todo(Blinkhawk): Under this operation, the GPU waits for the CPU
-        // to write a value that matches the current payload.
+    case Regs::QueryOperation::Acquire:
+        // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that
+        // matches the current payload.
         UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE");
         break;
-    }
-    case Regs::QueryOperation::Counter: {
-        u64 result{};
-        switch (regs.query.query_get.select) {
-        case Regs::QuerySelect::Zero:
-            result = 0;
-            break;
-        default:
-            result = 1;
-            UNIMPLEMENTED_MSG("Unimplemented query select type {}",
-                              static_cast<u32>(regs.query.query_get.select.Value()));
+    case Regs::QueryOperation::Counter:
+        if (const std::optional<u64> result = GetQueryResult()) {
+            // If the query returns an empty optional it means it's cached and deferred.
+            // In this case we have a non-empty result, so we stamp it immediately.
+            StampQueryResult(*result, regs.query.query_get.short_query == 0);
         }
-        StampQueryResult(result, regs.query.query_get.short_query == 0);
         break;
-    }
-    case Regs::QueryOperation::Trap: {
+    case Regs::QueryOperation::Trap:
         UNIMPLEMENTED_MSG("Unimplemented query operation TRAP");
         break;
-    }
-    default: {
+    default:
         UNIMPLEMENTED_MSG("Unknown query operation");
         break;
     }
-    }
 }
 
 void Maxwell3D::ProcessQueryCondition() {
@@ -593,20 +588,20 @@ void Maxwell3D::ProcessQueryCondition() {
     }
     case Regs::ConditionMode::ResNonZero: {
         Regs::QueryCompare cmp;
-        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
         execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;
         break;
     }
     case Regs::ConditionMode::Equal: {
         Regs::QueryCompare cmp;
-        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
         execute_on =
             cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;
         break;
     }
     case Regs::ConditionMode::NotEqual: {
         Regs::QueryCompare cmp;
-        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        memory_manager.ReadBlock(condition_address, &cmp, sizeof(cmp));
         execute_on =
             cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;
         break;
@@ -619,6 +614,18 @@ void Maxwell3D::ProcessQueryCondition() {
     }
 }
 
+void Maxwell3D::ProcessCounterReset() {
+    switch (regs.counter_reset) {
+    case Regs::CounterReset::SampleCnt:
+        rasterizer.ResetCounter(QueryType::SamplesPassed);
+        break;
+    default:
+        LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}",
+                    static_cast<int>(regs.counter_reset));
+        break;
+    }
+}
+
 void Maxwell3D::ProcessSyncPoint() {
     const u32 sync_point = regs.sync_info.sync_point.Value();
     const u32 increment = regs.sync_info.increment.Value();
@@ -647,7 +654,7 @@ void Maxwell3D::DrawArrays() {
 
     const bool is_indexed{regs.index_array.count && !regs.vertex_buffer.count};
     if (ShouldExecute()) {
-        rasterizer.DrawBatch(is_indexed);
+        rasterizer.Draw(is_indexed, false);
     }
 
     // TODO(bunnei): Below, we reset vertex count so that we can use these registers to determine if
@@ -661,6 +668,22 @@ void Maxwell3D::DrawArrays() {
     }
 }
 
+std::optional<u64> Maxwell3D::GetQueryResult() {
+    switch (regs.query.query_get.select) {
+    case Regs::QuerySelect::Zero:
+        return 0;
+    case Regs::QuerySelect::SamplesPassed:
+        // Deferred.
+        rasterizer.Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed,
+                         system.GPU().GetTicks());
+        return {};
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented query select type {}",
+                          static_cast<u32>(regs.query.query_get.select.Value()));
+        return 1;
+    }
+}
+
 void Maxwell3D::ProcessCBBind(std::size_t stage_index) {
     // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
     auto& shader = state.shader_stages[stage_index];
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 0a2af54e5..6ea7cc6a5 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <bitset>
+#include <optional>
 #include <type_traits>
 #include <unordered_map>
 #include <vector>
@@ -409,6 +410,27 @@ public:
             Linear = 1,
         };
 
+        enum class CounterReset : u32 {
+            SampleCnt = 0x01,
+            Unk02 = 0x02,
+            Unk03 = 0x03,
+            Unk04 = 0x04,
+            EmittedPrimitives = 0x10, // Not tested
+            Unk11 = 0x11,
+            Unk12 = 0x12,
+            Unk13 = 0x13,
+            Unk15 = 0x15,
+            Unk16 = 0x16,
+            Unk17 = 0x17,
+            Unk18 = 0x18,
+            Unk1A = 0x1A,
+            Unk1B = 0x1B,
+            Unk1C = 0x1C,
+            Unk1D = 0x1D,
+            Unk1E = 0x1E,
+            GeneratedPrimitives = 0x1F,
+        };
+
         struct Cull {
             enum class FrontFace : u32 {
                 ClockWise = 0x0900,
@@ -520,7 +542,7 @@ public:
                 BitField<12, 1, InvMemoryLayout> type;
             } memory_layout;
             union {
-                BitField<0, 16, u32> array_mode;
+                BitField<0, 16, u32> layers;
                 BitField<16, 1, u32> volume;
             };
             u32 layer_stride;
@@ -778,8 +800,12 @@ public:
 
                 u32 zeta_width;
                 u32 zeta_height;
+                union {
+                    BitField<0, 16, u32> zeta_layers;
+                    BitField<16, 1, u32> zeta_volume;
+                };
 
-                INSERT_UNION_PADDING_WORDS(0x27);
+                INSERT_UNION_PADDING_WORDS(0x26);
 
                 u32 depth_test_enable;
 
@@ -857,7 +883,7 @@ public:
                     BitField<7, 1, u32> c7;
                 } clip_distance_enabled;
 
-                INSERT_UNION_PADDING_WORDS(0x1);
+                u32 samplecnt_enable;
 
                 float point_size;
 
@@ -865,7 +891,11 @@ public:
 
                 u32 point_sprite_enable;
 
-                INSERT_UNION_PADDING_WORDS(0x5);
+                INSERT_UNION_PADDING_WORDS(0x3);
+
+                CounterReset counter_reset;
+
+                INSERT_UNION_PADDING_WORDS(0x1);
 
                 u32 zeta_enable;
 
@@ -1412,12 +1442,15 @@ private:
     /// Handles a write to the QUERY_GET register.
     void ProcessQueryGet();
 
-    // Writes the query result accordingly
+    /// Writes the query result accordingly.
     void StampQueryResult(u64 payload, bool long_query);
 
-    // Handles Conditional Rendering
+    /// Handles conditional rendering.
     void ProcessQueryCondition();
 
+    /// Handles counter resets.
+    void ProcessCounterReset();
+
     /// Handles writes to syncing register.
     void ProcessSyncPoint();
 
@@ -1434,6 +1467,9 @@ private:
 
     // Handles a instance drawcall from MME
     void StepInstance(MMEDrawMode expected_mode, u32 count);
+
+    /// Returns a query's value or an empty object if the value will be deferred through a cache.
+    std::optional<u64> GetQueryResult();
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
@@ -1475,6 +1511,7 @@ ASSERT_REG_POSITION(vertex_attrib_format, 0x458);
 ASSERT_REG_POSITION(rt_control, 0x487);
 ASSERT_REG_POSITION(zeta_width, 0x48a);
 ASSERT_REG_POSITION(zeta_height, 0x48b);
+ASSERT_REG_POSITION(zeta_layers, 0x48c);
 ASSERT_REG_POSITION(depth_test_enable, 0x4B3);
 ASSERT_REG_POSITION(independent_blend_enable, 0x4B9);
 ASSERT_REG_POSITION(depth_write_enabled, 0x4BA);
@@ -1499,8 +1536,10 @@ ASSERT_REG_POSITION(screen_y_control, 0x4EB);
 ASSERT_REG_POSITION(vb_element_base, 0x50D);
 ASSERT_REG_POSITION(vb_base_instance, 0x50E);
 ASSERT_REG_POSITION(clip_distance_enabled, 0x544);
+ASSERT_REG_POSITION(samplecnt_enable, 0x545);
 ASSERT_REG_POSITION(point_size, 0x546);
 ASSERT_REG_POSITION(point_sprite_enable, 0x548);
+ASSERT_REG_POSITION(counter_reset, 0x54C);
 ASSERT_REG_POSITION(zeta_enable, 0x54E);
 ASSERT_REG_POSITION(multisample_control, 0x54F);
 ASSERT_REG_POSITION(condition, 0x554);
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 7d7137109..e8f763ce9 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -140,71 +140,6 @@ void GPU::FlushCommands() {
     renderer.Rasterizer().FlushCommands();
 }
 
-u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
-    ASSERT(format != RenderTargetFormat::NONE);
-
-    switch (format) {
-    case RenderTargetFormat::RGBA32_FLOAT:
-    case RenderTargetFormat::RGBA32_UINT:
-        return 16;
-    case RenderTargetFormat::RGBA16_UINT:
-    case RenderTargetFormat::RGBA16_UNORM:
-    case RenderTargetFormat::RGBA16_FLOAT:
-    case RenderTargetFormat::RGBX16_FLOAT:
-    case RenderTargetFormat::RG32_FLOAT:
-    case RenderTargetFormat::RG32_UINT:
-        return 8;
-    case RenderTargetFormat::RGBA8_UNORM:
-    case RenderTargetFormat::RGBA8_SNORM:
-    case RenderTargetFormat::RGBA8_SRGB:
-    case RenderTargetFormat::RGBA8_UINT:
-    case RenderTargetFormat::RGB10_A2_UNORM:
-    case RenderTargetFormat::BGRA8_UNORM:
-    case RenderTargetFormat::BGRA8_SRGB:
-    case RenderTargetFormat::RG16_UNORM:
-    case RenderTargetFormat::RG16_SNORM:
-    case RenderTargetFormat::RG16_UINT:
-    case RenderTargetFormat::RG16_SINT:
-    case RenderTargetFormat::RG16_FLOAT:
-    case RenderTargetFormat::R32_FLOAT:
-    case RenderTargetFormat::R11G11B10_FLOAT:
-    case RenderTargetFormat::R32_UINT:
-        return 4;
-    case RenderTargetFormat::R16_UNORM:
-    case RenderTargetFormat::R16_SNORM:
-    case RenderTargetFormat::R16_UINT:
-    case RenderTargetFormat::R16_SINT:
-    case RenderTargetFormat::R16_FLOAT:
-    case RenderTargetFormat::RG8_UNORM:
-    case RenderTargetFormat::RG8_SNORM:
-        return 2;
-    case RenderTargetFormat::R8_UNORM:
-    case RenderTargetFormat::R8_UINT:
-        return 1;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented render target format {}", static_cast<u32>(format));
-        return 1;
-    }
-}
-
-u32 DepthFormatBytesPerPixel(DepthFormat format) {
-    switch (format) {
-    case DepthFormat::Z32_S8_X24_FLOAT:
-        return 8;
-    case DepthFormat::Z32_FLOAT:
-    case DepthFormat::S8_Z24_UNORM:
-    case DepthFormat::Z24_X8_UNORM:
-    case DepthFormat::Z24_S8_UNORM:
-    case DepthFormat::Z24_C8_UNORM:
-        return 4;
-    case DepthFormat::Z16_UNORM:
-        return 2;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented Depth format {}", static_cast<u32>(format));
-        return 1;
-    }
-}
-
 // Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
 // their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
 // So the values you see in docs might be multiplied by 4.
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 07727210c..ba8c9d665 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -57,6 +57,7 @@ enum class RenderTargetFormat : u32 {
     RG16_UINT = 0xDD,
     RG16_FLOAT = 0xDE,
     R11G11B10_FLOAT = 0xE0,
+    R32_SINT = 0xE3,
     R32_UINT = 0xE4,
     R32_FLOAT = 0xE5,
     B5G6R5_UNORM = 0xE8,
@@ -82,12 +83,6 @@ enum class DepthFormat : u32 {
     Z32_S8_X24_FLOAT = 0x19,
 };
 
-/// Returns the number of bytes per pixel of each rendertarget format.
-u32 RenderTargetBytesPerPixel(RenderTargetFormat format);
-
-/// Returns the number of bytes per pixel of each depth format.
-u32 DepthFormatBytesPerPixel(DepthFormat format);
-
 struct CommandListHeader;
 class DebugContext;
 
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 11848fbce..f5d33f27a 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -9,6 +9,7 @@
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/vm_manager.h"
 #include "core/memory.h"
+#include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 
@@ -84,7 +85,9 @@ GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) {
     const auto cpu_addr = GpuToCpuAddress(gpu_addr);
     ASSERT(cpu_addr);
 
-    rasterizer.FlushAndInvalidateRegion(cache_addr, aligned_size);
+    // Flush and invalidate through the GPU interface, to be asynchronous if possible.
+    system.GPU().FlushAndInvalidateRegion(cache_addr, aligned_size);
+
     UnmapRange(gpu_addr, aligned_size);
     ASSERT(system.CurrentProcess()
                ->VMManager()
@@ -242,6 +245,8 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s
         switch (page_table.attributes[page_index]) {
         case Common::PageType::Memory: {
             const u8* src_ptr{page_table.pointers[page_index] + page_offset};
+            // Flush must happen on the rasterizer interface, such that memory is always synchronous
+            // when it is read (even when in asynchronous GPU mode). Fixes Dead Cells title menu.
             rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
             std::memcpy(dest_buffer, src_ptr, copy_amount);
             break;
@@ -292,6 +297,8 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const
         switch (page_table.attributes[page_index]) {
         case Common::PageType::Memory: {
             u8* dest_ptr{page_table.pointers[page_index] + page_offset};
+            // Invalidate must happen on the rasterizer interface, such that memory is always
+            // synchronous when it is written (even when in asynchronous GPU mode).
             rasterizer.InvalidateRegion(ToCacheAddr(dest_ptr), copy_amount);
             std::memcpy(dest_ptr, src_buffer, copy_amount);
             break;
@@ -339,6 +346,8 @@ void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::
 
         switch (page_table.attributes[page_index]) {
         case Common::PageType::Memory: {
+            // Flush must happen on the rasterizer interface, such that memory is always synchronous
+            // when it is copied (even when in asynchronous GPU mode).
             const u8* src_ptr{page_table.pointers[page_index] + page_offset};
             rasterizer.FlushRegion(ToCacheAddr(src_ptr), copy_amount);
             WriteBlock(dest_addr, src_ptr, copy_amount);
diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp
index 2f2fe6859..f2c83266e 100644
--- a/src/video_core/morton.cpp
+++ b/src/video_core/morton.cpp
@@ -85,6 +85,7 @@ static constexpr ConversionArray morton_to_linear_fns = {
     MortonCopy<true, PixelFormat::RG32UI>,
     MortonCopy<true, PixelFormat::RGBX16F>,
     MortonCopy<true, PixelFormat::R32UI>,
+    MortonCopy<true, PixelFormat::R32I>,
     MortonCopy<true, PixelFormat::ASTC_2D_8X8>,
     MortonCopy<true, PixelFormat::ASTC_2D_8X5>,
     MortonCopy<true, PixelFormat::ASTC_2D_5X4>,
@@ -166,6 +167,7 @@ static constexpr ConversionArray linear_to_morton_fns = {
     MortonCopy<false, PixelFormat::RG32UI>,
     MortonCopy<false, PixelFormat::RGBX16F>,
     MortonCopy<false, PixelFormat::R32UI>,
+    MortonCopy<false, PixelFormat::R32I>,
     nullptr,
     nullptr,
     nullptr,
diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h
new file mode 100644
index 000000000..e66054ed0
--- /dev/null
+++ b/src/video_core/query_cache.h
@@ -0,0 +1,359 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <unordered_map>
+#include <vector>
+
+#include "common/assert.h"
+#include "core/core.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/gpu.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCommon {
+
+template <class QueryCache, class HostCounter>
+class CounterStreamBase {
+public:
+    explicit CounterStreamBase(QueryCache& cache, VideoCore::QueryType type)
+        : cache{cache}, type{type} {}
+
+    /// Updates the state of the stream, enabling or disabling as needed.
+    void Update(bool enabled) {
+        if (enabled) {
+            Enable();
+        } else {
+            Disable();
+        }
+    }
+
+    /// Resets the stream to zero. It doesn't disable the query after resetting.
+    void Reset() {
+        if (current) {
+            current->EndQuery();
+
+            // Immediately start a new query to avoid disabling its state.
+            current = cache.Counter(nullptr, type);
+        }
+        last = nullptr;
+    }
+
+    /// Returns the current counter slicing as needed.
+    std::shared_ptr<HostCounter> Current() {
+        if (!current) {
+            return nullptr;
+        }
+        current->EndQuery();
+        last = std::move(current);
+        current = cache.Counter(last, type);
+        return last;
+    }
+
+    /// Returns true when the counter stream is enabled.
+    bool IsEnabled() const {
+        return current != nullptr;
+    }
+
+private:
+    /// Enables the stream.
+    void Enable() {
+        if (current) {
+            return;
+        }
+        current = cache.Counter(last, type);
+    }
+
+    // Disables the stream.
+    void Disable() {
+        if (current) {
+            current->EndQuery();
+        }
+        last = std::exchange(current, nullptr);
+    }
+
+    QueryCache& cache;
+    const VideoCore::QueryType type;
+
+    std::shared_ptr<HostCounter> current;
+    std::shared_ptr<HostCounter> last;
+};
+
+template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter,
+          class QueryPool>
+class QueryCacheBase {
+public:
+    explicit QueryCacheBase(Core::System& system, VideoCore::RasterizerInterface& rasterizer)
+        : system{system}, rasterizer{rasterizer}, streams{{CounterStream{
+                                                      static_cast<QueryCache&>(*this),
+                                                      VideoCore::QueryType::SamplesPassed}}} {}
+
+    void InvalidateRegion(CacheAddr addr, std::size_t size) {
+        std::unique_lock lock{mutex};
+        FlushAndRemoveRegion(addr, size);
+    }
+
+    void FlushRegion(CacheAddr addr, std::size_t size) {
+        std::unique_lock lock{mutex};
+        FlushAndRemoveRegion(addr, size);
+    }
+
+    /**
+     * Records a query in GPU mapped memory, potentially marked with a timestamp.
+     * @param gpu_addr  GPU address to flush to when the mapped memory is read.
+     * @param type      Query type, e.g. SamplesPassed.
+     * @param timestamp Timestamp, when empty the flushed query is assumed to be short.
+     */
+    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) {
+        std::unique_lock lock{mutex};
+        auto& memory_manager = system.GPU().MemoryManager();
+        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
+
+        CachedQuery* query = TryGet(ToCacheAddr(host_ptr));
+        if (!query) {
+            const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
+            ASSERT_OR_EXECUTE(cpu_addr, return;);
+
+            query = Register(type, *cpu_addr, host_ptr, timestamp.has_value());
+        }
+
+        query->BindCounter(Stream(type).Current(), timestamp);
+    }
+
+    /// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch.
+    void UpdateCounters() {
+        std::unique_lock lock{mutex};
+        const auto& regs = system.GPU().Maxwell3D().regs;
+        Stream(VideoCore::QueryType::SamplesPassed).Update(regs.samplecnt_enable);
+    }
+
+    /// Resets a counter to zero. It doesn't disable the query after resetting.
+    void ResetCounter(VideoCore::QueryType type) {
+        std::unique_lock lock{mutex};
+        Stream(type).Reset();
+    }
+
+    /// Disable all active streams. Expected to be called at the end of a command buffer.
+    void DisableStreams() {
+        std::unique_lock lock{mutex};
+        for (auto& stream : streams) {
+            stream.Update(false);
+        }
+    }
+
+    /// Returns a new host counter.
+    std::shared_ptr<HostCounter> Counter(std::shared_ptr<HostCounter> dependency,
+                                         VideoCore::QueryType type) {
+        return std::make_shared<HostCounter>(static_cast<QueryCache&>(*this), std::move(dependency),
+                                             type);
+    }
+
+    /// Returns the counter stream of the specified type.
+    CounterStream& Stream(VideoCore::QueryType type) {
+        return streams[static_cast<std::size_t>(type)];
+    }
+
+    /// Returns the counter stream of the specified type.
+    const CounterStream& Stream(VideoCore::QueryType type) const {
+        return streams[static_cast<std::size_t>(type)];
+    }
+
+protected:
+    std::array<QueryPool, VideoCore::NumQueryTypes> query_pools;
+
+private:
+    /// Flushes a memory range to guest memory and removes it from the cache.
+    void FlushAndRemoveRegion(CacheAddr addr, std::size_t size) {
+        const u64 addr_begin = static_cast<u64>(addr);
+        const u64 addr_end = addr_begin + static_cast<u64>(size);
+        const auto in_range = [addr_begin, addr_end](CachedQuery& query) {
+            const u64 cache_begin = query.GetCacheAddr();
+            const u64 cache_end = cache_begin + query.SizeInBytes();
+            return cache_begin < addr_end && addr_begin < cache_end;
+        };
+
+        const u64 page_end = addr_end >> PAGE_SHIFT;
+        for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) {
+            const auto& it = cached_queries.find(page);
+            if (it == std::end(cached_queries)) {
+                continue;
+            }
+            auto& contents = it->second;
+            for (auto& query : contents) {
+                if (!in_range(query)) {
+                    continue;
+                }
+                rasterizer.UpdatePagesCachedCount(query.CpuAddr(), query.SizeInBytes(), -1);
+                query.Flush();
+            }
+            contents.erase(std::remove_if(std::begin(contents), std::end(contents), in_range),
+                           std::end(contents));
+        }
+    }
+
+    /// Registers the passed parameters as cached and returns a pointer to the stored cached query.
+    CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) {
+        rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1);
+        const u64 page = static_cast<u64>(ToCacheAddr(host_ptr)) >> PAGE_SHIFT;
+        return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr,
+                                                  host_ptr);
+    }
+
+    /// Tries to a get a cached query. Returns nullptr on failure.
+    CachedQuery* TryGet(CacheAddr addr) {
+        const u64 page = static_cast<u64>(addr) >> PAGE_SHIFT;
+        const auto it = cached_queries.find(page);
+        if (it == std::end(cached_queries)) {
+            return nullptr;
+        }
+        auto& contents = it->second;
+        const auto found =
+            std::find_if(std::begin(contents), std::end(contents),
+                         [addr](auto& query) { return query.GetCacheAddr() == addr; });
+        return found != std::end(contents) ? &*found : nullptr;
+    }
+
+    static constexpr std::uintptr_t PAGE_SIZE = 4096;
+    static constexpr unsigned PAGE_SHIFT = 12;
+
+    Core::System& system;
+    VideoCore::RasterizerInterface& rasterizer;
+
+    std::recursive_mutex mutex;
+
+    std::unordered_map<u64, std::vector<CachedQuery>> cached_queries;
+
+    std::array<CounterStream, VideoCore::NumQueryTypes> streams;
+};
+
+template <class QueryCache, class HostCounter>
+class HostCounterBase {
+public:
+    explicit HostCounterBase(std::shared_ptr<HostCounter> dependency_)
+        : dependency{std::move(dependency_)}, depth{dependency ? (dependency->Depth() + 1) : 0} {
+        // Avoid nesting too many dependencies to avoid a stack overflow when these are deleted.
+        constexpr u64 depth_threshold = 96;
+        if (depth > depth_threshold) {
+            depth = 0;
+            base_result = dependency->Query();
+            dependency = nullptr;
+        }
+    }
+    virtual ~HostCounterBase() = default;
+
+    /// Returns the current value of the query.
+    u64 Query() {
+        if (result) {
+            return *result;
+        }
+
+        u64 value = BlockingQuery() + base_result;
+        if (dependency) {
+            value += dependency->Query();
+            dependency = nullptr;
+        }
+
+        result = value;
+        return *result;
+    }
+
+    /// Returns true when flushing this query will potentially wait.
+    bool WaitPending() const noexcept {
+        return result.has_value();
+    }
+
+    u64 Depth() const noexcept {
+        return depth;
+    }
+
+protected:
+    /// Returns the value of query from the backend API blocking as needed.
+    virtual u64 BlockingQuery() const = 0;
+
+private:
+    std::shared_ptr<HostCounter> dependency; ///< Counter to add to this value.
+    std::optional<u64> result;               ///< Filled with the already returned value.
+    u64 depth;                               ///< Number of nested dependencies.
+    u64 base_result = 0;                     ///< Equivalent to nested dependencies value.
+};
+
+template <class HostCounter>
+class CachedQueryBase {
+public:
+    explicit CachedQueryBase(VAddr cpu_addr, u8* host_ptr)
+        : cpu_addr{cpu_addr}, host_ptr{host_ptr} {}
+    virtual ~CachedQueryBase() = default;
+
+    CachedQueryBase(CachedQueryBase&&) noexcept = default;
+    CachedQueryBase(const CachedQueryBase&) = delete;
+
+    CachedQueryBase& operator=(CachedQueryBase&&) noexcept = default;
+    CachedQueryBase& operator=(const CachedQueryBase&) = delete;
+
+    /// Flushes the query to guest memory.
+    virtual void Flush() {
+        // When counter is nullptr it means that it's just been reseted. We are supposed to write a
+        // zero in these cases.
+        const u64 value = counter ? counter->Query() : 0;
+        std::memcpy(host_ptr, &value, sizeof(u64));
+
+        if (timestamp) {
+            std::memcpy(host_ptr + TIMESTAMP_OFFSET, &*timestamp, sizeof(u64));
+        }
+    }
+
+    /// Binds a counter to this query.
+    void BindCounter(std::shared_ptr<HostCounter> counter_, std::optional<u64> timestamp_) {
+        if (counter) {
+            // If there's an old counter set it means the query is being rewritten by the game.
+            // To avoid losing the data forever, flush here.
+            Flush();
+        }
+        counter = std::move(counter_);
+        timestamp = timestamp_;
+    }
+
+    VAddr CpuAddr() const noexcept {
+        return cpu_addr;
+    }
+
+    CacheAddr GetCacheAddr() const noexcept {
+        return ToCacheAddr(host_ptr);
+    }
+
+    u64 SizeInBytes() const noexcept {
+        return SizeInBytes(timestamp.has_value());
+    }
+
+    static constexpr u64 SizeInBytes(bool with_timestamp) noexcept {
+        return with_timestamp ? LARGE_QUERY_SIZE : SMALL_QUERY_SIZE;
+    }
+
+protected:
+    /// Returns true when querying the counter may potentially block.
+    bool WaitPending() const noexcept {
+        return counter && counter->WaitPending();
+    }
+
+private:
+    static constexpr std::size_t SMALL_QUERY_SIZE = 8;   // Query size without timestamp.
+    static constexpr std::size_t LARGE_QUERY_SIZE = 16;  // Query size with timestamp.
+    static constexpr std::intptr_t TIMESTAMP_OFFSET = 8; // Timestamp offset in a large query.
+
+    VAddr cpu_addr;                       ///< Guest CPU address.
+    u8* host_ptr;                         ///< Writable host pointer.
+    std::shared_ptr<HostCounter> counter; ///< Host counter to query, owns the dependency tree.
+    std::optional<u64> timestamp;         ///< Timestamp to flush to guest memory.
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index c586cd6fe..f18eaf4bc 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -6,6 +6,7 @@
 
 #include <atomic>
 #include <functional>
+#include <optional>
 #include "common/common_types.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/gpu.h"
@@ -17,6 +18,11 @@ class MemoryManager;
 
 namespace VideoCore {
 
+enum class QueryType {
+    SamplesPassed,
+};
+constexpr std::size_t NumQueryTypes = 1;
+
 enum class LoadCallbackStage {
     Prepare,
     Decompile,
@@ -29,11 +35,8 @@ class RasterizerInterface {
 public:
     virtual ~RasterizerInterface() {}
 
-    /// Draw the current batch of vertex arrays
-    virtual bool DrawBatch(bool is_indexed) = 0;
-
-    /// Draw the current batch of multiple instances of vertex arrays
-    virtual bool DrawMultiBatch(bool is_indexed) = 0;
+    /// Dispatches a draw invocation
+    virtual void Draw(bool is_indexed, bool is_instanced) = 0;
 
     /// Clear the current framebuffer
     virtual void Clear() = 0;
@@ -41,6 +44,12 @@ public:
     /// Dispatches a compute shader invocation
     virtual void DispatchCompute(GPUVAddr code_addr) = 0;
 
+    /// Resets the counter of a query
+    virtual void ResetCounter(QueryType type) = 0;
+
+    /// Records a GPU query and caches it
+    virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0;
+
     /// Notify rasterizer that all caches should be flushed to Switch memory
     virtual void FlushAll() = 0;
 
diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp
new file mode 100644
index 000000000..f12e9f55f
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_query_cache.cpp
@@ -0,0 +1,120 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <cstring>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <glad/glad.h>
+
+#include "common/assert.h"
+#include "core/core.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_query_cache.h"
+#include "video_core/renderer_opengl/gl_rasterizer.h"
+
+namespace OpenGL {
+
+namespace {
+
+constexpr std::array<GLenum, VideoCore::NumQueryTypes> QueryTargets = {GL_SAMPLES_PASSED};
+
+constexpr GLenum GetTarget(VideoCore::QueryType type) {
+    return QueryTargets[static_cast<std::size_t>(type)];
+}
+
+} // Anonymous namespace
+
+QueryCache::QueryCache(Core::System& system, RasterizerOpenGL& gl_rasterizer)
+    : VideoCommon::QueryCacheBase<
+          QueryCache, CachedQuery, CounterStream, HostCounter,
+          std::vector<OGLQuery>>{system,
+                                 static_cast<VideoCore::RasterizerInterface&>(gl_rasterizer)},
+      gl_rasterizer{gl_rasterizer} {}
+
+QueryCache::~QueryCache() = default;
+
+OGLQuery QueryCache::AllocateQuery(VideoCore::QueryType type) {
+    auto& reserve = query_pools[static_cast<std::size_t>(type)];
+    OGLQuery query;
+    if (reserve.empty()) {
+        query.Create(GetTarget(type));
+        return query;
+    }
+
+    query = std::move(reserve.back());
+    reserve.pop_back();
+    return query;
+}
+
+void QueryCache::Reserve(VideoCore::QueryType type, OGLQuery&& query) {
+    query_pools[static_cast<std::size_t>(type)].push_back(std::move(query));
+}
+
+bool QueryCache::AnyCommandQueued() const noexcept {
+    return gl_rasterizer.AnyCommandQueued();
+}
+
+HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type)
+    : VideoCommon::HostCounterBase<QueryCache, HostCounter>{std::move(dependency)}, cache{cache},
+      type{type}, query{cache.AllocateQuery(type)} {
+    glBeginQuery(GetTarget(type), query.handle);
+}
+
+HostCounter::~HostCounter() {
+    cache.Reserve(type, std::move(query));
+}
+
+void HostCounter::EndQuery() {
+    if (!cache.AnyCommandQueued()) {
+        // There are chances a query waited on without commands (glDraw, glClear, glDispatch). Not
+        // having any of these causes a lock. glFlush is considered a command, so we can safely wait
+        // for this. Insert to the OpenGL command stream a flush.
+        glFlush();
+    }
+    glEndQuery(GetTarget(type));
+}
+
+u64 HostCounter::BlockingQuery() const {
+    GLint64 value;
+    glGetQueryObjecti64v(query.handle, GL_QUERY_RESULT, &value);
+    return static_cast<u64>(value);
+}
+
+CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr)
+    : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {}
+
+CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept
+    : VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {}
+
+CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept {
+    VideoCommon::CachedQueryBase<HostCounter>::operator=(std::move(rhs));
+    cache = rhs.cache;
+    type = rhs.type;
+    return *this;
+}
+
+void CachedQuery::Flush() {
+    // Waiting for a query while another query of the same target is enabled locks Nvidia's driver.
+    // To avoid this disable and re-enable keeping the dependency stream.
+    // But we only have to do this if we have pending waits to be done.
+    auto& stream = cache->Stream(type);
+    const bool slice_counter = WaitPending() && stream.IsEnabled();
+    if (slice_counter) {
+        stream.Update(false);
+    }
+
+    VideoCommon::CachedQueryBase<HostCounter>::Flush();
+
+    if (slice_counter) {
+        stream.Update(true);
+    }
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h
new file mode 100644
index 000000000..d8e7052a1
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_query_cache.h
@@ -0,0 +1,78 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/query_cache.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+
+namespace Core {
+class System;
+}
+
+namespace OpenGL {
+
+class CachedQuery;
+class HostCounter;
+class QueryCache;
+class RasterizerOpenGL;
+
+using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>;
+
+class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream,
+                                                            HostCounter, std::vector<OGLQuery>> {
+public:
+    explicit QueryCache(Core::System& system, RasterizerOpenGL& rasterizer);
+    ~QueryCache();
+
+    OGLQuery AllocateQuery(VideoCore::QueryType type);
+
+    void Reserve(VideoCore::QueryType type, OGLQuery&& query);
+
+    bool AnyCommandQueued() const noexcept;
+
+private:
+    RasterizerOpenGL& gl_rasterizer;
+};
+
+class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> {
+public:
+    explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type);
+    ~HostCounter();
+
+    void EndQuery();
+
+private:
+    u64 BlockingQuery() const override;
+
+    QueryCache& cache;
+    const VideoCore::QueryType type;
+    OGLQuery query;
+};
+
+class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> {
+public:
+    explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr,
+                         u8* host_ptr);
+    CachedQuery(CachedQuery&& rhs) noexcept;
+    CachedQuery(const CachedQuery&) = delete;
+
+    CachedQuery& operator=(CachedQuery&& rhs) noexcept;
+    CachedQuery& operator=(const CachedQuery&) = delete;
+
+    void Flush() override;
+
+private:
+    QueryCache* cache;
+    VideoCore::QueryType type;
+};
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index b0eb14c8b..e1965fb21 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -25,6 +25,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_query_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
@@ -92,8 +93,8 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
                                    ScreenInfo& info)
     : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device},
-      shader_cache{*this, system, emu_window, device}, system{system}, screen_info{info},
-      buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
+      shader_cache{*this, system, emu_window, device}, query_cache{system, *this}, system{system},
+      screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
     shader_program_manager = std::make_unique<GLShader::ProgramManager>();
     state.draw.shader_program = 0;
     state.Apply();
@@ -541,11 +542,16 @@ void RasterizerOpenGL::Clear() {
     } else if (use_stencil) {
         glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil);
     }
+
+    ++num_queued_commands;
 }
 
 void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     MICROPROFILE_SCOPE(OpenGL_Drawing);
     auto& gpu = system.GPU().Maxwell3D();
+    const auto& regs = gpu.regs;
+
+    query_cache.UpdateCounters();
 
     SyncRasterizeEnable(state);
     SyncColorMask();
@@ -611,7 +617,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
 
     // Setup shaders and their used resources.
     texture_cache.GuardSamplers(true);
-    const auto primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology);
+    const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(gpu.regs.draw.topology);
     SetupShaders(primitive_mode);
     texture_cache.GuardSamplers(false);
 
@@ -638,35 +644,47 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
         glTextureBarrier();
     }
 
+    ++num_queued_commands;
+
     const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance);
     const GLsizei num_instances =
         static_cast<GLsizei>(is_instanced ? gpu.mme_draw.instance_count : 1);
     if (is_indexed) {
-        const GLenum index_format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format);
         const GLint base_vertex = static_cast<GLint>(gpu.regs.vb_element_base);
         const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.index_array.count);
-        glDrawElementsInstancedBaseVertexBaseInstance(
-            primitive_mode, num_vertices, index_format,
-            reinterpret_cast<const void*>(index_buffer_offset), num_instances, base_vertex,
-            base_instance);
+        const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset);
+        const GLenum format = MaxwellToGL::IndexFormat(gpu.regs.index_array.format);
+        if (num_instances == 1 && base_instance == 0 && base_vertex == 0) {
+            glDrawElements(primitive_mode, num_vertices, format, offset);
+        } else if (num_instances == 1 && base_instance == 0) {
+            glDrawElementsBaseVertex(primitive_mode, num_vertices, format, offset, base_vertex);
+        } else if (base_vertex == 0 && base_instance == 0) {
+            glDrawElementsInstanced(primitive_mode, num_vertices, format, offset, num_instances);
+        } else if (base_vertex == 0) {
+            glDrawElementsInstancedBaseInstance(primitive_mode, num_vertices, format, offset,
+                                                num_instances, base_instance);
+        } else if (base_instance == 0) {
+            glDrawElementsInstancedBaseVertex(primitive_mode, num_vertices, format, offset,
+                                              num_instances, base_vertex);
+        } else {
+            glDrawElementsInstancedBaseVertexBaseInstance(primitive_mode, num_vertices, format,
+                                                          offset, num_instances, base_vertex,
+                                                          base_instance);
+        }
     } else {
         const GLint base_vertex = static_cast<GLint>(gpu.regs.vertex_buffer.first);
         const GLsizei num_vertices = static_cast<GLsizei>(gpu.regs.vertex_buffer.count);
-        glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices, num_instances,
-                                          base_instance);
+        if (num_instances == 1 && base_instance == 0) {
+            glDrawArrays(primitive_mode, base_vertex, num_vertices);
+        } else if (base_instance == 0) {
+            glDrawArraysInstanced(primitive_mode, base_vertex, num_vertices, num_instances);
+        } else {
+            glDrawArraysInstancedBaseInstance(primitive_mode, base_vertex, num_vertices,
+                                              num_instances, base_instance);
+        }
     }
 }
 
-bool RasterizerOpenGL::DrawBatch(bool is_indexed) {
-    Draw(is_indexed, false);
-    return true;
-}
-
-bool RasterizerOpenGL::DrawMultiBatch(bool is_indexed) {
-    Draw(is_indexed, true);
-    return true;
-}
-
 void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
     if (device.HasBrokenCompute()) {
         return;
@@ -707,6 +725,16 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
     state.ApplyProgramPipeline();
 
     glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
+    ++num_queued_commands;
+}
+
+void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) {
+    query_cache.ResetCounter(type);
+}
+
+void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
+                             std::optional<u64> timestamp) {
+    query_cache.Query(gpu_addr, type, timestamp);
 }
 
 void RasterizerOpenGL::FlushAll() {}
@@ -718,6 +746,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
     }
     texture_cache.FlushRegion(addr, size);
     buffer_cache.FlushRegion(addr, size);
+    query_cache.FlushRegion(addr, size);
 }
 
 void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
@@ -728,6 +757,7 @@ void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
     texture_cache.InvalidateRegion(addr, size);
     shader_cache.InvalidateRegion(addr, size);
     buffer_cache.InvalidateRegion(addr, size);
+    query_cache.InvalidateRegion(addr, size);
 }
 
 void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
@@ -738,10 +768,18 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
 }
 
 void RasterizerOpenGL::FlushCommands() {
+    // Only flush when we have commands queued to OpenGL.
+    if (num_queued_commands == 0) {
+        return;
+    }
+    num_queued_commands = 0;
     glFlush();
 }
 
 void RasterizerOpenGL::TickFrame() {
+    // Ticking a frame means that buffers will be swapped, calling glFlush implicitly.
+    num_queued_commands = 0;
+
     buffer_cache.TickFrame();
 }
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 0501f3828..68abe9a21 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -24,6 +24,7 @@
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_framebuffer_cache.h"
+#include "video_core/renderer_opengl/gl_query_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_sampler_cache.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
@@ -57,10 +58,11 @@ public:
                               ScreenInfo& info);
     ~RasterizerOpenGL() override;
 
-    bool DrawBatch(bool is_indexed) override;
-    bool DrawMultiBatch(bool is_indexed) override;
+    void Draw(bool is_indexed, bool is_instanced) override;
     void Clear() override;
     void DispatchCompute(GPUVAddr code_addr) override;
+    void ResetCounter(VideoCore::QueryType type) override;
+    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
     void FlushAll() override;
     void FlushRegion(CacheAddr addr, u64 size) override;
     void InvalidateRegion(CacheAddr addr, u64 size) override;
@@ -75,6 +77,11 @@ public:
     void LoadDiskResources(const std::atomic_bool& stop_loading,
                            const VideoCore::DiskResourceLoadCallback& callback) override;
 
+    /// Returns true when there are commands queued to the OpenGL server.
+    bool AnyCommandQueued() const {
+        return num_queued_commands > 0;
+    }
+
 private:
     /// Configures the color and depth framebuffer states.
     void ConfigureFramebuffers();
@@ -102,9 +109,6 @@ private:
     void SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
                            std::size_t size);
 
-    /// Syncs all the state, shaders, render targets and textures setting before a draw call.
-    void Draw(bool is_indexed, bool is_instanced);
-
     /// Configures the current textures to use for the draw command.
     void SetupDrawTextures(std::size_t stage_index, const Shader& shader);
 
@@ -180,10 +184,23 @@ private:
     /// Syncs the alpha test state to match the guest state
     void SyncAlphaTest();
 
-    /// Check for extension that are not strictly required
-    /// but are needed for correct emulation
+    /// Check for extension that are not strictly required but are needed for correct emulation
     void CheckExtensions();
 
+    std::size_t CalculateVertexArraysSize() const;
+
+    std::size_t CalculateIndexBufferSize() const;
+
+    /// Updates and returns a vertex array object representing current vertex format
+    GLuint SetupVertexFormat();
+
+    void SetupVertexBuffer(GLuint vao);
+    void SetupVertexInstances(GLuint vao);
+
+    GLintptr SetupIndexBuffer();
+
+    void SetupShaders(GLenum primitive_mode);
+
     const Device device;
     OpenGLState state;
 
@@ -191,6 +208,7 @@ private:
     ShaderCacheOpenGL shader_cache;
     SamplerCacheOpenGL sampler_cache;
     FramebufferCacheOpenGL framebuffer_cache;
+    QueryCache query_cache;
 
     Core::System& system;
     ScreenInfo& screen_info;
@@ -208,19 +226,8 @@ private:
     BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
     BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
 
-    std::size_t CalculateVertexArraysSize() const;
-
-    std::size_t CalculateIndexBufferSize() const;
-
-    /// Updates and returns a vertex array object representing current vertex format
-    GLuint SetupVertexFormat();
-
-    void SetupVertexBuffer(GLuint vao);
-    void SetupVertexInstances(GLuint vao);
-
-    GLintptr SetupIndexBuffer();
-
-    void SetupShaders(GLenum primitive_mode);
+    /// Number of commands queued to the OpenGL driver. Reseted on flush.
+    std::size_t num_queued_commands = 0;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp
index 5c96c1d46..f0ddfb276 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -207,4 +207,21 @@ void OGLFramebuffer::Release() {
     handle = 0;
 }
 
+void OGLQuery::Create(GLenum target) {
+    if (handle != 0)
+        return;
+
+    MICROPROFILE_SCOPE(OpenGL_ResourceCreation);
+    glCreateQueries(target, 1, &handle);
+}
+
+void OGLQuery::Release() {
+    if (handle == 0)
+        return;
+
+    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
+    glDeleteQueries(1, &handle);
+    handle = 0;
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h
index 3a85a1d4c..514d1d165 100644
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -266,4 +266,29 @@ public:
     GLuint handle = 0;
 };
 
+class OGLQuery : private NonCopyable {
+public:
+    OGLQuery() = default;
+
+    OGLQuery(OGLQuery&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
+    ~OGLQuery() {
+        Release();
+    }
+
+    OGLQuery& operator=(OGLQuery&& o) noexcept {
+        Release();
+        handle = std::exchange(o.handle, 0);
+        return *this;
+    }
+
+    /// Creates a new internal OpenGL resource and stores the handle
+    void Create(GLenum target);
+
+    /// Deletes the internal OpenGL resource
+    void Release();
+
+    GLuint handle = 0;
+};
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index d4b81cd87..cf934b0d8 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -87,6 +87,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format
     {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, false},                             // RG32UI
     {GL_RGB16F, GL_RGBA, GL_HALF_FLOAT, false},                                     // RGBX16F
     {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, false},                             // R32UI
+    {GL_R32I, GL_RED_INTEGER, GL_INT, false},                                       // R32I
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false},                                   // ASTC_2D_8X8
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false},                                   // ASTC_2D_8X5
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false},                                   // ASTC_2D_5X4
@@ -260,6 +261,13 @@ CachedSurface::~CachedSurface() = default;
 void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) {
     MICROPROFILE_SCOPE(OpenGL_Texture_Download);
 
+    if (params.IsBuffer()) {
+        glGetNamedBufferSubData(texture_buffer.handle, 0,
+                                static_cast<GLsizeiptr>(params.GetHostSizeInBytes()),
+                                staging_buffer.data());
+        return;
+    }
+
     SCOPE_EXIT({ glPixelStorei(GL_PACK_ROW_LENGTH, 0); });
 
     for (u32 level = 0; level < params.emulated_levels; ++level) {
@@ -398,24 +406,36 @@ CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& p
 CachedSurfaceView::~CachedSurfaceView() = default;
 
 void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
-    ASSERT(params.num_layers == 1 && params.num_levels == 1);
+    ASSERT(params.num_levels == 1);
 
-    const auto& owner_params = surface.GetSurfaceParams();
+    const GLuint texture = surface.GetTexture();
+    if (params.num_layers > 1) {
+        // Layered framebuffer attachments
+        UNIMPLEMENTED_IF(params.base_layer != 0);
+
+        switch (params.target) {
+        case SurfaceTarget::Texture2DArray:
+            glFramebufferTexture(target, attachment, texture, params.base_level);
+            break;
+        default:
+            UNIMPLEMENTED();
+        }
+        return;
+    }
 
-    switch (owner_params.target) {
+    const GLenum view_target = surface.GetTarget();
+    switch (surface.GetSurfaceParams().target) {
     case SurfaceTarget::Texture1D:
-        glFramebufferTexture1D(target, attachment, surface.GetTarget(), surface.GetTexture(),
-                               params.base_level);
+        glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level);
         break;
     case SurfaceTarget::Texture2D:
-        glFramebufferTexture2D(target, attachment, surface.GetTarget(), surface.GetTexture(),
-                               params.base_level);
+        glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level);
         break;
     case SurfaceTarget::Texture1DArray:
     case SurfaceTarget::Texture2DArray:
     case SurfaceTarget::TextureCubemap:
     case SurfaceTarget::TextureCubeArray:
-        glFramebufferTextureLayer(target, attachment, surface.GetTexture(), params.base_level,
+        glFramebufferTextureLayer(target, attachment, texture, params.base_level,
                                   params.base_layer);
         break;
     default:
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 331808113..ef66dd141 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -159,12 +159,13 @@ struct FormatTuple {
     {vk::Format::eR32G32Uint, Attachable | Storage},             // RG32UI
     {vk::Format::eUndefined, {}},                                // RGBX16F
     {vk::Format::eR32Uint, Attachable | Storage},                // R32UI
+    {vk::Format::eR32Sint, Attachable | Storage},                // R32I
     {vk::Format::eAstc8x8UnormBlock, {}},                        // ASTC_2D_8X8
     {vk::Format::eUndefined, {}},                                // ASTC_2D_8X5
     {vk::Format::eUndefined, {}},                                // ASTC_2D_5X4
     {vk::Format::eUndefined, {}},                                // BGRA8_SRGB
     {vk::Format::eBc1RgbaSrgbBlock, {}},                         // DXT1_SRGB
-    {vk::Format::eUndefined, {}},                                // DXT23_SRGB
+    {vk::Format::eBc2SrgbBlock, {}},                             // DXT23_SRGB
     {vk::Format::eBc3SrgbBlock, {}},                             // DXT45_SRGB
     {vk::Format::eBc7SrgbBlock, {}},                             // BC7U_SRGB
     {vk::Format::eR4G4B4A4UnormPack16, Attachable},              // R4G4B4A4U
@@ -363,6 +364,8 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr
             return vk::Format::eR8G8B8A8Uint;
         case Maxwell::VertexAttribute::Size::Size_32:
             return vk::Format::eR32Uint;
+        case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
+            return vk::Format::eR32G32B32A32Uint;
         default:
             break;
         }
diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp
index 9840f26e5..886bde3b9 100644
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -104,8 +104,11 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
     features.depthBiasClamp = true;
     features.geometryShader = true;
     features.tessellationShader = true;
+    features.occlusionQueryPrecise = true;
     features.fragmentStoresAndAtomics = true;
     features.shaderImageGatherExtended = true;
+    features.shaderStorageImageReadWithoutFormat =
+        is_shader_storage_img_read_without_format_supported;
     features.shaderStorageImageWriteWithoutFormat = true;
     features.textureCompressionASTC_LDR = is_optimal_astc_supported;
 
@@ -117,6 +120,10 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
     bit8_storage.uniformAndStorageBuffer8BitAccess = true;
     SetNext(next, bit8_storage);
 
+    vk::PhysicalDeviceHostQueryResetFeaturesEXT host_query_reset;
+    host_query_reset.hostQueryReset = true;
+    SetNext(next, host_query_reset);
+
     vk::PhysicalDeviceFloat16Int8FeaturesKHR float16_int8;
     if (is_float16_supported) {
         float16_int8.shaderFloat16 = true;
@@ -273,6 +280,7 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev
         VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME,
         VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME,
         VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME,
+        VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME,
     };
     std::bitset<required_extensions.size()> available_extensions{};
 
@@ -340,6 +348,7 @@ bool VKDevice::IsSuitable(const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDev
         std::make_pair(features.depthBiasClamp, "depthBiasClamp"),
         std::make_pair(features.geometryShader, "geometryShader"),
         std::make_pair(features.tessellationShader, "tessellationShader"),
+        std::make_pair(features.occlusionQueryPrecise, "occlusionQueryPrecise"),
         std::make_pair(features.fragmentStoresAndAtomics, "fragmentStoresAndAtomics"),
         std::make_pair(features.shaderImageGatherExtended, "shaderImageGatherExtended"),
         std::make_pair(features.shaderStorageImageWriteWithoutFormat,
@@ -376,7 +385,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
         }
     };
 
-    extensions.reserve(13);
+    extensions.reserve(14);
     extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
     extensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME);
     extensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME);
@@ -384,6 +393,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
     extensions.push_back(VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME);
     extensions.push_back(VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME);
     extensions.push_back(VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME);
+    extensions.push_back(VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME);
 
     [[maybe_unused]] const bool nsight =
         std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
@@ -457,6 +467,8 @@ void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceK
 
 void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) {
     const auto supported_features{physical.getFeatures(dldi)};
+    is_shader_storage_img_read_without_format_supported =
+        supported_features.shaderStorageImageReadWithoutFormat;
     is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi);
 }
 
@@ -511,6 +523,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti
                                         vk::Format::eB10G11R11UfloatPack32,
                                         vk::Format::eR32Sfloat,
                                         vk::Format::eR32Uint,
+                                        vk::Format::eR32Sint,
                                         vk::Format::eR16Sfloat,
                                         vk::Format::eR16G16B16A16Sfloat,
                                         vk::Format::eB8G8R8A8Unorm,
@@ -530,6 +543,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti
                                         vk::Format::eBc6HUfloatBlock,
                                         vk::Format::eBc6HSfloatBlock,
                                         vk::Format::eBc1RgbaSrgbBlock,
+                                        vk::Format::eBc2SrgbBlock,
                                         vk::Format::eBc3SrgbBlock,
                                         vk::Format::eBc7SrgbBlock,
                                         vk::Format::eAstc4x4SrgbBlock,
diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h
index 72603f9f6..2c27ad730 100644
--- a/src/video_core/renderer_vulkan/vk_device.h
+++ b/src/video_core/renderer_vulkan/vk_device.h
@@ -122,6 +122,11 @@ public:
         return properties.limits.maxPushConstantsSize;
     }
 
+    /// Returns true if Shader storage Image Read Without Format supported.
+    bool IsShaderStorageImageReadWithoutFormatSupported() const {
+        return is_shader_storage_img_read_without_format_supported;
+    }
+
     /// Returns true if ASTC is natively supported.
     bool IsOptimalAstcSupported() const {
         return is_optimal_astc_supported;
@@ -227,6 +232,8 @@ private:
     bool ext_depth_range_unrestricted{};       ///< Support for VK_EXT_depth_range_unrestricted.
     bool ext_shader_viewport_index_layer{};    ///< Support for VK_EXT_shader_viewport_index_layer.
     bool nv_device_diagnostic_checkpoints{};   ///< Support for VK_NV_device_diagnostic_checkpoints.
+    bool is_shader_storage_img_read_without_format_supported{}; ///< Support for shader storage
+                                                                ///< image read without format
 
     // Telemetry parameters
     std::string vendor_name;                      ///< Device's driver name.
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp
new file mode 100644
index 000000000..ffbf60dda
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp
@@ -0,0 +1,122 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_query_cache.h"
+#include "video_core/renderer_vulkan/vk_resource_manager.h"
+#include "video_core/renderer_vulkan/vk_scheduler.h"
+
+namespace Vulkan {
+
+namespace {
+
+constexpr std::array QUERY_TARGETS = {vk::QueryType::eOcclusion};
+
+constexpr vk::QueryType GetTarget(VideoCore::QueryType type) {
+    return QUERY_TARGETS[static_cast<std::size_t>(type)];
+}
+
+} // Anonymous namespace
+
+QueryPool::QueryPool() : VKFencedPool{GROW_STEP} {}
+
+QueryPool::~QueryPool() = default;
+
+void QueryPool::Initialize(const VKDevice& device_, VideoCore::QueryType type_) {
+    device = &device_;
+    type = type_;
+}
+
+std::pair<vk::QueryPool, std::uint32_t> QueryPool::Commit(VKFence& fence) {
+    std::size_t index;
+    do {
+        index = CommitResource(fence);
+    } while (usage[index]);
+    usage[index] = true;
+
+    return {*pools[index / GROW_STEP], static_cast<std::uint32_t>(index % GROW_STEP)};
+}
+
+void QueryPool::Allocate(std::size_t begin, std::size_t end) {
+    usage.resize(end);
+
+    const auto dev = device->GetLogical();
+    const u32 size = static_cast<u32>(end - begin);
+    const vk::QueryPoolCreateInfo query_pool_ci({}, GetTarget(type), size, {});
+    pools.push_back(dev.createQueryPoolUnique(query_pool_ci, nullptr, device->GetDispatchLoader()));
+}
+
+void QueryPool::Reserve(std::pair<vk::QueryPool, std::uint32_t> query) {
+    const auto it =
+        std::find_if(std::begin(pools), std::end(pools),
+                     [query_pool = query.first](auto& pool) { return query_pool == *pool; });
+    ASSERT(it != std::end(pools));
+
+    const std::ptrdiff_t pool_index = std::distance(std::begin(pools), it);
+    usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false;
+}
+
+VKQueryCache::VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                           const VKDevice& device, VKScheduler& scheduler)
+    : VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter,
+                                  QueryPool>{system, rasterizer},
+      device{device}, scheduler{scheduler} {
+    for (std::size_t i = 0; i < static_cast<std::size_t>(VideoCore::NumQueryTypes); ++i) {
+        query_pools[i].Initialize(device, static_cast<VideoCore::QueryType>(i));
+    }
+}
+
+VKQueryCache::~VKQueryCache() = default;
+
+std::pair<vk::QueryPool, std::uint32_t> VKQueryCache::AllocateQuery(VideoCore::QueryType type) {
+    return query_pools[static_cast<std::size_t>(type)].Commit(scheduler.GetFence());
+}
+
+void VKQueryCache::Reserve(VideoCore::QueryType type,
+                           std::pair<vk::QueryPool, std::uint32_t> query) {
+    query_pools[static_cast<std::size_t>(type)].Reserve(query);
+}
+
+HostCounter::HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type)
+    : VideoCommon::HostCounterBase<VKQueryCache, HostCounter>{std::move(dependency)}, cache{cache},
+      type{type}, query{cache.AllocateQuery(type)}, ticks{cache.Scheduler().Ticks()} {
+    const auto dev = cache.Device().GetLogical();
+    cache.Scheduler().Record([dev, query = query](vk::CommandBuffer cmdbuf, auto& dld) {
+        dev.resetQueryPoolEXT(query.first, query.second, 1, dld);
+        cmdbuf.beginQuery(query.first, query.second, vk::QueryControlFlagBits::ePrecise, dld);
+    });
+}
+
+HostCounter::~HostCounter() {
+    cache.Reserve(type, query);
+}
+
+void HostCounter::EndQuery() {
+    cache.Scheduler().Record([query = query](auto cmdbuf, auto& dld) {
+        cmdbuf.endQuery(query.first, query.second, dld);
+    });
+}
+
+u64 HostCounter::BlockingQuery() const {
+    if (ticks >= cache.Scheduler().Ticks()) {
+        cache.Scheduler().Flush();
+    }
+
+    const auto dev = cache.Device().GetLogical();
+    const auto& dld = cache.Device().GetDispatchLoader();
+    u64 value;
+    dev.getQueryPoolResults(query.first, query.second, 1, sizeof(value), &value, sizeof(value),
+                            vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait, dld);
+    return value;
+}
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h
new file mode 100644
index 000000000..c3092ee96
--- /dev/null
+++ b/src/video_core/renderer_vulkan/vk_query_cache.h
@@ -0,0 +1,104 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "common/common_types.h"
+#include "video_core/query_cache.h"
+#include "video_core/renderer_vulkan/declarations.h"
+#include "video_core/renderer_vulkan/vk_resource_manager.h"
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace Vulkan {
+
+class CachedQuery;
+class HostCounter;
+class VKDevice;
+class VKQueryCache;
+class VKScheduler;
+
+using CounterStream = VideoCommon::CounterStreamBase<VKQueryCache, HostCounter>;
+
+class QueryPool final : public VKFencedPool {
+public:
+    explicit QueryPool();
+    ~QueryPool() override;
+
+    void Initialize(const VKDevice& device, VideoCore::QueryType type);
+
+    std::pair<vk::QueryPool, std::uint32_t> Commit(VKFence& fence);
+
+    void Reserve(std::pair<vk::QueryPool, std::uint32_t> query);
+
+protected:
+    void Allocate(std::size_t begin, std::size_t end) override;
+
+private:
+    static constexpr std::size_t GROW_STEP = 512;
+
+    const VKDevice* device = nullptr;
+    VideoCore::QueryType type = {};
+
+    std::vector<UniqueQueryPool> pools;
+    std::vector<bool> usage;
+};
+
+class VKQueryCache final
+    : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter,
+                                         QueryPool> {
+public:
+    explicit VKQueryCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                          const VKDevice& device, VKScheduler& scheduler);
+    ~VKQueryCache();
+
+    std::pair<vk::QueryPool, std::uint32_t> AllocateQuery(VideoCore::QueryType type);
+
+    void Reserve(VideoCore::QueryType type, std::pair<vk::QueryPool, std::uint32_t> query);
+
+    const VKDevice& Device() const noexcept {
+        return device;
+    }
+
+    VKScheduler& Scheduler() const noexcept {
+        return scheduler;
+    }
+
+private:
+    const VKDevice& device;
+    VKScheduler& scheduler;
+};
+
+class HostCounter final : public VideoCommon::HostCounterBase<VKQueryCache, HostCounter> {
+public:
+    explicit HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency,
+                         VideoCore::QueryType type);
+    ~HostCounter();
+
+    void EndQuery();
+
+private:
+    u64 BlockingQuery() const override;
+
+    VKQueryCache& cache;
+    const VideoCore::QueryType type;
+    const std::pair<vk::QueryPool, std::uint32_t> query;
+    const u64 ticks;
+};
+
+class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> {
+public:
+    explicit CachedQuery(VKQueryCache&, VideoCore::QueryType, VAddr cpu_addr, u8* host_ptr)
+        : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr} {}
+};
+
+} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index aada38702..3bf86da87 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -289,25 +289,19 @@ RasterizerVulkan::RasterizerVulkan(Core::System& system, Core::Frontend::EmuWind
                     staging_pool),
       pipeline_cache(system, *this, device, scheduler, descriptor_pool, update_descriptor_queue),
       buffer_cache(*this, system, device, memory_manager, scheduler, staging_pool),
-      sampler_cache(device) {}
-
-RasterizerVulkan::~RasterizerVulkan() = default;
-
-bool RasterizerVulkan::DrawBatch(bool is_indexed) {
-    Draw(is_indexed, false);
-    return true;
+      sampler_cache(device), query_cache(system, *this, device, scheduler) {
+    scheduler.SetQueryCache(query_cache);
 }
 
-bool RasterizerVulkan::DrawMultiBatch(bool is_indexed) {
-    Draw(is_indexed, true);
-    return true;
-}
+RasterizerVulkan::~RasterizerVulkan() = default;
 
 void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
     MICROPROFILE_SCOPE(Vulkan_Drawing);
 
     FlushWork();
 
+    query_cache.UpdateCounters();
+
     const auto& gpu = system.GPU().Maxwell3D();
     GraphicsPipelineCacheKey key{GetFixedPipelineState(gpu.regs)};
 
@@ -362,6 +356,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
 void RasterizerVulkan::Clear() {
     MICROPROFILE_SCOPE(Vulkan_Clearing);
 
+    query_cache.UpdateCounters();
+
     const auto& gpu = system.GPU().Maxwell3D();
     if (!system.GPU().Maxwell3D().ShouldExecute()) {
         return;
@@ -429,6 +425,8 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
     sampled_views.clear();
     image_views.clear();
 
+    query_cache.UpdateCounters();
+
     const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
     const ComputePipelineCacheKey key{
         code_addr,
@@ -471,17 +469,28 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
     });
 }
 
+void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) {
+    query_cache.ResetCounter(type);
+}
+
+void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
+                             std::optional<u64> timestamp) {
+    query_cache.Query(gpu_addr, type, timestamp);
+}
+
 void RasterizerVulkan::FlushAll() {}
 
 void RasterizerVulkan::FlushRegion(CacheAddr addr, u64 size) {
     texture_cache.FlushRegion(addr, size);
     buffer_cache.FlushRegion(addr, size);
+    query_cache.FlushRegion(addr, size);
 }
 
 void RasterizerVulkan::InvalidateRegion(CacheAddr addr, u64 size) {
     texture_cache.InvalidateRegion(addr, size);
     pipeline_cache.InvalidateRegion(addr, size);
     buffer_cache.InvalidateRegion(addr, size);
+    query_cache.InvalidateRegion(addr, size);
 }
 
 void RasterizerVulkan::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
@@ -602,33 +611,34 @@ bool RasterizerVulkan::WalkAttachmentOverlaps(const CachedSurfaceView& attachmen
 std::tuple<vk::Framebuffer, vk::Extent2D> RasterizerVulkan::ConfigureFramebuffers(
     vk::RenderPass renderpass) {
     FramebufferCacheKey key{renderpass, std::numeric_limits<u32>::max(),
-                            std::numeric_limits<u32>::max()};
+                            std::numeric_limits<u32>::max(), std::numeric_limits<u32>::max()};
 
-    const auto MarkAsModifiedAndPush = [&](const View& view) {
-        if (view == nullptr) {
+    const auto try_push = [&](const View& view) {
+        if (!view) {
             return false;
         }
         key.views.push_back(view->GetHandle());
         key.width = std::min(key.width, view->GetWidth());
         key.height = std::min(key.height, view->GetHeight());
+        key.layers = std::min(key.layers, view->GetNumLayers());
         return true;
     };
 
     for (std::size_t index = 0; index < std::size(color_attachments); ++index) {
-        if (MarkAsModifiedAndPush(color_attachments[index])) {
+        if (try_push(color_attachments[index])) {
             texture_cache.MarkColorBufferInUse(index);
         }
     }
-    if (MarkAsModifiedAndPush(zeta_attachment)) {
+    if (try_push(zeta_attachment)) {
         texture_cache.MarkDepthBufferInUse();
     }
 
     const auto [fbentry, is_cache_miss] = framebuffer_cache.try_emplace(key);
     auto& framebuffer = fbentry->second;
     if (is_cache_miss) {
-        const vk::FramebufferCreateInfo framebuffer_ci({}, key.renderpass,
-                                                       static_cast<u32>(key.views.size()),
-                                                       key.views.data(), key.width, key.height, 1);
+        const vk::FramebufferCreateInfo framebuffer_ci(
+            {}, key.renderpass, static_cast<u32>(key.views.size()), key.views.data(), key.width,
+            key.height, key.layers);
         const auto dev = device.GetLogical();
         const auto& dld = device.GetDispatchLoader();
         framebuffer = dev.createFramebufferUnique(framebuffer_ci, nullptr, dld);
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 7be71e734..4dc8af6e8 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -24,6 +24,7 @@
 #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
 #include "video_core/renderer_vulkan/vk_memory_manager.h"
 #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
+#include "video_core/renderer_vulkan/vk_query_cache.h"
 #include "video_core/renderer_vulkan/vk_renderpass_cache.h"
 #include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_sampler_cache.h"
@@ -55,6 +56,7 @@ struct FramebufferCacheKey {
     vk::RenderPass renderpass{};
     u32 width = 0;
     u32 height = 0;
+    u32 layers = 0;
     ImageViewsPack views;
 
     std::size_t Hash() const noexcept {
@@ -65,12 +67,17 @@ struct FramebufferCacheKey {
         }
         boost::hash_combine(hash, width);
         boost::hash_combine(hash, height);
+        boost::hash_combine(hash, layers);
         return hash;
     }
 
     bool operator==(const FramebufferCacheKey& rhs) const noexcept {
-        return std::tie(renderpass, views, width, height) ==
-               std::tie(rhs.renderpass, rhs.views, rhs.width, rhs.height);
+        return std::tie(renderpass, views, width, height, layers) ==
+               std::tie(rhs.renderpass, rhs.views, rhs.width, rhs.height, rhs.layers);
+    }
+
+    bool operator!=(const FramebufferCacheKey& rhs) const noexcept {
+        return !operator==(rhs);
     }
 };
 
@@ -96,7 +103,7 @@ struct ImageView {
     vk::ImageLayout* layout = nullptr;
 };
 
-class RasterizerVulkan : public VideoCore::RasterizerAccelerated {
+class RasterizerVulkan final : public VideoCore::RasterizerAccelerated {
 public:
     explicit RasterizerVulkan(Core::System& system, Core::Frontend::EmuWindow& render_window,
                               VKScreenInfo& screen_info, const VKDevice& device,
@@ -104,10 +111,11 @@ public:
                               VKScheduler& scheduler);
     ~RasterizerVulkan() override;
 
-    bool DrawBatch(bool is_indexed) override;
-    bool DrawMultiBatch(bool is_indexed) override;
+    void Draw(bool is_indexed, bool is_instanced) override;
     void Clear() override;
     void DispatchCompute(GPUVAddr code_addr) override;
+    void ResetCounter(VideoCore::QueryType type) override;
+    void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
     void FlushAll() override;
     void FlushRegion(CacheAddr addr, u64 size) override;
     void InvalidateRegion(CacheAddr addr, u64 size) override;
@@ -140,8 +148,6 @@ private:
 
     static constexpr std::size_t ZETA_TEXCEPTION_INDEX = 8;
 
-    void Draw(bool is_indexed, bool is_instanced);
-
     void FlushWork();
 
     Texceptions UpdateAttachments();
@@ -247,6 +253,7 @@ private:
     VKPipelineCache pipeline_cache;
     VKBufferCache buffer_cache;
     VKSamplerCache sampler_cache;
+    VKQueryCache query_cache;
 
     std::array<View, Maxwell::NumRenderTargets> color_attachments;
     View zeta_attachment;
diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
index 0a8ec8398..204b7c39c 100644
--- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
@@ -23,7 +23,14 @@ static std::optional<vk::BorderColor> TryConvertBorderColor(std::array<float, 4>
     } else if (color == std::array<float, 4>{1, 1, 1, 1}) {
         return vk::BorderColor::eFloatOpaqueWhite;
     } else {
-        return {};
+        if (color[0] + color[1] + color[2] > 1.35f) {
+            // If color elements are brighter than roughly 0.5 average, use white border
+            return vk::BorderColor::eFloatOpaqueWhite;
+        }
+        if (color[3] > 0.5f) {
+            return vk::BorderColor::eFloatOpaqueBlack;
+        }
+        return vk::BorderColor::eFloatTransparentBlack;
     }
 }
 
@@ -37,8 +44,6 @@ UniqueSampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc)
 
     const auto border_color{tsc.GetBorderColor()};
     const auto vk_border_color{TryConvertBorderColor(border_color)};
-    UNIMPLEMENTED_IF_MSG(!vk_border_color, "Unimplemented border color {} {} {} {}",
-                         border_color[0], border_color[1], border_color[2], border_color[3]);
 
     constexpr bool unnormalized_coords{false};
 
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index d66133ad1..92bd6c344 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -6,6 +6,7 @@
 #include "common/microprofile.h"
 #include "video_core/renderer_vulkan/declarations.h"
 #include "video_core/renderer_vulkan/vk_device.h"
+#include "video_core/renderer_vulkan/vk_query_cache.h"
 #include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 
@@ -139,6 +140,8 @@ void VKScheduler::SubmitExecution(vk::Semaphore semaphore) {
 }
 
 void VKScheduler::AllocateNewContext() {
+    ++ticks;
+
     std::unique_lock lock{mutex};
     current_fence = next_fence;
     next_fence = &resource_manager.CommitFence();
@@ -146,6 +149,10 @@ void VKScheduler::AllocateNewContext() {
     current_cmdbuf = resource_manager.CommitCommandBuffer(*current_fence);
     current_cmdbuf.begin({vk::CommandBufferUsageFlagBits::eOneTimeSubmit},
                          device.GetDispatchLoader());
+    // Enable counters once again. These are disabled when a command buffer is finished.
+    if (query_cache) {
+        query_cache->UpdateCounters();
+    }
 }
 
 void VKScheduler::InvalidateState() {
@@ -159,6 +166,7 @@ void VKScheduler::InvalidateState() {
 }
 
 void VKScheduler::EndPendingOperations() {
+    query_cache->DisableStreams();
     EndRenderPass();
 }
 
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h
index bcdffbba0..62fd7858b 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <atomic>
 #include <condition_variable>
 #include <memory>
 #include <optional>
@@ -18,6 +19,7 @@ namespace Vulkan {
 
 class VKDevice;
 class VKFence;
+class VKQueryCache;
 class VKResourceManager;
 
 class VKFenceView {
@@ -67,6 +69,11 @@ public:
     /// Binds a pipeline to the current execution context.
     void BindGraphicsPipeline(vk::Pipeline pipeline);
 
+    /// Assigns the query cache.
+    void SetQueryCache(VKQueryCache& query_cache_) {
+        query_cache = &query_cache_;
+    }
+
     /// Returns true when viewports have been set in the current command buffer.
     bool TouchViewports() {
         return std::exchange(state.viewports, true);
@@ -112,6 +119,11 @@ public:
         return current_fence;
     }
 
+    /// Returns the current command buffer tick.
+    u64 Ticks() const {
+        return ticks;
+    }
+
 private:
     class Command {
     public:
@@ -205,6 +217,8 @@ private:
 
     const VKDevice& device;
     VKResourceManager& resource_manager;
+    VKQueryCache* query_cache = nullptr;
+
     vk::CommandBuffer current_cmdbuf;
     VKFence* current_fence = nullptr;
     VKFence* next_fence = nullptr;
@@ -227,6 +241,7 @@ private:
     Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_reserve;
     std::mutex mutex;
     std::condition_variable cv;
+    std::atomic<u64> ticks = 0;
     bool quit = false;
 };
 
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 24a658dce..2da622d15 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -86,6 +86,7 @@ struct AttributeType {
 
 struct VertexIndices {
     std::optional<u32> position;
+    std::optional<u32> layer;
     std::optional<u32> viewport;
     std::optional<u32> point_size;
     std::optional<u32> clip_distances;
@@ -275,21 +276,29 @@ public:
         AddCapability(spv::Capability::ImageGatherExtended);
         AddCapability(spv::Capability::SampledBuffer);
         AddCapability(spv::Capability::StorageImageWriteWithoutFormat);
+        AddCapability(spv::Capability::DrawParameters);
         AddCapability(spv::Capability::SubgroupBallotKHR);
         AddCapability(spv::Capability::SubgroupVoteKHR);
         AddExtension("SPV_KHR_shader_ballot");
         AddExtension("SPV_KHR_subgroup_vote");
         AddExtension("SPV_KHR_storage_buffer_storage_class");
         AddExtension("SPV_KHR_variable_pointers");
+        AddExtension("SPV_KHR_shader_draw_parameters");
 
-        if (ir.UsesViewportIndex()) {
-            AddCapability(spv::Capability::MultiViewport);
-            if (device.IsExtShaderViewportIndexLayerSupported()) {
+        if (ir.UsesLayer() || ir.UsesViewportIndex()) {
+            if (ir.UsesViewportIndex()) {
+                AddCapability(spv::Capability::MultiViewport);
+            }
+            if (stage != ShaderType::Geometry && device.IsExtShaderViewportIndexLayerSupported()) {
                 AddExtension("SPV_EXT_shader_viewport_index_layer");
                 AddCapability(spv::Capability::ShaderViewportIndexLayerEXT);
             }
         }
 
+        if (device.IsShaderStorageImageReadWithoutFormatSupported()) {
+            AddCapability(spv::Capability::StorageImageReadWithoutFormat);
+        }
+
         if (device.IsFloat16Supported()) {
             AddCapability(spv::Capability::Float16);
         }
@@ -492,9 +501,11 @@ private:
         interfaces.push_back(AddGlobalVariable(Name(out_vertex, "out_vertex")));
 
         // Declare input attributes
-        vertex_index = DeclareInputBuiltIn(spv::BuiltIn::VertexIndex, t_in_uint, "vertex_index");
+        vertex_index = DeclareInputBuiltIn(spv::BuiltIn::VertexIndex, t_in_int, "vertex_index");
         instance_index =
-            DeclareInputBuiltIn(spv::BuiltIn::InstanceIndex, t_in_uint, "instance_index");
+            DeclareInputBuiltIn(spv::BuiltIn::InstanceIndex, t_in_int, "instance_index");
+        base_vertex = DeclareInputBuiltIn(spv::BuiltIn::BaseVertex, t_in_int, "base_vertex");
+        base_instance = DeclareInputBuiltIn(spv::BuiltIn::BaseInstance, t_in_int, "base_instance");
     }
 
     void DeclareTessControl() {
@@ -920,13 +931,22 @@ private:
         VertexIndices indices;
         indices.position = AddBuiltIn(t_float4, spv::BuiltIn::Position, "position");
 
+        if (ir.UsesLayer()) {
+            if (stage != ShaderType::Vertex || device.IsExtShaderViewportIndexLayerSupported()) {
+                indices.layer = AddBuiltIn(t_int, spv::BuiltIn::Layer, "layer");
+            } else {
+                LOG_ERROR(
+                    Render_Vulkan,
+                    "Shader requires Layer but it's not supported on this stage with this device.");
+            }
+        }
+
         if (ir.UsesViewportIndex()) {
             if (stage != ShaderType::Vertex || device.IsExtShaderViewportIndexLayerSupported()) {
                 indices.viewport = AddBuiltIn(t_int, spv::BuiltIn::ViewportIndex, "viewport_index");
             } else {
-                LOG_ERROR(Render_Vulkan,
-                          "Shader requires ViewportIndex but it's not supported on this "
-                          "stage with this device.");
+                LOG_ERROR(Render_Vulkan, "Shader requires ViewportIndex but it's not supported on "
+                                         "this stage with this device.");
             }
         }
 
@@ -1068,9 +1088,12 @@ private:
                     return {OpLoad(t_float, AccessElement(t_in_float, tess_coord, element)),
                             Type::Float};
                 case 2:
-                    return {OpLoad(t_uint, instance_index), Type::Uint};
+                    return {
+                        OpISub(t_int, OpLoad(t_int, instance_index), OpLoad(t_int, base_instance)),
+                        Type::Int};
                 case 3:
-                    return {OpLoad(t_uint, vertex_index), Type::Uint};
+                    return {OpISub(t_int, OpLoad(t_int, vertex_index), OpLoad(t_int, base_vertex)),
+                            Type::Int};
                 }
                 UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
                 return {Constant(t_uint, 0U), Type::Uint};
@@ -1285,6 +1308,13 @@ private:
                 }
                 case Attribute::Index::LayerViewportPointSize:
                     switch (element) {
+                    case 1: {
+                        if (!out_indices.layer) {
+                            return {};
+                        }
+                        const u32 index = out_indices.layer.value();
+                        return {AccessElement(t_out_int, out_vertex, index), Type::Int};
+                    }
                     case 2: {
                         if (!out_indices.viewport) {
                             return {};
@@ -1355,6 +1385,11 @@ private:
             UNIMPLEMENTED();
         }
 
+        if (!target.id) {
+            // On failure we return a nullptr target.id, skip these stores.
+            return {};
+        }
+
         OpStore(target.id, As(Visit(src), target.type));
         return {};
     }
@@ -1748,8 +1783,16 @@ private:
     }
 
     Expression ImageLoad(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
+        if (!device.IsShaderStorageImageReadWithoutFormatSupported()) {
+            return {v_float_zero, Type::Float};
+        }
+
+        const auto& meta{std::get<MetaImage>(operation.GetMeta())};
+
+        const Id coords = GetCoordinates(operation, Type::Int);
+        const Id texel = OpImageRead(t_uint4, GetImage(operation), coords);
+
+        return {OpCompositeExtract(t_uint, texel, meta.element), Type::Uint};
     }
 
     Expression ImageStore(Operation operation) {
@@ -2542,6 +2585,8 @@ private:
 
     Id instance_index{};
     Id vertex_index{};
+    Id base_instance{};
+    Id base_vertex{};
     std::array<Id, Maxwell::NumRenderTargets> frag_colors{};
     Id frag_depth{};
     Id frag_coord{};
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index d3edbe80c..22e3d34de 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -151,6 +151,10 @@ public:
         return params.GetMipHeight(base_level);
     }
 
+    u32 GetNumLayers() const {
+        return num_layers;
+    }
+
     bool IsBufferView() const {
         return buffer_view;
     }
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp
index 0eeb75559..6ead42070 100644
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -83,14 +83,14 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
 
         const bool input_signed = instr.conversion.is_input_signed;
 
-        if (instr.conversion.src_size == Register::Size::Byte) {
-            const u32 offset = static_cast<u32>(instr.conversion.int_src.selector) * 8;
-            if (offset > 0) {
-                value = SignedOperation(OperationCode::ILogicalShiftRight, input_signed,
-                                        std::move(value), Immediate(offset));
+        if (const u32 offset = static_cast<u32>(instr.conversion.int_src.selector); offset > 0) {
+            ASSERT(instr.conversion.src_size == Register::Size::Byte ||
+                   instr.conversion.src_size == Register::Size::Short);
+            if (instr.conversion.src_size == Register::Size::Short) {
+                ASSERT(offset == 0 || offset == 2);
             }
-        } else {
-            UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0);
+            value = SignedOperation(OperationCode::ILogicalShiftRight, input_signed,
+                                    std::move(value), Immediate(offset * 8));
         }
 
         value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed);
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 351c8c2f1..bee7d8cad 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -522,68 +522,53 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
                                Node array, Node depth_compare, u32 bias_offset,
                                std::vector<Node> aoffi,
                                std::optional<Tegra::Shader::Register> bindless_reg) {
-    const auto is_array = static_cast<bool>(array);
-    const auto is_shadow = static_cast<bool>(depth_compare);
+    const bool is_array = array != nullptr;
+    const bool is_shadow = depth_compare != nullptr;
     const bool is_bindless = bindless_reg.has_value();
 
-    UNIMPLEMENTED_IF_MSG((texture_type == TextureType::Texture3D && (is_array || is_shadow)) ||
-                             (texture_type == TextureType::TextureCube && is_array && is_shadow),
-                         "This method is not supported.");
+    UNIMPLEMENTED_IF(texture_type == TextureType::TextureCube && is_array && is_shadow);
+    ASSERT_MSG(texture_type != TextureType::Texture3D || !is_array || !is_shadow,
+               "Illegal texture type");
 
     const SamplerInfo info{texture_type, is_array, is_shadow, false};
-    Node index_var{};
+    Node index_var;
     const Sampler* sampler = is_bindless ? GetBindlessSampler(*bindless_reg, index_var, info)
                                          : GetSampler(instr.sampler, info);
-    Node4 values;
-    if (sampler == nullptr) {
-        for (u32 element = 0; element < values.size(); ++element) {
-            values[element] = Immediate(0);
-        }
-        return values;
+    if (!sampler) {
+        return {Immediate(0), Immediate(0), Immediate(0), Immediate(0)};
     }
 
     const bool lod_needed = process_mode == TextureProcessMode::LZ ||
                             process_mode == TextureProcessMode::LL ||
                             process_mode == TextureProcessMode::LLA;
-
-    // LOD selection (either via bias or explicit textureLod) not supported in GL for
-    // sampler2DArrayShadow and samplerCubeArrayShadow.
-    const bool gl_lod_supported =
-        !((texture_type == Tegra::Shader::TextureType::Texture2D && is_array && is_shadow) ||
-          (texture_type == Tegra::Shader::TextureType::TextureCube && is_array && is_shadow));
-
-    const OperationCode read_method =
-        (lod_needed && gl_lod_supported) ? OperationCode::TextureLod : OperationCode::Texture;
-
-    UNIMPLEMENTED_IF(process_mode != TextureProcessMode::None && !gl_lod_supported);
+    const OperationCode opcode = lod_needed ? OperationCode::TextureLod : OperationCode::Texture;
 
     Node bias;
     Node lod;
-    if (process_mode != TextureProcessMode::None && gl_lod_supported) {
-        switch (process_mode) {
-        case TextureProcessMode::LZ:
-            lod = Immediate(0.0f);
-            break;
-        case TextureProcessMode::LB:
-            // If present, lod or bias are always stored in the register
-            // indexed by the gpr20 field with an offset depending on the
-            // usage of the other registers
-            bias = GetRegister(instr.gpr20.Value() + bias_offset);
-            break;
-        case TextureProcessMode::LL:
-            lod = GetRegister(instr.gpr20.Value() + bias_offset);
-            break;
-        default:
-            UNIMPLEMENTED_MSG("Unimplemented process mode={}", static_cast<u32>(process_mode));
-            break;
-        }
+    switch (process_mode) {
+    case TextureProcessMode::None:
+        break;
+    case TextureProcessMode::LZ:
+        lod = Immediate(0.0f);
+        break;
+    case TextureProcessMode::LB:
+        // If present, lod or bias are always stored in the register indexed by the gpr20 field with
+        // an offset depending on the usage of the other registers.
+        bias = GetRegister(instr.gpr20.Value() + bias_offset);
+        break;
+    case TextureProcessMode::LL:
+        lod = GetRegister(instr.gpr20.Value() + bias_offset);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented process mode={}", static_cast<u32>(process_mode));
+        break;
     }
 
+    Node4 values;
     for (u32 element = 0; element < values.size(); ++element) {
-        auto copy_coords = coords;
         MetaTexture meta{*sampler, array, depth_compare, aoffi,    {}, {}, bias,
                          lod,      {},    element,       index_var};
-        values[element] = Operation(read_method, meta, std::move(copy_coords));
+        values[element] = Operation(opcode, meta, coords);
     }
 
     return values;
diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp
index 1655ccf16..9707c353d 100644
--- a/src/video_core/surface.cpp
+++ b/src/video_core/surface.cpp
@@ -155,6 +155,8 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format)
         return PixelFormat::R16I;
     case Tegra::RenderTargetFormat::R32_FLOAT:
         return PixelFormat::R32F;
+    case Tegra::RenderTargetFormat::R32_SINT:
+        return PixelFormat::R32I;
     case Tegra::RenderTargetFormat::R32_UINT:
         return PixelFormat::R32UI;
     case Tegra::RenderTargetFormat::RG32_UINT:
diff --git a/src/video_core/surface.h b/src/video_core/surface.h
index 0d17a93ed..d88109e5a 100644
--- a/src/video_core/surface.h
+++ b/src/video_core/surface.h
@@ -59,47 +59,48 @@ enum class PixelFormat {
     RG32UI = 41,
     RGBX16F = 42,
     R32UI = 43,
-    ASTC_2D_8X8 = 44,
-    ASTC_2D_8X5 = 45,
-    ASTC_2D_5X4 = 46,
-    BGRA8_SRGB = 47,
-    DXT1_SRGB = 48,
-    DXT23_SRGB = 49,
-    DXT45_SRGB = 50,
-    BC7U_SRGB = 51,
-    R4G4B4A4U = 52,
-    ASTC_2D_4X4_SRGB = 53,
-    ASTC_2D_8X8_SRGB = 54,
-    ASTC_2D_8X5_SRGB = 55,
-    ASTC_2D_5X4_SRGB = 56,
-    ASTC_2D_5X5 = 57,
-    ASTC_2D_5X5_SRGB = 58,
-    ASTC_2D_10X8 = 59,
-    ASTC_2D_10X8_SRGB = 60,
-    ASTC_2D_6X6 = 61,
-    ASTC_2D_6X6_SRGB = 62,
-    ASTC_2D_10X10 = 63,
-    ASTC_2D_10X10_SRGB = 64,
-    ASTC_2D_12X12 = 65,
-    ASTC_2D_12X12_SRGB = 66,
-    ASTC_2D_8X6 = 67,
-    ASTC_2D_8X6_SRGB = 68,
-    ASTC_2D_6X5 = 69,
-    ASTC_2D_6X5_SRGB = 70,
-    E5B9G9R9F = 71,
+    R32I = 44,
+    ASTC_2D_8X8 = 45,
+    ASTC_2D_8X5 = 46,
+    ASTC_2D_5X4 = 47,
+    BGRA8_SRGB = 48,
+    DXT1_SRGB = 49,
+    DXT23_SRGB = 50,
+    DXT45_SRGB = 51,
+    BC7U_SRGB = 52,
+    R4G4B4A4U = 53,
+    ASTC_2D_4X4_SRGB = 54,
+    ASTC_2D_8X8_SRGB = 55,
+    ASTC_2D_8X5_SRGB = 56,
+    ASTC_2D_5X4_SRGB = 57,
+    ASTC_2D_5X5 = 58,
+    ASTC_2D_5X5_SRGB = 59,
+    ASTC_2D_10X8 = 60,
+    ASTC_2D_10X8_SRGB = 61,
+    ASTC_2D_6X6 = 62,
+    ASTC_2D_6X6_SRGB = 63,
+    ASTC_2D_10X10 = 64,
+    ASTC_2D_10X10_SRGB = 65,
+    ASTC_2D_12X12 = 66,
+    ASTC_2D_12X12_SRGB = 67,
+    ASTC_2D_8X6 = 68,
+    ASTC_2D_8X6_SRGB = 69,
+    ASTC_2D_6X5 = 70,
+    ASTC_2D_6X5_SRGB = 71,
+    E5B9G9R9F = 72,
 
     MaxColorFormat,
 
     // Depth formats
-    Z32F = 72,
-    Z16 = 73,
+    Z32F = 73,
+    Z16 = 74,
 
     MaxDepthFormat,
 
     // DepthStencil formats
-    Z24S8 = 74,
-    S8Z24 = 75,
-    Z32FS8 = 76,
+    Z24S8 = 75,
+    S8Z24 = 76,
+    Z32FS8 = 77,
 
     MaxDepthStencilFormat,
 
@@ -171,6 +172,7 @@ constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{
     0, // RG32UI
     0, // RGBX16F
     0, // R32UI
+    0, // R32I
     2, // ASTC_2D_8X8
     2, // ASTC_2D_8X5
     2, // ASTC_2D_5X4
@@ -267,6 +269,7 @@ constexpr std::array<u32, MaxPixelFormat> block_width_table = {{
     1,  // RG32UI
     1,  // RGBX16F
     1,  // R32UI
+    1,  // R32I
     8,  // ASTC_2D_8X8
     8,  // ASTC_2D_8X5
     5,  // ASTC_2D_5X4
@@ -355,6 +358,7 @@ constexpr std::array<u32, MaxPixelFormat> block_height_table = {{
     1,  // RG32UI
     1,  // RGBX16F
     1,  // R32UI
+    1,  // R32I
     8,  // ASTC_2D_8X8
     5,  // ASTC_2D_8X5
     4,  // ASTC_2D_5X4
@@ -443,6 +447,7 @@ constexpr std::array<u32, MaxPixelFormat> bpp_table = {{
     64,  // RG32UI
     64,  // RGBX16F
     32,  // R32UI
+    32,  // R32I
     128, // ASTC_2D_8X8
     128, // ASTC_2D_8X5
     128, // ASTC_2D_5X4
@@ -546,6 +551,7 @@ constexpr std::array<SurfaceCompression, MaxPixelFormat> compression_type_table
     SurfaceCompression::None,       // RG32UI
     SurfaceCompression::None,       // RGBX16F
     SurfaceCompression::None,       // R32UI
+    SurfaceCompression::None,       // R32I
     SurfaceCompression::Converted,  // ASTC_2D_8X8
     SurfaceCompression::Converted,  // ASTC_2D_8X5
     SurfaceCompression::Converted,  // ASTC_2D_5X4
diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp
index 81fb9f633..cc3ad8417 100644
--- a/src/video_core/texture_cache/format_lookup_table.cpp
+++ b/src/video_core/texture_cache/format_lookup_table.cpp
@@ -41,7 +41,7 @@ struct Table {
     ComponentType alpha_component;
     bool is_srgb;
 };
-constexpr std::array<Table, 74> DefinitionTable = {{
+constexpr std::array<Table, 75> DefinitionTable = {{
     {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U},
     {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S},
     {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI},
@@ -89,6 +89,7 @@ constexpr std::array<Table, 74> DefinitionTable = {{
 
     {TextureFormat::R32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32F},
     {TextureFormat::R32, C, UINT, UINT, UINT, UINT, PixelFormat::R32UI},
+    {TextureFormat::R32, C, SINT, SINT, SINT, SINT, PixelFormat::R32I},
 
     {TextureFormat::E5B9G9R9_SHAREDEXP, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::E5B9G9R9F},
 
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 84469b7ba..002df414f 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -277,6 +277,10 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager,
             SwizzleFunc(MortonSwizzleMode::LinearToMorton, host_ptr, params,
                         staging_buffer.data() + host_offset, level);
         }
+    } else if (params.IsBuffer()) {
+        // Buffers don't have pitch or any fancy layout property. We can just memcpy them to guest
+        // memory.
+        std::memcpy(host_ptr, staging_buffer.data(), guest_memory_size);
     } else {
         ASSERT(params.target == SurfaceTarget::Texture2D);
         ASSERT(params.num_levels == 1);
diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp
index 38b3a4ba8..f00839313 100644
--- a/src/video_core/texture_cache/surface_params.cpp
+++ b/src/video_core/texture_cache/surface_params.cpp
@@ -84,19 +84,16 @@ SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_ta
     if (entry.IsShadow() && params.type == SurfaceType::ColorTexture) {
         switch (params.pixel_format) {
         case PixelFormat::R16U:
-        case PixelFormat::R16F: {
+        case PixelFormat::R16F:
             params.pixel_format = PixelFormat::Z16;
             break;
-        }
-        case PixelFormat::R32F: {
+        case PixelFormat::R32F:
             params.pixel_format = PixelFormat::Z32F;
             break;
-        }
-        default: {
+        default:
             UNIMPLEMENTED_MSG("Unimplemented shadow convert format: {}",
                               static_cast<u32>(params.pixel_format));
         }
-        }
         params.type = GetFormatType(params.pixel_format);
     }
     params.type = GetFormatType(params.pixel_format);
@@ -168,27 +165,29 @@ SurfaceParams SurfaceParams::CreateForImage(const FormatLookupTable& lookup_tabl
     return params;
 }
 
-SurfaceParams SurfaceParams::CreateForDepthBuffer(
-    Core::System& system, u32 zeta_width, u32 zeta_height, Tegra::DepthFormat format,
-    u32 block_width, u32 block_height, u32 block_depth,
-    Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type) {
+SurfaceParams SurfaceParams::CreateForDepthBuffer(Core::System& system) {
+    const auto& regs = system.GPU().Maxwell3D().regs;
+    regs.zeta_width, regs.zeta_height, regs.zeta.format, regs.zeta.memory_layout.type;
     SurfaceParams params;
-    params.is_tiled = type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear;
+    params.is_tiled = regs.zeta.memory_layout.type ==
+                      Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear;
     params.srgb_conversion = false;
-    params.block_width = std::min(block_width, 5U);
-    params.block_height = std::min(block_height, 5U);
-    params.block_depth = std::min(block_depth, 5U);
+    params.block_width = std::min(regs.zeta.memory_layout.block_width.Value(), 5U);
+    params.block_height = std::min(regs.zeta.memory_layout.block_height.Value(), 5U);
+    params.block_depth = std::min(regs.zeta.memory_layout.block_depth.Value(), 5U);
     params.tile_width_spacing = 1;
-    params.pixel_format = PixelFormatFromDepthFormat(format);
+    params.pixel_format = PixelFormatFromDepthFormat(regs.zeta.format);
     params.type = GetFormatType(params.pixel_format);
-    params.width = zeta_width;
-    params.height = zeta_height;
-    params.target = SurfaceTarget::Texture2D;
-    params.depth = 1;
+    params.width = regs.zeta_width;
+    params.height = regs.zeta_height;
     params.pitch = 0;
     params.num_levels = 1;
     params.emulated_levels = 1;
-    params.is_layered = false;
+
+    const bool is_layered = regs.zeta_layers > 1 && params.block_depth == 0;
+    params.is_layered = is_layered;
+    params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D;
+    params.depth = is_layered ? regs.zeta_layers.Value() : 1U;
     return params;
 }
 
@@ -214,11 +213,13 @@ SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::siz
         params.width = params.pitch / bpp;
     }
     params.height = config.height;
-    params.depth = 1;
-    params.target = SurfaceTarget::Texture2D;
     params.num_levels = 1;
     params.emulated_levels = 1;
-    params.is_layered = false;
+
+    const bool is_layered = config.layers > 1 && params.block_depth == 0;
+    params.is_layered = is_layered;
+    params.depth = is_layered ? config.layers.Value() : 1;
+    params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D;
     return params;
 }
 
diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h
index 9256fd6d9..995cc3818 100644
--- a/src/video_core/texture_cache/surface_params.h
+++ b/src/video_core/texture_cache/surface_params.h
@@ -35,10 +35,7 @@ public:
                                         const VideoCommon::Shader::Image& entry);
 
     /// Creates SurfaceCachedParams for a depth buffer configuration.
-    static SurfaceParams CreateForDepthBuffer(
-        Core::System& system, u32 zeta_width, u32 zeta_height, Tegra::DepthFormat format,
-        u32 block_width, u32 block_height, u32 block_depth,
-        Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type);
+    static SurfaceParams CreateForDepthBuffer(Core::System& system);
 
     /// Creates SurfaceCachedParams from a framebuffer configuration.
     static SurfaceParams CreateForFramebuffer(Core::System& system, std::size_t index);
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index f4c015635..c70e4aec2 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -160,10 +160,7 @@ public:
             SetEmptyDepthBuffer();
             return {};
         }
-        const auto depth_params{SurfaceParams::CreateForDepthBuffer(
-            system, regs.zeta_width, regs.zeta_height, regs.zeta.format,
-            regs.zeta.memory_layout.block_width, regs.zeta.memory_layout.block_height,
-            regs.zeta.memory_layout.block_depth, regs.zeta.memory_layout.type)};
+        const auto depth_params{SurfaceParams::CreateForDepthBuffer(system)};
         auto surface_view = GetSurface(gpu_addr, cache_addr, depth_params, preserve_contents, true);
         if (depth_buffer.target)
             depth_buffer.target->MarkAsRenderTarget(false, NO_RT);
@@ -721,7 +718,6 @@ private:
     std::pair<TSurface, TView> GetSurface(const GPUVAddr gpu_addr, const CacheAddr cache_addr,
                                           const SurfaceParams& params, bool preserve_contents,
                                           bool is_render) {
-
         // Step 1
         // Check Level 1 Cache for a fast structural match. If candidate surface
         // matches at certain level we are pretty much done.
@@ -733,14 +729,18 @@ private:
                 return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
                                       topological_result);
             }
+
             const auto struct_result = current_surface->MatchesStructure(params);
-            if (struct_result != MatchStructureResult::None &&
-                (params.target != SurfaceTarget::Texture3D ||
-                 current_surface->MatchTarget(params.target))) {
-                if (struct_result == MatchStructureResult::FullMatch) {
-                    return ManageStructuralMatch(current_surface, params, is_render);
-                } else {
-                    return RebuildSurface(current_surface, params, is_render);
+            if (struct_result != MatchStructureResult::None) {
+                const auto& old_params = current_surface->GetSurfaceParams();
+                const bool not_3d = params.target != SurfaceTarget::Texture3D &&
+                                    old_params.target != SurfaceTarget::Texture3D;
+                if (not_3d || current_surface->MatchTarget(params.target)) {
+                    if (struct_result == MatchStructureResult::FullMatch) {
+                        return ManageStructuralMatch(current_surface, params, is_render);
+                    } else {
+                        return RebuildSurface(current_surface, params, is_render);
+                    }
                 }
             }
         }
diff --git a/src/web_service/web_backend.cpp b/src/web_service/web_backend.cpp
index 6683f459f..737ffe409 100644
--- a/src/web_service/web_backend.cpp
+++ b/src/web_service/web_backend.cpp
@@ -73,14 +73,12 @@ struct Client::Impl {
                 if (!parsedUrl.GetPort(&port)) {
                     port = HTTP_PORT;
                 }
-                cli = std::make_unique<httplib::Client>(parsedUrl.m_Host.c_str(), port,
-                                                        TIMEOUT_SECONDS);
+                cli = std::make_unique<httplib::Client>(parsedUrl.m_Host.c_str(), port);
             } else if (parsedUrl.m_Scheme == "https") {
                 if (!parsedUrl.GetPort(&port)) {
                     port = HTTPS_PORT;
                 }
-                cli = std::make_unique<httplib::SSLClient>(parsedUrl.m_Host.c_str(), port,
-                                                           TIMEOUT_SECONDS);
+                cli = std::make_unique<httplib::SSLClient>(parsedUrl.m_Host.c_str(), port);
             } else {
                 LOG_ERROR(WebService, "Bad URL scheme {}", parsedUrl.m_Scheme);
                 return Common::WebResult{Common::WebResult::Code::InvalidURL, "Bad URL scheme"};
@@ -90,6 +88,7 @@ struct Client::Impl {
             LOG_ERROR(WebService, "Invalid URL {}", host + path);
             return Common::WebResult{Common::WebResult::Code::InvalidURL, "Invalid URL"};
         }
+        cli->set_timeout_sec(TIMEOUT_SECONDS);
 
         httplib::Headers params;
         if (!jwt.empty()) {