1 files changed, 519 insertions, 100 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index cad7f902d..24c858104 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -15,6 +15,7 @@
 #include <vector>
 
 #include <boost/container/small_vector.hpp>
+#include <boost/icl/interval_set.hpp>
 
 #include "common/common_types.h"
 #include "common/div_ceil.h"
@@ -30,6 +31,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
+#include "video_core/surface.h"
 #include "video_core/texture_cache/slot_vector.h"
 #include "video_core/texture_cache/types.h"
 
@@ -41,14 +43,19 @@ MICROPROFILE_DECLARE(GPU_DownloadMemory);
 
 using BufferId = SlotId;
 
+using VideoCore::Surface::PixelFormat;
+using namespace Common::Literals;
+
 constexpr u32 NUM_VERTEX_BUFFERS = 32;
 constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4;
 constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18;
 constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8;
 constexpr u32 NUM_STORAGE_BUFFERS = 16;
+constexpr u32 NUM_TEXTURE_BUFFERS = 16;
 constexpr u32 NUM_STAGES = 5;
 
-using namespace Common::Literals;
+using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>;
+using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>;
 
 template <typename P>
 class BufferCache {
@@ -66,6 +73,7 @@ class BufferCache {
     static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX;
     static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX;
     static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS;
+    static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS;
 
     static constexpr BufferId NULL_BUFFER_ID{0};
 
@@ -77,6 +85,9 @@ class BufferCache {
     using Runtime = typename P::Runtime;
     using Buffer = typename P::Buffer;
 
+    using IntervalSet = boost::icl::interval_set<VAddr>;
+    using IntervalType = typename IntervalSet::interval_type;
+
     struct Empty {};
 
     struct OverlapResult {
@@ -92,6 +103,10 @@ class BufferCache {
         BufferId buffer_id;
     };
 
+    struct TextureBufferBinding : Binding {
+        PixelFormat format;
+    };
+
     static constexpr Binding NULL_BINDING{
         .cpu_addr = 0,
         .size = 0,
@@ -129,38 +144,63 @@ public:
 
     void BindHostComputeBuffers();
 
-    void SetEnabledUniformBuffers(size_t stage, u32 enabled);
+    void SetUniformBuffersState(const std::array<u32, NUM_STAGES>& mask,
+                                const UniformBufferSizes* sizes);
 
-    void SetEnabledComputeUniformBuffers(u32 enabled);
+    void SetComputeUniformBufferState(u32 mask, const ComputeUniformBufferSizes* sizes);
 
     void UnbindGraphicsStorageBuffers(size_t stage);
 
     void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
                                    bool is_written);
 
+    void UnbindGraphicsTextureBuffers(size_t stage);
+
+    void BindGraphicsTextureBuffer(size_t stage, size_t tbo_index, GPUVAddr gpu_addr, u32 size,
+                                   PixelFormat format, bool is_written, bool is_image);
+
     void UnbindComputeStorageBuffers();
 
     void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset,
                                   bool is_written);
 
+    void UnbindComputeTextureBuffers();
+
+    void BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size, PixelFormat format,
+                                  bool is_written, bool is_image);
+
     void FlushCachedWrites();
 
     /// Return true when there are uncommitted buffers to be downloaded
     [[nodiscard]] bool HasUncommittedFlushes() const noexcept;
 
+    void AccumulateFlushes();
+
     /// Return true when the caller should wait for async downloads
     [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept;
 
     /// Commit asynchronous downloads
     void CommitAsyncFlushes();
+    void CommitAsyncFlushesHigh();
 
     /// Pop asynchronous downloads
     void PopAsyncFlushes();
 
+    bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount);
+
+    bool DMAClear(GPUVAddr src_address, u64 amount, u32 value);
+
     /// Return true when a CPU region is modified from the GPU
     [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
 
+    /// Return true when a region is registered on the cache
+    [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size);
+
+    /// Return true when a CPU region is modified from the CPU
+    [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size);
+
     std::mutex mutex;
+    Runtime& runtime;
 
 private:
     template <typename Func>
@@ -190,6 +230,36 @@ private:
         }
     }
 
+    template <typename Func>
+    void ForEachWrittenRange(VAddr cpu_addr, u64 size, Func&& func) {
+        const VAddr start_address = cpu_addr;
+        const VAddr end_address = start_address + size;
+        const VAddr search_base =
+            static_cast<VAddr>(std::min<s64>(0LL, static_cast<s64>(start_address - size)));
+        const IntervalType search_interval{search_base, search_base + 1};
+        auto it = common_ranges.lower_bound(search_interval);
+        if (it == common_ranges.end()) {
+            it = common_ranges.begin();
+        }
+        for (; it != common_ranges.end(); it++) {
+            VAddr inter_addr_end = it->upper();
+            VAddr inter_addr = it->lower();
+            if (inter_addr >= end_address) {
+                break;
+            }
+            if (inter_addr_end <= start_address) {
+                continue;
+            }
+            if (inter_addr_end > end_address) {
+                inter_addr_end = end_address;
+            }
+            if (inter_addr < start_address) {
+                inter_addr = start_address;
+            }
+            func(inter_addr, inter_addr_end);
+        }
+    }
+
     static bool IsRangeGranular(VAddr cpu_addr, size_t size) {
         return (cpu_addr & ~Core::Memory::PAGE_MASK) ==
                ((cpu_addr + size) & ~Core::Memory::PAGE_MASK);
@@ -207,12 +277,16 @@ private:
 
     void BindHostGraphicsStorageBuffers(size_t stage);
 
+    void BindHostGraphicsTextureBuffers(size_t stage);
+
     void BindHostTransformFeedbackBuffers();
 
     void BindHostComputeUniformBuffers();
 
     void BindHostComputeStorageBuffers();
 
+    void BindHostComputeTextureBuffers();
+
     void DoUpdateGraphicsBuffers(bool is_indexed);
 
     void DoUpdateComputeBuffers();
@@ -227,6 +301,8 @@ private:
 
     void UpdateStorageBuffers(size_t stage);
 
+    void UpdateTextureBuffers(size_t stage);
+
     void UpdateTransformFeedbackBuffers();
 
     void UpdateTransformFeedbackBuffer(u32 index);
@@ -235,6 +311,8 @@ private:
 
     void UpdateComputeStorageBuffers();
 
+    void UpdateComputeTextureBuffers();
+
     void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size);
 
     [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size);
@@ -272,24 +350,26 @@ private:
 
     void DeleteBuffer(BufferId buffer_id);
 
-    void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id);
-
     void NotifyBufferDeletion();
 
     [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const;
 
+    [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size,
+                                                               PixelFormat format);
+
     [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size);
 
     [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity);
 
     [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept;
 
+    void ClearDownload(IntervalType subtract_interval);
+
     VideoCore::RasterizerInterface& rasterizer;
     Tegra::Engines::Maxwell3D& maxwell3d;
     Tegra::Engines::KeplerCompute& kepler_compute;
     Tegra::MemoryManager& gpu_memory;
     Core::Memory::Memory& cpu_memory;
-    Runtime& runtime;
 
     SlotVector<Buffer> slot_buffers;
     DelayedDestructionRing<Buffer, 8> delayed_destruction_ring;
@@ -300,20 +380,30 @@ private:
     std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers;
     std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers;
     std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers;
+    std::array<std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS>, NUM_STAGES> texture_buffers;
     std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers;
 
     std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers;
     std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers;
+    std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS> compute_texture_buffers;
+
+    std::array<u32, NUM_STAGES> enabled_uniform_buffer_masks{};
+    u32 enabled_compute_uniform_buffer_mask = 0;
 
-    std::array<u32, NUM_STAGES> enabled_uniform_buffers{};
-    u32 enabled_compute_uniform_buffers = 0;
+    const UniformBufferSizes* uniform_buffer_sizes{};
+    const ComputeUniformBufferSizes* compute_uniform_buffer_sizes{};
 
     std::array<u32, NUM_STAGES> enabled_storage_buffers{};
     std::array<u32, NUM_STAGES> written_storage_buffers{};
     u32 enabled_compute_storage_buffers = 0;
     u32 written_compute_storage_buffers = 0;
 
-    std::array<u32, NUM_STAGES> fast_bound_uniform_buffers{};
+    std::array<u32, NUM_STAGES> enabled_texture_buffers{};
+    std::array<u32, NUM_STAGES> written_texture_buffers{};
+    std::array<u32, NUM_STAGES> image_texture_buffers{};
+    u32 enabled_compute_texture_buffers = 0;
+    u32 written_compute_texture_buffers = 0;
+    u32 image_compute_texture_buffers = 0;
 
     std::array<u32, 16> uniform_cache_hits{};
     std::array<u32, 16> uniform_cache_shots{};
@@ -324,12 +414,16 @@ private:
 
     std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty>
         dirty_uniform_buffers{};
+    std::conditional_t<IS_OPENGL, std::array<u32, NUM_STAGES>, Empty> fast_bound_uniform_buffers{};
+    std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS,
+                       std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>, Empty>
+        uniform_buffer_binding_sizes{};
 
     std::vector<BufferId> cached_write_buffer_ids;
 
-    // TODO: This data structure is not optimal and it should be reworked
-    std::vector<BufferId> uncommitted_downloads;
-    std::deque<std::vector<BufferId>> committed_downloads;
+    IntervalSet uncommitted_ranges;
+    IntervalSet common_ranges;
+    std::deque<IntervalSet> committed_ranges;
 
     size_t immediate_buffer_capacity = 0;
     std::unique_ptr<u8[]> immediate_buffer_alloc;
@@ -347,11 +441,12 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
                             Tegra::Engines::KeplerCompute& kepler_compute_,
                             Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
                             Runtime& runtime_)
-    : rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_},
-      gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} {
+    : runtime{runtime_}, rasterizer{rasterizer_}, maxwell3d{maxwell3d_},
+      kepler_compute{kepler_compute_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_} {
     // Ensure the first slot is used for the null buffer
     void(slot_buffers.insert(runtime, NullBufferParams{}));
     deletion_iterator = slot_buffers.end();
+    common_ranges.clear();
 }
 
 template <class P>
@@ -422,6 +517,97 @@ void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) {
 }
 
 template <class P>
+void BufferCache<P>::ClearDownload(IntervalType subtract_interval) {
+    uncommitted_ranges.subtract(subtract_interval);
+    for (auto& interval_set : committed_ranges) {
+        interval_set.subtract(subtract_interval);
+    }
+}
+
+template <class P>
+bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount) {
+    const std::optional<VAddr> cpu_src_address = gpu_memory.GpuToCpuAddress(src_address);
+    const std::optional<VAddr> cpu_dest_address = gpu_memory.GpuToCpuAddress(dest_address);
+    if (!cpu_src_address || !cpu_dest_address) {
+        return false;
+    }
+    const bool source_dirty = IsRegionRegistered(*cpu_src_address, amount);
+    const bool dest_dirty = IsRegionRegistered(*cpu_dest_address, amount);
+    if (!source_dirty && !dest_dirty) {
+        return false;
+    }
+
+    const IntervalType subtract_interval{*cpu_dest_address, *cpu_dest_address + amount};
+    ClearDownload(subtract_interval);
+
+    BufferId buffer_a;
+    BufferId buffer_b;
+    do {
+        has_deleted_buffers = false;
+        buffer_a = FindBuffer(*cpu_src_address, static_cast<u32>(amount));
+        buffer_b = FindBuffer(*cpu_dest_address, static_cast<u32>(amount));
+    } while (has_deleted_buffers);
+    auto& src_buffer = slot_buffers[buffer_a];
+    auto& dest_buffer = slot_buffers[buffer_b];
+    SynchronizeBuffer(src_buffer, *cpu_src_address, static_cast<u32>(amount));
+    SynchronizeBuffer(dest_buffer, *cpu_dest_address, static_cast<u32>(amount));
+    std::array copies{BufferCopy{
+        .src_offset = src_buffer.Offset(*cpu_src_address),
+        .dst_offset = dest_buffer.Offset(*cpu_dest_address),
+        .size = amount,
+    }};
+
+    boost::container::small_vector<IntervalType, 4> tmp_intervals;
+    auto mirror = [&](VAddr base_address, VAddr base_address_end) {
+        const u64 size = base_address_end - base_address;
+        const VAddr diff = base_address - *cpu_src_address;
+        const VAddr new_base_address = *cpu_dest_address + diff;
+        const IntervalType add_interval{new_base_address, new_base_address + size};
+        uncommitted_ranges.add(add_interval);
+        tmp_intervals.push_back(add_interval);
+    };
+    ForEachWrittenRange(*cpu_src_address, amount, mirror);
+    // This subtraction in this order is important for overlapping copies.
+    common_ranges.subtract(subtract_interval);
+    bool atleast_1_download = tmp_intervals.size() != 0;
+    for (const IntervalType add_interval : tmp_intervals) {
+        common_ranges.add(add_interval);
+    }
+
+    runtime.CopyBuffer(dest_buffer, src_buffer, copies);
+    if (atleast_1_download) {
+        dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount);
+    }
+    std::vector<u8> tmp_buffer(amount);
+    cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount);
+    cpu_memory.WriteBlockUnsafe(*cpu_dest_address, tmp_buffer.data(), amount);
+    return true;
+}
+
+template <class P>
+bool BufferCache<P>::DMAClear(GPUVAddr dst_address, u64 amount, u32 value) {
+    const std::optional<VAddr> cpu_dst_address = gpu_memory.GpuToCpuAddress(dst_address);
+    if (!cpu_dst_address) {
+        return false;
+    }
+    const bool dest_dirty = IsRegionRegistered(*cpu_dst_address, amount);
+    if (!dest_dirty) {
+        return false;
+    }
+
+    const size_t size = amount * sizeof(u32);
+    const IntervalType subtract_interval{*cpu_dst_address, *cpu_dst_address + size};
+    ClearDownload(subtract_interval);
+    common_ranges.subtract(subtract_interval);
+
+    const BufferId buffer = FindBuffer(*cpu_dst_address, static_cast<u32>(size));
+    auto& dest_buffer = slot_buffers[buffer];
+    const u32 offset = dest_buffer.Offset(*cpu_dst_address);
+    runtime.ClearBuffer(dest_buffer, offset, size, value);
+    return true;
+}
+
+template <class P>
 void BufferCache<P>::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
                                                u32 size) {
     const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
@@ -476,6 +662,7 @@ void BufferCache<P>::BindHostStageBuffers(size_t stage) {
     MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
     BindHostGraphicsUniformBuffers(stage);
     BindHostGraphicsStorageBuffers(stage);
+    BindHostGraphicsTextureBuffers(stage);
 }
 
 template <class P>
@@ -483,21 +670,30 @@ void BufferCache<P>::BindHostComputeBuffers() {
     MICROPROFILE_SCOPE(GPU_BindUploadBuffers);
     BindHostComputeUniformBuffers();
     BindHostComputeStorageBuffers();
+    BindHostComputeTextureBuffers();
 }
 
 template <class P>
-void BufferCache<P>::SetEnabledUniformBuffers(size_t stage, u32 enabled) {
+void BufferCache<P>::SetUniformBuffersState(const std::array<u32, NUM_STAGES>& mask,
+                                            const UniformBufferSizes* sizes) {
     if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
-        if (enabled_uniform_buffers[stage] != enabled) {
-            dirty_uniform_buffers[stage] = ~u32{0};
+        if (enabled_uniform_buffer_masks != mask) {
+            if constexpr (IS_OPENGL) {
+                fast_bound_uniform_buffers.fill(0);
+            }
+            dirty_uniform_buffers.fill(~u32{0});
+            uniform_buffer_binding_sizes.fill({});
         }
     }
-    enabled_uniform_buffers[stage] = enabled;
+    enabled_uniform_buffer_masks = mask;
+    uniform_buffer_sizes = sizes;
 }
 
 template <class P>
-void BufferCache<P>::SetEnabledComputeUniformBuffers(u32 enabled) {
-    enabled_compute_uniform_buffers = enabled;
+void BufferCache<P>::SetComputeUniformBufferState(u32 mask,
+                                                  const ComputeUniformBufferSizes* sizes) {
+    enabled_compute_uniform_buffer_mask = mask;
+    compute_uniform_buffer_sizes = sizes;
 }
 
 template <class P>
@@ -518,9 +714,29 @@ void BufferCache<P>::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index,
 }
 
 template <class P>
+void BufferCache<P>::UnbindGraphicsTextureBuffers(size_t stage) {
+    enabled_texture_buffers[stage] = 0;
+    written_texture_buffers[stage] = 0;
+    image_texture_buffers[stage] = 0;
+}
+
+template <class P>
+void BufferCache<P>::BindGraphicsTextureBuffer(size_t stage, size_t tbo_index, GPUVAddr gpu_addr,
+                                               u32 size, PixelFormat format, bool is_written,
+                                               bool is_image) {
+    enabled_texture_buffers[stage] |= 1U << tbo_index;
+    written_texture_buffers[stage] |= (is_written ? 1U : 0U) << tbo_index;
+    if constexpr (SEPARATE_IMAGE_BUFFERS_BINDINGS) {
+        image_texture_buffers[stage] |= (is_image ? 1U : 0U) << tbo_index;
+    }
+    texture_buffers[stage][tbo_index] = GetTextureBufferBinding(gpu_addr, size, format);
+}
+
+template <class P>
 void BufferCache<P>::UnbindComputeStorageBuffers() {
     enabled_compute_storage_buffers = 0;
     written_compute_storage_buffers = 0;
+    image_compute_texture_buffers = 0;
 }
 
 template <class P>
@@ -538,6 +754,24 @@ void BufferCache<P>::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index,
 }
 
 template <class P>
+void BufferCache<P>::UnbindComputeTextureBuffers() {
+    enabled_compute_texture_buffers = 0;
+    written_compute_texture_buffers = 0;
+    image_compute_texture_buffers = 0;
+}
+
+template <class P>
+void BufferCache<P>::BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size,
+                                              PixelFormat format, bool is_written, bool is_image) {
+    enabled_compute_texture_buffers |= 1U << tbo_index;
+    written_compute_texture_buffers |= (is_written ? 1U : 0U) << tbo_index;
+    if constexpr (SEPARATE_IMAGE_BUFFERS_BINDINGS) {
+        image_compute_texture_buffers |= (is_image ? 1U : 0U) << tbo_index;
+    }
+    compute_texture_buffers[tbo_index] = GetTextureBufferBinding(gpu_addr, size, format);
+}
+
+template <class P>
 void BufferCache<P>::FlushCachedWrites() {
     for (const BufferId buffer_id : cached_write_buffer_ids) {
         slot_buffers[buffer_id].FlushCachedWrites();
@@ -547,29 +781,30 @@ void BufferCache<P>::FlushCachedWrites() {
 
 template <class P>
 bool BufferCache<P>::HasUncommittedFlushes() const noexcept {
-    return !uncommitted_downloads.empty();
+    return !uncommitted_ranges.empty() || !committed_ranges.empty();
 }
 
 template <class P>
-bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
-    return !committed_downloads.empty() && !committed_downloads.front().empty();
+void BufferCache<P>::AccumulateFlushes() {
+    if (Settings::values.gpu_accuracy.GetValue() != Settings::GPUAccuracy::High) {
+        uncommitted_ranges.clear();
+        return;
+    }
+    if (uncommitted_ranges.empty()) {
+        return;
+    }
+    committed_ranges.emplace_back(std::move(uncommitted_ranges));
 }
 
 template <class P>
-void BufferCache<P>::CommitAsyncFlushes() {
-    // This is intentionally passing the value by copy
-    committed_downloads.push_front(uncommitted_downloads);
-    uncommitted_downloads.clear();
+bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept {
+    return false;
 }
 
 template <class P>
-void BufferCache<P>::PopAsyncFlushes() {
-    if (committed_downloads.empty()) {
-        return;
-    }
-    auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); });
-    const std::span<const BufferId> download_ids = committed_downloads.back();
-    if (download_ids.empty()) {
+void BufferCache<P>::CommitAsyncFlushesHigh() {
+    AccumulateFlushes();
+    if (committed_ranges.empty()) {
         return;
     }
     MICROPROFILE_SCOPE(GPU_DownloadMemory);
@@ -577,20 +812,43 @@ void BufferCache<P>::PopAsyncFlushes() {
     boost::container::small_vector<std::pair<BufferCopy, BufferId>, 1> downloads;
     u64 total_size_bytes = 0;
     u64 largest_copy = 0;
-    for (const BufferId buffer_id : download_ids) {
-        slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) {
-            downloads.push_back({
-                BufferCopy{
-                    .src_offset = range_offset,
-                    .dst_offset = total_size_bytes,
-                    .size = range_size,
-                },
-                buffer_id,
+    for (const IntervalSet& intervals : committed_ranges) {
+        for (auto& interval : intervals) {
+            const std::size_t size = interval.upper() - interval.lower();
+            const VAddr cpu_addr = interval.lower();
+            ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) {
+                boost::container::small_vector<BufferCopy, 1> copies;
+                buffer.ForEachDownloadRangeAndClear(
+                    cpu_addr, size, [&](u64 range_offset, u64 range_size) {
+                        const VAddr buffer_addr = buffer.CpuAddr();
+                        const auto add_download = [&](VAddr start, VAddr end) {
+                            const u64 new_offset = start - buffer_addr;
+                            const u64 new_size = end - start;
+                            downloads.push_back({
+                                BufferCopy{
+                                    .src_offset = new_offset,
+                                    .dst_offset = total_size_bytes,
+                                    .size = new_size,
+                                },
+                                buffer_id,
+                            });
+                            // Align up to avoid cache conflicts
+                            constexpr u64 align = 256ULL;
+                            constexpr u64 mask = ~(align - 1ULL);
+                            total_size_bytes += (new_size + align - 1) & mask;
+                            largest_copy = std::max(largest_copy, new_size);
+                        };
+
+                        const VAddr start_address = buffer_addr + range_offset;
+                        const VAddr end_address = start_address + range_size;
+                        ForEachWrittenRange(start_address, range_size, add_download);
+                        const IntervalType subtract_interval{start_address, end_address};
+                        common_ranges.subtract(subtract_interval);
+                    });
             });
-            total_size_bytes += range_size;
-            largest_copy = std::max(largest_copy, range_size);
-        });
+        }
     }
+    committed_ranges.clear();
     if (downloads.empty()) {
         return;
     }
@@ -623,6 +881,19 @@ void BufferCache<P>::PopAsyncFlushes() {
 }
 
 template <class P>
+void BufferCache<P>::CommitAsyncFlushes() {
+    if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) {
+        CommitAsyncFlushesHigh();
+    } else {
+        uncommitted_ranges.clear();
+        committed_ranges.clear();
+    }
+}
+
+template <class P>
+void BufferCache<P>::PopAsyncFlushes() {}
+
+template <class P>
 bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
     const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
     for (u64 page = addr >> PAGE_BITS; page < page_end;) {
@@ -642,6 +913,46 @@ bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) {
 }
 
 template <class P>
+bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) {
+    const VAddr end_addr = addr + size;
+    const u64 page_end = Common::DivCeil(end_addr, PAGE_SIZE);
+    for (u64 page = addr >> PAGE_BITS; page < page_end;) {
+        const BufferId buffer_id = page_table[page];
+        if (!buffer_id) {
+            ++page;
+            continue;
+        }
+        Buffer& buffer = slot_buffers[buffer_id];
+        const VAddr buf_start_addr = buffer.CpuAddr();
+        const VAddr buf_end_addr = buf_start_addr + buffer.SizeBytes();
+        if (buf_start_addr < end_addr && addr < buf_end_addr) {
+            return true;
+        }
+        page = Common::DivCeil(end_addr, PAGE_SIZE);
+    }
+    return false;
+}
+
+template <class P>
+bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) {
+    const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
+    for (u64 page = addr >> PAGE_BITS; page < page_end;) {
+        const BufferId image_id = page_table[page];
+        if (!image_id) {
+            ++page;
+            continue;
+        }
+        Buffer& buffer = slot_buffers[image_id];
+        if (buffer.IsRegionCpuModified(addr, size)) {
+            return true;
+        }
+        const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes();
+        page = Common::DivCeil(end_addr, PAGE_SIZE);
+    }
+    return false;
+}
+
+template <class P>
 void BufferCache<P>::BindHostIndexBuffer() {
     Buffer& buffer = slot_buffers[index_buffer.buffer_id];
     TouchBuffer(buffer);
@@ -649,7 +960,9 @@ void BufferCache<P>::BindHostIndexBuffer() {
     const u32 size = index_buffer.size;
     SynchronizeBuffer(buffer, index_buffer.cpu_addr, size);
     if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
-        runtime.BindIndexBuffer(buffer, offset, size);
+        const u32 new_offset = offset + maxwell3d.regs.index_array.first *
+                                            maxwell3d.regs.index_array.FormatSizeInBytes();
+        runtime.BindIndexBuffer(buffer, new_offset, size);
     } else {
         runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format,
                                 maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count,
@@ -683,7 +996,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffers(size_t stage) {
         dirty = std::exchange(dirty_uniform_buffers[stage], 0);
     }
     u32 binding_index = 0;
-    ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
+    ForEachEnabledBit(enabled_uniform_buffer_masks[stage], [&](u32 index) {
         const bool needs_bind = ((dirty >> index) & 1) != 0;
         BindHostGraphicsUniformBuffer(stage, index, binding_index, needs_bind);
         if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
@@ -697,7 +1010,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
                                                    bool needs_bind) {
     const Binding& binding = uniform_buffers[stage][index];
     const VAddr cpu_addr = binding.cpu_addr;
-    const u32 size = binding.size;
+    const u32 size = std::min(binding.size, (*uniform_buffer_sizes)[stage][index]);
     Buffer& buffer = slot_buffers[binding.buffer_id];
     TouchBuffer(buffer);
     const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID &&
@@ -707,8 +1020,13 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
         if constexpr (IS_OPENGL) {
             if (runtime.HasFastBufferSubData()) {
                 // Fast path for Nvidia
-                if (!HasFastUniformBufferBound(stage, binding_index)) {
+                const bool should_fast_bind =
+                    !HasFastUniformBufferBound(stage, binding_index) ||
+                    uniform_buffer_binding_sizes[stage][binding_index] != size;
+                if (should_fast_bind) {
                     // We only have to bind when the currently bound buffer is not the fast version
+                    fast_bound_uniform_buffers[stage] |= 1U << binding_index;
+                    uniform_buffer_binding_sizes[stage][binding_index] = size;
                     runtime.BindFastUniformBuffer(stage, binding_index, size);
                 }
                 const auto span = ImmediateBufferWithData(cpu_addr, size);
@@ -716,8 +1034,10 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
                 return;
             }
         }
-        fast_bound_uniform_buffers[stage] |= 1U << binding_index;
-
+        if constexpr (IS_OPENGL) {
+            fast_bound_uniform_buffers[stage] |= 1U << binding_index;
+            uniform_buffer_binding_sizes[stage][binding_index] = size;
+        }
         // Stream buffer path to avoid stalling on non-Nvidia drivers or Vulkan
         const std::span<u8> span = runtime.BindMappedUniformBuffer(stage, binding_index, size);
         cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size);
@@ -730,14 +1050,27 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
     }
     ++uniform_cache_shots[0];
 
-    if (!needs_bind && !HasFastUniformBufferBound(stage, binding_index)) {
-        // Skip binding if it's not needed and if the bound buffer is not the fast version
-        // This exists to avoid instances where the fast buffer is bound and a GPU write happens
+    // Skip binding if it's not needed and if the bound buffer is not the fast version
+    // This exists to avoid instances where the fast buffer is bound and a GPU write happens
+    needs_bind |= HasFastUniformBufferBound(stage, binding_index);
+    if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+        needs_bind |= uniform_buffer_binding_sizes[stage][binding_index] != size;
+    }
+    if (!needs_bind) {
         return;
     }
-    fast_bound_uniform_buffers[stage] &= ~(1U << binding_index);
-
     const u32 offset = buffer.Offset(cpu_addr);
+    if constexpr (IS_OPENGL) {
+        // Fast buffer will be unbound
+        fast_bound_uniform_buffers[stage] &= ~(1U << binding_index);
+
+        // Mark the index as dirty if offset doesn't match
+        const bool is_copy_bind = offset != 0 && !runtime.SupportsNonZeroUniformOffset();
+        dirty_uniform_buffers[stage] |= (is_copy_bind ? 1U : 0U) << index;
+    }
+    if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
+        uniform_buffer_binding_sizes[stage][binding_index] = size;
+    }
     if constexpr (NEEDS_BIND_UNIFORM_INDEX) {
         runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size);
     } else {
@@ -767,6 +1100,28 @@ void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
 }
 
 template <class P>
+void BufferCache<P>::BindHostGraphicsTextureBuffers(size_t stage) {
+    ForEachEnabledBit(enabled_texture_buffers[stage], [&](u32 index) {
+        const TextureBufferBinding& binding = texture_buffers[stage][index];
+        Buffer& buffer = slot_buffers[binding.buffer_id];
+        const u32 size = binding.size;
+        SynchronizeBuffer(buffer, binding.cpu_addr, size);
+
+        const u32 offset = buffer.Offset(binding.cpu_addr);
+        const PixelFormat format = binding.format;
+        if constexpr (SEPARATE_IMAGE_BUFFERS_BINDINGS) {
+            if (((image_texture_buffers[stage] >> index) & 1) != 0) {
+                runtime.BindImageBuffer(buffer, offset, size, format);
+            } else {
+                runtime.BindTextureBuffer(buffer, offset, size, format);
+            }
+        } else {
+            runtime.BindTextureBuffer(buffer, offset, size, format);
+        }
+    });
+}
+
+template <class P>
 void BufferCache<P>::BindHostTransformFeedbackBuffers() {
     if (maxwell3d.regs.tfb_enabled == 0) {
         return;
@@ -788,13 +1143,14 @@ void BufferCache<P>::BindHostComputeUniformBuffers() {
     if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
         // Mark all uniform buffers as dirty
         dirty_uniform_buffers.fill(~u32{0});
+        fast_bound_uniform_buffers.fill(0);
     }
     u32 binding_index = 0;
-    ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
+    ForEachEnabledBit(enabled_compute_uniform_buffer_mask, [&](u32 index) {
         const Binding& binding = compute_uniform_buffers[index];
         Buffer& buffer = slot_buffers[binding.buffer_id];
         TouchBuffer(buffer);
-        const u32 size = binding.size;
+        const u32 size = std::min(binding.size, (*compute_uniform_buffer_sizes)[index]);
         SynchronizeBuffer(buffer, binding.cpu_addr, size);
 
         const u32 offset = buffer.Offset(binding.cpu_addr);
@@ -829,6 +1185,28 @@ void BufferCache<P>::BindHostComputeStorageBuffers() {
 }
 
 template <class P>
+void BufferCache<P>::BindHostComputeTextureBuffers() {
+    ForEachEnabledBit(enabled_compute_texture_buffers, [&](u32 index) {
+        const TextureBufferBinding& binding = compute_texture_buffers[index];
+        Buffer& buffer = slot_buffers[binding.buffer_id];
+        const u32 size = binding.size;
+        SynchronizeBuffer(buffer, binding.cpu_addr, size);
+
+        const u32 offset = buffer.Offset(binding.cpu_addr);
+        const PixelFormat format = binding.format;
+        if constexpr (SEPARATE_IMAGE_BUFFERS_BINDINGS) {
+            if (((image_compute_texture_buffers >> index) & 1) != 0) {
+                runtime.BindImageBuffer(buffer, offset, size, format);
+            } else {
+                runtime.BindTextureBuffer(buffer, offset, size, format);
+            }
+        } else {
+            runtime.BindTextureBuffer(buffer, offset, size, format);
+        }
+    });
+}
+
+template <class P>
 void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) {
     if (is_indexed) {
         UpdateIndexBuffer();
@@ -838,6 +1216,7 @@ void BufferCache<P>::DoUpdateGraphicsBuffers(bool is_indexed) {
     for (size_t stage = 0; stage < NUM_STAGES; ++stage) {
         UpdateUniformBuffers(stage);
         UpdateStorageBuffers(stage);
+        UpdateTextureBuffers(stage);
     }
 }
 
@@ -845,6 +1224,7 @@ template <class P>
 void BufferCache<P>::DoUpdateComputeBuffers() {
     UpdateComputeUniformBuffers();
     UpdateComputeStorageBuffers();
+    UpdateComputeTextureBuffers();
 }
 
 template <class P>
@@ -863,7 +1243,7 @@ void BufferCache<P>::UpdateIndexBuffer() {
     const GPUVAddr gpu_addr_end = index_array.EndAddress();
     const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin);
     const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin);
-    const u32 draw_size = index_array.count * index_array.FormatSizeInBytes();
+    const u32 draw_size = (index_array.count + index_array.first) * index_array.FormatSizeInBytes();
     const u32 size = std::min(address_size, draw_size);
     if (size == 0 || !cpu_addr) {
         index_buffer = NULL_BINDING;
@@ -914,7 +1294,7 @@ void BufferCache<P>::UpdateVertexBuffer(u32 index) {
 
 template <class P>
 void BufferCache<P>::UpdateUniformBuffers(size_t stage) {
-    ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) {
+    ForEachEnabledBit(enabled_uniform_buffer_masks[stage], [&](u32 index) {
         Binding& binding = uniform_buffers[stage][index];
         if (binding.buffer_id) {
             // Already updated
@@ -945,6 +1325,18 @@ void BufferCache<P>::UpdateStorageBuffers(size_t stage) {
 }
 
 template <class P>
+void BufferCache<P>::UpdateTextureBuffers(size_t stage) {
+    ForEachEnabledBit(enabled_texture_buffers[stage], [&](u32 index) {
+        Binding& binding = texture_buffers[stage][index];
+        binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
+        // Mark buffer as written if needed
+        if (((written_texture_buffers[stage] >> index) & 1) != 0) {
+            MarkWrittenBuffer(binding.buffer_id, binding.cpu_addr, binding.size);
+        }
+    });
+}
+
+template <class P>
 void BufferCache<P>::UpdateTransformFeedbackBuffers() {
     if (maxwell3d.regs.tfb_enabled == 0) {
         return;
@@ -975,7 +1367,7 @@ void BufferCache<P>::UpdateTransformFeedbackBuffer(u32 index) {
 
 template <class P>
 void BufferCache<P>::UpdateComputeUniformBuffers() {
-    ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) {
+    ForEachEnabledBit(enabled_compute_uniform_buffer_mask, [&](u32 index) {
         Binding& binding = compute_uniform_buffers[index];
         binding = NULL_BINDING;
         const auto& launch_desc = kepler_compute.launch_description;
@@ -996,11 +1388,22 @@ void BufferCache<P>::UpdateComputeStorageBuffers() {
     ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
         // Resolve buffer
         Binding& binding = compute_storage_buffers[index];
-        const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size);
-        binding.buffer_id = buffer_id;
+        binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
         // Mark as written if needed
         if (((written_compute_storage_buffers >> index) & 1) != 0) {
-            MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size);
+            MarkWrittenBuffer(binding.buffer_id, binding.cpu_addr, binding.size);
+        }
+    });
+}
+
+template <class P>
+void BufferCache<P>::UpdateComputeTextureBuffers() {
+    ForEachEnabledBit(enabled_compute_texture_buffers, [&](u32 index) {
+        Binding& binding = compute_texture_buffers[index];
+        binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size);
+        // Mark as written if needed
+        if (((written_compute_texture_buffers >> index) & 1) != 0) {
+            MarkWrittenBuffer(binding.buffer_id, binding.cpu_addr, binding.size);
         }
     });
 }
@@ -1010,16 +1413,16 @@ void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 s
     Buffer& buffer = slot_buffers[buffer_id];
     buffer.MarkRegionAsGpuModified(cpu_addr, size);
 
-    const bool is_accuracy_high = Settings::IsGPULevelHigh();
+    const IntervalType base_interval{cpu_addr, cpu_addr + size};
+    common_ranges.add(base_interval);
+
+    const bool is_accuracy_high =
+        Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High;
     const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
-    if (!is_accuracy_high || !is_async) {
+    if (!is_async && !is_accuracy_high) {
         return;
     }
-    if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) {
-        // Already inserted
-        return;
-    }
-    uncommitted_downloads.push_back(buffer_id);
+    uncommitted_ranges.add(base_interval);
 }
 
 template <class P>
@@ -1103,7 +1506,6 @@ void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id,
     if (!copies.empty()) {
         runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies);
     }
-    ReplaceBufferDownloads(overlap_id, new_buffer_id);
     DeleteBuffer(overlap_id);
 }
 
@@ -1244,14 +1646,29 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si
     boost::container::small_vector<BufferCopy, 1> copies;
     u64 total_size_bytes = 0;
     u64 largest_copy = 0;
-    buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
-        copies.push_back(BufferCopy{
-            .src_offset = range_offset,
-            .dst_offset = total_size_bytes,
-            .size = range_size,
-        });
-        total_size_bytes += range_size;
-        largest_copy = std::max(largest_copy, range_size);
+    buffer.ForEachDownloadRangeAndClear(cpu_addr, size, [&](u64 range_offset, u64 range_size) {
+        const VAddr buffer_addr = buffer.CpuAddr();
+        const auto add_download = [&](VAddr start, VAddr end) {
+            const u64 new_offset = start - buffer_addr;
+            const u64 new_size = end - start;
+            copies.push_back(BufferCopy{
+                .src_offset = new_offset,
+                .dst_offset = total_size_bytes,
+                .size = new_size,
+            });
+            // Align up to avoid cache conflicts
+            constexpr u64 align = 256ULL;
+            constexpr u64 mask = ~(align - 1ULL);
+            total_size_bytes += (new_size + align - 1) & mask;
+            largest_copy = std::max(largest_copy, new_size);
+        };
+
+        const VAddr start_address = buffer_addr + range_offset;
+        const VAddr end_address = start_address + range_size;
+        ForEachWrittenRange(start_address, range_size, add_download);
+        const IntervalType subtract_interval{start_address, end_address};
+        ClearDownload(subtract_interval);
+        common_ranges.subtract(subtract_interval);
     });
     if (total_size_bytes == 0) {
         return;
@@ -1316,21 +1733,10 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) {
 }
 
 template <class P>
-void BufferCache<P>::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) {
-    const auto replace = [old_buffer_id, new_buffer_id](std::vector<BufferId>& buffers) {
-        std::ranges::replace(buffers, old_buffer_id, new_buffer_id);
-        if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) {
-            buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end());
-        }
-    };
-    replace(uncommitted_downloads);
-    std::ranges::for_each(committed_downloads, replace);
-}
-
-template <class P>
 void BufferCache<P>::NotifyBufferDeletion() {
     if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
         dirty_uniform_buffers.fill(~u32{0});
+        uniform_buffer_binding_sizes.fill({});
     }
     auto& flags = maxwell3d.dirty.flags;
     flags[Dirty::IndexBuffer] = true;
@@ -1349,21 +1755,34 @@ typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr s
     if (!cpu_addr || size == 0) {
         return NULL_BINDING;
     }
-    // HACK(Rodrigo): This is the number of bytes bound in host beyond the guest API's range.
-    // It exists due to some games like Astral Chain operate out of bounds.
-    // Binding the whole map range would be technically correct, but games have large maps that make
-    // this approach unaffordable for now.
-    static constexpr u32 arbitrary_extra_bytes = 0xc000;
-    const u32 bytes_to_map_end = static_cast<u32>(gpu_memory.BytesToMapEnd(gpu_addr));
     const Binding binding{
         .cpu_addr = *cpu_addr,
-        .size = std::min(size + arbitrary_extra_bytes, bytes_to_map_end),
+        .size = size,
         .buffer_id = BufferId{},
     };
     return binding;
 }
 
 template <class P>
+typename BufferCache<P>::TextureBufferBinding BufferCache<P>::GetTextureBufferBinding(
+    GPUVAddr gpu_addr, u32 size, PixelFormat format) {
+    const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
+    TextureBufferBinding binding;
+    if (!cpu_addr || size == 0) {
+        binding.cpu_addr = 0;
+        binding.size = 0;
+        binding.buffer_id = NULL_BUFFER_ID;
+        binding.format = PixelFormat::Invalid;
+    } else {
+        binding.cpu_addr = *cpu_addr;
+        binding.size = size;
+        binding.buffer_id = BufferId{};
+        binding.format = format;
+    }
+    return binding;
+}
+
+template <class P>
 std::span<const u8> BufferCache<P>::ImmediateBufferWithData(VAddr cpu_addr, size_t size) {
     u8* const base_pointer = cpu_memory.GetPointer(cpu_addr);
     if (IsRangeGranular(cpu_addr, size) ||